JOBSCRAPER-1 Implement a bit of AI
* Moved to mongo * UI updated to use mongo * UI is a bit fancier now * Import sql to mongo
This commit is contained in:
parent
1938bbeb5f
commit
f2880b661e
1
.gitignore
vendored
1
.gitignore
vendored
@ -147,3 +147,4 @@ fabric.properties
|
|||||||
/live/
|
/live/
|
||||||
!/output/
|
!/output/
|
||||||
/db/jobs.db
|
/db/jobs.db
|
||||||
|
!/db/
|
||||||
|
7179
biglist.json
7179
biglist.json
File diff suppressed because one or more lines are too long
14
lib/base.js
14
lib/base.js
@ -10,6 +10,8 @@ const filterAccept = require('../lib/filter_md_jobs');
|
|||||||
const dbmanager = require('../lib/dbmanager');
|
const dbmanager = require('../lib/dbmanager');
|
||||||
const JobsModel = require('../lib/mongoManager');
|
const JobsModel = require('../lib/mongoManager');
|
||||||
|
|
||||||
|
const SHA = require('crypto-js/sha256');
|
||||||
|
|
||||||
const { Utils } = require('@rakh/utils');
|
const { Utils } = require('@rakh/utils');
|
||||||
const { Corpus } = require('./corpus');
|
const { Corpus } = require('./corpus');
|
||||||
|
|
||||||
@ -75,7 +77,7 @@ class MasterBase {
|
|||||||
newJob.save().then((m) => {
|
newJob.save().then((m) => {
|
||||||
console.log('m', m.details.title);
|
console.log('m', m.details.title);
|
||||||
}).catch((err) => {
|
}).catch((err) => {
|
||||||
console.error(err);
|
console.error('m', err);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,10 +114,20 @@ class MasterBase {
|
|||||||
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
|
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
|
||||||
*/
|
*/
|
||||||
reduceData(d) {
|
reduceData(d) {
|
||||||
|
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||||
|
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||||
|
|
||||||
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||||
|
|
||||||
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||||
|
|
||||||
|
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||||
|
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||||
|
outObj.details.hashed = SHA(outObj.details.summary);
|
||||||
|
|
||||||
|
outObj.data.read = 0;
|
||||||
|
outObj.data.applied = d.applied || 0;
|
||||||
|
|
||||||
outObj.data.jobtype = this.analyseRate(d.salary);
|
outObj.data.jobtype = this.analyseRate(d.salary);
|
||||||
outObj.data.autoclass = Corpus.process(d.summary);
|
outObj.data.autoclass = Corpus.process(d.summary);
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
|
|||||||
const deSpace = /\s+/g;
|
const deSpace = /\s+/g;
|
||||||
|
|
||||||
function cleanText(intext) {
|
function cleanText(intext) {
|
||||||
if (arguments.length === 0 || intext === null ) return '';
|
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
|
||||||
|
|
||||||
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
||||||
}
|
}
|
||||||
|
@ -18,9 +18,15 @@ require('dotenv').config();
|
|||||||
|
|
||||||
logger.level = 'debug';
|
logger.level = 'debug';
|
||||||
|
|
||||||
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
const mongoConnect = process.env.MONGOCONNECT;
|
||||||
|
|
||||||
mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
// logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||||
|
|
||||||
|
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
||||||
|
|
||||||
|
logger.debug(mongoConnect);
|
||||||
|
|
||||||
|
mongoose.connect(mongoConnect);
|
||||||
|
|
||||||
const mDB = mongoose.connection;
|
const mDB = mongoose.connection;
|
||||||
mDB.on('error', console.error.bind(console, 'connection error:'));
|
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||||
|
@ -22,7 +22,7 @@ class MasterScraper extends MasterBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* @param useStone
|
* @param useStone
|
||||||
* @returns {Promise<unknown>}
|
* @returns {Promise<unknown>}
|
||||||
@ -59,23 +59,28 @@ fetch(url, {
|
|||||||
resolve(response.body);
|
resolve(response.body);
|
||||||
})
|
})
|
||||||
.catch((e) => {
|
.catch((e) => {
|
||||||
|
console.error('getContent', e );
|
||||||
reject(e.response.body);
|
reject(e.response.body);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
async savePage(html) {
|
||||||
*
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
|
||||||
* @returns {Promise<void>}
|
|
||||||
*/
|
const filename = `pages/${this.siteid}-${now}.html`;
|
||||||
|
|
||||||
|
fs.writeFileSync(filename, html);
|
||||||
|
}
|
||||||
|
|
||||||
async getPage() {
|
async getPage() {
|
||||||
console.log('>> getPage: fetching', this.url);
|
console.log('>> getPage: fetching', this.url);
|
||||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
|
||||||
const filename = `${this.siteid}-${now}.html`;
|
|
||||||
|
|
||||||
await this.getContent(this.url, this.useStone)
|
await this.getContent(this.url, this.useStone)
|
||||||
.then((html) => {
|
.then((html) => {
|
||||||
fs.writeFileSync(filename, html);
|
// console.log('>> getPage:: got', html);
|
||||||
|
console.log('>> getPage:: OK');
|
||||||
|
if (this.saveFile) this.savePage(html);
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
this.loadPage($);
|
this.loadPage($);
|
||||||
})
|
})
|
||||||
|
45
migrate.js
45
migrate.js
@ -12,7 +12,12 @@ const { Utils } = require('@rakh/utils');
|
|||||||
|
|
||||||
const { Corpus } = require('./lib/corpus');
|
const { Corpus } = require('./lib/corpus');
|
||||||
|
|
||||||
|
const SHA = require('crypto-js/sha256');
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
||||||
|
2604
|
||||||
|
|
||||||
const mongoose = require('mongoose');
|
const mongoose = require('mongoose');
|
||||||
const log4js = require('log4js');
|
const log4js = require('log4js');
|
||||||
const logger = log4js.getLogger();
|
const logger = log4js.getLogger();
|
||||||
@ -49,7 +54,8 @@ const migrate = (function() {
|
|||||||
else if ((item > 100) && (item < 5000)) outVal = 1;
|
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||||
else if (item >= 5000) outVal = 2;
|
else if (item >= 5000) outVal = 2;
|
||||||
}
|
}
|
||||||
|
else return 0;
|
||||||
|
|
||||||
return outVal;
|
return outVal;
|
||||||
}
|
}
|
||||||
function reduceData(d) {
|
function reduceData(d) {
|
||||||
@ -62,6 +68,7 @@ const migrate = (function() {
|
|||||||
|
|
||||||
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||||
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||||
|
outObj.details.hashed = SHA(outObj.details.summary);
|
||||||
|
|
||||||
// outObj.data.read = d.read || 0;
|
// outObj.data.read = d.read || 0;
|
||||||
outObj.data.read = 0;
|
outObj.data.read = 0;
|
||||||
@ -94,8 +101,8 @@ const migrate = (function() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function start() {
|
async function start() {
|
||||||
getCurrent().then((d) => {
|
await getCurrent().then(async (d) => {
|
||||||
logger.debug(d.length);
|
logger.debug(d.length);
|
||||||
|
|
||||||
for (let t = 0;t < (d.length - 1);t++) {
|
for (let t = 0;t < (d.length - 1);t++) {
|
||||||
@ -105,10 +112,10 @@ const migrate = (function() {
|
|||||||
|
|
||||||
const newJob = Jobs(newD);
|
const newJob = Jobs(newD);
|
||||||
|
|
||||||
newJob.save().then((m) => {
|
await newJob.save().then((m) => {
|
||||||
logger.debug('m', m.details.title);
|
logger.debug('m', m.details.title);
|
||||||
}).catch((err) => {
|
}).catch((err) => {
|
||||||
logger.error(err);
|
logger.error(err.keyPattern);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}).then(() => {
|
}).then(() => {
|
||||||
@ -116,14 +123,34 @@ const migrate = (function() {
|
|||||||
Corpus.exportUnused();
|
Corpus.exportUnused();
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
logger.error(err);
|
logger.error(err.keyPattern);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function deleteOld() {
|
||||||
|
const oneDay = 86400000;
|
||||||
|
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
|
||||||
|
|
||||||
|
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
|
||||||
|
|
||||||
|
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
|
||||||
|
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
|
||||||
|
logger.debug('m', m);
|
||||||
|
}).catch((err) => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'start':start
|
'start':start,
|
||||||
|
'deleteOld': deleteOld
|
||||||
};
|
};
|
||||||
})();
|
})();
|
||||||
|
|
||||||
migrate.start();
|
(async function() {
|
||||||
logger.info('Done??');
|
await migrate.start();
|
||||||
|
await migrate.deleteOld();
|
||||||
|
logger.info('Done??');
|
||||||
|
})();
|
||||||
|
@ -20,7 +20,8 @@ const jobSchema = new Schema({
|
|||||||
'postdate': String,
|
'postdate': String,
|
||||||
'salary': String,
|
'salary': String,
|
||||||
'easyapply': Number,
|
'easyapply': Number,
|
||||||
'timestamp': Number
|
'timestamp': Number,
|
||||||
|
'hashed' : { 'type': String, 'required':true, 'unique':true }
|
||||||
},
|
},
|
||||||
'data': {
|
'data': {
|
||||||
'read': { 'type': Number, 'default': 0 },
|
'read': { 'type': Number, 'default': 0 },
|
||||||
|
66
onetime.js
Normal file
66
onetime.js
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 16/04/2020
|
||||||
|
* Time: 23:35
|
||||||
|
|
||||||
|
*/
|
||||||
|
const CronJob = require('cron').CronJob;
|
||||||
|
const IndeedScraper = require('./scrapers/indeed');
|
||||||
|
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||||
|
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||||
|
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||||
|
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||||
|
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||||
|
|
||||||
|
(async function () {
|
||||||
|
console.log('Started..');
|
||||||
|
const indeedScraper = new IndeedScraper();
|
||||||
|
const totaljobsScraper = new TotaljobsScraper();
|
||||||
|
const cwjobsScraper = new CwjobsScraper();
|
||||||
|
const jobserveScraper = new JobserveScraper();
|
||||||
|
const s1jobsScraper = new RssS1Jobs();
|
||||||
|
const technojobsScraper = new RssTechnojobs();
|
||||||
|
|
||||||
|
await indeedScraper.go('london');
|
||||||
|
await totaljobsScraper.go('london');
|
||||||
|
await cwjobsScraper.go('london');
|
||||||
|
|
||||||
|
await indeedScraper.go('glasgow');
|
||||||
|
await totaljobsScraper.go('glasgow');
|
||||||
|
await cwjobsScraper.go('glasgow');
|
||||||
|
await indeedScraper.go('edinburgh');
|
||||||
|
await totaljobsScraper.go('edinburgh');
|
||||||
|
await cwjobsScraper.go('edinburgh');
|
||||||
|
await indeedScraper.go('milton keynes');
|
||||||
|
await totaljobsScraper.go('milton keynes');
|
||||||
|
await cwjobsScraper.go('milton keynes');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||||
|
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||||
|
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
|
||||||
|
})();
|
1411
package-lock.json
generated
1411
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,12 +9,13 @@
|
|||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@rakh/utils": "file:../utils",
|
"@rakh/utils": "^1.0.0",
|
||||||
"axios": "^0.19.2",
|
"axios": "^0.19.2",
|
||||||
"bayes": "^1.0.0",
|
"bayes": "^1.0.0",
|
||||||
"body-parser": "^1.19.0",
|
"body-parser": "^1.19.0",
|
||||||
"cheerio": "^1.0.0-rc.3",
|
"cheerio": "^1.0.0-rc.3",
|
||||||
"cron": "^1.8.2",
|
"cron": "^1.8.2",
|
||||||
|
"crypto-js": "^4.0.0",
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"eslint": "^6.8.0",
|
"eslint": "^6.8.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
|
@ -139,7 +139,9 @@ class IndeedScraper extends MasterScraper {
|
|||||||
async go(location = 'london') {
|
async go(location = 'london') {
|
||||||
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
||||||
|
|
||||||
await this.processSite();
|
await this.processSite().catch((err) => {
|
||||||
|
console.error('Indeed Go', err);
|
||||||
|
});
|
||||||
|
|
||||||
console.log(`Indeed ${location} completed`);
|
console.log(`Indeed ${location} completed`);
|
||||||
}
|
}
|
||||||
|
@ -35,7 +35,7 @@ function reduceRecord(record) {
|
|||||||
exports.getList = (req, res) => {
|
exports.getList = (req, res) => {
|
||||||
console.log('>getList req', req.params);
|
console.log('>getList req', req.params);
|
||||||
|
|
||||||
Jobs.find( {}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).then((doc) => {
|
Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
|
||||||
if (doc) {
|
if (doc) {
|
||||||
|
|
||||||
res.send(reduceList(doc));
|
res.send(reduceList(doc));
|
||||||
|
2
server/dist/build/bundle.css
vendored
2
server/dist/build/bundle.css
vendored
File diff suppressed because one or more lines are too long
6
server/dist/build/bundle.css.map
vendored
6
server/dist/build/bundle.css.map
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js
vendored
2
server/dist/build/bundle.js
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js.map
vendored
2
server/dist/build/bundle.js.map
vendored
File diff suppressed because one or more lines are too long
@ -41,7 +41,9 @@ test.skip('Test Indeed scraper', async t => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
test.skip('Test full run Indeed scraper', async t => {
|
test.skip('Test full run Indeed scraper', async t => {
|
||||||
await indeedScraper.go('london');
|
await indeedScraper.go('london').catch((err) => {
|
||||||
|
console.error('Indeed GO', err);
|
||||||
|
});
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user