JOBSCRAPER-1 Implement a bit of AI

* Moved to mongo
* UI updated to use mongo
* UI is a bit fancier now
* Import sql to mongo
This commit is contained in:
Martin Donnelly 2020-09-10 14:13:08 +01:00
parent 1938bbeb5f
commit f2880b661e
19 changed files with 636 additions and 8143 deletions

1
.gitignore vendored
View File

@ -147,3 +147,4 @@ fabric.properties
/live/ /live/
!/output/ !/output/
/db/jobs.db /db/jobs.db
!/db/

File diff suppressed because one or more lines are too long

View File

@ -10,6 +10,8 @@ const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager'); const dbmanager = require('../lib/dbmanager');
const JobsModel = require('../lib/mongoManager'); const JobsModel = require('../lib/mongoManager');
const SHA = require('crypto-js/sha256');
const { Utils } = require('@rakh/utils'); const { Utils } = require('@rakh/utils');
const { Corpus } = require('./corpus'); const { Corpus } = require('./corpus');
@ -75,7 +77,7 @@ class MasterBase {
newJob.save().then((m) => { newJob.save().then((m) => {
console.log('m', m.details.title); console.log('m', m.details.title);
}).catch((err) => { }).catch((err) => {
console.error(err); console.error('m', err);
}); });
} }
} }
@ -112,10 +114,20 @@ class MasterBase {
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}} * @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
*/ */
reduceData(d) { reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } }; const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']); outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = this.analyseRate(d.salary); outObj.data.jobtype = this.analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary); outObj.data.autoclass = Corpus.process(d.summary);

View File

@ -18,7 +18,7 @@ const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
const deSpace = /\s+/g; const deSpace = /\s+/g;
function cleanText(intext) { function cleanText(intext) {
if (arguments.length === 0 || intext === null ) return ''; if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase(); return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
} }

View File

@ -18,9 +18,15 @@ require('dotenv').config();
logger.level = 'debug'; logger.level = 'debug';
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`); const mongoConnect = process.env.MONGOCONNECT;
mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`); // logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
logger.debug(mongoConnect);
mongoose.connect(mongoConnect);
const mDB = mongoose.connection; const mDB = mongoose.connection;
mDB.on('error', console.error.bind(console, 'connection error:')); mDB.on('error', console.error.bind(console, 'connection error:'));

View File

@ -22,7 +22,7 @@ class MasterScraper extends MasterBase {
} }
/** /**
* *
* @param url * @param url
* @param useStone * @param useStone
* @returns {Promise<unknown>} * @returns {Promise<unknown>}
@ -59,23 +59,28 @@ fetch(url, {
resolve(response.body); resolve(response.body);
}) })
.catch((e) => { .catch((e) => {
console.error('getContent', e );
reject(e.response.body); reject(e.response.body);
}); });
}); });
}; };
/** async savePage(html) {
* const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
* @returns {Promise<void>}
*/ const filename = `pages/${this.siteid}-${now}.html`;
fs.writeFileSync(filename, html);
}
async getPage() { async getPage() {
console.log('>> getPage: fetching', this.url); console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone) await this.getContent(this.url, this.useStone)
.then((html) => { .then((html) => {
fs.writeFileSync(filename, html); // console.log('>> getPage:: got', html);
console.log('>> getPage:: OK');
if (this.saveFile) this.savePage(html);
const $ = cheerio.load(html); const $ = cheerio.load(html);
this.loadPage($); this.loadPage($);
}) })

View File

@ -12,7 +12,12 @@ const { Utils } = require('@rakh/utils');
const { Corpus } = require('./lib/corpus'); const { Corpus } = require('./lib/corpus');
const SHA = require('crypto-js/sha256');
/* /*
2604
const mongoose = require('mongoose'); const mongoose = require('mongoose');
const log4js = require('log4js'); const log4js = require('log4js');
const logger = log4js.getLogger(); const logger = log4js.getLogger();
@ -49,7 +54,8 @@ const migrate = (function() {
else if ((item > 100) && (item < 5000)) outVal = 1; else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2; else if (item >= 5000) outVal = 2;
} }
else return 0;
return outVal; return outVal;
} }
function reduceData(d) { function reduceData(d) {
@ -62,6 +68,7 @@ const migrate = (function() {
outObj.details.title = outObj.details.title.replace(clearPremium, ''); outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, ''); outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
// outObj.data.read = d.read || 0; // outObj.data.read = d.read || 0;
outObj.data.read = 0; outObj.data.read = 0;
@ -94,8 +101,8 @@ const migrate = (function() {
}); });
} }
function start() { async function start() {
getCurrent().then((d) => { await getCurrent().then(async (d) => {
logger.debug(d.length); logger.debug(d.length);
for (let t = 0;t < (d.length - 1);t++) { for (let t = 0;t < (d.length - 1);t++) {
@ -105,10 +112,10 @@ const migrate = (function() {
const newJob = Jobs(newD); const newJob = Jobs(newD);
newJob.save().then((m) => { await newJob.save().then((m) => {
logger.debug('m', m.details.title); logger.debug('m', m.details.title);
}).catch((err) => { }).catch((err) => {
logger.error(err); logger.error(err.keyPattern);
}); });
} }
}).then(() => { }).then(() => {
@ -116,14 +123,34 @@ const migrate = (function() {
Corpus.exportUnused(); Corpus.exportUnused();
}) })
.catch((err) => { .catch((err) => {
logger.error(err); logger.error(err.keyPattern);
}); });
} }
async function deleteOld() {
const oneDay = 86400000;
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
logger.debug('m', m);
}).catch((err) => {
logger.error(err);
});
}
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
return { return {
'start':start 'start':start,
'deleteOld': deleteOld
}; };
})(); })();
migrate.start(); (async function() {
logger.info('Done??'); await migrate.start();
await migrate.deleteOld();
logger.info('Done??');
})();

View File

@ -20,7 +20,8 @@ const jobSchema = new Schema({
'postdate': String, 'postdate': String,
'salary': String, 'salary': String,
'easyapply': Number, 'easyapply': Number,
'timestamp': Number 'timestamp': Number,
'hashed' : { 'type': String, 'required':true, 'unique':true }
}, },
'data': { 'data': {
'read': { 'type': Number, 'default': 0 }, 'read': { 'type': Number, 'default': 0 },

66
onetime.js Normal file
View File

@ -0,0 +1,66 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 23:35
*/
const CronJob = require('cron').CronJob;
const IndeedScraper = require('./scrapers/indeed');
const TotaljobsScraper = require('./scrapers/totaljobs');
const CwjobsScraper = require('./scrapers/cwjobs');
const JobserveScraper = require('./scrapers/rss.jobserve');
const RssS1Jobs = require('./scrapers/rss.s1jobs');
const RssTechnojobs = require('./scrapers/rss.technojobs');
(async function () {
console.log('Started..');
const indeedScraper = new IndeedScraper();
const totaljobsScraper = new TotaljobsScraper();
const cwjobsScraper = new CwjobsScraper();
const jobserveScraper = new JobserveScraper();
const s1jobsScraper = new RssS1Jobs();
const technojobsScraper = new RssTechnojobs();
await indeedScraper.go('london');
await totaljobsScraper.go('london');
await cwjobsScraper.go('london');
await indeedScraper.go('glasgow');
await totaljobsScraper.go('glasgow');
await cwjobsScraper.go('glasgow');
await indeedScraper.go('edinburgh');
await totaljobsScraper.go('edinburgh');
await cwjobsScraper.go('edinburgh');
await indeedScraper.go('milton keynes');
await totaljobsScraper.go('milton keynes');
await cwjobsScraper.go('milton keynes');
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
})();

1411
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -9,12 +9,13 @@
"author": "", "author": "",
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@rakh/utils": "file:../utils", "@rakh/utils": "^1.0.0",
"axios": "^0.19.2", "axios": "^0.19.2",
"bayes": "^1.0.0", "bayes": "^1.0.0",
"body-parser": "^1.19.0", "body-parser": "^1.19.0",
"cheerio": "^1.0.0-rc.3", "cheerio": "^1.0.0-rc.3",
"cron": "^1.8.2", "cron": "^1.8.2",
"crypto-js": "^4.0.0",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
"eslint": "^6.8.0", "eslint": "^6.8.0",
"express": "^4.17.1", "express": "^4.17.1",

View File

@ -139,7 +139,9 @@ class IndeedScraper extends MasterScraper {
async go(location = 'london') { async go(location = 'london') {
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`); this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
await this.processSite(); await this.processSite().catch((err) => {
console.error('Indeed Go', err);
});
console.log(`Indeed ${location} completed`); console.log(`Indeed ${location} completed`);
} }

View File

@ -35,7 +35,7 @@ function reduceRecord(record) {
exports.getList = (req, res) => { exports.getList = (req, res) => {
console.log('>getList req', req.params); console.log('>getList req', req.params);
Jobs.find( {}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).then((doc) => { Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
if (doc) { if (doc) {
res.send(reduceList(doc)); res.send(reduceList(doc));

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -41,7 +41,9 @@ test.skip('Test Indeed scraper', async t => {
}); });
test.skip('Test full run Indeed scraper', async t => { test.skip('Test full run Indeed scraper', async t => {
await indeedScraper.go('london'); await indeedScraper.go('london').catch((err) => {
console.error('Indeed GO', err);
});
t.end(); t.end();
}); });

File diff suppressed because one or more lines are too long