JOBSCRAPER-1 Implement a bit of AI
* Moved to mongo * UI updated to use mongo * UI is a bit fancier now * Import sql to mongo
This commit is contained in:
parent
1938bbeb5f
commit
f2880b661e
1
.gitignore
vendored
1
.gitignore
vendored
@ -147,3 +147,4 @@ fabric.properties
|
||||
/live/
|
||||
!/output/
|
||||
/db/jobs.db
|
||||
!/db/
|
||||
|
7179
biglist.json
7179
biglist.json
File diff suppressed because one or more lines are too long
14
lib/base.js
14
lib/base.js
@ -10,6 +10,8 @@ const filterAccept = require('../lib/filter_md_jobs');
|
||||
const dbmanager = require('../lib/dbmanager');
|
||||
const JobsModel = require('../lib/mongoManager');
|
||||
|
||||
const SHA = require('crypto-js/sha256');
|
||||
|
||||
const { Utils } = require('@rakh/utils');
|
||||
const { Corpus } = require('./corpus');
|
||||
|
||||
@ -75,7 +77,7 @@ class MasterBase {
|
||||
newJob.save().then((m) => {
|
||||
console.log('m', m.details.title);
|
||||
}).catch((err) => {
|
||||
console.error(err);
|
||||
console.error('m', err);
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -112,10 +114,20 @@ class MasterBase {
|
||||
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
|
||||
*/
|
||||
reduceData(d) {
|
||||
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||
|
||||
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||
|
||||
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||
|
||||
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||
outObj.details.hashed = SHA(outObj.details.summary);
|
||||
|
||||
outObj.data.read = 0;
|
||||
outObj.data.applied = d.applied || 0;
|
||||
|
||||
outObj.data.jobtype = this.analyseRate(d.salary);
|
||||
outObj.data.autoclass = Corpus.process(d.summary);
|
||||
|
||||
|
@ -18,7 +18,7 @@ const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
|
||||
const deSpace = /\s+/g;
|
||||
|
||||
function cleanText(intext) {
|
||||
if (arguments.length === 0 || intext === null ) return '';
|
||||
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
|
||||
|
||||
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
||||
}
|
||||
|
@ -18,9 +18,15 @@ require('dotenv').config();
|
||||
|
||||
logger.level = 'debug';
|
||||
|
||||
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||
const mongoConnect = process.env.MONGOCONNECT;
|
||||
|
||||
mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
||||
// logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||
|
||||
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
||||
|
||||
logger.debug(mongoConnect);
|
||||
|
||||
mongoose.connect(mongoConnect);
|
||||
|
||||
const mDB = mongoose.connection;
|
||||
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||
|
@ -59,23 +59,28 @@ fetch(url, {
|
||||
resolve(response.body);
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error('getContent', e );
|
||||
reject(e.response.body);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async savePage(html) {
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
|
||||
|
||||
const filename = `pages/${this.siteid}-${now}.html`;
|
||||
|
||||
fs.writeFileSync(filename, html);
|
||||
}
|
||||
|
||||
async getPage() {
|
||||
console.log('>> getPage: fetching', this.url);
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
||||
const filename = `${this.siteid}-${now}.html`;
|
||||
|
||||
await this.getContent(this.url, this.useStone)
|
||||
.then((html) => {
|
||||
fs.writeFileSync(filename, html);
|
||||
// console.log('>> getPage:: got', html);
|
||||
console.log('>> getPage:: OK');
|
||||
if (this.saveFile) this.savePage(html);
|
||||
const $ = cheerio.load(html);
|
||||
this.loadPage($);
|
||||
})
|
||||
|
41
migrate.js
41
migrate.js
@ -12,7 +12,12 @@ const { Utils } = require('@rakh/utils');
|
||||
|
||||
const { Corpus } = require('./lib/corpus');
|
||||
|
||||
const SHA = require('crypto-js/sha256');
|
||||
|
||||
/*
|
||||
|
||||
2604
|
||||
|
||||
const mongoose = require('mongoose');
|
||||
const log4js = require('log4js');
|
||||
const logger = log4js.getLogger();
|
||||
@ -49,6 +54,7 @@ const migrate = (function() {
|
||||
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||
else if (item >= 5000) outVal = 2;
|
||||
}
|
||||
else return 0;
|
||||
|
||||
return outVal;
|
||||
}
|
||||
@ -62,6 +68,7 @@ const migrate = (function() {
|
||||
|
||||
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||
outObj.details.hashed = SHA(outObj.details.summary);
|
||||
|
||||
// outObj.data.read = d.read || 0;
|
||||
outObj.data.read = 0;
|
||||
@ -94,8 +101,8 @@ const migrate = (function() {
|
||||
});
|
||||
}
|
||||
|
||||
function start() {
|
||||
getCurrent().then((d) => {
|
||||
async function start() {
|
||||
await getCurrent().then(async (d) => {
|
||||
logger.debug(d.length);
|
||||
|
||||
for (let t = 0;t < (d.length - 1);t++) {
|
||||
@ -105,10 +112,10 @@ const migrate = (function() {
|
||||
|
||||
const newJob = Jobs(newD);
|
||||
|
||||
newJob.save().then((m) => {
|
||||
await newJob.save().then((m) => {
|
||||
logger.debug('m', m.details.title);
|
||||
}).catch((err) => {
|
||||
logger.error(err);
|
||||
logger.error(err.keyPattern);
|
||||
});
|
||||
}
|
||||
}).then(() => {
|
||||
@ -116,14 +123,34 @@ const migrate = (function() {
|
||||
Corpus.exportUnused();
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(err.keyPattern);
|
||||
});
|
||||
}
|
||||
|
||||
async function deleteOld() {
|
||||
const oneDay = 86400000;
|
||||
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
|
||||
|
||||
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
|
||||
|
||||
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
|
||||
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
|
||||
logger.debug('m', m);
|
||||
}).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
}
|
||||
|
||||
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
|
||||
|
||||
return {
|
||||
'start':start
|
||||
'start':start,
|
||||
'deleteOld': deleteOld
|
||||
};
|
||||
})();
|
||||
|
||||
migrate.start();
|
||||
logger.info('Done??');
|
||||
(async function() {
|
||||
await migrate.start();
|
||||
await migrate.deleteOld();
|
||||
logger.info('Done??');
|
||||
})();
|
||||
|
@ -20,7 +20,8 @@ const jobSchema = new Schema({
|
||||
'postdate': String,
|
||||
'salary': String,
|
||||
'easyapply': Number,
|
||||
'timestamp': Number
|
||||
'timestamp': Number,
|
||||
'hashed' : { 'type': String, 'required':true, 'unique':true }
|
||||
},
|
||||
'data': {
|
||||
'read': { 'type': Number, 'default': 0 },
|
||||
|
66
onetime.js
Normal file
66
onetime.js
Normal file
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 23:35
|
||||
|
||||
*/
|
||||
const CronJob = require('cron').CronJob;
|
||||
const IndeedScraper = require('./scrapers/indeed');
|
||||
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||
|
||||
(async function () {
|
||||
console.log('Started..');
|
||||
const indeedScraper = new IndeedScraper();
|
||||
const totaljobsScraper = new TotaljobsScraper();
|
||||
const cwjobsScraper = new CwjobsScraper();
|
||||
const jobserveScraper = new JobserveScraper();
|
||||
const s1jobsScraper = new RssS1Jobs();
|
||||
const technojobsScraper = new RssTechnojobs();
|
||||
|
||||
await indeedScraper.go('london');
|
||||
await totaljobsScraper.go('london');
|
||||
await cwjobsScraper.go('london');
|
||||
|
||||
await indeedScraper.go('glasgow');
|
||||
await totaljobsScraper.go('glasgow');
|
||||
await cwjobsScraper.go('glasgow');
|
||||
await indeedScraper.go('edinburgh');
|
||||
await totaljobsScraper.go('edinburgh');
|
||||
await cwjobsScraper.go('edinburgh');
|
||||
await indeedScraper.go('milton keynes');
|
||||
await totaljobsScraper.go('milton keynes');
|
||||
await cwjobsScraper.go('milton keynes');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
|
||||
})();
|
1353
package-lock.json
generated
1353
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,12 +9,13 @@
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@rakh/utils": "file:../utils",
|
||||
"@rakh/utils": "^1.0.0",
|
||||
"axios": "^0.19.2",
|
||||
"bayes": "^1.0.0",
|
||||
"body-parser": "^1.19.0",
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"cron": "^1.8.2",
|
||||
"crypto-js": "^4.0.0",
|
||||
"dotenv": "^8.2.0",
|
||||
"eslint": "^6.8.0",
|
||||
"express": "^4.17.1",
|
||||
|
@ -139,7 +139,9 @@ class IndeedScraper extends MasterScraper {
|
||||
async go(location = 'london') {
|
||||
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
||||
|
||||
await this.processSite();
|
||||
await this.processSite().catch((err) => {
|
||||
console.error('Indeed Go', err);
|
||||
});
|
||||
|
||||
console.log(`Indeed ${location} completed`);
|
||||
}
|
||||
|
@ -35,7 +35,7 @@ function reduceRecord(record) {
|
||||
exports.getList = (req, res) => {
|
||||
console.log('>getList req', req.params);
|
||||
|
||||
Jobs.find( {}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).then((doc) => {
|
||||
Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
|
||||
if (doc) {
|
||||
|
||||
res.send(reduceList(doc));
|
||||
|
2
server/dist/build/bundle.css
vendored
2
server/dist/build/bundle.css
vendored
File diff suppressed because one or more lines are too long
6
server/dist/build/bundle.css.map
vendored
6
server/dist/build/bundle.css.map
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js
vendored
2
server/dist/build/bundle.js
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js.map
vendored
2
server/dist/build/bundle.js.map
vendored
File diff suppressed because one or more lines are too long
@ -41,7 +41,9 @@ test.skip('Test Indeed scraper', async t => {
|
||||
});
|
||||
|
||||
test.skip('Test full run Indeed scraper', async t => {
|
||||
await indeedScraper.go('london');
|
||||
await indeedScraper.go('london').catch((err) => {
|
||||
console.error('Indeed GO', err);
|
||||
});
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user