Merge branch 'JOBSCRAPER-1' into 'development'
Resolve JOBSCRAPER-1 See merge request martind2000/jobscraper!1
This commit is contained in:
commit
1513ea5010
32
.edditorconfig
Normal file
32
.edditorconfig
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
; http://editorconfig.org
|
||||||
|
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
charset = utf-8
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.txt]
|
||||||
|
insert_final_newline = false
|
||||||
|
trim_trailing_whitespace = false
|
||||||
|
|
||||||
|
[*.py]
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[*.m]
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[Makefile]
|
||||||
|
indent_style = tab
|
||||||
|
indent_size = 8
|
||||||
|
|
||||||
|
[*.{js,json}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.md]
|
||||||
|
trim_trailing_whitespace = false
|
@ -9,7 +9,7 @@
|
|||||||
"env": {
|
"env": {
|
||||||
"browser": true,
|
"browser": true,
|
||||||
"node": true,
|
"node": true,
|
||||||
"es6": true
|
"es2017": true
|
||||||
},
|
},
|
||||||
"rules": {
|
"rules": {
|
||||||
"arrow-spacing": "error",
|
"arrow-spacing": "error",
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -147,3 +147,4 @@ fabric.properties
|
|||||||
/live/
|
/live/
|
||||||
!/output/
|
!/output/
|
||||||
/db/jobs.db
|
/db/jobs.db
|
||||||
|
!/db/
|
||||||
|
1
biglist.json
Normal file
1
biglist.json
Normal file
File diff suppressed because one or more lines are too long
204
brain.json
Normal file
204
brain.json
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
{
|
||||||
|
"categories": {
|
||||||
|
"good": true,
|
||||||
|
"bad": true
|
||||||
|
},
|
||||||
|
"docCount": {
|
||||||
|
"good": 43,
|
||||||
|
"bad": 5
|
||||||
|
},
|
||||||
|
"totalDocuments": 48,
|
||||||
|
"vocabulary": {
|
||||||
|
"tsql": true,
|
||||||
|
"developer": true,
|
||||||
|
"contract": true,
|
||||||
|
"web": true,
|
||||||
|
"javascript": true,
|
||||||
|
"js": true,
|
||||||
|
"node": true,
|
||||||
|
"es": true,
|
||||||
|
"agile": true,
|
||||||
|
"nodejs": true,
|
||||||
|
"london": true,
|
||||||
|
"aws": true,
|
||||||
|
"sql": true,
|
||||||
|
"postgresql": true,
|
||||||
|
"mysql": true,
|
||||||
|
"docker": true,
|
||||||
|
"ecs": true,
|
||||||
|
"automation": true,
|
||||||
|
"jslint": true,
|
||||||
|
"jshint": true,
|
||||||
|
"vuejs": true,
|
||||||
|
"vue": true,
|
||||||
|
"nginx": true,
|
||||||
|
"remotely": true,
|
||||||
|
"mvc": true,
|
||||||
|
"remote": true,
|
||||||
|
"iot": true,
|
||||||
|
"mqtt": true,
|
||||||
|
"es6": true,
|
||||||
|
"es2016": true,
|
||||||
|
"es2017": true,
|
||||||
|
"es2018": true,
|
||||||
|
"react": true,
|
||||||
|
"redux": true,
|
||||||
|
"graphql": true,
|
||||||
|
"java": true,
|
||||||
|
"reactjs": true,
|
||||||
|
"apps": true,
|
||||||
|
"html": true,
|
||||||
|
"css": true,
|
||||||
|
"code": true,
|
||||||
|
"angular": true,
|
||||||
|
"ember": true,
|
||||||
|
"restful": true,
|
||||||
|
"apis": true,
|
||||||
|
"infrastructure": true,
|
||||||
|
"software": true,
|
||||||
|
"native": true,
|
||||||
|
"med": true,
|
||||||
|
"mobile": true,
|
||||||
|
"client": true,
|
||||||
|
"applications": true,
|
||||||
|
"digital": true,
|
||||||
|
"analytics": true,
|
||||||
|
"dashboarding": true,
|
||||||
|
"online": true,
|
||||||
|
"analyse": true,
|
||||||
|
"dashboards": true,
|
||||||
|
"google": true,
|
||||||
|
"query": true,
|
||||||
|
"data": true,
|
||||||
|
"stakeholders": true,
|
||||||
|
"enhancements": true,
|
||||||
|
"requirements": true,
|
||||||
|
"c": true,
|
||||||
|
"net": true,
|
||||||
|
"technologies": true,
|
||||||
|
"azure": true,
|
||||||
|
"understanding": true,
|
||||||
|
"devops": true,
|
||||||
|
"tools": true,
|
||||||
|
"frameworks": true,
|
||||||
|
"scotland": true,
|
||||||
|
"responsibility": true,
|
||||||
|
"programme": true,
|
||||||
|
"functions": true,
|
||||||
|
"asp": true,
|
||||||
|
"project": true,
|
||||||
|
"transform": true,
|
||||||
|
"collaborative": true,
|
||||||
|
"technical": true,
|
||||||
|
"framework": true,
|
||||||
|
"nhibernate": true,
|
||||||
|
"server": true,
|
||||||
|
"api": true,
|
||||||
|
"development": true,
|
||||||
|
"lifecycle": true,
|
||||||
|
"specification": true,
|
||||||
|
"appointments": true
|
||||||
|
},
|
||||||
|
"vocabularySize": 89,
|
||||||
|
"wordCount": {
|
||||||
|
"good": 157,
|
||||||
|
"bad": 5
|
||||||
|
},
|
||||||
|
"wordFrequencyCount": {
|
||||||
|
"good": {
|
||||||
|
"tsql": 1,
|
||||||
|
"developer": 6,
|
||||||
|
"contract": 9,
|
||||||
|
"web": 6,
|
||||||
|
"javascript": 7,
|
||||||
|
"js": 3,
|
||||||
|
"node": 2,
|
||||||
|
"es": 1,
|
||||||
|
"agile": 2,
|
||||||
|
"nodejs": 1,
|
||||||
|
"london": 3,
|
||||||
|
"aws": 3,
|
||||||
|
"sql": 3,
|
||||||
|
"postgresql": 1,
|
||||||
|
"mysql": 1,
|
||||||
|
"docker": 1,
|
||||||
|
"ecs": 1,
|
||||||
|
"automation": 1,
|
||||||
|
"jslint": 1,
|
||||||
|
"jshint": 1,
|
||||||
|
"vuejs": 1,
|
||||||
|
"vue": 2,
|
||||||
|
"nginx": 1,
|
||||||
|
"remotely": 1,
|
||||||
|
"mvc": 5,
|
||||||
|
"remote": 2,
|
||||||
|
"iot": 1,
|
||||||
|
"mqtt": 1,
|
||||||
|
"es6": 1,
|
||||||
|
"es2016": 1,
|
||||||
|
"es2017": 1,
|
||||||
|
"es2018": 1,
|
||||||
|
"apps": 1,
|
||||||
|
"html": 5,
|
||||||
|
"css": 5,
|
||||||
|
"code": 2,
|
||||||
|
"react": 2,
|
||||||
|
"angular": 1,
|
||||||
|
"ember": 1,
|
||||||
|
"restful": 1,
|
||||||
|
"apis": 1,
|
||||||
|
"infrastructure": 1,
|
||||||
|
"software": 2,
|
||||||
|
"native": 1,
|
||||||
|
"med": 1,
|
||||||
|
"mobile": 1,
|
||||||
|
"client": 4,
|
||||||
|
"applications": 2,
|
||||||
|
"digital": 2,
|
||||||
|
"analytics": 1,
|
||||||
|
"dashboarding": 1,
|
||||||
|
"online": 1,
|
||||||
|
"analyse": 1,
|
||||||
|
"dashboards": 1,
|
||||||
|
"google": 1,
|
||||||
|
"query": 1,
|
||||||
|
"data": 1,
|
||||||
|
"stakeholders": 1,
|
||||||
|
"enhancements": 3,
|
||||||
|
"requirements": 3,
|
||||||
|
"c": 4,
|
||||||
|
"net": 5,
|
||||||
|
"technologies": 4,
|
||||||
|
"azure": 2,
|
||||||
|
"understanding": 1,
|
||||||
|
"devops": 2,
|
||||||
|
"tools": 1,
|
||||||
|
"frameworks": 1,
|
||||||
|
"scotland": 1,
|
||||||
|
"responsibility": 1,
|
||||||
|
"programme": 1,
|
||||||
|
"functions": 1,
|
||||||
|
"asp": 1,
|
||||||
|
"project": 1,
|
||||||
|
"transform": 1,
|
||||||
|
"collaborative": 1,
|
||||||
|
"technical": 1,
|
||||||
|
"framework": 1,
|
||||||
|
"nhibernate": 1,
|
||||||
|
"server": 1,
|
||||||
|
"api": 1,
|
||||||
|
"development": 1,
|
||||||
|
"lifecycle": 1,
|
||||||
|
"specification": 1,
|
||||||
|
"appointments": 1
|
||||||
|
},
|
||||||
|
"bad": {
|
||||||
|
"react": 1,
|
||||||
|
"redux": 1,
|
||||||
|
"graphql": 1,
|
||||||
|
"java": 1,
|
||||||
|
"reactjs": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"options": {}
|
||||||
|
}
|
BIN
db/jobs.db
BIN
db/jobs.db
Binary file not shown.
84
lib/base.js
84
lib/base.js
@ -8,6 +8,12 @@
|
|||||||
const filterReject = require('../lib/filter_reject');
|
const filterReject = require('../lib/filter_reject');
|
||||||
const filterAccept = require('../lib/filter_md_jobs');
|
const filterAccept = require('../lib/filter_md_jobs');
|
||||||
const dbmanager = require('../lib/dbmanager');
|
const dbmanager = require('../lib/dbmanager');
|
||||||
|
const JobsModel = require('../lib/mongoManager');
|
||||||
|
|
||||||
|
const SHA = require('crypto-js/sha256');
|
||||||
|
|
||||||
|
const { Utils } = require('@rakh/utils');
|
||||||
|
const { Corpus } = require('./corpus');
|
||||||
|
|
||||||
class MasterBase {
|
class MasterBase {
|
||||||
|
|
||||||
@ -57,6 +63,79 @@ class MasterBase {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
addToMongo() {
|
||||||
|
console.log('>> ADD TO MONGO!');
|
||||||
|
|
||||||
|
for(const item of this.items) {
|
||||||
|
// console.log('add', item);
|
||||||
|
const newObj = this.reduceData(item);
|
||||||
|
const newJob = new JobsModel(newObj);
|
||||||
|
|
||||||
|
newJob.save().then((m) => {
|
||||||
|
console.log('m', m.details.title);
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error('m', err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param inval
|
||||||
|
* @returns {number}
|
||||||
|
*/
|
||||||
|
analyseRate(inval) {
|
||||||
|
console.log('analyseRate', inval);
|
||||||
|
let outVal = 0;
|
||||||
|
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
|
||||||
|
const clearSpace = /\s+/g;
|
||||||
|
|
||||||
|
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
|
||||||
|
const resultArray = result.trim().split((' '));
|
||||||
|
|
||||||
|
if (resultArray.length > 0) {
|
||||||
|
const item = parseInt(resultArray[0], 10);
|
||||||
|
|
||||||
|
if (item < 100) outVal = 0;
|
||||||
|
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||||
|
else if (item >= 5000) outVal = 2;
|
||||||
|
}
|
||||||
|
else return 0;
|
||||||
|
|
||||||
|
return outVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param d
|
||||||
|
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
|
||||||
|
*/
|
||||||
|
reduceData(d) {
|
||||||
|
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||||
|
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||||
|
|
||||||
|
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||||
|
|
||||||
|
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||||
|
|
||||||
|
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||||
|
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||||
|
outObj.details.hashed = SHA(outObj.details.summary);
|
||||||
|
|
||||||
|
outObj.data.read = 0;
|
||||||
|
outObj.data.applied = d.applied || 0;
|
||||||
|
|
||||||
|
outObj.data.jobtype = this.analyseRate(d.salary);
|
||||||
|
outObj.data.autoclass = Corpus.process(d.summary);
|
||||||
|
|
||||||
|
outObj.data.timestamp = d.timestamp * 1000;
|
||||||
|
|
||||||
|
return outObj;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
@ -120,10 +199,15 @@ class MasterBase {
|
|||||||
return `https://image.silvrtree.co.uk/q${q}/${url}`;
|
return `https://image.silvrtree.co.uk/q${q}/${url}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async go() {
|
async go() {
|
||||||
this.items = [];
|
this.items = [];
|
||||||
this.rawItems = [];
|
this.rawItems = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = MasterBase;
|
module.exports = MasterBase;
|
||||||
|
90
lib/corpus.js
Normal file
90
lib/corpus.js
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
const jsonfile = require('jsonfile');
|
||||||
|
|
||||||
|
const words = require('../lib/wordlist.json');
|
||||||
|
const wordsAdditional = require('../lib/wordlistAdditional.json');
|
||||||
|
|
||||||
|
const bigList = new Map([]);
|
||||||
|
|
||||||
|
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
|
||||||
|
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||||
|
let unrated = [];
|
||||||
|
|
||||||
|
var _global = typeof global === 'undefined' ? window : global;
|
||||||
|
var Corpus = (_global.Corpus = _global.Corpus || {});
|
||||||
|
|
||||||
|
const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/;
|
||||||
|
const detagRegex = /(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/gi;
|
||||||
|
const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
|
||||||
|
const deSpace = /\s+/g;
|
||||||
|
|
||||||
|
function cleanText(intext) {
|
||||||
|
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
|
||||||
|
|
||||||
|
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
function dedupe(intext) {
|
||||||
|
if (arguments.length === 0 || intext === null ) return [];
|
||||||
|
|
||||||
|
return [...new Set(intext)];
|
||||||
|
}
|
||||||
|
|
||||||
|
function incItem(item) {
|
||||||
|
if (bigList.has(item))
|
||||||
|
bigList.set(item, bigList.get(item) + 1);
|
||||||
|
|
||||||
|
else
|
||||||
|
bigList.set(item, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process the body
|
||||||
|
* @param intext
|
||||||
|
* @returns {{score: number, bad: *, good: *}}
|
||||||
|
*/
|
||||||
|
Corpus.process = function(intext) {
|
||||||
|
const workText = cleanText(intext);
|
||||||
|
|
||||||
|
const workArray = workText.split(' ');
|
||||||
|
|
||||||
|
const cleanedArray = dedupe(workArray).filter((v) => {
|
||||||
|
return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1);
|
||||||
|
});
|
||||||
|
|
||||||
|
const good = cleanedArray.filter((v) => {
|
||||||
|
return (goodWords.indexOf(v) !== -1);
|
||||||
|
});
|
||||||
|
|
||||||
|
const bad = cleanedArray.filter((v) => {
|
||||||
|
return (badWords.indexOf(v) !== -1);
|
||||||
|
});
|
||||||
|
|
||||||
|
const unused = cleanedArray.filter((v) => {
|
||||||
|
return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1));
|
||||||
|
});
|
||||||
|
|
||||||
|
cleanedArray.map((item)=> {
|
||||||
|
incItem(item);
|
||||||
|
});
|
||||||
|
|
||||||
|
unrated = [...unrated, ...unused];
|
||||||
|
|
||||||
|
const score = good.length - (bad.length * 5);
|
||||||
|
|
||||||
|
// console.log('unused', unused);
|
||||||
|
|
||||||
|
return { good, bad, score, 'words':cleanedArray };
|
||||||
|
};
|
||||||
|
|
||||||
|
Corpus.exportUnused = function() {
|
||||||
|
jsonfile.writeFileSync('./unused.json', dedupe(unrated));
|
||||||
|
jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1]));
|
||||||
|
console.log([...bigList]);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if (typeof module !== 'undefined')
|
||||||
|
module.exports = {
|
||||||
|
'Corpus': Corpus
|
||||||
|
};
|
34
lib/mongoManager.js
Normal file
34
lib/mongoManager.js
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 22/07/2020
|
||||||
|
* Time: 17:00
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const mongoose = require('mongoose');
|
||||||
|
const log4js = require('log4js');
|
||||||
|
const logger = log4js.getLogger();
|
||||||
|
|
||||||
|
const JobsModel = require('../models/jobs');
|
||||||
|
|
||||||
|
const { Utils } = require('@rakh/utils');
|
||||||
|
|
||||||
|
require('dotenv').config();
|
||||||
|
|
||||||
|
logger.level = 'debug';
|
||||||
|
|
||||||
|
const mongoConnect = process.env.MONGOCONNECT;
|
||||||
|
|
||||||
|
// logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||||
|
|
||||||
|
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
||||||
|
|
||||||
|
logger.debug(mongoConnect);
|
||||||
|
|
||||||
|
mongoose.connect(mongoConnect);
|
||||||
|
|
||||||
|
const mDB = mongoose.connection;
|
||||||
|
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||||
|
|
||||||
|
module.exports = JobsModel;
|
@ -89,6 +89,7 @@ class MasterRSS extends MasterBase {
|
|||||||
await this.filterAdverts();
|
await this.filterAdverts();
|
||||||
|
|
||||||
if (this.items.length > 0) await this.addToDB();
|
if (this.items.length > 0) await this.addToDB();
|
||||||
|
if (this.items.length > 0) await this.addToMongo();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
console.log('No items to process');
|
console.log('No items to process');
|
||||||
|
@ -21,9 +21,14 @@ class MasterScraper extends MasterBase {
|
|||||||
super();
|
super();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param url
|
||||||
|
* @param useStone
|
||||||
|
* @returns {Promise<unknown>}
|
||||||
|
*/
|
||||||
getContent(url, useStone = false) {
|
getContent(url, useStone = false) {
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
let headers = new Headers({
|
let headers = new Headers({
|
||||||
"Accept" : "application/json",
|
"Accept" : "application/json",
|
||||||
@ -54,19 +59,28 @@ fetch(url, {
|
|||||||
resolve(response.body);
|
resolve(response.body);
|
||||||
})
|
})
|
||||||
.catch((e) => {
|
.catch((e) => {
|
||||||
|
console.error('getContent', e );
|
||||||
reject(e.response.body);
|
reject(e.response.body);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
async savePage(html) {
|
||||||
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
|
||||||
|
|
||||||
|
const filename = `pages/${this.siteid}-${now}.html`;
|
||||||
|
|
||||||
|
fs.writeFileSync(filename, html);
|
||||||
|
}
|
||||||
|
|
||||||
async getPage() {
|
async getPage() {
|
||||||
console.log('>> getPage: fetching', this.url);
|
console.log('>> getPage: fetching', this.url);
|
||||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
|
||||||
const filename = `${this.siteid}-${now}.html`;
|
|
||||||
|
|
||||||
await this.getContent(this.url, this.useStone)
|
await this.getContent(this.url, this.useStone)
|
||||||
.then((html) => {
|
.then((html) => {
|
||||||
fs.writeFileSync(filename, html);
|
// console.log('>> getPage:: got', html);
|
||||||
|
console.log('>> getPage:: OK');
|
||||||
|
if (this.saveFile) this.savePage(html);
|
||||||
const $ = cheerio.load(html);
|
const $ = cheerio.load(html);
|
||||||
this.loadPage($);
|
this.loadPage($);
|
||||||
})
|
})
|
||||||
@ -75,30 +89,59 @@ fetch(url, {
|
|||||||
|
|
||||||
// Site specific parts below here
|
// Site specific parts below here
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Break each page into items
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async breakPage() {
|
async breakPage() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param part
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async extractDetails(part) {
|
async extractDetails(part) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async checkNext() {
|
async checkNext() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async processSite() {
|
async processSite() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async getIndividualPage() {
|
async getIndividualPage() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async getJobPages() {
|
async getJobPages() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async go() {
|
async go() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
1007
lib/wordlist.json
Normal file
1007
lib/wordlist.json
Normal file
File diff suppressed because it is too large
Load Diff
8790
lib/wordlistAdditional.json
Normal file
8790
lib/wordlistAdditional.json
Normal file
File diff suppressed because it is too large
Load Diff
559
limited.json
Normal file
559
limited.json
Normal file
@ -0,0 +1,559 @@
|
|||||||
|
[
|
||||||
|
"experienced",
|
||||||
|
"exceptional",
|
||||||
|
"maintaining",
|
||||||
|
"familiarity",
|
||||||
|
"commodities",
|
||||||
|
"opportunity",
|
||||||
|
"possibility",
|
||||||
|
"integration",
|
||||||
|
"engineering",
|
||||||
|
"derivatives",
|
||||||
|
"prefferable",
|
||||||
|
"nutritional",
|
||||||
|
"performance",
|
||||||
|
"immediately",
|
||||||
|
"information",
|
||||||
|
"responsible",
|
||||||
|
"environment",
|
||||||
|
"stakeholder",
|
||||||
|
"proactively",
|
||||||
|
"requirement",
|
||||||
|
"temporarily",
|
||||||
|
"interrogate",
|
||||||
|
"effectively",
|
||||||
|
"progressing",
|
||||||
|
"substantial",
|
||||||
|
"identifying",
|
||||||
|
"maintenance",
|
||||||
|
"workarounds",
|
||||||
|
"departments",
|
||||||
|
"consultancy",
|
||||||
|
"regulations",
|
||||||
|
"statistical",
|
||||||
|
"previously·",
|
||||||
|
"euromonitor",
|
||||||
|
"documenting",
|
||||||
|
"bookkeeping",
|
||||||
|
"reconciling",
|
||||||
|
"hardworking",
|
||||||
|
"themselves!",
|
||||||
|
"appropriate",
|
||||||
|
"socialising",
|
||||||
|
"fundraising",
|
||||||
|
"initiatives",
|
||||||
|
"sponsorship",
|
||||||
|
"orientation",
|
||||||
|
"competitive",
|
||||||
|
"illustrator",
|
||||||
|
"outstanding",
|
||||||
|
"interaction",
|
||||||
|
"consistency",
|
||||||
|
"touchpoints",
|
||||||
|
"freshtechit",
|
||||||
|
"recruitment",
|
||||||
|
"catastrophe",
|
||||||
|
"accountable",
|
||||||
|
"workstreams",
|
||||||
|
"scalability",
|
||||||
|
"undertaking",
|
||||||
|
"interacting",
|
||||||
|
"significant",
|
||||||
|
"considering",
|
||||||
|
"independent",
|
||||||
|
"collaborate",
|
||||||
|
"arrangement",
|
||||||
|
"unsolicited",
|
||||||
|
"empowerment",
|
||||||
|
"connections",
|
||||||
|
"specialists",
|
||||||
|
"credentials",
|
||||||
|
"personality",
|
||||||
|
"established",
|
||||||
|
"northampton",
|
||||||
|
"advertising",
|
||||||
|
"operational",
|
||||||
|
"mathematics",
|
||||||
|
"contractors",
|
||||||
|
"instruments",
|
||||||
|
"referencing",
|
||||||
|
"locationsco",
|
||||||
|
"disciplines",
|
||||||
|
"corporation",
|
||||||
|
"investments",
|
||||||
|
"conferences",
|
||||||
|
"demonstrate",
|
||||||
|
"directorate",
|
||||||
|
"acknowledge",
|
||||||
|
"legislation",
|
||||||
|
"designgreat",
|
||||||
|
"understands",
|
||||||
|
"perspective",
|
||||||
|
"association",
|
||||||
|
"enforcement",
|
||||||
|
"prestigious",
|
||||||
|
"individuals",
|
||||||
|
"alternative",
|
||||||
|
"technically",
|
||||||
|
"challenging",
|
||||||
|
"discussions",
|
||||||
|
"lifeworking",
|
||||||
|
"interactive",
|
||||||
|
"storyboards",
|
||||||
|
"communicate",
|
||||||
|
"abilitywork",
|
||||||
|
"englishgood",
|
||||||
|
"detailbonus",
|
||||||
|
"angularwhat",
|
||||||
|
"neededabout",
|
||||||
|
"innovations",
|
||||||
|
"enthusiasts",
|
||||||
|
"instructors",
|
||||||
|
"prospective",
|
||||||
|
"comfortable",
|
||||||
|
"involvement",
|
||||||
|
"adventurous",
|
||||||
|
"marketplace",
|
||||||
|
"forecasting",
|
||||||
|
"contractual",
|
||||||
|
"underpinned",
|
||||||
|
"acquisition",
|
||||||
|
"microsoft’s",
|
||||||
|
"progression",
|
||||||
|
"suggestions",
|
||||||
|
"proficiency",
|
||||||
|
"participate",
|
||||||
|
"joblocation",
|
||||||
|
"methodology",
|
||||||
|
"continually",
|
||||||
|
"cataloguing",
|
||||||
|
"projectgood",
|
||||||
|
"incremental",
|
||||||
|
"overarching",
|
||||||
|
"confidently",
|
||||||
|
"circulatory",
|
||||||
|
"adjustments",
|
||||||
|
"interesting",
|
||||||
|
"consultants",
|
||||||
|
"experienceb",
|
||||||
|
"hourscasual",
|
||||||
|
"switzerland",
|
||||||
|
"contributes",
|
||||||
|
"participant",
|
||||||
|
"improvement",
|
||||||
|
"articulates",
|
||||||
|
"contributed",
|
||||||
|
"comfortably",
|
||||||
|
"deployments",
|
||||||
|
"integrating",
|
||||||
|
"configuring",
|
||||||
|
"platforming",
|
||||||
|
"educatedday",
|
||||||
|
"contracting",
|
||||||
|
"monthstotal",
|
||||||
|
"outsourcing",
|
||||||
|
"designswork",
|
||||||
|
"ideasdesign",
|
||||||
|
"deviceswork",
|
||||||
|
"fundamental",
|
||||||
|
"businessjob",
|
||||||
|
"implemented",
|
||||||
|
"transaction",
|
||||||
|
"reliability",
|
||||||
|
"upgradesyou",
|
||||||
|
"uncertainty",
|
||||||
|
"enterpriser",
|
||||||
|
"teamprovide",
|
||||||
|
"trafficking",
|
||||||
|
"doubleclick",
|
||||||
|
"communities",
|
||||||
|
"‘forestlink",
|
||||||
|
"dimensional",
|
||||||
|
"coordinator",
|
||||||
|
"spreadsheet",
|
||||||
|
"pressurised",
|
||||||
|
"assignments",
|
||||||
|
"willingness",
|
||||||
|
"certificate",
|
||||||
|
"summaryrole",
|
||||||
|
"institution",
|
||||||
|
"segregation",
|
||||||
|
"preparation",
|
||||||
|
"electronics",
|
||||||
|
"duplication",
|
||||||
|
"surrounding",
|
||||||
|
"informatica",
|
||||||
|
"blackfriars",
|
||||||
|
"terminology",
|
||||||
|
"shabarinath",
|
||||||
|
"interfacing",
|
||||||
|
"expectation",
|
||||||
|
"proprietary",
|
||||||
|
"conflicting",
|
||||||
|
"itecopeople",
|
||||||
|
"opowershell",
|
||||||
|
"submissions",
|
||||||
|
"negotiating",
|
||||||
|
"escalations",
|
||||||
|
"transferred",
|
||||||
|
"protections",
|
||||||
|
"customizing",
|
||||||
|
"oxfordshire",
|
||||||
|
"progressive",
|
||||||
|
"bishopsgate",
|
||||||
|
"partnership",
|
||||||
|
"futureheads",
|
||||||
|
"permissions",
|
||||||
|
"efficiently",
|
||||||
|
"unspecified",
|
||||||
|
"potentially",
|
||||||
|
"disclaimers",
|
||||||
|
"foreseeable",
|
||||||
|
"sustainable",
|
||||||
|
"calculation",
|
||||||
|
"replication",
|
||||||
|
"constitutes",
|
||||||
|
"recommended",
|
||||||
|
"enterprises",
|
||||||
|
"negotiation",
|
||||||
|
"imaginative",
|
||||||
|
"differences",
|
||||||
|
"nationality",
|
||||||
|
"impediments",
|
||||||
|
"refinements",
|
||||||
|
"translating",
|
||||||
|
"obligations",
|
||||||
|
"flexibility",
|
||||||
|
"unashamedly",
|
||||||
|
"exclusively",
|
||||||
|
"replacement",
|
||||||
|
"essentially",
|
||||||
|
"artifactory",
|
||||||
|
"theoretical",
|
||||||
|
"probability",
|
||||||
|
"integrators",
|
||||||
|
"contractor?",
|
||||||
|
"interested?",
|
||||||
|
"functioning",
|
||||||
|
"chamberlain",
|
||||||
|
"inclusivity",
|
||||||
|
"iteratively",
|
||||||
|
"enhancement",
|
||||||
|
"constraints",
|
||||||
|
"establishes",
|
||||||
|
"qualitative",
|
||||||
|
"influencing",
|
||||||
|
"procurement",
|
||||||
|
"experiences",
|
||||||
|
"furthermore",
|
||||||
|
"disciplined",
|
||||||
|
"unnecessary",
|
||||||
|
"bureaucracy",
|
||||||
|
"represented",
|
||||||
|
"siteimprove",
|
||||||
|
"lokhandwala",
|
||||||
|
"specialises",
|
||||||
|
"rationalize",
|
||||||
|
"competncies",
|
||||||
|
"restoration",
|
||||||
|
"allocations",
|
||||||
|
"admittances",
|
||||||
|
"furnishings",
|
||||||
|
"cleanliness",
|
||||||
|
"residential",
|
||||||
|
"contactable",
|
||||||
|
"conventions",
|
||||||
|
"translation",
|
||||||
|
"approaching",
|
||||||
|
"intecselect",
|
||||||
|
"linguistics",
|
||||||
|
"southampton",
|
||||||
|
"beautifully",
|
||||||
|
"estimations",
|
||||||
|
"newsletters",
|
||||||
|
"summarising",
|
||||||
|
"simulations",
|
||||||
|
"portfolio's",
|
||||||
|
"coronavirus",
|
||||||
|
"opoortunity",
|
||||||
|
"unavailable",
|
||||||
|
"accordingly",
|
||||||
|
"penetration",
|
||||||
|
"remediation",
|
||||||
|
"elimination",
|
||||||
|
"achievement",
|
||||||
|
"facilitator",
|
||||||
|
"westminster",
|
||||||
|
"introducing",
|
||||||
|
"businesses'",
|
||||||
|
"capitalists",
|
||||||
|
"investigate",
|
||||||
|
"countryside",
|
||||||
|
"problematic",
|
||||||
|
"coordinates",
|
||||||
|
"components'",
|
||||||
|
"supervision",
|
||||||
|
"bonavolonta",
|
||||||
|
"proposition",
|
||||||
|
"foundations",
|
||||||
|
"suitability",
|
||||||
|
"researchers",
|
||||||
|
"explanation",
|
||||||
|
"commitments",
|
||||||
|
"computation",
|
||||||
|
"questioning",
|
||||||
|
"experiments",
|
||||||
|
"visualfiles",
|
||||||
|
"cloudstream",
|
||||||
|
"determining",
|
||||||
|
"deliverable",
|
||||||
|
"inquisitive",
|
||||||
|
"backgrounds",
|
||||||
|
"thoughtspot",
|
||||||
|
"specialized",
|
||||||
|
"veloppement",
|
||||||
|
"importantes",
|
||||||
|
"typedscript",
|
||||||
|
"restaurants",
|
||||||
|
"prophylaxis",
|
||||||
|
"transmitted",
|
||||||
|
"appointment",
|
||||||
|
"encouraging",
|
||||||
|
"aggregating",
|
||||||
|
"championing",
|
||||||
|
"conjunction",
|
||||||
|
"customising",
|
||||||
|
"photography",
|
||||||
|
"authorities",
|
||||||
|
"competition",
|
||||||
|
"collections",
|
||||||
|
"contraintes",
|
||||||
|
"fonctionnel",
|
||||||
|
"adaptabilit",
|
||||||
|
"changements",
|
||||||
|
"conceptions",
|
||||||
|
"utilisation",
|
||||||
|
"shortlisted",
|
||||||
|
"reusability",
|
||||||
|
"recognizing",
|
||||||
|
"decisioning",
|
||||||
|
"accommodate",
|
||||||
|
"limitations",
|
||||||
|
"resourceful",
|
||||||
|
"algorithmic",
|
||||||
|
"unconcerned",
|
||||||
|
"intelligent",
|
||||||
|
"considerate",
|
||||||
|
"clientbased",
|
||||||
|
"accelerator",
|
||||||
|
"dreamweaver",
|
||||||
|
"applicant's",
|
||||||
|
"proactivity",
|
||||||
|
"aggregation",
|
||||||
|
"restriction",
|
||||||
|
"traditional",
|
||||||
|
"corporately",
|
||||||
|
"memberships",
|
||||||
|
"standardise",
|
||||||
|
"theecsgroup",
|
||||||
|
"scarchitect",
|
||||||
|
"consolidate",
|
||||||
|
"extensively",
|
||||||
|
"afghanistan",
|
||||||
|
"encompasses",
|
||||||
|
"distinctive",
|
||||||
|
"professions",
|
||||||
|
"interviewed",
|
||||||
|
"formulation",
|
||||||
|
"transitions",
|
||||||
|
"aspirations",
|
||||||
|
"ingredients",
|
||||||
|
"setterfield",
|
||||||
|
"candidate’s",
|
||||||
|
"leatherhead",
|
||||||
|
"publication",
|
||||||
|
"undoubtedly",
|
||||||
|
"basingstoke",
|
||||||
|
"underground",
|
||||||
|
"reinsurance",
|
||||||
|
"exemplifies",
|
||||||
|
"civiization",
|
||||||
|
"developer's",
|
||||||
|
"bazzelgette",
|
||||||
|
"adjacencies",
|
||||||
|
"feasibility",
|
||||||
|
"frontinvest",
|
||||||
|
"neogotiable",
|
||||||
|
"unconnected",
|
||||||
|
"conditional",
|
||||||
|
"bottlenecks",
|
||||||
|
"productions",
|
||||||
|
"pharmacists",
|
||||||
|
"technicians",
|
||||||
|
"prescribing",
|
||||||
|
"stewardship",
|
||||||
|
"recognising",
|
||||||
|
"convictions",
|
||||||
|
"subscribing",
|
||||||
|
"transparent",
|
||||||
|
"wireframing",
|
||||||
|
"insidehmcts",
|
||||||
|
"justicejobs",
|
||||||
|
"criminology",
|
||||||
|
"hospitality",
|
||||||
|
"structuring",
|
||||||
|
"educational",
|
||||||
|
"substantive",
|
||||||
|
"secondments",
|
||||||
|
"transgender",
|
||||||
|
"smartphones",
|
||||||
|
"microsoft's",
|
||||||
|
"definitions",
|
||||||
|
"validations",
|
||||||
|
"prioritised",
|
||||||
|
"autoscaling",
|
||||||
|
"abstraction",
|
||||||
|
"correlation",
|
||||||
|
"recognition",
|
||||||
|
"contributor",
|
||||||
|
"apigedevops",
|
||||||
|
"incorporate",
|
||||||
|
"woocommerce",
|
||||||
|
"informatics",
|
||||||
|
"adfadc@apps",
|
||||||
|
"automations",
|
||||||
|
"formulating",
|
||||||
|
"beneficiary",
|
||||||
|
"referential",
|
||||||
|
"jsdevsecops",
|
||||||
|
"solutioning",
|
||||||
|
"measurement",
|
||||||
|
"familiarise",
|
||||||
|
"eligibility",
|
||||||
|
"standardize",
|
||||||
|
"experience?",
|
||||||
|
"bournemouth",
|
||||||
|
"implementer",
|
||||||
|
"agilesphere",
|
||||||
|
"assumptions",
|
||||||
|
"accountancy",
|
||||||
|
"cockroachdb",
|
||||||
|
"promotional",
|
||||||
|
"facilitates",
|
||||||
|
"discoveries",
|
||||||
|
"bladecenter",
|
||||||
|
"considered!",
|
||||||
|
"cooperation",
|
||||||
|
"exploration",
|
||||||
|
"angulareact",
|
||||||
|
"preferabbly",
|
||||||
|
"harmonising",
|
||||||
|
"convenience",
|
||||||
|
"inclusively",
|
||||||
|
"strategists",
|
||||||
|
"attribution",
|
||||||
|
"fromscratch",
|
||||||
|
"combination",
|
||||||
|
"solutionize",
|
||||||
|
"accelerated",
|
||||||
|
"diagnostics",
|
||||||
|
"sensibility",
|
||||||
|
"informative",
|
||||||
|
"intellegnce",
|
||||||
|
"specilisits",
|
||||||
|
"projections",
|
||||||
|
"associative",
|
||||||
|
"personalize",
|
||||||
|
"farnborough",
|
||||||
|
"necessarily",
|
||||||
|
"nservicebus",
|
||||||
|
"constrained",
|
||||||
|
"prioritized",
|
||||||
|
"behavioural",
|
||||||
|
"chakraborty",
|
||||||
|
"leaderships",
|
||||||
|
"flourishing",
|
||||||
|
"uniqstudios",
|
||||||
|
"simplifying",
|
||||||
|
"realisation",
|
||||||
|
"extensions!",
|
||||||
|
"prioritises",
|
||||||
|
"experience!",
|
||||||
|
"candidates!",
|
||||||
|
"inclination",
|
||||||
|
"stimulating",
|
||||||
|
"appreciated",
|
||||||
|
"reinventing",
|
||||||
|
"compression",
|
||||||
|
"jscybsecdev",
|
||||||
|
"equirements",
|
||||||
|
"generalized",
|
||||||
|
"compressors",
|
||||||
|
"assessments",
|
||||||
|
"beyondtrust",
|
||||||
|
"engagements",
|
||||||
|
"numerically",
|
||||||
|
"electricity",
|
||||||
|
"interchange",
|
||||||
|
"jsswift_dev",
|
||||||
|
"circulating",
|
||||||
|
"attachments",
|
||||||
|
"credibility",
|
||||||
|
"vnetpeering",
|
||||||
|
"territories",
|
||||||
|
"staggering!",
|
||||||
|
"developers!",
|
||||||
|
"peripherals",
|
||||||
|
"virtualized",
|
||||||
|
"bitdefender",
|
||||||
|
"jssitecorjs",
|
||||||
|
"positioning",
|
||||||
|
"appreciates",
|
||||||
|
"chessington",
|
||||||
|
"controllers",
|
||||||
|
"controlling",
|
||||||
|
"quantifying",
|
||||||
|
"virtualised",
|
||||||
|
"manufacture",
|
||||||
|
"fluorescent",
|
||||||
|
"governments",
|
||||||
|
"bigcommerce",
|
||||||
|
"therapeutic",
|
||||||
|
"importantly",
|
||||||
|
"differently",
|
||||||
|
"rigourously",
|
||||||
|
"shareholder",
|
||||||
|
"copywriting",
|
||||||
|
"anticipated",
|
||||||
|
"approximate",
|
||||||
|
"behdarvandi",
|
||||||
|
"testability",
|
||||||
|
"beneficial!",
|
||||||
|
"jswmibmcraw",
|
||||||
|
"exhibitions",
|
||||||
|
"talentpoint",
|
||||||
|
"propagation",
|
||||||
|
"interviews!",
|
||||||
|
"solutionise",
|
||||||
|
"elasticache",
|
||||||
|
"manoeuvring",
|
||||||
|
"teamservice",
|
||||||
|
"geographies",
|
||||||
|
"efficientip",
|
||||||
|
"organically",
|
||||||
|
"advancement",
|
||||||
|
"jshodanular",
|
||||||
|
"wholesalers",
|
||||||
|
"multitenant",
|
||||||
|
"encouraged?",
|
||||||
|
"freelancers",
|
||||||
|
"composition",
|
||||||
|
"#jobswagger",
|
||||||
|
"typographic",
|
||||||
|
"stereotypes",
|
||||||
|
"clerkenwell",
|
||||||
|
"sacrificing",
|
||||||
|
"resolutions",
|
||||||
|
"technology?",
|
||||||
|
"advantagous"
|
||||||
|
]
|
22
mapbuilder.js
Normal file
22
mapbuilder.js
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 27/07/2020
|
||||||
|
* Time: 15:34
|
||||||
|
|
||||||
|
*/
|
||||||
|
const jsonfile = require('jsonfile');
|
||||||
|
|
||||||
|
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
|
||||||
|
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||||
|
|
||||||
|
const brain = new Map([]);
|
||||||
|
|
||||||
|
for(let i = 0;i < goodWords.length - 1;i++)
|
||||||
|
brain.set(goodWords[i], 3);
|
||||||
|
|
||||||
|
for(let i = 0;i < badWords.length - 1;i++)
|
||||||
|
brain.set(badWords[i], -5);
|
||||||
|
|
||||||
|
jsonfile.writeFileSync('brain.json', [...brain]);
|
||||||
|
|
156
migrate.js
Normal file
156
migrate.js
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 22/07/2020
|
||||||
|
* Time: 10:20
|
||||||
|
|
||||||
|
*/
|
||||||
|
const db = require('./lib/connect');
|
||||||
|
const log4js = require('log4js');
|
||||||
|
const logger = log4js.getLogger();
|
||||||
|
const { Utils } = require('@rakh/utils');
|
||||||
|
|
||||||
|
const { Corpus } = require('./lib/corpus');
|
||||||
|
|
||||||
|
const SHA = require('crypto-js/sha256');
|
||||||
|
|
||||||
|
/*
|
||||||
|
|
||||||
|
2604
|
||||||
|
|
||||||
|
const mongoose = require('mongoose');
|
||||||
|
const log4js = require('log4js');
|
||||||
|
const logger = log4js.getLogger();
|
||||||
|
|
||||||
|
const Jobs = require('./models/jobs');
|
||||||
|
|
||||||
|
require('dotenv').config();
|
||||||
|
|
||||||
|
logger.level = 'debug';
|
||||||
|
|
||||||
|
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||||
|
|
||||||
|
mongoose.connect(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||||
|
|
||||||
|
const mDB = mongoose.connection;
|
||||||
|
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||||
|
*/
|
||||||
|
|
||||||
|
const Jobs = require('./lib/mongoManager');
|
||||||
|
|
||||||
|
const migrate = (function() {
|
||||||
|
function analyseRate(inval) {
|
||||||
|
let outVal = 0;
|
||||||
|
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
|
||||||
|
const clearSpace = /\s+/g;
|
||||||
|
|
||||||
|
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
|
||||||
|
const resultArray = result.trim().split((' '));
|
||||||
|
|
||||||
|
if (resultArray.length > 0) {
|
||||||
|
const item = parseInt(resultArray[0], 10);
|
||||||
|
|
||||||
|
if (item < 100) outVal = 0;
|
||||||
|
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||||
|
else if (item >= 5000) outVal = 2;
|
||||||
|
}
|
||||||
|
else return 0;
|
||||||
|
|
||||||
|
return outVal;
|
||||||
|
}
|
||||||
|
function reduceData(d) {
|
||||||
|
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||||
|
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||||
|
|
||||||
|
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||||
|
|
||||||
|
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||||
|
|
||||||
|
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||||
|
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||||
|
outObj.details.hashed = SHA(outObj.details.summary);
|
||||||
|
|
||||||
|
// outObj.data.read = d.read || 0;
|
||||||
|
outObj.data.read = 0;
|
||||||
|
outObj.data.applied = d.applied || 0;
|
||||||
|
outObj.data.jobtype = analyseRate(d.salary);
|
||||||
|
|
||||||
|
outObj.data.autoclass = Corpus.process(d.summary);
|
||||||
|
|
||||||
|
outObj.data.timestamp = d.timestamp * 1000;
|
||||||
|
|
||||||
|
return outObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getCurrent() {
|
||||||
|
const outgoing = [];
|
||||||
|
console.log('get version');
|
||||||
|
const sql = 'select jobs.*, applied.a as applied, read.d as read from jobs left join applied on applied.aid = jobs._id left join read on read.rid = jobs._id order by _id asc;';
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
db.all(sql, [], (err, rows) => {
|
||||||
|
if (err)
|
||||||
|
reject(err);
|
||||||
|
|
||||||
|
rows.forEach((row) => {
|
||||||
|
outgoing.push(row);
|
||||||
|
});
|
||||||
|
|
||||||
|
resolve(outgoing) ;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function start() {
|
||||||
|
await getCurrent().then(async (d) => {
|
||||||
|
logger.debug(d.length);
|
||||||
|
|
||||||
|
for (let t = 0;t < (d.length - 1);t++) {
|
||||||
|
const newD = reduceData(d[t]);
|
||||||
|
|
||||||
|
// logger.debug(newD);
|
||||||
|
|
||||||
|
const newJob = Jobs(newD);
|
||||||
|
|
||||||
|
await newJob.save().then((m) => {
|
||||||
|
logger.debug('m', m.details.title);
|
||||||
|
}).catch((err) => {
|
||||||
|
logger.error(err.keyPattern);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}).then(() => {
|
||||||
|
logger.debug('SAVING!!');
|
||||||
|
Corpus.exportUnused();
|
||||||
|
})
|
||||||
|
.catch((err) => {
|
||||||
|
logger.error(err.keyPattern);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function deleteOld() {
|
||||||
|
const oneDay = 86400000;
|
||||||
|
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
|
||||||
|
|
||||||
|
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
|
||||||
|
|
||||||
|
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
|
||||||
|
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
|
||||||
|
logger.debug('m', m);
|
||||||
|
}).catch((err) => {
|
||||||
|
logger.error(err);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
|
||||||
|
|
||||||
|
return {
|
||||||
|
'start':start,
|
||||||
|
'deleteOld': deleteOld
|
||||||
|
};
|
||||||
|
})();
|
||||||
|
|
||||||
|
(async function() {
|
||||||
|
await migrate.start();
|
||||||
|
await migrate.deleteOld();
|
||||||
|
logger.info('Done??');
|
||||||
|
})();
|
47
models/jobs.js
Normal file
47
models/jobs.js
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 22/07/2020
|
||||||
|
* Time: 14:18
|
||||||
|
|
||||||
|
*/
|
||||||
|
const mongoose = require('mongoose');
|
||||||
|
const Schema = mongoose.Schema;
|
||||||
|
|
||||||
|
const jobSchema = new Schema({
|
||||||
|
'details': {
|
||||||
|
'title': { 'type': String, 'required': true },
|
||||||
|
'site': { 'type': String, 'required': true },
|
||||||
|
'url': { 'type': String, 'required': true, 'unique': true },
|
||||||
|
'id': String,
|
||||||
|
'summary': String,
|
||||||
|
'company': String,
|
||||||
|
'location': String,
|
||||||
|
'postdate': String,
|
||||||
|
'salary': String,
|
||||||
|
'easyapply': Number,
|
||||||
|
'timestamp': Number,
|
||||||
|
'hashed' : { 'type': String, 'required':true, 'unique':true }
|
||||||
|
},
|
||||||
|
'data': {
|
||||||
|
'read': { 'type': Number, 'default': 0 },
|
||||||
|
'applied': { 'type': Number, 'default': 0 },
|
||||||
|
'jobtype': { 'type': Number, 'default': 0 },
|
||||||
|
'class': { 'type': Number, 'default': 0 },
|
||||||
|
'autoclass': {
|
||||||
|
'good': Array,
|
||||||
|
'bad': Array,
|
||||||
|
'words': Array,
|
||||||
|
'score': { 'type': Number, 'default': 0 }
|
||||||
|
},
|
||||||
|
'timestamp': { 'type': Number, 'default': 0 },
|
||||||
|
'created_at': { 'type': Date, 'default': Date.now }
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
mongoose.set('useFindAndModify', false);
|
||||||
|
|
||||||
|
const Jobs = mongoose.model('Jobs', jobSchema);
|
||||||
|
|
||||||
|
module.exports = Jobs;
|
66
onetime.js
Normal file
66
onetime.js
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 16/04/2020
|
||||||
|
* Time: 23:35
|
||||||
|
|
||||||
|
*/
|
||||||
|
const CronJob = require('cron').CronJob;
|
||||||
|
const IndeedScraper = require('./scrapers/indeed');
|
||||||
|
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||||
|
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||||
|
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||||
|
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||||
|
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||||
|
|
||||||
|
(async function () {
|
||||||
|
console.log('Started..');
|
||||||
|
const indeedScraper = new IndeedScraper();
|
||||||
|
const totaljobsScraper = new TotaljobsScraper();
|
||||||
|
const cwjobsScraper = new CwjobsScraper();
|
||||||
|
const jobserveScraper = new JobserveScraper();
|
||||||
|
const s1jobsScraper = new RssS1Jobs();
|
||||||
|
const technojobsScraper = new RssTechnojobs();
|
||||||
|
|
||||||
|
await indeedScraper.go('london');
|
||||||
|
await totaljobsScraper.go('london');
|
||||||
|
await cwjobsScraper.go('london');
|
||||||
|
|
||||||
|
await indeedScraper.go('glasgow');
|
||||||
|
await totaljobsScraper.go('glasgow');
|
||||||
|
await cwjobsScraper.go('glasgow');
|
||||||
|
await indeedScraper.go('edinburgh');
|
||||||
|
await totaljobsScraper.go('edinburgh');
|
||||||
|
await cwjobsScraper.go('edinburgh');
|
||||||
|
await indeedScraper.go('milton keynes');
|
||||||
|
await totaljobsScraper.go('milton keynes');
|
||||||
|
await cwjobsScraper.go('milton keynes');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||||
|
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||||
|
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
|
||||||
|
})();
|
1294
package-lock.json
generated
1294
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,15 +9,21 @@
|
|||||||
"author": "",
|
"author": "",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@rakh/utils": "^1.0.0",
|
||||||
"axios": "^0.19.2",
|
"axios": "^0.19.2",
|
||||||
|
"bayes": "^1.0.0",
|
||||||
"body-parser": "^1.19.0",
|
"body-parser": "^1.19.0",
|
||||||
"cheerio": "^1.0.0-rc.3",
|
"cheerio": "^1.0.0-rc.3",
|
||||||
"cron": "^1.8.2",
|
"cron": "^1.8.2",
|
||||||
|
"crypto-js": "^4.0.0",
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"eslint": "^6.8.0",
|
"eslint": "^6.8.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
"fecha": "^4.2.0",
|
"fecha": "^4.2.0",
|
||||||
"got": "^11.2.0",
|
"got": "^11.2.0",
|
||||||
|
"jsonfile": "^6.0.1",
|
||||||
|
"log4js": "^6.3.0",
|
||||||
|
"mongoose": "^5.9.25",
|
||||||
"present": "^1.0.0",
|
"present": "^1.0.0",
|
||||||
"rss-parser": "^3.8.0",
|
"rss-parser": "^3.8.0",
|
||||||
"sqlite3": "^4.1.1",
|
"sqlite3": "^4.1.1",
|
||||||
|
45
preload.js
Normal file
45
preload.js
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 28/07/2020
|
||||||
|
* Time: 10:51
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const fs = require('fs');
|
||||||
|
|
||||||
|
var bayes = require('bayes');
|
||||||
|
|
||||||
|
var classifier = bayes({
|
||||||
|
'tokenizer': function (text) {
|
||||||
|
return text.split(',');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// teach it positive phrases
|
||||||
|
|
||||||
|
async function load() {
|
||||||
|
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt', 'es6', 'es2016', 'es2017', 'es2018', 'freelance'];
|
||||||
|
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||||
|
|
||||||
|
for(let i = 0;i < goodWords.length - 1;i++)
|
||||||
|
await classifier.learn(goodWords[i], 'good');
|
||||||
|
|
||||||
|
for(let i = 0;i < badWords.length - 1;i++)
|
||||||
|
await classifier.learn(badWords[i], 'bad');
|
||||||
|
|
||||||
|
// now ask it to categorize a document it has never seen before
|
||||||
|
|
||||||
|
console.log(await classifier.categorize(['ui', 'developer', 'london', 'react'].join(',')));
|
||||||
|
|
||||||
|
console.log(await classifier.categorize(['mysql', 'react', 'js', 'node', 'docker', 'kubernetes', 'google'].join(',')));
|
||||||
|
|
||||||
|
// serialize the classifier's state as a JSON string.
|
||||||
|
var stateJson = classifier.toJson();
|
||||||
|
|
||||||
|
console.log(stateJson);
|
||||||
|
|
||||||
|
fs.writeFileSync('brain.json', stateJson);
|
||||||
|
}
|
||||||
|
|
||||||
|
load();
|
@ -133,12 +133,15 @@ class IndeedScraper extends MasterScraper {
|
|||||||
await this.filterAdverts();
|
await this.filterAdverts();
|
||||||
|
|
||||||
await this.addToDB();
|
await this.addToDB();
|
||||||
|
await this.addToMongo();
|
||||||
}
|
}
|
||||||
|
|
||||||
async go(location = 'london') {
|
async go(location = 'london') {
|
||||||
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
||||||
|
|
||||||
await this.processSite();
|
await this.processSite().catch((err) => {
|
||||||
|
console.error('Indeed Go', err);
|
||||||
|
});
|
||||||
|
|
||||||
console.log(`Indeed ${location} completed`);
|
console.log(`Indeed ${location} completed`);
|
||||||
}
|
}
|
||||||
|
@ -140,6 +140,7 @@ class IndeedMobileScraper extends MasterScraper {
|
|||||||
await this.filterAdverts();
|
await this.filterAdverts();
|
||||||
|
|
||||||
await this.addToDB();
|
await this.addToDB();
|
||||||
|
await this.addToMongo();
|
||||||
}
|
}
|
||||||
|
|
||||||
async go(location = 'london') {
|
async go(location = 'london') {
|
||||||
|
@ -22,7 +22,10 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Site specific parts below here
|
// Site specific parts below here
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async breakPage() {
|
async breakPage() {
|
||||||
const $ = this.currentPage;
|
const $ = this.currentPage;
|
||||||
const ads = [];
|
const ads = [];
|
||||||
@ -39,6 +42,11 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
this.items = [...this.items, ...ads];
|
this.items = [...this.items, ...ads];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param part
|
||||||
|
* @returns {Promise<{}>}
|
||||||
|
*/
|
||||||
async extractDetails(part) {
|
async extractDetails(part) {
|
||||||
const newObj = {};
|
const newObj = {};
|
||||||
const $part = cheerio.load(part);
|
const $part = cheerio.load(part);
|
||||||
@ -61,6 +69,11 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
return newObj;
|
return newObj;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param item
|
||||||
|
* @returns {Promise<*>}
|
||||||
|
*/
|
||||||
async getIndividualPage(item) {
|
async getIndividualPage(item) {
|
||||||
const newItem = {...item};
|
const newItem = {...item};
|
||||||
console.log('Getting', item.url);
|
console.log('Getting', item.url);
|
||||||
@ -75,6 +88,10 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
return newItem;
|
return newItem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async getJobPages() {
|
async getJobPages() {
|
||||||
const newItems = [];
|
const newItems = [];
|
||||||
for (let item of this.items) {
|
for (let item of this.items) {
|
||||||
@ -86,6 +103,10 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
this.items = [...newItems];
|
this.items = [...newItems];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async checkNext() {
|
async checkNext() {
|
||||||
const $ = this.currentPage;
|
const $ = this.currentPage;
|
||||||
const next = $('.pagination > *:last-child').attr('href') || '';
|
const next = $('.pagination > *:last-child').attr('href') || '';
|
||||||
@ -96,6 +117,10 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
console.log(next);
|
console.log(next);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async processSite() {
|
async processSite() {
|
||||||
console.log('Processing...');
|
console.log('Processing...');
|
||||||
|
|
||||||
@ -121,8 +146,14 @@ class TotaljobsScraper extends MasterScraper {
|
|||||||
await this.filterAdverts();
|
await this.filterAdverts();
|
||||||
|
|
||||||
await this.addToDB();
|
await this.addToDB();
|
||||||
|
await this.addToMongo();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param location
|
||||||
|
* @returns {Promise<void>}
|
||||||
|
*/
|
||||||
async go(location = 'london') {
|
async go(location = 'london') {
|
||||||
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
|
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
|
||||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||||||
|
124
server/controllers/jobs.v2.controller.js
Normal file
124
server/controllers/jobs.v2.controller.js
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 24/07/2020
|
||||||
|
* Time: 11:45
|
||||||
|
|
||||||
|
*/
|
||||||
|
const Jobs = require('../../lib/mongoManager');
|
||||||
|
const { Utils } = require('@rakh/utils');
|
||||||
|
|
||||||
|
const killNLDoubleSpace = /(\\n)\s{2,}|(\\n)|\s{2,}/g;
|
||||||
|
|
||||||
|
function reduceList(data) {
|
||||||
|
if (arguments.length === 0 || arguments[0] === null ) return '';
|
||||||
|
|
||||||
|
const outObj = data.map((v) => {
|
||||||
|
const o = Utils.extractFromObj({...v.details,...v.data, _id:v._id},['title','site', 'company', 'timestamp', 'read', 'applied', 'jobtype', 'class', 'autoclass']);
|
||||||
|
o._id = v._id;
|
||||||
|
return o;
|
||||||
|
|
||||||
|
});
|
||||||
|
// console.log(data);
|
||||||
|
|
||||||
|
return outObj;
|
||||||
|
}
|
||||||
|
|
||||||
|
function reduceRecord(record) {
|
||||||
|
// console.log('Reducderecord', record);
|
||||||
|
let outRec = {...record.details,data:record.data,_id:record._id};
|
||||||
|
|
||||||
|
return outRec;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
exports.getList = (req, res) => {
|
||||||
|
console.log('>getList req', req.params);
|
||||||
|
|
||||||
|
Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
|
||||||
|
if (doc) {
|
||||||
|
|
||||||
|
res.send(reduceList(doc));
|
||||||
|
}
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error(err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.getJob = (req, res) => {
|
||||||
|
console.log('>getJob req', req.params);
|
||||||
|
|
||||||
|
if(!req.params.id)
|
||||||
|
return res.status(500).send({
|
||||||
|
'message': 'Job id missing'
|
||||||
|
});
|
||||||
|
|
||||||
|
const id = req.params.id;
|
||||||
|
|
||||||
|
Jobs.findById(id).then((doc) => {
|
||||||
|
if (doc) {
|
||||||
|
|
||||||
|
const item = reduceRecord(doc._doc);
|
||||||
|
const date = new Date( item.timestamp * 1000);
|
||||||
|
|
||||||
|
console.log(item);
|
||||||
|
item.date = date.toLocaleString();
|
||||||
|
item.title = item.title.replace(killNLDoubleSpace, ' ');
|
||||||
|
|
||||||
|
res.send(item);
|
||||||
|
}
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error(err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.readJob = (req, res) => {
|
||||||
|
console.log('>readJob req', req.params);
|
||||||
|
|
||||||
|
let id;
|
||||||
|
if(!req.params.id)
|
||||||
|
return res.status(500).send({
|
||||||
|
'message': 'Job id missing'
|
||||||
|
});
|
||||||
|
else
|
||||||
|
id = req.params.id;
|
||||||
|
|
||||||
|
Jobs.findById(id).then((doc) => {
|
||||||
|
if (doc) {
|
||||||
|
|
||||||
|
let fullDoc = Object.assign({}, doc._doc);
|
||||||
|
|
||||||
|
console.log('fullDoc', fullDoc);
|
||||||
|
|
||||||
|
if (!Utils.isEmpty(fullDoc)){
|
||||||
|
fullDoc.data.read = new Date().getTime();
|
||||||
|
|
||||||
|
Jobs.findByIdAndUpdate(id, fullDoc, {'new':true}).then((doc) => {
|
||||||
|
console.log(doc._doc);
|
||||||
|
res.status(200).end();
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error('inside',err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error('outer', err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
89
server/controllers/vote.controller.js
Normal file
89
server/controllers/vote.controller.js
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 28/07/2020
|
||||||
|
* Time: 11:08
|
||||||
|
|
||||||
|
*/
|
||||||
|
const Jobs = require('../../lib/mongoManager');
|
||||||
|
const { Utils } = require('@rakh/utils');
|
||||||
|
|
||||||
|
const fs = require('fs');
|
||||||
|
|
||||||
|
var bayes = require('bayes');
|
||||||
|
|
||||||
|
var classifier = bayes({
|
||||||
|
'tokenizer': function (text) {
|
||||||
|
return text.split(',');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
function load() {
|
||||||
|
const file = fs.readFileSync('brain.json');
|
||||||
|
|
||||||
|
classifier = bayes.fromJson(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
function save() {
|
||||||
|
var stateJson = classifier.toJson();
|
||||||
|
|
||||||
|
console.log(stateJson);
|
||||||
|
|
||||||
|
fs.writeFileSync('brain.json', stateJson);
|
||||||
|
}
|
||||||
|
|
||||||
|
load();
|
||||||
|
|
||||||
|
exports.upvote = (req, res) => {
|
||||||
|
console.log('>upvote req', req.params);
|
||||||
|
|
||||||
|
if(!req.params.id)
|
||||||
|
return res.status(500).send({
|
||||||
|
'message': 'Job id missing'
|
||||||
|
});
|
||||||
|
|
||||||
|
const id = req.params.id;
|
||||||
|
|
||||||
|
Jobs.findById(id).then(async (doc) => {
|
||||||
|
if (doc) {
|
||||||
|
const words = doc._doc.data.autoclass.words.join(',');
|
||||||
|
|
||||||
|
await classifier.learn(words, 'good');
|
||||||
|
|
||||||
|
save();
|
||||||
|
res.status(200).end();
|
||||||
|
}
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error(err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
exports.downvote = (req, res) => {
|
||||||
|
console.log('>upvote req', req.params);
|
||||||
|
|
||||||
|
if(!req.params.id)
|
||||||
|
return res.status(500).send({
|
||||||
|
'message': 'Job id missing'
|
||||||
|
});
|
||||||
|
|
||||||
|
const id = req.params.id;
|
||||||
|
|
||||||
|
Jobs.findById(id).then(async (doc) => {
|
||||||
|
if (doc) {
|
||||||
|
const words = doc._doc.data.autoclass.words.join(',');
|
||||||
|
|
||||||
|
await classifier.learn(words, 'bad');
|
||||||
|
|
||||||
|
save();
|
||||||
|
res.status(200).end();
|
||||||
|
}
|
||||||
|
}).catch((err) => {
|
||||||
|
console.error(err.message);
|
||||||
|
res.status(500).send({
|
||||||
|
'message': err.message || 'Some error occurred while querying the database.'
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
2
server/dist/build/bundle.css
vendored
2
server/dist/build/bundle.css
vendored
File diff suppressed because one or more lines are too long
6
server/dist/build/bundle.css.map
vendored
6
server/dist/build/bundle.css.map
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js
vendored
2
server/dist/build/bundle.js
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js.map
vendored
2
server/dist/build/bundle.js.map
vendored
File diff suppressed because one or more lines are too long
17
server/routes/jobs.v2.route.js
Normal file
17
server/routes/jobs.v2.route.js
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 24/07/2020
|
||||||
|
* Time: 11:42
|
||||||
|
|
||||||
|
*/
|
||||||
|
const jobs = require('../controllers/jobs.v2.controller');
|
||||||
|
|
||||||
|
module.exports = (app) => {
|
||||||
|
app.route('/v2/jobs')
|
||||||
|
.get(jobs.getList);
|
||||||
|
|
||||||
|
app.route('/v2/jobs/:id')
|
||||||
|
.get(jobs.getJob)
|
||||||
|
.put(jobs.readJob);
|
||||||
|
};
|
17
server/routes/vote.route.js
Normal file
17
server/routes/vote.route.js
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 28/07/2020
|
||||||
|
* Time: 11:07
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const vote = require('../controllers/vote.controller');
|
||||||
|
|
||||||
|
module.exports = (app) => {
|
||||||
|
app.route('/vote/up/:id')
|
||||||
|
.put(vote.upvote);
|
||||||
|
|
||||||
|
app.route('/vote/down/:id')
|
||||||
|
.put(vote.downvote);
|
||||||
|
};
|
@ -58,7 +58,9 @@ app.use(bodyParser.json());
|
|||||||
app.post('/auth', auth.auth);
|
app.post('/auth', auth.auth);
|
||||||
|
|
||||||
require('./routes/jobs.route')(app);
|
require('./routes/jobs.route')(app);
|
||||||
|
require('./routes/jobs.v2.route')(app);
|
||||||
require('./routes/apply.route')(app);
|
require('./routes/apply.route')(app);
|
||||||
|
require('./routes/vote.route')(app);
|
||||||
|
|
||||||
app.listen(serverPort, () => {
|
app.listen(serverPort, () => {
|
||||||
console.log(`Server is listening on port ${serverPort}`);
|
console.log(`Server is listening on port ${serverPort}`);
|
||||||
|
1986
test/indeed-2020-07-22--051214.html
Normal file
1986
test/indeed-2020-07-22--051214.html
Normal file
File diff suppressed because one or more lines are too long
@ -20,7 +20,7 @@ const indeedScraper = new IndeedScraper();
|
|||||||
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
|
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
|
||||||
const page = fs.readFileSync('data/indeed/page2.html');
|
const page = fs.readFileSync('data/indeed/page2.html');
|
||||||
|
|
||||||
test.test('Test Indeed scraper', async t => {
|
test.skip('Test Indeed scraper', async t => {
|
||||||
const $ = cheerio.load(page);
|
const $ = cheerio.load(page);
|
||||||
|
|
||||||
indeedScraper.loadPage($);
|
indeedScraper.loadPage($);
|
||||||
@ -35,13 +35,36 @@ test.test('Test Indeed scraper', async t => {
|
|||||||
|
|
||||||
await indeedScraper.filterAdverts();
|
await indeedScraper.filterAdverts();
|
||||||
|
|
||||||
// await indeedScraper.addToDB();
|
await indeedScraper.addToMongo();
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
|
||||||
test.test('Test full run Indeed scraper', async t => {
|
test.skip('Test full run Indeed scraper', async t => {
|
||||||
await indeedScraper.go('london');
|
await indeedScraper.go('london').catch((err) => {
|
||||||
|
console.error('Indeed GO', err);
|
||||||
|
});
|
||||||
|
|
||||||
|
t.end();
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
test.test('Test Indeed scraper -- MONGO', async t => {
|
||||||
|
const $ = cheerio.load(page);
|
||||||
|
|
||||||
|
indeedScraper.loadPage($);
|
||||||
|
|
||||||
|
await indeedScraper.breakPage();
|
||||||
|
|
||||||
|
// await indeedScraper.getJobPages();
|
||||||
|
|
||||||
|
// console.log(await indeedScraper.checkNext());
|
||||||
|
|
||||||
|
// console.log(indeedScraper.items);
|
||||||
|
|
||||||
|
// await indeedScraper.filterAdverts();
|
||||||
|
|
||||||
|
await indeedScraper.addToMongo();
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
@ -26,13 +26,14 @@ const s1jobsScraper = new RssS1Jobs();
|
|||||||
const feed = fs.readFileSync('test/data/s1jobs/m7dp711z2r.xml');
|
const feed = fs.readFileSync('test/data/s1jobs/m7dp711z2r.xml');
|
||||||
|
|
||||||
test.test('Test Jobserve scraper', async t => {
|
test.test('Test Jobserve scraper', async t => {
|
||||||
|
let url = 'http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml';
|
||||||
await s1jobsScraper.setStartUrl(url);
|
await s1jobsScraper.setStartUrl(url);
|
||||||
|
|
||||||
|
|
||||||
s1jobsScraper.reduceItems();
|
s1jobsScraper.reduceItems();
|
||||||
|
|
||||||
await s1jobsScraper.filterAdverts();
|
await s1jobsScraper.filterAdverts();
|
||||||
await s1jobsScraper.addToDB();
|
// await s1jobsScraper.addToDB();
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
@ -19,17 +19,17 @@ const testScraper = new RssTechnojobs();
|
|||||||
const feed = fs.readFileSync('test/data/technojobs/page1');
|
const feed = fs.readFileSync('test/data/technojobs/page1');
|
||||||
|
|
||||||
test.test('Test Technojobs scraper', async t => {
|
test.test('Test Technojobs scraper', async t => {
|
||||||
// await testScraper.loadFeed(feed);
|
await testScraper.loadFeed('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
|
||||||
// testScraper.reduceItems();
|
await testScraper.reduceItems();
|
||||||
|
|
||||||
// await s1jobsScraper.filterAdverts();
|
await s1jobsScraper.filterAdverts();
|
||||||
// await s1jobsScraper.addToDB();
|
// await s1jobsScraper.addToDB();
|
||||||
|
|
||||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
/* await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||||
|
*/
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
@ -22,20 +22,20 @@ console.log(`${__dirname}`);
|
|||||||
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
|
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
|
||||||
|
|
||||||
test.test('Test Totaljobs scraper', async t => {
|
test.test('Test Totaljobs scraper', async t => {
|
||||||
const $ = cheerio.load(page);
|
const $ = cheerio.load(page);
|
||||||
|
|
||||||
totaljobsScraper.loadPage($);
|
totaljobsScraper.loadPage($);
|
||||||
|
|
||||||
await totaljobsScraper.breakPage();
|
await totaljobsScraper.breakPage();
|
||||||
|
|
||||||
await totaljobsScraper.getJobPages();
|
await totaljobsScraper.getJobPages();
|
||||||
// console.log(await indeedScraper.checkNext());
|
// console.log(await indeedScraper.checkNext());
|
||||||
|
|
||||||
console.log(totaljobsScraper.items);
|
// console.log(totaljobsScraper.items);
|
||||||
|
|
||||||
await totaljobsScraper.filterAdverts();
|
await totaljobsScraper.filterAdverts();
|
||||||
|
|
||||||
// await totaljobsScraper.addToDB();
|
// await totaljobsScraper.addToDB();
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
14
test/wip.js
Normal file
14
test/wip.js
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 23/07/2020
|
||||||
|
* Time: 09:26
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const { Corpus } = require('../lib/corpus');
|
||||||
|
|
||||||
|
const text = 'ESTAMP DEVELOPER 6 month contract £450-525 / day Developer, SQL, Photoshop, Javascript, … NET, C#, Javascript Advanced knowledge of SQL Server TSQL Experience of the design and … PDF stamp development E-STAMP DEVELOPER 6 month contract';
|
||||||
|
const out = Corpus.process(text);
|
||||||
|
|
||||||
|
console.log(out);
|
71
testgrabber.js
Normal file
71
testgrabber.js
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 16/04/2020
|
||||||
|
* Time: 23:35
|
||||||
|
|
||||||
|
*/
|
||||||
|
const CronJob = require('cron').CronJob;
|
||||||
|
const IndeedScraper = require('./scrapers/indeed');
|
||||||
|
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||||
|
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||||
|
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||||
|
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||||
|
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||||
|
|
||||||
|
(async function () {
|
||||||
|
console.log('Started..');
|
||||||
|
const indeedScraper = new IndeedScraper();
|
||||||
|
const totaljobsScraper = new TotaljobsScraper();
|
||||||
|
const cwjobsScraper = new CwjobsScraper();
|
||||||
|
const jobserveScraper = new JobserveScraper();
|
||||||
|
const s1jobsScraper = new RssS1Jobs();
|
||||||
|
const technojobsScraper = new RssTechnojobs();
|
||||||
|
|
||||||
|
await indeedScraper.go('london');
|
||||||
|
|
||||||
|
|
||||||
|
await totaljobsScraper.go('london');
|
||||||
|
await cwjobsScraper.go('london');
|
||||||
|
await indeedScraper.go('glasgow');
|
||||||
|
await totaljobsScraper.go('glasgow');
|
||||||
|
await cwjobsScraper.go('glasgow');
|
||||||
|
await indeedScraper.go('edinburgh');
|
||||||
|
await totaljobsScraper.go('edinburgh');
|
||||||
|
await cwjobsScraper.go('edinburgh');
|
||||||
|
await indeedScraper.go('milton keynes');
|
||||||
|
await totaljobsScraper.go('milton keynes');
|
||||||
|
await cwjobsScraper.go('milton keynes');
|
||||||
|
/*
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||||
|
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||||
|
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||||
|
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||||
|
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
})();
|
1
unused.json
Normal file
1
unused.json
Normal file
File diff suppressed because one or more lines are too long
22
words.js
Normal file
22
words.js
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
/**
|
||||||
|
* Created by WebStorm.
|
||||||
|
* User: martin
|
||||||
|
* Date: 27/07/2020
|
||||||
|
* Time: 10:08
|
||||||
|
|
||||||
|
*/
|
||||||
|
|
||||||
|
const jsonfile = require('jsonfile');
|
||||||
|
|
||||||
|
const data = require('./unused.json');
|
||||||
|
|
||||||
|
function show(size) {
|
||||||
|
const f = data.filter((v) => {
|
||||||
|
return (v.length === size);
|
||||||
|
});
|
||||||
|
|
||||||
|
jsonfile.writeFileSync('limited.json', [...new Set(f)]);
|
||||||
|
console.log('done');
|
||||||
|
}
|
||||||
|
|
||||||
|
show(11);
|
Loading…
Reference in New Issue
Block a user