Merge branch 'JOBSCRAPER-1' into 'development'
Resolve JOBSCRAPER-1 See merge request martind2000/jobscraper!1
This commit is contained in:
commit
1513ea5010
32
.edditorconfig
Normal file
32
.edditorconfig
Normal file
@ -0,0 +1,32 @@
|
||||
; http://editorconfig.org
|
||||
|
||||
root = true
|
||||
|
||||
[*]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
[*.txt]
|
||||
insert_final_newline = false
|
||||
trim_trailing_whitespace = false
|
||||
|
||||
[*.py]
|
||||
indent_size = 4
|
||||
|
||||
[*.m]
|
||||
indent_size = 4
|
||||
|
||||
[Makefile]
|
||||
indent_style = tab
|
||||
indent_size = 8
|
||||
|
||||
[*.{js,json}]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
|
||||
[*.md]
|
||||
trim_trailing_whitespace = false
|
@ -9,7 +9,7 @@
|
||||
"env": {
|
||||
"browser": true,
|
||||
"node": true,
|
||||
"es6": true
|
||||
"es2017": true
|
||||
},
|
||||
"rules": {
|
||||
"arrow-spacing": "error",
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -147,3 +147,4 @@ fabric.properties
|
||||
/live/
|
||||
!/output/
|
||||
/db/jobs.db
|
||||
!/db/
|
||||
|
1
biglist.json
Normal file
1
biglist.json
Normal file
File diff suppressed because one or more lines are too long
204
brain.json
Normal file
204
brain.json
Normal file
@ -0,0 +1,204 @@
|
||||
{
|
||||
"categories": {
|
||||
"good": true,
|
||||
"bad": true
|
||||
},
|
||||
"docCount": {
|
||||
"good": 43,
|
||||
"bad": 5
|
||||
},
|
||||
"totalDocuments": 48,
|
||||
"vocabulary": {
|
||||
"tsql": true,
|
||||
"developer": true,
|
||||
"contract": true,
|
||||
"web": true,
|
||||
"javascript": true,
|
||||
"js": true,
|
||||
"node": true,
|
||||
"es": true,
|
||||
"agile": true,
|
||||
"nodejs": true,
|
||||
"london": true,
|
||||
"aws": true,
|
||||
"sql": true,
|
||||
"postgresql": true,
|
||||
"mysql": true,
|
||||
"docker": true,
|
||||
"ecs": true,
|
||||
"automation": true,
|
||||
"jslint": true,
|
||||
"jshint": true,
|
||||
"vuejs": true,
|
||||
"vue": true,
|
||||
"nginx": true,
|
||||
"remotely": true,
|
||||
"mvc": true,
|
||||
"remote": true,
|
||||
"iot": true,
|
||||
"mqtt": true,
|
||||
"es6": true,
|
||||
"es2016": true,
|
||||
"es2017": true,
|
||||
"es2018": true,
|
||||
"react": true,
|
||||
"redux": true,
|
||||
"graphql": true,
|
||||
"java": true,
|
||||
"reactjs": true,
|
||||
"apps": true,
|
||||
"html": true,
|
||||
"css": true,
|
||||
"code": true,
|
||||
"angular": true,
|
||||
"ember": true,
|
||||
"restful": true,
|
||||
"apis": true,
|
||||
"infrastructure": true,
|
||||
"software": true,
|
||||
"native": true,
|
||||
"med": true,
|
||||
"mobile": true,
|
||||
"client": true,
|
||||
"applications": true,
|
||||
"digital": true,
|
||||
"analytics": true,
|
||||
"dashboarding": true,
|
||||
"online": true,
|
||||
"analyse": true,
|
||||
"dashboards": true,
|
||||
"google": true,
|
||||
"query": true,
|
||||
"data": true,
|
||||
"stakeholders": true,
|
||||
"enhancements": true,
|
||||
"requirements": true,
|
||||
"c": true,
|
||||
"net": true,
|
||||
"technologies": true,
|
||||
"azure": true,
|
||||
"understanding": true,
|
||||
"devops": true,
|
||||
"tools": true,
|
||||
"frameworks": true,
|
||||
"scotland": true,
|
||||
"responsibility": true,
|
||||
"programme": true,
|
||||
"functions": true,
|
||||
"asp": true,
|
||||
"project": true,
|
||||
"transform": true,
|
||||
"collaborative": true,
|
||||
"technical": true,
|
||||
"framework": true,
|
||||
"nhibernate": true,
|
||||
"server": true,
|
||||
"api": true,
|
||||
"development": true,
|
||||
"lifecycle": true,
|
||||
"specification": true,
|
||||
"appointments": true
|
||||
},
|
||||
"vocabularySize": 89,
|
||||
"wordCount": {
|
||||
"good": 157,
|
||||
"bad": 5
|
||||
},
|
||||
"wordFrequencyCount": {
|
||||
"good": {
|
||||
"tsql": 1,
|
||||
"developer": 6,
|
||||
"contract": 9,
|
||||
"web": 6,
|
||||
"javascript": 7,
|
||||
"js": 3,
|
||||
"node": 2,
|
||||
"es": 1,
|
||||
"agile": 2,
|
||||
"nodejs": 1,
|
||||
"london": 3,
|
||||
"aws": 3,
|
||||
"sql": 3,
|
||||
"postgresql": 1,
|
||||
"mysql": 1,
|
||||
"docker": 1,
|
||||
"ecs": 1,
|
||||
"automation": 1,
|
||||
"jslint": 1,
|
||||
"jshint": 1,
|
||||
"vuejs": 1,
|
||||
"vue": 2,
|
||||
"nginx": 1,
|
||||
"remotely": 1,
|
||||
"mvc": 5,
|
||||
"remote": 2,
|
||||
"iot": 1,
|
||||
"mqtt": 1,
|
||||
"es6": 1,
|
||||
"es2016": 1,
|
||||
"es2017": 1,
|
||||
"es2018": 1,
|
||||
"apps": 1,
|
||||
"html": 5,
|
||||
"css": 5,
|
||||
"code": 2,
|
||||
"react": 2,
|
||||
"angular": 1,
|
||||
"ember": 1,
|
||||
"restful": 1,
|
||||
"apis": 1,
|
||||
"infrastructure": 1,
|
||||
"software": 2,
|
||||
"native": 1,
|
||||
"med": 1,
|
||||
"mobile": 1,
|
||||
"client": 4,
|
||||
"applications": 2,
|
||||
"digital": 2,
|
||||
"analytics": 1,
|
||||
"dashboarding": 1,
|
||||
"online": 1,
|
||||
"analyse": 1,
|
||||
"dashboards": 1,
|
||||
"google": 1,
|
||||
"query": 1,
|
||||
"data": 1,
|
||||
"stakeholders": 1,
|
||||
"enhancements": 3,
|
||||
"requirements": 3,
|
||||
"c": 4,
|
||||
"net": 5,
|
||||
"technologies": 4,
|
||||
"azure": 2,
|
||||
"understanding": 1,
|
||||
"devops": 2,
|
||||
"tools": 1,
|
||||
"frameworks": 1,
|
||||
"scotland": 1,
|
||||
"responsibility": 1,
|
||||
"programme": 1,
|
||||
"functions": 1,
|
||||
"asp": 1,
|
||||
"project": 1,
|
||||
"transform": 1,
|
||||
"collaborative": 1,
|
||||
"technical": 1,
|
||||
"framework": 1,
|
||||
"nhibernate": 1,
|
||||
"server": 1,
|
||||
"api": 1,
|
||||
"development": 1,
|
||||
"lifecycle": 1,
|
||||
"specification": 1,
|
||||
"appointments": 1
|
||||
},
|
||||
"bad": {
|
||||
"react": 1,
|
||||
"redux": 1,
|
||||
"graphql": 1,
|
||||
"java": 1,
|
||||
"reactjs": 1
|
||||
}
|
||||
},
|
||||
"options": {}
|
||||
}
|
BIN
db/jobs.db
BIN
db/jobs.db
Binary file not shown.
84
lib/base.js
84
lib/base.js
@ -8,6 +8,12 @@
|
||||
const filterReject = require('../lib/filter_reject');
|
||||
const filterAccept = require('../lib/filter_md_jobs');
|
||||
const dbmanager = require('../lib/dbmanager');
|
||||
const JobsModel = require('../lib/mongoManager');
|
||||
|
||||
const SHA = require('crypto-js/sha256');
|
||||
|
||||
const { Utils } = require('@rakh/utils');
|
||||
const { Corpus } = require('./corpus');
|
||||
|
||||
class MasterBase {
|
||||
|
||||
@ -57,6 +63,79 @@ class MasterBase {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
addToMongo() {
|
||||
console.log('>> ADD TO MONGO!');
|
||||
|
||||
for(const item of this.items) {
|
||||
// console.log('add', item);
|
||||
const newObj = this.reduceData(item);
|
||||
const newJob = new JobsModel(newObj);
|
||||
|
||||
newJob.save().then((m) => {
|
||||
console.log('m', m.details.title);
|
||||
}).catch((err) => {
|
||||
console.error('m', err);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param inval
|
||||
* @returns {number}
|
||||
*/
|
||||
analyseRate(inval) {
|
||||
console.log('analyseRate', inval);
|
||||
let outVal = 0;
|
||||
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
|
||||
const clearSpace = /\s+/g;
|
||||
|
||||
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
|
||||
const resultArray = result.trim().split((' '));
|
||||
|
||||
if (resultArray.length > 0) {
|
||||
const item = parseInt(resultArray[0], 10);
|
||||
|
||||
if (item < 100) outVal = 0;
|
||||
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||
else if (item >= 5000) outVal = 2;
|
||||
}
|
||||
else return 0;
|
||||
|
||||
return outVal;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param d
|
||||
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
|
||||
*/
|
||||
reduceData(d) {
|
||||
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||
|
||||
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||
|
||||
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||
|
||||
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||
outObj.details.hashed = SHA(outObj.details.summary);
|
||||
|
||||
outObj.data.read = 0;
|
||||
outObj.data.applied = d.applied || 0;
|
||||
|
||||
outObj.data.jobtype = this.analyseRate(d.salary);
|
||||
outObj.data.autoclass = Corpus.process(d.summary);
|
||||
|
||||
outObj.data.timestamp = d.timestamp * 1000;
|
||||
|
||||
return outObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
@ -120,10 +199,15 @@ class MasterBase {
|
||||
return `https://image.silvrtree.co.uk/q${q}/${url}`;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async go() {
|
||||
this.items = [];
|
||||
this.rawItems = [];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = MasterBase;
|
||||
|
90
lib/corpus.js
Normal file
90
lib/corpus.js
Normal file
@ -0,0 +1,90 @@
|
||||
const jsonfile = require('jsonfile');
|
||||
|
||||
const words = require('../lib/wordlist.json');
|
||||
const wordsAdditional = require('../lib/wordlistAdditional.json');
|
||||
|
||||
const bigList = new Map([]);
|
||||
|
||||
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
|
||||
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||
let unrated = [];
|
||||
|
||||
var _global = typeof global === 'undefined' ? window : global;
|
||||
var Corpus = (_global.Corpus = _global.Corpus || {});
|
||||
|
||||
const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/;
|
||||
const detagRegex = /(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/gi;
|
||||
const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
|
||||
const deSpace = /\s+/g;
|
||||
|
||||
function cleanText(intext) {
|
||||
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
|
||||
|
||||
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
|
||||
}
|
||||
|
||||
function dedupe(intext) {
|
||||
if (arguments.length === 0 || intext === null ) return [];
|
||||
|
||||
return [...new Set(intext)];
|
||||
}
|
||||
|
||||
function incItem(item) {
|
||||
if (bigList.has(item))
|
||||
bigList.set(item, bigList.get(item) + 1);
|
||||
|
||||
else
|
||||
bigList.set(item, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the body
|
||||
* @param intext
|
||||
* @returns {{score: number, bad: *, good: *}}
|
||||
*/
|
||||
Corpus.process = function(intext) {
|
||||
const workText = cleanText(intext);
|
||||
|
||||
const workArray = workText.split(' ');
|
||||
|
||||
const cleanedArray = dedupe(workArray).filter((v) => {
|
||||
return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1);
|
||||
});
|
||||
|
||||
const good = cleanedArray.filter((v) => {
|
||||
return (goodWords.indexOf(v) !== -1);
|
||||
});
|
||||
|
||||
const bad = cleanedArray.filter((v) => {
|
||||
return (badWords.indexOf(v) !== -1);
|
||||
});
|
||||
|
||||
const unused = cleanedArray.filter((v) => {
|
||||
return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1));
|
||||
});
|
||||
|
||||
cleanedArray.map((item)=> {
|
||||
incItem(item);
|
||||
});
|
||||
|
||||
unrated = [...unrated, ...unused];
|
||||
|
||||
const score = good.length - (bad.length * 5);
|
||||
|
||||
// console.log('unused', unused);
|
||||
|
||||
return { good, bad, score, 'words':cleanedArray };
|
||||
};
|
||||
|
||||
Corpus.exportUnused = function() {
|
||||
jsonfile.writeFileSync('./unused.json', dedupe(unrated));
|
||||
jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1]));
|
||||
console.log([...bigList]);
|
||||
};
|
||||
|
||||
|
||||
|
||||
if (typeof module !== 'undefined')
|
||||
module.exports = {
|
||||
'Corpus': Corpus
|
||||
};
|
34
lib/mongoManager.js
Normal file
34
lib/mongoManager.js
Normal file
@ -0,0 +1,34 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 22/07/2020
|
||||
* Time: 17:00
|
||||
|
||||
*/
|
||||
|
||||
const mongoose = require('mongoose');
|
||||
const log4js = require('log4js');
|
||||
const logger = log4js.getLogger();
|
||||
|
||||
const JobsModel = require('../models/jobs');
|
||||
|
||||
const { Utils } = require('@rakh/utils');
|
||||
|
||||
require('dotenv').config();
|
||||
|
||||
logger.level = 'debug';
|
||||
|
||||
const mongoConnect = process.env.MONGOCONNECT;
|
||||
|
||||
// logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||
|
||||
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
|
||||
|
||||
logger.debug(mongoConnect);
|
||||
|
||||
mongoose.connect(mongoConnect);
|
||||
|
||||
const mDB = mongoose.connection;
|
||||
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||
|
||||
module.exports = JobsModel;
|
@ -89,6 +89,7 @@ class MasterRSS extends MasterBase {
|
||||
await this.filterAdverts();
|
||||
|
||||
if (this.items.length > 0) await this.addToDB();
|
||||
if (this.items.length > 0) await this.addToMongo();
|
||||
}
|
||||
else
|
||||
console.log('No items to process');
|
||||
|
@ -20,9 +20,14 @@ class MasterScraper extends MasterBase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
getContent(url, useStone = false) {
|
||||
|
||||
/**
|
||||
*
|
||||
* @param url
|
||||
* @param useStone
|
||||
* @returns {Promise<unknown>}
|
||||
*/
|
||||
getContent(url, useStone = false) {
|
||||
|
||||
/*
|
||||
let headers = new Headers({
|
||||
@ -54,19 +59,28 @@ fetch(url, {
|
||||
resolve(response.body);
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error('getContent', e );
|
||||
reject(e.response.body);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
async savePage(html) {
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
|
||||
|
||||
const filename = `pages/${this.siteid}-${now}.html`;
|
||||
|
||||
fs.writeFileSync(filename, html);
|
||||
}
|
||||
|
||||
async getPage() {
|
||||
console.log('>> getPage: fetching', this.url);
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
||||
const filename = `${this.siteid}-${now}.html`;
|
||||
|
||||
await this.getContent(this.url, this.useStone)
|
||||
.then((html) => {
|
||||
fs.writeFileSync(filename, html);
|
||||
// console.log('>> getPage:: got', html);
|
||||
console.log('>> getPage:: OK');
|
||||
if (this.saveFile) this.savePage(html);
|
||||
const $ = cheerio.load(html);
|
||||
this.loadPage($);
|
||||
})
|
||||
@ -75,30 +89,59 @@ fetch(url, {
|
||||
|
||||
// Site specific parts below here
|
||||
|
||||
/**
|
||||
* Break each page into items
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async breakPage() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param part
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractDetails(part) {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async checkNext() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processSite() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async getIndividualPage() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async getJobPages() {
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async go() {
|
||||
|
||||
}
|
||||
|
1007
lib/wordlist.json
Normal file
1007
lib/wordlist.json
Normal file
File diff suppressed because it is too large
Load Diff
8790
lib/wordlistAdditional.json
Normal file
8790
lib/wordlistAdditional.json
Normal file
File diff suppressed because it is too large
Load Diff
559
limited.json
Normal file
559
limited.json
Normal file
@ -0,0 +1,559 @@
|
||||
[
|
||||
"experienced",
|
||||
"exceptional",
|
||||
"maintaining",
|
||||
"familiarity",
|
||||
"commodities",
|
||||
"opportunity",
|
||||
"possibility",
|
||||
"integration",
|
||||
"engineering",
|
||||
"derivatives",
|
||||
"prefferable",
|
||||
"nutritional",
|
||||
"performance",
|
||||
"immediately",
|
||||
"information",
|
||||
"responsible",
|
||||
"environment",
|
||||
"stakeholder",
|
||||
"proactively",
|
||||
"requirement",
|
||||
"temporarily",
|
||||
"interrogate",
|
||||
"effectively",
|
||||
"progressing",
|
||||
"substantial",
|
||||
"identifying",
|
||||
"maintenance",
|
||||
"workarounds",
|
||||
"departments",
|
||||
"consultancy",
|
||||
"regulations",
|
||||
"statistical",
|
||||
"previously·",
|
||||
"euromonitor",
|
||||
"documenting",
|
||||
"bookkeeping",
|
||||
"reconciling",
|
||||
"hardworking",
|
||||
"themselves!",
|
||||
"appropriate",
|
||||
"socialising",
|
||||
"fundraising",
|
||||
"initiatives",
|
||||
"sponsorship",
|
||||
"orientation",
|
||||
"competitive",
|
||||
"illustrator",
|
||||
"outstanding",
|
||||
"interaction",
|
||||
"consistency",
|
||||
"touchpoints",
|
||||
"freshtechit",
|
||||
"recruitment",
|
||||
"catastrophe",
|
||||
"accountable",
|
||||
"workstreams",
|
||||
"scalability",
|
||||
"undertaking",
|
||||
"interacting",
|
||||
"significant",
|
||||
"considering",
|
||||
"independent",
|
||||
"collaborate",
|
||||
"arrangement",
|
||||
"unsolicited",
|
||||
"empowerment",
|
||||
"connections",
|
||||
"specialists",
|
||||
"credentials",
|
||||
"personality",
|
||||
"established",
|
||||
"northampton",
|
||||
"advertising",
|
||||
"operational",
|
||||
"mathematics",
|
||||
"contractors",
|
||||
"instruments",
|
||||
"referencing",
|
||||
"locationsco",
|
||||
"disciplines",
|
||||
"corporation",
|
||||
"investments",
|
||||
"conferences",
|
||||
"demonstrate",
|
||||
"directorate",
|
||||
"acknowledge",
|
||||
"legislation",
|
||||
"designgreat",
|
||||
"understands",
|
||||
"perspective",
|
||||
"association",
|
||||
"enforcement",
|
||||
"prestigious",
|
||||
"individuals",
|
||||
"alternative",
|
||||
"technically",
|
||||
"challenging",
|
||||
"discussions",
|
||||
"lifeworking",
|
||||
"interactive",
|
||||
"storyboards",
|
||||
"communicate",
|
||||
"abilitywork",
|
||||
"englishgood",
|
||||
"detailbonus",
|
||||
"angularwhat",
|
||||
"neededabout",
|
||||
"innovations",
|
||||
"enthusiasts",
|
||||
"instructors",
|
||||
"prospective",
|
||||
"comfortable",
|
||||
"involvement",
|
||||
"adventurous",
|
||||
"marketplace",
|
||||
"forecasting",
|
||||
"contractual",
|
||||
"underpinned",
|
||||
"acquisition",
|
||||
"microsoft’s",
|
||||
"progression",
|
||||
"suggestions",
|
||||
"proficiency",
|
||||
"participate",
|
||||
"joblocation",
|
||||
"methodology",
|
||||
"continually",
|
||||
"cataloguing",
|
||||
"projectgood",
|
||||
"incremental",
|
||||
"overarching",
|
||||
"confidently",
|
||||
"circulatory",
|
||||
"adjustments",
|
||||
"interesting",
|
||||
"consultants",
|
||||
"experienceb",
|
||||
"hourscasual",
|
||||
"switzerland",
|
||||
"contributes",
|
||||
"participant",
|
||||
"improvement",
|
||||
"articulates",
|
||||
"contributed",
|
||||
"comfortably",
|
||||
"deployments",
|
||||
"integrating",
|
||||
"configuring",
|
||||
"platforming",
|
||||
"educatedday",
|
||||
"contracting",
|
||||
"monthstotal",
|
||||
"outsourcing",
|
||||
"designswork",
|
||||
"ideasdesign",
|
||||
"deviceswork",
|
||||
"fundamental",
|
||||
"businessjob",
|
||||
"implemented",
|
||||
"transaction",
|
||||
"reliability",
|
||||
"upgradesyou",
|
||||
"uncertainty",
|
||||
"enterpriser",
|
||||
"teamprovide",
|
||||
"trafficking",
|
||||
"doubleclick",
|
||||
"communities",
|
||||
"‘forestlink",
|
||||
"dimensional",
|
||||
"coordinator",
|
||||
"spreadsheet",
|
||||
"pressurised",
|
||||
"assignments",
|
||||
"willingness",
|
||||
"certificate",
|
||||
"summaryrole",
|
||||
"institution",
|
||||
"segregation",
|
||||
"preparation",
|
||||
"electronics",
|
||||
"duplication",
|
||||
"surrounding",
|
||||
"informatica",
|
||||
"blackfriars",
|
||||
"terminology",
|
||||
"shabarinath",
|
||||
"interfacing",
|
||||
"expectation",
|
||||
"proprietary",
|
||||
"conflicting",
|
||||
"itecopeople",
|
||||
"opowershell",
|
||||
"submissions",
|
||||
"negotiating",
|
||||
"escalations",
|
||||
"transferred",
|
||||
"protections",
|
||||
"customizing",
|
||||
"oxfordshire",
|
||||
"progressive",
|
||||
"bishopsgate",
|
||||
"partnership",
|
||||
"futureheads",
|
||||
"permissions",
|
||||
"efficiently",
|
||||
"unspecified",
|
||||
"potentially",
|
||||
"disclaimers",
|
||||
"foreseeable",
|
||||
"sustainable",
|
||||
"calculation",
|
||||
"replication",
|
||||
"constitutes",
|
||||
"recommended",
|
||||
"enterprises",
|
||||
"negotiation",
|
||||
"imaginative",
|
||||
"differences",
|
||||
"nationality",
|
||||
"impediments",
|
||||
"refinements",
|
||||
"translating",
|
||||
"obligations",
|
||||
"flexibility",
|
||||
"unashamedly",
|
||||
"exclusively",
|
||||
"replacement",
|
||||
"essentially",
|
||||
"artifactory",
|
||||
"theoretical",
|
||||
"probability",
|
||||
"integrators",
|
||||
"contractor?",
|
||||
"interested?",
|
||||
"functioning",
|
||||
"chamberlain",
|
||||
"inclusivity",
|
||||
"iteratively",
|
||||
"enhancement",
|
||||
"constraints",
|
||||
"establishes",
|
||||
"qualitative",
|
||||
"influencing",
|
||||
"procurement",
|
||||
"experiences",
|
||||
"furthermore",
|
||||
"disciplined",
|
||||
"unnecessary",
|
||||
"bureaucracy",
|
||||
"represented",
|
||||
"siteimprove",
|
||||
"lokhandwala",
|
||||
"specialises",
|
||||
"rationalize",
|
||||
"competncies",
|
||||
"restoration",
|
||||
"allocations",
|
||||
"admittances",
|
||||
"furnishings",
|
||||
"cleanliness",
|
||||
"residential",
|
||||
"contactable",
|
||||
"conventions",
|
||||
"translation",
|
||||
"approaching",
|
||||
"intecselect",
|
||||
"linguistics",
|
||||
"southampton",
|
||||
"beautifully",
|
||||
"estimations",
|
||||
"newsletters",
|
||||
"summarising",
|
||||
"simulations",
|
||||
"portfolio's",
|
||||
"coronavirus",
|
||||
"opoortunity",
|
||||
"unavailable",
|
||||
"accordingly",
|
||||
"penetration",
|
||||
"remediation",
|
||||
"elimination",
|
||||
"achievement",
|
||||
"facilitator",
|
||||
"westminster",
|
||||
"introducing",
|
||||
"businesses'",
|
||||
"capitalists",
|
||||
"investigate",
|
||||
"countryside",
|
||||
"problematic",
|
||||
"coordinates",
|
||||
"components'",
|
||||
"supervision",
|
||||
"bonavolonta",
|
||||
"proposition",
|
||||
"foundations",
|
||||
"suitability",
|
||||
"researchers",
|
||||
"explanation",
|
||||
"commitments",
|
||||
"computation",
|
||||
"questioning",
|
||||
"experiments",
|
||||
"visualfiles",
|
||||
"cloudstream",
|
||||
"determining",
|
||||
"deliverable",
|
||||
"inquisitive",
|
||||
"backgrounds",
|
||||
"thoughtspot",
|
||||
"specialized",
|
||||
"veloppement",
|
||||
"importantes",
|
||||
"typedscript",
|
||||
"restaurants",
|
||||
"prophylaxis",
|
||||
"transmitted",
|
||||
"appointment",
|
||||
"encouraging",
|
||||
"aggregating",
|
||||
"championing",
|
||||
"conjunction",
|
||||
"customising",
|
||||
"photography",
|
||||
"authorities",
|
||||
"competition",
|
||||
"collections",
|
||||
"contraintes",
|
||||
"fonctionnel",
|
||||
"adaptabilit",
|
||||
"changements",
|
||||
"conceptions",
|
||||
"utilisation",
|
||||
"shortlisted",
|
||||
"reusability",
|
||||
"recognizing",
|
||||
"decisioning",
|
||||
"accommodate",
|
||||
"limitations",
|
||||
"resourceful",
|
||||
"algorithmic",
|
||||
"unconcerned",
|
||||
"intelligent",
|
||||
"considerate",
|
||||
"clientbased",
|
||||
"accelerator",
|
||||
"dreamweaver",
|
||||
"applicant's",
|
||||
"proactivity",
|
||||
"aggregation",
|
||||
"restriction",
|
||||
"traditional",
|
||||
"corporately",
|
||||
"memberships",
|
||||
"standardise",
|
||||
"theecsgroup",
|
||||
"scarchitect",
|
||||
"consolidate",
|
||||
"extensively",
|
||||
"afghanistan",
|
||||
"encompasses",
|
||||
"distinctive",
|
||||
"professions",
|
||||
"interviewed",
|
||||
"formulation",
|
||||
"transitions",
|
||||
"aspirations",
|
||||
"ingredients",
|
||||
"setterfield",
|
||||
"candidate’s",
|
||||
"leatherhead",
|
||||
"publication",
|
||||
"undoubtedly",
|
||||
"basingstoke",
|
||||
"underground",
|
||||
"reinsurance",
|
||||
"exemplifies",
|
||||
"civiization",
|
||||
"developer's",
|
||||
"bazzelgette",
|
||||
"adjacencies",
|
||||
"feasibility",
|
||||
"frontinvest",
|
||||
"neogotiable",
|
||||
"unconnected",
|
||||
"conditional",
|
||||
"bottlenecks",
|
||||
"productions",
|
||||
"pharmacists",
|
||||
"technicians",
|
||||
"prescribing",
|
||||
"stewardship",
|
||||
"recognising",
|
||||
"convictions",
|
||||
"subscribing",
|
||||
"transparent",
|
||||
"wireframing",
|
||||
"insidehmcts",
|
||||
"justicejobs",
|
||||
"criminology",
|
||||
"hospitality",
|
||||
"structuring",
|
||||
"educational",
|
||||
"substantive",
|
||||
"secondments",
|
||||
"transgender",
|
||||
"smartphones",
|
||||
"microsoft's",
|
||||
"definitions",
|
||||
"validations",
|
||||
"prioritised",
|
||||
"autoscaling",
|
||||
"abstraction",
|
||||
"correlation",
|
||||
"recognition",
|
||||
"contributor",
|
||||
"apigedevops",
|
||||
"incorporate",
|
||||
"woocommerce",
|
||||
"informatics",
|
||||
"adfadc@apps",
|
||||
"automations",
|
||||
"formulating",
|
||||
"beneficiary",
|
||||
"referential",
|
||||
"jsdevsecops",
|
||||
"solutioning",
|
||||
"measurement",
|
||||
"familiarise",
|
||||
"eligibility",
|
||||
"standardize",
|
||||
"experience?",
|
||||
"bournemouth",
|
||||
"implementer",
|
||||
"agilesphere",
|
||||
"assumptions",
|
||||
"accountancy",
|
||||
"cockroachdb",
|
||||
"promotional",
|
||||
"facilitates",
|
||||
"discoveries",
|
||||
"bladecenter",
|
||||
"considered!",
|
||||
"cooperation",
|
||||
"exploration",
|
||||
"angulareact",
|
||||
"preferabbly",
|
||||
"harmonising",
|
||||
"convenience",
|
||||
"inclusively",
|
||||
"strategists",
|
||||
"attribution",
|
||||
"fromscratch",
|
||||
"combination",
|
||||
"solutionize",
|
||||
"accelerated",
|
||||
"diagnostics",
|
||||
"sensibility",
|
||||
"informative",
|
||||
"intellegnce",
|
||||
"specilisits",
|
||||
"projections",
|
||||
"associative",
|
||||
"personalize",
|
||||
"farnborough",
|
||||
"necessarily",
|
||||
"nservicebus",
|
||||
"constrained",
|
||||
"prioritized",
|
||||
"behavioural",
|
||||
"chakraborty",
|
||||
"leaderships",
|
||||
"flourishing",
|
||||
"uniqstudios",
|
||||
"simplifying",
|
||||
"realisation",
|
||||
"extensions!",
|
||||
"prioritises",
|
||||
"experience!",
|
||||
"candidates!",
|
||||
"inclination",
|
||||
"stimulating",
|
||||
"appreciated",
|
||||
"reinventing",
|
||||
"compression",
|
||||
"jscybsecdev",
|
||||
"equirements",
|
||||
"generalized",
|
||||
"compressors",
|
||||
"assessments",
|
||||
"beyondtrust",
|
||||
"engagements",
|
||||
"numerically",
|
||||
"electricity",
|
||||
"interchange",
|
||||
"jsswift_dev",
|
||||
"circulating",
|
||||
"attachments",
|
||||
"credibility",
|
||||
"vnetpeering",
|
||||
"territories",
|
||||
"staggering!",
|
||||
"developers!",
|
||||
"peripherals",
|
||||
"virtualized",
|
||||
"bitdefender",
|
||||
"jssitecorjs",
|
||||
"positioning",
|
||||
"appreciates",
|
||||
"chessington",
|
||||
"controllers",
|
||||
"controlling",
|
||||
"quantifying",
|
||||
"virtualised",
|
||||
"manufacture",
|
||||
"fluorescent",
|
||||
"governments",
|
||||
"bigcommerce",
|
||||
"therapeutic",
|
||||
"importantly",
|
||||
"differently",
|
||||
"rigourously",
|
||||
"shareholder",
|
||||
"copywriting",
|
||||
"anticipated",
|
||||
"approximate",
|
||||
"behdarvandi",
|
||||
"testability",
|
||||
"beneficial!",
|
||||
"jswmibmcraw",
|
||||
"exhibitions",
|
||||
"talentpoint",
|
||||
"propagation",
|
||||
"interviews!",
|
||||
"solutionise",
|
||||
"elasticache",
|
||||
"manoeuvring",
|
||||
"teamservice",
|
||||
"geographies",
|
||||
"efficientip",
|
||||
"organically",
|
||||
"advancement",
|
||||
"jshodanular",
|
||||
"wholesalers",
|
||||
"multitenant",
|
||||
"encouraged?",
|
||||
"freelancers",
|
||||
"composition",
|
||||
"#jobswagger",
|
||||
"typographic",
|
||||
"stereotypes",
|
||||
"clerkenwell",
|
||||
"sacrificing",
|
||||
"resolutions",
|
||||
"technology?",
|
||||
"advantagous"
|
||||
]
|
22
mapbuilder.js
Normal file
22
mapbuilder.js
Normal file
@ -0,0 +1,22 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 27/07/2020
|
||||
* Time: 15:34
|
||||
|
||||
*/
|
||||
const jsonfile = require('jsonfile');
|
||||
|
||||
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
|
||||
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||
|
||||
const brain = new Map([]);
|
||||
|
||||
for(let i = 0;i < goodWords.length - 1;i++)
|
||||
brain.set(goodWords[i], 3);
|
||||
|
||||
for(let i = 0;i < badWords.length - 1;i++)
|
||||
brain.set(badWords[i], -5);
|
||||
|
||||
jsonfile.writeFileSync('brain.json', [...brain]);
|
||||
|
156
migrate.js
Normal file
156
migrate.js
Normal file
@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 22/07/2020
|
||||
* Time: 10:20
|
||||
|
||||
*/
|
||||
const db = require('./lib/connect');
|
||||
const log4js = require('log4js');
|
||||
const logger = log4js.getLogger();
|
||||
const { Utils } = require('@rakh/utils');
|
||||
|
||||
const { Corpus } = require('./lib/corpus');
|
||||
|
||||
const SHA = require('crypto-js/sha256');
|
||||
|
||||
/*
|
||||
|
||||
2604
|
||||
|
||||
const mongoose = require('mongoose');
|
||||
const log4js = require('log4js');
|
||||
const logger = log4js.getLogger();
|
||||
|
||||
const Jobs = require('./models/jobs');
|
||||
|
||||
require('dotenv').config();
|
||||
|
||||
logger.level = 'debug';
|
||||
|
||||
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||
|
||||
mongoose.connect(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
|
||||
|
||||
const mDB = mongoose.connection;
|
||||
mDB.on('error', console.error.bind(console, 'connection error:'));
|
||||
*/
|
||||
|
||||
const Jobs = require('./lib/mongoManager');
|
||||
|
||||
const migrate = (function() {
|
||||
function analyseRate(inval) {
|
||||
let outVal = 0;
|
||||
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
|
||||
const clearSpace = /\s+/g;
|
||||
|
||||
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
|
||||
const resultArray = result.trim().split((' '));
|
||||
|
||||
if (resultArray.length > 0) {
|
||||
const item = parseInt(resultArray[0], 10);
|
||||
|
||||
if (item < 100) outVal = 0;
|
||||
else if ((item > 100) && (item < 5000)) outVal = 1;
|
||||
else if (item >= 5000) outVal = 2;
|
||||
}
|
||||
else return 0;
|
||||
|
||||
return outVal;
|
||||
}
|
||||
function reduceData(d) {
|
||||
const clearPremium = /(\n+)(Featured|Premium)/gi;
|
||||
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
|
||||
|
||||
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
|
||||
|
||||
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
|
||||
|
||||
outObj.details.title = outObj.details.title.replace(clearPremium, '');
|
||||
outObj.details.title = outObj.details.title.replace(otherStupid, '');
|
||||
outObj.details.hashed = SHA(outObj.details.summary);
|
||||
|
||||
// outObj.data.read = d.read || 0;
|
||||
outObj.data.read = 0;
|
||||
outObj.data.applied = d.applied || 0;
|
||||
outObj.data.jobtype = analyseRate(d.salary);
|
||||
|
||||
outObj.data.autoclass = Corpus.process(d.summary);
|
||||
|
||||
outObj.data.timestamp = d.timestamp * 1000;
|
||||
|
||||
return outObj;
|
||||
}
|
||||
|
||||
function getCurrent() {
|
||||
const outgoing = [];
|
||||
console.log('get version');
|
||||
const sql = 'select jobs.*, applied.a as applied, read.d as read from jobs left join applied on applied.aid = jobs._id left join read on read.rid = jobs._id order by _id asc;';
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
db.all(sql, [], (err, rows) => {
|
||||
if (err)
|
||||
reject(err);
|
||||
|
||||
rows.forEach((row) => {
|
||||
outgoing.push(row);
|
||||
});
|
||||
|
||||
resolve(outgoing) ;
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function start() {
|
||||
await getCurrent().then(async (d) => {
|
||||
logger.debug(d.length);
|
||||
|
||||
for (let t = 0;t < (d.length - 1);t++) {
|
||||
const newD = reduceData(d[t]);
|
||||
|
||||
// logger.debug(newD);
|
||||
|
||||
const newJob = Jobs(newD);
|
||||
|
||||
await newJob.save().then((m) => {
|
||||
logger.debug('m', m.details.title);
|
||||
}).catch((err) => {
|
||||
logger.error(err.keyPattern);
|
||||
});
|
||||
}
|
||||
}).then(() => {
|
||||
logger.debug('SAVING!!');
|
||||
Corpus.exportUnused();
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(err.keyPattern);
|
||||
});
|
||||
}
|
||||
|
||||
async function deleteOld() {
|
||||
const oneDay = 86400000;
|
||||
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
|
||||
|
||||
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
|
||||
|
||||
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
|
||||
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
|
||||
logger.debug('m', m);
|
||||
}).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
}
|
||||
|
||||
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
|
||||
|
||||
return {
|
||||
'start':start,
|
||||
'deleteOld': deleteOld
|
||||
};
|
||||
})();
|
||||
|
||||
(async function() {
|
||||
await migrate.start();
|
||||
await migrate.deleteOld();
|
||||
logger.info('Done??');
|
||||
})();
|
47
models/jobs.js
Normal file
47
models/jobs.js
Normal file
@ -0,0 +1,47 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 22/07/2020
|
||||
* Time: 14:18
|
||||
|
||||
*/
|
||||
const mongoose = require('mongoose');
|
||||
const Schema = mongoose.Schema;
|
||||
|
||||
const jobSchema = new Schema({
|
||||
'details': {
|
||||
'title': { 'type': String, 'required': true },
|
||||
'site': { 'type': String, 'required': true },
|
||||
'url': { 'type': String, 'required': true, 'unique': true },
|
||||
'id': String,
|
||||
'summary': String,
|
||||
'company': String,
|
||||
'location': String,
|
||||
'postdate': String,
|
||||
'salary': String,
|
||||
'easyapply': Number,
|
||||
'timestamp': Number,
|
||||
'hashed' : { 'type': String, 'required':true, 'unique':true }
|
||||
},
|
||||
'data': {
|
||||
'read': { 'type': Number, 'default': 0 },
|
||||
'applied': { 'type': Number, 'default': 0 },
|
||||
'jobtype': { 'type': Number, 'default': 0 },
|
||||
'class': { 'type': Number, 'default': 0 },
|
||||
'autoclass': {
|
||||
'good': Array,
|
||||
'bad': Array,
|
||||
'words': Array,
|
||||
'score': { 'type': Number, 'default': 0 }
|
||||
},
|
||||
'timestamp': { 'type': Number, 'default': 0 },
|
||||
'created_at': { 'type': Date, 'default': Date.now }
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
mongoose.set('useFindAndModify', false);
|
||||
|
||||
const Jobs = mongoose.model('Jobs', jobSchema);
|
||||
|
||||
module.exports = Jobs;
|
66
onetime.js
Normal file
66
onetime.js
Normal file
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 23:35
|
||||
|
||||
*/
|
||||
const CronJob = require('cron').CronJob;
|
||||
const IndeedScraper = require('./scrapers/indeed');
|
||||
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||
|
||||
(async function () {
|
||||
console.log('Started..');
|
||||
const indeedScraper = new IndeedScraper();
|
||||
const totaljobsScraper = new TotaljobsScraper();
|
||||
const cwjobsScraper = new CwjobsScraper();
|
||||
const jobserveScraper = new JobserveScraper();
|
||||
const s1jobsScraper = new RssS1Jobs();
|
||||
const technojobsScraper = new RssTechnojobs();
|
||||
|
||||
await indeedScraper.go('london');
|
||||
await totaljobsScraper.go('london');
|
||||
await cwjobsScraper.go('london');
|
||||
|
||||
await indeedScraper.go('glasgow');
|
||||
await totaljobsScraper.go('glasgow');
|
||||
await cwjobsScraper.go('glasgow');
|
||||
await indeedScraper.go('edinburgh');
|
||||
await totaljobsScraper.go('edinburgh');
|
||||
await cwjobsScraper.go('edinburgh');
|
||||
await indeedScraper.go('milton keynes');
|
||||
await totaljobsScraper.go('milton keynes');
|
||||
await cwjobsScraper.go('milton keynes');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
|
||||
})();
|
1294
package-lock.json
generated
1294
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -9,15 +9,21 @@
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@rakh/utils": "^1.0.0",
|
||||
"axios": "^0.19.2",
|
||||
"bayes": "^1.0.0",
|
||||
"body-parser": "^1.19.0",
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"cron": "^1.8.2",
|
||||
"crypto-js": "^4.0.0",
|
||||
"dotenv": "^8.2.0",
|
||||
"eslint": "^6.8.0",
|
||||
"express": "^4.17.1",
|
||||
"fecha": "^4.2.0",
|
||||
"got": "^11.2.0",
|
||||
"jsonfile": "^6.0.1",
|
||||
"log4js": "^6.3.0",
|
||||
"mongoose": "^5.9.25",
|
||||
"present": "^1.0.0",
|
||||
"rss-parser": "^3.8.0",
|
||||
"sqlite3": "^4.1.1",
|
||||
|
45
preload.js
Normal file
45
preload.js
Normal file
@ -0,0 +1,45 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 28/07/2020
|
||||
* Time: 10:51
|
||||
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
var bayes = require('bayes');
|
||||
|
||||
var classifier = bayes({
|
||||
'tokenizer': function (text) {
|
||||
return text.split(',');
|
||||
}
|
||||
});
|
||||
|
||||
// teach it positive phrases
|
||||
|
||||
async function load() {
|
||||
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt', 'es6', 'es2016', 'es2017', 'es2018', 'freelance'];
|
||||
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
|
||||
|
||||
for(let i = 0;i < goodWords.length - 1;i++)
|
||||
await classifier.learn(goodWords[i], 'good');
|
||||
|
||||
for(let i = 0;i < badWords.length - 1;i++)
|
||||
await classifier.learn(badWords[i], 'bad');
|
||||
|
||||
// now ask it to categorize a document it has never seen before
|
||||
|
||||
console.log(await classifier.categorize(['ui', 'developer', 'london', 'react'].join(',')));
|
||||
|
||||
console.log(await classifier.categorize(['mysql', 'react', 'js', 'node', 'docker', 'kubernetes', 'google'].join(',')));
|
||||
|
||||
// serialize the classifier's state as a JSON string.
|
||||
var stateJson = classifier.toJson();
|
||||
|
||||
console.log(stateJson);
|
||||
|
||||
fs.writeFileSync('brain.json', stateJson);
|
||||
}
|
||||
|
||||
load();
|
@ -133,12 +133,15 @@ class IndeedScraper extends MasterScraper {
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
await this.addToMongo();
|
||||
}
|
||||
|
||||
async go(location = 'london') {
|
||||
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
||||
|
||||
await this.processSite();
|
||||
await this.processSite().catch((err) => {
|
||||
console.error('Indeed Go', err);
|
||||
});
|
||||
|
||||
console.log(`Indeed ${location} completed`);
|
||||
}
|
||||
|
@ -140,6 +140,7 @@ class IndeedMobileScraper extends MasterScraper {
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
await this.addToMongo();
|
||||
}
|
||||
|
||||
async go(location = 'london') {
|
||||
|
@ -22,7 +22,10 @@ class TotaljobsScraper extends MasterScraper {
|
||||
}
|
||||
|
||||
// Site specific parts below here
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async breakPage() {
|
||||
const $ = this.currentPage;
|
||||
const ads = [];
|
||||
@ -39,6 +42,11 @@ class TotaljobsScraper extends MasterScraper {
|
||||
this.items = [...this.items, ...ads];
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param part
|
||||
* @returns {Promise<{}>}
|
||||
*/
|
||||
async extractDetails(part) {
|
||||
const newObj = {};
|
||||
const $part = cheerio.load(part);
|
||||
@ -61,6 +69,11 @@ class TotaljobsScraper extends MasterScraper {
|
||||
return newObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param item
|
||||
* @returns {Promise<*>}
|
||||
*/
|
||||
async getIndividualPage(item) {
|
||||
const newItem = {...item};
|
||||
console.log('Getting', item.url);
|
||||
@ -75,6 +88,10 @@ class TotaljobsScraper extends MasterScraper {
|
||||
return newItem;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async getJobPages() {
|
||||
const newItems = [];
|
||||
for (let item of this.items) {
|
||||
@ -86,6 +103,10 @@ class TotaljobsScraper extends MasterScraper {
|
||||
this.items = [...newItems];
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async checkNext() {
|
||||
const $ = this.currentPage;
|
||||
const next = $('.pagination > *:last-child').attr('href') || '';
|
||||
@ -96,6 +117,10 @@ class TotaljobsScraper extends MasterScraper {
|
||||
console.log(next);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processSite() {
|
||||
console.log('Processing...');
|
||||
|
||||
@ -121,8 +146,14 @@ class TotaljobsScraper extends MasterScraper {
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
await this.addToMongo();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param location
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async go(location = 'london') {
|
||||
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
|
||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||||
|
124
server/controllers/jobs.v2.controller.js
Normal file
124
server/controllers/jobs.v2.controller.js
Normal file
@ -0,0 +1,124 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 24/07/2020
|
||||
* Time: 11:45
|
||||
|
||||
*/
|
||||
const Jobs = require('../../lib/mongoManager');
|
||||
const { Utils } = require('@rakh/utils');
|
||||
|
||||
const killNLDoubleSpace = /(\\n)\s{2,}|(\\n)|\s{2,}/g;
|
||||
|
||||
function reduceList(data) {
|
||||
if (arguments.length === 0 || arguments[0] === null ) return '';
|
||||
|
||||
const outObj = data.map((v) => {
|
||||
const o = Utils.extractFromObj({...v.details,...v.data, _id:v._id},['title','site', 'company', 'timestamp', 'read', 'applied', 'jobtype', 'class', 'autoclass']);
|
||||
o._id = v._id;
|
||||
return o;
|
||||
|
||||
});
|
||||
// console.log(data);
|
||||
|
||||
return outObj;
|
||||
}
|
||||
|
||||
function reduceRecord(record) {
|
||||
// console.log('Reducderecord', record);
|
||||
let outRec = {...record.details,data:record.data,_id:record._id};
|
||||
|
||||
return outRec;
|
||||
|
||||
}
|
||||
|
||||
exports.getList = (req, res) => {
|
||||
console.log('>getList req', req.params);
|
||||
|
||||
Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
|
||||
if (doc) {
|
||||
|
||||
res.send(reduceList(doc));
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error(err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.getJob = (req, res) => {
|
||||
console.log('>getJob req', req.params);
|
||||
|
||||
if(!req.params.id)
|
||||
return res.status(500).send({
|
||||
'message': 'Job id missing'
|
||||
});
|
||||
|
||||
const id = req.params.id;
|
||||
|
||||
Jobs.findById(id).then((doc) => {
|
||||
if (doc) {
|
||||
|
||||
const item = reduceRecord(doc._doc);
|
||||
const date = new Date( item.timestamp * 1000);
|
||||
|
||||
console.log(item);
|
||||
item.date = date.toLocaleString();
|
||||
item.title = item.title.replace(killNLDoubleSpace, ' ');
|
||||
|
||||
res.send(item);
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error(err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.readJob = (req, res) => {
|
||||
console.log('>readJob req', req.params);
|
||||
|
||||
let id;
|
||||
if(!req.params.id)
|
||||
return res.status(500).send({
|
||||
'message': 'Job id missing'
|
||||
});
|
||||
else
|
||||
id = req.params.id;
|
||||
|
||||
Jobs.findById(id).then((doc) => {
|
||||
if (doc) {
|
||||
|
||||
let fullDoc = Object.assign({}, doc._doc);
|
||||
|
||||
console.log('fullDoc', fullDoc);
|
||||
|
||||
if (!Utils.isEmpty(fullDoc)){
|
||||
fullDoc.data.read = new Date().getTime();
|
||||
|
||||
Jobs.findByIdAndUpdate(id, fullDoc, {'new':true}).then((doc) => {
|
||||
console.log(doc._doc);
|
||||
res.status(200).end();
|
||||
}).catch((err) => {
|
||||
console.error('inside',err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error('outer', err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
};
|
||||
|
||||
|
89
server/controllers/vote.controller.js
Normal file
89
server/controllers/vote.controller.js
Normal file
@ -0,0 +1,89 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 28/07/2020
|
||||
* Time: 11:08
|
||||
|
||||
*/
|
||||
const Jobs = require('../../lib/mongoManager');
|
||||
const { Utils } = require('@rakh/utils');
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
var bayes = require('bayes');
|
||||
|
||||
var classifier = bayes({
|
||||
'tokenizer': function (text) {
|
||||
return text.split(',');
|
||||
}
|
||||
});
|
||||
|
||||
function load() {
|
||||
const file = fs.readFileSync('brain.json');
|
||||
|
||||
classifier = bayes.fromJson(file);
|
||||
}
|
||||
|
||||
function save() {
|
||||
var stateJson = classifier.toJson();
|
||||
|
||||
console.log(stateJson);
|
||||
|
||||
fs.writeFileSync('brain.json', stateJson);
|
||||
}
|
||||
|
||||
load();
|
||||
|
||||
exports.upvote = (req, res) => {
|
||||
console.log('>upvote req', req.params);
|
||||
|
||||
if(!req.params.id)
|
||||
return res.status(500).send({
|
||||
'message': 'Job id missing'
|
||||
});
|
||||
|
||||
const id = req.params.id;
|
||||
|
||||
Jobs.findById(id).then(async (doc) => {
|
||||
if (doc) {
|
||||
const words = doc._doc.data.autoclass.words.join(',');
|
||||
|
||||
await classifier.learn(words, 'good');
|
||||
|
||||
save();
|
||||
res.status(200).end();
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error(err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.downvote = (req, res) => {
|
||||
console.log('>upvote req', req.params);
|
||||
|
||||
if(!req.params.id)
|
||||
return res.status(500).send({
|
||||
'message': 'Job id missing'
|
||||
});
|
||||
|
||||
const id = req.params.id;
|
||||
|
||||
Jobs.findById(id).then(async (doc) => {
|
||||
if (doc) {
|
||||
const words = doc._doc.data.autoclass.words.join(',');
|
||||
|
||||
await classifier.learn(words, 'bad');
|
||||
|
||||
save();
|
||||
res.status(200).end();
|
||||
}
|
||||
}).catch((err) => {
|
||||
console.error(err.message);
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
2
server/dist/build/bundle.css
vendored
2
server/dist/build/bundle.css
vendored
File diff suppressed because one or more lines are too long
6
server/dist/build/bundle.css.map
vendored
6
server/dist/build/bundle.css.map
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js
vendored
2
server/dist/build/bundle.js
vendored
File diff suppressed because one or more lines are too long
2
server/dist/build/bundle.js.map
vendored
2
server/dist/build/bundle.js.map
vendored
File diff suppressed because one or more lines are too long
17
server/routes/jobs.v2.route.js
Normal file
17
server/routes/jobs.v2.route.js
Normal file
@ -0,0 +1,17 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 24/07/2020
|
||||
* Time: 11:42
|
||||
|
||||
*/
|
||||
const jobs = require('../controllers/jobs.v2.controller');
|
||||
|
||||
module.exports = (app) => {
|
||||
app.route('/v2/jobs')
|
||||
.get(jobs.getList);
|
||||
|
||||
app.route('/v2/jobs/:id')
|
||||
.get(jobs.getJob)
|
||||
.put(jobs.readJob);
|
||||
};
|
17
server/routes/vote.route.js
Normal file
17
server/routes/vote.route.js
Normal file
@ -0,0 +1,17 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 28/07/2020
|
||||
* Time: 11:07
|
||||
|
||||
*/
|
||||
|
||||
const vote = require('../controllers/vote.controller');
|
||||
|
||||
module.exports = (app) => {
|
||||
app.route('/vote/up/:id')
|
||||
.put(vote.upvote);
|
||||
|
||||
app.route('/vote/down/:id')
|
||||
.put(vote.downvote);
|
||||
};
|
@ -58,7 +58,9 @@ app.use(bodyParser.json());
|
||||
app.post('/auth', auth.auth);
|
||||
|
||||
require('./routes/jobs.route')(app);
|
||||
require('./routes/jobs.v2.route')(app);
|
||||
require('./routes/apply.route')(app);
|
||||
require('./routes/vote.route')(app);
|
||||
|
||||
app.listen(serverPort, () => {
|
||||
console.log(`Server is listening on port ${serverPort}`);
|
||||
|
1986
test/indeed-2020-07-22--051214.html
Normal file
1986
test/indeed-2020-07-22--051214.html
Normal file
File diff suppressed because one or more lines are too long
@ -20,7 +20,7 @@ const indeedScraper = new IndeedScraper();
|
||||
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
|
||||
const page = fs.readFileSync('data/indeed/page2.html');
|
||||
|
||||
test.test('Test Indeed scraper', async t => {
|
||||
test.skip('Test Indeed scraper', async t => {
|
||||
const $ = cheerio.load(page);
|
||||
|
||||
indeedScraper.loadPage($);
|
||||
@ -35,13 +35,36 @@ test.test('Test Indeed scraper', async t => {
|
||||
|
||||
await indeedScraper.filterAdverts();
|
||||
|
||||
// await indeedScraper.addToDB();
|
||||
await indeedScraper.addToMongo();
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
||||
test.test('Test full run Indeed scraper', async t => {
|
||||
await indeedScraper.go('london');
|
||||
test.skip('Test full run Indeed scraper', async t => {
|
||||
await indeedScraper.go('london').catch((err) => {
|
||||
console.error('Indeed GO', err);
|
||||
});
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
||||
|
||||
test.test('Test Indeed scraper -- MONGO', async t => {
|
||||
const $ = cheerio.load(page);
|
||||
|
||||
indeedScraper.loadPage($);
|
||||
|
||||
await indeedScraper.breakPage();
|
||||
|
||||
// await indeedScraper.getJobPages();
|
||||
|
||||
// console.log(await indeedScraper.checkNext());
|
||||
|
||||
// console.log(indeedScraper.items);
|
||||
|
||||
// await indeedScraper.filterAdverts();
|
||||
|
||||
await indeedScraper.addToMongo();
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
@ -26,13 +26,14 @@ const s1jobsScraper = new RssS1Jobs();
|
||||
const feed = fs.readFileSync('test/data/s1jobs/m7dp711z2r.xml');
|
||||
|
||||
test.test('Test Jobserve scraper', async t => {
|
||||
let url = 'http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml';
|
||||
await s1jobsScraper.setStartUrl(url);
|
||||
|
||||
|
||||
s1jobsScraper.reduceItems();
|
||||
|
||||
await s1jobsScraper.filterAdverts();
|
||||
await s1jobsScraper.addToDB();
|
||||
// await s1jobsScraper.addToDB();
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
@ -19,17 +19,17 @@ const testScraper = new RssTechnojobs();
|
||||
const feed = fs.readFileSync('test/data/technojobs/page1');
|
||||
|
||||
test.test('Test Technojobs scraper', async t => {
|
||||
// await testScraper.loadFeed(feed);
|
||||
await testScraper.loadFeed('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
|
||||
// testScraper.reduceItems();
|
||||
await testScraper.reduceItems();
|
||||
|
||||
// await s1jobsScraper.filterAdverts();
|
||||
await s1jobsScraper.filterAdverts();
|
||||
// await s1jobsScraper.addToDB();
|
||||
|
||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||
/* await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
|
||||
|
||||
*/
|
||||
|
||||
t.end();
|
||||
});
|
||||
|
@ -22,20 +22,20 @@ console.log(`${__dirname}`);
|
||||
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
|
||||
|
||||
test.test('Test Totaljobs scraper', async t => {
|
||||
const $ = cheerio.load(page);
|
||||
const $ = cheerio.load(page);
|
||||
|
||||
totaljobsScraper.loadPage($);
|
||||
totaljobsScraper.loadPage($);
|
||||
|
||||
await totaljobsScraper.breakPage();
|
||||
await totaljobsScraper.breakPage();
|
||||
|
||||
await totaljobsScraper.getJobPages();
|
||||
// console.log(await indeedScraper.checkNext());
|
||||
await totaljobsScraper.getJobPages();
|
||||
// console.log(await indeedScraper.checkNext());
|
||||
|
||||
console.log(totaljobsScraper.items);
|
||||
// console.log(totaljobsScraper.items);
|
||||
|
||||
await totaljobsScraper.filterAdverts();
|
||||
await totaljobsScraper.filterAdverts();
|
||||
|
||||
// await totaljobsScraper.addToDB();
|
||||
// await totaljobsScraper.addToDB();
|
||||
|
||||
t.end();
|
||||
t.end();
|
||||
});
|
||||
|
14
test/wip.js
Normal file
14
test/wip.js
Normal file
@ -0,0 +1,14 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 23/07/2020
|
||||
* Time: 09:26
|
||||
|
||||
*/
|
||||
|
||||
const { Corpus } = require('../lib/corpus');
|
||||
|
||||
const text = 'ESTAMP DEVELOPER 6 month contract £450-525 / day Developer, SQL, Photoshop, Javascript, … NET, C#, Javascript Advanced knowledge of SQL Server TSQL Experience of the design and … PDF stamp development E-STAMP DEVELOPER 6 month contract';
|
||||
const out = Corpus.process(text);
|
||||
|
||||
console.log(out);
|
71
testgrabber.js
Normal file
71
testgrabber.js
Normal file
@ -0,0 +1,71 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 23:35
|
||||
|
||||
*/
|
||||
const CronJob = require('cron').CronJob;
|
||||
const IndeedScraper = require('./scrapers/indeed');
|
||||
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||
const CwjobsScraper = require('./scrapers/cwjobs');
|
||||
const JobserveScraper = require('./scrapers/rss.jobserve');
|
||||
const RssS1Jobs = require('./scrapers/rss.s1jobs');
|
||||
const RssTechnojobs = require('./scrapers/rss.technojobs');
|
||||
|
||||
(async function () {
|
||||
console.log('Started..');
|
||||
const indeedScraper = new IndeedScraper();
|
||||
const totaljobsScraper = new TotaljobsScraper();
|
||||
const cwjobsScraper = new CwjobsScraper();
|
||||
const jobserveScraper = new JobserveScraper();
|
||||
const s1jobsScraper = new RssS1Jobs();
|
||||
const technojobsScraper = new RssTechnojobs();
|
||||
|
||||
await indeedScraper.go('london');
|
||||
|
||||
|
||||
await totaljobsScraper.go('london');
|
||||
await cwjobsScraper.go('london');
|
||||
await indeedScraper.go('glasgow');
|
||||
await totaljobsScraper.go('glasgow');
|
||||
await cwjobsScraper.go('glasgow');
|
||||
await indeedScraper.go('edinburgh');
|
||||
await totaljobsScraper.go('edinburgh');
|
||||
await cwjobsScraper.go('edinburgh');
|
||||
await indeedScraper.go('milton keynes');
|
||||
await totaljobsScraper.go('milton keynes');
|
||||
await cwjobsScraper.go('milton keynes');
|
||||
/*
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
|
||||
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
|
||||
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
|
||||
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
|
||||
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
|
||||
|
||||
*/
|
||||
|
||||
|
||||
})();
|
1
unused.json
Normal file
1
unused.json
Normal file
File diff suppressed because one or more lines are too long
22
words.js
Normal file
22
words.js
Normal file
@ -0,0 +1,22 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 27/07/2020
|
||||
* Time: 10:08
|
||||
|
||||
*/
|
||||
|
||||
const jsonfile = require('jsonfile');
|
||||
|
||||
const data = require('./unused.json');
|
||||
|
||||
function show(size) {
|
||||
const f = data.filter((v) => {
|
||||
return (v.length === size);
|
||||
});
|
||||
|
||||
jsonfile.writeFileSync('limited.json', [...new Set(f)]);
|
||||
console.log('done');
|
||||
}
|
||||
|
||||
show(11);
|
Loading…
Reference in New Issue
Block a user