Merge branch 'JOBSCRAPER-1' into 'development'

Resolve JOBSCRAPER-1

See merge request martind2000/jobscraper!1
This commit is contained in:
Martin Donnelly 2020-09-10 13:14:18 +00:00
commit 1513ea5010
42 changed files with 14399 additions and 553 deletions

32
.edditorconfig Normal file
View File

@ -0,0 +1,32 @@
; http://editorconfig.org
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
indent_style = space
indent_size = 2
[*.txt]
insert_final_newline = false
trim_trailing_whitespace = false
[*.py]
indent_size = 4
[*.m]
indent_size = 4
[Makefile]
indent_style = tab
indent_size = 8
[*.{js,json}]
indent_style = space
indent_size = 2
[*.md]
trim_trailing_whitespace = false

View File

@ -9,7 +9,7 @@
"env": {
"browser": true,
"node": true,
"es6": true
"es2017": true
},
"rules": {
"arrow-spacing": "error",

1
.gitignore vendored
View File

@ -147,3 +147,4 @@ fabric.properties
/live/
!/output/
/db/jobs.db
!/db/

1
biglist.json Normal file

File diff suppressed because one or more lines are too long

204
brain.json Normal file
View File

@ -0,0 +1,204 @@
{
"categories": {
"good": true,
"bad": true
},
"docCount": {
"good": 43,
"bad": 5
},
"totalDocuments": 48,
"vocabulary": {
"tsql": true,
"developer": true,
"contract": true,
"web": true,
"javascript": true,
"js": true,
"node": true,
"es": true,
"agile": true,
"nodejs": true,
"london": true,
"aws": true,
"sql": true,
"postgresql": true,
"mysql": true,
"docker": true,
"ecs": true,
"automation": true,
"jslint": true,
"jshint": true,
"vuejs": true,
"vue": true,
"nginx": true,
"remotely": true,
"mvc": true,
"remote": true,
"iot": true,
"mqtt": true,
"es6": true,
"es2016": true,
"es2017": true,
"es2018": true,
"react": true,
"redux": true,
"graphql": true,
"java": true,
"reactjs": true,
"apps": true,
"html": true,
"css": true,
"code": true,
"angular": true,
"ember": true,
"restful": true,
"apis": true,
"infrastructure": true,
"software": true,
"native": true,
"med": true,
"mobile": true,
"client": true,
"applications": true,
"digital": true,
"analytics": true,
"dashboarding": true,
"online": true,
"analyse": true,
"dashboards": true,
"google": true,
"query": true,
"data": true,
"stakeholders": true,
"enhancements": true,
"requirements": true,
"c": true,
"net": true,
"technologies": true,
"azure": true,
"understanding": true,
"devops": true,
"tools": true,
"frameworks": true,
"scotland": true,
"responsibility": true,
"programme": true,
"functions": true,
"asp": true,
"project": true,
"transform": true,
"collaborative": true,
"technical": true,
"framework": true,
"nhibernate": true,
"server": true,
"api": true,
"development": true,
"lifecycle": true,
"specification": true,
"appointments": true
},
"vocabularySize": 89,
"wordCount": {
"good": 157,
"bad": 5
},
"wordFrequencyCount": {
"good": {
"tsql": 1,
"developer": 6,
"contract": 9,
"web": 6,
"javascript": 7,
"js": 3,
"node": 2,
"es": 1,
"agile": 2,
"nodejs": 1,
"london": 3,
"aws": 3,
"sql": 3,
"postgresql": 1,
"mysql": 1,
"docker": 1,
"ecs": 1,
"automation": 1,
"jslint": 1,
"jshint": 1,
"vuejs": 1,
"vue": 2,
"nginx": 1,
"remotely": 1,
"mvc": 5,
"remote": 2,
"iot": 1,
"mqtt": 1,
"es6": 1,
"es2016": 1,
"es2017": 1,
"es2018": 1,
"apps": 1,
"html": 5,
"css": 5,
"code": 2,
"react": 2,
"angular": 1,
"ember": 1,
"restful": 1,
"apis": 1,
"infrastructure": 1,
"software": 2,
"native": 1,
"med": 1,
"mobile": 1,
"client": 4,
"applications": 2,
"digital": 2,
"analytics": 1,
"dashboarding": 1,
"online": 1,
"analyse": 1,
"dashboards": 1,
"google": 1,
"query": 1,
"data": 1,
"stakeholders": 1,
"enhancements": 3,
"requirements": 3,
"c": 4,
"net": 5,
"technologies": 4,
"azure": 2,
"understanding": 1,
"devops": 2,
"tools": 1,
"frameworks": 1,
"scotland": 1,
"responsibility": 1,
"programme": 1,
"functions": 1,
"asp": 1,
"project": 1,
"transform": 1,
"collaborative": 1,
"technical": 1,
"framework": 1,
"nhibernate": 1,
"server": 1,
"api": 1,
"development": 1,
"lifecycle": 1,
"specification": 1,
"appointments": 1
},
"bad": {
"react": 1,
"redux": 1,
"graphql": 1,
"java": 1,
"reactjs": 1
}
},
"options": {}
}

Binary file not shown.

View File

@ -8,6 +8,12 @@
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager');
const JobsModel = require('../lib/mongoManager');
const SHA = require('crypto-js/sha256');
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./corpus');
class MasterBase {
@ -57,6 +63,79 @@ class MasterBase {
});
}
/**
*
*/
addToMongo() {
console.log('>> ADD TO MONGO!');
for(const item of this.items) {
// console.log('add', item);
const newObj = this.reduceData(item);
const newJob = new JobsModel(newObj);
newJob.save().then((m) => {
console.log('m', m.details.title);
}).catch((err) => {
console.error('m', err);
});
}
}
/**
*
* @param inval
* @returns {number}
*/
analyseRate(inval) {
console.log('analyseRate', inval);
let outVal = 0;
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
const clearSpace = /\s+/g;
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
const resultArray = result.trim().split((' '));
if (resultArray.length > 0) {
const item = parseInt(resultArray[0], 10);
if (item < 100) outVal = 0;
else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2;
}
else return 0;
return outVal;
}
/**
*
* @param d
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
*/
reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = this.analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary);
outObj.data.timestamp = d.timestamp * 1000;
return outObj;
}
/**
*
* @returns {Promise<void>}
@ -120,10 +199,15 @@ class MasterBase {
return `https://image.silvrtree.co.uk/q${q}/${url}`;
}
/**
*
* @returns {Promise<void>}
*/
async go() {
this.items = [];
this.rawItems = [];
}
}
module.exports = MasterBase;

90
lib/corpus.js Normal file
View File

@ -0,0 +1,90 @@
const jsonfile = require('jsonfile');
const words = require('../lib/wordlist.json');
const wordsAdditional = require('../lib/wordlistAdditional.json');
const bigList = new Map([]);
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
let unrated = [];
var _global = typeof global === 'undefined' ? window : global;
var Corpus = (_global.Corpus = _global.Corpus || {});
const emailRegex = /[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?/;
const detagRegex = /(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/gi;
const desymbolNumberRegex = /[\n\t+$,\?\.\%\*=&:;()\\/\-£…"]|\d+/gi;
const deSpace = /\s+/g;
function cleanText(intext) {
if (arguments.length === 0 || typeof intext === 'undefined' || intext === null ) return '';
return intext.replace(emailRegex, ' ').replace(detagRegex, ' ').replace(desymbolNumberRegex, ' ').replace(deSpace, ' ').trim().toLowerCase();
}
function dedupe(intext) {
if (arguments.length === 0 || intext === null ) return [];
return [...new Set(intext)];
}
function incItem(item) {
if (bigList.has(item))
bigList.set(item, bigList.get(item) + 1);
else
bigList.set(item, 1);
}
/**
* Process the body
* @param intext
* @returns {{score: number, bad: *, good: *}}
*/
Corpus.process = function(intext) {
const workText = cleanText(intext);
const workArray = workText.split(' ');
const cleanedArray = dedupe(workArray).filter((v) => {
return (words.indexOf(v) === -1 && wordsAdditional.indexOf(v) === -1);
});
const good = cleanedArray.filter((v) => {
return (goodWords.indexOf(v) !== -1);
});
const bad = cleanedArray.filter((v) => {
return (badWords.indexOf(v) !== -1);
});
const unused = cleanedArray.filter((v) => {
return ((badWords.indexOf(v) === -1) && (goodWords.indexOf(v) === -1));
});
cleanedArray.map((item)=> {
incItem(item);
});
unrated = [...unrated, ...unused];
const score = good.length - (bad.length * 5);
// console.log('unused', unused);
return { good, bad, score, 'words':cleanedArray };
};
Corpus.exportUnused = function() {
jsonfile.writeFileSync('./unused.json', dedupe(unrated));
jsonfile.writeFileSync('./biglist.json', [...bigList].sort((a, b) => b[1] - a[1]));
console.log([...bigList]);
};
if (typeof module !== 'undefined')
module.exports = {
'Corpus': Corpus
};

34
lib/mongoManager.js Normal file
View File

@ -0,0 +1,34 @@
/**
* Created by WebStorm.
* User: martin
* Date: 22/07/2020
* Time: 17:00
*/
const mongoose = require('mongoose');
const log4js = require('log4js');
const logger = log4js.getLogger();
const JobsModel = require('../models/jobs');
const { Utils } = require('@rakh/utils');
require('dotenv').config();
logger.level = 'debug';
const mongoConnect = process.env.MONGOCONNECT;
// logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
// mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
logger.debug(mongoConnect);
mongoose.connect(mongoConnect);
const mDB = mongoose.connection;
mDB.on('error', console.error.bind(console, 'connection error:'));
module.exports = JobsModel;

View File

@ -89,6 +89,7 @@ class MasterRSS extends MasterBase {
await this.filterAdverts();
if (this.items.length > 0) await this.addToDB();
if (this.items.length > 0) await this.addToMongo();
}
else
console.log('No items to process');

View File

@ -20,9 +20,14 @@ class MasterScraper extends MasterBase {
constructor() {
super();
}
getContent(url, useStone = false) {
/**
*
* @param url
* @param useStone
* @returns {Promise<unknown>}
*/
getContent(url, useStone = false) {
/*
let headers = new Headers({
@ -54,19 +59,28 @@ fetch(url, {
resolve(response.body);
})
.catch((e) => {
console.error('getContent', e );
reject(e.response.body);
});
});
};
async savePage(html) {
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
const filename = `pages/${this.siteid}-${now}.html`;
fs.writeFileSync(filename, html);
}
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone)
.then((html) => {
fs.writeFileSync(filename, html);
// console.log('>> getPage:: got', html);
console.log('>> getPage:: OK');
if (this.saveFile) this.savePage(html);
const $ = cheerio.load(html);
this.loadPage($);
})
@ -75,30 +89,59 @@ fetch(url, {
// Site specific parts below here
/**
* Break each page into items
* @returns {Promise<void>}
*/
async breakPage() {
}
/**
*
* @param part
* @returns {Promise<void>}
*/
async extractDetails(part) {
}
/**
*
* @returns {Promise<void>}
*/
async checkNext() {
}
/**
*
* @returns {Promise<void>}
*/
async processSite() {
}
/**
*
* @returns {Promise<void>}
*/
async getIndividualPage() {
}
/**
*
* @returns {Promise<void>}
*/
async getJobPages() {
}
/**
*
* @returns {Promise<void>}
*/
async go() {
}

1007
lib/wordlist.json Normal file

File diff suppressed because it is too large Load Diff

8790
lib/wordlistAdditional.json Normal file

File diff suppressed because it is too large Load Diff

559
limited.json Normal file
View File

@ -0,0 +1,559 @@
[
"experienced",
"exceptional",
"maintaining",
"familiarity",
"commodities",
"opportunity",
"possibility",
"integration",
"engineering",
"derivatives",
"prefferable",
"nutritional",
"performance",
"immediately",
"information",
"responsible",
"environment",
"stakeholder",
"proactively",
"requirement",
"temporarily",
"interrogate",
"effectively",
"progressing",
"substantial",
"identifying",
"maintenance",
"workarounds",
"departments",
"consultancy",
"regulations",
"statistical",
"previously·",
"euromonitor",
"documenting",
"bookkeeping",
"reconciling",
"hardworking",
"themselves!",
"appropriate",
"socialising",
"fundraising",
"initiatives",
"sponsorship",
"orientation",
"competitive",
"illustrator",
"outstanding",
"interaction",
"consistency",
"touchpoints",
"freshtechit",
"recruitment",
"catastrophe",
"accountable",
"workstreams",
"scalability",
"undertaking",
"interacting",
"significant",
"considering",
"independent",
"collaborate",
"arrangement",
"unsolicited",
"empowerment",
"connections",
"specialists",
"credentials",
"personality",
"established",
"northampton",
"advertising",
"operational",
"mathematics",
"contractors",
"instruments",
"referencing",
"locationsco",
"disciplines",
"corporation",
"investments",
"conferences",
"demonstrate",
"directorate",
"acknowledge",
"legislation",
"designgreat",
"understands",
"perspective",
"association",
"enforcement",
"prestigious",
"individuals",
"alternative",
"technically",
"challenging",
"discussions",
"lifeworking",
"interactive",
"storyboards",
"communicate",
"abilitywork",
"englishgood",
"detailbonus",
"angularwhat",
"neededabout",
"innovations",
"enthusiasts",
"instructors",
"prospective",
"comfortable",
"involvement",
"adventurous",
"marketplace",
"forecasting",
"contractual",
"underpinned",
"acquisition",
"microsofts",
"progression",
"suggestions",
"proficiency",
"participate",
"joblocation",
"methodology",
"continually",
"cataloguing",
"projectgood",
"incremental",
"overarching",
"confidently",
"circulatory",
"adjustments",
"interesting",
"consultants",
"experienceb",
"hourscasual",
"switzerland",
"contributes",
"participant",
"improvement",
"articulates",
"contributed",
"comfortably",
"deployments",
"integrating",
"configuring",
"platforming",
"educatedday",
"contracting",
"monthstotal",
"outsourcing",
"designswork",
"ideasdesign",
"deviceswork",
"fundamental",
"businessjob",
"implemented",
"transaction",
"reliability",
"upgradesyou",
"uncertainty",
"enterpriser",
"teamprovide",
"trafficking",
"doubleclick",
"communities",
"forestlink",
"dimensional",
"coordinator",
"spreadsheet",
"pressurised",
"assignments",
"willingness",
"certificate",
"summaryrole",
"institution",
"segregation",
"preparation",
"electronics",
"duplication",
"surrounding",
"informatica",
"blackfriars",
"terminology",
"shabarinath",
"interfacing",
"expectation",
"proprietary",
"conflicting",
"itecopeople",
"opowershell",
"submissions",
"negotiating",
"escalations",
"transferred",
"protections",
"customizing",
"oxfordshire",
"progressive",
"bishopsgate",
"partnership",
"futureheads",
"permissions",
"efficiently",
"unspecified",
"potentially",
"disclaimers",
"foreseeable",
"sustainable",
"calculation",
"replication",
"constitutes",
"recommended",
"enterprises",
"negotiation",
"imaginative",
"differences",
"nationality",
"impediments",
"refinements",
"translating",
"obligations",
"flexibility",
"unashamedly",
"exclusively",
"replacement",
"essentially",
"artifactory",
"theoretical",
"probability",
"integrators",
"contractor?",
"interested?",
"functioning",
"chamberlain",
"inclusivity",
"iteratively",
"enhancement",
"constraints",
"establishes",
"qualitative",
"influencing",
"procurement",
"experiences",
"furthermore",
"disciplined",
"unnecessary",
"bureaucracy",
"represented",
"siteimprove",
"lokhandwala",
"specialises",
"rationalize",
"competncies",
"restoration",
"allocations",
"admittances",
"furnishings",
"cleanliness",
"residential",
"contactable",
"conventions",
"translation",
"approaching",
"intecselect",
"linguistics",
"southampton",
"beautifully",
"estimations",
"newsletters",
"summarising",
"simulations",
"portfolio's",
"coronavirus",
"opoortunity",
"unavailable",
"accordingly",
"penetration",
"remediation",
"elimination",
"achievement",
"facilitator",
"westminster",
"introducing",
"businesses'",
"capitalists",
"investigate",
"countryside",
"problematic",
"coordinates",
"components'",
"supervision",
"bonavolonta",
"proposition",
"foundations",
"suitability",
"researchers",
"explanation",
"commitments",
"computation",
"questioning",
"experiments",
"visualfiles",
"cloudstream",
"determining",
"deliverable",
"inquisitive",
"backgrounds",
"thoughtspot",
"specialized",
"veloppement",
"importantes",
"typedscript",
"restaurants",
"prophylaxis",
"transmitted",
"appointment",
"encouraging",
"aggregating",
"championing",
"conjunction",
"customising",
"photography",
"authorities",
"competition",
"collections",
"contraintes",
"fonctionnel",
"adaptabilit",
"changements",
"conceptions",
"utilisation",
"shortlisted",
"reusability",
"recognizing",
"decisioning",
"accommodate",
"limitations",
"resourceful",
"algorithmic",
"unconcerned",
"intelligent",
"considerate",
"clientbased",
"accelerator",
"dreamweaver",
"applicant's",
"proactivity",
"aggregation",
"restriction",
"traditional",
"corporately",
"memberships",
"standardise",
"theecsgroup",
"scarchitect",
"consolidate",
"extensively",
"afghanistan",
"encompasses",
"distinctive",
"professions",
"interviewed",
"formulation",
"transitions",
"aspirations",
"ingredients",
"setterfield",
"candidates",
"leatherhead",
"publication",
"undoubtedly",
"basingstoke",
"underground",
"reinsurance",
"exemplifies",
"civiization",
"developer's",
"bazzelgette",
"adjacencies",
"feasibility",
"frontinvest",
"neogotiable",
"unconnected",
"conditional",
"bottlenecks",
"productions",
"pharmacists",
"technicians",
"prescribing",
"stewardship",
"recognising",
"convictions",
"subscribing",
"transparent",
"wireframing",
"insidehmcts",
"justicejobs",
"criminology",
"hospitality",
"structuring",
"educational",
"substantive",
"secondments",
"transgender",
"smartphones",
"microsoft's",
"definitions",
"validations",
"prioritised",
"autoscaling",
"abstraction",
"correlation",
"recognition",
"contributor",
"apigedevops",
"incorporate",
"woocommerce",
"informatics",
"adfadc@apps",
"automations",
"formulating",
"beneficiary",
"referential",
"jsdevsecops",
"solutioning",
"measurement",
"familiarise",
"eligibility",
"standardize",
"experience?",
"bournemouth",
"implementer",
"agilesphere",
"assumptions",
"accountancy",
"cockroachdb",
"promotional",
"facilitates",
"discoveries",
"bladecenter",
"considered!",
"cooperation",
"exploration",
"angulareact",
"preferabbly",
"harmonising",
"convenience",
"inclusively",
"strategists",
"attribution",
"fromscratch",
"combination",
"solutionize",
"accelerated",
"diagnostics",
"sensibility",
"informative",
"intellegnce",
"specilisits",
"projections",
"associative",
"personalize",
"farnborough",
"necessarily",
"nservicebus",
"constrained",
"prioritized",
"behavioural",
"chakraborty",
"leaderships",
"flourishing",
"uniqstudios",
"simplifying",
"realisation",
"extensions!",
"prioritises",
"experience!",
"candidates!",
"inclination",
"stimulating",
"appreciated",
"reinventing",
"compression",
"jscybsecdev",
"equirements",
"generalized",
"compressors",
"assessments",
"beyondtrust",
"engagements",
"numerically",
"electricity",
"interchange",
"jsswift_dev",
"circulating",
"attachments",
"credibility",
"vnetpeering",
"territories",
"staggering!",
"developers!",
"peripherals",
"virtualized",
"bitdefender",
"jssitecorjs",
"positioning",
"appreciates",
"chessington",
"controllers",
"controlling",
"quantifying",
"virtualised",
"manufacture",
"fluorescent",
"governments",
"bigcommerce",
"therapeutic",
"importantly",
"differently",
"rigourously",
"shareholder",
"copywriting",
"anticipated",
"approximate",
"behdarvandi",
"testability",
"beneficial!",
"jswmibmcraw",
"exhibitions",
"talentpoint",
"propagation",
"interviews!",
"solutionise",
"elasticache",
"manoeuvring",
"teamservice",
"geographies",
"efficientip",
"organically",
"advancement",
"jshodanular",
"wholesalers",
"multitenant",
"encouraged?",
"freelancers",
"composition",
"#jobswagger",
"typographic",
"stereotypes",
"clerkenwell",
"sacrificing",
"resolutions",
"technology?",
"advantagous"
]

22
mapbuilder.js Normal file
View File

@ -0,0 +1,22 @@
/**
* Created by WebStorm.
* User: martin
* Date: 27/07/2020
* Time: 15:34
*/
const jsonfile = require('jsonfile');
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt'];
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
const brain = new Map([]);
for(let i = 0;i < goodWords.length - 1;i++)
brain.set(goodWords[i], 3);
for(let i = 0;i < badWords.length - 1;i++)
brain.set(badWords[i], -5);
jsonfile.writeFileSync('brain.json', [...brain]);

156
migrate.js Normal file
View File

@ -0,0 +1,156 @@
/**
* Created by WebStorm.
* User: martin
* Date: 22/07/2020
* Time: 10:20
*/
const db = require('./lib/connect');
const log4js = require('log4js');
const logger = log4js.getLogger();
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./lib/corpus');
const SHA = require('crypto-js/sha256');
/*
2604
const mongoose = require('mongoose');
const log4js = require('log4js');
const logger = log4js.getLogger();
const Jobs = require('./models/jobs');
require('dotenv').config();
logger.level = 'debug';
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
mongoose.connect(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
const mDB = mongoose.connection;
mDB.on('error', console.error.bind(console, 'connection error:'));
*/
const Jobs = require('./lib/mongoManager');
const migrate = (function() {
function analyseRate(inval) {
let outVal = 0;
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
const clearSpace = /\s+/g;
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
const resultArray = result.trim().split((' '));
if (resultArray.length > 0) {
const item = parseInt(resultArray[0], 10);
if (item < 100) outVal = 0;
else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2;
}
else return 0;
return outVal;
}
function reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
// outObj.data.read = d.read || 0;
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary);
outObj.data.timestamp = d.timestamp * 1000;
return outObj;
}
function getCurrent() {
const outgoing = [];
console.log('get version');
const sql = 'select jobs.*, applied.a as applied, read.d as read from jobs left join applied on applied.aid = jobs._id left join read on read.rid = jobs._id order by _id asc;';
return new Promise((resolve, reject) => {
db.all(sql, [], (err, rows) => {
if (err)
reject(err);
rows.forEach((row) => {
outgoing.push(row);
});
resolve(outgoing) ;
});
});
}
async function start() {
await getCurrent().then(async (d) => {
logger.debug(d.length);
for (let t = 0;t < (d.length - 1);t++) {
const newD = reduceData(d[t]);
// logger.debug(newD);
const newJob = Jobs(newD);
await newJob.save().then((m) => {
logger.debug('m', m.details.title);
}).catch((err) => {
logger.error(err.keyPattern);
});
}
}).then(() => {
logger.debug('SAVING!!');
Corpus.exportUnused();
})
.catch((err) => {
logger.error(err.keyPattern);
});
}
async function deleteOld() {
const oneDay = 86400000;
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
logger.debug('m', m);
}).catch((err) => {
logger.error(err);
});
}
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
return {
'start':start,
'deleteOld': deleteOld
};
})();
(async function() {
await migrate.start();
await migrate.deleteOld();
logger.info('Done??');
})();

47
models/jobs.js Normal file
View File

@ -0,0 +1,47 @@
/**
* Created by WebStorm.
* User: martin
* Date: 22/07/2020
* Time: 14:18
*/
const mongoose = require('mongoose');
const Schema = mongoose.Schema;
const jobSchema = new Schema({
'details': {
'title': { 'type': String, 'required': true },
'site': { 'type': String, 'required': true },
'url': { 'type': String, 'required': true, 'unique': true },
'id': String,
'summary': String,
'company': String,
'location': String,
'postdate': String,
'salary': String,
'easyapply': Number,
'timestamp': Number,
'hashed' : { 'type': String, 'required':true, 'unique':true }
},
'data': {
'read': { 'type': Number, 'default': 0 },
'applied': { 'type': Number, 'default': 0 },
'jobtype': { 'type': Number, 'default': 0 },
'class': { 'type': Number, 'default': 0 },
'autoclass': {
'good': Array,
'bad': Array,
'words': Array,
'score': { 'type': Number, 'default': 0 }
},
'timestamp': { 'type': Number, 'default': 0 },
'created_at': { 'type': Date, 'default': Date.now }
}
});
mongoose.set('useFindAndModify', false);
const Jobs = mongoose.model('Jobs', jobSchema);
module.exports = Jobs;

66
onetime.js Normal file
View File

@ -0,0 +1,66 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 23:35
*/
const CronJob = require('cron').CronJob;
const IndeedScraper = require('./scrapers/indeed');
const TotaljobsScraper = require('./scrapers/totaljobs');
const CwjobsScraper = require('./scrapers/cwjobs');
const JobserveScraper = require('./scrapers/rss.jobserve');
const RssS1Jobs = require('./scrapers/rss.s1jobs');
const RssTechnojobs = require('./scrapers/rss.technojobs');
(async function () {
console.log('Started..');
const indeedScraper = new IndeedScraper();
const totaljobsScraper = new TotaljobsScraper();
const cwjobsScraper = new CwjobsScraper();
const jobserveScraper = new JobserveScraper();
const s1jobsScraper = new RssS1Jobs();
const technojobsScraper = new RssTechnojobs();
await indeedScraper.go('london');
await totaljobsScraper.go('london');
await cwjobsScraper.go('london');
await indeedScraper.go('glasgow');
await totaljobsScraper.go('glasgow');
await cwjobsScraper.go('glasgow');
await indeedScraper.go('edinburgh');
await totaljobsScraper.go('edinburgh');
await cwjobsScraper.go('edinburgh');
await indeedScraper.go('milton keynes');
await totaljobsScraper.go('milton keynes');
await cwjobsScraper.go('milton keynes');
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
})();

1294
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -9,15 +9,21 @@
"author": "",
"license": "ISC",
"dependencies": {
"@rakh/utils": "^1.0.0",
"axios": "^0.19.2",
"bayes": "^1.0.0",
"body-parser": "^1.19.0",
"cheerio": "^1.0.0-rc.3",
"cron": "^1.8.2",
"crypto-js": "^4.0.0",
"dotenv": "^8.2.0",
"eslint": "^6.8.0",
"express": "^4.17.1",
"fecha": "^4.2.0",
"got": "^11.2.0",
"jsonfile": "^6.0.1",
"log4js": "^6.3.0",
"mongoose": "^5.9.25",
"present": "^1.0.0",
"rss-parser": "^3.8.0",
"sqlite3": "^4.1.1",

45
preload.js Normal file
View File

@ -0,0 +1,45 @@
/**
* Created by WebStorm.
* User: martin
* Date: 28/07/2020
* Time: 10:51
*/
const fs = require('fs');
var bayes = require('bayes');
var classifier = bayes({
'tokenizer': function (text) {
return text.split(',');
}
});
// teach it positive phrases
async function load() {
const goodWords = ['tsql', 'developer', 'contract', 'web', 'javascript', 'js', 'node', 'es', 'agile', 'nodejs', 'london', 'aws', 'sql', 'postgresql', 'mysql', 'docker', 'ecs', 'automation', 'jslint', 'jshint', 'vuejs', 'vue', 'nginx', 'remotely', 'mvc', 'remote', 'iot', 'mqtt', 'es6', 'es2016', 'es2017', 'es2018', 'freelance'];
const badWords = ['react', 'redux', 'graphql', 'java', 'reactjs', 'shopify'];
for(let i = 0;i < goodWords.length - 1;i++)
await classifier.learn(goodWords[i], 'good');
for(let i = 0;i < badWords.length - 1;i++)
await classifier.learn(badWords[i], 'bad');
// now ask it to categorize a document it has never seen before
console.log(await classifier.categorize(['ui', 'developer', 'london', 'react'].join(',')));
console.log(await classifier.categorize(['mysql', 'react', 'js', 'node', 'docker', 'kubernetes', 'google'].join(',')));
// serialize the classifier's state as a JSON string.
var stateJson = classifier.toJson();
console.log(stateJson);
fs.writeFileSync('brain.json', stateJson);
}
load();

View File

@ -133,12 +133,15 @@ class IndeedScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
}
async go(location = 'london') {
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
await this.processSite();
await this.processSite().catch((err) => {
console.error('Indeed Go', err);
});
console.log(`Indeed ${location} completed`);
}

View File

@ -140,6 +140,7 @@ class IndeedMobileScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
}
async go(location = 'london') {

View File

@ -22,7 +22,10 @@ class TotaljobsScraper extends MasterScraper {
}
// Site specific parts below here
/**
*
* @returns {Promise<void>}
*/
async breakPage() {
const $ = this.currentPage;
const ads = [];
@ -39,6 +42,11 @@ class TotaljobsScraper extends MasterScraper {
this.items = [...this.items, ...ads];
}
/**
*
* @param part
* @returns {Promise<{}>}
*/
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
@ -61,6 +69,11 @@ class TotaljobsScraper extends MasterScraper {
return newObj;
}
/**
*
* @param item
* @returns {Promise<*>}
*/
async getIndividualPage(item) {
const newItem = {...item};
console.log('Getting', item.url);
@ -75,6 +88,10 @@ class TotaljobsScraper extends MasterScraper {
return newItem;
}
/**
*
* @returns {Promise<void>}
*/
async getJobPages() {
const newItems = [];
for (let item of this.items) {
@ -86,6 +103,10 @@ class TotaljobsScraper extends MasterScraper {
this.items = [...newItems];
}
/**
*
* @returns {Promise<void>}
*/
async checkNext() {
const $ = this.currentPage;
const next = $('.pagination > *:last-child').attr('href') || '';
@ -96,6 +117,10 @@ class TotaljobsScraper extends MasterScraper {
console.log(next);
}
/**
*
* @returns {Promise<void>}
*/
async processSite() {
console.log('Processing...');
@ -121,8 +146,14 @@ class TotaljobsScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
}
/**
*
* @param location
* @returns {Promise<void>}
*/
async go(location = 'london') {
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');

View File

@ -0,0 +1,124 @@
/**
* Created by WebStorm.
* User: martin
* Date: 24/07/2020
* Time: 11:45
*/
const Jobs = require('../../lib/mongoManager');
const { Utils } = require('@rakh/utils');
const killNLDoubleSpace = /(\\n)\s{2,}|(\\n)|\s{2,}/g;
function reduceList(data) {
if (arguments.length === 0 || arguments[0] === null ) return '';
const outObj = data.map((v) => {
const o = Utils.extractFromObj({...v.details,...v.data, _id:v._id},['title','site', 'company', 'timestamp', 'read', 'applied', 'jobtype', 'class', 'autoclass']);
o._id = v._id;
return o;
});
// console.log(data);
return outObj;
}
function reduceRecord(record) {
// console.log('Reducderecord', record);
let outRec = {...record.details,data:record.data,_id:record._id};
return outRec;
}
exports.getList = (req, res) => {
console.log('>getList req', req.params);
Jobs.find({}, { 'details.title':1, 'details.site':1, 'details.company':1, 'data':1, '_id':1 }).limit(200).sort( { 'data.timestamp': -1 } ).then((doc) => {
if (doc) {
res.send(reduceList(doc));
}
}).catch((err) => {
console.error(err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};
exports.getJob = (req, res) => {
console.log('>getJob req', req.params);
if(!req.params.id)
return res.status(500).send({
'message': 'Job id missing'
});
const id = req.params.id;
Jobs.findById(id).then((doc) => {
if (doc) {
const item = reduceRecord(doc._doc);
const date = new Date( item.timestamp * 1000);
console.log(item);
item.date = date.toLocaleString();
item.title = item.title.replace(killNLDoubleSpace, ' ');
res.send(item);
}
}).catch((err) => {
console.error(err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};
exports.readJob = (req, res) => {
console.log('>readJob req', req.params);
let id;
if(!req.params.id)
return res.status(500).send({
'message': 'Job id missing'
});
else
id = req.params.id;
Jobs.findById(id).then((doc) => {
if (doc) {
let fullDoc = Object.assign({}, doc._doc);
console.log('fullDoc', fullDoc);
if (!Utils.isEmpty(fullDoc)){
fullDoc.data.read = new Date().getTime();
Jobs.findByIdAndUpdate(id, fullDoc, {'new':true}).then((doc) => {
console.log(doc._doc);
res.status(200).end();
}).catch((err) => {
console.error('inside',err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
}
}
}).catch((err) => {
console.error('outer', err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};

View File

@ -0,0 +1,89 @@
/**
* Created by WebStorm.
* User: martin
* Date: 28/07/2020
* Time: 11:08
*/
const Jobs = require('../../lib/mongoManager');
const { Utils } = require('@rakh/utils');
const fs = require('fs');
var bayes = require('bayes');
var classifier = bayes({
'tokenizer': function (text) {
return text.split(',');
}
});
function load() {
const file = fs.readFileSync('brain.json');
classifier = bayes.fromJson(file);
}
function save() {
var stateJson = classifier.toJson();
console.log(stateJson);
fs.writeFileSync('brain.json', stateJson);
}
load();
exports.upvote = (req, res) => {
console.log('>upvote req', req.params);
if(!req.params.id)
return res.status(500).send({
'message': 'Job id missing'
});
const id = req.params.id;
Jobs.findById(id).then(async (doc) => {
if (doc) {
const words = doc._doc.data.autoclass.words.join(',');
await classifier.learn(words, 'good');
save();
res.status(200).end();
}
}).catch((err) => {
console.error(err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};
exports.downvote = (req, res) => {
console.log('>upvote req', req.params);
if(!req.params.id)
return res.status(500).send({
'message': 'Job id missing'
});
const id = req.params.id;
Jobs.findById(id).then(async (doc) => {
if (doc) {
const words = doc._doc.data.autoclass.words.join(',');
await classifier.learn(words, 'bad');
save();
res.status(200).end();
}
}).catch((err) => {
console.error(err.message);
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
/**
* Created by WebStorm.
* User: martin
* Date: 24/07/2020
* Time: 11:42
*/
const jobs = require('../controllers/jobs.v2.controller');
module.exports = (app) => {
app.route('/v2/jobs')
.get(jobs.getList);
app.route('/v2/jobs/:id')
.get(jobs.getJob)
.put(jobs.readJob);
};

View File

@ -0,0 +1,17 @@
/**
* Created by WebStorm.
* User: martin
* Date: 28/07/2020
* Time: 11:07
*/
const vote = require('../controllers/vote.controller');
module.exports = (app) => {
app.route('/vote/up/:id')
.put(vote.upvote);
app.route('/vote/down/:id')
.put(vote.downvote);
};

View File

@ -58,7 +58,9 @@ app.use(bodyParser.json());
app.post('/auth', auth.auth);
require('./routes/jobs.route')(app);
require('./routes/jobs.v2.route')(app);
require('./routes/apply.route')(app);
require('./routes/vote.route')(app);
app.listen(serverPort, () => {
console.log(`Server is listening on port ${serverPort}`);

File diff suppressed because one or more lines are too long

View File

@ -20,7 +20,7 @@ const indeedScraper = new IndeedScraper();
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
const page = fs.readFileSync('data/indeed/page2.html');
test.test('Test Indeed scraper', async t => {
test.skip('Test Indeed scraper', async t => {
const $ = cheerio.load(page);
indeedScraper.loadPage($);
@ -35,13 +35,36 @@ test.test('Test Indeed scraper', async t => {
await indeedScraper.filterAdverts();
// await indeedScraper.addToDB();
await indeedScraper.addToMongo();
t.end();
});
test.test('Test full run Indeed scraper', async t => {
await indeedScraper.go('london');
test.skip('Test full run Indeed scraper', async t => {
await indeedScraper.go('london').catch((err) => {
console.error('Indeed GO', err);
});
t.end();
});
test.test('Test Indeed scraper -- MONGO', async t => {
const $ = cheerio.load(page);
indeedScraper.loadPage($);
await indeedScraper.breakPage();
// await indeedScraper.getJobPages();
// console.log(await indeedScraper.checkNext());
// console.log(indeedScraper.items);
// await indeedScraper.filterAdverts();
await indeedScraper.addToMongo();
t.end();
});

View File

@ -26,13 +26,14 @@ const s1jobsScraper = new RssS1Jobs();
const feed = fs.readFileSync('test/data/s1jobs/m7dp711z2r.xml');
test.test('Test Jobserve scraper', async t => {
let url = 'http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml';
await s1jobsScraper.setStartUrl(url);
s1jobsScraper.reduceItems();
await s1jobsScraper.filterAdverts();
await s1jobsScraper.addToDB();
// await s1jobsScraper.addToDB();
t.end();
});

View File

@ -19,17 +19,17 @@ const testScraper = new RssTechnojobs();
const feed = fs.readFileSync('test/data/technojobs/page1');
test.test('Test Technojobs scraper', async t => {
// await testScraper.loadFeed(feed);
await testScraper.loadFeed('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
// testScraper.reduceItems();
await testScraper.reduceItems();
// await s1jobsScraper.filterAdverts();
await s1jobsScraper.filterAdverts();
// await s1jobsScraper.addToDB();
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
/* await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
await testScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1')
*/
t.end();
});

View File

@ -22,20 +22,20 @@ console.log(`${__dirname}`);
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
test.test('Test Totaljobs scraper', async t => {
const $ = cheerio.load(page);
const $ = cheerio.load(page);
totaljobsScraper.loadPage($);
totaljobsScraper.loadPage($);
await totaljobsScraper.breakPage();
await totaljobsScraper.breakPage();
await totaljobsScraper.getJobPages();
// console.log(await indeedScraper.checkNext());
await totaljobsScraper.getJobPages();
// console.log(await indeedScraper.checkNext());
console.log(totaljobsScraper.items);
// console.log(totaljobsScraper.items);
await totaljobsScraper.filterAdverts();
await totaljobsScraper.filterAdverts();
// await totaljobsScraper.addToDB();
// await totaljobsScraper.addToDB();
t.end();
t.end();
});

14
test/wip.js Normal file
View File

@ -0,0 +1,14 @@
/**
* Created by WebStorm.
* User: martin
* Date: 23/07/2020
* Time: 09:26
*/
const { Corpus } = require('../lib/corpus');
const text = 'ESTAMP DEVELOPER 6 month contract £450-525 / day Developer, SQL, Photoshop, Javascript,  NET, C#, Javascript Advanced knowledge of SQL Server TSQL Experience of the design and  PDF stamp development E-STAMP DEVELOPER 6 month contract';
const out = Corpus.process(text);
console.log(out);

71
testgrabber.js Normal file
View File

@ -0,0 +1,71 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 23:35
*/
const CronJob = require('cron').CronJob;
const IndeedScraper = require('./scrapers/indeed');
const TotaljobsScraper = require('./scrapers/totaljobs');
const CwjobsScraper = require('./scrapers/cwjobs');
const JobserveScraper = require('./scrapers/rss.jobserve');
const RssS1Jobs = require('./scrapers/rss.s1jobs');
const RssTechnojobs = require('./scrapers/rss.technojobs');
(async function () {
console.log('Started..');
const indeedScraper = new IndeedScraper();
const totaljobsScraper = new TotaljobsScraper();
const cwjobsScraper = new CwjobsScraper();
const jobserveScraper = new JobserveScraper();
const s1jobsScraper = new RssS1Jobs();
const technojobsScraper = new RssTechnojobs();
await indeedScraper.go('london');
await totaljobsScraper.go('london');
await cwjobsScraper.go('london');
await indeedScraper.go('glasgow');
await totaljobsScraper.go('glasgow');
await cwjobsScraper.go('glasgow');
await indeedScraper.go('edinburgh');
await totaljobsScraper.go('edinburgh');
await cwjobsScraper.go('edinburgh');
await indeedScraper.go('milton keynes');
await totaljobsScraper.go('milton keynes');
await cwjobsScraper.go('milton keynes');
/*
await jobserveScraper.go('https://www.jobserve.com/MySearch/BAEBF3BDF82B8FEF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/9BCBF25C586A0E3F.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/F3A56475D5FD4966.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/4E2AC50E02AD128B.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6DA9769BA89834AA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/EDF47BEA6B31EF.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/3CAD044BEF2BFA.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/C7B25D86D0844A.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/64A3EEF615FA4C.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/6FC7E9ED5F042ECB.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/CA49421A86CA3F74.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
await jobserveScraper.go('https://www.jobserve.com/MySearch/ED1708BF42EF3513.rss'); // javascript node 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/4C67595E323E3453.rss'); // vuejs 2 Jul 2020
await jobserveScraper.go('https://www.jobserve.com/MySearch/DCD6B8CE431FE402.rss'); // svelte 2 Jul 2020
await s1jobsScraper.go('http://www.s1jobs.com/xml/m7dp711z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/pfvf7o7z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/lluqnt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/tu33qt8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/u3btnz8z2r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
*/
})();

1
unused.json Normal file

File diff suppressed because one or more lines are too long

22
words.js Normal file
View File

@ -0,0 +1,22 @@
/**
* Created by WebStorm.
* User: martin
* Date: 27/07/2020
* Time: 10:08
*/
const jsonfile = require('jsonfile');
const data = require('./unused.json');
function show(size) {
const f = data.filter((v) => {
return (v.length === size);
});
jsonfile.writeFileSync('limited.json', [...new Set(f)]);
console.log('done');
}
show(11);