This commit is contained in:
Martin Donnelly 2020-05-19 10:05:04 +01:00
commit 5001bbd798
43 changed files with 19464 additions and 0 deletions

55
.eslintrc.json Normal file
View File

@ -0,0 +1,55 @@
{
"parserOptions": {
"ecmaVersion": 2017,
"sourceType": "module",
"ecmaFeatures": {
"jsx": false
}
},
"env": {
"browser": true,
"node": true,
"es6": true
},
"rules": {
"arrow-spacing": "error",
"block-scoped-var": "error",
"block-spacing": "error",
"brace-style": ["error", "stroustrup", {}],
"camelcase": "error",
"comma-dangle": ["error", "never"],
"comma-spacing": ["error", { "before": false, "after": true }],
"comma-style": [1, "last"],
"consistent-this": [1, "_this"],
"curly": [1, "multi"],
"eol-last": 1,
"eqeqeq": 1,
"func-names": 1,
"indent": ["error", 2, { "SwitchCase": 1 }],
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
"new-cap": 1,
"newline-before-return": "error",
"no-array-constructor": 1,
"no-inner-declarations": [1, "both"],
"no-mixed-spaces-and-tabs": 1,
"no-multi-spaces": 2,
"no-new-object": 1,
"no-shadow-restricted-names": 1,
"object-curly-spacing": ["error", "always"],
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
"prefer-const": "error",
"prefer-template": "error",
"one-var": 0,
"quote-props": ["error", "always"],
"quotes": [1, "single"],
"radix": 1,
"semi": [1, "always"],
"space-before-blocks": [1, "always"],
"space-infix-ops": 1,
"vars-on-top": 1,
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
"spaced-comment": ["error", "always", { "markers": ["/"] }]
}
}

148
.gitignore vendored Normal file
View File

@ -0,0 +1,148 @@
# Created by .ignore support plugin (hsz.mobi)
### Node template
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Typescript v1 declaration files
typings/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
### macOS template
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
.idea/
# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
# Gradle:
.idea/**/gradle.xml
.idea/**/libraries
# CMake
cmake-build-debug/
# Mongo Explorer plugin:
.idea/**/mongoSettings.xml
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Elastic Beanstalk Files
.elasticbeanstalk/*
!.elasticbeanstalk/*.cfg.yml
!.elasticbeanstalk/*.global.yml
/src/bundle.js
/src/bundle.js.map
/live/
!/output/

BIN
db/jobs.db Normal file

Binary file not shown.

0
db/menu.db Normal file
View File

6
dist/build/bundle.css vendored Normal file

File diff suppressed because one or more lines are too long

18
dist/build/bundle.css.map vendored Normal file

File diff suppressed because one or more lines are too long

1
dist/build/bundle.js vendored Normal file

File diff suppressed because one or more lines are too long

1
dist/build/bundle.js.map vendored Normal file

File diff suppressed because one or more lines are too long

BIN
dist/favicon.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

0
dist/global.css vendored Normal file
View File

18
dist/index.html vendored Normal file
View File

@ -0,0 +1,18 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset='utf-8'>
<meta name='viewport' content='width=device-width,initial-scale=1'>
<title>Svelte app</title>
<link rel='icon' type='image/png' href='/favicon.png'>
<link rel='stylesheet' href='/global.css'>
<link rel='stylesheet' href='/build/bundle.css'>
<script defer src='/build/bundle.js'></script>
</head>
<body>
</body>
</html>

27
grabber.js Normal file
View File

@ -0,0 +1,27 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 23:35
*/
const CronJob = require('cron').CronJob;
const IndeedScraper = require('./scrapers/indeed');
const TotaljobsScraper = require('./scrapers/totaljobs');
(async function () {
console.log('Started..');
const indeedScraper = new IndeedScraper();
const totaljobsScraper = new TotaljobsScraper();
new CronJob('5 6-23/3 * * *', async function() {
await indeedScraper.go('london');
await totaljobsScraper.go('london');
await indeedScraper.go('glasgow');
await totaljobsScraper.go('glasgow');
await indeedScraper.go('edinburgh');
await totaljobsScraper.go('edinburgh');
await indeedScraper.go('milton keynes');
await totaljobsScraper.go('milton keynes');
}, null, true);
})();

6
lib/connect.js Normal file
View File

@ -0,0 +1,6 @@
const sqlite3 = require('sqlite3').verbose();
const db = new sqlite3.Database(`${__dirname}/../db/jobs.db`);
console.log(`${__dirname}/../db/jobs.db`);
module.exports = db;

66
lib/dbmanager.js Normal file
View File

@ -0,0 +1,66 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 10:00
*/
const db = require('./connect');
function prepareData(_obj) {
const newObj = Object.assign({}, _obj);
newObj.isEasyApply = (_obj.isEasyApply) ? 1 : 0;
return newObj;
}
exports.insertOne = (data) => {
const sql = 'INSERT INTO jobs VALUES (?,?,?,?,?,?,?,?,?,?,?,?)';
const workObj = prepareData(data);
return new Promise((resolve, reject) => {
db.run(sql, [null, workObj.title, workObj.site, workObj.url, workObj.id, workObj.summary, workObj.company, workObj.location, workObj.postDate, workObj.salary, workObj.isEasyApply, workObj.timestamp], function(err) {
if (err)
reject(err);
resolve({ 'msg':'Row inserted', '_id': this.lastID });
});
});
};
exports.getList = () => {
const outgoing = [];
const sql = 'select _id, title, site, company, timestamp from jobs order by _id desc';
return new Promise((resolve, reject) => {
db.all(sql, [], (err, rows) => {
if (err)
reject(err);
rows.forEach((row) => {
outgoing.push(row);
});
resolve(outgoing) ;
});
});
};
exports.getOne = (id) => {
const sql = 'SELECT * FROM jobs WHERE _id = ?';
return new Promise((resolve, reject) => {
db.get(sql, [id], (err, row) => {
if (err)
reject(err);
if (!err) resolve(row);
});
});
};
// select _id, title, site, company, timestamp from jobs order by _id desc;

18
lib/filter_3_days.js Normal file
View File

@ -0,0 +1,18 @@
module.exports = function (item, itemOptions, source) {
function inDays(d1, d2) {
var t2 = d2.getTime();
var t1 = d1.getTime();
return parseInt((t2-t1)/(24*3600*1000));
}
var now = new Date();
var then = new Date(itemOptions.date);
var d = inDays(then,now);
if (d < 3)
return itemOptions;
else
return -1;
};

View File

@ -0,0 +1,14 @@
module.exports = function (item, itemOptions, source) {
if (!item || !itemOptions) {
return;
}
/*
return only:
item.description, item.title contains: ajax, asp, javascript, php, vmware, sql, classic
*/
// This plugin removes all items by returning -1 instead of the processed itemOptions
return itemOptions;
};

7
lib/filter_glasgow.js Normal file
View File

@ -0,0 +1,7 @@
module.exports = function (item, itemOptions, source) {
if(itemOptions.description.indexOf('Glasgow') > 0 || itemOptions.description.indexOf('London') > 0)
return itemOptions;
else
return -1;
};

18
lib/filter_last_week.js Normal file
View File

@ -0,0 +1,18 @@
module.exports = function (item, itemOptions, source) {
function inDays(d1, d2) {
var t2 = d2.getTime();
var t1 = d1.getTime();
return parseInt((t2-t1)/(24*3600*1000));
}
var now = new Date();
var then = new Date(itemOptions.date);
var d = inDays(then,now);
if (d <= 7)
return itemOptions;
else
return -1;
};

8
lib/filter_location.js Normal file
View File

@ -0,0 +1,8 @@
module.exports = function (item, itemOptions, source) {
const patt = /(glasgow|london|edinburgh|milton keynes)/ig;
const result = patt.test(itemOptions.description);
const resultB = patt.test(itemOptions.title);
console.log('Location:', (result || resultB === true) ? 'Pass' : 'Reject');
return (result || resultB === true) ? itemOptions : -1;
};

16
lib/filter_md_jobs.js Normal file
View File

@ -0,0 +1,16 @@
module.exports = function (item) {
const patt = /(full\s?stack|front\s?end|html|html5|es6|react|knockout|ember|vue|riotjs|css|javascript|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W/ig;
const result = patt.test(item.title);
const resultB = patt.test(item.summary);
console.log('My Filter:', (result || resultB === true) ? 'Pass' : 'Reject');
if (!(result || resultB === true)) {
console.log('Result', result);
console.log('ResultB', resultB);
console.log(item);
}
return (result || resultB === true) ;
};

View File

@ -0,0 +1,8 @@
module.exports = function (item, itemOptions, source) {
if (!item || !itemOptions) {
return;
}
// This plugin removes all items by returning -1 instead of the processed itemOptions
return -1;
};

18
lib/filter_reject.js Normal file
View File

@ -0,0 +1,18 @@
module.exports = function (item) {
const patt = /(Teachers?|Technical Writer|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Manager|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist)/ig;
const engineers = /(Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)+(?:\s)(?=Engineer)/ig;
const developers = /(Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang)+(?:\s)(?=Developer)/ig;
const architects = /(Java|PHP|Microsoft)+(?:\s)(?=Architect)/ig;
const antiAd = /sja\d+/gi;
const result = patt.test(item.summary) || engineers.test(item.summary) || developers.test(item.summary) || architects.test(item.summary);
const resultB = patt.test(item.title) || engineers.test(item.title) || developers.test(item.title) || architects.test(item.title);
const resultC = antiAd.test(item.id);
console.log('Reject:', (result || resultB || resultC === true) ? 'Reject' : 'Pass');
return (!(result || resultB || resultC === true));
};

6
lib/filter_show.js Normal file
View File

@ -0,0 +1,6 @@
module.exports = function (item, itemOptions, source) {
console.log('---');
console.log(itemOptions.title);
console.log(itemOptions.permalink || itemOptions.url);
return itemOptions;
};

20
lib/filter_today_only.js Normal file
View File

@ -0,0 +1,20 @@
module.exports = function (item, itemOptions, source) {
function inDays(d1, d2) {
var t2 = d2.getTime();
var t1 = d1.getTime();
return parseInt((t2 - t1) / (24 * 3600 * 1000));
}
var now = new Date();
var then = new Date(itemOptions.date);
var d = inDays(then, now);
console.log('Today:', (d === 0) ? 'Pass' : 'Reject');
if (d === 0)
return itemOptions;
else
return -1;
};

165
lib/scraper.js Normal file
View File

@ -0,0 +1,165 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const request = require('request');
const axios = require('axios');
const fecha = require('fecha');
const fs = require('fs');
const dbmanager = require('../lib/dbmanager');
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
class MasterScraper {
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.useStone = false;
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
setStartUrl(newUrl) {
this.url = newUrl;
}
loadPage(page) {
this.currentPage = page;
}
getContent(url, useStone = false) {
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
if (useStone)
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
else
options.url = url;
console.log(options);
request(options, (err, _res, body) => {
if (!err)
resolve(body);
else
reject(err);
});
});
};
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone)
.then((html) => {
const $ = cheerio.load(html);
this.loadPage($);
})
.catch((err) => console.error(err));
// console.log(response.status);
/* if (response.status === 200) {
// console.log(response.status);
try{
console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`);
await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data);
}
catch(err) {
console.error(err);
}
const $ = cheerio.load(response.data);
this.loadPage($);
}*/
}
async addToDB() {
for(const item of this.items) {
console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(err.message || 'Some error occurred while querying the database.');
});
}
}
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
}
makeUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
makeProxyUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
// Site specific parts below here
async breakPage() {
}
async extractDetails(part) {
}
async checkNext() {
}
async processSite() {
}
async getIndividualPage() {
}
async getJobPages() {
}
async go() {
}
}
module.exports = MasterScraper;

2763
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

31
package.json Normal file
View File

@ -0,0 +1,31 @@
{
"name": "jobscraper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"grabber": "node grabber.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"axios": "^0.19.2",
"body-parser": "^1.19.0",
"cheerio": "^1.0.0-rc.3",
"cron": "^1.8.2",
"dotenv": "^8.2.0",
"eslint": "^6.8.0",
"express": "^4.17.1",
"fecha": "^4.2.0",
"present": "^1.0.0",
"rss-parser": "^3.7.6",
"sqlite3": "^4.1.1",
"tape": "^4.13.2",
"tape-promise": "^4.0.0"
},
"devDependencies": {
"cors": "^2.8.5",
"express-session": "^1.17.1",
"helmet": "^3.22.0"
}
}

150
scrapers/indeed.js Normal file
View File

@ -0,0 +1,150 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const MasterScraper = require('../lib/scraper');
class IndeedScraper extends MasterScraper {
constructor() {
super();
this.siteurl = 'www.indeed.co.uk';
this.siteid = 'indeed';
this.useStone = true;
this.requestOptions = {
'url' : ''
};
this.antiAd = /sja\d+/gi;
}
// Site specific parts below here
async breakPage() {
const $ = this.currentPage;
const ads = [];
const sections = $('div.row.result');
await sections.each(async (index, item) => {
// console.log($(item).html());
const ad = await this.extractDetails(item);
if (ad !== null)
ads.push(ad);
console.log(ads);
// console.log('<<<<<<<<<>>>>>>>>>');
});
this.items = [...this.items, ...ads];
}
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
// console.log($part.html());
const now = ~~(new Date().getTime() / 1000.0);
newObj.title = $part('.jobtitle').text().trim();
newObj.site = this.siteid;
// newObj.url = `https://${ this.siteurl }${$part('.jobtitle').attr('href')}`;
newObj.url = this.makeUrl($part('.jobtitle').attr('href'));
newObj.id = $part('h2.title a').attr('id').trim();
newObj.summary = $part('.summary').text().trim();
newObj.company = $part('.company').text().trim() || null;
newObj.location = $part('.location').text().trim();
newObj.postDate = $part('.date').text().trim();
newObj.salary = $part('.salary.no-wrap').text().trim();
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
newObj.timestamp = now;
return newObj;
}
async getIndividualPage(item) {
const newItem = {...item};
console.log('Getting', item.url);
await this.getContent(item.url)
.then((html) => {
const $ = cheerio.load(html);
newItem.summary = $('#jobDescriptionText').text().trim();
})
.catch((err) => console.error(err));
return newItem;
}
async getJobPages() {
const newItems = [];
for (let item of this.items) {
item = await this.getIndividualPage(item);
newItems.push(item);
}
this.items = [...newItems];
}
async checkNext() {
const $ = this.currentPage;
const next = $('.pagination > *:last-child').attr('href') || '';
if (next !== '')
// next = `https://${ this.siteurl }${next}`;
this.makeUrl(next);
console.log(next);
}
async processSite() {
console.log('Processing...');
let nextPage;
const previousPage = '';
// do {
// previousPage = this.url;
this.items = [];
await this.getPage();
await this.breakPage();
await this.checkNext();
await this.getJobPages();
// nextPage = await this.checkNext();
// if (nextPage === previousPage) nextPage = '';
// this.setStartUrl(nextPage);
// }while (nextPage !== '');
await this.filterAdverts();
await this.addToDB();
}
async go(location = 'london') {
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
// Glasgow
// https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=glasgow&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch
await this.processSite();
}
}
module.exports = IndeedScraper;

184
scrapers/indeed.orig.js Normal file
View File

@ -0,0 +1,184 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const axios = require('axios');
const fecha = require('fecha');
const fs = require('fs');
const dbmanager = require('../lib/dbmanager');
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
class IndeedScraper {
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.host = 'www.indeed.co.uk';
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
}
setStartUrl(newUrl) {
this.url = newUrl;
}
loadPage(page) {
this.currentPage = page;
}
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `indeed-${now}.html`;
const response = await axios.get(this.url).catch((err) => {
console.error(err);
});
console.log(response.status);
if (response.status === 200) {
console.log(response);
/* try{
fs.writeFileSync(`../test/data/indeed/${filename}`, response.data);
}
catch(err) {
console.error(err);
}*/
const $ = cheerio.load(response.data);
this.loadPage($);
}
}
async addToDB() {
for(const item of this.items) {
console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(err.message || 'Some error occurred while querying the database.');
});
}
}
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
}
// Site specific parts below here
async breakPage() {
const $ = this.currentPage;
const ads = [];
const sections = $('div.row.result');
await sections.each(async (index, item) => {
// console.log($(item).html());
const ad = await this.extractDetails(item);
ads.push(ad);
// console.log('<<<<<<<<<>>>>>>>>>');
});
this.items = [...this.items, ...ads];
}
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
const now = ~~(new Date().getTime() / 1000.0);
newObj.title = $part('.jobtitle')
.text()
.trim();
newObj.site = 'indeed';
newObj.url = `https://${ this.host }${$part('.jobtitle').attr('href')}`;
newObj.id = $part('h2.title a').attr('id').trim();
newObj.summary = $part('.summary').text().trim();
newObj.company = $part('.company').text().trim() || null;
newObj.location = $part('.location').text().trim();
newObj.postDate = $part('.date').text().trim();
newObj.salary = $part('.salary.no-wrap').text().trim();
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
newObj.timestamp = now;
// console.log(newObj);
return newObj;
}
async checkNext() {
const $ = this.currentPage;
let next = $('.pagination > *:last-child').attr('href') || '';
if (next !== '')
next = `https://${ this.host }${next}`;
console.log(next);
}
async processSite() {
console.log('Processing...');
let nextPage;
let previousPage = '';
// do {
previousPage = this.url;
await this.getPage();
await this.breakPage();
await this.checkNext();
nextPage = await this.checkNext();
// if (nextPage === previousPage) nextPage = '';
// this.setStartUrl(nextPage);
// }while (nextPage !== '');
await this.filterAdverts();
await this.addToDB();
}
async go() {
this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
await this.processSite();
}
}
const ind = new IndeedScraper();
ind.go();
module.exports = IndeedScraper;

87
scrapers/jobserve.js Normal file
View File

@ -0,0 +1,87 @@
/**
* Created by WebStorm.
* User: martin
* Date: 16/04/2020
* Time: 16:46
*/
const Parser = require('rss-parser');
class MasterReader {
constructor() {
this.url = '';
this.items = [];
this.feeditems = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
getContent(url) {
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
console.log(options);
options.url = url;
console.log(options);
request(options, (err, _res, body) => {
if (!err)
resolve(body);
else
reject(err);
});
});
};
setFeed(newUrl) {
this.url = newUrl;
}
async getFeed() {
console.log('>> getFeed: fetching', this.url);
const parser = new Parser();
const feed = await parser.parseURL(this.url);
console.log(feed);
this.feeditems = [...feed.items];
}
}
class JobserveReader extends MasterReader {
constructor(props) {
super(props);
this.hosturl = 'https://jobserve.com';
this.siteid = 'jobserve';
}
async processFeed() {
await this.getFeed();
if (this.feeditems.length > 0) {
} else {
console.log('Nothing to process');
}
}
}
const jobServeReader = new JobserveReader();
jobServeReader.setFeed('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
jobServeReader.processFeed();

138
scrapers/totaljobs.js Normal file
View File

@ -0,0 +1,138 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const MasterScraper = require('../lib/scraper');
class TotaljobsScraper extends MasterScraper {
constructor() {
super();
this.siteurl = 'www.totaljobs.com';
this.siteid = 'totaljobs';
this.requestOptions = {
'url' : ''
};
}
// Site specific parts below here
async breakPage() {
const $ = this.currentPage;
const ads = [];
const sections = $('div.job');
await sections.each(async (index, item) => {
// console.log($(item).html());
const ad = await this.extractDetails(item);
ads.push(ad);
// console.log('<<<<<<<<<>>>>>>>>>');
});
this.items = [...this.items, ...ads];
}
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
const now = ~~(new Date().getTime() / 1000.0);
// console.log($part.html());
newObj.title = $part('.job-title').text().trim();
newObj.url = $part('.job-title a').attr('href');
newObj.id = $part('div.job').attr('id').trim();
newObj.summary = $part('p.job-intro').text().trim();
newObj.company = $part('.company').text().trim() || null;
newObj.location = $part('.location > span').text().trim();
newObj.postDate = $part('.date-posted').text().trim();
newObj.salary = $part('.salary').text().trim();
newObj.isEasyApply = false;
newObj.site = this.siteid;
newObj.timestamp = now;
return newObj;
}
async getIndividualPage(item) {
const newItem = {...item};
console.log('Getting', item.url);
await this.getContent(item.url)
.then((html) => {
console.log(html);
const $ = cheerio.load(html);
newItem.summary = $('div.job-description').text().trim();
})
.catch((err) => console.error(err));
return newItem;
}
async getJobPages() {
const newItems = [];
for (let item of this.items) {
console.log(item.title);
item = await this.getIndividualPage(item);
newItems.push(item);
}
this.items = [...newItems];
}
async checkNext() {
const $ = this.currentPage;
const next = $('.pagination > *:last-child').attr('href') || '';
if (next !== '')
// next = `https://${ this.siteurl }${next}`;
this.makeUrl(next);
console.log(next);
}
async processSite() {
console.log('Processing...');
let nextPage;
const previousPage = '';
// do {
// previousPage = this.url;
this.items = [];
await this.getPage();
await this.breakPage();
await this.checkNext();
// await this.getJobPages();
// nextPage = await this.checkNext();
// if (nextPage === previousPage) nextPage = '';
// this.setStartUrl(nextPage);
// }while (nextPage !== '');
await this.filterAdverts();
await this.addToDB();
}
async go(location = 'london') {
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
// Glasgow
// https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=glasgow&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch
await this.processSite();
}
}
module.exports = TotaljobsScraper;

View File

@ -0,0 +1,62 @@
/**
* Created by WebStorm.
* User: martin
* Date: 18/05/2020
* Time: 13:39
*/
const dbmanager = require('../../lib/dbmanager');
const killNLDoubleSpace = /(\\n)\s{2,}|(\\n)|\s{2,}/g;
exports.getList = (req, res) => {
console.log('>getList req', req.params);
/* if(!req.params.id)
return res.status(400).send({
'message': 'Job id missing'
});*/
dbmanager.getList()
.then((data) => {
const processed = data.map((item) => {
const date = new Date( item.timestamp * 1000);
item.date = date.toLocaleString();
item.title = item.title.replace(killNLDoubleSpace, ' ');
return item;
});
res.send(processed);
})
.catch((err) => {
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};
exports.getJob = (req, res) => {
console.log('>getJob req', req.params);
if(!req.params.id)
return res.status(500).send({
'message': 'Job id missing'
});
const id = req.params.id;
dbmanager.getOne(id)
.then((data) => {
const item = Object.assign({}, data);
const date = new Date( item.timestamp * 1000);
item.date = date.toLocaleString();
item.title = item.title.replace(killNLDoubleSpace, ' ');
res.send(item);
})
.catch((err) => {
res.status(500).send({
'message': err.message || 'Some error occurred while querying the database.'
});
});
};

View File

@ -0,0 +1,17 @@
/**
* Created by WebStorm.
* User: martin
* Date: 18/05/2020
* Time: 13:39
*/
const jobs = require('../controllers/jobs.controller');
module.exports = (app) => {
app.route('/jobs')
.get(jobs.getList);
app.route('/jobs/:id')
.get(jobs.getJob);
};

49
server/server.js Normal file
View File

@ -0,0 +1,49 @@
/**
* Created by WebStorm.
* User: martin
* Date: 14/05/2020
* Time: 09:13
*/
require('dotenv').config();
const express = require('express');
const bodyParser = require('body-parser');
const session = require('express-session');
const path = require('path');
const helmet = require('helmet');
const cors = require('cors');
const app = express();
require('dotenv').config();
const serverPort = process.env.PORT || 3000;
const sitePath = '../live';
//app.use(cors());
//app.use(helmet());
app.use(session({
'secret': 'Z4hc5.64X1e',
'resave': true,
'saveUninitialized': true
}));
app.use(express.static(path.join(__dirname, sitePath)));
// parse requests of content-type - application/x-www-form-urlencoded
app.use(bodyParser.urlencoded({ 'extended': true }));
// parse requests of content-type - application/json
app.use(bodyParser.json());
require('./routes/jobs.route')(app);
app.listen(serverPort, () => {
console.log(`Server is listening on port ${serverPort}`);
});
((() => {
console.log('Job Server started');
// doJob();
})());

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1779
test/data/indeed/page.html Normal file

File diff suppressed because one or more lines are too long

2438
test/data/indeed/page2.html Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

41
test/indeed.js Normal file
View File

@ -0,0 +1,41 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:56
*/
const tape = require('tape');
const _test = require('tape-promise').default; // <---- notice 'default'
const test = _test(tape); // decorate tape
const fs = require('fs');
const cheerio = require('cheerio');
const IndeedScraper = require('../scrapers/indeed');
const indeedScraper = new IndeedScraper();
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
const page = fs.readFileSync('data/indeed/page2.html');
test.test('Test Indeed scraper', async t => {
const $ = cheerio.load(page);
indeedScraper.loadPage($);
await indeedScraper.breakPage();
// await indeedScraper.getJobPages();
// console.log(await indeedScraper.checkNext());
// console.log(indeedScraper.items);
await indeedScraper.filterAdverts();
// await indeedScraper.addToDB();
t.end();
});

41
test/totaljobs.js Normal file
View File

@ -0,0 +1,41 @@
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:56
*/
const tape = require('tape');
const _test = require('tape-promise').default; // <---- notice 'default'
const test = _test(tape); // decorate tape
const fs = require('fs');
const cheerio = require('cheerio');
const TotaljobsScraper = require('../scrapers/totaljobs');
const totaljobsScraper = new TotaljobsScraper();
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
console.log(`${__dirname}`);
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
test.test('Test Totaljobs scraper', async t => {
const $ = cheerio.load(page);
totaljobsScraper.loadPage($);
await totaljobsScraper.breakPage();
await totaljobsScraper.getJobPages();
// console.log(await indeedScraper.checkNext());
console.log(totaljobsScraper.items);
await totaljobsScraper.filterAdverts();
// await totaljobsScraper.addToDB();
t.end();
});