init
This commit is contained in:
commit
5001bbd798
55
.eslintrc.json
Normal file
55
.eslintrc.json
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2017,
|
||||
"sourceType": "module",
|
||||
"ecmaFeatures": {
|
||||
"jsx": false
|
||||
}
|
||||
},
|
||||
"env": {
|
||||
"browser": true,
|
||||
"node": true,
|
||||
"es6": true
|
||||
},
|
||||
"rules": {
|
||||
"arrow-spacing": "error",
|
||||
"block-scoped-var": "error",
|
||||
"block-spacing": "error",
|
||||
"brace-style": ["error", "stroustrup", {}],
|
||||
"camelcase": "error",
|
||||
"comma-dangle": ["error", "never"],
|
||||
"comma-spacing": ["error", { "before": false, "after": true }],
|
||||
"comma-style": [1, "last"],
|
||||
"consistent-this": [1, "_this"],
|
||||
"curly": [1, "multi"],
|
||||
"eol-last": 1,
|
||||
"eqeqeq": 1,
|
||||
"func-names": 1,
|
||||
"indent": ["error", 2, { "SwitchCase": 1 }],
|
||||
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
|
||||
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
|
||||
"new-cap": 1,
|
||||
"newline-before-return": "error",
|
||||
"no-array-constructor": 1,
|
||||
"no-inner-declarations": [1, "both"],
|
||||
"no-mixed-spaces-and-tabs": 1,
|
||||
"no-multi-spaces": 2,
|
||||
"no-new-object": 1,
|
||||
"no-shadow-restricted-names": 1,
|
||||
"object-curly-spacing": ["error", "always"],
|
||||
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
|
||||
"prefer-const": "error",
|
||||
"prefer-template": "error",
|
||||
"one-var": 0,
|
||||
"quote-props": ["error", "always"],
|
||||
"quotes": [1, "single"],
|
||||
"radix": 1,
|
||||
"semi": [1, "always"],
|
||||
"space-before-blocks": [1, "always"],
|
||||
"space-infix-ops": 1,
|
||||
"vars-on-top": 1,
|
||||
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
|
||||
"spaced-comment": ["error", "always", { "markers": ["/"] }]
|
||||
}
|
||||
|
||||
}
|
148
.gitignore
vendored
Normal file
148
.gitignore
vendored
Normal file
@ -0,0 +1,148 @@
|
||||
# Created by .ignore support plugin (hsz.mobi)
|
||||
### Node template
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Typescript v1 declaration files
|
||||
typings/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
|
||||
### macOS template
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
### JetBrains template
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
.idea/
|
||||
# User-specific stuff:
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/dictionaries
|
||||
|
||||
# Sensitive or high-churn files:
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.xml
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
|
||||
# Gradle:
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# CMake
|
||||
cmake-build-debug/
|
||||
|
||||
# Mongo Explorer plugin:
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
## File-based project format:
|
||||
*.iws
|
||||
|
||||
## Plugin-specific files:
|
||||
|
||||
# IntelliJ
|
||||
out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Cursive Clojure plugin
|
||||
.idea/replstate.xml
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
||||
# Elastic Beanstalk Files
|
||||
.elasticbeanstalk/*
|
||||
!.elasticbeanstalk/*.cfg.yml
|
||||
!.elasticbeanstalk/*.global.yml
|
||||
/src/bundle.js
|
||||
/src/bundle.js.map
|
||||
/live/
|
||||
!/output/
|
BIN
db/jobs.db
Normal file
BIN
db/jobs.db
Normal file
Binary file not shown.
0
db/menu.db
Normal file
0
db/menu.db
Normal file
6
dist/build/bundle.css
vendored
Normal file
6
dist/build/bundle.css
vendored
Normal file
File diff suppressed because one or more lines are too long
18
dist/build/bundle.css.map
vendored
Normal file
18
dist/build/bundle.css.map
vendored
Normal file
File diff suppressed because one or more lines are too long
1
dist/build/bundle.js
vendored
Normal file
1
dist/build/bundle.js
vendored
Normal file
File diff suppressed because one or more lines are too long
1
dist/build/bundle.js.map
vendored
Normal file
1
dist/build/bundle.js.map
vendored
Normal file
File diff suppressed because one or more lines are too long
BIN
dist/favicon.png
vendored
Normal file
BIN
dist/favicon.png
vendored
Normal file
Binary file not shown.
After Width: | Height: | Size: 25 KiB |
0
dist/global.css
vendored
Normal file
0
dist/global.css
vendored
Normal file
18
dist/index.html
vendored
Normal file
18
dist/index.html
vendored
Normal file
@ -0,0 +1,18 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset='utf-8'>
|
||||
<meta name='viewport' content='width=device-width,initial-scale=1'>
|
||||
|
||||
<title>Svelte app</title>
|
||||
|
||||
<link rel='icon' type='image/png' href='/favicon.png'>
|
||||
<link rel='stylesheet' href='/global.css'>
|
||||
<link rel='stylesheet' href='/build/bundle.css'>
|
||||
|
||||
<script defer src='/build/bundle.js'></script>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
27
grabber.js
Normal file
27
grabber.js
Normal file
@ -0,0 +1,27 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 23:35
|
||||
|
||||
*/
|
||||
const CronJob = require('cron').CronJob;
|
||||
const IndeedScraper = require('./scrapers/indeed');
|
||||
const TotaljobsScraper = require('./scrapers/totaljobs');
|
||||
|
||||
(async function () {
|
||||
console.log('Started..');
|
||||
const indeedScraper = new IndeedScraper();
|
||||
const totaljobsScraper = new TotaljobsScraper();
|
||||
|
||||
new CronJob('5 6-23/3 * * *', async function() {
|
||||
await indeedScraper.go('london');
|
||||
await totaljobsScraper.go('london');
|
||||
await indeedScraper.go('glasgow');
|
||||
await totaljobsScraper.go('glasgow');
|
||||
await indeedScraper.go('edinburgh');
|
||||
await totaljobsScraper.go('edinburgh');
|
||||
await indeedScraper.go('milton keynes');
|
||||
await totaljobsScraper.go('milton keynes');
|
||||
}, null, true);
|
||||
})();
|
6
lib/connect.js
Normal file
6
lib/connect.js
Normal file
@ -0,0 +1,6 @@
|
||||
const sqlite3 = require('sqlite3').verbose();
|
||||
const db = new sqlite3.Database(`${__dirname}/../db/jobs.db`);
|
||||
|
||||
console.log(`${__dirname}/../db/jobs.db`);
|
||||
|
||||
module.exports = db;
|
66
lib/dbmanager.js
Normal file
66
lib/dbmanager.js
Normal file
@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 10:00
|
||||
|
||||
*/
|
||||
|
||||
const db = require('./connect');
|
||||
|
||||
function prepareData(_obj) {
|
||||
const newObj = Object.assign({}, _obj);
|
||||
|
||||
newObj.isEasyApply = (_obj.isEasyApply) ? 1 : 0;
|
||||
|
||||
return newObj;
|
||||
}
|
||||
|
||||
exports.insertOne = (data) => {
|
||||
const sql = 'INSERT INTO jobs VALUES (?,?,?,?,?,?,?,?,?,?,?,?)';
|
||||
|
||||
const workObj = prepareData(data);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
db.run(sql, [null, workObj.title, workObj.site, workObj.url, workObj.id, workObj.summary, workObj.company, workObj.location, workObj.postDate, workObj.salary, workObj.isEasyApply, workObj.timestamp], function(err) {
|
||||
if (err)
|
||||
reject(err);
|
||||
|
||||
resolve({ 'msg':'Row inserted', '_id': this.lastID });
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.getList = () => {
|
||||
const outgoing = [];
|
||||
const sql = 'select _id, title, site, company, timestamp from jobs order by _id desc';
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
db.all(sql, [], (err, rows) => {
|
||||
if (err)
|
||||
reject(err);
|
||||
|
||||
rows.forEach((row) => {
|
||||
outgoing.push(row);
|
||||
});
|
||||
|
||||
resolve(outgoing) ;
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.getOne = (id) => {
|
||||
const sql = 'SELECT * FROM jobs WHERE _id = ?';
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
db.get(sql, [id], (err, row) => {
|
||||
if (err)
|
||||
reject(err);
|
||||
|
||||
if (!err) resolve(row);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
// select _id, title, site, company, timestamp from jobs order by _id desc;
|
18
lib/filter_3_days.js
Normal file
18
lib/filter_3_days.js
Normal file
@ -0,0 +1,18 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
function inDays(d1, d2) {
|
||||
var t2 = d2.getTime();
|
||||
var t1 = d1.getTime();
|
||||
|
||||
return parseInt((t2-t1)/(24*3600*1000));
|
||||
}
|
||||
|
||||
var now = new Date();
|
||||
var then = new Date(itemOptions.date);
|
||||
|
||||
var d = inDays(then,now);
|
||||
|
||||
if (d < 3)
|
||||
return itemOptions;
|
||||
else
|
||||
return -1;
|
||||
};
|
14
lib/filter_for_local_jobs.js
Normal file
14
lib/filter_for_local_jobs.js
Normal file
@ -0,0 +1,14 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
if (!item || !itemOptions) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
return only:
|
||||
item.description, item.title contains: ajax, asp, javascript, php, vmware, sql, classic
|
||||
|
||||
*/
|
||||
|
||||
// This plugin removes all items by returning -1 instead of the processed itemOptions
|
||||
return itemOptions;
|
||||
};
|
7
lib/filter_glasgow.js
Normal file
7
lib/filter_glasgow.js
Normal file
@ -0,0 +1,7 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
if(itemOptions.description.indexOf('Glasgow') > 0 || itemOptions.description.indexOf('London') > 0)
|
||||
|
||||
return itemOptions;
|
||||
else
|
||||
return -1;
|
||||
};
|
18
lib/filter_last_week.js
Normal file
18
lib/filter_last_week.js
Normal file
@ -0,0 +1,18 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
function inDays(d1, d2) {
|
||||
var t2 = d2.getTime();
|
||||
var t1 = d1.getTime();
|
||||
|
||||
return parseInt((t2-t1)/(24*3600*1000));
|
||||
}
|
||||
|
||||
var now = new Date();
|
||||
var then = new Date(itemOptions.date);
|
||||
|
||||
var d = inDays(then,now);
|
||||
|
||||
if (d <= 7)
|
||||
return itemOptions;
|
||||
else
|
||||
return -1;
|
||||
};
|
8
lib/filter_location.js
Normal file
8
lib/filter_location.js
Normal file
@ -0,0 +1,8 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
const patt = /(glasgow|london|edinburgh|milton keynes)/ig;
|
||||
const result = patt.test(itemOptions.description);
|
||||
const resultB = patt.test(itemOptions.title);
|
||||
console.log('Location:', (result || resultB === true) ? 'Pass' : 'Reject');
|
||||
|
||||
return (result || resultB === true) ? itemOptions : -1;
|
||||
};
|
16
lib/filter_md_jobs.js
Normal file
16
lib/filter_md_jobs.js
Normal file
@ -0,0 +1,16 @@
|
||||
module.exports = function (item) {
|
||||
const patt = /(full\s?stack|front\s?end|html|html5|es6|react|knockout|ember|vue|riotjs|css|javascript|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W/ig;
|
||||
const result = patt.test(item.title);
|
||||
const resultB = patt.test(item.summary);
|
||||
|
||||
console.log('My Filter:', (result || resultB === true) ? 'Pass' : 'Reject');
|
||||
|
||||
if (!(result || resultB === true)) {
|
||||
console.log('Result', result);
|
||||
console.log('ResultB', resultB);
|
||||
console.log(item);
|
||||
}
|
||||
|
||||
return (result || resultB === true) ;
|
||||
|
||||
};
|
8
lib/filter_out_all_articles.js
Normal file
8
lib/filter_out_all_articles.js
Normal file
@ -0,0 +1,8 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
if (!item || !itemOptions) {
|
||||
return;
|
||||
}
|
||||
|
||||
// This plugin removes all items by returning -1 instead of the processed itemOptions
|
||||
return -1;
|
||||
};
|
18
lib/filter_reject.js
Normal file
18
lib/filter_reject.js
Normal file
@ -0,0 +1,18 @@
|
||||
module.exports = function (item) {
|
||||
const patt = /(Teachers?|Technical Writer|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Manager|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist)/ig;
|
||||
|
||||
const engineers = /(Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)+(?:\s)(?=Engineer)/ig;
|
||||
const developers = /(Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang)+(?:\s)(?=Developer)/ig;
|
||||
const architects = /(Java|PHP|Microsoft)+(?:\s)(?=Architect)/ig;
|
||||
|
||||
const antiAd = /sja\d+/gi;
|
||||
|
||||
|
||||
const result = patt.test(item.summary) || engineers.test(item.summary) || developers.test(item.summary) || architects.test(item.summary);
|
||||
const resultB = patt.test(item.title) || engineers.test(item.title) || developers.test(item.title) || architects.test(item.title);
|
||||
const resultC = antiAd.test(item.id);
|
||||
|
||||
console.log('Reject:', (result || resultB || resultC === true) ? 'Reject' : 'Pass');
|
||||
|
||||
return (!(result || resultB || resultC === true));
|
||||
};
|
6
lib/filter_show.js
Normal file
6
lib/filter_show.js
Normal file
@ -0,0 +1,6 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
console.log('---');
|
||||
console.log(itemOptions.title);
|
||||
console.log(itemOptions.permalink || itemOptions.url);
|
||||
return itemOptions;
|
||||
};
|
20
lib/filter_today_only.js
Normal file
20
lib/filter_today_only.js
Normal file
@ -0,0 +1,20 @@
|
||||
module.exports = function (item, itemOptions, source) {
|
||||
function inDays(d1, d2) {
|
||||
var t2 = d2.getTime();
|
||||
var t1 = d1.getTime();
|
||||
|
||||
return parseInt((t2 - t1) / (24 * 3600 * 1000));
|
||||
}
|
||||
|
||||
var now = new Date();
|
||||
var then = new Date(itemOptions.date);
|
||||
|
||||
var d = inDays(then, now);
|
||||
|
||||
console.log('Today:', (d === 0) ? 'Pass' : 'Reject');
|
||||
|
||||
if (d === 0)
|
||||
return itemOptions;
|
||||
else
|
||||
return -1;
|
||||
};
|
165
lib/scraper.js
Normal file
165
lib/scraper.js
Normal file
@ -0,0 +1,165 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:55
|
||||
|
||||
*/
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const request = require('request');
|
||||
const axios = require('axios');
|
||||
const fecha = require('fecha');
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
const dbmanager = require('../lib/dbmanager');
|
||||
|
||||
const filterReject = require('../lib/filter_reject');
|
||||
const filterAccept = require('../lib/filter_md_jobs');
|
||||
|
||||
class MasterScraper {
|
||||
|
||||
constructor() {
|
||||
this.url = '';
|
||||
this.items = [];
|
||||
this.currentPage = null;
|
||||
this.hosturl = '';
|
||||
this.siteid = '';
|
||||
this.useStone = false;
|
||||
this.requestOptions = {
|
||||
'url' : '',
|
||||
'proxy' : 'http://uk.proxymesh.com:31280',
|
||||
'tunnel' : true
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
setStartUrl(newUrl) {
|
||||
this.url = newUrl;
|
||||
}
|
||||
|
||||
loadPage(page) {
|
||||
this.currentPage = page;
|
||||
}
|
||||
|
||||
getContent(url, useStone = false) {
|
||||
// return new pending promise
|
||||
return new Promise((resolve, reject) => {
|
||||
// select http or https module, depending on reqested url
|
||||
// const lib = url.startsWith('https') ? require('https') : require('http');
|
||||
const options = Object.assign({}, this.requestOptions);
|
||||
if (useStone)
|
||||
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
|
||||
else
|
||||
options.url = url;
|
||||
|
||||
console.log(options);
|
||||
|
||||
request(options, (err, _res, body) => {
|
||||
if (!err)
|
||||
resolve(body);
|
||||
else
|
||||
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
async getPage() {
|
||||
console.log('>> getPage: fetching', this.url);
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
||||
const filename = `${this.siteid}-${now}.html`;
|
||||
|
||||
await this.getContent(this.url, this.useStone)
|
||||
.then((html) => {
|
||||
const $ = cheerio.load(html);
|
||||
this.loadPage($);
|
||||
})
|
||||
.catch((err) => console.error(err));
|
||||
|
||||
// console.log(response.status);
|
||||
|
||||
/* if (response.status === 200) {
|
||||
// console.log(response.status);
|
||||
|
||||
try{
|
||||
console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`);
|
||||
await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data);
|
||||
}
|
||||
catch(err) {
|
||||
console.error(err);
|
||||
}
|
||||
const $ = cheerio.load(response.data);
|
||||
this.loadPage($);
|
||||
}*/
|
||||
}
|
||||
|
||||
async addToDB() {
|
||||
for(const item of this.items) {
|
||||
console.log(item);
|
||||
|
||||
dbmanager.insertOne(item)
|
||||
.then((data) => {
|
||||
console.log(data);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error(err.message || 'Some error occurred while querying the database.');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async filterAdverts() {
|
||||
console.log('>> FilterAdverts');
|
||||
console.log(`Currently ${this.items.length} items...`);
|
||||
|
||||
this.items = this.items.filter(filterReject);
|
||||
|
||||
console.log(`After reject ${this.items.length} items...`);
|
||||
|
||||
this.items = this.items.filter(filterAccept);
|
||||
|
||||
console.log(`After accept ${this.items.length} items...`);
|
||||
}
|
||||
|
||||
makeUrl(appended) {
|
||||
return `https://${ this.siteurl }${appended}`;
|
||||
}
|
||||
|
||||
makeProxyUrl(appended) {
|
||||
return `https://${ this.siteurl }${appended}`;
|
||||
}
|
||||
// Site specific parts below here
|
||||
|
||||
async breakPage() {
|
||||
|
||||
}
|
||||
|
||||
async extractDetails(part) {
|
||||
|
||||
}
|
||||
|
||||
async checkNext() {
|
||||
|
||||
}
|
||||
|
||||
async processSite() {
|
||||
|
||||
}
|
||||
|
||||
async getIndividualPage() {
|
||||
|
||||
}
|
||||
|
||||
async getJobPages() {
|
||||
|
||||
}
|
||||
|
||||
async go() {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = MasterScraper;
|
2763
package-lock.json
generated
Normal file
2763
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
31
package.json
Normal file
31
package.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"name": "jobscraper",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"grabber": "node grabber.js"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"axios": "^0.19.2",
|
||||
"body-parser": "^1.19.0",
|
||||
"cheerio": "^1.0.0-rc.3",
|
||||
"cron": "^1.8.2",
|
||||
"dotenv": "^8.2.0",
|
||||
"eslint": "^6.8.0",
|
||||
"express": "^4.17.1",
|
||||
"fecha": "^4.2.0",
|
||||
"present": "^1.0.0",
|
||||
"rss-parser": "^3.7.6",
|
||||
"sqlite3": "^4.1.1",
|
||||
"tape": "^4.13.2",
|
||||
"tape-promise": "^4.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"cors": "^2.8.5",
|
||||
"express-session": "^1.17.1",
|
||||
"helmet": "^3.22.0"
|
||||
}
|
||||
}
|
150
scrapers/indeed.js
Normal file
150
scrapers/indeed.js
Normal file
@ -0,0 +1,150 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:55
|
||||
|
||||
*/
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const MasterScraper = require('../lib/scraper');
|
||||
|
||||
class IndeedScraper extends MasterScraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.siteurl = 'www.indeed.co.uk';
|
||||
this.siteid = 'indeed';
|
||||
this.useStone = true;
|
||||
this.requestOptions = {
|
||||
'url' : ''
|
||||
};
|
||||
|
||||
this.antiAd = /sja\d+/gi;
|
||||
}
|
||||
|
||||
// Site specific parts below here
|
||||
|
||||
async breakPage() {
|
||||
const $ = this.currentPage;
|
||||
const ads = [];
|
||||
|
||||
const sections = $('div.row.result');
|
||||
|
||||
await sections.each(async (index, item) => {
|
||||
// console.log($(item).html());
|
||||
const ad = await this.extractDetails(item);
|
||||
|
||||
if (ad !== null)
|
||||
ads.push(ad);
|
||||
|
||||
console.log(ads);
|
||||
// console.log('<<<<<<<<<>>>>>>>>>');
|
||||
});
|
||||
|
||||
this.items = [...this.items, ...ads];
|
||||
}
|
||||
|
||||
async extractDetails(part) {
|
||||
const newObj = {};
|
||||
const $part = cheerio.load(part);
|
||||
// console.log($part.html());
|
||||
const now = ~~(new Date().getTime() / 1000.0);
|
||||
|
||||
|
||||
newObj.title = $part('.jobtitle').text().trim();
|
||||
newObj.site = this.siteid;
|
||||
// newObj.url = `https://${ this.siteurl }${$part('.jobtitle').attr('href')}`;
|
||||
newObj.url = this.makeUrl($part('.jobtitle').attr('href'));
|
||||
newObj.id = $part('h2.title a').attr('id').trim();
|
||||
newObj.summary = $part('.summary').text().trim();
|
||||
|
||||
newObj.company = $part('.company').text().trim() || null;
|
||||
|
||||
newObj.location = $part('.location').text().trim();
|
||||
|
||||
newObj.postDate = $part('.date').text().trim();
|
||||
|
||||
newObj.salary = $part('.salary.no-wrap').text().trim();
|
||||
|
||||
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
|
||||
newObj.timestamp = now;
|
||||
|
||||
return newObj;
|
||||
}
|
||||
|
||||
async getIndividualPage(item) {
|
||||
const newItem = {...item};
|
||||
console.log('Getting', item.url);
|
||||
await this.getContent(item.url)
|
||||
.then((html) => {
|
||||
const $ = cheerio.load(html);
|
||||
newItem.summary = $('#jobDescriptionText').text().trim();
|
||||
|
||||
})
|
||||
.catch((err) => console.error(err));
|
||||
return newItem;
|
||||
}
|
||||
|
||||
async getJobPages() {
|
||||
const newItems = [];
|
||||
for (let item of this.items) {
|
||||
item = await this.getIndividualPage(item);
|
||||
newItems.push(item);
|
||||
}
|
||||
|
||||
this.items = [...newItems];
|
||||
}
|
||||
|
||||
async checkNext() {
|
||||
const $ = this.currentPage;
|
||||
const next = $('.pagination > *:last-child').attr('href') || '';
|
||||
if (next !== '')
|
||||
// next = `https://${ this.siteurl }${next}`;
|
||||
this.makeUrl(next);
|
||||
|
||||
console.log(next);
|
||||
}
|
||||
|
||||
async processSite() {
|
||||
console.log('Processing...');
|
||||
|
||||
let nextPage;
|
||||
const previousPage = '';
|
||||
// do {
|
||||
// previousPage = this.url;
|
||||
this.items = [];
|
||||
await this.getPage();
|
||||
|
||||
await this.breakPage();
|
||||
|
||||
await this.checkNext();
|
||||
|
||||
await this.getJobPages();
|
||||
|
||||
// nextPage = await this.checkNext();
|
||||
|
||||
// if (nextPage === previousPage) nextPage = '';
|
||||
|
||||
// this.setStartUrl(nextPage);
|
||||
// }while (nextPage !== '');
|
||||
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
}
|
||||
|
||||
async go(location = 'london') {
|
||||
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
|
||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||||
|
||||
// Glasgow
|
||||
// https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=glasgow&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch
|
||||
|
||||
await this.processSite();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = IndeedScraper;
|
184
scrapers/indeed.orig.js
Normal file
184
scrapers/indeed.orig.js
Normal file
@ -0,0 +1,184 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:55
|
||||
|
||||
*/
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const axios = require('axios');
|
||||
const fecha = require('fecha');
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
const dbmanager = require('../lib/dbmanager');
|
||||
|
||||
const filterReject = require('../lib/filter_reject');
|
||||
const filterAccept = require('../lib/filter_md_jobs');
|
||||
|
||||
class IndeedScraper {
|
||||
|
||||
constructor() {
|
||||
this.url = '';
|
||||
this.items = [];
|
||||
this.currentPage = null;
|
||||
this.host = 'www.indeed.co.uk';
|
||||
|
||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
|
||||
}
|
||||
|
||||
setStartUrl(newUrl) {
|
||||
this.url = newUrl;
|
||||
}
|
||||
|
||||
loadPage(page) {
|
||||
this.currentPage = page;
|
||||
}
|
||||
|
||||
async getPage() {
|
||||
console.log('>> getPage: fetching', this.url);
|
||||
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
||||
const filename = `indeed-${now}.html`;
|
||||
const response = await axios.get(this.url).catch((err) => {
|
||||
console.error(err);
|
||||
});
|
||||
|
||||
console.log(response.status);
|
||||
|
||||
if (response.status === 200) {
|
||||
console.log(response);
|
||||
|
||||
/* try{
|
||||
fs.writeFileSync(`../test/data/indeed/${filename}`, response.data);
|
||||
}
|
||||
catch(err) {
|
||||
console.error(err);
|
||||
}*/
|
||||
const $ = cheerio.load(response.data);
|
||||
this.loadPage($);
|
||||
}
|
||||
}
|
||||
|
||||
async addToDB() {
|
||||
for(const item of this.items) {
|
||||
console.log(item);
|
||||
|
||||
dbmanager.insertOne(item)
|
||||
.then((data) => {
|
||||
console.log(data);
|
||||
})
|
||||
.catch((err) => {
|
||||
console.error(err.message || 'Some error occurred while querying the database.');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async filterAdverts() {
|
||||
console.log('>> FilterAdverts');
|
||||
console.log(`Currently ${this.items.length} items...`);
|
||||
|
||||
this.items = this.items.filter(filterReject);
|
||||
|
||||
console.log(`After reject ${this.items.length} items...`);
|
||||
|
||||
this.items = this.items.filter(filterAccept);
|
||||
|
||||
console.log(`After accept ${this.items.length} items...`);
|
||||
}
|
||||
|
||||
// Site specific parts below here
|
||||
|
||||
async breakPage() {
|
||||
const $ = this.currentPage;
|
||||
const ads = [];
|
||||
|
||||
const sections = $('div.row.result');
|
||||
|
||||
await sections.each(async (index, item) => {
|
||||
// console.log($(item).html());
|
||||
const ad = await this.extractDetails(item);
|
||||
ads.push(ad);
|
||||
// console.log('<<<<<<<<<>>>>>>>>>');
|
||||
});
|
||||
|
||||
this.items = [...this.items, ...ads];
|
||||
}
|
||||
|
||||
async extractDetails(part) {
|
||||
const newObj = {};
|
||||
const $part = cheerio.load(part);
|
||||
const now = ~~(new Date().getTime() / 1000.0);
|
||||
|
||||
newObj.title = $part('.jobtitle')
|
||||
.text()
|
||||
.trim();
|
||||
newObj.site = 'indeed';
|
||||
newObj.url = `https://${ this.host }${$part('.jobtitle').attr('href')}`;
|
||||
newObj.id = $part('h2.title a').attr('id').trim();
|
||||
newObj.summary = $part('.summary').text().trim();
|
||||
|
||||
newObj.company = $part('.company').text().trim() || null;
|
||||
|
||||
newObj.location = $part('.location').text().trim();
|
||||
|
||||
newObj.postDate = $part('.date').text().trim();
|
||||
|
||||
newObj.salary = $part('.salary.no-wrap').text().trim();
|
||||
|
||||
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
|
||||
newObj.timestamp = now;
|
||||
|
||||
// console.log(newObj);
|
||||
return newObj;
|
||||
}
|
||||
|
||||
async checkNext() {
|
||||
const $ = this.currentPage;
|
||||
let next = $('.pagination > *:last-child').attr('href') || '';
|
||||
if (next !== '')
|
||||
next = `https://${ this.host }${next}`;
|
||||
|
||||
console.log(next);
|
||||
}
|
||||
|
||||
async processSite() {
|
||||
console.log('Processing...');
|
||||
|
||||
let nextPage;
|
||||
let previousPage = '';
|
||||
// do {
|
||||
previousPage = this.url;
|
||||
await this.getPage();
|
||||
|
||||
await this.breakPage();
|
||||
|
||||
await this.checkNext();
|
||||
|
||||
nextPage = await this.checkNext();
|
||||
|
||||
// if (nextPage === previousPage) nextPage = '';
|
||||
|
||||
// this.setStartUrl(nextPage);
|
||||
// }while (nextPage !== '');
|
||||
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
}
|
||||
|
||||
async go() {
|
||||
this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
|
||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||||
|
||||
await this.processSite();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const ind = new IndeedScraper();
|
||||
|
||||
ind.go();
|
||||
|
||||
module.exports = IndeedScraper;
|
87
scrapers/jobserve.js
Normal file
87
scrapers/jobserve.js
Normal file
@ -0,0 +1,87 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 16/04/2020
|
||||
* Time: 16:46
|
||||
|
||||
*/
|
||||
|
||||
const Parser = require('rss-parser');
|
||||
|
||||
class MasterReader {
|
||||
constructor() {
|
||||
this.url = '';
|
||||
this.items = [];
|
||||
this.feeditems = [];
|
||||
this.currentPage = null;
|
||||
this.hosturl = '';
|
||||
this.siteid = '';
|
||||
this.requestOptions = {
|
||||
'url' : '',
|
||||
'proxy' : 'http://uk.proxymesh.com:31280',
|
||||
'tunnel' : true
|
||||
};
|
||||
}
|
||||
|
||||
getContent(url) {
|
||||
// return new pending promise
|
||||
return new Promise((resolve, reject) => {
|
||||
// select http or https module, depending on reqested url
|
||||
// const lib = url.startsWith('https') ? require('https') : require('http');
|
||||
const options = Object.assign({}, this.requestOptions);
|
||||
console.log(options);
|
||||
options.url = url;
|
||||
|
||||
console.log(options);
|
||||
|
||||
request(options, (err, _res, body) => {
|
||||
if (!err)
|
||||
resolve(body);
|
||||
else
|
||||
|
||||
reject(err);
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
setFeed(newUrl) {
|
||||
this.url = newUrl;
|
||||
}
|
||||
|
||||
async getFeed() {
|
||||
console.log('>> getFeed: fetching', this.url);
|
||||
|
||||
const parser = new Parser();
|
||||
const feed = await parser.parseURL(this.url);
|
||||
|
||||
|
||||
console.log(feed);
|
||||
|
||||
this.feeditems = [...feed.items];
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class JobserveReader extends MasterReader {
|
||||
constructor(props) {
|
||||
super(props);
|
||||
this.hosturl = 'https://jobserve.com';
|
||||
this.siteid = 'jobserve';
|
||||
}
|
||||
|
||||
async processFeed() {
|
||||
await this.getFeed();
|
||||
if (this.feeditems.length > 0) {
|
||||
|
||||
} else {
|
||||
console.log('Nothing to process');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const jobServeReader = new JobserveReader();
|
||||
|
||||
jobServeReader.setFeed('https://www.jobserve.com/MySearch/846CDA8658FF93A3.rss');
|
||||
|
||||
jobServeReader.processFeed();
|
138
scrapers/totaljobs.js
Normal file
138
scrapers/totaljobs.js
Normal file
@ -0,0 +1,138 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:55
|
||||
|
||||
*/
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const MasterScraper = require('../lib/scraper');
|
||||
|
||||
class TotaljobsScraper extends MasterScraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.siteurl = 'www.totaljobs.com';
|
||||
this.siteid = 'totaljobs';
|
||||
this.requestOptions = {
|
||||
'url' : ''
|
||||
};
|
||||
}
|
||||
|
||||
// Site specific parts below here
|
||||
|
||||
async breakPage() {
|
||||
const $ = this.currentPage;
|
||||
const ads = [];
|
||||
|
||||
const sections = $('div.job');
|
||||
|
||||
await sections.each(async (index, item) => {
|
||||
// console.log($(item).html());
|
||||
const ad = await this.extractDetails(item);
|
||||
ads.push(ad);
|
||||
// console.log('<<<<<<<<<>>>>>>>>>');
|
||||
});
|
||||
|
||||
this.items = [...this.items, ...ads];
|
||||
}
|
||||
|
||||
async extractDetails(part) {
|
||||
const newObj = {};
|
||||
const $part = cheerio.load(part);
|
||||
const now = ~~(new Date().getTime() / 1000.0);
|
||||
|
||||
// console.log($part.html());
|
||||
newObj.title = $part('.job-title').text().trim();
|
||||
newObj.url = $part('.job-title a').attr('href');
|
||||
newObj.id = $part('div.job').attr('id').trim();
|
||||
newObj.summary = $part('p.job-intro').text().trim();
|
||||
newObj.company = $part('.company').text().trim() || null;
|
||||
newObj.location = $part('.location > span').text().trim();
|
||||
newObj.postDate = $part('.date-posted').text().trim();
|
||||
newObj.salary = $part('.salary').text().trim();
|
||||
newObj.isEasyApply = false;
|
||||
|
||||
newObj.site = this.siteid;
|
||||
newObj.timestamp = now;
|
||||
|
||||
return newObj;
|
||||
}
|
||||
|
||||
async getIndividualPage(item) {
|
||||
const newItem = {...item};
|
||||
console.log('Getting', item.url);
|
||||
await this.getContent(item.url)
|
||||
.then((html) => {
|
||||
console.log(html);
|
||||
const $ = cheerio.load(html);
|
||||
newItem.summary = $('div.job-description').text().trim();
|
||||
|
||||
})
|
||||
.catch((err) => console.error(err));
|
||||
return newItem;
|
||||
}
|
||||
|
||||
async getJobPages() {
|
||||
const newItems = [];
|
||||
for (let item of this.items) {
|
||||
console.log(item.title);
|
||||
item = await this.getIndividualPage(item);
|
||||
newItems.push(item);
|
||||
}
|
||||
|
||||
this.items = [...newItems];
|
||||
}
|
||||
|
||||
async checkNext() {
|
||||
const $ = this.currentPage;
|
||||
const next = $('.pagination > *:last-child').attr('href') || '';
|
||||
if (next !== '')
|
||||
// next = `https://${ this.siteurl }${next}`;
|
||||
this.makeUrl(next);
|
||||
|
||||
console.log(next);
|
||||
}
|
||||
|
||||
async processSite() {
|
||||
console.log('Processing...');
|
||||
|
||||
let nextPage;
|
||||
const previousPage = '';
|
||||
// do {
|
||||
// previousPage = this.url;
|
||||
this.items = [];
|
||||
await this.getPage();
|
||||
|
||||
await this.breakPage();
|
||||
|
||||
await this.checkNext();
|
||||
|
||||
// await this.getJobPages();
|
||||
// nextPage = await this.checkNext();
|
||||
|
||||
// if (nextPage === previousPage) nextPage = '';
|
||||
|
||||
// this.setStartUrl(nextPage);
|
||||
// }while (nextPage !== '');
|
||||
|
||||
await this.filterAdverts();
|
||||
|
||||
await this.addToDB();
|
||||
}
|
||||
|
||||
async go(location = 'london') {
|
||||
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
|
||||
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||||
|
||||
// Glasgow
|
||||
// https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=glasgow&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch
|
||||
|
||||
await this.processSite();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = TotaljobsScraper;
|
62
server/controllers/jobs.controller.js
Normal file
62
server/controllers/jobs.controller.js
Normal file
@ -0,0 +1,62 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 18/05/2020
|
||||
* Time: 13:39
|
||||
|
||||
*/
|
||||
const dbmanager = require('../../lib/dbmanager');
|
||||
|
||||
const killNLDoubleSpace = /(\\n)\s{2,}|(\\n)|\s{2,}/g;
|
||||
|
||||
exports.getList = (req, res) => {
|
||||
console.log('>getList req', req.params);
|
||||
|
||||
/* if(!req.params.id)
|
||||
return res.status(400).send({
|
||||
'message': 'Job id missing'
|
||||
});*/
|
||||
|
||||
dbmanager.getList()
|
||||
.then((data) => {
|
||||
const processed = data.map((item) => {
|
||||
const date = new Date( item.timestamp * 1000);
|
||||
item.date = date.toLocaleString();
|
||||
item.title = item.title.replace(killNLDoubleSpace, ' ');
|
||||
|
||||
return item;
|
||||
});
|
||||
res.send(processed);
|
||||
})
|
||||
.catch((err) => {
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
||||
|
||||
exports.getJob = (req, res) => {
|
||||
console.log('>getJob req', req.params);
|
||||
|
||||
if(!req.params.id)
|
||||
return res.status(500).send({
|
||||
'message': 'Job id missing'
|
||||
});
|
||||
|
||||
const id = req.params.id;
|
||||
|
||||
dbmanager.getOne(id)
|
||||
.then((data) => {
|
||||
const item = Object.assign({}, data);
|
||||
const date = new Date( item.timestamp * 1000);
|
||||
item.date = date.toLocaleString();
|
||||
item.title = item.title.replace(killNLDoubleSpace, ' ');
|
||||
|
||||
res.send(item);
|
||||
})
|
||||
.catch((err) => {
|
||||
res.status(500).send({
|
||||
'message': err.message || 'Some error occurred while querying the database.'
|
||||
});
|
||||
});
|
||||
};
|
17
server/routes/jobs.route.js
Normal file
17
server/routes/jobs.route.js
Normal file
@ -0,0 +1,17 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 18/05/2020
|
||||
* Time: 13:39
|
||||
|
||||
*/
|
||||
|
||||
const jobs = require('../controllers/jobs.controller');
|
||||
|
||||
module.exports = (app) => {
|
||||
app.route('/jobs')
|
||||
.get(jobs.getList);
|
||||
|
||||
app.route('/jobs/:id')
|
||||
.get(jobs.getJob);
|
||||
};
|
49
server/server.js
Normal file
49
server/server.js
Normal file
@ -0,0 +1,49 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 14/05/2020
|
||||
* Time: 09:13
|
||||
|
||||
*/
|
||||
require('dotenv').config();
|
||||
const express = require('express');
|
||||
const bodyParser = require('body-parser');
|
||||
const session = require('express-session');
|
||||
const path = require('path');
|
||||
const helmet = require('helmet');
|
||||
const cors = require('cors');
|
||||
|
||||
const app = express();
|
||||
require('dotenv').config();
|
||||
|
||||
const serverPort = process.env.PORT || 3000;
|
||||
|
||||
const sitePath = '../live';
|
||||
|
||||
//app.use(cors());
|
||||
//app.use(helmet());
|
||||
|
||||
app.use(session({
|
||||
'secret': 'Z4hc5.64X1e',
|
||||
'resave': true,
|
||||
'saveUninitialized': true
|
||||
}));
|
||||
|
||||
app.use(express.static(path.join(__dirname, sitePath)));
|
||||
|
||||
// parse requests of content-type - application/x-www-form-urlencoded
|
||||
app.use(bodyParser.urlencoded({ 'extended': true }));
|
||||
|
||||
// parse requests of content-type - application/json
|
||||
app.use(bodyParser.json());
|
||||
|
||||
require('./routes/jobs.route')(app);
|
||||
|
||||
app.listen(serverPort, () => {
|
||||
console.log(`Server is listening on port ${serverPort}`);
|
||||
});
|
||||
|
||||
((() => {
|
||||
console.log('Job Server started');
|
||||
// doJob();
|
||||
})());
|
1931
test/data/indeed/indeed-2020-04-16--013308.html
Normal file
1931
test/data/indeed/indeed-2020-04-16--013308.html
Normal file
File diff suppressed because one or more lines are too long
1984
test/data/indeed/indeed-2020-04-16--092311.html
Normal file
1984
test/data/indeed/indeed-2020-04-16--092311.html
Normal file
File diff suppressed because one or more lines are too long
1782
test/data/indeed/indeed-2020-04-16--105727.html
Normal file
1782
test/data/indeed/indeed-2020-04-16--105727.html
Normal file
File diff suppressed because one or more lines are too long
1778
test/data/indeed/indeed-2020-04-16--105848.html
Normal file
1778
test/data/indeed/indeed-2020-04-16--105848.html
Normal file
File diff suppressed because one or more lines are too long
1779
test/data/indeed/page.html
Normal file
1779
test/data/indeed/page.html
Normal file
File diff suppressed because one or more lines are too long
2438
test/data/indeed/page2.html
Normal file
2438
test/data/indeed/page2.html
Normal file
File diff suppressed because one or more lines are too long
3565
test/data/totaljobs/totaljobs-2020-04-16--121504.html
Normal file
3565
test/data/totaljobs/totaljobs-2020-04-16--121504.html
Normal file
File diff suppressed because it is too large
Load Diff
41
test/indeed.js
Normal file
41
test/indeed.js
Normal file
@ -0,0 +1,41 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:56
|
||||
|
||||
*/
|
||||
|
||||
const tape = require('tape');
|
||||
const _test = require('tape-promise').default; // <---- notice 'default'
|
||||
const test = _test(tape); // decorate tape
|
||||
|
||||
const fs = require('fs');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const IndeedScraper = require('../scrapers/indeed');
|
||||
|
||||
const indeedScraper = new IndeedScraper();
|
||||
|
||||
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
|
||||
const page = fs.readFileSync('data/indeed/page2.html');
|
||||
|
||||
test.test('Test Indeed scraper', async t => {
|
||||
const $ = cheerio.load(page);
|
||||
|
||||
indeedScraper.loadPage($);
|
||||
|
||||
await indeedScraper.breakPage();
|
||||
|
||||
// await indeedScraper.getJobPages();
|
||||
|
||||
// console.log(await indeedScraper.checkNext());
|
||||
|
||||
// console.log(indeedScraper.items);
|
||||
|
||||
await indeedScraper.filterAdverts();
|
||||
|
||||
// await indeedScraper.addToDB();
|
||||
|
||||
t.end();
|
||||
});
|
41
test/totaljobs.js
Normal file
41
test/totaljobs.js
Normal file
@ -0,0 +1,41 @@
|
||||
/**
|
||||
* Created by WebStorm.
|
||||
* User: martin
|
||||
* Date: 15/04/2020
|
||||
* Time: 11:56
|
||||
|
||||
*/
|
||||
|
||||
const tape = require('tape');
|
||||
const _test = require('tape-promise').default; // <---- notice 'default'
|
||||
const test = _test(tape); // decorate tape
|
||||
|
||||
const fs = require('fs');
|
||||
const cheerio = require('cheerio');
|
||||
|
||||
const TotaljobsScraper = require('../scrapers/totaljobs');
|
||||
|
||||
const totaljobsScraper = new TotaljobsScraper();
|
||||
|
||||
// const page = fs.readFileSync('data/indeed/indeed-2020-04-16--092311.html');
|
||||
console.log(`${__dirname}`);
|
||||
const page = fs.readFileSync(`${__dirname}/data/totaljobs/totaljobs-2020-04-16--121504.html`);
|
||||
|
||||
test.test('Test Totaljobs scraper', async t => {
|
||||
const $ = cheerio.load(page);
|
||||
|
||||
totaljobsScraper.loadPage($);
|
||||
|
||||
await totaljobsScraper.breakPage();
|
||||
|
||||
await totaljobsScraper.getJobPages();
|
||||
// console.log(await indeedScraper.checkNext());
|
||||
|
||||
console.log(totaljobsScraper.items);
|
||||
|
||||
await totaljobsScraper.filterAdverts();
|
||||
|
||||
// await totaljobsScraper.addToDB();
|
||||
|
||||
t.end();
|
||||
});
|
Loading…
Reference in New Issue
Block a user