Compare commits

...

1 Commits

Author SHA1 Message Date
Martin Donnelly
39f399593a Dockerising the service 2023-12-22 09:56:28 +00:00
16 changed files with 1262 additions and 16 deletions

25
Docker/dev/Dockerfile Normal file
View File

@ -0,0 +1,25 @@
FROM git.caliban.io/martin/node-python:10
#FROM martind2000/node-python3:18
ARG VERSION
ENV VERSION ${VERSION:-development}
WORKDIR /app
COPY ./Docker/start.sh ./package*.json ./grabber.js ./ecosystem.config.js ./brain.json /app/
RUN mkdir -p /app/db /app/lib /app/models /app/scrapers /app/server
# COPY ./src /app/src
# COPY ./types /app/types
RUN set -x \
&& npm install \
&& npm install -g pm2
RUN chmod +x /app/start.sh
EXPOSE 8120
ENTRYPOINT ["/app/start.sh"]

9
Docker/start.sh Normal file
View File

@ -0,0 +1,9 @@
#!/bin/sh
set -ex
# npm run dev
# npm run start
while true; do sleep infinity; done

Binary file not shown.

27
docker-compose.yml Normal file
View File

@ -0,0 +1,27 @@
version: '3.5'
services:
jubilee-src:
container_name: jobscraper
build:
context: .
dockerfile: ./Docker/dev/Dockerfile
image: jobscraper
logging:
options:
max-size: '1m'
max-file: '5'
# restart: always
# env_file:
# - .env
ports:
- '8120:8120'
volumes:
- ./db:/app/db
- ./lib:/app/lib
- ./models:/app/models
- ./scrapers:/app/scrapers
- ./server:/app/server

View File

@ -22,7 +22,7 @@ const RssTechnojobs = require('./scrapers/rss.technojobs');
const s1jobsScraper = new RssS1Jobs();
const technojobsScraper = new RssTechnojobs();
new CronJob('5 6-23/3 * * *', async function() {
/* new CronJob('5 6-23/3 * * *', async function() {
await indeedScraper.go('london');
await totaljobsScraper.go('london');
await cwjobsScraper.go('london');
@ -35,7 +35,7 @@ const RssTechnojobs = require('./scrapers/rss.technojobs');
await indeedScraper.go('milton keynes');
await totaljobsScraper.go('milton keynes');
await cwjobsScraper.go('milton keynes');
}, null, true);
}, null, true);*/
new CronJob('0 6-23/1 * * *', async function() {
await jobserveScraper.go('https://www.jobserve.com/MySearch/D48462060FB24B6C.rss');
@ -63,8 +63,8 @@ const RssTechnojobs = require('./scrapers/rss.technojobs');
await s1jobsScraper.go('http://www.s1jobs.com/xml/b1d7e6c3a9a11964z3r.xml');
await s1jobsScraper.go('http://www.s1jobs.com/xml/ddeded091b6f6d33z3r.xml');*/
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
// await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationglasgow/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
// await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationLONDON/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
// await technojobsScraper.go('https://www.technojobs.co.uk/rss.php/html%20OR%20node%20OR%20web%20OR%20sql%20OR%20delphi%20OR%20javascript%20OR%20ajax/excludekeywords/locationMilton%20Keynes/radius25/termsin0/salary0/postedwithinall/jobtypeall/searchfieldRSearchIndex/page1');
}, null, true);
})();

View File

@ -8,7 +8,7 @@
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager');
const JobsModel = require('../lib/mongoManager');
// const JobsModel = require('../lib/mongoManager');
const SHA = require('crypto-js/sha256');
@ -67,7 +67,8 @@ class MasterBase {
*
*/
addToMongo() {
console.log('>> ADD TO MONGO!');
console.log('>> no ADD TO MONGO!');
return;
for(const item of this.items) {
// console.log('add', item);

View File

@ -89,7 +89,7 @@ class MasterRSS extends MasterBase {
await this.filterAdverts();
if (this.items.length > 0) await this.addToDB();
if (this.items.length > 0) await this.addToMongo();
// if (this.items.length > 0) await this.addToMongo();
}
else
console.log('No items to process');

View File

@ -133,7 +133,7 @@ class IndeedScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
// await this.addToMongo();
}
async go(location = 'london') {

View File

@ -140,7 +140,7 @@ class IndeedMobileScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
// await this.addToMongo();
}
async go(location = 'london') {

View File

@ -146,7 +146,7 @@ class TotaljobsScraper extends MasterScraper {
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
// await this.addToMongo();
}
/**

View File

@ -5,7 +5,10 @@
* Time: 11:08
*/
const Jobs = require('../../lib/mongoManager');
// const Jobs = require('../../lib/mongoManager');
const Jobs = {};
const { Utils } = require('@rakh/utils');
const fs = require('fs');
@ -19,7 +22,7 @@ var classifier = bayes({
});
function load() {
const file = fs.readFileSync('brain.json');
const file = fs.readFileSync('/app/brain.json');
classifier = bayes.fromJson(file);
}

File diff suppressed because one or more lines are too long

View File

@ -58,8 +58,8 @@ app.use(bodyParser.json());
app.post('/auth', auth.auth);
require('./routes/jobs.route')(app);
require('./routes/jobs.v2.route')(app);
require('./routes/apply.v2.route')(app);
// require('./routes/jobs.v2.route')(app);
// require('./routes/apply.v2.route')(app);
require('./routes/vote.route')(app);
app.listen(serverPort, () => {

214
v2/lib/base.js Normal file
View File

@ -0,0 +1,214 @@
/**
* Created by WebStorm.
* User: martin
* Date: 22/05/2020
* Time: 12:01
*/
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager');
// const JobsModel = require('../lib/mongoManager');
const SHA = require('crypto-js/sha256');
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./corpus');
class MasterBase {
/**
*
*/
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.useStone = false;
this.saveFile = false;
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
/**
*
* @returns {{summary: string, site: string, postDate: string, location: string, company: string, id: string, title: string, isEasyApply: boolean, salary: string, url: string, timestamp: number}}
*/
newRecord() {
const now = ~~(new Date().getTime() / 1000.0);
return { 'title': '', 'site': this.siteid || '', 'url':'', 'id':'', 'summary':'', 'postDate':'', 'isEasyApply':false, 'location': '', 'company': '', 'salary': '', 'timestamp':now };
}
/**
*
* @returns {Promise<void>}
*/
async addToDB() {
for(const item of this.items)
// console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(`${this.siteid} db error`);
console.error(err.message || 'Some error occurred while querying the database.');
});
}
/**
*
*/
addToMongo() {
console.log('>> no ADD TO MONGO!');
return;
for(const item of this.items) {
// console.log('add', item);
const newObj = this.reduceData(item);
const newJob = new JobsModel(newObj);
newJob.save().then((m) => {
console.log('m', m.details.title);
}).catch((err) => {
console.error('m', err);
});
}
}
/**
*
* @param inval
* @returns {number}
*/
analyseRate(inval) {
console.log('analyseRate', inval);
let outVal = 0;
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
const clearSpace = /\s+/g;
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
const resultArray = result.trim().split((' '));
if (resultArray.length > 0) {
const item = parseInt(resultArray[0], 10);
if (item < 100) outVal = 0;
else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2;
}
else return 0;
return outVal;
}
/**
*
* @param d
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
*/
reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = this.analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary);
outObj.data.timestamp = d.timestamp * 1000;
return outObj;
}
/**
*
* @returns {Promise<void>}
*/
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
// console.log(this.items);
}
/**
*
* @param newUrl
*/
setStartUrl(newUrl) {
this.url = newUrl;
}
/**
*
* @param page
*/
loadPage(page) {
this.currentPage = page;
}
/**
*
* @param appended
* @returns {string}
*/
makeUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
/**
*
* @param appended
* @returns {string}
*/
makeProxyUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
/**
*
* @param url
* @param q
* @returns {string}
*/
makeImg(url, q = 75) {
return `https://image.silvrtree.co.uk/q${q}/${url}`;
}
/**
*
* @returns {Promise<void>}
*/
async go() {
this.items = [];
this.rawItems = [];
}
}
module.exports = MasterBase;

16
v2/lib/filter_md_jobs.js Normal file
View File

@ -0,0 +1,16 @@
module.exports = function (item) {
const patt = /(full\s?stack|front\s?end|html|html5|es6|react|knockout|ember|vue|riotjs|css|javascript|sql|node|backbone|git|gulp|jquery|express|£\dk|Data Warehouse Developer|iot|internet of things)\W/ig;
const result = patt.test(item.title);
const resultB = patt.test(item.summary);
console.log('My Filter:', (result || resultB === true) ? 'Pass' : 'Reject');
/* if (!(result || resultB === true)) {
console.log('Result', result);
console.log('ResultB', resultB);
console.log(item);
}*/
return (result || resultB === true) ;
};

17
v2/lib/filter_reject.js Normal file
View File

@ -0,0 +1,17 @@
module.exports = function (item) {
const patt = /(Simply Education|Splunk|Coordinators?|Teachers?|Technical Writers?|Data Analyst|WebLogic|WebSphere|Data Scientist|Change Managers?|T24|Test Analyst|Insight Analyst|application tester|senior tester|Salesforce|QlikView|Navision|Murex|seo|django|drupal|SHAREPOINT|per annum|ServiceNow|Test Lead|User Researcher|Service Management|\(PERM\)|£\d.K|Remedy|ITSM|Symfony|Zend|Full Time|Technical Business Analyst|BUSINESS ANALYST|AUTOMATION TESTER|FIELD TECHNICIAN|websphere administrator|Research Data Scientist)/ig;
const engineers = /(Support|Devops|Planning|security|Postgresql|network|sccm|test|data|imac|firewall|vmware)+(?:\s)(?=Engineer)/ig;
const developers = /(Big Data|Java Server Side|Java|PHP|Graduate|Access|Oracle ADF|SHAREPOINT|Ruby on Rails|Java Software|IOS|Qlikview|c#|c\+\+|\.net|bi|go lang|Python)+(?:\s)(?=Developer)/ig;
const architects = /(Java|PHP|Microsoft)+(?:\s)(?=Architect)/ig;
const antiAd = /sja\d+/gi;
const result = patt.test(item.summary) || engineers.test(item.summary) || developers.test(item.summary) || architects.test(item.summary);
const resultB = patt.test(item.title) || engineers.test(item.title) || developers.test(item.title) || architects.test(item.title);
const resultC = antiAd.test(item.id);
console.log('Reject:', (result || resultB || resultC === true) ? 'Reject' : 'Pass');
return (!(result || resultB || resultC === true));
};