jobscraper/lib/base.js
Martin Donnelly f2880b661e JOBSCRAPER-1 Implement a bit of AI
* Moved to mongo
* UI updated to use mongo
* UI is a bit fancier now
* Import sql to mongo
2020-09-10 14:13:08 +01:00

214 lines
4.8 KiB
JavaScript

/**
* Created by WebStorm.
* User: martin
* Date: 22/05/2020
* Time: 12:01
*/
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager');
const JobsModel = require('../lib/mongoManager');
const SHA = require('crypto-js/sha256');
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./corpus');
class MasterBase {
/**
*
*/
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.useStone = false;
this.saveFile = false;
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
/**
*
* @returns {{summary: string, site: string, postDate: string, location: string, company: string, id: string, title: string, isEasyApply: boolean, salary: string, url: string, timestamp: number}}
*/
newRecord() {
const now = ~~(new Date().getTime() / 1000.0);
return { 'title': '', 'site': this.siteid || '', 'url':'', 'id':'', 'summary':'', 'postDate':'', 'isEasyApply':false, 'location': '', 'company': '', 'salary': '', 'timestamp':now };
}
/**
*
* @returns {Promise<void>}
*/
async addToDB() {
for(const item of this.items)
// console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(`${this.siteid} db error`);
console.error(err.message || 'Some error occurred while querying the database.');
});
}
/**
*
*/
addToMongo() {
console.log('>> ADD TO MONGO!');
for(const item of this.items) {
// console.log('add', item);
const newObj = this.reduceData(item);
const newJob = new JobsModel(newObj);
newJob.save().then((m) => {
console.log('m', m.details.title);
}).catch((err) => {
console.error('m', err);
});
}
}
/**
*
* @param inval
* @returns {number}
*/
analyseRate(inval) {
console.log('analyseRate', inval);
let outVal = 0;
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
const clearSpace = /\s+/g;
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
const resultArray = result.trim().split((' '));
if (resultArray.length > 0) {
const item = parseInt(resultArray[0], 10);
if (item < 100) outVal = 0;
else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2;
}
else return 0;
return outVal;
}
/**
*
* @param d
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
*/
reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = this.analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary);
outObj.data.timestamp = d.timestamp * 1000;
return outObj;
}
/**
*
* @returns {Promise<void>}
*/
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
// console.log(this.items);
}
/**
*
* @param newUrl
*/
setStartUrl(newUrl) {
this.url = newUrl;
}
/**
*
* @param page
*/
loadPage(page) {
this.currentPage = page;
}
/**
*
* @param appended
* @returns {string}
*/
makeUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
/**
*
* @param appended
* @returns {string}
*/
makeProxyUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
/**
*
* @param url
* @param q
* @returns {string}
*/
makeImg(url, q = 75) {
return `https://image.silvrtree.co.uk/q${q}/${url}`;
}
/**
*
* @returns {Promise<void>}
*/
async go() {
this.items = [];
this.rawItems = [];
}
}
module.exports = MasterBase;