jobscraper/migrate.js

157 lines
4.1 KiB
JavaScript
Raw Permalink Normal View History

2020-08-24 08:35:30 +00:00
/**
* Created by WebStorm.
* User: martin
* Date: 22/07/2020
* Time: 10:20
*/
const db = require('./lib/connect');
const log4js = require('log4js');
const logger = log4js.getLogger();
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./lib/corpus');
const SHA = require('crypto-js/sha256');
2020-08-24 08:35:30 +00:00
/*
2604
2020-08-24 08:35:30 +00:00
const mongoose = require('mongoose');
const log4js = require('log4js');
const logger = log4js.getLogger();
const Jobs = require('./models/jobs');
require('dotenv').config();
logger.level = 'debug';
logger.debug(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
mongoose.connect(`mongodb://martin:1V3D4m526i@${ process.env.DBHOST }/${ process.env.DBNAME}`);
const mDB = mongoose.connection;
mDB.on('error', console.error.bind(console, 'connection error:'));
*/
const Jobs = require('./lib/mongoManager');
const migrate = (function() {
function analyseRate(inval) {
let outVal = 0;
const cleanerReg = /ir35|[+$#,=&:;()\\/\-£a-z]|\.\d{1,2}/gi;
const clearSpace = /\s+/g;
const result = inval.replace(cleanerReg, '').replace(clearSpace, ' ');
const resultArray = result.trim().split((' '));
if (resultArray.length > 0) {
const item = parseInt(resultArray[0], 10);
if (item < 100) outVal = 0;
else if ((item > 100) && (item < 5000)) outVal = 1;
else if (item >= 5000) outVal = 2;
}
else return 0;
2020-08-24 08:35:30 +00:00
return outVal;
}
function reduceData(d) {
const clearPremium = /(\n+)(Featured|Premium)/gi;
const otherStupid = /((↵\s+)+)(Featured|Premium)/gi;
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
outObj.details = Utils.extractFromObj(d, ['title', 'site', 'url', 'id', 'summary', 'company', 'location', 'postdate', 'salary', 'easyapply', 'timestamp']);
outObj.details.title = outObj.details.title.replace(clearPremium, '');
outObj.details.title = outObj.details.title.replace(otherStupid, '');
outObj.details.hashed = SHA(outObj.details.summary);
2020-08-24 08:35:30 +00:00
// outObj.data.read = d.read || 0;
outObj.data.read = 0;
outObj.data.applied = d.applied || 0;
outObj.data.jobtype = analyseRate(d.salary);
outObj.data.autoclass = Corpus.process(d.summary);
outObj.data.timestamp = d.timestamp * 1000;
return outObj;
}
function getCurrent() {
const outgoing = [];
console.log('get version');
const sql = 'select jobs.*, applied.a as applied, read.d as read from jobs left join applied on applied.aid = jobs._id left join read on read.rid = jobs._id order by _id asc;';
return new Promise((resolve, reject) => {
db.all(sql, [], (err, rows) => {
if (err)
reject(err);
rows.forEach((row) => {
outgoing.push(row);
});
resolve(outgoing) ;
});
});
}
async function start() {
await getCurrent().then(async (d) => {
2020-08-24 08:35:30 +00:00
logger.debug(d.length);
for (let t = 0;t < (d.length - 1);t++) {
const newD = reduceData(d[t]);
// logger.debug(newD);
const newJob = Jobs(newD);
await newJob.save().then((m) => {
2020-08-24 08:35:30 +00:00
logger.debug('m', m.details.title);
}).catch((err) => {
logger.error(err.keyPattern);
2020-08-24 08:35:30 +00:00
});
}
}).then(() => {
logger.debug('SAVING!!');
Corpus.exportUnused();
})
.catch((err) => {
logger.error(err.keyPattern);
2020-08-24 08:35:30 +00:00
});
}
async function deleteOld() {
const oneDay = 86400000;
const twoWeeksAgo = new Date().getTime() - ( 14 * oneDay);
logger.debug('Delete older than: ', new Date(twoWeeksAgo), twoWeeksAgo);
logger.debug({ 'data.timestamp': { '$lt': twoWeeksAgo } });
Jobs.deleteMany({ 'data.timestamp': { '$lt': twoWeeksAgo }, 'data.applied': 0 }).then((m) => {
logger.debug('m', m);
}).catch((err) => {
logger.error(err);
});
}
// newJob.find({ 'data': { 'timestamp': { '$lt': 1587034346000 } } });
2020-08-24 08:35:30 +00:00
return {
'start':start,
'deleteOld': deleteOld
2020-08-24 08:35:30 +00:00
};
})();
(async function() {
await migrate.start();
await migrate.deleteOld();
logger.info('Done??');
})();