JOBSCRAPER-1 Implement a bit of AI

This commit is contained in:
Martin Donnelly 2020-09-01 12:44:42 +01:00
parent 6a23583b5b
commit 1938bbeb5f
4 changed files with 93 additions and 8 deletions

View File

@ -8,7 +8,7 @@
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
const dbmanager = require('../lib/dbmanager');
const Jobs = require('../lib/mongoManager');
const JobsModel = require('../lib/mongoManager');
const { Utils } = require('@rakh/utils');
const { Corpus } = require('./corpus');
@ -61,13 +61,16 @@ class MasterBase {
});
}
/**
*
*/
addToMongo() {
console.log('>> ADD TO MONGO!');
for(const item of this.items) {
// console.log('add', item);
const newObj = this.reduceData(item);
const newJob = Jobs(newObj);
const newJob = new JobsModel(newObj);
newJob.save().then((m) => {
console.log('m', m.details.title);
@ -77,6 +80,11 @@ class MasterBase {
}
}
/**
*
* @param inval
* @returns {number}
*/
analyseRate(inval) {
console.log('analyseRate', inval);
let outVal = 0;
@ -98,6 +106,11 @@ class MasterBase {
return outVal;
}
/**
*
* @param d
* @returns {{data: {read: number, autoclass: number, applied: number, jobtype: number, class: number}, details: {}}}
*/
reduceData(d) {
const outObj = { 'details':{}, 'data':{ 'read':0, 'applied':0, 'jobtype': 0, 'class':0, 'autoclass':0 } };
@ -174,6 +187,10 @@ class MasterBase {
return `https://image.silvrtree.co.uk/q${q}/${url}`;
}
/**
*
* @returns {Promise<void>}
*/
async go() {
this.items = [];
this.rawItems = [];

View File

@ -10,7 +10,7 @@ const mongoose = require('mongoose');
const log4js = require('log4js');
const logger = log4js.getLogger();
const Jobs = require('../models/jobs');
const JobsModel = require('../models/jobs');
const { Utils } = require('@rakh/utils');
@ -25,4 +25,4 @@ mongoose.connect(`mongodb://martin:1V3D4m526i@127.0.0.1/jobs`);
const mDB = mongoose.connection;
mDB.on('error', console.error.bind(console, 'connection error:'));
module.exports = Jobs;
module.exports = JobsModel;

View File

@ -21,9 +21,14 @@ class MasterScraper extends MasterBase {
super();
}
/**
*
* @param url
* @param useStone
* @returns {Promise<unknown>}
*/
getContent(url, useStone = false) {
/*
let headers = new Headers({
"Accept" : "application/json",
@ -59,6 +64,10 @@ fetch(url, {
});
};
/**
*
* @returns {Promise<void>}
*/
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
@ -75,30 +84,59 @@ fetch(url, {
// Site specific parts below here
/**
* Break each page into items
* @returns {Promise<void>}
*/
async breakPage() {
}
/**
*
* @param part
* @returns {Promise<void>}
*/
async extractDetails(part) {
}
/**
*
* @returns {Promise<void>}
*/
async checkNext() {
}
/**
*
* @returns {Promise<void>}
*/
async processSite() {
}
/**
*
* @returns {Promise<void>}
*/
async getIndividualPage() {
}
/**
*
* @returns {Promise<void>}
*/
async getJobPages() {
}
/**
*
* @returns {Promise<void>}
*/
async go() {
}

View File

@ -22,7 +22,10 @@ class TotaljobsScraper extends MasterScraper {
}
// Site specific parts below here
/**
*
* @returns {Promise<void>}
*/
async breakPage() {
const $ = this.currentPage;
const ads = [];
@ -39,6 +42,11 @@ class TotaljobsScraper extends MasterScraper {
this.items = [...this.items, ...ads];
}
/**
*
* @param part
* @returns {Promise<{}>}
*/
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
@ -61,6 +69,11 @@ class TotaljobsScraper extends MasterScraper {
return newObj;
}
/**
*
* @param item
* @returns {Promise<*>}
*/
async getIndividualPage(item) {
const newItem = {...item};
console.log('Getting', item.url);
@ -75,6 +88,10 @@ class TotaljobsScraper extends MasterScraper {
return newItem;
}
/**
*
* @returns {Promise<void>}
*/
async getJobPages() {
const newItems = [];
for (let item of this.items) {
@ -86,6 +103,10 @@ class TotaljobsScraper extends MasterScraper {
this.items = [...newItems];
}
/**
*
* @returns {Promise<void>}
*/
async checkNext() {
const $ = this.currentPage;
const next = $('.pagination > *:last-child').attr('href') || '';
@ -96,6 +117,10 @@ class TotaljobsScraper extends MasterScraper {
console.log(next);
}
/**
*
* @returns {Promise<void>}
*/
async processSite() {
console.log('Processing...');
@ -124,6 +149,11 @@ class TotaljobsScraper extends MasterScraper {
await this.addToMongo();
}
/**
*
* @param location
* @returns {Promise<void>}
*/
async go(location = 'london') {
this.setStartUrl(`https://www.totaljobs.com/jobs/contract/html-or-vue-or-vuejs-or-web-or-sql-or-delphi-or-vb-or-vbscript-or-php-or-ajax-or-mysql-or-sqlserver-or-javascript-or-node-or-nodejs-or-svelte-or-sveltejs-not-react/in-${encodeURIComponent(location)}?q=Html+Or+Vue+Or+Vuejs+Or+Web+Or+Sql+Or+Delphi+Or+Vb+Or+Vbscript+Or+Php+Or+Ajax+Or+Mysql+Or+Sqlserver+Or+Javascript+Or+Node+Or+Nodejs+Or+Svelte+Or+Sveltejs+NOT+React&postedwithin=3&radius=20`);
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+nodejs&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');