jobscraper/scrapers/indeed.orig.js
Martin Donnelly 5001bbd798 init
2020-05-19 10:05:04 +01:00

185 lines
4.6 KiB
JavaScript

/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const axios = require('axios');
const fecha = require('fecha');
const fs = require('fs');
const dbmanager = require('../lib/dbmanager');
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
class IndeedScraper {
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.host = 'www.indeed.co.uk';
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
}
setStartUrl(newUrl) {
this.url = newUrl;
}
loadPage(page) {
this.currentPage = page;
}
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `indeed-${now}.html`;
const response = await axios.get(this.url).catch((err) => {
console.error(err);
});
console.log(response.status);
if (response.status === 200) {
console.log(response);
/* try{
fs.writeFileSync(`../test/data/indeed/${filename}`, response.data);
}
catch(err) {
console.error(err);
}*/
const $ = cheerio.load(response.data);
this.loadPage($);
}
}
async addToDB() {
for(const item of this.items) {
console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(err.message || 'Some error occurred while querying the database.');
});
}
}
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
}
// Site specific parts below here
async breakPage() {
const $ = this.currentPage;
const ads = [];
const sections = $('div.row.result');
await sections.each(async (index, item) => {
// console.log($(item).html());
const ad = await this.extractDetails(item);
ads.push(ad);
// console.log('<<<<<<<<<>>>>>>>>>');
});
this.items = [...this.items, ...ads];
}
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
const now = ~~(new Date().getTime() / 1000.0);
newObj.title = $part('.jobtitle')
.text()
.trim();
newObj.site = 'indeed';
newObj.url = `https://${ this.host }${$part('.jobtitle').attr('href')}`;
newObj.id = $part('h2.title a').attr('id').trim();
newObj.summary = $part('.summary').text().trim();
newObj.company = $part('.company').text().trim() || null;
newObj.location = $part('.location').text().trim();
newObj.postDate = $part('.date').text().trim();
newObj.salary = $part('.salary.no-wrap').text().trim();
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
newObj.timestamp = now;
// console.log(newObj);
return newObj;
}
async checkNext() {
const $ = this.currentPage;
let next = $('.pagination > *:last-child').attr('href') || '';
if (next !== '')
next = `https://${ this.host }${next}`;
console.log(next);
}
async processSite() {
console.log('Processing...');
let nextPage;
let previousPage = '';
// do {
previousPage = this.url;
await this.getPage();
await this.breakPage();
await this.checkNext();
nextPage = await this.checkNext();
// if (nextPage === previousPage) nextPage = '';
// this.setStartUrl(nextPage);
// }while (nextPage !== '');
await this.filterAdverts();
await this.addToDB();
}
async go() {
this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
await this.processSite();
}
}
const ind = new IndeedScraper();
ind.go();
module.exports = IndeedScraper;