185 lines
4.6 KiB
JavaScript
185 lines
4.6 KiB
JavaScript
|
/**
|
||
|
* Created by WebStorm.
|
||
|
* User: martin
|
||
|
* Date: 15/04/2020
|
||
|
* Time: 11:55
|
||
|
|
||
|
*/
|
||
|
|
||
|
const cheerio = require('cheerio');
|
||
|
|
||
|
const axios = require('axios');
|
||
|
const fecha = require('fecha');
|
||
|
|
||
|
const fs = require('fs');
|
||
|
|
||
|
const dbmanager = require('../lib/dbmanager');
|
||
|
|
||
|
const filterReject = require('../lib/filter_reject');
|
||
|
const filterAccept = require('../lib/filter_md_jobs');
|
||
|
|
||
|
class IndeedScraper {
|
||
|
|
||
|
constructor() {
|
||
|
this.url = '';
|
||
|
this.items = [];
|
||
|
this.currentPage = null;
|
||
|
this.host = 'www.indeed.co.uk';
|
||
|
|
||
|
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
|
||
|
}
|
||
|
|
||
|
setStartUrl(newUrl) {
|
||
|
this.url = newUrl;
|
||
|
}
|
||
|
|
||
|
loadPage(page) {
|
||
|
this.currentPage = page;
|
||
|
}
|
||
|
|
||
|
async getPage() {
|
||
|
console.log('>> getPage: fetching', this.url);
|
||
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
||
|
const filename = `indeed-${now}.html`;
|
||
|
const response = await axios.get(this.url).catch((err) => {
|
||
|
console.error(err);
|
||
|
});
|
||
|
|
||
|
console.log(response.status);
|
||
|
|
||
|
if (response.status === 200) {
|
||
|
console.log(response);
|
||
|
|
||
|
/* try{
|
||
|
fs.writeFileSync(`../test/data/indeed/${filename}`, response.data);
|
||
|
}
|
||
|
catch(err) {
|
||
|
console.error(err);
|
||
|
}*/
|
||
|
const $ = cheerio.load(response.data);
|
||
|
this.loadPage($);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async addToDB() {
|
||
|
for(const item of this.items) {
|
||
|
console.log(item);
|
||
|
|
||
|
dbmanager.insertOne(item)
|
||
|
.then((data) => {
|
||
|
console.log(data);
|
||
|
})
|
||
|
.catch((err) => {
|
||
|
console.error(err.message || 'Some error occurred while querying the database.');
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async filterAdverts() {
|
||
|
console.log('>> FilterAdverts');
|
||
|
console.log(`Currently ${this.items.length} items...`);
|
||
|
|
||
|
this.items = this.items.filter(filterReject);
|
||
|
|
||
|
console.log(`After reject ${this.items.length} items...`);
|
||
|
|
||
|
this.items = this.items.filter(filterAccept);
|
||
|
|
||
|
console.log(`After accept ${this.items.length} items...`);
|
||
|
}
|
||
|
|
||
|
// Site specific parts below here
|
||
|
|
||
|
async breakPage() {
|
||
|
const $ = this.currentPage;
|
||
|
const ads = [];
|
||
|
|
||
|
const sections = $('div.row.result');
|
||
|
|
||
|
await sections.each(async (index, item) => {
|
||
|
// console.log($(item).html());
|
||
|
const ad = await this.extractDetails(item);
|
||
|
ads.push(ad);
|
||
|
// console.log('<<<<<<<<<>>>>>>>>>');
|
||
|
});
|
||
|
|
||
|
this.items = [...this.items, ...ads];
|
||
|
}
|
||
|
|
||
|
async extractDetails(part) {
|
||
|
const newObj = {};
|
||
|
const $part = cheerio.load(part);
|
||
|
const now = ~~(new Date().getTime() / 1000.0);
|
||
|
|
||
|
newObj.title = $part('.jobtitle')
|
||
|
.text()
|
||
|
.trim();
|
||
|
newObj.site = 'indeed';
|
||
|
newObj.url = `https://${ this.host }${$part('.jobtitle').attr('href')}`;
|
||
|
newObj.id = $part('h2.title a').attr('id').trim();
|
||
|
newObj.summary = $part('.summary').text().trim();
|
||
|
|
||
|
newObj.company = $part('.company').text().trim() || null;
|
||
|
|
||
|
newObj.location = $part('.location').text().trim();
|
||
|
|
||
|
newObj.postDate = $part('.date').text().trim();
|
||
|
|
||
|
newObj.salary = $part('.salary.no-wrap').text().trim();
|
||
|
|
||
|
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
|
||
|
newObj.timestamp = now;
|
||
|
|
||
|
// console.log(newObj);
|
||
|
return newObj;
|
||
|
}
|
||
|
|
||
|
async checkNext() {
|
||
|
const $ = this.currentPage;
|
||
|
let next = $('.pagination > *:last-child').attr('href') || '';
|
||
|
if (next !== '')
|
||
|
next = `https://${ this.host }${next}`;
|
||
|
|
||
|
console.log(next);
|
||
|
}
|
||
|
|
||
|
async processSite() {
|
||
|
console.log('Processing...');
|
||
|
|
||
|
let nextPage;
|
||
|
let previousPage = '';
|
||
|
// do {
|
||
|
previousPage = this.url;
|
||
|
await this.getPage();
|
||
|
|
||
|
await this.breakPage();
|
||
|
|
||
|
await this.checkNext();
|
||
|
|
||
|
nextPage = await this.checkNext();
|
||
|
|
||
|
// if (nextPage === previousPage) nextPage = '';
|
||
|
|
||
|
// this.setStartUrl(nextPage);
|
||
|
// }while (nextPage !== '');
|
||
|
|
||
|
await this.filterAdverts();
|
||
|
|
||
|
await this.addToDB();
|
||
|
}
|
||
|
|
||
|
async go() {
|
||
|
this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch');
|
||
|
// this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch');
|
||
|
|
||
|
await this.processSite();
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
const ind = new IndeedScraper();
|
||
|
|
||
|
ind.go();
|
||
|
|
||
|
module.exports = IndeedScraper;
|