/** * Created by WebStorm. * User: martin * Date: 15/04/2020 * Time: 11:55 */ const cheerio = require('cheerio'); const axios = require('axios'); const fecha = require('fecha'); const fs = require('fs'); const dbmanager = require('../lib/dbmanager'); const filterReject = require('../lib/filter_reject'); const filterAccept = require('../lib/filter_md_jobs'); class IndeedScraper { constructor() { this.url = ''; this.items = []; this.currentPage = null; this.host = 'www.indeed.co.uk'; // this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch'); } setStartUrl(newUrl) { this.url = newUrl; } loadPage(page) { this.currentPage = page; } async getPage() { console.log('>> getPage: fetching', this.url); const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss'); const filename = `indeed-${now}.html`; const response = await axios.get(this.url).catch((err) => { console.error(err); }); console.log(response.status); if (response.status === 200) { console.log(response); /* try{ fs.writeFileSync(`../test/data/indeed/${filename}`, response.data); } catch(err) { console.error(err); }*/ const $ = cheerio.load(response.data); this.loadPage($); } } async addToDB() { for(const item of this.items) { console.log(item); dbmanager.insertOne(item) .then((data) => { console.log(data); }) .catch((err) => { console.error(err.message || 'Some error occurred while querying the database.'); }); } } async filterAdverts() { console.log('>> FilterAdverts'); console.log(`Currently ${this.items.length} items...`); this.items = this.items.filter(filterReject); console.log(`After reject ${this.items.length} items...`); this.items = this.items.filter(filterAccept); console.log(`After accept ${this.items.length} items...`); } // Site specific parts below here async breakPage() { const $ = this.currentPage; const ads = []; const sections = $('div.row.result'); await sections.each(async (index, item) => { // console.log($(item).html()); const ad = await this.extractDetails(item); ads.push(ad); // console.log('<<<<<<<<<>>>>>>>>>'); }); this.items = [...this.items, ...ads]; } async extractDetails(part) { const newObj = {}; const $part = cheerio.load(part); const now = ~~(new Date().getTime() / 1000.0); newObj.title = $part('.jobtitle') .text() .trim(); newObj.site = 'indeed'; newObj.url = `https://${ this.host }${$part('.jobtitle').attr('href')}`; newObj.id = $part('h2.title a').attr('id').trim(); newObj.summary = $part('.summary').text().trim(); newObj.company = $part('.company').text().trim() || null; newObj.location = $part('.location').text().trim(); newObj.postDate = $part('.date').text().trim(); newObj.salary = $part('.salary.no-wrap').text().trim(); newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply'; newObj.timestamp = now; // console.log(newObj); return newObj; } async checkNext() { const $ = this.currentPage; let next = $('.pagination > *:last-child').attr('href') || ''; if (next !== '') next = `https://${ this.host }${next}`; console.log(next); } async processSite() { console.log('Processing...'); let nextPage; let previousPage = ''; // do { previousPage = this.url; await this.getPage(); await this.breakPage(); await this.checkNext(); nextPage = await this.checkNext(); // if (nextPage === previousPage) nextPage = ''; // this.setStartUrl(nextPage); // }while (nextPage !== ''); await this.filterAdverts(); await this.addToDB(); } async go() { this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=1&limit=50&sort=date&psf=advsrch&from=advancedsearch'); // this.setStartUrl('https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=javascript+node&as_not=&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=25&l=london&fromage=7&limit=10&sort=date&psf=advsrch&from=advancedsearch'); await this.processSite(); } } const ind = new IndeedScraper(); ind.go(); module.exports = IndeedScraper;