/** * Created by WebStorm. * User: martin * Date: 15/04/2020 * Time: 11:55 */ const cheerio = require('cheerio'); const MasterScraper = require('../lib/scraper'); class IndeedMobileScraper extends MasterScraper { constructor() { super(); this.siteurl = 'www.indeed.co.uk/m/'; this.siteid = 'indeed'; this.useStone = true; this.headers = { "User-Agent" : "Mozilla/5.0 (Linux; Android 9; SM-G960F Build/PPR1.180610.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.157 Mobile Safari/537.36" }; this.requestOptions = { 'url' : '', headers : this.headers }; this.antiAd = /sja\d+/gi; } // Site specific parts below here async breakPage() { const $ = this.currentPage; const ads = []; const sections = $('#jobResults.results'); console.log('Sections:', $(sections).html()); /*await sections.each(async (index, item) => { // console.log($(item).html()); const ad = await this.extractDetails(item); if (ad !== null) ads.push(ad); console.log(ads); // console.log('<<<<<<<<<>>>>>>>>>'); });*/ this.items = [...this.items, ...ads]; } async extractDetails(part) { const newObj = {}; const $part = cheerio.load(part); // console.log($part.html()); const now = ~~(new Date().getTime() / 1000.0); newObj.title = $part('.jobtitle').text().trim(); newObj.site = this.siteid; // newObj.url = `https://${ this.siteurl }${$part('.jobtitle').attr('href')}`; newObj.url = this.makeUrl($part('.jobtitle').attr('href')); newObj.id = $part('h2.title a').attr('id').trim(); newObj.summary = $part('.summary').text().trim(); newObj.company = $part('.company').text().trim() || null; newObj.location = $part('.location').text().trim(); newObj.postDate = $part('.date').text().trim(); newObj.salary = $part('.salary.no-wrap').text().trim(); newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply'; newObj.timestamp = now; return newObj; } async getIndividualPage(item) { const newItem = {...item}; console.log('Getting', item.url); await this.getContent(item.url) .then((html) => { const $ = cheerio.load(html); newItem.summary = $('#jobDescriptionText').html(); }) .catch((err) => console.error(err)); return newItem; } async getJobPages() { const newItems = []; for (let item of this.items) { item = await this.getIndividualPage(item); newItems.push(item); } this.items = [...newItems]; } async checkNext() { const $ = this.currentPage; const next = $('.pagination > *:last-child').attr('href') || ''; if (next !== '') // next = `https://${ this.siteurl }${next}`; this.makeUrl(next); console.log(next); } async processSite() { console.log('Processing...'); let nextPage; const previousPage = ''; // do { // previousPage = this.url; this.items = []; await this.getPage(); await this.breakPage(); await this.checkNext(); await this.getJobPages(); // nextPage = await this.checkNext(); // if (nextPage === previousPage) nextPage = ''; // this.setStartUrl(nextPage); // }while (nextPage !== ''); await this.filterAdverts(); await this.addToDB(); await this.addToMongo(); } async go(location = 'london') { this.setStartUrl(`https://www.indeed.co.uk/m/jobs?q=%28Html+or+Web+or+Sql+or+Delphi+or+Vb+or+Vbscript+or+Php+or+Ajax+or+Mysql+or+Sqlserver+or+Javascript+or+Nodejs+or+vuejs+or+sveltejs%29+-React&l=london&radius=0&jt=contract&rq=1&rsIdx=0&fromage=last&newcount=187`) this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`); await this.processSite(); console.log(`Indeed ${location} completed`); } } module.exports = IndeedMobileScraper;