jobscraper/scrapers/indeedMobile.js
2020-08-24 09:35:30 +01:00

158 lines
4.1 KiB
JavaScript

/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const MasterScraper = require('../lib/scraper');
class IndeedMobileScraper extends MasterScraper {
constructor() {
super();
this.siteurl = 'www.indeed.co.uk/m/';
this.siteid = 'indeed';
this.useStone = true;
this.headers = {
"User-Agent" : "Mozilla/5.0 (Linux; Android 9; SM-G960F Build/PPR1.180610.011; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.157 Mobile Safari/537.36"
};
this.requestOptions = {
'url' : '',
headers : this.headers
};
this.antiAd = /sja\d+/gi;
}
// Site specific parts below here
async breakPage() {
const $ = this.currentPage;
const ads = [];
const sections = $('#jobResults.results');
console.log('Sections:', $(sections).html());
/*await sections.each(async (index, item) => {
// console.log($(item).html());
const ad = await this.extractDetails(item);
if (ad !== null)
ads.push(ad);
console.log(ads);
// console.log('<<<<<<<<<>>>>>>>>>');
});*/
this.items = [...this.items, ...ads];
}
async extractDetails(part) {
const newObj = {};
const $part = cheerio.load(part);
// console.log($part.html());
const now = ~~(new Date().getTime() / 1000.0);
newObj.title = $part('.jobtitle').text().trim();
newObj.site = this.siteid;
// newObj.url = `https://${ this.siteurl }${$part('.jobtitle').attr('href')}`;
newObj.url = this.makeUrl($part('.jobtitle').attr('href'));
newObj.id = $part('h2.title a').attr('id').trim();
newObj.summary = $part('.summary').text().trim();
newObj.company = $part('.company').text().trim() || null;
newObj.location = $part('.location').text().trim();
newObj.postDate = $part('.date').text().trim();
newObj.salary = $part('.salary.no-wrap').text().trim();
newObj.isEasyApply = $part('.iaLabel').text().trim() === 'Easily apply';
newObj.timestamp = now;
return newObj;
}
async getIndividualPage(item) {
const newItem = {...item};
console.log('Getting', item.url);
await this.getContent(item.url)
.then((html) => {
const $ = cheerio.load(html);
newItem.summary = $('#jobDescriptionText').html();
})
.catch((err) => console.error(err));
return newItem;
}
async getJobPages() {
const newItems = [];
for (let item of this.items) {
item = await this.getIndividualPage(item);
newItems.push(item);
}
this.items = [...newItems];
}
async checkNext() {
const $ = this.currentPage;
const next = $('.pagination > *:last-child').attr('href') || '';
if (next !== '')
// next = `https://${ this.siteurl }${next}`;
this.makeUrl(next);
console.log(next);
}
async processSite() {
console.log('Processing...');
let nextPage;
const previousPage = '';
// do {
// previousPage = this.url;
this.items = [];
await this.getPage();
await this.breakPage();
await this.checkNext();
await this.getJobPages();
// nextPage = await this.checkNext();
// if (nextPage === previousPage) nextPage = '';
// this.setStartUrl(nextPage);
// }while (nextPage !== '');
await this.filterAdverts();
await this.addToDB();
await this.addToMongo();
}
async go(location = 'london') {
this.setStartUrl(`https://www.indeed.co.uk/m/jobs?q=%28Html+or+Web+or+Sql+or+Delphi+or+Vb+or+Vbscript+or+Php+or+Ajax+or+Mysql+or+Sqlserver+or+Javascript+or+Nodejs+or+vuejs+or+sveltejs%29+-React&l=london&radius=0&jt=contract&rq=1&rsIdx=0&fromage=last&newcount=187`)
this.setStartUrl(`https://www.indeed.co.uk/jobs?as_and=&as_phr=&as_any=Html+Web+Sql+Delphi+Vb+Vbscript+Php+Ajax+Mysql+Sqlserver+Javascript+Nodejs+vuejs+sveltejs&as_not=React&as_ttl=&as_cmp=&jt=contract&st=&as_src=&salary=&radius=0&l=${encodeURIComponent(location)}&fromage=1&limit=50&sort=&psf=advsrch&from=advancedsearch`);
await this.processSite();
console.log(`Indeed ${location} completed`);
}
}
module.exports = IndeedMobileScraper;