/** * Created by WebStorm. * User: martin * Date: 15/04/2020 * Time: 11:55 */ const cheerio = require('cheerio'); const request = require('request'); const got = require('got'); const axios = require('axios'); const fecha = require('fecha'); const fs = require('fs'); const dbmanager = require('../lib/dbmanager'); const filterReject = require('../lib/filter_reject'); const filterAccept = require('../lib/filter_md_jobs'); class MasterScraper { constructor() { this.url = ''; this.items = []; this.currentPage = null; this.hosturl = ''; this.siteid = ''; this.useStone = false; this.requestOptions = { 'url' : '', 'proxy' : 'http://uk.proxymesh.com:31280', 'tunnel' : true }; } setStartUrl(newUrl) { this.url = newUrl; } loadPage(page) { this.currentPage = page; } getContent(url, useStone = false) { // return new pending promise return new Promise((resolve, reject) => { // select http or https module, depending on reqested url // const lib = url.startsWith('https') ? require('https') : require('http'); const options = Object.assign({}, this.requestOptions); if (useStone) options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`; else options.url = url; console.log(options); got(options.url).then((response) => { resolve(response.body); }) .catch((e) => { reject(e.response.body); }); /* request(options, (err, _res, body) => { if (!err) resolve(body); else reject(err); }); */ }); }; async getPage() { console.log('>> getPage: fetching', this.url); const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss'); const filename = `${this.siteid}-${now}.html`; await this.getContent(this.url, this.useStone) .then((html) => { const $ = cheerio.load(html); this.loadPage($); }) .catch((err) => console.error(err)); // console.log(response.status); /* if (response.status === 200) { // console.log(response.status); try{ console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`); await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data); } catch(err) { console.error(err); } const $ = cheerio.load(response.data); this.loadPage($); }*/ } async addToDB() { console.log(`+ ${this.siteid} addToDB`); for(const item of this.items) { console.log(item); await dbmanager.insertOne(item) .then((data) => { console.log(data); }) .catch((err) => { console.error(`${this.siteid} db error`); console.error(err.message || 'Some error occurred while querying the database.'); }); } console.log(`- ${this.siteid} addToDB`); } async filterAdverts() { console.log('>> FilterAdverts'); console.log(`Currently ${this.items.length} items...`); this.items = this.items.filter(filterReject); console.log(`After reject ${this.items.length} items...`); this.items = this.items.filter(filterAccept); console.log(`After accept ${this.items.length} items...`); } makeUrl(appended) { return `https://${ this.siteurl }${appended}`; } makeProxyUrl(appended) { return `https://${ this.siteurl }${appended}`; } // Site specific parts below here async breakPage() { } async extractDetails(part) { } async checkNext() { } async processSite() { } async getIndividualPage() { } async getJobPages() { } async go() { } } module.exports = MasterScraper;