/** * Created by WebStorm. * User: martin * Date: 15/04/2020 * Time: 11:55 */ const fs = require('fs'); const MasterBase = require('./base'); const cheerio = require('cheerio'); const got = require('got'); const fecha = require('fecha'); class MasterScraper extends MasterBase { constructor() { super(); } /** * * @param url * @param useStone * @returns {Promise} */ getContent(url, useStone = false) { /* let headers = new Headers({ "Accept" : "application/json", "Content-Type" : "application/json", "User-Agent" : "MY-UA-STRING" }); fetch(url, { method : 'GET', headers : headers // ... etc }).then( ... */ // return new pending promise return new Promise((resolve, reject) => { // select http or https module, depending on reqested url // const lib = url.startsWith('https') ? require('https') : require('http'); const options = Object.assign({}, this.requestOptions); if (useStone) options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`; else options.url = url; console.log(options); got(options.url).then((response) => { resolve(response.body); }) .catch((e) => { reject(e.response.body); }); }); }; /** * * @returns {Promise} */ async getPage() { console.log('>> getPage: fetching', this.url); const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss'); const filename = `${this.siteid}-${now}.html`; await this.getContent(this.url, this.useStone) .then((html) => { fs.writeFileSync(filename, html); const $ = cheerio.load(html); this.loadPage($); }) .catch((err) => console.error(err)); } // Site specific parts below here /** * Break each page into items * @returns {Promise} */ async breakPage() { } /** * * @param part * @returns {Promise} */ async extractDetails(part) { } /** * * @returns {Promise} */ async checkNext() { } /** * * @returns {Promise} */ async processSite() { } /** * * @returns {Promise} */ async getIndividualPage() { } /** * * @returns {Promise} */ async getJobPages() { } /** * * @returns {Promise} */ async go() { } } module.exports = MasterScraper;