jobscraper/lib/scraper.js
2020-07-21 12:05:01 +01:00

109 lines
1.9 KiB
JavaScript

/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const fs = require('fs');
const MasterBase = require('./base');
const cheerio = require('cheerio');
const got = require('got');
const fecha = require('fecha');
class MasterScraper extends MasterBase {
constructor() {
super();
}
getContent(url, useStone = false) {
/*
let headers = new Headers({
"Accept" : "application/json",
"Content-Type" : "application/json",
"User-Agent" : "MY-UA-STRING"
});
fetch(url, {
method : 'GET',
headers : headers
// ... etc
}).then( ...
*/
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
if (useStone)
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
else
options.url = url;
console.log(options);
got(options.url).then((response) => {
resolve(response.body);
})
.catch((e) => {
reject(e.response.body);
});
});
};
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone)
.then((html) => {
fs.writeFileSync(filename, html);
const $ = cheerio.load(html);
this.loadPage($);
})
.catch((err) => console.error(err));
}
// Site specific parts below here
async breakPage() {
}
async extractDetails(part) {
}
async checkNext() {
}
async processSite() {
}
async getIndividualPage() {
}
async getJobPages() {
}
async go() {
}
}
module.exports = MasterScraper;