2020-05-19 09:05:04 +00:00
|
|
|
/**
|
|
|
|
* Created by WebStorm.
|
|
|
|
* User: martin
|
|
|
|
* Date: 15/04/2020
|
|
|
|
* Time: 11:55
|
|
|
|
|
|
|
|
*/
|
2020-07-21 11:05:01 +00:00
|
|
|
const fs = require('fs');
|
|
|
|
|
|
|
|
const MasterBase = require('./base');
|
2020-05-19 09:05:04 +00:00
|
|
|
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
|
2020-06-01 08:56:48 +00:00
|
|
|
const got = require('got');
|
|
|
|
|
2020-05-19 09:05:04 +00:00
|
|
|
const fecha = require('fecha');
|
|
|
|
|
2020-07-21 11:05:01 +00:00
|
|
|
class MasterScraper extends MasterBase {
|
2020-05-19 09:05:04 +00:00
|
|
|
|
|
|
|
constructor() {
|
2020-07-21 11:05:01 +00:00
|
|
|
super();
|
2020-05-19 09:05:04 +00:00
|
|
|
}
|
2020-07-21 11:05:01 +00:00
|
|
|
|
|
|
|
getContent(url, useStone = false) {
|
2020-05-19 09:05:04 +00:00
|
|
|
|
|
|
|
|
2020-07-21 11:05:01 +00:00
|
|
|
/*
|
|
|
|
let headers = new Headers({
|
|
|
|
"Accept" : "application/json",
|
|
|
|
"Content-Type" : "application/json",
|
|
|
|
"User-Agent" : "MY-UA-STRING"
|
|
|
|
});
|
2020-05-19 09:05:04 +00:00
|
|
|
|
2020-07-21 11:05:01 +00:00
|
|
|
fetch(url, {
|
|
|
|
method : 'GET',
|
|
|
|
headers : headers
|
|
|
|
// ... etc
|
|
|
|
}).then( ...
|
|
|
|
|
|
|
|
*/
|
2020-05-19 09:05:04 +00:00
|
|
|
// return new pending promise
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
// select http or https module, depending on reqested url
|
|
|
|
// const lib = url.startsWith('https') ? require('https') : require('http');
|
|
|
|
const options = Object.assign({}, this.requestOptions);
|
|
|
|
if (useStone)
|
|
|
|
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
|
|
|
|
else
|
|
|
|
options.url = url;
|
|
|
|
|
|
|
|
console.log(options);
|
|
|
|
|
2020-06-01 08:56:48 +00:00
|
|
|
got(options.url).then((response) => {
|
|
|
|
resolve(response.body);
|
|
|
|
})
|
|
|
|
.catch((e) => {
|
|
|
|
reject(e.response.body);
|
|
|
|
});
|
2020-05-19 09:05:04 +00:00
|
|
|
});
|
|
|
|
};
|
|
|
|
|
|
|
|
async getPage() {
|
|
|
|
console.log('>> getPage: fetching', this.url);
|
|
|
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
|
|
|
const filename = `${this.siteid}-${now}.html`;
|
|
|
|
|
|
|
|
await this.getContent(this.url, this.useStone)
|
|
|
|
.then((html) => {
|
2020-07-21 11:05:01 +00:00
|
|
|
fs.writeFileSync(filename, html);
|
2020-05-19 09:05:04 +00:00
|
|
|
const $ = cheerio.load(html);
|
|
|
|
this.loadPage($);
|
|
|
|
})
|
|
|
|
.catch((err) => console.error(err));
|
|
|
|
}
|
2020-07-21 11:05:01 +00:00
|
|
|
|
2020-05-19 09:05:04 +00:00
|
|
|
// Site specific parts below here
|
|
|
|
|
|
|
|
async breakPage() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractDetails(part) {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async checkNext() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async processSite() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async getIndividualPage() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async getJobPages() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
async go() {
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = MasterScraper;
|