f2880b661e
* Moved to mongo * UI updated to use mongo * UI is a bit fancier now * Import sql to mongo
152 lines
2.6 KiB
JavaScript
152 lines
2.6 KiB
JavaScript
/**
|
|
* Created by WebStorm.
|
|
* User: martin
|
|
* Date: 15/04/2020
|
|
* Time: 11:55
|
|
|
|
*/
|
|
const fs = require('fs');
|
|
|
|
const MasterBase = require('./base');
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const got = require('got');
|
|
|
|
const fecha = require('fecha');
|
|
|
|
class MasterScraper extends MasterBase {
|
|
|
|
constructor() {
|
|
super();
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param url
|
|
* @param useStone
|
|
* @returns {Promise<unknown>}
|
|
*/
|
|
getContent(url, useStone = false) {
|
|
|
|
/*
|
|
let headers = new Headers({
|
|
"Accept" : "application/json",
|
|
"Content-Type" : "application/json",
|
|
"User-Agent" : "MY-UA-STRING"
|
|
});
|
|
|
|
fetch(url, {
|
|
method : 'GET',
|
|
headers : headers
|
|
// ... etc
|
|
}).then( ...
|
|
|
|
*/
|
|
// return new pending promise
|
|
return new Promise((resolve, reject) => {
|
|
// select http or https module, depending on reqested url
|
|
// const lib = url.startsWith('https') ? require('https') : require('http');
|
|
const options = Object.assign({}, this.requestOptions);
|
|
if (useStone)
|
|
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
|
|
else
|
|
options.url = url;
|
|
|
|
console.log(options);
|
|
|
|
got(options.url).then((response) => {
|
|
resolve(response.body);
|
|
})
|
|
.catch((e) => {
|
|
console.error('getContent', e );
|
|
reject(e.response.body);
|
|
});
|
|
});
|
|
};
|
|
|
|
async savePage(html) {
|
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
|
|
|
|
const filename = `pages/${this.siteid}-${now}.html`;
|
|
|
|
fs.writeFileSync(filename, html);
|
|
}
|
|
|
|
async getPage() {
|
|
console.log('>> getPage: fetching', this.url);
|
|
|
|
await this.getContent(this.url, this.useStone)
|
|
.then((html) => {
|
|
// console.log('>> getPage:: got', html);
|
|
console.log('>> getPage:: OK');
|
|
if (this.saveFile) this.savePage(html);
|
|
const $ = cheerio.load(html);
|
|
this.loadPage($);
|
|
})
|
|
.catch((err) => console.error(err));
|
|
}
|
|
|
|
// Site specific parts below here
|
|
|
|
/**
|
|
* Break each page into items
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async breakPage() {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param part
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractDetails(part) {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async checkNext() {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processSite() {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async getIndividualPage() {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async getJobPages() {
|
|
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async go() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = MasterScraper;
|