jobscraper/lib/scraper.js

152 lines
2.6 KiB
JavaScript
Raw Normal View History

2020-05-19 09:05:04 +00:00
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
2020-07-21 11:05:01 +00:00
const fs = require('fs');
const MasterBase = require('./base');
2020-05-19 09:05:04 +00:00
const cheerio = require('cheerio');
2020-06-01 08:56:48 +00:00
const got = require('got');
2020-05-19 09:05:04 +00:00
const fecha = require('fecha');
2020-07-21 11:05:01 +00:00
class MasterScraper extends MasterBase {
2020-05-19 09:05:04 +00:00
constructor() {
2020-07-21 11:05:01 +00:00
super();
2020-05-19 09:05:04 +00:00
}
2020-09-01 11:44:42 +00:00
/**
*
2020-09-01 11:44:42 +00:00
* @param url
* @param useStone
* @returns {Promise<unknown>}
*/
getContent(url, useStone = false) {
2020-05-19 09:05:04 +00:00
2020-07-21 11:05:01 +00:00
/*
let headers = new Headers({
"Accept" : "application/json",
"Content-Type" : "application/json",
"User-Agent" : "MY-UA-STRING"
});
2020-05-19 09:05:04 +00:00
2020-07-21 11:05:01 +00:00
fetch(url, {
method : 'GET',
headers : headers
// ... etc
}).then( ...
*/
2020-05-19 09:05:04 +00:00
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
if (useStone)
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
else
options.url = url;
console.log(options);
2020-06-01 08:56:48 +00:00
got(options.url).then((response) => {
resolve(response.body);
})
.catch((e) => {
console.error('getContent', e );
2020-06-01 08:56:48 +00:00
reject(e.response.body);
});
2020-05-19 09:05:04 +00:00
});
};
2020-09-01 11:44:42 +00:00
async savePage(html) {
const now = fecha.format(new Date(), 'YYYY-MM-DD--hh');
const filename = `pages/${this.siteid}-${now}.html`;
fs.writeFileSync(filename, html);
}
2020-05-19 09:05:04 +00:00
async getPage() {
console.log('>> getPage: fetching', this.url);
await this.getContent(this.url, this.useStone)
.then((html) => {
// console.log('>> getPage:: got', html);
console.log('>> getPage:: OK');
if (this.saveFile) this.savePage(html);
2020-05-19 09:05:04 +00:00
const $ = cheerio.load(html);
this.loadPage($);
})
.catch((err) => console.error(err));
}
2020-07-21 11:05:01 +00:00
2020-05-19 09:05:04 +00:00
// Site specific parts below here
2020-09-01 11:44:42 +00:00
/**
* Break each page into items
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async breakPage() {
}
2020-09-01 11:44:42 +00:00
/**
*
* @param part
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async extractDetails(part) {
}
2020-09-01 11:44:42 +00:00
/**
*
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async checkNext() {
}
2020-09-01 11:44:42 +00:00
/**
*
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async processSite() {
}
2020-09-01 11:44:42 +00:00
/**
*
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async getIndividualPage() {
}
2020-09-01 11:44:42 +00:00
/**
*
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async getJobPages() {
}
2020-09-01 11:44:42 +00:00
/**
*
* @returns {Promise<void>}
*/
2020-05-19 09:05:04 +00:00
async go() {
}
}
module.exports = MasterScraper;