166 lines
3.4 KiB
JavaScript
166 lines
3.4 KiB
JavaScript
/**
|
|
* Created by WebStorm.
|
|
* User: martin
|
|
* Date: 15/04/2020
|
|
* Time: 11:55
|
|
|
|
*/
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const request = require('request');
|
|
const axios = require('axios');
|
|
const fecha = require('fecha');
|
|
|
|
const fs = require('fs');
|
|
|
|
const dbmanager = require('../lib/dbmanager');
|
|
|
|
const filterReject = require('../lib/filter_reject');
|
|
const filterAccept = require('../lib/filter_md_jobs');
|
|
|
|
class MasterScraper {
|
|
|
|
constructor() {
|
|
this.url = '';
|
|
this.items = [];
|
|
this.currentPage = null;
|
|
this.hosturl = '';
|
|
this.siteid = '';
|
|
this.useStone = false;
|
|
this.requestOptions = {
|
|
'url' : '',
|
|
'proxy' : 'http://uk.proxymesh.com:31280',
|
|
'tunnel' : true
|
|
};
|
|
|
|
}
|
|
|
|
setStartUrl(newUrl) {
|
|
this.url = newUrl;
|
|
}
|
|
|
|
loadPage(page) {
|
|
this.currentPage = page;
|
|
}
|
|
|
|
getContent(url, useStone = false) {
|
|
// return new pending promise
|
|
return new Promise((resolve, reject) => {
|
|
// select http or https module, depending on reqested url
|
|
// const lib = url.startsWith('https') ? require('https') : require('http');
|
|
const options = Object.assign({}, this.requestOptions);
|
|
if (useStone)
|
|
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
|
|
else
|
|
options.url = url;
|
|
|
|
console.log(options);
|
|
|
|
request(options, (err, _res, body) => {
|
|
if (!err)
|
|
resolve(body);
|
|
else
|
|
|
|
reject(err);
|
|
});
|
|
});
|
|
};
|
|
|
|
async getPage() {
|
|
console.log('>> getPage: fetching', this.url);
|
|
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
|
|
const filename = `${this.siteid}-${now}.html`;
|
|
|
|
await this.getContent(this.url, this.useStone)
|
|
.then((html) => {
|
|
const $ = cheerio.load(html);
|
|
this.loadPage($);
|
|
})
|
|
.catch((err) => console.error(err));
|
|
|
|
// console.log(response.status);
|
|
|
|
/* if (response.status === 200) {
|
|
// console.log(response.status);
|
|
|
|
try{
|
|
console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`);
|
|
await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data);
|
|
}
|
|
catch(err) {
|
|
console.error(err);
|
|
}
|
|
const $ = cheerio.load(response.data);
|
|
this.loadPage($);
|
|
}*/
|
|
}
|
|
|
|
async addToDB() {
|
|
for(const item of this.items) {
|
|
console.log(item);
|
|
|
|
dbmanager.insertOne(item)
|
|
.then((data) => {
|
|
console.log(data);
|
|
})
|
|
.catch((err) => {
|
|
console.error(err.message || 'Some error occurred while querying the database.');
|
|
});
|
|
}
|
|
}
|
|
|
|
async filterAdverts() {
|
|
console.log('>> FilterAdverts');
|
|
console.log(`Currently ${this.items.length} items...`);
|
|
|
|
this.items = this.items.filter(filterReject);
|
|
|
|
console.log(`After reject ${this.items.length} items...`);
|
|
|
|
this.items = this.items.filter(filterAccept);
|
|
|
|
console.log(`After accept ${this.items.length} items...`);
|
|
}
|
|
|
|
makeUrl(appended) {
|
|
return `https://${ this.siteurl }${appended}`;
|
|
}
|
|
|
|
makeProxyUrl(appended) {
|
|
return `https://${ this.siteurl }${appended}`;
|
|
}
|
|
// Site specific parts below here
|
|
|
|
async breakPage() {
|
|
|
|
}
|
|
|
|
async extractDetails(part) {
|
|
|
|
}
|
|
|
|
async checkNext() {
|
|
|
|
}
|
|
|
|
async processSite() {
|
|
|
|
}
|
|
|
|
async getIndividualPage() {
|
|
|
|
}
|
|
|
|
async getJobPages() {
|
|
|
|
}
|
|
|
|
async go() {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = MasterScraper;
|