jobscraper/lib/scraper.js
2020-06-01 09:33:03 +01:00

169 lines
3.4 KiB
JavaScript

/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const request = require('request');
const axios = require('axios');
const fecha = require('fecha');
const fs = require('fs');
const dbmanager = require('../lib/dbmanager');
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
class MasterScraper {
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.useStone = false;
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
setStartUrl(newUrl) {
this.url = newUrl;
}
loadPage(page) {
this.currentPage = page;
}
getContent(url, useStone = false) {
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
if (useStone)
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
else
options.url = url;
console.log(options);
request(options, (err, _res, body) => {
if (!err)
resolve(body);
else
reject(err);
});
});
};
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone)
.then((html) => {
const $ = cheerio.load(html);
this.loadPage($);
})
.catch((err) => console.error(err));
// console.log(response.status);
/* if (response.status === 200) {
// console.log(response.status);
try{
console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`);
await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data);
}
catch(err) {
console.error(err);
}
const $ = cheerio.load(response.data);
this.loadPage($);
}*/
}
async addToDB() {
console.log(`+ ${this.siteid} addToDB`);
for(const item of this.items) {
console.log(item);
dbmanager.insertOne(item)
.then((data) => {
console.log(data);
})
.catch((err) => {
console.error(err.message || 'Some error occurred while querying the database.');
});
}
console.log(`- ${this.siteid} addToDB`);
}
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
}
makeUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
makeProxyUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
// Site specific parts below here
async breakPage() {
}
async extractDetails(part) {
}
async checkNext() {
}
async processSite() {
}
async getIndividualPage() {
}
async getJobPages() {
}
async go() {
}
}
module.exports = MasterScraper;