jobscraper/lib/scraper.js

179 lines
3.7 KiB
JavaScript
Raw Normal View History

2020-05-19 09:05:04 +00:00
/**
* Created by WebStorm.
* User: martin
* Date: 15/04/2020
* Time: 11:55
*/
const cheerio = require('cheerio');
const request = require('request');
2020-06-01 08:56:48 +00:00
const got = require('got');
2020-05-19 09:05:04 +00:00
const axios = require('axios');
const fecha = require('fecha');
const fs = require('fs');
const dbmanager = require('../lib/dbmanager');
const filterReject = require('../lib/filter_reject');
const filterAccept = require('../lib/filter_md_jobs');
class MasterScraper {
constructor() {
this.url = '';
this.items = [];
this.currentPage = null;
this.hosturl = '';
this.siteid = '';
this.useStone = false;
this.requestOptions = {
'url' : '',
'proxy' : 'http://uk.proxymesh.com:31280',
'tunnel' : true
};
}
setStartUrl(newUrl) {
this.url = newUrl;
}
loadPage(page) {
this.currentPage = page;
}
getContent(url, useStone = false) {
// return new pending promise
return new Promise((resolve, reject) => {
// select http or https module, depending on reqested url
// const lib = url.startsWith('https') ? require('https') : require('http');
const options = Object.assign({}, this.requestOptions);
if (useStone)
options.url = `http://45.33.114.116:8080/${encodeURIComponent(url)}`;
else
options.url = url;
console.log(options);
2020-06-01 08:56:48 +00:00
got(options.url).then((response) => {
resolve(response.body);
})
.catch((e) => {
reject(e.response.body);
});
/*
2020-05-19 09:05:04 +00:00
request(options, (err, _res, body) => {
if (!err)
resolve(body);
else
reject(err);
});
2020-06-01 08:56:48 +00:00
*/
2020-05-19 09:05:04 +00:00
});
};
async getPage() {
console.log('>> getPage: fetching', this.url);
const now = fecha.format(new Date(), 'YYYY-MM-DD--hhmmss');
const filename = `${this.siteid}-${now}.html`;
await this.getContent(this.url, this.useStone)
.then((html) => {
const $ = cheerio.load(html);
this.loadPage($);
})
.catch((err) => console.error(err));
// console.log(response.status);
/* if (response.status === 200) {
// console.log(response.status);
try{
console.log(`Saving ${__dirname}/../test/data/${this.siteid}/${filename}`);
await fs.writeFileSync(`${__dirname}/../test/data/${this.siteid}/${filename}`, response.data);
}
catch(err) {
console.error(err);
}
const $ = cheerio.load(response.data);
this.loadPage($);
}*/
}
async addToDB() {
2020-06-01 08:33:03 +00:00
console.log(`+ ${this.siteid} addToDB`);
2020-05-19 09:05:04 +00:00
for(const item of this.items) {
console.log(item);
2020-06-01 08:38:17 +00:00
await dbmanager.insertOne(item)
2020-05-19 09:05:04 +00:00
.then((data) => {
console.log(data);
})
.catch((err) => {
2020-06-01 08:36:29 +00:00
console.error(`${this.siteid} db error`);
2020-05-19 09:05:04 +00:00
console.error(err.message || 'Some error occurred while querying the database.');
});
}
2020-06-01 08:33:03 +00:00
console.log(`- ${this.siteid} addToDB`);
2020-05-19 09:05:04 +00:00
}
async filterAdverts() {
console.log('>> FilterAdverts');
console.log(`Currently ${this.items.length} items...`);
this.items = this.items.filter(filterReject);
console.log(`After reject ${this.items.length} items...`);
this.items = this.items.filter(filterAccept);
console.log(`After accept ${this.items.length} items...`);
}
makeUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
makeProxyUrl(appended) {
return `https://${ this.siteurl }${appended}`;
}
// Site specific parts below here
async breakPage() {
}
async extractDetails(part) {
}
async checkNext() {
}
async processSite() {
}
async getIndividualPage() {
}
async getJobPages() {
}
async go() {
}
}
module.exports = MasterScraper;