155 lines
3.6 KiB
JavaScript
155 lines
3.6 KiB
JavaScript
const Scraper = require('../lib/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const logger = require('log4js').getLogger('RC');
|
|
const LocalStorage = require('node-localstorage').LocalStorage;
|
|
const fs = require('fs');
|
|
|
|
const HtmlDiffer = require('html-differ').HtmlDiffer;
|
|
|
|
const diffLogger = require('html-differ/lib/logger');
|
|
|
|
|
|
const Diff = require('text-diff');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'debug';
|
|
|
|
class OutletScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
|
|
this.setID('OUTLET');
|
|
|
|
this.run = this._debounce(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
}
|
|
|
|
|
|
async process() {
|
|
|
|
const options = {
|
|
ignoreAttributes: ['value', 'id', 'd'],
|
|
compareAttributesAsJSON: [],
|
|
ignoreWhitespaces: true,
|
|
ignoreComments: true,
|
|
ignoreEndTags: false,
|
|
ignoreDuplicateAttributes: false
|
|
};
|
|
|
|
const oldFile = `${this.path}/previous.html`;
|
|
// var basefile = fs.readFileSync('1.html', 'utf-8')
|
|
|
|
// const body = await this.page.content();
|
|
|
|
const innerText = await this.page.evaluate(() => {
|
|
return {
|
|
'body': document.body.innerText
|
|
};
|
|
});
|
|
|
|
|
|
// logger.debug(innerText.body);
|
|
|
|
if (!fs.existsSync(oldFile)) {
|
|
fs.writeFileSync(oldFile, body.body, 'utf-8');
|
|
} else
|
|
{
|
|
|
|
const previousFile = fs.readFileSync(oldFile, 'utf-8');
|
|
|
|
var diff = new Diff(); // options may be passed to constructor; see below
|
|
var textDiff = diff.main(previousFile, innerText.body); // produces diff array
|
|
const levenshtein = diff.levenshtein(textDiff);
|
|
|
|
|
|
logger.debug('levenshtein:', levenshtein);
|
|
|
|
if (levenshtein !== 0) {
|
|
logger.debug(diff.prettyHtml(textDiff));
|
|
|
|
fs.writeFileSync(oldFile, innerText.body, 'utf-8');
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
async start() {
|
|
await super._start();
|
|
try{
|
|
this.startPage = 'https://www.harmankardon.co.uk/outlet/';
|
|
|
|
// this.startPage = 'https://silvrtree.co.uk/slack';
|
|
const mouseDownDuration = OutletScrape.notARobot();
|
|
|
|
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/outlet`));
|
|
|
|
|
|
|
|
await this._initBrowser(true);
|
|
await this._createBrowserPage();
|
|
|
|
// await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
// await this.page.waitForSelector('#SI_ID_Head_FromPrice');
|
|
logger.debug('loaded..');
|
|
// await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });*/
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Grab the Pdf's and screenshots
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async __run() {
|
|
try {
|
|
|
|
logger.debug('run');
|
|
await this.start();
|
|
|
|
await this.process();
|
|
|
|
|
|
|
|
logger.debug('Done...');
|
|
|
|
// await this._randomWait(this.page, 5, 10);
|
|
// await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null);
|
|
|
|
// const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions'];
|
|
|
|
/*for (const section of sections)
|
|
await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section);
|
|
|
|
this.emit('done');*/
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = OutletScrape;
|