changedetection/scrapers/scraper.js
Martin Donnelly cd3248340d Docker file working
Build working
docker-compose.yml working
2020-07-09 16:34:01 +01:00

337 lines
8.7 KiB
JavaScript

const Scraper = require('../lib/scraper');
const path = require('path');
const logger = require('log4js').getLogger('RC');
const fs = require('fs');
const dateFormat = require('dateformat');
const _ = require('lodash');
const jsonfile = require('jsonfile');
const Diff = require('text-diff');
const time = require('time-since');
const pug = require('pug');
const PNG = require('pngjs').PNG;
const pixelmatch = require('pixelmatch');
const compareImages = require('resemblejs/compareImages');
const email = require('smtp-email-sender')({
'host': process.env.HOST,
'port': '465',
'auth': {
'user': process.env.USER,
'pass': process.env.PASS,
'type': 'LOGIN' // PLAIN, LOGIN, MD5 etc...
},
'secure': 'secure'
});
logger.level = process.env.LOGGER_LEVEL || 'debug';
class ChangeDetection extends Scraper {
constructor() {
super();
this.setID('CD');
/* this.run = _.debounce(async () => {
await this.__run();
}, 5000);*/
this.run = async () => {
await this.__run();
};
}
pugTest(data, newpath) {
logger.debug(pug.renderFile(`${newpath}/` + 'pug/email.pug', data));
}
async sendSMTP(data, newPath) {
const now = new Date();
const attachments = [
{
'path':data.diffPNG
}
];
const html = pug.renderFile(`${newPath}/` + 'pug/email.pug', data);
email({
'from': 'Aida <aida@caliban.io>',
'to': 'Martin <martind2000@gmail.com>',
'subject': `ChangeDetection: ${data.name}`,
'html': html,
'attachments': attachments
});
}
async generateDiffScreenshotV2(previous, today) {
const { dir, root, ext } = path.parse(today);
let { name } = path.parse(today);
const options = {
'output': {
'errorColor': {
'red': 255,
'green': 0,
'blue': 255
},
'errorType': 'movement',
'transparency': 0.3,
'largeImageThreshold': 1200,
'useCrossOrigin': false,
'outputDiff': true
},
'scaleToSameSize': false,
'ignore': 'colors'
};
// const img1 = PNG.sync.read(fs.readFileSync(previous));
// const img2 = PNG.sync.read(fs.readFileSync(today));
const img1 = fs.readFileSync(previous);
const img2 = fs.readFileSync(today);
const data = await compareImages(img1, img2,
options
);
name = name.concat('_diff');
const endFilename = path.format({ dir, root, ext, name });
logger.debug('diffFilename', endFilename);
fs.writeFileSync(endFilename, data.getBuffer());
return endFilename;
}
async generateDiffScreenshot(previous, today) {
const { dir, root, ext } = path.parse(today);
let { name } = path.parse(today);
const img1 = PNG.sync.read(fs.readFileSync(previous));
const img2 = PNG.sync.read(fs.readFileSync(today));
const { width, height } = img1;
const diff = new PNG({ width, height });
pixelmatch(img1.data, img2.data, diff.data, width, height, { 'threshold': 0.1 });
name = name.concat('_diff');
const endFilename = path.format({ dir, root, ext, name });
logger.debug('diffFilename', endFilename);
fs.writeFileSync(endFilename, PNG.sync.write(diff));
return endFilename;
}
async processItem(item) {
logger.debug(`Processing ${item.name}...`);
const insRegEx = /<ins>/g;
const now = new Date();
const filename = _.kebabCase(item.name);
const oldFile = `${this.path}/${filename}.html`;
const stats = this.stats.get(filename) || { 'lastSaved': now, 'lastChanged':null };
await this._goto(item.url);
await this._randomWait(this.page, 3, 5);
const innerText = await this.page.evaluate(() => {
return {
'body': document.body.innerText
};
});
const timestamp = dateFormat(now, 'yyyymmddHHMM');
const screenshotPath = `${this.path}/screenshots/${filename}-${timestamp}`;
if (!fs.existsSync(oldFile)) {
fs.writeFileSync(oldFile, innerText.body, 'utf-8');
stats.screenshot = screenshotPath;
await this._makeScreenshotV2(this.page, screenshotPath, null);
this.stats.set(filename, stats);
}
else {
const previousFile = fs.readFileSync(oldFile, 'utf-8');
const diff = new Diff(); // options may be passed to constructor; see below
const textDiff = diff.main(previousFile, innerText.body); // produces diff array
const cleanedDiff = diff.cleanupSemantic(textDiff);
const levenshtein = diff.levenshtein(textDiff);
// logger.debug('textDiff:', textDiff);
logger.debug('levenshtein:', levenshtein);
logger.debug('cleanedDiff:', cleanedDiff );
if (levenshtein !== 0) {
logger.info('Changed...');
stats.previousScreenshot = stats.screenshot;
stats.previousChange = stats.lastSaved;
stats.lastSaved = now;
stats.lastChanged = now;
stats.screenshot = screenshotPath;
stats.changed = diff.prettyHtml(textDiff);
stats.levenshtein = levenshtein;
stats.since = time.since(new Date(stats.previousChange)).days();
stats.changed = stats.changed.replace(insRegEx, `<ins style="background-color: #ffff99;display:inline;">`);
await this._makeScreenshotV2(this.page, screenshotPath, null);
stats.diffPNG = await this.generateDiffScreenshotV2(stats.previousScreenshot.concat('.png'), screenshotPath.concat('.png'));
await this._randomWait(this.page, 3, 5);
fs.writeFileSync(oldFile, innerText.body, 'utf-8');
this.stats.set(filename, stats);
const pugData = { ...stats, ...item}; // eslint-disable-line
// console.log(pugData);
await this.sendSMTP(pugData, './');
}
else
logger.debug('No change...');
}
}
async processItems() {
for (const item of this.settings)
await this.processItem(item);
}
async processOld() {
const options = {
'ignoreAttributes': ['value', 'id', 'd'],
'compareAttributesAsJSON': [],
'ignoreWhitespaces': true,
'ignoreComments': true,
'ignoreEndTags': false,
'ignoreDuplicateAttributes': false
};
const oldFile = `${this.path}/previous.html`;
const innerText = await this.page.evaluate(() => {
return {
'body': document.body.innerText
};
});
if (!fs.existsSync(oldFile))
fs.writeFileSync(oldFile, body.body, 'utf-8');
else {
const previousFile = fs.readFileSync(oldFile, 'utf-8');
const diff = new Diff(); // options may be passed to constructor; see below
const textDiff = diff.main(previousFile, innerText.body); // produces diff array
const levenshtein = diff.levenshtein(textDiff);
logger.debug('levenshtein:', levenshtein);
if (levenshtein !== 0) {
logger.debug(diff.prettyHtml(textDiff));
fs.writeFileSync(oldFile, innerText.body, 'utf-8');
}
}
}
async start() {
await super._start();
try{
this.startPage = 'https://www.harmankardon.co.uk/outlet/';
// this.startPage = 'https://silvrtree.co.uk/slack';
const mouseDownDuration = ChangeDetection.notARobot();
await this.setPath(path.resolve(`${__dirname }/../artefacts`));
await this._createDirectory(`${this.path}/screenshots`);
await this._initBrowser(true);
await this._createBrowserPage();
// await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
// await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
// await this.page.waitForSelector('#SI_ID_Head_FromPrice');
logger.debug('Started..');
// await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });*/
}
catch(e) {
throw new Error(e);
}
}
async loadSettings() {
logger.debug('Load settings...');
const statsFile = `${this.path}/stats.json`;
this.settings = jsonfile.readFileSync('settings.json');
let stats = [];
if (fs.existsSync(statsFile))
stats = jsonfile.readFileSync(statsFile) || [];
this.stats = new Map(stats);
}
async saveSettings() {
logger.debug('Save settings...');
const statsFile = `${this.path}/stats.json`;
const stats = [...this.stats];
// logger.debug(stats);
jsonfile.writeFileSync(statsFile, stats);
}
/**
* Grab the Pdf's and screenshots
* @returns {Promise<void>}
*/
async __run() {
// try {
logger.debug('run');
await this.start();
// await this.process();
await this.loadSettings();
logger.debug('Running...');
await this.processItems();
await this.saveSettings();
await this._done();
/* }
catch(e) {
throw new Error(e);
}*/
}
}
module.exports = ChangeDetection;