changedetection/lib/scraper.js
Martin Donnelly 9858a90912 init
2019-10-21 23:38:27 +01:00

792 lines
17 KiB
JavaScript

const fs = require('fs-extra');
const path = require('path');
const url = require('url');
const log4js = require('log4js');
let logger = log4js.getLogger('Scraper');
const EventEmitter = require('events');
const dateFormat = require('dateformat');
const puppeteer = require('puppeteer');
logger.level = process.env.LOGGER_LEVEL || 'debug';
class Scraper extends EventEmitter {
constructor() {
super(); // must call super for "this" to be defined.
this.filters = [
'livefyre',
'moatad',
'analytics',
'controltag',
'chartbeat',
'siteimprove',
'hotjar',
'/plugins/cookie-notice/',
'addthis',
'facebook.',
'linkedin',
'googletagmanager',
'swiftypecdn.com',
'-social-tracking.',
'demdex.net',
'adobedtm.com'
];
this.perf = {
'started': 0,
'finished': 0,
'time': 0,
'scraped': 0
};
this.browserCrashed = false;
this.crashLog = new Map([]);
this.page = null;
}
setID(newID) {
logger = log4js.getLogger(`Scraper (${newID})`);
logger.level = process.env.LOGGER_LEVEL || 'warn';
this.id = newID;
}
/**
*
* @param path
* @returns {Promise<void>}
*/
async emptyPath(path) {
if (process.env.NODE_ENV === 'production')
await del([path]).then(paths => {
logger.warn('Deleted files and folders:\n', paths.join('\n'));
});
}
async setPath(newPath) {
const now = new Date();
const timestamp = dateFormat(now, 'yyyymmdd');
await this.emptyPath(newPath);
// this.path = `${newPath}/${timestamp}`;
this.path = `${newPath}`;
this.debugPath = `${__dirname }/../debug/${this.id}`;
await this._createDirectory(this.path);
await this._createDirectory(this.debugPath);
}
/**
* 'Human' like click delay
* @returns {number}
*/
static notARobot() {
return 90 + Math.floor(Math.random() * (30 - 1));
}
/**
*
*/
canDetach() {
this.detatchable = true;
}
async _killRunningBrowser() {
// if (typeof(this.browser) !== 'undefined' && this.browser !== null) {
if (this.browser)
try{
logger.info('Trying to close hanging / running browser');
await this._forcePageClose();
await this.browser.removeAllListeners('disconnected');
await this.browser.close();
}
catch(err) {
logger.error('Closing browser', err);
}
finally {
this.browser = null;
}
}
/**
*
* @param headless
* @returns {Promise<void>}
* @private
*/
async _initBrowser(headless = true) {
// Force headless when running in production
const realHeadless = (process.env.NODE_ENV === 'production') ? true : headless;
await this._killRunningBrowser();
this.browserCrashed = false;
logger.info('Puppeteer.launch', realHeadless);
logger.debug('Using proxy:', process.env.PROXY_URI);
this.browser = await puppeteer.launch({
'headless': realHeadless,
'args': [
// Use proxy so FCA wont block us
`--proxy-server=${process.env.PROXY_URI}`,
'--disable-dev-shm-usage',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-default-apps'
]
}).catch((err) => {
logger.error('Puppeteer failed to launch');
logger.error(err);
});
const browserVersion = await this.browser.version();
logger.info(`Browser version ${browserVersion}`);
this.browser.on('disconnected', () => {
logger.warn('Browser has become detached!');
if (this.detatchable === false) {
this.browserCrashed = true;
logger.warn('browser.onDisconnected::emit recover');
this.emit('recover');
}
});
}
async _forcePageClose() {
// if (this.page !== null) {
if (this.page)
try{
logger.warn('Browser Page exists: DESTROYING');
await this.page.removeAllListeners('close');
// this.page.on('close', () => {});
await this.page.close().catch((e) => {
logger.debug(e);
});
}
catch( err) {
logger.error(err);
}
finally {
this.page = null;
}
}
/**
*
* @returns {Promise<void>}
* @private
*/
async _createBrowserPage() {
this._forcePageClose();
this.page = await this.browser.newPage();
try{
await this.page.setDefaultNavigationTimeout(90000);
await this.page.setDefaultTimeout(90000);
}
catch(err) {
logger.debug(err);
}
await this.page.setRequestInterception(true);
this.page.on('request', (request) => {
const url = request.url();
logger.trace('request', url);
const shouldAbort = this.filters.some((urlPart) => url.includes(urlPart));
if (shouldAbort) request.abort();
else request.continue();
});
this.page.on('dialog', async dialog => {
logger.warn('Dialog Box', dialog.message());
await dialog.dismiss();
});
this.page.on('error', async err => {
logger.warn('Page crashed', err);
if (!this.detatchable) {
await this._uploadError();
logger.warn('page.onError::emit recover');
this.emit('recover');
}
});
this.page.on('pageerror', async err => {
logger.trace('pageerror', err);
});
this.page.on('requestfailed', async err => {
const url = err['_url'];
const blocked = this.filters.some((urlPart) => url.includes(urlPart));
if (blocked)
logger.trace('🚫', err['_url']);
else
logger.warn('requestfailed', err['_url']);
});
this.page.on('close', () => {
logger.warn('Browser Page has closed');
if (this.detatchable === false) {
logger.warn('page.onClose::emit recover');
this.emit('recover');
}
});
}
/**
*
* @returns {Promise<void>}
* @private
*/
async _makeResponsive() {
const viewPort = {
'name': 'Responsive',
'userAgent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3494.0 Safari/537.36',
'viewport': {
'width': 1200,
'height': 1200,
'deviceScaleFactor': 4.5,
'isMobile': true,
'hasTouch': true,
'isLandscape': true
}
};
await this.page.setViewport(viewPort.viewport);
await this.page.setDefaultNavigationTimeout(90000);
}
/**
*
* @param id
* @returns {string}
* @private
*/
_makeFileName(id) {
const noWhiteSpace = /\W/g;
const maxChars = 175;
const entity = removeAccents.remove(id.replace(noWhiteSpace, ' ').trim());
const _crc = crc.crc32(id).toString(16);
const output = [this.modePrefix[this.mode], camelCase(entity)].join('');
return (output.length > maxChars) ? output.substring(0, maxChars).concat('_', _crc) : output;
}
/**
*
* @param id
* @returns {Promise<string>}
* @private
*/
async _makeFilePath(id) {
return `${this.path}/${this._makeFileName(id)}`.substring(0, 240);
}
/**
*
* @param page
* @param destPath
* @param waitFor
* @returns {Promise<void>}
* @private
*/
async _makeScreenshotV2(page, destPath, waitFor = null) {
try{
if (waitFor)
await page.waitFor(waitFor);
if(!this.page) {
logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot');
return;
}
logger.debug('Snapshot', `${destPath}.png`);
await page.setViewport({ 'width': 1200, 'height': 800 });
await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => {
logger.error('Screenshot', err);
});
}
catch( err) {
logger.error('_makeScreenshotV2', err);
}
}
/**
*
* @param page
* @param minTime
* @param maxTime
* @param msg
* @returns {Promise<void>}
* @private
*/
async _randomWait(page, minTime = 2, maxTime = 10, msg = '') {
const insertedMsg = (msg.length > 0) ? `${this.id} ${msg} - ` : `${this.id} `;
const waitTime = Math.floor(Math.random() * (maxTime - minTime + 1) + minTime);
logger.debug(`${insertedMsg}Waiting ${waitTime} seconds...`);
await page.waitFor(waitTime * 1000);
}
/**
*
* @param page
* @param waitTime
* @param msg
* @returns {Promise<void>}
* @private
*/
async _microWait(page, waitTime, msg = '') {
const insertedMsg = (msg.length > 0) ? `${msg} - ` : '';
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 100} ms...`);
await page.waitFor(waitTime * 100);
}
/**
*
* @param page
* @param waitTime
* @param msg
* @returns {Promise<void>}
* @private
*/
async _nanoWait(page, waitTime, msg = '') {
const insertedMsg = (msg.length > 0) ? `${msg} - ` : '';
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 10} ms...`);
await page.waitFor(waitTime * 10);
}
/**
*
* @param destPath
* @param data
* @returns {Promise<*>}
* @private
*/
async _saveToFile(destPath, data) {
// use for artefacts saving only
return new Promise((resolve, reject) => {
const fullPath = `${__dirname}/../artefacts/${destPath}`;
fs.writeFile(fullPath, data, function(err) {
if(err)
reject(err);
else
resolve(`File saved to '${fullPath}'`);
});
});
}
/**
*
* @param destPath
* @param data
* @returns {Promise<*>}
* @private
*/
async _dumpFile(destPath, data) {
return new Promise((resolve, reject) => {
fs.writeFile(destPath, data, function(err) {
if(err)
reject(err);
else
resolve(`File saved to '${destPath}'`);
});
});
}
/**
*
* @param destPath
* @returns {Promise<string>}
* @private
*/
async _createTimestampDirectory(destPath = null) {
const now = new Date();
const timestamp = dateFormat(now, 'yyyymmddHHMM');
const fullPath = `${destPath}/${timestamp}`;
logger.info('fullPath', fullPath);
if (!fs.existsSync(fullPath))
fs.ensureDirSync(fullPath);
return fullPath;
}
/**
*
* @param destPath
* @returns {Promise<*>}
* @private
*/
async _createDirectory(destPath = null) {
try{
if (!fs.existsSync(destPath))
fs.ensureDirSync(destPath);
}
catch( err) {
logger.error('_createDirectory', err);
}
return destPath;
}
/**
*
* @param destPath
* @param filename
* @returns {Promise<*>}
* @private
*/
async _createArchive(destPath = null, filename = null, glob = false) {
return new Promise((resolve, reject) => {
if (!destPath || !filename) {
const e = new Error('Missing paths');
logger.error(e);
reject(e);
}
const archive = archiver(filename, {
'zlib': { 'level': 9 } // Sets the compression level.
});
if (glob)
archive.glob(`${destPath}`);
else
archive.directory(`${destPath}/`);
archive.finalize().then(() => {
logger.debug('Archive finished');
resolve();
});
});
}
/**
*
* @param destPath
* @param filename
* @param glob
* @returns {Promise<*>}
* @private
*/
async _createArchiveV2(destPath = null, filename = null, glob = false) {
logger.debug('=== _createArchiveV2 :: STREAMING ===');
return new Promise((resolve, reject) => {
if (!destPath || !filename) {
const e = new Error('Missing paths');
logger.error(e);
reject(e);
}
const output = fs.createWriteStream(filename);
const archive = archiver('zip', {
'TransformOptions': {
'objectMode':true
},
'zlib': { 'level': 6 } // Sets the compression level.
});
archive.pipe(output);
if (glob)
archive.glob(`${destPath}`);
else
archive.directory(`${destPath}/`);
archive.finalize().then(() => {
logger.debug('Archive finished');
resolve();
});
});
}
/**
*
* @param urlStr
* @returns {*}
*/
explodeURL (urlStr = null) {
if (!urlStr || urlStr === '')
return (null);
try {
const workURL = url.parse(urlStr);
return tldExtract.parse_host( workURL.host);
}
catch(e) {
return e;
}
}
/**
* Get Params from a url string
*/
_getParamsFromUrl(url) {
url = decodeURI(url);
if (typeof url === 'string') {
const params = url.split('?');
const obj = {};
if (params.length > 1) {
const eachParamsArr = params[1].split('&');
if (eachParamsArr && eachParamsArr.length)
eachParamsArr.map(param => {
const keyValuePair = param.split('=');
const key = keyValuePair[0];
const value = keyValuePair[1];
obj[key] = value;
});
}
return obj;
}
}
/**
*
* @param text
* @returns {string}
* @private
*/
_cleanUp(text) {
if (!text) return '';
const regexNewLine = /\n/;
const regexCollapseWS = /\s+/g;
return text.replace(regexNewLine, '').replace(regexCollapseWS, ' ').trim();
}
_makeFieldName(text) {
const removePunctuation = /([^A-Za-z0-9\s])+/g;
if (!text) return '';
let workString = this._cleanUp(text);
workString = removeAccents.remove(workString);
workString = workString.replace(removePunctuation, '');
workString = camelCase(workString);
return workString;
}
async _renameFile(origFN, newFN) {
await checkFileExists(origFN)
.then(async exists => {
console.log(`file exists: ${exists}`);
if (exists)
await fs.renameSync(origFN, newFN);
}).catch((e) => {
logger.error(e);
});
}
/**
*
* @private
*/
async _start() {
logger.debug(`<=- START ${this.id}-=>`);
const now = new Date();
this.perf.started = now.getTime();
this.on('recover', async () => {
await this.recover();
});
// await this._createLock();
}
/**
*
* @returns {Promise<void>}
* @private
*/
async _done() {
logger.info('<=- DONE -=>');
// OK To close the browser window now
this.canDetach();
await this._forcePageClose();
await this._killRunningBrowser();
await this._complete();
}
/**
*
* @returns {Promise<void>}
* @private
*/
async _complete() {
try {
if (global.gc) global.gc();
}
catch (e) {
logger.warn('`node --expose-gc`');
}
logger.info('<=- COMPLETE -=>');
}
/**
*
* @param url
* @param options
* @param noRecover
* @returns {Promise<void>}
* @private
*/
async _goto(url, options = {}, noRecover = false) {
this.lastUrl = url;
const newOptions = Object.assign({ 'timeout':90000, 'waitUntil':'networkidle0' }, options);
logger.debug(newOptions);
try {
logger.info('Goto:', url);
await this.page.goto(url, newOptions).catch((err) => {
logger.error('GOTO', err);
if (err.message.indexOf('net::ERR_FAILED') !== -1)
this.browserCrashed = true;
if (!noRecover)
this.emit('recover');
});
}
catch (error) {
logger.error(error);
logger.error(url, options);
// if (error === 'net::ERR_CONNECTION_TIMED_OUT')
}
}
/**
*
* @param fn
* @param time
* @returns {Function}
* @private
*/
_debounce(fn, time) {
let timeout;
return function (...args) { // <-- not an arrow function
const functionCall = () => fn.apply(this, args);
clearTimeout(timeout);
timeout = setTimeout(functionCall, time);
};
}
/**
*
* @param callback
* @param limit
* @returns {Function}
* @private
*/
_throttle (callback, limit) {
var wait = false;
return function () {
if (!wait) {
callback.apply(null, arguments);
wait = true;
setTimeout(function () {
wait = false;
}, limit);
}
};
}
/**
*
* @param func
* @returns {function(): *}
* @private
*/
_once(func) {
var alreadyCalled = false;
var result;
return function() {
if (!alreadyCalled) {
result = func.apply(this, arguments);
alreadyCalled = true;
}
return result;
};
};
/**
*
* @param restartURL
* @returns {Promise<void>}
*/
async restart(restartURL) {
const rURL = restartURL || this.lastUrl;
logger.info(`Restarting ${this.id} // Going to ${rURL}`);
await this._goto(rURL);
}
/**
*
* @param filename
* @param data
* @returns {Promise<void>}
*/
async saveFile(filename, data) {
try{
fs.writeFileSync(filename, data);
}
catch( err) {
logger.error(err);
}
}
}
module.exports = Scraper;