const fs = require('fs-extra'); const path = require('path'); const url = require('url'); const log4js = require('log4js'); let logger = log4js.getLogger('Scraper'); const del = require('del'); const EventEmitter = require('events'); const dateFormat = require('dateformat'); const puppeteer = require('puppeteer'); logger.level = process.env.LOGGER_LEVEL || 'debug'; class Scraper extends EventEmitter { constructor() { super(); // must call super for "this" to be defined. this.filters = [ 'livefyre', 'moatad', 'analytics', 'controltag', 'chartbeat', 'siteimprove', 'hotjar', '/plugins/cookie-notice/', 'addthis', 'facebook.', 'linkedin', 'googletagmanager', 'swiftypecdn.com', '-social-tracking.', 'demdex.net', 'adobedtm.com' ]; this.perf = { 'started': 0, 'finished': 0, 'time': 0, 'scraped': 0 }; this.browserCrashed = false; this.crashLog = new Map([]); this.page = null; } setID(newID) { logger = log4js.getLogger(`Scraper (${newID})`); logger.level = process.env.LOGGER_LEVEL || 'warn'; this.id = newID; } /** * * @param path * @returns {Promise} */ async emptyPath(path) { if (process.env.NODE_ENV === 'production') await del([path]).then(paths => { logger.warn('Deleted files and folders:\n', paths.join('\n')); }); } async setPath(newPath) { const now = new Date(); const timestamp = dateFormat(now, 'yyyymmdd'); // why delete?? // await this.emptyPath(newPath); // this.path = `${newPath}/${timestamp}`; this.path = `${newPath}`; this.debugPath = `${__dirname }/../debug/${this.id}`; await this._createDirectory(this.path); await this._createDirectory(this.debugPath); } /** * 'Human' like click delay * @returns {number} */ static notARobot() { return 90 + Math.floor(Math.random() * (30 - 1)); } /** * */ canDetach() { this.detatchable = true; } async _killRunningBrowser() { // if (typeof(this.browser) !== 'undefined' && this.browser !== null) { if (this.browser) try{ logger.info('Trying to close hanging / running browser'); await this._forcePageClose(); await this.browser.removeAllListeners('disconnected'); await this.browser.close(); } catch(err) { logger.error('Closing browser', err); } finally { this.browser = null; } } /** * * @param headless * @returns {Promise} * @private */ async _initBrowser(headless = true) { // Force headless when running in production const realHeadless = (process.env.NODE_ENV === 'production') ? true : headless; await this._killRunningBrowser(); this.browserCrashed = false; logger.info('Puppeteer.launch', realHeadless); logger.debug('Using proxy:', process.env.PROXY_URI); this.browser = await puppeteer.launch({ 'headless': realHeadless, 'args': [ // Use proxy so FCA wont block us `--proxy-server=${process.env.PROXY_URI}`, '--disable-dev-shm-usage', '--no-sandbox', '--disable-setuid-sandbox', '--disable-accelerated-2d-canvas', '--disable-gpu', '--window-size=1920x1080', '--hide-scrollbars', '--disable-default-apps' ] }).catch((err) => { logger.error('Puppeteer failed to launch'); logger.error(err); }); const browserVersion = await this.browser.version(); logger.info(`Browser version ${browserVersion}`); this.browser.on('disconnected', () => { logger.warn('Browser has become detached!'); if (this.detatchable === false) { this.browserCrashed = true; logger.warn('browser.onDisconnected::emit recover'); this.emit('recover'); } }); } async _forcePageClose() { // if (this.page !== null) { if (this.page) try{ logger.warn('Browser Page exists: DESTROYING'); await this.page.removeAllListeners('close'); // this.page.on('close', () => {}); await this.page.close().catch((e) => { logger.debug(e); }); } catch( err) { logger.error(err); } finally { this.page = null; } } /** * * @returns {Promise} * @private */ async _createBrowserPage() { this._forcePageClose(); this.page = await this.browser.newPage(); try{ await this.page.setDefaultNavigationTimeout(90000); await this.page.setDefaultTimeout(90000); } catch(err) { logger.debug(err); } await this.page.setRequestInterception(true); this.page.on('request', (request) => { const url = request.url(); logger.trace('request', url); const shouldAbort = this.filters.some((urlPart) => url.includes(urlPart)); if (shouldAbort) request.abort(); else request.continue(); }); this.page.on('dialog', async dialog => { logger.warn('Dialog Box', dialog.message()); await dialog.dismiss(); }); this.page.on('error', async err => { logger.warn('Page crashed', err); if (!this.detatchable) { await this._uploadError(); logger.warn('page.onError::emit recover'); this.emit('recover'); } }); this.page.on('pageerror', async err => { logger.trace('pageerror', err); }); this.page.on('requestfailed', async err => { const url = err['_url']; const blocked = this.filters.some((urlPart) => url.includes(urlPart)); if (blocked) logger.trace('🚫', err['_url']); else logger.warn('requestfailed', err['_url']); }); this.page.on('close', () => { logger.warn('Browser Page has closed'); if (this.detatchable === false) { logger.warn('page.onClose::emit recover'); this.emit('recover'); } }); } /** * * @returns {Promise} * @private */ async _makeResponsive() { const viewPort = { 'name': 'Responsive', 'userAgent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3494.0 Safari/537.36', 'viewport': { 'width': 1200, 'height': 1200, 'deviceScaleFactor': 4.5, 'isMobile': true, 'hasTouch': true, 'isLandscape': true } }; await this.page.setViewport(viewPort.viewport); await this.page.setDefaultNavigationTimeout(90000); } /** * * @param id * @returns {string} * @private */ _makeFileName(id) { const noWhiteSpace = /\W/g; const maxChars = 175; const entity = removeAccents.remove(id.replace(noWhiteSpace, ' ').trim()); const _crc = crc.crc32(id).toString(16); const output = [this.modePrefix[this.mode], camelCase(entity)].join(''); return (output.length > maxChars) ? output.substring(0, maxChars).concat('_', _crc) : output; } /** * * @param id * @returns {Promise} * @private */ async _makeFilePath(id) { return `${this.path}/${this._makeFileName(id)}`.substring(0, 240); } /** * * @param page * @param destPath * @param waitFor * @returns {Promise} * @private */ async _makeScreenshotV2(page, destPath, waitFor = null) { try{ if (waitFor) await page.waitFor(waitFor); if(!this.page) { logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot'); return; } logger.debug('Snapshot', `${destPath}.png`); await page.setViewport({ 'width': 1200, 'height': 800 }); await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => { logger.error('Screenshot', err); }); } catch( err) { logger.error('_makeScreenshotV2', err); } } /** * * @param page * @param minTime * @param maxTime * @param msg * @returns {Promise} * @private */ async _randomWait(page, minTime = 2, maxTime = 10, msg = '') { const insertedMsg = (msg.length > 0) ? `${this.id} ${msg} - ` : `${this.id} `; const waitTime = Math.floor(Math.random() * (maxTime - minTime + 1) + minTime); logger.debug(`${insertedMsg}Waiting ${waitTime} seconds...`); await page.waitFor(waitTime * 1000); } /** * * @param page * @param waitTime * @param msg * @returns {Promise} * @private */ async _microWait(page, waitTime, msg = '') { const insertedMsg = (msg.length > 0) ? `${msg} - ` : ''; if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 100} ms...`); await page.waitFor(waitTime * 100); } /** * * @param page * @param waitTime * @param msg * @returns {Promise} * @private */ async _nanoWait(page, waitTime, msg = '') { const insertedMsg = (msg.length > 0) ? `${msg} - ` : ''; if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 10} ms...`); await page.waitFor(waitTime * 10); } /** * * @param destPath * @param data * @returns {Promise<*>} * @private */ async _saveToFile(destPath, data) { // use for artefacts saving only return new Promise((resolve, reject) => { const fullPath = `${__dirname}/../artefacts/${destPath}`; fs.writeFile(fullPath, data, function(err) { if(err) reject(err); else resolve(`File saved to '${fullPath}'`); }); }); } /** * * @param destPath * @param data * @returns {Promise<*>} * @private */ async _dumpFile(destPath, data) { return new Promise((resolve, reject) => { fs.writeFile(destPath, data, function(err) { if(err) reject(err); else resolve(`File saved to '${destPath}'`); }); }); } /** * * @param destPath * @returns {Promise} * @private */ async _createTimestampDirectory(destPath = null) { const now = new Date(); const timestamp = dateFormat(now, 'yyyymmddHHMM'); const fullPath = `${destPath}/${timestamp}`; logger.info('fullPath', fullPath); if (!fs.existsSync(fullPath)) fs.ensureDirSync(fullPath); return fullPath; } /** * * @param destPath * @returns {Promise<*>} * @private */ async _createDirectory(destPath = null) { try{ if (!fs.existsSync(destPath)) fs.ensureDirSync(destPath); } catch( err) { logger.error('_createDirectory', err); } return destPath; } /** * * @param destPath * @param filename * @returns {Promise<*>} * @private */ async _createArchive(destPath = null, filename = null, glob = false) { return new Promise((resolve, reject) => { if (!destPath || !filename) { const e = new Error('Missing paths'); logger.error(e); reject(e); } const archive = archiver(filename, { 'zlib': { 'level': 9 } // Sets the compression level. }); if (glob) archive.glob(`${destPath}`); else archive.directory(`${destPath}/`); archive.finalize().then(() => { logger.debug('Archive finished'); resolve(); }); }); } /** * * @param destPath * @param filename * @param glob * @returns {Promise<*>} * @private */ async _createArchiveV2(destPath = null, filename = null, glob = false) { logger.debug('=== _createArchiveV2 :: STREAMING ==='); return new Promise((resolve, reject) => { if (!destPath || !filename) { const e = new Error('Missing paths'); logger.error(e); reject(e); } const output = fs.createWriteStream(filename); const archive = archiver('zip', { 'TransformOptions': { 'objectMode':true }, 'zlib': { 'level': 6 } // Sets the compression level. }); archive.pipe(output); if (glob) archive.glob(`${destPath}`); else archive.directory(`${destPath}/`); archive.finalize().then(() => { logger.debug('Archive finished'); resolve(); }); }); } /** * * @param urlStr * @returns {*} */ explodeURL (urlStr = null) { if (!urlStr || urlStr === '') return (null); try { const workURL = url.parse(urlStr); return tldExtract.parse_host( workURL.host); } catch(e) { return e; } } /** * Get Params from a url string */ _getParamsFromUrl(url) { url = decodeURI(url); if (typeof url === 'string') { const params = url.split('?'); const obj = {}; if (params.length > 1) { const eachParamsArr = params[1].split('&'); if (eachParamsArr && eachParamsArr.length) eachParamsArr.map(param => { const keyValuePair = param.split('='); const key = keyValuePair[0]; const value = keyValuePair[1]; obj[key] = value; }); } return obj; } } /** * * @param text * @returns {string} * @private */ _cleanUp(text) { if (!text) return ''; const regexNewLine = /\n/; const regexCollapseWS = /\s+/g; return text.replace(regexNewLine, '').replace(regexCollapseWS, ' ').trim(); } _makeFieldName(text) { const removePunctuation = /([^A-Za-z0-9\s])+/g; if (!text) return ''; let workString = this._cleanUp(text); workString = removeAccents.remove(workString); workString = workString.replace(removePunctuation, ''); workString = camelCase(workString); return workString; } async _renameFile(origFN, newFN) { await checkFileExists(origFN) .then(async exists => { console.log(`file exists: ${exists}`); if (exists) await fs.renameSync(origFN, newFN); }).catch((e) => { logger.error(e); }); } /** * * @private */ async _start() { logger.debug(`<=- START ${this.id}-=>`); const now = new Date(); this.perf.started = now.getTime(); this.on('recover', async () => { await this.recover(); }); // await this._createLock(); } /** * * @returns {Promise} * @private */ async _done() { logger.info('<=- DONE -=>'); // OK To close the browser window now this.canDetach(); await this._forcePageClose(); await this._killRunningBrowser(); await this._complete(); } /** * * @returns {Promise} * @private */ async _complete() { try { if (global.gc) global.gc(); } catch (e) { logger.warn('`node --expose-gc`'); } logger.info('<=- COMPLETE -=>'); } /** * * @param url * @param options * @param noRecover * @returns {Promise} * @private */ async _goto(url, options = {}, noRecover = false) { this.lastUrl = url; const newOptions = Object.assign({ 'timeout':90000, 'waitUntil':'networkidle0' }, options); logger.debug(newOptions); try { logger.info('Goto:', url); await this.page.goto(url, newOptions).catch((err) => { logger.error('GOTO', err); if (err.message.indexOf('net::ERR_FAILED') !== -1) this.browserCrashed = true; if (!noRecover) this.emit('recover'); }); } catch (error) { logger.error(error); logger.error(url, options); // if (error === 'net::ERR_CONNECTION_TIMED_OUT') } } /** * * @param fn * @param time * @returns {Function} * @private */ _debounce(fn, time) { let timeout; return function (...args) { // <-- not an arrow function const functionCall = () => fn.apply(this, args); clearTimeout(timeout); timeout = setTimeout(functionCall, time); }; } /** * * @param callback * @param limit * @returns {Function} * @private */ _throttle (callback, limit) { var wait = false; return function () { if (!wait) { callback.apply(null, arguments); wait = true; setTimeout(function () { wait = false; }, limit); } }; } /** * * @param func * @returns {function(): *} * @private */ _once(func) { var alreadyCalled = false; var result; return function() { if (!alreadyCalled) { result = func.apply(this, arguments); alreadyCalled = true; } return result; }; }; /** * * @param restartURL * @returns {Promise} */ async restart(restartURL) { const rURL = restartURL || this.lastUrl; logger.info(`Restarting ${this.id} // Going to ${rURL}`); await this._goto(rURL); } /** * * @param filename * @param data * @returns {Promise} */ async saveFile(filename, data) { try{ fs.writeFileSync(filename, data); } catch( err) { logger.error(err); } } } module.exports = Scraper;