1792 lines
41 KiB
JavaScript
1792 lines
41 KiB
JavaScript
const fs = require('fs-extra');
|
|
const path = require('path');
|
|
const url = require('url');
|
|
const dns = require('dns');
|
|
const AWS = require('aws-sdk');
|
|
const puppeteer = require('puppeteer');
|
|
const archiver = require('archiver-promise');
|
|
const dateFormat = require('dateformat');
|
|
const whois = require('whois');
|
|
const whoisJSON = require('whois-json');
|
|
const sslCertificate = require('get-ssl-certificate');
|
|
const tldExtract = require('tld-extract');
|
|
const log4js = require('log4js');
|
|
// const logger = require('log4js').getLogger('Scraper');
|
|
const EventEmitter = require('events');
|
|
const dig = require('./dig');
|
|
const jsonfile = require('jsonfile');
|
|
const TimeFormat = require('hh-mm-ss');
|
|
const removeAccents = require('remove-accents-diacritics');
|
|
const del = require('del');
|
|
const camelCase = require('camelcase');
|
|
const crc = require('crc');
|
|
let logger = log4js.getLogger('Scraper');
|
|
|
|
const { promisify } = require('util');
|
|
|
|
const whoisAsync = promisify(whois.lookup);
|
|
const readFileAsync = promisify( fs.readFile);
|
|
|
|
const checkFileExists = s => new Promise(r => fs.access(s, fs.F_OK, e => r(!e)));
|
|
|
|
require('dotenv').config({
|
|
'path': `${__dirname }/../.env`
|
|
});
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
// This keeps the process persistent & stops the constant restart loop in PM2
|
|
var done = (function wait () {
|
|
if (!done) setTimeout(wait, 1000);
|
|
})();
|
|
|
|
const dnsServers = {
|
|
'fr' : 'whois.afnic.fr:43',
|
|
'cy' : 'whois.cynic.dns.cy:43',
|
|
'mt' : 'whois.nic.org.mt:43',
|
|
'com.mt' : 'whois.nic.org.mt:43'
|
|
};
|
|
|
|
const useDig = ['cy'];
|
|
|
|
// Default the region
|
|
AWS.config.update({ 'region': 'eu-west-1' });
|
|
|
|
if (process.env.NODE_ENV !== 'production')
|
|
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
|
|
|
const s3 = new AWS.S3();
|
|
const sns = new AWS.SNS();
|
|
|
|
/**
|
|
* Catch all unhandled promises
|
|
*/
|
|
process.on('unhandledRejection', (reason, p) => {
|
|
logger.error('⚡ Unhandled Rejection at: Promise', p, 'reason:', reason);
|
|
});
|
|
|
|
class Scraper extends EventEmitter {
|
|
|
|
constructor() {
|
|
super(); // must call super for "this" to be defined.
|
|
// this.dateTime = moment.calendarFormat("YYYYMMDD-HH-mm-ss");
|
|
|
|
process.on('uncaughtException', err => {
|
|
logger.error('Uncaught', err);
|
|
});
|
|
|
|
this.uriBase = process.env.SCRAPE_BASE_URI || 'https://register.fca.org.uk/ShPo_HomePage';
|
|
this.nonrepudation = {};
|
|
this.pathList = [];
|
|
|
|
this.filters = [
|
|
'livefyre',
|
|
'moatad',
|
|
'analytics',
|
|
'controltag',
|
|
'chartbeat',
|
|
'siteimprove',
|
|
'hotjar',
|
|
'/plugins/cookie-notice/',
|
|
'addthis',
|
|
'facebook.',
|
|
'linkedin',
|
|
'googletagmanager'
|
|
];
|
|
|
|
this.perf = {
|
|
'started': 0,
|
|
'finished': 0,
|
|
'time': 0,
|
|
'scraped': 0
|
|
};
|
|
|
|
this.lastUrl = '';
|
|
this.detatchable = false;
|
|
this.browserCrashed = false;
|
|
this.crashLog = new Map([]);
|
|
|
|
this.page = null;
|
|
|
|
this.modePrefix = ['ps_', 'em_', 'ci_'];
|
|
|
|
this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices'];
|
|
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
|
|
|
|
this.dictionary = new Map();
|
|
|
|
this.recover = this._debounce(async () => {
|
|
await this.__recover();
|
|
}, 30000);
|
|
}
|
|
|
|
setID(newID) {
|
|
logger = log4js.getLogger(`Scraper (${newID})`);
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
this.id = newID;
|
|
}
|
|
|
|
/**
|
|
* Add items to the URL filter
|
|
* @param items
|
|
*/
|
|
addToBlockFilters(items = []) {
|
|
// Consult uBlock Origin to see wht should be blocked on the page
|
|
this.filters = this.filters.concat(items);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param path
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async emptyPath(path) {
|
|
if (process.env.NODE_ENV === 'production')
|
|
await del([path]).then(paths => {
|
|
logger.warn('Deleted files and folders:\n', paths.join('\n'));
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param newPath
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async setPath(newPath) {
|
|
const now = new Date();
|
|
const timestamp = dateFormat(now, 'yyyymmdd');
|
|
|
|
await this.emptyPath(newPath);
|
|
|
|
this.path = `${newPath}/${timestamp}`;
|
|
this.debugPath = `${__dirname }/../debug/${this.id}`;
|
|
await this._createDirectory(this.path);
|
|
await this._createDirectory(this.debugPath);
|
|
}
|
|
|
|
/**
|
|
* 'Human' like click delay
|
|
* @returns {number}
|
|
*/
|
|
static notARobot() {
|
|
return 90 + Math.floor(Math.random() * (30 - 1));
|
|
}
|
|
|
|
/**
|
|
*
|
|
*/
|
|
canDetach() {
|
|
this.detatchable = true;
|
|
}
|
|
|
|
async _killRunningBrowser() {
|
|
// if (typeof(this.browser) !== 'undefined' && this.browser !== null) {
|
|
if (this.browser)
|
|
try{
|
|
logger.info('Trying to close hanging / running browser');
|
|
|
|
await this._forcePageClose();
|
|
|
|
await this.browser.removeAllListeners('disconnected');
|
|
|
|
await this.browser.close();
|
|
}
|
|
catch(err) {
|
|
logger.error('Closing browser', err);
|
|
}
|
|
finally {
|
|
this.browser = null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param headless
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _initBrowser(headless = true) {
|
|
// Force headless when running in production
|
|
|
|
const realHeadless = (process.env.NODE_ENV === 'production') ? true : headless;
|
|
|
|
await this._killRunningBrowser();
|
|
|
|
this.browserCrashed = false;
|
|
|
|
logger.info('Puppeteer.launch', realHeadless);
|
|
|
|
logger.debug('Using proxy:', process.env.PROXY_URI);
|
|
this.browser = await puppeteer.launch({
|
|
'headless': realHeadless,
|
|
'args': [
|
|
// Use proxy so FCA wont block us
|
|
`--proxy-server=${process.env.PROXY_URI}`,
|
|
'--disable-dev-shm-usage',
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-accelerated-2d-canvas',
|
|
'--disable-gpu',
|
|
'--window-size=1920x1080',
|
|
'--hide-scrollbars',
|
|
'--disable-default-apps'
|
|
]
|
|
}).catch((err) => {
|
|
logger.error('Puppeteer failed to launch');
|
|
logger.error(err);
|
|
});
|
|
|
|
const browserVersion = await this.browser.version();
|
|
|
|
logger.info(`Browser version ${browserVersion}`);
|
|
|
|
this.browser.on('disconnected', () => {
|
|
logger.warn('Browser has become detached!');
|
|
|
|
if (this.detatchable === false) {
|
|
this.browserCrashed = true;
|
|
|
|
logger.warn('browser.onDisconnected::emit recover');
|
|
this.emit('recover');
|
|
}
|
|
});
|
|
}
|
|
|
|
async _forcePageClose() {
|
|
// if (this.page !== null) {
|
|
if (this.page)
|
|
|
|
try{
|
|
logger.warn('Browser Page exists: DESTROYING');
|
|
|
|
await this.page.removeAllListeners('close');
|
|
// this.page.on('close', () => {});
|
|
|
|
await this.page.close().catch((e) => {
|
|
logger.debug(e);
|
|
});
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
finally {
|
|
this.page = null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _createBrowserPage() {
|
|
this._forcePageClose();
|
|
|
|
this.page = await this.browser.newPage();
|
|
|
|
try{
|
|
await this.page.setDefaultNavigationTimeout(90000);
|
|
|
|
await this.page.setDefaultTimeout(90000);
|
|
}
|
|
catch(err) {
|
|
logger.debug(err);
|
|
}
|
|
|
|
await this.page.setRequestInterception(true);
|
|
|
|
this.page.on('request', (request) => {
|
|
const url = request.url();
|
|
logger.trace('request', url);
|
|
const shouldAbort = this.filters.some((urlPart) => url.includes(urlPart));
|
|
if (shouldAbort) request.abort();
|
|
else request.continue();
|
|
});
|
|
|
|
this.page.on('dialog', async dialog => {
|
|
logger.warn('Dialog Box', dialog.message());
|
|
await dialog.dismiss();
|
|
});
|
|
|
|
this.page.on('error', async err => {
|
|
logger.warn('Page crashed', err);
|
|
if (!this.detatchable) {
|
|
await this._uploadError();
|
|
logger.warn('page.onError::emit recover');
|
|
this.emit('recover');
|
|
}
|
|
});
|
|
|
|
this.page.on('pageerror', async err => {
|
|
logger.trace('pageerror', err);
|
|
});
|
|
|
|
this.page.on('requestfailed', async err => {
|
|
const url = err['_url'];
|
|
const blocked = this.filters.some((urlPart) => url.includes(urlPart));
|
|
|
|
if (blocked)
|
|
logger.trace('🚫', err['_url']);
|
|
else
|
|
logger.warn('requestfailed', err['_url']);
|
|
});
|
|
|
|
this.page.on('close', () => {
|
|
logger.warn('Browser Page has closed');
|
|
|
|
if (this.detatchable === false) {
|
|
logger.warn('page.onClose::emit recover');
|
|
this.emit('recover');
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _makeResponsive() {
|
|
const viewPort = {
|
|
'name': 'Responsive',
|
|
'userAgent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3494.0 Safari/537.36',
|
|
'viewport': {
|
|
'width': 1200,
|
|
'height': 1200,
|
|
'deviceScaleFactor': 4.5,
|
|
'isMobile': true,
|
|
'hasTouch': true,
|
|
'isLandscape': true
|
|
}
|
|
};
|
|
|
|
await this.page.setViewport(viewPort.viewport);
|
|
|
|
await this.page.setDefaultNavigationTimeout(90000);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param id
|
|
* @returns {string}
|
|
* @private
|
|
*/
|
|
_makeFileName(id) {
|
|
const noWhiteSpace = /\W/g;
|
|
const maxChars = 175;
|
|
const entity = removeAccents.remove(id.replace(noWhiteSpace, ' ').trim());
|
|
|
|
const _crc = crc.crc32(id).toString(16);
|
|
|
|
const output = [this.modePrefix[this.mode], camelCase(entity)].join('');
|
|
|
|
return (output.length > maxChars) ? output.substring(0, maxChars).concat('_', _crc) : output;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param id
|
|
* @returns {Promise<string>}
|
|
* @private
|
|
*/
|
|
async _makeFilePath(id) {
|
|
return `${this.path}/${this._makeFileName(id)}`.substring(0, 240);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param destPath
|
|
* @param waitFor
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _makeScreenshot(page, destPath, waitFor = null) {
|
|
if (waitFor)
|
|
await page.waitFor(waitFor);
|
|
|
|
await page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await page.screenshot({ 'path': `artefacts/screenshots/${destPath}.png`, 'fullPage': true }).catch((err) => {
|
|
logger.error('Screenshot', err);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param destPath
|
|
* @param waitFor
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _makeScreenshotV2(page, destPath, waitFor = null) {
|
|
try{
|
|
if (waitFor)
|
|
await page.waitFor(waitFor);
|
|
|
|
if(!this.page) {
|
|
logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot');
|
|
|
|
return;
|
|
}
|
|
|
|
logger.debug('Snapshot', `${destPath}.png`);
|
|
await page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => {
|
|
logger.error('Screenshot', err);
|
|
});
|
|
}
|
|
catch( err) {
|
|
logger.error('_makeScreenshotV2', err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param minTime
|
|
* @param maxTime
|
|
* @param msg
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _randomWait(page, minTime = 2, maxTime = 10, msg = '') {
|
|
const insertedMsg = (msg.length > 0) ? `${this.id} ${msg} - ` : `${this.id} `;
|
|
|
|
const waitTime = Math.floor(Math.random() * (maxTime - minTime + 1) + minTime);
|
|
logger.debug(`${insertedMsg}Waiting ${waitTime} seconds...`);
|
|
await page.waitFor(waitTime * 1000);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param waitTime
|
|
* @param msg
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _microWait(page, waitTime, msg = '') {
|
|
const insertedMsg = (msg.length > 0) ? `${msg} - ` : '';
|
|
|
|
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 100} ms...`);
|
|
await page.waitFor(waitTime * 100);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param waitTime
|
|
* @param msg
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _nanoWait(page, waitTime, msg = '') {
|
|
const insertedMsg = (msg.length > 0) ? `${msg} - ` : '';
|
|
|
|
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 10} ms...`);
|
|
await page.waitFor(waitTime * 10);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param data
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _saveToFile(destPath, data) {
|
|
// use for artefacts saving only
|
|
return new Promise((resolve, reject) => {
|
|
const fullPath = `${__dirname}/../artefacts/${destPath}`;
|
|
fs.writeFile(fullPath, data, function(err) {
|
|
if(err)
|
|
reject(err);
|
|
else
|
|
resolve(`File saved to '${fullPath}'`);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param data
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _dumpFile(destPath, data) {
|
|
return new Promise((resolve, reject) => {
|
|
fs.writeFile(destPath, data, function(err) {
|
|
if(err)
|
|
reject(err);
|
|
else
|
|
resolve(`File saved to '${destPath}'`);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @returns {Promise<string>}
|
|
* @private
|
|
*/
|
|
async _createTimestampDirectory(destPath = null) {
|
|
const now = new Date();
|
|
|
|
const timestamp = dateFormat(now, 'yyyymmddHHMM');
|
|
const fullPath = `${destPath}/${timestamp}`;
|
|
|
|
logger.info('fullPath', fullPath);
|
|
|
|
if (!fs.existsSync(fullPath))
|
|
fs.ensureDirSync(fullPath);
|
|
|
|
return fullPath;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _createDirectory(destPath = null) {
|
|
try{
|
|
if (!fs.existsSync(destPath))
|
|
fs.ensureDirSync(destPath);
|
|
}
|
|
catch( err) {
|
|
logger.error('_createDirectory', err);
|
|
}
|
|
|
|
return destPath;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param filename
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _createArchive(destPath = null, filename = null, glob = false) {
|
|
return new Promise((resolve, reject) => {
|
|
if (!destPath || !filename) {
|
|
const e = new Error('Missing paths');
|
|
logger.error(e);
|
|
reject(e);
|
|
}
|
|
const archive = archiver(filename, {
|
|
'zlib': { 'level': 9 } // Sets the compression level.
|
|
});
|
|
|
|
if (glob)
|
|
archive.glob(`${destPath}`);
|
|
else
|
|
archive.directory(`${destPath}/`);
|
|
|
|
archive.finalize().then(() => {
|
|
logger.debug('Archive finished');
|
|
resolve();
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param filename
|
|
* @param glob
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _createArchiveV2(destPath = null, filename = null, glob = false) {
|
|
logger.debug('=== _createArchiveV2 :: STREAMING ===');
|
|
|
|
return new Promise((resolve, reject) => {
|
|
if (!destPath || !filename) {
|
|
const e = new Error('Missing paths');
|
|
logger.error(e);
|
|
reject(e);
|
|
}
|
|
|
|
const output = fs.createWriteStream(filename);
|
|
|
|
const archive = archiver('zip', {
|
|
'TransformOptions': {
|
|
'objectMode':true
|
|
},
|
|
'zlib': { 'level': 6 } // Sets the compression level.
|
|
});
|
|
|
|
archive.pipe(output);
|
|
|
|
if (glob)
|
|
archive.glob(`${destPath}`);
|
|
else
|
|
archive.directory(`${destPath}/`);
|
|
|
|
archive.finalize().then(() => {
|
|
logger.debug('Archive finished');
|
|
resolve();
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param urlStr
|
|
* @returns {*}
|
|
*/
|
|
explodeURL (urlStr = null) {
|
|
if (!urlStr || urlStr === '')
|
|
return (null);
|
|
|
|
try {
|
|
const workURL = url.parse(urlStr);
|
|
|
|
return tldExtract.parse_host( workURL.host);
|
|
}
|
|
catch(e) {
|
|
return e;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param withPrefix
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _getWhoIsRaw(destPath = null, withPrefix = false) {
|
|
const options = {};
|
|
|
|
logger.debug('_getWhoIsRaw', destPath);
|
|
if (!destPath)
|
|
throw new Error('No destination path');
|
|
|
|
const explodedURL = this.explodeURL(destPath);
|
|
|
|
if (dnsServers.hasOwnProperty(explodedURL.tld))
|
|
options.server = dnsServers[explodedURL.tld];
|
|
|
|
const lookup = (withPrefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;
|
|
|
|
logger.debug('_getWhoIsRaw', lookup);
|
|
|
|
if (useDig.indexOf(explodedURL.tld) > -1)
|
|
|
|
return await dig(`${explodedURL.sub}.${explodedURL.domain}`);
|
|
|
|
else
|
|
return await whoisAsync(lookup, options).catch((err) => {
|
|
logger.error('_getWhoIsRaw', err);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param withPrefix
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _getWhoIsJSON(destPath = null, withPrefix = false) {
|
|
const options = { };
|
|
|
|
if (!destPath)
|
|
throw new Error('No destination path');
|
|
|
|
const explodedURL = this.explodeURL(destPath);
|
|
|
|
if (dnsServers.hasOwnProperty(explodedURL.tld))
|
|
options.server = dnsServers[explodedURL.tld];
|
|
|
|
const lookup = (withPrefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;
|
|
|
|
logger.debug('_getWhoIsJSON', options);
|
|
if (useDig.indexOf(explodedURL.tld) > -1)
|
|
return {};
|
|
|
|
else
|
|
return await whoisJSON(lookup, options).catch((err) => {
|
|
logger.error('_getWhoIsJSON', err);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _getWhoIsIPJSON(destPath = null) {
|
|
return new Promise((resolve, reject) => {
|
|
if (!destPath)
|
|
reject(new Error('No destination path'));
|
|
|
|
const workURL = url.parse(destPath);
|
|
|
|
dns.lookup(workURL.host, (err, address, family) => {
|
|
whoisJSON(address).catch((e) => {
|
|
logger.error(e);
|
|
reject(e);
|
|
}).then((result) => {
|
|
resolve(result);
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _getWhoIsIPRaw(destPath = null) {
|
|
return new Promise((resolve, reject) => {
|
|
if (!destPath)
|
|
reject(new Error('No destination path'));
|
|
|
|
const workURL = url.parse(destPath);
|
|
|
|
dns.lookup(workURL.host, (err, address, family) => {
|
|
if (err)
|
|
reject(err);
|
|
|
|
whois.lookup(address, (err, data) => {
|
|
if (err)
|
|
reject(err);
|
|
|
|
resolve(data);
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param prefix
|
|
* @returns {Promise<Error>}
|
|
* @private
|
|
*/
|
|
async _getSSLCert(destPath = null, prefix = false) {
|
|
if (!destPath)
|
|
return(new Error('No destination path'));
|
|
|
|
const explodedURL = this.explodeURL(destPath);
|
|
const searchFor = (prefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;
|
|
|
|
logger.debug('Cert for:', searchFor);
|
|
|
|
return sslCertificate.get(searchFor, 5000);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param destPath
|
|
* @param options
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _populateNonRepudiation(destPath = null, options = {}) {
|
|
this.nonrepudation.whois = {};
|
|
this.nonrepudation.ipwhois = {};
|
|
|
|
const whoisWithPrefix = options.whoisWithPrefix || false;
|
|
const sslWithPrefix = options.sslWithPrefix || false;
|
|
const skipSsl = options.skipSsl || false;
|
|
|
|
logger.debug('Non Repudiation Data for', destPath);
|
|
|
|
await this._getWhoIsJSON(destPath, whoisWithPrefix).then((r) => {
|
|
this.nonrepudation.whois.json = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
await this._getWhoIsRaw(destPath, whoisWithPrefix).then((r) => {
|
|
this.nonrepudation.whois.raw = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
await this._getWhoIsIPRaw(destPath).then((r) => {
|
|
this.nonrepudation.ipwhois.raw = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
await this._getWhoIsIPJSON(destPath).then((r) => {
|
|
this.nonrepudation.ipwhois.json = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
if (options.skipSsl)
|
|
return;
|
|
|
|
await this._getSSLCert(destPath, sslWithPrefix).then((r) => {
|
|
this.nonrepudation.sslcertificate = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
if (this.nonrepudation.sslcertificate === null || typeof(this.nonrepudation.sslcertificate) === 'undefined') {
|
|
logger.warn('Trying to retrieve SSL certificate with domain prefix.');
|
|
await this._getSSLCert(destPath, true).then((r) => {
|
|
this.nonrepudation.sslcertificate = r;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param str
|
|
* @param length
|
|
* @returns {string}
|
|
* @private
|
|
*/
|
|
_zeroPad(str, length) {
|
|
const spaces = ' '.repeat(length);
|
|
|
|
return `${spaces}${str}`.slice((length * -1));
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param selector
|
|
* @param text
|
|
* @param url
|
|
* @returns {Promise<boolean>}
|
|
* @private
|
|
*/
|
|
async _findAndClick(selector, text = null, url = null) {
|
|
try {
|
|
logger.debug('_findAndClick selector', selector);
|
|
const mouseDownDuration = Scraper.notARobot();
|
|
|
|
if (!text && !url) {
|
|
logger.debug('Just clicking element');
|
|
await this.page.waitForSelector(selector, { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
await elm.click({ 'delay':mouseDownDuration });
|
|
});
|
|
}
|
|
else {
|
|
const clickableLinks = await this.page.$$(selector);
|
|
let innerText;
|
|
let href;
|
|
|
|
await this.page.hover(selector);
|
|
await this.page.waitForSelector(selector);
|
|
|
|
if (clickableLinks.length > 0)
|
|
for (const item of clickableLinks) {
|
|
innerText = await this.page.evaluate(el => el.innerText, item);
|
|
href = await this.page.evaluate(el => el.href, item);
|
|
|
|
if( (text && innerText === text ) || (url && href === url )) {
|
|
logger.debug('Matched item');
|
|
await item.click({ 'delay':mouseDownDuration });
|
|
await this._randomWait(this.page, 5, 10, 'After click');
|
|
// we need to break out of this for loop
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
}
|
|
catch(err) {
|
|
logger.error('_findAndClick', err);
|
|
this._uploadError();
|
|
this.emit('stall');
|
|
// process.exit(-99);
|
|
}
|
|
// selector `[id="${id}"] p a`
|
|
}
|
|
|
|
/**
|
|
* Get Params from a url string
|
|
*/
|
|
_getParamsFromUrl(url) {
|
|
url = decodeURI(url);
|
|
if (typeof url === 'string') {
|
|
const params = url.split('?');
|
|
|
|
const obj = {};
|
|
if (params.length > 1) {
|
|
const eachParamsArr = params[1].split('&');
|
|
|
|
if (eachParamsArr && eachParamsArr.length)
|
|
eachParamsArr.map(param => {
|
|
const keyValuePair = param.split('=');
|
|
const key = keyValuePair[0];
|
|
const value = keyValuePair[1];
|
|
obj[key] = value;
|
|
});
|
|
}
|
|
|
|
return obj;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param text
|
|
* @returns {string}
|
|
* @private
|
|
*/
|
|
_cleanUp(text) {
|
|
if (!text) return '';
|
|
const regexNewLine = /\n/;
|
|
const regexCollapseWS = /\s+/g;
|
|
|
|
return text.replace(regexNewLine, '').replace(regexCollapseWS, ' ').trim();
|
|
}
|
|
|
|
_makeFieldName(text) {
|
|
const removePunctuation = /([^A-Za-z0-9\s])+/g;
|
|
|
|
if (!text) return '';
|
|
let workString = this._cleanUp(text);
|
|
workString = removeAccents.remove(workString);
|
|
workString = workString.replace(removePunctuation, '');
|
|
|
|
workString = camelCase(workString);
|
|
|
|
return workString;
|
|
}
|
|
|
|
async _renameFile(origFN, newFN) {
|
|
await checkFileExists(origFN)
|
|
.then(async exists => {
|
|
console.log(`file exists: ${exists}`);
|
|
|
|
if (exists)
|
|
await fs.renameSync(origFN, newFN);
|
|
}).catch((e) => {
|
|
logger.error(e);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _loadDictionaryOld() {
|
|
await checkFileExists(`helpers/dictionary/${this.id}.json`)
|
|
.then(exists => {
|
|
console.log(`file exists: ${exists}`);
|
|
|
|
if (exists) {
|
|
const dictionary = jsonfile.readFileSync(`helpers/dictionary/${this.id}.json`);
|
|
this.dictionary = new Map(dictionary);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _loadDictionary() {
|
|
const langFileName = `lang.${this.id.toLowerCase()}.json`;
|
|
// _checkS3FileExists
|
|
|
|
await this._checkS3FileExists(langFileName)
|
|
.then(exists => {
|
|
if (exists)
|
|
return new Promise((resolve, reject) => { // (*)
|
|
this._getFileS3(langFileName).then((data) => {
|
|
this.dictionary = new Map(JSON.parse(data));
|
|
logger.info(`${this.id} dictionary loaded with ${this.dictionary.size} entries.`);
|
|
|
|
resolve(this.dictionary);
|
|
}).catch((err) => {
|
|
reject(err);
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<Promise<any>|undefined>}
|
|
* @private
|
|
*/
|
|
async _saveDictionary() {
|
|
if (this.dictionary.size > 0) {
|
|
logger.debug('Save dictionary', this.dictionary.size);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const langFileName = `lang.${this.id.toLowerCase()}.json`;
|
|
const arrayedMap = JSON.stringify([...this.dictionary]);
|
|
// const base64data = new Buffer.from(arrayedMap, 'binary');
|
|
const base64data = new Buffer.from(arrayedMap);
|
|
|
|
const s3Obj = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': langFileName,
|
|
'Body': base64data,
|
|
'ACL': 'public-read'
|
|
};
|
|
// await s3.deleteObject(params).promise().then((data) => {
|
|
s3.upload(s3Obj).promise()
|
|
.then((data) => {
|
|
return resolve(data);
|
|
})
|
|
.catch((err) => {
|
|
logger.error(err);
|
|
|
|
return reject(err);
|
|
});
|
|
});
|
|
}
|
|
|
|
// jsonfile.writeFileSync(`helpers/dictionary/${this.id}.json`, [...this.dictionary]);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param phrase
|
|
* @returns {string|any}
|
|
* @private
|
|
*/
|
|
_translate(phrase) {
|
|
if (!this.dictionary.get(phrase)) {
|
|
this.dictionary.set(phrase, '');
|
|
|
|
return '';
|
|
}
|
|
else
|
|
return this.dictionary.get(phrase);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @private
|
|
*/
|
|
async _start() {
|
|
logger.debug(`<=- START ${this.id}-=>`);
|
|
const now = new Date();
|
|
this.perf.started = now.getTime();
|
|
|
|
this.on('recover', async () => {
|
|
await this.recover();
|
|
});
|
|
|
|
await this._createLock();
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _done() {
|
|
logger.info('<=- DONE -=>');
|
|
|
|
// OK To close the browser window now
|
|
this.canDetach();
|
|
|
|
const now = new Date();
|
|
|
|
this.perf.finished = now.getTime();
|
|
this.perf.duration = this.perf.finished - this.perf.started;
|
|
|
|
this.perf.human = {};
|
|
this.perf.human.duration = TimeFormat.fromMs(this.perf.duration, 'hh:mm:ss');
|
|
|
|
jsonfile.writeFileSync(`${this.path}/perfdata.json`, this.perf);
|
|
|
|
if (this.page.tracing._recording)
|
|
await this.page.tracing.stop();
|
|
|
|
await this._archive();
|
|
|
|
await this._forcePageClose();
|
|
|
|
await this._killRunningBrowser();
|
|
|
|
await this._complete();
|
|
}
|
|
|
|
/**
|
|
* Stream a file to S3
|
|
* @param filename
|
|
* @returns {Promise<ManagedUpload.SendData>}
|
|
* @private
|
|
*/
|
|
async _uploadV2(filename) {
|
|
try {
|
|
logger.info('^^^ UPLOADING V2 :: STREAMING ^^^');
|
|
const filePath = path.parse(filename);
|
|
|
|
const body = fs.createReadStream(filename);
|
|
|
|
const s3Obj = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': filePath.base,
|
|
'Body': body,
|
|
'ACL': 'public-read'
|
|
};
|
|
|
|
return await s3.upload(s3Obj).promise()
|
|
.then((data) => {
|
|
logger.info('Successfully uploaded file.');
|
|
|
|
return data;
|
|
})
|
|
.catch((err) => {
|
|
logger.error(err);
|
|
|
|
return err;
|
|
});
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Upload a file to S3
|
|
* @param filename
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _upload(filename) {
|
|
try {
|
|
logger.info('^^^ UPLOADING ^^^');
|
|
const filePath = path.parse(filename);
|
|
|
|
await readFileAsync(filename).then(async (data) => {
|
|
const base64data = new Buffer.from(data, 'binary');
|
|
|
|
const s3Obj = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': filePath.base,
|
|
'Body': base64data,
|
|
'ACL': 'public-read'
|
|
};
|
|
|
|
return await s3.upload(s3Obj).promise()
|
|
.then((data) => {
|
|
logger.info('Successfully uploaded file.');
|
|
|
|
return data;
|
|
})
|
|
.catch((err) => {
|
|
logger.error(err);
|
|
|
|
return err;
|
|
});
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
});
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Upload an Error zip file to S3
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _uploadError() {
|
|
const now = new Date();
|
|
const timestamp = dateFormat(now, 'yyyymmdd-HHMMss');
|
|
|
|
const errorFilePath = `${ this.path}/${this.id}-error-${timestamp}`;
|
|
|
|
await this._makeScreenshotV2(this.page, errorFilePath, null);
|
|
|
|
const body = await this.page.content();
|
|
|
|
await this._dumpFile(`${errorFilePath}.html`, body);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
jsonfile.writeFileSync(`${errorFilePath}.json`, pageUrl);
|
|
|
|
logger.info('!!! ARCHIVING ERROR !!!<');
|
|
|
|
await this._createDirectory('dist');
|
|
|
|
const filename = `dist/${this.id}-error-${timestamp}.zip`;
|
|
|
|
logger.debug('errorFilePath', `${errorFilePath}.*`);
|
|
await this._createArchive(`${errorFilePath}.*`, filename, true);
|
|
await this._upload(filename);
|
|
|
|
logger.info('^^! UPLOADING ERROR !^^');
|
|
const filePath = path.parse(filename);
|
|
|
|
await fs.readFile(filename, async (err, data) => {
|
|
if (err) throw err;
|
|
|
|
const base64data = new Buffer.from(data, 'binary');
|
|
|
|
// const s3 = new AWS.S3();
|
|
|
|
await s3.upload({
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': filePath.base,
|
|
'Body': base64data,
|
|
'ACL': 'public-read'
|
|
}, (err, data) => {
|
|
if (err) logger.error(err);
|
|
logger.info('Successfully uploaded error package.');
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _createLock() {
|
|
logger.info('Locking...');
|
|
|
|
const base64data = Buffer.alloc(0);
|
|
|
|
await s3.upload({
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': `${this.id}.lock`,
|
|
'Body': base64data,
|
|
'ACL': 'public-read'
|
|
}, (err, data) => {
|
|
if (err) logger.error('_createLock', err);
|
|
logger.info(`${this.id} LOCKED`);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _removeLock() {
|
|
const params = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': `${this.id}.lock`
|
|
};
|
|
|
|
await s3.deleteObject(params).promise().then((data) => {
|
|
logger.debug(`${this.id} Unlocked`, data);
|
|
}).catch((err) => {
|
|
logger.error(err, err.stack);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param filename
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _checkS3FileExists(filename = null ) {
|
|
var params = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': filename
|
|
};
|
|
|
|
return new Promise((resolve, reject) => {
|
|
if (filename === null)
|
|
return reject(Error('No filename for S3'));
|
|
|
|
s3.headObject(params).promise().then((i) => {
|
|
logger.debug(`${filename} exists`);
|
|
|
|
return resolve(true);
|
|
}).catch((e) => {
|
|
return resolve(false);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param filename
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
async _getFileS3(filename = null) {
|
|
var params = {
|
|
'Bucket': process.env.S3_BUCKET,
|
|
'Key': filename
|
|
};
|
|
|
|
return new Promise((resolve, reject) => {
|
|
if (filename === null)
|
|
return reject(Error('No filename for S3'));
|
|
|
|
s3.getObject(params).promise().then((data) => {
|
|
return resolve(data.Body.toString());
|
|
}).catch((e) => {
|
|
return reject(e);
|
|
});
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<*>}
|
|
* @private
|
|
*/
|
|
|
|
async _checkLock() {
|
|
// return await this._checkS3FileExists(`${this.id}.lock`);
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Broadcast a message using AWS SQS
|
|
* @param id
|
|
* @param msg
|
|
* @param msgBody
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _sendMessage(id, msg, msgBody = 'New upload') {
|
|
logger.debug('+ _sendMessage', process.env.SQS_ID);
|
|
if (typeof process.env.SQS_ID !== 'undefined' && process.env.SQS_ID !== null)
|
|
try {
|
|
const sqs = new AWS.SQS({ 'apiVersion': '2012-11-05' });
|
|
|
|
const params = {
|
|
'DelaySeconds': 10,
|
|
'MessageAttributes': {
|
|
'id': {
|
|
'DataType': 'String',
|
|
'StringValue': id
|
|
},
|
|
'filename': {
|
|
'DataType': 'String',
|
|
'StringValue': msg
|
|
}
|
|
},
|
|
'MessageBody': msgBody,
|
|
'QueueUrl': process.env.SQS_ID
|
|
};
|
|
|
|
logger.info('SQS:', JSON.stringify(params));
|
|
|
|
await sqs.sendMessage(params).promise().then((data) => {
|
|
logger.debug('SQS Success', data.MessageId);
|
|
|
|
return data;
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
|
|
return err;
|
|
});
|
|
}
|
|
catch (err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Broadcast a message using AWS SNS
|
|
* @param mesg
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _publish(mesg) {
|
|
var publishParams = {
|
|
'TopicArn' : process.env.SQS_ARN,
|
|
'Message': mesg
|
|
};
|
|
|
|
await sns.publish(publishParams).promise().then((data) => {
|
|
logger.debug('>>> PUBLISH >>>', data);
|
|
|
|
return data;
|
|
}).catch((err) => {
|
|
return err;
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _archive() {
|
|
logger.info('>-< ARCHIVING >-<');
|
|
const now = new Date();
|
|
|
|
await this._createDirectory('dist');
|
|
|
|
const timestamp = dateFormat(now, process.env.FILE_DATE_FOTMAT || 'yyyymmdd');
|
|
const filename = `dist/${this.id}-${timestamp}.zip`;
|
|
await this._createArchiveV2(`artefacts/${this.id}/`, filename);
|
|
// await this._upload(filename);
|
|
await this._uploadV2(filename);
|
|
await this._sendMessage(this.id, `${this.id}-${timestamp}.zip`);
|
|
// await this._publish(JSON.stringify({ 'id':this.id, 'filename':`${this.id}-${timestamp}.zip`, 'msgBody':'complete' }));
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _complete() {
|
|
await this._removeLock();
|
|
try {
|
|
if (global.gc) global.gc();
|
|
}
|
|
catch (e) {
|
|
logger.warn('`node --expose-gc`');
|
|
}
|
|
|
|
logger.info('<=- COMPLETE -=>');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param skip
|
|
* @param options
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _doNonRepudiation(skip = false, options = {}) {
|
|
if (!skip)
|
|
|
|
try{
|
|
if (typeof this.startPage === 'undefined' || this.startPage === null)
|
|
throw new Error('No startpage defined');
|
|
|
|
const pageUrl = url.parse(this.startPage);
|
|
|
|
const lookup = `${pageUrl.protocol}//${pageUrl.hostname}`;
|
|
|
|
await this._populateNonRepudiation(lookup, options).catch((err) => {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
});
|
|
|
|
await jsonfile.writeFileSync(`${ this.path}/nonrepudiation.json`, this.nonrepudation);
|
|
}
|
|
catch(err) {
|
|
logger.error(err);
|
|
throw Error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param filePath
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _saveLocalStorage(page, filePath) {
|
|
const json = await this.page.evaluate(() => {
|
|
const json = {};
|
|
for (let i = 0; i < localStorage.length; i++) {
|
|
const key = localStorage.key(i);
|
|
json[key] = localStorage.getItem(key);
|
|
}
|
|
|
|
return json;
|
|
});
|
|
await jsonfile.writeFileSync(filePath, json);
|
|
}
|
|
|
|
async _getLocalStorage( ) {
|
|
return await this.page.evaluate(() => {
|
|
const json = {};
|
|
for (let i = 0; i < localStorage.length; i++) {
|
|
const key = localStorage.key(i);
|
|
json[key] = localStorage.getItem(key);
|
|
}
|
|
|
|
return json;
|
|
});
|
|
}
|
|
|
|
_checkFileExistsSync(filePath) {
|
|
try {
|
|
fs.accessSync(filePath, fs.F_OK);
|
|
|
|
return true;
|
|
}
|
|
catch (err) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param page
|
|
* @param filePath
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _restoreLocalStorage(page, filePath) {
|
|
await checkFileExists(filePath)
|
|
.then(async exists => {
|
|
if (exists) {
|
|
const json = jsonfile.readFileSync(filePath);
|
|
await this.page.evaluate(json => {
|
|
localStorage.clear();
|
|
for (const key in json)
|
|
localStorage.setItem(key, json[key]);
|
|
}, json);
|
|
}
|
|
}
|
|
);
|
|
};
|
|
|
|
/**
|
|
*
|
|
* @param url
|
|
* @param options
|
|
* @param noRecover
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async _goto(url, options = {}, noRecover = false) {
|
|
this.lastUrl = url;
|
|
|
|
const newOptions = Object.assign({ 'timeout':90000, 'waitUntil':'networkidle0' }, options);
|
|
|
|
logger.debug(newOptions);
|
|
|
|
try {
|
|
logger.info('Goto:', url);
|
|
await this.page.goto(url, newOptions).catch((err) => {
|
|
logger.error('GOTO', err);
|
|
|
|
if (err.message.indexOf('net::ERR_FAILED') !== -1)
|
|
this.browserCrashed = true;
|
|
|
|
if (!noRecover)
|
|
this.emit('recover');
|
|
});
|
|
}
|
|
catch (error) {
|
|
logger.error(error);
|
|
logger.error(url, options);
|
|
// if (error === 'net::ERR_CONNECTION_TIMED_OUT')
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param fn
|
|
* @param time
|
|
* @returns {Function}
|
|
* @private
|
|
*/
|
|
_debounce(fn, time) {
|
|
let timeout;
|
|
|
|
return function (...args) { // <-- not an arrow function
|
|
const functionCall = () => fn.apply(this, args);
|
|
|
|
clearTimeout(timeout);
|
|
timeout = setTimeout(functionCall, time);
|
|
};
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param callback
|
|
* @param limit
|
|
* @returns {Function}
|
|
* @private
|
|
*/
|
|
_throttle (callback, limit) {
|
|
var wait = false;
|
|
|
|
return function () {
|
|
if (!wait) {
|
|
callback.apply(null, arguments);
|
|
wait = true;
|
|
setTimeout(function () {
|
|
wait = false;
|
|
}, limit);
|
|
}
|
|
};
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param func
|
|
* @returns {function(): *}
|
|
* @private
|
|
*/
|
|
|
|
_once(func) {
|
|
var alreadyCalled = false;
|
|
var result;
|
|
|
|
return function() {
|
|
if (!alreadyCalled) {
|
|
result = func.apply(this, arguments);
|
|
alreadyCalled = true;
|
|
}
|
|
|
|
return result;
|
|
};
|
|
};
|
|
|
|
async _paymentServicesDone() {
|
|
logger.warn('paymentServicesDone');
|
|
try{
|
|
this.paymentServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
* @private
|
|
*/
|
|
async __recover(restartURL) {
|
|
logger.warn(`*** RECONNECTING ${this.id} PAGE ***`);
|
|
|
|
let crashCount = 0;
|
|
if (this.crashLog.has(this.lastUrl)) {
|
|
crashCount = this.crashLog.get(this.lastUrl);
|
|
crashCount++;
|
|
this.crashLog.set(this.lastUrl, crashCount);
|
|
|
|
if (crashCount >= 3)
|
|
logger.error('The page has crashed more than 3 times', this.lastUrl);
|
|
|
|
if (crashCount >= 10) {
|
|
logger.error('10 times on the same page is enough', this.lastUrl);
|
|
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
this.crashLog.set(this.lastUrl, 1);
|
|
|
|
if (crashCount < 10) {
|
|
if (this.browserCrashed) await this._initBrowser(true);
|
|
|
|
await this._createBrowserPage();
|
|
|
|
logger.debug('Reattach processNewPage', (typeof this.processNewPage === 'function') ? 'Yes' : 'No');
|
|
if (typeof this.processNewPage === 'function')
|
|
this.page.on('domcontentloaded', () => {
|
|
this.processNewPage();
|
|
});
|
|
|
|
const onHold = (crashCount >= 3) ? (90000 * crashCount) : 0;
|
|
const antiCollision = 125 + (Math.floor(Math.random() * (15 - 1)) * 500);
|
|
const timeout = 90000 + antiCollision + onHold;
|
|
|
|
logger.info(`🚨 Restarting in ${(timeout / 1000).toFixed(2)} seconds.`);
|
|
|
|
setTimeout(async() => {
|
|
logger.warn(`Attempting recovery to ${restartURL}`);
|
|
|
|
await this.restart(restartURL);
|
|
}, timeout);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param restartURL
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async restart(restartURL) {
|
|
const rURL = restartURL || this.lastUrl;
|
|
logger.info(`Restarting ${this.id} // Going to ${rURL}`);
|
|
|
|
await this._goto(rURL);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param filename
|
|
* @param data
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async saveFile(filename, data) {
|
|
try{
|
|
fs.writeFileSync(filename, data);
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param s
|
|
* @returns {string}
|
|
*/
|
|
soundex(s) {
|
|
const a = s.toLowerCase().split(''),
|
|
|
|
codes = {
|
|
'a': '', 'e': '', 'i': '', 'o': '', 'u': '',
|
|
'b': 1, 'f': 1, 'p': 1, 'v': 1,
|
|
'c': 2, 'g': 2, 'j': 2, 'k': 2, 'q': 2, 's': 2, 'x': 2, 'z': 2,
|
|
'd': 3, 't': 3,
|
|
'l': 4,
|
|
'm': 5, 'n': 5,
|
|
'r': 6
|
|
};
|
|
|
|
const f = a.shift();
|
|
let r = '';
|
|
|
|
r = f +
|
|
a
|
|
.map((v, i, a) => {
|
|
return codes[v];
|
|
})
|
|
.filter((v, i, a) => {
|
|
return ((i === 0) ? v !== codes[f] : v !== a[i - 1]);
|
|
})
|
|
.join('');
|
|
|
|
return (`${r }000`).slice(0, 4).toUpperCase();
|
|
};
|
|
|
|
}
|
|
|
|
module.exports = Scraper;
|