cc-tracking/tasks/DIN-136 Batch2/scraper.js.md
2019-05-21 16:40:16 +01:00

40 KiB

const fs = require('fs-extra'); const path = require('path'); const url = require('url'); const dns = require('dns'); const AWS = require('aws-sdk'); const puppeteer = require('puppeteer'); const archiver = require('archiver-promise'); const dateFormat = require('dateformat'); const whois = require('whois'); const whoisJSON = require('whois-json'); const sslCertificate = require('get-ssl-certificate'); const tldExtract = require('tld-extract'); const log4js = require('log4js'); // const logger = require('log4js').getLogger('Scraper'); const EventEmitter = require('events'); const dig = require('./dig'); const jsonfile = require('jsonfile'); const TimeFormat = require('hh-mm-ss'); const removeAccents = require('remove-accents-diacritics'); const del = require('del'); const camelCase = require('camelcase'); const crc = require('crc'); let logger = log4js.getLogger('Scraper');

const { promisify } = require('util');

const whoisAsync = promisify(whois.lookup); const readFileAsync = promisify( fs.readFile);

const checkFileExists = s => new Promise(r => fs.access(s, fs.F_OK, e => r(!e)));

require('dotenv').config({ 'path': ${__dirname }/../.env });

logger.level = process.env.LOGGER_LEVEL || 'warn';

// This keeps the process persistent & stops the constant restart loop in PM2 var done = (function wait () { if (!done) setTimeout(wait, 1000); })();

const dnsServers = { 'fr' : 'whois.afnic.fr:43', 'cy' : 'whois.cynic.dns.cy:43', 'mt' : 'whois.nic.org.mt:43', 'com.mt' : 'whois.nic.org.mt:43' };

const useDig = ['cy'];

// Default the region AWS.config.update({ 'region': 'eu-west-1' });

if (process.env.NODE_ENV !== 'production') AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });

const s3 = new AWS.S3(); const sns = new AWS.SNS();

/**

  • Catch all unhandled promises */ process.on('unhandledRejection', (reason, p) => { logger.error(' Unhandled Rejection at: Promise', p, 'reason:', reason); });

class Scraper extends EventEmitter {

constructor() { super(); // must call super for "this" to be defined. // this.dateTime = moment.calendarFormat("YYYYMMDD-HH-mm-ss");

process.on('uncaughtException', err => {
  logger.error('Uncaught', err);
});

this.uriBase = process.env.SCRAPE_BASE_URI || 'https://register.fca.org.uk/ShPo_HomePage';
this.nonrepudation = {};
this.pathList = [];

this.filters = [
  'livefyre',
  'moatad',
  'analytics',
  'controltag',
  'chartbeat',
  'siteimprove',
  'hotjar',
  '/plugins/cookie-notice/',
  'addthis',
  'facebook.',
  'linkedin',
  'googletagmanager'
];

this.perf = {
  'started': 0,
  'finished': 0,
  'time': 0,
  'scraped': 0
};

this.lastUrl = '';
this.detatchable = false;
this.browserCrashed = false;
this.crashLog = new Map([]);

this.page = null;

this.modePrefix = ['ps_', 'em_', 'ci_'];

this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices'];
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];

this.dictionary = new Map();

this.recover = this._debounce(async () => {
  await this.__recover();
}, 30000);

}

setID(newID) { logger = log4js.getLogger(Scraper (${newID})); logger.level = process.env.LOGGER_LEVEL || 'warn';

this.id = newID;

}

/**

  • Add items to the URL filter
  • @param items */ addToBlockFilters(items = []) { // Consult uBlock Origin to see wht should be blocked on the page this.filters = this.filters.concat(items); }

/** *

  • @param path
  • @returns {Promise} */

async emptyPath(path) { if (process.env.NODE_ENV === 'production') await del([path]).then(paths => { logger.warn('Deleted files and folders:\n', paths.join('\n')); }); }

/** *

  • @param newPath
  • @returns {Promise} */ async setPath(newPath) { const now = new Date(); const timestamp = dateFormat(now, 'yyyymmdd');
await this.emptyPath(newPath);

this.path = `${newPath}/${timestamp}`;
this.debugPath = `${__dirname }/../debug/${this.id}`;
await this._createDirectory(this.path);
await this._createDirectory(this.debugPath);

}

/**

  • 'Human' like click delay
  • @returns {number} */ static notARobot() { return 90 + Math.floor(Math.random() * (30 - 1)); }

/** * */ canDetach() { this.detatchable = true; }

async _killRunningBrowser() { // if (typeof(this.browser) !== 'undefined' && this.browser !== null) { if (this.browser) try{ logger.info('Trying to close hanging / running browser');

    await this._forcePageClose();

    await this.browser.removeAllListeners('disconnected');

    await this.browser.close();
  }
  catch(err) {
    logger.error('Closing browser', err);
  }
  finally {
    this.browser = null;
  }

}

/** *

  • @param headless
  • @returns {Promise}
  • @private */ async _initBrowser(headless = true) { // Force headless when running in production
const realHeadless = (process.env.NODE_ENV === 'production') ? true : headless;

await this._killRunningBrowser();

this.browserCrashed = false;

logger.info('Puppeteer.launch', realHeadless);

logger.debug('Using proxy:', process.env.PROXY_URI);
this.browser = await puppeteer.launch({
  'headless': realHeadless,
  'args': [
    // Use proxy so FCA wont block us
    `--proxy-server=${process.env.PROXY_URI}`,
    '--disable-dev-shm-usage',
    '--no-sandbox',
    '--disable-setuid-sandbox',
    '--disable-accelerated-2d-canvas',
    '--disable-gpu',
    '--window-size=1920x1080',
    '--hide-scrollbars',
    '--disable-default-apps'
  ]
}).catch((err) => {
  logger.error('Puppeteer failed to launch');
  logger.error(err);
});

const browserVersion = await this.browser.version();

logger.info(`Browser version ${browserVersion}`);

this.browser.on('disconnected', () => {
  logger.warn('Browser has become detached!');

  if (this.detatchable === false) {
    this.browserCrashed = true;

    logger.warn('browser.onDisconnected::emit recover');
    this.emit('recover');
  }
});

}

async _forcePageClose() { // if (this.page !== null) { if (this.page)

  try{
    logger.warn('Browser Page exists: DESTROYING');

    await this.page.removeAllListeners('close');
    // this.page.on('close', () => {});

    await this.page.close().catch((e) => {
      logger.debug(e);
    });
  }
  catch( err) {
    logger.error(err);
  }
  finally {
    this.page = null;
  }

}

/** *

  • @returns {Promise}
  • @private */ async _createBrowserPage() { this._forcePageClose();
this.page = await this.browser.newPage();

try{
  await this.page.setDefaultNavigationTimeout(90000);

  await this.page.setDefaultTimeout(90000);
}
catch(err) {
  logger.debug(err);
}

await this.page.setRequestInterception(true);

this.page.on('request', (request) => {
  const url = request.url();
  logger.trace('request', url);
  const shouldAbort = this.filters.some((urlPart) => url.includes(urlPart));
  if (shouldAbort) request.abort();
  else request.continue();
});

this.page.on('dialog', async dialog => {
  logger.warn('Dialog Box', dialog.message());
  await dialog.dismiss();
});

this.page.on('error', async err => {
  logger.warn('Page crashed', err);
  if (!this.detatchable) {
    await this._uploadError();
    logger.warn('page.onError::emit recover');
    this.emit('recover');
  }
});

this.page.on('pageerror', async err => {
  logger.trace('pageerror', err);
});

this.page.on('requestfailed', async err => {
  const url = err['_url'];
  const blocked = this.filters.some((urlPart) => url.includes(urlPart));

  if (blocked)
    logger.trace('🚫', err['_url']);
  else
    logger.warn('requestfailed', err['_url']);
});

this.page.on('close', () => {
  logger.warn('Browser Page has closed');

  if (this.detatchable === false) {
    logger.warn('page.onClose::emit recover');
    this.emit('recover');
  }
});

}

/** *

  • @returns {Promise}
  • @private */ async _makeResponsive() { const viewPort = { 'name': 'Responsive', 'userAgent' : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3494.0 Safari/537.36', 'viewport': { 'width': 1200, 'height': 1200, 'deviceScaleFactor': 4.5, 'isMobile': true, 'hasTouch': true, 'isLandscape': true } };
await this.page.setViewport(viewPort.viewport);

await this.page.setDefaultNavigationTimeout(90000);

}

/** *

  • @param id
  • @returns {string}
  • @private */ _makeFileName(id) { const noWhiteSpace = /\W/g; const maxChars = 175; const entity = removeAccents.remove(id.replace(noWhiteSpace, ' ').trim());
const _crc = crc.crc32(id).toString(16);

const output = [this.modePrefix[this.mode], camelCase(entity)].join('');

return (output.length > maxChars) ? output.substring(0, maxChars).concat('_', _crc) : output;

}

/** *

  • @param id
  • @returns {Promise}
  • @private */ async _makeFilePath(id) { return ${this.path}/${this._makeFileName(id)}.substring(0, 240); }

/** *

  • @param page
  • @param destPath
  • @param waitFor
  • @returns {Promise}
  • @private */ async _makeScreenshot(page, destPath, waitFor = null) { if (waitFor) await page.waitFor(waitFor);
await page.setViewport({ 'width': 1200, 'height': 800 });
await page.screenshot({ 'path': `artefacts/screenshots/${destPath}.png`, 'fullPage': true }).catch((err) => {
  logger.error('Screenshot', err);
});

}

/** *

  • @param page

  • @param destPath

  • @param waitFor

  • @returns {Promise}

  • @private */ async _makeScreenshotV2(page, destPath, waitFor = null) { try{ if (waitFor) await page.waitFor(waitFor);

    if(!this.page) { logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot');

    return; }

    logger.debug('Snapshot', ${destPath}.png); await page.setViewport({ 'width': 1200, 'height': 800 }); await page.screenshot({ 'path': ${destPath}.png, 'fullPage': true }).catch(err => { logger.error('Screenshot', err); }); } catch( err) { logger.error('_makeScreenshotV2', err); } }

/** *

  • @param page
  • @param minTime
  • @param maxTime
  • @param msg
  • @returns {Promise}
  • @private */ async _randomWait(page, minTime = 2, maxTime = 10, msg = '') { const insertedMsg = (msg.length > 0) ? ${this.id} ${msg} - : ${this.id} ;
const waitTime = Math.floor(Math.random() * (maxTime - minTime + 1) + minTime);
logger.debug(`${insertedMsg}Waiting ${waitTime} seconds...`);
await page.waitFor(waitTime * 1000);

}

/** *

  • @param page
  • @param waitTime
  • @param msg
  • @returns {Promise}
  • @private */ async _microWait(page, waitTime, msg = '') { const insertedMsg = (msg.length > 0) ? ${msg} - : '';
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 100} ms...`);
await page.waitFor(waitTime * 100);

}

/** *

  • @param page
  • @param waitTime
  • @param msg
  • @returns {Promise}
  • @private */ async _nanoWait(page, waitTime, msg = '') { const insertedMsg = (msg.length > 0) ? ${msg} - : '';
if (msg !== '') logger.debug(`${insertedMsg}Waiting ${waitTime * 10} ms...`);
await page.waitFor(waitTime * 10);

}

/** *

  • @param destPath
  • @param data
  • @returns {Promise<*>}
  • @private */ async _saveToFile(destPath, data) { // use for artefacts saving only return new Promise((resolve, reject) => { const fullPath = ${__dirname}/../artefacts/${destPath}; fs.writeFile(fullPath, data, function(err) { if(err) reject(err); else resolve(File saved to '${fullPath}'); }); }); }

/** *

  • @param destPath
  • @param data
  • @returns {Promise<*>}
  • @private */ async _dumpFile(destPath, data) { return new Promise((resolve, reject) => { fs.writeFile(destPath, data, function(err) { if(err) reject(err); else resolve(File saved to '${destPath}'); }); }); }

/** *

  • @param destPath
  • @returns {Promise}
  • @private */ async _createTimestampDirectory(destPath = null) { const now = new Date();
const timestamp = dateFormat(now, 'yyyymmddHHMM');
const fullPath = `${destPath}/${timestamp}`;

logger.info('fullPath', fullPath);

if (!fs.existsSync(fullPath))
  fs.ensureDirSync(fullPath);

return fullPath;

}

/** *

  • @param destPath
  • @returns {Promise<*>}
  • @private */ async _createDirectory(destPath = null) { try{ if (!fs.existsSync(destPath)) fs.ensureDirSync(destPath); } catch( err) { logger.error('_createDirectory', err); }
return destPath;

}

/** *

  • @param destPath

  • @param filename

  • @returns {Promise<*>}

  • @private */ async _createArchive(destPath = null, filename = null, glob = false) { return new Promise((resolve, reject) => { if (!destPath || !filename) { const e = new Error('Missing paths'); logger.error(e); reject(e); } const archive = archiver(filename, { 'zlib': { 'level': 9 } // Sets the compression level. });

    if (glob) archive.glob(${destPath}); else archive.directory(${destPath}/);

    archive.finalize().then(() => { logger.debug('Archive finished'); resolve(); }); }); }

/** * * @param destPath

  • @param filename
  • @param glob
  • @returns {Promise<*>}
  • @private */ async _createArchiveV2(destPath = null, filename = null, glob = false) { logger.debug('=== _createArchiveV2 :: STREAMING ===');
return new Promise((resolve, reject) => {
  if (!destPath || !filename) {
    const e = new Error('Missing paths');
    logger.error(e);
    reject(e);
  }

  const output = fs.createWriteStream(filename);

  const archive = archiver('zip', {
    'TransformOptions': {
      'objectMode':true
    },
    'zlib': { 'level': 6 } // Sets the compression level.
  });

  archive.pipe(output);

  if (glob)
    archive.glob(`${destPath}`);
  else
    archive.directory(`${destPath}/`);

  archive.finalize().then(() => {
    logger.debug('Archive finished');
    resolve();
  });
});

}

/** *

  • @param urlStr
  • @returns {*} */ explodeURL (urlStr = null) { if (!urlStr || urlStr === '') return (null);
try {
  const workURL = url.parse(urlStr);

  return tldExtract.parse_host( workURL.host);
}
catch(e) {
  return e;
}

}

/** *

  • @param destPath
  • @param withPrefix
  • @returns {Promise}
  • @private */ async _getWhoIsRaw(destPath = null, withPrefix = false) { const options = {};
logger.debug('_getWhoIsRaw', destPath);
if (!destPath)
  throw new Error('No destination path');

const explodedURL = this.explodeURL(destPath);

if (dnsServers.hasOwnProperty(explodedURL.tld))
  options.server = dnsServers[explodedURL.tld];

const lookup = (withPrefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;

logger.debug('_getWhoIsRaw', lookup);

if (useDig.indexOf(explodedURL.tld) > -1)

  return await dig(`${explodedURL.sub}.${explodedURL.domain}`);

else
  return await whoisAsync(lookup, options).catch((err) => {
    logger.error('_getWhoIsRaw', err);
  });

}

/** *

  • @param destPath
  • @param withPrefix
  • @returns {Promise<*>}
  • @private */ async _getWhoIsJSON(destPath = null, withPrefix = false) { const options = { };
if (!destPath)
  throw new Error('No destination path');

const explodedURL = this.explodeURL(destPath);

if (dnsServers.hasOwnProperty(explodedURL.tld))
  options.server = dnsServers[explodedURL.tld];

const lookup = (withPrefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;

logger.debug('_getWhoIsJSON', options);
if (useDig.indexOf(explodedURL.tld) > -1)
  return {};

else
  return await whoisJSON(lookup, options).catch((err) => {
    logger.error('_getWhoIsJSON', err);
  });

}

/** *

  • @param destPath

  • @returns {Promise<*>}

  • @private */ async _getWhoIsIPJSON(destPath = null) { return new Promise((resolve, reject) => { if (!destPath) reject(new Error('No destination path'));

    const workURL = url.parse(destPath);

    dns.lookup(workURL.host, (err, address, family) => { whoisJSON(address).catch((e) => { logger.error(e); reject(e); }).then((result) => { resolve(result); }); }); }); }

/** *

  • @param destPath

  • @returns {Promise<*>}

  • @private */ async _getWhoIsIPRaw(destPath = null) { return new Promise((resolve, reject) => { if (!destPath) reject(new Error('No destination path'));

    const workURL = url.parse(destPath);

    dns.lookup(workURL.host, (err, address, family) => { if (err) reject(err);

    whois.lookup(address, (err, data) => { if (err) reject(err);

     resolve(data);
    

    }); }); }); }

/** *

  • @param destPath
  • @param prefix
  • @returns {Promise}
  • @private */ async _getSSLCert(destPath = null, prefix = false) { if (!destPath) return(new Error('No destination path'));
const explodedURL = this.explodeURL(destPath);
const searchFor = (prefix) ? `${explodedURL.sub}.${explodedURL.domain}` : `${explodedURL.domain}`;

logger.debug('Cert for:', searchFor);

return sslCertificate.get(searchFor, 5000);

}

/** *

  • @param destPath
  • @param options
  • @returns {Promise}
  • @private */ async _populateNonRepudiation(destPath = null, options = {}) { this.nonrepudation.whois = {}; this.nonrepudation.ipwhois = {};
const whoisWithPrefix = options.whoisWithPrefix || false;
const sslWithPrefix = options.sslWithPrefix || false;
const skipSsl = options.skipSsl || false;

logger.debug('Non Repudiation Data for', destPath);

await this._getWhoIsJSON(destPath, whoisWithPrefix).then((r) => {
  this.nonrepudation.whois.json = r;
}).catch((err) => {
  logger.error(err);
  throw Error(err);
});

await this._getWhoIsRaw(destPath, whoisWithPrefix).then((r) => {
  this.nonrepudation.whois.raw = r;
}).catch((err) => {
  logger.error(err);
  throw Error(err);
});

await this._getWhoIsIPRaw(destPath).then((r) => {
  this.nonrepudation.ipwhois.raw = r;
}).catch((err) => {
  logger.error(err);
  throw Error(err);
});

await this._getWhoIsIPJSON(destPath).then((r) => {
  this.nonrepudation.ipwhois.json = r;
}).catch((err) => {
  logger.error(err);
  throw Error(err);
});

if (options.skipSsl)
  return;

await this._getSSLCert(destPath, sslWithPrefix).then((r) => {
  this.nonrepudation.sslcertificate = r;
}).catch((err) => {
  logger.error(err);
  throw Error(err);
});

if (this.nonrepudation.sslcertificate === null || typeof(this.nonrepudation.sslcertificate) === 'undefined') {
  logger.warn('Trying to retrieve SSL certificate with domain prefix.');
  await this._getSSLCert(destPath, true).then((r) => {
    this.nonrepudation.sslcertificate = r;
  }).catch((err) => {
    logger.error(err);
    throw Error(err);
  });
}

}

/** *

  • @param str
  • @param length
  • @returns {string}
  • @private */ _zeroPad(str, length) { const spaces = ' '.repeat(length);
return `${spaces}${str}`.slice((length * -1));

}

/** *

  • @param selector

  • @param text

  • @param url

  • @returns {Promise}

  • @private */ async _findAndClick(selector, text = null, url = null) { try { logger.debug('_findAndClick selector', selector); const mouseDownDuration = Scraper.notARobot();

    if (!text && !url) { logger.debug('Just clicking element'); await this.page.waitForSelector(selector, { 'visible': true, 'timeout':90000 }).then(async (elm) => { await elm.click({ 'delay':mouseDownDuration }); }); } else { const clickableLinks = await this.page.$$(selector); let innerText; let href;

    await this.page.hover(selector); await this.page.waitForSelector(selector);

    if (clickableLinks.length > 0) for (const item of clickableLinks) { innerText = await this.page.evaluate(el => el.innerText, item); href = await this.page.evaluate(el => el.href, item);

       if( (text && innerText === text ) || (url && href === url )) {
         logger.debug('Matched item');
         await item.click({ 'delay':mouseDownDuration });
         await this._randomWait(this.page, 5, 10, 'After click');
         // we need to break out of this for loop
    
         return true;
       }
     }
    

    return false; } } catch(err) { logger.error('_findAndClick', err); this._uploadError(); this.emit('stall'); // process.exit(-99); } // selector [id="${id}"] p a }

/**

  • Get Params from a url string */ _getParamsFromUrl(url) { url = decodeURI(url); if (typeof url === 'string') { const params = url.split('?');

    const obj = {}; if (params.length > 1) { const eachParamsArr = params[1].split('&');

    if (eachParamsArr && eachParamsArr.length) eachParamsArr.map(param => { const keyValuePair = param.split('='); const key = keyValuePair[0]; const value = keyValuePair[1]; obj[key] = value; }); }

    return obj; } }

/** *

  • @param text
  • @returns {string}
  • @private */ _cleanUp(text) { if (!text) return ''; const regexNewLine = /\n/; const regexCollapseWS = /\s+/g;
return text.replace(regexNewLine, '').replace(regexCollapseWS, ' ').trim();

}

_makeFieldName(text) { const removePunctuation = /([^A-Za-z0-9\s])+/g;

if (!text) return '';
let workString = this._cleanUp(text);
workString = removeAccents.remove(workString);
workString = workString.replace(removePunctuation, '');

workString = camelCase(workString);

return workString;

}

async _renameFile(origFN, newFN) { await checkFileExists(origFN) .then(async exists => { console.log(file exists: ${exists});

    if (exists)
      await fs.renameSync(origFN, newFN);
  }).catch((e) => {
    logger.error(e);
  });

}

/** *

  • @returns {Promise}

  • @private */ async _loadDictionaryOld() { await checkFileExists(helpers/dictionary/${this.id}.json) .then(exists => { console.log(file exists: ${exists});

    if (exists) { const dictionary = jsonfile.readFileSync(helpers/dictionary/${this.id}.json); this.dictionary = new Map(dictionary); } }); }

/** *

  • @returns {Promise}
  • @private */ async _loadDictionary() { const langFileName = lang.${this.id.toLowerCase()}.json; // _checkS3FileExists
await this._checkS3FileExists(langFileName)
  .then(exists => {
    if (exists)
      return new Promise((resolve, reject) => { // (*)
        this._getFileS3(langFileName).then((data) => {
          this.dictionary = new Map(JSON.parse(data));
          logger.info(`${this.id} dictionary loaded with ${this.dictionary.size} entries.`);

          resolve(this.dictionary);
        }).catch((err) => {
          reject(err);
        });
      });
  });

}

/** *

  • @returns {Promise<Promise|undefined>}

  • @private */ async _saveDictionary() { if (this.dictionary.size > 0) { logger.debug('Save dictionary', this.dictionary.size);

    return new Promise((resolve, reject) => { const langFileName = lang.${this.id.toLowerCase()}.json; const arrayedMap = JSON.stringify([...this.dictionary]); // const base64data = new Buffer.from(arrayedMap, 'binary'); const base64data = new Buffer.from(arrayedMap);

    const s3Obj = { 'Bucket': process.env.S3_BUCKET, 'Key': langFileName, 'Body': base64data, 'ACL': 'public-read' }; // await s3.deleteObject(params).promise().then((data) => { s3.upload(s3Obj).promise() .then((data) => { return resolve(data); }) .catch((err) => { logger.error(err);

       return reject(err);
     });
    

    }); }

// jsonfile.writeFileSync(`helpers/dictionary/${this.id}.json`, [...this.dictionary]);

}

/** *

  • @param phrase

  • @returns {string|any}

  • @private */ _translate(phrase) { if (!this.dictionary.get(phrase)) { this.dictionary.set(phrase, '');

    return ''; } else return this.dictionary.get(phrase); }

/** *

  • @private */ async _start() { logger.debug(<=- START ${this.id}-=>); const now = new Date(); this.perf.started = now.getTime();
this.on('recover', async () => {
  await this.recover();
});

await this._createLock();

}

/** *

  • @returns {Promise}
  • @private */ async _done() { logger.info('<=- DONE -=>');
// OK To close the browser window now
this.canDetach();

const now = new Date();

this.perf.finished = now.getTime();
this.perf.duration = this.perf.finished - this.perf.started;

this.perf.human = {};
this.perf.human.duration = TimeFormat.fromMs(this.perf.duration, 'hh:mm:ss');

jsonfile.writeFileSync(`${this.path}/perfdata.json`, this.perf);

if (this.page.tracing._recording)
  await this.page.tracing.stop();

await this._archive();

await this._forcePageClose();

await this._killRunningBrowser();

await this._complete();

}

/**

  • Stream a file to S3

  • @param filename

  • @returns {Promise<ManagedUpload.SendData>}

  • @private */ async _uploadV2(filename) { try { logger.info('^^^ UPLOADING V2 :: STREAMING ^^^'); const filePath = path.parse(filename);

    const body = fs.createReadStream(filename);

    const s3Obj = { 'Bucket': process.env.S3_BUCKET, 'Key': filePath.base, 'Body': body, 'ACL': 'public-read' };

    return await s3.upload(s3Obj).promise() .then((data) => { logger.info('Successfully uploaded file.');

     return data;
    

    }) .catch((err) => { logger.error(err);

     return err;
    

    }); } catch (e) { logger.error(e); } }

/**

  • Upload a file to S3

  • @param filename

  • @returns {Promise}

  • @private */ async _upload(filename) { try { logger.info('^^^ UPLOADING ^^^'); const filePath = path.parse(filename);

    await readFileAsync(filename).then(async (data) => { const base64data = new Buffer.from(data, 'binary');

    const s3Obj = { 'Bucket': process.env.S3_BUCKET, 'Key': filePath.base, 'Body': base64data, 'ACL': 'public-read' };

    return await s3.upload(s3Obj).promise() .then((data) => { logger.info('Successfully uploaded file.');

       return data;
     })
     .catch((err) => {
       logger.error(err);
    
       return err;
     });
    

    }).catch((err) => { logger.error(err); }); } catch (e) { logger.error(e); } }

/**

  • Upload an Error zip file to S3
  • @returns {Promise}
  • @private */ async _uploadError() { const now = new Date(); const timestamp = dateFormat(now, 'yyyymmdd-HHMMss');
const errorFilePath = `${ this.path}/${this.id}-error-${timestamp}`;

await this._makeScreenshotV2(this.page, errorFilePath, null);

const body = await this.page.content();

await this._dumpFile(`${errorFilePath}.html`, body);

const pageUrl = url.parse(await this.page.url());

jsonfile.writeFileSync(`${errorFilePath}.json`, pageUrl);

logger.info('!!! ARCHIVING ERROR !!!<');

await this._createDirectory('dist');

const filename = `dist/${this.id}-error-${timestamp}.zip`;

logger.debug('errorFilePath', `${errorFilePath}.*`);
await this._createArchive(`${errorFilePath}.*`, filename, true);
await this._upload(filename);

logger.info('^^! UPLOADING ERROR !^^');
const filePath = path.parse(filename);

await fs.readFile(filename, async (err, data) => {
  if (err) throw err;

  const base64data = new Buffer.from(data, 'binary');

  // const s3 = new AWS.S3();

  await s3.upload({
    'Bucket': process.env.S3_BUCKET,
    'Key': filePath.base,
    'Body': base64data,
    'ACL': 'public-read'
  }, (err, data) => {
    if (err) logger.error(err);
    logger.info('Successfully uploaded error package.');
  });
});

}

/** *

  • @returns {Promise}
  • @private */ async _createLock() { logger.info('Locking...');
const base64data = Buffer.alloc(0);

await s3.upload({
  'Bucket': process.env.S3_BUCKET,
  'Key': `${this.id}.lock`,
  'Body': base64data,
  'ACL': 'public-read'
}, (err, data) => {
  if (err) logger.error('_createLock', err);
  logger.info(`${this.id} LOCKED`);
});

}

/** *

  • @returns {Promise}
  • @private */ async _removeLock() { const params = { 'Bucket': process.env.S3_BUCKET, 'Key': ${this.id}.lock };
await s3.deleteObject(params).promise().then((data) => {
  logger.debug(`${this.id} Unlocked`, data);
}).catch((err) => {
  logger.error(err, err.stack);
});

}

/** *

  • @param filename
  • @returns {Promise<*>}
  • @private */ async _checkS3FileExists(filename = null ) { var params = { 'Bucket': process.env.S3_BUCKET, 'Key': filename };
return new Promise((resolve, reject) => {
  if (filename === null)
    return reject(Error('No filename for S3'));

  s3.headObject(params).promise().then((i) => {
    logger.debug(`${filename} exists`);

    return resolve(true);
  }).catch((e) => {
    return resolve(false);
  });
});

}

/** *

  • @param filename
  • @returns {Promise<*>}
  • @private */ async _getFileS3(filename = null) { var params = { 'Bucket': process.env.S3_BUCKET, 'Key': filename };
return new Promise((resolve, reject) => {
  if (filename === null)
    return reject(Error('No filename for S3'));

  s3.getObject(params).promise().then((data) => {
    return resolve(data.Body.toString());
  }).catch((e) => {
    return reject(e);
  });
});

}

/** *

  • @returns {Promise<*>}
  • @private */

async _checkLock() { // return await this._checkS3FileExists(${this.id}.lock);

return false;

}

/**

  • Broadcast a message using AWS SQS

  • @param id

  • @param msg

  • @param msgBody

  • @returns {Promise}

  • @private */ async _sendMessage(id, msg, msgBody = 'New upload') { logger.debug('+ _sendMessage', process.env.SQS_ID); if (typeof process.env.SQS_ID !== 'undefined' && process.env.SQS_ID !== null) try { const sqs = new AWS.SQS({ 'apiVersion': '2012-11-05' });

    const params = { 'DelaySeconds': 10, 'MessageAttributes': { 'id': { 'DataType': 'String', 'StringValue': id }, 'filename': { 'DataType': 'String', 'StringValue': msg } }, 'MessageBody': msgBody, 'QueueUrl': process.env.SQS_ID };

    logger.info('SQS:', JSON.stringify(params));

    await sqs.sendMessage(params).promise().then((data) => { logger.debug('SQS Success', data.MessageId);

     return data;
    

    }).catch((err) => { logger.error(err);

     return err;
    

    }); } catch (err) { logger.error(err); } }

/**

  • Broadcast a message using AWS SNS
  • @param mesg
  • @returns {Promise}
  • @private */ async _publish(mesg) { var publishParams = { 'TopicArn' : process.env.SQS_ARN, 'Message': mesg };
await sns.publish(publishParams).promise().then((data) => {
  logger.debug('>>> PUBLISH >>>', data);

  return data;
}).catch((err) => {
  return err;
});

}

/** *

  • @returns {Promise}
  • @private */ async _archive() { logger.info('>-< ARCHIVING >-<'); const now = new Date();
await this._createDirectory('dist');

const timestamp = dateFormat(now, process.env.FILE_DATE_FOTMAT || 'yyyymmdd');
const filename = `dist/${this.id}-${timestamp}.zip`;
await this._createArchiveV2(`artefacts/${this.id}/`, filename);
// await this._upload(filename);
await this._uploadV2(filename);
await this._sendMessage(this.id, `${this.id}-${timestamp}.zip`);
// await this._publish(JSON.stringify({ 'id':this.id, 'filename':`${this.id}-${timestamp}.zip`, 'msgBody':'complete' }));

}

/** *

  • @returns {Promise}
  • @private */ async _complete() { await this._removeLock(); try { if (global.gc) global.gc(); } catch (e) { logger.warn('node --expose-gc'); }
logger.info('<=- COMPLETE -=>');

}

/** *

  • @param skip

  • @param options

  • @returns {Promise}

  • @private */ async _doNonRepudiation(skip = false, options = {}) { if (!skip)

    try{ if (typeof this.startPage === 'undefined' || this.startPage === null) throw new Error('No startpage defined');

    const pageUrl = url.parse(this.startPage);

    const lookup = ${pageUrl.protocol}//${pageUrl.hostname};

    await this._populateNonRepudiation(lookup, options).catch((err) => { logger.error(err); throw Error(err); });

    await jsonfile.writeFileSync(${ this.path}/nonrepudiation.json, this.nonrepudation); } catch(err) { logger.error(err); throw Error(err); } }

/** *

  • @param page

  • @param filePath

  • @returns {Promise}

  • @private */ async _saveLocalStorage(page, filePath) { const json = await this.page.evaluate(() => { const json = {}; for (let i = 0; i < localStorage.length; i++) { const key = localStorage.key(i); json[key] = localStorage.getItem(key); }

    return json; }); await jsonfile.writeFileSync(filePath, json); }

_checkFileExistsSync(filePath) { try { fs.accessSync(filePath, fs.F_OK);

  return true;
}
catch (err) {
  return false;
}

}

/** *

  • @param page
  • @param filePath
  • @returns {Promise}
  • @private */ async _restoreLocalStorage(page, filePath) { await checkFileExists(filePath) .then(async exists => { if (exists) { const json = jsonfile.readFileSync(filePath); await this.page.evaluate(json => { localStorage.clear(); for (const key in json) localStorage.setItem(key, json[key]); }, json); } } ); };

/** *

  • @param url
  • @param options
  • @param noRecover
  • @returns {Promise}
  • @private */ async _goto(url, options = {}, noRecover = false) { this.lastUrl = url;
const newOptions = Object.assign({ 'timeout':90000, 'waitUntil':'networkidle0' }, options);

logger.debug(newOptions);

try {
  logger.info('Goto:', url);
  await this.page.goto(url, newOptions).catch((err) => {
    logger.error('GOTO', err);

    if (err.message.indexOf('net::ERR_FAILED') !== -1)
      this.browserCrashed = true;

    if (!noRecover)
      this.emit('recover');
  });
}
catch (error) {
  logger.error(error);
  logger.error(url, options);
  // if (error === 'net::ERR_CONNECTION_TIMED_OUT')
}

}

/** *

  • @param fn
  • @param time
  • @returns {Function}
  • @private */ _debounce(fn, time) { let timeout;
return function (...args) { // <-- not an arrow function
  const functionCall = () => fn.apply(this, args);

  clearTimeout(timeout);
  timeout = setTimeout(functionCall, time);
};

}

/** *

  • @param callback
  • @param limit
  • @returns {Function}
  • @private */ _throttle (callback, limit) { var wait = false;
return function () {
  if (!wait) {
    callback.apply(null, arguments);
    wait = true;
    setTimeout(function () {
      wait = false;
    }, limit);
  }
};

}

/** *

  • @param func
  • @returns {function(): *}
  • @private */

_once(func) { var alreadyCalled = false; var result;

return function() {
  if (!alreadyCalled) {
    result = func.apply(this, arguments);
    alreadyCalled = true;
  }

  return result;
};

};

async _paymentServicesDone() { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(${this.path}/paymentServices.json, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(${this.debugPath}/paymentServices.json, this.paymentServices);

  this.mode++;
  this.inProgress = false;

  await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
  logger.error(e);
}

}

/** *

  • @returns {Promise}
  • @private */ async __recover(restartURL) { logger.warn(*** RECONNECTING ${this.id} PAGE ***);
let crashCount = 0;
if (this.crashLog.has(this.lastUrl)) {
  crashCount = this.crashLog.get(this.lastUrl);
  crashCount++;
  this.crashLog.set(this.lastUrl, crashCount);

  if (crashCount >= 3)
    logger.error('The page has crashed more than 3 times', this.lastUrl);

  if (crashCount >= 10) {
    logger.error('10 times on the same page is enough', this.lastUrl);
    
    return;
  }
}
else
  this.crashLog.set(this.lastUrl, 1);

if (crashCount < 10) {
  if (this.browserCrashed) await this._initBrowser(true);

  await this._createBrowserPage();

  logger.debug('Reattach processNewPage', (typeof this.processNewPage === 'function') ? 'Yes' : 'No');
  if (typeof this.processNewPage === 'function')
    this.page.on('domcontentloaded', () => {
      this.processNewPage();
    });

  const onHold = (crashCount >= 3) ? (90000 * crashCount) : 0;
  const antiCollision = 125 + (Math.floor(Math.random() * (15 - 1)) * 500);
  const timeout = 90000 + antiCollision + onHold;

  logger.info(`🚨 Restarting in ${(timeout / 1000).toFixed(2)} seconds.`);

  setTimeout(async() => {
    logger.warn('Attempting recovery..');

    await this.restart(restartURL);
  }, timeout);
}

}

/** *

  • @param restartURL
  • @returns {Promise} */ async restart(restartURL) { const rURL = restartURL || this.lastUrl; logger.info(Restarting ${this.id} // Going to ${rURL});
await this._goto(rURL);

}

/** *

  • @param filename
  • @param data
  • @returns {Promise} */ async saveFile(filename, data) { try{ fs.writeFileSync(filename, data); } catch( err) { logger.error(err); } }

}

module.exports = Scraper;