obdfcascrape/ncas/ie.js

/**
 *
 * User: Martin Donnelly
 * Date: 2018-09-13
 * Time: 12:23
 *
 */

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const logger = require('log4js').getLogger('IE');

class IEScrape extends Scraper {

  constructor() {
    super();
    this.setID('IE');

    this.on('done', () => {
      this._done();
    });

    this.run = this._debounce(async () => {
      await this.__run();
    }, 5000);

    if (process.env.NODE_ENV === 'production')
      this._checkLock().then((l) => {
        if(l)
          this.run();
      });
  }

  /**
   *
   * @returns {Promise<void>}
   */
  async start() {
    await super._start();
    try{
      this.startPage = 'http://registers.centralbank.ie/Home.aspx';
      const mouseDownDuration = IEScrape.notARobot();

      this.setPath(path.resolve(`${__dirname }/../artefacts/IE/CBI`));

      await this._doNonRepudiation().catch((err) => {
        logger.warn(err);
      });

      await this._initBrowser(true);
      await this._createBrowserPage();

      await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });

      await this.page.setViewport({ 'width': 1200, 'height': 800 });
      await this._goto(this.startPage);

      await this._randomWait(this.page, 3, 5);
      await this.page.waitForSelector('#ctl00_cphRegistersMasterPage_lblViewList');
      await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });
    }
    catch(e) {
      throw new Error(e);
    }
  }

  /**
   * locate the download section associated with the searchText
   * @param downloadsection
   * @param searchText
   * @returns {Promise<*>}
   */
  async findDownloadSection(downloadsection, searchText) {
    let wantedId;
    try{
      await this.page.waitFor(downloadsection);

      const body = await this.page.evaluate(() => document.documentElement.outerHTML);
      const $ = cheerio.load(body);

      $(`${downloadsection} span`).each((i, el) => {
        if ($(el).text() === searchText)
          wantedId = $(el).attr('id');

        return wantedId;
      });

      return wantedId;
    }

    catch(e) {
      throw new Error(e);
    }
  }

  /**
   * Expand the relevant section
   * @param elmId
   * @returns {Promise<void>}
   */
  async expandArea(elmId) {
    await this.page.click(`span#${elmId}`);
  }

  /**
   * Find the Download Links via section ID
   * @param elmId
   * @returns {Promise<void>}
   */
  async findDownloadsLinksID(elmId) {
    return await this.page.$eval(`span#${elmId}`, e => e.parentElement.nextElementSibling.getAttribute('id'));
  }

  /**
   * Process the download links and grab the pdf files
   * @param id
   * @returns {Promise<void>}
   */
  async processDownloadLinks(id) {
    try {
      // Each link is duplicated in a P and an Image. We just use the one in the P tag.
      const clickableLinks = await this.page.$$(`[id="${id}"] p a`);
      const mouseDownDuration = IEScrape.notARobot();

      for (const item of clickableLinks) {
        await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });

        await item.click({ 'delay':mouseDownDuration }).catch((err) => {
          this._uploadError();
        });
        await this._randomWait(this.page, 5, 10);
      }
    }
    catch(e) {
      await this._uploadError();
      throw new Error(e);
    }
  }

  async grabSection(dlSectionElm, sectionTitle) {
    try {
      const section = await this.findDownloadSection(dlSectionElm, sectionTitle);

      await this.expandArea(section);

      this._makeScreenshotV2(this.page, `${ this.path}/${sectionTitle}`, null);

      const sectionID = await this.findDownloadsLinksID(section);

      await this.processDownloadLinks(sectionID);

      await this._randomWait(this.page, 5, 10);
    }
    catch(e) {
      await this._uploadError();
      throw new Error(e);
    }
  }

  /**
   * Grab the Pdf's and screenshots
   * @returns {Promise<void>}
   */
  async __run() {
    try {
      await this.start();

      await this._randomWait(this.page, 5, 10);
      await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null);

      const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions'];

      for (const section of sections)
        await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section);

      this.emit('done');
    }
    catch(e) {
      throw new Error(e);
    }
  }

}

module.exports = IEScrape;