obdfcascrape/ncas/be.js

const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('BE');
const path = require('path');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');

const Scraper = require('../helpers/scraper');

logger.level = process.env.LOGGER_LEVEL || 'warn';

class BEScrape extends Scraper {

  constructor() {
    super();
    this.setID('BE');

    this.on('done', () => {
      this._done();
    });

    this.run = this._throttle(async () => {
      await this.__run();
    }, 5000);

    if (process.env.NODE_ENV === 'production')
      this._checkLock().then((l) => {
        if(l)
          this.run();
      });
  }

  async downloadFile(docLink) {
    logger.info(`Downloading ${docLink}`);

    await this.page.goto(docLink).catch((err) => {
      if (err.message.indexOf('net::ERR_ABORTED') !== -1) {
        logger.info(`Ignoring expected error upon file download: ${err.message}`);
      }
      else
        throw err;
    });

    const waitMs = 5000;
    const parsedUrl = url.parse(docLink);
    const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
    const downloadFilePath = `${this.path}/${fileName}`;

    let tries;

    for (tries = 1; tries <= 10; tries++) {
      logger.info('Waiting...');
      await this.page.waitFor(waitMs);
      if (this._checkFileExistsSync(downloadFilePath)) {
        logger.info(`${docLink} successfully downloaded.`);
        return true;
      }
      else {
        logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`);
      }
    }

    // if we reach this point, download has failed
    logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`);
    return false;
  }

  normaliseDocLink(docLink) {
    if (!docLink.startsWith('http://www.nbb.be/')) {
      // attempt to normalise document link
      if (docLink.startsWith('file:///L:/PRXNWEBP/')) {
        return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/');
      }
      else {
        logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`);
      }
    }

    return docLink;
  }

  convertMulitpleSpaceToCommaSpace(value) {
    return value.replace(/\s{2,}/g, ', ');
  }

  extractMainDetails(detailsContainer) {
    const $ = require('cheerio');

    const details = {};

    details['name'] = $(detailsContainer).children('strong').text().trim();
    details['companyType'] = $(detailsContainer).children('em').text().trim();

    const lines = $(detailsContainer).children();

    details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim());
    details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim());

    // Occasionally line 5 will contain text. If this is the case, line 5 contains
    // "addressThree", and every other line moves along by one.
    let offset = 0;
    if (lines[5].next.data.trim() !== '') {
      offset = 1;
      details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim());
    }
    else {
      details['addressThree'] = null;
    }

    details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim();

    details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim();

    const docLink = $(detailsContainer).children('a');

    if (docLink.length > 0) {
      details['docLink'] = docLink.attr('href');
      details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href'));
    }
    else {
      details['docLink'] = null;
      details['normalisedDocLink'] = null;
    }

    return details;
  }

  extractAdditionalDetails(tableCells) {
    const $ = require('cheerio');

    const additionalDetails = {};

    tableCells.toArray().map((td) => {
      const thText = $(td).closest('table').find('th').eq($(td).index()).text();
      const fieldName = this._makeFieldName(thText);
      additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"]
    });

    return additionalDetails;
  }

  extractFullDetails(fullDetailsContainer, mode) {
    const $ = require('cheerio');

    switch (mode) {

      case 0:
      case 1:
        // in modes 0 and 1 the main details are in the first td of the parent container
        const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0));
        const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1));
        return {...mainDetails, ...additionalDetails};

      case 2:
        // in mode 2 (credit institutions) the main details are in the root.
        return this.extractMainDetails(fullDetailsContainer);
        // no additional details for credit institutions
    }

  }

  extractEntitiesFromContainer(entitiesContainer, mode) {
    const $ = require('cheerio');

    const entities = [];

    switch ($(entitiesContainer).prop("tagName")) {
      case 'TBODY':
        $(entitiesContainer).children('tr').each((index, item) => {
          entities.push(this.extractFullDetails(item, mode));
        });
        break;

      case 'UL':
        $(entitiesContainer).children('li').each((index, item) => {
          entities.push(this.extractFullDetails(item, mode));
        });
        break;
    }

    return entities;
  }

  extractIndex(indexContainer, mode) {
    const $ = require('cheerio');

    const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim();

    const description = $(indexContainer).find('div.description').html();

    const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record

    const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0);

    let entities;

    if (entitiesContainer.length > 0) {
      entities = this.extractEntitiesFromContainer(entitiesContainer, mode);
    }
    else {
      entities = [];
    }

    const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record

    return { title, description, legend, entities, changes };
  }

  getIdByEntityName(name) {
    const noWhiteSpace = /\W/g;

    let id = this._makeFieldName(name).trim();
    id = removeAccents.remove(id);
    id = id.replace(noWhiteSpace, '_');

    return id;
  }

  async processIndex() {
    const pageUrl = await this.page.url();

    logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`);

    await this.allowCookies();

    const body = await this.page.content();

    const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' });

    logger.info('Extracting index...')
    const index = this.extractIndex($('div#PrudentialList'), this.mode);

    logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`);

    logger.info(`Downloading ${this.modeNames[this.mode]} documents.`);
    // download all documents from this index page
    for (const entity of index.entities) {
      if (entity.normalisedDocLink !== null) {
        const didDownload = await this.downloadFile(entity.normalisedDocLink);

        if (didDownload) {
          // rename the file to match the json file name format
          const parsedUrl = url.parse(entity.normalisedDocLink);
          const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
          const originalFilePath = `${this.path}/${originalFileName}`;
          const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join('');
          const newFilePath = `${this.path}/${newFileName}`;
          await this._renameFile(originalFilePath, newFilePath);
          // save new file name to entity object so it can be found later.
          entity['docLocalFilename'] = newFileName;
        }
        else {
          entity['docLocalFilename'] = null;
        }
      }
    }

    logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`);
    const description = index['description'];
    const legend = index['legend'];
    const changes = index['changes'];
    const metadata = { description, legend, changes };
    const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`;
    const metadataFilePath = `${this.path}/${metadataFileName}`
    jsonfile.writeFile(metadataFilePath, { metadata });

    for (const entity of index.entities) {
      const id = this.getIdByEntityName(entity.name);

      // create json file for each entity
      const filename = [this.modePrefix[this.mode], id].join('');
      const filePath = `${this.path}/${filename}`.substring(0, 240);
      jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName});

      // add entity details to "links" so that index file can be generated later
      this.getCurrentMode().links.push({
        'id': id,
        'href': await this.page.url(),
        'filename': filename
      });
    }

    logger.info(`Taking screenshot of: ${pageUrl}`);
    const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`;
    const screenshotPath = `${this.path}/${screenshotFilename}`;
    await this._makeScreenshotV2(this.page, screenshotPath);

    const nextUrl = this.getNextUrl();

    if (nextUrl !== null)
      await this._goto(nextUrl);
    else
      this.emit('done');
  }

  serviceDone() {
    try{
      jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
      jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());

      logger.info(`${this.modeNames[this.mode]} done.`);
      logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`);
    }
    catch (e) {
      logger.error(e);
    }
  }

  getCurrentMode() {
    switch (this.mode) {

      case 0:
        return this.paymentServices;

      case 1:
        return this.emoneyServices;

      case 2:
        return this.creditServices;

    }
  }

  getNextUrl() {
    if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
      this.getCurrentMode().urlStep++;
    else {
      this.serviceDone();
      if (this.mode < this.modeNames.length - 1)
        this.mode++;
      else
        return null;
    }

    return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
  }

  async allowCookies() {
    const agreeButton = await this.page.$('button.agree-button');
    if (agreeButton !== null) {
      logger.info('Agreeing to cookie policy.')
      await agreeButton.click();
      await this._randomWait(this.page, 3, 5);
    }
  }

  async processNewPage() {
    // give the page a few seconds to settle
    await this._randomWait(this.page, 3, 5);

    // set download behaviour in case this is a new tab after a recovery
    // TODO: this could be set by default in the base class for every new tab in every scraper
    await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });

    const pageUrl = await this.page.url();

    if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle'))
      await this.processIndex();
    else if (process.env.NODE_ENV) {
        await this._uploadError();
        throw new Error(`Unknown page: ${pageUrl}`);
      }
      else {
        logger.warn('processNewPage Fell through');
      logger.warn('currentPage.location', pageUrl);
    }
  }

  async attachEvents() {

  }

  async start() {
    super._start();

    this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`));

    this.mode = 0;

    this.paymentServices = {
      'links': [],
      'urlStep': 0,
      'urls': [
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15',
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14',
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16'
      ]
    };

    this.emoneyServices = {
      'links': [],
      'urlStep': 0,
      'urls': [
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9',
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17'
      ]
    };

    this.creditServices = {
      'links': [],
      'urlStep': 0,
      'urls': [
        'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7'
      ]
    };

    this.startPage = this.paymentServices.urls[0];

    await this._doNonRepudiation().catch((err) => {
      logger.warn(err);
    });

    await this._initBrowser();
    await this._createBrowserPage();

    this.page.on('domcontentloaded', this._throttle(async () => {
      this.processNewPage().catch((err) => {
        logger.error('processNewPage fail', err);
      });
    }, 2500));

    if (this.eventNames().length === 2)
      await this.attachEvents();

    await this._goto(this.startPage);
  }

  async __run() {
    await this.start();
  }
}

module.exports = BEScrape;