cc-tracking/tasks/DIN-136 Batch2/de.js.md
2019-05-21 16:40:16 +01:00

18 KiB

// version: 0.0.1-20

const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const logger = require('log4js').getLogger('(DE)'); const url = require('url');

logger.level = process.env.LOGGER_LEVEL || 'warn';

class DEScrape extends Scraper {

constructor() { super(); this.setID('DE');

this.on('done', () => {
  this._done();
});

this.run = this._debounce(async () => {
  await this.__run();
}, 5000);

if (process.env.NODE_ENV === 'production')
  this._checkLock().then((l) => {
    if(l)
      this.run();
  });

}

/** *

  • @returns {Promise} */ async buildSubIndex() { logger.info('Building sub-index...');
const currentPage = await this.page.evaluate(() => document);

const search = currentPage.location.search;
const params = this._getParamsFromUrl(search);

const currentPageID = params.nameZahlungsinstitut || '';

await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null);

await this._randomWait(this.page, 3, 5);

const links = await this.page.$$('#zahlinst > tbody > tr a');

for (const item of links) {
  const id = await this.page.evaluate(el => el.innerText, item);
  let href = await this.page.evaluate(el => el.href, item);
  const params = this._getParamsFromUrl(href);

  href = href.concat('&locale=en_GB');

  if (id !== 'Found payment institutions:')
    this.paymentServices.links.push({ id, href, params });
}

this.index.step++;

if (this.index.step < this.index.items)
  this.emit('nextsubindex');
else {
  this.subIndex.done = true;
  this.paymentServices.items = this.paymentServices.links.length;
  this.emit('subindexdone');
}

}

/** *

  • @returns {Promise} */ async buildIndex() { logger.info('Building the index...');
await this._randomWait(this.page, 3, 5);

const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a');

for (const item of links) {
  const id = await this.page.evaluate(el => el.innerText, item);
  let href = await this.page.evaluate(el => el.href, item);

  href = href.concat('&locale=en_GB');

  this.index.links.push({ id, href });
}

this.index.done = true;
this.index.items = this.index.links.length;

this.emit('indexdone');

}

async initiateCreditIndex() { // first time around. // need to kick off the index correctly..

const options = await this.page.$$('#institutKategorie option');
const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)'];
for (const item of options) {
  const text = await this.page.evaluate(el => el.innerText, item);
  const value = await this.page.evaluate(el => el.value, item);

  if (wantedOption.indexOf(text) !== -1) {
    await this.page.select('#institutKategorie', value);
    this.creditServices.started = true;
    break;
  }
}

if (this.creditServices.started)
  this._findAndClick('#sucheButtonInstitut');
else
  throw new Error('Unable to initiate CI Search');

}

async processCreditInstIndexPage() { try{ const noWhiteSpace = /\W/g; logger.info('Building CI sub-index...');

  const wantedRowType = ['CRR-Kreditinstitut'];
  const currentPage = await this.page.evaluate(() => document);
  const body = await this.page.content();
  const $ = cheerio.load(body);

  const search = currentPage.location.search;
  const params = this._getParamsFromUrl(search);

  const currentPageID = params['d-4012550-p'] || '';

  await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null);

  await this._randomWait(this.page, 7, 10);

  const rows = $('#institut tr');

  rows.each((i, elm) => {
    const rowClass = cheerio(elm).attr('class');

    if (typeof(rowClass) !== 'undefined') {
      const children = cheerio(elm).children();

      const rowType = children.eq(1).text();

      if (wantedRowType.indexOf(rowType) !== -1) {
        const name = this._cleanUp(children.eq(0).text());
        const id = this._makeFieldName(name);
        let href = cheerio(children.eq(0)).find('a').attr('href');
        const params = this._getParamsFromUrl(href);
        href = href.concat('&locale=en_GB');

        // this is the one we want.

        this.creditServices.links.push({ name, id, href, params });
      }
    }
  });

  const clicked = await this._findAndClick('.pagelinks a', 'Next');
  if (!clicked) {
    // come to the end of the index..

    this.creditServices.done = true;
    this.creditServices.items = this.creditServices.links.length;

    this.emit('ciindexdone');
  }
}
catch( err) {
  logger.error(err);
  this.emit('recover');
}

}

async processCreditInstPage() { try{ const noWhiteSpace = /\W/g;

  const id = this.creditServices.links[this.creditServices.step].id;
  const name = this.creditServices.links[this.creditServices.step].name;
  logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`);

  await this._randomWait(this.page, 3, 5);

  const body = await this.page.content();

  const details = await this.extractPaymentEntity(body);

  const entity = removeAccents.remove(details.description[0].trim());

  const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);

  logger.debug('filename', filename);

  const filePath = `${this.path}/${filename}`.substring(0, 240);

  await this._makeScreenshotV2(this.page, `${filePath}_main`, null);

  jsonfile.writeFileSync(`${filePath}.json`, details);

  this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
  this.creditServices.links[this.creditServices.step].filePath = `${filePath}`;
  this.creditServices.step++;

  if (this.creditServices.step < this.creditServices.items) {
    const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;

    await this._goto(newUrl);
  }
  else
    this.emit('creditinstdone');
}
catch( err) {
  logger.error(err);
  this.emit('recover');
}

}

/** *

  • @returns {Promise} */ async processCreditInstIndex() { logger.info('Building CI Index..');
if (!this.creditServices.started)
  await this.initiateCreditIndex();
else
  await this.processCreditInstIndexPage();

}

/** *

  • @param html

  • @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>} */ async extractPaymentEntity(html) { try{ const permissions = { 'original':[], 'translated':[] };

    const newLine = /\n/g; const $ = cheerio.load(html);

    let description = $('#content > p').text().split(newLine).filter(line => line.length > 0);

    description = description.map((i) => { return this._cleanUp(i.replace(/\t/g, '')).trim(); });

    description = description.filter(item => item.length > 0);

    const rows = $('#erlaubnis > tbody tr');

    rows.each((index, item) => { const cells = $(item).find('td');

    const service = $(cells.get(0)).text(); const startAuth = $(cells.get(1)).text(); const endAuth = $(cells.get(2)).text();

    const reason = (cells.length === 4) ? $(cells.get(3)).text() : '';

    const phrasing = service.split(' (§'); const translated = this._translate(phrasing[0]);

    phrasing[0] = (translated !== '') ? translated : phrasing[0];

    const newObjTrans = { 'service': phrasing.join(' (§'), startAuth, endAuth };

    const newObj = { service, startAuth, endAuth };

    if (cells.length === 4) { newObj.reason = reason; newObjTrans.reason = reason; }

    permissions.translated.push(newObjTrans);

    permissions.original.push(newObj); });

    return { description, permissions }; } catch( err) { logger.error(err); this.emit('recover'); } }

/** *

  • @returns {Promise} */ async processEntity() { try{ const noWhiteSpace = /\W/g; if (!this.subIndex.done) { // We should not be here quite yet, so add this to subindex; const currentPage = await this.page.evaluate(() => document);

    const location = currentPage.location; const id = location.search; let href = location.href; href = href.concat('&locale=en_GB');

    this.paymentServices.links.push({ id, href });

    this.index.step++;

    if (this.index.step < this.index.items) this.emit('nextsubindex'); else { logger.info('Sub indexing done...'); this.subIndex.done = true; this.paymentServices.items = this.paymentServices.links.length; this.emit('subindexdone'); } } else { const id = this.paymentServices.links[this.paymentServices.step].id; // logger.info('Process entity:', id); logger.info(Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}); await this._randomWait(this.page, 3, 5);

    const body = await this.page.evaluate(() => document.documentElement.outerHTML);

    const details = await this.extractPaymentEntity(body);

    const entity = removeAccents.remove(details.description[0].trim());

    // const filename = id.indexOf('?id=') === 0 ? ps_${entity.replace(noWhiteSpace, '_')} : ps_${id.replace(noWhiteSpace, '_')};

    const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);

    logger.debug('filename', filename);

    await this._makeScreenshotV2(this.page, ${this.path}/${filename}_main, null);

    jsonfile.writeFileSync(${this.path}/${filename}.json, details); this.paymentServices.links[this.paymentServices.step].filename = ${filename}.json;

    this.paymentServices.step++;

    if (this.paymentServices.step < this.paymentServices.items) await this._goto(this.paymentServices.links[this.paymentServices.step].href); else this.emit('processdone'); } } catch( err) { logger.error(err); this.emit('reover'); } }

/** *

  • @param selector

  • @returns {Promise} */ async grabLink(selector) { try{ const clickableLinks = await this.page.$$(selector);

    await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });

    if (clickableLinks.length > 0) for (const item of clickableLinks) { const href = await this.page.evaluate(el => el.href, item); await this._randomWait(this.page, 3, 5); await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => { // log this error but Puppeteer isn't supposed to support this sort of download....

       logger.warn(err);
       // throw(Error(err));
     });
    

    } } catch (e) { // this._uploadError(); } }

/** *

  • @returns {Promise} */ async processEMoney() { logger.info('Process EMoney:'); await this._randomWait(this.page, 3, 5);
const filename = 'e-money_Institutions';

await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);

await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)');

await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null);

await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a');

await this._randomWait(this.page, 3, 5);

this.mode++;
this.emit('startcredit');

}

/** *

  • @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle
const pageUrl = url.parse(await this.page.url());

if (pageUrl.href === 'chrome-error://chromewebdata/') {
  logger.warn('Directed to: chrome-error://chromewebdata/');
  this.emit('recover');

  return;
}

await this._randomWait(this.page, 3, 5);

switch (pageUrl.pathname) {

  case '/database/ZahlInstInfo/':
    await this.buildIndex();
    break;

  case '/database/ZahlInstInfo/suche.do':
    await this.buildSubIndex();
    break;
  case '/database/ZahlInstInfo/zahlinst.do':
    await this.processEntity();
    break;
  case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html':
    await this.processEMoney();
    break;
  case '/database/InstInfo/sucheForm.do':
    await this.processCreditInstIndex();
    // build index of credit institutes.
    break;
  case '/database/InstInfo/institutDetails.do':
    await this.processCreditInstPage();
    // build index of credit institutes.
    break;
  default:

    await this._uploadError();
    throw new Error(`Unknown page: ${pageUrl}`);
    break;

}

}

/** *

  • @returns {Promise} */ async attachEvents() { this.on('startcredit', async () => { logger.info('Starting Credit Institutes'); await this._goto(this.credit); });
this.on('processdone', async () => {
  logger.warn('Payment Entities done', this.paymentServices.items);

  jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
  jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);

  this.mode++;
  await this._randomWait(this.page, 5, 10);
  await this._goto(this.emoneyUrl);
});

this.on('subindexdone', async () => {
  logger.info('Sub Index done', this.paymentServices.items);
  logger.info(this.paymentServices.links[this.paymentServices.step].href);
  await this._goto(this.paymentServices.links[this.paymentServices.step].href);
});

this.on('indexdone', async () => {
  logger.info('Index done', this.index.items);
  logger.info(this.index.links[this.index.step].href);
  await this._goto(this.index.links[this.index.step].href);
});

this.on('ciindexdone', async () => {
  logger.info('CI Index done', this.creditServices.items);
  logger.info(this.creditServices.links[this.creditServices.step].href);

  const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
  await this._goto(newUrl);
});

this.on('creditinstdone', async () => {
  logger.debug('Credit Institutes done', this.paymentServices.items);

  jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
  jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
  this.mode++;
  await this._randomWait(this.page, 5, 10);
  this.emit('done');
});

this.on('nextsubindex', async () => {
  logger.debug(this.index.links[this.index.step].href);
  await this._goto(this.index.links[this.index.step].href);
});

}

/** *

  • @returns {Promise} */ async start() { super._start();
this.mode = 0;

try {
  await this._loadDictionary();

  this.index = {
    'items': 0,
    'links': [],
    'step': 0,
    'started': false,
    'done' : false
  };

  this.subIndex = {
    'items': 0,
    'links': [],
    'step': 0,
    'started': false,
    'done' : false
  };

  this.paymentServices = {
    'items': 0,
    'links': [],
    'step': 0,
    'visited': false,
    'done' : false
  };

  this.creditServices = {
    'items': 0,
    'links': [],
    'step': 0,
    'visited': false,
    'done' : false,
    'searchDone' : false,
    'started': false
  };

  this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB';
  this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
  this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';

  this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`));

  await this._doNonRepudiation().catch((err) => {
    logger.warn(err);
  });

  await this._initBrowser(true);
  await this._createBrowserPage();

  this.page.on('domcontentloaded', this._throttle(async () => {
    this.processNewPage().catch((err) => {
      logger.error('processNewPage fail', err);
    });
  }, 2500));

  if (this.eventNames().length === 2)
    await this.attachEvents();

  await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
    logger.error(err);
  });

  await this.page.setViewport({ 'width': 1200, 'height': 800 });

  await this._goto(this.startPage, { 'waitUntil':'networkidle2' });

  await this._randomWait(this.page, 3, 5, 'Startup');
}
catch(e) {
  throw new Error(e);
}

}

async __run() { await this.start(); }

}

module.exports = DEScrape;