obdfcascrape/ncas/de.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

598 lines
17 KiB
JavaScript

// version: 0.0.1-20
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('DE');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class DEScrape extends Scraper {
constructor() {
super();
this.setID('DE');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<void>}
*/
async buildSubIndex() {
logger.info('Building sub-index...');
const currentPage = await this.page.evaluate(() => document);
const search = currentPage.location.search;
const params = this._getParamsFromUrl(search);
const currentPageID = params.nameZahlungsinstitut || '';
await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null);
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#zahlinst > tbody > tr a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
const params = this._getParamsFromUrl(href);
href = href.concat('&locale=en_GB');
if (id !== 'Found payment institutions:')
this.paymentServices.links.push({ id, href, params });
}
this.index.step++;
if (this.index.step < this.index.items)
this.emit('nextsubindex');
else {
this.subIndex.done = true;
this.paymentServices.items = this.paymentServices.links.length;
this.emit('subindexdone');
}
}
/**
*
* @returns {Promise<void>}
*/
async buildIndex() {
logger.info('Building the index...');
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
href = href.concat('&locale=en_GB');
this.index.links.push({ id, href });
}
this.index.done = true;
this.index.items = this.index.links.length;
this.emit('indexdone');
}
async initiateCreditIndex() {
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#institutKategorie option');
const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#institutKategorie', value);
this.creditServices.started = true;
break;
}
}
if (this.creditServices.started)
this._findAndClick('#sucheButtonInstitut');
else
throw new Error('Unable to initiate CI Search');
}
async processCreditInstIndexPage() {
const noWhiteSpace = /\W/g;
logger.info('Building CI sub-index...');
const wantedRowType = ['CRR-Kreditinstitut'];
const currentPage = await this.page.evaluate(() => document);
const body = await this.page.content();
const $ = cheerio.load(body);
const search = currentPage.location.search;
const params = this._getParamsFromUrl(search);
const currentPageID = params['d-4012550-p'] || '';
await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null);
await this._randomWait(this.page, 7, 10);
const rows = $('#institut tr');
rows.each((i, elm) => {
const rowClass = cheerio(elm).attr('class');
if (typeof(rowClass) !== 'undefined') {
const children = cheerio(elm).children();
const rowType = children.eq(1).text();
if (wantedRowType.indexOf(rowType) !== -1) {
const name = this._cleanUp(children.eq(0).text());
const id = this._makeFieldName(name);
let href = cheerio(children.eq(0)).find('a').attr('href');
const params = this._getParamsFromUrl(href);
href = href.concat('&locale=en_GB');
// this is the one we want.
this.creditServices.links.push({ name, id, href, params });
}
}
});
const clicked = await this._findAndClick('.pagelinks a', 'Next');
if (!clicked) {
// come to the end of the index..
this.creditServices.done = true;
this.creditServices.items = this.creditServices.links.length;
this.emit('ciindexdone');
}
}
async processCreditInstPage() {
const noWhiteSpace = /\W/g;
const id = this.creditServices.links[this.creditServices.step].id;
const name = this.creditServices.links[this.creditServices.step].name;
logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const details = await this.extractPaymentEntity(body);
const entity = removeAccents.remove(details.description[0].trim());
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
logger.debug('filename', filename);
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
jsonfile.writeFileSync(`${filePath}.json`, details);
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
this.creditServices.links[this.creditServices.step].filePath = `${filePath}`;
this.creditServices.step++;
if (this.creditServices.step < this.creditServices.items) {
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
await this._goto(newUrl);
}
else
this.emit('creditinstdone');
}
/**
*
* @returns {Promise<void>}
*/
async processCreditInstIndex() {
logger.info('Building CI Index..');
if (!this.creditServices.started)
await this.initiateCreditIndex();
else
await this.processCreditInstIndexPage();
}
/**
*
* @param html
* @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>}
*/
async extractPaymentEntity(html) {
const permissions = { 'original':[], 'translated':[] };
const newLine = /\n/g;
const $ = cheerio.load(html);
let description = $('#content > p').text().split(newLine).filter(line => line.length > 0);
description = description.map((i) => {
return this._cleanUp(i.replace(/\t/g, '')).trim();
});
description = description.filter(item => item.length > 0);
const rows = $('#erlaubnis > tbody tr');
rows.each((index, item) => {
const cells = $(item).find('td');
const service = $(cells.get(0)).text();
const startAuth = $(cells.get(1)).text();
const endAuth = $(cells.get(2)).text();
const reason = (cells.length === 4) ? $(cells.get(3)).text() : '';
const phrasing = service.split(' (§');
const translated = this._translate(phrasing[0]);
phrasing[0] = (translated !== '') ? translated : phrasing[0];
const newObjTrans = {
'service': phrasing.join(' (§'),
startAuth,
endAuth
};
const newObj = {
service,
startAuth,
endAuth
};
if (cells.length === 4) {
newObj.reason = reason;
newObjTrans.reason = reason;
}
permissions.translated.push(newObjTrans);
permissions.original.push(newObj);
});
return { description, permissions };
}
/**
*
* @returns {Promise<void>}
*/
async processEntity() {
const noWhiteSpace = /\W/g;
if (!this.subIndex.done) {
// We should not be here quite yet, so add this to subindex;
const currentPage = await this.page.evaluate(() => document);
const location = currentPage.location;
const id = location.search;
let href = location.href;
href = href.concat('&locale=en_GB');
this.paymentServices.links.push({ id, href });
this.index.step++;
if (this.index.step < this.index.items)
this.emit('nextsubindex');
else {
logger.info('Sub indexing done...');
this.subIndex.done = true;
this.paymentServices.items = this.paymentServices.links.length;
this.emit('subindexdone');
}
}
else {
const id = this.paymentServices.links[this.paymentServices.step].id;
// logger.info('Process entity:', id);
logger.info(`Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const details = await this.extractPaymentEntity(body);
const entity = removeAccents.remove(details.description[0].trim());
// const filename = id.indexOf('?id=') === 0 ? `ps_${entity.replace(noWhiteSpace, '_')}` : `ps_${id.replace(noWhiteSpace, '_')}`;
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
logger.debug('filename', filename);
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
jsonfile.writeFileSync(`${this.path}/${filename}.json`, details);
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
this.paymentServices.step++;
if (this.paymentServices.step < this.paymentServices.items)
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
else
this.emit('processdone');
}
}
/**
*
* @param selector
* @returns {Promise<void>}
*/
async grabLink(selector) {
try{
const clickableLinks = await this.page.$$(selector);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
if (clickableLinks.length > 0)
for (const item of clickableLinks) {
const href = await this.page.evaluate(el => el.href, item);
await this._randomWait(this.page, 3, 5);
await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
logger.warn(err);
// throw(Error(err));
});
}
}
catch (e) {
// this._uploadError();
}
}
/**
*
* @returns {Promise<void>}
*/
async processEMoney() {
logger.info('Process EMoney:');
await this._randomWait(this.page, 3, 5);
const filename = 'e-money_Institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)');
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null);
await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a');
await this._randomWait(this.page, 3, 5);
this.mode++;
this.emit('startcredit');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
await this._randomWait(this.page, 3, 5);
switch (pageUrl.pathname) {
case '/database/ZahlInstInfo/':
await this.buildIndex();
break;
case '/database/ZahlInstInfo/suche.do':
await this.buildSubIndex();
break;
case '/database/ZahlInstInfo/zahlinst.do':
await this.processEntity();
break;
case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html':
await this.processEMoney();
break;
case '/database/InstInfo/sucheForm.do':
await this.processCreditInstIndex();
// build index of credit institutes.
break;
case '/database/InstInfo/institutDetails.do':
await this.processCreditInstPage();
// build index of credit institutes.
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('startcredit', async function() {
logger.info('Starting Credit Institutes');
await this._goto(this.credit);
});
this.on('processdone', async function() {
logger.warn('Payment Entities done', this.paymentServices.items);
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
await this._randomWait(this.page, 5, 10);
await this._goto(this.emoneyUrl);
});
this.on('subindexdone', async function() {
logger.info('Sub Index done', this.paymentServices.items);
logger.info(this.paymentServices.links[this.paymentServices.step].href);
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
});
this.on('indexdone', async function() {
logger.info('Index done', this.index.items);
logger.info(this.index.links[this.index.step].href);
await this._goto(this.index.links[this.index.step].href);
});
this.on('ciindexdone', async function() {
logger.info('CI Index done', this.creditServices.items);
logger.info(this.creditServices.links[this.creditServices.step].href);
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
await this._goto(newUrl);
});
this.on('creditinstdone', async function() {
logger.debug('Credit Institutes done', this.paymentServices.items);
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
await this._randomWait(this.page, 5, 10);
this.emit('done');
});
this.on('nextsubindex', async function() {
logger.debug(this.index.links[this.index.step].href);
await this._goto(this.index.links[this.index.step].href);
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
this.mode = 0;
try {
await this._loadDictionary();
this.index = {
'items': 0,
'links': [],
'step': 0,
'started': false,
'done' : false
};
this.subIndex = {
'items': 0,
'links': [],
'step': 0,
'started': false,
'done' : false
};
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false
};
this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB';
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
logger.error(err);
});
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5, 'Startup');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = DEScrape;