598 lines
17 KiB
JavaScript
598 lines
17 KiB
JavaScript
|
// version: 0.0.1-20
|
||
|
|
||
|
const Scraper = require('../helpers/scraper');
|
||
|
const cheerio = require('cheerio');
|
||
|
const path = require('path');
|
||
|
const jsonfile = require('jsonfile');
|
||
|
const removeAccents = require('remove-accents-diacritics');
|
||
|
const logger = require('log4js').getLogger('DE');
|
||
|
const url = require('url');
|
||
|
|
||
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||
|
|
||
|
class DEScrape extends Scraper {
|
||
|
|
||
|
constructor() {
|
||
|
super();
|
||
|
this.setID('DE');
|
||
|
|
||
|
this.on('done', () => {
|
||
|
this._done();
|
||
|
});
|
||
|
|
||
|
this.run = this._debounce(async () => {
|
||
|
await this.__run();
|
||
|
}, 5000);
|
||
|
|
||
|
if (process.env.NODE_ENV === 'production')
|
||
|
this._checkLock().then((l) => {
|
||
|
if(l)
|
||
|
this.run();
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async buildSubIndex() {
|
||
|
logger.info('Building sub-index...');
|
||
|
|
||
|
const currentPage = await this.page.evaluate(() => document);
|
||
|
|
||
|
const search = currentPage.location.search;
|
||
|
const params = this._getParamsFromUrl(search);
|
||
|
|
||
|
const currentPageID = params.nameZahlungsinstitut || '';
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null);
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const links = await this.page.$$('#zahlinst > tbody > tr a');
|
||
|
|
||
|
for (const item of links) {
|
||
|
const id = await this.page.evaluate(el => el.innerText, item);
|
||
|
let href = await this.page.evaluate(el => el.href, item);
|
||
|
const params = this._getParamsFromUrl(href);
|
||
|
|
||
|
href = href.concat('&locale=en_GB');
|
||
|
|
||
|
if (id !== 'Found payment institutions:')
|
||
|
this.paymentServices.links.push({ id, href, params });
|
||
|
}
|
||
|
|
||
|
this.index.step++;
|
||
|
|
||
|
if (this.index.step < this.index.items)
|
||
|
this.emit('nextsubindex');
|
||
|
else {
|
||
|
this.subIndex.done = true;
|
||
|
this.paymentServices.items = this.paymentServices.links.length;
|
||
|
this.emit('subindexdone');
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async buildIndex() {
|
||
|
logger.info('Building the index...');
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a');
|
||
|
|
||
|
for (const item of links) {
|
||
|
const id = await this.page.evaluate(el => el.innerText, item);
|
||
|
let href = await this.page.evaluate(el => el.href, item);
|
||
|
|
||
|
href = href.concat('&locale=en_GB');
|
||
|
|
||
|
this.index.links.push({ id, href });
|
||
|
}
|
||
|
|
||
|
this.index.done = true;
|
||
|
this.index.items = this.index.links.length;
|
||
|
|
||
|
this.emit('indexdone');
|
||
|
}
|
||
|
|
||
|
async initiateCreditIndex() {
|
||
|
// first time around.
|
||
|
// need to kick off the index correctly..
|
||
|
|
||
|
const options = await this.page.$$('#institutKategorie option');
|
||
|
const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)'];
|
||
|
for (const item of options) {
|
||
|
const text = await this.page.evaluate(el => el.innerText, item);
|
||
|
const value = await this.page.evaluate(el => el.value, item);
|
||
|
|
||
|
if (wantedOption.indexOf(text) !== -1) {
|
||
|
await this.page.select('#institutKategorie', value);
|
||
|
this.creditServices.started = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (this.creditServices.started)
|
||
|
this._findAndClick('#sucheButtonInstitut');
|
||
|
else
|
||
|
throw new Error('Unable to initiate CI Search');
|
||
|
}
|
||
|
|
||
|
async processCreditInstIndexPage() {
|
||
|
const noWhiteSpace = /\W/g;
|
||
|
logger.info('Building CI sub-index...');
|
||
|
|
||
|
const wantedRowType = ['CRR-Kreditinstitut'];
|
||
|
const currentPage = await this.page.evaluate(() => document);
|
||
|
const body = await this.page.content();
|
||
|
const $ = cheerio.load(body);
|
||
|
|
||
|
const search = currentPage.location.search;
|
||
|
const params = this._getParamsFromUrl(search);
|
||
|
|
||
|
const currentPageID = params['d-4012550-p'] || '';
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null);
|
||
|
|
||
|
await this._randomWait(this.page, 7, 10);
|
||
|
|
||
|
const rows = $('#institut tr');
|
||
|
|
||
|
rows.each((i, elm) => {
|
||
|
const rowClass = cheerio(elm).attr('class');
|
||
|
|
||
|
if (typeof(rowClass) !== 'undefined') {
|
||
|
const children = cheerio(elm).children();
|
||
|
|
||
|
const rowType = children.eq(1).text();
|
||
|
|
||
|
if (wantedRowType.indexOf(rowType) !== -1) {
|
||
|
const name = this._cleanUp(children.eq(0).text());
|
||
|
const id = this._makeFieldName(name);
|
||
|
let href = cheerio(children.eq(0)).find('a').attr('href');
|
||
|
const params = this._getParamsFromUrl(href);
|
||
|
href = href.concat('&locale=en_GB');
|
||
|
|
||
|
// this is the one we want.
|
||
|
|
||
|
this.creditServices.links.push({ name, id, href, params });
|
||
|
}
|
||
|
}
|
||
|
});
|
||
|
|
||
|
const clicked = await this._findAndClick('.pagelinks a', 'Next');
|
||
|
if (!clicked) {
|
||
|
// come to the end of the index..
|
||
|
|
||
|
this.creditServices.done = true;
|
||
|
this.creditServices.items = this.creditServices.links.length;
|
||
|
|
||
|
this.emit('ciindexdone');
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async processCreditInstPage() {
|
||
|
const noWhiteSpace = /\W/g;
|
||
|
|
||
|
const id = this.creditServices.links[this.creditServices.step].id;
|
||
|
const name = this.creditServices.links[this.creditServices.step].name;
|
||
|
logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`);
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const body = await this.page.content();
|
||
|
|
||
|
const details = await this.extractPaymentEntity(body);
|
||
|
|
||
|
const entity = removeAccents.remove(details.description[0].trim());
|
||
|
|
||
|
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
|
||
|
|
||
|
logger.debug('filename', filename);
|
||
|
|
||
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||
|
|
||
|
jsonfile.writeFileSync(`${filePath}.json`, details);
|
||
|
|
||
|
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
|
||
|
this.creditServices.links[this.creditServices.step].filePath = `${filePath}`;
|
||
|
this.creditServices.step++;
|
||
|
|
||
|
if (this.creditServices.step < this.creditServices.items) {
|
||
|
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
|
||
|
|
||
|
await this._goto(newUrl);
|
||
|
}
|
||
|
else
|
||
|
this.emit('creditinstdone');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async processCreditInstIndex() {
|
||
|
logger.info('Building CI Index..');
|
||
|
|
||
|
if (!this.creditServices.started)
|
||
|
await this.initiateCreditIndex();
|
||
|
else
|
||
|
await this.processCreditInstIndexPage();
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @param html
|
||
|
* @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>}
|
||
|
*/
|
||
|
async extractPaymentEntity(html) {
|
||
|
const permissions = { 'original':[], 'translated':[] };
|
||
|
|
||
|
const newLine = /\n/g;
|
||
|
const $ = cheerio.load(html);
|
||
|
|
||
|
let description = $('#content > p').text().split(newLine).filter(line => line.length > 0);
|
||
|
|
||
|
description = description.map((i) => {
|
||
|
return this._cleanUp(i.replace(/\t/g, '')).trim();
|
||
|
});
|
||
|
|
||
|
description = description.filter(item => item.length > 0);
|
||
|
|
||
|
const rows = $('#erlaubnis > tbody tr');
|
||
|
|
||
|
rows.each((index, item) => {
|
||
|
const cells = $(item).find('td');
|
||
|
|
||
|
const service = $(cells.get(0)).text();
|
||
|
const startAuth = $(cells.get(1)).text();
|
||
|
const endAuth = $(cells.get(2)).text();
|
||
|
|
||
|
const reason = (cells.length === 4) ? $(cells.get(3)).text() : '';
|
||
|
|
||
|
const phrasing = service.split(' (§');
|
||
|
const translated = this._translate(phrasing[0]);
|
||
|
|
||
|
phrasing[0] = (translated !== '') ? translated : phrasing[0];
|
||
|
|
||
|
const newObjTrans = {
|
||
|
'service': phrasing.join(' (§'),
|
||
|
startAuth,
|
||
|
endAuth
|
||
|
};
|
||
|
|
||
|
const newObj = {
|
||
|
service,
|
||
|
startAuth,
|
||
|
endAuth
|
||
|
};
|
||
|
|
||
|
if (cells.length === 4) {
|
||
|
newObj.reason = reason;
|
||
|
newObjTrans.reason = reason;
|
||
|
}
|
||
|
|
||
|
permissions.translated.push(newObjTrans);
|
||
|
|
||
|
permissions.original.push(newObj);
|
||
|
});
|
||
|
|
||
|
return { description, permissions };
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async processEntity() {
|
||
|
const noWhiteSpace = /\W/g;
|
||
|
if (!this.subIndex.done) {
|
||
|
// We should not be here quite yet, so add this to subindex;
|
||
|
const currentPage = await this.page.evaluate(() => document);
|
||
|
|
||
|
const location = currentPage.location;
|
||
|
const id = location.search;
|
||
|
let href = location.href;
|
||
|
href = href.concat('&locale=en_GB');
|
||
|
|
||
|
this.paymentServices.links.push({ id, href });
|
||
|
|
||
|
this.index.step++;
|
||
|
|
||
|
if (this.index.step < this.index.items)
|
||
|
this.emit('nextsubindex');
|
||
|
else {
|
||
|
logger.info('Sub indexing done...');
|
||
|
this.subIndex.done = true;
|
||
|
this.paymentServices.items = this.paymentServices.links.length;
|
||
|
this.emit('subindexdone');
|
||
|
}
|
||
|
}
|
||
|
else {
|
||
|
const id = this.paymentServices.links[this.paymentServices.step].id;
|
||
|
// logger.info('Process entity:', id);
|
||
|
logger.info(`Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}`);
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||
|
|
||
|
const details = await this.extractPaymentEntity(body);
|
||
|
|
||
|
const entity = removeAccents.remove(details.description[0].trim());
|
||
|
|
||
|
// const filename = id.indexOf('?id=') === 0 ? `ps_${entity.replace(noWhiteSpace, '_')}` : `ps_${id.replace(noWhiteSpace, '_')}`;
|
||
|
|
||
|
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
|
||
|
|
||
|
logger.debug('filename', filename);
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||
|
|
||
|
jsonfile.writeFileSync(`${this.path}/${filename}.json`, details);
|
||
|
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
|
||
|
|
||
|
this.paymentServices.step++;
|
||
|
|
||
|
if (this.paymentServices.step < this.paymentServices.items)
|
||
|
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
|
||
|
else
|
||
|
this.emit('processdone');
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @param selector
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async grabLink(selector) {
|
||
|
try{
|
||
|
const clickableLinks = await this.page.$$(selector);
|
||
|
|
||
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||
|
|
||
|
if (clickableLinks.length > 0)
|
||
|
for (const item of clickableLinks) {
|
||
|
const href = await this.page.evaluate(el => el.href, item);
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => {
|
||
|
// log this error but Puppeteer isn't supposed to support this sort of download....
|
||
|
|
||
|
logger.warn(err);
|
||
|
// throw(Error(err));
|
||
|
});
|
||
|
}
|
||
|
}
|
||
|
catch (e) {
|
||
|
// this._uploadError();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async processEMoney() {
|
||
|
logger.info('Process EMoney:');
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const filename = 'e-money_Institutions';
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||
|
|
||
|
await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)');
|
||
|
|
||
|
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null);
|
||
|
|
||
|
await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a');
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
this.mode++;
|
||
|
this.emit('startcredit');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async processNewPage() {
|
||
|
// give the page a few seconds to settle
|
||
|
|
||
|
const pageUrl = url.parse(await this.page.url());
|
||
|
|
||
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||
|
this.emit('recover');
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
switch (pageUrl.pathname) {
|
||
|
|
||
|
case '/database/ZahlInstInfo/':
|
||
|
await this.buildIndex();
|
||
|
break;
|
||
|
|
||
|
case '/database/ZahlInstInfo/suche.do':
|
||
|
await this.buildSubIndex();
|
||
|
break;
|
||
|
case '/database/ZahlInstInfo/zahlinst.do':
|
||
|
await this.processEntity();
|
||
|
break;
|
||
|
case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html':
|
||
|
await this.processEMoney();
|
||
|
break;
|
||
|
case '/database/InstInfo/sucheForm.do':
|
||
|
await this.processCreditInstIndex();
|
||
|
// build index of credit institutes.
|
||
|
break;
|
||
|
case '/database/InstInfo/institutDetails.do':
|
||
|
await this.processCreditInstPage();
|
||
|
// build index of credit institutes.
|
||
|
break;
|
||
|
default:
|
||
|
|
||
|
await this._uploadError();
|
||
|
throw new Error(`Unknown page: ${pageUrl}`);
|
||
|
break;
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async attachEvents() {
|
||
|
this.on('startcredit', async function() {
|
||
|
logger.info('Starting Credit Institutes');
|
||
|
await this._goto(this.credit);
|
||
|
});
|
||
|
|
||
|
this.on('processdone', async function() {
|
||
|
logger.warn('Payment Entities done', this.paymentServices.items);
|
||
|
|
||
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||
|
|
||
|
this.mode++;
|
||
|
await this._randomWait(this.page, 5, 10);
|
||
|
await this._goto(this.emoneyUrl);
|
||
|
});
|
||
|
|
||
|
this.on('subindexdone', async function() {
|
||
|
logger.info('Sub Index done', this.paymentServices.items);
|
||
|
logger.info(this.paymentServices.links[this.paymentServices.step].href);
|
||
|
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
|
||
|
});
|
||
|
|
||
|
this.on('indexdone', async function() {
|
||
|
logger.info('Index done', this.index.items);
|
||
|
logger.info(this.index.links[this.index.step].href);
|
||
|
await this._goto(this.index.links[this.index.step].href);
|
||
|
});
|
||
|
|
||
|
this.on('ciindexdone', async function() {
|
||
|
logger.info('CI Index done', this.creditServices.items);
|
||
|
logger.info(this.creditServices.links[this.creditServices.step].href);
|
||
|
|
||
|
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
|
||
|
await this._goto(newUrl);
|
||
|
});
|
||
|
|
||
|
this.on('creditinstdone', async function() {
|
||
|
logger.debug('Credit Institutes done', this.paymentServices.items);
|
||
|
|
||
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||
|
this.mode++;
|
||
|
await this._randomWait(this.page, 5, 10);
|
||
|
this.emit('done');
|
||
|
});
|
||
|
|
||
|
this.on('nextsubindex', async function() {
|
||
|
logger.debug(this.index.links[this.index.step].href);
|
||
|
await this._goto(this.index.links[this.index.step].href);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async start() {
|
||
|
super._start();
|
||
|
|
||
|
this.mode = 0;
|
||
|
|
||
|
try {
|
||
|
await this._loadDictionary();
|
||
|
|
||
|
this.index = {
|
||
|
'items': 0,
|
||
|
'links': [],
|
||
|
'step': 0,
|
||
|
'started': false,
|
||
|
'done' : false
|
||
|
};
|
||
|
|
||
|
this.subIndex = {
|
||
|
'items': 0,
|
||
|
'links': [],
|
||
|
'step': 0,
|
||
|
'started': false,
|
||
|
'done' : false
|
||
|
};
|
||
|
|
||
|
this.paymentServices = {
|
||
|
'items': 0,
|
||
|
'links': [],
|
||
|
'step': 0,
|
||
|
'visited': false,
|
||
|
'done' : false
|
||
|
};
|
||
|
|
||
|
this.creditServices = {
|
||
|
'items': 0,
|
||
|
'links': [],
|
||
|
'step': 0,
|
||
|
'visited': false,
|
||
|
'done' : false,
|
||
|
'searchDone' : false,
|
||
|
'started': false
|
||
|
};
|
||
|
|
||
|
this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB';
|
||
|
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
|
||
|
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
|
||
|
|
||
|
this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`));
|
||
|
|
||
|
await this._doNonRepudiation().catch((err) => {
|
||
|
logger.warn(err);
|
||
|
});
|
||
|
|
||
|
await this._initBrowser(true);
|
||
|
await this._createBrowserPage();
|
||
|
|
||
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
||
|
this.processNewPage().catch((err) => {
|
||
|
logger.error('processNewPage fail', err);
|
||
|
});
|
||
|
}, 2500));
|
||
|
|
||
|
if (this.eventNames().length === 2)
|
||
|
await this.attachEvents();
|
||
|
|
||
|
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
|
||
|
logger.error(err);
|
||
|
});
|
||
|
|
||
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||
|
|
||
|
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5, 'Startup');
|
||
|
}
|
||
|
catch(e) {
|
||
|
throw new Error(e);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async __run() {
|
||
|
await this.start();
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
module.exports = DEScrape;
|