// version: 0.0.1-20 const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const logger = require('log4js').getLogger('DE'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class DEScrape extends Scraper { constructor() { super(); this.setID('DE'); this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @returns {Promise} */ async buildSubIndex() { logger.info('Building sub-index...'); const currentPage = await this.page.evaluate(() => document); const search = currentPage.location.search; const params = this._getParamsFromUrl(search); const currentPageID = params.nameZahlungsinstitut || ''; await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null); await this._randomWait(this.page, 3, 5); const links = await this.page.$$('#zahlinst > tbody > tr a'); for (const item of links) { const id = await this.page.evaluate(el => el.innerText, item); let href = await this.page.evaluate(el => el.href, item); const params = this._getParamsFromUrl(href); href = href.concat('&locale=en_GB'); if (id !== 'Found payment institutions:') this.paymentServices.links.push({ id, href, params }); } this.index.step++; if (this.index.step < this.index.items) this.emit('nextsubindex'); else { this.subIndex.done = true; this.paymentServices.items = this.paymentServices.links.length; this.emit('subindexdone'); } } /** * * @returns {Promise} */ async buildIndex() { logger.info('Building the index...'); await this._randomWait(this.page, 3, 5); const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a'); for (const item of links) { const id = await this.page.evaluate(el => el.innerText, item); let href = await this.page.evaluate(el => el.href, item); href = href.concat('&locale=en_GB'); this.index.links.push({ id, href }); } this.index.done = true; this.index.items = this.index.links.length; this.emit('indexdone'); } async initiateCreditIndex() { // first time around. // need to kick off the index correctly.. const options = await this.page.$$('#institutKategorie option'); const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)']; for (const item of options) { const text = await this.page.evaluate(el => el.innerText, item); const value = await this.page.evaluate(el => el.value, item); if (wantedOption.indexOf(text) !== -1) { await this.page.select('#institutKategorie', value); this.creditServices.started = true; break; } } if (this.creditServices.started) this._findAndClick('#sucheButtonInstitut'); else throw new Error('Unable to initiate CI Search'); } async processCreditInstIndexPage() { const noWhiteSpace = /\W/g; logger.info('Building CI sub-index...'); const wantedRowType = ['CRR-Kreditinstitut']; const currentPage = await this.page.evaluate(() => document); const body = await this.page.content(); const $ = cheerio.load(body); const search = currentPage.location.search; const params = this._getParamsFromUrl(search); const currentPageID = params['d-4012550-p'] || ''; await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null); await this._randomWait(this.page, 7, 10); const rows = $('#institut tr'); rows.each((i, elm) => { const rowClass = cheerio(elm).attr('class'); if (typeof(rowClass) !== 'undefined') { const children = cheerio(elm).children(); const rowType = children.eq(1).text(); if (wantedRowType.indexOf(rowType) !== -1) { const name = this._cleanUp(children.eq(0).text()); const id = this._makeFieldName(name); let href = cheerio(children.eq(0)).find('a').attr('href'); const params = this._getParamsFromUrl(href); href = href.concat('&locale=en_GB'); // this is the one we want. this.creditServices.links.push({ name, id, href, params }); } } }); const clicked = await this._findAndClick('.pagelinks a', 'Next'); if (!clicked) { // come to the end of the index.. this.creditServices.done = true; this.creditServices.items = this.creditServices.links.length; this.emit('ciindexdone'); } } async processCreditInstPage() { const noWhiteSpace = /\W/g; const id = this.creditServices.links[this.creditServices.step].id; const name = this.creditServices.links[this.creditServices.step].name; logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`); await this._randomWait(this.page, 3, 5); const body = await this.page.content(); const details = await this.extractPaymentEntity(body); const entity = removeAccents.remove(details.description[0].trim()); const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id); logger.debug('filename', filename); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); jsonfile.writeFileSync(`${filePath}.json`, details); this.creditServices.links[this.creditServices.step].filename = `${filename}.json`; this.creditServices.links[this.creditServices.step].filePath = `${filePath}`; this.creditServices.step++; if (this.creditServices.step < this.creditServices.items) { const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`; await this._goto(newUrl); } else this.emit('creditinstdone'); } /** * * @returns {Promise} */ async processCreditInstIndex() { logger.info('Building CI Index..'); if (!this.creditServices.started) await this.initiateCreditIndex(); else await this.processCreditInstIndexPage(); } /** * * @param html * @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>} */ async extractPaymentEntity(html) { const permissions = { 'original':[], 'translated':[] }; const newLine = /\n/g; const $ = cheerio.load(html); let description = $('#content > p').text().split(newLine).filter(line => line.length > 0); description = description.map((i) => { return this._cleanUp(i.replace(/\t/g, '')).trim(); }); description = description.filter(item => item.length > 0); const rows = $('#erlaubnis > tbody tr'); rows.each((index, item) => { const cells = $(item).find('td'); const service = $(cells.get(0)).text(); const startAuth = $(cells.get(1)).text(); const endAuth = $(cells.get(2)).text(); const reason = (cells.length === 4) ? $(cells.get(3)).text() : ''; const phrasing = service.split(' (§'); const translated = this._translate(phrasing[0]); phrasing[0] = (translated !== '') ? translated : phrasing[0]; const newObjTrans = { 'service': phrasing.join(' (§'), startAuth, endAuth }; const newObj = { service, startAuth, endAuth }; if (cells.length === 4) { newObj.reason = reason; newObjTrans.reason = reason; } permissions.translated.push(newObjTrans); permissions.original.push(newObj); }); return { description, permissions }; } /** * * @returns {Promise} */ async processEntity() { const noWhiteSpace = /\W/g; if (!this.subIndex.done) { // We should not be here quite yet, so add this to subindex; const currentPage = await this.page.evaluate(() => document); const location = currentPage.location; const id = location.search; let href = location.href; href = href.concat('&locale=en_GB'); this.paymentServices.links.push({ id, href }); this.index.step++; if (this.index.step < this.index.items) this.emit('nextsubindex'); else { logger.info('Sub indexing done...'); this.subIndex.done = true; this.paymentServices.items = this.paymentServices.links.length; this.emit('subindexdone'); } } else { const id = this.paymentServices.links[this.paymentServices.step].id; // logger.info('Process entity:', id); logger.info(`Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}`); await this._randomWait(this.page, 3, 5); const body = await this.page.evaluate(() => document.documentElement.outerHTML); const details = await this.extractPaymentEntity(body); const entity = removeAccents.remove(details.description[0].trim()); // const filename = id.indexOf('?id=') === 0 ? `ps_${entity.replace(noWhiteSpace, '_')}` : `ps_${id.replace(noWhiteSpace, '_')}`; const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id); logger.debug('filename', filename); await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null); jsonfile.writeFileSync(`${this.path}/${filename}.json`, details); this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`; this.paymentServices.step++; if (this.paymentServices.step < this.paymentServices.items) await this._goto(this.paymentServices.links[this.paymentServices.step].href); else this.emit('processdone'); } } /** * * @param selector * @returns {Promise} */ async grabLink(selector) { try{ const clickableLinks = await this.page.$$(selector); await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); if (clickableLinks.length > 0) for (const item of clickableLinks) { const href = await this.page.evaluate(el => el.href, item); await this._randomWait(this.page, 3, 5); await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => { // log this error but Puppeteer isn't supposed to support this sort of download.... logger.warn(err); // throw(Error(err)); }); } } catch (e) { // this._uploadError(); } } /** * * @returns {Promise} */ async processEMoney() { logger.info('Process EMoney:'); await this._randomWait(this.page, 3, 5); const filename = 'e-money_Institutions'; await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null); await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)'); await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null); await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a'); await this._randomWait(this.page, 3, 5); this.mode++; this.emit('startcredit'); } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } await this._randomWait(this.page, 3, 5); switch (pageUrl.pathname) { case '/database/ZahlInstInfo/': await this.buildIndex(); break; case '/database/ZahlInstInfo/suche.do': await this.buildSubIndex(); break; case '/database/ZahlInstInfo/zahlinst.do': await this.processEntity(); break; case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html': await this.processEMoney(); break; case '/database/InstInfo/sucheForm.do': await this.processCreditInstIndex(); // build index of credit institutes. break; case '/database/InstInfo/institutDetails.do': await this.processCreditInstPage(); // build index of credit institutes. break; default: await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); break; } } /** * * @returns {Promise} */ async attachEvents() { this.on('startcredit', async function() { logger.info('Starting Credit Institutes'); await this._goto(this.credit); }); this.on('processdone', async function() { logger.warn('Payment Entities done', this.paymentServices.items); jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; await this._randomWait(this.page, 5, 10); await this._goto(this.emoneyUrl); }); this.on('subindexdone', async function() { logger.info('Sub Index done', this.paymentServices.items); logger.info(this.paymentServices.links[this.paymentServices.step].href); await this._goto(this.paymentServices.links[this.paymentServices.step].href); }); this.on('indexdone', async function() { logger.info('Index done', this.index.items); logger.info(this.index.links[this.index.step].href); await this._goto(this.index.links[this.index.step].href); }); this.on('ciindexdone', async function() { logger.info('CI Index done', this.creditServices.items); logger.info(this.creditServices.links[this.creditServices.step].href); const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`; await this._goto(newUrl); }); this.on('creditinstdone', async function() { logger.debug('Credit Institutes done', this.paymentServices.items); jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; await this._randomWait(this.page, 5, 10); this.emit('done'); }); this.on('nextsubindex', async function() { logger.debug(this.index.links[this.index.step].href); await this._goto(this.index.links[this.index.step].href); }); } /** * * @returns {Promise} */ async start() { super._start(); this.mode = 0; try { await this._loadDictionary(); this.index = { 'items': 0, 'links': [], 'step': 0, 'started': false, 'done' : false }; this.subIndex = { 'items': 0, 'links': [], 'step': 0, 'started': false, 'done' : false }; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false }; this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB'; this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html'; this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB'; this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(true); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => { logger.error(err); }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle2' }); await this._randomWait(this.page, 3, 5, 'Startup'); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = DEScrape;