const cheerio = require('cheerio'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('BE'); const path = require('path'); const removeAccents = require('remove-accents-diacritics'); const url = require('url'); const Scraper = require('../helpers/scraper'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class BEScrape extends Scraper { constructor() { super(); this.setID('BE'); this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } async downloadFile(docLink) { logger.info(`Downloading ${docLink}`); await this.page.goto(docLink).catch((err) => { if (err.message.indexOf('net::ERR_ABORTED') !== -1) { logger.info(`Ignoring expected error upon file download: ${err.message}`); } else throw err; }); const waitMs = 5000; const parsedUrl = url.parse(docLink); const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase())); const downloadFilePath = `${this.path}/${fileName}`; let tries; for (tries = 1; tries <= 10; tries++) { logger.info('Waiting...'); await this.page.waitFor(waitMs); if (this._checkFileExistsSync(downloadFilePath)) { logger.info(`${docLink} successfully downloaded.`); return true; } else { logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`); } } // if we reach this point, download has failed logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`); return false; } normaliseDocLink(docLink) { if (!docLink.startsWith('http://www.nbb.be/')) { // attempt to normalise document link if (docLink.startsWith('file:///L:/PRXNWEBP/')) { return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/'); } else { logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`); } } return docLink; } convertMulitpleSpaceToCommaSpace(value) { return value.replace(/\s{2,}/g, ', '); } extractMainDetails(detailsContainer) { const $ = require('cheerio'); const details = {}; details['name'] = $(detailsContainer).children('strong').text().trim(); details['companyType'] = $(detailsContainer).children('em').text().trim(); const lines = $(detailsContainer).children(); details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim()); details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim()); // Occasionally line 5 will contain text. If this is the case, line 5 contains // "addressThree", and every other line moves along by one. let offset = 0; if (lines[5].next.data.trim() !== '') { offset = 1; details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim()); } else { details['addressThree'] = null; } details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim(); details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim(); const docLink = $(detailsContainer).children('a'); if (docLink.length > 0) { details['docLink'] = docLink.attr('href'); details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href')); } else { details['docLink'] = null; details['normalisedDocLink'] = null; } return details; } extractAdditionalDetails(tableCells) { const $ = require('cheerio'); const additionalDetails = {}; tableCells.toArray().map((td) => { const thText = $(td).closest('table').find('th').eq($(td).index()).text(); const fieldName = this._makeFieldName(thText); additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"] }); return additionalDetails; } extractFullDetails(fullDetailsContainer, mode) { const $ = require('cheerio'); switch (mode) { case 0: case 1: // in modes 0 and 1 the main details are in the first td of the parent container const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0)); const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1)); return {...mainDetails, ...additionalDetails}; case 2: // in mode 2 (credit institutions) the main details are in the root. return this.extractMainDetails(fullDetailsContainer); // no additional details for credit institutions } } extractEntitiesFromContainer(entitiesContainer, mode) { const $ = require('cheerio'); const entities = []; switch ($(entitiesContainer).prop("tagName")) { case 'TBODY': $(entitiesContainer).children('tr').each((index, item) => { entities.push(this.extractFullDetails(item, mode)); }); break; case 'UL': $(entitiesContainer).children('li').each((index, item) => { entities.push(this.extractFullDetails(item, mode)); }); break; } return entities; } extractIndex(indexContainer, mode) { const $ = require('cheerio'); const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim(); const description = $(indexContainer).find('div.description').html(); const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0); let entities; if (entitiesContainer.length > 0) { entities = this.extractEntitiesFromContainer(entitiesContainer, mode); } else { entities = []; } const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record return { title, description, legend, entities, changes }; } getIdByEntityName(name) { const noWhiteSpace = /\W/g; let id = this._makeFieldName(name).trim(); id = removeAccents.remove(id); id = id.replace(noWhiteSpace, '_'); return id; } async processIndex() { const pageUrl = await this.page.url(); logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`); await this.allowCookies(); const body = await this.page.content(); const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' }); logger.info('Extracting index...') const index = this.extractIndex($('div#PrudentialList'), this.mode); logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`); logger.info(`Downloading ${this.modeNames[this.mode]} documents.`); // download all documents from this index page for (const entity of index.entities) { if (entity.normalisedDocLink !== null) { const didDownload = await this.downloadFile(entity.normalisedDocLink); if (didDownload) { // rename the file to match the json file name format const parsedUrl = url.parse(entity.normalisedDocLink); const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase())); const originalFilePath = `${this.path}/${originalFileName}`; const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join(''); const newFilePath = `${this.path}/${newFileName}`; await this._renameFile(originalFilePath, newFilePath); // save new file name to entity object so it can be found later. entity['docLocalFilename'] = newFileName; } else { entity['docLocalFilename'] = null; } } } logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`); const description = index['description']; const legend = index['legend']; const changes = index['changes']; const metadata = { description, legend, changes }; const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`; const metadataFilePath = `${this.path}/${metadataFileName}` jsonfile.writeFile(metadataFilePath, { metadata }); for (const entity of index.entities) { const id = this.getIdByEntityName(entity.name); // create json file for each entity const filename = [this.modePrefix[this.mode], id].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName}); // add entity details to "links" so that index file can be generated later this.getCurrentMode().links.push({ 'id': id, 'href': await this.page.url(), 'filename': filename }); } logger.info(`Taking screenshot of: ${pageUrl}`); const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`; const screenshotPath = `${this.path}/${screenshotFilename}`; await this._makeScreenshotV2(this.page, screenshotPath); const nextUrl = this.getNextUrl(); if (nextUrl !== null) await this._goto(nextUrl); else this.emit('done'); } serviceDone() { try{ jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links }); jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode()); logger.info(`${this.modeNames[this.mode]} done.`); logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`); } catch (e) { logger.error(e); } } getCurrentMode() { switch (this.mode) { case 0: return this.paymentServices; case 1: return this.emoneyServices; case 2: return this.creditServices; } } getNextUrl() { if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1) this.getCurrentMode().urlStep++; else { this.serviceDone(); if (this.mode < this.modeNames.length - 1) this.mode++; else return null; } return this.getCurrentMode().urls[this.getCurrentMode().urlStep]; } async allowCookies() { const agreeButton = await this.page.$('button.agree-button'); if (agreeButton !== null) { logger.info('Agreeing to cookie policy.') await agreeButton.click(); await this._randomWait(this.page, 3, 5); } } async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); // set download behaviour in case this is a new tab after a recovery // TODO: this could be set by default in the base class for every new tab in every scraper await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); const pageUrl = await this.page.url(); if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle')) await this.processIndex(); else if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } } async attachEvents() { } async start() { super._start(); this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`)); this.mode = 0; this.paymentServices = { 'links': [], 'urlStep': 0, 'urls': [ 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15', 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14', 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16' ] }; this.emoneyServices = { 'links': [], 'urlStep': 0, 'urls': [ 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9', 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17' ] }; this.creditServices = { 'links': [], 'urlStep': 0, 'urls': [ 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7' ] }; this.startPage = this.paymentServices.urls[0]; await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this._goto(this.startPage); } async __run() { await this.start(); } } module.exports = BEScrape;