434 lines
13 KiB
JavaScript
434 lines
13 KiB
JavaScript
const cheerio = require('cheerio');
|
|
const jsonfile = require('jsonfile');
|
|
const logger = require('log4js').getLogger('BE');
|
|
const path = require('path');
|
|
const removeAccents = require('remove-accents-diacritics');
|
|
const url = require('url');
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
class BEScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('BE');
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
async downloadFile(docLink) {
|
|
logger.info(`Downloading ${docLink}`);
|
|
|
|
await this.page.goto(docLink).catch((err) => {
|
|
if (err.message.indexOf('net::ERR_ABORTED') !== -1) {
|
|
logger.info(`Ignoring expected error upon file download: ${err.message}`);
|
|
}
|
|
else
|
|
throw err;
|
|
});
|
|
|
|
const waitMs = 5000;
|
|
const parsedUrl = url.parse(docLink);
|
|
const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
|
|
const downloadFilePath = `${this.path}/${fileName}`;
|
|
|
|
let tries;
|
|
|
|
for (tries = 1; tries <= 10; tries++) {
|
|
logger.info('Waiting...');
|
|
await this.page.waitFor(waitMs);
|
|
if (this._checkFileExistsSync(downloadFilePath)) {
|
|
logger.info(`${docLink} successfully downloaded.`);
|
|
return true;
|
|
}
|
|
else {
|
|
logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`);
|
|
}
|
|
}
|
|
|
|
// if we reach this point, download has failed
|
|
logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`);
|
|
return false;
|
|
}
|
|
|
|
normaliseDocLink(docLink) {
|
|
if (!docLink.startsWith('http://www.nbb.be/')) {
|
|
// attempt to normalise document link
|
|
if (docLink.startsWith('file:///L:/PRXNWEBP/')) {
|
|
return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/');
|
|
}
|
|
else {
|
|
logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`);
|
|
}
|
|
}
|
|
|
|
return docLink;
|
|
}
|
|
|
|
convertMulitpleSpaceToCommaSpace(value) {
|
|
return value.replace(/\s{2,}/g, ', ');
|
|
}
|
|
|
|
extractMainDetails(detailsContainer) {
|
|
const $ = require('cheerio');
|
|
|
|
const details = {};
|
|
|
|
details['name'] = $(detailsContainer).children('strong').text().trim();
|
|
details['companyType'] = $(detailsContainer).children('em').text().trim();
|
|
|
|
const lines = $(detailsContainer).children();
|
|
|
|
details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim());
|
|
details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim());
|
|
|
|
// Occasionally line 5 will contain text. If this is the case, line 5 contains
|
|
// "addressThree", and every other line moves along by one.
|
|
let offset = 0;
|
|
if (lines[5].next.data.trim() !== '') {
|
|
offset = 1;
|
|
details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim());
|
|
}
|
|
else {
|
|
details['addressThree'] = null;
|
|
}
|
|
|
|
details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim();
|
|
|
|
details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim();
|
|
|
|
const docLink = $(detailsContainer).children('a');
|
|
|
|
if (docLink.length > 0) {
|
|
details['docLink'] = docLink.attr('href');
|
|
details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href'));
|
|
}
|
|
else {
|
|
details['docLink'] = null;
|
|
details['normalisedDocLink'] = null;
|
|
}
|
|
|
|
return details;
|
|
}
|
|
|
|
extractAdditionalDetails(tableCells) {
|
|
const $ = require('cheerio');
|
|
|
|
const additionalDetails = {};
|
|
|
|
tableCells.toArray().map((td) => {
|
|
const thText = $(td).closest('table').find('th').eq($(td).index()).text();
|
|
const fieldName = this._makeFieldName(thText);
|
|
additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"]
|
|
});
|
|
|
|
return additionalDetails;
|
|
}
|
|
|
|
extractFullDetails(fullDetailsContainer, mode) {
|
|
const $ = require('cheerio');
|
|
|
|
switch (mode) {
|
|
|
|
case 0:
|
|
case 1:
|
|
// in modes 0 and 1 the main details are in the first td of the parent container
|
|
const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0));
|
|
const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1));
|
|
return {...mainDetails, ...additionalDetails};
|
|
|
|
case 2:
|
|
// in mode 2 (credit institutions) the main details are in the root.
|
|
return this.extractMainDetails(fullDetailsContainer);
|
|
// no additional details for credit institutions
|
|
}
|
|
|
|
}
|
|
|
|
extractEntitiesFromContainer(entitiesContainer, mode) {
|
|
const $ = require('cheerio');
|
|
|
|
const entities = [];
|
|
|
|
switch ($(entitiesContainer).prop("tagName")) {
|
|
case 'TBODY':
|
|
$(entitiesContainer).children('tr').each((index, item) => {
|
|
entities.push(this.extractFullDetails(item, mode));
|
|
});
|
|
break;
|
|
|
|
case 'UL':
|
|
$(entitiesContainer).children('li').each((index, item) => {
|
|
entities.push(this.extractFullDetails(item, mode));
|
|
});
|
|
break;
|
|
}
|
|
|
|
return entities;
|
|
}
|
|
|
|
extractIndex(indexContainer, mode) {
|
|
const $ = require('cheerio');
|
|
|
|
const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim();
|
|
|
|
const description = $(indexContainer).find('div.description').html();
|
|
|
|
const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record
|
|
|
|
const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0);
|
|
|
|
let entities;
|
|
|
|
if (entitiesContainer.length > 0) {
|
|
entities = this.extractEntitiesFromContainer(entitiesContainer, mode);
|
|
}
|
|
else {
|
|
entities = [];
|
|
}
|
|
|
|
const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record
|
|
|
|
return { title, description, legend, entities, changes };
|
|
}
|
|
|
|
getIdByEntityName(name) {
|
|
const noWhiteSpace = /\W/g;
|
|
|
|
let id = this._makeFieldName(name).trim();
|
|
id = removeAccents.remove(id);
|
|
id = id.replace(noWhiteSpace, '_');
|
|
|
|
return id;
|
|
}
|
|
|
|
async processIndex() {
|
|
const pageUrl = await this.page.url();
|
|
|
|
logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`);
|
|
|
|
await this.allowCookies();
|
|
|
|
const body = await this.page.content();
|
|
|
|
const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' });
|
|
|
|
logger.info('Extracting index...')
|
|
const index = this.extractIndex($('div#PrudentialList'), this.mode);
|
|
|
|
logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`);
|
|
|
|
logger.info(`Downloading ${this.modeNames[this.mode]} documents.`);
|
|
// download all documents from this index page
|
|
for (const entity of index.entities) {
|
|
if (entity.normalisedDocLink !== null) {
|
|
const didDownload = await this.downloadFile(entity.normalisedDocLink);
|
|
|
|
if (didDownload) {
|
|
// rename the file to match the json file name format
|
|
const parsedUrl = url.parse(entity.normalisedDocLink);
|
|
const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
|
|
const originalFilePath = `${this.path}/${originalFileName}`;
|
|
const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join('');
|
|
const newFilePath = `${this.path}/${newFileName}`;
|
|
await this._renameFile(originalFilePath, newFilePath);
|
|
// save new file name to entity object so it can be found later.
|
|
entity['docLocalFilename'] = newFileName;
|
|
}
|
|
else {
|
|
entity['docLocalFilename'] = null;
|
|
}
|
|
}
|
|
}
|
|
|
|
logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`);
|
|
const description = index['description'];
|
|
const legend = index['legend'];
|
|
const changes = index['changes'];
|
|
const metadata = { description, legend, changes };
|
|
const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`;
|
|
const metadataFilePath = `${this.path}/${metadataFileName}`
|
|
jsonfile.writeFile(metadataFilePath, { metadata });
|
|
|
|
for (const entity of index.entities) {
|
|
const id = this.getIdByEntityName(entity.name);
|
|
|
|
// create json file for each entity
|
|
const filename = [this.modePrefix[this.mode], id].join('');
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName});
|
|
|
|
// add entity details to "links" so that index file can be generated later
|
|
this.getCurrentMode().links.push({
|
|
'id': id,
|
|
'href': await this.page.url(),
|
|
'filename': filename
|
|
});
|
|
}
|
|
|
|
logger.info(`Taking screenshot of: ${pageUrl}`);
|
|
const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`;
|
|
const screenshotPath = `${this.path}/${screenshotFilename}`;
|
|
await this._makeScreenshotV2(this.page, screenshotPath);
|
|
|
|
const nextUrl = this.getNextUrl();
|
|
|
|
if (nextUrl !== null)
|
|
await this._goto(nextUrl);
|
|
else
|
|
this.emit('done');
|
|
}
|
|
|
|
serviceDone() {
|
|
try{
|
|
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
|
|
|
|
logger.info(`${this.modeNames[this.mode]} done.`);
|
|
logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
}
|
|
|
|
getCurrentMode() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
return this.paymentServices;
|
|
|
|
case 1:
|
|
return this.emoneyServices;
|
|
|
|
case 2:
|
|
return this.creditServices;
|
|
|
|
}
|
|
}
|
|
|
|
getNextUrl() {
|
|
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
|
|
this.getCurrentMode().urlStep++;
|
|
else {
|
|
this.serviceDone();
|
|
if (this.mode < this.modeNames.length - 1)
|
|
this.mode++;
|
|
else
|
|
return null;
|
|
}
|
|
|
|
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
|
|
}
|
|
|
|
async allowCookies() {
|
|
const agreeButton = await this.page.$('button.agree-button');
|
|
if (agreeButton !== null) {
|
|
logger.info('Agreeing to cookie policy.')
|
|
await agreeButton.click();
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
}
|
|
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
// set download behaviour in case this is a new tab after a recovery
|
|
// TODO: this could be set by default in the base class for every new tab in every scraper
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
const pageUrl = await this.page.url();
|
|
|
|
if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle'))
|
|
await this.processIndex();
|
|
else if (process.env.NODE_ENV) {
|
|
await this._uploadError();
|
|
throw new Error(`Unknown page: ${pageUrl}`);
|
|
}
|
|
else {
|
|
logger.warn('processNewPage Fell through');
|
|
logger.warn('currentPage.location', pageUrl);
|
|
}
|
|
}
|
|
|
|
async attachEvents() {
|
|
|
|
}
|
|
|
|
async start() {
|
|
super._start();
|
|
|
|
this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`));
|
|
|
|
this.mode = 0;
|
|
|
|
this.paymentServices = {
|
|
'links': [],
|
|
'urlStep': 0,
|
|
'urls': [
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15',
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14',
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16'
|
|
]
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'links': [],
|
|
'urlStep': 0,
|
|
'urls': [
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9',
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17'
|
|
]
|
|
};
|
|
|
|
this.creditServices = {
|
|
'links': [],
|
|
'urlStep': 0,
|
|
'urls': [
|
|
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7'
|
|
]
|
|
};
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser();
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
if (this.eventNames().length === 2)
|
|
await this.attachEvents();
|
|
|
|
await this._goto(this.startPage);
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
}
|
|
|
|
module.exports = BEScrape;
|