Martin Donnelly a5109efabe 2019-05-12
2019-05-12 18:33:09 +01:00

434 lines
13 KiB
JavaScript

const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('BE');
const path = require('path');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const Scraper = require('../helpers/scraper');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class BEScrape extends Scraper {
constructor() {
super();
this.setID('BE');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async downloadFile(docLink) {
logger.info(`Downloading ${docLink}`);
await this.page.goto(docLink).catch((err) => {
if (err.message.indexOf('net::ERR_ABORTED') !== -1) {
logger.info(`Ignoring expected error upon file download: ${err.message}`);
}
else
throw err;
});
const waitMs = 5000;
const parsedUrl = url.parse(docLink);
const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
const downloadFilePath = `${this.path}/${fileName}`;
let tries;
for (tries = 1; tries <= 10; tries++) {
logger.info('Waiting...');
await this.page.waitFor(waitMs);
if (this._checkFileExistsSync(downloadFilePath)) {
logger.info(`${docLink} successfully downloaded.`);
return true;
}
else {
logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`);
}
}
// if we reach this point, download has failed
logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`);
return false;
}
normaliseDocLink(docLink) {
if (!docLink.startsWith('http://www.nbb.be/')) {
// attempt to normalise document link
if (docLink.startsWith('file:///L:/PRXNWEBP/')) {
return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/');
}
else {
logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`);
}
}
return docLink;
}
convertMulitpleSpaceToCommaSpace(value) {
return value.replace(/\s{2,}/g, ', ');
}
extractMainDetails(detailsContainer) {
const $ = require('cheerio');
const details = {};
details['name'] = $(detailsContainer).children('strong').text().trim();
details['companyType'] = $(detailsContainer).children('em').text().trim();
const lines = $(detailsContainer).children();
details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim());
details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim());
// Occasionally line 5 will contain text. If this is the case, line 5 contains
// "addressThree", and every other line moves along by one.
let offset = 0;
if (lines[5].next.data.trim() !== '') {
offset = 1;
details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim());
}
else {
details['addressThree'] = null;
}
details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim();
details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim();
const docLink = $(detailsContainer).children('a');
if (docLink.length > 0) {
details['docLink'] = docLink.attr('href');
details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href'));
}
else {
details['docLink'] = null;
details['normalisedDocLink'] = null;
}
return details;
}
extractAdditionalDetails(tableCells) {
const $ = require('cheerio');
const additionalDetails = {};
tableCells.toArray().map((td) => {
const thText = $(td).closest('table').find('th').eq($(td).index()).text();
const fieldName = this._makeFieldName(thText);
additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"]
});
return additionalDetails;
}
extractFullDetails(fullDetailsContainer, mode) {
const $ = require('cheerio');
switch (mode) {
case 0:
case 1:
// in modes 0 and 1 the main details are in the first td of the parent container
const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0));
const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1));
return {...mainDetails, ...additionalDetails};
case 2:
// in mode 2 (credit institutions) the main details are in the root.
return this.extractMainDetails(fullDetailsContainer);
// no additional details for credit institutions
}
}
extractEntitiesFromContainer(entitiesContainer, mode) {
const $ = require('cheerio');
const entities = [];
switch ($(entitiesContainer).prop("tagName")) {
case 'TBODY':
$(entitiesContainer).children('tr').each((index, item) => {
entities.push(this.extractFullDetails(item, mode));
});
break;
case 'UL':
$(entitiesContainer).children('li').each((index, item) => {
entities.push(this.extractFullDetails(item, mode));
});
break;
}
return entities;
}
extractIndex(indexContainer, mode) {
const $ = require('cheerio');
const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim();
const description = $(indexContainer).find('div.description').html();
const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record
const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0);
let entities;
if (entitiesContainer.length > 0) {
entities = this.extractEntitiesFromContainer(entitiesContainer, mode);
}
else {
entities = [];
}
const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record
return { title, description, legend, entities, changes };
}
getIdByEntityName(name) {
const noWhiteSpace = /\W/g;
let id = this._makeFieldName(name).trim();
id = removeAccents.remove(id);
id = id.replace(noWhiteSpace, '_');
return id;
}
async processIndex() {
const pageUrl = await this.page.url();
logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`);
await this.allowCookies();
const body = await this.page.content();
const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' });
logger.info('Extracting index...')
const index = this.extractIndex($('div#PrudentialList'), this.mode);
logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`);
logger.info(`Downloading ${this.modeNames[this.mode]} documents.`);
// download all documents from this index page
for (const entity of index.entities) {
if (entity.normalisedDocLink !== null) {
const didDownload = await this.downloadFile(entity.normalisedDocLink);
if (didDownload) {
// rename the file to match the json file name format
const parsedUrl = url.parse(entity.normalisedDocLink);
const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
const originalFilePath = `${this.path}/${originalFileName}`;
const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join('');
const newFilePath = `${this.path}/${newFileName}`;
await this._renameFile(originalFilePath, newFilePath);
// save new file name to entity object so it can be found later.
entity['docLocalFilename'] = newFileName;
}
else {
entity['docLocalFilename'] = null;
}
}
}
logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`);
const description = index['description'];
const legend = index['legend'];
const changes = index['changes'];
const metadata = { description, legend, changes };
const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`;
const metadataFilePath = `${this.path}/${metadataFileName}`
jsonfile.writeFile(metadataFilePath, { metadata });
for (const entity of index.entities) {
const id = this.getIdByEntityName(entity.name);
// create json file for each entity
const filename = [this.modePrefix[this.mode], id].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName});
// add entity details to "links" so that index file can be generated later
this.getCurrentMode().links.push({
'id': id,
'href': await this.page.url(),
'filename': filename
});
}
logger.info(`Taking screenshot of: ${pageUrl}`);
const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`;
const screenshotPath = `${this.path}/${screenshotFilename}`;
await this._makeScreenshotV2(this.page, screenshotPath);
const nextUrl = this.getNextUrl();
if (nextUrl !== null)
await this._goto(nextUrl);
else
this.emit('done');
}
serviceDone() {
try{
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
logger.info(`${this.modeNames[this.mode]} done.`);
logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`);
}
catch (e) {
logger.error(e);
}
}
getCurrentMode() {
switch (this.mode) {
case 0:
return this.paymentServices;
case 1:
return this.emoneyServices;
case 2:
return this.creditServices;
}
}
getNextUrl() {
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
this.getCurrentMode().urlStep++;
else {
this.serviceDone();
if (this.mode < this.modeNames.length - 1)
this.mode++;
else
return null;
}
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
}
async allowCookies() {
const agreeButton = await this.page.$('button.agree-button');
if (agreeButton !== null) {
logger.info('Agreeing to cookie policy.')
await agreeButton.click();
await this._randomWait(this.page, 3, 5);
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
// set download behaviour in case this is a new tab after a recovery
// TODO: this could be set by default in the base class for every new tab in every scraper
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
const pageUrl = await this.page.url();
if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle'))
await this.processIndex();
else if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
}
async attachEvents() {
}
async start() {
super._start();
this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`));
this.mode = 0;
this.paymentServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16'
]
};
this.emoneyServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17'
]
};
this.creditServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7'
]
};
this.startPage = this.paymentServices.urls[0];
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this._goto(this.startPage);
}
async __run() {
await this.start();
}
}
module.exports = BEScrape;