obdfcascrape/ncas/nl.js

795 lines
22 KiB
JavaScript
Raw Normal View History

2019-05-05 19:13:56 +00:00
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('NL');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class NLScrape extends Scraper {
constructor() {
super();
this.setID('NL');
this.addToBlockFilters(['cookiebar.js', 'readspeaker']);
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
// Delays the call to 30 seconds after the last time it was called.
// Useful if the page beaks and multiple errors happen at the same time
this.recover = this._debounce(async () => {
await this.__recover();
}, 30000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async extractDetail(body) {
const description = [];
try{
const $ = cheerio.load(body);
const rows = $('dl.extra > dd > table > tbody > tr');
rows.each((index, item) => {
let cells = $(item).find('th');
const title = this._cleanUp($(cells.get(0)).text()).replace(':', '') || '';
cells = $(item).find('td');
const detail = this._cleanUp($(cells.get(0)).text()) || '';
if (title !== '')
description.push([title, detail]);
});
}
catch( err) {
logger.error(err);
}
return description;
}
async extractActivity(body) {
const details = [];
try{
const $ = cheerio.load(body);
const rows = $('#tab2 > div > div > table > tbody > tr');
let previousFinancialService = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
const endDate = this._cleanUp($(cells.get(2)).text()) || '';
const thCell = $(item).find('th');
const financialService = this._cleanUp($(thCell.get(0)).text()) || previousFinancialService;
details.push({ financialService, activity, startDate, endDate });
previousFinancialService = financialService;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Extract Passporting Out Data from page
* @param body
* @returns {Promise<void>}
*/
async extractPassportingOut(body) {
const details = {};
try{
const $ = cheerio.load(body);
const rows = $('#tab6 > div > div > table > tbody > tr');
let previouseuPassportOut = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const country = this._cleanUp($(cells.get(1)).text()) || '';
const startDate = this._cleanUp($(cells.get(2)).text()) || '';
const endDate = this._cleanUp($(cells.get(3)).text()) || '';
const thCell = $(item).find('th');
const euPassportOut = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportOut;
if (!details.hasOwnProperty(country))
details[country] = [{ activity, startDate, endDate, euPassportOut }];
else
details[country].push({ activity, startDate, endDate, euPassportOut });
previouseuPassportOut = euPassportOut;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Extract Passporting In Data from page
* @param body
* @returns {Promise<void>}
*/
async extractPassportingIn(body) {
const details = {};
try{
const $ = cheerio.load(body);
const rows = $('#tab7 > div > div > table > tbody > tr');
let previouseuPassportIn = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
const thCell = $(item).find('th');
const euPassportIn = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportIn;
if (!details.hasOwnProperty(euPassportIn))
details[euPassportIn] = [{ activity, startDate }];
else
details[euPassportIn].push({ activity, startDate });
previouseuPassportIn = euPassportIn;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Process Entity Detail
*
* @returns {Promise<{activity: *, details: *}>}
*/
async processEntityDetail(serviceObject) {
const noWhiteSpace = /\W/g;
const urlSections = ['WFTBI', 'WFTEG', 'WFTKF'];
const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process V2 ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = this._makeFileName(entity);
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this.page.waitForSelector('#contentcolumn > div.interactive-tabs > ol > li:nth-child(2) > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
}).catch(() => {
logger.debug('No activity tab');
});
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab6"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_passportingout`, null);
}).catch(() => {
logger.debug('No passporting Out tab');
});
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab7"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_passportingin`, null);
}).catch(() => {
logger.debug('No passporting In tab');
});
const body = await this.page.content();
const details = await this.extractDetail(body);
const activity = await this.extractActivity(body);
const passportingOut = await this.extractPassportingOut(body);
const passportingIn = await this.extractPassportingIn(body);
await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`;
await this._goto(newUrl);
}
else
this.emit('entityDone');
}
/**
* Process WFTBI / Payment Services Detail
*
* @returns {Promise<{activity: *, details: *}>}
*/
async processWFTBIDetail() {
await this.processEntityDetail(this.paymentServices);
}
/**
* Process WFTEG / Emoney services Detail
* @returns {Promise<{activity: *, details: *}>}
*/
async processWFTEGDetail() {
await this.processEntityDetail(this.emoneyServices);
}
/**
* Process WFTKF / Credit Services Details
* @returns {Promise<{activity: *, passportingOut: void, details: *}>}
*/
async processWFTKFDetail() {
await this.processEntityDetail(this.creditServices);
}
/**
* Initiate WFTBI / Payment Services
* @returns {Promise<void>}
*/
async initiateWFTBI() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const wantedOption = ['2:3c Dutch branch of payment institution (EEA incl. NL)'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Initiaite WFTEG / Emoney services
* @returns {Promise<void>}
*/
async initiateWFTEG() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const wantedOption = ['2:10b Carrying on the business of an electronic money institution'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Initiate WFTKF / Credit Services
* @returns {Promise<void>}
*/
async initiateWFTKF() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const selects = ['2:12(1) Carrying on the business of a bank', '2:13(1) Carrying on the business of a bank'];
const wantedOption = [];
wantedOption.push(selects[this.creditServices.step]);
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Process WFTBI / Payment Services
* @returns {Promise<void>}
*/
async processWFTBI() {
const nonWhiteSpace = /\W/g;
logger.info('WFTBI / Payment Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTBI();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/paymentServices_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
this.paymentServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
this.emit('startProcessingPaymentServices');
}
}
/**
* Process WFTEG / Emoney services
* @returns {Promise<void>}
*/
async processWFTEG() {
const nonWhiteSpace = /\W/g;
logger.info('WFTEG / EMoney Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTEG();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/eMoney_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
// const id = `${statutoryName}-${tradeName}`;
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
this.emoneyServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
this.emit('startProcessingEMoneyServices');
}
}
/**
* Process WFTKF / Credit Services
* @returns {Promise<void>}
*/
async processWFTKF() {
try {
// Credit Institute
const nonWhiteSpace = /\W/g;
logger.info('WFTKF / Credit Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTKF();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/creditServices_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
// const id = `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
logger.debug({ id, href });
this.creditServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
if (this.creditServices.step === 0) {
this.creditServices.step = 1;
await this._goto(this.credit);
}
else
this.emit('startProcessingCreditServices');
}
}
catch(e) {
await this._uploadError();
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const failedUrls = ['chrome-error://chromewebdata/'];
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (failedUrls.indexOf(pageUrl.href) !== -1) {
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/supervision/public-register/WFTBI/index.jsp':
await this.processWFTBI();
break;
case '/en/supervision/public-register/WFTBI/detail.jsp':
await this.processWFTBIDetail();
break;
case '/en/supervision/public-register/WFTEG/index.jsp':
await this.processWFTEG();
break;
case '/en/supervision/public-register/WFTEG/detail.jsp':
await this.processWFTEGDetail();
break;
case '/en/supervision/public-register/WFTKF/index.jsp':
await this.processWFTKF();
break;
case '/en/supervision/public-register/WFTKF/detail.jsp':
await this.processWFTKFDetail();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async restart() {
logger.info(`Restarting ${this.modeTitles[this.mode]}`);
switch (this.mode) {
case 2:
this.emit('startProcessingCreditServices');
break;
case 1:
this.emit('startProcessingEMoneyServices');
break;
case 0:
default:
this.emit('startProcessingPaymentServices');
break;
}
}
/**
*
* @returns {Promise<void>}
* @private
*/
async __recover() {
logger.warn('*** RECONNECTING PAGE ***');
if (this.browserCrashed) await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', () => {
this.processNewPage();
});
const timeout = 90000;
setTimeout(async() => {
logger.warn('Attempting recovery..');
await this.restart();
}, timeout);
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('startProcessingPaymentServices', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTBI/${this.paymentServices.links[this.paymentServices.step].href}`;
logger.debug('startProcessingPaymentServices', newUrl);
await this._goto(newUrl);
});
this.on('paymentServicesDone', async () => {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
await this._goto(this.emoneyUrl);
});
this.on('startProcessingEMoneyServices', async () => {
this.mode = 1;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.debug(`${this.emoneyServices.items} EMoney items indexed` );
logger.debug(this.emoneyServices.links[this.emoneyServices.step].href);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTEG/${this.emoneyServices.links[this.emoneyServices.step].href}`;
logger.debug('startProcessingEMoneyServices', newUrl);
await this._goto(newUrl);
});
this.on('emoneyServicesDone', async () => {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
await this._goto(this.credit);
});
this.on('startProcessingCreditServices', async () => {
this.mode = 2;
this.creditServices.items = this.creditServices.links.length;
logger.debug(`${this.creditServices.items} CI items indexed` );
logger.debug(this.creditServices.links[this.creditServices.step].href);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTKF/${this.creditServices.links[this.creditServices.step].href}`;
logger.debug('startProcessingCreditServices', newUrl);
await this._goto(newUrl);
});
this.on('creditServicesDone', async () => {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
this.mode = 0;
try {
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.dnb.nl/en/supervision/public-register/WFTBI/index.jsp';
this.emoneyUrl = 'https://www.dnb.nl/en/supervision/public-register/WFTEG/index.jsp';
this.credit = 'https://www.dnb.nl/en/supervision/public-register/WFTKF/index.jsp';
//
this.setPath(path.resolve(`${__dirname }/../artefacts/NL/DNB`));
await this._doNonRepudiation(false, { 'sslWithPrefix': true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
await this.start();
}
}
module.exports = NLScrape;