Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

575 lines
16 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const logger = require('log4js').getLogger('ES');
const url = require('url');
const querystring = require('querystring');
const removeAccents = require('remove-accents-diacritics');
const jsonfile = require('jsonfile');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ESScrape extends Scraper {
constructor() {
super();
this.id = 'ES';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractPassporting($) {
const passporting = [];
const headerRow = $('td.tdSubtituloSeccion:contains("PAISES EN LOS QUE OPERA")').eq(0).parent().eq(0);
const passportRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
passportRows.each(function(i, elem) {
passporting.push(
{
'country': $(elem).find('td').eq(0).text(),
'mode': $(elem).find('td').eq(1).text()
}
);
});
return passporting;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractActivities($) {
const activities = [];
const headerRow = $('td.tdSubtituloSeccion td.tdSubtituloSeccion:contains("ACTIVIDADES")').eq(0).parent().eq(0);
const activityRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
activityRows.each(function(i, elem) {
activities.push($(elem).text());
});
for (let i = 0; i < activities.length; i++)
activities[i] = this._cleanUp(activities[i]);
return activities;
}
/**
*
* @param $
* @param details
* @returns {Promise<void>}
*/
async extractSingleFields($, details) {
const mainDiv = $('div#divSalida > table.tablaParametros > tbody > tr > td> table > tbody');
details.bancoDeEspanaCode = this._cleanUp($(mainDiv).find('input[name=CODIGO]').val());
details.bancoDeEspanaPrevCode = this._cleanUp($(mainDiv).find('input[name=CODIGO_PREVIO]').val());
details.effectiveFrom = this._cleanUp($(mainDiv).find('input[name=FechaAlta1]').val());
details.effectiveTo = this._cleanUp($(mainDiv).find('input[name=FechaBaja]').val());
details.lastUpdated = this._cleanUp($(mainDiv).find('input[name=FechaActualizacion]').val());
details.name = this._cleanUp(
// Can't find accent in "Denominación:" so search for half the word:
$(mainDiv).children('tr:contains("Denominaci")').nextAll().eq(0).find('textarea').text()
);
details.institutionType = this._cleanUp(
$(mainDiv).children('tr:contains("Tipo de entidad:")').nextAll().eq(0).find('textarea').text()
);
details.address = this._cleanUp(
$(mainDiv).children('tr:contains("Domicilio:")').nextAll().eq(0).find('textarea').text()
);
details.legalEntityIdentifierCode = this._cleanUp(
$(mainDiv).find('input[name=CODIGO_PREVIO]').parent().nextAll().eq(3).children('input').val()
);
details.shortName = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Nombre abreviado:")').nextAll().eq(1).children('input').val()
);
details.nif = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("N.I.F.:")').nextAll().eq(1).find('td.textoCampo input').val()
);
// Can't find "Teléfono", probably due to accent. Search for "fono" instead.
details.telephone = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("fono:")').nextAll().eq(1).find('td.textoCampo input').val()
);
details.fax = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Fax:")').nextAll().eq(1).find('td.textoCampo input').val()
);
details.website = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Dom. / Dir. Internet:")').nextAll().eq(1).find('a').text()
);
details.safeguardOfFunds = this._cleanUp(
$(mainDiv).find('td.tdSubtituloSeccion:contains("SALVAGUARDA DE FONDOS")').parent().nextAll('tr').eq(1).text()
);
details.financialExclusivity = this._cleanUp(
$(mainDiv).find('td.tdSubtituloSeccion:contains("EXCLUSIVIDAD FINANCIERA")').parent().nextAll('tr').eq(1).text()
);
if ($(mainDiv).find('li.textoAvisoResaltado').length > 0)
details.notice = this._cleanUp(
$(mainDiv).find('li.textoAvisoResaltado').text()
);
else
details.notice = '';
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
const details = {};
const $ = cheerio.load(html);
try {
await this.extractSingleFields($, details);
details.activities = await this.extractActivities($);
details.passporting = await this.extractPassporting($);
}
catch (err) {
logger.error(err);
}
return details;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
const noResultsSelector = '//td[@class="textoEtiqueta"][contains(text(), "NO SE HAN ENCONTRADO ENTIDADES SEGUN LOS CRITERIOS DE BUSQUEDA.")]';
const paginationRowSelector = '//table[@class="tablaResultados"]//td[@colspan="4"]';
await this._randomWait(this.page, 3, 5);
// pagination row is the last to load, so wait for that before scraping the links
// Sometimes the row is empty, so look for the surrounding td with `colspan=4`
// also look for the "no results" notice in case the result set is empty
await this.page.waitForXPath(`${noResultsSelector} | ${paginationRowSelector}`);
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
const filename = this.modeNames[this.mode];
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}_${serviceObject.paginationStep}`, null);
if (this.page.$x(noResultsSelector).length > 0) {
logger.info(`Results page ${serviceObject.indexStep} for ${this.modeNames[this.mode]} is empty`);
return;
}
// TODO: handle when the table loads, but the entity links are missing (happens occasionally)
const body = await this.page.content();
const $ = cheerio.load(body);
const links = $('table.tablaResultados tr.estilofila a');
links.each((i, item) => {
const href = $(item).attr('href');
// ignore any javascript print links
if (href.startsWith('javascript'))
return;
const text = $(item).text().trim();
const newUrl = `http://app.bde.es${href}`;
const id = this._makeFieldName(text);
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this._randomWait(this.page, 6, 9);
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
await this.processIndex(serviceObject);
const nextButtons = await this.page.$x('//a[contains(text(), \'Siguiente\')]');
if (nextButtons.length > 0) {
serviceObject.paginationStep++;
await nextButtons[0].click();
}
else if (serviceObject.indexStep < serviceObject.urls.length - 1) {
serviceObject.indexStep++;
serviceObject.paginationStep = 0;
const newUrl = serviceObject.urls[serviceObject.indexStep];
await this._goto(newUrl);
}
else
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('td.tdContenido', { 'visible':true, 'timeout':7500 }); // Wait for buttons at bottom of table to be visible
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const qstring = querystring.parse(pageUrl.search);
if ('TIPO' in qstring) // 'type'
await this.indexRedirector();
else if ('CODBE' in qstring) // 'code'
await this.processRedirector();
else {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} paymentServices items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} emoneyServices items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} creditServices items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': [
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EP&DONDE=11&LEI=&ORDEN=2&RADIO=0', // Payment Entities
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EPH&DONDE=11&LEI=&ORDEN=2&RADIO=0' // Hybrid Payment Entities
],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EDE&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Electronic Money Entities
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=BP&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Credit institutions
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/ES/BE`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ESScrape;