const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const logger = require('log4js').getLogger('ES'); const url = require('url'); const querystring = require('querystring'); const removeAccents = require('remove-accents-diacritics'); const jsonfile = require('jsonfile'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class ESScrape extends Scraper { constructor() { super(); this.id = 'ES'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param $ * @returns {Promise} */ async extractPassporting($) { const passporting = []; const headerRow = $('td.tdSubtituloSeccion:contains("PAISES EN LOS QUE OPERA")').eq(0).parent().eq(0); const passportRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row passportRows.each(function(i, elem) { passporting.push( { 'country': $(elem).find('td').eq(0).text(), 'mode': $(elem).find('td').eq(1).text() } ); }); return passporting; } /** * * @param $ * @returns {Promise} */ async extractActivities($) { const activities = []; const headerRow = $('td.tdSubtituloSeccion td.tdSubtituloSeccion:contains("ACTIVIDADES")').eq(0).parent().eq(0); const activityRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row activityRows.each(function(i, elem) { activities.push($(elem).text()); }); for (let i = 0; i < activities.length; i++) activities[i] = this._cleanUp(activities[i]); return activities; } /** * * @param $ * @param details * @returns {Promise} */ async extractSingleFields($, details) { const mainDiv = $('div#divSalida > table.tablaParametros > tbody > tr > td> table > tbody'); details.bancoDeEspanaCode = this._cleanUp($(mainDiv).find('input[name=CODIGO]').val()); details.bancoDeEspanaPrevCode = this._cleanUp($(mainDiv).find('input[name=CODIGO_PREVIO]').val()); details.effectiveFrom = this._cleanUp($(mainDiv).find('input[name=FechaAlta1]').val()); details.effectiveTo = this._cleanUp($(mainDiv).find('input[name=FechaBaja]').val()); details.lastUpdated = this._cleanUp($(mainDiv).find('input[name=FechaActualizacion]').val()); details.name = this._cleanUp( // Can't find accent in "Denominación:" so search for half the word: $(mainDiv).children('tr:contains("Denominaci")').nextAll().eq(0).find('textarea').text() ); details.institutionType = this._cleanUp( $(mainDiv).children('tr:contains("Tipo de entidad:")').nextAll().eq(0).find('textarea').text() ); details.address = this._cleanUp( $(mainDiv).children('tr:contains("Domicilio:")').nextAll().eq(0).find('textarea').text() ); details.legalEntityIdentifierCode = this._cleanUp( $(mainDiv).find('input[name=CODIGO_PREVIO]').parent().nextAll().eq(3).children('input').val() ); details.shortName = this._cleanUp( $(mainDiv).find('td.textoEtiqueta:contains("Nombre abreviado:")').nextAll().eq(1).children('input').val() ); details.nif = this._cleanUp( $(mainDiv).find('td.textoEtiqueta:contains("N.I.F.:")').nextAll().eq(1).find('td.textoCampo input').val() ); // Can't find "Teléfono", probably due to accent. Search for "fono" instead. details.telephone = this._cleanUp( $(mainDiv).find('td.textoEtiqueta:contains("fono:")').nextAll().eq(1).find('td.textoCampo input').val() ); details.fax = this._cleanUp( $(mainDiv).find('td.textoEtiqueta:contains("Fax:")').nextAll().eq(1).find('td.textoCampo input').val() ); details.website = this._cleanUp( $(mainDiv).find('td.textoEtiqueta:contains("Dom. / Dir. Internet:")').nextAll().eq(1).find('a').text() ); details.safeguardOfFunds = this._cleanUp( $(mainDiv).find('td.tdSubtituloSeccion:contains("SALVAGUARDA DE FONDOS")').parent().nextAll('tr').eq(1).text() ); details.financialExclusivity = this._cleanUp( $(mainDiv).find('td.tdSubtituloSeccion:contains("EXCLUSIVIDAD FINANCIERA")').parent().nextAll('tr').eq(1).text() ); if ($(mainDiv).find('li.textoAvisoResaltado').length > 0) details.notice = this._cleanUp( $(mainDiv).find('li.textoAvisoResaltado').text() ); else details.notice = ''; } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { const details = {}; const $ = cheerio.load(html); try { await this.extractSingleFields($, details); details.activities = await this.extractActivities($); details.passporting = await this.extractPassporting($); } catch (err) { logger.error(err); } return details; } /** * * @param serviceObject * @returns {Promise} */ async processIndex(serviceObject) { const noResultsSelector = '//td[@class="textoEtiqueta"][contains(text(), "NO SE HAN ENCONTRADO ENTIDADES SEGUN LOS CRITERIOS DE BUSQUEDA.")]'; const paginationRowSelector = '//table[@class="tablaResultados"]//td[@colspan="4"]'; await this._randomWait(this.page, 3, 5); // pagination row is the last to load, so wait for that before scraping the links // Sometimes the row is empty, so look for the surrounding td with `colspan=4` // also look for the "no results" notice in case the result set is empty await this.page.waitForXPath(`${noResultsSelector} | ${paginationRowSelector}`); logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`); const filename = this.modeNames[this.mode]; this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}_${serviceObject.paginationStep}`, null); if (this.page.$x(noResultsSelector).length > 0) { logger.info(`Results page ${serviceObject.indexStep} for ${this.modeNames[this.mode]} is empty`); return; } // TODO: handle when the table loads, but the entity links are missing (happens occasionally) const body = await this.page.content(); const $ = cheerio.load(body); const links = $('table.tablaResultados tr.estilofila a'); links.each((i, item) => { const href = $(item).attr('href'); // ignore any javascript print links if (href.startsWith('javascript')) return; const text = $(item).text().trim(); const newUrl = `http://app.bde.es${href}`; const id = this._makeFieldName(text); serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id }); }); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { await this._randomWait(this.page, 6, 9); logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`); await this.processIndex(serviceObject); const nextButtons = await this.page.$x('//a[contains(text(), \'Siguiente\')]'); if (nextButtons.length > 0) { serviceObject.paginationStep++; await nextButtons[0].click(); } else if (serviceObject.indexStep < serviceObject.urls.length - 1) { serviceObject.indexStep++; serviceObject.paginationStep = 0; const newUrl = serviceObject.urls[serviceObject.indexStep]; await this._goto(newUrl); } else this.emit('indexdone'); } /** * * @returns {Promise} */ async indexRedirector() { logger.debug('>> indexRedirector'); switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { const noWhiteSpace = /\W/g; const { name, id } = serviceObject.links[serviceObject.step]; logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`); await this.page.waitForSelector('td.tdContenido', { 'visible':true, 'timeout':7500 }); // Wait for buttons at bottom of table to be visible await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); const details = await this.extractEntityDetails(body); await jsonfile.writeFile(`${filePath}.json`, { details }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } const qstring = querystring.parse(pageUrl.search); if ('TIPO' in qstring) // 'type' await this.indexRedirector(); else if ('CODBE' in qstring) // 'code' await this.processRedirector(); else { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } } /** * * @returns {Promise} */ async attachEvents() { this.on('serviceDone', async function() { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('psindexdone', async () => { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} paymentServices items indexed`); const newUrl = this.paymentServices.links[this.paymentServices.step].href; await this._goto(newUrl); }); this.on('emindexdone', async () => { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} emoneyServices items indexed`); const newUrl = this.emoneyServices.links[this.emoneyServices.step].href; await this._goto(newUrl); }); this.on('ciindexdone', async () => { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} creditServices items indexed`); const newUrl = this.creditServices.links[this.creditServices.step].href; await this._goto(newUrl); }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('paymentServicesDone', async () => { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done' : false, 'urls': [ 'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EP&DONDE=11&LEI=&ORDEN=2&RADIO=0', // Payment Entities 'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EPH&DONDE=11&LEI=&ORDEN=2&RADIO=0' // Hybrid Payment Entities ], 'sections' : [], 'sectionLinks' : [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done' : false, 'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EDE&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Electronic Money Entities 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done' : false, 'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=BP&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Credit institutions 'sections' : [], 'sectionLinks' : [] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/ES/BE`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = ESScrape;