const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const logger = require('log4js').getLogger('MT'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class MTScrape extends Scraper { constructor() { super(); this.id = 'MT'; this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param html * @returns {Promise<{authorization, details}>} * @constructor */ async OLDextractEntity(html) { const $ = cheerio.load(html); const details = {}; const authorization = {}; details.name = this._cleanUp($('#lblName').text()); const dlCells = $('div#pnlCommonDetails').children(); const superCells = $('#LHDetails span.fix-width-caption'); // #lblStatus dlCells.each((index, item) => { if ($(item).attr('id') === 'pnlRegDate') { const itemText = this._cleanUp($(item).find('span').text()).split(/\s*:\s*/); details[itemText[0]] = itemText[1]; } else { const current = this._cleanUp($(item).find('p').text()).replace(/\s*:\s*/, ''); details[current] = this._cleanUp($(item).find('span').text()); } }); superCells.each((index, item) => { const nextElm = $($(item).next()); const li = $(nextElm).find('li'); const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, ''); authorization[thisId] = []; if (li.length > 0) li.each((index, item) => { const auth = $(item).html().split(' - '); auth[1] = this._cleanUp(auth[1]); authorization[thisId].push(auth); }); else { const itemText = this._cleanUp($(nextElm).text()); authorization[thisId].push(itemText); } }); return { details, authorization }; } /** * * @param html * @returns {Promise<{authorization, details}>} */ async extractEntityV2(html) { const trimToColon = /^.*?(?=(:))/; const $ = cheerio.load(html); const details = {}; const authorization = {}; const errors = []; details.name = this._cleanUp($('div#mainTitle > div').text()); const dlCells = $('table#tableLicenceResult tr'); const superCells = $('#LHDetails span.fix-width-caption'); let previousLabel = ''; dlCells.each((index, item) => { const children = $(item).children(); const rawLabel = $(children).eq(0).text().match(trimToColon); const itemValue = this._cleanUp($(children).eq(1).text().trim()); if (rawLabel !== null ) { const itemLabel = this._cleanUp(rawLabel[0]); details[itemLabel] = itemValue; previousLabel = itemLabel; } else details[previousLabel] = details[previousLabel].concat([itemValue]); }); previousLabel = ''; superCells.each((index, item) => { const nextElm = $($(item).next()); const children = $(nextElm).children(); if ($(children).length <= 1) { const li = $(nextElm).find('li'); const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, ''); authorization[thisId] = []; if (li.length > 0) li.each((index, item) => { const auth = $(item).text().split(' - '); auth[1] = this._cleanUp(auth[1]); if (auth[1] !== '') authorization[thisId].push(auth); }); else { const itemText = this._cleanUp($(nextElm).text()); authorization[thisId].push(itemText); } } else { logger.warn('Possible error in the HTML'); logger.warn($(nextElm).html()); errors.push($(nextElm).html()); } }); const outObj = { details, authorization }; if (errors.length > 0) outObj.errors = errors; return outObj; } /** * * @param serviceObject * @returns {Promise} * @constructor */ async OLDprocessIndex(serviceObject) { logger.info(`Building the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 3, 5); const pagingItem = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tfoot > tr > td > table > tbody > tr > td > div.rgWrap.rgInfoPart strong'); const maxPagesText = (pagingItem.length > 0) ? await this.page.evaluate(el => el.innerText, pagingItem[1]) : '0'; const maxPages = parseInt(maxPagesText, 10); const links = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tbody > tr > td> a'); for (const item of links) { const id = await this.page.evaluate(el => el.innerText, item); const href = await this.page.evaluate(el => el.href, item); const params = this._getParamsFromUrl(href); serviceObject.links.push({ id, href, 'entId': params.id, 'metaStep': serviceObject.indexMetaStep }); } if (serviceObject.indexStep < (maxPages - 1) ) { serviceObject.indexStep++; await this._findAndClick('input.rgPageNext'); } else this.emit('indexdone'); } async processIndexV2(serviceObject) { // #tableResult span const numberRegEx = /\d+/; logger.debug('+ processIndexV2'); logger.info(`Building the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 3, 5); const links = await this.page.$$('#tableResult span'); for (const item of links) { const id = await this.page.evaluate(el => el.innerText, item); const href = await this.page.evaluate(el => el.getAttribute('onclick'), item); serviceObject.links.push({ id, 'entId': href.match(numberRegEx)[0], 'metaStep': serviceObject.indexMetaStep }); } this.emit('indexdone'); } /** * * @param serviceObject * @returns {Promise} * @constructor */ async OLDinitiateIndex(serviceObject) { logger.debug('initiateIndex'); const matched = { 'left':false, 'right':false }; // first time around. // need to kick off the index correctly.. await this._findAndClick('#ctl00_cphMain_RadComboBox1'); await this._randomWait(this.page, 2, 3); const leftOptions = await this.page.$$('#ctl00_cphMain_RadComboBox1_DropDown > div > ul.rcbList li'); const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep]; for (const item of leftOptions) { const text = await this.page.evaluate(el => el.innerText, item); if (wantedOption.indexOf(text) !== -1) { await item.click({ 'delay':95 }); matched.left = true; // this element can take a while to reload.. break; } } await this._randomWait(this.page, 7, 9); await this._findAndClick('#ctl00_cphMain_RadComboBox2_Input'); await this._randomWait(this.page, 2, 3); const rightOptions = await this.page.$$('#ctl00_cphMain_RadComboBox2_DropDown > div > ul.rcbList li'); for (const item of rightOptions) { const text = await this.page.evaluate(el => el.innerText, item); if (text === wantedOption[1]) { matched.right = true; await item.click({ 'delay':95 }); break; } } // Wait for items to setttle await this._randomWait(this.page, 2, 3); if (matched.left && matched.right) { serviceObject.started = true; await this._findAndClick('#cphMain_btnSearch2'); } else logger.error('Not fully matched', matched); } /** * Reworked for site reskin * @param serviceObject * @returns {Promise} */ async initiateIndexV2(serviceObject) { logger.debug('initiateIndexV2'); const matched = { 'left':false, 'right':false }; // first time around. // need to kick off the index correctly.. // select#select1 const leftOptions = await this.page.$$('select#select1 option'); const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep]; for (const item of leftOptions) { const rawText = await this.page.evaluate(el => el.innerText, item); const value = await this.page.evaluate(el => el.value, item); const text = this._cleanUp(rawText); if (wantedOption.indexOf(text) !== -1) { await this.page.select('select#select1', value); matched.left = true; break; } } // Wait for items to setttle await this._randomWait(this.page, 2, 3); const rightOptions = await this.page.$$('select#select2 option'); for (const item of rightOptions) { const rawText = await this.page.evaluate(el => el.innerText, item); const value = await this.page.evaluate(el => el.value, item); const text = this._cleanUp(rawText); if (text === wantedOption[1]) { matched.right = true; await this.page.select('select#select2', value); break; } } await this._randomWait(this.page, 2, 2); if (matched.left && matched.right) { serviceObject.started = true; await this._findAndClick('button.searchButtonAdv'); this.emit('processIndex'); } else logger.error('Not fully matched', matched); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { logger.debug('buildIndex'); if (!serviceObject.started) await this.initiateIndexV2(serviceObject); else await this.processIndexV2(serviceObject); } /** * * @param serviceObject * @returns {Promise} */ async nextItem(serviceObject) { const entId = serviceObject.links[serviceObject.step].entId; logger.debug('nextItem', entId); await this.newLoadLicenceHolder(entId); } /** * * @returns {Promise} */ async indexRedirector() { if (!this.processing) switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } else switch (this.mode) { case 0: await this.nextItem(this.paymentServices); break; case 1: await this.nextItem(this.emoneyServices); break; case 2: await this.nextItem(this.creditServices); break; } } async processEntityDetails(serviceObject) { const noWhiteSpace = /\W/g; const { id, entId } = serviceObject.links[serviceObject.step]; logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step}:${id}`); await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${entId}`].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); const details = await this.extractEntityV2(body); await jsonfile.writeFile(`${filePath}.json`, { details }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); else this.emit('serviceDone'); } // processIndex async handleProcessIndex() { switch (this.mode) { case 0: await this.processIndexV2(this.paymentServices); break; case 1: await this.processIndexV2(this.emoneyServices); break; case 2: await this.processIndexV2(this.creditServices); break; } } async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } async processNewPage() { // give the ajax page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } logger.debug('processNewPage', pageUrl.href); switch (pageUrl.pathname) { case '/pages/licenceholders.aspx': case '/financial-services-register/': await this.indexRedirector(); break; case'/pages/licenceholder.aspx': case '/financial-services-register/result/': await this.processRedirector(); break; case '/en/our-registers/company-register/gransoverskridandehandel/': await this.crossBorderRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); this.emit('backoff'); throw new Error(`Unknown page: ${pageUrl.href}`); } else { logger.warn('processNewPage Fell through'); logger.warn('pathName', pathName); logger.warn('currentPage.location', pageUrl); } break; } } /** * Replaces the goto * @param id * @returns {Promise} */ async newLoadLicenceHolder(id) { // loadLicenceHolder(10966) const formElm = await this.page.$('form#loadHolder'); logger.debug('loadLicenceHolder', id); await this.page.evaluate(x => { x.target = '_self'; }, formElm); await this._microWait(this.page, 5); await this.page.evaluate(x => { return loadLicenceHolder(x); }, id); } /** * * @returns {Promise} */ async attachEvents() { this.on('processIndex', async () => { this.handleProcessIndex(); }); // this.on('pageChanged', this._debounce(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 1000)); this.on('psindexdone', async () => { this.paymentServices.indexMetaStep++; if (this.paymentServices.indexMetaStep < this.paymentServices.indexMeta.length) { logger.info('Resetting for next meta index...'); // next.. this.paymentServices.started = false; this.paymentServices.indexStep = 0; await this._goto(this.startPage); } else { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); logger.warn('GO THROUGH THE NEW LIST!!!!'); this.processing = true; await this._randomWait(this.page, 2, 2, 'New page transition'); } }); this.on('emindexdone', async () => { this.emoneyServices.indexMetaStep++; if (this.emoneyServices.indexMetaStep < this.emoneyServices.indexMeta.length) { logger.info('Resetting for next meta index...'); // next.. this.emoneyServices.started = false; this.emoneyServices.indexStep = 0; await this._goto(this.startPage); } else { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); logger.warn('GO THROUGH THE NEW LIST!!!!'); this.processing = true; await this._randomWait(this.page, 2, 2, 'New page transition'); } }); this.on('ciindexdone', async () => { this.creditServices.indexMetaStep++; if (this.creditServices.indexMetaStep < this.creditServices.indexMeta.length) { logger.info('Resetting for next meta index...'); // next.. this.creditServices.started = false; this.creditServices.indexStep = 0; await this._goto(this.startPage); } else { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); logger.warn('GO THROUGH THE NEW LIST!!!!'); this.processing = true; await this._randomWait(this.page, 2, 2, 'New page transition'); } }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('paymentServicesDone', async () => { this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.processing = false; await this._goto(this.emoneyServices.urls[0]); }); this.on('emoneyServicesDone', async () => { this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.processing = false; await this._goto(this.creditServices.urls[0]); }); this.on('creditServicesDone', async () => { this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.emit('done'); }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.processing = false; this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services']; this.paymentServices = { 'items': 0, 'links': [], 'step': 46, 'indexStep': 0, 'indexMetaStep':0, 'visited': false, 'done' : false, 'started': false, 'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'], 'indexMeta' : [ ['Financial Institutions', 'Financial Institutions licensed to undertake payment services under the 2nd Schedule to the Financial Institutions Act (Payment Institutions)'], ['Financial Institutions', 'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to provide services outside Malta'], ['Financial Institutions', 'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to establish a branch outside Malta'] ] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'indexMetaStep':0, 'visited': false, 'done' : false, 'started': false, 'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'], 'indexMeta' : [ ['Financial Institutions', 'Financial Institutions licenced to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions)'], ['Financial Institutions', 'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to provide services outside Malta'], ['Financial Institutions', 'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to establish a branch outside Malta'] ] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'indexMetaStep':0, 'visited': false, 'done' : false, 'started': false, 'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'], 'indexMeta' : [ ['Credit Institutions', 'Credit Institutions'], ['Credit Institutions', 'Freedom of Services and Establishments - Exercise of the freedom to provide services outside Malta'], ['Credit Institutions', 'Freedom of Services and Establishments - Exercise of the freedom to set up an establishment outside Malta'] ] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html'; this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB'; this.setPath(path.resolve(`${__dirname }/../artefacts/MT/MFSA`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._debounce(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle2' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = MTScrape;