const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('(LU)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; function debounce(func, wait, immediate) { var timeout; return () => { const context = this; const args = arguments; const later = () => { timeout = null; if (!immediate) func.apply(context, args); }; var callNow = immediate && !timeout; clearTimeout(timeout); timeout = setTimeout(later, wait); if (callNow) func.apply(context, args); }; } class LUScrape extends Scraper { constructor() { super(); this.setID('LU'); this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); this.debounceHandleIndexPage = debounce(() => { // the index page sometimes reloads up to 3 times.. this.emit('handleIndexPage'); }, 7500); } /** * * @returns {Promise} */ async handleIndexPage() { const thisUrl = await this.page.url(); const pageUrl = url.parse(thisUrl); switch (pageUrl.hash) { case '#Home': case '#AdvancedSearch': await this.indexPageHomeMode(); break; case '#ResultResearch': this.emit('handleEntityIndex'); break; case '#DetailEntity': this.emit('processEntity'); break; case null: this.emit('selectSearchManually'); break; default: logger.error('HASH NOT RECOGNISED'); logger.error(pageUrl); break; } } /** * * @returns {Promise} */ async indexPageHomeMode() { try{ const searchType = ['6', '7', '1']; const bodys = ['#advancedsearch_paymentservicestype-body', '#advancedsearch_electronicmoneytype-body', '#advancedsearch_banktype-body']; const bankInputs = ['#advancedsearch_bankgroup1_inputEl', '#advancedsearch_bankgroupA_inputEl', '#advancedsearch_bankgroupB_inputEl', '#advancedsearch_bankgroupC_inputEl', '#advancedsearch_bankgroupD_inputEl', '#advancedsearch_bankgroup2_inputEl', '#advancedsearch_bankgroup3_inputEl']; // click the advanced search button await this.page.waitForSelector('#menu_advanced').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); // click await this.page.waitForSelector('#advancedsearch_type-bodyEl').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); await this._randomWait(this.page, 2, 2); // call the EXT function to set the advanced search mode.. await this.page.evaluate(x => { return Ext.getCmp('advancedsearch_type').setValue(x); }, searchType[this.mode]); // Mode 0 & Mode 1 have a list of options which can be iterated easily // Mode 2 requires a handful of different inputs to be clicked on await this._microWait(this.page, 7); if (this.mode === 0) { await this.page.waitForSelector('label#advancedsearch_paymentinstitutionsgroup1-boxLabelEl').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); await this._randomWait(this.page, 2, 2); } if (this.mode === 0 && this.mode === 1) { const options = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input.x-form-checkbox-default`); // click all the elements logger.debug('options length', options.length); for (const item of options) await item.click({ 'delay':Scraper.notARobot() }); } if (this.mode === 2) for(const bI of bankInputs) { const input = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input${bI}`); await input[0].click({ 'delay':Scraper.notARobot() }); } await this._randomWait(this.page, 1, 1); // click the button await this.page.waitForSelector('#advancedsearch_searchbutton').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); // now wait for the results to load.. await this.page.waitForSelector('#title-1083-textEl').then(async () => { logger.debug('Results loaded'); this.emit('pageChanged'); }); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async entityIndexFirstPass(serviceObject) { try{ const body = await this.page.content(); const $ = cheerio.load(body); const pageDetails = await this.extractBarDetails($); const { currentPageIndex, currentPageMax } = pageDetails; if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) { serviceObject.currentIndexLength = pageDetails.currentIndexLength; serviceObject.currentPageMax = currentPageMax; serviceObject.visited = true; serviceObject.currentIndex = url.parse(await this.page.url()); serviceObject.currentMetaIndex = 0; } } catch( err) { logger.error(err); } } /** * * @param $ * @returns {Promise<{currentIndexLength: number, maxPages: number, currentPageMax: number, page: number, currentPageIndex: number}>} */ async extractBarDetails($) { try{ const numberExtract = /(\d+)/g; const pagingBar = $('#resultresearch_paging-targetEl').children(); const page = parseInt($(pagingBar).eq(4).find('input').val(), 10); const workMaxPages = this._cleanUp($(pagingBar).eq(5).text() ); const maxPages = parseInt(workMaxPages.match(numberExtract)[0], 10); const rawDisplaying = this._cleanUp($(pagingBar).eq(pagingBar.length - 1).text()); const [ currentPageIndex, currentPageMax, currentIndexLength ] = rawDisplaying.match(numberExtract).map((s) => { return parseInt(s, 10); }); return { page, maxPages, currentPageIndex, currentPageMax, currentIndexLength }; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntityIndex(serviceObject) { try{ const fields = ['type', 'name', 'address']; logger.info(`Working on the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 1, 2); if (serviceObject.visited === false) { logger.debug('Preparing...'); serviceObject.restart = false; await this.entityIndexFirstPass(serviceObject); } if (serviceObject.visited === true) { serviceObject.currentMetaIndex = serviceObject.step % serviceObject.currentPageMax; logger.debug('serviceObject.currentMetaIndex', serviceObject.currentMetaIndex); if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) { logger.debug('Maxed out this page..'); // serviceObject.visited = false; serviceObject.restart = false; await this.page.waitForSelector('#button-1052').then(async (elm) => { logger.debug('Proceeding to next index page..'); await elm.click({ 'delay':Scraper.notARobot() }); this.emit('pageChanged'); }); } else { logger.debug('dealing...'); serviceObject.restart = true; logger.debug(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`); const wantedRow = await this.page.$$(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`); const htmlTable = await this.page.evaluate(el => el.outerHTML, wantedRow[0]); const $ = cheerio.load(`${htmlTable}
`); const cells = $('div.x-grid-cell-inner'); serviceObject.current = {}; cells.each((index, item) => { serviceObject.current[ fields[index] ] = this._cleanUp($(item).text()); }); if (typeof(serviceObject.current.name ) !== 'undefined' && serviceObject.current.name !== '') { const fileName = this._makeFileName(serviceObject.current.name); serviceObject.current.fileName = fileName; serviceObject.current.filePath = `${this.path}/${fileName}`.substring(0, 240); } // logger.debug(serviceObject); await this._randomWait(this.page, 3, 5); await wantedRow[0].click({ 'delay':97, 'clickCount': 2 }); await this._randomWait(this.page, 1, 1); this.emit('pageChanged'); } } } catch( err) { logger.error(err); } } /** * * @param $ * @param html * @param divId * @param sequence * @returns {Promise} */ async extractGridPanel($, html, divId, sequence) { try{ const outObj = []; const elms = $(html).find(`${divId} div.x-grid-item-container table`); elms.each((index, itm) => { const newObj = {}; for(const seqItem of sequence) { const mclass = `.x-grid-cell-${seqItem[0]}`; const rowElm = $(itm).find(mclass); newObj[seqItem[1]] = this._cleanUp($(rowElm).text()); } outObj.push(newObj); }); return outObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { try{ const details = {}; const detailSequence = [['detailEntity_type_inputEl', 'type'], ['detailEntity_number_inputEl', 'number'], ['detailEntity_name_inputEl', 'name'], ['detailEntity_address_inputEl', 'address'], ['detailEntity_startdate_inputEl', 'startdate'], ['detailEntity_closeddate_inputEl', 'closedate'], ['detailEntity_countrycode_inputEl', 'countrycode'], ['detailEntity_group_inputEl', 'group'], ['detailEntity_subgroup_inputEl', 'subgroup'], ['detailEntity_iciOutside_inputEl', 'iciOutside'], ['detailEntity_icilinked_inputEl', 'icilinked'] ]; const gridPanels = [{ 'id': 'autorisedStatus', 'sequence': [['detailEntity_autorisedStatus', 'autorisedStatus'], ['detailEntity_recentChangeautorisedStatus', 'recentChangeautorisedStatus'], ['detailEntity_recentChangeautorisedDate', 'recentChangeautorisedDate']], 'divId': '#detailEntity_autorisedStatusGridPanel-body' }, { 'id': 'agentOrBranch', 'sequence': [['detailEntity_agentorbranchData', 'agentorbranchData'], ['detailEntity_agentData', 'agentData'], ['detailEntity_branchData', 'branchData'], ['detailEntity_agentorbranchCountry', 'agentorbranchCountry'], ['detailEntity_agentorbranchAddress', 'agentorbranchAddress'], ['detailEntity_agentorbranchlegalstatus', 'agentorbranchlegalstatus']], 'divId': '#detailEntity_agentorbranchGridPanel-body' }, { 'id': 'iciOutsideTable', 'sequence': [['detailEntity_iciOutsideMember', 'iciOutsideMember']], 'divId': '#detailEntity_iciOutsideGridPanel-body' }, { 'id': 'icilinkedTable', 'sequence': [['detailEntity_icilinkedname', 'icilinkedname'], ['detailEntity_icilinkedstartingdate', 'icilinkedstartingdate'], ['detailEntity_icilinkedendingdate', 'icilinkedendingdate']], 'divId': '#detailEntity_icilinkedGridPanel-body' }, { 'id': 'othersStatus', 'sequence': [['detailEntity_otherStatus', 'otherStatus'], ['detailEntity_recentChangeotherStatus', 'recentChangeotherStatus'], ['detailEntity_recentChangeotherDate', 'recentChangeotherDate']], 'divId': '#detailEntity_othersStatusGridPanel-body' }, { 'id': 'services', 'sequence': [['detailEntity_service', 'service'], ['detailEntity_recentChangeservice', 'recentChangeservice'], ['detailEntity_recentChangeserviceDate', 'recentChangeserviceDate']], 'divId': '#detailEntity_servicesGridPanel-body' }, { 'id': 'ancillaryservices', 'sequence': [['detailEntity_ancillaryservice', 'ancillaryservice'], ['detailEntity_recentChangeancillaryservice', 'recentChangeancillaryservice'], ['detailEntity_recentChangeancillaryserviceDate', 'recentChangeancillaryserviceDate']], 'divId': '#detailEntity_ancillaryservicesGridPanel-body' }, { 'id': 'prestataire', 'sequence': [['detailEntity_prestatairename', 'prestatairename'], ['detailEntity_prestataireheadoffice', 'prestataireheadoffice'], ['detailEntity_prestataireauthorisation', 'prestataireauthorisation']], 'divId': '#detailEntity_prestataireGridPanel-body' }, { 'id': 'historicName', 'sequence': [['detailEntity_historicNameName', 'historicNameName'], ['detailEntity_historicNameDate', 'historicNameDate']], 'divId': '#detailEntity_historicNameGridPanel-body' }]; const $ = cheerio.load(html); const mainDiv = $('#promoteDetailEntityPanel-innerCt'); for(const item of detailSequence) { const i = $(mainDiv).find(`#${item[0]}`); details[item[1]] = this._cleanUp($(i).text()); } for( const grid of gridPanels) details[grid.id] = await this.extractGridPanel($, mainDiv, grid.divId, grid.sequence); return details; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntity(serviceObject) { try{ logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.current.name}`); logger.info(`Step ${serviceObject.step} of ${serviceObject.currentIndexLength}`); await this._randomWait(this.page, 3, 5); const filePath = serviceObject.current.filePath; await this._randomWait(this.page, 3, 5); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); serviceObject.current.details = await this.extractEntityDetails(body); this.emit('entityComplete'); logger.info('Entity complete...'); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async entityCompleter(serviceObject) { try{ const filename = serviceObject.current.fileName; const filePath = serviceObject.current.filePath; const newObj = {}; logger.info(`Saving: ${filename}.json`); await jsonfile.writeFile(`${filePath}.json`, serviceObject.current); await this._randomWait(this.page, 3, 5); newObj.fileName = `${filename}.json`; newObj.name = serviceObject.current.name; newObj.number = serviceObject.current.details.number || ''; serviceObject.links.push(newObj); serviceObject.step++; if (serviceObject.step < serviceObject.currentIndexLength) { serviceObject.current = {}; await this.page.waitForSelector('a#detailEntity_backtolist').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); this.emit('pageChanged'); }); } else this.emit('serviceDone'); } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async handleProcessEntity() { switch (this.mode) { case 1: await this.processEntity(this.emoneyServices); break; case 2: await this.processEntity(this.creditServices); break; case 0: default: await this.processEntity(this.paymentServices); break; } } /** * * @returns {Promise} */ async handleEntityComplete() { switch (this.mode) { case 1: await this.entityCompleter(this.emoneyServices); break; case 2: await this.entityCompleter(this.creditServices); break; case 0: default: await this.entityCompleter(this.paymentServices); break; } } async processNewPage() { // give the page a few seconds to settle // await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } if (pageUrl.href === 'about:blank') return; if (pageUrl.pathname === '/index.html') this.debounceHandleIndexPage(); else if (process.env.NODE_ENV === 'production') { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } } /** * * @returns {Promise} */ async attachEvents() { // Need thiss for Angular / EXT based sites this.on('pageChanged', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 1000)); this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('handleIndexPage', () => { this.handleIndexPage(); }); this.on('processEntity', () => { this.handleProcessEntity(); }); this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('handleEntityIndex', async () => { switch (this.mode) { case 1: await this.processEntityIndex(this.emoneyServices); break; case 2: await this.processEntityIndex(this.creditServices); break; case 0: default: await this.processEntityIndex(this.paymentServices); break; } }); this.on('paymentServicesDone', async () => { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); this.emit('pageChanged'); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); this.emit('pageChanged'); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); this.on('selectSearchManually', async () => { logger.debug('Locating advanced search button'); await this.page.waitForSelector('#menu_advanced', { 'visible':true, 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':90 }); }).catch(() => { logger.error('No advanced search button'); }); await this.page.waitForSelector('#promoteAdvancedSearchPanel-body', { 'visible':true, 'timeout':7500 }).then(async () => { await this.indexPageHomeMode(); }).catch(() => { logger.error('No advanced search form'); }); }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'], 'sections' : [], 'sectionLinks' : [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'], 'sections' : [], 'sectionLinks' : [] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/LU/CSSF`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 1000)); if (this.eventNames().length === 2) await this.attachEvents(); await this._makeResponsive(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'load' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = LUScrape;