const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('DK'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class DKScrape extends Scraper { constructor(checkForLock = true) { super(); this.id = 'DK'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (checkForLock) this._checkLock().then((l) => { if(l) this.run(); }); this.on('error', (err) => { logger.error('Error catcher!!', err); }); } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const currentPage = await this.page.evaluate(() => document); const search = currentPage.location.search; switch (currentPage.location.pathname) { case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx': await this.handleStartPage(); break; case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx': await this.handleSearchResults(search); break; case '/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx': case '/da/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx': case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx': // these are all the same page, just in Danish, Danish and English this.processCoporation(); break; default: await this._uploadError(); throw new Error(`Unknown page: ${currentPage.location.href}`); } } /** * * @returns {Promise} */ async handleStartPage() { if (this.mode === 0) await this._findAndClick('ul li a', 'Payment institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Payment+institutions'); if (this.mode === 1) await this._findAndClick('ul li a', 'Electronic money institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Electronic+money+institutions'); if (this.mode === 2) { logger.info('Processing complete'); this.done(); } } /** * * @returns {Promise} */ async processCoporation() { await this._randomWait(this.page, 3, 5); const body = await this.page.evaluate(() => document.documentElement.outerHTML); const $ = cheerio.load(body); const h2 = $('h2').eq(0).text(); // Virksomhedsoplysninger // Company information if (h2 === 'Virksomhedsoplysninger') { logger.warn('Not in English, trying to switch language...'); await this._findAndClick('#mainform > div.header > ul > li.ln > a'); } else if (h2 === 'Company information') { const noWhiteSpace = /\W/g; let ssName; if (this.mode === 0) ssName = this.paymentServices.links[this.paymentServices.step].innerText.replace(noWhiteSpace, '_'); else ssName = this.emoneyServices.links[this.emoneyServices.step].innerText.replace(noWhiteSpace, '_'); await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); this._makeScreenshotV2(this.page, `${this.path}/${ssName}`, null); logger.debug('Processing:', this.paymentServices.links[this.paymentServices.step]); const fields = await this.extractData(body); jsonfile.writeFileSync(`${this.path}/${ssName}.json`, fields); await this.downloadExcel(); if (this.mode === 0) this.paymentServices.step += 1; else this.emoneyServices.step += 1; await this._randomWait(this.page, 10, 15); // This should take us back to the search result list await this._findAndClick('#divContentWidthScr li a', 'To search results'); } else return new Error('I do not understand this page...'); } /** * * @param $block * @returns {Promise} */ async processDataBlock($block) { const $ = cheerio.load($block); const noWhiteSpace = /\W/g; const a = $('tr').map((i, el) => { const head = $(el).find('td').first(); const data = $(el).find('td').next(); return [head.eq(-1).html().split('')[1].replace(/\n/, '').trim(), data.text()]; }); const fields = []; for( let step = 0;step < a.length;step = step + 2) fields.push([a[step].replace(noWhiteSpace, '_'), a[step + 1]]); return fields; } /** * * @param body * @returns {Promise<{companyInformation: *[], presence: Array}>} */ async extractData(body) { const $ = cheerio.load(body); const vutDataContainer = $('.vut-data-container'); const $basicInfo = vutDataContainer.find('#phmain_0_vut_pnl_basic_info table tbody').get(); const $extendednInfo = vutDataContainer.find('#phmain_0_vut_pnl_extended_info table tbody').get(); const $presenceInfo = vutDataContainer.find('#phmain_0_vut_pnl_tilstedevaerelser table tbody').get(); let companyInformation = await this.processDataBlock($basicInfo); companyInformation = companyInformation.concat(await this.processDataBlock($extendednInfo)); const presence = await this.processDataBlock($presenceInfo); return { companyInformation, presence }; } /** * * @returns {Promise} */ async downloadExcel() { await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); logger.info('Saving excel into:', this.path); await this._findAndClick('#phmain_0_vut_link_button_excel'); } /** * * @param search * @returns {Promise} */ async handleSearchResults(search) { switch (search) { case '?aid=Payment+services+area&ctid=Payment+institutions': if (!this.paymentServices.done) await this.handlePaymentServices(); else // Are we not done yet? // Restarting the page await this.page.goto(this.startPage); break; case '?aid=Payment+services+area&ctid=Electronic+money+institutions': if (!this.emoneyServices.done) await this.handleEmoneyServices(); else // Are we not done yet? // Restarting the page await this.page.goto(this.startPage); break; case '?restoreSearch=1': if (this.mode === 0) if (this.paymentServices.items > 0 && !this.paymentServices.done) await this.handlePaymentServices(); else { // Are we not done yet? // Restarting the page await this.page.goto(this.startPage); } if (this.mode === 1) if (this.emoneyServices.items > 0 && !this.emoneyServices.done) await this.handleEmoneyServices(); break; default: // Menu fell through break; } } /** * * @returns {Promise} */ async extractLinks() { const returnObj = []; await this._randomWait(this.page, 3, 5); const rows = await this.page.$$('.search-further-data tr a'); for (const item of rows) { const innerText = await this.page.evaluate(el => el.innerText, item); const href = await this.page.evaluate(el => el.href, item); const id = await this.page.evaluate(el => el.id, item); returnObj.push( { innerText, href, id }); } return returnObj; } /** * * @returns {Promise} */ async handleEmoneyServices() { await this._randomWait(this.page, 3, 5); await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL'); if (!this.emoneyServices.visited) if (this.emoneyServices.items === 0) { // first visit, Build the list this.emoneyServices.links = await this.extractLinks(); this.emoneyServices.items = this.emoneyServices.links.length; this.emoneyServices.visited = true; } if (this.emoneyServices.visited) if (this.emoneyServices.step < this.emoneyServices.items) { const nextItem = this.emoneyServices.links[this.emoneyServices.step]; // Not using an await here. We want to click and exit this page so we don't get tied up this._findAndClick(`#${nextItem.id}`, nextItem.innerText); } else { // EMoney services complete, move onto the next service. this.emoneyServices.done = true; this.mode = 2; await this.page.goto(this.startPage); } } /** * * @returns {Promise} */ async handlePaymentServices() { await this._randomWait(this.page, 3, 5); await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL'); if (!this.paymentServices.visited) if (this.paymentServices.items === 0) { // first visit, Build the list this.paymentServices.links = await this.extractLinks(); this.paymentServices.items = this.paymentServices.links.length; this.paymentServices.visited = true; } if (this.paymentServices.visited) if (this.paymentServices.step < this.paymentServices.items) { const nextItem = this.paymentServices.links[this.paymentServices.step]; // Not using an await here. We want to click and exit this page so we don't get tied up this._findAndClick(`#${nextItem.id}`, nextItem.innerText); } else { // Payment services complete, move onto the next service. this.paymentServices.done = true; this.mode = 1; await this.page.goto(this.startPage); } } /** * * @returns {Promise} */ async start() { super._start(); try { // Financial Supervisory Authority // Government ministry // https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark) this.mode = 0; this.paymentServices = { 'items': 0, 'links': { }, 'step': 0, 'visited': false, 'done' : false }; this.emoneyServices = { 'items': 0, 'links': { }, 'step': 0, 'visited': false, 'done' : false }; this.startPage = 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx'; this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`)); await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => { logger.error(err); }); await this._initBrowser(); this.page = await this.browser.newPage(); this.page.on('domcontentloaded', () => { this.processNewPage().catch((err) => { logger.error('####', err); this.emit('done'); }); }); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this.page.goto(this.startPage).catch((err) => { logger.error(err); this._uploadError(); }); await this._randomWait(this.page, 3, 5); } catch(e) { throw Error(e); } } async __run() { await this.start(); } } module.exports = DKScrape;