414 lines
11 KiB
JavaScript
414 lines
11 KiB
JavaScript
|
const Scraper = require('../helpers/scraper');
|
||
|
const cheerio = require('cheerio');
|
||
|
const path = require('path');
|
||
|
const jsonfile = require('jsonfile');
|
||
|
const logger = require('log4js').getLogger('DK');
|
||
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||
|
|
||
|
class DKScrape extends Scraper {
|
||
|
|
||
|
constructor(checkForLock = true) {
|
||
|
super();
|
||
|
this.id = 'DK';
|
||
|
|
||
|
this.on('done', () => {
|
||
|
this._done();
|
||
|
});
|
||
|
|
||
|
this.run = this._throttle(async () => {
|
||
|
await this.__run();
|
||
|
}, 5000);
|
||
|
|
||
|
if (checkForLock)
|
||
|
this._checkLock().then((l) => {
|
||
|
if(l)
|
||
|
this.run();
|
||
|
});
|
||
|
|
||
|
this.on('error', (err) => {
|
||
|
logger.error('Error catcher!!', err);
|
||
|
});
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async processNewPage() {
|
||
|
// give the page a few seconds to settle
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const currentPage = await this.page.evaluate(() => document);
|
||
|
|
||
|
const search = currentPage.location.search;
|
||
|
|
||
|
switch (currentPage.location.pathname) {
|
||
|
|
||
|
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx':
|
||
|
await this.handleStartPage();
|
||
|
break;
|
||
|
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx':
|
||
|
await this.handleSearchResults(search);
|
||
|
break;
|
||
|
|
||
|
case '/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||
|
case '/da/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||
|
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||
|
// these are all the same page, just in Danish, Danish and English
|
||
|
this.processCoporation();
|
||
|
break;
|
||
|
default:
|
||
|
|
||
|
await this._uploadError();
|
||
|
throw new Error(`Unknown page: ${currentPage.location.href}`);
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async handleStartPage() {
|
||
|
if (this.mode === 0)
|
||
|
await this._findAndClick('ul li a', 'Payment institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Payment+institutions');
|
||
|
|
||
|
if (this.mode === 1)
|
||
|
await this._findAndClick('ul li a', 'Electronic money institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Electronic+money+institutions');
|
||
|
|
||
|
if (this.mode === 2) {
|
||
|
logger.info('Processing complete');
|
||
|
this.done();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<Error>}
|
||
|
*/
|
||
|
async processCoporation() {
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||
|
const $ = cheerio.load(body);
|
||
|
|
||
|
const h2 = $('h2').eq(0).text();
|
||
|
// Virksomhedsoplysninger
|
||
|
// Company information
|
||
|
|
||
|
if (h2 === 'Virksomhedsoplysninger') {
|
||
|
logger.warn('Not in English, trying to switch language...');
|
||
|
await this._findAndClick('#mainform > div.header > ul > li.ln > a');
|
||
|
}
|
||
|
else if (h2 === 'Company information') {
|
||
|
const noWhiteSpace = /\W/g;
|
||
|
let ssName;
|
||
|
|
||
|
if (this.mode === 0)
|
||
|
ssName = this.paymentServices.links[this.paymentServices.step].innerText.replace(noWhiteSpace, '_');
|
||
|
else
|
||
|
ssName = this.emoneyServices.links[this.emoneyServices.step].innerText.replace(noWhiteSpace, '_');
|
||
|
|
||
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||
|
|
||
|
this._makeScreenshotV2(this.page, `${this.path}/${ssName}`, null);
|
||
|
|
||
|
logger.debug('Processing:', this.paymentServices.links[this.paymentServices.step]);
|
||
|
|
||
|
const fields = await this.extractData(body);
|
||
|
|
||
|
jsonfile.writeFileSync(`${this.path}/${ssName}.json`, fields);
|
||
|
await this.downloadExcel();
|
||
|
|
||
|
if (this.mode === 0)
|
||
|
this.paymentServices.step += 1;
|
||
|
else
|
||
|
this.emoneyServices.step += 1;
|
||
|
|
||
|
await this._randomWait(this.page, 10, 15);
|
||
|
|
||
|
// This should take us back to the search result list
|
||
|
|
||
|
await this._findAndClick('#divContentWidthScr li a', 'To search results');
|
||
|
}
|
||
|
else
|
||
|
return new Error('I do not understand this page...');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @param $block
|
||
|
* @returns {Promise<Array>}
|
||
|
*/
|
||
|
async processDataBlock($block) {
|
||
|
const $ = cheerio.load($block);
|
||
|
const noWhiteSpace = /\W/g;
|
||
|
|
||
|
const a = $('tr').map((i, el) => {
|
||
|
const head = $(el).find('td').first();
|
||
|
const data = $(el).find('td').next();
|
||
|
|
||
|
return [head.eq(-1).html().split('</div>')[1].replace(/\n/, '').trim(), data.text()];
|
||
|
});
|
||
|
|
||
|
const fields = [];
|
||
|
|
||
|
for( let step = 0;step < a.length;step = step + 2)
|
||
|
fields.push([a[step].replace(noWhiteSpace, '_'), a[step + 1]]);
|
||
|
|
||
|
return fields;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @param body
|
||
|
* @returns {Promise<{companyInformation: *[], presence: Array}>}
|
||
|
*/
|
||
|
async extractData(body) {
|
||
|
const $ = cheerio.load(body);
|
||
|
|
||
|
const vutDataContainer = $('.vut-data-container');
|
||
|
|
||
|
const $basicInfo = vutDataContainer.find('#phmain_0_vut_pnl_basic_info table tbody').get();
|
||
|
const $extendednInfo = vutDataContainer.find('#phmain_0_vut_pnl_extended_info table tbody').get();
|
||
|
|
||
|
const $presenceInfo = vutDataContainer.find('#phmain_0_vut_pnl_tilstedevaerelser table tbody').get();
|
||
|
|
||
|
let companyInformation = await this.processDataBlock($basicInfo);
|
||
|
companyInformation = companyInformation.concat(await this.processDataBlock($extendednInfo));
|
||
|
const presence = await this.processDataBlock($presenceInfo);
|
||
|
|
||
|
return { companyInformation, presence };
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async downloadExcel() {
|
||
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||
|
logger.info('Saving excel into:', this.path);
|
||
|
|
||
|
await this._findAndClick('#phmain_0_vut_link_button_excel');
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @param search
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async handleSearchResults(search) {
|
||
|
switch (search) {
|
||
|
|
||
|
case '?aid=Payment+services+area&ctid=Payment+institutions':
|
||
|
if (!this.paymentServices.done)
|
||
|
await this.handlePaymentServices();
|
||
|
else
|
||
|
// Are we not done yet?
|
||
|
// Restarting the page
|
||
|
await this.page.goto(this.startPage);
|
||
|
|
||
|
break;
|
||
|
case '?aid=Payment+services+area&ctid=Electronic+money+institutions':
|
||
|
if (!this.emoneyServices.done)
|
||
|
await this.handleEmoneyServices();
|
||
|
else
|
||
|
// Are we not done yet?
|
||
|
// Restarting the page
|
||
|
await this.page.goto(this.startPage);
|
||
|
|
||
|
break;
|
||
|
case '?restoreSearch=1':
|
||
|
if (this.mode === 0)
|
||
|
if (this.paymentServices.items > 0 && !this.paymentServices.done)
|
||
|
await this.handlePaymentServices();
|
||
|
else {
|
||
|
// Are we not done yet?
|
||
|
// Restarting the page
|
||
|
await this.page.goto(this.startPage);
|
||
|
}
|
||
|
|
||
|
if (this.mode === 1)
|
||
|
if (this.emoneyServices.items > 0 && !this.emoneyServices.done)
|
||
|
await this.handleEmoneyServices();
|
||
|
|
||
|
break;
|
||
|
default:
|
||
|
// Menu fell through
|
||
|
break;
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<Array>}
|
||
|
*/
|
||
|
async extractLinks() {
|
||
|
const returnObj = [];
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
const rows = await this.page.$$('.search-further-data tr a');
|
||
|
|
||
|
for (const item of rows) {
|
||
|
const innerText = await this.page.evaluate(el => el.innerText, item);
|
||
|
const href = await this.page.evaluate(el => el.href, item);
|
||
|
const id = await this.page.evaluate(el => el.id, item);
|
||
|
|
||
|
returnObj.push( {
|
||
|
innerText,
|
||
|
href,
|
||
|
id
|
||
|
});
|
||
|
}
|
||
|
|
||
|
return returnObj;
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async handleEmoneyServices() {
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
|
||
|
|
||
|
if (!this.emoneyServices.visited)
|
||
|
if (this.emoneyServices.items === 0) {
|
||
|
// first visit, Build the list
|
||
|
|
||
|
this.emoneyServices.links = await this.extractLinks();
|
||
|
|
||
|
this.emoneyServices.items = this.emoneyServices.links.length;
|
||
|
|
||
|
this.emoneyServices.visited = true;
|
||
|
}
|
||
|
|
||
|
if (this.emoneyServices.visited)
|
||
|
|
||
|
if (this.emoneyServices.step < this.emoneyServices.items) {
|
||
|
const nextItem = this.emoneyServices.links[this.emoneyServices.step];
|
||
|
|
||
|
// Not using an await here. We want to click and exit this page so we don't get tied up
|
||
|
|
||
|
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
|
||
|
}
|
||
|
else {
|
||
|
// EMoney services complete, move onto the next service.
|
||
|
this.emoneyServices.done = true;
|
||
|
this.mode = 2;
|
||
|
await this.page.goto(this.startPage);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async handlePaymentServices() {
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
|
||
|
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
|
||
|
|
||
|
if (!this.paymentServices.visited)
|
||
|
if (this.paymentServices.items === 0) {
|
||
|
// first visit, Build the list
|
||
|
|
||
|
this.paymentServices.links = await this.extractLinks();
|
||
|
|
||
|
this.paymentServices.items = this.paymentServices.links.length;
|
||
|
|
||
|
this.paymentServices.visited = true;
|
||
|
}
|
||
|
|
||
|
if (this.paymentServices.visited)
|
||
|
|
||
|
if (this.paymentServices.step < this.paymentServices.items) {
|
||
|
const nextItem = this.paymentServices.links[this.paymentServices.step];
|
||
|
|
||
|
// Not using an await here. We want to click and exit this page so we don't get tied up
|
||
|
|
||
|
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
|
||
|
}
|
||
|
else {
|
||
|
// Payment services complete, move onto the next service.
|
||
|
this.paymentServices.done = true;
|
||
|
this.mode = 1;
|
||
|
await this.page.goto(this.startPage);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
*
|
||
|
* @returns {Promise<void>}
|
||
|
*/
|
||
|
async start() {
|
||
|
super._start();
|
||
|
try {
|
||
|
// Financial Supervisory Authority
|
||
|
// Government ministry
|
||
|
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
|
||
|
|
||
|
this.mode = 0;
|
||
|
|
||
|
this.paymentServices = {
|
||
|
'items': 0,
|
||
|
'links': {
|
||
|
|
||
|
},
|
||
|
'step': 0,
|
||
|
'visited': false,
|
||
|
'done' : false
|
||
|
};
|
||
|
|
||
|
this.emoneyServices = {
|
||
|
'items': 0,
|
||
|
'links': {
|
||
|
|
||
|
},
|
||
|
'step': 0,
|
||
|
'visited': false,
|
||
|
'done' : false
|
||
|
};
|
||
|
|
||
|
this.startPage = 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx';
|
||
|
|
||
|
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
|
||
|
|
||
|
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
|
||
|
logger.error(err);
|
||
|
});
|
||
|
|
||
|
await this._initBrowser();
|
||
|
this.page = await this.browser.newPage();
|
||
|
|
||
|
this.page.on('domcontentloaded', () => {
|
||
|
this.processNewPage().catch((err) => {
|
||
|
logger.error('####', err);
|
||
|
this.emit('done');
|
||
|
});
|
||
|
});
|
||
|
|
||
|
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
||
|
|
||
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||
|
await this.page.goto(this.startPage).catch((err) => {
|
||
|
logger.error(err);
|
||
|
this._uploadError();
|
||
|
});
|
||
|
|
||
|
await this._randomWait(this.page, 3, 5);
|
||
|
}
|
||
|
catch(e) {
|
||
|
throw Error(e);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async __run() {
|
||
|
await this.start();
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
module.exports = DKScrape;
|