obdfcascrape/ncas/dk.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

414 lines
11 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('DK');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class DKScrape extends Scraper {
constructor(checkForLock = true) {
super();
this.id = 'DK';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (checkForLock)
this._checkLock().then((l) => {
if(l)
this.run();
});
this.on('error', (err) => {
logger.error('Error catcher!!', err);
});
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const currentPage = await this.page.evaluate(() => document);
const search = currentPage.location.search;
switch (currentPage.location.pathname) {
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx':
await this.handleStartPage();
break;
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx':
await this.handleSearchResults(search);
break;
case '/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
case '/da/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
// these are all the same page, just in Danish, Danish and English
this.processCoporation();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${currentPage.location.href}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async handleStartPage() {
if (this.mode === 0)
await this._findAndClick('ul li a', 'Payment institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Payment+institutions');
if (this.mode === 1)
await this._findAndClick('ul li a', 'Electronic money institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Electronic+money+institutions');
if (this.mode === 2) {
logger.info('Processing complete');
this.done();
}
}
/**
*
* @returns {Promise<Error>}
*/
async processCoporation() {
await this._randomWait(this.page, 3, 5);
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
const h2 = $('h2').eq(0).text();
// Virksomhedsoplysninger
// Company information
if (h2 === 'Virksomhedsoplysninger') {
logger.warn('Not in English, trying to switch language...');
await this._findAndClick('#mainform > div.header > ul > li.ln > a');
}
else if (h2 === 'Company information') {
const noWhiteSpace = /\W/g;
let ssName;
if (this.mode === 0)
ssName = this.paymentServices.links[this.paymentServices.step].innerText.replace(noWhiteSpace, '_');
else
ssName = this.emoneyServices.links[this.emoneyServices.step].innerText.replace(noWhiteSpace, '_');
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
this._makeScreenshotV2(this.page, `${this.path}/${ssName}`, null);
logger.debug('Processing:', this.paymentServices.links[this.paymentServices.step]);
const fields = await this.extractData(body);
jsonfile.writeFileSync(`${this.path}/${ssName}.json`, fields);
await this.downloadExcel();
if (this.mode === 0)
this.paymentServices.step += 1;
else
this.emoneyServices.step += 1;
await this._randomWait(this.page, 10, 15);
// This should take us back to the search result list
await this._findAndClick('#divContentWidthScr li a', 'To search results');
}
else
return new Error('I do not understand this page...');
}
/**
*
* @param $block
* @returns {Promise<Array>}
*/
async processDataBlock($block) {
const $ = cheerio.load($block);
const noWhiteSpace = /\W/g;
const a = $('tr').map((i, el) => {
const head = $(el).find('td').first();
const data = $(el).find('td').next();
return [head.eq(-1).html().split('</div>')[1].replace(/\n/, '').trim(), data.text()];
});
const fields = [];
for( let step = 0;step < a.length;step = step + 2)
fields.push([a[step].replace(noWhiteSpace, '_'), a[step + 1]]);
return fields;
}
/**
*
* @param body
* @returns {Promise<{companyInformation: *[], presence: Array}>}
*/
async extractData(body) {
const $ = cheerio.load(body);
const vutDataContainer = $('.vut-data-container');
const $basicInfo = vutDataContainer.find('#phmain_0_vut_pnl_basic_info table tbody').get();
const $extendednInfo = vutDataContainer.find('#phmain_0_vut_pnl_extended_info table tbody').get();
const $presenceInfo = vutDataContainer.find('#phmain_0_vut_pnl_tilstedevaerelser table tbody').get();
let companyInformation = await this.processDataBlock($basicInfo);
companyInformation = companyInformation.concat(await this.processDataBlock($extendednInfo));
const presence = await this.processDataBlock($presenceInfo);
return { companyInformation, presence };
}
/**
*
* @returns {Promise<void>}
*/
async downloadExcel() {
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
logger.info('Saving excel into:', this.path);
await this._findAndClick('#phmain_0_vut_link_button_excel');
}
/**
*
* @param search
* @returns {Promise<void>}
*/
async handleSearchResults(search) {
switch (search) {
case '?aid=Payment+services+area&ctid=Payment+institutions':
if (!this.paymentServices.done)
await this.handlePaymentServices();
else
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
break;
case '?aid=Payment+services+area&ctid=Electronic+money+institutions':
if (!this.emoneyServices.done)
await this.handleEmoneyServices();
else
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
break;
case '?restoreSearch=1':
if (this.mode === 0)
if (this.paymentServices.items > 0 && !this.paymentServices.done)
await this.handlePaymentServices();
else {
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
}
if (this.mode === 1)
if (this.emoneyServices.items > 0 && !this.emoneyServices.done)
await this.handleEmoneyServices();
break;
default:
// Menu fell through
break;
}
}
/**
*
* @returns {Promise<Array>}
*/
async extractLinks() {
const returnObj = [];
await this._randomWait(this.page, 3, 5);
const rows = await this.page.$$('.search-further-data tr a');
for (const item of rows) {
const innerText = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.href, item);
const id = await this.page.evaluate(el => el.id, item);
returnObj.push( {
innerText,
href,
id
});
}
return returnObj;
}
/**
*
* @returns {Promise<void>}
*/
async handleEmoneyServices() {
await this._randomWait(this.page, 3, 5);
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
if (!this.emoneyServices.visited)
if (this.emoneyServices.items === 0) {
// first visit, Build the list
this.emoneyServices.links = await this.extractLinks();
this.emoneyServices.items = this.emoneyServices.links.length;
this.emoneyServices.visited = true;
}
if (this.emoneyServices.visited)
if (this.emoneyServices.step < this.emoneyServices.items) {
const nextItem = this.emoneyServices.links[this.emoneyServices.step];
// Not using an await here. We want to click and exit this page so we don't get tied up
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
}
else {
// EMoney services complete, move onto the next service.
this.emoneyServices.done = true;
this.mode = 2;
await this.page.goto(this.startPage);
}
}
/**
*
* @returns {Promise<void>}
*/
async handlePaymentServices() {
await this._randomWait(this.page, 3, 5);
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
if (!this.paymentServices.visited)
if (this.paymentServices.items === 0) {
// first visit, Build the list
this.paymentServices.links = await this.extractLinks();
this.paymentServices.items = this.paymentServices.links.length;
this.paymentServices.visited = true;
}
if (this.paymentServices.visited)
if (this.paymentServices.step < this.paymentServices.items) {
const nextItem = this.paymentServices.links[this.paymentServices.step];
// Not using an await here. We want to click and exit this page so we don't get tied up
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
}
else {
// Payment services complete, move onto the next service.
this.paymentServices.done = true;
this.mode = 1;
await this.page.goto(this.startPage);
}
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
// Financial Supervisory Authority
// Government ministry
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': {
},
'step': 0,
'visited': false,
'done' : false
};
this.emoneyServices = {
'items': 0,
'links': {
},
'step': 0,
'visited': false,
'done' : false
};
this.startPage = 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx';
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
logger.error(err);
});
await this._initBrowser();
this.page = await this.browser.newPage();
this.page.on('domcontentloaded', () => {
this.processNewPage().catch((err) => {
logger.error('####', err);
this.emit('done');
});
});
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this.page.goto(this.startPage).catch((err) => {
logger.error(err);
this._uploadError();
});
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = DKScrape;