obdfcascrape/ncas/at.js
Martin Donnelly 534fd67b5d final update
2019-08-15 08:48:49 +01:00

315 lines
9.3 KiB
JavaScript

const cheerio = require('cheerio');
const logger = require('log4js').getLogger('AT');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const Scraper = require('../helpers/scraper');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ATScrape extends Scraper {
constructor() {
super();
this.setID('AT');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
getIndexUrl(category) {
return `https://www.fma.gv.at/en/search-company-database/?cname=&place=&bic=&category=${category}&per_page=10&submitted=1&to=1`;
}
getHtmlNotInMatchingElements(html, selector) {
const $ = cheerio.load(html, { 'decodeEntities': false });
$(selector).remove();
// cheerio adds <html><body>... around the result, so we return the contents of <body>
return $('body').html();
}
extractFieldFromLiWithStrongHeader($, headerText, extractTextFromHtml = false) {
const fieldHeader = $(`div.company-details li > strong:contains("${headerText}")`);
if (fieldHeader.length < 1)
return '';
const fieldLi = fieldHeader.parent().html();
const fieldValue = this.getHtmlNotInMatchingElements(fieldLi, 'strong');
if (extractTextFromHtml)
return $(fieldValue).text().trim();
else
return fieldValue.trim();
}
extractSingleFields($, details) {
details['name'] = this._cleanUp($('h3 > a').text());
const addressRaw = this.extractFieldFromLiWithStrongHeader($, 'Address:');
let address = addressRaw.replace(/\s*\|\s*/g, ', '); // replace pipes with commas
address = address.replace(/\s+/g, ' '); // replace any non-standard spaces with simple spaces
address = address.trim();
details['address'] = address;
details['phone'] = this.extractFieldFromLiWithStrongHeader($, 'Phone:');
details['email'] = this.extractFieldFromLiWithStrongHeader($, 'Email:', true);
details['website'] = this.extractFieldFromLiWithStrongHeader($, 'Web:', true);
details['bankIdentificationNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Bank identification number:');
details['commercialRegisterNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Commercial register number:');
}
extractMultiples($, details) {
details['categories'] = [];
const categoriesLi = $('div.company-details > ul > li > strong:contains("Category:")').parent().html();
const categoriesRaw = this.getHtmlNotInMatchingElements(categoriesLi, 'strong');
const categories = categoriesRaw.split(/<br>/);
for (let i = 0; i < categories.length; i++) {
let cat = categories[i];
cat = this._cleanUp(cat);
if (cat !== '')
details['categories'].push(cat);
}
}
extractPermissions($, details) {
details['permissions'] = [];
const permissionsDiv = $('div.modal-body');
$(permissionsDiv).find('h4').each((i, item) => {
const heading = this._cleanUp($(item).text());
const body = $(item).next().html().split('<br>').map(x => this._cleanUp(x)).filter(x => x != "");
details['permissions'].push({ heading, body });
});
}
extractEntityDetails(html) {
const details = {};
const $ = cheerio.load(html, { 'decodeEntities': false });
this.extractSingleFields($, details);
this.extractMultiples($, details);
this.extractPermissions($, details);
return details;
}
currentPageAsString() {
return `${this.modeNames[this.mode]} url ${this.getCurrentMode().urlStep}, page ${this.getCurrentMode().paginationStep}`;
}
async expandAndScreenshot() {
logger.info(`Expanding content on ${this.currentPageAsString()}`);
await this.page.addStyleTag({
'content':
`
div.company-details { /* make space for the content */
position: static;
width: auto;
height: auto;
}
div.document-description { /* make content visible */
display: block;
position: static;
opacity: 1;
}
div.modal-dialog { /* move the content back down (it's transformed up a bit by default) */
transform: none !important;
}
div.modal-content { /* remove the drop shadow (might help render faster?) */
box-shadow: none;
-webkit-box-shadow: none;
}
`
});
logger.info(`Taking screenshot of ${this.currentPageAsString()}`);
const filename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_page_${this.getCurrentMode().paginationStep}`;
const path = `${this.path}/${filename}`;
await this._makeScreenshotV2(this.page, path);
}
serviceDone() {
logger.info(`${this.modeNames[this.mode]} done. Total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]} scraped.`);
try{
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
}
catch (e) {
logger.error(e);
}
}
async entityResultsPageProcessor() {
const body = await this.page.content();
const $ = cheerio.load(body, { 'decodeEntities': false });
await this.expandAndScreenshot();
const entities = $('div.company-details-wrap');
const href = await this.page.url();
entities.each(async (i, item) => {
const noWhiteSpace = /\W/g;
const details = this.extractEntityDetails($(item).html());
const id = this._makeFieldName(details.name);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), '.json'].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
jsonfile.writeFile(`${filePath}`, { details });
this.getCurrentMode().links.push({ id, href, filename });
});
logger.info(`${entities.length} ${this.modeNames[this.mode]} entities scraped.`);
const nextLink = await this.page.$('div.paging li.next:not(.disabled) a');
if (nextLink !== null) {
logger.info('Clicking through to next page.');
this.getCurrentMode().paginationStep++;
const nextHref = await this.page.evaluate(link => {
return link.href;
}, nextLink);
this._goto(nextHref);
}
else {
this.serviceDone();
this.getCurrentMode().paginationStep = 1;
const nextUrl = this.getNextUrl();
if (nextUrl !== null)
this._goto(nextUrl);
else
this.emit('done');
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.pathname === '/en/search-company-database/')
await this.entityResultsPageProcessor();
else
logger.error(`Page url not recognised: ${pageUrl.href}`);
}
getCurrentMode() {
switch (this.mode) {
case 0:
return this.paymentServices;
case 1:
return this.emoneyServices;
case 2:
return this.creditServices;
}
}
getNextUrl() {
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
this.getCurrentMode().urlStep++;
else
if (this.mode < this.modeNames.length - 1)
this.mode++;
else
return null;
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
this.setPath(path.resolve(`${__dirname }/../artefacts/AT/FMA`));
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [
this.getIndexUrl('1977'), // Payment institutions - Payment Institutions licensed in Austria
this.getIndexUrl('2798'), // Payment Institutions - Account information service provider (AISP)
this.getIndexUrl('2799') // Payment Institutions - Payment initiation service provider (PISP)
]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [this.getIndexUrl('2193')] // E-Money-Institutions - E-Money-Institutions licensed in Austria
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [this.getIndexUrl('165')] // Banks - Banks licensed in Austria
};
this.startPage = this.paymentServices.urls[0];
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
try {
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ATScrape;