315 lines
9.3 KiB
JavaScript
315 lines
9.3 KiB
JavaScript
const cheerio = require('cheerio');
|
|
const logger = require('log4js').getLogger('AT');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const removeAccents = require('remove-accents-diacritics');
|
|
const url = require('url');
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
class ATScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('AT');
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
getIndexUrl(category) {
|
|
return `https://www.fma.gv.at/en/search-company-database/?cname=&place=&bic=&category=${category}&per_page=10&submitted=1&to=1`;
|
|
}
|
|
|
|
getHtmlNotInMatchingElements(html, selector) {
|
|
const $ = cheerio.load(html, { 'decodeEntities': false });
|
|
|
|
$(selector).remove();
|
|
|
|
// cheerio adds <html><body>... around the result, so we return the contents of <body>
|
|
return $('body').html();
|
|
}
|
|
|
|
extractFieldFromLiWithStrongHeader($, headerText, extractTextFromHtml = false) {
|
|
const fieldHeader = $(`div.company-details li > strong:contains("${headerText}")`);
|
|
if (fieldHeader.length < 1)
|
|
return '';
|
|
const fieldLi = fieldHeader.parent().html();
|
|
const fieldValue = this.getHtmlNotInMatchingElements(fieldLi, 'strong');
|
|
|
|
if (extractTextFromHtml)
|
|
return $(fieldValue).text().trim();
|
|
else
|
|
return fieldValue.trim();
|
|
}
|
|
|
|
extractSingleFields($, details) {
|
|
details['name'] = this._cleanUp($('h3 > a').text());
|
|
|
|
const addressRaw = this.extractFieldFromLiWithStrongHeader($, 'Address:');
|
|
let address = addressRaw.replace(/\s*\|\s*/g, ', '); // replace pipes with commas
|
|
address = address.replace(/\s+/g, ' '); // replace any non-standard spaces with simple spaces
|
|
address = address.trim();
|
|
details['address'] = address;
|
|
|
|
details['phone'] = this.extractFieldFromLiWithStrongHeader($, 'Phone:');
|
|
details['email'] = this.extractFieldFromLiWithStrongHeader($, 'Email:', true);
|
|
details['website'] = this.extractFieldFromLiWithStrongHeader($, 'Web:', true);
|
|
details['bankIdentificationNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Bank identification number:');
|
|
details['commercialRegisterNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Commercial register number:');
|
|
}
|
|
|
|
extractMultiples($, details) {
|
|
details['categories'] = [];
|
|
const categoriesLi = $('div.company-details > ul > li > strong:contains("Category:")').parent().html();
|
|
const categoriesRaw = this.getHtmlNotInMatchingElements(categoriesLi, 'strong');
|
|
const categories = categoriesRaw.split(/<br>/);
|
|
for (let i = 0; i < categories.length; i++) {
|
|
let cat = categories[i];
|
|
cat = this._cleanUp(cat);
|
|
if (cat !== '')
|
|
details['categories'].push(cat);
|
|
}
|
|
}
|
|
|
|
extractPermissions($, details) {
|
|
details['permissions'] = [];
|
|
const permissionsDiv = $('div.modal-body');
|
|
$(permissionsDiv).find('h4').each((i, item) => {
|
|
const heading = this._cleanUp($(item).text());
|
|
const body = $(item).next().html().split('<br>').map(x => this._cleanUp(x)).filter(x => x != "");
|
|
details['permissions'].push({ heading, body });
|
|
});
|
|
}
|
|
|
|
extractEntityDetails(html) {
|
|
const details = {};
|
|
|
|
const $ = cheerio.load(html, { 'decodeEntities': false });
|
|
|
|
this.extractSingleFields($, details);
|
|
|
|
this.extractMultiples($, details);
|
|
|
|
this.extractPermissions($, details);
|
|
|
|
return details;
|
|
}
|
|
|
|
currentPageAsString() {
|
|
return `${this.modeNames[this.mode]} url ${this.getCurrentMode().urlStep}, page ${this.getCurrentMode().paginationStep}`;
|
|
}
|
|
|
|
async expandAndScreenshot() {
|
|
logger.info(`Expanding content on ${this.currentPageAsString()}`);
|
|
|
|
await this.page.addStyleTag({
|
|
'content':
|
|
`
|
|
div.company-details { /* make space for the content */
|
|
position: static;
|
|
width: auto;
|
|
height: auto;
|
|
}
|
|
div.document-description { /* make content visible */
|
|
display: block;
|
|
position: static;
|
|
opacity: 1;
|
|
}
|
|
div.modal-dialog { /* move the content back down (it's transformed up a bit by default) */
|
|
transform: none !important;
|
|
}
|
|
div.modal-content { /* remove the drop shadow (might help render faster?) */
|
|
box-shadow: none;
|
|
-webkit-box-shadow: none;
|
|
}
|
|
`
|
|
});
|
|
|
|
logger.info(`Taking screenshot of ${this.currentPageAsString()}`);
|
|
const filename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_page_${this.getCurrentMode().paginationStep}`;
|
|
const path = `${this.path}/${filename}`;
|
|
await this._makeScreenshotV2(this.page, path);
|
|
}
|
|
|
|
serviceDone() {
|
|
logger.info(`${this.modeNames[this.mode]} done. Total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]} scraped.`);
|
|
try{
|
|
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
}
|
|
|
|
async entityResultsPageProcessor() {
|
|
const body = await this.page.content();
|
|
const $ = cheerio.load(body, { 'decodeEntities': false });
|
|
|
|
await this.expandAndScreenshot();
|
|
|
|
const entities = $('div.company-details-wrap');
|
|
|
|
const href = await this.page.url();
|
|
|
|
entities.each(async (i, item) => {
|
|
const noWhiteSpace = /\W/g;
|
|
|
|
const details = this.extractEntityDetails($(item).html());
|
|
const id = this._makeFieldName(details.name);
|
|
const entity = removeAccents.remove(id.trim());
|
|
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), '.json'].join('');
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
jsonfile.writeFile(`${filePath}`, { details });
|
|
|
|
this.getCurrentMode().links.push({ id, href, filename });
|
|
});
|
|
|
|
logger.info(`${entities.length} ${this.modeNames[this.mode]} entities scraped.`);
|
|
|
|
const nextLink = await this.page.$('div.paging li.next:not(.disabled) a');
|
|
if (nextLink !== null) {
|
|
logger.info('Clicking through to next page.');
|
|
this.getCurrentMode().paginationStep++;
|
|
const nextHref = await this.page.evaluate(link => {
|
|
return link.href;
|
|
}, nextLink);
|
|
this._goto(nextHref);
|
|
}
|
|
else {
|
|
this.serviceDone();
|
|
this.getCurrentMode().paginationStep = 1;
|
|
const nextUrl = this.getNextUrl();
|
|
if (nextUrl !== null)
|
|
this._goto(nextUrl);
|
|
else
|
|
this.emit('done');
|
|
}
|
|
}
|
|
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.pathname === '/en/search-company-database/')
|
|
await this.entityResultsPageProcessor();
|
|
else
|
|
logger.error(`Page url not recognised: ${pageUrl.href}`);
|
|
}
|
|
|
|
getCurrentMode() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
return this.paymentServices;
|
|
|
|
case 1:
|
|
return this.emoneyServices;
|
|
|
|
case 2:
|
|
return this.creditServices;
|
|
|
|
}
|
|
}
|
|
|
|
getNextUrl() {
|
|
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
|
|
this.getCurrentMode().urlStep++;
|
|
else
|
|
if (this.mode < this.modeNames.length - 1)
|
|
this.mode++;
|
|
else
|
|
return null;
|
|
|
|
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
super._start();
|
|
|
|
await this._initBrowser();
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/AT/FMA`));
|
|
|
|
this.mode = 0;
|
|
|
|
this.paymentServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'urlStep': 0,
|
|
'paginationStep': 1,
|
|
'urls': [
|
|
this.getIndexUrl('1977'), // Payment institutions - Payment Institutions licensed in Austria
|
|
this.getIndexUrl('2798'), // Payment Institutions - Account information service provider (AISP)
|
|
this.getIndexUrl('2799') // Payment Institutions - Payment initiation service provider (PISP)
|
|
]
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'urlStep': 0,
|
|
'paginationStep': 1,
|
|
'urls': [this.getIndexUrl('2193')] // E-Money-Institutions - E-Money-Institutions licensed in Austria
|
|
};
|
|
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'urlStep': 0,
|
|
'paginationStep': 1,
|
|
'urls': [this.getIndexUrl('165')] // Banks - Banks licensed in Austria
|
|
};
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
try {
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage);
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
}
|
|
|
|
module.exports = ATScrape;
|