const cheerio = require('cheerio'); const logger = require('log4js').getLogger('AT'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const url = require('url'); const Scraper = require('../helpers/scraper'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class ATScrape extends Scraper { constructor() { super(); this.setID('AT'); this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } getIndexUrl(category) { return `https://www.fma.gv.at/en/search-company-database/?cname=&place=&bic=&category=${category}&per_page=10&submitted=1&to=1`; } getHtmlNotInMatchingElements(html, selector) { const $ = cheerio.load(html, { 'decodeEntities': false }); $(selector).remove(); // cheerio adds ... around the result, so we return the contents of return $('body').html(); } extractFieldFromLiWithStrongHeader($, headerText, extractTextFromHtml = false) { const fieldHeader = $(`div.company-details li > strong:contains("${headerText}")`); if (fieldHeader.length < 1) return ''; const fieldLi = fieldHeader.parent().html(); const fieldValue = this.getHtmlNotInMatchingElements(fieldLi, 'strong'); if (extractTextFromHtml) return $(fieldValue).text().trim(); else return fieldValue.trim(); } extractSingleFields($, details) { details['name'] = this._cleanUp($('h3 > a').text()); const addressRaw = this.extractFieldFromLiWithStrongHeader($, 'Address:'); let address = addressRaw.replace(/\s*\|\s*/g, ', '); // replace pipes with commas address = address.replace(/\s+/g, ' '); // replace any non-standard spaces with simple spaces address = address.trim(); details['address'] = address; details['phone'] = this.extractFieldFromLiWithStrongHeader($, 'Phone:'); details['email'] = this.extractFieldFromLiWithStrongHeader($, 'Email:', true); details['website'] = this.extractFieldFromLiWithStrongHeader($, 'Web:', true); details['bankIdentificationNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Bank identification number:'); details['commercialRegisterNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Commercial register number:'); } extractMultiples($, details) { details['categories'] = []; const categoriesLi = $('div.company-details > ul > li > strong:contains("Category:")').parent().html(); const categoriesRaw = this.getHtmlNotInMatchingElements(categoriesLi, 'strong'); const categories = categoriesRaw.split(/
/); for (let i = 0; i < categories.length; i++) { let cat = categories[i]; cat = this._cleanUp(cat); if (cat !== '') details['categories'].push(cat); } } extractPermissions($, details) { details['permissions'] = []; const permissionsDiv = $('div.modal-body'); $(permissionsDiv).find('h4').each((i, item) => { const heading = this._cleanUp($(item).text()); const body = $(item).next().html().split('
').map(x => this._cleanUp(x)).filter(x => x != ""); details['permissions'].push({ heading, body }); }); } extractEntityDetails(html) { const details = {}; const $ = cheerio.load(html, { 'decodeEntities': false }); this.extractSingleFields($, details); this.extractMultiples($, details); this.extractPermissions($, details); return details; } currentPageAsString() { return `${this.modeNames[this.mode]} url ${this.getCurrentMode().urlStep}, page ${this.getCurrentMode().paginationStep}`; } async expandAndScreenshot() { logger.info(`Expanding content on ${this.currentPageAsString()}`); await this.page.addStyleTag({ 'content': ` div.company-details { /* make space for the content */ position: static; width: auto; height: auto; } div.document-description { /* make content visible */ display: block; position: static; opacity: 1; } div.modal-dialog { /* move the content back down (it's transformed up a bit by default) */ transform: none !important; } div.modal-content { /* remove the drop shadow (might help render faster?) */ box-shadow: none; -webkit-box-shadow: none; } ` }); logger.info(`Taking screenshot of ${this.currentPageAsString()}`); const filename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_page_${this.getCurrentMode().paginationStep}`; const path = `${this.path}/${filename}`; await this._makeScreenshotV2(this.page, path); } serviceDone() { logger.info(`${this.modeNames[this.mode]} done. Total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]} scraped.`); try{ jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links }); jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode()); } catch (e) { logger.error(e); } } async entityResultsPageProcessor() { const body = await this.page.content(); const $ = cheerio.load(body, { 'decodeEntities': false }); await this.expandAndScreenshot(); const entities = $('div.company-details-wrap'); const href = await this.page.url(); entities.each(async (i, item) => { const noWhiteSpace = /\W/g; const details = this.extractEntityDetails($(item).html()); const id = this._makeFieldName(details.name); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), '.json'].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); jsonfile.writeFile(`${filePath}`, { details }); this.getCurrentMode().links.push({ id, href, filename }); }); logger.info(`${entities.length} ${this.modeNames[this.mode]} entities scraped.`); const nextLink = await this.page.$('div.paging li.next:not(.disabled) a'); if (nextLink !== null) { logger.info('Clicking through to next page.'); this.getCurrentMode().paginationStep++; const nextHref = await this.page.evaluate(link => { return link.href; }, nextLink); this._goto(nextHref); } else { this.serviceDone(); this.getCurrentMode().paginationStep = 1; const nextUrl = this.getNextUrl(); if (nextUrl !== null) this._goto(nextUrl); else this.emit('done'); } } async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.pathname === '/en/search-company-database/') await this.entityResultsPageProcessor(); else logger.error(`Page url not recognised: ${pageUrl.href}`); } getCurrentMode() { switch (this.mode) { case 0: return this.paymentServices; case 1: return this.emoneyServices; case 2: return this.creditServices; } } getNextUrl() { if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1) this.getCurrentMode().urlStep++; else if (this.mode < this.modeNames.length - 1) this.mode++; else return null; return this.getCurrentMode().urls[this.getCurrentMode().urlStep]; } /** * * @returns {Promise} */ async start() { super._start(); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); this.setPath(path.resolve(`${__dirname }/../artefacts/AT/FMA`)); this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'urlStep': 0, 'paginationStep': 1, 'urls': [ this.getIndexUrl('1977'), // Payment institutions - Payment Institutions licensed in Austria this.getIndexUrl('2798'), // Payment Institutions - Account information service provider (AISP) this.getIndexUrl('2799') // Payment Institutions - Payment initiation service provider (PISP) ] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'urlStep': 0, 'paginationStep': 1, 'urls': [this.getIndexUrl('2193')] // E-Money-Institutions - E-Money-Institutions licensed in Austria }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'urlStep': 0, 'paginationStep': 1, 'urls': [this.getIndexUrl('165')] // Banks - Banks licensed in Austria }; this.startPage = this.paymentServices.urls[0]; await this._doNonRepudiation().catch((err) => { logger.warn(err); }); try { await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = ATScrape;