const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const logger = require('log4js').getLogger('SE'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class SEScrape extends Scraper { constructor() { super(); this.setID('SE'); this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param html * @returns {Promise<{authorization: Array, details}>} */ async extractEntity(html) { const $ = cheerio.load(html); const details = {}; const authorization = []; details.name = this._cleanUp($('h2').text()); const dlCells = $('dl.funky').children(); const ulCells = $('ul.tillstand').children(); let current = ''; dlCells.each((index, item) => { const itemText = this._cleanUp($(item).text()); if (item.name === 'dt') { details[itemText] = []; current = itemText; } else details[current].push(itemText); }); ulCells.each((index, item) => { const date = this._cleanUp($(item.children).eq(0).text()) ; const text = this._cleanUp($(item.children).eq(1).text()) ; authorization.push({ date, text, 'translated':this._translate(text) }); }); return { details, authorization }; } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { const noWhiteSpace = /\W/g; const id = serviceObject.links[serviceObject.step].id; logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`); await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('h1').catch((e) => { throw e; }); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); const $ = cheerio.load(body); const details = await this.extractEntity(body); const crossBorderExists = $('div.container a.link'); if (crossBorderExists.length !== 0) { serviceObject.links[serviceObject.step].data = { details }; await this._findAndClick('div.container a.link', 'View cross border services'); } else { await jsonfile.writeFile(`${filePath}.json`, { details }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } } /** * * @param html * @returns {Promise} */ async extractCrossBorderServices(html) { const services = {}; const $ = cheerio.load(html); const rows = $('div.container table tbody tr'); let current = ''; rows.each((index, item) => { if ($(item).children().length === 1) { // this is a heading... const itemText = this._cleanUp($(item).text()); services[itemText] = { 'authorization': [], 'translated': this._translate(itemText) }; current = itemText; } else { const date = this._cleanUp($(item.children).eq(0).text()) ; const text = this._cleanUp($(item.children).eq(1).text()) ; const translated = this._translate(text); services[current].authorization.push({ date, text, translated }); } }); return services; } /** * * @param serviceObject * @returns {Promise} */ async processCrossBorderServicesV2(serviceObject) { try{ const noWhiteSpace = /\W/g; const id = serviceObject.links[serviceObject.step].id; logger.info('Process CBS entity:', id); await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('h1').catch((e) => { throw e; }); await this._makeScreenshotV2(this.page, `${filePath}_crossborder`, null); const body = await this.page.content(); const crossBorderServices = await this.extractCrossBorderServices(body); const details = serviceObject.links[serviceObject.step].data; serviceObject.links[serviceObject.step].data = null; await jsonfile.writeFile(`${filePath}.json`, { details, crossBorderServices }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { logger.info(`Building the ${this.modeTitles[this.mode]} index...`); // await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('#institut', { 'visible':true }); const links = await this.page.$$('#institut > tbody > tr > td > a'); for (const item of links) { // logger.debug(item); const id = await this.page.evaluate(el => el.innerText, item); let href = await this.page.evaluate(el => el.href, item); href = href.concat('&locale=en_GB'); serviceObject.links.push({ id, href }); } serviceObject.items = serviceObject.links.length; serviceObject.indexStep++; this.emit('indexdone'); } /** * * @returns {Promise} */ async indexRedirector() { switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } /** * * @returns {Promise} */ async crossBorderRedirector() { switch (this.mode) { case 0: await this.processCrossBorderServicesV2(this.paymentServices); break; case 1: await this.processCrossBorderServicesV2(this.emoneyServices); break; case 2: await this.processCrossBorderServicesV2(this.creditServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } switch (pageUrl.pathname) { case '/en/our-registers/company-register/': await this.indexRedirector(); break; case '/en/our-registers/company-register/details': await this.processRedirector(); break; case '/en/our-registers/company-register/gransoverskridandehandel/': await this.crossBorderRedirector(); break; default: await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); break; } } /** * * @returns {Promise} */ async attachEvents() { this.on('indexdone', async function() { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('serviceDone', async function() { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('psindexdone', async function() { if (this.paymentServices.indexStep < this.paymentServices.urls.length) { const newUrl = this.paymentServices.urls[this.paymentServices.indexStep]; await this._goto(newUrl); } else this.emit('startProcessingPaymentServices'); }); this.on('startProcessingPaymentServices', async function() { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); // logger.debug(this.paymentServices.links); const newUrl = this.paymentServices.links[this.paymentServices.step].href; await this._goto(newUrl); }); this.on('paymentServicesDone', async function() { this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; await this._goto(this.emoneyServices.urls[0]); }); // emoney Services this.on('emindexdone', async function() { if (this.emoneyServices.indexStep < this.emoneyServices.urls.length) { const newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep]; await this._goto(newUrl); } else this.emit('startProcessingEMoneyServices'); }); this.on('startProcessingEMoneyServices', async function() { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); // logger.debug(this.emoneyServices.links); const newUrl = this.emoneyServices.links[this.emoneyServices.step].href; await this._goto(newUrl); }); this.on('emoneyServicesDone', async function() { this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; await this._goto(this.creditServices.urls[0]); }); // credit services this.on('ciindexdone', async function() { if (this.creditServices.indexStep < this.creditServices.urls.length) { const newUrl = this.creditServices.urls[this.creditServices.indexStep]; await this._goto(newUrl); } else this.emit('startProcessingcreditServices'); }); this.on('startProcessingcreditServices', async function() { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); // logger.debug(this.creditServices.links); const newUrl = this.creditServices.links[this.creditServices.step].href; await this._goto(newUrl); }); this.on('creditServicesDone', async function() { this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.emit('done'); }); } /** * * @returns {Promise} */ async start() { super._start(); try { await this._loadDictionary(); this.mode = 0; this.modeTitles = ['**Payment Service', 'EMoney', 'Credit Services']; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BET&area=#results'/* , 'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BETREG&area=#results'*/] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=EINST&area=#results', 'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=REGUTG&area=#results'] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=BANK&area=#results', 'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=MBANK&area=#results', 'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=SPAR&area=#results'] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html'; this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB'; this.setPath(path.resolve(`${__dirname }/../artefacts/SE/FI`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(true); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => { logger.error(err); }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle2' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } /** * * @returns {Promise} */ async __run() { await this.start(); } } module.exports = SEScrape;