const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const logger = require('log4js').getLogger('(PT)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class PTScrape extends Scraper { constructor() { super(); this.setID('PT'); this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { try { const details = {}; const detailSequence = [ ['field-name-field-tipo-ent-aut', 'institutionType'], ['field-name-field-estado-ent', 'state'], ['field-name-field-morada', 'address'], ['field-name-field-localidade', 'firstName'], ['field-name-field-cod-postal', 'postcode'], ['field-name-field-pais', 'country'], ['field-name-field-data-limite', 'beginningOfActivity'], ['field-name-field-capital-subscrito', 'subscribedCapital'], ['field-name-field-capital-realizado', 'paidUpCapital'], ['field-name-field-jel', 'institutionCodeNumber'] ]; const $ = cheerio.load(html); details.name = this._cleanUp($('h1.page-title').text()) ; const mainDiv = $('div.content'); for(const item of detailSequence) { const i = $(mainDiv).find(`.${item[0]} div.field-items`); details[item[1]] = this._cleanUp($(i).text()); } return details; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processIndex(serviceObject) { logger.info(`Building the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 3, 5); const body = await this.page.content(); const $ = cheerio.load(body); if ($('div.view-empty').length > 0) { // We have reached an empty page, so we assume we've scraped all links from this index this.emit('indexdone'); return; } const links = $('div.views-field.views-field-title > span > a'); links.each((i, item) => { const href = $(item).attr('href'); const text = $(item).text(); const newUrl = `https://www.bportugal.pt${href}`; const id = this._makeFieldName(text); serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id }); }); const filename = this.modeNames[this.mode]; const parsedUrl = url.parse(this.page.url(), true); await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${parsedUrl.query.page}`, null); parsedUrl.query.page++; parsedUrl.search = undefined; // Forces parsedUrl to use `query` property, as modified on line above const nextPage = url.format(parsedUrl); await this._goto(nextPage); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { // We have stopped using the "view all" button due to it breaking. // Leaving the code below commented in case it is ever useful in future. // await this.page.waitForSelector('#block-system-main > div > div > div.view-content-wrapper > ul > li.pager__item.pager__item_all', { 'visible':true, 'timeout':7500 }).then(async (elm) => { // logger.debug('Extend menu list..'); // await elm.click({ 'delay':90 }); // }).catch(() => { // logger.info('No show all button'); // }); await this._randomWait(this.page, 6, 9); await this.processIndex(serviceObject); } /** * * @returns {Promise} */ async indexRedirector() { logger.debug('>> indexRedirector'); let doIndex = false; await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':7500 }).then(async (elm) => { logger.warn('Sent back to the main selector screen'); await elm.click({ 'delay':90 }); doIndex = false; }).catch(() => { // logger.info('No show all button'); doIndex = true; }); if (doIndex) switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { const noWhiteSpace = /\W/g; const { name, id } = serviceObject.links[serviceObject.step]; // const id = serviceObject.links[serviceObject.step].id; logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`); // 'h1.page-title' await this.page.waitForSelector('h1.page-title', { 'visible':true, 'timeout':7500 }); await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); const details = await this.extractEntityDetails(body); await jsonfile.writeFile(`${filePath}.json`, { details }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle const pathSplitter = /(\/en\/.+?\/)/; await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } const splitPath = pageUrl.pathname.match(pathSplitter); const pathname = splitPath[0]; switch (pathname) { case '/en/entidades-autorizadas/': await this.indexRedirector(); break; case '/en/entidadeautorizada/': await this.processRedirector(); break; case '/en/our-registers/company-register/gransoverskridandehandel/': await this.crossBorderRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } break; } } async attachEvents() { this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('serviceDone', async function() { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); // this.on('psindexdone', async () => { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); const newUrl = this.paymentServices.links[this.paymentServices.step].href; await this._goto(newUrl); }); this.on('emindexdone', async () => { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); const newUrl = this.emoneyServices.links[this.emoneyServices.step].href; await this._goto(newUrl); }); this.on('ciindexdone', async () => { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); const newUrl = this.creditServices.links[this.creditServices.step].href; await this._goto(newUrl); }); this.on('indexdone', async function() { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('paymentServicesDone', async function() { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async function() { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async function() { logger.warn('creditServicesDone'); try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { logger.debug(this.eventNames()); super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/75/all?page=0'], 'sections' : [], 'sectionLinks' : [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/72/all?page=0'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/67-68-1524-69/all?page=0'], 'sections' : [], 'sectionLinks' : [] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/PT/BP`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); await this._makeResponsive(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage(); }, 5000)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = PTScrape;