const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const url = require('url'); const logger = require('log4js').getLogger('CY'); logger.level = process.env.LOGGER_LEVEL || 'warn'; // load env variables from file class CYScrape extends Scraper { constructor() { super(); this.setID('CY'); this.addToBlockFilters(['recaptcha']); this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param selector * @returns {Promise} */ async grabLink(selector) { const clickableLinks = await this.page.$$(selector); await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); if (clickableLinks.length > 0) for (const item of clickableLinks) { const href = await this.page.evaluate(el => el.href, item); await this._randomWait(this.page, 3, 5); await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => { // log this error but Puppeteer isn't supposed to support this sort of download.... // mute the ERR_ABORTED error which happens everytime but alert for everything else. if (!err.message.includes('net::ERR_ABORTED') ) logger.error('grabLink', err); }); } } /** * * @param id * @returns {Promise} */ async downloadEmoney(id) { const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a']; await this.grabLink(selector[id]); } /** * * @returns {Promise} */ async downloadExcel() { const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a'; await this.grabLink(selector); } /** * * @returns {Promise} */ async handlePaymentInstitutions() { await this._randomWait(this.page, 3, 5); const filename = 'licensing-and-supervision-of-payment-institutions'; await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null); await this._randomWait(this.page, 3, 5); await this.downloadExcel(); await this._randomWait(this.page, 3, 5); await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' }); } /** * * @returns {Promise} */ async handleElectronicMoneyInstitutions() { await this._randomWait(this.page, 3, 5); const filename = 'licensing-and-supervision-of-electronic-money-institutions'; await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null); await this._randomWait(this.page, 3, 5); await this.downloadEmoney(0); await this._randomWait(this.page, 3, 5); await this.downloadEmoney(1); await this._randomWait(this.page, 3, 5); this.emit('startProcessingCreditServices'); } /** * * @param body * @returns {Promise<{}|Array>} */ async extractLocalCreditInstitutions(body) { try{ const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/; const sanity = /(\d+\.\s)(.+)/; const $ = cheerio.load(body, { 'normalizeWhitespace': true }); let nextItem; $('p').each(function(i, elem) { const lineText = $(this).text(); const isHeading = matchHeading.test(lineText); if (isHeading) nextItem = $(this).next(); }); if (typeof nextItem !== 'undefined' && nextItem !== null) { const splitText = $(nextItem).text().split('\n'); const output = []; splitText.forEach((item) => { const newItem = this._cleanUp(item); if ( newItem !== '') output.push( sanity.exec(newItem)[2]); }); return output; } return {}; } catch( err) { logger.error(err); } } /** * * @param body * @returns {Promise} */ async extractForeignCreditInstitutions(body) { try{ const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/; const sanity = /(\w+\.\s+)(.+)/; const $ = cheerio.load(body, { 'normalizeWhitespace': true }); const output = {}; let nextItem; $('p').each(function(i, elem) { const lineText = $(this).text(); const isHeading = matchHeading.test(lineText); if (isHeading) nextItem = $(this).next(); }); // Rolling this out for ease as it could be changed by hand let nextElm; let firstHeadOrig, firstHead; if (typeof nextItem !== 'undefined' && nextItem !== null) { firstHeadOrig = this._cleanUp($(nextItem).text()); firstHead = sanity.exec(firstHeadOrig)[2]; output[firstHead] = {}; nextElm = $(nextItem).next(); const secondHeadOrig = this._cleanUp($(nextElm).text()); const secondHead = sanity.exec(secondHeadOrig)[2]; nextElm = $(nextElm).next(); const li = $(nextElm).find('li'); const arrayA = []; $(li).each(function (i, elem) { const lineText = $(this).text(); arrayA.push(lineText); }); output[firstHead][secondHead] = arrayA; nextElm = $(nextElm).next(); } if (typeof nextElm !== 'undefined' && nextElm !== null) { const secondHeadOrig = this._cleanUp($(nextElm).text()); const secondHead = sanity.exec(secondHeadOrig)[2]; nextElm = $(nextElm).next(); const li = $(nextElm).find('li'); const arrayA = []; $(li).each(function (i, elem) { const lineText = $(this).text(); arrayA.push(lineText); }); output[firstHead][secondHead] = arrayA; nextElm = $(nextElm).next(); } if (typeof nextElm !== 'undefined' && nextElm !== null) { firstHeadOrig = this._cleanUp($(nextElm).text()); firstHead = sanity.exec(firstHeadOrig)[2]; output[firstHead] = {}; nextElm = $(nextElm).next(); const secondHeadOrig = this._cleanUp($(nextElm).text()); const secondHead = sanity.exec(secondHeadOrig)[2]; nextElm = $(nextElm).next(); const li = $(nextElm).find('li'); const arrayA = []; $(li).each(function (i, elem) { const lineText = $(this).text(); arrayA.push(lineText); }); output[firstHead][secondHead] = arrayA; nextElm = $(nextElm).next(); } if (typeof nextElm !== 'undefined' && nextElm !== null) { const secondHeadOrig = this._cleanUp($(nextElm).text()); const secondHead = sanity.exec(secondHeadOrig)[2]; nextElm = $(nextElm).next(); const li = $(nextElm).find('li'); const arrayA = []; $(li).each(function (i, elem) { const lineText = $(this).text(); arrayA.push(lineText); }); output[firstHead][secondHead] = arrayA; } return output; } catch(err) { logger.error(err); } } /** * * @returns {Promise<{local: Promise<*|void>}>} */ async processCreditInstitute() { logger.info('Credit institutes'); try{ await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null); const body = await this.page.content(); await this._dumpFile(`${this.path}/creditInstitutes.html`, body); const $ = cheerio.load(body); const content = $('.generic_page-intro'); const local = await this.extractLocalCreditInstitutions(content.html()); const creditInstitutes = await this.extractForeignCreditInstitutions(content.html()); await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes }); this.emit('done'); return { local, creditInstitutes }; } catch(err) { logger.error(err); } } /** * * @param filePath * @returns {Promise} */ async savePDF(filePath) { logger.info('Saving the pdf:', filePath); await this._randomWait(this.page, 5, 7); await this.page.pdf({ 'path': filePath, 'format': 'A4' }); // this.emit('startProcessingCreditServices'); logger.debug('!! i SHOULD EMIT SOMETHING HERE !!'); } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle const checkPDF = /(.pdf)/g; await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } let currentPath = pageUrl.pathname; let pdfFile; if (checkPDF.test(currentPath)) { const splitPath = currentPath.split('/'); pdfFile = splitPath.pop(); currentPath = splitPath.join('/'); } switch (currentPath) { case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions': await this.handlePaymentInstitutions(); break; case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions': await this.handleElectronicMoneyInstitutions(); break; case '/images/media/redirectfile/Electronic%20Money%20Institutions': logger.warn('We should only arrive here when in Non-headless mode'); await this.savePDF(pdfFile); break; case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus': await this.processCreditInstitute(); break; default: await this._uploadError(); throw new Error(`Unknown page: ${pageUrl.href}`); break; } } /** * * @returns {Promise} */ async attachEvents() { logger.info('Attaching events'); this.on('startProcessingCreditServices', async function() { await this._goto(this.credit); }); } /** * * @returns {Promise} */ async start() { try { super._start(); this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false, 'searchDone' : false }; this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions'; this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions'; this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus'; this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`); await this._createDirectory(this.path); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(true); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage); await this._randomWait(this.page, 3, 5); } catch (e) { throw new Error(e); } } /** * * @returns {Promise} */ async __run() { logger.info('Scraping Cyprus...'); await this.start(); } } module.exports = CYScrape;