obdfcascrape/ncas/cy.js
Martin Donnelly 534fd67b5d final update
2019-08-15 08:48:49 +01:00

458 lines
12 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const url = require('url');
const logger = require('log4js').getLogger('CY');
logger.level = process.env.LOGGER_LEVEL || 'warn';
// load env variables from file
class CYScrape extends Scraper {
constructor() {
super();
this.setID('CY');
this.addToBlockFilters(['recaptcha']);
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param selector
* @returns {Promise<void>}
*/
async grabLink(selector) {
const clickableLinks = await this.page.$$(selector);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
if (clickableLinks.length > 0)
for (const item of clickableLinks) {
const href = await this.page.evaluate(el => el.href, item);
await this._randomWait(this.page, 3, 5);
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if (!err.message.includes('net::ERR_ABORTED') )
logger.error('grabLink', err);
});
}
}
/**
*
* @param id
* @returns {Promise<void>}
*/
async downloadEmoney(id) {
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
await this.grabLink(selector[id]);
}
/**
*
* @returns {Promise<void>}
*/
async downloadExcel() {
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
await this.grabLink(selector);
}
/**
*
* @returns {Promise<void>}
*/
async handlePaymentInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-payment-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadExcel();
await this._randomWait(this.page, 3, 5);
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
}
/**
*
* @returns {Promise<void>}
*/
async handleElectronicMoneyInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(0);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(1);
await this._randomWait(this.page, 3, 5);
this.emit('startProcessingCreditServices');
}
/**
*
* @param body
* @returns {Promise<{}|Array>}
*/
async extractLocalCreditInstitutions(body) {
try{
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
const sanity = /(\d+\.\s)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
if (typeof nextItem !== 'undefined' && nextItem !== null) {
const splitText = $(nextItem).text().split('\n');
const output = [];
splitText.forEach((item) => {
const newItem = this._cleanUp(item);
if ( newItem !== '')
output.push( sanity.exec(newItem)[2]);
});
return output;
}
return {};
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param body
* @returns {Promise<void>}
*/
async extractForeignCreditInstitutions(body) {
try{
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
const sanity = /(\w+\.\s+)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
const output = {};
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
// Rolling this out for ease as it could be changed by hand
let nextElm;
let firstHeadOrig, firstHead;
if (typeof nextItem !== 'undefined' && nextItem !== null) {
firstHeadOrig = this._cleanUp($(nextItem).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextItem).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
firstHeadOrig = this._cleanUp($(nextElm).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextElm).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
}
return output;
}
catch(err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<{local: Promise<*|void>}>}
*/
async processCreditInstitute() {
logger.info('Credit institutes');
try{
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
const body = await this.page.content();
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
const $ = cheerio.load(body);
const content = $('.generic_page-intro');
const local = await this.extractLocalCreditInstitutions(content.html());
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
this.emit('done');
return { local, creditInstitutes };
}
catch(err) {
logger.error(err);
}
}
/**
*
* @param filePath
* @returns {Promise<void>}
*/
async savePDF(filePath) {
logger.info('Saving the pdf:', filePath);
await this._randomWait(this.page, 5, 7);
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
// this.emit('startProcessingCreditServices');
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const checkPDF = /(.pdf)/g;
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
let currentPath = pageUrl.pathname;
let pdfFile;
if (checkPDF.test(currentPath)) {
const splitPath = currentPath.split('/');
pdfFile = splitPath.pop();
currentPath = splitPath.join('/');
}
switch (currentPath) {
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
await this.handlePaymentInstitutions();
break;
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
await this.handleElectronicMoneyInstitutions();
break;
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
logger.warn('We should only arrive here when in Non-headless mode');
await this.savePDF(pdfFile);
break;
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
await this.processCreditInstitute();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
logger.info('Attaching events');
this.on('startProcessingCreditServices', async function() {
await this._goto(this.credit);
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
try {
super._start();
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
await this._createDirectory(this.path);
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}
catch (e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
logger.info('Scraping Cyprus...');
await this.start();
}
}
module.exports = CYScrape;