458 lines
12 KiB
JavaScript
458 lines
12 KiB
JavaScript
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const url = require('url');
|
|
const logger = require('log4js').getLogger('CY');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
// load env variables from file
|
|
|
|
class CYScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('CY');
|
|
|
|
this.addToBlockFilters(['recaptcha']);
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._debounce(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param selector
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async grabLink(selector) {
|
|
const clickableLinks = await this.page.$$(selector);
|
|
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
if (clickableLinks.length > 0)
|
|
for (const item of clickableLinks) {
|
|
const href = await this.page.evaluate(el => el.href, item);
|
|
await this._randomWait(this.page, 3, 5);
|
|
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
|
|
// log this error but Puppeteer isn't supposed to support this sort of download....
|
|
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
|
|
|
|
if (!err.message.includes('net::ERR_ABORTED') )
|
|
logger.error('grabLink', err);
|
|
});
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param id
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async downloadEmoney(id) {
|
|
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
|
|
|
|
await this.grabLink(selector[id]);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async downloadExcel() {
|
|
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
|
|
|
|
await this.grabLink(selector);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handlePaymentInstitutions() {
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const filename = 'licensing-and-supervision-of-payment-institutions';
|
|
|
|
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this.downloadExcel();
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleElectronicMoneyInstitutions() {
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
|
|
|
|
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this.downloadEmoney(0);
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this.downloadEmoney(1);
|
|
await this._randomWait(this.page, 3, 5);
|
|
this.emit('startProcessingCreditServices');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param body
|
|
* @returns {Promise<{}|Array>}
|
|
*/
|
|
async extractLocalCreditInstitutions(body) {
|
|
try{
|
|
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
|
|
const sanity = /(\d+\.\s)(.+)/;
|
|
const $ = cheerio.load(body, {
|
|
'normalizeWhitespace': true
|
|
});
|
|
|
|
let nextItem;
|
|
|
|
$('p').each(function(i, elem) {
|
|
const lineText = $(this).text();
|
|
|
|
const isHeading = matchHeading.test(lineText);
|
|
if (isHeading)
|
|
nextItem = $(this).next();
|
|
});
|
|
|
|
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
|
const splitText = $(nextItem).text().split('\n');
|
|
|
|
const output = [];
|
|
|
|
splitText.forEach((item) => {
|
|
const newItem = this._cleanUp(item);
|
|
|
|
if ( newItem !== '')
|
|
output.push( sanity.exec(newItem)[2]);
|
|
});
|
|
|
|
return output;
|
|
}
|
|
|
|
return {};
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param body
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractForeignCreditInstitutions(body) {
|
|
try{
|
|
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
|
|
|
|
const sanity = /(\w+\.\s+)(.+)/;
|
|
|
|
const $ = cheerio.load(body, {
|
|
'normalizeWhitespace': true
|
|
});
|
|
|
|
const output = {};
|
|
|
|
let nextItem;
|
|
|
|
$('p').each(function(i, elem) {
|
|
const lineText = $(this).text();
|
|
const isHeading = matchHeading.test(lineText);
|
|
if (isHeading)
|
|
nextItem = $(this).next();
|
|
});
|
|
|
|
// Rolling this out for ease as it could be changed by hand
|
|
let nextElm;
|
|
|
|
let firstHeadOrig, firstHead;
|
|
|
|
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
|
firstHeadOrig = this._cleanUp($(nextItem).text());
|
|
firstHead = sanity.exec(firstHeadOrig)[2];
|
|
output[firstHead] = {};
|
|
|
|
nextElm = $(nextItem).next();
|
|
|
|
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
|
const secondHead = sanity.exec(secondHeadOrig)[2];
|
|
|
|
nextElm = $(nextElm).next();
|
|
|
|
const li = $(nextElm).find('li');
|
|
|
|
const arrayA = [];
|
|
$(li).each(function (i, elem) {
|
|
const lineText = $(this).text();
|
|
|
|
arrayA.push(lineText);
|
|
});
|
|
|
|
output[firstHead][secondHead] = arrayA;
|
|
nextElm = $(nextElm).next();
|
|
}
|
|
|
|
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
|
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
|
const secondHead = sanity.exec(secondHeadOrig)[2];
|
|
|
|
nextElm = $(nextElm).next();
|
|
|
|
const li = $(nextElm).find('li');
|
|
|
|
const arrayA = [];
|
|
$(li).each(function (i, elem) {
|
|
const lineText = $(this).text();
|
|
|
|
arrayA.push(lineText);
|
|
});
|
|
|
|
output[firstHead][secondHead] = arrayA;
|
|
nextElm = $(nextElm).next();
|
|
}
|
|
|
|
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
|
firstHeadOrig = this._cleanUp($(nextElm).text());
|
|
firstHead = sanity.exec(firstHeadOrig)[2];
|
|
output[firstHead] = {};
|
|
|
|
nextElm = $(nextElm).next();
|
|
|
|
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
|
const secondHead = sanity.exec(secondHeadOrig)[2];
|
|
|
|
nextElm = $(nextElm).next();
|
|
|
|
const li = $(nextElm).find('li');
|
|
|
|
const arrayA = [];
|
|
$(li).each(function (i, elem) {
|
|
const lineText = $(this).text();
|
|
arrayA.push(lineText);
|
|
});
|
|
|
|
output[firstHead][secondHead] = arrayA;
|
|
nextElm = $(nextElm).next();
|
|
}
|
|
|
|
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
|
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
|
const secondHead = sanity.exec(secondHeadOrig)[2];
|
|
|
|
nextElm = $(nextElm).next();
|
|
|
|
const li = $(nextElm).find('li');
|
|
|
|
const arrayA = [];
|
|
$(li).each(function (i, elem) {
|
|
const lineText = $(this).text();
|
|
arrayA.push(lineText);
|
|
});
|
|
|
|
output[firstHead][secondHead] = arrayA;
|
|
}
|
|
|
|
return output;
|
|
}
|
|
catch(err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<{local: Promise<*|void>}>}
|
|
*/
|
|
async processCreditInstitute() {
|
|
logger.info('Credit institutes');
|
|
try{
|
|
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
|
|
|
|
const body = await this.page.content();
|
|
|
|
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
|
|
const $ = cheerio.load(body);
|
|
|
|
const content = $('.generic_page-intro');
|
|
|
|
const local = await this.extractLocalCreditInstitutions(content.html());
|
|
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
|
|
|
|
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
|
|
|
|
this.emit('done');
|
|
|
|
return { local, creditInstitutes };
|
|
}
|
|
catch(err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param filePath
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async savePDF(filePath) {
|
|
logger.info('Saving the pdf:', filePath);
|
|
|
|
await this._randomWait(this.page, 5, 7);
|
|
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
|
|
// this.emit('startProcessingCreditServices');
|
|
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
const checkPDF = /(.pdf)/g;
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
this.emit('recover');
|
|
|
|
return;
|
|
}
|
|
|
|
let currentPath = pageUrl.pathname;
|
|
let pdfFile;
|
|
|
|
if (checkPDF.test(currentPath)) {
|
|
const splitPath = currentPath.split('/');
|
|
|
|
pdfFile = splitPath.pop();
|
|
currentPath = splitPath.join('/');
|
|
}
|
|
|
|
switch (currentPath) {
|
|
|
|
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
|
|
await this.handlePaymentInstitutions();
|
|
break;
|
|
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
|
|
await this.handleElectronicMoneyInstitutions();
|
|
break;
|
|
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
|
|
logger.warn('We should only arrive here when in Non-headless mode');
|
|
await this.savePDF(pdfFile);
|
|
break;
|
|
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
|
|
|
|
await this.processCreditInstitute();
|
|
break;
|
|
default:
|
|
|
|
await this._uploadError();
|
|
throw new Error(`Unknown page: ${pageUrl.href}`);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async attachEvents() {
|
|
logger.info('Attaching events');
|
|
this.on('startProcessingCreditServices', async function() {
|
|
await this._goto(this.credit);
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
try {
|
|
super._start();
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false
|
|
};
|
|
|
|
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
|
|
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
|
|
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
|
|
|
|
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
|
|
await this._createDirectory(this.path);
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser(true);
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
if (this.eventNames().length === 2)
|
|
await this.attachEvents();
|
|
|
|
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
catch (e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async __run() {
|
|
logger.info('Scraping Cyprus...');
|
|
|
|
await this.start();
|
|
}
|
|
}
|
|
|
|
module.exports = CYScrape;
|