obdfcascrape/ncas/mt.js

819 lines
22 KiB
JavaScript
Raw Normal View History

2019-05-05 19:13:56 +00:00
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('MT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class MTScrape extends Scraper {
constructor() {
super();
this.id = 'MT';
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<{authorization, details}>}
* @constructor
*/
async OLDextractEntity(html) {
const $ = cheerio.load(html);
const details = {};
const authorization = {};
details.name = this._cleanUp($('#lblName').text());
const dlCells = $('div#pnlCommonDetails').children();
const superCells = $('#LHDetails span.fix-width-caption');
// #lblStatus
dlCells.each((index, item) => {
if ($(item).attr('id') === 'pnlRegDate') {
const itemText = this._cleanUp($(item).find('span').text()).split(/\s*:\s*/);
details[itemText[0]] = itemText[1];
}
else {
const current = this._cleanUp($(item).find('p').text()).replace(/\s*:\s*/, '');
details[current] = this._cleanUp($(item).find('span').text());
}
});
superCells.each((index, item) => {
const nextElm = $($(item).next());
const li = $(nextElm).find('li');
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
authorization[thisId] = [];
if (li.length > 0)
li.each((index, item) => {
const auth = $(item).html().split(' - ');
auth[1] = this._cleanUp(auth[1]);
authorization[thisId].push(auth);
});
else {
const itemText = this._cleanUp($(nextElm).text());
authorization[thisId].push(itemText);
}
});
return { details, authorization };
}
/**
*
* @param html
* @returns {Promise<{authorization, details}>}
*/
async extractEntityV2(html) {
const trimToColon = /^.*?(?=(:))/;
const $ = cheerio.load(html);
const details = {};
const authorization = {};
const errors = [];
details.name = this._cleanUp($('div#mainTitle > div').text());
const dlCells = $('table#tableLicenceResult tr');
const superCells = $('#LHDetails span.fix-width-caption');
let previousLabel = '';
dlCells.each((index, item) => {
const children = $(item).children();
const rawLabel = $(children).eq(0).text().match(trimToColon);
const itemValue = this._cleanUp($(children).eq(1).text().trim());
if (rawLabel !== null ) {
const itemLabel = this._cleanUp(rawLabel[0]);
details[itemLabel] = itemValue;
previousLabel = itemLabel;
}
else
details[previousLabel] = details[previousLabel].concat([itemValue]);
});
previousLabel = '';
superCells.each((index, item) => {
const nextElm = $($(item).next());
const children = $(nextElm).children();
if ($(children).length <= 1) {
const li = $(nextElm).find('li');
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
authorization[thisId] = [];
if (li.length > 0)
li.each((index, item) => {
const auth = $(item).text().split(' - ');
auth[1] = this._cleanUp(auth[1]);
if (auth[1] !== '')
authorization[thisId].push(auth);
});
else {
const itemText = this._cleanUp($(nextElm).text());
authorization[thisId].push(itemText);
}
}
else {
logger.warn('Possible error in the HTML');
logger.warn($(nextElm).html());
errors.push($(nextElm).html());
}
});
const outObj = { details, authorization };
if (errors.length > 0)
outObj.errors = errors;
return outObj;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
* @constructor
*/
async OLDprocessIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const pagingItem = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tfoot > tr > td > table > tbody > tr > td > div.rgWrap.rgInfoPart strong');
const maxPagesText = (pagingItem.length > 0) ? await this.page.evaluate(el => el.innerText, pagingItem[1]) : '0';
const maxPages = parseInt(maxPagesText, 10);
const links = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tbody > tr > td> a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.href, item);
const params = this._getParamsFromUrl(href);
serviceObject.links.push({ id, href, 'entId': params.id, 'metaStep': serviceObject.indexMetaStep });
}
if (serviceObject.indexStep < (maxPages - 1) ) {
serviceObject.indexStep++;
await this._findAndClick('input.rgPageNext');
}
else
this.emit('indexdone');
}
async processIndexV2(serviceObject) {
// #tableResult span
const numberRegEx = /\d+/;
logger.debug('+ processIndexV2');
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#tableResult span');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.getAttribute('onclick'), item);
serviceObject.links.push({ id, 'entId': href.match(numberRegEx)[0], 'metaStep': serviceObject.indexMetaStep });
}
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
* @constructor
*/
async OLDinitiateIndex(serviceObject) {
logger.debug('initiateIndex');
const matched = { 'left':false, 'right':false };
// first time around.
// need to kick off the index correctly..
await this._findAndClick('#ctl00_cphMain_RadComboBox1');
await this._randomWait(this.page, 2, 3);
const leftOptions = await this.page.$$('#ctl00_cphMain_RadComboBox1_DropDown > div > ul.rcbList li');
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
for (const item of leftOptions) {
const text = await this.page.evaluate(el => el.innerText, item);
if (wantedOption.indexOf(text) !== -1) {
await item.click({ 'delay':95 });
matched.left = true;
// this element can take a while to reload..
break;
}
}
await this._randomWait(this.page, 7, 9);
await this._findAndClick('#ctl00_cphMain_RadComboBox2_Input');
await this._randomWait(this.page, 2, 3);
const rightOptions = await this.page.$$('#ctl00_cphMain_RadComboBox2_DropDown > div > ul.rcbList li');
for (const item of rightOptions) {
const text = await this.page.evaluate(el => el.innerText, item);
if (text === wantedOption[1]) {
matched.right = true;
await item.click({ 'delay':95 });
break;
}
}
// Wait for items to setttle
await this._randomWait(this.page, 2, 3);
if (matched.left && matched.right) {
serviceObject.started = true;
await this._findAndClick('#cphMain_btnSearch2');
}
else
logger.error('Not fully matched', matched);
}
/**
* Reworked for site reskin
* @param serviceObject
* @returns {Promise<void>}
*/
async initiateIndexV2(serviceObject) {
logger.debug('initiateIndexV2');
const matched = { 'left':false, 'right':false };
// first time around.
// need to kick off the index correctly..
// select#select1
const leftOptions = await this.page.$$('select#select1 option');
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
for (const item of leftOptions) {
const rawText = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
const text = this._cleanUp(rawText);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('select#select1', value);
matched.left = true;
break;
}
}
// Wait for items to setttle
await this._randomWait(this.page, 2, 3);
const rightOptions = await this.page.$$('select#select2 option');
for (const item of rightOptions) {
const rawText = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
const text = this._cleanUp(rawText);
if (text === wantedOption[1]) {
matched.right = true;
await this.page.select('select#select2', value);
break;
}
}
await this._randomWait(this.page, 2, 2);
if (matched.left && matched.right) {
serviceObject.started = true;
await this._findAndClick('button.searchButtonAdv');
this.emit('processIndex');
}
else
logger.error('Not fully matched', matched);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
logger.debug('buildIndex');
if (!serviceObject.started)
await this.initiateIndexV2(serviceObject);
else
await this.processIndexV2(serviceObject);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async nextItem(serviceObject) {
const entId = serviceObject.links[serviceObject.step].entId;
logger.debug('nextItem', entId);
await this.newLoadLicenceHolder(entId);
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
if (!this.processing)
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
else
switch (this.mode) {
case 0:
await this.nextItem(this.paymentServices);
break;
case 1:
await this.nextItem(this.emoneyServices);
break;
case 2:
await this.nextItem(this.creditServices);
break;
}
}
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { id, entId } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step}:${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${entId}`].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityV2(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items)
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
else
this.emit('serviceDone');
}
// processIndex
async handleProcessIndex() {
switch (this.mode) {
case 0:
await this.processIndexV2(this.paymentServices);
break;
case 1:
await this.processIndexV2(this.emoneyServices);
break;
case 2:
await this.processIndexV2(this.creditServices);
break;
}
}
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
async processNewPage() {
// give the ajax page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
logger.debug('processNewPage', pageUrl.href);
switch (pageUrl.pathname) {
case '/pages/licenceholders.aspx':
case '/financial-services-register/':
await this.indexRedirector();
break;
case'/pages/licenceholder.aspx':
case '/financial-services-register/result/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
this.emit('backoff');
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('pathName', pathName);
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
* Replaces the goto
* @param id
* @returns {Promise<void>}
*/
async newLoadLicenceHolder(id) {
// loadLicenceHolder(10966)
const formElm = await this.page.$('form#loadHolder');
logger.debug('loadLicenceHolder', id);
await this.page.evaluate(x => {
x.target = '_self';
}, formElm);
await this._microWait(this.page, 5);
await this.page.evaluate(x => {
return loadLicenceHolder(x);
}, id);
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('processIndex', async () => {
this.handleProcessIndex();
});
//
this.on('pageChanged', this._debounce(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
this.on('psindexdone', async () => {
this.paymentServices.indexMetaStep++;
if (this.paymentServices.indexMetaStep < this.paymentServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.paymentServices.started = false;
this.paymentServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('emindexdone', async () => {
this.emoneyServices.indexMetaStep++;
if (this.emoneyServices.indexMetaStep < this.emoneyServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.emoneyServices.started = false;
this.emoneyServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('ciindexdone', async () => {
this.creditServices.indexMetaStep++;
if (this.creditServices.indexMetaStep < this.creditServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.creditServices.started = false;
this.creditServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('paymentServicesDone', async () => {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.processing = false;
await this._goto(this.emoneyServices.urls[0]);
});
this.on('emoneyServicesDone', async () => {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.processing = false;
await this._goto(this.creditServices.urls[0]);
});
this.on('creditServicesDone', async () => {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.processing = false;
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 46,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Financial Institutions',
'Financial Institutions licensed to undertake payment services under the 2nd Schedule to the Financial Institutions Act (Payment Institutions)'],
['Financial Institutions',
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to provide services outside Malta'],
['Financial Institutions',
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to establish a branch outside Malta']
]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Financial Institutions',
'Financial Institutions licenced to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions)'],
['Financial Institutions',
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to provide services outside Malta'],
['Financial Institutions',
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to establish a branch outside Malta']
]
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Credit Institutions',
'Credit Institutions'],
['Credit Institutions',
'Freedom of Services and Establishments - Exercise of the freedom to provide services outside Malta'],
['Credit Institutions',
'Freedom of Services and Establishments - Exercise of the freedom to set up an establishment outside Malta']
]
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/MT/MFSA`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._debounce(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = MTScrape;