obdfcascrape/ncas/lu.js
Martin Donnelly 534fd67b5d final update
2019-08-15 08:48:49 +01:00

792 lines
22 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('(LU)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
function debounce(func, wait, immediate) {
var timeout;
return () => {
const context = this;
const args = arguments;
const later = () => {
timeout = null;
if (!immediate) func.apply(context, args);
};
var callNow = immediate && !timeout;
clearTimeout(timeout);
timeout = setTimeout(later, wait);
if (callNow) func.apply(context, args);
};
}
class LUScrape extends Scraper {
constructor() {
super();
this.setID('LU');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
this.debounceHandleIndexPage = debounce(() => {
// the index page sometimes reloads up to 3 times..
this.emit('handleIndexPage');
}, 7500);
}
/**
*
* @returns {Promise<void>}
*/
async handleIndexPage() {
const thisUrl = await this.page.url();
const pageUrl = url.parse(thisUrl);
switch (pageUrl.hash) {
case '#Home':
case '#AdvancedSearch':
await this.indexPageHomeMode();
break;
case '#ResultResearch':
this.emit('handleEntityIndex');
break;
case '#DetailEntity':
this.emit('processEntity');
break;
case null:
this.emit('selectSearchManually');
break;
default:
logger.error('HASH NOT RECOGNISED');
logger.error(pageUrl);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async indexPageHomeMode() {
try{
const searchType = ['6', '7', '1'];
const bodys = ['#advancedsearch_paymentservicestype-body', '#advancedsearch_electronicmoneytype-body', '#advancedsearch_banktype-body'];
const bankInputs = ['#advancedsearch_bankgroup1_inputEl', '#advancedsearch_bankgroupA_inputEl', '#advancedsearch_bankgroupB_inputEl',
'#advancedsearch_bankgroupC_inputEl', '#advancedsearch_bankgroupD_inputEl', '#advancedsearch_bankgroup2_inputEl', '#advancedsearch_bankgroup3_inputEl'];
// click the advanced search button
await this.page.waitForSelector('#menu_advanced').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
// click
await this.page.waitForSelector('#advancedsearch_type-bodyEl').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
await this._randomWait(this.page, 2, 2);
// call the EXT function to set the advanced search mode..
await this.page.evaluate(x => {
return Ext.getCmp('advancedsearch_type').setValue(x);
}, searchType[this.mode]);
// Mode 0 & Mode 1 have a list of options which can be iterated easily
// Mode 2 requires a handful of different inputs to be clicked on
await this._microWait(this.page, 7);
if (this.mode === 0) {
await this.page.waitForSelector('label#advancedsearch_paymentinstitutionsgroup1-boxLabelEl').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
await this._randomWait(this.page, 2, 2);
}
if (this.mode === 0 && this.mode === 1) {
const options = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input.x-form-checkbox-default`);
// click all the elements
logger.debug('options length', options.length);
for (const item of options)
await item.click({ 'delay':Scraper.notARobot() });
}
if (this.mode === 2)
for(const bI of bankInputs) {
const input = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input${bI}`);
await input[0].click({ 'delay':Scraper.notARobot() });
}
await this._randomWait(this.page, 1, 1);
// click the button
await this.page.waitForSelector('#advancedsearch_searchbutton').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
// now wait for the results to load..
await this.page.waitForSelector('#title-1083-textEl').then(async () => {
logger.debug('Results loaded');
this.emit('pageChanged');
});
}
catch( err) {
logger.error(err);
this.emit('recover');
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityIndexFirstPass(serviceObject) {
try{
const body = await this.page.content();
const $ = cheerio.load(body);
const pageDetails = await this.extractBarDetails($);
const { currentPageIndex, currentPageMax } = pageDetails;
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
serviceObject.currentIndexLength = pageDetails.currentIndexLength;
serviceObject.currentPageMax = currentPageMax;
serviceObject.visited = true;
serviceObject.currentIndex = url.parse(await this.page.url());
serviceObject.currentMetaIndex = 0;
}
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param $
* @returns {Promise<{currentIndexLength: number, maxPages: number, currentPageMax: number, page: number, currentPageIndex: number}>}
*/
async extractBarDetails($) {
try{
const numberExtract = /(\d+)/g;
const pagingBar = $('#resultresearch_paging-targetEl').children();
const page = parseInt($(pagingBar).eq(4).find('input').val(), 10);
const workMaxPages = this._cleanUp($(pagingBar).eq(5).text() );
const maxPages = parseInt(workMaxPages.match(numberExtract)[0], 10);
const rawDisplaying = this._cleanUp($(pagingBar).eq(pagingBar.length - 1).text());
const [ currentPageIndex, currentPageMax, currentIndexLength ] = rawDisplaying.match(numberExtract).map((s) => {
return parseInt(s, 10);
});
return { page, maxPages, currentPageIndex, currentPageMax, currentIndexLength };
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityIndex(serviceObject) {
try{
const fields = ['type', 'name', 'address'];
logger.info(`Working on the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 1, 2);
if (serviceObject.visited === false) {
logger.debug('Preparing...');
serviceObject.restart = false;
await this.entityIndexFirstPass(serviceObject);
}
if (serviceObject.visited === true) {
serviceObject.currentMetaIndex = serviceObject.step % serviceObject.currentPageMax;
logger.debug('serviceObject.currentMetaIndex', serviceObject.currentMetaIndex);
if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) {
logger.debug('Maxed out this page..');
// serviceObject.visited = false;
serviceObject.restart = false;
await this.page.waitForSelector('#button-1052').then(async (elm) => {
logger.debug('Proceeding to next index page..');
await elm.click({ 'delay':Scraper.notARobot() });
this.emit('pageChanged');
});
}
else {
logger.debug('dealing...');
serviceObject.restart = true;
logger.debug(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
const wantedRow = await this.page.$$(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
const htmlTable = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
const $ = cheerio.load(`<table>${htmlTable}</table>`);
const cells = $('div.x-grid-cell-inner');
serviceObject.current = {};
cells.each((index, item) => {
serviceObject.current[ fields[index] ] = this._cleanUp($(item).text());
});
if (typeof(serviceObject.current.name ) !== 'undefined' && serviceObject.current.name !== '') {
const fileName = this._makeFileName(serviceObject.current.name);
serviceObject.current.fileName = fileName;
serviceObject.current.filePath = `${this.path}/${fileName}`.substring(0, 240);
}
// logger.debug(serviceObject);
await this._randomWait(this.page, 3, 5);
await wantedRow[0].click({ 'delay':97, 'clickCount': 2 });
await this._randomWait(this.page, 1, 1);
this.emit('pageChanged');
}
}
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param $
* @param html
* @param divId
* @param sequence
* @returns {Promise<Array>}
*/
async extractGridPanel($, html, divId, sequence) {
try{
const outObj = [];
const elms = $(html).find(`${divId} div.x-grid-item-container table`);
elms.each((index, itm) => {
const newObj = {};
for(const seqItem of sequence) {
const mclass = `.x-grid-cell-${seqItem[0]}`;
const rowElm = $(itm).find(mclass);
newObj[seqItem[1]] = this._cleanUp($(rowElm).text());
}
outObj.push(newObj);
});
return outObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try{
const details = {};
const detailSequence = [['detailEntity_type_inputEl', 'type'],
['detailEntity_number_inputEl', 'number'],
['detailEntity_name_inputEl', 'name'],
['detailEntity_address_inputEl', 'address'],
['detailEntity_startdate_inputEl', 'startdate'],
['detailEntity_closeddate_inputEl', 'closedate'],
['detailEntity_countrycode_inputEl', 'countrycode'],
['detailEntity_group_inputEl', 'group'],
['detailEntity_subgroup_inputEl', 'subgroup'],
['detailEntity_iciOutside_inputEl', 'iciOutside'],
['detailEntity_icilinked_inputEl', 'icilinked']
];
const gridPanels = [{
'id': 'autorisedStatus',
'sequence': [['detailEntity_autorisedStatus', 'autorisedStatus'],
['detailEntity_recentChangeautorisedStatus', 'recentChangeautorisedStatus'],
['detailEntity_recentChangeautorisedDate', 'recentChangeautorisedDate']],
'divId': '#detailEntity_autorisedStatusGridPanel-body'
}, {
'id': 'agentOrBranch',
'sequence': [['detailEntity_agentorbranchData', 'agentorbranchData'], ['detailEntity_agentData', 'agentData'],
['detailEntity_branchData', 'branchData'], ['detailEntity_agentorbranchCountry', 'agentorbranchCountry'],
['detailEntity_agentorbranchAddress', 'agentorbranchAddress'], ['detailEntity_agentorbranchlegalstatus', 'agentorbranchlegalstatus']],
'divId': '#detailEntity_agentorbranchGridPanel-body'
}, {
'id': 'iciOutsideTable',
'sequence': [['detailEntity_iciOutsideMember', 'iciOutsideMember']],
'divId': '#detailEntity_iciOutsideGridPanel-body'
}, {
'id': 'icilinkedTable',
'sequence': [['detailEntity_icilinkedname', 'icilinkedname'], ['detailEntity_icilinkedstartingdate', 'icilinkedstartingdate'],
['detailEntity_icilinkedendingdate', 'icilinkedendingdate']],
'divId': '#detailEntity_icilinkedGridPanel-body'
}, {
'id': 'othersStatus',
'sequence': [['detailEntity_otherStatus', 'otherStatus'], ['detailEntity_recentChangeotherStatus', 'recentChangeotherStatus'],
['detailEntity_recentChangeotherDate', 'recentChangeotherDate']],
'divId': '#detailEntity_othersStatusGridPanel-body'
}, {
'id': 'services',
'sequence': [['detailEntity_service', 'service'], ['detailEntity_recentChangeservice', 'recentChangeservice'],
['detailEntity_recentChangeserviceDate', 'recentChangeserviceDate']],
'divId': '#detailEntity_servicesGridPanel-body'
}, {
'id': 'ancillaryservices',
'sequence': [['detailEntity_ancillaryservice', 'ancillaryservice'],
['detailEntity_recentChangeancillaryservice', 'recentChangeancillaryservice'],
['detailEntity_recentChangeancillaryserviceDate', 'recentChangeancillaryserviceDate']],
'divId': '#detailEntity_ancillaryservicesGridPanel-body'
}, {
'id': 'prestataire',
'sequence': [['detailEntity_prestatairename', 'prestatairename'], ['detailEntity_prestataireheadoffice', 'prestataireheadoffice'],
['detailEntity_prestataireauthorisation', 'prestataireauthorisation']],
'divId': '#detailEntity_prestataireGridPanel-body'
}, {
'id': 'historicName',
'sequence': [['detailEntity_historicNameName', 'historicNameName'], ['detailEntity_historicNameDate', 'historicNameDate']],
'divId': '#detailEntity_historicNameGridPanel-body'
}];
const $ = cheerio.load(html);
const mainDiv = $('#promoteDetailEntityPanel-innerCt');
for(const item of detailSequence) {
const i = $(mainDiv).find(`#${item[0]}`);
details[item[1]] = this._cleanUp($(i).text());
}
for( const grid of gridPanels)
details[grid.id] = await this.extractGridPanel($, mainDiv, grid.divId, grid.sequence);
return details;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntity(serviceObject) {
try{
logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.current.name}`);
logger.info(`Step ${serviceObject.step} of ${serviceObject.currentIndexLength}`);
await this._randomWait(this.page, 3, 5);
const filePath = serviceObject.current.filePath;
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
serviceObject.current.details = await this.extractEntityDetails(body);
this.emit('entityComplete');
logger.info('Entity complete...');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<null>}
*/
async entityCompleter(serviceObject) {
try{
const filename = serviceObject.current.fileName;
const filePath = serviceObject.current.filePath;
const newObj = {};
logger.info(`Saving: ${filename}.json`);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 3, 5);
newObj.fileName = `${filename}.json`;
newObj.name = serviceObject.current.name;
newObj.number = serviceObject.current.details.number || '';
serviceObject.links.push(newObj);
serviceObject.step++;
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
await this.page.waitForSelector('a#detailEntity_backtolist').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
this.emit('pageChanged');
});
}
else
this.emit('serviceDone');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<void>}
*/
async handleProcessEntity() {
switch (this.mode) {
case 1:
await this.processEntity(this.emoneyServices);
break;
case 2:
await this.processEntity(this.creditServices);
break;
case 0:
default:
await this.processEntity(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityComplete() {
switch (this.mode) {
case 1:
await this.entityCompleter(this.emoneyServices);
break;
case 2:
await this.entityCompleter(this.creditServices);
break;
case 0:
default:
await this.entityCompleter(this.paymentServices);
break;
}
}
async processNewPage() {
// give the page a few seconds to settle
// await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
if (pageUrl.href === 'about:blank') return;
if (pageUrl.pathname === '/index.html')
this.debounceHandleIndexPage();
else
if (process.env.NODE_ENV === 'production') {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
// Need thiss for Angular / EXT based sites
this.on('pageChanged', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('handleIndexPage', () => {
this.handleIndexPage();
});
this.on('processEntity', () => {
this.handleProcessEntity();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('handleEntityIndex', async () => {
switch (this.mode) {
case 1:
await this.processEntityIndex(this.emoneyServices);
break;
case 2:
await this.processEntityIndex(this.creditServices);
break;
case 0:
default:
await this.processEntityIndex(this.paymentServices);
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
this.emit('pageChanged');
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
this.emit('pageChanged');
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
this.on('selectSearchManually', async () => {
logger.debug('Locating advanced search button');
await this.page.waitForSelector('#menu_advanced', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':90 });
}).catch(() => {
logger.error('No advanced search button');
});
await this.page.waitForSelector('#promoteAdvancedSearchPanel-body', { 'visible':true, 'timeout':7500 }).then(async () => {
await this.indexPageHomeMode();
}).catch(() => {
logger.error('No advanced search form');
});
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/LU/CSSF`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
if (this.eventNames().length === 2)
await this.attachEvents();
await this._makeResponsive();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'load' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = LUScrape;