Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

890 lines
26 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('IT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ITscrape extends Scraper {
constructor() {
super();
this.setID('IT');
this.on('done', () => {
// this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<void>}
*/
async forceScrollToTop() {
// Force the scroll
await this.page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
// Force the hover
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
logger.warn(err);
});
// Force the focus
await this.page.focus('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
}
/**
*
* @returns {Promise<void>}
*/
async forceEnglish() {
await this._randomWait(this.page, 2, 2, 'Force English');
await this.page.waitForSelector('#bs-example-navbar-collapse-1 > ul > li.dropdown > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 2, 2);
}).catch(() => {
logger.debug('No Language button');
});
await this._findAndClick('#bs-example-navbar-collapse-1 > ul > li.dropdown.open > ul > li:nth-child(2) > a');
}
/**
*
* @returns {Promise<void>}
*/
async handleFrontPage() {
let pageReturned = false;
await this._randomWait(this.page, 3, 5, 'handleFrontPage');
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(async (err) => {
logger.info('handleFrontPage: ul.linkgroup a Not found', err);
});
do{
await this.page.waitFor('#my-container > div.container > div', { 'visible':true, 'timeout':7500 }).then(() => {
pageReturned = true;
}).catch(async () => {
logger.info('We didnt transition back correctly, forcing another click..\n');
});
if (!pageReturned) {
await this.page.hover('ul.linkgroup a').catch((err) => {
logger.debug(err.name);
});
await this.page.focus('ul.linkgroup a').catch((err) => {
logger.debug(err.name);
});
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(async (err) => {
logger.info('handleFrontPage: ul.linkgroup a still not found', err.name);
});
}
}
while(!pageReturned);
// Supervisory registers and lists
}
/**
*
* @returns {Promise<void>}
*/
async handleSecondPage() {
try{
// sometimes this page takes a while to load...
const url = await this.page.evaluate('location.href');
await this._randomWait(this.page, 10, 13, 'handleSecondPage');
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':90000 }).catch((e) => {
logger.warn('Ajax loading shroud not removed after 90 seconds');
});
await this.page.waitForSelector('ul.nav.navbar-nav.navbar-center li a', { 'visible':false, 'timeout':90000 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 5, 8, 'await transition');
}).catch((e) => {
logger.warn('Page Navigation navigation links failed to load / display');
});
// await this._findAndClick('ul.nav.navbar-nav.navbar-center li a', null, 'https://infostat.bancaditalia.it/GIAVAInquiry-public/ng/int-albi');
const newUrl = await this.page.evaluate('location.href');
if (url !== newUrl) {
logger.debug('The page Has changed!');
this.emit('pageChanged');
}
}
catch( err) {
logger.error('Failed to progress past second page', err);
this.emit('recover');
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractPSRegistry(html) {
try{
const registry = {};
const $ = cheerio.load(html);
const rows = $('app-details-anagrafica > div.row');
rows.each((index, item) => {
const divs = $(item).find('div');
if ($(item).children().length === 2) {
const name = this._cleanUp(divs.eq(0).text()) ;
registry[name] = this._cleanUp(divs.eq(1).text());
}
});
return registry;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSRegistry\n', err);
}
else
logger.error('extractPSRegistry\n', err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractPSRegisters(html) {
try {
const registers = [];
const $ = cheerio.load(html);
const rows = $('app-details-albi div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
logger.info(`${rows.length} registers item${(rows.length !== 1) ? 's' : ''}`);
rows.each((index, item) => {
const divs = $(item).find('div');
const obj = {};
for (let counter = 0; counter < divs.length;counter++) {
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
obj[name] = this._cleanUp(divs.eq(counter).text());
}
registers.push(obj);
});
return registers;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSRegisters\n', err);
}
else
logger.error('extractPSRegisters\n', err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractPSAuthority(html) {
try{
const authority = [];
const $ = cheerio.load(html);
const rows = $('app-details-att-autorizzate div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
logger.info(`${rows.length} authority item${(rows.length !== 1) ? 's' : ''}`);
rows.each((index, item) => {
const divs = $(item).find('div');
const obj = {};
for (let counter = 0; counter < divs.length;counter++) {
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
obj[name] = this._cleanUp(divs.eq(counter).text());
}
authority.push(obj);
});
return authority;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSAuthority\n', err);
}
else
logger.error('extractPSAuthority\n', err);
}
}
/**
*
* @returns {Promise<void>}
*/
async preparePSSearch() {
try{
await this._randomWait(this.page, 3, 5, `preparePSSearch - ${this.modeTitles[this.mode]}`);
// Brute force the selector
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-search > div > div:nth-child(3) > div > input');
await this.page.waitForFunction(
'document.querySelector("#alboElenco").options.length > 1'
, { 'timeout':7500 }).then(() => {
logger.debug('Ajax done');
}).catch(() => {
throw new Error('Ajax not done');
});
const options = await this.page.$$('#alboElenco option');
const optionList = ['ALBO IP ART.114-SEPTIES TUB ', 'ALBO IMEL ITA EX 114-QUATER ', 'ALBO DELLE BANCHE '];
const wantedOption = [optionList[this.mode]];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#alboElenco', value);
break;
}
}
// wait for loading shroud to go away
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':25000 });
let btnSuccess = false;
do {
await this.page.waitForSelector('button.btn.btn-success', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
btnSuccess = true;
});
await this._randomWait(this.page, 1, 1, 'preparePSSearch btnSuccess');
}
while(!btnSuccess);
this.page.waitFor('app-int-albi-grid-result').then(async () => {
//
await this.forceEnglish();
await this.emit('processAgTable');
}).catch(async (err) => {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('No results transition\n', err);
}
else
logger.error('No results transition\n', err);
});
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('preparePSSearch\n', err);
}
else
logger.error('preparePSSearch\n', err);
}
}
/**
*
* @returns {Promise<{registry, authority, registers}>}
*/
async processPSDetail() {
let registry = {}, registers = {}, authority = {};
await this._randomWait(this.page, 3, 3, 'processPSDetail: AJAX');
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div.card.card-title > span > span', { 'visible': true }).catch((err) => {
logger.warn('AJAX data has failed to load');
logger.debug(err);
return { registry, registers, authority };
});
await this.page.waitFor('app-int-albi-details').then(async () => {
await this.forceScrollToTop();
const body = await this.page.content();
registry = await this.extractPSRegistry(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-int-albi-details');
}).catch(async (err) => {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('processPSDetail\n', err);
}
else
logger.error('processPSDetail\n', err);
});
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-int-albi-details');
//
await this.forceScrollToTop();
// wait for Registers Tab
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(2) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
logger.debug('** Showing Registers Tab');
await elm.click({ 'delay':90 });
await this.page.waitFor('app-details-albi', { 'visible': true, 'timeout':10000 }).then(async () => {
const body = await this.page.content();
registers = await this.extractPSRegisters(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-albi');
}).catch(async (err) => {
if (process.env.NODE_ENV)
// await this._uploadError();
throw new Error('No tab transition\n', err);
else
logger.error('No tab transition');
});
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-details-albi');
}).catch((err) => {
logger.warn('No "registers" Block...');
logger.debug(err);
});
// wait for Activity Tab
await this.forceScrollToTop();
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
logger.debug('** Showing Activity Tab');
await elm.click({ 'delay':90 });
let pageReturned = false;
do
await this.page.waitFor('app-details-att-autorizzate', { 'visible': true, 'timeout':10000 }).then(async () => {
pageReturned = true;
const body = await this.page.content();
authority = await this.extractPSAuthority(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-att-autorizzate');
}).catch(async (err) => {
await this.forceScrollToTop();
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a');
if (process.env.NODE_ENV)
throw new Error('No tab transition\n', err);
else
logger.warn('No tab transition');
});
while(!pageReturned);
}).catch((err) => {
logger.warn('No "Activity" Block...');
logger.debug(err);
});
return { registry, registers, authority };
}
/**
*
* @returns {Promise<void>}
*/
async returnToPSList() {
try{
let pageReturned = false;
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
logger.warn(err);
});
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
do
await this.page.waitFor('app-int-albi-grid-result').then(() => {
pageReturned = true;
}).catch(async (err) => {
logger.warn('We didnt transition back correctly, forcing another click..\n', err);
await this.forceScrollToTop();
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
});
while(!pageReturned);
}
catch (err) {
logger.error('returnToPSList\n', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
}
}
/**
*
* @returns {Promise<number>}
*/
async psGetMaxRows() {
const regExNumbersOnly = /\d{1,13}(?:,\d{0,2})?/g;
const elm = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(1) > p');
const text = await this.page.evaluate(el => el.innerText, elm[0]);
const numbers = regExNumbersOnly.exec(text);
return (numbers !== null) ? parseInt(numbers[0], 10) : -1;
}
async processDivs($, divs) {
const entries = {};
divs.each((index, item) => {
const itemText = this._cleanUp($(item).text());
const itemName = $(item).attr('col-id');
// logger.info(`>> ${index}`, itemName, itemText);
entries[itemName] = itemText;
});
return entries;
}
async psSetListCount(count) {
logger.debug('+ psSetListCount ');
await this.page.focus('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(7) > div > input');
for(let del = 0;del < 5;del++)
await this.page.keyboard.press('Backspace');
await this.page.keyboard.type(count.toString(), { 'delay': 100 }); // Types slower, like a user
await this.page.keyboard.press('Enter');
await this._randomWait(this.page, 10, 10, 'ajax refresh');
logger.debug('- psSetListCount ');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processAGTableV3(serviceObject) {
// this whole thing is ugly but at the moment it works
await this._randomWait(this.page, 3, 5, 'processAGTableV3');
const _defaultMaxPerPage = 10;
let workingData;
let elmStep;
let item;
let maxPages = 0;
let rowsInPass;
await this.psSetListCount(_defaultMaxPerPage);
const maxRows = await this.psGetMaxRows();
let remainingRows = maxRows;
logger.info('Max Rows', maxRows);
if (maxRows > _defaultMaxPerPage) {
maxPages = ~~(maxRows / _defaultMaxPerPage);
logger.info('Max pages:', maxPages);
}
for(let pageStep = 0; pageStep <= maxPages; pageStep++) {
logger.info('Pagestep', pageStep, (pageStep + 1) * _defaultMaxPerPage);
if (maxPages > 0)
if ((maxRows - ((pageStep ) * _defaultMaxPerPage)) > _defaultMaxPerPage)
rowsInPass = _defaultMaxPerPage;
else
rowsInPass = (maxRows - ((pageStep ) * _defaultMaxPerPage));
else
rowsInPass = maxRows;
logger.info(`Rows in this pass : ${rowsInPass}`);
for (let step = 0; step < rowsInPass; step++) {
for ( elmStep = 0; elmStep <= step; elmStep++) {
workingData = await this.page.$$(`div.ag-body-container div.ag-row[row-id="${elmStep}"]`);
item = workingData[0];
if (typeof item !== 'undefined')
await item.hover().catch((err) => {
logger.warn(err);
logger.info(item);
});
await this._microWait(this.page, 1);
}
await this._randomWait(this.page, 2, 2, 'processAGTableV3 after rows');
if (typeof item !== 'undefined') {
const html = await this.page.evaluate(el => el.innerHTML, item);
const clickable = await item.$('div[col-id="name"]');
const abiCodeElm = await item.$('div[col-id="abiCode"]');
const uid = await this.page.evaluate(el => el.innerText, abiCodeElm);
const clickName = await this.page.evaluate(el => el.innerText, clickable);
const $ = cheerio.load(html);
const divs = $('div');
logger.info(`Processing : ${clickName}, ${remainingRows} remain.`);
if (!serviceObject.workingData.has(uid)) {
// Exract all the data from the cells
const newEntry = await this.processDivs($, divs);
// Insert it in the map
serviceObject.workingData.set(uid, newEntry);
await this._randomWait(this.page, 2, 2, `Processing : ${clickName}`);
const filePath = await this._makeFilePath(clickName);
const fileName = this._makeFileName(clickName);
await this._randomWait(this.page, 2, 2, 'processAGTableV3 before ss');
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
serviceObject.links.push({ uid, 'fileName':`${fileName}.json`, 'name':clickName });
// Go into the detail
await clickable.click();
await this._randomWait(this.page, 3, 4, 'processAGTableV3 before next');
remainingRows--;
await this.page.waitFor('app-int-albi-details').then(
await this.doAlbiDetails(filePath, newEntry)
).catch(async (err) => {
logger.error('No detail transition', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
});
}
}
}
if (maxPages > 0) {
logger.info('Clicking to the next page...');
const nextButton = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
const buttonDisabled = await this.page.evaluate(el => el.disabled, nextButton[0]);
if (!buttonDisabled) {
this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
await this._randomWait(this.page, 5, 5, 'processAGTableV3 next page click');
}
}
}
logger.debug('processAGTableV3 DONE');
this.emit('doneProcessingGrid');
}
async doAlbiDetails(filePath, newEntry) {
try{
// process the page
const data = await this.processPSDetail();
data.details = newEntry;
logger.info(`Saving ${filePath}.json`);
await jsonfile.writeFile(`${filePath}.json`, data);
await this._randomWait(this.page, 5, 7, 'doAlbiDetails');
// Retun back to list
await this.returnToPSList();
await this._randomWait(this.page, 2, 2, 'doAlbiDetails after returnToPSList');
// wArray.push([uid, clickName]);
}
catch (err) {
logger.error('doAlbiDetails\n', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5, 'processNewPage');
const pageUrl = url.parse(await this.page.url());
switch (pageUrl.pathname) {
case '/compiti/vigilanza/albi-elenchi/index.html':
await this.handleFrontPage();
break;
case '/GIAVAInquiry-public/ng/':
await this.handleSecondPage();
break;
case '/GIAVAInquiry-public/ng/int-albi/search':
await this.preparePSSearch();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
// Need thiss for Angular based sites
// clear out stock recover handler
this.removeAllListeners('recover');
this.on('pageChanged', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
this.on('recover', this._debounce(async () => {
clearTimeout(this.backOffTimer);
logger.warn('Backing off for 5 minutes..');
const timeout = (60 * 1000) * 5;
this.backOffTimer = setTimeout(() => {
this.emit('restart');
// this.recover();
}, timeout);
}, 30000));
this.on('restart', this._debounce(async() => {
clearTimeout(this.backOffTimer);
logger.warn('Restarting::');
// await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
// use the Scraper recovery now to ensure crashed browser is resurrected
await this.__recover(this.startPage);
}, 15000));
this.on('processAgTable', async () => {
switch (this.mode) {
case 1:
await this.processAGTableV3(this.emoneyServices);
break;
case 2:
await this.processAGTableV3(this.creditServices);
break;
case 0:
default:
await this.processAGTableV3(this.paymentServices);
break;
}
});
this.on('doneProcessingGrid', async () => {
let curObj;
switch (this.mode) {
case 1:
curObj = this.emoneyServices;
break;
case 2:
curObj = this.creditServices;
break;
case 0:
default:
curObj = this.paymentServices;
break;
}
curObj.done = true;
curObj.items = curObj.links.length;
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links':curObj.links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, curObj);
this.mode++;
if (this.mode < 3) {
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5, 'doneProcessingGrid');
}
else
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = '';
this.credit = '';
this.backOffTimer = 0;
this.setPath(path.resolve(`${__dirname }/../artefacts/IT/FSA`));
await this._doNonRepudiation(false, { 'sslWithPrefix':true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5, 'After start');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ITscrape;