const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('IT'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class ITscrape extends Scraper { constructor() { super(); this.setID('IT'); this.on('done', () => { // this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`); this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @returns {Promise} */ async forceScrollToTop() { // Force the scroll await this.page.evaluate(() => { window.scrollBy(0, window.innerHeight); }); // Force the hover await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => { logger.warn(err); }); // Force the focus await this.page.focus('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a'); } /** * * @returns {Promise} */ async forceEnglish() { await this._randomWait(this.page, 2, 2, 'Force English'); await this.page.waitForSelector('#bs-example-navbar-collapse-1 > ul > li.dropdown > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); await this._randomWait(this.page, 2, 2); }).catch(() => { logger.debug('No Language button'); }); await this._findAndClick('#bs-example-navbar-collapse-1 > ul > li.dropdown.open > ul > li:nth-child(2) > a'); } /** * * @returns {Promise} */ async handleFrontPage() { let pageReturned = false; await this._randomWait(this.page, 3, 5, 'handleFrontPage'); await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(async (err) => { logger.info('handleFrontPage: ul.linkgroup a Not found', err); }); do{ await this.page.waitFor('#my-container > div.container > div', { 'visible':true, 'timeout':7500 }).then(() => { pageReturned = true; }).catch(async () => { logger.info('We didnt transition back correctly, forcing another click..\n'); }); if (!pageReturned) { await this.page.hover('ul.linkgroup a').catch((err) => { logger.debug(err.name); }); await this.page.focus('ul.linkgroup a').catch((err) => { logger.debug(err.name); }); await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(async (err) => { logger.info('handleFrontPage: ul.linkgroup a still not found', err.name); }); } } while(!pageReturned); // Supervisory registers and lists } /** * * @returns {Promise} */ async handleSecondPage() { try{ // sometimes this page takes a while to load... const url = await this.page.evaluate('location.href'); await this._randomWait(this.page, 10, 13, 'handleSecondPage'); await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':90000 }).catch((e) => { logger.warn('Ajax loading shroud not removed after 90 seconds'); }); await this.page.waitForSelector('ul.nav.navbar-nav.navbar-center li a', { 'visible':false, 'timeout':90000 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); await this._randomWait(this.page, 5, 8, 'await transition'); }).catch((e) => { logger.warn('Page Navigation navigation links failed to load / display'); }); // await this._findAndClick('ul.nav.navbar-nav.navbar-center li a', null, 'https://infostat.bancaditalia.it/GIAVAInquiry-public/ng/int-albi'); const newUrl = await this.page.evaluate('location.href'); if (url !== newUrl) { logger.debug('The page Has changed!'); this.emit('pageChanged'); } } catch( err) { logger.error('Failed to progress past second page', err); this.emit('recover'); } } /** * * @param html * @returns {Promise} */ async extractPSRegistry(html) { try{ const registry = {}; const $ = cheerio.load(html); const rows = $('app-details-anagrafica > div.row'); rows.each((index, item) => { const divs = $(item).find('div'); if ($(item).children().length === 2) { const name = this._cleanUp(divs.eq(0).text()) ; registry[name] = this._cleanUp(divs.eq(1).text()); } }); return registry; } catch (err) { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('extractPSRegistry\n', err); } else logger.error('extractPSRegistry\n', err); } } /** * * @param html * @returns {Promise} */ async extractPSRegisters(html) { try { const registers = []; const $ = cheerio.load(html); const rows = $('app-details-albi div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]'); logger.info(`${rows.length} registers item${(rows.length !== 1) ? 's' : ''}`); rows.each((index, item) => { const divs = $(item).find('div'); const obj = {}; for (let counter = 0; counter < divs.length;counter++) { const name = this._cleanUp(divs.eq(counter).attr('col-id')); obj[name] = this._cleanUp(divs.eq(counter).text()); } registers.push(obj); }); return registers; } catch (err) { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('extractPSRegisters\n', err); } else logger.error('extractPSRegisters\n', err); } } /** * * @param html * @returns {Promise} */ async extractPSAuthority(html) { try{ const authority = []; const $ = cheerio.load(html); const rows = $('app-details-att-autorizzate div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]'); logger.info(`${rows.length} authority item${(rows.length !== 1) ? 's' : ''}`); rows.each((index, item) => { const divs = $(item).find('div'); const obj = {}; for (let counter = 0; counter < divs.length;counter++) { const name = this._cleanUp(divs.eq(counter).attr('col-id')); obj[name] = this._cleanUp(divs.eq(counter).text()); } authority.push(obj); }); return authority; } catch (err) { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('extractPSAuthority\n', err); } else logger.error('extractPSAuthority\n', err); } } /** * * @returns {Promise} */ async preparePSSearch() { try{ await this._randomWait(this.page, 3, 5, `preparePSSearch - ${this.modeTitles[this.mode]}`); // Brute force the selector await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-search > div > div:nth-child(3) > div > input'); await this.page.waitForFunction( 'document.querySelector("#alboElenco").options.length > 1' , { 'timeout':7500 }).then(() => { logger.debug('Ajax done'); }).catch(() => { throw new Error('Ajax not done'); }); const options = await this.page.$$('#alboElenco option'); const optionList = ['ALBO IP ART.114-SEPTIES TUB ', 'ALBO IMEL ITA EX 114-QUATER ', 'ALBO DELLE BANCHE ']; const wantedOption = [optionList[this.mode]]; for (const item of options) { const text = await this.page.evaluate(el => el.innerText, item); const value = await this.page.evaluate(el => el.value, item); if (wantedOption.indexOf(text) !== -1) { await this.page.select('#alboElenco', value); break; } } // wait for loading shroud to go away await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':25000 }); let btnSuccess = false; do { await this.page.waitForSelector('button.btn.btn-success', { 'visible':true, 'timeout':2500 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(() => { btnSuccess = true; }); await this._randomWait(this.page, 1, 1, 'preparePSSearch btnSuccess'); } while(!btnSuccess); this.page.waitFor('app-int-albi-grid-result').then(async () => { // await this.forceEnglish(); await this.emit('processAgTable'); }).catch(async (err) => { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('No results transition\n', err); } else logger.error('No results transition\n', err); }); } catch (err) { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('preparePSSearch\n', err); } else logger.error('preparePSSearch\n', err); } } /** * * @returns {Promise<{registry, authority, registers}>} */ async processPSDetail() { let registry = {}, registers = {}, authority = {}; await this._randomWait(this.page, 3, 3, 'processPSDetail: AJAX'); // await this._makeScreenshotV2(this.page, `${filePath}_main`, null); await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div.card.card-title > span > span', { 'visible': true }).catch((err) => { logger.warn('AJAX data has failed to load'); logger.debug(err); return { registry, registers, authority }; }); await this.page.waitFor('app-int-albi-details').then(async () => { await this.forceScrollToTop(); const body = await this.page.content(); registry = await this.extractPSRegistry(body); await this._randomWait(this.page, 2, 2, 'processPSDetail app-int-albi-details'); }).catch(async (err) => { if (process.env.NODE_ENV) { await this._uploadError(); throw new Error('processPSDetail\n', err); } else logger.error('processPSDetail\n', err); }); await this._randomWait(this.page, 1, 1, 'processPSDetail after app-int-albi-details'); // await this.forceScrollToTop(); // wait for Registers Tab await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(2) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => { logger.debug('** Showing Registers Tab'); await elm.click({ 'delay':90 }); await this.page.waitFor('app-details-albi', { 'visible': true, 'timeout':10000 }).then(async () => { const body = await this.page.content(); registers = await this.extractPSRegisters(body); await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-albi'); }).catch(async (err) => { if (process.env.NODE_ENV) // await this._uploadError(); throw new Error('No tab transition\n', err); else logger.error('No tab transition'); }); await this._randomWait(this.page, 1, 1, 'processPSDetail after app-details-albi'); }).catch((err) => { logger.warn('No "registers" Block...'); logger.debug(err); }); // wait for Activity Tab await this.forceScrollToTop(); await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => { logger.debug('** Showing Activity Tab'); await elm.click({ 'delay':90 }); let pageReturned = false; do await this.page.waitFor('app-details-att-autorizzate', { 'visible': true, 'timeout':10000 }).then(async () => { pageReturned = true; const body = await this.page.content(); authority = await this.extractPSAuthority(body); await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-att-autorizzate'); }).catch(async (err) => { await this.forceScrollToTop(); await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a'); if (process.env.NODE_ENV) throw new Error('No tab transition\n', err); else logger.warn('No tab transition'); }); while(!pageReturned); }).catch((err) => { logger.warn('No "Activity" Block...'); logger.debug(err); }); return { registry, registers, authority }; } /** * * @returns {Promise} */ async returnToPSList() { try{ let pageReturned = false; await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => { logger.warn(err); }); await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a'); do await this.page.waitFor('app-int-albi-grid-result').then(() => { pageReturned = true; }).catch(async (err) => { logger.warn('We didnt transition back correctly, forcing another click..\n', err); await this.forceScrollToTop(); await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a'); }); while(!pageReturned); } catch (err) { logger.error('returnToPSList\n', err); this.emit('recover'); if (process.env.NODE_ENV) await this._uploadError(); } } /** * * @returns {Promise} */ async psGetMaxRows() { const regExNumbersOnly = /\d{1,13}(?:,\d{0,2})?/g; const elm = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(1) > p'); const text = await this.page.evaluate(el => el.innerText, elm[0]); const numbers = regExNumbersOnly.exec(text); return (numbers !== null) ? parseInt(numbers[0], 10) : -1; } async processDivs($, divs) { const entries = {}; divs.each((index, item) => { const itemText = this._cleanUp($(item).text()); const itemName = $(item).attr('col-id'); // logger.info(`>> ${index}`, itemName, itemText); entries[itemName] = itemText; }); return entries; } async psSetListCount(count) { logger.debug('+ psSetListCount '); await this.page.focus('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(7) > div > input'); for(let del = 0;del < 5;del++) await this.page.keyboard.press('Backspace'); await this.page.keyboard.type(count.toString(), { 'delay': 100 }); // Types slower, like a user await this.page.keyboard.press('Enter'); await this._randomWait(this.page, 10, 10, 'ajax refresh'); logger.debug('- psSetListCount '); } /** * * @param serviceObject * @returns {Promise} */ async processAGTableV3(serviceObject) { // this whole thing is ugly but at the moment it works await this._randomWait(this.page, 3, 5, 'processAGTableV3'); const _defaultMaxPerPage = 10; let workingData; let elmStep; let item; let maxPages = 0; let rowsInPass; await this.psSetListCount(_defaultMaxPerPage); const maxRows = await this.psGetMaxRows(); let remainingRows = maxRows; logger.info('Max Rows', maxRows); if (maxRows > _defaultMaxPerPage) { maxPages = ~~(maxRows / _defaultMaxPerPage); logger.info('Max pages:', maxPages); } for(let pageStep = 0; pageStep <= maxPages; pageStep++) { logger.info('Pagestep', pageStep, (pageStep + 1) * _defaultMaxPerPage); if (maxPages > 0) if ((maxRows - ((pageStep ) * _defaultMaxPerPage)) > _defaultMaxPerPage) rowsInPass = _defaultMaxPerPage; else rowsInPass = (maxRows - ((pageStep ) * _defaultMaxPerPage)); else rowsInPass = maxRows; logger.info(`Rows in this pass : ${rowsInPass}`); for (let step = 0; step < rowsInPass; step++) { for ( elmStep = 0; elmStep <= step; elmStep++) { workingData = await this.page.$$(`div.ag-body-container div.ag-row[row-id="${elmStep}"]`); item = workingData[0]; if (typeof item !== 'undefined') await item.hover().catch((err) => { logger.warn(err); logger.info(item); }); await this._microWait(this.page, 1); } await this._randomWait(this.page, 2, 2, 'processAGTableV3 after rows'); if (typeof item !== 'undefined') { const html = await this.page.evaluate(el => el.innerHTML, item); const clickable = await item.$('div[col-id="name"]'); const abiCodeElm = await item.$('div[col-id="abiCode"]'); const uid = await this.page.evaluate(el => el.innerText, abiCodeElm); const clickName = await this.page.evaluate(el => el.innerText, clickable); const $ = cheerio.load(html); const divs = $('div'); logger.info(`Processing : ${clickName}, ${remainingRows} remain.`); if (!serviceObject.workingData.has(uid)) { // Exract all the data from the cells const newEntry = await this.processDivs($, divs); // Insert it in the map serviceObject.workingData.set(uid, newEntry); await this._randomWait(this.page, 2, 2, `Processing : ${clickName}`); const filePath = await this._makeFilePath(clickName); const fileName = this._makeFileName(clickName); await this._randomWait(this.page, 2, 2, 'processAGTableV3 before ss'); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); serviceObject.links.push({ uid, 'fileName':`${fileName}.json`, 'name':clickName }); // Go into the detail await clickable.click(); await this._randomWait(this.page, 3, 4, 'processAGTableV3 before next'); remainingRows--; await this.page.waitFor('app-int-albi-details').then( await this.doAlbiDetails(filePath, newEntry) ).catch(async (err) => { logger.error('No detail transition', err); this.emit('recover'); if (process.env.NODE_ENV) await this._uploadError(); }); } } } if (maxPages > 0) { logger.info('Clicking to the next page...'); const nextButton = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button'); const buttonDisabled = await this.page.evaluate(el => el.disabled, nextButton[0]); if (!buttonDisabled) { this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button'); await this._randomWait(this.page, 5, 5, 'processAGTableV3 next page click'); } } } logger.debug('processAGTableV3 DONE'); this.emit('doneProcessingGrid'); } async doAlbiDetails(filePath, newEntry) { try{ // process the page const data = await this.processPSDetail(); data.details = newEntry; logger.info(`Saving ${filePath}.json`); await jsonfile.writeFile(`${filePath}.json`, data); await this._randomWait(this.page, 5, 7, 'doAlbiDetails'); // Retun back to list await this.returnToPSList(); await this._randomWait(this.page, 2, 2, 'doAlbiDetails after returnToPSList'); // wArray.push([uid, clickName]); } catch (err) { logger.error('doAlbiDetails\n', err); this.emit('recover'); if (process.env.NODE_ENV) await this._uploadError(); } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5, 'processNewPage'); const pageUrl = url.parse(await this.page.url()); switch (pageUrl.pathname) { case '/compiti/vigilanza/albi-elenchi/index.html': await this.handleFrontPage(); break; case '/GIAVAInquiry-public/ng/': await this.handleSecondPage(); break; case '/GIAVAInquiry-public/ng/int-albi/search': await this.preparePSSearch(); break; case '/en/our-registers/company-register/gransoverskridandehandel/': await this.crossBorderRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } break; } } /** * * @returns {Promise} */ async attachEvents() { // Need thiss for Angular based sites // clear out stock recover handler this.removeAllListeners('recover'); this.on('pageChanged', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); this.on('recover', this._debounce(async () => { clearTimeout(this.backOffTimer); logger.warn('Backing off for 5 minutes..'); const timeout = (60 * 1000) * 5; this.backOffTimer = setTimeout(() => { this.emit('restart'); // this.recover(); }, timeout); }, 30000)); this.on('restart', this._debounce(async() => { clearTimeout(this.backOffTimer); logger.warn('Restarting::'); // await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); // use the Scraper recovery now to ensure crashed browser is resurrected await this.__recover(this.startPage); }, 15000)); this.on('processAgTable', async () => { switch (this.mode) { case 1: await this.processAGTableV3(this.emoneyServices); break; case 2: await this.processAGTableV3(this.creditServices); break; case 0: default: await this.processAGTableV3(this.paymentServices); break; } }); this.on('doneProcessingGrid', async () => { let curObj; switch (this.mode) { case 1: curObj = this.emoneyServices; break; case 2: curObj = this.creditServices; break; case 0: default: curObj = this.paymentServices; break; } curObj.done = true; curObj.items = curObj.links.length; jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links':curObj.links }); jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, curObj); this.mode++; if (this.mode < 3) { await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5, 'doneProcessingGrid'); } else this.emit('done'); }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services']; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'], 'workingData': new Map([]), 'workingIndex': 0 }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'], 'workingData': new Map([]), 'workingIndex': 0 }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'], 'workingData': new Map([]), 'workingIndex': 0 }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = ''; this.credit = ''; this.backOffTimer = 0; this.setPath(path.resolve(`${__dirname }/../artefacts/IT/FSA`)); await this._doNonRepudiation(false, { 'sslWithPrefix':true }).catch((err) => { logger.warn(err); }); await this._initBrowser(true); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle2' }); await this._randomWait(this.page, 3, 5, 'After start'); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = ITscrape;