const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('PL'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; function tag() { const now = new Date().getTime(); return now.toString(36); } class PLScrape extends Scraper { constructor() { super(); this.id = 'PL'; this.on('done', () => { this._done(); }); /* if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); });*/ } /** * * @param rows * @returns {Promise} */ async rowReducer(rows) { try{ const newObj = { } ; rows.each((i, elm) => { const children = cheerio(elm).children(); if (children.length === 2) { // we want this data const label = this._makeFieldName(cheerio(children.eq(0)).text()); newObj[label] = this._cleanUp(cheerio(children.eq(1)).text()); } }); return newObj; } catch( err) { logger.error(err); } } /** * * @param items * @returns {Promise} */ async reduceBullets(items) { try{ const newArray = [] ; items.each((i, elm) => { newArray.push(this._cleanUp(cheerio(elm).text())); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractCSHeading(html) { try{ const $ = cheerio.load(html); const rawHeading = $('#singleEtity > div > div > div.panel-heading > h2'); if ($(rawHeading).length === 0) return ''; return this._cleanUp($(rawHeading).text()); } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractCSBodyText(html) { try{ const wanted = ['b', 'text']; const $ = cheerio.load(html); const rawBody = $('#singleEntityBody'); if ($(rawBody).length === 0) return ''; const firstRow = $(rawBody).contents()[0]; if (wanted.indexOf(firstRow.name) !== -1) return this._cleanUp($(firstRow).text()); return ''; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise<...Map[]>} */ async extractCSTable(html) { try{ const outMap = new Map([]); const $ = cheerio.load(html); const mainTable = $('#singleEntityBody > table'); if ($(mainTable).children().length === 0) return [...outMap]; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm); const cells = $(rows).children(); if (cells.length > 0) { const label = this._cleanUp($(cells).eq(0).text()); const text = this._cleanUp($(cells).eq(1).html()); outMap.set(label, text); } }); return [...outMap]; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityActivity(html) { try{ const removeCountry = /(Kraj)\s+/g; const newObj = {} ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_5 table.tableDynamic'); if ($(mainTable).children().length === 0) return newObj; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); const listItems = $(elm).find('li'); const rawCountryName = this._cleanUp($($(rows)[0]).text()).replace(removeCountry, ''); const countryName = this._makeFieldName(rawCountryName); this.reduceBullets(listItems).then((d) => { newObj[countryName] = d; }); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityBranches(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_4 table.tableDynamic'); if ($(mainTable).children().length === 0) return newArray; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); this.rowReducer(rows).then((d) => { newArray.push(d); }); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityAgents(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_3 table.tableDynamic'); if ($(mainTable).children().length === 0) return newArray; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); this.rowReducer(rows).then((d) => { newArray.push(d); }); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityServices(html) { logger.debug('+ extractEntityServices'); try{ const newObj = { } ; const $ = cheerio.load(html); const rows = $('#areatabs1_2 > table tr'); const label = this._makeFieldName($(rows).find('.left').text()); newObj[label] = []; const listItems = $(rows).find('.container100 li'); listItems.each((i, elm) => { newObj[label].push(this._cleanUp($(elm).text())); }); logger.debug('- extractEntityServices'); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { logger.debug('+ extractEntityDetails'); try{ const newObj = { } ; const $ = cheerio.load(html); const rows = $('div#areatabs1_1 tr'); rows.each((i, elm) => { const children = cheerio(elm).children(); if (children.length === 2) { // we want this data const label = this._makeFieldName($(children.eq(0)).text()); newObj[label] = this._cleanUp($(children.eq(1)).text()); } }); logger.debug('- extractEntityDetails'); return newObj; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async entityIndexFirstPass(serviceObject) { logger.debug('+ entityIndexFirstPass'); try{ // breaks up `1/146 (1455)` const breaker = /(\d+)/g; const body = await this.page.content(); const $ = cheerio.load(body); const subjectsInfo = $($('.infoNavigation').contents()[2]).text(); const brokenString = subjectsInfo.match(breaker); const currentPageIndex = parseInt(brokenString[0], 10); const currentPageMax = parseInt(brokenString[1], 10); const currentIndexLength = parseInt(brokenString[2], 10); logger.info(`First pass on the ${this.modeTitles[this.mode]} index...`); serviceObject.currentIndexLength = currentIndexLength; serviceObject.currentPageMax = currentPageMax; serviceObject.currentPageIndex = currentPageIndex; serviceObject.visited = true; serviceObject.currentIndex = url.parse(await this.page.url()); serviceObject.currentMetaIndex = 0; const entityName = `${this.modeNames[this.mode]}_${currentPageIndex}`; const filePath = await this._makeFilePath(entityName); await this._makeScreenshotV2(this.page, filePath, null); } catch( err) { logger.error(err); } logger.debug('- entityIndexFirstPass'); } /** * * @param serviceObject * @returns {Promise} */ async processEntityIndex(serviceObject) { logger.debug('+ processEntityIndex'); try{ const fields = ['count', 'referenceNumber', 'typeOfEntity', 'name', 'registrationNumber', 'nip', 'date']; const mouseDownDuration = Scraper.notARobot(); logger.info(`Working on the ${this.modeTitles[this.mode]} index...`); if (serviceObject.visited === false) { logger.debug('Preparing...'); await this.entityIndexFirstPass(serviceObject); } if (serviceObject.visited === true) { serviceObject.currentMetaIndex = serviceObject.step % 10; if ((serviceObject.step ) >= (serviceObject.currentPageIndex * 10)) { logger.debug('Maxed out this page..'); const nextButton = await this.page.$$('#j_idt64-tableViewS-recordsGoToNext'); const isDisabled = await this.page.$eval('#j_idt64-tableViewS-recordsGoToNext', (elm) => { return elm.disabled; }); if (!isDisabled) { // we need a click.. serviceObject.visited = false; await this._randomWait(this.page, 1, 2); nextButton[0].click({ 'delay':mouseDownDuration }); } else { logger.debug('I think we are done here...'); this.emit('serviceDone'); } } else { logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`); const elmStr = `table#j_idt64-tableViewS tbody tr:nth-child(${serviceObject.currentMetaIndex + 1})`; await this.page.waitForSelector(elmStr).then(async (elm) => { await elm.hover().catch((err) => { logger.warn(err); }); await elm.focus(); }); // Force the focus const wantedRow = await this.page.$$(elmStr); const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]); const $ = cheerio.load(`${htmlRow}
`); const cells = $('td'); serviceObject.current = {}; cells.each((index, item) => { serviceObject.current[ fields[index] ] = $(item).text(); }); await this._randomWait(this.page, 2, 3); await wantedRow[0].click({ 'delay':mouseDownDuration }); await this._findAndClick('#j_idt112 > input.button'); } } } catch( err) { logger.error(err); } logger.debug('- processEntityIndex'); } /** * * @returns {Promise} */ async indexRedirector() { logger.debug('+ indexRedirector'); try{ await this._randomWait(this.page, 3, 5, 'handleIntroPage'); await this.page.waitForSelector('#allByJS > tbody > tr:nth-child(4) > td > input').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); } catch( err) { logger.warn('!!!!!'); logger.error(err); await this._uploadError(); this.emit('stall'); } logger.debug('- indexRedirector'); } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetail(serviceObject) { logger.debug('+ processEntityDetail'); try{ const tabs = [ { 'id': '', 'name' : 'details' }, { 'id': 'div#tabs1_2', 'name' : 'services' }, { 'id': 'div#tabs1_3', 'name' : 'agents' }, { 'id': 'div#tabs1_4', 'name' : 'branches' }, { 'id': 'div#tabs1_5', 'name' : 'activity' } ]; if (serviceObject.visited === false) { logger.debug('Process the menu correctly'); this.emit('handleEntityIndex'); return; } logger.info(`Process ${this.modeTitles[this.mode]} // ${serviceObject.current.name}`); const stallObj = Object.assign({}, serviceObject.current); this.stall = setTimeout(() => { logger.warn(`Page stalled. Backing off :: ${stallObj.name}`); this.stalled = true; this.emit('backoff'); }, 75000); const newObj = {}; const entityName = `${serviceObject.current.name}_${serviceObject.current.nip}`; const fileName = this._makeFileName(entityName); const filePath = await this._makeFilePath(entityName); serviceObject.current.fileName = fileName; const body = await this.page.content(); newObj.details = await this.extractEntityDetails(body); newObj.services = await this.extractEntityServices(body); newObj.agents = await this.extractEntityAgents(body); newObj.branches = await this.extractEntityBranches(body); newObj.activity = await this.extractEntityActivity(body); serviceObject.current = Object.assign(serviceObject.current, newObj); for(const item of tabs) if (item.id !== '') { const tabExists = await this.page.$$(item.id); if (tabExists.length > 0) { await this._findAndClick(item.id); await this._makeScreenshotV2(this.page, `${filePath}_${item.name}`, null); } } if (!this.stalled) { this.emit('entityComplete'); logger.info('Entity complete...'); } else throw('Stalled'); } catch( err) { logger.error(err); } logger.debug('- processEntityDetail'); } /** * * @param serviceObject * @returns {Promise} */ async entityCompleter(serviceObject) { const _tag = tag(); logger.debug('+ entityCompleter', _tag); try{ const filename = serviceObject.current.fileName; const filePath = `${this.path}/${filename}`.substring(0, 240); logger.info(`Saving: ${filename}.json`); const newLink = { 'name':serviceObject.current.name, 'fileName':`${filename}.json` }; if (this.mode === 0) newLink.nip = serviceObject.current.nip; if (this.mode === 2) newLink.hash = serviceObject.current.hash; serviceObject.links.push(newLink); await jsonfile.writeFile(`${filePath}.json`, serviceObject.current); await this._randomWait(this.page, 10, 15, 'Throttled'); serviceObject.step++; clearTimeout(this.stall); this.stall = 0; if (serviceObject.step < serviceObject.currentIndexLength) { serviceObject.current = {}; if (this.mode === 0) // await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input'); await this.page.waitForSelector('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); else { // await this._findAndClick('#previousSearchPage'); await this.page.waitForSelector('#previousSearchPage').then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); this.emit('pageChanged'); } } else this.emit('serviceDone'); } catch( err) { logger.error(err); } logger.debug('- entityCompleter', _tag); } /** * * @returns {Promise} */ async handleEntityIndex() { switch (this.mode) { case 1: await this.processEntityIndex(this.emoneyServices); break; case 2: await this.processCSEntityIndex(this.creditServices); break; case 0: default: await this.processEntityIndex(this.paymentServices); break; } } /** * * @returns {Promise} */ async handleEntityDetail() { logger.debug('+ handleEntityDetail'); switch (this.mode) { case 1: await this.processEntityDetail(this.emoneyServices); break; case 2: await this.processCSEntityDetail(this.creditServices); break; case 0: default: await this.processEntityDetail(this.paymentServices); break; } logger.debug('- handleEntityDetail'); } /** * * @returns {Promise} */ async handleEntityComplete() { logger.debug('+ handleEntityComplete'); switch (this.mode) { case 1: await this.entityCompleter(this.emoneyServices); break; case 2: await this.entityCompleter(this.creditServices); break; case 0: default: await this.entityCompleter(this.paymentServices); break; } logger.debug('- handleEntityComplete'); } /** * * @returns {Promise} */ async processNewPage() { const _tag = tag(); logger.debug('+ processNewPage', _tag); // give the page a few seconds to settle const removeJSession = /(;jsessionid=[0-9a-f]*)/g; await this._randomWait(this.page, 3, 5, 'processNewPage'); const pageUrl = url.parse(await this.page.url()); const pathName = (pageUrl.pathname || '').replace(removeJSession, ''); // logger.debug('## Page changed', pageUrl); switch (pathName) { case '/View/': await this.indexRedirector(); break; case '/View/faces/start2OuterView.xhtml': case '/View/faces/dataEdit.xhtml': await this.handleEntityIndex(); break; case '/View/faces/subjectsList.xhtml': await this.handleEntityDetail(); break; case '/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych': await this.handleXLSDownload(); break; case '/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych': await this.handleXLSDownload(); break; case '/podmioty/wyszukiwarka_podmiotow': await this.csIndexHandler(); break; case '/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi': await this.processArt70(); break; case '/': if (pageUrl.href === 'chrome-error://chromewebdata/') this.emit('backoff'); else throw new Error(`Bad page: ${pageUrl.href}`); break; default: if (process.env.NODE_ENV) { await this._uploadError(); // this.emit('backoff'); throw new Error(`Unknown page: ${pageUrl.href}`); } else { logger.warn('processNewPage Fell through'); logger.warn('pathName', pathName); logger.warn('currentPage.location', pageUrl); } break; } logger.debug('- processNewPage', _tag); } async restart() { logger.warn(`Tryng to restart ${this.modeTitles[this.mode]}`); logger.error('KILLING PAGE & BROWSER'); await this.page.close(); await this.browser.close().catch((err) => { logger.error(err); }); this.page = null; this.browser = null; logger.error('RESTARTING'); await this._initBrowser(true); this.page = await this.browser.newPage(); logger.warn('Restarted'); switch (this.mode) { case 1: await this._goto(this.emoneyServices.urls[0]); break; case 2: await this._goto(this.creditServices.urls[this.creditServices.metastep]); break; case 0: default: await this._goto(this.paymentServices.urls[0]); break; } } async backoff() { this.backOffStep++; clearTimeout(this.stall); this.stall = 0; this.stalled = false; if (this.backOffStep > this.backOffLimit) this.backOffStep = this.backOffLimit; logger.warn(`Backing off for ${this.backOffStep * 5} minutes..`); const timeout = (60 * 1000) * (this.backOffStep * 5); await this._uploadError(); this.backOffTimer = setTimeout(() => { this.emit('restart'); }, timeout); } async start() { super._start(); try { this.mode = 0; this.backOffStep = 0; this.backOffLimit = 6; this.backOffTimer = null; this.stall = null; this.stalled = false; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://erup.knf.gov.pl/View/'], 'sections' : [], 'sectionLinks' : [], 'brokenReturn' : false }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.knf.gov.pl/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.knf.gov.pl/podmioty/wyszukiwarka_podmiotow', 'https://www.knf.gov.pl/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych', 'https://www.knf.gov.pl/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi'], 'sections' : [], 'sectionLinks' : [], 'restart' : false, 'metastep' : 0 }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/PL/KNF`)); // await this._doNonRepudiation(); await this._initBrowser(true); this.page = await this.browser.newPage(); this.page.on('domcontentloaded', () => { this.processNewPage(); }); this.on('pageChanged', async () => { await this.processNewPage(); }); this.on('stall', () => { this.backoff(); }); this.on('backoff', () => { this.backoff(); }); this.on('restart', () => { this.restart(); }); this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('handleEntityIndex', () => { this.handleEntityIndex(); }); this.on('entityDetail', () => { this.handleEntityDetail(); }); this.on('startcs', () => { this.handleStartcs(); }); this.on('serviceDone', async function() { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('paymentServicesDone', async function() { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async function() { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async function() { logger.warn('creditServicesDone'); try{ if (this.creditServices.metastep === 0) { jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.creditServices.metastep++; await this._goto(this.creditServices.urls[this.creditServices.metastep]); return; } if (this.creditServices.metastep === 1) { this.creditServices.metastep++; await this._goto(this.creditServices.urls[this.creditServices.metastep]); return; } if (this.creditServices.metastep === 2) { this.creditServices.done = true; this.mode++; this.inProgress = false; this.emit('done'); } } catch (e) { logger.error(e); } }); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }).catch((err) => { logger.error(err); }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async run() { await this.start(); } } module.exports = PLScrape;