const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('(PL)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class PLScrape extends Scraper { constructor() { super(); this.setID('PL'); this.version = '0.0.1-1'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param rows * @returns {Promise} */ async rowReducer(rows) { try{ const newObj = { } ; rows.each((i, elm) => { const children = cheerio(elm).children(); if (children.length === 2) { // we want this data const label = this._makeFieldName(cheerio(children.eq(0)).text()); newObj[label] = this._cleanUp(cheerio(children.eq(1)).text()); } }); return newObj; } catch( err) { logger.error(err); } } /** * * @param items * @returns {Promise} */ async reduceBullets(items) { try{ const newArray = [] ; items.each((i, elm) => { newArray.push(this._cleanUp(cheerio(elm).text())); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractCSHeading(html) { try{ const $ = cheerio.load(html); const rawHeading = $('#singleEtity > div > div > div.panel-heading > h2'); if ($(rawHeading).length === 0) return ''; return this._cleanUp($(rawHeading).text()); } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractCSBodyText(html) { try{ const wanted = ['b', 'text']; const $ = cheerio.load(html); const rawBody = $('#singleEntityBody'); if ($(rawBody).length === 0) return ''; const firstRow = $(rawBody).contents()[0]; if (wanted.indexOf(firstRow.name) !== -1) return this._cleanUp($(firstRow).text()); return ''; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise<...Map[]>} */ async extractCSTable(html) { try{ const outMap = new Map([]); const $ = cheerio.load(html); const mainTable = $('#singleEntityBody > table'); if ($(mainTable).children().length === 0) return [...outMap]; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm); const cells = $(rows).children(); if (cells.length > 0) { const label = this._cleanUp($(cells).eq(0).text()); const text = this._cleanUp($(cells).eq(1).html()); outMap.set(label, text); } }); return [...outMap]; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityActivity(html) { try{ const removeCountry = /(Kraj)\s+/g; const newObj = {} ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_5 table.tableDynamic'); if ($(mainTable).children().length === 0) return newObj; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); const listItems = $(elm).find('li'); const rawCountryName = this._cleanUp($($(rows)[0]).text()).replace(removeCountry, ''); const countryName = this._makeFieldName(rawCountryName); this.reduceBullets(listItems).then((d) => { newObj[countryName] = d; }); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityBranches(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_4 table.tableDynamic'); if ($(mainTable).children().length === 0) return newArray; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); this.rowReducer(rows).then((d) => { newArray.push(d); }); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityAgents(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const mainTable = $('div#areatabs1_3 table.tableDynamic'); if ($(mainTable).children().length === 0) return newArray; const mainBody = $(mainTable).children()[0]; const tableRows = $(mainBody).children(); tableRows.each((i, elm) => { const rows = $(elm).find('tr'); this.rowReducer(rows).then((d) => { newArray.push(d); }); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityServices(html) { try{ const newObj = { } ; const $ = cheerio.load(html); const rows = $('#areatabs1_2 > table tr'); const label = this._makeFieldName($(rows).find('.left').text()); newObj[label] = []; const listItems = $(rows).find('.container100 li'); listItems.each((i, elm) => { newObj[label].push(this._cleanUp($(elm).text())); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { try{ const newObj = { } ; const $ = cheerio.load(html); const rows = $('div#areatabs1_1 tr'); rows.each((i, elm) => { const children = cheerio(elm).children(); if (children.length === 2) { // we want this data const label = this._makeFieldName($(children.eq(0)).text()); newObj[label] = this._cleanUp($(children.eq(1)).text()); } }); return newObj; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async entityIndexFirstPass(serviceObject) { try{ // breaks up `1/146 (1455)` const breaker = /(\d+)/g; const body = await this.page.content(); const $ = cheerio.load(body); const subjectsInfo = $($('.infoNavigation').contents()[2]).text(); const brokenString = subjectsInfo.match(breaker); const currentPageIndex = parseInt(brokenString[0], 10); const currentPageMax = parseInt(brokenString[1], 10); const currentIndexLength = parseInt(brokenString[2], 10); logger.info(`First pass on the ${this.modeTitles[this.mode]} index...`); serviceObject.currentIndexLength = currentIndexLength; serviceObject.currentPageMax = currentPageMax; serviceObject.currentPageIndex = currentPageIndex; serviceObject.visited = true; serviceObject.currentIndex = url.parse(await this.page.url()); serviceObject.currentMetaIndex = 0; const entityName = `${this.modeNames[this.mode]}_${currentPageIndex}`; const filePath = await this._makeFilePath(entityName); await this._makeScreenshotV2(this.page, filePath, null); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processCSEntityIndex(serviceObject) { try{ const mouseDownDuration = Scraper.notARobot(); serviceObject.currentMetaIndex = serviceObject.step % 20; if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) { logger.debug('Maxed out this page..'); serviceObject.restart = false; await this.page.waitForSelector('#nextPage', { 'visible': true, 'timeout':90000 }).then(async (elm) => { logger.debug('Proceeding to next index page..'); await elm.click({ 'delay':Scraper.notARobot() }); this.emit('pageChanged'); }); } else { logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`); serviceObject.restart = true; const wantedRow = await this.page.$$(`#searchEntitites > div:nth-child(${serviceObject.currentMetaIndex + 1}) a`); const text = await this.page.evaluate(el => el.innerText, wantedRow[0]); await this._randomWait(this.page, 2, 3); wantedRow[0].click({ 'delay':mouseDownDuration }).then(() => { serviceObject.current = { 'name':text }; this.emit('pageChanged'); }); } } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntityIndex(serviceObject) { // #j_idt64-tableViewS > tbody > tr:nth-child(1) > td.tableViewNoR logger.debug('###### processEntityIndex'); try{ const fields = ['count', 'referenceNumber', 'typeOfEntity', 'name', 'registrationNumber', 'nip', 'date']; const mouseDownDuration = Scraper.notARobot(); logger.info(`Working on the ${this.modeTitles[this.mode]} index...`); if (serviceObject.visited === false) { logger.debug('Preparing...'); await this.entityIndexFirstPass(serviceObject); } let pageMaxContent = 10; await this.page.waitForSelector('#j_idt64-tableViewS > tfoot > tr > td > select', { 'visible': true, 'timeout':90000 }).then(async (elm) => { const rawValue = await elm.getProperty('value'); const value = await rawValue.jsonValue(); pageMaxContent = parseInt(value, 10); }); logger.debug('pageMaxContent', pageMaxContent); if (serviceObject.visited === true) { serviceObject.currentMetaIndex = serviceObject.step % pageMaxContent; if ((serviceObject.step ) >= (serviceObject.currentPageIndex * pageMaxContent)) { logger.debug('Maxed out this page..'); await this.page.waitForSelector('#j_idt64-tableViewS-recordsGoToNext', { 'visible': true, 'timeout':90000 }).then(async (elm) => { const isDisabled = elm.disabled; if (!isDisabled) { // we need a click.. serviceObject.visited = false; await this._randomWait(this.page, 1, 2); elm.click({ 'delay':mouseDownDuration }); } else { logger.debug('Check if we should be done:', serviceObject.step, serviceObject.currentIndexLength); if(serviceObject.step >= serviceObject.currentIndexLength) { logger.debug('processEntityIndex Done here...'); this.emit('serviceDone'); } } }).catch((err) => { logger.error(err); this.emit('backoff'); }); } else { logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`); const elmStr = `table#j_idt64-tableViewS tbody tr:nth-child(${serviceObject.currentMetaIndex + 1})`; await this.page.waitForSelector(elmStr, { 'visible': true, 'timeout':90000 }).then(async (elm) => { await elm.hover().catch((err) => { logger.warn(err); }); await elm.focus(); const htmlRow = await this.page.evaluate(el => el.outerHTML, elm); const $ = cheerio.load(`${htmlRow}
`); const cells = $('td'); serviceObject.current = {}; cells.each((index, item) => { serviceObject.current[ fields[index] ] = $(item).text(); }); await this._randomWait(this.page, 2, 3); await elm.click({ 'delay':mouseDownDuration }); await this._findAndClick('#j_idt112 > input.button'); }).catch((err) => { logger.error(err); this.emit('backoff'); }); } } } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async indexRedirector() { try{ await this._randomWait(this.page, 3, 5, 'handleIntroPage'); await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible': true, 'timeout':90000 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }); } catch( err) { logger.warn('!!!!!'); logger.error(err); await this._uploadError(); this.emit('stall'); } } /** * * @returns {Promise} */ async csIndexHandler() { try{ const pageUrl = url.parse(await this.page.url()); if (this.creditServices.started !== false) if (pageUrl.hash === null || pageUrl.hash === '#') this.emit('processCSEntityIndex'); else this.emit('entityDetail'); else this.emit('startcs'); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processCSEntityDetail(serviceObject) { try{ logger.info(`Process ${serviceObject.current.name}`); const newObj = {}; const pageUrl = url.parse(await this.page.url()); const hash = (pageUrl.hash || '').replace('#', ''); const entityName = `${serviceObject.current.name}_${hash}`; const fileName = this._makeFileName(entityName); const filePath = await this._makeFilePath(entityName); serviceObject.current.fileName = fileName; const body = await this.page.content(); newObj.hash = hash; newObj.heading = await this.extractCSHeading(body); newObj.bodytext = await this.extractCSBodyText(body); newObj.table = await this.extractCSTable(body); serviceObject.current = Object.assign(serviceObject.current, newObj); await this._makeScreenshotV2(this.page, `${filePath}`, null); this.emit('entityComplete'); logger.info('Entity complete...'); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetail(serviceObject) { logger.debug('-----> processEntityDetail'); try{ const tabs = [ { 'id': '', 'name' : 'details' }, { 'id': 'div#tabs1_2', 'name' : 'services' }, { 'id': 'div#tabs1_3', 'name' : 'agents' }, { 'id': 'div#tabs1_4', 'name' : 'branches' }, { 'id': 'div#tabs1_5', 'name' : 'activity' } ]; if (serviceObject.visited === false) { logger.debug('Process the menu correctly'); this.emit('handleEntityIndex'); return; } logger.debug('====== processEntityDetail ----->'); logger.info(`Process ${this.modeTitles[this.mode]} // ${serviceObject.current.name}`); const newObj = {}; const entityName = `${serviceObject.current.name}_${serviceObject.current.nip}`; const fileName = this._makeFileName(entityName); const filePath = await this._makeFilePath(entityName); serviceObject.current.fileName = fileName; const body = await this.page.content(); newObj.details = await this.extractEntityDetails(body); newObj.services = await this.extractEntityServices(body); newObj.agents = await this.extractEntityAgents(body); newObj.branches = await this.extractEntityBranches(body); newObj.activity = await this.extractEntityActivity(body); serviceObject.current = Object.assign(serviceObject.current, newObj); for(const item of tabs) if (item.id !== '') { const tabExists = await this.page.$$(item.id); if (tabExists.length > 0) { await this._findAndClick(item.id); await this._makeScreenshotV2(this.page, `${filePath}_${item.name}`, null); await this._microWait(this.page, 15); } } this.emit('entityComplete'); logger.info('Entity complete...'); } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async entityCompleter(serviceObject) { try{ const filename = serviceObject.current.fileName; const filePath = `${this.path}/${filename}`.substring(0, 240); logger.info(`Saving: ${filename}.json`); const newLink = { 'name':serviceObject.current.name, 'fileName':`${filename}.json` }; // Payment service if (this.mode === 0) newLink.nip = serviceObject.current.nip; // Credit Institute if (this.mode === 2) newLink.hash = serviceObject.current.hash; serviceObject.links.push(newLink); await jsonfile.writeFile(`${filePath}.json`, serviceObject.current); await this._randomWait(this.page, 10, 15, 'Throttled'); serviceObject.step++; clearTimeout(this.stall); this.stall = 0; logger.debug('>> this.mode:', this.mode); logger.debug('>> serviceObject.step:', serviceObject.step); logger.debug('>> serviceObject.currentIndexLength:', serviceObject.currentIndexLength); if (serviceObject.step < serviceObject.currentIndexLength) { serviceObject.current = {}; // 2019-05-08 :: THIS BIT BROKE TODAY if (this.mode === 0) await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input'); else { await this._findAndClick('#previousSearchPage'); this.emit('pageChanged'); } } else this.emit('serviceDone'); } catch( err) { logger.error(err); this.emit('backoff'); } } /** * * @returns {Promise} */ async handleXLSDownload() { try{ const entityName = `${this.modeNames[this.mode]}_main`; const filePath = await this._makeFilePath(entityName); await this._makeScreenshotV2(this.page, filePath, null); await this._randomWait(this.page, 3, 6); await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); this._findAndClick('body > section.article-view > div > div > div.col-xs-12.col-lg-9.article-content.pl50-lg > div.row.mb30 > div > div'); await this._randomWait(this.page, 3, 6); this.emit('serviceDone'); } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async countCSRows() { try{ const body = await this.page.content(); const $ = cheerio.load(body); const searchEntitites = $('#searchEntitites'); return $(searchEntitites).children().length; } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async startcs() { try{ const options = await this.page.$$('#selectCategory option'); const wantedOption = ['Działalność transgraniczna podmiotów krajowych']; for (const item of options) { const text = await this.page.evaluate(el => el.innerText, item); const value = await this.page.evaluate(el => el.value, item); if (wantedOption.indexOf(text) !== -1) { await this.page.select('#selectCategory', value); break; } } await this._randomWait(this.page, 1, 2); await this._findAndClick('#searchButton'); await this.page.waitForSelector('#band-cookies-close', { 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':90 }); }).catch(() => { logger.info('No cookie band...'); }); await this.page.waitForSelector('#searchresults > div.searchresults-counter.border-top.text-uppercase.text-center > p > span', { 'visible': true, 'timeout':90000 }).then(async (elm) => { const count = await this.page.evaluate(el => el.innerText, elm); this.creditServices.started = true; this.creditServices.currentIndexLength = parseInt(count, 10); this.creditServices.currentPageLimit = await this.countCSRows(); this.emit('pageChanged'); }); } catch( err) { logger.error(err); } } /** * Reduce the Article 70 Spans into an array * @param html * @returns {Array} */ reduceArt70Spans(html) { try{ const output = []; const $ = cheerio.load(html); const spans = $('span'); spans.each((i, item) => { output.push($(item).text()); }); return output; } catch( err) { logger.error(err); } } /** * Reduce the Article 70 data * @param html * @returns {Promise<...Map[]>} */ async reduceArt70(html) { try{ const outMap = new Map([]); const $ = cheerio.load(html); const tables = $('table'); tables.each(async (i, itm) => { const rows = $(itm).find('td'); const title = this._cleanUp($($(rows)[1]).text()); const spans = $(rows)[3]; if($(spans).length > 0) { const reducedTable = this.reduceArt70Spans($(spans).html()); outMap.set(title, reducedTable); } }); return [...outMap]; } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async processArt70() { try{ await this.page.waitForSelector('table#entityTable', { 'visible': true, 'timeout':90000 }).then(async (elm) => { const html = await this.page.evaluate(el => el.outerHTML, elm); const activities = await this.reduceArt70(html); const entityName = `${this.modeNames[this.mode]}_article70`; const filePath = await this._makeFilePath(entityName); await this._makeScreenshotV2(this.page, filePath, null); logger.info(`Saving: ${entityName}.json`); await jsonfile.writeFile(`${filePath}.json`, activities); this.emit('serviceDone'); }); } catch( err) { logger.error(err); } } /** * * @returns {Promise} */ async handleStartcs() { await this.page.waitForSelector('#selectCategory', { 'visible': true, 'timeout':90000 }).then(async () => { await this.startcs(); }); } /** * * @returns {Promise} */ async handleEntityIndex() { let doIndex = false; await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':3500 }).then(async (elm) => { logger.warn('Sent back to the main selector screen'); await elm.click({ 'delay':90 }); doIndex = false; }).catch(() => { // logger.info('No show all button'); doIndex = true; }); if (!doIndex) return; await this.page.waitForSelector('tfoot > tr > td > select', { 'visible': true, 'timeout':90000 }).then(async (elm) => { const rawValue = await elm.getProperty('value'); const value = await rawValue.jsonValue(); logger.debug('Dropdown value', value); if (parseInt(value, 10) === 10) { doIndex = false; await this.page.select('tfoot > tr > td > select', '200'); logger.debug('Drop down changed..'); } }).catch(() => { logger.debug('There was no paging drop down??'); }); if (doIndex) await this.processEntityIndex(this.paymentServices).catch(async (err) => { logger.error('processEntityIndex catch: ', err); this.emit('restart'); }); } /** * * @returns {Promise} */ async handleEntityDetail() { switch (this.mode) { case 1: await this.processEntityDetail(this.emoneyServices); break; case 2: await this.processCSEntityDetail(this.creditServices); break; case 0: default: await this.processEntityDetail(this.paymentServices); break; } } /** * * @returns {Promise} */ async handleEntityComplete() { switch (this.mode) { case 1: await this.entityCompleter(this.emoneyServices); break; case 2: await this.entityCompleter(this.creditServices); break; case 0: default: await this.entityCompleter(this.paymentServices); break; } } /** * * @returns {Promise} */ async bouncerCheck() { let canProceed = 0; let msg = 'Bouncer: '; await this.page.waitForFunction( 'document.querySelector("body").innerText.includes("Usługa chwilowo niedostępna. Przepraszamy.");' , { 'timeout':2500 }).then(() => { msg += '❌'; }).catch(() => { msg += '✔️'; canProceed++; }); await this.page.waitForFunction( 'document.querySelector("body").innerText.length===0' , { 'timeout':2500 }).then(() => { msg += '❌'; }).catch(() => { msg += '✔️'; canProceed++; }); if (canProceed === 2) logger.debug(msg); else logger.warn(msg); return (canProceed === 2); } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle const removeJSession = /(;jsessionid=[0-9a-f]*)/g; await this._randomWait(this.page, 3, 5, 'processNewPage'); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('backoff'); return; } const pathName = (pageUrl.pathname || '').replace(removeJSession, ''); // pre check logger.debug('Hit:', pathName); const canProceed = await this.bouncerCheck(); if (canProceed) switch (pathName) { case '/View/': case '/View/faces/start2OuterView.xhtml': case '/View/faces/dataEdit.xhtml': await this.handleEntityIndex(); break; case '/View/faces/subjectsList.xhtml': this.emit('entityDetail'); break; case '/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych': await this.handleXLSDownload(); break; case '/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych': await this.handleXLSDownload(); break; case '/podmioty/wyszukiwarka_podmiotow': await this.csIndexHandler(); break; case '/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi': await this.processArt70(); break; case '/View/redirect2OuterView.jsp': logger.warn('Letting this page transition...'); // do nothing // let the page transition break; case '/': if (pageUrl.href === 'chrome-error://chromewebdata/') this.emit('backoff'); else throw new Error(`Bad page: ${pageUrl.href}`); break; default: if (process.env.NODE_ENV) { await this._uploadError(); // this.emit('backoff'); throw new Error(`Unknown page: ${pageUrl.href}`); } else { logger.warn('processNewPage Fell through'); logger.warn('pathName', pathName); logger.warn('currentPage.location', pageUrl); } break; } else { logger.warn('We have hit a bouncer.. Back off for a bit'); this.emit('backoff'); } // logger.debug('## Page changed', pageUrl); } /** * * @returns {Promise} */ async _restart() { logger.warn(`Tryng to restart ${this.modeTitles[this.mode]}`); if (this.mode === 0) { logger.debug('Clearing current object..'); this.paymentServices.visited = false; this.paymentServices.current = {}; } switch (this.mode) { case 1: await this._goto(this.emoneyServices.urls[0]); break; case 2: await this._goto(this.creditServices.urls[this.creditServices.metastep]); break; case 0: default: await this._goto(this.paymentServices.urls[0]); break; } } /** * * @returns {Promise} */ async backoff() { this.backOffStep++; clearTimeout(this.stall); this.stall = 0; this.stalled = false; if (this.backOffStep > this.backOffLimit) this.backOffStep = this.backOffLimit; logger.warn(`Backing off for ${this.backOffStep * 5} minutes..`); const timeout = 300000; // (this.backOffStep * 5) * 60000; logger.warn('timeout', timeout); // await this._uploadError(); this.backOffTimer = setTimeout(() => { this.emit('restart'); }, timeout); } /** * * @returns {Promise} */ async attachEvents() { this.on('pageChanged', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); this.on('stall', () => { this.emit('backoff'); }); this.on('backoff', this._debounce( () => { this.backoff(); }, 10000)); /* this.on('backoff', () => { this.backoff(); });*/ this.on('restart', async () => { await this._restart(); }); this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('handleEntityIndex', () => { this.handleEntityIndex(); }); this.on('entityDetail', async () => { await this.handleEntityDetail(); }); this.on('startcs', () => { this.handleStartcs(); }); this.on('processCSEntityIndex', async () => { await this.processCSEntityIndex(this.creditServices).catch(() => { this.emit('backoff'); }); }); this.on('serviceDone', async function() { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('paymentServicesDone', async function() { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async function() { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async function() { logger.warn('creditServicesDone'); try{ if (this.creditServices.metastep === 0) { jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.creditServices.metastep++; await this._goto(this.creditServices.urls[this.creditServices.metastep]); return; } if (this.creditServices.metastep === 1) { this.creditServices.metastep++; await this._goto(this.creditServices.urls[this.creditServices.metastep]); return; } if (this.creditServices.metastep === 2) { this.creditServices.done = true; this.mode++; this.inProgress = false; this.emit('done'); } } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.backOffStep = 0; this.backOffLimit = 3; this.backOffTimer = null; this.stall = null; this.stalled = false; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://erup.knf.gov.pl/View/'], 'sections' : [], 'sectionLinks' : [], 'brokenReturn' : false, 'started':0 }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.knf.gov.pl/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.knf.gov.pl/podmioty/wyszukiwarka_podmiotow', 'https://www.knf.gov.pl/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych', 'https://www.knf.gov.pl/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi'], 'sections' : [], 'sectionLinks' : [], 'restart' : false, 'metastep' : 0 }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/PL/KNF`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); const now = new Date(); this.paymentServices.started = now.getTime(); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }).catch((err) => { logger.error(err); }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = PLScrape;