const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('SK'); const url = require('url'); const camelCase = require('camelcase'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class SKScrape extends Scraper { constructor() { super(); this.id = 'SK'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @returns {Promise} */ async checkChangeLanguage() { const languageIcon = await this.page.$$('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a > img'); if (languageIcon.length > 0) { const value = await this.page.evaluate(el => el.getAttribute('src'), languageIcon[0]); if (value === '/static/icon/ico_en.gif') { // this needs a click logger.info('Changing language to English..'); await this._findAndClick('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a '); return true; // } } return false; } /** * * @returns {Promise} */ async handleIntroPage() { const pageUrl = url.parse(await this.page.url()); // Clear cookie bar await this.page.waitForSelector('a.btnCookieAccept', { 'visible':true, 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(() => { logger.info('No cookie bar'); }); if (!this.inProgress && pageUrl.query === null) { // fix language before going on const changedLanguage = await this.checkChangeLanguage(); if (!changedLanguage) { await this._randomWait(this.page, 3, 5, 'handleIntroPage'); await this._findAndClick(' body > div.container > div:nth-child(5) > div:nth-child(1) > div > div'); } } } /** * * @param serviceObject * @returns {Promise} */ async processMainMenu(serviceObject) { const wantedItem = serviceObject.sections[serviceObject.indexStep]; const expandables = ['#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl1', '#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl2', '#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl3', '#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl4' ]; for (const item of expandables) await this.page.$eval(item, e => e.click({ 'delay':90 })); await this._randomWait(this.page, 3, 5); const wantedRow = `[data-sector="${wantedItem}"]`; logger.debug('Looking for', wantedRow); await this.page.waitForSelector(wantedRow, { 'visible':true, 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(() => { logger.warn('processMainMenu did not find what it was looking for!'); }); } /** * * @param serviceObject * @returns {Promise} */ async entityIndexFirstPass(serviceObject) { // breaks up `Showing 1 to 10 of 12 entries` const breaker = /(\d+)/g; const body = await this.page.content(); const $ = cheerio.load(body); const subjectsInfo = $('#Subjects_info').text(); const brokenString = subjectsInfo.match(breaker); const currentPageIndex = parseInt(brokenString[0], 10); const currentPageMax = parseInt(brokenString[1], 10); // The site returns the index from the last page when you select a different view. // This should be watched and can cause a problem logger.debug('subjectsInfo', subjectsInfo); logger.debug('Step', serviceObject.step); logger.debug('currentPageIndex', currentPageIndex); if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) { serviceObject.currentIndexLength = parseInt(brokenString[2], 10); serviceObject.currentPageMax = currentPageMax; serviceObject.visited = true; serviceObject.currentIndex = url.parse(await this.page.url()); serviceObject.currentMetaIndex = 0; } else { logger.info('Need to click previous'); const nextButton = await this.page.$$('#Subjects_previous'); const buttonClasses = await this.page.$eval('#Subjects_previous', e => e.getAttribute('class')); if (buttonClasses.split(' ').indexOf('disabled') === -1) { // we need a click.. nextButton[0].click({ 'delay':90 }); await this._randomWait(this.page, 3, 5); serviceObject.visited = false; this.emit('entityIndex'); } } } /** * * @param serviceObject * @returns {Promise} */ async processEntityIndex(serviceObject) { const fields = ['referenceNumber', 'businessName', 'address', 'start', 'end', 'reason']; const mouseDownDuration = Scraper.notARobot(); if (serviceObject.visited === false) { logger.debug('Preparing...'); await this.page.waitForSelector('table#Subjects', { 'visible':true }).then(async () => { await this.entityIndexFirstPass(serviceObject); }).catch(() => { logger.error('Table failed to render'); }); } if (serviceObject.visited === true) { serviceObject.currentMetaIndex = serviceObject.step % 10; if ((serviceObject.step ) >= serviceObject.currentPageMax) { const nextButton = await this.page.$$('#Subjects_next'); const buttonClasses = await this.page.$eval('#Subjects_next', e => e.getAttribute('class')); if (buttonClasses.split(' ').indexOf('disabled') === -1) { // we need a click.. nextButton[0].click({ 'delay':mouseDownDuration }); await this._randomWait(this.page, 3, 5); serviceObject.visited = false; this.emit('entityIndex'); } else { logger.debug('I think we are done here...'); this.emit('serviceDone'); } } else { await this.page.waitForSelector('#Subjects > tbody'); const wantedRow = await this.page.$$(`#Subjects > tbody > tr:nth-child(${serviceObject.currentMetaIndex + 1})`); const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]); const $ = cheerio.load(`${htmlRow}
`); const cells = $('td'); serviceObject.current = {}; cells.each((index, item) => { serviceObject.current[ fields[index] ] = $(item).text(); }); await this._randomWait(this.page, 3, 5); await wantedRow[0].click({ 'delay':mouseDownDuration }); } } } /** * * @param $ * @returns {Promise} */ async processEntityDetailBasicDetails($) { const newObj = {}; const rows = $('tr'); rows.each((index, elm) => { const children = $(elm).children(); const preLabel = $(children).eq(0).text(); const label = camelCase(this._cleanUp(preLabel.replace(':', ''))); newObj[label] = this._cleanUp($(children).eq(1).text()); }); return newObj; } /** * * @param $ * @param elm */ decodeTable($, elm) { const rows = $(elm).find('table.details tr'); const obj = {}; rows.each( (index, elm) => { const children = $(elm).children(); const labelClass = $(children[0]).attr('class'); const label = camelCase(this._cleanUp($(children[0]).text().replace(':', '').replace(',', ''))); const contents = this._cleanUp($(children[1]).text().replace(/(Hide|View)\s*/, '')); if (typeof(labelClass) !== 'undefined' && labelClass === 'dlabel') obj[label] = contents; }); return obj; } /** * * @param $ * @returns {Promise} */ async processEntityDetailTableV2($) { // take the first tbody as this is the main one... const fields = [ 'license', 'start', 'end', 'reason']; const outData = []; let newObj = {}; let topLevel = ''; let midLevel = {}; let level1ID = ''; const tbody = $('tbody')[0]; const children = $(tbody).children(); children.each((index, item) => { const itemClasses = $(item).attr('class').split(' '); if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('sublicctrl') !== -1)) { // TOP LEVEL const itemChildren = $(item).children(); if (Object.keys(newObj).length !== 0) { // push this object into the list outData.push(newObj); newObj = {}; } topLevel = camelCase(this._cleanUp($(itemChildren[0]).text().replace(',', ''))); midLevel = {}; itemChildren.each((ci, celm) => { midLevel[fields[ci]] = this._cleanUp($(celm).text()); }); midLevel.detail = []; newObj[topLevel] = Object.assign({}, midLevel); } // if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('details') !== -1)) // TOP LEVEL - DETAILS newObj[topLevel].detail.push(this.decodeTable($, item)); // if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') === -1)) { // LEVEL 1 const itemChildren = $(item).children(); level1ID = camelCase(this._cleanUp($(itemChildren[0]).text())); newObj[topLevel][level1ID] = []; } // if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') !== -1)) { // LEVEL 1 - DETAIL const table = this.decodeTable($, item); newObj[topLevel][level1ID].push(table); } // if ((itemClasses.indexOf('level2') !== -1) && (itemClasses.indexOf('details') === -1)) { // LEVEL 2 const itemChildren = $(item).children(); const obj = {}; itemChildren.each((ci, celm) => { obj[fields[ci]] = this._cleanUp($(celm).text()); }); const nexttable = $(item).next(); obj.details = this.decodeTable($, nexttable); if (level1ID === '') { const newID = camelCase(this._cleanUp(obj.license.replace(',', ''))); newObj[topLevel][newID] = []; newObj[topLevel][newID].push(obj); } else { if (!newObj[topLevel].hasOwnProperty(level1ID)) newObj[topLevel][level1ID] = []; newObj[topLevel][level1ID].push(obj); } } }); // insert final obj if (Object.keys(newObj).length !== 0) { // push this object into the list outData.push(newObj); newObj = {}; } return outData; } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetail(serviceObject) { // level0 sublicctrl sublicctrl1 odd // level0 sublicctrl sublicctrl1 odd sublicshow shown // expand all accordians const rows = await this.page.$$('tr.sublicctrl'); for (const item of rows) { const cls = await this.page.evaluate(el => el.getAttribute('class'), item); if (!cls.includes('shown')) await item.click({ 'delay':Scraper.notARobot() }); } await this.page.waitForSelector('#Licenses > tbody > tr.level1.shown.sublichide1.sllhidectrl.sllhidectrl1', { 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':Scraper.notARobot() }); }).catch(() => { logger.debug('No License information'); }); await this._microWait(this.page, 5); // expand all viewable anchors const wantedAnchors = await this.page.$$('.row a'); for (const item of wantedAnchors) { const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item)); if (exItem === 'View') await item.click({ 'delay': Scraper.notARobot() }).catch((e) => { logger.debug('View click failed', e); }); } const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`; const fileName = this._makeFileName(entityName); const filePath = await this._makeFilePath(entityName); serviceObject.current.fileName = fileName; await this._randomWait(this.page, 2, 2); await this.page.focus('h3.page-header'); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); await this.page.waitForSelector('body > div.container > form.form-horizontal > table', { 'timeout':7500 }).then(async (elm) => { logger.debug('prep for processEntityDetailBasicDetails'); const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm); const $ = cheerio.load(htmlBlock); serviceObject.current.basicDetails = await this.processEntityDetailBasicDetails($); }); await this.page.waitForSelector('#Licenses').then(async (elm) => { logger.debug('prep for processEntityDetailTableV2'); const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm); const $ = cheerio.load(htmlBlock); serviceObject.current.entityDetails = await this.processEntityDetailTableV2($); }); this.entityCompleter(serviceObject); } /** * * @param serviceObject * @returns {Promise} */ async entityCompleter(serviceObject) { const filename = serviceObject.current.fileName; const filePath = `${this.path}/${filename}`.substring(0, 240); logger.info(`Saving: ${filename}.json`); const newLink = { 'referenceNumber':serviceObject.current.referenceNumber, 'businessName':serviceObject.current.businessName, 'fileName':`${filename}.json` }; serviceObject.links.push(newLink); await jsonfile.writeFile(`${filePath}.json`, serviceObject.current); await this._randomWait(this.page, 3, 5); serviceObject.step++; if (serviceObject.step < serviceObject.currentIndexLength) { serviceObject.current = {}; await this.page.goBack({ 'waitUntil':'networkidle0' }); } else this.emit('serviceDone'); } /** * * @returns {Promise} */ async handleMainIndex() { switch (this.mode) { case 1: await this.processMainMenu(this.emoneyServices); break; case 2: await this.processMainMenu(this.creditServices); break; case 0: default: await this.processMainMenu(this.paymentServices); break; } } /** * * @returns {Promise} */ async handleEntityIndex() { switch (this.mode) { case 1: await this.processEntityIndex(this.emoneyServices); break; case 2: await this.processEntityIndex(this.creditServices); break; case 0: default: await this.processEntityIndex(this.paymentServices); break; } } /** * * @returns {Promise} */ async handleEntityDetail() { switch (this.mode) { case 1: await this.processEntityDetail(this.emoneyServices); break; case 2: await this.processEntityDetail(this.creditServices); break; case 0: default: await this.processEntityDetail(this.paymentServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } const params = Object.assign({ 'aa': '' }, this._getParamsFromUrl(pageUrl.search)); switch (params.aa) { case '': await this.handleIntroPage(); break; case 'select_sector': await this.handleMainIndex(); break; case 'select_categ': await this.handleEntityIndex(); break; case 'select_subject': await this.handleEntityDetail(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } break; } } /** * * @returns {Promise} */ async attachEvents() { this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('entityIndex', async () => { await this.handleEntityIndex(); }); this.on('paymentServicesDone', async () => { try{ this.paymentServices.indexStep++; if (this.paymentServices.indexStep < this.paymentServices.sections.length) { this.paymentServices.visited = false; this.paymentServices.step = 0; await this._goto(this.paymentServices.urls[1]); } else { this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { try{ this.emoneyServices.indexStep++; if (this.emoneyServices.indexStep < this.emoneyServices.sections.length) { this.emoneyServices.visited = false; this.emoneyServices.step = 0; await this._goto(this.emoneyServices.urls[0]); } else { this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { try{ this.creditServices.indexStep++; if (this.creditServices.indexStep < this.creditServices.sections.length) { this.creditServices.visited = false; this.creditServices.step = 0; await this._goto(this.creditServices.urls[0]); } else { this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } } catch (e) { logger.error(e); } }); } /** * Initite the process * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.inProgress = false; /* Swapping sections from text to data-sector ids. document.querySelector('[data-sector="156"]') Payment Services: Payment Institutions and Branches of Foreign Payment Institutions // 9 Providing Payment Services in Limited Scope // 11 Account information service providers // 156 eMoney Services: E-Money Institutions and Branches of Foreign E-Money Institutions // 12 E-Money Institutions Based in Slovakia // 37 credit Services: Banks Authorised to Provide Investment Services // 5 Banks Based in Slovakia // 19 */ this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://subjekty.nbs.sk/', 'https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='], 'sections' : [9, 11, 156], 'sectionStep': 0, 'currentIndexLength' : 0, 'sectionLinks' : [], 'currentIndex' :'', 'currentMetaIndex' : 0 }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='], 'sections' : [12, 37], 'sectionStep': 0, 'currentIndexLength' : 0, 'sectionLinks' : [], 'currentIndex' :'', 'currentMetaIndex' : 0 }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='], 'sections' : [5, 19], 'sectionStep': 0, 'currentIndexLength' : 0, 'sectionLinks' : [], 'currentIndex' :'', 'currentMetaIndex' : 0 }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/SK/NBS`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = SKScrape;