Martin Donnelly a5109efabe 2019-05-12
2019-05-12 18:33:09 +01:00

839 lines
22 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('(SK)');
const url = require('url');
const camelCase = require('camelcase');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class SKScrape extends Scraper {
constructor() {
super();
this.setID('SK');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<boolean>}
*/
async checkChangeLanguage() {
const languageIcon = await this.page.$$('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a > img');
if (languageIcon.length > 0) {
const value = await this.page.evaluate(el => el.getAttribute('src'), languageIcon[0]);
if (value === '/static/icon/ico_en.gif') {
// this needs a click
logger.info('Changing language to English..');
await this._findAndClick('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a ');
return true;
//
}
}
return false;
}
/**
*
* @returns {Promise<void>}
*/
async handleIntroPage() {
const pageUrl = url.parse(await this.page.url());
// Clear cookie bar
await this.page.waitForSelector('a.btnCookieAccept', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.info('No cookie bar');
});
if (!this.inProgress && pageUrl.query === null) {
// fix language before going on
const changedLanguage = await this.checkChangeLanguage();
if (!changedLanguage) {
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
await this._findAndClick(' body > div.container > div:nth-child(5) > div:nth-child(1) > div > div');
}
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processMainMenu(serviceObject) {
const wantedItem = serviceObject.sections[serviceObject.indexStep];
const expandables = ['#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl1',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl2',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl3',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl4'
];
for (const item of expandables)
await this.page.$eval(item, e => e.click({ 'delay':90 }));
await this._randomWait(this.page, 3, 5);
const wantedRow = `[data-sector="${wantedItem}"]`;
logger.debug('Looking for', wantedRow);
await this.page.waitForSelector(wantedRow, { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.warn('processMainMenu did not find what it was looking for!');
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityIndexFirstPass(serviceObject) {
// breaks up `Showing 1 to 10 of 12 entries`
const breaker = /(\d+)/g;
const body = await this.page.content();
const $ = cheerio.load(body);
const subjectsInfo = $('#Subjects_info').text();
const brokenString = subjectsInfo.match(breaker);
const currentPageIndex = parseInt(brokenString[0], 10);
const currentPageMax = parseInt(brokenString[1], 10);
// The site returns the index from the last page when you select a different view.
// This should be watched and can cause a problem
logger.debug('subjectsInfo', subjectsInfo);
logger.debug('Step', serviceObject.step);
logger.debug('currentPageIndex', currentPageIndex);
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
serviceObject.currentIndexLength = parseInt(brokenString[2], 10);
serviceObject.currentPageMax = currentPageMax;
serviceObject.visited = true;
serviceObject.currentIndex = url.parse(await this.page.url());
serviceObject.currentMetaIndex = 0;
}
else {
logger.info('Need to click previous');
const nextButton = await this.page.$$('#Subjects_previous');
const buttonClasses = await this.page.$eval('#Subjects_previous', e => e.getAttribute('class'));
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
// we need a click..
nextButton[0].click({ 'delay':90 });
await this._randomWait(this.page, 3, 5);
serviceObject.visited = false;
this.emit('entityIndex');
}
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityIndex(serviceObject) {
const fields = ['referenceNumber', 'businessName', 'address', 'start', 'end', 'reason'];
const mouseDownDuration = Scraper.notARobot();
if (serviceObject.visited === false) {
logger.debug('Preparing...');
await this.page.waitForSelector('table#Subjects', { 'visible':true }).then(async () => {
await this.entityIndexFirstPass(serviceObject);
}).catch(() => {
logger.error('Table failed to render');
});
}
if (serviceObject.visited === true) {
serviceObject.currentMetaIndex = serviceObject.step % 10;
if ((serviceObject.step ) >= serviceObject.currentPageMax) {
const nextButton = await this.page.$$('#Subjects_next');
const buttonClasses = await this.page.$eval('#Subjects_next', e => e.getAttribute('class'));
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
// we need a click..
nextButton[0].click({ 'delay':mouseDownDuration });
await this._randomWait(this.page, 3, 5);
serviceObject.visited = false;
this.emit('entityIndex');
}
else {
logger.debug('I think we are done here...');
this.emit('serviceDone');
}
}
else {
await this.page.waitForSelector('#Subjects > tbody');
const wantedRow = await this.page.$$(`#Subjects > tbody > tr:nth-child(${serviceObject.currentMetaIndex + 1})`);
const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
const $ = cheerio.load(`<table>${htmlRow}</table>`);
const cells = $('td');
serviceObject.current = {};
cells.each((index, item) => {
serviceObject.current[ fields[index] ] = $(item).text();
});
await this._randomWait(this.page, 3, 5);
await wantedRow[0].click({ 'delay':mouseDownDuration });
}
}
}
/**
*
* @param $
* @returns {Promise<void>}
*/
async processEntityDetailBasicDetails($) {
const newObj = {};
const rows = $('tr');
rows.each((index, elm) => {
const children = $(elm).children();
const preLabel = $(children).eq(0).text();
const label = camelCase(this._cleanUp(preLabel.replace(':', '')));
newObj[label] = this._cleanUp($(children).eq(1).text());
});
return newObj;
}
/**
*
* @param $
* @param elm
*/
decodeTable($, elm) {
const rows = $(elm).find('table.details tr');
const obj = {};
rows.each( (index, elm) => {
const children = $(elm).children();
const labelClass = $(children[0]).attr('class');
const label = camelCase(this._cleanUp($(children[0]).text().replace(':', '').replace(',', '')));
const contents = this._cleanUp($(children[1]).text().replace(/(Hide|View)\s*/, ''));
if (typeof(labelClass) !== 'undefined' && labelClass === 'dlabel')
obj[label] = contents;
});
return obj;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async processEntityDetailTableV2($) {
// take the first tbody as this is the main one...
const fields = [ 'license', 'start', 'end', 'reason'];
const outData = [];
let newObj = {};
let topLevel = '';
let midLevel = {};
let level1ID = '';
const tbody = $('tbody')[0];
const children = $(tbody).children();
children.each((index, item) => {
const itemClasses = $(item).attr('class').split(' ');
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('sublicctrl') !== -1)) {
// TOP LEVEL
const itemChildren = $(item).children();
if (Object.keys(newObj).length !== 0) {
// push this object into the list
outData.push(newObj);
newObj = {};
}
topLevel = camelCase(this._cleanUp($(itemChildren[0]).text().replace(',', '')));
midLevel = {};
itemChildren.each((ci, celm) => {
midLevel[fields[ci]] = this._cleanUp($(celm).text());
});
midLevel.detail = [];
newObj[topLevel] = Object.assign({}, midLevel);
}
//
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('details') !== -1))
// TOP LEVEL - DETAILS
newObj[topLevel].detail.push(this.decodeTable($, item));
//
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') === -1)) {
// LEVEL 1
const itemChildren = $(item).children();
level1ID = camelCase(this._cleanUp($(itemChildren[0]).text()));
newObj[topLevel][level1ID] = [];
}
//
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') !== -1)) {
// LEVEL 1 - DETAIL
const table = this.decodeTable($, item);
newObj[topLevel][level1ID].push(table);
}
//
if ((itemClasses.indexOf('level2') !== -1) && (itemClasses.indexOf('details') === -1)) {
// LEVEL 2
const itemChildren = $(item).children();
const obj = {};
itemChildren.each((ci, celm) => {
obj[fields[ci]] = this._cleanUp($(celm).text());
});
const nexttable = $(item).next();
obj.details = this.decodeTable($, nexttable);
if (level1ID === '') {
const newID = camelCase(this._cleanUp(obj.license.replace(',', '')));
newObj[topLevel][newID] = [];
newObj[topLevel][newID].push(obj);
}
else {
if (!newObj[topLevel].hasOwnProperty(level1ID))
newObj[topLevel][level1ID] = [];
newObj[topLevel][level1ID].push(obj);
}
}
});
// insert final obj
if (Object.keys(newObj).length !== 0) {
// push this object into the list
outData.push(newObj);
newObj = {};
}
return outData;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetail(serviceObject) {
// level0 sublicctrl sublicctrl1 odd
// level0 sublicctrl sublicctrl1 odd sublicshow shown
// expand all accordians
const rows = await this.page.$$('tr.sublicctrl');
for (const item of rows) {
const cls = await this.page.evaluate(el => el.getAttribute('class'), item);
if (!cls.includes('shown'))
await item.click({ 'delay':Scraper.notARobot() });
}
await this.page.waitForSelector('#Licenses > tbody > tr.level1.shown.sublichide1.sllhidectrl.sllhidectrl1', { 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.debug('No License information');
});
await this._microWait(this.page, 5);
// expand all viewable anchors
const wantedAnchors = await this.page.$$('.row a');
for (const item of wantedAnchors) {
const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item));
if (exItem === 'View') {
await item.hover().catch((e) => {
logger.warn('Hover failed', e.name);
});
await item.click({ 'delay': Scraper.notARobot() }).catch((e) => {
logger.debug('View click failed', e.name);
});
}
}
const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`;
const fileName = this._makeFileName(entityName);
const filePath = await this._makeFilePath(entityName);
serviceObject.current.fileName = fileName;
await this._randomWait(this.page, 2, 2);
await this.page.focus('h3.page-header');
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
await this.page.waitForSelector('body > div.container > form.form-horizontal > table', { 'timeout':7500 }).then(async (elm) => {
logger.debug('prep for processEntityDetailBasicDetails');
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
const $ = cheerio.load(htmlBlock);
serviceObject.current.basicDetails = await this.processEntityDetailBasicDetails($);
});
await this.page.waitForSelector('#Licenses').then(async (elm) => {
logger.debug('prep for processEntityDetailTableV2');
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
const $ = cheerio.load(htmlBlock);
serviceObject.current.entityDetails = await this.processEntityDetailTableV2($);
});
this.entityCompleter(serviceObject);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityCompleter(serviceObject) {
const filename = serviceObject.current.fileName;
const filePath = `${this.path}/${filename}`.substring(0, 240);
logger.info(`Saving: ${filename}.json`);
const newLink = { 'referenceNumber':serviceObject.current.referenceNumber, 'businessName':serviceObject.current.businessName, 'fileName':`${filename}.json` };
serviceObject.links.push(newLink);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 3, 5);
serviceObject.step++;
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
await this.page.goBack({ 'waitUntil':'networkidle0' });
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async handleMainIndex() {
switch (this.mode) {
case 1:
await this.processMainMenu(this.emoneyServices);
break;
case 2:
await this.processMainMenu(this.creditServices);
break;
case 0:
default:
await this.processMainMenu(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityIndex() {
switch (this.mode) {
case 1:
await this.processEntityIndex(this.emoneyServices);
break;
case 2:
await this.processEntityIndex(this.creditServices);
break;
case 0:
default:
await this.processEntityIndex(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityDetail() {
switch (this.mode) {
case 1:
await this.processEntityDetail(this.emoneyServices);
break;
case 2:
await this.processEntityDetail(this.creditServices);
break;
case 0:
default:
await this.processEntityDetail(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const params = Object.assign({ 'aa': '' }, this._getParamsFromUrl(pageUrl.search));
switch (params.aa) {
case '':
await this.handleIntroPage();
break;
case 'select_sector':
await this.handleMainIndex();
break;
case 'select_categ':
await this.handleEntityIndex();
break;
case 'select_subject':
await this.handleEntityDetail();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('entityIndex', async () => {
await this.handleEntityIndex();
});
this.on('paymentServicesDone', async () => {
try{
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep < this.paymentServices.sections.length) {
this.paymentServices.visited = false;
this.paymentServices.step = 0;
await this._goto(this.paymentServices.urls[1]);
}
else {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
try{
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep < this.emoneyServices.sections.length) {
this.emoneyServices.visited = false;
this.emoneyServices.step = 0;
await this._goto(this.emoneyServices.urls[0]);
}
else {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
try{
this.creditServices.indexStep++;
if (this.creditServices.indexStep < this.creditServices.sections.length) {
this.creditServices.visited = false;
this.creditServices.step = 0;
await this._goto(this.creditServices.urls[0]);
}
else {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
}
catch (e) {
logger.error(e);
}
});
}
/**
* Initite the process
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.inProgress = false;
/*
Swapping sections from text to
data-sector ids.
document.querySelector('[data-sector="156"]')
Payment Services:
Payment Institutions and Branches of Foreign Payment Institutions // 9
Providing Payment Services in Limited Scope // 11
Account information service providers // 156
eMoney Services:
E-Money Institutions and Branches of Foreign E-Money Institutions // 12
E-Money Institutions Based in Slovakia // 37
credit Services:
Banks Authorised to Provide Investment Services // 5
Banks Based in Slovakia // 19
*/
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://subjekty.nbs.sk/', 'https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [9, 11, 156],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [12, 37],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [5, 19],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/SK/NBS`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = SKScrape;