obdfcascrape/ncas/cz.js
Martin Donnelly 534fd67b5d final update
2019-08-15 08:48:49 +01:00

1070 lines
31 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('CZ');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class CZScrape extends Scraper {
constructor() {
super();
this.setID('CZ');
this.version = '0.0.1-3';
this.captchas = ['iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAj0lEQVRYR+2XQQ6AIAwE7f8fXeOBBIlll8bYSNZzhemUZMHc3Y/CzwSQMWBm06GtTDU1ghIAtGmkBNmgDZQBZDcejUQmoIEIYKb26Z/XANBMW+cjhABkQAZkQAZkYB8DLe0+i+PVeL3q+yhG8Q0vJBEA+5bZB6DvGN0TUde3tX75MGHnz9TRh5BZLFNTDnACDZUAsJw5oEAAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAXElEQVRYR+2XsQ0AMAjDyv9H0wmVganQZjEPYDkREubuvoRjAHQMmJ3sboNsRQCAzEBeHC342gEJQLU036/nBuQA1bWWlTBgAMAABjCAAQxgAAMTL2XrMQFgwsAGQ1axoX3D1WIAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAuUlEQVRYR+2WUQ6AMAhD2f0PjfHDxBiRtiPBJfMb17cC1eHubo3P2ACSAyPpGdFUrQUtAJloZEriBu5AG0AkHN2MrM8dIA80sp4HQCf8CRK8twCAGtNlDrAA5TOwFMBXVshDyDjwBlCWhBGIIHo/Kl9DRviqRbPCzOoBCPGTlwNAPkgtAKSoNgPCiiELhLdgctrDWU7/CcloRW7NteC3AOxV5ShGVg+B2QCyA4i9EzV4DkyIfL3aDnAAnuOdod3Qo9wAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAArklEQVRYR+2XSw6AIAwF7f0PXcPCxCj9PRqKBtcCk2n7UGJmPgof2gCQASK9aIGqYiUoAbAO1ZwoRvwGygCkg7U699YI79sGlgPwdvgTPM3AdAA0ptMMIACpTbg0gDcbUoKoZ2IDeA00e3AOLN2EElz5GDawtCBybvSS4VwXvw2n3wWR61ibCngKIqMGfJbZJbg2HQFJieJygLteC8bbqG1av/ljgsSzsAYz8CeAE+03waHIRTLTAAAAAElFTkSuQmCC',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAzklEQVRYR+2WYQqAMAiF9Uzdqc5Ud+pMiwULGbopTEew/gQ1pn4+34YppQQTH1wJjCRwIcINABvA+z4V3R3aggPxC56T2KMSqCsPJ1BXXtoQQkAK7q4BCXvB764BTnBhU8BhzyMXNgW0Uio0dwJS5eUocSfQq7D3nzvz1E7Iqb6ec1cCdHPJ410IWDZ1IWDZ1JJs0UNTAy2bpY5Hj2Dpu9S2ZgIau9UEb9nyIqD2Ac3F2SJYlQg1Qema4VNgTWARWASmE7CKNq8f6gO/TOABNoFBsH2myFkAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAwklEQVRYR+2WwRHDMAgEoab0FNeU9OSayPgRj8YjdNIRWXngj37WaQ8O1MxMFn6aAhAB3VTkISK7hE571p2GFuhbw5cf4u3FCkgCFwKel2wnD9eA5+U8AUng3wj0hNJIoeIiJHJgpFCxgGASIjFYAEGgtAnZgQU4BFovKwfYNAKtl9UGGD0N6YS7kKOnIS2gMz9gDdACksCvCHg74W1dsHwn/BYhux2Hk/AUQM6EcBLW2hDZgi4t/zktB3rzIwUsJ/ABbkZFsGTnR4wAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA90lEQVRYR+2W0Q2DMAxEk5nYCWaCnZjJFVIjVRHmXoyK+1F+kJBD3p3jg2pmVhKv+geIOLDVrexlL1OZTu+rrbipuAVq0ygMBljq4ir2Nj+ezzZfuiEBosp7KK8tEsBT7ikbrZcAngOeotF6CdArUj3Fx/9dKAF6RSMjRmAkwE86QEeStCvkAB1N0i4JQDf7WhJSu71vg2pD2IG0HGhK05KQHKxj/ml+yBZEc4CukwBUSZ96dJ0EoEp6ALpOAlAljzmQPgUt8R7LgWZtNBFvJ2EDiH4TVG7IQ3j2U6Fg1Kaf7wwBqHYo228DkF8tWhNygL6c1KUDvADp2Oqw5E5+1wAAAABJRU5ErkJggg==',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAvUlEQVRYR+2WwQ3AIAwDw0zsBDPRnZiJikclhEBAbDUf+uGDiHN2KK6UUsTwc1fAikCMTrwXyVmgNYSx00sLnsfBxav4lJQCLgFzAporohU98/47dxnCEwF9YFfF69kUATObZqPXNkURoOmcakFPYKdzqgBTAqOb8lcCCH5oCmb/iJ3Ro0wB2jkcQiR4MAE0eLAAFn51CFn41QLMCLBGT50BZueqMWR6rxJwCZgTOHkf7u6lPMl2i432mQt4AZRbZ7D1wLKYAAAAAElFTkSuQmCC',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA10lEQVRYR+2X0Q2AIAxEy0zuJDPpTs5U4weGkJYrDUJi8MefRo7X6yGBmZkmPmEJ8BAI50l0XUTbJr75OMxNdbUgxKgu/ojife8vAC2qEUFizAQQdm87oADvzksiGgksQDFczWiSaK0eCygMh3qa3FeK6EbAOmKlZ/5DILl9mAdae/rU5/iRZ6AJXwEgfj/LAY1A7SzIxSwCnxGwTgHKDWhC7Syo7Uw6uFYS/odAmv9hHtCScNgUrCSU/q/RDyoKn/ybMIhEATPuBeabRmOhi0DjGtXy6QJujffgsC3pb7kAAAAASUVORK5CYII=',
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABA0lEQVRYR+2W0Q3DIAxEyUzsBDPBTsxEhVQkZMW5s5MWqWp/8lETng/fkaP33sPG3/EH8ChQaw2ttRBjPH2WUuhDdR1BzlndfECllJ4HQJtqiiAYWgEku/c4IIC3c6mIpgQE0DrXBs1aDwGkAuhM5/Sx6yCA7Ii1GLsOArCdSN+x6yAA24kEYNdBALaTrykw/b7NBdPfmhu03Hg8B7TksyYinAHN11e34RmEW4F1uNhYHps95oIVgL2QxoCy7qGPgL7g34UfUcAC8TsKWH1trYczYL3frfUQ4G4OoO8HGoC1oAwh9P1AA1jyAG26vssFgBIRyX4bwJIHqNalAHqp5f/tAC88u1y/XYiv9gAAAABJRU5ErkJggg=='
];
this.on('done', async () => {
await this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
await this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then(async (l) => {
if (l)
await this.run();
});
}
async getBase64Image(img) {
// Create an empty canvas element
var canvas = document.createElement('canvas');
canvas.width = img.width;
canvas.height = img.height;
// Copy the image contents to the canvas
var ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0);
// Get the data-URL formatted image
// Firefox supports PNG and JPEG. You could check img.src to
// guess the original format, but be aware the using "image/jpg"
// will re-encode the image.
var dataURL = canvas.toDataURL('image/png');
return dataURL.replace(/^data:image\/(png|jpg);base64,/, '');
}
async getBinOfImg(elm) {
const bin = await this.page.evaluate(el => {
const canvas = document.createElement('canvas');
canvas.width = el.width;
canvas.height = el.height;
// Copy the image contents to the canvas
const ctx = canvas.getContext('2d');
ctx.drawImage(el, 0, 0);
const dataURL = canvas.toDataURL('image/png');
return dataURL.replace(/^data:image\/(png|jpg);base64,/, '');
}, elm[0]);
// logger.debug(bin);
return bin;
}
async captchaTest() {
// #ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child(2) > img
const rawBins = [];
for (let step = 2; step <= 7; step++) {
const elm = await this.page.$$(`#ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child(${step}) > img`);
rawBins.push(this.captchas.indexOf(await this.getBinOfImg(elm)));
}
// #OPIS
await this.page.focus('#OPIS');
await this.page.keyboard.type(rawBins.join(''), { 'delay': 15 }); // Types slower, like a user
logger.info(rawBins);
return rawBins.join('');
}
async handleCaptchaPage() {
logger.debug('+ handleCaptchaPage');
await this._randomWait(this.page, 2, 2, 'handleCaptchaPage');
await this.captchaTest();
await this._microWait(this.page, 5);
await this._findAndClick('input.jerrsButton');
// await this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
logger.debug('- handleCaptchaPage');
}
async handleBasicListings() {
try{
const options = await this.page.$$('select[name="p_rec_per_page"] option');
const wantedOption = ['no paginate'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('select[name="p_rec_per_page"]', value);
break;
}
}
await this._microWait(this.page, 5);
await this._findAndClick('#ID_BL_FORM > fieldset > table > tbody > tr:nth-child(4) > td:nth-child(1) > input');
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async handleIntroPage() {
if (!this.inProgress) {
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
await this._findAndClick('#navigace > li:nth-child(5) > a');
}
else {
logger.warn('Trying to resume..');
logger.warn(this.lastUrl);
this._goto(this.lastUrl);
// this.emit('entityComplete');
}
}
/**
*
* @param html
*/
async extractEntityDetails(html) {
const seq = [
{ 'name' : 'entityType', 'field':'Entity Type' },
{ 'name' : 'companyId', 'field' : 'Company Identification Number' },
{ 'name' : 'instituteName', 'field' : 'Institution Name' },
{ 'name':'registeredAddress', 'field' : 'Registered / permanent residence address' },
{ 'name' : 'contactAddress', 'field' : 'Contact Address' },
{ 'name' : 'phone', 'field' : 'Phone' },
{ 'name' : 'fax', 'field': 'Fax' },
{ 'name' : 'email', 'field': 'E-mail' },
{ 'name' : 'website', 'field': 'Website' },
{ 'name' : 'typeOfAuth', 'field': 'Type of authorization' },
{ 'name' : 'dateOfAuth', 'field': 'Date of authorization' },
{ 'name' : 'dateOfAuthLegalForce', 'field': 'Date the decision came to legal force' },
{ 'name' : 'ownershipStructure', 'field': 'Ownership Structure' },
{ 'name' : 'detailedEntitityType', 'field': 'Detailed Entity Type' },
{ 'name' : 'relatedLegalTies', 'field': 'Related legal ties' },
{ 'name' : 'otherFunctions', 'field': 'Other function(s)' },
{ 'name' : 'numericCode', 'field' : 'Numeric code' },
{ 'name' : 'lei', 'field' : 'LEI' }
];
const crossBorderField = 'Cross-border services';
const $ = cheerio.load(html);
const details = { 'crossBorderLinks' : [], 'authProcess':false, 'cbProcess':false };
for (const item of seq) {
const headCells = $(`#obsah > table > tbody td.tableNadpis:contains("${item.field}")`);
const foundCell = $(headCells).next('td.tableDetail');
details[item.name] = this._cleanUp($(foundCell).text());
}
const href = $('a:contains("Authorized activities")').attr('href');
details['authLink'] = `${this.urlPrefix}${href}`;
const crossBorder = $(`#obsah > table > tbody td.tableNadpis:contains("${crossBorderField}")`).next('td.tableDetail');
const cbElms = $(crossBorder).children();
cbElms.each((index, itm) => {
details.crossBorderLinks.push({ 'name': $(itm).text(), 'href':`${this.urlPrefix}${$(itm).attr('href')}` });
});
details.crossBorderStep = 0;
return details;
}
/**
*
* @param serviceObject
* @returns {Promise<null>}
*/
async entityCompleter(serviceObject) {
let cbFlag = false;
try{
if (serviceObject.current.authLink !== '' && !serviceObject.current.authProcess) {
await this._randomWait(this.page, 3, 5, 'Get Authorisations');
await this._goto(serviceObject.current.authLink, { 'waitUntil':'networkidle0' });
return null;
}
if (typeof serviceObject.current.crossBorderLinks !== 'undefined' && !serviceObject.current.cbProcess && serviceObject.current.crossBorderLinks.length > 0) {
await this._randomWait(this.page, 3, 5, 'Get CBs');
// logger.info(`Crossborder for ${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}`);
await this._goto(serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].href, { 'waitUntil':'networkidle0' });
}
else
cbFlag = true;
}
catch( err) {
logger.error(err);
}
if( cbFlag === true) {
const filename = serviceObject.links[serviceObject.step].fileName;
const filePath = `${this.path}/${filename}`.substring(0, 240);
logger.info(`Saving: ${filename}.json`);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].fileName = `${filename}.json`;
serviceObject.links[serviceObject.step].params = this._getParamsFromUrl(serviceObject.links[serviceObject.step].href);
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
serviceObject.current = {};
await this._goto(serviceObject.links[serviceObject.step].href, { 'waitUntil':'networkidle0' });
}
else
this.emit('serviceDone');
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractEntityAuthority(html) {
const output = [];
let chunk;
let header = '';
const $ = cheerio.load(html);
const rows = $('table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
if (children.length === 1) {
if (typeof(chunk) !== 'undefined')
output.push([header, chunk]);
chunk = [];
header = this._cleanUp($(elm).text());
}
else {
const text = this._cleanUp($(children.eq(0)).text());
const cb = $(children.eq(1)).find('input').attr('checked');
if (typeof(cb) !== 'undefined')
chunk.push(text);
}
});
if (typeof(chunk) !== 'undefined')
output.push([header, chunk]);
return output;
}
/**
*
* @param $
* @param row
* @param verts
* @returns {Promise<*[]>}
*/
async reduceMatrixRow($, row, verts) {
const newRow = [];
const title = this._cleanUp($(row).eq(0).text());
const cells = $(row).find('input');
cells.each((i, elm) => {
const input = $(elm).attr('checked');
const checked = (typeof input !== 'undefined');
newRow.push([verts[i], checked]);
});
return [title, newRow];
}
/**
*
* @param $
* @param rows
* @returns {Promise<Array>}
*/
async extractCSEAUndetakings($, rows) {
const output = [];
const verts = [];
let mSectionTitle = '', mSection = [];
let section = [];
let sectionTitle = '';
let lastRowLength = 0;
// build index of verts
const vertCols = $(rows).eq(1).find('td');
vertCols.each((i, elm) => {
const alt = $(elm).find('img').attr('alt');
verts.push(alt);
});
// walk the rows.
for(let index = 2; index < rows.length;index++) {
const row = $(rows).eq(index);
const children = cheerio(row).children();
if (children.length === 1) {
// section change
// tableDetailLightGrey
if (lastRowLength > (verts.length))
output.push([mSectionTitle, mSection]);
if (children.eq(0).attr('class') === 'tableDetailLightGrey') {
mSection = [];
mSectionTitle = this._cleanUp($(row).text());
}
if (children.eq(0).attr('class') === 'tableNadpis') {
if (section.length > 0)
output.push([sectionTitle, section]);
section = [];
sectionTitle = this._cleanUp($(row).text());
}
}
if(children.length === 2) {
if (lastRowLength > (verts.length))
output.push([mSectionTitle, mSection]);
const text = this._cleanUp($(children.eq(0)).text());
const cb = $(children.eq(1)).find('input').attr('checked');
if (typeof(cb) !== 'undefined')
section.push(text);
}
if (children.length > (verts.length)) {
// a matrix row
const matrixRow = await this.reduceMatrixRow($, row, verts);
mSection.push(matrixRow);
}
lastRowLength = children.length;
}
return output;
}
/**
*
* @param $
* @param rows
* @returns {Promise<*[]>}
*/
async extractCSEAActivity($, rows) {
const activity = [];
let chunk;
let section = [];
let sectionTitle;
let chunkTitle = '';
let lastItemSection = false;
rows.each((i, elm) => {
const children = cheerio(elm).children();
if (children.length === 1)
if (children.eq(0).attr('class') === 'tableNadpis') {
if (typeof(chunk) !== 'undefined')
activity.push(chunk);
chunk = [];
chunkTitle = this._cleanUp($(elm).text());
}
else {
sectionTitle = this._cleanUp($(elm).text());
section = [];
}
else {
const text = this._cleanUp($(children.eq(0)).text());
const cb = $(children.eq(1)).find('input').attr('checked');
const span = $(children.eq(0)).find('span');
if (typeof(cb) !== 'undefined') {
if (span.length > 0)
section.push(text);
else {
if (lastItemSection === true)
chunk.push([sectionTitle, section]);
chunk.push(text);
}
lastItemSection = (span.length > 0);
}
}
});
if (typeof(chunk) !== 'undefined')
activity.push([chunkTitle, chunk]);
return activity;
}
/**
*
* @param html
* @returns {Promise<{activity: Array, undertakings: *}>}
*/
async extractCreditServicesEntityAuthority(html) {
try{
let undertakings = null;
const activity = [];
const $ = cheerio.load(html);
const tables = $('table');
for(let index = 0;index < tables.length; index++) {
const table = $(tables.eq(index));
const matrixTable = $(table).find('td.tableNadpis:contains("Act No. 256/2004 Coll., Capital Market Undertakings Act")');
const rows = $(table).find('tbody tr');
if ($(matrixTable).length === 0) {
const activityTable = await this.extractCSEAActivity($, rows);
activity.push(activityTable);
}
else
undertakings = await this.extractCSEAUndetakings($, rows);
}
return { activity, undertakings };
}
catch (e) {
logge.error(e);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processCBDetails(serviceObject) {
logger.info(`Process ${this.modeTitles[this.mode]} entity crossBorderStep:${serviceObject.links[serviceObject.step].crossBorderStep}`);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processAuthorityDetails(serviceObject) {
// serviceObject
const modeStrings = ['authority', 'crossBorder'];
const authorityMode = (serviceObject.current.authProcess) ? 1 : 0;
if (authorityMode === 1)
if (!serviceObject.current.hasOwnProperty('crossBorder')) {
serviceObject.current['crossBorder'] = {};
}
const crossBorderText = (authorityMode === 1) ? `for ${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}` : '';
logger.info(`Process ${this.modeTitles[this.mode]} entity: ${serviceObject.links[serviceObject.step].name} ${modeStrings[authorityMode]} ${crossBorderText}`);
await this._randomWait(this.page, 3, 5);
const ccFN = (authorityMode === 1) ? `_${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}` : '';
const filename = serviceObject.links[serviceObject.step].fileName;
const outFile = `${filename}_${modeStrings[authorityMode]}${ccFN}`;
const filePath = `${this.path}/${outFile}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}`, null);
const body = await this.page.content();
// const $ = cheerio.load(body);
const details = (this.mode === 2 && authorityMode === 0) ? await this.extractCreditServicesEntityAuthority(body) : await this.extractEntityAuthority(body);
if (authorityMode === 0) {
serviceObject.current['authority'] = (this.mode === 2) ? Object.assign({}, details) : details.slice();
serviceObject.current.authProcess = true;
}
else {
const cbStep = serviceObject.current.crossBorderStep;
const countryCode = serviceObject.current.crossBorderLinks[cbStep].name;
serviceObject.current['crossBorder'][countryCode] = details.slice();
serviceObject.current.crossBorderStep++;
if (serviceObject.current.crossBorderStep >= serviceObject.current.crossBorderLinks.length)
serviceObject.current.cbProcess = true;
}
logger.info(`Completed ${modeStrings[authorityMode]}...`);
this.emit('entityComplete');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
// const noWhiteSpace = /\W/g;
serviceObject.current = {};
logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.links[serviceObject.step].name}`);
logger.info(`Step ${serviceObject.step} of ${serviceObject.links.length}`);
await this._randomWait(this.page, 3, 5);
const filename = serviceObject.links[serviceObject.step].fileName;
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
// const $ = cheerio.load(body);
const details = await this.extractEntityDetails(body);
serviceObject.current = Object.assign({}, details);
this.emit('entityComplete');
logger.info('Entity complete...');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async populateSectionLinks(serviceObject) {
const pageUrl = url.parse(await this.page.url());
const urlPrefix = `${pageUrl.protocol}//${pageUrl.host}/apljerrsdad/`;
const body = await this.page.content();
const $ = cheerio.load(body);
const links = $('a.textNorm');
for (const items of serviceObject.sections)
for (let index = 0, len = links.length; index < len; index++) {
const item = links[index];
const itemText = this._cleanUp($(item).text());
const itemLink = $(item).attr('href');
if (itemText === items)
serviceObject.sectionLinks.push(`${urlPrefix}${itemLink}`);
}
//
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async useListingPage(serviceObject) {
if (serviceObject.sectionLinks.length === 0) {
logger.debug('SectionLinks empty');
await this.populateSectionLinks(serviceObject);
}
logger.debug(serviceObject.sectionLinks);
await this._randomWait(this.page, 3, 5, 'First sub section');
await this._goto(serviceObject.sectionLinks[serviceObject.indexStep], { 'waitUntil':'networkidle0' });
}
/**
*
* @param $
* @param rows
* @param indexStep
* @returns {Promise<Array>}
*/
async extractDataFromSubList($, rows, indexStep = 0) {
const cellTitles = [
'companyNumber',
'name',
'address',
'city',
'postcode',
'country',
'datefrom'
];
const pageUrl = url.parse(await this.page.url());
const urlPrefix = `${pageUrl.protocol}//${pageUrl.host}/apljerrsdad/`;
const details = [];
rows.each(async (i, elm) => {
const children = $(elm).children();
const newItem = {};
if (children.length === 7) {
for(let cPos = 0;cPos <= 6;cPos++)
newItem[cellTitles[cPos]] = this._cleanUp(children.eq(cPos).text().trim());
newItem['href'] = `${urlPrefix}${children.eq(1).children().eq(0).attr('href')}`;
newItem['fileName'] = this._makeFileName(newItem['name']);
newItem['indexStep'] = indexStep;
details.push(newItem);
}
});
return details;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processSubListPage(serviceObject) {
const body = await this.page.content();
const $ = cheerio.load(body);
// details = details.concat(urlParams);
const rows = $('table tr');
const details = await this.extractDataFromSubList($, rows, serviceObject.indexStep);
serviceObject.links = serviceObject.links.concat(details);
serviceObject.indexStep++;
if (serviceObject.indexStep >= serviceObject.sectionLinks.length) {
this.inProgress = true;
serviceObject.items = serviceObject.links.length;
await this._randomWait(this.page, 3, 5, 'First page');
logger.info('goto', serviceObject.links[serviceObject.step].href);
await this._goto(serviceObject.links[serviceObject.step].href, { 'waitUntil':'networkidle0' });
}
else {
await this._randomWait(this.page, 3, 5, 'Next sub section');
await this._goto(serviceObject.sectionLinks[serviceObject.indexStep], { 'waitUntil':'networkidle0', 'timeout': 5000 });
}
}
/**
*
* @returns {Promise<void>}
*/
async handleSubListPage() {
switch (this.mode) {
case 1:
await this.processSubListPage(this.emoneyServices);
break;
case 2:
await this.processSubListPage(this.creditServices);
break;
case 0:
default:
await this.processSubListPage(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleListingsPage() {
switch (this.mode) {
case 1:
await this.useListingPage(this.emoneyServices);
break;
case 2:
await this.useListingPage(this.creditServices);
break;
case 0:
default:
await this.useListingPage(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
case 0:
default:
await this.processEntityDetails(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processAuthority() {
switch (this.mode) {
case 1:
await this.processAuthorityDetails(this.emoneyServices);
break;
case 2:
await this.processAuthorityDetails(this.creditServices);
break;
case 0:
default:
await this.processAuthorityDetails(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityComplete() {
switch (this.mode) {
case 1:
await this.entityCompleter(this.emoneyServices);
break;
case 2:
await this.entityCompleter(this.creditServices);
break;
case 0:
default:
await this.entityCompleter(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const errorPages = ['https://apl.cnb.cz/apljerrsdad/undefined', 'chrome-error://chromewebdata/'];
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (errorPages.indexOf(pageUrl.href) !== -1) {
logger.warn(`Directed to: ${pageUrl.href}`);
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/apljerrsdad/JERRS.WEB07.INTRO_PAGE':
await this.handleIntroPage();
break;
case '/apljerrsdad/JERRS.WEB45.LOGIN_A':
await this.handleCaptchaPage();
break;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS':
await this.handleBasicListings();
break;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE':
await this.handleListingsPage();
break;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE_3':
await this.handleSubListPage();
break;
case '/apljerrsdad/JERRS.WEB10.VIZITKA':
await this.processRedirector();
break;
case '/apljerrsdad/JERRS.WEB14.POVOLENE_CINNOSTI':
await this.processAuthority();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
// throw new Error(`Unknown page: ${pageUrl}`);
this.emit('recover');
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('paymentServicesDone', async function() {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices.links);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async function() {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices.links);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async function() {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices.links);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.inProgress = false;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
'sections' : ['Payment institutions and branches of foreign payment institutions', 'Small payment institutions', 'Account information service providers and branches of foreign account information service providers'],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
'sections' : ['Electronic money institutions and branches of foreign electronic money institutions', 'Small e-money issuers'],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
'sections' : ['Banks and branches of foreign banks'],
'sectionLinks' : []
};
this.urlPrefix = 'https://apl.cnb.cz/apljerrsdad/';
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/CZ/CNB`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = CZScrape;