1070 lines
31 KiB
JavaScript
1070 lines
31 KiB
JavaScript
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const logger = require('log4js').getLogger('CZ');
|
|
const url = require('url');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
class CZScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('CZ');
|
|
this.version = '0.0.1-3';
|
|
|
|
this.captchas = ['iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAj0lEQVRYR+2XQQ6AIAwE7f8fXeOBBIlll8bYSNZzhemUZMHc3Y/CzwSQMWBm06GtTDU1ghIAtGmkBNmgDZQBZDcejUQmoIEIYKb26Z/XANBMW+cjhABkQAZkQAZkYB8DLe0+i+PVeL3q+yhG8Q0vJBEA+5bZB6DvGN0TUde3tX75MGHnz9TRh5BZLFNTDnACDZUAsJw5oEAAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAXElEQVRYR+2XsQ0AMAjDyv9H0wmVganQZjEPYDkREubuvoRjAHQMmJ3sboNsRQCAzEBeHC342gEJQLU036/nBuQA1bWWlTBgAMAABjCAAQxgAAMTL2XrMQFgwsAGQ1axoX3D1WIAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAuUlEQVRYR+2WUQ6AMAhD2f0PjfHDxBiRtiPBJfMb17cC1eHubo3P2ACSAyPpGdFUrQUtAJloZEriBu5AG0AkHN2MrM8dIA80sp4HQCf8CRK8twCAGtNlDrAA5TOwFMBXVshDyDjwBlCWhBGIIHo/Kl9DRviqRbPCzOoBCPGTlwNAPkgtAKSoNgPCiiELhLdgctrDWU7/CcloRW7NteC3AOxV5ShGVg+B2QCyA4i9EzV4DkyIfL3aDnAAnuOdod3Qo9wAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAArklEQVRYR+2XSw6AIAwF7f0PXcPCxCj9PRqKBtcCk2n7UGJmPgof2gCQASK9aIGqYiUoAbAO1ZwoRvwGygCkg7U699YI79sGlgPwdvgTPM3AdAA0ptMMIACpTbg0gDcbUoKoZ2IDeA00e3AOLN2EElz5GDawtCBybvSS4VwXvw2n3wWR61ibCngKIqMGfJbZJbg2HQFJieJygLteC8bbqG1av/ljgsSzsAYz8CeAE+03waHIRTLTAAAAAElFTkSuQmCC',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAzklEQVRYR+2WYQqAMAiF9Uzdqc5Ud+pMiwULGbopTEew/gQ1pn4+34YppQQTH1wJjCRwIcINABvA+z4V3R3aggPxC56T2KMSqCsPJ1BXXtoQQkAK7q4BCXvB764BTnBhU8BhzyMXNgW0Uio0dwJS5eUocSfQq7D3nzvz1E7Iqb6ec1cCdHPJ410IWDZ1IWDZ1JJs0UNTAy2bpY5Hj2Dpu9S2ZgIau9UEb9nyIqD2Ac3F2SJYlQg1Qema4VNgTWARWASmE7CKNq8f6gO/TOABNoFBsH2myFkAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAwklEQVRYR+2WwRHDMAgEoab0FNeU9OSayPgRj8YjdNIRWXngj37WaQ8O1MxMFn6aAhAB3VTkISK7hE571p2GFuhbw5cf4u3FCkgCFwKel2wnD9eA5+U8AUng3wj0hNJIoeIiJHJgpFCxgGASIjFYAEGgtAnZgQU4BFovKwfYNAKtl9UGGD0N6YS7kKOnIS2gMz9gDdACksCvCHg74W1dsHwn/BYhux2Hk/AUQM6EcBLW2hDZgi4t/zktB3rzIwUsJ/ABbkZFsGTnR4wAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA90lEQVRYR+2W0Q2DMAxEk5nYCWaCnZjJFVIjVRHmXoyK+1F+kJBD3p3jg2pmVhKv+geIOLDVrexlL1OZTu+rrbipuAVq0ygMBljq4ir2Nj+ezzZfuiEBosp7KK8tEsBT7ikbrZcAngOeotF6CdArUj3Fx/9dKAF6RSMjRmAkwE86QEeStCvkAB1N0i4JQDf7WhJSu71vg2pD2IG0HGhK05KQHKxj/ml+yBZEc4CukwBUSZ96dJ0EoEp6ALpOAlAljzmQPgUt8R7LgWZtNBFvJ2EDiH4TVG7IQ3j2U6Fg1Kaf7wwBqHYo228DkF8tWhNygL6c1KUDvADp2Oqw5E5+1wAAAABJRU5ErkJggg==',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAvUlEQVRYR+2WwQ3AIAwDw0zsBDPRnZiJikclhEBAbDUf+uGDiHN2KK6UUsTwc1fAikCMTrwXyVmgNYSx00sLnsfBxav4lJQCLgFzAporohU98/47dxnCEwF9YFfF69kUATObZqPXNkURoOmcakFPYKdzqgBTAqOb8lcCCH5oCmb/iJ3Ro0wB2jkcQiR4MAE0eLAAFn51CFn41QLMCLBGT50BZueqMWR6rxJwCZgTOHkf7u6lPMl2i432mQt4AZRbZ7D1wLKYAAAAAElFTkSuQmCC',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA10lEQVRYR+2X0Q2AIAxEy0zuJDPpTs5U4weGkJYrDUJi8MefRo7X6yGBmZkmPmEJ8BAI50l0XUTbJr75OMxNdbUgxKgu/ojife8vAC2qEUFizAQQdm87oADvzksiGgksQDFczWiSaK0eCygMh3qa3FeK6EbAOmKlZ/5DILl9mAdae/rU5/iRZ6AJXwEgfj/LAY1A7SzIxSwCnxGwTgHKDWhC7Syo7Uw6uFYS/odAmv9hHtCScNgUrCSU/q/RDyoKn/ybMIhEATPuBeabRmOhi0DjGtXy6QJujffgsC3pb7kAAAAASUVORK5CYII=',
|
|
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABA0lEQVRYR+2W0Q3DIAxEyUzsBDPBTsxEhVQkZMW5s5MWqWp/8lETng/fkaP33sPG3/EH8ChQaw2ttRBjPH2WUuhDdR1BzlndfECllJ4HQJtqiiAYWgEku/c4IIC3c6mIpgQE0DrXBs1aDwGkAuhM5/Sx6yCA7Ii1GLsOArCdSN+x6yAA24kEYNdBALaTrykw/b7NBdPfmhu03Hg8B7TksyYinAHN11e34RmEW4F1uNhYHps95oIVgL2QxoCy7qGPgL7g34UfUcAC8TsKWH1trYczYL3frfUQ4G4OoO8HGoC1oAwh9P1AA1jyAG26vssFgBIRyX4bwJIHqNalAHqp5f/tAC88u1y/XYiv9gAAAABJRU5ErkJggg=='
|
|
];
|
|
|
|
this.on('done', async () => {
|
|
await this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
|
|
await this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then(async (l) => {
|
|
if (l)
|
|
await this.run();
|
|
});
|
|
}
|
|
|
|
async getBase64Image(img) {
|
|
// Create an empty canvas element
|
|
var canvas = document.createElement('canvas');
|
|
canvas.width = img.width;
|
|
canvas.height = img.height;
|
|
|
|
// Copy the image contents to the canvas
|
|
var ctx = canvas.getContext('2d');
|
|
ctx.drawImage(img, 0, 0);
|
|
|
|
// Get the data-URL formatted image
|
|
// Firefox supports PNG and JPEG. You could check img.src to
|
|
// guess the original format, but be aware the using "image/jpg"
|
|
// will re-encode the image.
|
|
var dataURL = canvas.toDataURL('image/png');
|
|
|
|
return dataURL.replace(/^data:image\/(png|jpg);base64,/, '');
|
|
}
|
|
|
|
async getBinOfImg(elm) {
|
|
const bin = await this.page.evaluate(el => {
|
|
const canvas = document.createElement('canvas');
|
|
canvas.width = el.width;
|
|
canvas.height = el.height;
|
|
|
|
// Copy the image contents to the canvas
|
|
const ctx = canvas.getContext('2d');
|
|
ctx.drawImage(el, 0, 0);
|
|
|
|
const dataURL = canvas.toDataURL('image/png');
|
|
|
|
return dataURL.replace(/^data:image\/(png|jpg);base64,/, '');
|
|
}, elm[0]);
|
|
|
|
// logger.debug(bin);
|
|
return bin;
|
|
}
|
|
|
|
async captchaTest() {
|
|
// #ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child(2) > img
|
|
|
|
const rawBins = [];
|
|
for (let step = 2; step <= 7; step++) {
|
|
const elm = await this.page.$$(`#ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child(${step}) > img`);
|
|
|
|
rawBins.push(this.captchas.indexOf(await this.getBinOfImg(elm)));
|
|
}
|
|
|
|
// #OPIS
|
|
|
|
await this.page.focus('#OPIS');
|
|
|
|
await this.page.keyboard.type(rawBins.join(''), { 'delay': 15 }); // Types slower, like a user
|
|
|
|
logger.info(rawBins);
|
|
|
|
return rawBins.join('');
|
|
}
|
|
|
|
async handleCaptchaPage() {
|
|
logger.debug('+ handleCaptchaPage');
|
|
await this._randomWait(this.page, 2, 2, 'handleCaptchaPage');
|
|
|
|
await this.captchaTest();
|
|
|
|
await this._microWait(this.page, 5);
|
|
|
|
await this._findAndClick('input.jerrsButton');
|
|
|
|
// await this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
|
|
|
|
logger.debug('- handleCaptchaPage');
|
|
}
|
|
|
|
async handleBasicListings() {
|
|
try{
|
|
const options = await this.page.$$('select[name="p_rec_per_page"] option');
|
|
const wantedOption = ['no paginate'];
|
|
for (const item of options) {
|
|
const text = await this.page.evaluate(el => el.innerText, item);
|
|
const value = await this.page.evaluate(el => el.value, item);
|
|
|
|
if (wantedOption.indexOf(text) !== -1) {
|
|
await this.page.select('select[name="p_rec_per_page"]', value);
|
|
break;
|
|
}
|
|
}
|
|
|
|
await this._microWait(this.page, 5);
|
|
|
|
await this._findAndClick('#ID_BL_FORM > fieldset > table > tbody > tr:nth-child(4) > td:nth-child(1) > input');
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleIntroPage() {
|
|
if (!this.inProgress) {
|
|
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
|
|
|
|
await this._findAndClick('#navigace > li:nth-child(5) > a');
|
|
}
|
|
else {
|
|
logger.warn('Trying to resume..');
|
|
logger.warn(this.lastUrl);
|
|
this._goto(this.lastUrl);
|
|
// this.emit('entityComplete');
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
*/
|
|
async extractEntityDetails(html) {
|
|
const seq = [
|
|
{ 'name' : 'entityType', 'field':'Entity Type' },
|
|
{ 'name' : 'companyId', 'field' : 'Company Identification Number' },
|
|
{ 'name' : 'instituteName', 'field' : 'Institution Name' },
|
|
{ 'name':'registeredAddress', 'field' : 'Registered / permanent residence address' },
|
|
{ 'name' : 'contactAddress', 'field' : 'Contact Address' },
|
|
{ 'name' : 'phone', 'field' : 'Phone' },
|
|
{ 'name' : 'fax', 'field': 'Fax' },
|
|
{ 'name' : 'email', 'field': 'E-mail' },
|
|
{ 'name' : 'website', 'field': 'Website' },
|
|
{ 'name' : 'typeOfAuth', 'field': 'Type of authorization' },
|
|
{ 'name' : 'dateOfAuth', 'field': 'Date of authorization' },
|
|
{ 'name' : 'dateOfAuthLegalForce', 'field': 'Date the decision came to legal force' },
|
|
{ 'name' : 'ownershipStructure', 'field': 'Ownership Structure' },
|
|
{ 'name' : 'detailedEntitityType', 'field': 'Detailed Entity Type' },
|
|
{ 'name' : 'relatedLegalTies', 'field': 'Related legal ties' },
|
|
{ 'name' : 'otherFunctions', 'field': 'Other function(s)' },
|
|
{ 'name' : 'numericCode', 'field' : 'Numeric code' },
|
|
{ 'name' : 'lei', 'field' : 'LEI' }
|
|
];
|
|
|
|
const crossBorderField = 'Cross-border services';
|
|
|
|
const $ = cheerio.load(html);
|
|
const details = { 'crossBorderLinks' : [], 'authProcess':false, 'cbProcess':false };
|
|
|
|
for (const item of seq) {
|
|
const headCells = $(`#obsah > table > tbody td.tableNadpis:contains("${item.field}")`);
|
|
|
|
const foundCell = $(headCells).next('td.tableDetail');
|
|
details[item.name] = this._cleanUp($(foundCell).text());
|
|
}
|
|
|
|
const href = $('a:contains("Authorized activities")').attr('href');
|
|
details['authLink'] = `${this.urlPrefix}${href}`;
|
|
|
|
const crossBorder = $(`#obsah > table > tbody td.tableNadpis:contains("${crossBorderField}")`).next('td.tableDetail');
|
|
|
|
const cbElms = $(crossBorder).children();
|
|
|
|
cbElms.each((index, itm) => {
|
|
details.crossBorderLinks.push({ 'name': $(itm).text(), 'href':`${this.urlPrefix}${$(itm).attr('href')}` });
|
|
});
|
|
|
|
details.crossBorderStep = 0;
|
|
|
|
return details;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<null>}
|
|
*/
|
|
async entityCompleter(serviceObject) {
|
|
let cbFlag = false;
|
|
try{
|
|
if (serviceObject.current.authLink !== '' && !serviceObject.current.authProcess) {
|
|
await this._randomWait(this.page, 3, 5, 'Get Authorisations');
|
|
|
|
await this._goto(serviceObject.current.authLink, { 'waitUntil':'networkidle0' });
|
|
|
|
return null;
|
|
}
|
|
|
|
if (typeof serviceObject.current.crossBorderLinks !== 'undefined' && !serviceObject.current.cbProcess && serviceObject.current.crossBorderLinks.length > 0) {
|
|
await this._randomWait(this.page, 3, 5, 'Get CBs');
|
|
// logger.info(`Crossborder for ${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}`);
|
|
|
|
await this._goto(serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].href, { 'waitUntil':'networkidle0' });
|
|
}
|
|
else
|
|
cbFlag = true;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
|
|
if( cbFlag === true) {
|
|
const filename = serviceObject.links[serviceObject.step].fileName;
|
|
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
|
|
logger.info(`Saving: ${filename}.json`);
|
|
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
serviceObject.links[serviceObject.step].fileName = `${filename}.json`;
|
|
serviceObject.links[serviceObject.step].params = this._getParamsFromUrl(serviceObject.links[serviceObject.step].href);
|
|
|
|
serviceObject.step++;
|
|
|
|
if (serviceObject.step < serviceObject.items) {
|
|
serviceObject.current = {};
|
|
|
|
await this._goto(serviceObject.links[serviceObject.step].href, { 'waitUntil':'networkidle0' });
|
|
}
|
|
else
|
|
this.emit('serviceDone');
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractEntityAuthority(html) {
|
|
const output = [];
|
|
|
|
let chunk;
|
|
let header = '';
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('table tbody tr');
|
|
|
|
rows.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
|
|
if (children.length === 1) {
|
|
if (typeof(chunk) !== 'undefined')
|
|
output.push([header, chunk]);
|
|
|
|
chunk = [];
|
|
header = this._cleanUp($(elm).text());
|
|
}
|
|
else {
|
|
const text = this._cleanUp($(children.eq(0)).text());
|
|
const cb = $(children.eq(1)).find('input').attr('checked');
|
|
|
|
if (typeof(cb) !== 'undefined')
|
|
chunk.push(text);
|
|
}
|
|
});
|
|
|
|
if (typeof(chunk) !== 'undefined')
|
|
output.push([header, chunk]);
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @param row
|
|
* @param verts
|
|
* @returns {Promise<*[]>}
|
|
*/
|
|
async reduceMatrixRow($, row, verts) {
|
|
const newRow = [];
|
|
|
|
const title = this._cleanUp($(row).eq(0).text());
|
|
|
|
const cells = $(row).find('input');
|
|
|
|
cells.each((i, elm) => {
|
|
const input = $(elm).attr('checked');
|
|
|
|
const checked = (typeof input !== 'undefined');
|
|
|
|
newRow.push([verts[i], checked]);
|
|
});
|
|
|
|
return [title, newRow];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @param rows
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractCSEAUndetakings($, rows) {
|
|
const output = [];
|
|
|
|
const verts = [];
|
|
let mSectionTitle = '', mSection = [];
|
|
let section = [];
|
|
let sectionTitle = '';
|
|
let lastRowLength = 0;
|
|
|
|
// build index of verts
|
|
|
|
const vertCols = $(rows).eq(1).find('td');
|
|
|
|
vertCols.each((i, elm) => {
|
|
const alt = $(elm).find('img').attr('alt');
|
|
|
|
verts.push(alt);
|
|
});
|
|
|
|
// walk the rows.
|
|
|
|
for(let index = 2; index < rows.length;index++) {
|
|
const row = $(rows).eq(index);
|
|
const children = cheerio(row).children();
|
|
|
|
if (children.length === 1) {
|
|
// section change
|
|
|
|
// tableDetailLightGrey
|
|
if (lastRowLength > (verts.length))
|
|
output.push([mSectionTitle, mSection]);
|
|
|
|
if (children.eq(0).attr('class') === 'tableDetailLightGrey') {
|
|
mSection = [];
|
|
mSectionTitle = this._cleanUp($(row).text());
|
|
}
|
|
|
|
if (children.eq(0).attr('class') === 'tableNadpis') {
|
|
if (section.length > 0)
|
|
output.push([sectionTitle, section]);
|
|
|
|
section = [];
|
|
sectionTitle = this._cleanUp($(row).text());
|
|
}
|
|
}
|
|
|
|
if(children.length === 2) {
|
|
if (lastRowLength > (verts.length))
|
|
output.push([mSectionTitle, mSection]);
|
|
|
|
const text = this._cleanUp($(children.eq(0)).text());
|
|
const cb = $(children.eq(1)).find('input').attr('checked');
|
|
|
|
if (typeof(cb) !== 'undefined')
|
|
section.push(text);
|
|
}
|
|
|
|
if (children.length > (verts.length)) {
|
|
// a matrix row
|
|
|
|
const matrixRow = await this.reduceMatrixRow($, row, verts);
|
|
mSection.push(matrixRow);
|
|
}
|
|
|
|
lastRowLength = children.length;
|
|
}
|
|
|
|
return output;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @param rows
|
|
* @returns {Promise<*[]>}
|
|
*/
|
|
async extractCSEAActivity($, rows) {
|
|
const activity = [];
|
|
|
|
let chunk;
|
|
let section = [];
|
|
let sectionTitle;
|
|
let chunkTitle = '';
|
|
let lastItemSection = false;
|
|
rows.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
|
|
if (children.length === 1)
|
|
if (children.eq(0).attr('class') === 'tableNadpis') {
|
|
if (typeof(chunk) !== 'undefined')
|
|
activity.push(chunk);
|
|
|
|
chunk = [];
|
|
chunkTitle = this._cleanUp($(elm).text());
|
|
}
|
|
else {
|
|
sectionTitle = this._cleanUp($(elm).text());
|
|
section = [];
|
|
}
|
|
|
|
else {
|
|
const text = this._cleanUp($(children.eq(0)).text());
|
|
const cb = $(children.eq(1)).find('input').attr('checked');
|
|
const span = $(children.eq(0)).find('span');
|
|
|
|
if (typeof(cb) !== 'undefined') {
|
|
if (span.length > 0)
|
|
section.push(text);
|
|
|
|
else {
|
|
if (lastItemSection === true)
|
|
chunk.push([sectionTitle, section]);
|
|
|
|
chunk.push(text);
|
|
}
|
|
|
|
lastItemSection = (span.length > 0);
|
|
}
|
|
}
|
|
});
|
|
|
|
if (typeof(chunk) !== 'undefined')
|
|
activity.push([chunkTitle, chunk]);
|
|
|
|
return activity;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<{activity: Array, undertakings: *}>}
|
|
*/
|
|
async extractCreditServicesEntityAuthority(html) {
|
|
try{
|
|
let undertakings = null;
|
|
|
|
const activity = [];
|
|
const $ = cheerio.load(html);
|
|
|
|
const tables = $('table');
|
|
|
|
for(let index = 0;index < tables.length; index++) {
|
|
const table = $(tables.eq(index));
|
|
const matrixTable = $(table).find('td.tableNadpis:contains("Act No. 256/2004 Coll., Capital Market Undertakings Act")');
|
|
const rows = $(table).find('tbody tr');
|
|
|
|
if ($(matrixTable).length === 0) {
|
|
const activityTable = await this.extractCSEAActivity($, rows);
|
|
activity.push(activityTable);
|
|
}
|
|
else
|
|
undertakings = await this.extractCSEAUndetakings($, rows);
|
|
}
|
|
|
|
return { activity, undertakings };
|
|
}
|
|
catch (e) {
|
|
logge.error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processCBDetails(serviceObject) {
|
|
logger.info(`Process ${this.modeTitles[this.mode]} entity crossBorderStep:${serviceObject.links[serviceObject.step].crossBorderStep}`);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processAuthorityDetails(serviceObject) {
|
|
// serviceObject
|
|
|
|
const modeStrings = ['authority', 'crossBorder'];
|
|
|
|
const authorityMode = (serviceObject.current.authProcess) ? 1 : 0;
|
|
|
|
if (authorityMode === 1)
|
|
if (!serviceObject.current.hasOwnProperty('crossBorder')) {
|
|
serviceObject.current['crossBorder'] = {};
|
|
}
|
|
|
|
const crossBorderText = (authorityMode === 1) ? `for ${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}` : '';
|
|
|
|
logger.info(`Process ${this.modeTitles[this.mode]} entity: ${serviceObject.links[serviceObject.step].name} ${modeStrings[authorityMode]} ${crossBorderText}`);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const ccFN = (authorityMode === 1) ? `_${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}` : '';
|
|
|
|
const filename = serviceObject.links[serviceObject.step].fileName;
|
|
|
|
const outFile = `${filename}_${modeStrings[authorityMode]}${ccFN}`;
|
|
|
|
const filePath = `${this.path}/${outFile}`.substring(0, 240);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this._makeScreenshotV2(this.page, `${filePath}`, null);
|
|
|
|
const body = await this.page.content();
|
|
// const $ = cheerio.load(body);
|
|
const details = (this.mode === 2 && authorityMode === 0) ? await this.extractCreditServicesEntityAuthority(body) : await this.extractEntityAuthority(body);
|
|
|
|
if (authorityMode === 0) {
|
|
serviceObject.current['authority'] = (this.mode === 2) ? Object.assign({}, details) : details.slice();
|
|
serviceObject.current.authProcess = true;
|
|
}
|
|
else {
|
|
const cbStep = serviceObject.current.crossBorderStep;
|
|
const countryCode = serviceObject.current.crossBorderLinks[cbStep].name;
|
|
serviceObject.current['crossBorder'][countryCode] = details.slice();
|
|
serviceObject.current.crossBorderStep++;
|
|
|
|
if (serviceObject.current.crossBorderStep >= serviceObject.current.crossBorderLinks.length)
|
|
serviceObject.current.cbProcess = true;
|
|
}
|
|
|
|
logger.info(`Completed ${modeStrings[authorityMode]}...`);
|
|
|
|
this.emit('entityComplete');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processEntityDetails(serviceObject) {
|
|
// const noWhiteSpace = /\W/g;
|
|
|
|
serviceObject.current = {};
|
|
|
|
logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.links[serviceObject.step].name}`);
|
|
logger.info(`Step ${serviceObject.step} of ${serviceObject.links.length}`);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const filename = serviceObject.links[serviceObject.step].fileName;
|
|
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
|
|
|
const body = await this.page.content();
|
|
// const $ = cheerio.load(body);
|
|
const details = await this.extractEntityDetails(body);
|
|
|
|
serviceObject.current = Object.assign({}, details);
|
|
|
|
this.emit('entityComplete');
|
|
|
|
logger.info('Entity complete...');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async populateSectionLinks(serviceObject) {
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
const urlPrefix = `${pageUrl.protocol}//${pageUrl.host}/apljerrsdad/`;
|
|
|
|
const body = await this.page.content();
|
|
const $ = cheerio.load(body);
|
|
|
|
const links = $('a.textNorm');
|
|
|
|
for (const items of serviceObject.sections)
|
|
for (let index = 0, len = links.length; index < len; index++) {
|
|
const item = links[index];
|
|
const itemText = this._cleanUp($(item).text());
|
|
const itemLink = $(item).attr('href');
|
|
if (itemText === items)
|
|
serviceObject.sectionLinks.push(`${urlPrefix}${itemLink}`);
|
|
}
|
|
//
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async useListingPage(serviceObject) {
|
|
if (serviceObject.sectionLinks.length === 0) {
|
|
logger.debug('SectionLinks empty');
|
|
await this.populateSectionLinks(serviceObject);
|
|
}
|
|
|
|
logger.debug(serviceObject.sectionLinks);
|
|
|
|
await this._randomWait(this.page, 3, 5, 'First sub section');
|
|
|
|
await this._goto(serviceObject.sectionLinks[serviceObject.indexStep], { 'waitUntil':'networkidle0' });
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @param rows
|
|
* @param indexStep
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractDataFromSubList($, rows, indexStep = 0) {
|
|
const cellTitles = [
|
|
'companyNumber',
|
|
'name',
|
|
'address',
|
|
'city',
|
|
'postcode',
|
|
'country',
|
|
'datefrom'
|
|
];
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
const urlPrefix = `${pageUrl.protocol}//${pageUrl.host}/apljerrsdad/`;
|
|
|
|
const details = [];
|
|
|
|
rows.each(async (i, elm) => {
|
|
const children = $(elm).children();
|
|
|
|
const newItem = {};
|
|
|
|
if (children.length === 7) {
|
|
for(let cPos = 0;cPos <= 6;cPos++)
|
|
newItem[cellTitles[cPos]] = this._cleanUp(children.eq(cPos).text().trim());
|
|
|
|
newItem['href'] = `${urlPrefix}${children.eq(1).children().eq(0).attr('href')}`;
|
|
|
|
newItem['fileName'] = this._makeFileName(newItem['name']);
|
|
newItem['indexStep'] = indexStep;
|
|
details.push(newItem);
|
|
}
|
|
});
|
|
|
|
return details;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processSubListPage(serviceObject) {
|
|
const body = await this.page.content();
|
|
const $ = cheerio.load(body);
|
|
|
|
// details = details.concat(urlParams);
|
|
|
|
const rows = $('table tr');
|
|
|
|
const details = await this.extractDataFromSubList($, rows, serviceObject.indexStep);
|
|
|
|
serviceObject.links = serviceObject.links.concat(details);
|
|
|
|
serviceObject.indexStep++;
|
|
|
|
if (serviceObject.indexStep >= serviceObject.sectionLinks.length) {
|
|
this.inProgress = true;
|
|
serviceObject.items = serviceObject.links.length;
|
|
|
|
await this._randomWait(this.page, 3, 5, 'First page');
|
|
logger.info('goto', serviceObject.links[serviceObject.step].href);
|
|
|
|
await this._goto(serviceObject.links[serviceObject.step].href, { 'waitUntil':'networkidle0' });
|
|
}
|
|
else {
|
|
await this._randomWait(this.page, 3, 5, 'Next sub section');
|
|
|
|
await this._goto(serviceObject.sectionLinks[serviceObject.indexStep], { 'waitUntil':'networkidle0', 'timeout': 5000 });
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleSubListPage() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.processSubListPage(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.processSubListPage(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.processSubListPage(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleListingsPage() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.useListingPage(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.useListingPage(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.useListingPage(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processRedirector() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.processEntityDetails(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.processEntityDetails(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.processEntityDetails(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processAuthority() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.processAuthorityDetails(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.processAuthorityDetails(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.processAuthorityDetails(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleEntityComplete() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.entityCompleter(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.entityCompleter(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.entityCompleter(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
const errorPages = ['https://apl.cnb.cz/apljerrsdad/undefined', 'chrome-error://chromewebdata/'];
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (errorPages.indexOf(pageUrl.href) !== -1) {
|
|
logger.warn(`Directed to: ${pageUrl.href}`);
|
|
this.emit('recover');
|
|
|
|
return;
|
|
}
|
|
|
|
switch (pageUrl.pathname) {
|
|
|
|
case '/apljerrsdad/JERRS.WEB07.INTRO_PAGE':
|
|
await this.handleIntroPage();
|
|
break;
|
|
|
|
case '/apljerrsdad/JERRS.WEB45.LOGIN_A':
|
|
await this.handleCaptchaPage();
|
|
break;
|
|
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS':
|
|
await this.handleBasicListings();
|
|
break;
|
|
|
|
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE':
|
|
await this.handleListingsPage();
|
|
break;
|
|
|
|
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE_3':
|
|
await this.handleSubListPage();
|
|
break;
|
|
|
|
case '/apljerrsdad/JERRS.WEB10.VIZITKA':
|
|
await this.processRedirector();
|
|
break;
|
|
|
|
case '/apljerrsdad/JERRS.WEB14.POVOLENE_CINNOSTI':
|
|
await this.processAuthority();
|
|
break;
|
|
|
|
default:
|
|
if (process.env.NODE_ENV) {
|
|
await this._uploadError();
|
|
// throw new Error(`Unknown page: ${pageUrl}`);
|
|
this.emit('recover');
|
|
}
|
|
else {
|
|
logger.warn('processNewPage Fell through');
|
|
logger.warn('currentPage.location', pageUrl);
|
|
}
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async attachEvents() {
|
|
this.on('entityComplete', () => {
|
|
this.handleEntityComplete();
|
|
});
|
|
|
|
this.on('serviceDone', async function() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
this.emit('paymentServicesDone');
|
|
break;
|
|
|
|
case 1:
|
|
this.emit('emoneyServicesDone');
|
|
break;
|
|
|
|
case 2:
|
|
this.emit('creditServicesDone');
|
|
break;
|
|
|
|
}
|
|
});
|
|
|
|
this.on('paymentServicesDone', async function() {
|
|
logger.warn('paymentServicesDone');
|
|
try{
|
|
this.paymentServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices.links);
|
|
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('emoneyServicesDone', async function() {
|
|
logger.warn('emoneyServicesDone');
|
|
try{
|
|
this.emoneyServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices.links);
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.creditServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('creditServicesDone', async function() {
|
|
logger.warn('creditServicesDone');
|
|
try{
|
|
this.creditServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices.links);
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
this.emit('done');
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
super._start();
|
|
try {
|
|
this.mode = 0;
|
|
|
|
this.inProgress = false;
|
|
|
|
this.paymentServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
|
|
'sections' : ['Payment institutions and branches of foreign payment institutions', 'Small payment institutions', 'Account information service providers and branches of foreign account information service providers'],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
|
|
'sections' : ['Electronic money institutions and branches of foreign electronic money institutions', 'Small e-money issuers'],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'started': false,
|
|
'urls': ['https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en'],
|
|
'sections' : ['Banks and branches of foreign banks'],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.urlPrefix = 'https://apl.cnb.cz/apljerrsdad/';
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
|
|
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/CZ/CNB`));
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser();
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
if (this.eventNames().length === 2)
|
|
await this.attachEvents();
|
|
|
|
//
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = CZScrape;
|