2019-05-05 19:13:56 +00:00
const Scraper = require ( '../helpers/scraper' ) ;
const cheerio = require ( 'cheerio' ) ;
const path = require ( 'path' ) ;
const jsonfile = require ( 'jsonfile' ) ;
const logger = require ( 'log4js' ) . getLogger ( 'CZ' ) ;
const url = require ( 'url' ) ;
logger . level = process . env . LOGGER _LEVEL || 'warn' ;
class CZScrape extends Scraper {
constructor ( ) {
super ( ) ;
2019-08-15 07:48:49 +00:00
this . setID ( 'CZ' ) ;
2019-05-05 19:13:56 +00:00
this . version = '0.0.1-3' ;
this . captchas = [ 'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAj0lEQVRYR+2XQQ6AIAwE7f8fXeOBBIlll8bYSNZzhemUZMHc3Y/CzwSQMWBm06GtTDU1ghIAtGmkBNmgDZQBZDcejUQmoIEIYKb26Z/XANBMW+cjhABkQAZkQAZkYB8DLe0+i+PVeL3q+yhG8Q0vJBEA+5bZB6DvGN0TUde3tX75MGHnz9TRh5BZLFNTDnACDZUAsJw5oEAAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAXElEQVRYR+2XsQ0AMAjDyv9H0wmVganQZjEPYDkREubuvoRjAHQMmJ3sboNsRQCAzEBeHC342gEJQLU036/nBuQA1bWWlTBgAMAABjCAAQxgAAMTL2XrMQFgwsAGQ1axoX3D1WIAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAuUlEQVRYR+2WUQ6AMAhD2f0PjfHDxBiRtiPBJfMb17cC1eHubo3P2ACSAyPpGdFUrQUtAJloZEriBu5AG0AkHN2MrM8dIA80sp4HQCf8CRK8twCAGtNlDrAA5TOwFMBXVshDyDjwBlCWhBGIIHo/Kl9DRviqRbPCzOoBCPGTlwNAPkgtAKSoNgPCiiELhLdgctrDWU7/CcloRW7NteC3AOxV5ShGVg+B2QCyA4i9EzV4DkyIfL3aDnAAnuOdod3Qo9wAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAArklEQVRYR+2XSw6AIAwF7f0PXcPCxCj9PRqKBtcCk2n7UGJmPgof2gCQASK9aIGqYiUoAbAO1ZwoRvwGygCkg7U699YI79sGlgPwdvgTPM3AdAA0ptMMIACpTbg0gDcbUoKoZ2IDeA00e3AOLN2EElz5GDawtCBybvSS4VwXvw2n3wWR61ibCngKIqMGfJbZJbg2HQFJieJygLteC8bbqG1av/ljgsSzsAYz8CeAE+03waHIRTLTAAAAAElFTkSuQmCC' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAzklEQVRYR+2WYQqAMAiF9Uzdqc5Ud+pMiwULGbopTEew/gQ1pn4+34YppQQTH1wJjCRwIcINABvA+z4V3R3aggPxC56T2KMSqCsPJ1BXXtoQQkAK7q4BCXvB764BTnBhU8BhzyMXNgW0Uio0dwJS5eUocSfQq7D3nzvz1E7Iqb6ec1cCdHPJ410IWDZ1IWDZ1JJs0UNTAy2bpY5Hj2Dpu9S2ZgIau9UEb9nyIqD2Ac3F2SJYlQg1Qema4VNgTWARWASmE7CKNq8f6gO/TOABNoFBsH2myFkAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAwklEQVRYR+2WwRHDMAgEoab0FNeU9OSayPgRj8YjdNIRWXngj37WaQ8O1MxMFn6aAhAB3VTkISK7hE571p2GFuhbw5cf4u3FCkgCFwKel2wnD9eA5+U8AUng3wj0hNJIoeIiJHJgpFCxgGASIjFYAEGgtAnZgQU4BFovKwfYNAKtl9UGGD0N6YS7kKOnIS2gMz9gDdACksCvCHg74W1dsHwn/BYhux2Hk/AUQM6EcBLW2hDZgi4t/zktB3rzIwUsJ/ABbkZFsGTnR4wAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA90lEQVRYR+2W0Q2DMAxEk5nYCWaCnZjJFVIjVRHmXoyK+1F+kJBD3p3jg2pmVhKv+geIOLDVrexlL1OZTu+rrbipuAVq0ygMBljq4ir2Nj+ezzZfuiEBosp7KK8tEsBT7ikbrZcAngOeotF6CdArUj3Fx/9dKAF6RSMjRmAkwE86QEeStCvkAB1N0i4JQDf7WhJSu71vg2pD2IG0HGhK05KQHKxj/ml+yBZEc4CukwBUSZ96dJ0EoEp6ALpOAlAljzmQPgUt8R7LgWZtNBFvJ2EDiH4TVG7IQ3j2U6Fg1Kaf7wwBqHYo228DkF8tWhNygL6c1KUDvADp2Oqw5E5+1wAAAABJRU5ErkJggg==' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAAvUlEQVRYR+2WwQ3AIAwDw0zsBDPRnZiJikclhEBAbDUf+uGDiHN2KK6UUsTwc1fAikCMTrwXyVmgNYSx00sLnsfBxav4lJQCLgFzAporohU98/47dxnCEwF9YFfF69kUATObZqPXNkURoOmcakFPYKdzqgBTAqOb8lcCCH5oCmb/iJ3Ro0wB2jkcQiR4MAE0eLAAFn51CFn41QLMCLBGT50BZueqMWR6rxJwCZgTOHkf7u6lPMl2i432mQt4AZRbZ7D1wLKYAAAAAElFTkSuQmCC' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAAA10lEQVRYR+2X0Q2AIAxEy0zuJDPpTs5U4weGkJYrDUJi8MefRo7X6yGBmZkmPmEJ8BAI50l0XUTbJr75OMxNdbUgxKgu/ojife8vAC2qEUFizAQQdm87oADvzksiGgksQDFczWiSaK0eCygMh3qa3FeK6EbAOmKlZ/5DILl9mAdae/rU5/iRZ6AJXwEgfj/LAY1A7SzIxSwCnxGwTgHKDWhC7Syo7Uw6uFYS/odAmv9hHtCScNgUrCSU/q/RDyoKn/ybMIhEATPuBeabRmOhi0DjGtXy6QJujffgsC3pb7kAAAAASUVORK5CYII=' ,
'iVBORw0KGgoAAAANSUhEUgAAACAAAAAgCAYAAABzenr0AAABA0lEQVRYR+2W0Q3DIAxEyUzsBDPBTsxEhVQkZMW5s5MWqWp/8lETng/fkaP33sPG3/EH8ChQaw2ttRBjPH2WUuhDdR1BzlndfECllJ4HQJtqiiAYWgEku/c4IIC3c6mIpgQE0DrXBs1aDwGkAuhM5/Sx6yCA7Ii1GLsOArCdSN+x6yAA24kEYNdBALaTrykw/b7NBdPfmhu03Hg8B7TksyYinAHN11e34RmEW4F1uNhYHps95oIVgL2QxoCy7qGPgL7g34UfUcAC8TsKWH1trYczYL3frfUQ4G4OoO8HGoC1oAwh9P1AA1jyAG26vssFgBIRyX4bwJIHqNalAHqp5f/tAC88u1y/XYiv9gAAAABJRU5ErkJggg=='
] ;
this . on ( 'done' , async ( ) => {
await this . _saveLocalStorage ( this . page , ` ${ this . path } / ${ this . id } _localstorage.json ` ) ;
await this . _done ( ) ;
} ) ;
this . run = this . _throttle ( async ( ) => {
await this . _ _run ( ) ;
} , 5000 ) ;
if ( process . env . NODE _ENV === 'production' )
this . _checkLock ( ) . then ( async ( l ) => {
if ( l )
await this . run ( ) ;
} ) ;
}
async getBase64Image ( img ) {
// Create an empty canvas element
var canvas = document . createElement ( 'canvas' ) ;
canvas . width = img . width ;
canvas . height = img . height ;
// Copy the image contents to the canvas
var ctx = canvas . getContext ( '2d' ) ;
ctx . drawImage ( img , 0 , 0 ) ;
// Get the data-URL formatted image
// Firefox supports PNG and JPEG. You could check img.src to
// guess the original format, but be aware the using "image/jpg"
// will re-encode the image.
var dataURL = canvas . toDataURL ( 'image/png' ) ;
return dataURL . replace ( /^data:image\/(png|jpg);base64,/ , '' ) ;
}
async getBinOfImg ( elm ) {
const bin = await this . page . evaluate ( el => {
const canvas = document . createElement ( 'canvas' ) ;
canvas . width = el . width ;
canvas . height = el . height ;
// Copy the image contents to the canvas
const ctx = canvas . getContext ( '2d' ) ;
ctx . drawImage ( el , 0 , 0 ) ;
const dataURL = canvas . toDataURL ( 'image/png' ) ;
return dataURL . replace ( /^data:image\/(png|jpg);base64,/ , '' ) ;
} , elm [ 0 ] ) ;
// logger.debug(bin);
return bin ;
}
async captchaTest ( ) {
// #ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child(2) > img
const rawBins = [ ] ;
for ( let step = 2 ; step <= 7 ; step ++ ) {
const elm = await this . page . $$ ( ` #ID_EMAIL_FORM > fieldset > table > tbody > tr > td:nth-child( ${ step } ) > img ` ) ;
rawBins . push ( this . captchas . indexOf ( await this . getBinOfImg ( elm ) ) ) ;
}
// #OPIS
await this . page . focus ( '#OPIS' ) ;
await this . page . keyboard . type ( rawBins . join ( '' ) , { 'delay' : 15 } ) ; // Types slower, like a user
logger . info ( rawBins ) ;
return rawBins . join ( '' ) ;
}
async handleCaptchaPage ( ) {
logger . debug ( '+ handleCaptchaPage' ) ;
await this . _randomWait ( this . page , 2 , 2 , 'handleCaptchaPage' ) ;
await this . captchaTest ( ) ;
await this . _microWait ( this . page , 5 ) ;
await this . _findAndClick ( 'input.jerrsButton' ) ;
// await this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
logger . debug ( '- handleCaptchaPage' ) ;
}
async handleBasicListings ( ) {
try {
const options = await this . page . $$ ( 'select[name="p_rec_per_page"] option' ) ;
const wantedOption = [ 'no paginate' ] ;
for ( const item of options ) {
const text = await this . page . evaluate ( el => el . innerText , item ) ;
const value = await this . page . evaluate ( el => el . value , item ) ;
if ( wantedOption . indexOf ( text ) !== - 1 ) {
await this . page . select ( 'select[name="p_rec_per_page"]' , value ) ;
break ;
}
}
await this . _microWait ( this . page , 5 ) ;
await this . _findAndClick ( '#ID_BL_FORM > fieldset > table > tbody > tr:nth-child(4) > td:nth-child(1) > input' ) ;
}
catch ( e ) {
throw new Error ( e ) ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async handleIntroPage ( ) {
if ( ! this . inProgress ) {
await this . _randomWait ( this . page , 3 , 5 , 'handleIntroPage' ) ;
await this . _findAndClick ( '#navigace > li:nth-child(5) > a' ) ;
}
else {
logger . warn ( 'Trying to resume..' ) ;
logger . warn ( this . lastUrl ) ;
this . _goto ( this . lastUrl ) ;
// this.emit('entityComplete');
}
}
/ * *
*
* @ param html
* /
async extractEntityDetails ( html ) {
const seq = [
{ 'name' : 'entityType' , 'field' : 'Entity Type' } ,
{ 'name' : 'companyId' , 'field' : 'Company Identification Number' } ,
{ 'name' : 'instituteName' , 'field' : 'Institution Name' } ,
{ 'name' : 'registeredAddress' , 'field' : 'Registered / permanent residence address' } ,
{ 'name' : 'contactAddress' , 'field' : 'Contact Address' } ,
{ 'name' : 'phone' , 'field' : 'Phone' } ,
{ 'name' : 'fax' , 'field' : 'Fax' } ,
{ 'name' : 'email' , 'field' : 'E-mail' } ,
{ 'name' : 'website' , 'field' : 'Website' } ,
{ 'name' : 'typeOfAuth' , 'field' : 'Type of authorization' } ,
{ 'name' : 'dateOfAuth' , 'field' : 'Date of authorization' } ,
{ 'name' : 'dateOfAuthLegalForce' , 'field' : 'Date the decision came to legal force' } ,
{ 'name' : 'ownershipStructure' , 'field' : 'Ownership Structure' } ,
{ 'name' : 'detailedEntitityType' , 'field' : 'Detailed Entity Type' } ,
{ 'name' : 'relatedLegalTies' , 'field' : 'Related legal ties' } ,
{ 'name' : 'otherFunctions' , 'field' : 'Other function(s)' } ,
{ 'name' : 'numericCode' , 'field' : 'Numeric code' } ,
{ 'name' : 'lei' , 'field' : 'LEI' }
] ;
const crossBorderField = 'Cross-border services' ;
const $ = cheerio . load ( html ) ;
const details = { 'crossBorderLinks' : [ ] , 'authProcess' : false , 'cbProcess' : false } ;
for ( const item of seq ) {
const headCells = $ ( ` #obsah > table > tbody td.tableNadpis:contains(" ${ item . field } ") ` ) ;
const foundCell = $ ( headCells ) . next ( 'td.tableDetail' ) ;
details [ item . name ] = this . _cleanUp ( $ ( foundCell ) . text ( ) ) ;
}
const href = $ ( 'a:contains("Authorized activities")' ) . attr ( 'href' ) ;
details [ 'authLink' ] = ` ${ this . urlPrefix } ${ href } ` ;
const crossBorder = $ ( ` #obsah > table > tbody td.tableNadpis:contains(" ${ crossBorderField } ") ` ) . next ( 'td.tableDetail' ) ;
const cbElms = $ ( crossBorder ) . children ( ) ;
cbElms . each ( ( index , itm ) => {
details . crossBorderLinks . push ( { 'name' : $ ( itm ) . text ( ) , 'href' : ` ${ this . urlPrefix } ${ $ ( itm ) . attr ( 'href' ) } ` } ) ;
} ) ;
details . crossBorderStep = 0 ;
return details ;
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < null > }
* /
async entityCompleter ( serviceObject ) {
let cbFlag = false ;
2019-08-15 07:48:49 +00:00
try {
if ( serviceObject . current . authLink !== '' && ! serviceObject . current . authProcess ) {
await this . _randomWait ( this . page , 3 , 5 , 'Get Authorisations' ) ;
2019-05-05 19:13:56 +00:00
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . current . authLink , { 'waitUntil' : 'networkidle0' } ) ;
2019-05-05 19:13:56 +00:00
2019-08-15 07:48:49 +00:00
return null ;
}
2019-05-05 19:13:56 +00:00
2019-08-15 07:48:49 +00:00
if ( typeof serviceObject . current . crossBorderLinks !== 'undefined' && ! serviceObject . current . cbProcess && serviceObject . current . crossBorderLinks . length > 0 ) {
await this . _randomWait ( this . page , 3 , 5 , 'Get CBs' ) ;
// logger.info(`Crossborder for ${serviceObject.current.crossBorderLinks[serviceObject.current.crossBorderStep].name}`);
2019-05-05 19:13:56 +00:00
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . current . crossBorderLinks [ serviceObject . current . crossBorderStep ] . href , { 'waitUntil' : 'networkidle0' } ) ;
}
else
cbFlag = true ;
}
catch ( err ) {
logger . error ( err ) ;
2019-05-05 19:13:56 +00:00
}
if ( cbFlag === true ) {
const filename = serviceObject . links [ serviceObject . step ] . fileName ;
const filePath = ` ${ this . path } / ${ filename } ` . substring ( 0 , 240 ) ;
logger . info ( ` Saving: ${ filename } .json ` ) ;
await jsonfile . writeFile ( ` ${ filePath } .json ` , serviceObject . current ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
serviceObject . links [ serviceObject . step ] . fileName = ` ${ filename } .json ` ;
serviceObject . links [ serviceObject . step ] . params = this . _getParamsFromUrl ( serviceObject . links [ serviceObject . step ] . href ) ;
serviceObject . step ++ ;
if ( serviceObject . step < serviceObject . items ) {
serviceObject . current = { } ;
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . links [ serviceObject . step ] . href , { 'waitUntil' : 'networkidle0' } ) ;
2019-05-05 19:13:56 +00:00
}
else
this . emit ( 'serviceDone' ) ;
}
}
/ * *
*
* @ param html
* @ returns { Promise < Array > }
* /
async extractEntityAuthority ( html ) {
const output = [ ] ;
let chunk ;
let header = '' ;
const $ = cheerio . load ( html ) ;
const rows = $ ( 'table tbody tr' ) ;
rows . each ( ( i , elm ) => {
const children = cheerio ( elm ) . children ( ) ;
if ( children . length === 1 ) {
if ( typeof ( chunk ) !== 'undefined' )
output . push ( [ header , chunk ] ) ;
chunk = [ ] ;
header = this . _cleanUp ( $ ( elm ) . text ( ) ) ;
}
else {
const text = this . _cleanUp ( $ ( children . eq ( 0 ) ) . text ( ) ) ;
const cb = $ ( children . eq ( 1 ) ) . find ( 'input' ) . attr ( 'checked' ) ;
if ( typeof ( cb ) !== 'undefined' )
chunk . push ( text ) ;
}
} ) ;
if ( typeof ( chunk ) !== 'undefined' )
output . push ( [ header , chunk ] ) ;
return output ;
}
/ * *
*
* @ param $
* @ param row
* @ param verts
* @ returns { Promise < * [ ] > }
* /
async reduceMatrixRow ( $ , row , verts ) {
const newRow = [ ] ;
const title = this . _cleanUp ( $ ( row ) . eq ( 0 ) . text ( ) ) ;
const cells = $ ( row ) . find ( 'input' ) ;
cells . each ( ( i , elm ) => {
const input = $ ( elm ) . attr ( 'checked' ) ;
const checked = ( typeof input !== 'undefined' ) ;
newRow . push ( [ verts [ i ] , checked ] ) ;
} ) ;
return [ title , newRow ] ;
}
/ * *
*
* @ param $
* @ param rows
* @ returns { Promise < Array > }
* /
async extractCSEAUndetakings ( $ , rows ) {
const output = [ ] ;
const verts = [ ] ;
let mSectionTitle = '' , mSection = [ ] ;
let section = [ ] ;
let sectionTitle = '' ;
let lastRowLength = 0 ;
// build index of verts
const vertCols = $ ( rows ) . eq ( 1 ) . find ( 'td' ) ;
vertCols . each ( ( i , elm ) => {
const alt = $ ( elm ) . find ( 'img' ) . attr ( 'alt' ) ;
verts . push ( alt ) ;
} ) ;
// walk the rows.
for ( let index = 2 ; index < rows . length ; index ++ ) {
const row = $ ( rows ) . eq ( index ) ;
const children = cheerio ( row ) . children ( ) ;
if ( children . length === 1 ) {
// section change
// tableDetailLightGrey
if ( lastRowLength > ( verts . length ) )
output . push ( [ mSectionTitle , mSection ] ) ;
if ( children . eq ( 0 ) . attr ( 'class' ) === 'tableDetailLightGrey' ) {
mSection = [ ] ;
mSectionTitle = this . _cleanUp ( $ ( row ) . text ( ) ) ;
}
if ( children . eq ( 0 ) . attr ( 'class' ) === 'tableNadpis' ) {
if ( section . length > 0 )
output . push ( [ sectionTitle , section ] ) ;
section = [ ] ;
sectionTitle = this . _cleanUp ( $ ( row ) . text ( ) ) ;
}
}
if ( children . length === 2 ) {
if ( lastRowLength > ( verts . length ) )
output . push ( [ mSectionTitle , mSection ] ) ;
const text = this . _cleanUp ( $ ( children . eq ( 0 ) ) . text ( ) ) ;
const cb = $ ( children . eq ( 1 ) ) . find ( 'input' ) . attr ( 'checked' ) ;
if ( typeof ( cb ) !== 'undefined' )
section . push ( text ) ;
}
if ( children . length > ( verts . length ) ) {
// a matrix row
const matrixRow = await this . reduceMatrixRow ( $ , row , verts ) ;
mSection . push ( matrixRow ) ;
}
lastRowLength = children . length ;
}
return output ;
}
/ * *
*
* @ param $
* @ param rows
* @ returns { Promise < * [ ] > }
* /
async extractCSEAActivity ( $ , rows ) {
const activity = [ ] ;
let chunk ;
let section = [ ] ;
let sectionTitle ;
let chunkTitle = '' ;
let lastItemSection = false ;
rows . each ( ( i , elm ) => {
const children = cheerio ( elm ) . children ( ) ;
if ( children . length === 1 )
if ( children . eq ( 0 ) . attr ( 'class' ) === 'tableNadpis' ) {
if ( typeof ( chunk ) !== 'undefined' )
activity . push ( chunk ) ;
chunk = [ ] ;
chunkTitle = this . _cleanUp ( $ ( elm ) . text ( ) ) ;
}
else {
sectionTitle = this . _cleanUp ( $ ( elm ) . text ( ) ) ;
section = [ ] ;
}
else {
const text = this . _cleanUp ( $ ( children . eq ( 0 ) ) . text ( ) ) ;
const cb = $ ( children . eq ( 1 ) ) . find ( 'input' ) . attr ( 'checked' ) ;
const span = $ ( children . eq ( 0 ) ) . find ( 'span' ) ;
if ( typeof ( cb ) !== 'undefined' ) {
if ( span . length > 0 )
section . push ( text ) ;
else {
if ( lastItemSection === true )
chunk . push ( [ sectionTitle , section ] ) ;
chunk . push ( text ) ;
}
lastItemSection = ( span . length > 0 ) ;
}
}
} ) ;
if ( typeof ( chunk ) !== 'undefined' )
activity . push ( [ chunkTitle , chunk ] ) ;
return activity ;
}
/ * *
*
* @ param html
* @ returns { Promise < { activity : Array , undertakings : * } > }
* /
async extractCreditServicesEntityAuthority ( html ) {
try {
let undertakings = null ;
const activity = [ ] ;
const $ = cheerio . load ( html ) ;
const tables = $ ( 'table' ) ;
for ( let index = 0 ; index < tables . length ; index ++ ) {
const table = $ ( tables . eq ( index ) ) ;
const matrixTable = $ ( table ) . find ( 'td.tableNadpis:contains("Act No. 256/2004 Coll., Capital Market Undertakings Act")' ) ;
const rows = $ ( table ) . find ( 'tbody tr' ) ;
if ( $ ( matrixTable ) . length === 0 ) {
const activityTable = await this . extractCSEAActivity ( $ , rows ) ;
activity . push ( activityTable ) ;
}
else
undertakings = await this . extractCSEAUndetakings ( $ , rows ) ;
}
return { activity , undertakings } ;
}
catch ( e ) {
logge . error ( e ) ;
}
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async processCBDetails ( serviceObject ) {
logger . info ( ` Process ${ this . modeTitles [ this . mode ] } entity crossBorderStep: ${ serviceObject . links [ serviceObject . step ] . crossBorderStep } ` ) ;
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async processAuthorityDetails ( serviceObject ) {
// serviceObject
const modeStrings = [ 'authority' , 'crossBorder' ] ;
const authorityMode = ( serviceObject . current . authProcess ) ? 1 : 0 ;
if ( authorityMode === 1 )
if ( ! serviceObject . current . hasOwnProperty ( 'crossBorder' ) ) {
serviceObject . current [ 'crossBorder' ] = { } ;
}
const crossBorderText = ( authorityMode === 1 ) ? ` for ${ serviceObject . current . crossBorderLinks [ serviceObject . current . crossBorderStep ] . name } ` : '' ;
logger . info ( ` Process ${ this . modeTitles [ this . mode ] } entity: ${ serviceObject . links [ serviceObject . step ] . name } ${ modeStrings [ authorityMode ] } ${ crossBorderText } ` ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
const ccFN = ( authorityMode === 1 ) ? ` _ ${ serviceObject . current . crossBorderLinks [ serviceObject . current . crossBorderStep ] . name } ` : '' ;
const filename = serviceObject . links [ serviceObject . step ] . fileName ;
const outFile = ` ${ filename } _ ${ modeStrings [ authorityMode ] } ${ ccFN } ` ;
const filePath = ` ${ this . path } / ${ outFile } ` . substring ( 0 , 240 ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . _makeScreenshotV2 ( this . page , ` ${ filePath } ` , null ) ;
const body = await this . page . content ( ) ;
// const $ = cheerio.load(body);
const details = ( this . mode === 2 && authorityMode === 0 ) ? await this . extractCreditServicesEntityAuthority ( body ) : await this . extractEntityAuthority ( body ) ;
if ( authorityMode === 0 ) {
serviceObject . current [ 'authority' ] = ( this . mode === 2 ) ? Object . assign ( { } , details ) : details . slice ( ) ;
serviceObject . current . authProcess = true ;
}
else {
const cbStep = serviceObject . current . crossBorderStep ;
const countryCode = serviceObject . current . crossBorderLinks [ cbStep ] . name ;
serviceObject . current [ 'crossBorder' ] [ countryCode ] = details . slice ( ) ;
serviceObject . current . crossBorderStep ++ ;
if ( serviceObject . current . crossBorderStep >= serviceObject . current . crossBorderLinks . length )
serviceObject . current . cbProcess = true ;
}
logger . info ( ` Completed ${ modeStrings [ authorityMode ] } ... ` ) ;
this . emit ( 'entityComplete' ) ;
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async processEntityDetails ( serviceObject ) {
// const noWhiteSpace = /\W/g;
serviceObject . current = { } ;
logger . info ( ` Process ${ this . modeTitles [ this . mode ] } entity: ${ serviceObject . links [ serviceObject . step ] . name } ` ) ;
logger . info ( ` Step ${ serviceObject . step } of ${ serviceObject . links . length } ` ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
const filename = serviceObject . links [ serviceObject . step ] . fileName ;
const filePath = ` ${ this . path } / ${ filename } ` . substring ( 0 , 240 ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . _makeScreenshotV2 ( this . page , ` ${ filePath } _main ` , null ) ;
const body = await this . page . content ( ) ;
// const $ = cheerio.load(body);
const details = await this . extractEntityDetails ( body ) ;
serviceObject . current = Object . assign ( { } , details ) ;
this . emit ( 'entityComplete' ) ;
logger . info ( 'Entity complete...' ) ;
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async populateSectionLinks ( serviceObject ) {
const pageUrl = url . parse ( await this . page . url ( ) ) ;
const urlPrefix = ` ${ pageUrl . protocol } // ${ pageUrl . host } /apljerrsdad/ ` ;
const body = await this . page . content ( ) ;
const $ = cheerio . load ( body ) ;
const links = $ ( 'a.textNorm' ) ;
for ( const items of serviceObject . sections )
for ( let index = 0 , len = links . length ; index < len ; index ++ ) {
const item = links [ index ] ;
const itemText = this . _cleanUp ( $ ( item ) . text ( ) ) ;
const itemLink = $ ( item ) . attr ( 'href' ) ;
if ( itemText === items )
serviceObject . sectionLinks . push ( ` ${ urlPrefix } ${ itemLink } ` ) ;
}
//
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async useListingPage ( serviceObject ) {
if ( serviceObject . sectionLinks . length === 0 ) {
logger . debug ( 'SectionLinks empty' ) ;
await this . populateSectionLinks ( serviceObject ) ;
}
logger . debug ( serviceObject . sectionLinks ) ;
await this . _randomWait ( this . page , 3 , 5 , 'First sub section' ) ;
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . sectionLinks [ serviceObject . indexStep ] , { 'waitUntil' : 'networkidle0' } ) ;
2019-05-05 19:13:56 +00:00
}
/ * *
*
* @ param $
* @ param rows
* @ param indexStep
* @ returns { Promise < Array > }
* /
async extractDataFromSubList ( $ , rows , indexStep = 0 ) {
const cellTitles = [
'companyNumber' ,
'name' ,
'address' ,
'city' ,
'postcode' ,
'country' ,
'datefrom'
] ;
const pageUrl = url . parse ( await this . page . url ( ) ) ;
const urlPrefix = ` ${ pageUrl . protocol } // ${ pageUrl . host } /apljerrsdad/ ` ;
const details = [ ] ;
rows . each ( async ( i , elm ) => {
const children = $ ( elm ) . children ( ) ;
const newItem = { } ;
if ( children . length === 7 ) {
for ( let cPos = 0 ; cPos <= 6 ; cPos ++ )
newItem [ cellTitles [ cPos ] ] = this . _cleanUp ( children . eq ( cPos ) . text ( ) . trim ( ) ) ;
newItem [ 'href' ] = ` ${ urlPrefix } ${ children . eq ( 1 ) . children ( ) . eq ( 0 ) . attr ( 'href' ) } ` ;
newItem [ 'fileName' ] = this . _makeFileName ( newItem [ 'name' ] ) ;
newItem [ 'indexStep' ] = indexStep ;
details . push ( newItem ) ;
}
} ) ;
return details ;
}
/ * *
*
* @ param serviceObject
* @ returns { Promise < void > }
* /
async processSubListPage ( serviceObject ) {
const body = await this . page . content ( ) ;
const $ = cheerio . load ( body ) ;
// details = details.concat(urlParams);
const rows = $ ( 'table tr' ) ;
const details = await this . extractDataFromSubList ( $ , rows , serviceObject . indexStep ) ;
serviceObject . links = serviceObject . links . concat ( details ) ;
serviceObject . indexStep ++ ;
if ( serviceObject . indexStep >= serviceObject . sectionLinks . length ) {
this . inProgress = true ;
serviceObject . items = serviceObject . links . length ;
2019-08-15 07:48:49 +00:00
2019-05-05 19:13:56 +00:00
await this . _randomWait ( this . page , 3 , 5 , 'First page' ) ;
logger . info ( 'goto' , serviceObject . links [ serviceObject . step ] . href ) ;
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . links [ serviceObject . step ] . href , { 'waitUntil' : 'networkidle0' } ) ;
2019-05-05 19:13:56 +00:00
}
else {
await this . _randomWait ( this . page , 3 , 5 , 'Next sub section' ) ;
2019-08-15 07:48:49 +00:00
await this . _goto ( serviceObject . sectionLinks [ serviceObject . indexStep ] , { 'waitUntil' : 'networkidle0' , 'timeout' : 5000 } ) ;
2019-05-05 19:13:56 +00:00
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async handleSubListPage ( ) {
switch ( this . mode ) {
case 1 :
await this . processSubListPage ( this . emoneyServices ) ;
break ;
case 2 :
await this . processSubListPage ( this . creditServices ) ;
break ;
case 0 :
default :
await this . processSubListPage ( this . paymentServices ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async handleListingsPage ( ) {
switch ( this . mode ) {
case 1 :
await this . useListingPage ( this . emoneyServices ) ;
break ;
case 2 :
await this . useListingPage ( this . creditServices ) ;
break ;
case 0 :
default :
await this . useListingPage ( this . paymentServices ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async processRedirector ( ) {
switch ( this . mode ) {
case 1 :
await this . processEntityDetails ( this . emoneyServices ) ;
break ;
case 2 :
await this . processEntityDetails ( this . creditServices ) ;
break ;
case 0 :
default :
await this . processEntityDetails ( this . paymentServices ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async processAuthority ( ) {
switch ( this . mode ) {
case 1 :
await this . processAuthorityDetails ( this . emoneyServices ) ;
break ;
case 2 :
await this . processAuthorityDetails ( this . creditServices ) ;
break ;
case 0 :
default :
await this . processAuthorityDetails ( this . paymentServices ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async handleEntityComplete ( ) {
switch ( this . mode ) {
case 1 :
await this . entityCompleter ( this . emoneyServices ) ;
break ;
case 2 :
await this . entityCompleter ( this . creditServices ) ;
break ;
case 0 :
default :
await this . entityCompleter ( this . paymentServices ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async processNewPage ( ) {
// give the page a few seconds to settle
2019-08-15 07:48:49 +00:00
const errorPages = [ 'https://apl.cnb.cz/apljerrsdad/undefined' , 'chrome-error://chromewebdata/' ] ;
2019-05-05 19:13:56 +00:00
await this . _randomWait ( this . page , 3 , 5 ) ;
const pageUrl = url . parse ( await this . page . url ( ) ) ;
2019-08-15 07:48:49 +00:00
if ( errorPages . indexOf ( pageUrl . href ) !== - 1 ) {
logger . warn ( ` Directed to: ${ pageUrl . href } ` ) ;
2019-05-05 19:13:56 +00:00
this . emit ( 'recover' ) ;
return ;
}
switch ( pageUrl . pathname ) {
case '/apljerrsdad/JERRS.WEB07.INTRO_PAGE' :
await this . handleIntroPage ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB45.LOGIN_A' :
await this . handleCaptchaPage ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS' :
await this . handleBasicListings ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE' :
await this . handleListingsPage ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB15.BASIC_LISTINGS_RESPONSE_3' :
await this . handleSubListPage ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB10.VIZITKA' :
await this . processRedirector ( ) ;
break ;
case '/apljerrsdad/JERRS.WEB14.POVOLENE_CINNOSTI' :
await this . processAuthority ( ) ;
break ;
default :
if ( process . env . NODE _ENV ) {
await this . _uploadError ( ) ;
2019-08-15 07:48:49 +00:00
// throw new Error(`Unknown page: ${pageUrl}`);
this . emit ( 'recover' ) ;
2019-05-05 19:13:56 +00:00
}
else {
logger . warn ( 'processNewPage Fell through' ) ;
logger . warn ( 'currentPage.location' , pageUrl ) ;
}
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async attachEvents ( ) {
this . on ( 'entityComplete' , ( ) => {
this . handleEntityComplete ( ) ;
} ) ;
this . on ( 'serviceDone' , async function ( ) {
switch ( this . mode ) {
case 0 :
this . emit ( 'paymentServicesDone' ) ;
break ;
case 1 :
this . emit ( 'emoneyServicesDone' ) ;
break ;
case 2 :
this . emit ( 'creditServicesDone' ) ;
break ;
}
} ) ;
this . on ( 'paymentServicesDone' , async function ( ) {
logger . warn ( 'paymentServicesDone' ) ;
try {
this . paymentServices . done = true ;
jsonfile . writeFileSync ( ` ${ this . path } /paymentServices.json ` , { 'links' : this . paymentServices . links } ) ;
jsonfile . writeFileSync ( ` ${ this . debugPath } /paymentServices.json ` , this . paymentServices . links ) ;
this . mode ++ ;
this . inProgress = false ;
await this . _goto ( this . emoneyServices . urls [ 0 ] ) ;
}
catch ( e ) {
logger . error ( e ) ;
}
} ) ;
this . on ( 'emoneyServicesDone' , async function ( ) {
logger . warn ( 'emoneyServicesDone' ) ;
try {
this . emoneyServices . done = true ;
jsonfile . writeFileSync ( ` ${ this . path } /emoneyServices.json ` , { 'links' : this . emoneyServices . links } ) ;
jsonfile . writeFileSync ( ` ${ this . debugPath } /emoneyServices.json ` , this . emoneyServices . links ) ;
this . mode ++ ;
this . inProgress = false ;
await this . _goto ( this . creditServices . urls [ 0 ] ) ;
}
catch ( e ) {
logger . error ( e ) ;
}
} ) ;
this . on ( 'creditServicesDone' , async function ( ) {
logger . warn ( 'creditServicesDone' ) ;
try {
this . creditServices . done = true ;
jsonfile . writeFileSync ( ` ${ this . path } /creditServices.json ` , { 'links' : this . creditServices . links } ) ;
jsonfile . writeFileSync ( ` ${ this . debugPath } /creditServices.json ` , this . creditServices . links ) ;
this . mode ++ ;
this . inProgress = false ;
this . emit ( 'done' ) ;
}
catch ( e ) {
logger . error ( e ) ;
}
} ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async start ( ) {
super . _start ( ) ;
try {
this . mode = 0 ;
this . inProgress = false ;
this . paymentServices = {
'items' : 0 ,
'links' : [ ] ,
'step' : 0 ,
'indexStep' : 0 ,
'visited' : false ,
'done' : false ,
'urls' : [ 'https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en' ] ,
'sections' : [ 'Payment institutions and branches of foreign payment institutions' , 'Small payment institutions' , 'Account information service providers and branches of foreign account information service providers' ] ,
'sectionLinks' : [ ]
} ;
this . emoneyServices = {
'items' : 0 ,
'links' : [ ] ,
'step' : 0 ,
'indexStep' : 0 ,
'visited' : false ,
'done' : false ,
'urls' : [ 'https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en' ] ,
'sections' : [ 'Electronic money institutions and branches of foreign electronic money institutions' , 'Small e-money issuers' ] ,
'sectionLinks' : [ ]
} ;
this . creditServices = {
'items' : 0 ,
'links' : [ ] ,
'step' : 0 ,
'indexStep' : 0 ,
'visited' : false ,
'done' : false ,
'searchDone' : false ,
'started' : false ,
'urls' : [ 'https://apl.cnb.cz/apljerrsdad/JERRS.WEB07.INTRO_PAGE?p_lang=en' ] ,
'sections' : [ 'Banks and branches of foreign banks' ] ,
'sectionLinks' : [ ]
} ;
this . urlPrefix = 'https://apl.cnb.cz/apljerrsdad/' ;
this . startPage = this . paymentServices . urls [ 0 ] ;
this . emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html' ;
this . credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB' ;
this . setPath ( path . resolve ( ` ${ _ _dirname } /../artefacts/CZ/CNB ` ) ) ;
await this . _doNonRepudiation ( ) . catch ( ( err ) => {
logger . warn ( err ) ;
} ) ;
await this . _initBrowser ( ) ;
await this . _createBrowserPage ( ) ;
this . page . on ( 'domcontentloaded' , this . _throttle ( async ( ) => {
this . processNewPage ( ) . catch ( ( err ) => {
logger . error ( 'processNewPage fail' , err ) ;
} ) ;
} , 2500 ) ) ;
if ( this . eventNames ( ) . length === 2 )
await this . attachEvents ( ) ;
//
await this . page . setViewport ( { 'width' : 1200 , 'height' : 800 } ) ;
2019-08-15 07:48:49 +00:00
await this . _goto ( this . startPage , { 'waitUntil' : 'networkidle0' } ) ;
2019-05-05 19:13:56 +00:00
await this . _randomWait ( this . page , 3 , 5 ) ;
}
catch ( e ) {
throw new Error ( e ) ;
}
}
async _ _run ( ) {
await this . start ( ) ;
}
}
module . exports = CZScrape ;