2019-05-05 19:13:56 +00:00
const Scraper = require ( '../helpers/scraper' ) ;
const cheerio = require ( 'cheerio' ) ;
const path = require ( 'path' ) ;
const jsonfile = require ( 'jsonfile' ) ;
const url = require ( 'url' ) ;
const logger = require ( 'log4js' ) . getLogger ( 'CY' ) ;
logger . level = process . env . LOGGER _LEVEL || 'warn' ;
// load env variables from file
class CYScrape extends Scraper {
constructor ( ) {
super ( ) ;
this . setID ( 'CY' ) ;
2019-08-15 07:48:49 +00:00
this . addToBlockFilters ( [ 'recaptcha' ] ) ;
2019-05-05 19:13:56 +00:00
this . on ( 'done' , ( ) => {
this . _done ( ) ;
} ) ;
this . run = this . _debounce ( async ( ) => {
await this . _ _run ( ) ;
} , 5000 ) ;
if ( process . env . NODE _ENV === 'production' )
this . _checkLock ( ) . then ( ( l ) => {
if ( l )
this . run ( ) ;
} ) ;
}
/ * *
*
* @ param selector
* @ returns { Promise < void > }
* /
async grabLink ( selector ) {
const clickableLinks = await this . page . $$ ( selector ) ;
await this . page . _client . send ( 'Page.setDownloadBehavior' , { 'behavior' : 'allow' , 'downloadPath' : this . path } ) ;
if ( clickableLinks . length > 0 )
for ( const item of clickableLinks ) {
const href = await this . page . evaluate ( el => el . href , item ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . page . goto ( href , { 'waitUntil' : 'networkidle2' } ) . catch ( ( err ) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if ( ! err . message . includes ( 'net::ERR_ABORTED' ) )
logger . error ( 'grabLink' , err ) ;
} ) ;
}
}
/ * *
*
* @ param id
* @ returns { Promise < void > }
* /
async downloadEmoney ( id ) {
const selector = [ '#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a' , '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a' ] ;
await this . grabLink ( selector [ id ] ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async downloadExcel ( ) {
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a' ;
await this . grabLink ( selector ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async handlePaymentInstitutions ( ) {
await this . _randomWait ( this . page , 3 , 5 ) ;
const filename = 'licensing-and-supervision-of-payment-institutions' ;
await this . _makeScreenshotV2 ( this . page , ` ${ this . path } / ${ filename } _main ` , null ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . downloadExcel ( ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . page . goto ( this . eMoneyUrl , { 'waitUntil' : 'networkidle2' } ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async handleElectronicMoneyInstitutions ( ) {
await this . _randomWait ( this . page , 3 , 5 ) ;
const filename = 'licensing-and-supervision-of-electronic-money-institutions' ;
await this . _makeScreenshotV2 ( this . page , ` ${ this . path } / ${ filename } _main ` , null ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . downloadEmoney ( 0 ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
await this . downloadEmoney ( 1 ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
this . emit ( 'startProcessingCreditServices' ) ;
}
/ * *
*
* @ param body
* @ returns { Promise < { } | Array > }
* /
async extractLocalCreditInstitutions ( body ) {
try {
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/ ;
const sanity = /(\d+\.\s)(.+)/ ;
const $ = cheerio . load ( body , {
'normalizeWhitespace' : true
} ) ;
let nextItem ;
$ ( 'p' ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
const isHeading = matchHeading . test ( lineText ) ;
if ( isHeading )
nextItem = $ ( this ) . next ( ) ;
} ) ;
if ( typeof nextItem !== 'undefined' && nextItem !== null ) {
const splitText = $ ( nextItem ) . text ( ) . split ( '\n' ) ;
const output = [ ] ;
splitText . forEach ( ( item ) => {
const newItem = this . _cleanUp ( item ) ;
if ( newItem !== '' )
output . push ( sanity . exec ( newItem ) [ 2 ] ) ;
} ) ;
return output ;
}
return { } ;
}
catch ( err ) {
logger . error ( err ) ;
}
}
/ * *
*
* @ param body
* @ returns { Promise < void > }
* /
async extractForeignCreditInstitutions ( body ) {
try {
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/ ;
const sanity = /(\w+\.\s+)(.+)/ ;
const $ = cheerio . load ( body , {
'normalizeWhitespace' : true
} ) ;
const output = { } ;
let nextItem ;
$ ( 'p' ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
const isHeading = matchHeading . test ( lineText ) ;
if ( isHeading )
nextItem = $ ( this ) . next ( ) ;
} ) ;
// Rolling this out for ease as it could be changed by hand
let nextElm ;
let firstHeadOrig , firstHead ;
if ( typeof nextItem !== 'undefined' && nextItem !== null ) {
firstHeadOrig = this . _cleanUp ( $ ( nextItem ) . text ( ) ) ;
firstHead = sanity . exec ( firstHeadOrig ) [ 2 ] ;
output [ firstHead ] = { } ;
nextElm = $ ( nextItem ) . next ( ) ;
const secondHeadOrig = this . _cleanUp ( $ ( nextElm ) . text ( ) ) ;
const secondHead = sanity . exec ( secondHeadOrig ) [ 2 ] ;
nextElm = $ ( nextElm ) . next ( ) ;
const li = $ ( nextElm ) . find ( 'li' ) ;
const arrayA = [ ] ;
$ ( li ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
arrayA . push ( lineText ) ;
} ) ;
output [ firstHead ] [ secondHead ] = arrayA ;
nextElm = $ ( nextElm ) . next ( ) ;
}
if ( typeof nextElm !== 'undefined' && nextElm !== null ) {
const secondHeadOrig = this . _cleanUp ( $ ( nextElm ) . text ( ) ) ;
const secondHead = sanity . exec ( secondHeadOrig ) [ 2 ] ;
nextElm = $ ( nextElm ) . next ( ) ;
const li = $ ( nextElm ) . find ( 'li' ) ;
const arrayA = [ ] ;
$ ( li ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
arrayA . push ( lineText ) ;
} ) ;
output [ firstHead ] [ secondHead ] = arrayA ;
nextElm = $ ( nextElm ) . next ( ) ;
}
if ( typeof nextElm !== 'undefined' && nextElm !== null ) {
firstHeadOrig = this . _cleanUp ( $ ( nextElm ) . text ( ) ) ;
firstHead = sanity . exec ( firstHeadOrig ) [ 2 ] ;
output [ firstHead ] = { } ;
nextElm = $ ( nextElm ) . next ( ) ;
const secondHeadOrig = this . _cleanUp ( $ ( nextElm ) . text ( ) ) ;
const secondHead = sanity . exec ( secondHeadOrig ) [ 2 ] ;
nextElm = $ ( nextElm ) . next ( ) ;
const li = $ ( nextElm ) . find ( 'li' ) ;
const arrayA = [ ] ;
$ ( li ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
arrayA . push ( lineText ) ;
} ) ;
output [ firstHead ] [ secondHead ] = arrayA ;
nextElm = $ ( nextElm ) . next ( ) ;
}
if ( typeof nextElm !== 'undefined' && nextElm !== null ) {
const secondHeadOrig = this . _cleanUp ( $ ( nextElm ) . text ( ) ) ;
const secondHead = sanity . exec ( secondHeadOrig ) [ 2 ] ;
nextElm = $ ( nextElm ) . next ( ) ;
const li = $ ( nextElm ) . find ( 'li' ) ;
const arrayA = [ ] ;
$ ( li ) . each ( function ( i , elem ) {
const lineText = $ ( this ) . text ( ) ;
arrayA . push ( lineText ) ;
} ) ;
output [ firstHead ] [ secondHead ] = arrayA ;
}
return output ;
}
catch ( err ) {
logger . error ( err ) ;
}
}
/ * *
*
* @ returns { Promise < { local : Promise < * | void > } > }
* /
async processCreditInstitute ( ) {
logger . info ( 'Credit institutes' ) ;
try {
await this . _makeScreenshotV2 ( this . page , ` ${ this . path } /creditInstitutes ` , null ) ;
const body = await this . page . content ( ) ;
await this . _dumpFile ( ` ${ this . path } /creditInstitutes.html ` , body ) ;
const $ = cheerio . load ( body ) ;
const content = $ ( '.generic_page-intro' ) ;
const local = await this . extractLocalCreditInstitutions ( content . html ( ) ) ;
const creditInstitutes = await this . extractForeignCreditInstitutions ( content . html ( ) ) ;
await jsonfile . writeFile ( ` ${ this . path } /creditInstitutes.json ` , { local , creditInstitutes } ) ;
this . emit ( 'done' ) ;
return { local , creditInstitutes } ;
}
catch ( err ) {
logger . error ( err ) ;
}
}
/ * *
*
* @ param filePath
* @ returns { Promise < void > }
* /
async savePDF ( filePath ) {
logger . info ( 'Saving the pdf:' , filePath ) ;
await this . _randomWait ( this . page , 5 , 7 ) ;
await this . page . pdf ( { 'path' : filePath , 'format' : 'A4' } ) ;
// this.emit('startProcessingCreditServices');
logger . debug ( '!! i SHOULD EMIT SOMETHING HERE !!' ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async processNewPage ( ) {
// give the page a few seconds to settle
const checkPDF = /(.pdf)/g ;
await this . _randomWait ( this . page , 3 , 5 ) ;
const pageUrl = url . parse ( await this . page . url ( ) ) ;
if ( pageUrl . href === 'chrome-error://chromewebdata/' ) {
logger . warn ( 'Directed to: chrome-error://chromewebdata/' ) ;
this . emit ( 'recover' ) ;
return ;
}
let currentPath = pageUrl . pathname ;
let pdfFile ;
if ( checkPDF . test ( currentPath ) ) {
const splitPath = currentPath . split ( '/' ) ;
pdfFile = splitPath . pop ( ) ;
currentPath = splitPath . join ( '/' ) ;
}
switch ( currentPath ) {
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions' :
await this . handlePaymentInstitutions ( ) ;
break ;
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions' :
await this . handleElectronicMoneyInstitutions ( ) ;
break ;
case '/images/media/redirectfile/Electronic%20Money%20Institutions' :
logger . warn ( 'We should only arrive here when in Non-headless mode' ) ;
await this . savePDF ( pdfFile ) ;
break ;
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus' :
await this . processCreditInstitute ( ) ;
break ;
default :
await this . _uploadError ( ) ;
throw new Error ( ` Unknown page: ${ pageUrl . href } ` ) ;
break ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async attachEvents ( ) {
logger . info ( 'Attaching events' ) ;
this . on ( 'startProcessingCreditServices' , async function ( ) {
await this . _goto ( this . credit ) ;
} ) ;
}
/ * *
*
* @ returns { Promise < void > }
* /
async start ( ) {
try {
super . _start ( ) ;
this . creditServices = {
'items' : 0 ,
'links' : [ ] ,
'step' : 0 ,
'visited' : false ,
'done' : false ,
'searchDone' : false
} ;
this . startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions' ;
this . eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions' ;
this . credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus' ;
this . path = path . resolve ( ` ${ _ _dirname } /../artefacts/CY/CBOC ` ) ;
await this . _createDirectory ( this . path ) ;
await this . _doNonRepudiation ( ) . catch ( ( err ) => {
logger . warn ( err ) ;
} ) ;
await this . _initBrowser ( true ) ;
await this . _createBrowserPage ( ) ;
this . page . on ( 'domcontentloaded' , this . _throttle ( async ( ) => {
this . processNewPage ( ) . catch ( ( err ) => {
logger . error ( 'processNewPage fail' , err ) ;
} ) ;
} , 2500 ) ) ;
if ( this . eventNames ( ) . length === 2 )
await this . attachEvents ( ) ;
await this . page . tracing . start ( { 'path' : ` ${ this . path } /trace.json ` , 'screenshots' : true } ) ;
await this . page . setViewport ( { 'width' : 1200 , 'height' : 800 } ) ;
await this . _goto ( this . startPage ) ;
await this . _randomWait ( this . page , 3 , 5 ) ;
}
catch ( e ) {
throw new Error ( e ) ;
}
}
/ * *
*
* @ returns { Promise < void > }
* /
async _ _run ( ) {
logger . info ( 'Scraping Cyprus...' ) ;
await this . start ( ) ;
}
}
module . exports = CYScrape ;