const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('EE'); const url = require('url'); const removeAccents = require('remove-accents-diacritics'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class EEScrape extends Scraper { constructor() { super(); this.id = 'EE'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); this.recover = this._debounce(async () => { await this.__recover(); }, 120000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param html * @returns {Promise} */ async extractIndexItems(html, serviceObject) { const newArray = [] ; const $ = cheerio.load(html); const links = $('a'); links.each((i, item) => { const href = $(item).attr('href'); const text = this._cleanUp($(item).text()); const newUrl = `${this.rootURI}${href}`; newArray.push({ 'name':text, 'href':newUrl }); }); return newArray; } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { try { const newObj = {}; const $ = cheerio.load(html); const title = $('h1.page-title').text(); newObj.title = this._cleanUp(title); const tables = $('article div.table-wrap table'); const rows = $(tables).eq(0).find('tbody > tr'); rows.each((i, item) => { const children = $(item).children(); const curLabel = this._makeFieldName($(children).eq(0).text()); newObj[curLabel] = (this._cleanUp($(children).eq(1).text())); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityServices(html) { try { const newObj = {}; const $ = cheerio.load(html); const tables = $('article div.table-wrap table'); if (tables.length > 1) tables.each((i, table) => { if (i > 0) { const label = this._makeFieldName($(table).find('caption').text()); const services = $(table).find('div.field__item').map((i, el) => { return this._cleanUp($(el).text()); }).get(); if (!newObj.hasOwnProperty(label)) newObj[label] = services.slice(); else newObj[label] = newObj[label].concat(services); } }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @param blockType * @returns {{licenseDescription: string, blockType: string}} */ extractEntityLicense(html ) { try { const blockType = 'Licenses'; const newObj = { 'licenseDescription':'', 'blockType': blockType, 'licenses' : [] }; const $ = cheerio.load(html); const header = $(`h3:contains("${blockType}")`); if ($(header).length === 0) return {}; const fieldContent = $(header).next(); const children = $(fieldContent).children(); children.each((i, item) => { const newLicense = {}; newLicense.permitNumber = this._cleanUp($(item).find('div.field--name-field-permit-number div.field__item').text()) ; newLicense.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-permit-entry-date div.field__item').text()) ; const block = $(item).find('div.field--name-field-permit-restrictions'); newLicense.restrictions = $(block).find('p').map((i, el) => { return this._cleanUp($(el).text()); }).get(); newObj.licenses.push(newLicense); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @param blockType * @returns {{licenseDescription: string, blockType: string}} */ extractEntityCrossBorder(html ) { try { const blockType = 'List of cross-border services provided'; const newObj = { 'crossBorder' : [] }; const $ = cheerio.load(html); const header = $(`h3:contains("${blockType}")`); if ($(header).length === 0) return {}; const fieldContent = $(header).next(); const children = $(fieldContent).children(); children.each((i, item) => { const cb = {}; cb.permitNumber = this._cleanUp($(item).find('div.field--name-field-overborder-permit-number div.field__item').text()) ; cb.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-date div.field__item').text()) ; cb.startDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-start div.field__item').text()) ; // field--name-field-overborder-permit-start const block = $(item).find('div.field--name-field-services-list'); cb.cbServices = $(block).find('div.paragraph--type--subject-services-list').map((i, el) => { const service = this._cleanUp($(el).children().eq(0).text()); const country = this._cleanUp($(el).children().eq(1).text()); return { service, country }; }).get(); newObj.crossBorder.push(cb); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @param blockType * @returns {{licenseDescription: string, blockType: string}} */ extractEntityBranches(html ) { try { const subDetails = [['country', 'field--name-field-country'], ['businessName', 'field--name-field-business-name'], ['address', 'field--name-field-address'], ['phone', 'field--name-field-phone']]; const blockType = 'Branches'; const newObj = { 'branches' : [] }; const $ = cheerio.load(html); const header = $(`h3:contains("${blockType}")`); if ($(header).length === 0) return {}; const fieldContent = $(header).next(); const children = $(fieldContent).children(); children.each((i, item) => { const workObj = { 'details' : {}, 'branchServices':[], 'licenses':{} }; workObj.name = this._cleanUp($(item).find('header.paragraph-heading h4').text()); for (const sdItems of subDetails) workObj.details[sdItems[0]] = this._cleanUp($(item).find(`div.${sdItems[1]} div.field__item`).text()) ; const branchPermissions = $(item).find('div.field--name-field-branch-permissions'); const branchServices = $(item).find('div.field--name-field-branch-services'); workObj.branchServices = $(branchServices).find('div.paragraph--type--subject-services-list-simple div.field__item').map((i, el) => { return this._cleanUp($(el).text()); }).get(); workObj.licenses = $(branchPermissions).find('div.paragraph--type--subject-branch-permits').map((i, el) => { const permitNumber = this._cleanUp($(el).children().eq(0).find('div.field__item').text()); const start = this._cleanUp($(el).children().eq(1).find('div.field__item').text()); return { permitNumber, start }; }).get(); newObj.branches.push(workObj); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { const id = serviceObject.links[serviceObject.step].name; logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`); let pageLoaded = true; await this._randomWait(this.page, 3, 5); const entity = removeAccents.remove(id.trim()); const filename = this._makeFileName(entity); const filePath = `${this.path}/${filename}`.substring(0, 240); await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('h1.page-title').catch((e) => { logger.error('processEntityDetails', e); pageLoaded = false; }); if (pageLoaded) { await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); // -- const details = await this.extractEntityDetails(body); const licenses = await this.extractEntityLicense(body); const crossBorder = await this.extractEntityCrossBorder(body); const services = await this.extractEntityServices(body); const branches = await this.extractEntityBranches(body); // -- await jsonfile.writeFile(`${filePath}.json`, { details, licenses, crossBorder, services, branches }); await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl).catch((err) => { if (err.name === 'TimeoutError') this.emit('recover'); }); } else this.emit('serviceDone'); } } /** * * @param serviceObject * @returns {Promise} */ async processIndex(serviceObject) { let html = ''; logger.info(`Building the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => { html = await this.page.evaluate(el => el.outerHTML, elm); }).catch((e) => { logger.error(e); logger.warn('No index list'); }); const indexList = await this.extractIndexItems(html); logger.debug('serviceObject.indexStep', serviceObject.indexStep); serviceObject.links = serviceObject.links.concat(indexList).map((v) => { v['meta'] = serviceObject.indexStep; return v; }); const filename = this.modeNames[this.mode]; await this._randomWait(this.page, 5, 7); const subStep = (serviceObject.pageCount > 0) ? `-${serviceObject.pageCount}` : ''; this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}${subStep}`, null); await this.page.waitForSelector('li.next-nav > a.button.next', { 'visible':true, 'timeout':7500 }).then(async (elm) => { logger.debug('Next page..'); await elm.click({ 'delay':Scraper.notARobot() }); await this._randomWait(this.page, 5, 7); serviceObject.pageCount++; this.emit('pageChanged'); }).catch(() => { serviceObject.pageCount = 0; this.emit('indexdone'); }); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => { await this.processIndex(serviceObject); }).catch((e) => { // logger.error(e); logger.warn('No index list'); this.emit('indexdone'); }); } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } /** * * @returns {Promise} */ async indexRedirector() { switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); const pathname = pageUrl.pathname; logger.debug('workMode::', ['Indexing', 'Scraping'][this.workMode]); if (pathname === '/') { logger.error('Invalid path'); logger.debug(JSON.stringify(pageUrl)); logger.warn('processNewPage::emit recover'); this.emit('recover'); return; } switch (this.workMode) { case 0: await this.indexRedirector(); break; case 1: await this.processRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl.href); } break; } } /** * * @returns {Promise} */ async restart() { logger.info(`Restarting ${this.modeTitles[this.mode]}`); this._goto(this.lastUrl); } /** * * @returns {Promise} * @private */ async __recover() { logger.warn('*** RECONNECTING PAGE ***'); logger.info('BrowserCrashed:', this.browserCrashed); await this._forcePageClose(); if (this.browserCrashed) await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', () => { this.processNewPage(); }); const timeout = 90000; setTimeout(async() => { logger.warn('Attempting recovery..'); await this.restart(); }, timeout); } /** * * @returns {Promise} */ async attachEvents() { this.on('pageChanged', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); // clear out stock recover handler this.removeAllListeners('recover'); this.on('recover', async () => { logger.info('onRecover'); await this.recover(); }); this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('psindexdone', async () => { let newUrl; this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); this.paymentServices.indexStep++; if (this.paymentServices.indexStep >= this.paymentServices.urls.length) { this.workMode = 1; logger.debug(JSON.stringify(this.paymentServices)); newUrl = this.paymentServices.links[this.paymentServices.step].href; } else newUrl = this.paymentServices.urls[this.paymentServices.indexStep]; await this._goto(newUrl); }); this.on('emindexdone', async () => { let newUrl; this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); this.emoneyServices.indexStep++; if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length) { this.workMode = 1; newUrl = this.emoneyServices.links[this.emoneyServices.step].href; } else newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep]; await this._goto(newUrl); }); this.on('ciindexdone', async () => { let newUrl; this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); this.creditServices.indexStep++; if (this.creditServices.indexStep >= this.creditServices.urls.length) { this.workMode = 1; newUrl = this.creditServices.links[this.creditServices.step].href; } else newUrl = this.creditServices.urls[this.creditServices.indexStep]; await this._goto(newUrl); }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('paymentServicesDone', async () => { this.workMode = 0; await super._paymentServicesDone(); }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); this.workMode = 0; try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); this.workMode = 0; try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.workMode = 0; this.rootURI = 'https://www.fi.ee'; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.fi.ee/en/payment-services/payment-institutions/estonian-payment-institutions', 'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/estonian-payment-institutions-exemption', 'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/branches-foreign-payment-institutions', 'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/payment-agents', 'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/providers-cross-border-payment-sevices', 'https://www.fi.ee/en/payment-services/payment-institutions/payment-agents-providers-cross-border-payment-services'], 'sections' : [], 'sectionLinks' : [], 'pageCount' : 0 }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions', 'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions-exemption', 'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/distributors-e-money', 'https://www.fi.ee/en/payment-services/e-money-institutions/providers-cross-border-e-money-services', 'https://www.fi.ee/en/distributors-providers-cross-border-e-money-services', 'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/branches-foreign-e-money-institutions'], 'sections' : [], 'sectionLinks' : [], 'pageCount' : 0 }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/licensed-credit-institutions-estonia', 'https://www.fi.ee/en/banking-and-credit/credit-institutions/affiliated-branches-foreign-credit-institutions', 'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/representative-offices-foreign-credit-institutions', 'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/providers-cross-border-banking-services'], 'sections' : [], 'sectionLinks' : [], 'pageCount' : 0 }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/EE/FI`)); // await this._doNonRepudiation(); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.paymentServices.urls[0], { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = EEScrape;