From a5109efabe347f82d34074ee88068602ed45727d Mon Sep 17 00:00:00 2001 From: Martin Donnelly Date: Sun, 12 May 2019 18:33:09 +0100 Subject: [PATCH] 2019-05-12 --- at.js | 23 +++ be.js | 23 +++ ecosystem.config.js | 13 +- fi.js | 25 +++ helpers/scraper.js | 48 ++++- ncas/at.js | 318 ++++++++++++++++++++++++++++++++ ncas/be.js | 433 ++++++++++++++++++++++++++++++++++++++++++++ ncas/bg.js | 93 +++++++--- ncas/fi.js | 327 +++++++++++++++++++++++++++++++++ ncas/fr.js | 18 +- ncas/gi.js | 2 +- ncas/lt.js | 4 +- ncas/lu.js | 4 +- ncas/lv.js | 16 +- ncas/nl.js | 34 ++-- ncas/pl.js | 6 +- ncas/pt.js | 4 +- ncas/sk.js | 15 +- package-lock.json | 41 +++-- 19 files changed, 1354 insertions(+), 93 deletions(-) create mode 100644 at.js create mode 100644 be.js create mode 100644 fi.js create mode 100644 ncas/at.js create mode 100644 ncas/be.js create mode 100644 ncas/fi.js diff --git a/at.js b/at.js new file mode 100644 index 0000000..081f2f3 --- /dev/null +++ b/at.js @@ -0,0 +1,23 @@ +#!/usr/bin/env node +const CronJob = require('cron').CronJob; + +// load env variables from file +require('dotenv').config(); + +const Austria = require('./ncas/at'); + +async function run() { + const atScraper = new Austria(); + + if (typeof(process.env.AT_CRON) === 'string' ) + new CronJob(process.env.AT_CRON, async function() { + await atScraper.run(); + }, null, true); + + if (process.env.SCRAPE_START === atScraper.id) + await atScraper.run(); + + console.log('AT Launched'); +} + +run(); diff --git a/be.js b/be.js new file mode 100644 index 0000000..c16e144 --- /dev/null +++ b/be.js @@ -0,0 +1,23 @@ +#!/usr/bin/env node +const CronJob = require('cron').CronJob; + +// load env variables from file +require('dotenv').config(); + +const Belgium = require('./ncas/be'); + +async function run() { + const beScraper = new Belgium(); + + if (typeof(process.env.BE_CRON) === 'string' ) + new CronJob(process.env.BE_CRON, async function() { + await beScraper.run(); + }, null, true); + + if (process.env.SCRAPE_START === beScraper.id) + await beScraper.run(); + + console.log('BE launched'); +} + +run(); diff --git a/ecosystem.config.js b/ecosystem.config.js index a818422..3a8b993 100644 --- a/ecosystem.config.js +++ b/ecosystem.config.js @@ -13,27 +13,30 @@ function buildApps() { const apps = []; const list = [ { 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js', 'proxy': 'uk', 'crontime': '0 0 * * *' }, // 00:04:40 - { 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // 01:12:53 + { 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // "01:09:45.187" { 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js', 'proxy': 'uk', 'crontime': '10 1 * * *' }, // 04:51:37 - uk free at 6:30 + { 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk', 'crontime': '20 6 * * *' }, // "00:24:01.696" + { 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk', 'crontime': '0 7 * * *' }, // "00:53:02.432" { 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js', 'proxy': 'fr', 'crontime': '0 0 * * *' }, // 00:01:03 { 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js', 'proxy': 'fr', 'crontime': '5 0 * * *' }, // 00:43:45 { 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js', 'proxy': 'fr', 'crontime': '0 1 * * *' }, // 01:22:29 - { 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // 00:53:26 + { 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // "00:54:28.134" { 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js', 'proxy': 'fr', 'crontime': '30 3 * * *' }, // 00:24:03 - fr free at 4:00 { 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js', 'proxy': 'de', 'crontime': '0 0 * * *' }, // 03:55:38 - de free at 4:00 { 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js', 'proxy': 'nl', 'crontime': '0 0 * * *' }, // 07:23:19 - nl free at 7:30 { 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js', 'proxy': 'ch', 'crontime': '0 0 * * *' }, // 17:59:18 - ch free at 18:00 - { 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk' }, { 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js', 'proxy': 'uk' }, { 'cron':'ES_CRON', 'start':'ES', 'name':'ES', 'script':'es.js', 'proxy': 'uk' }, { 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js', 'proxy': 'uk' }, { 'cron':'GR_CRON', 'start':'GR', 'name':'GR', 'script':'gr.js', 'proxy': 'uk' }, { 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js', 'proxy': 'uk' }, - { 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk' }, { 'cron':'LV_CRON', 'start':'LV', 'name':'LV', 'script':'lv.js', 'proxy': 'uk' }, { 'cron':'NO_CRON', 'start':'NO', 'name':'NO', 'script':'no.js', 'proxy': 'uk' }, { 'cron':'EE_CRON', 'start':'EE', 'name':'EE', 'script':'ee.js', 'proxy': 'uk' }, - { 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' } + { 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' }, + { 'cron':'AT_CRON', 'start':'AT', 'name':'AT', 'script':'at.js', 'proxy': 'uk' }, + { 'cron':'FI_CRON', 'start':'FI', 'name':'FI', 'script':'fi.js', 'proxy': 'uk' }, + { 'cron':'BE_CRON', 'start':'BE', 'name':'BE', 'script':'be.js', 'proxy': 'uk' } ]; apps.push({ diff --git a/fi.js b/fi.js new file mode 100644 index 0000000..da0381c --- /dev/null +++ b/fi.js @@ -0,0 +1,25 @@ +#!/usr/bin/env node +const CronJob = require('cron').CronJob; + +// load env variables from file +require('dotenv').config(); + +const argv = require('yargs').argv; + +const Finland = require('./ncas/fi'); + +async function run() { + const fiScraper = new Finland(); + + if (typeof(process.env.FI_CRON) === 'string' ) + new CronJob(process.env.FI_CRON, async () => { + await fiScraper.run(); + }, null, true); + + if (process.env.SCRAPE_START === fiScraper.id) + await fiScraper.run(); + + console.log('FI Launched'); +} + +run(); diff --git a/helpers/scraper.js b/helpers/scraper.js index b40ad17..a7e8eed 100644 --- a/helpers/scraper.js +++ b/helpers/scraper.js @@ -231,7 +231,8 @@ class Scraper extends EventEmitter { '--disable-gpu', '--window-size=1920x1080', '--hide-scrollbars', - '--disable-default-apps' + '--disable-default-apps', + '--remote-debugging-port=9222' ] }).catch((err) => { logger.error('Puppeteer failed to launch'); @@ -416,14 +417,25 @@ class Scraper extends EventEmitter { * @private */ async _makeScreenshotV2(page, destPath, waitFor = null) { - if (waitFor) - await page.waitFor(waitFor); + try{ + if (waitFor) + await page.waitFor(waitFor); - logger.debug('Snapshot', `${destPath}.png`); - await page.setViewport({ 'width': 1200, 'height': 800 }); - await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => { - logger.error('Screenshot', err); - }); + if(!this.page) { + logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot'); + + return; + } + + logger.debug('Snapshot', `${destPath}.png`); + await page.setViewport({ 'width': 1200, 'height': 800 }); + await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => { + logger.error('Screenshot', err); + }); + } + catch( err) { + logger.error('_makeScreenshotV2', err); + } } /** @@ -1500,6 +1512,17 @@ class Scraper extends EventEmitter { await jsonfile.writeFileSync(filePath, json); } + _checkFileExistsSync(filePath) { + try { + fs.accessSync(filePath, fs.F_OK); + + return true; + } + catch (err) { + return false; + } + } + /** * * @param page @@ -1681,6 +1704,15 @@ class Scraper extends EventEmitter { await this._goto(rURL); } + async saveFile(filename, data) { + try{ + fs.writeFileSync(filename, data); + } + catch( err) { + logger.error(err); + } + } + } module.exports = Scraper; diff --git a/ncas/at.js b/ncas/at.js new file mode 100644 index 0000000..e75eb76 --- /dev/null +++ b/ncas/at.js @@ -0,0 +1,318 @@ +const cheerio = require('cheerio'); +const logger = require('log4js').getLogger('AT'); +const path = require('path'); +const jsonfile = require('jsonfile'); +const removeAccents = require('remove-accents-diacritics'); +const url = require('url'); + +const Scraper = require('../helpers/scraper'); + +logger.level = process.env.LOGGER_LEVEL || 'warn'; + +class ATScrape extends Scraper { + + constructor() { + super(); + this.setID('AT'); + + this.on('done', () => { + this._done(); + }); + + this.run = this._throttle(async () => { + await this.__run(); + }, 5000); + + if (process.env.NODE_ENV === 'production') + this._checkLock().then((l) => { + if(l) + this.run(); + }); + } + + getIndexUrl(category) { + return `https://www.fma.gv.at/en/search-company-database/?cname=&place=&bic=&category=${category}&per_page=10&submitted=1&to=1`; + } + + getHtmlNotInMatchingElements(html, selector) { + const $ = cheerio.load(html, { 'decodeEntities': false }); + + $(selector).remove(); + + // cheerio adds ... around the result, so we return the contents of + return $('body').html(); + } + + extractFieldFromLiWithStrongHeader($, headerText, extractTextFromHtml = false) { + const fieldHeader = $(`div.company-details li > strong:contains("${headerText}")`); + if (fieldHeader.length < 1) + return ''; + const fieldLi = fieldHeader.parent().html(); + const fieldValue = this.getHtmlNotInMatchingElements(fieldLi, 'strong'); + + if (extractTextFromHtml) + return $(fieldValue).text().trim(); + else + return fieldValue.trim(); + } + + extractSingleFields($, details) { + details['name'] = this._cleanUp($('h3 > a').text()); + + const addressRaw = this.extractFieldFromLiWithStrongHeader($, 'Address:'); + let address = addressRaw.replace(/\s*\|\s*/g, ', '); // replace pipes with commas + address = address.replace(/\s+/g, ' '); // replace any non-standard spaces with simple spaces + address = address.trim(); + details['address'] = address; + + details['phone'] = this.extractFieldFromLiWithStrongHeader($, 'Phone:'); + details['email'] = this.extractFieldFromLiWithStrongHeader($, 'Email:', true); + details['website'] = this.extractFieldFromLiWithStrongHeader($, 'Web:', true); + details['bankIdentificationNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Bank identification number:'); + details['commercialRegisterNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Commercial register number:'); + } + + extractMultiples($, details) { + details['categories'] = []; + const categoriesLi = $('div.company-details > ul > li > strong:contains("Category:")').parent().html(); + const categoriesRaw = this.getHtmlNotInMatchingElements(categoriesLi, 'strong'); + const categories = categoriesRaw.split(/
/); + for (let i = 0; i < categories.length; i++) { + let cat = categories[i]; + cat = this._cleanUp(cat); + if (cat !== '') + details['categories'].push(cat); + } + } + + extractPermissions($, details) { + details['permissions'] = []; + const permissionsDiv = $('div.modal-body'); + $(permissionsDiv).find('h4').each((i, item) => { + const code = this._cleanUp($(item).text()); + const description = this._cleanUp($(item).next().text()); + details['permissions'].push({ 'code': code, 'description': description }); + }); + } + + extractEntityDetails(html) { + const details = {}; + + const $ = cheerio.load(html, { 'decodeEntities': false }); + + this.extractSingleFields($, details); + + this.extractMultiples($, details); + + this.extractPermissions($, details); + + return details; + } + + currentPageAsString() { + return `${this.modeNames[this.mode]} url ${this.getCurrentMode().urlStep}, page ${this.getCurrentMode().paginationStep}`; + } + + async expandAndScreenshot() { + logger.info(`Expanding content on ${this.currentPageAsString()}`); + + await this.page.addStyleTag({ + 'content': + ` + div.company-details { /* make space for the content */ + position: static; + width: auto; + height: auto; + } + div.document-description { /* make content visible */ + display: block; + position: static; + opacity: 1; + } + div.modal-dialog { /* move the content back down (it's transformed up a bit by default) */ + transform: none !important; + } + div.modal-content { /* remove the drop shadow (might help render faster?) */ + box-shadow: none; + -webkit-box-shadow: none; + } + ` + }); + + logger.info(`Taking screenshot of ${this.currentPageAsString()}`); + const filename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_page_${this.getCurrentMode().paginationStep}`; + const path = `${this.path}/${filename}`; + await this._makeScreenshotV2(this.page, path); + } + + serviceDone() { + logger.info(`${this.modeNames[this.mode]} done. Total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]} scraped.`); + try{ + jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links }); + jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode()); + } + catch (e) { + logger.error(e); + } + } + + async entityResultsPageProcessor() { + const body = await this.page.content(); + const $ = cheerio.load(body, { 'decodeEntities': false }); + + await this.expandAndScreenshot(); + + const entities = $('div.company-details-wrap'); + + entities.each(async (i, item) => { + const noWhiteSpace = /\W/g; + + const details = this.extractEntityDetails($(item).html()); + const id = this._makeFieldName(details.name); + const entity = removeAccents.remove(id.trim()); + const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); + const filePath = `${this.path}/${filename}`.substring(0, 240); + jsonfile.writeFile(`${filePath}.json`, { details }); + + this.getCurrentMode().links.push({ + 'id': id, + 'href': await this.page.url(), + 'filename': filename + }); + }); + + logger.info(`${entities.length} ${this.modeNames[this.mode]} entities scraped.`); + + const nextLink = await this.page.$('div.paging li.next:not(.disabled) a'); + if (nextLink !== null) { + logger.info('Clicking through to next page.'); + this.getCurrentMode().paginationStep++; + const nextHref = await this.page.evaluate(link => { + return link.href; + }, nextLink); + this._goto(nextHref); + } + else { + this.serviceDone(); + this.getCurrentMode().paginationStep = 1; + const nextUrl = this.getNextUrl(); + if (nextUrl !== null) + this._goto(nextUrl); + else + this.emit('done'); + } + } + + async processNewPage() { + // give the page a few seconds to settle + await this._randomWait(this.page, 3, 5); + + const pageUrl = url.parse(await this.page.url()); + + if (pageUrl.pathname === '/en/search-company-database/') + await this.entityResultsPageProcessor(); + else + logger.error(`Page url not recognised: ${pageUrl.href}`); + + } + + getCurrentMode() { + switch (this.mode) { + + case 0: + return this.paymentServices; + + case 1: + return this.emoneyServices; + + case 2: + return this.creditServices; + + } + } + + getNextUrl() { + if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1) + this.getCurrentMode().urlStep++; + else { + if (this.mode < this.modeNames.length - 1) + this.mode++; + else + return null; + } + + return this.getCurrentMode().urls[this.getCurrentMode().urlStep]; + } + + /** + * + * @returns {Promise} + */ + async start() { + super._start(); + + await this._initBrowser(); + await this._createBrowserPage(); + + this.page.on('domcontentloaded', this._throttle(async () => { + this.processNewPage().catch((err) => { + logger.error('processNewPage fail', err); + }); + }, 2500)); + + this.setPath(path.resolve(`${__dirname }/../artefacts/AT/FMA`)); + + this.mode = 0; + + this.paymentServices = { + 'items': 0, + 'links': [], + 'step': 0, + 'urlStep': 0, + 'paginationStep': 1, + 'urls': [ + this.getIndexUrl('1977'), // Payment institutions - Payment Institutions licensed in Austria + this.getIndexUrl('2798'), // Payment Institutions - Account information service provider (AISP) + this.getIndexUrl('2799') // Payment Institutions - Payment initiation service provider (PISP) + ] + }; + + this.emoneyServices = { + 'items': 0, + 'links': [], + 'step': 0, + 'urlStep': 0, + 'paginationStep': 1, + 'urls': [this.getIndexUrl('2193')] // E-Money-Institutions - E-Money-Institutions licensed in Austria + }; + + this.creditServices = { + 'items': 0, + 'links': [], + 'step': 0, + 'urlStep': 0, + 'paginationStep': 1, + 'urls': [this.getIndexUrl('165')] // Banks - Banks licensed in Austria + }; + + this.startPage = this.paymentServices.urls[0]; + + await this._doNonRepudiation().catch((err) => { + logger.warn(err); + }); + + try { + await this.page.setViewport({ 'width': 1200, 'height': 800 }); + await this._goto(this.startPage); + } + catch(e) { + throw new Error(e); + } + } + + async __run() { + await this.start(); + } +} + +module.exports = ATScrape; diff --git a/ncas/be.js b/ncas/be.js new file mode 100644 index 0000000..243e369 --- /dev/null +++ b/ncas/be.js @@ -0,0 +1,433 @@ +const cheerio = require('cheerio'); +const jsonfile = require('jsonfile'); +const logger = require('log4js').getLogger('BE'); +const path = require('path'); +const removeAccents = require('remove-accents-diacritics'); +const url = require('url'); + +const Scraper = require('../helpers/scraper'); + +logger.level = process.env.LOGGER_LEVEL || 'warn'; + +class BEScrape extends Scraper { + + constructor() { + super(); + this.setID('BE'); + + this.on('done', () => { + this._done(); + }); + + this.run = this._throttle(async () => { + await this.__run(); + }, 5000); + + if (process.env.NODE_ENV === 'production') + this._checkLock().then((l) => { + if(l) + this.run(); + }); + } + + async downloadFile(docLink) { + logger.info(`Downloading ${docLink}`); + + await this.page.goto(docLink).catch((err) => { + if (err.message.indexOf('net::ERR_ABORTED') !== -1) { + logger.info(`Ignoring expected error upon file download: ${err.message}`); + } + else + throw err; + }); + + const waitMs = 5000; + const parsedUrl = url.parse(docLink); + const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase())); + const downloadFilePath = `${this.path}/${fileName}`; + + let tries; + + for (tries = 1; tries <= 10; tries++) { + logger.info('Waiting...'); + await this.page.waitFor(waitMs); + if (this._checkFileExistsSync(downloadFilePath)) { + logger.info(`${docLink} successfully downloaded.`); + return true; + } + else { + logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`); + } + } + + // if we reach this point, download has failed + logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`); + return false; + } + + normaliseDocLink(docLink) { + if (!docLink.startsWith('http://www.nbb.be/')) { + // attempt to normalise document link + if (docLink.startsWith('file:///L:/PRXNWEBP/')) { + return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/'); + } + else { + logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`); + } + } + + return docLink; + } + + convertMulitpleSpaceToCommaSpace(value) { + return value.replace(/\s{2,}/g, ', '); + } + + extractMainDetails(detailsContainer) { + const $ = require('cheerio'); + + const details = {}; + + details['name'] = $(detailsContainer).children('strong').text().trim(); + details['companyType'] = $(detailsContainer).children('em').text().trim(); + + const lines = $(detailsContainer).children(); + + details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim()); + details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim()); + + // Occasionally line 5 will contain text. If this is the case, line 5 contains + // "addressThree", and every other line moves along by one. + let offset = 0; + if (lines[5].next.data.trim() !== '') { + offset = 1; + details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim()); + } + else { + details['addressThree'] = null; + } + + details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim(); + + details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim(); + + const docLink = $(detailsContainer).children('a'); + + if (docLink.length > 0) { + details['docLink'] = docLink.attr('href'); + details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href')); + } + else { + details['docLink'] = null; + details['normalisedDocLink'] = null; + } + + return details; + } + + extractAdditionalDetails(tableCells) { + const $ = require('cheerio'); + + const additionalDetails = {}; + + tableCells.toArray().map((td) => { + const thText = $(td).closest('table').find('th').eq($(td).index()).text(); + const fieldName = this._makeFieldName(thText); + additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"] + }); + + return additionalDetails; + } + + extractFullDetails(fullDetailsContainer, mode) { + const $ = require('cheerio'); + + switch (mode) { + + case 0: + case 1: + // in modes 0 and 1 the main details are in the first td of the parent container + const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0)); + const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1)); + return {...mainDetails, ...additionalDetails}; + + case 2: + // in mode 2 (credit institutions) the main details are in the root. + return this.extractMainDetails(fullDetailsContainer); + // no additional details for credit institutions + } + + } + + extractEntitiesFromContainer(entitiesContainer, mode) { + const $ = require('cheerio'); + + const entities = []; + + switch ($(entitiesContainer).prop("tagName")) { + case 'TBODY': + $(entitiesContainer).children('tr').each((index, item) => { + entities.push(this.extractFullDetails(item, mode)); + }); + break; + + case 'UL': + $(entitiesContainer).children('li').each((index, item) => { + entities.push(this.extractFullDetails(item, mode)); + }); + break; + } + + return entities; + } + + extractIndex(indexContainer, mode) { + const $ = require('cheerio'); + + const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim(); + + const description = $(indexContainer).find('div.description').html(); + + const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record + + const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0); + + let entities; + + if (entitiesContainer.length > 0) { + entities = this.extractEntitiesFromContainer(entitiesContainer, mode); + } + else { + entities = []; + } + + const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record + + return { title, description, legend, entities, changes }; + } + + getIdByEntityName(name) { + const noWhiteSpace = /\W/g; + + let id = this._makeFieldName(name).trim(); + id = removeAccents.remove(id); + id = id.replace(noWhiteSpace, '_'); + + return id; + } + + async processIndex() { + const pageUrl = await this.page.url(); + + logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`); + + await this.allowCookies(); + + const body = await this.page.content(); + + const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' }); + + logger.info('Extracting index...') + const index = this.extractIndex($('div#PrudentialList'), this.mode); + + logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`); + + logger.info(`Downloading ${this.modeNames[this.mode]} documents.`); + // download all documents from this index page + for (const entity of index.entities) { + if (entity.normalisedDocLink !== null) { + const didDownload = await this.downloadFile(entity.normalisedDocLink); + + if (didDownload) { + // rename the file to match the json file name format + const parsedUrl = url.parse(entity.normalisedDocLink); + const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase())); + const originalFilePath = `${this.path}/${originalFileName}`; + const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join(''); + const newFilePath = `${this.path}/${newFileName}`; + await this._renameFile(originalFilePath, newFilePath); + // save new file name to entity object so it can be found later. + entity['docLocalFilename'] = newFileName; + } + else { + entity['docLocalFilename'] = null; + } + } + } + + logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`); + const description = index['description']; + const legend = index['legend']; + const changes = index['changes']; + const metadata = { description, legend, changes }; + const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`; + const metadataFilePath = `${this.path}/${metadataFileName}` + jsonfile.writeFile(metadataFilePath, { metadata }); + + for (const entity of index.entities) { + const id = this.getIdByEntityName(entity.name); + + // create json file for each entity + const filename = [this.modePrefix[this.mode], id].join(''); + const filePath = `${this.path}/${filename}`.substring(0, 240); + jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName}); + + // add entity details to "links" so that index file can be generated later + this.getCurrentMode().links.push({ + 'id': id, + 'href': await this.page.url(), + 'filename': filename + }); + } + + logger.info(`Taking screenshot of: ${pageUrl}`); + const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`; + const screenshotPath = `${this.path}/${screenshotFilename}`; + await this._makeScreenshotV2(this.page, screenshotPath); + + const nextUrl = this.getNextUrl(); + + if (nextUrl !== null) + await this._goto(nextUrl); + else + this.emit('done'); + } + + serviceDone() { + try{ + jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links }); + jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode()); + + logger.info(`${this.modeNames[this.mode]} done.`); + logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`); + } + catch (e) { + logger.error(e); + } + } + + getCurrentMode() { + switch (this.mode) { + + case 0: + return this.paymentServices; + + case 1: + return this.emoneyServices; + + case 2: + return this.creditServices; + + } + } + + getNextUrl() { + if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1) + this.getCurrentMode().urlStep++; + else { + this.serviceDone(); + if (this.mode < this.modeNames.length - 1) + this.mode++; + else + return null; + } + + return this.getCurrentMode().urls[this.getCurrentMode().urlStep]; + } + + async allowCookies() { + const agreeButton = await this.page.$('button.agree-button'); + if (agreeButton !== null) { + logger.info('Agreeing to cookie policy.') + await agreeButton.click(); + await this._randomWait(this.page, 3, 5); + } + } + + async processNewPage() { + // give the page a few seconds to settle + await this._randomWait(this.page, 3, 5); + + // set download behaviour in case this is a new tab after a recovery + // TODO: this could be set by default in the base class for every new tab in every scraper + await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); + + const pageUrl = await this.page.url(); + + if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle')) + await this.processIndex(); + else if (process.env.NODE_ENV) { + await this._uploadError(); + throw new Error(`Unknown page: ${pageUrl}`); + } + else { + logger.warn('processNewPage Fell through'); + logger.warn('currentPage.location', pageUrl); + } + } + + async attachEvents() { + + } + + async start() { + super._start(); + + this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`)); + + this.mode = 0; + + this.paymentServices = { + 'links': [], + 'urlStep': 0, + 'urls': [ + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15', + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14', + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16' + ] + }; + + this.emoneyServices = { + 'links': [], + 'urlStep': 0, + 'urls': [ + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9', + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17' + ] + }; + + this.creditServices = { + 'links': [], + 'urlStep': 0, + 'urls': [ + 'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7' + ] + }; + + this.startPage = this.paymentServices.urls[0]; + + await this._doNonRepudiation().catch((err) => { + logger.warn(err); + }); + + await this._initBrowser(); + await this._createBrowserPage(); + + this.page.on('domcontentloaded', this._throttle(async () => { + this.processNewPage().catch((err) => { + logger.error('processNewPage fail', err); + }); + }, 2500)); + + if (this.eventNames().length === 2) + await this.attachEvents(); + + await this._goto(this.startPage); + } + + async __run() { + await this.start(); + } +} + +module.exports = BEScrape; diff --git a/ncas/bg.js b/ncas/bg.js index 796ef6f..95c2dfb 100644 --- a/ncas/bg.js +++ b/ncas/bg.js @@ -1,5 +1,6 @@ const logger = require('log4js').getLogger('BG'); const path = require('path'); +const url = require('url'); const Scraper = require('../helpers/scraper'); @@ -7,7 +8,7 @@ class BGScrape extends Scraper { constructor() { super(); - this.id = 'BG'; + this.setID('BG'); this.on('done', () => { this._done(); @@ -32,6 +33,64 @@ class BGScrape extends Scraper { await this._randomWait(this.page, 3, 5); } + async processPaymentServicesPage() { + await this._randomWait(this.page, 3, 5); + this._makeScreenshotV2(this.page, `${this.path}/ps_em_index`); + + await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); + + await this.downloadByHrefFilename('ps_po_register_2_en.xls'); // Payment Institutions + await this.downloadByHrefFilename('ps_po_register_3a_en.xls'); // eMoney Institutions + + // wait until networkidle to ensure the above downloads are complete, then go to next page + await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' }); + } + + async processCreditInstitutionsPage() { + await this._randomWait(this.page, 3, 5); + this._makeScreenshotV2(this.page, `${this.path}/ci_index`); + + // TODO: come back and scrape the html page version of this word doc, if we have time + await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc'); // banks and foreign banks' branches operating in Bulgaria + await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls'); // permissions list + + // no more pages to go to at this point, so wait a final 10 seconds to allow files to download + // TODO: investigate whether this could be done with: // page.waitForNavigation({ waitUntil: 'networkidle0' }) + await this.page.waitFor(10000); + + this.emit('done'); + } + + async processNewPage() { + // give the page a few seconds to settle + await this._randomWait(this.page, 3, 5); + + // set download behaviour on every processNewPage in case this is a recovery attempt / new tab + await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); + + const pageUrl = url.parse(await this.page.url()); + + if (pageUrl.href === 'chrome-error://chromewebdata/') { + logger.warn('Directed to: chrome-error://chromewebdata/'); + this.emit('recover'); + + return; + } + + if (pageUrl.href.includes('/PSPaymentOversightRegisters')) + await this.processPaymentServicesPage(); + else if (pageUrl.href.includes('/RSCIRegisters')) + await this.processCreditInstitutionsPage(); + else if (process.env.NODE_ENV) { + await this._uploadError(); + throw new Error(`Unknown page: ${pageUrl.href}`); + } + else { + logger.warn('processNewPage Fell through'); + logger.warn('currentPage.location', pageUrl.href); + } + } + async start() { super._start(); @@ -45,34 +104,20 @@ class BGScrape extends Scraper { logger.warn(err); }); - await this._initBrowser(); - this.page = await this.browser.newPage(); + await this._initBrowser(false); + await this._createBrowserPage(); + + this.page.on('domcontentloaded', this._throttle(async () => { + this.processNewPage().catch((err) => { + logger.error('processNewPage fail', err); + }); + }, 2500)); + await this.page.setViewport({ 'width': 1200, 'height': 800 }); // set cookie for English language and load start page await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); - - await this._randomWait(this.page, 3, 5); - this._makeScreenshotV2(this.page, `${this.path}/index1`); - - await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); - - await this.downloadByHrefFilename('ps_po_register_2_en.xls'); - await this.downloadByHrefFilename('ps_po_register_3a_en.xls'); - - await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' }); - await this._randomWait(this.page, 3, 5); - this._makeScreenshotV2(this.page, `${this.path}/index2`); - - // TODO: come back and scrape the html page version of this word doc, if we have time - await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc'); - await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls'); - - // wait until all downloads finished with 'networkidle0' (currently this is only possible with 'page.goto', so we go back to the start page) - await this._goto(this.startPage, { 'waitUntil': 'networkidle0' }); - - this.emit('done'); } async __run() { diff --git a/ncas/fi.js b/ncas/fi.js new file mode 100644 index 0000000..ad0a3d6 --- /dev/null +++ b/ncas/fi.js @@ -0,0 +1,327 @@ +const Scraper = require('../helpers/scraper'); +const path = require('path'); +const logger = require('log4js').getLogger('FI'); +const url = require('url'); + +logger.level = process.env.LOGGER_LEVEL || 'warn'; + +class FIScrape extends Scraper { + + constructor(checkForLock = true) { + super(); + this.id = 'FI'; + + this.addToBlockFilters(['msecnd.net', 'siteimproveanalytics.com', 'newrelic.com', 'visualstudio.com']); + + this.on('done', () => { + this._done(); + }); + + this.run = this._throttle(async () => { + await this.__run(); + }, 5000); + + if (checkForLock) + this._checkLock().then((l) => { + if(l) + this.run(); + }); + + this.on('error', (err) => { + logger.error('Error catcher!!', err); + }); + } + + /** + * + * @returns {Promise} + */ + async movePageToBottom() { + await this.page.evaluate(() => { + window.scrollBy(0, window.innerHeight); + }); + } + + /** + * + * @returns {Promise} + */ + async renameFile() { + try{ + const filename = this.modeNames[this.step]; + + const sourceFile = 'exported.json'; + + const origFile = `${this.path}/${sourceFile}`; + const newFile = `${this.path}/${filename}.json`; + + await this._renameFile(origFile, newFile); + } + catch( err) { + logger.error(err); + } + } + + /** + * + * @returns {Promise} + */ + async clickReturn() { + await this._randomWait(this.page, 5, 7, 'clickReturn'); + this.step++; + + this.emit('next'); + } + + /** + * + * @returns {Promise} + */ + async clickSearch() { + logger.debug('clickSearch'); + + await this.movePageToBottom(); + + await this._randomWait(this.page, 2, 3, 'Move to bottom'); + + await this.page.waitForSelector('#tree-search-button', { 'visible':true, 'timeout':75000 }).then(async (elm) => { + logger.debug('found'); + await elm.focus(); + this._microWait(this.page, 5); + await elm.click({ 'delay':90 }); + }).catch((e) => { + logger.error('Search button missing', e); + }); + + await this._randomWait(this.page, 2, 3, 'after clickSearch click'); + } + + /** + * + * @returns {Promise} + */ + async selectOptions() { + logger.debug(`select ${this.modeNames[this.step]}`); + + const clickablesSource = [ + [ + '#tree > ul > li:nth-child(4) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > ul > li:nth-child(2) > div > span:nth-child(3) > label', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(3) > div > span:nth-child(3) > label', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(4) > div > span:nth-child(3) > label' + ], + [ + '#tree > ul > li:nth-child(4) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span:nth-child(3) > label', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(1) > div > span:nth-child(3) > label', + '#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(2) > div > span:nth-child(3) > label' + ], + [ + '#tree > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander', + '#tree > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span:nth-child(3) > label' + ] + + ]; + + const clickables = clickablesSource[this.step]; + let itemFound; + await this.movePageToBottom(); + + for(let step = 0; step < clickables.length;step++) { + itemFound = false; + do{ + logger.debug('Wait for:', clickables[step]); + await this.page.waitForSelector(clickables[step], { 'timeout':75000 }).then(async (elm) => { + console.log('found'); + itemFound = true; + await elm.hover().catch((err) => { + logger.warn(err); + }); + + this._microWait(this.page, 5); + await elm.focus(); + this._microWait(this.page, 5); + await elm.click({ 'delay':90 }); + this._microWait(this.page, 5); + }).catch((e) => { + logger.error('item missing', e); + // pageLoaded = false; + }); + + await this._randomWait(this.page, 3, 4); + } + while(!itemFound); + } + } + + /** + * + * @returns {Promise} + */ + async motions() { + switch(this.step) { + + case 0: + case 1: + case 2: + await this.selectOptions(); + + await this.clickSearch(); + + await this.renameFile(); + + await this.clickReturn(); + + break; + + default: + // Menu fell through + this.complete = true; + + this.emit('done'); + break; + + } + } + + /** + * + * @returns {Promise} + */ + async waitForPage() { + await this.page.waitForSelector('#tree > ul', { 'visible':true, 'timeout':75000 }).then(async (elm) => { + logger.debug('Option tree visible'); + + await this._randomWait(this.page, 3, 5); + + await this.clearCookieStrap(); + + await this.motions(); + }).catch((e) => { + logger.error('waitForPage', e); + }); + } + + /** + * + * @returns {Promise} + */ + async clearCookieStrap() { + await this.page.waitForSelector('#cookie-consent > div > div > button', { 'visible':true, 'timeout':7500 }).then(async (elm) => { + logger.debug('page'); + + await elm.click({ 'delay':90 }); + await this._randomWait(this.page, 3, 5); + }).catch(() => { + logger.debug('Cookie strap not found'); + }); + } + + /** + * + * @returns {Promise} + */ + + async processNewPage() { + logger.debug('** processNewPage'); + // give the page a few seconds to settle + await this._randomWait(this.page, 3, 5); + + await this.waitForPage(); + } + + /** + * + * @returns {Promise} + */ + async start() { + super._start(); + try { + this.step = 0; + this.complete = false; + + this.startPage = 'http://www.finanssivalvonta.fi/en/About_us/Supervised/Pages/supervisedentities.aspx'; + + this.setPath(path.resolve(`${__dirname }/../artefacts/FI/FCMC`)); + + await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => { + logger.error(err); + }); + + await this._initBrowser(false); + await this._createBrowserPage(); + + await this._makeResponsive(); + + this.page.on('domcontentloaded', this._throttle(async () => { + this.processNewPage().catch((err) => { + logger.error('processNewPage fail', err); + }); + }, 5000)); + + // Check and capture response file + this.page.on('response', async o => { + try{ + const rUrl = await o.url(); + + if (rUrl.includes('supervised-entity-api/v1/all-supervised-entities')) { + logger.debug('satus:', await o.status()); + + o.text().then((data) => { + if (data.length > 0) { + const filename = `${this.path}/exported.json`.substring(0, 240); + logger.debug('>> Intercepting:', rUrl); + + this.saveFile(filename, data); + } + else + logger.debug('Request response is empty'); + }).catch((e) => { + logger.warn(e.message); + }); + } + } + catch( err) { + logger.info('Response.text failed'); + } + }); + + this.on('next', this._throttle(async () => { + await this.page.goto(this.startPage).catch((err) => { + logger.error(err); + this._uploadError(); + }); + }, 5000)); + + await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }); + + await this.page.setViewport({ 'width': 1200, 'height': 800 }); + await this.page.goto(this.startPage).catch((err) => { + logger.error(err); + this._uploadError(); + }); + + await this._randomWait(this.page, 3, 5); + } + catch(e) { + throw Error(e); + } + } + + /** + * + * @returns {Promise} + * @private + */ + async __run() { + await this.start(); + } + +} + +module.exports = FIScrape; diff --git a/ncas/fr.js b/ncas/fr.js index d481fea..824cca5 100644 --- a/ncas/fr.js +++ b/ncas/fr.js @@ -178,9 +178,8 @@ class FRScrape extends Scraper { const statusField = $row.children().length - 1; const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase()); - if(wantedCIStatuses.indexOf(status) !== -1) { + if(wantedCIStatuses.indexOf(status) !== -1) links.push({ link, title }); - } } } @@ -328,14 +327,15 @@ class FRScrape extends Scraper { async searchResultsProcessor($, store) { const $table = $('table.table tr'); - if ($table.length > 1) - // The table contains more than just the heading row + if ($table.length > 1) { + // The table contains more than just the heading row store.indexcount++; logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`); - await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null); + await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null); store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2))); + } // check that the next button is active @@ -418,7 +418,7 @@ class FRScrape extends Scraper { async start() { await super._start(); try { - this.mode = 2; + this.mode = 0; this.paymentServices = { 'items': 0, @@ -438,7 +438,7 @@ class FRScrape extends Scraper { 'visited': false, 'done' : false, 'searchDone' : false, - 'indexcount' :0 + 'indexcount' :0 }; this.creditServices = { @@ -448,7 +448,7 @@ class FRScrape extends Scraper { 'visited': false, 'done' : false, 'searchDone' : false, - 'indexcount' :0 + 'indexcount' :0 }; this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3'; @@ -471,7 +471,7 @@ class FRScrape extends Scraper { await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); - await this._goto(this.creditUrl); + await this._goto(this.startPage); await this._randomWait(this.page, 3, 5); } diff --git a/ncas/gi.js b/ncas/gi.js index cfc0f30..c95969a 100644 --- a/ncas/gi.js +++ b/ncas/gi.js @@ -12,7 +12,7 @@ class GIScrape extends Scraper { constructor() { super(); - this.id = 'GI'; + this.setID('GI'); // treat these elements as block boundaries when scraping permissions this.blockBoundaries = 'div.panel, li'; diff --git a/ncas/lt.js b/ncas/lt.js index f956336..0244cc7 100644 --- a/ncas/lt.js +++ b/ncas/lt.js @@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); -const logger = require('log4js').getLogger('LT'); +const logger = require('log4js').getLogger('(LT)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; @@ -11,7 +11,7 @@ class LTScrape extends Scraper { constructor() { super(); - this.id = 'LT'; + this.setID('LT'); this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']); diff --git a/ncas/lu.js b/ncas/lu.js index 1116bc3..78e8204 100644 --- a/ncas/lu.js +++ b/ncas/lu.js @@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); -const logger = require('log4js').getLogger('LU'); +const logger = require('log4js').getLogger('(LU)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; @@ -28,7 +28,7 @@ class LUScrape extends Scraper { constructor() { super(); - this.id = 'LU'; + this.setID('LU'); this.on('done', () => { this._done(); diff --git a/ncas/lv.js b/ncas/lv.js index 933bf5f..5f7aa07 100644 --- a/ncas/lv.js +++ b/ncas/lv.js @@ -373,16 +373,16 @@ class LVScrape extends Scraper { switch (splitUrl[1]) { - case '/en/market/payment-institutions/': - case '/en/market/electronic-money-institutions/': + case '/en/market/payment-service-providers/payment-institutions/': + case '/en/market/payment-service-providers/electronic-money-institutions/': case '/en/market/credit-institutions/': await this.indexRedirector(); break; - case '/en/market/payment-institutions/authorized-payment-institutions/': - case '/en/market/payment-institutions/registered-payment-institutions/': - case '/en/market/electronic-money-institutions/authorized-electronic-money-institutions/': - case '/en/market/electronic-money-institutions/registered-electronic-money-institutions/': + case '/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions/': + case '/en/market/payment-service-providers/payment-institutions/registered-payment-institutions/': + case '/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions/': + case '/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions/': case '/en/market/credit-institutions/banks/': await this.processRedirector(); break; @@ -552,7 +552,7 @@ class LVScrape extends Scraper { 'indexStep': 0, 'visited': false, 'done' : false, - 'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'], + 'urls': ['http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/registered-payment-institutions.html'], 'sections' : [], 'sectionLinks' : [] }; @@ -564,7 +564,7 @@ class LVScrape extends Scraper { 'indexStep': 0, 'visited': false, 'done' : false, - 'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'], + 'urls': ['http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions.html'], 'sections' : [], 'sectionLinks' : [] }; diff --git a/ncas/nl.js b/ncas/nl.js index fc808e5..6905238 100644 --- a/ncas/nl.js +++ b/ncas/nl.js @@ -216,26 +216,32 @@ class NLScrape extends Scraper { logger.debug('No passporting In tab'); }); - const body = await this.page.content(); - const details = await this.extractDetail(body); - const activity = await this.extractActivity(body); - const passportingOut = await this.extractPassportingOut(body); - const passportingIn = await this.extractPassportingIn(body); + try{ + const body = await this.page.content(); + const details = await this.extractDetail(body); + const activity = await this.extractActivity(body); + const passportingOut = await this.extractPassportingOut(body); + const passportingIn = await this.extractPassportingIn(body); - await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn }); + await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn }); - await this._randomWait(this.page, 3, 5); + await this._randomWait(this.page, 3, 5); - serviceObject.links[serviceObject.step].filename = `${filename}.json`; - serviceObject.step++; + serviceObject.links[serviceObject.step].filename = `${filename}.json`; + serviceObject.step++; - if (serviceObject.step < serviceObject.items) { - const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`; + if (serviceObject.step < serviceObject.items) { + const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`; - await this._goto(newUrl); + await this._goto(newUrl); + } + else + this.emit('entityDone'); + } + catch( err) { + logger.error(err); + this.emit('recover'); } - else - this.emit('entityDone'); } /** diff --git a/ncas/pl.js b/ncas/pl.js index 9b555c0..02dd948 100644 --- a/ncas/pl.js +++ b/ncas/pl.js @@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); -const logger = require('log4js').getLogger('PL'); +const logger = require('log4js').getLogger('(PL)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; @@ -11,7 +11,7 @@ class PLScrape extends Scraper { constructor() { super(); - this.id = 'PL'; + this.setID('PL'); this.version = '0.0.1-1'; this.on('done', () => { @@ -690,6 +690,7 @@ class PLScrape extends Scraper { if (serviceObject.step < serviceObject.currentIndexLength) { serviceObject.current = {}; + // 2019-05-08 :: THIS BIT BROKE TODAY if (this.mode === 0) await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input'); @@ -703,6 +704,7 @@ class PLScrape extends Scraper { } catch( err) { logger.error(err); + this.emit('recover'); } } diff --git a/ncas/pt.js b/ncas/pt.js index c64cc5a..973bc71 100644 --- a/ncas/pt.js +++ b/ncas/pt.js @@ -4,7 +4,7 @@ const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); -const logger = require('log4js').getLogger('PT'); +const logger = require('log4js').getLogger('(PT)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; @@ -13,7 +13,7 @@ class PTScrape extends Scraper { constructor() { super(); - this.id = 'PT'; + this.setID('PT'); this.on('done', () => { this._done(); diff --git a/ncas/sk.js b/ncas/sk.js index aacf92d..0257769 100644 --- a/ncas/sk.js +++ b/ncas/sk.js @@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); -const logger = require('log4js').getLogger('SK'); +const logger = require('log4js').getLogger('(SK)'); const url = require('url'); const camelCase = require('camelcase'); @@ -12,7 +12,7 @@ class SKScrape extends Scraper { constructor() { super(); - this.id = 'SK'; + this.setID('SK'); this.on('done', () => { this._done(); @@ -414,10 +414,15 @@ class SKScrape extends Scraper { for (const item of wantedAnchors) { const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item)); - if (exItem === 'View') - await item.click({ 'delay': Scraper.notARobot() }).catch((e) => { - logger.debug('View click failed', e); + if (exItem === 'View') { + await item.hover().catch((e) => { + logger.warn('Hover failed', e.name); }); + + await item.click({ 'delay': Scraper.notARobot() }).catch((e) => { + logger.debug('View click failed', e.name); + }); + } } const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`; diff --git a/package-lock.json b/package-lock.json index a09f7f1..8f134a0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2300,7 +2300,8 @@ }, "ansi-regex": { "version": "2.1.1", - "bundled": true + "bundled": true, + "optional": true }, "aproba": { "version": "1.2.0", @@ -2318,11 +2319,13 @@ }, "balanced-match": { "version": "1.0.0", - "bundled": true + "bundled": true, + "optional": true }, "brace-expansion": { "version": "1.1.11", "bundled": true, + "optional": true, "requires": { "balanced-match": "^1.0.0", "concat-map": "0.0.1" @@ -2335,15 +2338,18 @@ }, "code-point-at": { "version": "1.1.0", - "bundled": true + "bundled": true, + "optional": true }, "concat-map": { "version": "0.0.1", - "bundled": true + "bundled": true, + "optional": true }, "console-control-strings": { "version": "1.1.0", - "bundled": true + "bundled": true, + "optional": true }, "core-util-is": { "version": "1.0.2", @@ -2446,7 +2452,8 @@ }, "inherits": { "version": "2.0.3", - "bundled": true + "bundled": true, + "optional": true }, "ini": { "version": "1.3.5", @@ -2456,6 +2463,7 @@ "is-fullwidth-code-point": { "version": "1.0.0", "bundled": true, + "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -2468,17 +2476,20 @@ "minimatch": { "version": "3.0.4", "bundled": true, + "optional": true, "requires": { "brace-expansion": "^1.1.7" } }, "minimist": { "version": "0.0.8", - "bundled": true + "bundled": true, + "optional": true }, "minipass": { "version": "2.3.5", "bundled": true, + "optional": true, "requires": { "safe-buffer": "^5.1.2", "yallist": "^3.0.0" @@ -2495,6 +2506,7 @@ "mkdirp": { "version": "0.5.1", "bundled": true, + "optional": true, "requires": { "minimist": "0.0.8" } @@ -2567,7 +2579,8 @@ }, "number-is-nan": { "version": "1.0.1", - "bundled": true + "bundled": true, + "optional": true }, "object-assign": { "version": "4.1.1", @@ -2577,6 +2590,7 @@ "once": { "version": "1.4.0", "bundled": true, + "optional": true, "requires": { "wrappy": "1" } @@ -2652,7 +2666,8 @@ }, "safe-buffer": { "version": "5.1.2", - "bundled": true + "bundled": true, + "optional": true }, "safer-buffer": { "version": "2.1.2", @@ -2682,6 +2697,7 @@ "string-width": { "version": "1.0.2", "bundled": true, + "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", @@ -2699,6 +2715,7 @@ "strip-ansi": { "version": "3.0.1", "bundled": true, + "optional": true, "requires": { "ansi-regex": "^2.0.0" } @@ -2737,11 +2754,13 @@ }, "wrappy": { "version": "1.0.2", - "bundled": true + "bundled": true, + "optional": true }, "yallist": { "version": "3.0.3", - "bundled": true + "bundled": true, + "optional": true } } },