2019-05-12

This commit is contained in:
Martin Donnelly 2019-05-12 18:33:09 +01:00
parent ff6985b72f
commit a5109efabe
19 changed files with 1354 additions and 93 deletions

23
at.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Austria = require('./ncas/at');
async function run() {
const atScraper = new Austria();
if (typeof(process.env.AT_CRON) === 'string' )
new CronJob(process.env.AT_CRON, async function() {
await atScraper.run();
}, null, true);
if (process.env.SCRAPE_START === atScraper.id)
await atScraper.run();
console.log('AT Launched');
}
run();

23
be.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Belgium = require('./ncas/be');
async function run() {
const beScraper = new Belgium();
if (typeof(process.env.BE_CRON) === 'string' )
new CronJob(process.env.BE_CRON, async function() {
await beScraper.run();
}, null, true);
if (process.env.SCRAPE_START === beScraper.id)
await beScraper.run();
console.log('BE launched');
}
run();

View File

@ -13,27 +13,30 @@ function buildApps() {
const apps = [];
const list = [
{ 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js', 'proxy': 'uk', 'crontime': '0 0 * * *' }, // 00:04:40
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // 01:12:53
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // "01:09:45.187"
{ 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js', 'proxy': 'uk', 'crontime': '10 1 * * *' }, // 04:51:37 - uk free at 6:30
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk', 'crontime': '20 6 * * *' }, // "00:24:01.696"
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk', 'crontime': '0 7 * * *' }, // "00:53:02.432"
{ 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js', 'proxy': 'fr', 'crontime': '0 0 * * *' }, // 00:01:03
{ 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js', 'proxy': 'fr', 'crontime': '5 0 * * *' }, // 00:43:45
{ 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js', 'proxy': 'fr', 'crontime': '0 1 * * *' }, // 01:22:29
{ 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // 00:53:26
{ 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // "00:54:28.134"
{ 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js', 'proxy': 'fr', 'crontime': '30 3 * * *' }, // 00:24:03 - fr free at 4:00
{ 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js', 'proxy': 'de', 'crontime': '0 0 * * *' }, // 03:55:38 - de free at 4:00
{ 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js', 'proxy': 'nl', 'crontime': '0 0 * * *' }, // 07:23:19 - nl free at 7:30
{ 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js', 'proxy': 'ch', 'crontime': '0 0 * * *' }, // 17:59:18 - ch free at 18:00
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk' },
{ 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js', 'proxy': 'uk' },
{ 'cron':'ES_CRON', 'start':'ES', 'name':'ES', 'script':'es.js', 'proxy': 'uk' },
{ 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js', 'proxy': 'uk' },
{ 'cron':'GR_CRON', 'start':'GR', 'name':'GR', 'script':'gr.js', 'proxy': 'uk' },
{ 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js', 'proxy': 'uk' },
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk' },
{ 'cron':'LV_CRON', 'start':'LV', 'name':'LV', 'script':'lv.js', 'proxy': 'uk' },
{ 'cron':'NO_CRON', 'start':'NO', 'name':'NO', 'script':'no.js', 'proxy': 'uk' },
{ 'cron':'EE_CRON', 'start':'EE', 'name':'EE', 'script':'ee.js', 'proxy': 'uk' },
{ 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' }
{ 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' },
{ 'cron':'AT_CRON', 'start':'AT', 'name':'AT', 'script':'at.js', 'proxy': 'uk' },
{ 'cron':'FI_CRON', 'start':'FI', 'name':'FI', 'script':'fi.js', 'proxy': 'uk' },
{ 'cron':'BE_CRON', 'start':'BE', 'name':'BE', 'script':'be.js', 'proxy': 'uk' }
];
apps.push({

25
fi.js Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Finland = require('./ncas/fi');
async function run() {
const fiScraper = new Finland();
if (typeof(process.env.FI_CRON) === 'string' )
new CronJob(process.env.FI_CRON, async () => {
await fiScraper.run();
}, null, true);
if (process.env.SCRAPE_START === fiScraper.id)
await fiScraper.run();
console.log('FI Launched');
}
run();

View File

@ -231,7 +231,8 @@ class Scraper extends EventEmitter {
'--disable-gpu',
'--window-size=1920x1080',
'--hide-scrollbars',
'--disable-default-apps'
'--disable-default-apps',
'--remote-debugging-port=9222'
]
}).catch((err) => {
logger.error('Puppeteer failed to launch');
@ -416,14 +417,25 @@ class Scraper extends EventEmitter {
* @private
*/
async _makeScreenshotV2(page, destPath, waitFor = null) {
if (waitFor)
await page.waitFor(waitFor);
try{
if (waitFor)
await page.waitFor(waitFor);
logger.debug('Snapshot', `${destPath}.png`);
await page.setViewport({ 'width': 1200, 'height': 800 });
await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => {
logger.error('Screenshot', err);
});
if(!this.page) {
logger.warn('_makeScreenshotV2: No Page -- Not taking screenshot');
return;
}
logger.debug('Snapshot', `${destPath}.png`);
await page.setViewport({ 'width': 1200, 'height': 800 });
await page.screenshot({ 'path': `${destPath}.png`, 'fullPage': true }).catch(err => {
logger.error('Screenshot', err);
});
}
catch( err) {
logger.error('_makeScreenshotV2', err);
}
}
/**
@ -1500,6 +1512,17 @@ class Scraper extends EventEmitter {
await jsonfile.writeFileSync(filePath, json);
}
_checkFileExistsSync(filePath) {
try {
fs.accessSync(filePath, fs.F_OK);
return true;
}
catch (err) {
return false;
}
}
/**
*
* @param page
@ -1681,6 +1704,15 @@ class Scraper extends EventEmitter {
await this._goto(rURL);
}
async saveFile(filename, data) {
try{
fs.writeFileSync(filename, data);
}
catch( err) {
logger.error(err);
}
}
}
module.exports = Scraper;

318
ncas/at.js Normal file
View File

@ -0,0 +1,318 @@
const cheerio = require('cheerio');
const logger = require('log4js').getLogger('AT');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const Scraper = require('../helpers/scraper');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ATScrape extends Scraper {
constructor() {
super();
this.setID('AT');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
getIndexUrl(category) {
return `https://www.fma.gv.at/en/search-company-database/?cname=&place=&bic=&category=${category}&per_page=10&submitted=1&to=1`;
}
getHtmlNotInMatchingElements(html, selector) {
const $ = cheerio.load(html, { 'decodeEntities': false });
$(selector).remove();
// cheerio adds <html><body>... around the result, so we return the contents of <body>
return $('body').html();
}
extractFieldFromLiWithStrongHeader($, headerText, extractTextFromHtml = false) {
const fieldHeader = $(`div.company-details li > strong:contains("${headerText}")`);
if (fieldHeader.length < 1)
return '';
const fieldLi = fieldHeader.parent().html();
const fieldValue = this.getHtmlNotInMatchingElements(fieldLi, 'strong');
if (extractTextFromHtml)
return $(fieldValue).text().trim();
else
return fieldValue.trim();
}
extractSingleFields($, details) {
details['name'] = this._cleanUp($('h3 > a').text());
const addressRaw = this.extractFieldFromLiWithStrongHeader($, 'Address:');
let address = addressRaw.replace(/\s*\|\s*/g, ', '); // replace pipes with commas
address = address.replace(/\s+/g, ' '); // replace any non-standard spaces with simple spaces
address = address.trim();
details['address'] = address;
details['phone'] = this.extractFieldFromLiWithStrongHeader($, 'Phone:');
details['email'] = this.extractFieldFromLiWithStrongHeader($, 'Email:', true);
details['website'] = this.extractFieldFromLiWithStrongHeader($, 'Web:', true);
details['bankIdentificationNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Bank identification number:');
details['commercialRegisterNumber'] = this.extractFieldFromLiWithStrongHeader($, 'Commercial register number:');
}
extractMultiples($, details) {
details['categories'] = [];
const categoriesLi = $('div.company-details > ul > li > strong:contains("Category:")').parent().html();
const categoriesRaw = this.getHtmlNotInMatchingElements(categoriesLi, 'strong');
const categories = categoriesRaw.split(/<br>/);
for (let i = 0; i < categories.length; i++) {
let cat = categories[i];
cat = this._cleanUp(cat);
if (cat !== '')
details['categories'].push(cat);
}
}
extractPermissions($, details) {
details['permissions'] = [];
const permissionsDiv = $('div.modal-body');
$(permissionsDiv).find('h4').each((i, item) => {
const code = this._cleanUp($(item).text());
const description = this._cleanUp($(item).next().text());
details['permissions'].push({ 'code': code, 'description': description });
});
}
extractEntityDetails(html) {
const details = {};
const $ = cheerio.load(html, { 'decodeEntities': false });
this.extractSingleFields($, details);
this.extractMultiples($, details);
this.extractPermissions($, details);
return details;
}
currentPageAsString() {
return `${this.modeNames[this.mode]} url ${this.getCurrentMode().urlStep}, page ${this.getCurrentMode().paginationStep}`;
}
async expandAndScreenshot() {
logger.info(`Expanding content on ${this.currentPageAsString()}`);
await this.page.addStyleTag({
'content':
`
div.company-details { /* make space for the content */
position: static;
width: auto;
height: auto;
}
div.document-description { /* make content visible */
display: block;
position: static;
opacity: 1;
}
div.modal-dialog { /* move the content back down (it's transformed up a bit by default) */
transform: none !important;
}
div.modal-content { /* remove the drop shadow (might help render faster?) */
box-shadow: none;
-webkit-box-shadow: none;
}
`
});
logger.info(`Taking screenshot of ${this.currentPageAsString()}`);
const filename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_page_${this.getCurrentMode().paginationStep}`;
const path = `${this.path}/${filename}`;
await this._makeScreenshotV2(this.page, path);
}
serviceDone() {
logger.info(`${this.modeNames[this.mode]} done. Total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]} scraped.`);
try{
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
}
catch (e) {
logger.error(e);
}
}
async entityResultsPageProcessor() {
const body = await this.page.content();
const $ = cheerio.load(body, { 'decodeEntities': false });
await this.expandAndScreenshot();
const entities = $('div.company-details-wrap');
entities.each(async (i, item) => {
const noWhiteSpace = /\W/g;
const details = this.extractEntityDetails($(item).html());
const id = this._makeFieldName(details.name);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
jsonfile.writeFile(`${filePath}.json`, { details });
this.getCurrentMode().links.push({
'id': id,
'href': await this.page.url(),
'filename': filename
});
});
logger.info(`${entities.length} ${this.modeNames[this.mode]} entities scraped.`);
const nextLink = await this.page.$('div.paging li.next:not(.disabled) a');
if (nextLink !== null) {
logger.info('Clicking through to next page.');
this.getCurrentMode().paginationStep++;
const nextHref = await this.page.evaluate(link => {
return link.href;
}, nextLink);
this._goto(nextHref);
}
else {
this.serviceDone();
this.getCurrentMode().paginationStep = 1;
const nextUrl = this.getNextUrl();
if (nextUrl !== null)
this._goto(nextUrl);
else
this.emit('done');
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.pathname === '/en/search-company-database/')
await this.entityResultsPageProcessor();
else
logger.error(`Page url not recognised: ${pageUrl.href}`);
}
getCurrentMode() {
switch (this.mode) {
case 0:
return this.paymentServices;
case 1:
return this.emoneyServices;
case 2:
return this.creditServices;
}
}
getNextUrl() {
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
this.getCurrentMode().urlStep++;
else {
if (this.mode < this.modeNames.length - 1)
this.mode++;
else
return null;
}
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
this.setPath(path.resolve(`${__dirname }/../artefacts/AT/FMA`));
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [
this.getIndexUrl('1977'), // Payment institutions - Payment Institutions licensed in Austria
this.getIndexUrl('2798'), // Payment Institutions - Account information service provider (AISP)
this.getIndexUrl('2799') // Payment Institutions - Payment initiation service provider (PISP)
]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [this.getIndexUrl('2193')] // E-Money-Institutions - E-Money-Institutions licensed in Austria
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'urlStep': 0,
'paginationStep': 1,
'urls': [this.getIndexUrl('165')] // Banks - Banks licensed in Austria
};
this.startPage = this.paymentServices.urls[0];
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
try {
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ATScrape;

433
ncas/be.js Normal file
View File

@ -0,0 +1,433 @@
const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('BE');
const path = require('path');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const Scraper = require('../helpers/scraper');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class BEScrape extends Scraper {
constructor() {
super();
this.setID('BE');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async downloadFile(docLink) {
logger.info(`Downloading ${docLink}`);
await this.page.goto(docLink).catch((err) => {
if (err.message.indexOf('net::ERR_ABORTED') !== -1) {
logger.info(`Ignoring expected error upon file download: ${err.message}`);
}
else
throw err;
});
const waitMs = 5000;
const parsedUrl = url.parse(docLink);
const fileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
const downloadFilePath = `${this.path}/${fileName}`;
let tries;
for (tries = 1; tries <= 10; tries++) {
logger.info('Waiting...');
await this.page.waitFor(waitMs);
if (this._checkFileExistsSync(downloadFilePath)) {
logger.info(`${docLink} successfully downloaded.`);
return true;
}
else {
logger.info(`Still waiting for ${docLink} to download after ${tries * waitMs / 1000} seconds...`);
}
}
// if we reach this point, download has failed
logger.error(`${docLink} failed to download after ${tries * waitMs / 1000} seconds....`);
return false;
}
normaliseDocLink(docLink) {
if (!docLink.startsWith('http://www.nbb.be/')) {
// attempt to normalise document link
if (docLink.startsWith('file:///L:/PRXNWEBP/')) {
return docLink.replace('file:///L:/PRXNWEBP/', 'http://www.nbb.be/');
}
else {
logger.warn(`Unable to normalise document link, unknown format, will attempt download as is: ${docLink}`);
}
}
return docLink;
}
convertMulitpleSpaceToCommaSpace(value) {
return value.replace(/\s{2,}/g, ', ');
}
extractMainDetails(detailsContainer) {
const $ = require('cheerio');
const details = {};
details['name'] = $(detailsContainer).children('strong').text().trim();
details['companyType'] = $(detailsContainer).children('em').text().trim();
const lines = $(detailsContainer).children();
details['addressOne'] = this.convertMulitpleSpaceToCommaSpace(lines[3].next.data.trim());
details['addressTwo'] = this.convertMulitpleSpaceToCommaSpace(lines[4].next.data.trim());
// Occasionally line 5 will contain text. If this is the case, line 5 contains
// "addressThree", and every other line moves along by one.
let offset = 0;
if (lines[5].next.data.trim() !== '') {
offset = 1;
details['addressThree'] = this.convertMulitpleSpaceToCommaSpace(lines[5].next.data.trim());
}
else {
details['addressThree'] = null;
}
details['uniqueId'] = lines[6 + offset].next.data.split(':').pop().trim();
details['dateOfListing'] = (lines[7 + offset] === undefined) ? null : lines[7 + offset].next.data.split(':').pop().trim();
const docLink = $(detailsContainer).children('a');
if (docLink.length > 0) {
details['docLink'] = docLink.attr('href');
details['normalisedDocLink'] = this.normaliseDocLink(docLink.attr('href'));
}
else {
details['docLink'] = null;
details['normalisedDocLink'] = null;
}
return details;
}
extractAdditionalDetails(tableCells) {
const $ = require('cheerio');
const additionalDetails = {};
tableCells.toArray().map((td) => {
const thText = $(td).closest('table').find('th').eq($(td).index()).text();
const fieldName = this._makeFieldName(thText);
additionalDetails[fieldName] = $(td).text().split(' '); // e.g. scrape "1 2 3" as ["1", "2", "3"]
});
return additionalDetails;
}
extractFullDetails(fullDetailsContainer, mode) {
const $ = require('cheerio');
switch (mode) {
case 0:
case 1:
// in modes 0 and 1 the main details are in the first td of the parent container
const mainDetails = this.extractMainDetails($(fullDetailsContainer).children('td').eq(0));
const additionalDetails = this.extractAdditionalDetails($(fullDetailsContainer).children('td').slice(1));
return {...mainDetails, ...additionalDetails};
case 2:
// in mode 2 (credit institutions) the main details are in the root.
return this.extractMainDetails(fullDetailsContainer);
// no additional details for credit institutions
}
}
extractEntitiesFromContainer(entitiesContainer, mode) {
const $ = require('cheerio');
const entities = [];
switch ($(entitiesContainer).prop("tagName")) {
case 'TBODY':
$(entitiesContainer).children('tr').each((index, item) => {
entities.push(this.extractFullDetails(item, mode));
});
break;
case 'UL':
$(entitiesContainer).children('li').each((index, item) => {
entities.push(this.extractFullDetails(item, mode));
});
break;
}
return entities;
}
extractIndex(indexContainer, mode) {
const $ = require('cheerio');
const title = $(indexContainer).find('div.field-name-field-page-intro > p').text().trim();
const description = $(indexContainer).find('div.description').html();
const legend = $(indexContainer).find('div.legend').html(); // not entirely necessary but good to keep a record
const entitiesContainer = $(indexContainer).find('ul.List1 tbody, ul.List1 ul.List2 > li > ul').eq(0);
let entities;
if (entitiesContainer.length > 0) {
entities = this.extractEntitiesFromContainer(entitiesContainer, mode);
}
else {
entities = [];
}
const changes = $(indexContainer).find('div.changes-12').html(); // not entirely necessary but good to keep a record
return { title, description, legend, entities, changes };
}
getIdByEntityName(name) {
const noWhiteSpace = /\W/g;
let id = this._makeFieldName(name).trim();
id = removeAccents.remove(id);
id = id.replace(noWhiteSpace, '_');
return id;
}
async processIndex() {
const pageUrl = await this.page.url();
logger.info(`Processing ${this.modeNames[this.mode]} index url number ${this.getCurrentMode().urlStep}: ${pageUrl}`);
await this.allowCookies();
const body = await this.page.content();
const $ = cheerio.load(body, { 'decodeEntities': false, 'encoding': 'utf-8' });
logger.info('Extracting index...')
const index = this.extractIndex($('div#PrudentialList'), this.mode);
logger.info(`Extracted ${index.entities.length} ${this.modeNames[this.mode]}.`);
logger.info(`Downloading ${this.modeNames[this.mode]} documents.`);
// download all documents from this index page
for (const entity of index.entities) {
if (entity.normalisedDocLink !== null) {
const didDownload = await this.downloadFile(entity.normalisedDocLink);
if (didDownload) {
// rename the file to match the json file name format
const parsedUrl = url.parse(entity.normalisedDocLink);
const originalFileName = decodeURI(path.basename(parsedUrl.pathname.toLowerCase()));
const originalFilePath = `${this.path}/${originalFileName}`;
const newFileName = [this.modePrefix[this.mode], this.getIdByEntityName(entity.name), path.extname(originalFileName)].join('');
const newFilePath = `${this.path}/${newFileName}`;
await this._renameFile(originalFilePath, newFilePath);
// save new file name to entity object so it can be found later.
entity['docLocalFilename'] = newFileName;
}
else {
entity['docLocalFilename'] = null;
}
}
}
logger.info(`Saving metadata for ${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`);
const description = index['description'];
const legend = index['legend'];
const changes = index['changes'];
const metadata = { description, legend, changes };
const metadataFileName = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}_metadata.json`;
const metadataFilePath = `${this.path}/${metadataFileName}`
jsonfile.writeFile(metadataFilePath, { metadata });
for (const entity of index.entities) {
const id = this.getIdByEntityName(entity.name);
// create json file for each entity
const filename = [this.modePrefix[this.mode], id].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
jsonfile.writeFile(`${filePath}.json`, { 'details': entity , metadataFileName});
// add entity details to "links" so that index file can be generated later
this.getCurrentMode().links.push({
'id': id,
'href': await this.page.url(),
'filename': filename
});
}
logger.info(`Taking screenshot of: ${pageUrl}`);
const screenshotFilename = `${this.modeNames[this.mode]}_url_${this.getCurrentMode().urlStep}`;
const screenshotPath = `${this.path}/${screenshotFilename}`;
await this._makeScreenshotV2(this.page, screenshotPath);
const nextUrl = this.getNextUrl();
if (nextUrl !== null)
await this._goto(nextUrl);
else
this.emit('done');
}
serviceDone() {
try{
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links': this.getCurrentMode().links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, this.getCurrentMode());
logger.info(`${this.modeNames[this.mode]} done.`);
logger.info(`Extracted a total of ${this.getCurrentMode().links.length} ${this.modeNames[this.mode]}`);
}
catch (e) {
logger.error(e);
}
}
getCurrentMode() {
switch (this.mode) {
case 0:
return this.paymentServices;
case 1:
return this.emoneyServices;
case 2:
return this.creditServices;
}
}
getNextUrl() {
if (this.getCurrentMode().urlStep < this.getCurrentMode().urls.length - 1)
this.getCurrentMode().urlStep++;
else {
this.serviceDone();
if (this.mode < this.modeNames.length - 1)
this.mode++;
else
return null;
}
return this.getCurrentMode().urls[this.getCurrentMode().urlStep];
}
async allowCookies() {
const agreeButton = await this.page.$('button.agree-button');
if (agreeButton !== null) {
logger.info('Agreeing to cookie policy.')
await agreeButton.click();
await this._randomWait(this.page, 3, 5);
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
// set download behaviour in case this is a new tab after a recovery
// TODO: this could be set by default in the base class for every new tab in every scraper
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
const pageUrl = await this.page.url();
if (pageUrl.includes('supervision-financiere/controle-prudentiel/domaines-de-controle'))
await this.processIndex();
else if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
}
async attachEvents() {
}
async start() {
super._start();
this.setPath(path.resolve(`${__dirname}/../artefacts/BE/NBB`));
this.mode = 0;
this.paymentServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-15',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-14',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-16'
]
};
this.emoneyServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-9',
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-paiement-et-17'
]
};
this.creditServices = {
'links': [],
'urlStep': 0,
'urls': [
'https://www.nbb.be/fr/supervision-financiere/controle-prudentiel/domaines-de-controle/etablissements-de-credit/listes-7'
]
};
this.startPage = this.paymentServices.urls[0];
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this._goto(this.startPage);
}
async __run() {
await this.start();
}
}
module.exports = BEScrape;

View File

@ -1,5 +1,6 @@
const logger = require('log4js').getLogger('BG');
const path = require('path');
const url = require('url');
const Scraper = require('../helpers/scraper');
@ -7,7 +8,7 @@ class BGScrape extends Scraper {
constructor() {
super();
this.id = 'BG';
this.setID('BG');
this.on('done', () => {
this._done();
@ -32,6 +33,64 @@ class BGScrape extends Scraper {
await this._randomWait(this.page, 3, 5);
}
async processPaymentServicesPage() {
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/ps_em_index`);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await this.downloadByHrefFilename('ps_po_register_2_en.xls'); // Payment Institutions
await this.downloadByHrefFilename('ps_po_register_3a_en.xls'); // eMoney Institutions
// wait until networkidle to ensure the above downloads are complete, then go to next page
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
}
async processCreditInstitutionsPage() {
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/ci_index`);
// TODO: come back and scrape the html page version of this word doc, if we have time
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc'); // banks and foreign banks' branches operating in Bulgaria
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls'); // permissions list
// no more pages to go to at this point, so wait a final 10 seconds to allow files to download
// TODO: investigate whether this could be done with: // page.waitForNavigation({ waitUntil: 'networkidle0' })
await this.page.waitFor(10000);
this.emit('done');
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
// set download behaviour on every processNewPage in case this is a recovery attempt / new tab
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
if (pageUrl.href.includes('/PSPaymentOversightRegisters'))
await this.processPaymentServicesPage();
else if (pageUrl.href.includes('/RSCIRegisters'))
await this.processCreditInstitutionsPage();
else if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl.href);
}
}
async start() {
super._start();
@ -45,34 +104,20 @@ class BGScrape extends Scraper {
logger.warn(err);
});
await this._initBrowser();
this.page = await this.browser.newPage();
await this._initBrowser(false);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
await this.page.setViewport({ 'width': 1200, 'height': 800 });
// set cookie for English language and load start page
await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index1`);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await this.downloadByHrefFilename('ps_po_register_2_en.xls');
await this.downloadByHrefFilename('ps_po_register_3a_en.xls');
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index2`);
// TODO: come back and scrape the html page version of this word doc, if we have time
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc');
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls');
// wait until all downloads finished with 'networkidle0' (currently this is only possible with 'page.goto', so we go back to the start page)
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
this.emit('done');
}
async __run() {

327
ncas/fi.js Normal file
View File

@ -0,0 +1,327 @@
const Scraper = require('../helpers/scraper');
const path = require('path');
const logger = require('log4js').getLogger('FI');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class FIScrape extends Scraper {
constructor(checkForLock = true) {
super();
this.id = 'FI';
this.addToBlockFilters(['msecnd.net', 'siteimproveanalytics.com', 'newrelic.com', 'visualstudio.com']);
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (checkForLock)
this._checkLock().then((l) => {
if(l)
this.run();
});
this.on('error', (err) => {
logger.error('Error catcher!!', err);
});
}
/**
*
* @returns {Promise<void>}
*/
async movePageToBottom() {
await this.page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
}
/**
*
* @returns {Promise<void>}
*/
async renameFile() {
try{
const filename = this.modeNames[this.step];
const sourceFile = 'exported.json';
const origFile = `${this.path}/${sourceFile}`;
const newFile = `${this.path}/${filename}.json`;
await this._renameFile(origFile, newFile);
}
catch( err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<void>}
*/
async clickReturn() {
await this._randomWait(this.page, 5, 7, 'clickReturn');
this.step++;
this.emit('next');
}
/**
*
* @returns {Promise<void>}
*/
async clickSearch() {
logger.debug('clickSearch');
await this.movePageToBottom();
await this._randomWait(this.page, 2, 3, 'Move to bottom');
await this.page.waitForSelector('#tree-search-button', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
logger.debug('found');
await elm.focus();
this._microWait(this.page, 5);
await elm.click({ 'delay':90 });
}).catch((e) => {
logger.error('Search button missing', e);
});
await this._randomWait(this.page, 2, 3, 'after clickSearch click');
}
/**
*
* @returns {Promise<void>}
*/
async selectOptions() {
logger.debug(`select ${this.modeNames[this.step]}`);
const clickablesSource = [
[
'#tree > ul > li:nth-child(4) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > ul > li:nth-child(2) > div > span:nth-child(3) > label',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(3) > div > span:nth-child(3) > label',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(4) > div > span:nth-child(3) > label'
],
[
'#tree > ul > li:nth-child(4) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span:nth-child(3) > label',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(1) > div > span:nth-child(3) > label',
'#tree > ul > li:nth-child(4) > ul > li:nth-child(2) > ul > li:nth-child(2) > div > span:nth-child(3) > label'
],
[
'#tree > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span.gj-tree-glyphicons-expander',
'#tree > ul > li:nth-child(1) > ul > li:nth-child(1) > div > span:nth-child(3) > label'
]
];
const clickables = clickablesSource[this.step];
let itemFound;
await this.movePageToBottom();
for(let step = 0; step < clickables.length;step++) {
itemFound = false;
do{
logger.debug('Wait for:', clickables[step]);
await this.page.waitForSelector(clickables[step], { 'timeout':75000 }).then(async (elm) => {
console.log('found');
itemFound = true;
await elm.hover().catch((err) => {
logger.warn(err);
});
this._microWait(this.page, 5);
await elm.focus();
this._microWait(this.page, 5);
await elm.click({ 'delay':90 });
this._microWait(this.page, 5);
}).catch((e) => {
logger.error('item missing', e);
// pageLoaded = false;
});
await this._randomWait(this.page, 3, 4);
}
while(!itemFound);
}
}
/**
*
* @returns {Promise<void>}
*/
async motions() {
switch(this.step) {
case 0:
case 1:
case 2:
await this.selectOptions();
await this.clickSearch();
await this.renameFile();
await this.clickReturn();
break;
default:
// Menu fell through
this.complete = true;
this.emit('done');
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async waitForPage() {
await this.page.waitForSelector('#tree > ul', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
logger.debug('Option tree visible');
await this._randomWait(this.page, 3, 5);
await this.clearCookieStrap();
await this.motions();
}).catch((e) => {
logger.error('waitForPage', e);
});
}
/**
*
* @returns {Promise<void>}
*/
async clearCookieStrap() {
await this.page.waitForSelector('#cookie-consent > div > div > button', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.debug('page');
await elm.click({ 'delay':90 });
await this._randomWait(this.page, 3, 5);
}).catch(() => {
logger.debug('Cookie strap not found');
});
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
logger.debug('** processNewPage');
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
await this.waitForPage();
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.step = 0;
this.complete = false;
this.startPage = 'http://www.finanssivalvonta.fi/en/About_us/Supervised/Pages/supervisedentities.aspx';
this.setPath(path.resolve(`${__dirname }/../artefacts/FI/FCMC`));
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
logger.error(err);
});
await this._initBrowser(false);
await this._createBrowserPage();
await this._makeResponsive();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 5000));
// Check and capture response file
this.page.on('response', async o => {
try{
const rUrl = await o.url();
if (rUrl.includes('supervised-entity-api/v1/all-supervised-entities')) {
logger.debug('satus:', await o.status());
o.text().then((data) => {
if (data.length > 0) {
const filename = `${this.path}/exported.json`.substring(0, 240);
logger.debug('>> Intercepting:', rUrl);
this.saveFile(filename, data);
}
else
logger.debug('Request response is empty');
}).catch((e) => {
logger.warn(e.message);
});
}
}
catch( err) {
logger.info('Response.text failed');
}
});
this.on('next', this._throttle(async () => {
await this.page.goto(this.startPage).catch((err) => {
logger.error(err);
this._uploadError();
});
}, 5000));
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this.page.goto(this.startPage).catch((err) => {
logger.error(err);
this._uploadError();
});
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw Error(e);
}
}
/**
*
* @returns {Promise<void>}
* @private
*/
async __run() {
await this.start();
}
}
module.exports = FIScrape;

View File

@ -178,9 +178,8 @@ class FRScrape extends Scraper {
const statusField = $row.children().length - 1;
const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase());
if(wantedCIStatuses.indexOf(status) !== -1) {
if(wantedCIStatuses.indexOf(status) !== -1)
links.push({ link, title });
}
}
}
@ -328,14 +327,15 @@ class FRScrape extends Scraper {
async searchResultsProcessor($, store) {
const $table = $('table.table tr');
if ($table.length > 1)
// The table contains more than just the heading row
if ($table.length > 1) {
// The table contains more than just the heading row
store.indexcount++;
logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`);
await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null);
await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null);
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
}
// check that the next button is active
@ -418,7 +418,7 @@ class FRScrape extends Scraper {
async start() {
await super._start();
try {
this.mode = 2;
this.mode = 0;
this.paymentServices = {
'items': 0,
@ -438,7 +438,7 @@ class FRScrape extends Scraper {
'visited': false,
'done' : false,
'searchDone' : false,
'indexcount' :0
'indexcount' :0
};
this.creditServices = {
@ -448,7 +448,7 @@ class FRScrape extends Scraper {
'visited': false,
'done' : false,
'searchDone' : false,
'indexcount' :0
'indexcount' :0
};
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
@ -471,7 +471,7 @@ class FRScrape extends Scraper {
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.creditUrl);
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}

View File

@ -12,7 +12,7 @@ class GIScrape extends Scraper {
constructor() {
super();
this.id = 'GI';
this.setID('GI');
// treat these elements as block boundaries when scraping permissions
this.blockBoundaries = 'div.panel, li';

View File

@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LT');
const logger = require('log4js').getLogger('(LT)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
@ -11,7 +11,7 @@ class LTScrape extends Scraper {
constructor() {
super();
this.id = 'LT';
this.setID('LT');
this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']);

View File

@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LU');
const logger = require('log4js').getLogger('(LU)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
@ -28,7 +28,7 @@ class LUScrape extends Scraper {
constructor() {
super();
this.id = 'LU';
this.setID('LU');
this.on('done', () => {
this._done();

View File

@ -373,16 +373,16 @@ class LVScrape extends Scraper {
switch (splitUrl[1]) {
case '/en/market/payment-institutions/':
case '/en/market/electronic-money-institutions/':
case '/en/market/payment-service-providers/payment-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/':
case '/en/market/credit-institutions/':
await this.indexRedirector();
break;
case '/en/market/payment-institutions/authorized-payment-institutions/':
case '/en/market/payment-institutions/registered-payment-institutions/':
case '/en/market/electronic-money-institutions/authorized-electronic-money-institutions/':
case '/en/market/electronic-money-institutions/registered-electronic-money-institutions/':
case '/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions/':
case '/en/market/payment-service-providers/payment-institutions/registered-payment-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions/':
case '/en/market/credit-institutions/banks/':
await this.processRedirector();
break;
@ -552,7 +552,7 @@ class LVScrape extends Scraper {
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'],
'urls': ['http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/registered-payment-institutions.html'],
'sections' : [],
'sectionLinks' : []
};
@ -564,7 +564,7 @@ class LVScrape extends Scraper {
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'],
'urls': ['http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions.html'],
'sections' : [],
'sectionLinks' : []
};

View File

@ -216,26 +216,32 @@ class NLScrape extends Scraper {
logger.debug('No passporting In tab');
});
const body = await this.page.content();
const details = await this.extractDetail(body);
const activity = await this.extractActivity(body);
const passportingOut = await this.extractPassportingOut(body);
const passportingIn = await this.extractPassportingIn(body);
try{
const body = await this.page.content();
const details = await this.extractDetail(body);
const activity = await this.extractActivity(body);
const passportingOut = await this.extractPassportingOut(body);
const passportingIn = await this.extractPassportingIn(body);
await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn });
await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn });
await this._randomWait(this.page, 3, 5);
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`;
if (serviceObject.step < serviceObject.items) {
const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`;
await this._goto(newUrl);
await this._goto(newUrl);
}
else
this.emit('entityDone');
}
catch( err) {
logger.error(err);
this.emit('recover');
}
else
this.emit('entityDone');
}
/**

View File

@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('PL');
const logger = require('log4js').getLogger('(PL)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
@ -11,7 +11,7 @@ class PLScrape extends Scraper {
constructor() {
super();
this.id = 'PL';
this.setID('PL');
this.version = '0.0.1-1';
this.on('done', () => {
@ -690,6 +690,7 @@ class PLScrape extends Scraper {
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
// 2019-05-08 :: THIS BIT BROKE TODAY
if (this.mode === 0)
await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input');
@ -703,6 +704,7 @@ class PLScrape extends Scraper {
}
catch( err) {
logger.error(err);
this.emit('recover');
}
}

View File

@ -4,7 +4,7 @@ const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('PT');
const logger = require('log4js').getLogger('(PT)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
@ -13,7 +13,7 @@ class PTScrape extends Scraper {
constructor() {
super();
this.id = 'PT';
this.setID('PT');
this.on('done', () => {
this._done();

View File

@ -2,7 +2,7 @@ const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('SK');
const logger = require('log4js').getLogger('(SK)');
const url = require('url');
const camelCase = require('camelcase');
@ -12,7 +12,7 @@ class SKScrape extends Scraper {
constructor() {
super();
this.id = 'SK';
this.setID('SK');
this.on('done', () => {
this._done();
@ -414,10 +414,15 @@ class SKScrape extends Scraper {
for (const item of wantedAnchors) {
const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item));
if (exItem === 'View')
await item.click({ 'delay': Scraper.notARobot() }).catch((e) => {
logger.debug('View click failed', e);
if (exItem === 'View') {
await item.hover().catch((e) => {
logger.warn('Hover failed', e.name);
});
await item.click({ 'delay': Scraper.notARobot() }).catch((e) => {
logger.debug('View click failed', e.name);
});
}
}
const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`;

41
package-lock.json generated
View File

@ -2300,7 +2300,8 @@
},
"ansi-regex": {
"version": "2.1.1",
"bundled": true
"bundled": true,
"optional": true
},
"aproba": {
"version": "1.2.0",
@ -2318,11 +2319,13 @@
},
"balanced-match": {
"version": "1.0.0",
"bundled": true
"bundled": true,
"optional": true
},
"brace-expansion": {
"version": "1.1.11",
"bundled": true,
"optional": true,
"requires": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
@ -2335,15 +2338,18 @@
},
"code-point-at": {
"version": "1.1.0",
"bundled": true
"bundled": true,
"optional": true
},
"concat-map": {
"version": "0.0.1",
"bundled": true
"bundled": true,
"optional": true
},
"console-control-strings": {
"version": "1.1.0",
"bundled": true
"bundled": true,
"optional": true
},
"core-util-is": {
"version": "1.0.2",
@ -2446,7 +2452,8 @@
},
"inherits": {
"version": "2.0.3",
"bundled": true
"bundled": true,
"optional": true
},
"ini": {
"version": "1.3.5",
@ -2456,6 +2463,7 @@
"is-fullwidth-code-point": {
"version": "1.0.0",
"bundled": true,
"optional": true,
"requires": {
"number-is-nan": "^1.0.0"
}
@ -2468,17 +2476,20 @@
"minimatch": {
"version": "3.0.4",
"bundled": true,
"optional": true,
"requires": {
"brace-expansion": "^1.1.7"
}
},
"minimist": {
"version": "0.0.8",
"bundled": true
"bundled": true,
"optional": true
},
"minipass": {
"version": "2.3.5",
"bundled": true,
"optional": true,
"requires": {
"safe-buffer": "^5.1.2",
"yallist": "^3.0.0"
@ -2495,6 +2506,7 @@
"mkdirp": {
"version": "0.5.1",
"bundled": true,
"optional": true,
"requires": {
"minimist": "0.0.8"
}
@ -2567,7 +2579,8 @@
},
"number-is-nan": {
"version": "1.0.1",
"bundled": true
"bundled": true,
"optional": true
},
"object-assign": {
"version": "4.1.1",
@ -2577,6 +2590,7 @@
"once": {
"version": "1.4.0",
"bundled": true,
"optional": true,
"requires": {
"wrappy": "1"
}
@ -2652,7 +2666,8 @@
},
"safe-buffer": {
"version": "5.1.2",
"bundled": true
"bundled": true,
"optional": true
},
"safer-buffer": {
"version": "2.1.2",
@ -2682,6 +2697,7 @@
"string-width": {
"version": "1.0.2",
"bundled": true,
"optional": true,
"requires": {
"code-point-at": "^1.0.0",
"is-fullwidth-code-point": "^1.0.0",
@ -2699,6 +2715,7 @@
"strip-ansi": {
"version": "3.0.1",
"bundled": true,
"optional": true,
"requires": {
"ansi-regex": "^2.0.0"
}
@ -2737,11 +2754,13 @@
},
"wrappy": {
"version": "1.0.2",
"bundled": true
"bundled": true,
"optional": true
},
"yallist": {
"version": "3.0.3",
"bundled": true
"bundled": true,
"optional": true
}
}
},