This commit is contained in:
Martin Donnelly 2019-05-05 20:13:56 +01:00
commit be5d3eae07
363 changed files with 211512 additions and 0 deletions

55
.eslintrc.json Normal file
View File

@ -0,0 +1,55 @@
{
"parserOptions": {
"ecmaVersion": 2017,
"sourceType": "module",
"ecmaFeatures": {
"jsx": false
}
},
"env": {
"browser": false,
"node": true,
"es6": true
},
"rules": {
"arrow-spacing": "error",
"block-scoped-var": "error",
"block-spacing": "error",
"brace-style": ["error", "stroustrup", {}],
"camelcase": "error",
"comma-dangle": ["error", "never"],
"comma-spacing": ["error", { "before": false, "after": true }],
"comma-style": [1, "last"],
"consistent-this": [1, "_this"],
"curly": [1, "multi"],
"eol-last": 1,
"eqeqeq": 1,
"func-names": 1,
"indent": ["error", 2, { "SwitchCase": 1 }],
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
"new-cap": 1,
"newline-before-return": "error",
"no-array-constructor": 1,
"no-inner-declarations": [1, "both"],
"no-mixed-spaces-and-tabs": 1,
"no-multi-spaces": 2,
"no-new-object": 1,
"no-shadow-restricted-names": 1,
"object-curly-spacing": ["error", "always"],
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
"prefer-const": "error",
"prefer-template": "error",
"one-var": 0,
"quote-props": ["error", "always"],
"quotes": [1, "single"],
"radix": 1,
"semi": [1, "always"],
"space-before-blocks": [1, "always"],
"space-infix-ops": 1,
"vars-on-top": 1,
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
"spaced-comment": ["error", "always", { "markers": ["/"] }]
}
}

161
.gitignore vendored Normal file
View File

@ -0,0 +1,161 @@
# Created by .ignore support plugin (hsz.mobi)
### Node template
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Typescript v1 declaration files
typings/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
### macOS template
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### JetBrains template
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
.idea/
# User-specific stuff:
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/dictionaries
# Sensitive or high-churn files:
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.xml
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
# Gradle:
.idea/**/gradle.xml
.idea/**/libraries
# CMake
cmake-build-debug/
# Mongo Explorer plugin:
.idea/**/mongoSettings.xml
## File-based project format:
*.iws
## Plugin-specific files:
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
artefacts/screenshots/*.png
artefacts/*.txt
artefacts/*.json
artefacts/*.html
artefacts/*
/tests/*.zip
/output/
/dist/
!/tests/data/
/tests/sink/
/debug/
/update.sh
/setup/web/
/backup/
/archive.tar.gz
/user/

38
Dockerfile Normal file
View File

@ -0,0 +1,38 @@
FROM node:stretch
ARG VERSION
ENV VERSION ${VERSION:-development}
RUN echo udev hold | dpkg --set-selections;\
echo initscripts hold | dpkg --set-selections;\
apt-get -yq update;\
DEBIAN_FRONTEND=noninteractive apt-get install -yq -f --no-install-recommends build-essential dnsutils git xorg blackbox libasound2 libnss3-dev libxss1 libatk-bridge2.0-0 libgtk2.0-common libgtk-3-0 ;\
apt-get autoremove -yq ;\
apt-get clean -yq
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y \
python \
python-dev \
python-pip \
python-setuptools \
groff \
less \
&& pip install --upgrade awscli \
&& apt-get clean
WORKDIR /app
COPY start.sh package.json *.js /app/
COPY ncas/ /app/ncas
COPY helpers/ /app/helpers
RUN npm install pm2 -g && npm install
# RUN npm install
# COPY start.sh /app/
RUN chmod +x /app/start.sh
ENTRYPOINT ["/app/start.sh"]

20
Dockerfile.orig Normal file
View File

@ -0,0 +1,20 @@
FROM node:jessie
ARG VERSION
ENV VERSION ${VERSION:-development}
RUN echo udev hold | dpkg --set-selections;\
echo initscripts hold | dpkg --set-selections;\
apt-get -yq update;\
DEBIAN_FRONTEND=noninteractive apt-get install -yq -f --no-install-recommends build-essential dnsutils git xorg openbox libasound2 libnss3-dev libxss1 libatk-bridge2.0-0 libgtk2.0-common libgtk-3-0 ;\
apt-get autoremove -yq ;\
apt-get clean -yq
WORKDIR /app
ADD setup/web/archive.tar.gz /app
RUN npm install pm2 -g
RUN npm install --unsafe-perm
CMD ["pm2-runtime", "start", "ecosystem.config.js", "--raw" , "--env", "production"]

6
Jenkinsfile vendored Normal file
View File

@ -0,0 +1,6 @@
@Library('OpenBankingUK/ob-pipeline-library') _
javaMsPipelinev2 {
projectName='obdfcascrape'
cluster='nca'
}

36
Makefile Normal file
View File

@ -0,0 +1,36 @@
PROJECT = obdfcascrape
VERSION = $(shell git rev-parse --short HEAD)
ECR_REGION = eu-west-1
ECR_ACCOUNT_NUMBER = 482681734622
# ECR_REPO = $(ECR_ACCOUNT_NUMBER).dkr.ecr.$(ECR_REGION).amazonaws.com
ECR_REPO = mail.caliban.io:5000
#APP_IMAGE = 482681734622.dkr.ecr.eu-west-1.amazonaws.com/$(PROJECT):$(VERSION)
APP_IMAGE = $(ECR_REPO)/$(PROJECT):$(VERSION)
NO_CACHE = false
#build docker image
build:
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
# tar -C ./ -czvf ./archive.tar.gz 'package.json' 'ncas/' 'helpers/' -X *.js
docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
.PHONY: build
#push docker image to registry
push: build
docker push $(APP_IMAGE)
.PHONY: push
#push docker image to registry
run: build
docker run $(APP_IMAGE)
.PHONY: run
ver:
@echo '$(VERSION)'
#echo $ERSION
.PHONY: ver
tar:
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
tar -C ./ -czvf ./archive.tar.gz 'package.json' 'ncas/' 'helpers/' -X *.js
.PHONY: build

23
bg.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Bulgaria = require('./ncas/bg');
async function run() {
const bgScraper = new Bulgaria();
if (typeof(process.env.BG_CRON) === 'string' )
new CronJob(process.env.BG_CRON, async function() {
await bgScraper.run();
}, null, true);
if (process.env.SCRAPE_START === bgScraper.id)
await bgScraper.run();
console.log('BG launched');
}
run();

5
config.json Normal file
View File

@ -0,0 +1,5 @@
{
"TopicArn": "arn:aws:sns:eu-west-1:115486161803:obdfcascrape",
"QueueUrl": "https://sqs.eu-west-1.amazonaws.com/115486161803/obdfcascrape",
"QueueArn": "arn:aws:sqs:eu-west-1:115486161803:obdfcascrape"
}

64
consume.js Normal file
View File

@ -0,0 +1,64 @@
var AWS = require('aws-sdk');
var util = require('util');
var config = require('./config.json');
require('dotenv').config({
'path': `${__dirname }/.env`
});
// configure AWS
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
var sqs = new AWS.SQS();
var receiveMessageParams = {
'QueueUrl': config.QueueUrl,
'MaxNumberOfMessages': 10
};
function getMessages() {
sqs.receiveMessage(receiveMessageParams, receiveMessageCallback);
}
function receiveMessageCallback(err, data) {
// console.log(data);
if (data && data.Messages && data.Messages.length > 0) {
for (var i = 0; i < data.Messages.length; i++) {
const body =JSON.parse( data.Messages[i].Body);
if (body && body.Message) {
const msg = JSON.parse(body.Message);
console.dir(msg);
}
// console.dir(body);
process.stdout.write('.');
// console.log("do something with the message here...");
//
// Delete the message when we've successfully processed it
const deleteMessageParams = {
'QueueUrl': config.QueueUrl,
'ReceiptHandle': data.Messages[i].ReceiptHandle
};
// sqs.deleteMessage(deleteMessageParams, deleteMessageCallback);
}
getMessages();
}
else {
process.stdout.write('-');
setTimeout(getMessages, 1000);
}
}
function deleteMessageCallback(err, data) {
// console.log("deleted message");
// console.log(data);
}
setTimeout(getMessages, 1000);

35
cy.js Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
// TODO:
// parse arguments - we should run just 1 FCA per go &
// have option to run selected company from selected NCA
const argv = require('yargs').argv;
// load helper libs etc
// const Fca = require('./ncas/fca');
const Cyprus = require('./ncas/cy');
async function run() {
const cyScraper = new Cyprus();
if (typeof(process.env.CY_CRON) === 'string' )
new CronJob(process.env.CY_CRON, async function() {
await cyScraper.run();
}, null, true);
if (process.env.SCRAPE_START === cyScraper.id)
await cyScraper.run();
console.log('CY Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

29
cz.js Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Czech = require('./ncas/cz');
async function run() {
const czScraper = new Czech();
if (typeof(process.env.CZ_CRON) === 'string' )
new CronJob(process.env.CZ_CRON, async function() {
await czScraper.run();
}, null, true);
if (process.env.SCRAPE_START === czScraper.id)
await czScraper.run();
console.log('CZ Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

29
de.js Normal file
View File

@ -0,0 +1,29 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Germany = require('./ncas/de');
async function run() {
const deScraper = new Germany();
if (typeof(process.env.DE_CRON) === 'string' )
new CronJob(process.env.DE_CRON, async function() {
await deScraper.run();
}, null, true);
if (process.env.SCRAPE_START === deScraper.id)
await deScraper.run();
console.log('DE Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

110
debuglogs.js Normal file
View File

@ -0,0 +1,110 @@
// https://github.com/markcallen/snssqs
const AWS = require('aws-sdk');
const util = require('util');
const async = require('async');
const fs = require('fs-extra');
const path = require('path');
const archiver = require('archiver-promise');
const logger = require('log4js').getLogger('DebugUploader');
const dateFormat = require('dateformat');
const { promisify } = require('util');
const readFileAsync = promisify( fs.readFile);
require('dotenv').config({
'path': `${__dirname }/.env`
});
logger.level = process.env.LOGGER_LEVEL || 'debug';
// configure AWS
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
const s3 = new AWS.S3();
async function _createDirectory(destPath = null) {
if (!fs.existsSync(destPath))
fs.ensureDirSync(destPath);
return destPath;
}
async function _createArchive(destPath = null, filename = null, glob = false) {
return new Promise((resolve, reject) => {
if (!destPath || !filename) {
const e = new Error('Missing paths');
logger.error(e);
reject(e);
}
const archive = archiver(filename, {
'zlib': { 'level': 9 } // Sets the compression level.
});
if (glob)
archive.glob(`${destPath}`);
else
archive.directory(`${destPath}/`);
archive.finalize().then(function() {
logger.debug('Archive finished');
resolve();
});
});
}
async function _upload(filename) {
logger.info('^^^ UPLOADING ^^^');
const filePath = path.parse(filename);
await readFileAsync(filename).then(async (data) => {
const base64data = new Buffer.from(data, 'binary');
const s3Obj = {
'Bucket': process.env.S3_BUCKET,
'Key': filePath.base,
'Body': base64data,
'ACL': 'public-read'
};
return await s3.upload(s3Obj).promise()
.then((data) => {
logger.info('Successfully uploaded file.');
return data;
})
.catch((err) => {
logger.error(err);
return err;
});
}).catch((err) => {
logger.error(err);
});
}
async function _archive() {
logger.info('>-< ARCHIVING >-<');
try{
const now = new Date();
await _createDirectory('dist');
const timestamp = dateFormat(now, process.env.FILE_DATE_FOTMAT || 'yyyymmdd');
const filename = `dist/debug-${process.env.HOSTNAME}-${timestamp}.zip`;
const eFilename = `dist/pl-${process.env.HOSTNAME}-${timestamp}.zip`;
await _createArchive('debug/', filename);
await _createArchive('artefacts/PL', eFilename, true);
await _upload(filename);
await _upload(eFilename);
}
catch (e) {
logger.error(e);
}
}
logger.info('Debug Archiver');
async.series([_archive]);

28
dk.js Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Denmark = require('./ncas/dkV2');
async function run() {
const dkScraper = new Denmark();
if (typeof(process.env.DK_CRON) === 'string' )
new CronJob(process.env.DK_CRON, async function() {
await dkScraper.run();
}, null, true);
if (process.env.SCRAPE_START === dkScraper.id)
await dkScraper.run();
console.log('DK Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

10
docker-compose.yml Normal file
View File

@ -0,0 +1,10 @@
version: '3'
services:
web:
build: ./setup/web/.
container_name: web
ports:
- 9000:9000

9
docker.sh Executable file
View File

@ -0,0 +1,9 @@
#!/usr/bin/env bash
docker-compose down
gulp default
docker-compose pull
docker-compose up --build -d

94
ecosystem.config.js Normal file
View File

@ -0,0 +1,94 @@
require('dotenv').config();
const dateFormat = require('dateformat');
function buildApps() {
// proxies = ['uk', 'fr', 'de', 'nl', 'ch'];
const debugCron = process.env['debugCron'] || false;
const cronBump = process.env['cronBump'] || false;
const baseDate = new Date();
let startCronMS = baseDate.getTime() + ( 5 * (60 * 1000));
console.log(`debugCron:${debugCron} // cronBump:${cronBump}`);
const apps = [];
const list = [
{ 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js', 'proxy': 'uk', 'crontime': '0 0 * * *' }, // 00:04:40
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // 01:12:53
{ 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js', 'proxy': 'uk', 'crontime': '10 1 * * *' }, // 04:51:37 - uk free at 6:30
{ 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js', 'proxy': 'fr', 'crontime': '0 0 * * *' }, // 00:01:03
{ 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js', 'proxy': 'fr', 'crontime': '5 0 * * *' }, // 00:43:45
{ 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js', 'proxy': 'fr', 'crontime': '0 1 * * *' }, // 01:22:29
{ 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // 00:53:26
{ 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js', 'proxy': 'fr', 'crontime': '30 3 * * *' }, // 00:24:03 - fr free at 4:00
{ 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js', 'proxy': 'de', 'crontime': '0 0 * * *' }, // 03:55:38 - de free at 4:00
{ 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js', 'proxy': 'nl', 'crontime': '0 0 * * *' }, // 07:23:19 - nl free at 7:30
{ 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js', 'proxy': 'ch', 'crontime': '0 0 * * *' }, // 17:59:18 - ch free at 18:00
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk' },
{ 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js', 'proxy': 'uk' },
{ 'cron':'ES_CRON', 'start':'ES', 'name':'ES', 'script':'es.js', 'proxy': 'uk' },
{ 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js', 'proxy': 'uk' },
{ 'cron':'GR_CRON', 'start':'GR', 'name':'GR', 'script':'gr.js', 'proxy': 'uk' },
{ 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js', 'proxy': 'uk' },
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk' },
{ 'cron':'LV_CRON', 'start':'LV', 'name':'LV', 'script':'lv.js', 'proxy': 'uk' },
{ 'cron':'NO_CRON', 'start':'NO', 'name':'NO', 'script':'no.js', 'proxy': 'uk' },
{ 'cron':'EE_CRON', 'start':'EE', 'name':'EE', 'script':'ee.js', 'proxy': 'uk' },
{ 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' }
];
apps.push({
'name' : 'watcher',
'script' : 'helpers/watcher.js',
'env': {
'NODE_ENV': 'production'
},
'autorestart' : true,
'max_restarts': 3,
'restart_delay': 4000
});
for(const item of list)
if ((typeof process.env[item.cron] !== 'undefined' || process.env.SCRAPE_START === item.start)) {
const proxyUri = `${item.proxy}.proxymesh.com:31280`;
const newItem = {
'name' : item.name,
'script' : item.script,
'env': {
'NODE_ENV': 'production',
'PROXY_URI' : proxyUri
},
'autorestart' : true,
'max_restarts': 3,
'restart_delay': 4000
};
if (typeof process.env[item.cron] !== 'undefined') {
newItem.env[item.cron] = (debugCron !== false) ? process.env[item.cron] : item.crontime;
if (cronBump !== false) {
newItem.env[item.cron] = dateFormat(startCronMS, 'M H "* * *"');
startCronMS = startCronMS + ( 2 * (60 * 1000));
}
}
apps.push(newItem);
}
const version = process.env.VERSION || 'NO VERSION!';
console.log('*****************************');
console.log(`LAUNCHING VERSION: ${version}`);
console.log('*****************************');
console.log(JSON.stringify(apps));
return apps;
}
module.exports = {
'apps' : buildApps()
};

23
ee.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Estonia = require('./ncas/ee');
async function run() {
const eeScraper = new Estonia();
if (typeof(process.env.EE_CRON) === 'string' )
new CronJob(process.env.EE_CRON, async () => {
await eeScraper.run();
}, null, true);
if (process.env.SCRAPE_START === eeScraper.id)
await eeScraper.run();
console.log('EE Launched');
}
run();

25
es.js Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Spain = require('./ncas/es');
async function run() {
const esScraper = new Spain();
if (typeof(process.env.ES_CRON) === 'string' )
new CronJob(process.env.ES_CRON, async function() {
await esScraper.run();
}, null, true);
if (process.env.SCRAPE_START === esScraper.id)
await esScraper.run();
console.log('ES Launched');
}
run();

31
fr.js Normal file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
// load helper libs etc
// const Fca = require('./ncas/fca');
const France = require('./ncas/fr');
async function run() {
const frScraper = new France();
if (typeof(process.env.FR_CRON) === 'string' )
new CronJob(process.env.FR_CRON, async function() {
await frScraper.run();
}, null, true);
if (process.env.SCRAPE_START === frScraper.id)
await frScraper.run();
console.log('FR Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

25
gi.js Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Gibraltar = require('./ncas/gi');
async function run() {
const giScraper = new Gibraltar();
if (typeof(process.env.GI_CRON) === 'string' )
new CronJob(process.env.GI_CRON, async function() {
await giScraper.run();
}, null, true);
if (process.env.SCRAPE_START === giScraper.id)
await giScraper.run();
console.log('GI Launched');
}
run();

23
gr.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Greece = require('./ncas/gr');
async function run() {
const grScraper = new Greece();
if (typeof(process.env.GR_CRON) === 'string' )
new CronJob(process.env.GR_CRON, async function() {
await grScraper.run();
}, null, true);
if (process.env.SCRAPE_START === grScraper.id)
await grScraper.run();
console.log('GR Launched');
}
run();

40
gulpfile.js Normal file
View File

@ -0,0 +1,40 @@
'use strict';
const gulp = require('gulp');
var bump = require('gulp-bump');
var changedInPlace = require('gulp-changed-in-place');
const debug = require('gulp-debug');
const watchFolders = ['ncas/**/*.js', 'helpers/**/*.js'];
gulp.task('bumpNcas', () => {
gulp.src('ncas/**/*.js')
.pipe(changedInPlace({'firstPass':true}))
.pipe(debug({ 'showFiles': true }))
.pipe(bump({ 'key': 'version', 'type':'prerelease' }))
.pipe(gulp.dest('ncas'));
});
/*
gulp.task('styles', function() {
return gulp.src(['node_modules/backbone.modal/backbone.modal.css', 'node_modules/backbone.modal/backbone.modal.theme.css'])
.pipe(autoprefixer('last 2 version', 'safari 5', 'ie 8', 'ie 9', 'opera 12.1', 'ios 6', 'android 4'))
.pipe(concat('style.min.css'))
.pipe(cssnano())
.pipe(gulp.dest('live/css'));
});
gulp.task('default', function () {
return gulp.src('src/**//*/ .{ts,tsx}')
.pipe(changedInPlace())
.pipe(tsfmt())
.pipe(gulp.dest('src'));
});
*/
gulp.task('bumpWatch', ['bumpNcas'], function() {
gulp.watch(watchFolders, ['bumpNcas']);
});

53
helpers/csv-data.js Normal file
View File

@ -0,0 +1,53 @@
const AWS = require('aws-sdk');
const { parse, generate } = require('csv');
class CsvData {
constructor() {
this.s3 = new AWS.S3();
// defaults for bucket and main CSV name
this.bucketName = process.env.OB_SCRAPE_BUCKET || 'obregstoretest';
this.keyName = process.env.OB_SCRAPE_KEYNAME || 'artefacts/UK/FCA/latest/Firms with PSD Permissions (CSV).csv';
}
async _getCsvDataFromS3() {
return this.s3.getObject({
'Bucket': this.bucketName,
'Key': this.keyName
}).promise();
}
async _parseCsvBufferData(csvBufferData) {
return new Promise((resolve, reject) => {
parse(
csvBufferData.Body.toString('utf-8'),
{
'columns': true, // required to create objects (instead of array of strings), auto-discovery works fine
'relax_column_count': true // needs to set to false as header have info when file was generated as extra column
},
(err, data) => {
if (err) return reject(err);
resolve(data);
}
);
});
}
async _getFrnAndName(data) {
return data.map(el => ({
'frn': el.FRN,
'firm': el.Firm
}));
}
async getOrgIds() {
const rawCsv = await this._getCsvDataFromS3();
const parsedCsv = await this._parseCsvBufferData(rawCsv);
const frnIds = this._getFrnAndName(parsedCsv);
return frnIds;
}
}
module.exports = CsvData;

View File

@ -0,0 +1 @@
[["Akquisitionsgeschäft","Acquisition business"],["Finanztransfergeschäft","Money transmission services"],["Zahlungsauthentifizierungsgeschäft","Payment authentication business"],["Digitalisiertes Zahlungsgeschäft","Digitized payment transaction"],["Ein- oder Auszahlungsgeschäft","Deposit or withdrawal transaction"],["Lastschriftgeschäft mit Kreditgewährung","Direct debit business with lending"],["Lastschriftgeschäft ohne Kreditgewährung","Direct debit business without lending"],["Zahlungskartengeschäft mit Kreditgewährung","Payment card business with credit"],["Zahlungskartengeschäft ohne Kreditgewährung","Payment card business without lending"],["Überweisungsgeschäft mit Kreditgewährung","Credit transfer transaction"],["Überweisungsgeschäft ohne Kreditgewährung","Remittance without credit"],["Auszahlungsgeschäft","Payment business"],["Einzahlungsgeschäft","Deposit business"],["Kontoinformationsdienste","Account information services"],["Zahlungsauslösedienste","Payment release services"],["Abschlußvermittlung","Terminating switch"],["Anlageberatung","Investment advice"],["Anlagevermittlung","Investment brokerage"],["Anlageverwaltung","Investment brokerage"],["Depotgeschäft","Custodian business"],["Diskontgeschäft","Discount store"],["Drittstaateneinlagenvermittlung","Non- EEA deposit broking"],["E-Geld-Geschäft","E-money business"],["Eigengeschäft","Own business"],["Eigenhandel","Proprietary trading"],["Einlagengeschäft","Deposit business"],["Emissionsgeschäft","Underwriting"],["Factoring","Factoring"],["Finanzierungsleasing","Finance lease"],["Finanzkommissionsgeschäft","Broking"],["Finanzportfolioverwaltung","Financial portfolio management"],["Garantiegeschäft","Guarantee business"],["Geldkartengeschäft","Money card business"],["Girogeschäft","Giro business"],["Kreditgeschäft","Lending business"],["Kreditkartengeschäft","Credit card business"],["Netzgeldgeschäft","Network money business"],["Organisiertes Handelssystem (OTF)","Organized trading system (OTF)"],["Platzierungsgeschäft","Placement business"],["Revolvinggeschäft, sog.","Revolving business, so-called"],["Scheck- u. Wechseleinzugs- u. Reisescheckgeschäft","Check and Change-in. Traveler's check business"],["Sortengeschäft","Foreign currency dealing"],["Bausparkassengeschäft","Building society business"],["Pfandbriefgeschäft","Pfandbrief business"],["Multilaterales Handelssystem","Multilateral trading system"],["Hypothekenbankengeschäft","Mortgage Banking"],["Entgegennahme von Einlagen (Nr. 1)","Receipt of deposits"],["Tätigkeit als zentrale Gegenpartei","Activity as a central counterparty"],["Tätigkeit als zentrale Gegenpartei nach Art. 14 VO (EU) Nr. 648/2012","Activity as central counterparty according to Art. 14 VO (EU) No. 648/2012"],["Tätigkeit als zentraler Kontrahent","Activity as central counterparty"],["Teilnahme an Versteigerungen für Emissionsberechtigungen","Participation in auctions for emission allowances"]]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,10 @@
[
[
"VP-inst, SI - obligationer",
""
],
[
"VP-inst, SI - depåbevis",
""
]
]

View File

@ -0,0 +1 @@
[["Akquisitionsgeschäft","Acquisition business"],["Finanztransfergeschäft","Money transmission services"],["Zahlungsauthentifizierungsgeschäft","Payment authentication business"],["Digitalisiertes Zahlungsgeschäft","Digitized payment transaction"],["Ein- oder Auszahlungsgeschäft","Deposit or withdrawal transaction"],["Lastschriftgeschäft mit Kreditgewährung","Direct debit business with lending"],["Lastschriftgeschäft ohne Kreditgewährung","Direct debit business without lending"],["Zahlungskartengeschäft mit Kreditgewährung","Payment card business with credit"],["Zahlungskartengeschäft ohne Kreditgewährung","Payment card business without lending"],["Überweisungsgeschäft mit Kreditgewährung","Credit transfer transaction"],["Überweisungsgeschäft ohne Kreditgewährung","Remittance without credit"],["Auszahlungsgeschäft","Payment business"],["Einzahlungsgeschäft","Deposit business"],["Kontoinformationsdienste","Account information services"],["Zahlungsauslösedienste","Payment release services"],["Abschlußvermittlung","Terminating switch"],["Anlageberatung","Investment advice"],["Anlagevermittlung","Investment brokerage"],["Anlageverwaltung","Investment brokerage"],["Depotgeschäft","Custodian business"],["Diskontgeschäft","Discount store"],["Drittstaateneinlagenvermittlung","Non- EEA deposit broking"],["E-Geld-Geschäft","E-money business"],["Eigengeschäft","Own business"],["Eigenhandel","Proprietary trading"],["Einlagengeschäft","Deposit business"],["Emissionsgeschäft","Underwriting"],["Factoring","Factoring"],["Finanzierungsleasing","Finance lease"],["Finanzkommissionsgeschäft","Broking"],["Finanzportfolioverwaltung","Financial portfolio management"],["Garantiegeschäft","Guarantee business"],["Geldkartengeschäft","Money card business"],["Girogeschäft","Giro business"],["Kreditgeschäft","Lending business"],["Kreditkartengeschäft","Credit card business"],["Netzgeldgeschäft","Network money business"],["Organisiertes Handelssystem (OTF)","Organized trading system (OTF)"],["Platzierungsgeschäft","Placement business"],["Revolvinggeschäft, sog.","Revolving business, so-called"],["Scheck- u. Wechseleinzugs- u. Reisescheckgeschäft","Check and Change-in. Traveler's check business"],["Sortengeschäft","Foreign currency dealing"],["Bausparkassengeschäft","Building society business"],["Pfandbriefgeschäft","Pfandbrief business"],["Multilaterales Handelssystem","Multilateral trading system"],["Hypothekenbankengeschäft","Mortgage Banking"],["Entgegennahme von Einlagen (Nr. 1)","Receipt of deposits"],["Tätigkeit als zentrale Gegenpartei","Activity as a central counterparty"],["Tätigkeit als zentrale Gegenpartei nach Art. 14 VO (EU) Nr. 648/2012","Activity as central counterparty according to Art. 14 VO (EU) No. 648/2012"],["Tätigkeit als zentraler Kontrahent","Activity as central counterparty"],["Teilnahme an Versteigerungen für Emissionsberechtigungen","Participation in auctions for emission allowances"]]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,558 @@
[
[
"VP-inst, SI - obligationer",
"VP inst, SI bonds"
],
[
"VP-inst, SI - depåbevis",
"VP Inst, SI Depositary Certificate"
],
[
"VP-inst, SI - aktier",
"VP inst, SI shares"
],
[
"Förvaltare av fondandelar",
"Manager of fund units"
],
[
"Mottagande & vidarebefordran av order avs fi instrument",
"Receipt and forwarding of order instruments"
],
[
"Utförande av order avs fi instrument på kunders uppdrag",
"Execution of order instruments on behalf of clients"
],
[
"Handel med finansiella instrument för egen räkning",
"Trade in financial instruments for own account"
],
[
"Diskretionär portföljförvaltning avs finansiella instrument",
"Discretionary portfolio management of financial instruments"
],
[
"Investeringsrådgivning till kund avs finansiella instrument",
"Investment advice to customer financial instruments"
],
[
"Garantigivning & placering av fi instrument m fast åtagande",
"Warranty & placement of fi m instruments m firm commitment"
],
[
"Placering av finansiella instrument utan fast åtagande",
"Placement of financial instruments without a firm commitment"
],
[
"Försäkring i samtliga livförsäkr.klasser (direkt) förs.förm.",
"Insurance in all life insurance classes (direct)."
],
[
"a) Olycksfalls- och sjukförsäkring (direkt)",
"(a) accident and sickness insurance (direct)"
],
[
"Oktroj och stadfästelse av bolagsordning",
"Octroj and confirmation of articles of association"
],
[
"BELGIEN",
"BELGIUM"
],
[
"Mottagande insättningar och andra återbet. medel fr allmänh.",
"Receiving deposits and other repayments. means of public."
],
[
"Portföljförvaltning och -rådgivning",
"Portfolio management and consulting"
],
[
"Förvaring och förvaltning av värdepapper",
"Securities storage and management"
],
[
"Utlåning",
"Loans"
],
[
"Betalningsförmedling",
"Money transfers"
],
[
"Garantiförbindelser och ställande av säkerhet",
"Warranty and Security"
],
[
"Penningmarknadsinstrument (checkar, växlar, depåbevis m.m)",
"Money market instruments (checks, bills, depository receipts, etc.)"
],
[
"Utländsk valuta",
"Foreign currency"
],
[
"Finansiella terminer och optioner",
"Financial futures and options"
],
[
"Valuta- och ränteinstrument",
"Currency and interest rate instruments"
],
[
"Överlåtbara värdepapper",
"Transferable securities"
],
[
"Medverkan i värdepappersemi. och tillhandahåll. av tjänster",
"Participation in securities issues. and provide. of services"
],
[
"Rådgivning till företag ang. kapitalstr, ind. strategi etc.",
"Advice to companies ang. kapitalstr. strategy etc."
],
[
"TYSKLAND",
"GERMANY"
],
[
"DANMARK",
"DENMARK"
],
[
"ESTLAND",
"ESTONIA"
],
[
"SPANIEN",
"SPAIN"
],
[
"FINLAND",
"FINLAND"
],
[
"FRANKRIKE",
"FRANCE"
],
[
"STORBRITANNIEN",
"UK"
],
[
"IRLAND",
"IRELAND"
],
[
"ITALIEN",
"ITALY"
],
[
"Handel för egen eller kunders räkning med ..(se förteckning)",
"Trade on behalf of your own or customers with .. (see list)"
],
[
"LITHUANIA",
"LITHUANIA"
],
[
"LUXEMBURG",
"LUXEMBOURG"
],
[
"LETTLAND",
"LATVIA"
],
[
"NEDERLÄNDERNA",
"NETHERLANDS"
],
[
"NORGE",
"NORWAY"
],
[
"Penningmarknadsmäkling",
"Money broking"
],
[
"Tillstånd att bedriva bankrörelse",
"Permission to conduct banking operations"
],
[
"Ia) Livförsäkring (direkt)",
"Ia) Life insurance (direct)"
],
[
"16. Annan förmögenhetsskada (direkt)",
"16. Other property damage (direct)"
],
[
"1. Olycksfall (direkt)",
"1. Accidents (direct)"
],
[
"2. Sjukdom (direkt)",
"2. Disease (direct)"
],
[
"ÖSTERRIKE",
"AUSTRIA"
],
[
"Kreditupplysningstjänster",
"Credit Services"
],
[
"Finansiell leasing",
"Financial leasing"
],
[
"POLEN",
"POLAND"
],
[
"Mottagande & vidarebefordran av order beträffande fi instr",
"Receipt and forwarding of orders concerning fi rst"
],
[
"Utförande av order på kunders uppdrag",
"Execution of orders on customer assignments"
],
[
"Portföljförvaltning",
"portfolio"
],
[
"Investeringsrådgivning",
"investment advice"
],
[
"Förvaring av fi instr för kund samt kontanta medel",
"Storage of fixtures for customers as well as cash funds"
],
[
"Lämnande av kredit till inv för transaktion i fi instr",
"Transmission of credit to transaction in fi rst"
],
[
"Råd till företag & råd och tjänster vid fusioner & uppköp",
"Advice for companies & services and mergers & acquisitions"
],
[
"Valutatjänster i samband med investeringstjänster",
"Currency services in connection with investment services"
],
[
"GREKLAND",
"GREECE"
],
[
"ISLAND",
"ICELAND"
],
[
"Handel för egen räkning",
"Trade for your own account"
],
[
"PORTUGAL",
"PORTUGAL"
],
[
"Gränsöverskridande verksamhet med penningöverföring",
"Transboundary activity with money transfer"
],
[
"h) Försäkring i samtliga skadeförsäkringsklasser (direkt)",
"h) Insurance in all non-life classes (direct)"
],
[
"Utställande av administration av betalningsmedel",
"Exhibition of payment of funds"
],
[
"Operativ risk, schablonmetod",
"Operational risk, standardized method"
],
[
"Utgivning av elektroniska pengar",
"Electronic money issuance"
],
[
"BULGARIEN",
"BULGARIA"
],
[
"CYPERN",
"CYPRUS"
],
[
"TJECKIEN",
"TURKEY"
],
[
"KROATIEN",
"CROATIA"
],
[
"UNGERN",
"HUNGARY"
],
[
"LIECHTENSTEIN",
"LIECHTENSTEIN"
],
[
"MALTA",
"MALTA"
],
[
"RUMÄNIEN",
"ROMANIA"
],
[
"SLOVENIEN",
"SLOVENIA"
],
[
"SLOVAKIEN",
"SLOVAKIA"
],
[
"Tillstånd till metod för intern riskklassificering",
"Permission for internal risk classification method"
],
[
"Utgivning av säkerställda obligationer",
"Issuance of covered bonds"
],
[
"LITAUEN",
"LITHUANIA"
],
[
"Tillstånd att bedriva pensionssparrörelse (IPS)",
"Permission to pursue pension retirement (IPS)"
],
[
"Ib) Tilläggsförsäkring till livförsäkring (direkt)",
"Ib) Supplementary Insurance to Life Insurance (Direct)"
],
[
"13. Allmän ansvarighet (direkt)",
"13. General Accountability (Direct)"
],
[
"17. Rättsskydd (direkt)",
"17. Legal protection (direct)"
],
[
"18. Assistans (direkt)",
"18. Assistans (direct)"
],
[
"7. Godstransport (direkt)",
"7. Freight transport (direct)"
],
[
"8. Brand och naturkrafter (direkt)",
"8. Fire and natural forces (direct)"
],
[
"9. Annan sakskada (direkt)",
"9. Other property damage (direct)"
],
[
"IV. Lång olycksfall- och sjukförsäkring (direkt)",
"IV. Long accident and health insurance (direct)"
],
[
"Ia) Livförsäkring (indirekt)",
"(Ia) Life insurance (indirect)"
],
[
"Ib) Tilläggsförsäkring till livförsäkring (indirekt)",
"Ib) Supplementary Insurance to Life Insurance (indirect)"
],
[
"III. Försäkring anknuten till värdepappersfonder (direkt)",
"III. Insurance linked to mutual funds (direct)"
],
[
"III. Försäkring anknuten till värdepappersfonder (indirekt)",
"III. Insurance linked to mutual funds (indirect)"
],
[
"Bankfacktjänster",
"Safe custody services"
],
[
"Operativ risk, internmätningsmetod",
"Operational risk, internal measurement method"
],
[
"b) Motorfordonsförsäkring (direkt)",
"b) Motor vehicle insurance (direct)"
],
[
"VP-inst, SI - derivat",
"VP inst, SI derivative"
],
[
"VP-inst, SI - ETC:er",
"VP Inst, SI - ETC"
],
[
"VP-inst, SI - ETN:er",
"VP Inst, SI - ETN"
],
[
"VP-inst, SI - strukturerade finansiella produkter",
"VP inst, SI - structured financial products"
],
[
"VP-inst, SI - utsläppsrätter",
"VP inst, SI emission allowances"
],
[
"VP-inst, SI - värdepapperiserade derivat",
"VP inst, SI - securitized derivatives"
],
[
"Tillstånd till avancerad metod för intern riskklassifiering",
"Permit to advanced method for internal risk classification"
],
[
"Marknadsrisk, VaR-modell",
"Market risk, VaR model"
],
[
"Garantiverksamhet & placering av fi instr med fast åtagande",
"Warranty & Placement of Fixed Commitment"
],
[
"Tjänster i samband med garantigivning",
"Services in connection with warranty"
],
[
"Investeringstj mm för underligg derivat vid investeringstj",
"Investment etc. for underlying derivatives at investment"
],
[
"Investerings- och finansanalys & allm rekom av fi instr",
"Investment and financial analysis & publications by fi rst"
],
[
"10. Motorfordonsansvar (direkt)",
"10. Motor Vehicle Responsibility (Direct)"
],
[
"3. Landfordon (direkt)",
"3. Country vehicles (direct)"
],
[
"6. Fartyg (direkt)",
"6. Ships (direct)"
],
[
"VP-inst, SI - annat aktieliknande instrument",
"VP inst, SI - other share-like instrument"
],
[
"VP-inst, SI - börshandlade fonder",
"VP inst, SI-traded funds"
],
[
"VP-inst, SI - certifikat",
"VP Inst, SI Certificate"
],
[
"e) Försäkring mot brand och annan skada på egendom (direkt)",
"e) Insurance against fire and other property damage (direct)"
],
[
"Penningöverföring",
"money transfer"
],
[
"Genomföra betaltransaktioner",
"Make payment transactions"
],
[
"Insättning/uttag betalkonto",
"Deposit / withdrawal payment account"
],
[
"Genomföra betalningstransaktioner",
"Make payment transactions"
],
[
"Inlåning",
"deposits"
],
[
"Genomföra betalningstransaktioner genom kreditutrymme",
"Make payment transactions through credit space"
],
[
"Betalningsinitieringstjänster",
"Betalningsinitieringstjänster"
],
[
"Kontoinformationstjänster",
"account Information"
],
[
"Bostadskreditförmedling",
"Housing Finance Agency"
],
[
"Inlösa transaktionsbelopp",
"Redeem transaction amount"
],
[
"Genomförande av betaltransaktioner",
"Implementation of payment transactions"
],
[
"Genomförande av betaltransaktioner gm kreditutrymme",
"Implementation of payment transactions gm credit space"
],
[
"Utfärdande och/eller förvärvande av betalningsinstrument",
"Issuance and / or acquisition of payment instruments"
],
[
"GIBRALTAR",
"GIBRALTAR"
],
[
"Valutaväxling",
"Currency Exchange"
],
[
"Annan finansiell verksamhet",
"Other financial activities"
],
[
"Konsumentkreditförmedling",
"Consumer Credit Agency"
],
[
"Konsumentkreditgivning",
"consumer Credit"
],
[
"Genomföra betalningstransaktioner som systemoperatör",
"Make payment transactions as a system operator"
],
[
"Elpengar",
"Elpengar"
],
[
"Betalningstransakt via tele-/informationsteknik-/nätoperatör",
"Payment transaction via telecommunications / information technology / network operator"
]
]

50
helpers/dig.js Normal file
View File

@ -0,0 +1,50 @@
const { format, promisify } = require('util');
const exec = promisify(require('child_process').exec);
/* async function parseDig(output) {
console.log('parseDig', output);
const lines = output.split(/\n/);
const result = {
'A': [],
'CNAME': []
};
for (const line of lines) {
if (/^A (.*) from/.test(line)) {
const scn = line.match(/^A (.*). from/)[1];
result.A.push(scn);
}
if (/^CNAME (.*) from/.test(line)) {
const scn = line.match(/^CNAME (.*). from/)[1];
result.CNAME.push(scn);
}
}
return result;
}*/
/**
* Dig trace option wrapper method.
*
* @param {String} name
* @returns {{A: String[], CNAME: String[]}} results
*/
function dig(name) {
return new Promise(async (resolve, reject) => {
if (typeof (name) !== 'string')
throw new TypeError('name (string) is required');
const cmd = format('dig %s +time=3 +retry=1', name);
console.log('CMD', cmd);
const { stdout, stderr } = await exec(cmd, { 'maxBuffer': 1024 * 1024 });
// console.log('output', stdout, stderr);
try {
resolve(stdout);
}
catch (e) {
reject(e);
}
});
}
module.exports = dig;
module.exports.default = dig;

3
helpers/m.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
docker run -d --env S3_BUCKET=obregstoretest-mdtest --env AWS_ACCESS_KEY_ID=AKIAJWJS75F7WNCGK64A --env AWS_SECRET_ACCESS_KEY=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 --env AWS_REGION=eu-west-1 --env LOGGER_LEVEL=debug --env IE_CRON=1 --env NL_CRON=1 --env CY_CRON=1 --env SE_CRON=1 -env DE_CRON=1 --env cronBump=true mail.caliban.io:5000/obdfcascrape:95a9843

14
helpers/s3tools.js Normal file
View File

@ -0,0 +1,14 @@
const AWS = require('aws-sdk');
equire('dotenv').config({
'path': `${__dirname }/../.env`
});
const logger = require('log4js').getLogger('S3Tools 🔧');
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
module.exports = { reduceArticle };

1686
helpers/scraper.js Normal file

File diff suppressed because it is too large Load Diff

0
helpers/tools.js Normal file
View File

60
helpers/watcher.js Normal file
View File

@ -0,0 +1,60 @@
// UATU
const CronJob = require('cron').CronJob;
const pm2 = require('pm2');
const logger = require('log4js').getLogger('WATCHER');
const nodeFree = require('node-free');
// load env variables from file
require('dotenv').config();
logger.level = 'trace';
function formatBytes(bytes, decimals = 2) {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const dm = decimals < 0 ? 0 : decimals;
const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) } ${ sizes[i]}`;
}
async function logMemory() {
pm2.list((err, processDescriptionList) => {
if (err)
logger.error(err);
else
for (const item of processDescriptionList) {
// logger.debug(JSON.stringify(item));
const { pid, name, monit } = item;
const { memory, cpu } = monit;
if (name !== 'watcher')
logger.info(`${name} :: PID:${pid} :: MEMORY:${formatBytes(memory)} :: CPU:${cpu}`);
}
});
logger.info(`Total:${formatBytes(nodeFree.total())} :: Used:${formatBytes(nodeFree.used())} :: Free:${formatBytes(nodeFree.free())}`);
}
async function run() {
pm2.connect(() => {
logMemory();
});
new CronJob('*/5 * * * *', async () => {
await logMemory();
}, null, true);
logger.info('Watcher Launched');
}
process.once('uncaughtException', function caught(err) {
logger.error('Uncaught', err);
});
run();

30
ie.js Normal file
View File

@ -0,0 +1,30 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Ireland = require('./ncas/ie');
async function run() {
const ieScraper = new Ireland();
if (typeof(process.env.IE_CRON) === 'string' ) {
console.log(`${ieScraper.id} cron set for ${process.env.IE_CRON}`);
new CronJob(process.env.IE_CRON, async function() {
await ieScraper.run();
}, null, true);
}
if (process.env.SCRAPE_START === ieScraper.id)
await ieScraper.run();
console.log('IE Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

80
index.js Normal file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
// TODO:
// parse arguments - we should run just 1 FCA per go &
// have option to run selected company from selected NCA
const argv = require('yargs').argv;
// load helper libs etc
// const Fca = require('./ncas/fca');
const Ireland = require('./ncas/ie');
const Denmark = require('./ncas/dk');
const France = require('./ncas/fr');
const Cyprus = require('./ncas/cy');
const Germany = require('./ncas/de');
const Netherlands = require('./ncas/nl');
const Sweden = require('./ncas/se');
const Malta = require('./ncas/mt');
async function run() {
// const fcaScraper = new Fca();
const ieScraper = new Ireland();
const dkScraper = new Denmark();
const frScraper = new France();
const cyScraper = new Cyprus();
const deScraper = new Germany();
const nlScraper = new Netherlands();
const seScraper = new Sweden();
const mtScraper = new Malta();
// fcaScraper.run();
// await ieScraper.run();
/*
await dkScraper.run();
// dkScraper.run();
await frScraper.run();
await cyScraper.run();
await ieScraper.run();
*/
// await nlScraper.run();
// await seScraper.run();
// await deScraper.run();
// await cyScraper.run();
// await frScraper.run();
// console.log('Launched');
console.log(new Date());
new CronJob('05 16 * * *', async function() {
await ieScraper.run();
}, null, true);
new CronJob('05 16 * * *', async function() {
await frScraper.run();
}, null, true);
}
run();

28
it.js Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Italy = require('./ncas/it');
async function run() {
const itScraper = new Italy();
if (typeof(process.env.IT_CRON) === 'string' )
new CronJob(process.env.IT_CRON, async function() {
await itScraper.run();
}, null, true);
if (process.env.SCRAPE_START === itScraper.id)
await itScraper.run();
console.log('IT Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

47
launcher.js Normal file
View File

@ -0,0 +1,47 @@
const jsonfile = require('jsonfile');
require('dotenv').config();
(function() {
const apps = [];
const list = [
{ 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js' },
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js' },
{ 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js' },
{ 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js' },
{ 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js' },
{ 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js' },
{ 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js' },
{ 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js' },
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js' },
{ 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js' },
{ 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js' },
{ 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js' },
{ 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js' },
{ 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js' },
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js' }
];
for(const item of list)
if (typeof process.env[item.cron] !== 'undefined' || process.env.SCRAPE_START === item.start) {
const newItem = {
'name' : item.name,
'script' : item.script,
'env': {
'NODE_ENV': 'production'
},
'env_production' : {
'NODE_ENV': 'production'
},
'autorestart' : true,
'max_restarts': 3,
'restart_delay': 4000
};
apps.push(newItem);
}
console.log(apps);
})();

25
lt.js Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Lithuania = require('./ncas/lt');
async function run() {
const lsScraper = new Lithuania();
if (typeof(process.env.LT_CRON) === 'string' )
new CronJob(process.env.LT_CRON, async () => {
await lsScraper.run();
}, null, true);
if (process.env.SCRAPE_START === lsScraper.id)
await lsScraper.run();
console.log('LT Launched');
}
run();

23
lu.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Lux = require('./ncas/lu');
async function run() {
const luScraper = new Lux();
if (typeof(process.env.LU_CRON) === 'string' )
new CronJob(process.env.LU_CRON, async () => {
await luScraper.run();
}, null, true);
if (process.env.SCRAPE_START === luScraper.id)
await luScraper.run();
console.log('LU Launched');
}
run();

23
lv.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Latvia = require('./ncas/lv');
async function run() {
const lvScraper = new Latvia();
if (typeof(process.env.LV_CRON) === 'string' )
new CronJob(process.env.LV_CRON, async () => {
await lvScraper.run();
}, null, true);
if (process.env.SCRAPE_START === lvScraper.id)
await lvScraper.run();
console.log('LV Launched');
}
run();

28
mt.js Normal file
View File

@ -0,0 +1,28 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Malta = require('./ncas/mt');
async function run() {
const mtScraper = new Malta();
if (typeof(process.env.MT_CRON) === 'string' )
new CronJob(process.env.MT_CRON, async () => {
await mtScraper.run();
}, null, true);
if (process.env.SCRAPE_START === mtScraper.id)
await mtScraper.run();
console.log('scrapestart', process.env.SCRAPE_START);
console.log('MT Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
});
run();

83
ncas/bg.js Normal file
View File

@ -0,0 +1,83 @@
const logger = require('log4js').getLogger('BG');
const path = require('path');
const Scraper = require('../helpers/scraper');
class BGScrape extends Scraper {
constructor() {
super();
this.id = 'BG';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async downloadByHrefFilename(filename) {
logger.info(`Downloading ${filename} from ${this.page.url}`);
const linkHandles = await this.page.$x(`//a[contains(@href, \'${filename}\')]`);
const linkElement = linkHandles[0];
await linkElement.click();
await this._randomWait(this.page, 3, 5);
}
async start() {
super._start();
this.setPath(path.resolve(`${__dirname }/../artefacts/BG/BNB`));
this.startPage = 'http://www.bnb.bg/PaymentSystem/PSPaymentOversight/PSPaymentOversightRegisters/index.htm';
this.creditInstitutionsPage = 'http://www.bnb.bg/RegistersAndServices/RSCIRegisters/index.htm';
// site only over http, so skip ssl during non-repudiation
await this._doNonRepudiation(false, { 'skipSsl': true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser();
this.page = await this.browser.newPage();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
// set cookie for English language and load start page
await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index1`);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await this.downloadByHrefFilename('ps_po_register_2_en.xls');
await this.downloadByHrefFilename('ps_po_register_3a_en.xls');
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index2`);
// TODO: come back and scrape the html page version of this word doc, if we have time
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc');
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls');
// wait until all downloads finished with 'networkidle0' (currently this is only possible with 'page.goto', so we go back to the start page)
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
this.emit('done');
}
async __run() {
await this.start();
}
}
module.exports = BGScrape;

455
ncas/cy.js Normal file
View File

@ -0,0 +1,455 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const url = require('url');
const logger = require('log4js').getLogger('CY');
logger.level = process.env.LOGGER_LEVEL || 'warn';
// load env variables from file
class CYScrape extends Scraper {
constructor() {
super();
this.setID('CY');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param selector
* @returns {Promise<void>}
*/
async grabLink(selector) {
const clickableLinks = await this.page.$$(selector);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
if (clickableLinks.length > 0)
for (const item of clickableLinks) {
const href = await this.page.evaluate(el => el.href, item);
await this._randomWait(this.page, 3, 5);
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if (!err.message.includes('net::ERR_ABORTED') )
logger.error('grabLink', err);
});
}
}
/**
*
* @param id
* @returns {Promise<void>}
*/
async downloadEmoney(id) {
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
await this.grabLink(selector[id]);
}
/**
*
* @returns {Promise<void>}
*/
async downloadExcel() {
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
await this.grabLink(selector);
}
/**
*
* @returns {Promise<void>}
*/
async handlePaymentInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-payment-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadExcel();
await this._randomWait(this.page, 3, 5);
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
}
/**
*
* @returns {Promise<void>}
*/
async handleElectronicMoneyInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(0);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(1);
await this._randomWait(this.page, 3, 5);
this.emit('startProcessingCreditServices');
}
/**
*
* @param body
* @returns {Promise<{}|Array>}
*/
async extractLocalCreditInstitutions(body) {
try{
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
const sanity = /(\d+\.\s)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
if (typeof nextItem !== 'undefined' && nextItem !== null) {
const splitText = $(nextItem).text().split('\n');
const output = [];
splitText.forEach((item) => {
const newItem = this._cleanUp(item);
if ( newItem !== '')
output.push( sanity.exec(newItem)[2]);
});
return output;
}
return {};
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param body
* @returns {Promise<void>}
*/
async extractForeignCreditInstitutions(body) {
try{
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
const sanity = /(\w+\.\s+)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
const output = {};
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
// Rolling this out for ease as it could be changed by hand
let nextElm;
let firstHeadOrig, firstHead;
if (typeof nextItem !== 'undefined' && nextItem !== null) {
firstHeadOrig = this._cleanUp($(nextItem).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextItem).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
firstHeadOrig = this._cleanUp($(nextElm).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextElm).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
}
return output;
}
catch(err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<{local: Promise<*|void>}>}
*/
async processCreditInstitute() {
logger.info('Credit institutes');
try{
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
const body = await this.page.content();
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
const $ = cheerio.load(body);
const content = $('.generic_page-intro');
const local = await this.extractLocalCreditInstitutions(content.html());
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
this.emit('done');
return { local, creditInstitutes };
}
catch(err) {
logger.error(err);
}
}
/**
*
* @param filePath
* @returns {Promise<void>}
*/
async savePDF(filePath) {
logger.info('Saving the pdf:', filePath);
await this._randomWait(this.page, 5, 7);
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
// this.emit('startProcessingCreditServices');
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const checkPDF = /(.pdf)/g;
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
let currentPath = pageUrl.pathname;
let pdfFile;
if (checkPDF.test(currentPath)) {
const splitPath = currentPath.split('/');
pdfFile = splitPath.pop();
currentPath = splitPath.join('/');
}
switch (currentPath) {
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
await this.handlePaymentInstitutions();
break;
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
await this.handleElectronicMoneyInstitutions();
break;
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
logger.warn('We should only arrive here when in Non-headless mode');
await this.savePDF(pdfFile);
break;
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
await this.processCreditInstitute();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
logger.info('Attaching events');
this.on('startProcessingCreditServices', async function() {
await this._goto(this.credit);
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
try {
super._start();
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
await this._createDirectory(this.path);
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}
catch (e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
logger.info('Scraping Cyprus...');
await this.start();
}
}
module.exports = CYScrape;

1062
ncas/cz.js Normal file

File diff suppressed because it is too large Load Diff

597
ncas/de.js Normal file
View File

@ -0,0 +1,597 @@
// version: 0.0.1-20
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('DE');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class DEScrape extends Scraper {
constructor() {
super();
this.setID('DE');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<void>}
*/
async buildSubIndex() {
logger.info('Building sub-index...');
const currentPage = await this.page.evaluate(() => document);
const search = currentPage.location.search;
const params = this._getParamsFromUrl(search);
const currentPageID = params.nameZahlungsinstitut || '';
await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null);
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#zahlinst > tbody > tr a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
const params = this._getParamsFromUrl(href);
href = href.concat('&locale=en_GB');
if (id !== 'Found payment institutions:')
this.paymentServices.links.push({ id, href, params });
}
this.index.step++;
if (this.index.step < this.index.items)
this.emit('nextsubindex');
else {
this.subIndex.done = true;
this.paymentServices.items = this.paymentServices.links.length;
this.emit('subindexdone');
}
}
/**
*
* @returns {Promise<void>}
*/
async buildIndex() {
logger.info('Building the index...');
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
href = href.concat('&locale=en_GB');
this.index.links.push({ id, href });
}
this.index.done = true;
this.index.items = this.index.links.length;
this.emit('indexdone');
}
async initiateCreditIndex() {
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#institutKategorie option');
const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#institutKategorie', value);
this.creditServices.started = true;
break;
}
}
if (this.creditServices.started)
this._findAndClick('#sucheButtonInstitut');
else
throw new Error('Unable to initiate CI Search');
}
async processCreditInstIndexPage() {
const noWhiteSpace = /\W/g;
logger.info('Building CI sub-index...');
const wantedRowType = ['CRR-Kreditinstitut'];
const currentPage = await this.page.evaluate(() => document);
const body = await this.page.content();
const $ = cheerio.load(body);
const search = currentPage.location.search;
const params = this._getParamsFromUrl(search);
const currentPageID = params['d-4012550-p'] || '';
await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null);
await this._randomWait(this.page, 7, 10);
const rows = $('#institut tr');
rows.each((i, elm) => {
const rowClass = cheerio(elm).attr('class');
if (typeof(rowClass) !== 'undefined') {
const children = cheerio(elm).children();
const rowType = children.eq(1).text();
if (wantedRowType.indexOf(rowType) !== -1) {
const name = this._cleanUp(children.eq(0).text());
const id = this._makeFieldName(name);
let href = cheerio(children.eq(0)).find('a').attr('href');
const params = this._getParamsFromUrl(href);
href = href.concat('&locale=en_GB');
// this is the one we want.
this.creditServices.links.push({ name, id, href, params });
}
}
});
const clicked = await this._findAndClick('.pagelinks a', 'Next');
if (!clicked) {
// come to the end of the index..
this.creditServices.done = true;
this.creditServices.items = this.creditServices.links.length;
this.emit('ciindexdone');
}
}
async processCreditInstPage() {
const noWhiteSpace = /\W/g;
const id = this.creditServices.links[this.creditServices.step].id;
const name = this.creditServices.links[this.creditServices.step].name;
logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const details = await this.extractPaymentEntity(body);
const entity = removeAccents.remove(details.description[0].trim());
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
logger.debug('filename', filename);
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
jsonfile.writeFileSync(`${filePath}.json`, details);
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
this.creditServices.links[this.creditServices.step].filePath = `${filePath}`;
this.creditServices.step++;
if (this.creditServices.step < this.creditServices.items) {
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
await this._goto(newUrl);
}
else
this.emit('creditinstdone');
}
/**
*
* @returns {Promise<void>}
*/
async processCreditInstIndex() {
logger.info('Building CI Index..');
if (!this.creditServices.started)
await this.initiateCreditIndex();
else
await this.processCreditInstIndexPage();
}
/**
*
* @param html
* @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>}
*/
async extractPaymentEntity(html) {
const permissions = { 'original':[], 'translated':[] };
const newLine = /\n/g;
const $ = cheerio.load(html);
let description = $('#content > p').text().split(newLine).filter(line => line.length > 0);
description = description.map((i) => {
return this._cleanUp(i.replace(/\t/g, '')).trim();
});
description = description.filter(item => item.length > 0);
const rows = $('#erlaubnis > tbody tr');
rows.each((index, item) => {
const cells = $(item).find('td');
const service = $(cells.get(0)).text();
const startAuth = $(cells.get(1)).text();
const endAuth = $(cells.get(2)).text();
const reason = (cells.length === 4) ? $(cells.get(3)).text() : '';
const phrasing = service.split(' (§');
const translated = this._translate(phrasing[0]);
phrasing[0] = (translated !== '') ? translated : phrasing[0];
const newObjTrans = {
'service': phrasing.join(' (§'),
startAuth,
endAuth
};
const newObj = {
service,
startAuth,
endAuth
};
if (cells.length === 4) {
newObj.reason = reason;
newObjTrans.reason = reason;
}
permissions.translated.push(newObjTrans);
permissions.original.push(newObj);
});
return { description, permissions };
}
/**
*
* @returns {Promise<void>}
*/
async processEntity() {
const noWhiteSpace = /\W/g;
if (!this.subIndex.done) {
// We should not be here quite yet, so add this to subindex;
const currentPage = await this.page.evaluate(() => document);
const location = currentPage.location;
const id = location.search;
let href = location.href;
href = href.concat('&locale=en_GB');
this.paymentServices.links.push({ id, href });
this.index.step++;
if (this.index.step < this.index.items)
this.emit('nextsubindex');
else {
logger.info('Sub indexing done...');
this.subIndex.done = true;
this.paymentServices.items = this.paymentServices.links.length;
this.emit('subindexdone');
}
}
else {
const id = this.paymentServices.links[this.paymentServices.step].id;
// logger.info('Process entity:', id);
logger.info(`Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const details = await this.extractPaymentEntity(body);
const entity = removeAccents.remove(details.description[0].trim());
// const filename = id.indexOf('?id=') === 0 ? `ps_${entity.replace(noWhiteSpace, '_')}` : `ps_${id.replace(noWhiteSpace, '_')}`;
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
logger.debug('filename', filename);
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
jsonfile.writeFileSync(`${this.path}/${filename}.json`, details);
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
this.paymentServices.step++;
if (this.paymentServices.step < this.paymentServices.items)
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
else
this.emit('processdone');
}
}
/**
*
* @param selector
* @returns {Promise<void>}
*/
async grabLink(selector) {
try{
const clickableLinks = await this.page.$$(selector);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
if (clickableLinks.length > 0)
for (const item of clickableLinks) {
const href = await this.page.evaluate(el => el.href, item);
await this._randomWait(this.page, 3, 5);
await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
logger.warn(err);
// throw(Error(err));
});
}
}
catch (e) {
// this._uploadError();
}
}
/**
*
* @returns {Promise<void>}
*/
async processEMoney() {
logger.info('Process EMoney:');
await this._randomWait(this.page, 3, 5);
const filename = 'e-money_Institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)');
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null);
await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a');
await this._randomWait(this.page, 3, 5);
this.mode++;
this.emit('startcredit');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
await this._randomWait(this.page, 3, 5);
switch (pageUrl.pathname) {
case '/database/ZahlInstInfo/':
await this.buildIndex();
break;
case '/database/ZahlInstInfo/suche.do':
await this.buildSubIndex();
break;
case '/database/ZahlInstInfo/zahlinst.do':
await this.processEntity();
break;
case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html':
await this.processEMoney();
break;
case '/database/InstInfo/sucheForm.do':
await this.processCreditInstIndex();
// build index of credit institutes.
break;
case '/database/InstInfo/institutDetails.do':
await this.processCreditInstPage();
// build index of credit institutes.
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('startcredit', async function() {
logger.info('Starting Credit Institutes');
await this._goto(this.credit);
});
this.on('processdone', async function() {
logger.warn('Payment Entities done', this.paymentServices.items);
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
await this._randomWait(this.page, 5, 10);
await this._goto(this.emoneyUrl);
});
this.on('subindexdone', async function() {
logger.info('Sub Index done', this.paymentServices.items);
logger.info(this.paymentServices.links[this.paymentServices.step].href);
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
});
this.on('indexdone', async function() {
logger.info('Index done', this.index.items);
logger.info(this.index.links[this.index.step].href);
await this._goto(this.index.links[this.index.step].href);
});
this.on('ciindexdone', async function() {
logger.info('CI Index done', this.creditServices.items);
logger.info(this.creditServices.links[this.creditServices.step].href);
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
await this._goto(newUrl);
});
this.on('creditinstdone', async function() {
logger.debug('Credit Institutes done', this.paymentServices.items);
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
await this._randomWait(this.page, 5, 10);
this.emit('done');
});
this.on('nextsubindex', async function() {
logger.debug(this.index.links[this.index.step].href);
await this._goto(this.index.links[this.index.step].href);
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
this.mode = 0;
try {
await this._loadDictionary();
this.index = {
'items': 0,
'links': [],
'step': 0,
'started': false,
'done' : false
};
this.subIndex = {
'items': 0,
'links': [],
'step': 0,
'started': false,
'done' : false
};
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false
};
this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB';
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
logger.error(err);
});
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5, 'Startup');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = DEScrape;

413
ncas/dk.js Normal file
View File

@ -0,0 +1,413 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('DK');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class DKScrape extends Scraper {
constructor(checkForLock = true) {
super();
this.id = 'DK';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (checkForLock)
this._checkLock().then((l) => {
if(l)
this.run();
});
this.on('error', (err) => {
logger.error('Error catcher!!', err);
});
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const currentPage = await this.page.evaluate(() => document);
const search = currentPage.location.search;
switch (currentPage.location.pathname) {
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx':
await this.handleStartPage();
break;
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx':
await this.handleSearchResults(search);
break;
case '/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
case '/da/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
// these are all the same page, just in Danish, Danish and English
this.processCoporation();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${currentPage.location.href}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async handleStartPage() {
if (this.mode === 0)
await this._findAndClick('ul li a', 'Payment institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Payment+institutions');
if (this.mode === 1)
await this._findAndClick('ul li a', 'Electronic money institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Electronic+money+institutions');
if (this.mode === 2) {
logger.info('Processing complete');
this.done();
}
}
/**
*
* @returns {Promise<Error>}
*/
async processCoporation() {
await this._randomWait(this.page, 3, 5);
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
const h2 = $('h2').eq(0).text();
// Virksomhedsoplysninger
// Company information
if (h2 === 'Virksomhedsoplysninger') {
logger.warn('Not in English, trying to switch language...');
await this._findAndClick('#mainform > div.header > ul > li.ln > a');
}
else if (h2 === 'Company information') {
const noWhiteSpace = /\W/g;
let ssName;
if (this.mode === 0)
ssName = this.paymentServices.links[this.paymentServices.step].innerText.replace(noWhiteSpace, '_');
else
ssName = this.emoneyServices.links[this.emoneyServices.step].innerText.replace(noWhiteSpace, '_');
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
this._makeScreenshotV2(this.page, `${this.path}/${ssName}`, null);
logger.debug('Processing:', this.paymentServices.links[this.paymentServices.step]);
const fields = await this.extractData(body);
jsonfile.writeFileSync(`${this.path}/${ssName}.json`, fields);
await this.downloadExcel();
if (this.mode === 0)
this.paymentServices.step += 1;
else
this.emoneyServices.step += 1;
await this._randomWait(this.page, 10, 15);
// This should take us back to the search result list
await this._findAndClick('#divContentWidthScr li a', 'To search results');
}
else
return new Error('I do not understand this page...');
}
/**
*
* @param $block
* @returns {Promise<Array>}
*/
async processDataBlock($block) {
const $ = cheerio.load($block);
const noWhiteSpace = /\W/g;
const a = $('tr').map((i, el) => {
const head = $(el).find('td').first();
const data = $(el).find('td').next();
return [head.eq(-1).html().split('</div>')[1].replace(/\n/, '').trim(), data.text()];
});
const fields = [];
for( let step = 0;step < a.length;step = step + 2)
fields.push([a[step].replace(noWhiteSpace, '_'), a[step + 1]]);
return fields;
}
/**
*
* @param body
* @returns {Promise<{companyInformation: *[], presence: Array}>}
*/
async extractData(body) {
const $ = cheerio.load(body);
const vutDataContainer = $('.vut-data-container');
const $basicInfo = vutDataContainer.find('#phmain_0_vut_pnl_basic_info table tbody').get();
const $extendednInfo = vutDataContainer.find('#phmain_0_vut_pnl_extended_info table tbody').get();
const $presenceInfo = vutDataContainer.find('#phmain_0_vut_pnl_tilstedevaerelser table tbody').get();
let companyInformation = await this.processDataBlock($basicInfo);
companyInformation = companyInformation.concat(await this.processDataBlock($extendednInfo));
const presence = await this.processDataBlock($presenceInfo);
return { companyInformation, presence };
}
/**
*
* @returns {Promise<void>}
*/
async downloadExcel() {
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
logger.info('Saving excel into:', this.path);
await this._findAndClick('#phmain_0_vut_link_button_excel');
}
/**
*
* @param search
* @returns {Promise<void>}
*/
async handleSearchResults(search) {
switch (search) {
case '?aid=Payment+services+area&ctid=Payment+institutions':
if (!this.paymentServices.done)
await this.handlePaymentServices();
else
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
break;
case '?aid=Payment+services+area&ctid=Electronic+money+institutions':
if (!this.emoneyServices.done)
await this.handleEmoneyServices();
else
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
break;
case '?restoreSearch=1':
if (this.mode === 0)
if (this.paymentServices.items > 0 && !this.paymentServices.done)
await this.handlePaymentServices();
else {
// Are we not done yet?
// Restarting the page
await this.page.goto(this.startPage);
}
if (this.mode === 1)
if (this.emoneyServices.items > 0 && !this.emoneyServices.done)
await this.handleEmoneyServices();
break;
default:
// Menu fell through
break;
}
}
/**
*
* @returns {Promise<Array>}
*/
async extractLinks() {
const returnObj = [];
await this._randomWait(this.page, 3, 5);
const rows = await this.page.$$('.search-further-data tr a');
for (const item of rows) {
const innerText = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.href, item);
const id = await this.page.evaluate(el => el.id, item);
returnObj.push( {
innerText,
href,
id
});
}
return returnObj;
}
/**
*
* @returns {Promise<void>}
*/
async handleEmoneyServices() {
await this._randomWait(this.page, 3, 5);
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
if (!this.emoneyServices.visited)
if (this.emoneyServices.items === 0) {
// first visit, Build the list
this.emoneyServices.links = await this.extractLinks();
this.emoneyServices.items = this.emoneyServices.links.length;
this.emoneyServices.visited = true;
}
if (this.emoneyServices.visited)
if (this.emoneyServices.step < this.emoneyServices.items) {
const nextItem = this.emoneyServices.links[this.emoneyServices.step];
// Not using an await here. We want to click and exit this page so we don't get tied up
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
}
else {
// EMoney services complete, move onto the next service.
this.emoneyServices.done = true;
this.mode = 2;
await this.page.goto(this.startPage);
}
}
/**
*
* @returns {Promise<void>}
*/
async handlePaymentServices() {
await this._randomWait(this.page, 3, 5);
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
if (!this.paymentServices.visited)
if (this.paymentServices.items === 0) {
// first visit, Build the list
this.paymentServices.links = await this.extractLinks();
this.paymentServices.items = this.paymentServices.links.length;
this.paymentServices.visited = true;
}
if (this.paymentServices.visited)
if (this.paymentServices.step < this.paymentServices.items) {
const nextItem = this.paymentServices.links[this.paymentServices.step];
// Not using an await here. We want to click and exit this page so we don't get tied up
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
}
else {
// Payment services complete, move onto the next service.
this.paymentServices.done = true;
this.mode = 1;
await this.page.goto(this.startPage);
}
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
// Financial Supervisory Authority
// Government ministry
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': {
},
'step': 0,
'visited': false,
'done' : false
};
this.emoneyServices = {
'items': 0,
'links': {
},
'step': 0,
'visited': false,
'done' : false
};
this.startPage = 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx';
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
logger.error(err);
});
await this._initBrowser();
this.page = await this.browser.newPage();
this.page.on('domcontentloaded', () => {
this.processNewPage().catch((err) => {
logger.error('####', err);
this.emit('done');
});
});
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this.page.goto(this.startPage).catch((err) => {
logger.error(err);
this._uploadError();
});
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = DKScrape;

498
ncas/dkV2.js Normal file
View File

@ -0,0 +1,498 @@
const Scraper = require('../helpers/scraper');
const path = require('path');
const logger = require('log4js').getLogger('DK');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class DKScrape extends Scraper {
constructor(checkForLock = true) {
super();
this.id = 'DK';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (checkForLock)
this._checkLock().then((l) => {
if(l)
this.run();
});
this.on('error', (err) => {
logger.error('Error catcher!!', err);
});
}
/**
*
* @returns {Promise<void>}
*/
async movePageToTop() {
await this.page.evaluate(() => {
window.scrollTo(0, 0);
});
}
/**
*
* @returns {Promise<void>}
*/
async movePageToBottom() {
await this.page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
}
/**
*
* @returns {Promise<void>}
*/
async renameFile() {
const filename = this.filenames[this.step];
const sourceFile = 'Finanstilsynets virksomhedsregister - SQL.xlsx';
const origFile = `${this.path}/${sourceFile}`;
const newFile = `${this.path}/${filename}.xlsx`;
await this._renameFile(origFile, newFile);
}
/**
*
* @returns {Promise<void>}
*/
async clickReturn() {
logger.debug('clickReturn');
await this.iframe.waitForSelector('#lsAnalysisPage > div > div:nth-child(2)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
console.log('found');
await elm.click({ 'delay':90 });
}).catch((e) => {
logger.error('iframe missing stuff', e);
// pageLoaded = false;
});
await this._randomWait(this.page, 2, 3, 'after clickReturn click');
this.step++;
}
/**
*
* @returns {Promise<void>}
*/
async scrollContainer() {
await this.page.evaluate(() => {
console.log('window.innerWidth', window.innerWidth);
window.scrollBy(window.innerWidth, window.innerHeight);
});
await this._randomWait(this.page, 2, 2, 'scroll x?');
this.page.mouse.move(1061, 437);
await this._randomWait(this.page, 2, 3, 'bottom right scroll arrow');
for(let count = 0; count < 15; count++) {
this.page.mouse.click(1061, 437, { 'delay':500 });
await this._randomWait(this.page, 1, 2, 'scrolling');
}
await this._randomWait(this.page, 4, 5, 'after scroll');
}
/**
*
* @returns {Promise<void>}
*/
async clickExport() {
logger.debug('clickExport');
await this.movePageToTop();
await this._randomWait(this.page, 2, 2, 'Move to top');
const filename = this.filenames[this.step];
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
await this._randomWait(this.page, 4, 4, 'Screenshot');
this.page.mouse.move(175, 440);
await this._randomWait(this.page, 2, 3, 'Move 175, 440');
this.page.mouse.click(175, 440, { 'button':'right', 'delay':90 });
await this._randomWait(this.page, 2, 3, 'Click 175, 440');
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await this.iframe.waitForSelector('div.lsDialogContent > div:nth-child(2)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
console.log('found');
await elm.click({ 'delay':90 });
}).catch((e) => {
logger.error('iframe missing stuff', e);
// pageLoaded = false;
});
await this._randomWait(this.page, 2, 3, 'after clickExport click');
}
/**
*
* @returns {Promise<void>}
*/
async clickSearch() {
logger.debug('clickSearch');
await this.movePageToBottom();
await this._randomWait(this.page, 2, 3, 'Move to bottom');
await this.iframe.waitForSelector('#lsAnalysisPage > div > div:nth-child(11)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
console.log('found');
await elm.click({ 'delay':90 });
}).catch((e) => {
logger.error('iframe missing stuff', e);
// pageLoaded = false;
});
await this._randomWait(this.page, 2, 3, 'after clickSearch click');
}
/**
*
* @returns {Promise<void>}
*/
async selectPs01() {
logger.debug('selectPs01');
/*
List 1 - betalingstjeneste området = Payment Service Area
List 2 - Betalingsinstitutter = Payment Institutions
*/
this.page.mouse.move(200, 418);
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
this.page.mouse.click(200, 418);
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
this.page.mouse.move(400, 434);
await this._randomWait(this.page, 2, 3, 'Move 400, 434');
this.page.mouse.click(400, 434);
await this._randomWait(this.page, 2, 3, 'Click 400, 434');
}
/**
*
* @returns {Promise<void>}
*/
async selectPs02() {
logger.debug('selectPs02');
/*
List 1 - betalingstjeneste området = Payment Service Area
List 2 - List 2 - Udbyder af betalingstjenester med begraenset tilladelse = Provider of limited payment services
*/
this.page.mouse.move(200, 418);
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
this.page.mouse.click(200, 418);
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
this.page.mouse.move(400, 585);
await this._randomWait(this.page, 2, 3, 'Move 400, 585');
this.page.mouse.click(400, 585);
await this._randomWait(this.page, 2, 3, 'Click 400, 585');
}
/**
*
* @returns {Promise<void>}
*/
async selectEm01() {
logger.debug('selectEm01');
/*
List 1 - betalingstjeneste området = Payment Service Area
List 2 - E-penge-institutter = E money Institutions
*/
this.page.mouse.move(200, 418);
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
this.page.mouse.click(200, 418);
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
this.page.mouse.move(400, 473);
await this._randomWait(this.page, 2, 3, 'Move 400, 473');
this.page.mouse.click(400, 473);
await this._randomWait(this.page, 2, 3, 'Click 400, 473');
}
/**
*
* @returns {Promise<void>}
*/
async selectEm02() {
logger.debug('selectEm01');
/*
List 1 - betalingstjeneste området = Payment Service Area
List 2 - Udbyder af elektroniske penge med begraenset tilladelse = Provider of electronic money with limited permission
*/
this.page.mouse.move(200, 418);
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
this.page.mouse.click(200, 418);
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
this.page.mouse.move(400, 631);
await this._randomWait(this.page, 2, 3, 'Move 400, 631');
this.page.mouse.click(400, 631);
await this._randomWait(this.page, 2, 3, 'Click 400, 631');
}
/**
*
* @returns {Promise<void>}
*/
async selectCs01() {
logger.debug('selectCs01');
/*
List 1 - Kreditinsti Området = Credit Institution
List 2 - pengeinstitutter = Banks
*/
this.page.mouse.move(200, 508);
await this._randomWait(this.page, 2, 3, 'Move 200, 508');
this.page.mouse.click(200, 508);
await this._randomWait(this.page, 2, 3, 'Click 200, 508');
this.page.mouse.move(400, 473);
await this._randomWait(this.page, 2, 3, 'Move 400, 473');
this.page.mouse.click(400, 473);
await this._randomWait(this.page, 2, 3, 'Click 400, 473');
}
/**
*
* @returns {Promise<void>}
*/
async motions() {
do
switch(this.step) {
case 0:
await this.selectPs01();
await this.clickSearch();
await this.scrollContainer();
await this.clickExport();
await this.renameFile();
await this.clickReturn();
break;
case 1:
await this.selectPs02();
await this.clickSearch();
await this.clickExport();
await this.renameFile();
await this.clickReturn();
break;
case 2:
await this.selectEm01();
await this.clickSearch();
await this.scrollContainer();
await this.clickExport();
await this.renameFile();
await this.clickReturn();
break;
case 3:
await this.selectEm02();
await this.clickSearch();
await this.scrollContainer();
await this.clickExport();
await this.renameFile();
await this.clickReturn();
break;
case 4:
await this.selectCs01();
await this.clickSearch();
await this.scrollContainer();
await this.clickExport();
await this.renameFile();
await this.clickReturn();
break;
default:
// Menu fell through
this.complete = true;
this.emit('done');
break;
}
while(!this.complete );
}
/**
*
* @returns {Promise<void>}
*/
async waitForIframe() {
await this.page.waitForSelector('body > div.site > div > div.content > div.content__sections > section > center > iframe', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
logger.debug('iframe');
this.iframe = await this.page.mainFrame().childFrames()[0].childFrames()[0];
await this._randomWait(this.page, 15, 20);
await this.motions();
}).catch((e) => {
logger.error('processEntityDetails', e);
// pageLoaded = false;
});
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
logger.debug('** processNewPage');
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
logger.debug(pageUrl);
await this.waitForIframe();
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
// Financial Supervisory Authority
// Government ministry
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
this.mode = 0;
this.step = 0;
this.complete = false;
this.filenames = ['paymentServices01', 'paymentServices02', 'eMoney01', 'eMoney02', 'creditServices01', 'creditServices02'];
this.startPage = 'https://virksomhedsregister.finanstilsynet.dk/listeudtr%C3%A6k-en.html';
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
logger.error(err);
});
await this._initBrowser();
await this._createBrowserPage();
await this._makeResponsive();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 5000));
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this.page.goto(this.startPage).catch((err) => {
logger.error(err);
this._uploadError();
});
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw Error(e);
}
}
/**
*
* @returns {Promise<void>}
* @private
*/
async __run() {
await this.start();
}
}
module.exports = DKScrape;

781
ncas/ee.js Normal file
View File

@ -0,0 +1,781 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('EE');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class EEScrape extends Scraper {
constructor() {
super();
this.id = 'EE';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
this.recover = this._debounce(async () => {
await this.__recover();
}, 120000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractIndexItems(html, serviceObject) {
const newArray = [] ;
const $ = cheerio.load(html);
const links = $('a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).text());
const newUrl = `${this.rootURI}${href}`;
newArray.push({ 'name':text, 'href':newUrl });
});
return newArray;
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const newObj = {};
const $ = cheerio.load(html);
const title = $('h1.page-title').text();
newObj.title = this._cleanUp(title);
const tables = $('article div.table-wrap table');
const rows = $(tables).eq(0).find('tbody > tr');
rows.each((i, item) => {
const children = $(item).children();
const curLabel = this._makeFieldName($(children).eq(0).text());
newObj[curLabel] = (this._cleanUp($(children).eq(1).text()));
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityServices(html) {
try {
const newObj = {};
const $ = cheerio.load(html);
const tables = $('article div.table-wrap table');
if (tables.length > 1)
tables.each((i, table) => {
if (i > 0) {
const label = this._makeFieldName($(table).find('caption').text());
const services = $(table).find('div.field__item').map((i, el) => {
return this._cleanUp($(el).text());
}).get();
if (!newObj.hasOwnProperty(label))
newObj[label] = services.slice();
else
newObj[label] = newObj[label].concat(services);
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param blockType
* @returns {{licenseDescription: string, blockType: string}}
*/
extractEntityLicense(html ) {
try {
const blockType = 'Licenses';
const newObj = { 'licenseDescription':'', 'blockType': blockType, 'licenses' : [] };
const $ = cheerio.load(html);
const header = $(`h3:contains("${blockType}")`);
if ($(header).length === 0) return {};
const fieldContent = $(header).next();
const children = $(fieldContent).children();
children.each((i, item) => {
const newLicense = {};
newLicense.permitNumber = this._cleanUp($(item).find('div.field--name-field-permit-number div.field__item').text()) ;
newLicense.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-permit-entry-date div.field__item').text()) ;
const block = $(item).find('div.field--name-field-permit-restrictions');
newLicense.restrictions = $(block).find('p').map((i, el) => {
return this._cleanUp($(el).text());
}).get();
newObj.licenses.push(newLicense);
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param blockType
* @returns {{licenseDescription: string, blockType: string}}
*/
extractEntityCrossBorder(html ) {
try {
const blockType = 'List of cross-border services provided';
const newObj = { 'crossBorder' : [] };
const $ = cheerio.load(html);
const header = $(`h3:contains("${blockType}")`);
if ($(header).length === 0) return {};
const fieldContent = $(header).next();
const children = $(fieldContent).children();
children.each((i, item) => {
const cb = {};
cb.permitNumber = this._cleanUp($(item).find('div.field--name-field-overborder-permit-number div.field__item').text()) ;
cb.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-date div.field__item').text()) ;
cb.startDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-start div.field__item').text()) ;
// field--name-field-overborder-permit-start
const block = $(item).find('div.field--name-field-services-list');
cb.cbServices = $(block).find('div.paragraph--type--subject-services-list').map((i, el) => {
const service = this._cleanUp($(el).children().eq(0).text());
const country = this._cleanUp($(el).children().eq(1).text());
return { service, country };
}).get();
newObj.crossBorder.push(cb);
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param blockType
* @returns {{licenseDescription: string, blockType: string}}
*/
extractEntityBranches(html ) {
try {
const subDetails = [['country', 'field--name-field-country'], ['businessName', 'field--name-field-business-name'], ['address', 'field--name-field-address'], ['phone', 'field--name-field-phone']];
const blockType = 'Branches';
const newObj = { 'branches' : [] };
const $ = cheerio.load(html);
const header = $(`h3:contains("${blockType}")`);
if ($(header).length === 0) return {};
const fieldContent = $(header).next();
const children = $(fieldContent).children();
children.each((i, item) => {
const workObj = { 'details' : {}, 'branchServices':[], 'licenses':{} };
workObj.name = this._cleanUp($(item).find('header.paragraph-heading h4').text());
for (const sdItems of subDetails)
workObj.details[sdItems[0]] = this._cleanUp($(item).find(`div.${sdItems[1]} div.field__item`).text()) ;
const branchPermissions = $(item).find('div.field--name-field-branch-permissions');
const branchServices = $(item).find('div.field--name-field-branch-services');
workObj.branchServices = $(branchServices).find('div.paragraph--type--subject-services-list-simple div.field__item').map((i, el) => {
return this._cleanUp($(el).text());
}).get();
workObj.licenses = $(branchPermissions).find('div.paragraph--type--subject-branch-permits').map((i, el) => {
const permitNumber = this._cleanUp($(el).children().eq(0).find('div.field__item').text());
const start = this._cleanUp($(el).children().eq(1).find('div.field__item').text());
return { permitNumber, start };
}).get();
newObj.branches.push(workObj);
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const id = serviceObject.links[serviceObject.step].name;
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
let pageLoaded = true;
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = this._makeFileName(entity);
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('h1.page-title').catch((e) => {
logger.error('processEntityDetails', e);
pageLoaded = false;
});
if (pageLoaded) {
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
// --
const details = await this.extractEntityDetails(body);
const licenses = await this.extractEntityLicense(body);
const crossBorder = await this.extractEntityCrossBorder(body);
const services = await this.extractEntityServices(body);
const branches = await this.extractEntityBranches(body);
// --
await jsonfile.writeFile(`${filePath}.json`, { details, licenses, crossBorder, services, branches });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl).catch((err) => {
if (err.name === 'TimeoutError')
this.emit('recover');
});
}
else
this.emit('serviceDone');
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
let html = '';
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
html = await this.page.evaluate(el => el.outerHTML, elm);
}).catch((e) => {
logger.error(e);
logger.warn('No index list');
});
const indexList = await this.extractIndexItems(html);
logger.debug('serviceObject.indexStep', serviceObject.indexStep);
serviceObject.links = serviceObject.links.concat(indexList).map((v) => {
v['meta'] = serviceObject.indexStep;
return v;
});
const filename = this.modeNames[this.mode];
await this._randomWait(this.page, 5, 7);
const subStep = (serviceObject.pageCount > 0) ? `-${serviceObject.pageCount}` : '';
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}${subStep}`, null);
await this.page.waitForSelector('li.next-nav > a.button.next', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.debug('Next page..');
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 5, 7);
serviceObject.pageCount++;
this.emit('pageChanged');
}).catch(() => {
serviceObject.pageCount = 0;
this.emit('indexdone');
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await this.processIndex(serviceObject);
}).catch((e) => {
// logger.error(e);
logger.warn('No index list');
this.emit('indexdone');
});
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
const pathname = pageUrl.pathname;
logger.debug('workMode::', ['Indexing', 'Scraping'][this.workMode]);
if (pathname === '/') {
logger.error('Invalid path');
logger.debug(JSON.stringify(pageUrl));
logger.warn('processNewPage::emit recover');
this.emit('recover');
return;
}
switch (this.workMode) {
case 0:
await this.indexRedirector();
break;
case 1:
await this.processRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl.href);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async restart() {
logger.info(`Restarting ${this.modeTitles[this.mode]}`);
this._goto(this.lastUrl);
}
/**
*
* @returns {Promise<void>}
* @private
*/
async __recover() {
logger.warn('*** RECONNECTING PAGE ***');
logger.info('BrowserCrashed:', this.browserCrashed);
await this._forcePageClose();
if (this.browserCrashed)
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', () => {
this.processNewPage();
});
const timeout = 90000;
setTimeout(async() => {
logger.warn('Attempting recovery..');
await this.restart();
}, timeout);
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('pageChanged', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
// clear out stock recover handler
this.removeAllListeners('recover');
this.on('recover', async () => {
logger.info('onRecover');
await this.recover();
});
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
let newUrl;
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep >= this.paymentServices.urls.length) {
this.workMode = 1;
logger.debug(JSON.stringify(this.paymentServices));
newUrl = this.paymentServices.links[this.paymentServices.step].href;
}
else
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
let newUrl;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length) {
this.workMode = 1;
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
}
else
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
let newUrl;
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
this.creditServices.indexStep++;
if (this.creditServices.indexStep >= this.creditServices.urls.length) {
this.workMode = 1;
newUrl = this.creditServices.links[this.creditServices.step].href;
}
else
newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
this.workMode = 0;
await super._paymentServicesDone();
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
this.workMode = 0;
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
this.workMode = 0;
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.workMode = 0;
this.rootURI = 'https://www.fi.ee';
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.ee/en/payment-services/payment-institutions/estonian-payment-institutions',
'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/estonian-payment-institutions-exemption',
'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/branches-foreign-payment-institutions',
'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/payment-agents',
'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/providers-cross-border-payment-sevices',
'https://www.fi.ee/en/payment-services/payment-institutions/payment-agents-providers-cross-border-payment-services'],
'sections' : [],
'sectionLinks' : [],
'pageCount' : 0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions',
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions-exemption',
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/distributors-e-money',
'https://www.fi.ee/en/payment-services/e-money-institutions/providers-cross-border-e-money-services',
'https://www.fi.ee/en/distributors-providers-cross-border-e-money-services',
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/branches-foreign-e-money-institutions'],
'sections' : [],
'sectionLinks' : [],
'pageCount' : 0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/licensed-credit-institutions-estonia',
'https://www.fi.ee/en/banking-and-credit/credit-institutions/affiliated-branches-foreign-credit-institutions',
'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/representative-offices-foreign-credit-institutions',
'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/providers-cross-border-banking-services'],
'sections' : [],
'sectionLinks' : [],
'pageCount' : 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/EE/FI`));
// await this._doNonRepudiation();
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.paymentServices.urls[0], { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = EEScrape;

574
ncas/es.js Normal file
View File

@ -0,0 +1,574 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const logger = require('log4js').getLogger('ES');
const url = require('url');
const querystring = require('querystring');
const removeAccents = require('remove-accents-diacritics');
const jsonfile = require('jsonfile');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ESScrape extends Scraper {
constructor() {
super();
this.id = 'ES';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractPassporting($) {
const passporting = [];
const headerRow = $('td.tdSubtituloSeccion:contains("PAISES EN LOS QUE OPERA")').eq(0).parent().eq(0);
const passportRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
passportRows.each(function(i, elem) {
passporting.push(
{
'country': $(elem).find('td').eq(0).text(),
'mode': $(elem).find('td').eq(1).text()
}
);
});
return passporting;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractActivities($) {
const activities = [];
const headerRow = $('td.tdSubtituloSeccion td.tdSubtituloSeccion:contains("ACTIVIDADES")').eq(0).parent().eq(0);
const activityRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
activityRows.each(function(i, elem) {
activities.push($(elem).text());
});
for (let i = 0; i < activities.length; i++)
activities[i] = this._cleanUp(activities[i]);
return activities;
}
/**
*
* @param $
* @param details
* @returns {Promise<void>}
*/
async extractSingleFields($, details) {
const mainDiv = $('div#divSalida > table.tablaParametros > tbody > tr > td> table > tbody');
details.bancoDeEspanaCode = this._cleanUp($(mainDiv).find('input[name=CODIGO]').val());
details.bancoDeEspanaPrevCode = this._cleanUp($(mainDiv).find('input[name=CODIGO_PREVIO]').val());
details.effectiveFrom = this._cleanUp($(mainDiv).find('input[name=FechaAlta1]').val());
details.effectiveTo = this._cleanUp($(mainDiv).find('input[name=FechaBaja]').val());
details.lastUpdated = this._cleanUp($(mainDiv).find('input[name=FechaActualizacion]').val());
details.name = this._cleanUp(
// Can't find accent in "Denominación:" so search for half the word:
$(mainDiv).children('tr:contains("Denominaci")').nextAll().eq(0).find('textarea').text()
);
details.institutionType = this._cleanUp(
$(mainDiv).children('tr:contains("Tipo de entidad:")').nextAll().eq(0).find('textarea').text()
);
details.address = this._cleanUp(
$(mainDiv).children('tr:contains("Domicilio:")').nextAll().eq(0).find('textarea').text()
);
details.legalEntityIdentifierCode = this._cleanUp(
$(mainDiv).find('input[name=CODIGO_PREVIO]').parent().nextAll().eq(3).children('input').val()
);
details.shortName = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Nombre abreviado:")').nextAll().eq(1).children('input').val()
);
details.nif = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("N.I.F.:")').nextAll().eq(1).find('td.textoCampo input').val()
);
// Can't find "Teléfono", probably due to accent. Search for "fono" instead.
details.telephone = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("fono:")').nextAll().eq(1).find('td.textoCampo input').val()
);
details.fax = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Fax:")').nextAll().eq(1).find('td.textoCampo input').val()
);
details.website = this._cleanUp(
$(mainDiv).find('td.textoEtiqueta:contains("Dom. / Dir. Internet:")').nextAll().eq(1).find('a').text()
);
details.safeguardOfFunds = this._cleanUp(
$(mainDiv).find('td.tdSubtituloSeccion:contains("SALVAGUARDA DE FONDOS")').parent().nextAll('tr').eq(1).text()
);
details.financialExclusivity = this._cleanUp(
$(mainDiv).find('td.tdSubtituloSeccion:contains("EXCLUSIVIDAD FINANCIERA")').parent().nextAll('tr').eq(1).text()
);
if ($(mainDiv).find('li.textoAvisoResaltado').length > 0)
details.notice = this._cleanUp(
$(mainDiv).find('li.textoAvisoResaltado').text()
);
else
details.notice = '';
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
const details = {};
const $ = cheerio.load(html);
try {
await this.extractSingleFields($, details);
details.activities = await this.extractActivities($);
details.passporting = await this.extractPassporting($);
}
catch (err) {
logger.error(err);
}
return details;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
const noResultsSelector = '//td[@class="textoEtiqueta"][contains(text(), "NO SE HAN ENCONTRADO ENTIDADES SEGUN LOS CRITERIOS DE BUSQUEDA.")]';
const paginationRowSelector = '//table[@class="tablaResultados"]//td[@colspan="4"]';
await this._randomWait(this.page, 3, 5);
// pagination row is the last to load, so wait for that before scraping the links
// Sometimes the row is empty, so look for the surrounding td with `colspan=4`
// also look for the "no results" notice in case the result set is empty
await this.page.waitForXPath(`${noResultsSelector} | ${paginationRowSelector}`);
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
const filename = this.modeNames[this.mode];
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}_${serviceObject.paginationStep}`, null);
if (this.page.$x(noResultsSelector).length > 0) {
logger.info(`Results page ${serviceObject.indexStep} for ${this.modeNames[this.mode]} is empty`);
return;
}
// TODO: handle when the table loads, but the entity links are missing (happens occasionally)
const body = await this.page.content();
const $ = cheerio.load(body);
const links = $('table.tablaResultados tr.estilofila a');
links.each((i, item) => {
const href = $(item).attr('href');
// ignore any javascript print links
if (href.startsWith('javascript'))
return;
const text = $(item).text().trim();
const newUrl = `http://app.bde.es${href}`;
const id = this._makeFieldName(text);
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this._randomWait(this.page, 6, 9);
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
await this.processIndex(serviceObject);
const nextButtons = await this.page.$x('//a[contains(text(), \'Siguiente\')]');
if (nextButtons.length > 0) {
serviceObject.paginationStep++;
await nextButtons[0].click();
}
else if (serviceObject.indexStep < serviceObject.urls.length - 1) {
serviceObject.indexStep++;
serviceObject.paginationStep = 0;
const newUrl = serviceObject.urls[serviceObject.indexStep];
await this._goto(newUrl);
}
else
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('td.tdContenido', { 'visible':true, 'timeout':7500 }); // Wait for buttons at bottom of table to be visible
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const qstring = querystring.parse(pageUrl.search);
if ('TIPO' in qstring) // 'type'
await this.indexRedirector();
else if ('CODBE' in qstring) // 'code'
await this.processRedirector();
else {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} paymentServices items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} emoneyServices items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} creditServices items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': [
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EP&DONDE=11&LEI=&ORDEN=2&RADIO=0', // Payment Entities
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EPH&DONDE=11&LEI=&ORDEN=2&RADIO=0' // Hybrid Payment Entities
],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EDE&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Electronic Money Entities
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done' : false,
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=BP&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Credit institutions
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/ES/BE`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ESScrape;

193
ncas/fca.js Normal file
View File

@ -0,0 +1,193 @@
// load env variables from file
require('dotenv').config({
'path': `${__dirname }/../.env`
});
const version = '0.0.1-1';
// load helper libs etc
const CsvData = require('../helpers/csv-data');
const csv = new CsvData();
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const fs = require('fs');
const range = n => Array.from({ 'length': n }, (value, key) => key + 1);
const searchables = new Map([[759676, '759676 Barclays Bank UK PLC'],
[661836, '661836 American Express Services Europe Limited (AESEL)']
]);
const userAgents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 16.04) AppleWebKit/537.36 Chromium/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58.0.3029.110 Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 14.04 like Android 4.4) AppleWebKit/537.36 Chromium/35.0.1870.2 Mobile Safari/537.36'];
class FCAScrape extends Scraper {
constructor() {
super();
}
async _checkPassporting(page, id, waitFor) {
const passportingHeader = await page.$x('//a[contains(text(), \'Passport Out\')]');
if (passportingHeader.length > 0) {
await passportingHeader[0].click(); // click tab to open passporting accordion
await this._makeScreenshot(page, `${id}-passporting`, waitFor); // save general screen
// check how many countries we need to parse
const countryLinks = await page.$$('#PanelShPo_PassportOut .countries li');
const passportingTempArray = range(countryLinks.length);
console.log('>> passportingTempArray', passportingTempArray);
for (const item of passportingTempArray) {
await page.mouse.move(50, 50, 100);
console.log(id, item);
// const cookies = await page.cookies();
// const cookiesNames = cookies.map(el => {
// return {name : el.name};
// });
// console.log(cookiesNames);
await page.deleteCookie(
{ 'name': '_gat' },
{ 'name': '_gid' },
{ 'name': '_ga' },
{ 'name': '__cfduid' },
{ 'name': 'pctrk' }
);
/* const newAgent = userAgents[Math.floor(Math.random() * (userAgents.length - 1))];
console.log('New agent:', newAgent);
await page.setUserAgent(newAgent);*/
await this._processPassportingCountry(page, id, item);
}
}
else
throw new Error('Passporting not found');
}
async _processPassportingCountry(page, orgId, id) {
// Mousedown Duration between 90 - 120ms
const mouseDownDuration = 90 + Math.floor(Math.random() * (30 - 1));
console.log('Mouse duration:', mouseDownDuration);
await page.click(`#PanelShPo_PassportOut .countries li:nth-child(${id}) a`, { 'delay':mouseDownDuration });
await this._randomWait(page, 20, 40);
const innerHtml = await page.evaluate(() => document.body.innerHTML);
await this._makeScreenshot(page, `${orgId}-${id}-passporting`);
await this._saveToFile(`${orgId}-${id}-inner.html`, innerHtml);
const parsedPassportOut = await this._parseHtmlPassportingData(innerHtml);
await this._saveToFile(`${orgId}-${id}-parsed.json`, JSON.stringify(parsedPassportOut));
}
async _getOrgData(id) {
try {
await this._initBrowser();
const page = await this.browser.newPage();
// await page.setUserAgent(userAgents[Math.floor(Math.random() * (userAgents.length - 1))]);
console.log('>> Wanted searchable', searchables.get(id));
await page.goto('https://register.fca.org.uk/ShPo_HomePage');
await page.type('input[type=text].input.form-control.searchbox', searchables.get(id));
await page.keyboard.press(String.fromCharCode(13)); // press Enter (so we do not need to search for submit button by CSS selector)
await page.waitForSelector('div.RecordDetails h1.RecordName');
// make general screenshot
await this._makeScreenshot(page, `${id}-general`);
// check if org has passporting rights and parse if poss
await this._checkPassporting(page, id);
await this.browser.close();
}
catch(e) {
throw new Error(e);
}
}
async _parseHtmlPassportingData(innerHtml) {
const $ = cheerio.load(innerHtml);
// get List of PassportOut countries
const countries = [];
$('li.PassportOutLink a').each((i, el) => {
countries[i] = $(el).text();
});
// get current country data
// lets count tables - how many different directives!
const directives = $('.ShPo_PassportOutTable').map((i, el) => {
const head = $(el).find('table tbody tr').first().find('th');
// table headers
const country = head.eq(0).text().trim();
const directive = head.eq(1).text().trim();
const passportType = head.eq(2).text().trim();
// get actual table data
const data = $(el).find('table tbody tr').find('td').map((i, el) => {
// if element does contain H3 - we need more parsing
if ($(el).find('.InvestmentTypes li').length) {
const name = $(el).find('h3').text().trim();
const investment = $(el).find('.InvestmentTypes li').map((ii, subel) => {
const name = $(subel).text().trim();
let tt = null;
// check if LI contains span == it has tooltips, get data and override null
if ($(subel).find('span').length) {
const $$ = cheerio.load($(subel).find('span').data('content'));
tt = $$('div').text().trim();
}
return { name, tt };
}).get();
return { name, investment };
}
// no lists in HTML, so record just name
else
return {
'name': $(el).text().trim(),
'investment': null
};
}).get();
return { country, directive, passportType, data };
}).get();
return directives;
}
// TODO: get initial list as per ticket
// https://register.fca.org.uk/shpo_searchresultspage?preDefined=AIPISP&TOKEN=3wq1nht7eg7tr
async getInitialList(page) {
return;
}
async run() {
const passporting = await this._getOrgData(661836);
// const passporting = await this._parseCurrentPassporting(1);
}
}
module.exports = FCAScrape;

488
ncas/fr.js Normal file
View File

@ -0,0 +1,488 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('FR');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
// load env variables from file
class FRScrape extends Scraper {
constructor() {
super(); // must call super for "this" to be defined.
this.setID('FR');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param path
* @returns {Promise<void>}
*/
async gotoPage(path = null) {
const newUrl = `${this.parsedUrl.protocol}//${this.parsedUrl.hostname}${path.link}`;
await this._randomWait(this.page, 3, 5);
logger.info('newurl:', newUrl);
await this._goto(newUrl);
}
/**
*
* @param rows
* @returns {Array}
*/
extractDataFromTable(rows) {
const unchecked = /(unchecked)/;
const output = [];
const crossBorder = [];
let currentActivityID ;
rows.each((i, elm) => {
const children = cheerio(elm).children();
let newItem;
if (children.eq(1).text().trim() !== '')
currentActivityID = children.eq(1).text().trim();
if (children.eq(0).html().match(unchecked) === null)
if (children.length === 2) {
crossBorder.push(this._cleanUp(currentActivityID.trim()));
}
else
if (children.length === 3) {
newItem = [currentActivityID, this._cleanUp(children.eq(2).text().trim())];
output.push(newItem);
}
else {
newItem = [`${currentActivityID}${children.eq(2).text().replace(')', '').trim()}`, this._cleanUp(children.eq(3).text().trim())];
output.push(newItem);
}
});
return { output, crossBorder };
}
extractDataFromInvestmentServicesTable(rows) {
const unchecked = /(unchecked)/;
const output = [];
const authorised = [];
const financialInstruments = [];
rows.each((i, elm) => {
const finInst = [];
const children = cheerio(elm).children();
if (children.length > 2) {
if (children.length === 11)
children.each((step, fiElm) => {
financialInstruments.push(this._cleanUp(cheerio(fiElm).text()));
});
if (children.length > 11) {
let offset = (children.length - 1) - financialInstruments.length;
const fiOffset = (offset === 0) ? 1 : 2;
const rowName = children.eq(offset).text();
offset++;
while(offset < financialInstruments.length) {
if (children.eq(offset).html().match(unchecked) === null)
finInst.push(financialInstruments[offset - fiOffset]);
offset++;
}
if (finInst.length > 0)
output.push([rowName, finInst]);
}
}
else if (children.length === 2)
if (children.eq(0).html().match(unchecked) === null) {
authorised.push(this._cleanUp(children.eq(1).text()));
}
});
return { 'investmentServices':output, authorised };
}
/**
*
* @param tables
* @returns {Promise<Array>}
*/
async extractEuroData(tables) {
const dataBlock = [];
const findToColon = /^.*?(?=(:))/;
const trimToColon = /^.*?(?=(:)).\s/;
const divs = tables.find('div.zone_succ');
divs.each((i, elm) => {
const p = cheerio(elm).find('p').eq(0).text();
const title = this._cleanUp(p.match(findToColon)[0]).trim();
const country = this._cleanUp(p.split(trimToColon)[2]).trim();
const obj = {};
obj[title] = country;
const rows = cheerio(elm).find('table tr');
const data = this.extractDataFromTable(rows);
obj.paymentServices = data.output;
obj.crossBorder = data.crossBorder;
dataBlock.push(obj);
});
return dataBlock;
}
async extractLinks($table, creditInstFilter = false) {
const wantedCIStatuses = ['legal entity/ company'];
const links = [];
logger.info('Extracting links...');
if ($table.length > 1)
// The table contains more than just the heading row
for (let count = 1;count < $table.length;count++) {
const $row = cheerio($table.get(count)).find('td');
const $item = $row.children().eq(2);
const link = $item.attr('href');
const title = this._cleanUp($item.text());
if (!creditInstFilter)
// Default mode
links.push({ link, title });
else
if ($row.children().length === 6) {
const status = this._cleanUp($row.children().eq(5).text().toLowerCase());
logger.debug(`Status:**${status}** ${title}`);
if(wantedCIStatuses.indexOf(status) !== -1) {
logger.debug(`Matched:**${status}** ${title}`);
links.push({ link, title });
}
}
}
return links;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractDetails($) {
const findToColon = /^.*?(?=(:))/;
const trimToColon = /^.*?(?=(:)).\s/;
const details = [];
$('div#zone_description ul.nopuce li').each((i, elm) => {
if ($(elm).children().length > 0) {
const matched = $(elm).text().match(findToColon);
if (matched !== null) {
const field = this._cleanUp($(elm).text().match(findToColon)[0]).trim();
const data = this._cleanUp( $(elm).text().split(trimToColon)[2]);
details.push([field, data]);
}
}
});
return details;
}
/**
*
* @returns {Promise<void>}
*/
async processAFPage() {
const noWhiteSpace = /\W/g;
const trimToColon = /^.*?(?=(:)).\s/;
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
const modeFilename = ['ps_', 'em_', 'ci_'];
const pageData = { 'description':[], 'frActivities':null, 'EUActivities':[] };
pageData.entity = removeAccents.remove($('p.sttr').eq(0).text().replace(trimToColon, '').trim());
const filename = `${modeFilename[this.mode]}${pageData.entity.replace(noWhiteSpace, '_')}`;
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
pageData.description = await this.extractDetails($);
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(3) a');
// Process France / French details
this._makeScreenshotV2(this.page, `${this.path}/${filename}_france`, null);
const frenchTbl = $('#zone_en_france > table tr');
if (this.mode < 2)
pageData.frActivities = await this.extractDataFromTable(frenchTbl).output;
else
pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl);
if (this.mode < 2) {
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(5) a');
// Process EU Details
this._makeScreenshotV2(this.page, `${this.path}/${filename}_europe`, null);
const euroTbls = $('#zone_en_europe');
pageData.EUActivities = await this.extractEuroData(euroTbls);
}
jsonfile.writeFileSync(`${this.path}/${filename}.json`, pageData);
if (this.mode === 0 ) {
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
this.paymentServices.step++;
}
else if( this.mode === 1) {
this.emoneyServices.links[this.emoneyServices.step].filename = `${filename}.json`;
this.emoneyServices.step++;
}
else if( this.mode === 2) {
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
this.creditServices.step++;
}
this.perf.scraped++;
await this._randomWait(this.page, 5, 7);
if (this.mode === 0)
if (this.paymentServices.step < this.paymentServices.items)
await this.gotoPage(this.paymentServices.links[this.paymentServices.step]);
else {
logger.debug('Payment services complete.');
this.paymentServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
await this._goto(this.eMoneyUrl);
}
else if (this.mode === 1)
if (this.emoneyServices.step < this.emoneyServices.items)
await this.gotoPage(this.emoneyServices.links[this.emoneyServices.step]);
else {
logger.debug('EMoney services complete.');
this.emoneyServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
await this._goto(this.creditUrl);
}
else if (this.mode === 2)
if (this.creditServices.step < this.creditServices.items)
await this.gotoPage(this.creditServices.links[this.creditServices.step]);
else {
logger.debug('Credit services complete.');
this.creditServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
}
}
/**
*
* @param $
* @param store
* @returns {Promise<void>}
*/
async searchResultsProcessor($, store) {
const $table = $('table.table tr');
if ($table.length > 1)
// The table contains more than just the heading row
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
// check that the next button is active
const nextExists = $('body > div > div.main.main_evol > ul > li:last-child > a');
if (nextExists.length === 1 )
await this._findAndClick('body > div > div.main.main_evol > ul > li:last-child > a', 'Next page >');
else {
// Done gathering search results
logger.info('Completed gathering search results..');
store.searchDone = true;
store.items = store.links.length;
jsonfile.writeFileSync(`${this.path}/${['pi', 'eu', 'ci'][this.mode]}.json`, store);
this.gotoPage(store.links[store.step]);
}
}
/**
* Handle the search result page and uilt the list of links
* @returns {Promise<void>}
*/
async handleSearchResults() {
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
if (this.mode === 0 && !this.paymentServices.searchDone)
await this.searchResultsProcessor($, this.paymentServices);
if (this.mode === 1 && !this.emoneyServices.searchDone)
await this.searchResultsProcessor($, this.emoneyServices);
if (this.mode === 2 && !this.creditServices.searchDone)
await this.searchResultsProcessor($, this.creditServices);
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage(dump = false) {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const search = pageUrl.search;
const params = this._getParamsFromUrl(search);
const pageID = params.page || '';
switch (pageID) {
case 'results':
await this.handleSearchResults( );
break;
case 'af':
await this.processAFPage();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${currentPage.location}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async start() {
await super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
this.eMoneyUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=22-TBR07&retrait=0';
this.creditUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0';
this.parsedUrl = url.parse(this.creditUrl);
this.setPath(path.resolve(`${__dirname }/../artefacts/FR/REGAFI`));
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
logger.info('Scraping France...');
await this.start();
}
}
module.exports = FRScrape;

773
ncas/gi.js Normal file
View File

@ -0,0 +1,773 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const logger = require('log4js').getLogger('GI');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class GIScrape extends Scraper {
constructor() {
super();
this.id = 'GI';
// treat these elements as block boundaries when scraping permissions
this.blockBoundaries = 'div.panel, li';
// ignore elements matched by these selectors when scraping titles
this._ignoreList = 'button, div.modal-body > h3';
// scrape these top-level permissions headings only
this._headingsToScrape = [
'Financial Services (Banking) Act',
'Financial Services (Investment and Fiduciary Services) Act'
];
// override these values from the base class
this.modePrefix = ['ps_', 'em_', 'ci_', 'ag_'];
this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices', 'agentServices'];
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services', 'Agent'];
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if (l)
this.run();
});
}
async _convertBrToComma(text) {
return text.replace(/<br\s*[\/]?>/gi, ', ');
}
async _reduceWhiteSpace(text) {
return text.replace(/\s+/g, ' ').trim();
}
/**
*
* @param html
* @param selector
* @returns {Promise<void>}
*
* Finds elements in the `html` with the given `selector`, but returns only the uppermost matched elements,
* and not those that are nested within other matched elements.
*/
async getUppermostElementsBySelector(html, selector) {
const $ = cheerio.load(html);
return $(selector).filter(function () {
return $(this).parents(selector).length === 0;
});
}
async getTextNotInMatchingElements(html, selector) {
const $ = cheerio.load(html);
$(selector)
.remove()
.end();
$(this._ignoreList)
.remove()
.end();
return $.text();
}
async extractSingleFields($, details) {
details.slug = $('meta[name="og:url"]').attr('content').replace('http://www.fsc.gi/regulated-entity/', '');
details.name = $('#fvFirmDetails_lblName').text();
details.address = await this._convertBrToComma($('#fvFirmDetails_lblAddress').html());
details.telephone = $('#fvFirmDetails_lblTel').text();
details.fax = $('#fvFirmDetails_lblFax').text();
details.email = $('#fvFirmDetails_Label12').text();
details.website = $('#fvFirmDetails_lblWebsite').text();
details.legalForm = $('#fvFirmDetails_lblLegalForm').text();
details.countryOfIncorporation = $('#fvFirmDetails_lblIncorporationCountry').text();
details.incorporationNumber = $('#fvFirmDetails_lblRegistrationNo').text();
details.incorporationDate = $('#fvFirmDetails_lblDateOfIncorporation').text();
}
async processOtherNameListItem($, elm, names) {
const type = $(elm).children('strong').text();
let name = $(elm).children('strong').get(0).nextSibling.nodeValue;
// trim the preceding ' -'
if (name.startsWith(' -'))
name = name.substr(2);
name = name.trim();
names.push({
'type': type,
'name': name
});
}
async extractOtherNames($) {
const otherNames = [];
const otherNamesList = $('h3:contains("Other names")').next();
$(otherNamesList).find('li').each(
(index, element) => {
this.processOtherNameListItem($, element, otherNames);
}
);
return otherNames;
}
processParentFirm($, elm, firms) {
const href = $(elm).find('a').attr('href');
const slug = href.replace('/regulated-entity/', '');
firms.push(slug);
}
extractAgentOf($) {
const parentFirms = [];
const parentFirmsList = $('h3:contains("Agent of")').next();
$(parentFirmsList).find('li').each(
(index, element) => {
this.processParentFirm($, element, parentFirms);
}
);
return parentFirms;
}
async processAgentLink($, elm, firmAgentList) {
const href = $(elm).attr('href');
const fullUrl = `https://www.fsc.gi${href}`;
const slug = href.replace('/regulated-entity/', '');
const name = await this._cleanUp($(elm).text());
const id = this._makeFieldName(name);
// TODO: refactor this out of this function somehow, it's not unit-testable without a mock for agentServices
if ('agentServices' in this) // i.e. don't do this if we're running a unit test
// Add the href to our list of links to check later (if it's not already added)
if (this.agentServices.links.findIndex(x => x.href === fullUrl) === -1)
this.agentServices.links.push({
'name': name,
'href': fullUrl,
'id': id
});
firmAgentList.push({
'name': name,
'slug': slug
});
}
async extractAgents(html) {
const $ = cheerio.load(html);
const agents = [];
$('li > a').each(
(index, element) => {
this.processAgentLink($, element, agents);
}
);
return agents;
}
async recurseDOM(html, selector, level = 0) {
const currentLevel = level + 1;
const $ = cheerio.load(html);
const result = [];
const blocks = await this.getUppermostElementsBySelector(html, selector);
for (let i = 0; i < blocks.length; i++) {
const block = blocks[i];
const rawName = await this.getTextNotInMatchingElements($(block).html(), selector);
const name = await this._reduceWhiteSpace(rawName);
// Only scrape the top level headings we're interested in
if (currentLevel === 1 && this._headingsToScrape.indexOf(name) === -1)
continue;
const blockHtml = $(block).html();
let data;
if (name === 'Agents')
data = await this.extractAgents(blockHtml);
else
data = await this.recurseDOM(blockHtml, selector, currentLevel);
if (data === null)
result.push(name);
else
result.push({
'name': name,
'data': data
});
}
if (result.length > 0)
return result;
return null;
}
async extractPermissions(html) {
const $ = cheerio.load(html);
const permissionsContainer = $('h3:contains("Permissions")').next();
if (permissionsContainer.length === 0)
return {};
const permissions = await this.recurseDOM(permissionsContainer.html(), this.blockBoundaries);
return permissions;
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const details = {};
const $ = cheerio.load(html);
await this.extractSingleFields($, details);
details.otherNames = await this.extractOtherNames($);
details.permissions = await this.extractPermissions(html);
details.agentOf = await this.extractAgentOf($);
return details;
}
catch (err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
// Wait for the paragraph at the bottom to have loaded.
await this.page.$x('//a[contains(text(), "* Firms with an asterisk")]');
// open all accordions before taking screenshot
// first, add a class `expand-below` to the container divs we are interested in:
for (const heading of this._headingsToScrape) {
const expandBelowThisDiv = await this.page.$x(`//h4[contains(., "${heading}")]/../..`);
expandBelowThisDiv.forEach(async (elm) => {
await this.page.evaluate(el => {
const currentClass = el.getAttribute('class');
el.setAttribute('class', `${currentClass} expand-below`);
}, elm);
});
}
// then, add a style tag to the <head> to expand the content
await this.page.addStyleTag({
'content':
`
div.expand-below div.collapse {
display: block;
}
div.expand-below div.modal {
display: block;
position: static;
opacity: 1;
overflow: visible;
margin-top: 125px;
}
/* remove drop shadows for faster rendering on large pages */
.modal-content {
-webkit-box-shadow: none;
box-shadow: none;
}
`
});
// temporarily disable GI screenshots
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const $ = cheerio.load(body);
const underConstruction = $('h3:contains("under construction")').length > 0;
if (underConstruction) {
logger.warn(`Page under construction: ${this.page.url()}`);
await jsonfile.writeFile(`${filePath}.json`, { 'underConstruction': true });
}
else {
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
}
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
case 3:
await this.processEntityDetails(this.agentServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const filename = this.modeNames[this.mode];
// temporarily disable GI screenshots
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
// await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
const $ = cheerio.load(body);
let ul;
switch (this.mode) {
case 0:
ul = $('h3:contains("Authorised Payment Institutions")');
break;
case 1:
ul = $('h3:contains("E-money Institutions")');
break;
case 2:
ul = $('h3:contains("Banks")');
break;
case 3:
ul = $('h3:contains("Electronic Money and Payment Institution Agents")');
}
const links = ul.next().find('li > a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).text());
const newUrl = `https://www.fsc.gi${href}`;
const id = this._makeFieldName(text);
if (serviceObject.links.findIndex(x => x.href === newUrl) === -1)
serviceObject.links.push({ 'name': text, 'href': newUrl, 'id': id });
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this._randomWait(this.page, 6, 9);
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
await this.processIndex(serviceObject);
if (serviceObject.indexStep < serviceObject.urls.length - 1) {
serviceObject.indexStep++;
const newUrl = serviceObject.urls[serviceObject.indexStep];
await this._goto(newUrl);
}
else
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
case 3:
await this.buildIndex(this.agentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
try {
if (
pageUrl.pathname.includes('payment-institutions-20') ||
pageUrl.pathname.includes('e-money-institutions-17') ||
pageUrl.pathname.includes('banks-1') ||
pageUrl.pathname.includes('electronic-money-and-payment-institution-agents-26')
)
await this.indexRedirector();
else if (pageUrl.pathname.includes('regulated-entity'))
await this.processRedirector();
else if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl.href);
}
}
catch (err) {
if (err.name === 'TimeoutError') {
logger.error(`Reloading page after timeout: ${err.name}: ${err.message}`);
this.page.reload();
}
else
throw(err);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
case 3:
this.emit('agentServicesDone');
break;
}
});
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('agindexdone', async () => {
this.agentServices.items = this.agentServices.links.length;
logger.info(`${this.agentServices.items} items indexed`);
const newUrl = this.agentServices.links[this.agentServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
case 3:
this.emit('agindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
}
catch (e) {
logger.error(e);
}
await this._goto(this.agentServices.urls[0]);
});
this.on('agentServicesDone', async () => {
logger.warn('agentServicesDone');
try {
this.agentServices.done = true;
jsonfile.writeFileSync(`${this.path}/agentServices.json`, { 'links': this.agentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/agentServices.json`, this.agentServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/payment-institutions-20'],
'sections': [],
'sectionLinks': []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/e-money-institutions-17'],
'sections': [],
'sectionLinks': []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/banks-1'],
'sections': [],
'sectionLinks': []
};
this.agentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/electronic-money-and-payment-institution-agents-26']
};
this.startPage = this.paymentServices.urls[0];
this.setPath(path.resolve(`${__dirname}/../artefacts/GI/FSC`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch (e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = GIScrape;

79
ncas/gr.js Normal file
View File

@ -0,0 +1,79 @@
const logger = require('log4js').getLogger('GR');
const path = require('path');
const url = require('url');
const Scraper = require('../helpers/scraper');
class GRScrape extends Scraper {
constructor() {
super();
this.id = 'GR';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async start() {
super._start();
this.setPath(path.resolve(`${__dirname }/../artefacts/GR/BG`));
this.startPage = 'https://www.bankofgreece.gr/Pages/en/Supervision/SupervisedInstitutions/default.aspx';
await this._doNonRepudiation(false, { 'sslWithPrefix': true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index`);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
logger.info('Saving excels into:', this.path);
for (const linkText of [
'List of credit institutions operating in Greece',
'List of credit institutions authorised in Greece with operations abroad through a subsidiary or a branch',
'List/register of payment institutions',
'List/register of electronic money institutions'
]) {
const links = await this.page.$x(`//a[contains(text(), \'${linkText}\')]`);
const linkElement = links[0];
const href = await this.page.evaluate(
link => link.getAttribute('href'),
linkElement,
);
const xlsUrl = url.resolve(await this.page.url(), href);
await this._goto(xlsUrl, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
// wait until all downloads finished (currently this is only possible with 'page.goto', so we go back to the start page.
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
this.emit('done');
}
async __run() {
await this.start();
}
}
module.exports = GRScrape;

186
ncas/ie.js Normal file
View File

@ -0,0 +1,186 @@
/**
*
* User: Martin Donnelly
* Date: 2018-09-13
* Time: 12:23
*
*/
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const logger = require('log4js').getLogger('IE');
class IEScrape extends Scraper {
constructor() {
super();
this.setID('IE');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
await super._start();
try{
this.startPage = 'http://registers.centralbank.ie/Home.aspx';
const mouseDownDuration = IEScrape.notARobot();
this.setPath(path.resolve(`${__dirname }/../artefacts/IE/CBI`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('#ctl00_cphRegistersMasterPage_lblViewList');
await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });
}
catch(e) {
throw new Error(e);
}
}
/**
* locate the download section associated with the searchText
* @param downloadsection
* @param searchText
* @returns {Promise<*>}
*/
async findDownloadSection(downloadsection, searchText) {
let wantedId;
try{
await this.page.waitFor(downloadsection);
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
$(`${downloadsection} span`).each((i, el) => {
if ($(el).text() === searchText)
wantedId = $(el).attr('id');
return wantedId;
});
return wantedId;
}
catch(e) {
throw new Error(e);
}
}
/**
* Expand the relevant section
* @param elmId
* @returns {Promise<void>}
*/
async expandArea(elmId) {
await this.page.click(`span#${elmId}`);
}
/**
* Find the Download Links via section ID
* @param elmId
* @returns {Promise<void>}
*/
async findDownloadsLinksID(elmId) {
return await this.page.$eval(`span#${elmId}`, e => e.parentElement.nextElementSibling.getAttribute('id'));
}
/**
* Process the download links and grab the pdf files
* @param id
* @returns {Promise<void>}
*/
async processDownloadLinks(id) {
try {
// Each link is duplicated in a P and an Image. We just use the one in the P tag.
const clickableLinks = await this.page.$$(`[id="${id}"] p a`);
const mouseDownDuration = IEScrape.notARobot();
for (const item of clickableLinks) {
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await item.click({ 'delay':mouseDownDuration }).catch((err) => {
this._uploadError();
});
await this._randomWait(this.page, 5, 10);
}
}
catch(e) {
await this._uploadError();
throw new Error(e);
}
}
async grabSection(dlSectionElm, sectionTitle) {
try {
const section = await this.findDownloadSection(dlSectionElm, sectionTitle);
await this.expandArea(section);
this._makeScreenshotV2(this.page, `${ this.path}/${sectionTitle}`, null);
const sectionID = await this.findDownloadsLinksID(section);
await this.processDownloadLinks(sectionID);
await this._randomWait(this.page, 5, 10);
}
catch(e) {
await this._uploadError();
throw new Error(e);
}
}
/**
* Grab the Pdf's and screenshots
* @returns {Promise<void>}
*/
async __run() {
try {
await this.start();
await this._randomWait(this.page, 5, 10);
await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null);
const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions'];
for (const section of sections)
await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section);
this.emit('done');
}
catch(e) {
throw new Error(e);
}
}
}
module.exports = IEScrape;

889
ncas/it.js Normal file
View File

@ -0,0 +1,889 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('IT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class ITscrape extends Scraper {
constructor() {
super();
this.setID('IT');
this.on('done', () => {
// this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<void>}
*/
async forceScrollToTop() {
// Force the scroll
await this.page.evaluate(() => {
window.scrollBy(0, window.innerHeight);
});
// Force the hover
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
logger.warn(err);
});
// Force the focus
await this.page.focus('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
}
/**
*
* @returns {Promise<void>}
*/
async forceEnglish() {
await this._randomWait(this.page, 2, 2, 'Force English');
await this.page.waitForSelector('#bs-example-navbar-collapse-1 > ul > li.dropdown > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 2, 2);
}).catch(() => {
logger.debug('No Language button');
});
await this._findAndClick('#bs-example-navbar-collapse-1 > ul > li.dropdown.open > ul > li:nth-child(2) > a');
}
/**
*
* @returns {Promise<void>}
*/
async handleFrontPage() {
let pageReturned = false;
await this._randomWait(this.page, 3, 5, 'handleFrontPage');
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(async (err) => {
logger.info('handleFrontPage: ul.linkgroup a Not found', err);
});
do{
await this.page.waitFor('#my-container > div.container > div', { 'visible':true, 'timeout':7500 }).then(() => {
pageReturned = true;
}).catch(async () => {
logger.info('We didnt transition back correctly, forcing another click..\n');
});
if (!pageReturned) {
await this.page.hover('ul.linkgroup a').catch((err) => {
logger.debug(err.name);
});
await this.page.focus('ul.linkgroup a').catch((err) => {
logger.debug(err.name);
});
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(async (err) => {
logger.info('handleFrontPage: ul.linkgroup a still not found', err.name);
});
}
}
while(!pageReturned);
// Supervisory registers and lists
}
/**
*
* @returns {Promise<void>}
*/
async handleSecondPage() {
try{
// sometimes this page takes a while to load...
const url = await this.page.evaluate('location.href');
await this._randomWait(this.page, 10, 13, 'handleSecondPage');
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':90000 }).catch((e) => {
logger.warn('Ajax loading shroud not removed after 90 seconds');
});
await this.page.waitForSelector('ul.nav.navbar-nav.navbar-center li a', { 'visible':false, 'timeout':90000 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 5, 8, 'await transition');
}).catch((e) => {
logger.warn('Page Navigation navigation links failed to load / display');
});
// await this._findAndClick('ul.nav.navbar-nav.navbar-center li a', null, 'https://infostat.bancaditalia.it/GIAVAInquiry-public/ng/int-albi');
const newUrl = await this.page.evaluate('location.href');
if (url !== newUrl) {
logger.debug('The page Has changed!');
this.emit('pageChanged');
}
}
catch( err) {
logger.error('Failed to progress past second page', err);
this.emit('recover');
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractPSRegistry(html) {
try{
const registry = {};
const $ = cheerio.load(html);
const rows = $('app-details-anagrafica > div.row');
rows.each((index, item) => {
const divs = $(item).find('div');
if ($(item).children().length === 2) {
const name = this._cleanUp(divs.eq(0).text()) ;
registry[name] = this._cleanUp(divs.eq(1).text());
}
});
return registry;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSRegistry\n', err);
}
else
logger.error('extractPSRegistry\n', err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractPSRegisters(html) {
try {
const registers = [];
const $ = cheerio.load(html);
const rows = $('app-details-albi div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
logger.info(`${rows.length} registers item${(rows.length !== 1) ? 's' : ''}`);
rows.each((index, item) => {
const divs = $(item).find('div');
const obj = {};
for (let counter = 0; counter < divs.length;counter++) {
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
obj[name] = this._cleanUp(divs.eq(counter).text());
}
registers.push(obj);
});
return registers;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSRegisters\n', err);
}
else
logger.error('extractPSRegisters\n', err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractPSAuthority(html) {
try{
const authority = [];
const $ = cheerio.load(html);
const rows = $('app-details-att-autorizzate div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
logger.info(`${rows.length} authority item${(rows.length !== 1) ? 's' : ''}`);
rows.each((index, item) => {
const divs = $(item).find('div');
const obj = {};
for (let counter = 0; counter < divs.length;counter++) {
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
obj[name] = this._cleanUp(divs.eq(counter).text());
}
authority.push(obj);
});
return authority;
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('extractPSAuthority\n', err);
}
else
logger.error('extractPSAuthority\n', err);
}
}
/**
*
* @returns {Promise<void>}
*/
async preparePSSearch() {
try{
await this._randomWait(this.page, 3, 5, `preparePSSearch - ${this.modeTitles[this.mode]}`);
// Brute force the selector
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-search > div > div:nth-child(3) > div > input');
await this.page.waitForFunction(
'document.querySelector("#alboElenco").options.length > 1'
, { 'timeout':7500 }).then(() => {
logger.debug('Ajax done');
}).catch(() => {
throw new Error('Ajax not done');
});
const options = await this.page.$$('#alboElenco option');
const optionList = ['ALBO IP ART.114-SEPTIES TUB ', 'ALBO IMEL ITA EX 114-QUATER ', 'ALBO DELLE BANCHE '];
const wantedOption = [optionList[this.mode]];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#alboElenco', value);
break;
}
}
// wait for loading shroud to go away
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':25000 });
let btnSuccess = false;
do {
await this.page.waitForSelector('button.btn.btn-success', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
btnSuccess = true;
});
await this._randomWait(this.page, 1, 1, 'preparePSSearch btnSuccess');
}
while(!btnSuccess);
this.page.waitFor('app-int-albi-grid-result').then(async () => {
//
await this.forceEnglish();
await this.emit('processAgTable');
}).catch(async (err) => {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('No results transition\n', err);
}
else
logger.error('No results transition\n', err);
});
}
catch (err) {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('preparePSSearch\n', err);
}
else
logger.error('preparePSSearch\n', err);
}
}
/**
*
* @returns {Promise<{registry, authority, registers}>}
*/
async processPSDetail() {
let registry = {}, registers = {}, authority = {};
await this._randomWait(this.page, 3, 3, 'processPSDetail: AJAX');
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div.card.card-title > span > span', { 'visible': true }).catch((err) => {
logger.warn('AJAX data has failed to load');
logger.debug(err);
return { registry, registers, authority };
});
await this.page.waitFor('app-int-albi-details').then(async () => {
await this.forceScrollToTop();
const body = await this.page.content();
registry = await this.extractPSRegistry(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-int-albi-details');
}).catch(async (err) => {
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error('processPSDetail\n', err);
}
else
logger.error('processPSDetail\n', err);
});
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-int-albi-details');
//
await this.forceScrollToTop();
// wait for Registers Tab
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(2) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
logger.debug('** Showing Registers Tab');
await elm.click({ 'delay':90 });
await this.page.waitFor('app-details-albi', { 'visible': true, 'timeout':10000 }).then(async () => {
const body = await this.page.content();
registers = await this.extractPSRegisters(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-albi');
}).catch(async (err) => {
if (process.env.NODE_ENV)
// await this._uploadError();
throw new Error('No tab transition\n', err);
else
logger.error('No tab transition');
});
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-details-albi');
}).catch((err) => {
logger.warn('No "registers" Block...');
logger.debug(err);
});
// wait for Activity Tab
await this.forceScrollToTop();
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
logger.debug('** Showing Activity Tab');
await elm.click({ 'delay':90 });
let pageReturned = false;
do
await this.page.waitFor('app-details-att-autorizzate', { 'visible': true, 'timeout':10000 }).then(async () => {
pageReturned = true;
const body = await this.page.content();
authority = await this.extractPSAuthority(body);
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-att-autorizzate');
}).catch(async (err) => {
await this.forceScrollToTop();
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a');
if (process.env.NODE_ENV)
throw new Error('No tab transition\n', err);
else
logger.warn('No tab transition');
});
while(!pageReturned);
}).catch((err) => {
logger.warn('No "Activity" Block...');
logger.debug(err);
});
return { registry, registers, authority };
}
/**
*
* @returns {Promise<void>}
*/
async returnToPSList() {
try{
let pageReturned = false;
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
logger.warn(err);
});
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
do
await this.page.waitFor('app-int-albi-grid-result').then(() => {
pageReturned = true;
}).catch(async (err) => {
logger.warn('We didnt transition back correctly, forcing another click..\n', err);
await this.forceScrollToTop();
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
});
while(!pageReturned);
}
catch (err) {
logger.error('returnToPSList\n', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
}
}
/**
*
* @returns {Promise<number>}
*/
async psGetMaxRows() {
const regExNumbersOnly = /\d{1,13}(?:,\d{0,2})?/g;
const elm = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(1) > p');
const text = await this.page.evaluate(el => el.innerText, elm[0]);
const numbers = regExNumbersOnly.exec(text);
return (numbers !== null) ? parseInt(numbers[0], 10) : -1;
}
async processDivs($, divs) {
const entries = {};
divs.each((index, item) => {
const itemText = this._cleanUp($(item).text());
const itemName = $(item).attr('col-id');
// logger.info(`>> ${index}`, itemName, itemText);
entries[itemName] = itemText;
});
return entries;
}
async psSetListCount(count) {
logger.debug('+ psSetListCount ');
await this.page.focus('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(7) > div > input');
for(let del = 0;del < 5;del++)
await this.page.keyboard.press('Backspace');
await this.page.keyboard.type(count.toString(), { 'delay': 100 }); // Types slower, like a user
await this.page.keyboard.press('Enter');
await this._randomWait(this.page, 10, 10, 'ajax refresh');
logger.debug('- psSetListCount ');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processAGTableV3(serviceObject) {
// this whole thing is ugly but at the moment it works
await this._randomWait(this.page, 3, 5, 'processAGTableV3');
const _defaultMaxPerPage = 10;
let workingData;
let elmStep;
let item;
let maxPages = 0;
let rowsInPass;
await this.psSetListCount(_defaultMaxPerPage);
const maxRows = await this.psGetMaxRows();
let remainingRows = maxRows;
logger.info('Max Rows', maxRows);
if (maxRows > _defaultMaxPerPage) {
maxPages = ~~(maxRows / _defaultMaxPerPage);
logger.info('Max pages:', maxPages);
}
for(let pageStep = 0; pageStep <= maxPages; pageStep++) {
logger.info('Pagestep', pageStep, (pageStep + 1) * _defaultMaxPerPage);
if (maxPages > 0)
if ((maxRows - ((pageStep ) * _defaultMaxPerPage)) > _defaultMaxPerPage)
rowsInPass = _defaultMaxPerPage;
else
rowsInPass = (maxRows - ((pageStep ) * _defaultMaxPerPage));
else
rowsInPass = maxRows;
logger.info(`Rows in this pass : ${rowsInPass}`);
for (let step = 0; step < rowsInPass; step++) {
for ( elmStep = 0; elmStep <= step; elmStep++) {
workingData = await this.page.$$(`div.ag-body-container div.ag-row[row-id="${elmStep}"]`);
item = workingData[0];
if (typeof item !== 'undefined')
await item.hover().catch((err) => {
logger.warn(err);
logger.info(item);
});
await this._microWait(this.page, 1);
}
await this._randomWait(this.page, 2, 2, 'processAGTableV3 after rows');
if (typeof item !== 'undefined') {
const html = await this.page.evaluate(el => el.innerHTML, item);
const clickable = await item.$('div[col-id="name"]');
const abiCodeElm = await item.$('div[col-id="abiCode"]');
const uid = await this.page.evaluate(el => el.innerText, abiCodeElm);
const clickName = await this.page.evaluate(el => el.innerText, clickable);
const $ = cheerio.load(html);
const divs = $('div');
logger.info(`Processing : ${clickName}, ${remainingRows} remain.`);
if (!serviceObject.workingData.has(uid)) {
// Exract all the data from the cells
const newEntry = await this.processDivs($, divs);
// Insert it in the map
serviceObject.workingData.set(uid, newEntry);
await this._randomWait(this.page, 2, 2, `Processing : ${clickName}`);
const filePath = await this._makeFilePath(clickName);
const fileName = this._makeFileName(clickName);
await this._randomWait(this.page, 2, 2, 'processAGTableV3 before ss');
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
serviceObject.links.push({ uid, 'fileName':`${fileName}.json`, 'name':clickName });
// Go into the detail
await clickable.click();
await this._randomWait(this.page, 3, 4, 'processAGTableV3 before next');
remainingRows--;
await this.page.waitFor('app-int-albi-details').then(
await this.doAlbiDetails(filePath, newEntry)
).catch(async (err) => {
logger.error('No detail transition', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
});
}
}
}
if (maxPages > 0) {
logger.info('Clicking to the next page...');
const nextButton = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
const buttonDisabled = await this.page.evaluate(el => el.disabled, nextButton[0]);
if (!buttonDisabled) {
this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
await this._randomWait(this.page, 5, 5, 'processAGTableV3 next page click');
}
}
}
logger.debug('processAGTableV3 DONE');
this.emit('doneProcessingGrid');
}
async doAlbiDetails(filePath, newEntry) {
try{
// process the page
const data = await this.processPSDetail();
data.details = newEntry;
logger.info(`Saving ${filePath}.json`);
await jsonfile.writeFile(`${filePath}.json`, data);
await this._randomWait(this.page, 5, 7, 'doAlbiDetails');
// Retun back to list
await this.returnToPSList();
await this._randomWait(this.page, 2, 2, 'doAlbiDetails after returnToPSList');
// wArray.push([uid, clickName]);
}
catch (err) {
logger.error('doAlbiDetails\n', err);
this.emit('recover');
if (process.env.NODE_ENV)
await this._uploadError();
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5, 'processNewPage');
const pageUrl = url.parse(await this.page.url());
switch (pageUrl.pathname) {
case '/compiti/vigilanza/albi-elenchi/index.html':
await this.handleFrontPage();
break;
case '/GIAVAInquiry-public/ng/':
await this.handleSecondPage();
break;
case '/GIAVAInquiry-public/ng/int-albi/search':
await this.preparePSSearch();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
// Need thiss for Angular based sites
// clear out stock recover handler
this.removeAllListeners('recover');
this.on('pageChanged', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
this.on('recover', this._debounce(async () => {
clearTimeout(this.backOffTimer);
logger.warn('Backing off for 5 minutes..');
const timeout = (60 * 1000) * 5;
this.backOffTimer = setTimeout(() => {
this.emit('restart');
// this.recover();
}, timeout);
}, 30000));
this.on('restart', this._debounce(async() => {
clearTimeout(this.backOffTimer);
logger.warn('Restarting::');
// await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
// use the Scraper recovery now to ensure crashed browser is resurrected
await this.__recover(this.startPage);
}, 15000));
this.on('processAgTable', async () => {
switch (this.mode) {
case 1:
await this.processAGTableV3(this.emoneyServices);
break;
case 2:
await this.processAGTableV3(this.creditServices);
break;
case 0:
default:
await this.processAGTableV3(this.paymentServices);
break;
}
});
this.on('doneProcessingGrid', async () => {
let curObj;
switch (this.mode) {
case 1:
curObj = this.emoneyServices;
break;
case 2:
curObj = this.creditServices;
break;
case 0:
default:
curObj = this.paymentServices;
break;
}
curObj.done = true;
curObj.items = curObj.links.length;
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links':curObj.links });
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, curObj);
this.mode++;
if (this.mode < 3) {
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5, 'doneProcessingGrid');
}
else
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
'workingData': new Map([]),
'workingIndex': 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = '';
this.credit = '';
this.backOffTimer = 0;
this.setPath(path.resolve(`${__dirname }/../artefacts/IT/FSA`));
await this._doNonRepudiation(false, { 'sslWithPrefix':true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5, 'After start');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = ITscrape;

666
ncas/lt.js Normal file
View File

@ -0,0 +1,666 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class LTScrape extends Scraper {
constructor() {
super();
this.id = 'LT';
this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']);
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @param path
* @returns {Promise<void>}
*/
async extractEntityIntermediaries(html, path = 'item-contra-intermediaries') {
try{
const newObj = { } ;
const $ = cheerio.load(html);
const rows = $(`#${path} li div.row`);
rows.each((i, li) => {
const children = $(li).children();
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
if (!newObj.hasOwnProperty(label))
newObj[label] = [];
newObj[label].push(this._cleanUp($(children).eq(1).text()));
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityList(html) {
try{
const newArray = [] ;
const $ = cheerio.load(html);
const rows = $('#item-lists li');
rows.each((i, li) => {
const children = $(li).children();
if ($(children).length === 1)
newArray.push(this._cleanUp($(children).eq(0).text()));
});
return newArray;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityActivity(html) {
try{
const newArray = [] ;
const $ = cheerio.load(html);
const rows = $('#item-activities tbody tr');
rows.each((i, li) => {
const children = $(li).children();
if ($(children).length === 3) {
const activity = this._cleanUp($(children).eq(0).text());
const from = this._cleanUp($(children).eq(1).text());
const to = this._cleanUp($(children).eq(2).text());
newArray.push({ activity, from, to });
}
});
return newArray;
}
catch( err) {
logger.error(err);
}
}
//
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityFOSContent(html) {
try{
const newObj = {} ;
const $ = cheerio.load(html);
const rows = $('#fos-content div.panel-heading');
rows.each((i, row) => {
const label = this._makeFieldName($(row).find('span.l').text());
if (!newObj.hasOwnProperty(label))
newObj[label] = [];
const sibling = $(row).next();
const tr = $(sibling).find('tbody tr');
tr.each((y, item) => {
const children = $(item).children();
if ($(children).length === 3) {
const activity = this._cleanUp($(children).eq(0).text());
const from = this._cleanUp($(children).eq(1).text());
const to = this._cleanUp($(children).eq(2).text());
newObj[label].push({ activity, from, to });
}
});
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
const spliterRX = /(.+)(?::\s+)(.+)/;
try{
const newObj = { } ;
const $ = cheerio.load(html);
const items = $('div.frd-props.text.row p');
items.each((i, elm) => {
const children = cheerio(elm).children();
if (children.length > 0) {
const propType = $(children.eq(0)).prop('name');
if (propType !== 'a') {
const ws = $(elm).text().match(spliterRX);
const label = this._makeFieldName(ws[1]);
newObj[label] = this._cleanUp(ws[2]);
}
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
async preBuildIndex(serviceObject) {
await this.page.waitForSelector('#cookies_msg > div > a', { 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':90 });
}).catch(() => {
logger.info('No cookie band...');
});
// Ensure that the max number f items is shown
await this.page.waitForSelector('#content > div > div:nth-child(4) > div.totals > form > span > button:nth-child(3)', { 'visible': true, 'timeout':7500 }).then(async (elm) => {
const cls = await this.page.evaluate(el => el.getAttribute('class'), elm);
logger.debug('button class', cls);
if (cls === null)
await elm.click({ 'delay':90 });
else
await this.buildIndex(serviceObject);
});
}
async expandAreas() {
const divs = ['item-activities', 'item-contra-intermediaries', 'item-intermediaries', 'item-lists', 'foe-countries'];
// #content > div > div:nth-child(4) > div > a:nth-child(2)
for (const item of divs)
await this.page.waitForSelector(`div#${item}`, { 'visible': false, 'timeout':2500 }).then(async (elm) => {
await this.page.evaluate(el => {
el.removeAttribute('class');
el.style.display = '';
}, elm);
}).catch(() => {
logger.debug(`No ${item}`);
});
// these needs to load content via ajax
const fosA = await this.page.$$('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]');
if (fosA.length === 1) {
await this.page.waitForSelector('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]', { 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':90 });
}).catch(() => {
logger.debug('No #fos-countries');
});
// #fos-countries > div > div > div.modal-body > div > div > i
await this.page.waitForSelector('#fos-countries > div > div > div.modal-body > div > div > i', { 'visible': false, 'timeout':10000 });
await this.page.waitForSelector('div#fos-countries', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
await this.page.evaluate(async el => {
el.style.display = '';
await el.removeAttribute('class');
}, elm);
}).catch(() => {
logger.debug('No #fos-countries');
});
await this.page.waitForSelector('div.modal-backdrop.in', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
await this.page.evaluate(async el => {
el.style.height = '0px';
el.style.display = 'none';
await el.removeAttribute('class');
}, elm);
}).catch(() => {
logger.debug('No #fos-countries');
});
}
}
async extractIndex(html) {
const links = [];
const slashRgx = /(\/\/)/;
const $ = cheerio.load(html);
const rows = $('table.table tbody tr');
rows.each((index, item) => {
const children = $(item.children);
const title = this._cleanUp($(children).eq(1).text()) ;
const type = this._cleanUp($(children).eq(3).text()) ;
const businessForm = this._cleanUp($(children).eq(5).text()) ;
const rawUrl = $(children).eq(1).find('a').attr('href');
const href = rawUrl.replace(slashRgx, 'https://');
links.push({ 'id': title, 'href': href, 'type': type, 'businessForm':businessForm });
});
return links;
}
async processEntityPage(serviceObject) {
const newObj = {};
const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
await this._randomWait(this.page, 3, 5);
const entityName = serviceObject.links[serviceObject.step].id;
const fileName = this._makeFileName(entityName);
const filePath = await this._makeFilePath(entityName);
await this.expandAreas();
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
newObj.details = await this.extractEntityDetails(body);
newObj.contraIntermediaries = await this.extractEntityIntermediaries(body, 'item-contra-intermediaries');
newObj.intermediaries = await this.extractEntityIntermediaries(body, 'item-intermediaries');
newObj.list = await this.extractEntityList(body);
newObj.activity = await this.extractEntityActivity(body);
newObj.foeCountries = await this.extractEntityIntermediaries(body, 'foe-countries');
newObj.fosContent = await this.extractEntityFOSContent(body);
await jsonfile.writeFile(`${filePath}.json`, newObj);
await this._randomWait(this.page, 3, 5);
// await this._randomWait(this.page, 1000, 1000, 'Throttled');
serviceObject.links[serviceObject.step].filename = `${fileName}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const entityName = `${this.modeNames[this.mode]}`;
const filePath = await this._makeFilePath(entityName);
await this._makeScreenshotV2(this.page, filePath, null);
const links = await this.extractIndex(body);
serviceObject.links = links.slice();
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.preBuildIndex(this.paymentServices);
break;
case 1:
await this.preBuildIndex(this.emoneyServices);
break;
case 2:
await this.preBuildIndex(this.creditServices);
break;
}
}
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityPage(this.paymentServices);
break;
case 1:
await this.processEntityPage(this.emoneyServices);
break;
case 2:
await this.processEntityPage(this.creditServices);
break;
}
}
async processNewPage() {
// give the page a few seconds to settle
const rX = /(\/en\/sfi-financial-market-participants)(\/?)/;
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const pathName = pageUrl.pathname.match(rX)[0];
logger.debug(pathName);
switch (pathName) {
case '/en/sfi-financial-market-participants':
await this.indexRedirector();
break;
case '/en/sfi-financial-market-participants/':
await this.processRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
// logger.debug(this.paymentServices.links);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
// logger.debug(this.paymentServices.links);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
// logger.debug(this.paymentServices.links);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=6&type%5B%5D=20&business_form%5B%5D=28&business_form%5B%5D=27&business_form%5B%5D=89'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=7&type%5B%5D=21&business_form%5B%5D=32&business_form%5B%5D=33'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=3&type%5B%5D=27&business_form%5B%5D=82&business_form%5B%5D=22&business_form%5B%5D=110'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/LT/LB`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
// start the browser
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = LTScrape;

790
ncas/lu.js Normal file
View File

@ -0,0 +1,790 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LU');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
function debounce(func, wait, immediate) {
var timeout;
return () => {
const context = this;
const args = arguments;
const later = () => {
timeout = null;
if (!immediate) func.apply(context, args);
};
var callNow = immediate && !timeout;
clearTimeout(timeout);
timeout = setTimeout(later, wait);
if (callNow) func.apply(context, args);
};
}
class LUScrape extends Scraper {
constructor() {
super();
this.id = 'LU';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
this.debounceHandleIndexPage = debounce(() => {
// the index page sometimes reloads up to 3 times..
this.emit('handleIndexPage');
}, 7500);
}
/**
*
* @returns {Promise<void>}
*/
async handleIndexPage() {
const thisUrl = await this.page.url();
const pageUrl = url.parse(thisUrl);
switch (pageUrl.hash) {
case '#Home':
case '#AdvancedSearch':
await this.indexPageHomeMode();
break;
case '#ResultResearch':
this.emit('handleEntityIndex');
break;
case '#DetailEntity':
this.emit('processEntity');
break;
case null:
this.emit('selectSearchManually');
break;
default:
logger.error('HASH NOT RECOGNISED');
logger.error(pageUrl);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async indexPageHomeMode() {
try{
const searchType = ['6', '7', '1'];
const bodys = ['#advancedsearch_paymentservicestype-body', '#advancedsearch_electronicmoneytype-body', '#advancedsearch_banktype-body'];
const bankInputs = ['#advancedsearch_bankgroup1_inputEl', '#advancedsearch_bankgroupA_inputEl', '#advancedsearch_bankgroupB_inputEl',
'#advancedsearch_bankgroupC_inputEl', '#advancedsearch_bankgroupD_inputEl', '#advancedsearch_bankgroup2_inputEl', '#advancedsearch_bankgroup3_inputEl'];
// click the advanced search button
await this.page.waitForSelector('#menu_advanced').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
// click
await this.page.waitForSelector('#advancedsearch_type-bodyEl').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
await this._randomWait(this.page, 2, 2);
// call the EXT function to set the advanced search mode..
await this.page.evaluate(x => {
return Ext.getCmp('advancedsearch_type').setValue(x);
}, searchType[this.mode]);
// Mode 0 & Mode 1 have a list of options which can be iterated easily
// Mode 2 requires a handful of different inputs to be clicked on
await this._microWait(this.page, 7);
if (this.mode === 0) {
await this.page.waitForSelector('label#advancedsearch_paymentinstitutionsgroup1-boxLabelEl').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
await this._randomWait(this.page, 2, 2);
}
if (this.mode === 0 && this.mode === 1) {
const options = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input.x-form-checkbox-default`);
// click all the elements
logger.debug('options length', options.length);
for (const item of options)
await item.click({ 'delay':Scraper.notARobot() });
}
if (this.mode === 2)
for(const bI of bankInputs) {
const input = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input${bI}`);
await input[0].click({ 'delay':Scraper.notARobot() });
}
await this._randomWait(this.page, 1, 1);
// click the button
await this.page.waitForSelector('#advancedsearch_searchbutton').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
// now wait for the results to load..
await this.page.waitForSelector('#title-1083-textEl').then(async () => {
logger.debug('Results loaded');
this.emit('pageChanged');
});
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityIndexFirstPass(serviceObject) {
try{
const body = await this.page.content();
const $ = cheerio.load(body);
const pageDetails = await this.extractBarDetails($);
const { currentPageIndex, currentPageMax } = pageDetails;
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
serviceObject.currentIndexLength = pageDetails.currentIndexLength;
serviceObject.currentPageMax = currentPageMax;
serviceObject.visited = true;
serviceObject.currentIndex = url.parse(await this.page.url());
serviceObject.currentMetaIndex = 0;
}
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param $
* @returns {Promise<{currentIndexLength: number, maxPages: number, currentPageMax: number, page: number, currentPageIndex: number}>}
*/
async extractBarDetails($) {
try{
const numberExtract = /(\d+)/g;
const pagingBar = $('#resultresearch_paging-targetEl').children();
const page = parseInt($(pagingBar).eq(4).find('input').val(), 10);
const workMaxPages = this._cleanUp($(pagingBar).eq(5).text() );
const maxPages = parseInt(workMaxPages.match(numberExtract)[0], 10);
const rawDisplaying = this._cleanUp($(pagingBar).eq(pagingBar.length - 1).text());
const [ currentPageIndex, currentPageMax, currentIndexLength ] = rawDisplaying.match(numberExtract).map((s) => {
return parseInt(s, 10);
});
return { page, maxPages, currentPageIndex, currentPageMax, currentIndexLength };
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityIndex(serviceObject) {
try{
const fields = ['type', 'name', 'address'];
logger.info(`Working on the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 1, 2);
if (serviceObject.visited === false) {
logger.debug('Preparing...');
serviceObject.restart = false;
await this.entityIndexFirstPass(serviceObject);
}
if (serviceObject.visited === true) {
serviceObject.currentMetaIndex = serviceObject.step % serviceObject.currentPageMax;
logger.debug('serviceObject.currentMetaIndex', serviceObject.currentMetaIndex);
if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) {
logger.debug('Maxed out this page..');
// serviceObject.visited = false;
serviceObject.restart = false;
await this.page.waitForSelector('#button-1052').then(async (elm) => {
logger.debug('Proceeding to next index page..');
await elm.click({ 'delay':Scraper.notARobot() });
this.emit('pageChanged');
});
}
else {
logger.debug('dealing...');
serviceObject.restart = true;
logger.debug(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
const wantedRow = await this.page.$$(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
const htmlTable = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
const $ = cheerio.load(`<table>${htmlTable}</table>`);
const cells = $('div.x-grid-cell-inner');
serviceObject.current = {};
cells.each((index, item) => {
serviceObject.current[ fields[index] ] = this._cleanUp($(item).text());
});
if (typeof(serviceObject.current.name ) !== 'undefined' && serviceObject.current.name !== '') {
const fileName = this._makeFileName(serviceObject.current.name);
serviceObject.current.fileName = fileName;
serviceObject.current.filePath = `${this.path}/${fileName}`.substring(0, 240);
}
// logger.debug(serviceObject);
await this._randomWait(this.page, 3, 5);
await wantedRow[0].click({ 'delay':97, 'clickCount': 2 });
await this._randomWait(this.page, 1, 1);
this.emit('pageChanged');
}
}
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param $
* @param html
* @param divId
* @param sequence
* @returns {Promise<Array>}
*/
async extractGridPanel($, html, divId, sequence) {
try{
const outObj = [];
const elms = $(html).find(`${divId} div.x-grid-item-container table`);
elms.each((index, itm) => {
const newObj = {};
for(const seqItem of sequence) {
const mclass = `.x-grid-cell-${seqItem[0]}`;
const rowElm = $(itm).find(mclass);
newObj[seqItem[1]] = this._cleanUp($(rowElm).text());
}
outObj.push(newObj);
});
return outObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try{
const details = {};
const detailSequence = [['detailEntity_type_inputEl', 'type'],
['detailEntity_number_inputEl', 'number'],
['detailEntity_name_inputEl', 'name'],
['detailEntity_address_inputEl', 'address'],
['detailEntity_startdate_inputEl', 'startdate'],
['detailEntity_closeddate_inputEl', 'closedate'],
['detailEntity_countrycode_inputEl', 'countrycode'],
['detailEntity_group_inputEl', 'group'],
['detailEntity_subgroup_inputEl', 'subgroup'],
['detailEntity_iciOutside_inputEl', 'iciOutside'],
['detailEntity_icilinked_inputEl', 'icilinked']
];
const gridPanels = [{
'id': 'autorisedStatus',
'sequence': [['detailEntity_autorisedStatus', 'autorisedStatus'],
['detailEntity_recentChangeautorisedStatus', 'recentChangeautorisedStatus'],
['detailEntity_recentChangeautorisedDate', 'recentChangeautorisedDate']],
'divId': '#detailEntity_autorisedStatusGridPanel-body'
}, {
'id': 'agentOrBranch',
'sequence': [['detailEntity_agentorbranchData', 'agentorbranchData'], ['detailEntity_agentData', 'agentData'],
['detailEntity_branchData', 'branchData'], ['detailEntity_agentorbranchCountry', 'agentorbranchCountry'],
['detailEntity_agentorbranchAddress', 'agentorbranchAddress'], ['detailEntity_agentorbranchlegalstatus', 'agentorbranchlegalstatus']],
'divId': '#detailEntity_agentorbranchGridPanel-body'
}, {
'id': 'iciOutsideTable',
'sequence': [['detailEntity_iciOutsideMember', 'iciOutsideMember']],
'divId': '#detailEntity_iciOutsideGridPanel-body'
}, {
'id': 'icilinkedTable',
'sequence': [['detailEntity_icilinkedname', 'icilinkedname'], ['detailEntity_icilinkedstartingdate', 'icilinkedstartingdate'],
['detailEntity_icilinkedendingdate', 'icilinkedendingdate']],
'divId': '#detailEntity_icilinkedGridPanel-body'
}, {
'id': 'othersStatus',
'sequence': [['detailEntity_otherStatus', 'otherStatus'], ['detailEntity_recentChangeotherStatus', 'recentChangeotherStatus'],
['detailEntity_recentChangeotherDate', 'recentChangeotherDate']],
'divId': '#detailEntity_othersStatusGridPanel-body'
}, {
'id': 'services',
'sequence': [['detailEntity_service', 'service'], ['detailEntity_recentChangeservice', 'recentChangeservice'],
['detailEntity_recentChangeserviceDate', 'recentChangeserviceDate']],
'divId': '#detailEntity_servicesGridPanel-body'
}, {
'id': 'ancillaryservices',
'sequence': [['detailEntity_ancillaryservice', 'ancillaryservice'],
['detailEntity_recentChangeancillaryservice', 'recentChangeancillaryservice'],
['detailEntity_recentChangeancillaryserviceDate', 'recentChangeancillaryserviceDate']],
'divId': '#detailEntity_ancillaryservicesGridPanel-body'
}, {
'id': 'prestataire',
'sequence': [['detailEntity_prestatairename', 'prestatairename'], ['detailEntity_prestataireheadoffice', 'prestataireheadoffice'],
['detailEntity_prestataireauthorisation', 'prestataireauthorisation']],
'divId': '#detailEntity_prestataireGridPanel-body'
}, {
'id': 'historicName',
'sequence': [['detailEntity_historicNameName', 'historicNameName'], ['detailEntity_historicNameDate', 'historicNameDate']],
'divId': '#detailEntity_historicNameGridPanel-body'
}];
const $ = cheerio.load(html);
const mainDiv = $('#promoteDetailEntityPanel-innerCt');
for(const item of detailSequence) {
const i = $(mainDiv).find(`#${item[0]}`);
details[item[1]] = this._cleanUp($(i).text());
}
for( const grid of gridPanels)
details[grid.id] = await this.extractGridPanel($, mainDiv, grid.divId, grid.sequence);
return details;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntity(serviceObject) {
try{
logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.current.name}`);
logger.info(`Step ${serviceObject.step} of ${serviceObject.currentIndexLength}`);
await this._randomWait(this.page, 3, 5);
const filePath = serviceObject.current.filePath;
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
serviceObject.current.details = await this.extractEntityDetails(body);
this.emit('entityComplete');
logger.info('Entity complete...');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<null>}
*/
async entityCompleter(serviceObject) {
try{
const filename = serviceObject.current.fileName;
const filePath = serviceObject.current.filePath;
const newObj = {};
logger.info(`Saving: ${filename}.json`);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 3, 5);
newObj.fileName = `${filename}.json`;
newObj.name = serviceObject.current.name;
newObj.number = serviceObject.current.details.number || '';
serviceObject.links.push(newObj);
serviceObject.step++;
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
await this.page.waitForSelector('a#detailEntity_backtolist').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
this.emit('pageChanged');
});
}
else
this.emit('serviceDone');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<void>}
*/
async handleProcessEntity() {
switch (this.mode) {
case 1:
await this.processEntity(this.emoneyServices);
break;
case 2:
await this.processEntity(this.creditServices);
break;
case 0:
default:
await this.processEntity(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityComplete() {
switch (this.mode) {
case 1:
await this.entityCompleter(this.emoneyServices);
break;
case 2:
await this.entityCompleter(this.creditServices);
break;
case 0:
default:
await this.entityCompleter(this.paymentServices);
break;
}
}
async processNewPage() {
// give the page a few seconds to settle
// await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
if (pageUrl.href === 'about:blank') return;
if (pageUrl.pathname === '/index.html')
this.debounceHandleIndexPage();
else
if (process.env.NODE_ENV === 'production') {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
// Need thiss for Angular / EXT based sites
this.on('pageChanged', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('handleIndexPage', () => {
this.handleIndexPage();
});
this.on('processEntity', () => {
this.handleProcessEntity();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('handleEntityIndex', async () => {
switch (this.mode) {
case 1:
await this.processEntityIndex(this.emoneyServices);
break;
case 2:
await this.processEntityIndex(this.creditServices);
break;
case 0:
default:
await this.processEntityIndex(this.paymentServices);
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
this.emit('pageChanged');
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
this.emit('pageChanged');
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
this.on('selectSearchManually', async () => {
logger.debug('Locating advanced search button');
await this.page.waitForSelector('#menu_advanced', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':90 });
}).catch(() => {
logger.error('No advanced search button');
});
await this.page.waitForSelector('#promoteAdvancedSearchPanel-body', { 'visible':true, 'timeout':7500 }).then(async () => {
await this.indexPageHomeMode();
}).catch(() => {
logger.error('No advanced search form');
});
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/LU/CSSF`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
if (this.eventNames().length === 2)
await this.attachEvents();
await this._makeResponsive();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'load' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = LUScrape;

626
ncas/lv.js Normal file
View File

@ -0,0 +1,626 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LV');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class LVScrape extends Scraper {
constructor() {
super();
this.id = 'LV';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param serviceObject
* @param html
* @returns {Promise<void>}
*/
async processIndex(serviceObject, html) {
const newArray = [] ;
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const $ = cheerio.load(html);
const links = $('div.featured-articles-title a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).text());
const newUrl = `${this.rootURI}${href}`;
const id = this._makeFieldName(text);
newArray.push({ 'name':text, 'href':newUrl, 'id':id });
});
serviceObject.links = serviceObject.links.concat(newArray);
const filename = this.modeNames[this.mode];
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
// ('div.featured-articles-title')
await this.page.waitForSelector('table#organizcijasList', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.debug('Menu details.');
const elmHtml = await this.page.evaluate(el => el.outerHTML, elm);
await this.processIndex(serviceObject, elmHtml);
}).catch(() => {
logger.info('No show all button');
});
}
/**
*
* @param html
* @param section
* @returns {Promise<void>}
*/
async extractEntitySections(html, section) {
const httpRegEx = /(http|ftp|https):\/\//;
const filenameFromURL = /(?:\/.*\/)(.*)/;
try{
const newObj = { } ;
const $ = cheerio.load(html);
// const wantedItem = $('div#featured-articles-title');
// const wantedItem = $('h2:contains("Sanctions")');
const wantedItem = $(section);
if (wantedItem.length === 0) return newObj;
newObj.name = this._cleanUp($(wantedItem).text());
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
rows.each((i, item) => {
const children = $(item).children();
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newObj[label] = this._cleanUp($(children).eq(1).text());
}
if ($(children).length === 1) {
const label = 'notes';
if (!newObj.hasOwnProperty(label))
newObj[label] = [];
newObj[label].push(this._cleanUp($(children).eq(0).text()));
const links = $(item).find('a');
if ($(links).length > 0)
links.each((y, link) => {
const href = $(link).attr('href');
const text = this._cleanUp($(link).text());
if (href.match(httpRegEx) === null) {
const fileName = href.match(filenameFromURL);
if (!newObj.hasOwnProperty('links'))
newObj['links'] = [];
newObj['links'].push({ href, text, 'filename': fileName[1] });
}
});
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param section
* @returns {Promise<Array>}
*/
async extractEntitySubSections(html, section) {
try{
const newObj = [] ;
const $ = cheerio.load(html);
const wantedItem = $(section);
if (wantedItem.length === 0) return newObj;
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
let newItem = {};
rows.each((i, item) => {
const children = $(item).children();
if (i === 0 || $(children).length === 1) {
if (Object.keys(newItem).length !== 0)
newObj.push(newItem);
newItem = {};
}
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newItem[label] = this._cleanUp($(children).eq(1).text());
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractEntityLicenses(html) {
try{
const newObj = [] ;
const $ = cheerio.load(html);
const wantedItem = $('h2:contains("Licenses / Types of activities")');
if (wantedItem.length === 0) return newObj;
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
let newItem = {};
rows.each((i, item) => {
const children = $(item).children();
if (i === 0 || $(children).length === 1) {
if (Object.keys(newItem).length !== 0)
newObj.push(newItem);
newItem = {};
}
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newItem[label] = this._cleanUp($(children).eq(1).text());
}
});
logger.debug(JSON.stringify(newObj));
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
// const filenameFromURL = /(?:\/.*\/)(.*)/;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('#featured-articles-title > h2', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
// --
const details = await this.extractEntitySections(body, 'div#featured-articles-title');
const marketSegments = await this.extractEntitySubSections(body, 'h2:contains("Market segments")');
const relatedPersons = await this.extractEntitySubSections(body, 'h2:contains("Related persons")');
const licenses = await this.extractEntityLicenses(body);
const sanctions = await this.extractEntitySections(body, 'h2:contains("Sanctions")');
const qualifyHoldings = await this.extractEntitySubSections(body, 'h2:contains("Qualifying holdings")');
// --
await jsonfile.writeFile(`${filePath}.json`, { details, marketSegments, relatedPersons, licenses, sanctions, qualifyHoldings });
await this._randomWait(this.page, 3, 5);
if (details.hasOwnProperty('links')) {
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
for(const items of details.links) {
const href = `${this.rootURI}${items.href}`;
await this.page.goto(href, { 'waitUntil': 'networkidle0' }).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if (!err.message.includes('net::ERR_ABORTED') )
logger.error('grabLink', err);
});
}
}
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const urlSplitter = /(\/en\/.*\/)(.*)/;
const pageUrl = url.parse(await this.page.url());
const splitUrl = pageUrl.pathname.match(urlSplitter);
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
if (splitUrl === null) return;
switch (splitUrl[1]) {
case '/en/market/payment-institutions/':
case '/en/market/electronic-money-institutions/':
case '/en/market/credit-institutions/':
await this.indexRedirector();
break;
case '/en/market/payment-institutions/authorized-payment-institutions/':
case '/en/market/payment-institutions/registered-payment-institutions/':
case '/en/market/electronic-money-institutions/authorized-electronic-money-institutions/':
case '/en/market/electronic-money-institutions/registered-electronic-money-institutions/':
case '/en/market/credit-institutions/banks/':
await this.processRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
let newUrl;
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep >= this.paymentServices.urls.length)
newUrl = this.paymentServices.links[this.paymentServices.step].href;
else
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
let newUrl;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
else
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
let newUrl;
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
this.creditServices.indexStep++;
if (this.creditServices.indexStep >= this.creditServices.urls.length)
newUrl = this.creditServices.links[this.creditServices.step].href;
else
newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.rootURI = 'http://www.fktk.lv';
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = LVScrape;

818
ncas/mt.js Normal file
View File

@ -0,0 +1,818 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('MT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class MTScrape extends Scraper {
constructor() {
super();
this.id = 'MT';
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<{authorization, details}>}
* @constructor
*/
async OLDextractEntity(html) {
const $ = cheerio.load(html);
const details = {};
const authorization = {};
details.name = this._cleanUp($('#lblName').text());
const dlCells = $('div#pnlCommonDetails').children();
const superCells = $('#LHDetails span.fix-width-caption');
// #lblStatus
dlCells.each((index, item) => {
if ($(item).attr('id') === 'pnlRegDate') {
const itemText = this._cleanUp($(item).find('span').text()).split(/\s*:\s*/);
details[itemText[0]] = itemText[1];
}
else {
const current = this._cleanUp($(item).find('p').text()).replace(/\s*:\s*/, '');
details[current] = this._cleanUp($(item).find('span').text());
}
});
superCells.each((index, item) => {
const nextElm = $($(item).next());
const li = $(nextElm).find('li');
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
authorization[thisId] = [];
if (li.length > 0)
li.each((index, item) => {
const auth = $(item).html().split(' - ');
auth[1] = this._cleanUp(auth[1]);
authorization[thisId].push(auth);
});
else {
const itemText = this._cleanUp($(nextElm).text());
authorization[thisId].push(itemText);
}
});
return { details, authorization };
}
/**
*
* @param html
* @returns {Promise<{authorization, details}>}
*/
async extractEntityV2(html) {
const trimToColon = /^.*?(?=(:))/;
const $ = cheerio.load(html);
const details = {};
const authorization = {};
const errors = [];
details.name = this._cleanUp($('div#mainTitle > div').text());
const dlCells = $('table#tableLicenceResult tr');
const superCells = $('#LHDetails span.fix-width-caption');
let previousLabel = '';
dlCells.each((index, item) => {
const children = $(item).children();
const rawLabel = $(children).eq(0).text().match(trimToColon);
const itemValue = this._cleanUp($(children).eq(1).text().trim());
if (rawLabel !== null ) {
const itemLabel = this._cleanUp(rawLabel[0]);
details[itemLabel] = itemValue;
previousLabel = itemLabel;
}
else
details[previousLabel] = details[previousLabel].concat([itemValue]);
});
previousLabel = '';
superCells.each((index, item) => {
const nextElm = $($(item).next());
const children = $(nextElm).children();
if ($(children).length <= 1) {
const li = $(nextElm).find('li');
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
authorization[thisId] = [];
if (li.length > 0)
li.each((index, item) => {
const auth = $(item).text().split(' - ');
auth[1] = this._cleanUp(auth[1]);
if (auth[1] !== '')
authorization[thisId].push(auth);
});
else {
const itemText = this._cleanUp($(nextElm).text());
authorization[thisId].push(itemText);
}
}
else {
logger.warn('Possible error in the HTML');
logger.warn($(nextElm).html());
errors.push($(nextElm).html());
}
});
const outObj = { details, authorization };
if (errors.length > 0)
outObj.errors = errors;
return outObj;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
* @constructor
*/
async OLDprocessIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const pagingItem = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tfoot > tr > td > table > tbody > tr > td > div.rgWrap.rgInfoPart strong');
const maxPagesText = (pagingItem.length > 0) ? await this.page.evaluate(el => el.innerText, pagingItem[1]) : '0';
const maxPages = parseInt(maxPagesText, 10);
const links = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tbody > tr > td> a');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.href, item);
const params = this._getParamsFromUrl(href);
serviceObject.links.push({ id, href, 'entId': params.id, 'metaStep': serviceObject.indexMetaStep });
}
if (serviceObject.indexStep < (maxPages - 1) ) {
serviceObject.indexStep++;
await this._findAndClick('input.rgPageNext');
}
else
this.emit('indexdone');
}
async processIndexV2(serviceObject) {
// #tableResult span
const numberRegEx = /\d+/;
logger.debug('+ processIndexV2');
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const links = await this.page.$$('#tableResult span');
for (const item of links) {
const id = await this.page.evaluate(el => el.innerText, item);
const href = await this.page.evaluate(el => el.getAttribute('onclick'), item);
serviceObject.links.push({ id, 'entId': href.match(numberRegEx)[0], 'metaStep': serviceObject.indexMetaStep });
}
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
* @constructor
*/
async OLDinitiateIndex(serviceObject) {
logger.debug('initiateIndex');
const matched = { 'left':false, 'right':false };
// first time around.
// need to kick off the index correctly..
await this._findAndClick('#ctl00_cphMain_RadComboBox1');
await this._randomWait(this.page, 2, 3);
const leftOptions = await this.page.$$('#ctl00_cphMain_RadComboBox1_DropDown > div > ul.rcbList li');
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
for (const item of leftOptions) {
const text = await this.page.evaluate(el => el.innerText, item);
if (wantedOption.indexOf(text) !== -1) {
await item.click({ 'delay':95 });
matched.left = true;
// this element can take a while to reload..
break;
}
}
await this._randomWait(this.page, 7, 9);
await this._findAndClick('#ctl00_cphMain_RadComboBox2_Input');
await this._randomWait(this.page, 2, 3);
const rightOptions = await this.page.$$('#ctl00_cphMain_RadComboBox2_DropDown > div > ul.rcbList li');
for (const item of rightOptions) {
const text = await this.page.evaluate(el => el.innerText, item);
if (text === wantedOption[1]) {
matched.right = true;
await item.click({ 'delay':95 });
break;
}
}
// Wait for items to setttle
await this._randomWait(this.page, 2, 3);
if (matched.left && matched.right) {
serviceObject.started = true;
await this._findAndClick('#cphMain_btnSearch2');
}
else
logger.error('Not fully matched', matched);
}
/**
* Reworked for site reskin
* @param serviceObject
* @returns {Promise<void>}
*/
async initiateIndexV2(serviceObject) {
logger.debug('initiateIndexV2');
const matched = { 'left':false, 'right':false };
// first time around.
// need to kick off the index correctly..
// select#select1
const leftOptions = await this.page.$$('select#select1 option');
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
for (const item of leftOptions) {
const rawText = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
const text = this._cleanUp(rawText);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('select#select1', value);
matched.left = true;
break;
}
}
// Wait for items to setttle
await this._randomWait(this.page, 2, 3);
const rightOptions = await this.page.$$('select#select2 option');
for (const item of rightOptions) {
const rawText = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
const text = this._cleanUp(rawText);
if (text === wantedOption[1]) {
matched.right = true;
await this.page.select('select#select2', value);
break;
}
}
await this._randomWait(this.page, 2, 2);
if (matched.left && matched.right) {
serviceObject.started = true;
await this._findAndClick('button.searchButtonAdv');
this.emit('processIndex');
}
else
logger.error('Not fully matched', matched);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
logger.debug('buildIndex');
if (!serviceObject.started)
await this.initiateIndexV2(serviceObject);
else
await this.processIndexV2(serviceObject);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async nextItem(serviceObject) {
const entId = serviceObject.links[serviceObject.step].entId;
logger.debug('nextItem', entId);
await this.newLoadLicenceHolder(entId);
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
if (!this.processing)
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
else
switch (this.mode) {
case 0:
await this.nextItem(this.paymentServices);
break;
case 1:
await this.nextItem(this.emoneyServices);
break;
case 2:
await this.nextItem(this.creditServices);
break;
}
}
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { id, entId } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step}:${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${entId}`].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityV2(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items)
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
else
this.emit('serviceDone');
}
// processIndex
async handleProcessIndex() {
switch (this.mode) {
case 0:
await this.processIndexV2(this.paymentServices);
break;
case 1:
await this.processIndexV2(this.emoneyServices);
break;
case 2:
await this.processIndexV2(this.creditServices);
break;
}
}
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
async processNewPage() {
// give the ajax page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
logger.debug('processNewPage', pageUrl.href);
switch (pageUrl.pathname) {
case '/pages/licenceholders.aspx':
case '/financial-services-register/':
await this.indexRedirector();
break;
case'/pages/licenceholder.aspx':
case '/financial-services-register/result/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
this.emit('backoff');
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('pathName', pathName);
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
* Replaces the goto
* @param id
* @returns {Promise<void>}
*/
async newLoadLicenceHolder(id) {
// loadLicenceHolder(10966)
const formElm = await this.page.$('form#loadHolder');
logger.debug('loadLicenceHolder', id);
await this.page.evaluate(x => {
x.target = '_self';
}, formElm);
await this._microWait(this.page, 5);
await this.page.evaluate(x => {
return loadLicenceHolder(x);
}, id);
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('processIndex', async () => {
this.handleProcessIndex();
});
//
this.on('pageChanged', this._debounce(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 1000));
this.on('psindexdone', async () => {
this.paymentServices.indexMetaStep++;
if (this.paymentServices.indexMetaStep < this.paymentServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.paymentServices.started = false;
this.paymentServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('emindexdone', async () => {
this.emoneyServices.indexMetaStep++;
if (this.emoneyServices.indexMetaStep < this.emoneyServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.emoneyServices.started = false;
this.emoneyServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('ciindexdone', async () => {
this.creditServices.indexMetaStep++;
if (this.creditServices.indexMetaStep < this.creditServices.indexMeta.length) {
logger.info('Resetting for next meta index...');
// next..
this.creditServices.started = false;
this.creditServices.indexStep = 0;
await this._goto(this.startPage);
}
else {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
logger.warn('GO THROUGH THE NEW LIST!!!!');
this.processing = true;
await this._randomWait(this.page, 2, 2, 'New page transition');
}
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('paymentServicesDone', async () => {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.processing = false;
await this._goto(this.emoneyServices.urls[0]);
});
this.on('emoneyServicesDone', async () => {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.processing = false;
await this._goto(this.creditServices.urls[0]);
});
this.on('creditServicesDone', async () => {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.processing = false;
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 46,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Financial Institutions',
'Financial Institutions licensed to undertake payment services under the 2nd Schedule to the Financial Institutions Act (Payment Institutions)'],
['Financial Institutions',
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to provide services outside Malta'],
['Financial Institutions',
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to establish a branch outside Malta']
]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Financial Institutions',
'Financial Institutions licenced to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions)'],
['Financial Institutions',
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to provide services outside Malta'],
['Financial Institutions',
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to establish a branch outside Malta']
]
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'indexMetaStep':0,
'visited': false,
'done' : false,
'started': false,
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
'indexMeta' : [
['Credit Institutions',
'Credit Institutions'],
['Credit Institutions',
'Freedom of Services and Establishments - Exercise of the freedom to provide services outside Malta'],
['Credit Institutions',
'Freedom of Services and Establishments - Exercise of the freedom to set up an establishment outside Malta']
]
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/MT/MFSA`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._debounce(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = MTScrape;

794
ncas/nl.js Normal file
View File

@ -0,0 +1,794 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('NL');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class NLScrape extends Scraper {
constructor() {
super();
this.setID('NL');
this.addToBlockFilters(['cookiebar.js', 'readspeaker']);
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
// Delays the call to 30 seconds after the last time it was called.
// Useful if the page beaks and multiple errors happen at the same time
this.recover = this._debounce(async () => {
await this.__recover();
}, 30000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async extractDetail(body) {
const description = [];
try{
const $ = cheerio.load(body);
const rows = $('dl.extra > dd > table > tbody > tr');
rows.each((index, item) => {
let cells = $(item).find('th');
const title = this._cleanUp($(cells.get(0)).text()).replace(':', '') || '';
cells = $(item).find('td');
const detail = this._cleanUp($(cells.get(0)).text()) || '';
if (title !== '')
description.push([title, detail]);
});
}
catch( err) {
logger.error(err);
}
return description;
}
async extractActivity(body) {
const details = [];
try{
const $ = cheerio.load(body);
const rows = $('#tab2 > div > div > table > tbody > tr');
let previousFinancialService = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
const endDate = this._cleanUp($(cells.get(2)).text()) || '';
const thCell = $(item).find('th');
const financialService = this._cleanUp($(thCell.get(0)).text()) || previousFinancialService;
details.push({ financialService, activity, startDate, endDate });
previousFinancialService = financialService;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Extract Passporting Out Data from page
* @param body
* @returns {Promise<void>}
*/
async extractPassportingOut(body) {
const details = {};
try{
const $ = cheerio.load(body);
const rows = $('#tab6 > div > div > table > tbody > tr');
let previouseuPassportOut = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const country = this._cleanUp($(cells.get(1)).text()) || '';
const startDate = this._cleanUp($(cells.get(2)).text()) || '';
const endDate = this._cleanUp($(cells.get(3)).text()) || '';
const thCell = $(item).find('th');
const euPassportOut = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportOut;
if (!details.hasOwnProperty(country))
details[country] = [{ activity, startDate, endDate, euPassportOut }];
else
details[country].push({ activity, startDate, endDate, euPassportOut });
previouseuPassportOut = euPassportOut;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Extract Passporting In Data from page
* @param body
* @returns {Promise<void>}
*/
async extractPassportingIn(body) {
const details = {};
try{
const $ = cheerio.load(body);
const rows = $('#tab7 > div > div > table > tbody > tr');
let previouseuPassportIn = '';
rows.each((index, item) => {
const cells = $(item).find('td');
const activity = this._cleanUp($(cells.get(0)).text()) || '';
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
const thCell = $(item).find('th');
const euPassportIn = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportIn;
if (!details.hasOwnProperty(euPassportIn))
details[euPassportIn] = [{ activity, startDate }];
else
details[euPassportIn].push({ activity, startDate });
previouseuPassportIn = euPassportIn;
});
}
catch( err) {
logger.error(err);
}
return details;
}
/**
* Process Entity Detail
*
* @returns {Promise<{activity: *, details: *}>}
*/
async processEntityDetail(serviceObject) {
const noWhiteSpace = /\W/g;
const urlSections = ['WFTBI', 'WFTEG', 'WFTKF'];
const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process V2 ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = this._makeFileName(entity);
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this.page.waitForSelector('#contentcolumn > div.interactive-tabs > ol > li:nth-child(2) > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
}).catch(() => {
logger.debug('No activity tab');
});
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab6"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_passportingout`, null);
}).catch(() => {
logger.debug('No passporting Out tab');
});
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab7"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_passportingin`, null);
}).catch(() => {
logger.debug('No passporting In tab');
});
const body = await this.page.content();
const details = await this.extractDetail(body);
const activity = await this.extractActivity(body);
const passportingOut = await this.extractPassportingOut(body);
const passportingIn = await this.extractPassportingIn(body);
await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`;
await this._goto(newUrl);
}
else
this.emit('entityDone');
}
/**
* Process WFTBI / Payment Services Detail
*
* @returns {Promise<{activity: *, details: *}>}
*/
async processWFTBIDetail() {
await this.processEntityDetail(this.paymentServices);
}
/**
* Process WFTEG / Emoney services Detail
* @returns {Promise<{activity: *, details: *}>}
*/
async processWFTEGDetail() {
await this.processEntityDetail(this.emoneyServices);
}
/**
* Process WFTKF / Credit Services Details
* @returns {Promise<{activity: *, passportingOut: void, details: *}>}
*/
async processWFTKFDetail() {
await this.processEntityDetail(this.creditServices);
}
/**
* Initiate WFTBI / Payment Services
* @returns {Promise<void>}
*/
async initiateWFTBI() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const wantedOption = ['2:3c Dutch branch of payment institution (EEA incl. NL)'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Initiaite WFTEG / Emoney services
* @returns {Promise<void>}
*/
async initiateWFTEG() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const wantedOption = ['2:10b Carrying on the business of an electronic money institution'];
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Initiate WFTKF / Credit Services
* @returns {Promise<void>}
*/
async initiateWFTKF() {
try{
// first time around.
// need to kick off the index correctly..
const options = await this.page.$$('#ddfilter option');
const selects = ['2:12(1) Carrying on the business of a bank', '2:13(1) Carrying on the business of a bank'];
const wantedOption = [];
wantedOption.push(selects[this.creditServices.step]);
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption.indexOf(text) !== -1) {
await this.page.select('#ddfilter', value);
break;
}
}
this._findAndClick('#search-main button');
}
catch(e) {
throw new Error(e);
}
}
/**
* Process WFTBI / Payment Services
* @returns {Promise<void>}
*/
async processWFTBI() {
const nonWhiteSpace = /\W/g;
logger.info('WFTBI / Payment Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTBI();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/paymentServices_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
this.paymentServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
this.emit('startProcessingPaymentServices');
}
}
/**
* Process WFTEG / Emoney services
* @returns {Promise<void>}
*/
async processWFTEG() {
const nonWhiteSpace = /\W/g;
logger.info('WFTEG / EMoney Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTEG();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/eMoney_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
// const id = `${statutoryName}-${tradeName}`;
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
this.emoneyServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
this.emit('startProcessingEMoneyServices');
}
}
/**
* Process WFTKF / Credit Services
* @returns {Promise<void>}
*/
async processWFTKF() {
try {
// Credit Institute
const nonWhiteSpace = /\W/g;
logger.info('WFTKF / Credit Services');
await this._randomWait(this.page, 3, 5);
const origUrl = await this.page.url();
const pageUrl = url.parse(origUrl);
if (pageUrl.query === null)
// we need to select the correct item from the dropdown.
this.initiateWFTKF();
else {
// crack query
const body = await this.page.content();
const $ = cheerio.load(body);
const q = this._getParamsFromUrl(origUrl);
const page = q.page || '1';
await this._makeScreenshotV2(this.page, `${this.path}/creditServices_menu_${page}`, null);
const rows = $('#contentcolumn table tbody tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
let statutoryName = children.eq(0).text();
let tradeName = children.eq(1).text();
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
// const id = `${statutoryName}-${tradeName}`;
let href = cheerio(children.eq(0)).find('a').attr('href');
href = href.concat('&locale=en_GB');
// this is the one we want.
logger.debug({ id, href });
this.creditServices.links.push({ id, href });
});
const next = $('a.next').attr('href') || '';
if (next !== '')
this._findAndClick('a.next');
else
if (this.creditServices.step === 0) {
this.creditServices.step = 1;
await this._goto(this.credit);
}
else
this.emit('startProcessingCreditServices');
}
}
catch(e) {
await this._uploadError();
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const failedUrls = ['chrome-error://chromewebdata/'];
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (failedUrls.indexOf(pageUrl.href) !== -1) {
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/supervision/public-register/WFTBI/index.jsp':
await this.processWFTBI();
break;
case '/en/supervision/public-register/WFTBI/detail.jsp':
await this.processWFTBIDetail();
break;
case '/en/supervision/public-register/WFTEG/index.jsp':
await this.processWFTEG();
break;
case '/en/supervision/public-register/WFTEG/detail.jsp':
await this.processWFTEGDetail();
break;
case '/en/supervision/public-register/WFTKF/index.jsp':
await this.processWFTKF();
break;
case '/en/supervision/public-register/WFTKF/detail.jsp':
await this.processWFTKFDetail();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
}
}
/**
*
* @returns {Promise<void>}
*/
async restart() {
logger.info(`Restarting ${this.modeTitles[this.mode]}`);
switch (this.mode) {
case 2:
this.emit('startProcessingCreditServices');
break;
case 1:
this.emit('startProcessingEMoneyServices');
break;
case 0:
default:
this.emit('startProcessingPaymentServices');
break;
}
}
/**
*
* @returns {Promise<void>}
* @private
*/
async __recover() {
logger.warn('*** RECONNECTING PAGE ***');
if (this.browserCrashed) await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', () => {
this.processNewPage();
});
const timeout = 90000;
setTimeout(async() => {
logger.warn('Attempting recovery..');
await this.restart();
}, timeout);
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('startProcessingPaymentServices', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTBI/${this.paymentServices.links[this.paymentServices.step].href}`;
logger.debug('startProcessingPaymentServices', newUrl);
await this._goto(newUrl);
});
this.on('paymentServicesDone', async () => {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
await this._goto(this.emoneyUrl);
});
this.on('startProcessingEMoneyServices', async () => {
this.mode = 1;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.debug(`${this.emoneyServices.items} EMoney items indexed` );
logger.debug(this.emoneyServices.links[this.emoneyServices.step].href);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTEG/${this.emoneyServices.links[this.emoneyServices.step].href}`;
logger.debug('startProcessingEMoneyServices', newUrl);
await this._goto(newUrl);
});
this.on('emoneyServicesDone', async () => {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
await this._goto(this.credit);
});
this.on('startProcessingCreditServices', async () => {
this.mode = 2;
this.creditServices.items = this.creditServices.links.length;
logger.debug(`${this.creditServices.items} CI items indexed` );
logger.debug(this.creditServices.links[this.creditServices.step].href);
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTKF/${this.creditServices.links[this.creditServices.step].href}`;
logger.debug('startProcessingCreditServices', newUrl);
await this._goto(newUrl);
});
this.on('creditServicesDone', async () => {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
this.mode = 0;
try {
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.dnb.nl/en/supervision/public-register/WFTBI/index.jsp';
this.emoneyUrl = 'https://www.dnb.nl/en/supervision/public-register/WFTEG/index.jsp';
this.credit = 'https://www.dnb.nl/en/supervision/public-register/WFTKF/index.jsp';
//
this.setPath(path.resolve(`${__dirname }/../artefacts/NL/DNB`));
await this._doNonRepudiation(false, { 'sslWithPrefix': true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
await this.start();
}
}
module.exports = NLScrape;

767
ncas/no.js Normal file
View File

@ -0,0 +1,767 @@
// Version: 0.0.1-3
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('NO');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class NOScrape extends Scraper {
constructor() {
super();
this.id = 'NO';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const newObj = {};
const $ = cheerio.load(html);
const title = $('h1.common-header-text').text();
newObj.title = this._cleanUp(title);
const detailBox = $('div.side-container.license-side-unit-container');
const children = $(detailBox).children();
let curLabel = '';
children.each((i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H4') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (['P', 'SPAN', 'A'].indexOf(tagName) !== -1)
newObj[curLabel].push(this._cleanUp($(item).text()));
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param blockType
* @returns {{licenseDescription: string, blockType: string}}
*/
extractEntityDetailLicense(html, blockType = 'License') {
try {
const newObj = { 'licenseDescription':'', 'blockType': blockType };
const $ = cheerio.load(html);
const detailBox = $('div.license-container');
const children = $(detailBox).children();
let curLabel = '';
children.each((i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H3') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (tagName === 'H2') {
if (!newObj.hasOwnProperty('misc'))
newObj['misc'] = [];
newObj['misc'].push(this._cleanUp($(item).text()));
}
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1) {
const elmClass = $(item).attr('class');
if (elmClass === 'license-description')
newObj['licenseDescription'] = this._cleanUp($(item).text());
else
newObj[curLabel].push( this._cleanUp($(item).text()));
}
if (tagName === 'UL') {
const liArray = [];
const li = $(item).children('li');
for (let i = 0; i < $(li).length;i++)
liArray.push(this._cleanUp($(li).eq(i).text()));
newObj[curLabel].push(liArray);
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {{description: {}}}
*/
extractCrossBorderDetailsV2(html) {
const newObj = { 'description':{} };
const titleRegEx = /([^]*?)(?:<ul>)/;
const $ = cheerio.load(html);
const top = $('ul');
const parent = $(top).parent();
const title = this._cleanUp($(parent).children().first().text());
const li = $(top).first().children();
li.each(async (i, item) => {
const anotherUL = $(item).find('ul').index();
if (anotherUL !== -1) {
// There are UL's within this LI
const elms = $(item).find('ul').children('li');
if ($(elms).length !== 0) {
const nameArray = $(item).html().match(titleRegEx);
const rawTitle = nameArray[0].replace('<ul>', '');
const title = this._cleanUp(rawTitle) || 'main';
const label = this._makeFieldName(title);
if (!newObj.hasOwnProperty(label)) {
newObj[label] = [];
newObj.description[label] = title;
}
elms.each((i, e) => {
newObj[label].push(this._cleanUp($(e).text()));
});
}
}
else {
const label = this._makeFieldName(title);
if (!newObj.hasOwnProperty(label)) {
newObj[label] = [];
newObj.description[label] = title;
}
newObj[label].push(this._cleanUp($(item).text()));
}
});
return newObj;
}
/**
*
* @param html
* @returns {Promise<void>}
*/
extractEntityDetailCrossBorder(html) {
try {
const newObj = { };
const $ = cheerio.load(html);
const header = $('h3.license-unit-label:contains("Cross-border services/classes")');
const detailBox = $(header).parent();
const children = $(detailBox).children();
let curLabel = '';
children.each(async (i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H3') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1)
newObj[curLabel].push(this._cleanUp($(item).text()));
if(tagName === 'DIV' || tagName === 'UL') {
if (!newObj.hasOwnProperty('data'))
newObj['data'] = [];
const cbData = this.extractCrossBorderDetailsV2($(item).html());
newObj['data'].push(cbData);
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @param elm
* @returns {Promise<void>}
*/
async selectLicenseOption(serviceObject, elm) {
const wantedOption = serviceObject.wanted[serviceObject.indexStep];
const elmSelector = await this.page.evaluate((el) => [el.tagName, el.getAttribute('class')].join('.'), elm);
const options = await elm.$$('option');
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption === text) {
await this.page.select(elmSelector, value);
break;
}
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractIndexItems(html) {
const newArray = [] ;
const $ = cheerio.load(html);
const links = $('a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).find('.licenseregistry-search-result-item-header').text());
const country = this._cleanUp($(item).find('.licenseregistry-search-result-item-metadata').text());
const type = this._cleanUp($(item).find('.licenseregistry-search-result-item-type').text());
const params = this._getParamsFromUrl(href);
const newUrl = `${this.rootURI}${href}`;
const id = params.id;
newArray.push({ 'name':text, 'href':newUrl, 'id':id, 'country':country, 'type': type });
});
return newArray;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
let html = '';
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
let loadedAll = false;
do
await this.page.waitForSelector('button.search-result-loadmore', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
loadedAll = false;
logger.debug('Expanding index..');
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
}).catch(() => {
loadedAll = true;
});
while( loadedAll === false);
logger.debug('>> All loaded...');
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
html = await this.page.evaluate(el => el.outerHTML, elm);
}).catch((e) => {
logger.error(e);
logger.warn('No index list');
});
const indexList = await this.extractIndexItems(html);
serviceObject.links = serviceObject.links.concat(indexList);
const filename = this.modeNames[this.mode];
await this._randomWait(this.page, 5, 7);
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this.page.waitForSelector('select.search-filter', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await this.selectLicenseOption(serviceObject, elm);
}).catch((e) => {
logger.error(e);
logger.warn('No select');
});
// this reload can take a long time
await this.page.waitForSelector('span.search-results-count.highlight', { 'visible':true, 'timeout':75000 }).catch((e) => {
logger.error(e);
logger.warn('Waiting for data timeout');
});
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await this.processIndex(serviceObject);
}).catch((e) => {
logger.error(e);
logger.warn('No index list');
});
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async entityContentSniffer(html) {
const $ = cheerio.load(html);
const output = [];
const contentArray = [
{ 'find':'h2:contains("Bank")', 'blockType':'Bank' },
{ 'find':'h2:contains("Agent of payment institution (company)")', 'blockType':'Agent Payment Institution' },
{ 'find':'h2:contains("Tied Agent")', 'blockType':'Agent' },
{ 'find':'h3.license-unit-label:contains("The entity is a tied agent affiliated to")', 'blockType':'Affiliation' },
{ 'find':'h2:contains("Nominee in Norwegian securities registers")', 'blockType':'Securities register' },
{ 'find':'h2:contains("Branch of foreign credit institution")', 'blockType':'Foreign credit institution' },
{ 'find':'h2:contains("Finance company")', 'blockType':'Finance company' },
{ 'find':'h2:contains("Payment institution")', 'blockType':'Payment institution' },
{ 'find':'h2:contains("Agency debt collection on behalf of others")', 'blockType':'Debt collection' },
{ 'find':'h2:contains("E-money institution")', 'blockType':'E-money institution' },
{ 'find':'h2:contains("Investment firm")', 'blockType':'h2:contains("Investment firm")' },
{ 'find':'h2:contains("Intermediator of loans and guarantees")', 'blockType':'Intermediator of loans and guarantees' }
];
const licenseBlocks = $('div.article-content-container').children('div.license-container');
licenseBlocks.each( (i, item) => {
let license = {};
for(const cItem of contentArray)
if ($(item).find(cItem.find).index() !== -1)
license = this.extractEntityDetailLicense(item, cItem.blockType);
if ($(item).find('h3.license-unit-label:contains("Cross-border services/classes")').index() !== -1)
license.crossBorder = this.extractEntityDetailCrossBorder(item);
output.push(license);
});
return output;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('h1.common-header-text', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(name.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${id}`].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 5, 7);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
// --
const details = await this.extractEntityDetails(body);
const licenses = await this.entityContentSniffer(body);
// --
await jsonfile.writeFile(`${filePath}.json`, { details, licenses });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/finanstilsynets-registry/':
await this.indexRedirector();
break;
case '/en/finanstilsynets-registry/detail/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
let newUrl;
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep >= this.paymentServices.wanted.length)
newUrl = this.paymentServices.links[this.paymentServices.step].href;
else
newUrl = this.paymentServices.urls[0];
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
let newUrl;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
else
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
let newUrl;
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
this.creditServices.indexStep++;
if (this.creditServices.indexStep >= this.creditServices.urls.length)
newUrl = this.creditServices.links[this.creditServices.step].href;
else
newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.rootURI = 'https://www.finanstilsynet.no';
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['Payment institution', 'Agent of payment institution (company)', 'Payment service provider with a limited authorisat'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['E-money institution'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['Bank', 'Branch of foreign credit institution', 'Credit Institution', 'Savings bank foundation'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/NO/FS`));
// await this._doNonRepudiation();
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5, 'Startup');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = NOScrape;

1384
ncas/pl.js Normal file

File diff suppressed because it is too large Load Diff

1022
ncas/plR.js Normal file

File diff suppressed because it is too large Load Diff

513
ncas/pt.js Normal file
View File

@ -0,0 +1,513 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('PT');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class PTScrape extends Scraper {
constructor() {
super();
this.id = 'PT';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const details = {};
const detailSequence = [
['field-name-field-tipo-ent-aut', 'institutionType'],
['field-name-field-estado-ent', 'state'],
['field-name-field-morada', 'address'],
['field-name-field-localidade', 'firstName'],
['field-name-field-cod-postal', 'postcode'],
['field-name-field-pais', 'country'],
['field-name-field-data-limite', 'beginningOfActivity'],
['field-name-field-capital-subscrito', 'subscribedCapital'],
['field-name-field-capital-realizado', 'paidUpCapital'],
['field-name-field-jel', 'institutionCodeNumber']
];
const $ = cheerio.load(html);
details.name = this._cleanUp($('h1.page-title').text()) ;
const mainDiv = $('div.content');
for(const item of detailSequence) {
const i = $(mainDiv).find(`.${item[0]} div.field-items`);
details[item[1]] = this._cleanUp($(i).text());
}
return details;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const $ = cheerio.load(body);
if ($('div.view-empty').length > 0) {
// We have reached an empty page, so we assume we've scraped all links from this index
this.emit('indexdone');
return;
}
const links = $('div.views-field.views-field-title > span > a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = $(item).text();
const newUrl = `https://www.bportugal.pt${href}`;
const id = this._makeFieldName(text);
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
});
const filename = this.modeNames[this.mode];
const parsedUrl = url.parse(this.page.url(), true);
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${parsedUrl.query.page}`, null);
parsedUrl.query.page++;
parsedUrl.search = undefined; // Forces parsedUrl to use `query` property, as modified on line above
const nextPage = url.format(parsedUrl);
await this._goto(nextPage);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
// We have stopped using the "view all" button due to it breaking.
// Leaving the code below commented in case it is ever useful in future.
// await this.page.waitForSelector('#block-system-main > div > div > div.view-content-wrapper > ul > li.pager__item.pager__item_all', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
// logger.debug('Extend menu list..');
// await elm.click({ 'delay':90 });
// }).catch(() => {
// logger.info('No show all button');
// });
await this._randomWait(this.page, 6, 9);
await this.processIndex(serviceObject);
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
let doIndex = false;
await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.warn('Sent back to the main selector screen');
await elm.click({ 'delay':90 });
doIndex = false;
}).catch(() => {
// logger.info('No show all button');
doIndex = true;
});
if (doIndex)
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
// const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
// 'h1.page-title'
await this.page.waitForSelector('h1.page-title', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const pathSplitter = /(\/en\/.+?\/)/;
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const splitPath = pageUrl.pathname.match(pathSplitter);
const pathname = splitPath[0];
switch (pathname) {
case '/en/entidades-autorizadas/':
await this.indexRedirector();
break;
case '/en/entidadeautorizada/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
//
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async function() {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async function() {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async function() {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async function() {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
logger.debug(this.eventNames());
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/75/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/72/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/67-68-1524-69/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/PT/BP`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
await this._makeResponsive();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage();
}, 5000));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = PTScrape;

569
ncas/se.js Normal file
View File

@ -0,0 +1,569 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('SE');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class SEScrape extends Scraper {
constructor() {
super();
this.setID('SE');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<{authorization: Array, details}>}
*/
async extractEntity(html) {
const $ = cheerio.load(html);
const details = {};
const authorization = [];
details.name = this._cleanUp($('h2').text());
const dlCells = $('dl.funky').children();
const ulCells = $('ul.tillstand').children();
let current = '';
dlCells.each((index, item) => {
const itemText = this._cleanUp($(item).text());
if (item.name === 'dt') {
details[itemText] = [];
current = itemText;
}
else
details[current].push(itemText);
});
ulCells.each((index, item) => {
const date = this._cleanUp($(item.children).eq(0).text()) ;
const text = this._cleanUp($(item.children).eq(1).text()) ;
authorization.push({ date, text, 'translated':this._translate(text) });
});
return { details, authorization };
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('h1').catch((e) => {
throw e;
});
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const $ = cheerio.load(body);
const details = await this.extractEntity(body);
const crossBorderExists = $('div.container a.link');
if (crossBorderExists.length !== 0) {
serviceObject.links[serviceObject.step].data = { details };
await this._findAndClick('div.container a.link', 'View cross border services');
}
else {
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractCrossBorderServices(html) {
const services = {};
const $ = cheerio.load(html);
const rows = $('div.container table tbody tr');
let current = '';
rows.each((index, item) => {
if ($(item).children().length === 1) {
// this is a heading...
const itemText = this._cleanUp($(item).text());
services[itemText] = { 'authorization': [], 'translated': this._translate(itemText) };
current = itemText;
}
else {
const date = this._cleanUp($(item.children).eq(0).text()) ;
const text = this._cleanUp($(item.children).eq(1).text()) ;
const translated = this._translate(text);
services[current].authorization.push({ date, text, translated });
}
});
return services;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processCrossBorderServicesV2(serviceObject) {
try{
const noWhiteSpace = /\W/g;
const id = serviceObject.links[serviceObject.step].id;
logger.info('Process CBS entity:', id);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('h1').catch((e) => {
throw e;
});
await this._makeScreenshotV2(this.page, `${filePath}_crossborder`, null);
const body = await this.page.content();
const crossBorderServices = await this.extractCrossBorderServices(body);
const details = serviceObject.links[serviceObject.step].data;
serviceObject.links[serviceObject.step].data = null;
await jsonfile.writeFile(`${filePath}.json`, { details, crossBorderServices });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
// await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('#institut', { 'visible':true });
const links = await this.page.$$('#institut > tbody > tr > td > a');
for (const item of links) {
// logger.debug(item);
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
href = href.concat('&locale=en_GB');
serviceObject.links.push({ id, href });
}
serviceObject.items = serviceObject.links.length;
serviceObject.indexStep++;
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async crossBorderRedirector() {
switch (this.mode) {
case 0:
await this.processCrossBorderServicesV2(this.paymentServices);
break;
case 1:
await this.processCrossBorderServicesV2(this.emoneyServices);
break;
case 2:
await this.processCrossBorderServicesV2(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/our-registers/company-register/':
await this.indexRedirector();
break;
case '/en/our-registers/company-register/details':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('indexdone', async function() {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async function() {
if (this.paymentServices.indexStep < this.paymentServices.urls.length) {
const newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingPaymentServices');
});
this.on('startProcessingPaymentServices', async function() {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
// logger.debug(this.paymentServices.links);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('paymentServicesDone', async function() {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
await this._goto(this.emoneyServices.urls[0]);
});
// emoney Services
this.on('emindexdone', async function() {
if (this.emoneyServices.indexStep < this.emoneyServices.urls.length) {
const newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingEMoneyServices');
});
this.on('startProcessingEMoneyServices', async function() {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
// logger.debug(this.emoneyServices.links);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('emoneyServicesDone', async function() {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
await this._goto(this.creditServices.urls[0]);
});
// credit services
this.on('ciindexdone', async function() {
if (this.creditServices.indexStep < this.creditServices.urls.length) {
const newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingcreditServices');
});
this.on('startProcessingcreditServices', async function() {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
// logger.debug(this.creditServices.links);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('creditServicesDone', async function() {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
await this._loadDictionary();
this.mode = 0;
this.modeTitles = ['**Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BET&area=#results'/* ,
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BETREG&area=#results'*/]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=EINST&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=REGUTG&area=#results']
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=BANK&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=MBANK&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=SPAR&area=#results']
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/SE/FI`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
logger.error(err);
});
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
await this.start();
}
}
module.exports = SEScrape;

833
ncas/sk.js Normal file
View File

@ -0,0 +1,833 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('SK');
const url = require('url');
const camelCase = require('camelcase');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class SKScrape extends Scraper {
constructor() {
super();
this.id = 'SK';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @returns {Promise<boolean>}
*/
async checkChangeLanguage() {
const languageIcon = await this.page.$$('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a > img');
if (languageIcon.length > 0) {
const value = await this.page.evaluate(el => el.getAttribute('src'), languageIcon[0]);
if (value === '/static/icon/ico_en.gif') {
// this needs a click
logger.info('Changing language to English..');
await this._findAndClick('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a ');
return true;
//
}
}
return false;
}
/**
*
* @returns {Promise<void>}
*/
async handleIntroPage() {
const pageUrl = url.parse(await this.page.url());
// Clear cookie bar
await this.page.waitForSelector('a.btnCookieAccept', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.info('No cookie bar');
});
if (!this.inProgress && pageUrl.query === null) {
// fix language before going on
const changedLanguage = await this.checkChangeLanguage();
if (!changedLanguage) {
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
await this._findAndClick(' body > div.container > div:nth-child(5) > div:nth-child(1) > div > div');
}
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processMainMenu(serviceObject) {
const wantedItem = serviceObject.sections[serviceObject.indexStep];
const expandables = ['#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl1',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl2',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl3',
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl4'
];
for (const item of expandables)
await this.page.$eval(item, e => e.click({ 'delay':90 }));
await this._randomWait(this.page, 3, 5);
const wantedRow = `[data-sector="${wantedItem}"]`;
logger.debug('Looking for', wantedRow);
await this.page.waitForSelector(wantedRow, { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.warn('processMainMenu did not find what it was looking for!');
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityIndexFirstPass(serviceObject) {
// breaks up `Showing 1 to 10 of 12 entries`
const breaker = /(\d+)/g;
const body = await this.page.content();
const $ = cheerio.load(body);
const subjectsInfo = $('#Subjects_info').text();
const brokenString = subjectsInfo.match(breaker);
const currentPageIndex = parseInt(brokenString[0], 10);
const currentPageMax = parseInt(brokenString[1], 10);
// The site returns the index from the last page when you select a different view.
// This should be watched and can cause a problem
logger.debug('subjectsInfo', subjectsInfo);
logger.debug('Step', serviceObject.step);
logger.debug('currentPageIndex', currentPageIndex);
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
serviceObject.currentIndexLength = parseInt(brokenString[2], 10);
serviceObject.currentPageMax = currentPageMax;
serviceObject.visited = true;
serviceObject.currentIndex = url.parse(await this.page.url());
serviceObject.currentMetaIndex = 0;
}
else {
logger.info('Need to click previous');
const nextButton = await this.page.$$('#Subjects_previous');
const buttonClasses = await this.page.$eval('#Subjects_previous', e => e.getAttribute('class'));
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
// we need a click..
nextButton[0].click({ 'delay':90 });
await this._randomWait(this.page, 3, 5);
serviceObject.visited = false;
this.emit('entityIndex');
}
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityIndex(serviceObject) {
const fields = ['referenceNumber', 'businessName', 'address', 'start', 'end', 'reason'];
const mouseDownDuration = Scraper.notARobot();
if (serviceObject.visited === false) {
logger.debug('Preparing...');
await this.page.waitForSelector('table#Subjects', { 'visible':true }).then(async () => {
await this.entityIndexFirstPass(serviceObject);
}).catch(() => {
logger.error('Table failed to render');
});
}
if (serviceObject.visited === true) {
serviceObject.currentMetaIndex = serviceObject.step % 10;
if ((serviceObject.step ) >= serviceObject.currentPageMax) {
const nextButton = await this.page.$$('#Subjects_next');
const buttonClasses = await this.page.$eval('#Subjects_next', e => e.getAttribute('class'));
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
// we need a click..
nextButton[0].click({ 'delay':mouseDownDuration });
await this._randomWait(this.page, 3, 5);
serviceObject.visited = false;
this.emit('entityIndex');
}
else {
logger.debug('I think we are done here...');
this.emit('serviceDone');
}
}
else {
await this.page.waitForSelector('#Subjects > tbody');
const wantedRow = await this.page.$$(`#Subjects > tbody > tr:nth-child(${serviceObject.currentMetaIndex + 1})`);
const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
const $ = cheerio.load(`<table>${htmlRow}</table>`);
const cells = $('td');
serviceObject.current = {};
cells.each((index, item) => {
serviceObject.current[ fields[index] ] = $(item).text();
});
await this._randomWait(this.page, 3, 5);
await wantedRow[0].click({ 'delay':mouseDownDuration });
}
}
}
/**
*
* @param $
* @returns {Promise<void>}
*/
async processEntityDetailBasicDetails($) {
const newObj = {};
const rows = $('tr');
rows.each((index, elm) => {
const children = $(elm).children();
const preLabel = $(children).eq(0).text();
const label = camelCase(this._cleanUp(preLabel.replace(':', '')));
newObj[label] = this._cleanUp($(children).eq(1).text());
});
return newObj;
}
/**
*
* @param $
* @param elm
*/
decodeTable($, elm) {
const rows = $(elm).find('table.details tr');
const obj = {};
rows.each( (index, elm) => {
const children = $(elm).children();
const labelClass = $(children[0]).attr('class');
const label = camelCase(this._cleanUp($(children[0]).text().replace(':', '').replace(',', '')));
const contents = this._cleanUp($(children[1]).text().replace(/(Hide|View)\s*/, ''));
if (typeof(labelClass) !== 'undefined' && labelClass === 'dlabel')
obj[label] = contents;
});
return obj;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async processEntityDetailTableV2($) {
// take the first tbody as this is the main one...
const fields = [ 'license', 'start', 'end', 'reason'];
const outData = [];
let newObj = {};
let topLevel = '';
let midLevel = {};
let level1ID = '';
const tbody = $('tbody')[0];
const children = $(tbody).children();
children.each((index, item) => {
const itemClasses = $(item).attr('class').split(' ');
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('sublicctrl') !== -1)) {
// TOP LEVEL
const itemChildren = $(item).children();
if (Object.keys(newObj).length !== 0) {
// push this object into the list
outData.push(newObj);
newObj = {};
}
topLevel = camelCase(this._cleanUp($(itemChildren[0]).text().replace(',', '')));
midLevel = {};
itemChildren.each((ci, celm) => {
midLevel[fields[ci]] = this._cleanUp($(celm).text());
});
midLevel.detail = [];
newObj[topLevel] = Object.assign({}, midLevel);
}
//
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('details') !== -1))
// TOP LEVEL - DETAILS
newObj[topLevel].detail.push(this.decodeTable($, item));
//
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') === -1)) {
// LEVEL 1
const itemChildren = $(item).children();
level1ID = camelCase(this._cleanUp($(itemChildren[0]).text()));
newObj[topLevel][level1ID] = [];
}
//
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') !== -1)) {
// LEVEL 1 - DETAIL
const table = this.decodeTable($, item);
newObj[topLevel][level1ID].push(table);
}
//
if ((itemClasses.indexOf('level2') !== -1) && (itemClasses.indexOf('details') === -1)) {
// LEVEL 2
const itemChildren = $(item).children();
const obj = {};
itemChildren.each((ci, celm) => {
obj[fields[ci]] = this._cleanUp($(celm).text());
});
const nexttable = $(item).next();
obj.details = this.decodeTable($, nexttable);
if (level1ID === '') {
const newID = camelCase(this._cleanUp(obj.license.replace(',', '')));
newObj[topLevel][newID] = [];
newObj[topLevel][newID].push(obj);
}
else {
if (!newObj[topLevel].hasOwnProperty(level1ID))
newObj[topLevel][level1ID] = [];
newObj[topLevel][level1ID].push(obj);
}
}
});
// insert final obj
if (Object.keys(newObj).length !== 0) {
// push this object into the list
outData.push(newObj);
newObj = {};
}
return outData;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetail(serviceObject) {
// level0 sublicctrl sublicctrl1 odd
// level0 sublicctrl sublicctrl1 odd sublicshow shown
// expand all accordians
const rows = await this.page.$$('tr.sublicctrl');
for (const item of rows) {
const cls = await this.page.evaluate(el => el.getAttribute('class'), item);
if (!cls.includes('shown'))
await item.click({ 'delay':Scraper.notARobot() });
}
await this.page.waitForSelector('#Licenses > tbody > tr.level1.shown.sublichide1.sllhidectrl.sllhidectrl1', { 'timeout':7500 }).then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
}).catch(() => {
logger.debug('No License information');
});
await this._microWait(this.page, 5);
// expand all viewable anchors
const wantedAnchors = await this.page.$$('.row a');
for (const item of wantedAnchors) {
const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item));
if (exItem === 'View')
await item.click({ 'delay': Scraper.notARobot() }).catch((e) => {
logger.debug('View click failed', e);
});
}
const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`;
const fileName = this._makeFileName(entityName);
const filePath = await this._makeFilePath(entityName);
serviceObject.current.fileName = fileName;
await this._randomWait(this.page, 2, 2);
await this.page.focus('h3.page-header');
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
await this.page.waitForSelector('body > div.container > form.form-horizontal > table', { 'timeout':7500 }).then(async (elm) => {
logger.debug('prep for processEntityDetailBasicDetails');
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
const $ = cheerio.load(htmlBlock);
serviceObject.current.basicDetails = await this.processEntityDetailBasicDetails($);
});
await this.page.waitForSelector('#Licenses').then(async (elm) => {
logger.debug('prep for processEntityDetailTableV2');
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
const $ = cheerio.load(htmlBlock);
serviceObject.current.entityDetails = await this.processEntityDetailTableV2($);
});
this.entityCompleter(serviceObject);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityCompleter(serviceObject) {
const filename = serviceObject.current.fileName;
const filePath = `${this.path}/${filename}`.substring(0, 240);
logger.info(`Saving: ${filename}.json`);
const newLink = { 'referenceNumber':serviceObject.current.referenceNumber, 'businessName':serviceObject.current.businessName, 'fileName':`${filename}.json` };
serviceObject.links.push(newLink);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 3, 5);
serviceObject.step++;
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
await this.page.goBack({ 'waitUntil':'networkidle0' });
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async handleMainIndex() {
switch (this.mode) {
case 1:
await this.processMainMenu(this.emoneyServices);
break;
case 2:
await this.processMainMenu(this.creditServices);
break;
case 0:
default:
await this.processMainMenu(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityIndex() {
switch (this.mode) {
case 1:
await this.processEntityIndex(this.emoneyServices);
break;
case 2:
await this.processEntityIndex(this.creditServices);
break;
case 0:
default:
await this.processEntityIndex(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityDetail() {
switch (this.mode) {
case 1:
await this.processEntityDetail(this.emoneyServices);
break;
case 2:
await this.processEntityDetail(this.creditServices);
break;
case 0:
default:
await this.processEntityDetail(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const params = Object.assign({ 'aa': '' }, this._getParamsFromUrl(pageUrl.search));
switch (params.aa) {
case '':
await this.handleIntroPage();
break;
case 'select_sector':
await this.handleMainIndex();
break;
case 'select_categ':
await this.handleEntityIndex();
break;
case 'select_subject':
await this.handleEntityDetail();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('entityIndex', async () => {
await this.handleEntityIndex();
});
this.on('paymentServicesDone', async () => {
try{
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep < this.paymentServices.sections.length) {
this.paymentServices.visited = false;
this.paymentServices.step = 0;
await this._goto(this.paymentServices.urls[1]);
}
else {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
try{
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep < this.emoneyServices.sections.length) {
this.emoneyServices.visited = false;
this.emoneyServices.step = 0;
await this._goto(this.emoneyServices.urls[0]);
}
else {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
try{
this.creditServices.indexStep++;
if (this.creditServices.indexStep < this.creditServices.sections.length) {
this.creditServices.visited = false;
this.creditServices.step = 0;
await this._goto(this.creditServices.urls[0]);
}
else {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
}
catch (e) {
logger.error(e);
}
});
}
/**
* Initite the process
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.inProgress = false;
/*
Swapping sections from text to
data-sector ids.
document.querySelector('[data-sector="156"]')
Payment Services:
Payment Institutions and Branches of Foreign Payment Institutions // 9
Providing Payment Services in Limited Scope // 11
Account information service providers // 156
eMoney Services:
E-Money Institutions and Branches of Foreign E-Money Institutions // 12
E-Money Institutions Based in Slovakia // 37
credit Services:
Banks Authorised to Provide Investment Services // 5
Banks Based in Slovakia // 19
*/
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://subjekty.nbs.sk/', 'https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [9, 11, 156],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [12, 37],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
'sections' : [5, 19],
'sectionStep': 0,
'currentIndexLength' : 0,
'sectionLinks' : [],
'currentIndex' :'',
'currentMetaIndex' : 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/SK/NBS`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = SKScrape;

36
nl.js Normal file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
// TODO:
// parse arguments - we should run just 1 FCA per go &
// have option to run selected company from selected NCA
const argv = require('yargs').argv;
// load helper libs etc
// const Fca = require('./ncas/fca');
const Netherlands = require('./ncas/nl');
async function run() {
const nlScraper = new Netherlands();
if (typeof(process.env.NL_CRON) === 'string' )
new CronJob(process.env.NL_CRON, async function() {
await nlScraper.run();
}, null, true);
if (process.env.SCRAPE_START === nlScraper.id)
await nlScraper.run();
console.log('NL Launched');
}
process.once('uncaughtException', function caught(err) {
console.error('Uncaught', err);
done = true;
});
run();

23
no.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Norway = require('./ncas/no');
async function run() {
const noScraper = new Norway();
if (typeof(process.env.NO_CRON) === 'string' )
new CronJob(process.env.NO_CRON, async () => {
await noScraper.run();
}, null, true);
if (process.env.SCRAPE_START === noScraper.id)
await noScraper.run();
console.log('NO Launched');
}
run();

8848
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

68
package.json Normal file
View File

@ -0,0 +1,68 @@
{
"name": "obdfcascrape",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "nyc tape tests/**/*.js",
"testScrapers": "nyc tape tests/**/scrape.*.js",
"testSpecific": "nyc tape tests/scrape.se.js",
"testRep": "nyc tape tests/**/rep.*.js",
"testfr": "nyc tape tests/fr.js",
"cleanup": "rm artefacts/*.{html,json}; rm artefacts/screenshots/*.{jpg,jpeg,png};",
"start": "./start.sh",
"server": "http-server ./public",
"malta": "node mt.js",
"debuglogs": "node debuglogs.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"archiver": "^2.1.1",
"archiver-promise": "^1.0.0",
"aws-sdk": "^2.395.0",
"camelcase": "^5.0.0",
"cheerio": "^1.0.0-rc.2",
"crc": "^3.8.0",
"cron": "^1.6.0",
"csv": "^3.1.0",
"dateformat": "^3.0.3",
"del": "^3.0.0",
"dotenv": "^6.2.0",
"fs-extra": "^7.0.1",
"get-ssl-certificate": "^2.3.1",
"google-translate-api": "^2.3.0",
"hh-mm-ss": "^1.2.0",
"jsonfile": "^5.0.0",
"log4js": "^3.0.6",
"memory": "0.0.3",
"moment": "^2.24.0",
"node-free": "^1.0.0",
"pm2": "^3.5.0",
"puppeteer": "^1.14.0",
"remove-accents-diacritics": "^1.0.2",
"request": "^2.88.0",
"tld-extract": "^1.0.1",
"underscore": "^1.9.1",
"whois": "^2.9.1",
"whois-json": "^2.0.4",
"yargs": "^12.0.5"
},
"devDependencies": {
"deep-diff": "^1.0.2",
"gulp": "^3.9.1",
"gulp-archiver": "^1.0.0",
"gulp-aws-s3": "^1.1.0",
"gulp-bump": "^3.1.3",
"gulp-changed-in-place": "^2.3.0",
"gulp-debug": "^4.0.0",
"gulp-gzip": "^1.4.2",
"gulp-tar": "^2.1.0",
"nyc": "^13.1.0",
"static-server": "^2.2.1",
"tap-summary": "^4.0.0",
"tape": "^4.9.2",
"tape-promise": "^3.0.0",
"translate-google": "^1.3.5"
}
}

63
package.old Normal file
View File

@ -0,0 +1,63 @@
{
"name": "obdfcascrape",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "nyc tape tests/**/*.js",
"testScrapers": "nyc tape tests/**/scrape.*.js",
"testSpecific": "nyc tape tests/scrape.se.js",
"testRep": "nyc tape tests/**/rep.*.js",
"testfr": "nyc tape tests/fr.js",
"cleanup": "rm artefacts/*.{html,json}; rm artefacts/screenshots/*.{jpg,jpeg,png};",
"start": "./start.sh",
"server": "http-server ./public",
"malta": "node mt.js",
"debuglogs": "node debuglogs.js"
},
"author": "",
"license": "ISC",
"dependencies": {
"archiver": "^2.1.1",
"archiver-promise": "^1.0.0",
"aws-sdk": "^2.395.0",
"camelcase": "^5.0.0",
"cheerio": "^1.0.0-rc.2",
"crc": "^3.8.0",
"cron": "^1.6.0",
"csv": "^3.1.0",
"dateformat": "^3.0.3",
"del": "^3.0.0",
"dotenv": "^6.2.0",
"fs-extra": "^7.0.1",
"get-ssl-certificate": "^2.3.1",
"google-translate-api": "^2.3.0",
"hh-mm-ss": "^1.2.0",
"jsonfile": "^5.0.0",
"log4js": "^3.0.6",
"moment": "^2.24.0",
"pm2": "^3.2.9",
"puppeteer": "1.11.0",
"remove-accents-diacritics": "^1.0.2",
"request": "^2.88.0",
"tld-extract": "^1.0.1",
"underscore": "^1.9.1",
"whois": "^2.9.1",
"whois-json": "^2.0.4",
"yargs": "^12.0.5"
},
"devDependencies": {
"gulp": "^3.9.1",
"gulp-archiver": "^1.0.0",
"gulp-aws-s3": "^1.1.0",
"gulp-bump": "^3.1.3",
"gulp-gzip": "^1.4.2",
"gulp-tar": "^2.1.0",
"nyc": "^13.1.0",
"static-server": "^2.2.1",
"tap-summary": "^4.0.0",
"tape": "^4.9.2",
"tape-promise": "^3.0.0",
"translate-google": "^1.3.5"
}
}

23
pl.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Poland = require('./ncas/pl');
async function run() {
const plScraper = new Poland();
if (typeof(process.env.PL_CRON) === 'string' )
new CronJob(process.env.PL_CRON, async function() {
await plScraper.run();
}, null, true);
if (process.env.SCRAPE_START === plScraper.id)
await plScraper.run();
console.log('PL Launched');
}
run();

23
pt.js Normal file
View File

@ -0,0 +1,23 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const Portugal = require('./ncas/pt');
async function run() {
const ptScraper = new Portugal();
if (typeof(process.env.PT_CRON) === 'string' )
new CronJob(process.env.PT_CRON, async function() {
await ptScraper.run();
}, null, true);
if (process.env.SCRAPE_START === ptScraper.id)
await ptScraper.run();
console.log('PT Launched');
}
run();

28
publish.js Normal file
View File

@ -0,0 +1,28 @@
var AWS = require('aws-sdk');
var util = require('util');
var config = require('./config.json');
require('dotenv').config({
'path': `${__dirname }/.env`
});
// configure AWS
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
var sns = new AWS.SNS();
function publish(mesg) {
var publishParams = {
'TopicArn' : config.TopicArn,
'Message': mesg
};
sns.publish(publishParams, function(err, data) {
process.stdout.write('.');
// console.log(data);
});
}
for (var i = 0; i < 500; i++)
publish(`message: ${ i}`);

31
se.js Normal file
View File

@ -0,0 +1,31 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
// TODO:
// parse arguments - we should run just 1 FCA per go &
// have option to run selected company from selected NCA
const argv = require('yargs').argv;
// load helper libs etc
// const Fca = require('./ncas/fca');
const Sweden = require('./ncas/se');
async function run() {
const seScraper = new Sweden();
if (typeof(process.env.SE_CRON) === 'string' )
new CronJob(process.env.SE_CRON, async function() {
await seScraper.run();
}, null, true);
if (process.env.SCRAPE_START === seScraper.id)
await seScraper.run();
console.log('SE Launched');
}
run();

55
setup/eslintrc.json Normal file
View File

@ -0,0 +1,55 @@
{
"parserOptions": {
"ecmaVersion": 2017,
"sourceType": "module",
"ecmaFeatures": {
"jsx": false
}
},
"env": {
"browser": false,
"node": true,
"es6": true
},
"rules": {
"arrow-spacing": "error",
"block-scoped-var": "error",
"block-spacing": "error",
"brace-style": ["error", "stroustrup", {}],
"camelcase": "error",
"comma-dangle": ["error", "never"],
"comma-spacing": ["error", { "before": false, "after": true }],
"comma-style": [1, "last"],
"consistent-this": [1, "_this"],
"curly": [1, "multi"],
"eol-last": 1,
"eqeqeq": 1,
"func-names": 1,
"indent": ["error", 2, { "SwitchCase": 1 }],
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
"new-cap": 1,
"newline-before-return": "error",
"no-array-constructor": 1,
"no-inner-declarations": [1, "both"],
"no-mixed-spaces-and-tabs": 1,
"no-multi-spaces": 2,
"no-new-object": 1,
"no-shadow-restricted-names": 1,
"object-curly-spacing": ["error", "always"],
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
"prefer-const": "error",
"prefer-template": "error",
"one-var": 0,
"quote-props": ["error", "always"],
"quotes": [1, "single"],
"radix": 1,
"semi": [1, "always"],
"space-before-blocks": [1, "always"],
"space-infix-ops": 1,
"vars-on-top": 1,
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
"spaced-comment": ["error", "always", { "markers": ["/"] }]
}
}

8
setup/init.sh Executable file
View File

@ -0,0 +1,8 @@
#cloud-boothook
#!/bin/bash
curl -o- https://raw.githubusercontent.com/OpenBankingUK/obdfcascrape/DIR-3232/setup/install.sh?token=ApJJhry7P8vGWWpPtttCgOaregsZnXdmks5b_rG7wA%3D%3D | bash
# cat /var/log/cloud-init-output.log

29
setup/install.sh Executable file
View File

@ -0,0 +1,29 @@
#!/bin/bash
NVM="$HOME/.nvm"
NVM_VERSION="stable"
DEV="$HOME/dev"
SWAP="/swapfile"
# apt-get -y -q update && apt-get -y -q upgrade
apt-get -y -q update && apt-get --assume-yes install build-essential git nginx htop screen wget curl xorg openbox libasound2
apt-get -y -q clean
fallocate -l 1G $SWAP
chmod 600 $SWAP
mkswap $SWAP
swapon $SWAP
echo '/swapfile none swap defaults 0 0' >> /etc/fstab
curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash
export NVM_DIR=$NVM
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" # This loads nvm
source $HOME/.bashrc
source $NVM/nvm.sh
nvm install $NVM_VERSION
npm install -g gulp pm2@latest npm-check npm-install-missing
pm2 update
touch $HOME/martin.txt

7
setup/setup.sh Normal file
View File

@ -0,0 +1,7 @@
#!/usr/bin/env bash
rsync -avz --exclude 'artefacts' --exclude 'node_modules' --exclude '.git' --exclude 'dist' /media/sf_mdev/obdfcascrape/ ~/dev/
SCRAPE_START=EE;NODE_ENV=;LOGGER_LEVEL=trace

52
setup/work/Dockerfile Normal file
View File

@ -0,0 +1,52 @@
FROM node:8-slim
LABEL name "slimscrape"
# See https://crbug.com/795759
RUN apt-get update && apt-get install -yq libgconf-2-4
# Install latest chrome dev package and fonts to support major
# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
# Note: this installs the necessary libs to make the bundled version
# of Chromium that Puppeteer
# installs, work.
RUN apt-get update && apt-get install -y wget --no-install-recommends \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get purge --auto-remove -y curl \
&& rm -rf /src/*.deb
# It's a good idea to use dumb-init to help prevent zombie chrome processes.
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init
RUN chmod +x /usr/local/bin/dumb-init
# Uncomment to skip the chromium download when installing puppeteer.
# If you do, you'll need to launch puppeteer with:
# browser.launch({executablePath: 'google-chrome-unstable'})
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
# Copy the app
WORKDIR /app
ADD archive.tar.gz /app
RUN npm install pm2 -g
RUN npm i
# Add user so we don't need --no-sandbox.
# RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
# && mkdir -p /home/pptruser/Downloads \
# && chown -R pptruser:pptruser /home/pptruser \
# && chown -R pptruser:pptruser ./node_modules
# Run everything after as non-privileged user.
# USER pptruser
# EXPOSE 8084
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
CMD ["pm2-runtime", "start", "ecosystem.config.js", "--raw" , "--env", "production"]

29
setup/work/Makefile Normal file
View File

@ -0,0 +1,29 @@
PROJECT = obdfcascrape
VERSION = $(shell git rev-parse --short HEAD)
ECR_REGION = eu-west-1
ECR_ACCOUNT_NUMBER = 482681734622
#ECR_REPO = $(ECR_ACCOUNT_NUMBER).dkr.ecr.$(ECR_REGION).amazonaws.com
ECR_REPO = mail.caliban.io:5000
#APP_IMAGE = 482681734622.dkr.ecr.eu-west-1.amazonaws.com/$(PROJECT):$(VERSION)
APP_IMAGE = $(ECR_REPO)/$(PROJECT):$(VERSION)
NO_CACHE = false
#build docker image
build:
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
.PHONY: build
#push docker image to registry
push: build
docker push $(APP_IMAGE)
.PHONY: push
#push docker image to registry
run: build
docker run $(APP_IMAGE)
.PHONY: run
ver:
@echo '$(VERSION)'
#echo $ERSION
.PHONY: ver

BIN
setup/work/archive.tar.gz Normal file

Binary file not shown.

146
setupQueue.js Normal file
View File

@ -0,0 +1,146 @@
// https://github.com/markcallen/snssqs
const AWS = require('aws-sdk');
const util = require('util');
const async = require('async');
const fs = require('fs');
require('dotenv').config({
'path': `${__dirname }/.env`
});
// configure AWS
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
const sns = new AWS.SNS();
const sqs = new AWS.SQS();
const config = {};
function createTopic(cb) {
sns.createTopic({
'Name': process.env.SQS_NAME
}, function (err, result) {
if (err !== null) {
console.log(util.inspect(err));
return cb(err);
}
console.log(util.inspect(result));
config.TopicArn = result.TopicArn;
cb();
});
}
function createQueue(cb) {
sqs.createQueue({
'QueueName': process.env.SQS_NAME
}, function (err, result) {
if (err !== null) {
console.log(util.inspect(err));
return cb(err);
}
console.log(util.inspect(result));
config.QueueUrl = result.QueueUrl;
cb();
});
}
function getQueueAttr(cb) {
sqs.getQueueAttributes({
'QueueUrl': config.QueueUrl,
'AttributeNames': ['QueueArn']
}, function (err, result) {
if (err !== null) {
console.log(util.inspect(err));
return cb(err);
}
console.log(util.inspect(result));
config.QueueArn = result.Attributes.QueueArn;
cb();
});
}
function snsSubscribe(cb) {
sns.subscribe({
'TopicArn': config.TopicArn,
'Protocol': 'sqs',
'Endpoint': config.QueueArn
}, function (err, result) {
if (err !== null) {
console.log(util.inspect(err));
return cb(err);
}
console.log(util.inspect(result));
cb();
});
}
function setQueueAttr(cb) {
const queueUrl = config.QueueUrl;
const topicArn = config.TopicArn;
const sqsArn = config.QueueArn;
const attributes = {
'Version': '2008-10-17',
'Id': `${sqsArn}/SQSDefaultPolicy`,
'Statement': [{
'Sid': `Sid${new Date().getTime()}`,
'Effect': 'Allow',
'Principal': {
'AWS': '*'
},
'Action': 'SQS:SendMessage',
'Resource': sqsArn,
'Condition': {
'ArnEquals': {
'aws:SourceArn': topicArn
}
}
}
]
};
sqs.setQueueAttributes({
'QueueUrl': queueUrl,
'Attributes': {
'Policy': JSON.stringify(attributes)
}
}, function (err, result) {
if (err !== null) {
console.log(util.inspect(err));
return cb(err);
}
console.log(util.inspect(result));
cb();
});
}
function writeConfigFile(cb) {
fs.writeFile('config.json', JSON.stringify(config, null, 4), function(err) {
if(err)
return cb(err);
console.log('config saved to config.json');
cb();
});
}
async.series([createTopic, createQueue, getQueueAttr, snsSubscribe, setQueueAttr, writeConfigFile]);

25
sk.js Normal file
View File

@ -0,0 +1,25 @@
#!/usr/bin/env node
const CronJob = require('cron').CronJob;
// load env variables from file
require('dotenv').config();
const argv = require('yargs').argv;
const Slovakia = require('./ncas/sk');
async function run() {
const skScraper = new Slovakia();
if (typeof(process.env.SK_CRON) === 'string' )
new CronJob(process.env.SK_CRON, async function() {
await skScraper.run();
}, null, true);
if (process.env.SCRAPE_START === skScraper.id)
await skScraper.run();
console.log('SK Launched');
}
run();

1
sonar-project.properties Normal file
View File

@ -0,0 +1 @@
sonar.exclusions=**/tests/**/*

448
src/cy.js Normal file
View File

@ -0,0 +1,448 @@
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('CY');
logger.level = process.env.LOGGER_LEVEL || 'warn';
// load env variables from file
class CYScrape extends Scraper {
constructor() {
super();
this.id = 'CY';
this.version = '0.0.2';
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param selector
* @returns {Promise<void>}
*/
async grabLink(selector) {
const clickableLinks = await this.page.$$(selector);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
if (clickableLinks.length > 0)
for (const item of clickableLinks) {
const href = await this.page.evaluate(el => el.href, item);
await this._randomWait(this.page, 3, 5);
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if (!err.message.includes('net::ERR_ABORTED') )
logger.error('grabLink', err);
});
}
}
/**
*
* @param id
* @returns {Promise<void>}
*/
async downloadEmoney(id) {
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
await this.grabLink(selector[id]);
}
/**
*
* @returns {Promise<void>}
*/
async downloadExcel() {
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
await this.grabLink(selector);
}
/**
*
* @returns {Promise<void>}
*/
async handlePaymentInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-payment-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadExcel();
await this._randomWait(this.page, 3, 5);
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
}
/**
*
* @returns {Promise<void>}
*/
async handleElectronicMoneyInstitutions() {
await this._randomWait(this.page, 3, 5);
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(0);
await this._randomWait(this.page, 3, 5);
await this.downloadEmoney(1);
await this._randomWait(this.page, 3, 5);
this.emit('startProcessingCreditServices');
}
/**
*
* @param body
* @returns {Promise<{}|Array>}
*/
async extractLocalCreditInstitutions(body) {
try{
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
const sanity = /(\d+\.\s)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
if (typeof nextItem !== 'undefined' && nextItem !== null) {
const splitText = $(nextItem).text().split('\n');
const output = [];
splitText.forEach((item) => {
const newItem = this._cleanUp(item);
if ( newItem !== '')
output.push( sanity.exec(newItem)[2]);
});
return output;
}
return {};
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param body
* @returns {Promise<void>}
*/
async extractForeignCreditInstitutions(body) {
try{
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
const sanity = /(\w+\.\s+)(.+)/;
const $ = cheerio.load(body, {
'normalizeWhitespace': true
});
const output = {};
let nextItem;
$('p').each(function(i, elem) {
const lineText = $(this).text();
const isHeading = matchHeading.test(lineText);
if (isHeading)
nextItem = $(this).next();
});
// Rolling this out for ease as it could be changed by hand
let nextElm;
let firstHeadOrig, firstHead;
if (typeof nextItem !== 'undefined' && nextItem !== null) {
firstHeadOrig = this._cleanUp($(nextItem).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextItem).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
firstHeadOrig = this._cleanUp($(nextElm).text());
firstHead = sanity.exec(firstHeadOrig)[2];
output[firstHead] = {};
nextElm = $(nextElm).next();
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
nextElm = $(nextElm).next();
}
if (typeof nextElm !== 'undefined' && nextElm !== null) {
const secondHeadOrig = this._cleanUp($(nextElm).text());
const secondHead = sanity.exec(secondHeadOrig)[2];
nextElm = $(nextElm).next();
const li = $(nextElm).find('li');
const arrayA = [];
$(li).each(function (i, elem) {
const lineText = $(this).text();
arrayA.push(lineText);
});
output[firstHead][secondHead] = arrayA;
}
return output;
}
catch(err) {
logger.error(err);
}
}
/**
*
* @returns {Promise<{local: Promise<*|void>}>}
*/
async processCreditInstitute() {
logger.info('Credit institutes');
try{
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
const body = await this.page.content();
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
const $ = cheerio.load(body);
const content = $('.generic_page-intro');
const local = await this.extractLocalCreditInstitutions(content.html());
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
this.emit('done');
return { local, creditInstitutes };
}
catch(err) {
logger.error(err);
}
}
/**
*
* @param filePath
* @returns {Promise<void>}
*/
async savePDF(filePath) {
logger.info('Saving the pdf:', filePath);
await this._randomWait(this.page, 5, 7);
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
// this.emit('startProcessingCreditServices');
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const checkPDF = /(.pdf)/g;
await this._randomWait(this.page, 3, 5);
const currentPage = await this.page.evaluate(() => document);
let currentPath = currentPage.location.pathname;
let pdfFile;
if (checkPDF.test(currentPath)) {
const splitPath = currentPath.split('/');
pdfFile = splitPath.pop();
currentPath = splitPath.join('/');
}
switch (currentPath) {
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
await this.handlePaymentInstitutions();
break;
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
await this.handleElectronicMoneyInstitutions();
break;
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
logger.warn('We should only arrive here when in Non-headless mode');
await this.savePDF(pdfFile);
break;
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
await this.processCreditInstitute();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${currentPath}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
logger.info('Attaching events');
this.on('startProcessingCreditServices', async function() {
await this._goto(this.credit);
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
try {
super._start();
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
};
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
await this._createDirectory(this.path);
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}
catch (e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
logger.info('Scraping Cyprus...');
await this.start();
}
}
module.exports = CYScrape;

8
start.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/sh
set -ex
eval "$(aws ssm get-parameters-by-path --region $REGION --path "/$SERVICE_NAME/$ENV/" --query 'Parameters[*].{Name:Name,Value:Value}' --output text | sed 's/\/'"$SERVICE_NAME"'\/'"$ENV"'\///g' | awk -F '\t' '{ print "export " $1 "=" "\""$2"\";" }')"
npm show puppeteer version
pm2-runtime start ecosystem.config.js --raw --env production

View File

@ -0,0 +1,51 @@
{
"local": [
"Ancoria Bank Limited",
"Astrobank Limited",
"Bank of Cyprus Public Company Ltd",
"Cyprus Development Bank Public Company Limited",
"Hellenic Bank Public Company Limited",
"Housing Finance Corporation",
"RCB BANK LTD"
],
"creditInstitutes": {
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS": {
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
"Αlpha Bank Cyprus Ltd",
"Eurobank Cyprus Ltd",
"National Bank of Greece (Cyprus) Ltd"
],
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
"Societe Generale Bank-Cyprus Limited",
"USB Bank Plc"
]
},
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS": {
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
"AS Expobank ",
"Banque SBA",
"Central Cooperative Bank PLC",
"EFG Bank (Luxembourg) S.A.",
"First Investment Bank Ltd ",
"National Bank of Greece S.A."
],
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
"Arab Jordan Investment Bank SA",
"Bank of Beirut SAL",
"BankMed s.a.l. ",
"Banque BEMO SAL ",
"BBAC SAL ",
"BLOM Bank SAL ",
"Byblos Bank SAL ",
"Credit Libanais SAL ",
"IBL Bank sal ",
"Joint-stock company AVTOVAZBANK * ",
"Jordan Ahli Bank plc",
"Jordan Kuwait Bank PLC ",
"Lebanon and Gulf Bank SAL ",
"Promsvyazbank PJSC **",
"Public Joint-Stock Company Commercial Bank \"Privatbank\"***"
]
}
}
}

101
tests/data/cy/content.html Normal file
View File

@ -0,0 +1,101 @@
<div class="generic_page-intro">
<h1 class="text-center">Register of Credit Institutions operating in Cyprus</h1>
<p class="text-center"></p>
<p>&nbsp;</p>
<p><b>1.&nbsp;LOCAL AUTHORISED CREDIT INSTITUTIONS</b></p>
<p><b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</b>&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1. <a href="http://www.ancoriabank.com" target="_blank"><font color="#0000ff">Ancoria Bank Limited</font></a>&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp; 2.&nbsp;<a href="http://www.astrobank.com"><font color="#0000ff">Astrobank Limited</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3. <a href="http://www.bankofcyprus.com/" target="_blank"><font color="#0000ff">Bank of Cyprus Public Company Ltd</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4. <a href="http://www.cyprusdevelopmentbank.com/" target="_blank"><span style="color: rgb(0, 0, 255);">Cyprus Development Bank Public Company Limited</span></a>&nbsp;&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp; 5.&nbsp;<a href="http://www.hellenicbank.com/" target="_blank"><font color="#0000ff">Hellenic Bank Public Company Limited&nbsp;</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp; 6. <a href="http://www.hfc.com.cy/" target="_blank"><font color="#0000ff">Housing Finance Corporation</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp; 7. <a href="http://www.rcbcy.com/" target="_blank"><font color="#0000ff">RCB BANK LTD</font></a></p>
<p><strong>&nbsp;</strong></p>
<p><b>2. FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING UNDER THE "EUROPEAN PASSPORT"</b></p>
<p><b>&nbsp; A. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS</b></p>
<p><b>&nbsp;&nbsp;&nbsp; I.&nbsp; SUBSIDIARIES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM E.U. MEMBER STATES</b></p>
<ol type="1">
<li><a href="http://www.alphabank.com.cy/" target="_blank"><font color="#0000ff">Αlpha Bank Cyprus Ltd</font></a></li>
<li><a href="http://www.eurobank.com.cy" target="_blank"><font color="#0000ff">Eurobank Cyprus Ltd</font></a></li>
<li><a href="http://www.nbg.com.cy/" target="_blank"><font color="#0000ff">National Bank of Greece (Cyprus) Ltd</font></a></li>
</ol>
<p><strong>&nbsp;&nbsp; II.&nbsp; SUBSIDIARIES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM NON E.U. MEMBER STATES</strong></p>
<ol>
<li><a href="http://www.sgcyprus.com/" target="_blank"><font color="#0000ff">Societe Generale Bank-Cyprus Limited</font></a></li>
<li><a href="http://www.usbbank.com.cy/" target="_blank"><font color="#0000ff">USB Bank Plc</font></a></li>
</ol>
<p><strong>B. BRANCHES OF FOREIGN CREDIT INSTITUTIONS</strong></p>
<p><strong>&nbsp;&nbsp;&nbsp; I. BRANCHES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM E.U. MEMBER STATES</strong></p>
<ol>
<li><a href="http://www.expobank.eu"><font color="#0000ff">AS Expobank&nbsp;</font></a></li>
<li><a href="http://www.banque-sba.com/" target="_blank"><font color="#0000ff">Banque SBA</font></a></li>
<li><a href="http://www.ccbank.bg/" target="_blank"><font color="#0000ff">Central Cooperative Bank PLC</font></a></li>
<li><font color="#0000ff"><a href="http://www.efgbank.lu/" target="_blank"><font color="#0000ff"><font color="#0000ff">EFG Bank (Luxembourg) S.A.</font></font></a></font></li>
<li><a href="http://www.fibank.bg/" target="_blank"><font color="#0000ff">First Investment Bank Ltd</font></a>&nbsp;</li>
<li><a href="http://www.nbg.gr/" target="_blank"><font color="#0000ff">National Bank of Greece S.A.</font></a></li>
</ol>
<p><br>
<b>&nbsp; II.&nbsp;BRANCHES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM NON E.U. MEMBER STATES</b></p>
<ol>
<li><a href="http://www.ajib.com/" target="_blank"><font color="#0000ff">Arab Jordan Investment Bank SA</font></a></li>
<li><a href="http://www.bankofbeirut.com.lb/" target="_blank"><font color="#0000ff">Bank of Beirut SAL</font></a></li>
<li><a href="http://www.bankmed.com.lb/" target="_blank"><font color="#0000ff">BankMed s.a.l.</font></a>&nbsp;</li>
<li><a href="http://www.bemobank.com/"><font color="#0000ff">Banque BEMO SAL</font>&nbsp;</a></li>
<li><a href="http://www.bbac.com.lb/" target="_blank"><font color="#0000ff">BBAC SAL</font></a>&nbsp;</li>
<li><a href="http://www.blom.com.lb/" target="_blank"><font color="#0000ff">BLOM Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.byblosbank.com.lb/" target="_blank"><font color="#0000ff">Byblos Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.creditlibanais.com.lb/" target="_blank"><font color="#0000ff">Credit Libanais SAL</font></a>&nbsp;</li>
<li><a href="http://www.ibl.com.lb/" target="_blank"><font color="#0000ff">IBL Bank sal</font></a>&nbsp;</li>
<li><a href="http://www.avbbank.ru" target="_blank"><font color="#0000ff">Joint-stock company&nbsp;AVTOVAZBANK</font></a> <sup>*</sup>&nbsp;</li>
<li><font color="#0000ff"><a href="http://www.ahli.com/" target="_blank"><font color="#0000ff">Jordan&nbsp;Ahli Bank plc</font></a></font></li>
<li><font color="#0000ff"><a href="http://www.jkb.com" target="_blank"><font color="#0000ff">Jordan Kuwait Bank PLC</font></a>&nbsp;</font></li>
<li><a href="http://www.lgb.com.lb/" target="_blank"><font color="#0000ff">Lebanon and Gulf Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.psbank.ru/" target="_blank"><font color="#0000ff">Promsvyazbank PJSC</font></a> <sup>**</sup></li>
<li><a href="https://privatbank.ua/ua/"><font color="#0000ff">Public Joint-Stock Company Commercial Bank "Privatbank"</font></a><sup>***</sup></li>
</ol>
<div>&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">*&nbsp; Following the amendment of the licence of the branch of Joint-stock company AVTOVAZBANK by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: &nbsp;(1) the repayment of the existing customer deposits,&nbsp; (2) the acceptance of payments towards existing customers credit facilities, &nbsp;&nbsp;(3) the execution of customers outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
<div style="text-align: justify;">&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">** Following the amendment of the licence of the branch of Promsvyazbank PJSC by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits,&nbsp; (2) the acceptance of payments towards existing customers credit facilities, &nbsp;(3) the execution of customers outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
<div style="text-align: justify;">&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">*** Following the amendment of the licence of the branch of Public Joint-Stock Company Commercial Bank "Privatbank" by the Central Bank of Cyprus on 20/12/2016, the said branch is not permitted to engage in any banking business, other than: (i) the repayment or renewal of existing deposits and the acceptance of payments towards existing credit facilities, and (ii) the repayment of administrative expenses relating to the operations of the branch.<strong>&nbsp;&nbsp;&nbsp;&nbsp;</strong></span></div>
<p><span style="font-size: 10px;">&nbsp;&nbsp;&nbsp;</span></p>
<p><strong>&nbsp;3. REPRESENTATIVE OFFICES</strong></p>
<div>
<ol>
<li><a href="http://www.atlasbanka.com/en/"><font color="#0000ff">Atlasmont Banka A.</font><font color="#0000ff">D</font></a></li>
<li><a href="http://bankofgeorgia.ge/en/"><font color="#0000ff">JSC Bank of Georgia</font>&nbsp;</a>&nbsp;</li>
</ol>
<p>&nbsp;</p>
</div>
<p>&nbsp;</p>
<p></p>
</div>

View File

@ -0,0 +1,541 @@
<html lang="en"><script async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript" async="" src="https://www.gstatic.com/recaptcha/api2/v1540794797339/recaptcha__en.js"></script><script>
Object.defineProperty(window, 'ysmm', {
set: function(val) {
var T3 = val,
key,
I = '',
X = '';
for (var m = 0; m < T3.length; m++) {
if (m % 2 == 0) {
I += T3.charAt(m);
} else {
X = T3.charAt(m) + X;
}
}
T3 = I + X;
var U = T3.split('');
for (var m = 0; m < U.length; m++) {
if (!isNaN(U[m])) {
for (var R = m + 1; R < U.length; R++) {
if (!isNaN(U[R])) {
var S = U[m]^U[R];
if (S < 10) {
U[m] = S;
}
m = R;
R = U.length;
}
}
}
}
T3 = U.join('');
T3 = window.atob(T3);
T3 = T3.substring(T3.length - (T3.length - 16));
T3 = T3.substring(0, T3.length - 16);
key = T3;
if (key && (key.indexOf('http://') === 0 || key.indexOf("https://") === 0)) {
document.write('<!--');
window.stop();
window.onbeforeunload = null;
window.location = key;
}
}
});
</script><head><script src="https://www.google.com/recaptcha/api.js"></script>
<title>Central Bank of Cyprus - Register of Credit Institutions operating in Cyprus</title>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="author" content="DW Dynamic Works Ltd">
<meta name="keywords" content="">
<meta name="description" content="">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="app" data-lang="en">
<meta property="og:title" content="Central Bank of Cyprus - Register of Credit Institutions operating in Cyprus">
<meta property="og:description" content="">
<meta property="og:site_name" content="www.centralbank.cy">
<meta property="og:image" contnt="www.centralbank.cy/assets/image/imageoriginal/Register-of-Credit.jpg">
<meta property="og:url" content="https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus">
<meta property="og:type" content="">
<script type="text/javascript" src="/lib/jquery/2.1.4/jquery.min.js"></script>
<script type="text/javascript" src="/lib/jquery-base64/jquery.base64.js"></script>
<link rel="stylesheet" type="text/css" href="/lib/bootstrap/3.3.6/css/bootstrap.min.css">
<script type="text/javascript" src="/lib/bootstrap/3.3.6/js/bootstrap.min.js"></script>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Tinos">
<link rel="stylesheet" type="text/css" href="/lib/font-awesome/4.6.3/css/font-awesome.min.css">
<link rel="stylesheet" type="text/css" href="/lib/simple-line-icons/2.3.2/css/simple-line-icons.css">
<script type="text/javascript" src="/js/data.appvars.min.js"></script>
<script type="text/javascript" src="/js/data.dictionary.min.js"></script>
<script type="text/javascript" src="/system.js"></script>
<script type="text/javascript" src="/js/app.js"></script>
<script type="text/javascript" src="/js/localization.js"></script>
<script type="text/javascript" src="/js/number.js"></script>
<script type="text/javascript" src="/js/string.js"></script>
<script type="text/javascript" src="/js/global.js"></script>
<script type="text/javascript" src="/js/controller.js"></script>
<link href="/plugins/bootstrap-datepicker/1.7.0/dist/css/bootstrap-datepicker.min.css" rel="stylesheet">
<script src="/plugins/bootstrap-datepicker/1.7.0/dist/js/bootstrap-datepicker.min.js"></script>
<script src="/plugins/bootstrap-datepicker/1.7.0/dist/locales/bootstrap-datepicker.el.min.js" charset="UTF-8"></script>
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5952254251d264eb"></script>
<link rel="stylesheet" type="text/css" href="/css/sections.css">
<script type="text/javascript" src="/js/sections.js"></script>
<link rel="stylesheet" type="text/css" href="/css/dwf.css">
<link rel="stylesheet" type="text/css" href="/css/style.css">
<link rel="stylesheet" type="text/css" href="/css/responsive.css">
<link rel="stylesheet" type="text/css" href="/css/custom.css">
<link rel="icon" type="image/x-icon" href="/favicon.ico">
<link rel="apple-touch-icon" sizes="57x57" href="/images/favicon/apple-icon-57x57.png">
<link rel="apple-touch-icon" sizes="60x60" href="/images/favicon/apple-icon-60x60.png">
<link rel="apple-touch-icon" sizes="72x72" href="/images/favicon/images/favicon/apple-icon-72x72.png">
<link rel="apple-touch-icon" sizes="76x76" href="/images/favicon/apple-icon-76x76.png">
<link rel="apple-touch-icon" sizes="114x114" href="/images/favicon/apple-icon-114x114.png">
<link rel="apple-touch-icon" sizes="120x120" href="/images/favicon/apple-icon-120x120.png">
<link rel="apple-touch-icon" sizes="144x144" href="/images/favicon/apple-icon-144x144.png">
<link rel="apple-touch-icon" sizes="152x152" href="/images/favicon/apple-icon-152x152.png">
<link rel="apple-touch-icon" sizes="180x180" href="/images/favicon/apple-icon-180x180.png">
<link rel="icon" type="image/png" sizes="192x192" href="/images/favicon/android-icon-192x192.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="96x96" href="/images/favicon/favicon-96x96.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon/favicon-16x16.png">
<link rel="manifest" href="/images/favicon/manifest.json">
<meta name="msapplication-TileColor" content="#ffffff">
<meta name="msapplication-TileImage" content="/images/favicon/ms-icon-144x144.png">
<meta name="theme-color" content="#ffffff">
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-103148572-1', 'auto');
ga('send', 'pageview');
</script>
</head>
<body>
<header>
<div class="container-fluid">
<div class="language-switch">
<a class="nav-top-items" href="/en/the-bank">The Bank</a>
<a class="nav-top-items" href="/en/contact-us"><i class="fa fa-envelope" aria-hidden="true"></i></a>
<a class="nav-top-items searchBtn search_icon" href=".toggle-search"> <span class="[ glyphicon glyphicon-search ]"></span></a>
<a href="javascript:void(0);" class="contrast-toggle" onclick="visuallyImpaired()">High Contrast</a>
<a href="javascript:void(0);" onclick="location.href='/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus'">EN</a> <span>|</span> <a href="javascript:void(0);" onclick="location.href='/el/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus'">ΕΛ</a>
</div>
</div>
<nav class="navbar navbar-default navbar-bootsnipp" role="navigation">
<div class="container-fluid">
<div class="navbar-header">
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-collapse-1">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand navbrand-custom normal-brand" href="/en/home" title=" Central Bank of Cyprus"><img src="/images/central-bank-of-cyprus-en.png"></a>
<a class="navbar-brand navbrand-custom visually-impaired-brand" href="/en/home" title=" Central Bank of Cyprus" style="display: none;"><h3>Central Bank of Cyprus</h3></a>
</div>
<div class="collapse navbar-collapse" id="navbar-collapse-1">
<ul class="nav navbar-nav navbar-right navbar-custom-media">
<li><a href="/en/monetary-policy" title="Monetary Policy">Monetary Policy</a></li>
<li><a href="/en/licensing-supervision" title="Licensing &amp; Supervision">Licensing &amp; Supervision</a></li>
<li><a href="/en/resolution" title="Resolution">Resolution</a></li>
<li><a href="/en/financial-stability" title="Financial Stability">Financial Stability</a></li>
<li><a href="/en/deposit-guarantee-investors-compensation-schemes" title="Deposit Guarantee &amp; Investors' Compensation">Deposit Guarantee &amp; Investors' Compensation</a></li>
<li><a href="/en/payment-systems-services" title="Payment Systems &amp; Services">Payment Systems &amp; Services</a></li>
<li><a href="/en/banknotes-and-coins" title="Banknotes &amp; Coins">Banknotes &amp; Coins</a></li>
</ul>
</div>
</div>
<form action="/easyconsole.cfm/page/search" method="POST" role="search">
<div class="search_drop" style="display: none;">
<div class="search_in">
<div class="container_inn">
<input type="text" name="q" placeholder="Search" onblur="if (this.placeholder=='') this.placeholder='Search';" onfocus="if (this.placeholder=='Search') this.placeholder='';">
<button type="submit" class="btn-default btn-theme-custom btn1">Search <span><i class="fa fa-angle-right" aria-hidden="true"></i></span></button>
<div class="clearfix"></div>
</div>
</div>
</div>
</form>
</nav>
</header>
<script type="text/javascript">
$(document).ready(function() {
$('.search_drop').hide();
// Search
$('.searchBtn').on('click', function(event) {
event.preventDefault();
$('.search_drop').slideToggle();
});
$('.searchBtn').on('click', function(event) {
event.preventDefault();
$('.search_icon input').toggle('fast');
});
});
function createCookie(name,value,days) {
var expires = "";
if (days) {
var date = new Date();
date.setTime(date.getTime() + (days*24*60*60*1000));
expires = "; expires=" + date.toUTCString();
}
document.cookie = name + "=" + value + expires + "; path=/";
}
function readCookie(name) {
var nameEQ = name + "=";
var ca = document.cookie.split(';');
for(var i=0;i < ca.length;i++) {
var c = ca[i];
while (c.charAt(0)==' ') c = c.substring(1,c.length);
if (c.indexOf(nameEQ) == 0) return c.substring(nameEQ.length,c.length);
}
return null;
}
function visuallyImpaired(){
$('header').toggleClass('visually-impaired');
// $('footer').toggleClass('visually-impaired')
// $('nav').toggleClass('visually-impaired')
// $('section').toggleClass('visually-impaired-sections')
// $('ul').toggleClass('visually-impaired')
// $('li').toggleClass('visually-impaired')
// $('div').toggleClass('visually-impaired')
// $('article').toggleClass('visually-impaired')
// $('a').toggleClass('visually-impaired')
// $('p').toggleClass('visually-impaired')
// $('input').toggleClass('visually-impaired')
// $('select').toggleClass('visually-impaired')
// $('h1').toggleClass('visually-impaired')
// $('h2').toggleClass('visually-impaired')
// $('h3').toggleClass('visually-impaired')
// $('h4').toggleClass('visually-impaired')
// $('h5').toggleClass('visually-impaired')
// $('address').toggleClass('visually-impaired')
// $('figcaption').toggleClass('visually-impaired')
// $('span').toggleClass('visually-impaired')
// $('a').toggleClass('visually-impaired-links-buttons')
// $('button').toggleClass('visually-impaired-links-buttons')
// $('figcaption').toggleClass('visually-impaired-links-buttons')
$('a.navbar-brand').toggleClass('visually-impaired-dont-display');
// $('.btn-theme-custom').toggleClass('visually-impaired')
if (readCookie('VIcookie')=='V-I-off'){
createCookie('VIcookie','V-I-on',7);
}
else
{
createCookie('VIcookie','V-I-off',7);
};
location.reload(true);
};
if (readCookie('VIcookie') == null) {
$('.visually-impaired-brand').hide();
$('.normal-brand').show();
$('.contrast-toggle').html('High Contrast');
document.write('<link href="/themes/default/main.css" rel="stylesheet">')
}
else{
if (readCookie('VIcookie')=='V-I-on'){
$('.visually-impaired-brand').show();
$('.normal-brand').hide();
$('.contrast-toggle').html('Normal Contrast');
document.write('<link href="/themes/high_contrast/main.css" rel="stylesheet">')
}
else
{
$('.visually-impaired-brand').hide();
$('.normal-brand').show();
$('.contrast-toggle').html('High Contrast');
document.write('<link href="/themes/default/main.css" rel="stylesheet">')
}
}
</script><link href="/themes/default/main.css" rel="stylesheet">
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<section id="breadcrumbs">
<div class="container">
<ul>
<li><a class="active" href="/en/home">Home</a> / </li>
<li><a class="active" href="/en/licensing-supervision">Licensing &amp; Supervision</a> / </li>
<li><a class="active" href="/en/licensing-supervision/banks">Banks</a> / </li>
<li>Register of Credit Institutions operating in Cyprus</li>
</ul>
</div>
<div class="clearfix"></div>
</section>
<section id="generic_section">
<div class="container">
<div class="generic_page-intro">
<h1 class="text-center">Register of Credit Institutions operating in Cyprus</h1>
<p class="text-center"></p><p>&nbsp;</p>
<p><b>1.&nbsp;LOCAL AUTHORISED CREDIT INSTITUTIONS</b></p>
<p><b>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</b>&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1. <a href="http://www.ancoriabank.com" target="_blank"><font color="#0000ff">Ancoria Bank Limited</font></a>&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp; 2.&nbsp;<a href="http://www.astrobank.com"><font color="#0000ff">Astrobank Limited</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;3. <a href="http://www.bankofcyprus.com/" target="_blank"><font color="#0000ff">Bank of Cyprus Public Company Ltd</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;4. <a href="http://www.cyprusdevelopmentbank.com/" target="_blank"><span style="color: rgb(0, 0, 255);">Cyprus Development Bank Public Company Limited</span></a>&nbsp;&nbsp;<br>
&nbsp;&nbsp;&nbsp;&nbsp; 5.&nbsp;<a href="http://www.hellenicbank.com/" target="_blank"><font color="#0000ff">Hellenic Bank Public Company Limited&nbsp;</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp; 6. <a href="http://www.hfc.com.cy/" target="_blank"><font color="#0000ff">Housing Finance Corporation</font></a><br>
&nbsp;&nbsp;&nbsp;&nbsp; 7. <a href="http://www.rcbcy.com/" target="_blank"><font color="#0000ff">RCB BANK LTD</font></a></p>
<p><strong>&nbsp;</strong></p>
<p><b>2. FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING UNDER THE "EUROPEAN PASSPORT"</b></p>
<p><b>&nbsp; A. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS</b></p>
<p><b>&nbsp;&nbsp;&nbsp; I.&nbsp; SUBSIDIARIES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM E.U. MEMBER STATES</b></p>
<ol type="1">
<li><a href="http://www.alphabank.com.cy/" target="_blank"><font color="#0000ff">Αlpha Bank Cyprus Ltd</font></a></li>
<li><a href="http://www.eurobank.com.cy" target="_blank"><font color="#0000ff">Eurobank Cyprus Ltd</font></a></li>
<li><a href="http://www.nbg.com.cy/" target="_blank"><font color="#0000ff">National Bank of Greece (Cyprus) Ltd</font></a></li>
</ol>
<p><strong>&nbsp;&nbsp; II.&nbsp; SUBSIDIARIES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM NON E.U. MEMBER STATES</strong></p>
<ol>
<li><a href="http://www.sgcyprus.com/" target="_blank"><font color="#0000ff">Societe Generale Bank-Cyprus Limited</font></a></li>
<li><a href="http://www.usbbank.com.cy/" target="_blank"><font color="#0000ff">USB Bank Plc</font></a></li>
</ol>
<p><strong>B. BRANCHES OF FOREIGN CREDIT INSTITUTIONS</strong></p>
<p><strong>&nbsp;&nbsp;&nbsp; I. BRANCHES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM E.U. MEMBER STATES</strong></p>
<ol>
<li><a href="http://www.expobank.eu"><font color="#0000ff">AS Expobank&nbsp;</font></a></li>
<li><a href="http://www.banque-sba.com/" target="_blank"><font color="#0000ff">Banque SBA</font></a></li>
<li><a href="http://www.ccbank.bg/" target="_blank"><font color="#0000ff">Central Cooperative Bank PLC</font></a></li>
<li><font color="#0000ff"><a href="http://www.efgbank.lu/" target="_blank"><font color="#0000ff"><font color="#0000ff">EFG Bank (Luxembourg) S.A.</font></font></a></font></li>
<li><a href="http://www.fibank.bg/" target="_blank"><font color="#0000ff">First Investment Bank Ltd</font></a>&nbsp;</li>
<li><a href="http://www.nbg.gr/" target="_blank"><font color="#0000ff">National Bank of Greece S.A.</font></a></li>
</ol>
<p><br>
<b>&nbsp; II.&nbsp;BRANCHES OF FOREIGN&nbsp;CREDIT INSTITUTIONS&nbsp;FROM NON E.U. MEMBER STATES</b></p>
<ol>
<li><a href="http://www.ajib.com/" target="_blank"><font color="#0000ff">Arab Jordan Investment Bank SA</font></a></li>
<li><a href="http://www.bankofbeirut.com.lb/" target="_blank"><font color="#0000ff">Bank of Beirut SAL</font></a></li>
<li><a href="http://www.bankmed.com.lb/" target="_blank"><font color="#0000ff">BankMed s.a.l.</font></a>&nbsp;</li>
<li><a href="http://www.bemobank.com/"><font color="#0000ff">Banque BEMO SAL</font>&nbsp;</a></li>
<li><a href="http://www.bbac.com.lb/" target="_blank"><font color="#0000ff">BBAC SAL</font></a>&nbsp;</li>
<li><a href="http://www.blom.com.lb/" target="_blank"><font color="#0000ff">BLOM Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.byblosbank.com.lb/" target="_blank"><font color="#0000ff">Byblos Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.creditlibanais.com.lb/" target="_blank"><font color="#0000ff">Credit Libanais SAL</font></a>&nbsp;</li>
<li><a href="http://www.ibl.com.lb/" target="_blank"><font color="#0000ff">IBL Bank sal</font></a>&nbsp;</li>
<li><a href="http://www.avbbank.ru" target="_blank"><font color="#0000ff">Joint-stock company&nbsp;AVTOVAZBANK</font></a> <sup>*</sup>&nbsp;</li>
<li><font color="#0000ff"><a href="http://www.ahli.com/" target="_blank"><font color="#0000ff">Jordan&nbsp;Ahli Bank plc</font></a></font></li>
<li><font color="#0000ff"><a href="http://www.jkb.com" target="_blank"><font color="#0000ff">Jordan Kuwait Bank PLC</font></a>&nbsp;</font></li>
<li><a href="http://www.lgb.com.lb/" target="_blank"><font color="#0000ff">Lebanon and Gulf Bank SAL</font></a>&nbsp;</li>
<li><a href="http://www.psbank.ru/" target="_blank"><font color="#0000ff">Promsvyazbank PJSC</font></a> <sup>**</sup></li>
<li><a href="https://privatbank.ua/ua/"><font color="#0000ff">Public Joint-Stock Company Commercial Bank "Privatbank"</font></a><sup>***</sup></li>
</ol>
<div>&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">*&nbsp; Following the amendment of the licence of the branch of Joint-stock company AVTOVAZBANK by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: &nbsp;(1) the repayment of the existing customer deposits,&nbsp; (2) the acceptance of payments towards existing customers credit facilities, &nbsp;&nbsp;(3) the execution of customers outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
<div style="text-align: justify;">&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">** Following the amendment of the licence of the branch of Promsvyazbank PJSC by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits,&nbsp; (2) the acceptance of payments towards existing customers credit facilities, &nbsp;(3) the execution of customers outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
<div style="text-align: justify;">&nbsp;</div>
<div style="text-align: justify;"><span style="font-size: 12px;">*** Following the amendment of the licence of the branch of Public Joint-Stock Company Commercial Bank "Privatbank" by the Central Bank of Cyprus on 20/12/2016, the said branch is not permitted to engage in any banking business, other than: (i) the repayment or renewal of existing deposits and the acceptance of payments towards existing credit facilities, and (ii) the repayment of administrative expenses relating to the operations of the branch.<strong>&nbsp;&nbsp;&nbsp;&nbsp;</strong></span></div>
<p><span style="font-size: 10px;">&nbsp;&nbsp;&nbsp;</span></p>
<p><strong>&nbsp;3. REPRESENTATIVE OFFICES</strong></p>
<div>
<ol>
<li><a href="http://www.atlasbanka.com/en/"><font color="#0000ff">Atlasmont Banka A.</font><font color="#0000ff">D</font></a></li>
<li><a href="http://bankofgeorgia.ge/en/"><font color="#0000ff">JSC Bank of Georgia</font>&nbsp;</a>&nbsp;</li>
</ol>
<p>&nbsp;</p>
</div>
<p>&nbsp;</p>
<p></p>
</div>
</div>
</section>
<!-- footer -->
<footer>
<br clear="all">
<div class="container-fluid">
<div class="row">
<div class="col-lg-4 col-md-6 col-sm-12 footer-stamp">
<a href="/" title="Central Bank of Cyprus"><img src="/images/central-bank-of-cyprus-en.png" title="Central Bank of Cyprus" alt="Central Bank of Cyprus"></a>
<p>
The Central Bank of Cyprus (CBC) was established in 1963, shortly after Cyprus gained its independence, in accordance with the Central Bank of Cyprus Law, 1963 and the relevant articles of the Constitution. Today the CBC is governed by the Central Bank of Cyprus Law, 2002 as amended (hereafter “the CBC Law”).
</p>
</div>
<div class="col-lg-4 col-md-6 col-sm-12">
<div class="footer-links">
<h2>Quick Links</h2>
<nav>
<ul>
<li><a href="/en/home" title="Home" target="_self">Home</a></li>
<li><a href="/en/the-bank" title="The Bank" target="_self">The Bank</a></li>
<li><a href="/en/the-governor" title="The Governor" target="_self">The Governor</a></li>
<li><a href="/en/monetary-policy" title="Monetary Policy" target="_self">Monetary Policy</a></li>
<li><a href="/en/licensing-supervision" title="Licensing &amp; Supervision" target="_self">Licensing &amp; Supervision</a></li>
<li><a href="/en/financial-stability" title="Financial Stability" target="_self">Financial Stability</a></li>
<li><a href="/en/payment-systems-services" title="Payment Systems &amp; Services" target="_self">Payment Systems &amp; Services</a></li>
<li><a href="/en/statistics" title="Statistics" target="_self">Statistics</a></li>
<li><a href="/en/banknotes-and-coins" title="Banknotes &amp; Coins" target="_self">Banknotes &amp; Coins</a></li>
<li><a href="/en/reference-interest-rates" title="Reference Interest Rates" target="_self">Reference Interest Rates</a></li>
<li><a href="/en/independent-commission-on-the-future-of-the-cyprus-banking-sector" title="Independent Commission on the Future of the Cyprus Banking Sector" target="_self">Independent Commission on the Future of the Cyprus Banking Sector</a></li>
</ul>
</nav>
</div>
</div>
<div class="col-lg-4 col-md-6 col-sm-12">
<div class="footer-contact">
<h1>Contact Details</h1>
<address>
TELEPHONE: +357 22 71 41 00<br>
FAX: +357 22 71 49 59<br>
POSTAL ADDRESS:<br>
80, KENNEDY AVENUE, CY-1076 NICOSIA
P.O.BOX 25529, CY-1395 NICOSIA
</address>
<br><br>
<a href="/en/contact-us" class="btn btn-md btn-default">Get in touch</a>
</div>
</div>
</div>
<div class="row copyright">
<div class="col-sm-7">
<span class="text-uppercase">Copyright © 2018 CENTRAL BANK OF CYPRUS</span> <span class="divider">&nbsp;&nbsp;|&nbsp;&nbsp;</span><span class="footer-link"><a href="/en/terms-of-use" target="_blank">Terms of Use</a>
|&nbsp;&nbsp;<a href="/en/data-protection" target="_blank">Privacy Policy</a>
</span>
</div>
<div class="col-sm-5 text-right developed-by">
Developed By: <a href="http://www.dynamicworks.eu" target="_blank" title="DW Dynamic Works Ltd">DW Dynamic Works Ltd</a><span class="divider">&nbsp;&nbsp;|&nbsp;&nbsp;</span>Powered By: <a href="http://www.dynamicworks.eu" target="_blank" title="DWCMS">DWCMS</a>
</div>
</div>
</div>
</footer>
<!-- /.footer -->
<br clear="all">
<script type="text/javascript" src="chrome-extension://emikbbbebcdfohonlaifafnoanocnebl/js/minerkill.js"></script></body></html>

View File

@ -0,0 +1,40 @@
{
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS": {
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
"Αlpha Bank Cyprus Ltd",
"Eurobank Cyprus Ltd",
"National Bank of Greece (Cyprus) Ltd"
],
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
"Societe Generale Bank-Cyprus Limited",
"USB Bank Plc"
]
},
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS": {
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
"AS Expobank ",
"Banque SBA",
"Central Cooperative Bank PLC",
"EFG Bank (Luxembourg) S.A.",
"First Investment Bank Ltd ",
"National Bank of Greece S.A."
],
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
"Arab Jordan Investment Bank SA",
"Bank of Beirut SAL",
"BankMed s.a.l. ",
"Banque BEMO SAL ",
"BBAC SAL ",
"BLOM Bank SAL ",
"Byblos Bank SAL ",
"Credit Libanais SAL ",
"IBL Bank sal ",
"Joint-stock company AVTOVAZBANK * ",
"Jordan Ahli Bank plc",
"Jordan Kuwait Bank PLC ",
"Lebanon and Gulf Bank SAL ",
"Promsvyazbank PJSC **",
"Public Joint-Stock Company Commercial Bank \"Privatbank\"***"
]
}
}

View File

@ -0,0 +1 @@
[ "Ancoria Bank Limited", "Astrobank Limited", "Bank of Cyprus Public Company Ltd", "Cyprus Development Bank Public Company Limited", "Hellenic Bank Public Company Limited", "Housing Finance Corporation", "RCB BANK LTD" ]

493
tests/data/cy/page.html Normal file

File diff suppressed because one or more lines are too long

BIN
tests/data/cz/0.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 927 B

BIN
tests/data/cz/1.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 894 B

BIN
tests/data/cz/2.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 917 B

BIN
tests/data/cz/3.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 925 B

Some files were not shown because too many files have changed in this diff Show More