init
This commit is contained in:
commit
be5d3eae07
55
.eslintrc.json
Normal file
55
.eslintrc.json
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2017,
|
||||
"sourceType": "module",
|
||||
"ecmaFeatures": {
|
||||
"jsx": false
|
||||
}
|
||||
},
|
||||
"env": {
|
||||
"browser": false,
|
||||
"node": true,
|
||||
"es6": true
|
||||
},
|
||||
"rules": {
|
||||
"arrow-spacing": "error",
|
||||
"block-scoped-var": "error",
|
||||
"block-spacing": "error",
|
||||
"brace-style": ["error", "stroustrup", {}],
|
||||
"camelcase": "error",
|
||||
"comma-dangle": ["error", "never"],
|
||||
"comma-spacing": ["error", { "before": false, "after": true }],
|
||||
"comma-style": [1, "last"],
|
||||
"consistent-this": [1, "_this"],
|
||||
"curly": [1, "multi"],
|
||||
"eol-last": 1,
|
||||
"eqeqeq": 1,
|
||||
"func-names": 1,
|
||||
"indent": ["error", 2, { "SwitchCase": 1 }],
|
||||
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
|
||||
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
|
||||
"new-cap": 1,
|
||||
"newline-before-return": "error",
|
||||
"no-array-constructor": 1,
|
||||
"no-inner-declarations": [1, "both"],
|
||||
"no-mixed-spaces-and-tabs": 1,
|
||||
"no-multi-spaces": 2,
|
||||
"no-new-object": 1,
|
||||
"no-shadow-restricted-names": 1,
|
||||
"object-curly-spacing": ["error", "always"],
|
||||
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
|
||||
"prefer-const": "error",
|
||||
"prefer-template": "error",
|
||||
"one-var": 0,
|
||||
"quote-props": ["error", "always"],
|
||||
"quotes": [1, "single"],
|
||||
"radix": 1,
|
||||
"semi": [1, "always"],
|
||||
"space-before-blocks": [1, "always"],
|
||||
"space-infix-ops": 1,
|
||||
"vars-on-top": 1,
|
||||
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
|
||||
"spaced-comment": ["error", "always", { "markers": ["/"] }]
|
||||
}
|
||||
|
||||
}
|
161
.gitignore
vendored
Normal file
161
.gitignore
vendored
Normal file
@ -0,0 +1,161 @@
|
||||
# Created by .ignore support plugin (hsz.mobi)
|
||||
### Node template
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Typescript v1 declaration files
|
||||
typings/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
|
||||
### macOS template
|
||||
# General
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
|
||||
# Files that might appear in the root of a volume
|
||||
.DocumentRevisions-V100
|
||||
.fseventsd
|
||||
.Spotlight-V100
|
||||
.TemporaryItems
|
||||
.Trashes
|
||||
.VolumeIcon.icns
|
||||
.com.apple.timemachine.donotpresent
|
||||
|
||||
# Directories potentially created on remote AFP share
|
||||
.AppleDB
|
||||
.AppleDesktop
|
||||
Network Trash Folder
|
||||
Temporary Items
|
||||
.apdisk
|
||||
### JetBrains template
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
.idea/
|
||||
# User-specific stuff:
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/dictionaries
|
||||
|
||||
# Sensitive or high-churn files:
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.xml
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
|
||||
# Gradle:
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# CMake
|
||||
cmake-build-debug/
|
||||
|
||||
# Mongo Explorer plugin:
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
## File-based project format:
|
||||
*.iws
|
||||
|
||||
## Plugin-specific files:
|
||||
|
||||
# IntelliJ
|
||||
out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Cursive Clojure plugin
|
||||
.idea/replstate.xml
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
||||
|
||||
|
||||
artefacts/screenshots/*.png
|
||||
artefacts/*.txt
|
||||
artefacts/*.json
|
||||
artefacts/*.html
|
||||
artefacts/*
|
||||
|
||||
/tests/*.zip
|
||||
|
||||
/output/
|
||||
/dist/
|
||||
!/tests/data/
|
||||
/tests/sink/
|
||||
/debug/
|
||||
/update.sh
|
||||
/setup/web/
|
||||
/backup/
|
||||
|
||||
/archive.tar.gz
|
||||
/user/
|
38
Dockerfile
Normal file
38
Dockerfile
Normal file
@ -0,0 +1,38 @@
|
||||
FROM node:stretch
|
||||
ARG VERSION
|
||||
ENV VERSION ${VERSION:-development}
|
||||
|
||||
RUN echo udev hold | dpkg --set-selections;\
|
||||
echo initscripts hold | dpkg --set-selections;\
|
||||
apt-get -yq update;\
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -yq -f --no-install-recommends build-essential dnsutils git xorg blackbox libasound2 libnss3-dev libxss1 libatk-bridge2.0-0 libgtk2.0-common libgtk-3-0 ;\
|
||||
apt-get autoremove -yq ;\
|
||||
apt-get clean -yq
|
||||
|
||||
RUN apt-get update && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
python \
|
||||
python-dev \
|
||||
python-pip \
|
||||
python-setuptools \
|
||||
groff \
|
||||
less \
|
||||
&& pip install --upgrade awscli \
|
||||
&& apt-get clean
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
COPY start.sh package.json *.js /app/
|
||||
COPY ncas/ /app/ncas
|
||||
COPY helpers/ /app/helpers
|
||||
|
||||
RUN npm install pm2 -g && npm install
|
||||
|
||||
# RUN npm install
|
||||
|
||||
# COPY start.sh /app/
|
||||
|
||||
RUN chmod +x /app/start.sh
|
||||
|
||||
ENTRYPOINT ["/app/start.sh"]
|
20
Dockerfile.orig
Normal file
20
Dockerfile.orig
Normal file
@ -0,0 +1,20 @@
|
||||
FROM node:jessie
|
||||
ARG VERSION
|
||||
ENV VERSION ${VERSION:-development}
|
||||
|
||||
RUN echo udev hold | dpkg --set-selections;\
|
||||
echo initscripts hold | dpkg --set-selections;\
|
||||
apt-get -yq update;\
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -yq -f --no-install-recommends build-essential dnsutils git xorg openbox libasound2 libnss3-dev libxss1 libatk-bridge2.0-0 libgtk2.0-common libgtk-3-0 ;\
|
||||
apt-get autoremove -yq ;\
|
||||
apt-get clean -yq
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD setup/web/archive.tar.gz /app
|
||||
|
||||
RUN npm install pm2 -g
|
||||
|
||||
RUN npm install --unsafe-perm
|
||||
|
||||
CMD ["pm2-runtime", "start", "ecosystem.config.js", "--raw" , "--env", "production"]
|
6
Jenkinsfile
vendored
Normal file
6
Jenkinsfile
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
@Library('OpenBankingUK/ob-pipeline-library') _
|
||||
|
||||
javaMsPipelinev2 {
|
||||
projectName='obdfcascrape'
|
||||
cluster='nca'
|
||||
}
|
36
Makefile
Normal file
36
Makefile
Normal file
@ -0,0 +1,36 @@
|
||||
PROJECT = obdfcascrape
|
||||
VERSION = $(shell git rev-parse --short HEAD)
|
||||
ECR_REGION = eu-west-1
|
||||
ECR_ACCOUNT_NUMBER = 482681734622
|
||||
# ECR_REPO = $(ECR_ACCOUNT_NUMBER).dkr.ecr.$(ECR_REGION).amazonaws.com
|
||||
ECR_REPO = mail.caliban.io:5000
|
||||
#APP_IMAGE = 482681734622.dkr.ecr.eu-west-1.amazonaws.com/$(PROJECT):$(VERSION)
|
||||
APP_IMAGE = $(ECR_REPO)/$(PROJECT):$(VERSION)
|
||||
NO_CACHE = false
|
||||
|
||||
#build docker image
|
||||
build:
|
||||
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
|
||||
# tar -C ./ -czvf ./archive.tar.gz 'package.json' 'ncas/' 'helpers/' -X *.js
|
||||
docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
|
||||
.PHONY: build
|
||||
|
||||
#push docker image to registry
|
||||
push: build
|
||||
docker push $(APP_IMAGE)
|
||||
.PHONY: push
|
||||
|
||||
#push docker image to registry
|
||||
run: build
|
||||
docker run $(APP_IMAGE)
|
||||
.PHONY: run
|
||||
ver:
|
||||
@echo '$(VERSION)'
|
||||
#echo $ERSION
|
||||
.PHONY: ver
|
||||
|
||||
tar:
|
||||
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
|
||||
tar -C ./ -czvf ./archive.tar.gz 'package.json' 'ncas/' 'helpers/' -X *.js
|
||||
|
||||
.PHONY: build
|
23
bg.js
Normal file
23
bg.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Bulgaria = require('./ncas/bg');
|
||||
|
||||
async function run() {
|
||||
const bgScraper = new Bulgaria();
|
||||
|
||||
if (typeof(process.env.BG_CRON) === 'string' )
|
||||
new CronJob(process.env.BG_CRON, async function() {
|
||||
await bgScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === bgScraper.id)
|
||||
await bgScraper.run();
|
||||
|
||||
console.log('BG launched');
|
||||
}
|
||||
|
||||
run();
|
5
config.json
Normal file
5
config.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"TopicArn": "arn:aws:sns:eu-west-1:115486161803:obdfcascrape",
|
||||
"QueueUrl": "https://sqs.eu-west-1.amazonaws.com/115486161803/obdfcascrape",
|
||||
"QueueArn": "arn:aws:sqs:eu-west-1:115486161803:obdfcascrape"
|
||||
}
|
64
consume.js
Normal file
64
consume.js
Normal file
@ -0,0 +1,64 @@
|
||||
var AWS = require('aws-sdk');
|
||||
var util = require('util');
|
||||
var config = require('./config.json');
|
||||
|
||||
require('dotenv').config({
|
||||
'path': `${__dirname }/.env`
|
||||
});
|
||||
|
||||
// configure AWS
|
||||
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
||||
|
||||
var sqs = new AWS.SQS();
|
||||
|
||||
var receiveMessageParams = {
|
||||
'QueueUrl': config.QueueUrl,
|
||||
'MaxNumberOfMessages': 10
|
||||
};
|
||||
|
||||
function getMessages() {
|
||||
sqs.receiveMessage(receiveMessageParams, receiveMessageCallback);
|
||||
}
|
||||
|
||||
function receiveMessageCallback(err, data) {
|
||||
// console.log(data);
|
||||
|
||||
if (data && data.Messages && data.Messages.length > 0) {
|
||||
for (var i = 0; i < data.Messages.length; i++) {
|
||||
|
||||
|
||||
const body =JSON.parse( data.Messages[i].Body);
|
||||
|
||||
if (body && body.Message) {
|
||||
|
||||
const msg = JSON.parse(body.Message);
|
||||
console.dir(msg);
|
||||
}
|
||||
// console.dir(body);
|
||||
process.stdout.write('.');
|
||||
// console.log("do something with the message here...");
|
||||
//
|
||||
// Delete the message when we've successfully processed it
|
||||
const deleteMessageParams = {
|
||||
'QueueUrl': config.QueueUrl,
|
||||
'ReceiptHandle': data.Messages[i].ReceiptHandle
|
||||
};
|
||||
|
||||
// sqs.deleteMessage(deleteMessageParams, deleteMessageCallback);
|
||||
}
|
||||
|
||||
getMessages();
|
||||
}
|
||||
else {
|
||||
process.stdout.write('-');
|
||||
setTimeout(getMessages, 1000);
|
||||
}
|
||||
}
|
||||
|
||||
function deleteMessageCallback(err, data) {
|
||||
// console.log("deleted message");
|
||||
// console.log(data);
|
||||
}
|
||||
|
||||
setTimeout(getMessages, 1000);
|
||||
|
35
cy.js
Normal file
35
cy.js
Normal file
@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
// TODO:
|
||||
// parse arguments - we should run just 1 FCA per go &
|
||||
// have option to run selected company from selected NCA
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
// load helper libs etc
|
||||
// const Fca = require('./ncas/fca');
|
||||
|
||||
const Cyprus = require('./ncas/cy');
|
||||
|
||||
async function run() {
|
||||
const cyScraper = new Cyprus();
|
||||
|
||||
if (typeof(process.env.CY_CRON) === 'string' )
|
||||
new CronJob(process.env.CY_CRON, async function() {
|
||||
await cyScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === cyScraper.id)
|
||||
await cyScraper.run();
|
||||
|
||||
console.log('CY Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
29
cz.js
Normal file
29
cz.js
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Czech = require('./ncas/cz');
|
||||
|
||||
async function run() {
|
||||
const czScraper = new Czech();
|
||||
|
||||
if (typeof(process.env.CZ_CRON) === 'string' )
|
||||
new CronJob(process.env.CZ_CRON, async function() {
|
||||
await czScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === czScraper.id)
|
||||
await czScraper.run();
|
||||
|
||||
console.log('CZ Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
29
de.js
Normal file
29
de.js
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Germany = require('./ncas/de');
|
||||
|
||||
async function run() {
|
||||
const deScraper = new Germany();
|
||||
|
||||
if (typeof(process.env.DE_CRON) === 'string' )
|
||||
new CronJob(process.env.DE_CRON, async function() {
|
||||
await deScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === deScraper.id)
|
||||
await deScraper.run();
|
||||
|
||||
console.log('DE Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
110
debuglogs.js
Normal file
110
debuglogs.js
Normal file
@ -0,0 +1,110 @@
|
||||
// https://github.com/markcallen/snssqs
|
||||
|
||||
const AWS = require('aws-sdk');
|
||||
const util = require('util');
|
||||
const async = require('async');
|
||||
const fs = require('fs-extra');
|
||||
const path = require('path');
|
||||
const archiver = require('archiver-promise');
|
||||
const logger = require('log4js').getLogger('DebugUploader');
|
||||
const dateFormat = require('dateformat');
|
||||
|
||||
const { promisify } = require('util');
|
||||
|
||||
const readFileAsync = promisify( fs.readFile);
|
||||
|
||||
require('dotenv').config({
|
||||
'path': `${__dirname }/.env`
|
||||
});
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'debug';
|
||||
|
||||
// configure AWS
|
||||
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
||||
|
||||
const s3 = new AWS.S3();
|
||||
|
||||
async function _createDirectory(destPath = null) {
|
||||
if (!fs.existsSync(destPath))
|
||||
fs.ensureDirSync(destPath);
|
||||
|
||||
return destPath;
|
||||
}
|
||||
|
||||
async function _createArchive(destPath = null, filename = null, glob = false) {
|
||||
return new Promise((resolve, reject) => {
|
||||
if (!destPath || !filename) {
|
||||
const e = new Error('Missing paths');
|
||||
logger.error(e);
|
||||
reject(e);
|
||||
}
|
||||
const archive = archiver(filename, {
|
||||
'zlib': { 'level': 9 } // Sets the compression level.
|
||||
});
|
||||
|
||||
if (glob)
|
||||
archive.glob(`${destPath}`);
|
||||
else
|
||||
archive.directory(`${destPath}/`);
|
||||
|
||||
archive.finalize().then(function() {
|
||||
logger.debug('Archive finished');
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function _upload(filename) {
|
||||
logger.info('^^^ UPLOADING ^^^');
|
||||
const filePath = path.parse(filename);
|
||||
|
||||
await readFileAsync(filename).then(async (data) => {
|
||||
const base64data = new Buffer.from(data, 'binary');
|
||||
|
||||
const s3Obj = {
|
||||
'Bucket': process.env.S3_BUCKET,
|
||||
'Key': filePath.base,
|
||||
'Body': base64data,
|
||||
'ACL': 'public-read'
|
||||
};
|
||||
|
||||
return await s3.upload(s3Obj).promise()
|
||||
.then((data) => {
|
||||
logger.info('Successfully uploaded file.');
|
||||
|
||||
return data;
|
||||
})
|
||||
.catch((err) => {
|
||||
logger.error(err);
|
||||
|
||||
return err;
|
||||
});
|
||||
}).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
}
|
||||
|
||||
async function _archive() {
|
||||
logger.info('>-< ARCHIVING >-<');
|
||||
try{
|
||||
const now = new Date();
|
||||
|
||||
await _createDirectory('dist');
|
||||
|
||||
const timestamp = dateFormat(now, process.env.FILE_DATE_FOTMAT || 'yyyymmdd');
|
||||
const filename = `dist/debug-${process.env.HOSTNAME}-${timestamp}.zip`;
|
||||
const eFilename = `dist/pl-${process.env.HOSTNAME}-${timestamp}.zip`;
|
||||
await _createArchive('debug/', filename);
|
||||
await _createArchive('artefacts/PL', eFilename, true);
|
||||
|
||||
await _upload(filename);
|
||||
await _upload(eFilename);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('Debug Archiver');
|
||||
async.series([_archive]);
|
||||
|
28
dk.js
Normal file
28
dk.js
Normal file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Denmark = require('./ncas/dkV2');
|
||||
|
||||
async function run() {
|
||||
const dkScraper = new Denmark();
|
||||
|
||||
if (typeof(process.env.DK_CRON) === 'string' )
|
||||
new CronJob(process.env.DK_CRON, async function() {
|
||||
await dkScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === dkScraper.id)
|
||||
await dkScraper.run();
|
||||
console.log('DK Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
10
docker-compose.yml
Normal file
10
docker-compose.yml
Normal file
@ -0,0 +1,10 @@
|
||||
version: '3'
|
||||
|
||||
services:
|
||||
|
||||
web:
|
||||
build: ./setup/web/.
|
||||
container_name: web
|
||||
|
||||
ports:
|
||||
- 9000:9000
|
9
docker.sh
Executable file
9
docker.sh
Executable file
@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
docker-compose down
|
||||
|
||||
gulp default
|
||||
docker-compose pull
|
||||
docker-compose up --build -d
|
||||
|
||||
|
94
ecosystem.config.js
Normal file
94
ecosystem.config.js
Normal file
@ -0,0 +1,94 @@
|
||||
require('dotenv').config();
|
||||
const dateFormat = require('dateformat');
|
||||
|
||||
function buildApps() {
|
||||
// proxies = ['uk', 'fr', 'de', 'nl', 'ch'];
|
||||
|
||||
const debugCron = process.env['debugCron'] || false;
|
||||
const cronBump = process.env['cronBump'] || false;
|
||||
const baseDate = new Date();
|
||||
let startCronMS = baseDate.getTime() + ( 5 * (60 * 1000));
|
||||
|
||||
console.log(`debugCron:${debugCron} // cronBump:${cronBump}`);
|
||||
const apps = [];
|
||||
const list = [
|
||||
{ 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js', 'proxy': 'uk', 'crontime': '0 0 * * *' }, // 00:04:40
|
||||
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js', 'proxy': 'uk', 'crontime': '10 0 * * *' }, // 01:12:53
|
||||
{ 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js', 'proxy': 'uk', 'crontime': '10 1 * * *' }, // 04:51:37 - uk free at 6:30
|
||||
{ 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js', 'proxy': 'fr', 'crontime': '0 0 * * *' }, // 00:01:03
|
||||
{ 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js', 'proxy': 'fr', 'crontime': '5 0 * * *' }, // 00:43:45
|
||||
{ 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js', 'proxy': 'fr', 'crontime': '0 1 * * *' }, // 01:22:29
|
||||
{ 'cron':'LT_CRON', 'start':'LT', 'name':'LT', 'script':'lt.js', 'proxy': 'fr', 'crontime': '30 2 * * *' }, // 00:53:26
|
||||
{ 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js', 'proxy': 'fr', 'crontime': '30 3 * * *' }, // 00:24:03 - fr free at 4:00
|
||||
{ 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js', 'proxy': 'de', 'crontime': '0 0 * * *' }, // 03:55:38 - de free at 4:00
|
||||
{ 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js', 'proxy': 'nl', 'crontime': '0 0 * * *' }, // 07:23:19 - nl free at 7:30
|
||||
{ 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js', 'proxy': 'ch', 'crontime': '0 0 * * *' }, // 17:59:18 - ch free at 18:00
|
||||
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js', 'proxy': 'uk' },
|
||||
{ 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js', 'proxy': 'uk' },
|
||||
{ 'cron':'ES_CRON', 'start':'ES', 'name':'ES', 'script':'es.js', 'proxy': 'uk' },
|
||||
{ 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js', 'proxy': 'uk' },
|
||||
{ 'cron':'GR_CRON', 'start':'GR', 'name':'GR', 'script':'gr.js', 'proxy': 'uk' },
|
||||
{ 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js', 'proxy': 'uk' },
|
||||
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js', 'proxy': 'uk' },
|
||||
{ 'cron':'LV_CRON', 'start':'LV', 'name':'LV', 'script':'lv.js', 'proxy': 'uk' },
|
||||
{ 'cron':'NO_CRON', 'start':'NO', 'name':'NO', 'script':'no.js', 'proxy': 'uk' },
|
||||
{ 'cron':'EE_CRON', 'start':'EE', 'name':'EE', 'script':'ee.js', 'proxy': 'uk' },
|
||||
{ 'cron':'BG_CRON', 'start':'BG', 'name':'BG', 'script':'bg.js', 'proxy': 'uk' }
|
||||
];
|
||||
|
||||
apps.push({
|
||||
'name' : 'watcher',
|
||||
'script' : 'helpers/watcher.js',
|
||||
|
||||
'env': {
|
||||
'NODE_ENV': 'production'
|
||||
},
|
||||
'autorestart' : true,
|
||||
'max_restarts': 3,
|
||||
'restart_delay': 4000
|
||||
});
|
||||
|
||||
for(const item of list)
|
||||
|
||||
if ((typeof process.env[item.cron] !== 'undefined' || process.env.SCRAPE_START === item.start)) {
|
||||
const proxyUri = `${item.proxy}.proxymesh.com:31280`;
|
||||
|
||||
const newItem = {
|
||||
'name' : item.name,
|
||||
'script' : item.script,
|
||||
|
||||
'env': {
|
||||
'NODE_ENV': 'production',
|
||||
'PROXY_URI' : proxyUri
|
||||
},
|
||||
'autorestart' : true,
|
||||
'max_restarts': 3,
|
||||
'restart_delay': 4000
|
||||
};
|
||||
|
||||
if (typeof process.env[item.cron] !== 'undefined') {
|
||||
newItem.env[item.cron] = (debugCron !== false) ? process.env[item.cron] : item.crontime;
|
||||
if (cronBump !== false) {
|
||||
newItem.env[item.cron] = dateFormat(startCronMS, 'M H "* * *"');
|
||||
|
||||
startCronMS = startCronMS + ( 2 * (60 * 1000));
|
||||
}
|
||||
}
|
||||
|
||||
apps.push(newItem);
|
||||
}
|
||||
|
||||
const version = process.env.VERSION || 'NO VERSION!';
|
||||
|
||||
console.log('*****************************');
|
||||
console.log(`LAUNCHING VERSION: ${version}`);
|
||||
console.log('*****************************');
|
||||
|
||||
console.log(JSON.stringify(apps));
|
||||
|
||||
return apps;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
'apps' : buildApps()
|
||||
};
|
23
ee.js
Normal file
23
ee.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Estonia = require('./ncas/ee');
|
||||
|
||||
async function run() {
|
||||
const eeScraper = new Estonia();
|
||||
|
||||
if (typeof(process.env.EE_CRON) === 'string' )
|
||||
new CronJob(process.env.EE_CRON, async () => {
|
||||
await eeScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === eeScraper.id)
|
||||
await eeScraper.run();
|
||||
|
||||
console.log('EE Launched');
|
||||
}
|
||||
|
||||
run();
|
25
es.js
Normal file
25
es.js
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Spain = require('./ncas/es');
|
||||
|
||||
async function run() {
|
||||
const esScraper = new Spain();
|
||||
|
||||
if (typeof(process.env.ES_CRON) === 'string' )
|
||||
new CronJob(process.env.ES_CRON, async function() {
|
||||
await esScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === esScraper.id)
|
||||
await esScraper.run();
|
||||
|
||||
console.log('ES Launched');
|
||||
}
|
||||
|
||||
run();
|
31
fr.js
Normal file
31
fr.js
Normal file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
// load helper libs etc
|
||||
// const Fca = require('./ncas/fca');
|
||||
|
||||
const France = require('./ncas/fr');
|
||||
|
||||
async function run() {
|
||||
const frScraper = new France();
|
||||
|
||||
if (typeof(process.env.FR_CRON) === 'string' )
|
||||
new CronJob(process.env.FR_CRON, async function() {
|
||||
await frScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === frScraper.id)
|
||||
await frScraper.run();
|
||||
console.log('FR Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
25
gi.js
Normal file
25
gi.js
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Gibraltar = require('./ncas/gi');
|
||||
|
||||
async function run() {
|
||||
const giScraper = new Gibraltar();
|
||||
|
||||
if (typeof(process.env.GI_CRON) === 'string' )
|
||||
new CronJob(process.env.GI_CRON, async function() {
|
||||
await giScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === giScraper.id)
|
||||
await giScraper.run();
|
||||
|
||||
console.log('GI Launched');
|
||||
}
|
||||
|
||||
run();
|
23
gr.js
Normal file
23
gr.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Greece = require('./ncas/gr');
|
||||
|
||||
async function run() {
|
||||
const grScraper = new Greece();
|
||||
|
||||
if (typeof(process.env.GR_CRON) === 'string' )
|
||||
new CronJob(process.env.GR_CRON, async function() {
|
||||
await grScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === grScraper.id)
|
||||
await grScraper.run();
|
||||
|
||||
console.log('GR Launched');
|
||||
}
|
||||
|
||||
run();
|
40
gulpfile.js
Normal file
40
gulpfile.js
Normal file
@ -0,0 +1,40 @@
|
||||
'use strict';
|
||||
|
||||
const gulp = require('gulp');
|
||||
var bump = require('gulp-bump');
|
||||
var changedInPlace = require('gulp-changed-in-place');
|
||||
const debug = require('gulp-debug');
|
||||
|
||||
const watchFolders = ['ncas/**/*.js', 'helpers/**/*.js'];
|
||||
|
||||
gulp.task('bumpNcas', () => {
|
||||
gulp.src('ncas/**/*.js')
|
||||
.pipe(changedInPlace({'firstPass':true}))
|
||||
.pipe(debug({ 'showFiles': true }))
|
||||
.pipe(bump({ 'key': 'version', 'type':'prerelease' }))
|
||||
.pipe(gulp.dest('ncas'));
|
||||
});
|
||||
|
||||
/*
|
||||
gulp.task('styles', function() {
|
||||
return gulp.src(['node_modules/backbone.modal/backbone.modal.css', 'node_modules/backbone.modal/backbone.modal.theme.css'])
|
||||
.pipe(autoprefixer('last 2 version', 'safari 5', 'ie 8', 'ie 9', 'opera 12.1', 'ios 6', 'android 4'))
|
||||
|
||||
.pipe(concat('style.min.css'))
|
||||
.pipe(cssnano())
|
||||
.pipe(gulp.dest('live/css'));
|
||||
});
|
||||
|
||||
gulp.task('default', function () {
|
||||
return gulp.src('src/**//*/ .{ts,tsx}')
|
||||
.pipe(changedInPlace())
|
||||
.pipe(tsfmt())
|
||||
.pipe(gulp.dest('src'));
|
||||
});
|
||||
|
||||
*/
|
||||
|
||||
gulp.task('bumpWatch', ['bumpNcas'], function() {
|
||||
gulp.watch(watchFolders, ['bumpNcas']);
|
||||
});
|
||||
|
53
helpers/csv-data.js
Normal file
53
helpers/csv-data.js
Normal file
@ -0,0 +1,53 @@
|
||||
const AWS = require('aws-sdk');
|
||||
const { parse, generate } = require('csv');
|
||||
|
||||
class CsvData {
|
||||
|
||||
constructor() {
|
||||
this.s3 = new AWS.S3();
|
||||
|
||||
// defaults for bucket and main CSV name
|
||||
this.bucketName = process.env.OB_SCRAPE_BUCKET || 'obregstoretest';
|
||||
this.keyName = process.env.OB_SCRAPE_KEYNAME || 'artefacts/UK/FCA/latest/Firms with PSD Permissions (CSV).csv';
|
||||
}
|
||||
|
||||
async _getCsvDataFromS3() {
|
||||
return this.s3.getObject({
|
||||
'Bucket': this.bucketName,
|
||||
'Key': this.keyName
|
||||
}).promise();
|
||||
}
|
||||
|
||||
async _parseCsvBufferData(csvBufferData) {
|
||||
return new Promise((resolve, reject) => {
|
||||
parse(
|
||||
csvBufferData.Body.toString('utf-8'),
|
||||
{
|
||||
'columns': true, // required to create objects (instead of array of strings), auto-discovery works fine
|
||||
'relax_column_count': true // needs to set to false as header have info when file was generated as extra column
|
||||
},
|
||||
(err, data) => {
|
||||
if (err) return reject(err);
|
||||
resolve(data);
|
||||
}
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
async _getFrnAndName(data) {
|
||||
return data.map(el => ({
|
||||
'frn': el.FRN,
|
||||
'firm': el.Firm
|
||||
}));
|
||||
}
|
||||
|
||||
async getOrgIds() {
|
||||
const rawCsv = await this._getCsvDataFromS3();
|
||||
const parsedCsv = await this._parseCsvBufferData(rawCsv);
|
||||
const frnIds = this._getFrnAndName(parsedCsv);
|
||||
|
||||
return frnIds;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = CsvData;
|
1
helpers/dictionary/DE.json
Normal file
1
helpers/dictionary/DE.json
Normal file
@ -0,0 +1 @@
|
||||
[["Akquisitionsgeschäft","Acquisition business"],["Finanztransfergeschäft","Money transmission services"],["Zahlungsauthentifizierungsgeschäft","Payment authentication business"],["Digitalisiertes Zahlungsgeschäft","Digitized payment transaction"],["Ein- oder Auszahlungsgeschäft","Deposit or withdrawal transaction"],["Lastschriftgeschäft mit Kreditgewährung","Direct debit business with lending"],["Lastschriftgeschäft ohne Kreditgewährung","Direct debit business without lending"],["Zahlungskartengeschäft mit Kreditgewährung","Payment card business with credit"],["Zahlungskartengeschäft ohne Kreditgewährung","Payment card business without lending"],["Überweisungsgeschäft mit Kreditgewährung","Credit transfer transaction"],["Überweisungsgeschäft ohne Kreditgewährung","Remittance without credit"],["Auszahlungsgeschäft","Payment business"],["Einzahlungsgeschäft","Deposit business"],["Kontoinformationsdienste","Account information services"],["Zahlungsauslösedienste","Payment release services"],["Abschlußvermittlung","Terminating switch"],["Anlageberatung","Investment advice"],["Anlagevermittlung","Investment brokerage"],["Anlageverwaltung","Investment brokerage"],["Depotgeschäft","Custodian business"],["Diskontgeschäft","Discount store"],["Drittstaateneinlagenvermittlung","Non- EEA deposit broking"],["E-Geld-Geschäft","E-money business"],["Eigengeschäft","Own business"],["Eigenhandel","Proprietary trading"],["Einlagengeschäft","Deposit business"],["Emissionsgeschäft","Underwriting"],["Factoring","Factoring"],["Finanzierungsleasing","Finance lease"],["Finanzkommissionsgeschäft","Broking"],["Finanzportfolioverwaltung","Financial portfolio management"],["Garantiegeschäft","Guarantee business"],["Geldkartengeschäft","Money card business"],["Girogeschäft","Giro business"],["Kreditgeschäft","Lending business"],["Kreditkartengeschäft","Credit card business"],["Netzgeldgeschäft","Network money business"],["Organisiertes Handelssystem (OTF)","Organized trading system (OTF)"],["Platzierungsgeschäft","Placement business"],["Revolvinggeschäft, sog.","Revolving business, so-called"],["Scheck- u. Wechseleinzugs- u. Reisescheckgeschäft","Check and Change-in. Traveler's check business"],["Sortengeschäft","Foreign currency dealing"],["Bausparkassengeschäft","Building society business"],["Pfandbriefgeschäft","Pfandbrief business"],["Multilaterales Handelssystem","Multilateral trading system"],["Hypothekenbankengeschäft","Mortgage Banking"],["Entgegennahme von Einlagen (Nr. 1)","Receipt of deposits"],["Tätigkeit als zentrale Gegenpartei","Activity as a central counterparty"],["Tätigkeit als zentrale Gegenpartei nach Art. 14 VO (EU) Nr. 648/2012","Activity as central counterparty according to Art. 14 VO (EU) No. 648/2012"],["Tätigkeit als zentraler Kontrahent","Activity as central counterparty"],["Teilnahme an Versteigerungen für Emissionsberechtigungen","Participation in auctions for emission allowances"]]
|
1
helpers/dictionary/SE.json
Normal file
1
helpers/dictionary/SE.json
Normal file
File diff suppressed because one or more lines are too long
10
helpers/dictionary/cut.se.json
Normal file
10
helpers/dictionary/cut.se.json
Normal file
@ -0,0 +1,10 @@
|
||||
[
|
||||
[
|
||||
"VP-inst, SI - obligationer",
|
||||
""
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - depåbevis",
|
||||
""
|
||||
]
|
||||
]
|
1
helpers/dictionary/lang.de.json
Normal file
1
helpers/dictionary/lang.de.json
Normal file
@ -0,0 +1 @@
|
||||
[["Akquisitionsgeschäft","Acquisition business"],["Finanztransfergeschäft","Money transmission services"],["Zahlungsauthentifizierungsgeschäft","Payment authentication business"],["Digitalisiertes Zahlungsgeschäft","Digitized payment transaction"],["Ein- oder Auszahlungsgeschäft","Deposit or withdrawal transaction"],["Lastschriftgeschäft mit Kreditgewährung","Direct debit business with lending"],["Lastschriftgeschäft ohne Kreditgewährung","Direct debit business without lending"],["Zahlungskartengeschäft mit Kreditgewährung","Payment card business with credit"],["Zahlungskartengeschäft ohne Kreditgewährung","Payment card business without lending"],["Überweisungsgeschäft mit Kreditgewährung","Credit transfer transaction"],["Überweisungsgeschäft ohne Kreditgewährung","Remittance without credit"],["Auszahlungsgeschäft","Payment business"],["Einzahlungsgeschäft","Deposit business"],["Kontoinformationsdienste","Account information services"],["Zahlungsauslösedienste","Payment release services"],["Abschlußvermittlung","Terminating switch"],["Anlageberatung","Investment advice"],["Anlagevermittlung","Investment brokerage"],["Anlageverwaltung","Investment brokerage"],["Depotgeschäft","Custodian business"],["Diskontgeschäft","Discount store"],["Drittstaateneinlagenvermittlung","Non- EEA deposit broking"],["E-Geld-Geschäft","E-money business"],["Eigengeschäft","Own business"],["Eigenhandel","Proprietary trading"],["Einlagengeschäft","Deposit business"],["Emissionsgeschäft","Underwriting"],["Factoring","Factoring"],["Finanzierungsleasing","Finance lease"],["Finanzkommissionsgeschäft","Broking"],["Finanzportfolioverwaltung","Financial portfolio management"],["Garantiegeschäft","Guarantee business"],["Geldkartengeschäft","Money card business"],["Girogeschäft","Giro business"],["Kreditgeschäft","Lending business"],["Kreditkartengeschäft","Credit card business"],["Netzgeldgeschäft","Network money business"],["Organisiertes Handelssystem (OTF)","Organized trading system (OTF)"],["Platzierungsgeschäft","Placement business"],["Revolvinggeschäft, sog.","Revolving business, so-called"],["Scheck- u. Wechseleinzugs- u. Reisescheckgeschäft","Check and Change-in. Traveler's check business"],["Sortengeschäft","Foreign currency dealing"],["Bausparkassengeschäft","Building society business"],["Pfandbriefgeschäft","Pfandbrief business"],["Multilaterales Handelssystem","Multilateral trading system"],["Hypothekenbankengeschäft","Mortgage Banking"],["Entgegennahme von Einlagen (Nr. 1)","Receipt of deposits"],["Tätigkeit als zentrale Gegenpartei","Activity as a central counterparty"],["Tätigkeit als zentrale Gegenpartei nach Art. 14 VO (EU) Nr. 648/2012","Activity as central counterparty according to Art. 14 VO (EU) No. 648/2012"],["Tätigkeit als zentraler Kontrahent","Activity as central counterparty"],["Teilnahme an Versteigerungen für Emissionsberechtigungen","Participation in auctions for emission allowances"]]
|
1
helpers/dictionary/lang.se.json
Normal file
1
helpers/dictionary/lang.se.json
Normal file
File diff suppressed because one or more lines are too long
558
helpers/dictionary/trans.se.json
Normal file
558
helpers/dictionary/trans.se.json
Normal file
@ -0,0 +1,558 @@
|
||||
[
|
||||
[
|
||||
"VP-inst, SI - obligationer",
|
||||
"VP inst, SI bonds"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - depåbevis",
|
||||
"VP Inst, SI Depositary Certificate"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - aktier",
|
||||
"VP inst, SI shares"
|
||||
],
|
||||
[
|
||||
"Förvaltare av fondandelar",
|
||||
"Manager of fund units"
|
||||
],
|
||||
[
|
||||
"Mottagande & vidarebefordran av order avs fi instrument",
|
||||
"Receipt and forwarding of order instruments"
|
||||
],
|
||||
[
|
||||
"Utförande av order avs fi instrument på kunders uppdrag",
|
||||
"Execution of order instruments on behalf of clients"
|
||||
],
|
||||
[
|
||||
"Handel med finansiella instrument för egen räkning",
|
||||
"Trade in financial instruments for own account"
|
||||
],
|
||||
[
|
||||
"Diskretionär portföljförvaltning avs finansiella instrument",
|
||||
"Discretionary portfolio management of financial instruments"
|
||||
],
|
||||
[
|
||||
"Investeringsrådgivning till kund avs finansiella instrument",
|
||||
"Investment advice to customer financial instruments"
|
||||
],
|
||||
[
|
||||
"Garantigivning & placering av fi instrument m fast åtagande",
|
||||
"Warranty & placement of fi m instruments m firm commitment"
|
||||
],
|
||||
[
|
||||
"Placering av finansiella instrument utan fast åtagande",
|
||||
"Placement of financial instruments without a firm commitment"
|
||||
],
|
||||
[
|
||||
"Försäkring i samtliga livförsäkr.klasser (direkt) förs.förm.",
|
||||
"Insurance in all life insurance classes (direct)."
|
||||
],
|
||||
[
|
||||
"a) Olycksfalls- och sjukförsäkring (direkt)",
|
||||
"(a) accident and sickness insurance (direct)"
|
||||
],
|
||||
[
|
||||
"Oktroj och stadfästelse av bolagsordning",
|
||||
"Octroj and confirmation of articles of association"
|
||||
],
|
||||
[
|
||||
"BELGIEN",
|
||||
"BELGIUM"
|
||||
],
|
||||
[
|
||||
"Mottagande insättningar och andra återbet. medel fr allmänh.",
|
||||
"Receiving deposits and other repayments. means of public."
|
||||
],
|
||||
[
|
||||
"Portföljförvaltning och -rådgivning",
|
||||
"Portfolio management and consulting"
|
||||
],
|
||||
[
|
||||
"Förvaring och förvaltning av värdepapper",
|
||||
"Securities storage and management"
|
||||
],
|
||||
[
|
||||
"Utlåning",
|
||||
"Loans"
|
||||
],
|
||||
[
|
||||
"Betalningsförmedling",
|
||||
"Money transfers"
|
||||
],
|
||||
[
|
||||
"Garantiförbindelser och ställande av säkerhet",
|
||||
"Warranty and Security"
|
||||
],
|
||||
[
|
||||
"Penningmarknadsinstrument (checkar, växlar, depåbevis m.m)",
|
||||
"Money market instruments (checks, bills, depository receipts, etc.)"
|
||||
],
|
||||
[
|
||||
"Utländsk valuta",
|
||||
"Foreign currency"
|
||||
],
|
||||
[
|
||||
"Finansiella terminer och optioner",
|
||||
"Financial futures and options"
|
||||
],
|
||||
[
|
||||
"Valuta- och ränteinstrument",
|
||||
"Currency and interest rate instruments"
|
||||
],
|
||||
[
|
||||
"Överlåtbara värdepapper",
|
||||
"Transferable securities"
|
||||
],
|
||||
[
|
||||
"Medverkan i värdepappersemi. och tillhandahåll. av tjänster",
|
||||
"Participation in securities issues. and provide. of services"
|
||||
],
|
||||
[
|
||||
"Rådgivning till företag ang. kapitalstr, ind. strategi etc.",
|
||||
"Advice to companies ang. kapitalstr. strategy etc."
|
||||
],
|
||||
[
|
||||
"TYSKLAND",
|
||||
"GERMANY"
|
||||
],
|
||||
[
|
||||
"DANMARK",
|
||||
"DENMARK"
|
||||
],
|
||||
[
|
||||
"ESTLAND",
|
||||
"ESTONIA"
|
||||
],
|
||||
[
|
||||
"SPANIEN",
|
||||
"SPAIN"
|
||||
],
|
||||
[
|
||||
"FINLAND",
|
||||
"FINLAND"
|
||||
],
|
||||
[
|
||||
"FRANKRIKE",
|
||||
"FRANCE"
|
||||
],
|
||||
[
|
||||
"STORBRITANNIEN",
|
||||
"UK"
|
||||
],
|
||||
[
|
||||
"IRLAND",
|
||||
"IRELAND"
|
||||
],
|
||||
[
|
||||
"ITALIEN",
|
||||
"ITALY"
|
||||
],
|
||||
[
|
||||
"Handel för egen eller kunders räkning med ..(se förteckning)",
|
||||
"Trade on behalf of your own or customers with .. (see list)"
|
||||
],
|
||||
[
|
||||
"LITHUANIA",
|
||||
"LITHUANIA"
|
||||
],
|
||||
[
|
||||
"LUXEMBURG",
|
||||
"LUXEMBOURG"
|
||||
],
|
||||
[
|
||||
"LETTLAND",
|
||||
"LATVIA"
|
||||
],
|
||||
[
|
||||
"NEDERLÄNDERNA",
|
||||
"NETHERLANDS"
|
||||
],
|
||||
[
|
||||
"NORGE",
|
||||
"NORWAY"
|
||||
],
|
||||
[
|
||||
"Penningmarknadsmäkling",
|
||||
"Money broking"
|
||||
],
|
||||
[
|
||||
"Tillstånd att bedriva bankrörelse",
|
||||
"Permission to conduct banking operations"
|
||||
],
|
||||
[
|
||||
"Ia) Livförsäkring (direkt)",
|
||||
"Ia) Life insurance (direct)"
|
||||
],
|
||||
[
|
||||
"16. Annan förmögenhetsskada (direkt)",
|
||||
"16. Other property damage (direct)"
|
||||
],
|
||||
[
|
||||
"1. Olycksfall (direkt)",
|
||||
"1. Accidents (direct)"
|
||||
],
|
||||
[
|
||||
"2. Sjukdom (direkt)",
|
||||
"2. Disease (direct)"
|
||||
],
|
||||
[
|
||||
"ÖSTERRIKE",
|
||||
"AUSTRIA"
|
||||
],
|
||||
[
|
||||
"Kreditupplysningstjänster",
|
||||
"Credit Services"
|
||||
],
|
||||
[
|
||||
"Finansiell leasing",
|
||||
"Financial leasing"
|
||||
],
|
||||
[
|
||||
"POLEN",
|
||||
"POLAND"
|
||||
],
|
||||
[
|
||||
"Mottagande & vidarebefordran av order beträffande fi instr",
|
||||
"Receipt and forwarding of orders concerning fi rst"
|
||||
],
|
||||
[
|
||||
"Utförande av order på kunders uppdrag",
|
||||
"Execution of orders on customer assignments"
|
||||
],
|
||||
[
|
||||
"Portföljförvaltning",
|
||||
"portfolio"
|
||||
],
|
||||
[
|
||||
"Investeringsrådgivning",
|
||||
"investment advice"
|
||||
],
|
||||
[
|
||||
"Förvaring av fi instr för kund samt kontanta medel",
|
||||
"Storage of fixtures for customers as well as cash funds"
|
||||
],
|
||||
[
|
||||
"Lämnande av kredit till inv för transaktion i fi instr",
|
||||
"Transmission of credit to transaction in fi rst"
|
||||
],
|
||||
[
|
||||
"Råd till företag & råd och tjänster vid fusioner & uppköp",
|
||||
"Advice for companies & services and mergers & acquisitions"
|
||||
],
|
||||
[
|
||||
"Valutatjänster i samband med investeringstjänster",
|
||||
"Currency services in connection with investment services"
|
||||
],
|
||||
[
|
||||
"GREKLAND",
|
||||
"GREECE"
|
||||
],
|
||||
[
|
||||
"ISLAND",
|
||||
"ICELAND"
|
||||
],
|
||||
[
|
||||
"Handel för egen räkning",
|
||||
"Trade for your own account"
|
||||
],
|
||||
[
|
||||
"PORTUGAL",
|
||||
"PORTUGAL"
|
||||
],
|
||||
[
|
||||
"Gränsöverskridande verksamhet med penningöverföring",
|
||||
"Transboundary activity with money transfer"
|
||||
],
|
||||
[
|
||||
"h) Försäkring i samtliga skadeförsäkringsklasser (direkt)",
|
||||
"h) Insurance in all non-life classes (direct)"
|
||||
],
|
||||
[
|
||||
"Utställande av administration av betalningsmedel",
|
||||
"Exhibition of payment of funds"
|
||||
],
|
||||
[
|
||||
"Operativ risk, schablonmetod",
|
||||
"Operational risk, standardized method"
|
||||
],
|
||||
[
|
||||
"Utgivning av elektroniska pengar",
|
||||
"Electronic money issuance"
|
||||
],
|
||||
[
|
||||
"BULGARIEN",
|
||||
"BULGARIA"
|
||||
],
|
||||
[
|
||||
"CYPERN",
|
||||
"CYPRUS"
|
||||
],
|
||||
[
|
||||
"TJECKIEN",
|
||||
"TURKEY"
|
||||
],
|
||||
[
|
||||
"KROATIEN",
|
||||
"CROATIA"
|
||||
],
|
||||
[
|
||||
"UNGERN",
|
||||
"HUNGARY"
|
||||
],
|
||||
[
|
||||
"LIECHTENSTEIN",
|
||||
"LIECHTENSTEIN"
|
||||
],
|
||||
[
|
||||
"MALTA",
|
||||
"MALTA"
|
||||
],
|
||||
[
|
||||
"RUMÄNIEN",
|
||||
"ROMANIA"
|
||||
],
|
||||
[
|
||||
"SLOVENIEN",
|
||||
"SLOVENIA"
|
||||
],
|
||||
[
|
||||
"SLOVAKIEN",
|
||||
"SLOVAKIA"
|
||||
],
|
||||
[
|
||||
"Tillstånd till metod för intern riskklassificering",
|
||||
"Permission for internal risk classification method"
|
||||
],
|
||||
[
|
||||
"Utgivning av säkerställda obligationer",
|
||||
"Issuance of covered bonds"
|
||||
],
|
||||
[
|
||||
"LITAUEN",
|
||||
"LITHUANIA"
|
||||
],
|
||||
[
|
||||
"Tillstånd att bedriva pensionssparrörelse (IPS)",
|
||||
"Permission to pursue pension retirement (IPS)"
|
||||
],
|
||||
[
|
||||
"Ib) Tilläggsförsäkring till livförsäkring (direkt)",
|
||||
"Ib) Supplementary Insurance to Life Insurance (Direct)"
|
||||
],
|
||||
[
|
||||
"13. Allmän ansvarighet (direkt)",
|
||||
"13. General Accountability (Direct)"
|
||||
],
|
||||
[
|
||||
"17. Rättsskydd (direkt)",
|
||||
"17. Legal protection (direct)"
|
||||
],
|
||||
[
|
||||
"18. Assistans (direkt)",
|
||||
"18. Assistans (direct)"
|
||||
],
|
||||
[
|
||||
"7. Godstransport (direkt)",
|
||||
"7. Freight transport (direct)"
|
||||
],
|
||||
[
|
||||
"8. Brand och naturkrafter (direkt)",
|
||||
"8. Fire and natural forces (direct)"
|
||||
],
|
||||
[
|
||||
"9. Annan sakskada (direkt)",
|
||||
"9. Other property damage (direct)"
|
||||
],
|
||||
[
|
||||
"IV. Lång olycksfall- och sjukförsäkring (direkt)",
|
||||
"IV. Long accident and health insurance (direct)"
|
||||
],
|
||||
[
|
||||
"Ia) Livförsäkring (indirekt)",
|
||||
"(Ia) Life insurance (indirect)"
|
||||
],
|
||||
[
|
||||
"Ib) Tilläggsförsäkring till livförsäkring (indirekt)",
|
||||
"Ib) Supplementary Insurance to Life Insurance (indirect)"
|
||||
],
|
||||
[
|
||||
"III. Försäkring anknuten till värdepappersfonder (direkt)",
|
||||
"III. Insurance linked to mutual funds (direct)"
|
||||
],
|
||||
[
|
||||
"III. Försäkring anknuten till värdepappersfonder (indirekt)",
|
||||
"III. Insurance linked to mutual funds (indirect)"
|
||||
],
|
||||
[
|
||||
"Bankfacktjänster",
|
||||
"Safe custody services"
|
||||
],
|
||||
[
|
||||
"Operativ risk, internmätningsmetod",
|
||||
"Operational risk, internal measurement method"
|
||||
],
|
||||
[
|
||||
"b) Motorfordonsförsäkring (direkt)",
|
||||
"b) Motor vehicle insurance (direct)"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - derivat",
|
||||
"VP inst, SI derivative"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - ETC:er",
|
||||
"VP Inst, SI - ETC"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - ETN:er",
|
||||
"VP Inst, SI - ETN"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - strukturerade finansiella produkter",
|
||||
"VP inst, SI - structured financial products"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - utsläppsrätter",
|
||||
"VP inst, SI emission allowances"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - värdepapperiserade derivat",
|
||||
"VP inst, SI - securitized derivatives"
|
||||
],
|
||||
[
|
||||
"Tillstånd till avancerad metod för intern riskklassifiering",
|
||||
"Permit to advanced method for internal risk classification"
|
||||
],
|
||||
[
|
||||
"Marknadsrisk, VaR-modell",
|
||||
"Market risk, VaR model"
|
||||
],
|
||||
[
|
||||
"Garantiverksamhet & placering av fi instr med fast åtagande",
|
||||
"Warranty & Placement of Fixed Commitment"
|
||||
],
|
||||
[
|
||||
"Tjänster i samband med garantigivning",
|
||||
"Services in connection with warranty"
|
||||
],
|
||||
[
|
||||
"Investeringstj mm för underligg derivat vid investeringstj",
|
||||
"Investment etc. for underlying derivatives at investment"
|
||||
],
|
||||
[
|
||||
"Investerings- och finansanalys & allm rekom av fi instr",
|
||||
"Investment and financial analysis & publications by fi rst"
|
||||
],
|
||||
[
|
||||
"10. Motorfordonsansvar (direkt)",
|
||||
"10. Motor Vehicle Responsibility (Direct)"
|
||||
],
|
||||
[
|
||||
"3. Landfordon (direkt)",
|
||||
"3. Country vehicles (direct)"
|
||||
],
|
||||
[
|
||||
"6. Fartyg (direkt)",
|
||||
"6. Ships (direct)"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - annat aktieliknande instrument",
|
||||
"VP inst, SI - other share-like instrument"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - börshandlade fonder",
|
||||
"VP inst, SI-traded funds"
|
||||
],
|
||||
[
|
||||
"VP-inst, SI - certifikat",
|
||||
"VP Inst, SI Certificate"
|
||||
],
|
||||
[
|
||||
"e) Försäkring mot brand och annan skada på egendom (direkt)",
|
||||
"e) Insurance against fire and other property damage (direct)"
|
||||
],
|
||||
[
|
||||
"Penningöverföring",
|
||||
"money transfer"
|
||||
],
|
||||
[
|
||||
"Genomföra betaltransaktioner",
|
||||
"Make payment transactions"
|
||||
],
|
||||
[
|
||||
"Insättning/uttag betalkonto",
|
||||
"Deposit / withdrawal payment account"
|
||||
],
|
||||
[
|
||||
"Genomföra betalningstransaktioner",
|
||||
"Make payment transactions"
|
||||
],
|
||||
[
|
||||
"Inlåning",
|
||||
"deposits"
|
||||
],
|
||||
[
|
||||
"Genomföra betalningstransaktioner genom kreditutrymme",
|
||||
"Make payment transactions through credit space"
|
||||
],
|
||||
[
|
||||
"Betalningsinitieringstjänster",
|
||||
"Betalningsinitieringstjänster"
|
||||
],
|
||||
[
|
||||
"Kontoinformationstjänster",
|
||||
"account Information"
|
||||
],
|
||||
[
|
||||
"Bostadskreditförmedling",
|
||||
"Housing Finance Agency"
|
||||
],
|
||||
[
|
||||
"Inlösa transaktionsbelopp",
|
||||
"Redeem transaction amount"
|
||||
],
|
||||
[
|
||||
"Genomförande av betaltransaktioner",
|
||||
"Implementation of payment transactions"
|
||||
],
|
||||
[
|
||||
"Genomförande av betaltransaktioner gm kreditutrymme",
|
||||
"Implementation of payment transactions gm credit space"
|
||||
],
|
||||
[
|
||||
"Utfärdande och/eller förvärvande av betalningsinstrument",
|
||||
"Issuance and / or acquisition of payment instruments"
|
||||
],
|
||||
[
|
||||
"GIBRALTAR",
|
||||
"GIBRALTAR"
|
||||
],
|
||||
[
|
||||
"Valutaväxling",
|
||||
"Currency Exchange"
|
||||
],
|
||||
[
|
||||
"Annan finansiell verksamhet",
|
||||
"Other financial activities"
|
||||
],
|
||||
[
|
||||
"Konsumentkreditförmedling",
|
||||
"Consumer Credit Agency"
|
||||
],
|
||||
[
|
||||
"Konsumentkreditgivning",
|
||||
"consumer Credit"
|
||||
],
|
||||
[
|
||||
"Genomföra betalningstransaktioner som systemoperatör",
|
||||
"Make payment transactions as a system operator"
|
||||
],
|
||||
[
|
||||
"Elpengar",
|
||||
"Elpengar"
|
||||
],
|
||||
[
|
||||
"Betalningstransakt via tele-/informationsteknik-/nätoperatör",
|
||||
"Payment transaction via telecommunications / information technology / network operator"
|
||||
]
|
||||
]
|
50
helpers/dig.js
Normal file
50
helpers/dig.js
Normal file
@ -0,0 +1,50 @@
|
||||
const { format, promisify } = require('util');
|
||||
const exec = promisify(require('child_process').exec);
|
||||
|
||||
/* async function parseDig(output) {
|
||||
console.log('parseDig', output);
|
||||
const lines = output.split(/\n/);
|
||||
const result = {
|
||||
'A': [],
|
||||
'CNAME': []
|
||||
};
|
||||
for (const line of lines) {
|
||||
if (/^A (.*) from/.test(line)) {
|
||||
const scn = line.match(/^A (.*). from/)[1];
|
||||
result.A.push(scn);
|
||||
}
|
||||
if (/^CNAME (.*) from/.test(line)) {
|
||||
const scn = line.match(/^CNAME (.*). from/)[1];
|
||||
result.CNAME.push(scn);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}*/
|
||||
|
||||
/**
|
||||
* Dig trace option wrapper method.
|
||||
*
|
||||
* @param {String} name
|
||||
* @returns {{A: String[], CNAME: String[]}} results
|
||||
*/
|
||||
function dig(name) {
|
||||
return new Promise(async (resolve, reject) => {
|
||||
if (typeof (name) !== 'string')
|
||||
throw new TypeError('name (string) is required');
|
||||
|
||||
const cmd = format('dig %s +time=3 +retry=1', name);
|
||||
console.log('CMD', cmd);
|
||||
const { stdout, stderr } = await exec(cmd, { 'maxBuffer': 1024 * 1024 });
|
||||
// console.log('output', stdout, stderr);
|
||||
try {
|
||||
resolve(stdout);
|
||||
}
|
||||
catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = dig;
|
||||
module.exports.default = dig;
|
3
helpers/m.sh
Executable file
3
helpers/m.sh
Executable file
@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
docker run -d --env S3_BUCKET=obregstoretest-mdtest --env AWS_ACCESS_KEY_ID=AKIAJWJS75F7WNCGK64A --env AWS_SECRET_ACCESS_KEY=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 --env AWS_REGION=eu-west-1 --env LOGGER_LEVEL=debug --env IE_CRON=1 --env NL_CRON=1 --env CY_CRON=1 --env SE_CRON=1 -env DE_CRON=1 --env cronBump=true mail.caliban.io:5000/obdfcascrape:95a9843
|
14
helpers/s3tools.js
Normal file
14
helpers/s3tools.js
Normal file
@ -0,0 +1,14 @@
|
||||
const AWS = require('aws-sdk');
|
||||
equire('dotenv').config({
|
||||
'path': `${__dirname }/../.env`
|
||||
});
|
||||
|
||||
const logger = require('log4js').getLogger('S3Tools 🔧');
|
||||
|
||||
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
module.exports = { reduceArticle };
|
1686
helpers/scraper.js
Normal file
1686
helpers/scraper.js
Normal file
File diff suppressed because it is too large
Load Diff
0
helpers/tools.js
Normal file
0
helpers/tools.js
Normal file
60
helpers/watcher.js
Normal file
60
helpers/watcher.js
Normal file
@ -0,0 +1,60 @@
|
||||
// UATU
|
||||
|
||||
const CronJob = require('cron').CronJob;
|
||||
const pm2 = require('pm2');
|
||||
|
||||
const logger = require('log4js').getLogger('WATCHER');
|
||||
const nodeFree = require('node-free');
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
logger.level = 'trace';
|
||||
|
||||
function formatBytes(bytes, decimals = 2) {
|
||||
if (bytes === 0) return '0 Bytes';
|
||||
|
||||
const k = 1024;
|
||||
const dm = decimals < 0 ? 0 : decimals;
|
||||
const sizes = ['Bytes', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'];
|
||||
|
||||
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||
|
||||
return `${parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) } ${ sizes[i]}`;
|
||||
}
|
||||
|
||||
async function logMemory() {
|
||||
pm2.list((err, processDescriptionList) => {
|
||||
if (err)
|
||||
logger.error(err);
|
||||
else
|
||||
for (const item of processDescriptionList) {
|
||||
// logger.debug(JSON.stringify(item));
|
||||
const { pid, name, monit } = item;
|
||||
const { memory, cpu } = monit;
|
||||
if (name !== 'watcher')
|
||||
logger.info(`${name} :: PID:${pid} :: MEMORY:${formatBytes(memory)} :: CPU:${cpu}`);
|
||||
}
|
||||
});
|
||||
|
||||
logger.info(`Total:${formatBytes(nodeFree.total())} :: Used:${formatBytes(nodeFree.used())} :: Free:${formatBytes(nodeFree.free())}`);
|
||||
}
|
||||
|
||||
async function run() {
|
||||
pm2.connect(() => {
|
||||
logMemory();
|
||||
});
|
||||
|
||||
new CronJob('*/5 * * * *', async () => {
|
||||
await logMemory();
|
||||
}, null, true);
|
||||
|
||||
logger.info('Watcher Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
logger.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
||||
|
30
ie.js
Normal file
30
ie.js
Normal file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Ireland = require('./ncas/ie');
|
||||
|
||||
async function run() {
|
||||
const ieScraper = new Ireland();
|
||||
|
||||
if (typeof(process.env.IE_CRON) === 'string' ) {
|
||||
console.log(`${ieScraper.id} cron set for ${process.env.IE_CRON}`);
|
||||
new CronJob(process.env.IE_CRON, async function() {
|
||||
await ieScraper.run();
|
||||
}, null, true);
|
||||
}
|
||||
|
||||
if (process.env.SCRAPE_START === ieScraper.id)
|
||||
await ieScraper.run();
|
||||
|
||||
console.log('IE Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
||||
|
80
index.js
Normal file
80
index.js
Normal file
@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
// TODO:
|
||||
// parse arguments - we should run just 1 FCA per go &
|
||||
// have option to run selected company from selected NCA
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
// load helper libs etc
|
||||
// const Fca = require('./ncas/fca');
|
||||
const Ireland = require('./ncas/ie');
|
||||
const Denmark = require('./ncas/dk');
|
||||
const France = require('./ncas/fr');
|
||||
const Cyprus = require('./ncas/cy');
|
||||
const Germany = require('./ncas/de');
|
||||
const Netherlands = require('./ncas/nl');
|
||||
const Sweden = require('./ncas/se');
|
||||
const Malta = require('./ncas/mt');
|
||||
|
||||
async function run() {
|
||||
// const fcaScraper = new Fca();
|
||||
|
||||
const ieScraper = new Ireland();
|
||||
|
||||
const dkScraper = new Denmark();
|
||||
|
||||
const frScraper = new France();
|
||||
|
||||
const cyScraper = new Cyprus();
|
||||
|
||||
const deScraper = new Germany();
|
||||
|
||||
const nlScraper = new Netherlands();
|
||||
|
||||
const seScraper = new Sweden();
|
||||
|
||||
const mtScraper = new Malta();
|
||||
|
||||
// fcaScraper.run();
|
||||
|
||||
// await ieScraper.run();
|
||||
|
||||
/*
|
||||
await dkScraper.run();
|
||||
|
||||
// dkScraper.run();
|
||||
|
||||
await frScraper.run();
|
||||
|
||||
await cyScraper.run();
|
||||
|
||||
await ieScraper.run();
|
||||
*/
|
||||
|
||||
// await nlScraper.run();
|
||||
|
||||
// await seScraper.run();
|
||||
|
||||
// await deScraper.run();
|
||||
|
||||
// await cyScraper.run();
|
||||
|
||||
// await frScraper.run();
|
||||
|
||||
// console.log('Launched');
|
||||
|
||||
console.log(new Date());
|
||||
new CronJob('05 16 * * *', async function() {
|
||||
await ieScraper.run();
|
||||
}, null, true);
|
||||
|
||||
new CronJob('05 16 * * *', async function() {
|
||||
await frScraper.run();
|
||||
}, null, true);
|
||||
}
|
||||
|
||||
run();
|
28
it.js
Normal file
28
it.js
Normal file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Italy = require('./ncas/it');
|
||||
|
||||
async function run() {
|
||||
const itScraper = new Italy();
|
||||
|
||||
if (typeof(process.env.IT_CRON) === 'string' )
|
||||
new CronJob(process.env.IT_CRON, async function() {
|
||||
await itScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === itScraper.id)
|
||||
await itScraper.run();
|
||||
console.log('IT Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
47
launcher.js
Normal file
47
launcher.js
Normal file
@ -0,0 +1,47 @@
|
||||
const jsonfile = require('jsonfile');
|
||||
|
||||
require('dotenv').config();
|
||||
|
||||
(function() {
|
||||
const apps = [];
|
||||
const list = [
|
||||
{ 'cron':'CY_CRON', 'start':'CY', 'name':'CY', 'script':'cy.js' },
|
||||
{ 'cron':'CZ_CRON', 'start':'CZ', 'name':'CZ', 'script':'cz.js' },
|
||||
{ 'cron':'DE_CRON', 'start':'DE', 'name':'DE', 'script':'de.js' },
|
||||
{ 'cron':'DK_CRON', 'start':'DK', 'name':'DK', 'script':'dk.js' },
|
||||
{ 'cron':'FR_CRON', 'start':'FR', 'name':'FR', 'script':'fr.js' },
|
||||
{ 'cron':'GI_CRON', 'start':'GI', 'name':'GI', 'script':'gi.js' },
|
||||
{ 'cron':'IE_CRON', 'start':'IE', 'name':'IE', 'script':'ie.js' },
|
||||
{ 'cron':'IT_CRON', 'start':'IT', 'name':'IT', 'script':'it.js' },
|
||||
{ 'cron':'LU_CRON', 'start':'LU', 'name':'LU', 'script':'lu.js' },
|
||||
{ 'cron':'MT_CRON', 'start':'MT', 'name':'MT', 'script':'mt.js' },
|
||||
{ 'cron':'NL_CRON', 'start':'NL', 'name':'NL', 'script':'nl.js' },
|
||||
{ 'cron':'PL_CRON', 'start':'PL', 'name':'PL', 'script':'pl.js' },
|
||||
{ 'cron':'SE_CRON', 'start':'SE', 'name':'SE', 'script':'se.js' },
|
||||
{ 'cron':'SK_CRON', 'start':'SK', 'name':'SK', 'script':'sk.js' },
|
||||
{ 'cron':'PT_CRON', 'start':'PT', 'name':'PT', 'script':'pt.js' }
|
||||
];
|
||||
|
||||
for(const item of list)
|
||||
if (typeof process.env[item.cron] !== 'undefined' || process.env.SCRAPE_START === item.start) {
|
||||
const newItem = {
|
||||
'name' : item.name,
|
||||
'script' : item.script,
|
||||
|
||||
'env': {
|
||||
'NODE_ENV': 'production'
|
||||
},
|
||||
'env_production' : {
|
||||
'NODE_ENV': 'production'
|
||||
},
|
||||
'autorestart' : true,
|
||||
'max_restarts': 3,
|
||||
'restart_delay': 4000
|
||||
};
|
||||
|
||||
apps.push(newItem);
|
||||
}
|
||||
|
||||
console.log(apps);
|
||||
})();
|
||||
|
25
lt.js
Normal file
25
lt.js
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Lithuania = require('./ncas/lt');
|
||||
|
||||
async function run() {
|
||||
const lsScraper = new Lithuania();
|
||||
|
||||
if (typeof(process.env.LT_CRON) === 'string' )
|
||||
new CronJob(process.env.LT_CRON, async () => {
|
||||
await lsScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === lsScraper.id)
|
||||
await lsScraper.run();
|
||||
|
||||
console.log('LT Launched');
|
||||
}
|
||||
|
||||
run();
|
23
lu.js
Normal file
23
lu.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Lux = require('./ncas/lu');
|
||||
|
||||
async function run() {
|
||||
const luScraper = new Lux();
|
||||
|
||||
if (typeof(process.env.LU_CRON) === 'string' )
|
||||
new CronJob(process.env.LU_CRON, async () => {
|
||||
await luScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === luScraper.id)
|
||||
await luScraper.run();
|
||||
|
||||
console.log('LU Launched');
|
||||
}
|
||||
|
||||
run();
|
23
lv.js
Normal file
23
lv.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Latvia = require('./ncas/lv');
|
||||
|
||||
async function run() {
|
||||
const lvScraper = new Latvia();
|
||||
|
||||
if (typeof(process.env.LV_CRON) === 'string' )
|
||||
new CronJob(process.env.LV_CRON, async () => {
|
||||
await lvScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === lvScraper.id)
|
||||
await lvScraper.run();
|
||||
|
||||
console.log('LV Launched');
|
||||
}
|
||||
|
||||
run();
|
28
mt.js
Normal file
28
mt.js
Normal file
@ -0,0 +1,28 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Malta = require('./ncas/mt');
|
||||
|
||||
async function run() {
|
||||
const mtScraper = new Malta();
|
||||
|
||||
if (typeof(process.env.MT_CRON) === 'string' )
|
||||
new CronJob(process.env.MT_CRON, async () => {
|
||||
await mtScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === mtScraper.id)
|
||||
await mtScraper.run();
|
||||
|
||||
console.log('scrapestart', process.env.SCRAPE_START);
|
||||
console.log('MT Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
});
|
||||
|
||||
run();
|
83
ncas/bg.js
Normal file
83
ncas/bg.js
Normal file
@ -0,0 +1,83 @@
|
||||
const logger = require('log4js').getLogger('BG');
|
||||
const path = require('path');
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
|
||||
class BGScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'BG';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
async downloadByHrefFilename(filename) {
|
||||
logger.info(`Downloading ${filename} from ${this.page.url}`);
|
||||
const linkHandles = await this.page.$x(`//a[contains(@href, \'${filename}\')]`);
|
||||
const linkElement = linkHandles[0];
|
||||
await linkElement.click();
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
|
||||
async start() {
|
||||
super._start();
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/BG/BNB`));
|
||||
|
||||
this.startPage = 'http://www.bnb.bg/PaymentSystem/PSPaymentOversight/PSPaymentOversightRegisters/index.htm';
|
||||
this.creditInstitutionsPage = 'http://www.bnb.bg/RegistersAndServices/RSCIRegisters/index.htm';
|
||||
|
||||
// site only over http, so skip ssl during non-repudiation
|
||||
await this._doNonRepudiation(false, { 'skipSsl': true }).catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
this.page = await this.browser.newPage();
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
|
||||
// set cookie for English language and load start page
|
||||
await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
this._makeScreenshotV2(this.page, `${this.path}/index1`);
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
await this.downloadByHrefFilename('ps_po_register_2_en.xls');
|
||||
await this.downloadByHrefFilename('ps_po_register_3a_en.xls');
|
||||
|
||||
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
this._makeScreenshotV2(this.page, `${this.path}/index2`);
|
||||
|
||||
// TODO: come back and scrape the html page version of this word doc, if we have time
|
||||
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc');
|
||||
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls');
|
||||
|
||||
// wait until all downloads finished with 'networkidle0' (currently this is only possible with 'page.goto', so we go back to the start page)
|
||||
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = BGScrape;
|
455
ncas/cy.js
Normal file
455
ncas/cy.js
Normal file
@ -0,0 +1,455 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const url = require('url');
|
||||
const logger = require('log4js').getLogger('CY');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
// load env variables from file
|
||||
|
||||
class CYScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('CY');
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param selector
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async grabLink(selector) {
|
||||
const clickableLinks = await this.page.$$(selector);
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
if (clickableLinks.length > 0)
|
||||
for (const item of clickableLinks) {
|
||||
const href = await this.page.evaluate(el => el.href, item);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
|
||||
// log this error but Puppeteer isn't supposed to support this sort of download....
|
||||
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
|
||||
|
||||
if (!err.message.includes('net::ERR_ABORTED') )
|
||||
logger.error('grabLink', err);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param id
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async downloadEmoney(id) {
|
||||
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
|
||||
|
||||
await this.grabLink(selector[id]);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async downloadExcel() {
|
||||
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
|
||||
|
||||
await this.grabLink(selector);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handlePaymentInstitutions() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filename = 'licensing-and-supervision-of-payment-institutions';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadExcel();
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleElectronicMoneyInstitutions() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadEmoney(0);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadEmoney(1);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
this.emit('startProcessingCreditServices');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param body
|
||||
* @returns {Promise<{}|Array>}
|
||||
*/
|
||||
async extractLocalCreditInstitutions(body) {
|
||||
try{
|
||||
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
|
||||
const sanity = /(\d+\.\s)(.+)/;
|
||||
const $ = cheerio.load(body, {
|
||||
'normalizeWhitespace': true
|
||||
});
|
||||
|
||||
let nextItem;
|
||||
|
||||
$('p').each(function(i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
const isHeading = matchHeading.test(lineText);
|
||||
if (isHeading)
|
||||
nextItem = $(this).next();
|
||||
});
|
||||
|
||||
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
||||
const splitText = $(nextItem).text().split('\n');
|
||||
|
||||
const output = [];
|
||||
|
||||
splitText.forEach((item) => {
|
||||
const newItem = this._cleanUp(item);
|
||||
|
||||
if ( newItem !== '')
|
||||
output.push( sanity.exec(newItem)[2]);
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param body
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractForeignCreditInstitutions(body) {
|
||||
try{
|
||||
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
|
||||
|
||||
const sanity = /(\w+\.\s+)(.+)/;
|
||||
|
||||
const $ = cheerio.load(body, {
|
||||
'normalizeWhitespace': true
|
||||
});
|
||||
|
||||
const output = {};
|
||||
|
||||
let nextItem;
|
||||
|
||||
$('p').each(function(i, elem) {
|
||||
const lineText = $(this).text();
|
||||
const isHeading = matchHeading.test(lineText);
|
||||
if (isHeading)
|
||||
nextItem = $(this).next();
|
||||
});
|
||||
|
||||
// Rolling this out for ease as it could be changed by hand
|
||||
let nextElm;
|
||||
|
||||
let firstHeadOrig, firstHead;
|
||||
|
||||
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
||||
firstHeadOrig = this._cleanUp($(nextItem).text());
|
||||
firstHead = sanity.exec(firstHeadOrig)[2];
|
||||
output[firstHead] = {};
|
||||
|
||||
nextElm = $(nextItem).next();
|
||||
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
firstHeadOrig = this._cleanUp($(nextElm).text());
|
||||
firstHead = sanity.exec(firstHeadOrig)[2];
|
||||
output[firstHead] = {};
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
catch(err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<{local: Promise<*|void>}>}
|
||||
*/
|
||||
async processCreditInstitute() {
|
||||
logger.info('Credit institutes');
|
||||
try{
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const content = $('.generic_page-intro');
|
||||
|
||||
const local = await this.extractLocalCreditInstitutions(content.html());
|
||||
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
|
||||
|
||||
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
|
||||
|
||||
this.emit('done');
|
||||
|
||||
return { local, creditInstitutes };
|
||||
}
|
||||
catch(err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param filePath
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async savePDF(filePath) {
|
||||
logger.info('Saving the pdf:', filePath);
|
||||
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
|
||||
// this.emit('startProcessingCreditServices');
|
||||
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
const checkPDF = /(.pdf)/g;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let currentPath = pageUrl.pathname;
|
||||
let pdfFile;
|
||||
|
||||
if (checkPDF.test(currentPath)) {
|
||||
const splitPath = currentPath.split('/');
|
||||
|
||||
pdfFile = splitPath.pop();
|
||||
currentPath = splitPath.join('/');
|
||||
}
|
||||
|
||||
switch (currentPath) {
|
||||
|
||||
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
|
||||
await this.handlePaymentInstitutions();
|
||||
break;
|
||||
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
|
||||
await this.handleElectronicMoneyInstitutions();
|
||||
break;
|
||||
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
|
||||
logger.warn('We should only arrive here when in Non-headless mode');
|
||||
await this.savePDF(pdfFile);
|
||||
break;
|
||||
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
|
||||
|
||||
await this.processCreditInstitute();
|
||||
break;
|
||||
default:
|
||||
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl.href}`);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
logger.info('Attaching events');
|
||||
this.on('startProcessingCreditServices', async function() {
|
||||
await this._goto(this.credit);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
try {
|
||||
super._start();
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
|
||||
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
|
||||
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
|
||||
|
||||
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
|
||||
await this._createDirectory(this.path);
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch (e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async __run() {
|
||||
logger.info('Scraping Cyprus...');
|
||||
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = CYScrape;
|
1062
ncas/cz.js
Normal file
1062
ncas/cz.js
Normal file
File diff suppressed because it is too large
Load Diff
597
ncas/de.js
Normal file
597
ncas/de.js
Normal file
@ -0,0 +1,597 @@
|
||||
// version: 0.0.1-20
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const logger = require('log4js').getLogger('DE');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class DEScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('DE');
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildSubIndex() {
|
||||
logger.info('Building sub-index...');
|
||||
|
||||
const currentPage = await this.page.evaluate(() => document);
|
||||
|
||||
const search = currentPage.location.search;
|
||||
const params = this._getParamsFromUrl(search);
|
||||
|
||||
const currentPageID = params.nameZahlungsinstitut || '';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/menu_${currentPageID}`, null);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const links = await this.page.$$('#zahlinst > tbody > tr a');
|
||||
|
||||
for (const item of links) {
|
||||
const id = await this.page.evaluate(el => el.innerText, item);
|
||||
let href = await this.page.evaluate(el => el.href, item);
|
||||
const params = this._getParamsFromUrl(href);
|
||||
|
||||
href = href.concat('&locale=en_GB');
|
||||
|
||||
if (id !== 'Found payment institutions:')
|
||||
this.paymentServices.links.push({ id, href, params });
|
||||
}
|
||||
|
||||
this.index.step++;
|
||||
|
||||
if (this.index.step < this.index.items)
|
||||
this.emit('nextsubindex');
|
||||
else {
|
||||
this.subIndex.done = true;
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
this.emit('subindexdone');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex() {
|
||||
logger.info('Building the index...');
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const links = await this.page.$$('#suchform > div > div:nth-child(2) > div.navigationGruppeBuchstaben a');
|
||||
|
||||
for (const item of links) {
|
||||
const id = await this.page.evaluate(el => el.innerText, item);
|
||||
let href = await this.page.evaluate(el => el.href, item);
|
||||
|
||||
href = href.concat('&locale=en_GB');
|
||||
|
||||
this.index.links.push({ id, href });
|
||||
}
|
||||
|
||||
this.index.done = true;
|
||||
this.index.items = this.index.links.length;
|
||||
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
async initiateCreditIndex() {
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
const options = await this.page.$$('#institutKategorie option');
|
||||
const wantedOption = ['Credit institutions (BA)', 'Kreditinstitute (BA)'];
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('#institutKategorie', value);
|
||||
this.creditServices.started = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (this.creditServices.started)
|
||||
this._findAndClick('#sucheButtonInstitut');
|
||||
else
|
||||
throw new Error('Unable to initiate CI Search');
|
||||
}
|
||||
|
||||
async processCreditInstIndexPage() {
|
||||
const noWhiteSpace = /\W/g;
|
||||
logger.info('Building CI sub-index...');
|
||||
|
||||
const wantedRowType = ['CRR-Kreditinstitut'];
|
||||
const currentPage = await this.page.evaluate(() => document);
|
||||
const body = await this.page.content();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const search = currentPage.location.search;
|
||||
const params = this._getParamsFromUrl(search);
|
||||
|
||||
const currentPageID = params['d-4012550-p'] || '';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/credit_instititute_menu_${currentPageID}`, null);
|
||||
|
||||
await this._randomWait(this.page, 7, 10);
|
||||
|
||||
const rows = $('#institut tr');
|
||||
|
||||
rows.each((i, elm) => {
|
||||
const rowClass = cheerio(elm).attr('class');
|
||||
|
||||
if (typeof(rowClass) !== 'undefined') {
|
||||
const children = cheerio(elm).children();
|
||||
|
||||
const rowType = children.eq(1).text();
|
||||
|
||||
if (wantedRowType.indexOf(rowType) !== -1) {
|
||||
const name = this._cleanUp(children.eq(0).text());
|
||||
const id = this._makeFieldName(name);
|
||||
let href = cheerio(children.eq(0)).find('a').attr('href');
|
||||
const params = this._getParamsFromUrl(href);
|
||||
href = href.concat('&locale=en_GB');
|
||||
|
||||
// this is the one we want.
|
||||
|
||||
this.creditServices.links.push({ name, id, href, params });
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const clicked = await this._findAndClick('.pagelinks a', 'Next');
|
||||
if (!clicked) {
|
||||
// come to the end of the index..
|
||||
|
||||
this.creditServices.done = true;
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
|
||||
this.emit('ciindexdone');
|
||||
}
|
||||
}
|
||||
|
||||
async processCreditInstPage() {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const id = this.creditServices.links[this.creditServices.step].id;
|
||||
const name = this.creditServices.links[this.creditServices.step].name;
|
||||
logger.info(`Process Credit Service entity ${this.creditServices.step} of ${this.creditServices.items} // ${name}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const details = await this.extractPaymentEntity(body);
|
||||
|
||||
const entity = removeAccents.remove(details.description[0].trim());
|
||||
|
||||
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
|
||||
|
||||
logger.debug('filename', filename);
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
jsonfile.writeFileSync(`${filePath}.json`, details);
|
||||
|
||||
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
|
||||
this.creditServices.links[this.creditServices.step].filePath = `${filePath}`;
|
||||
this.creditServices.step++;
|
||||
|
||||
if (this.creditServices.step < this.creditServices.items) {
|
||||
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('creditinstdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processCreditInstIndex() {
|
||||
logger.info('Building CI Index..');
|
||||
|
||||
if (!this.creditServices.started)
|
||||
await this.initiateCreditIndex();
|
||||
else
|
||||
await this.processCreditInstIndexPage();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<{description: T[] | jQuery, permissions: {original: Array, translated: Array}}>}
|
||||
*/
|
||||
async extractPaymentEntity(html) {
|
||||
const permissions = { 'original':[], 'translated':[] };
|
||||
|
||||
const newLine = /\n/g;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
let description = $('#content > p').text().split(newLine).filter(line => line.length > 0);
|
||||
|
||||
description = description.map((i) => {
|
||||
return this._cleanUp(i.replace(/\t/g, '')).trim();
|
||||
});
|
||||
|
||||
description = description.filter(item => item.length > 0);
|
||||
|
||||
const rows = $('#erlaubnis > tbody tr');
|
||||
|
||||
rows.each((index, item) => {
|
||||
const cells = $(item).find('td');
|
||||
|
||||
const service = $(cells.get(0)).text();
|
||||
const startAuth = $(cells.get(1)).text();
|
||||
const endAuth = $(cells.get(2)).text();
|
||||
|
||||
const reason = (cells.length === 4) ? $(cells.get(3)).text() : '';
|
||||
|
||||
const phrasing = service.split(' (§');
|
||||
const translated = this._translate(phrasing[0]);
|
||||
|
||||
phrasing[0] = (translated !== '') ? translated : phrasing[0];
|
||||
|
||||
const newObjTrans = {
|
||||
'service': phrasing.join(' (§'),
|
||||
startAuth,
|
||||
endAuth
|
||||
};
|
||||
|
||||
const newObj = {
|
||||
service,
|
||||
startAuth,
|
||||
endAuth
|
||||
};
|
||||
|
||||
if (cells.length === 4) {
|
||||
newObj.reason = reason;
|
||||
newObjTrans.reason = reason;
|
||||
}
|
||||
|
||||
permissions.translated.push(newObjTrans);
|
||||
|
||||
permissions.original.push(newObj);
|
||||
});
|
||||
|
||||
return { description, permissions };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntity() {
|
||||
const noWhiteSpace = /\W/g;
|
||||
if (!this.subIndex.done) {
|
||||
// We should not be here quite yet, so add this to subindex;
|
||||
const currentPage = await this.page.evaluate(() => document);
|
||||
|
||||
const location = currentPage.location;
|
||||
const id = location.search;
|
||||
let href = location.href;
|
||||
href = href.concat('&locale=en_GB');
|
||||
|
||||
this.paymentServices.links.push({ id, href });
|
||||
|
||||
this.index.step++;
|
||||
|
||||
if (this.index.step < this.index.items)
|
||||
this.emit('nextsubindex');
|
||||
else {
|
||||
logger.info('Sub indexing done...');
|
||||
this.subIndex.done = true;
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
this.emit('subindexdone');
|
||||
}
|
||||
}
|
||||
else {
|
||||
const id = this.paymentServices.links[this.paymentServices.step].id;
|
||||
// logger.info('Process entity:', id);
|
||||
logger.info(`Process entity ${this.paymentServices.step} of ${this.paymentServices.items} // ${id}`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||||
|
||||
const details = await this.extractPaymentEntity(body);
|
||||
|
||||
const entity = removeAccents.remove(details.description[0].trim());
|
||||
|
||||
// const filename = id.indexOf('?id=') === 0 ? `ps_${entity.replace(noWhiteSpace, '_')}` : `ps_${id.replace(noWhiteSpace, '_')}`;
|
||||
|
||||
const filename = id.indexOf('?id=') === 0 ? this._makeFileName(entity) : this._makeFileName(id);
|
||||
|
||||
logger.debug('filename', filename);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/${filename}.json`, details);
|
||||
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
|
||||
|
||||
this.paymentServices.step++;
|
||||
|
||||
if (this.paymentServices.step < this.paymentServices.items)
|
||||
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
|
||||
else
|
||||
this.emit('processdone');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param selector
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async grabLink(selector) {
|
||||
try{
|
||||
const clickableLinks = await this.page.$$(selector);
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
if (clickableLinks.length > 0)
|
||||
for (const item of clickableLinks) {
|
||||
const href = await this.page.evaluate(el => el.href, item);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
await this._goto(href, { 'waitUntil': 'networkidle0' }, true).catch((err) => {
|
||||
// log this error but Puppeteer isn't supposed to support this sort of download....
|
||||
|
||||
logger.warn(err);
|
||||
// throw(Error(err));
|
||||
});
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
// this._uploadError();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEMoney() {
|
||||
logger.info('Process EMoney:');
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filename = 'e-money_Institutions';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
|
||||
await this._findAndClick('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > h3:nth-child(5)');
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_expanded`, null);
|
||||
|
||||
await this.grabLink('#content > div.sectionRelated.toggleEntry.gsb-toggle > div > ul:nth-child(6) > li > a');
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
this.mode++;
|
||||
this.emit('startcredit');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/database/ZahlInstInfo/':
|
||||
await this.buildIndex();
|
||||
break;
|
||||
|
||||
case '/database/ZahlInstInfo/suche.do':
|
||||
await this.buildSubIndex();
|
||||
break;
|
||||
case '/database/ZahlInstInfo/zahlinst.do':
|
||||
await this.processEntity();
|
||||
break;
|
||||
case '/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html':
|
||||
await this.processEMoney();
|
||||
break;
|
||||
case '/database/InstInfo/sucheForm.do':
|
||||
await this.processCreditInstIndex();
|
||||
// build index of credit institutes.
|
||||
break;
|
||||
case '/database/InstInfo/institutDetails.do':
|
||||
await this.processCreditInstPage();
|
||||
// build index of credit institutes.
|
||||
break;
|
||||
default:
|
||||
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('startcredit', async function() {
|
||||
logger.info('Starting Credit Institutes');
|
||||
await this._goto(this.credit);
|
||||
});
|
||||
|
||||
this.on('processdone', async function() {
|
||||
logger.warn('Payment Entities done', this.paymentServices.items);
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
await this._randomWait(this.page, 5, 10);
|
||||
await this._goto(this.emoneyUrl);
|
||||
});
|
||||
|
||||
this.on('subindexdone', async function() {
|
||||
logger.info('Sub Index done', this.paymentServices.items);
|
||||
logger.info(this.paymentServices.links[this.paymentServices.step].href);
|
||||
await this._goto(this.paymentServices.links[this.paymentServices.step].href);
|
||||
});
|
||||
|
||||
this.on('indexdone', async function() {
|
||||
logger.info('Index done', this.index.items);
|
||||
logger.info(this.index.links[this.index.step].href);
|
||||
await this._goto(this.index.links[this.index.step].href);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async function() {
|
||||
logger.info('CI Index done', this.creditServices.items);
|
||||
logger.info(this.creditServices.links[this.creditServices.step].href);
|
||||
|
||||
const newUrl = `https://portal.mvp.bafin.de/database/InstInfo/${this.creditServices.links[this.creditServices.step].href}`;
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('creditinstdone', async function() {
|
||||
logger.debug('Credit Institutes done', this.paymentServices.items);
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
await this._randomWait(this.page, 5, 10);
|
||||
this.emit('done');
|
||||
});
|
||||
|
||||
this.on('nextsubindex', async function() {
|
||||
logger.debug(this.index.links[this.index.step].href);
|
||||
await this._goto(this.index.links[this.index.step].href);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
|
||||
this.mode = 0;
|
||||
|
||||
try {
|
||||
await this._loadDictionary();
|
||||
|
||||
this.index = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'started': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.subIndex = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'started': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false
|
||||
};
|
||||
|
||||
this.startPage = 'https://portal.mvp.bafin.de/database/ZahlInstInfo/?locale=en_GB';
|
||||
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
|
||||
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/DE/BAFIN`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5, 'Startup');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = DEScrape;
|
413
ncas/dk.js
Normal file
413
ncas/dk.js
Normal file
@ -0,0 +1,413 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('DK');
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class DKScrape extends Scraper {
|
||||
|
||||
constructor(checkForLock = true) {
|
||||
super();
|
||||
this.id = 'DK';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (checkForLock)
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
|
||||
this.on('error', (err) => {
|
||||
logger.error('Error catcher!!', err);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const currentPage = await this.page.evaluate(() => document);
|
||||
|
||||
const search = currentPage.location.search;
|
||||
|
||||
switch (currentPage.location.pathname) {
|
||||
|
||||
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx':
|
||||
await this.handleStartPage();
|
||||
break;
|
||||
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx':
|
||||
await this.handleSearchResults(search);
|
||||
break;
|
||||
|
||||
case '/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||||
case '/da/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||||
case '/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-vis-virksomhed.aspx':
|
||||
// these are all the same page, just in Danish, Danish and English
|
||||
this.processCoporation();
|
||||
break;
|
||||
default:
|
||||
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${currentPage.location.href}`);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleStartPage() {
|
||||
if (this.mode === 0)
|
||||
await this._findAndClick('ul li a', 'Payment institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Payment+institutions');
|
||||
|
||||
if (this.mode === 1)
|
||||
await this._findAndClick('ul li a', 'Electronic money institutions', 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-soegning.aspx?aid=Payment+services+area&ctid=Electronic+money+institutions');
|
||||
|
||||
if (this.mode === 2) {
|
||||
logger.info('Processing complete');
|
||||
this.done();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<Error>}
|
||||
*/
|
||||
async processCoporation() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const h2 = $('h2').eq(0).text();
|
||||
// Virksomhedsoplysninger
|
||||
// Company information
|
||||
|
||||
if (h2 === 'Virksomhedsoplysninger') {
|
||||
logger.warn('Not in English, trying to switch language...');
|
||||
await this._findAndClick('#mainform > div.header > ul > li.ln > a');
|
||||
}
|
||||
else if (h2 === 'Company information') {
|
||||
const noWhiteSpace = /\W/g;
|
||||
let ssName;
|
||||
|
||||
if (this.mode === 0)
|
||||
ssName = this.paymentServices.links[this.paymentServices.step].innerText.replace(noWhiteSpace, '_');
|
||||
else
|
||||
ssName = this.emoneyServices.links[this.emoneyServices.step].innerText.replace(noWhiteSpace, '_');
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${ssName}`, null);
|
||||
|
||||
logger.debug('Processing:', this.paymentServices.links[this.paymentServices.step]);
|
||||
|
||||
const fields = await this.extractData(body);
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/${ssName}.json`, fields);
|
||||
await this.downloadExcel();
|
||||
|
||||
if (this.mode === 0)
|
||||
this.paymentServices.step += 1;
|
||||
else
|
||||
this.emoneyServices.step += 1;
|
||||
|
||||
await this._randomWait(this.page, 10, 15);
|
||||
|
||||
// This should take us back to the search result list
|
||||
|
||||
await this._findAndClick('#divContentWidthScr li a', 'To search results');
|
||||
}
|
||||
else
|
||||
return new Error('I do not understand this page...');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $block
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async processDataBlock($block) {
|
||||
const $ = cheerio.load($block);
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const a = $('tr').map((i, el) => {
|
||||
const head = $(el).find('td').first();
|
||||
const data = $(el).find('td').next();
|
||||
|
||||
return [head.eq(-1).html().split('</div>')[1].replace(/\n/, '').trim(), data.text()];
|
||||
});
|
||||
|
||||
const fields = [];
|
||||
|
||||
for( let step = 0;step < a.length;step = step + 2)
|
||||
fields.push([a[step].replace(noWhiteSpace, '_'), a[step + 1]]);
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param body
|
||||
* @returns {Promise<{companyInformation: *[], presence: Array}>}
|
||||
*/
|
||||
async extractData(body) {
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const vutDataContainer = $('.vut-data-container');
|
||||
|
||||
const $basicInfo = vutDataContainer.find('#phmain_0_vut_pnl_basic_info table tbody').get();
|
||||
const $extendednInfo = vutDataContainer.find('#phmain_0_vut_pnl_extended_info table tbody').get();
|
||||
|
||||
const $presenceInfo = vutDataContainer.find('#phmain_0_vut_pnl_tilstedevaerelser table tbody').get();
|
||||
|
||||
let companyInformation = await this.processDataBlock($basicInfo);
|
||||
companyInformation = companyInformation.concat(await this.processDataBlock($extendednInfo));
|
||||
const presence = await this.processDataBlock($presenceInfo);
|
||||
|
||||
return { companyInformation, presence };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async downloadExcel() {
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
logger.info('Saving excel into:', this.path);
|
||||
|
||||
await this._findAndClick('#phmain_0_vut_link_button_excel');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param search
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleSearchResults(search) {
|
||||
switch (search) {
|
||||
|
||||
case '?aid=Payment+services+area&ctid=Payment+institutions':
|
||||
if (!this.paymentServices.done)
|
||||
await this.handlePaymentServices();
|
||||
else
|
||||
// Are we not done yet?
|
||||
// Restarting the page
|
||||
await this.page.goto(this.startPage);
|
||||
|
||||
break;
|
||||
case '?aid=Payment+services+area&ctid=Electronic+money+institutions':
|
||||
if (!this.emoneyServices.done)
|
||||
await this.handleEmoneyServices();
|
||||
else
|
||||
// Are we not done yet?
|
||||
// Restarting the page
|
||||
await this.page.goto(this.startPage);
|
||||
|
||||
break;
|
||||
case '?restoreSearch=1':
|
||||
if (this.mode === 0)
|
||||
if (this.paymentServices.items > 0 && !this.paymentServices.done)
|
||||
await this.handlePaymentServices();
|
||||
else {
|
||||
// Are we not done yet?
|
||||
// Restarting the page
|
||||
await this.page.goto(this.startPage);
|
||||
}
|
||||
|
||||
if (this.mode === 1)
|
||||
if (this.emoneyServices.items > 0 && !this.emoneyServices.done)
|
||||
await this.handleEmoneyServices();
|
||||
|
||||
break;
|
||||
default:
|
||||
// Menu fell through
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractLinks() {
|
||||
const returnObj = [];
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
const rows = await this.page.$$('.search-further-data tr a');
|
||||
|
||||
for (const item of rows) {
|
||||
const innerText = await this.page.evaluate(el => el.innerText, item);
|
||||
const href = await this.page.evaluate(el => el.href, item);
|
||||
const id = await this.page.evaluate(el => el.id, item);
|
||||
|
||||
returnObj.push( {
|
||||
innerText,
|
||||
href,
|
||||
id
|
||||
});
|
||||
}
|
||||
|
||||
return returnObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleEmoneyServices() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
|
||||
|
||||
if (!this.emoneyServices.visited)
|
||||
if (this.emoneyServices.items === 0) {
|
||||
// first visit, Build the list
|
||||
|
||||
this.emoneyServices.links = await this.extractLinks();
|
||||
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
|
||||
this.emoneyServices.visited = true;
|
||||
}
|
||||
|
||||
if (this.emoneyServices.visited)
|
||||
|
||||
if (this.emoneyServices.step < this.emoneyServices.items) {
|
||||
const nextItem = this.emoneyServices.links[this.emoneyServices.step];
|
||||
|
||||
// Not using an await here. We want to click and exit this page so we don't get tied up
|
||||
|
||||
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
|
||||
}
|
||||
else {
|
||||
// EMoney services complete, move onto the next service.
|
||||
this.emoneyServices.done = true;
|
||||
this.mode = 2;
|
||||
await this.page.goto(this.startPage);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handlePaymentServices() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._findAndClick('#phmain_0_Search1_allBtn', 'SHOW ALL');
|
||||
|
||||
if (!this.paymentServices.visited)
|
||||
if (this.paymentServices.items === 0) {
|
||||
// first visit, Build the list
|
||||
|
||||
this.paymentServices.links = await this.extractLinks();
|
||||
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
|
||||
this.paymentServices.visited = true;
|
||||
}
|
||||
|
||||
if (this.paymentServices.visited)
|
||||
|
||||
if (this.paymentServices.step < this.paymentServices.items) {
|
||||
const nextItem = this.paymentServices.links[this.paymentServices.step];
|
||||
|
||||
// Not using an await here. We want to click and exit this page so we don't get tied up
|
||||
|
||||
this._findAndClick(`#${nextItem.id}`, nextItem.innerText);
|
||||
}
|
||||
else {
|
||||
// Payment services complete, move onto the next service.
|
||||
this.paymentServices.done = true;
|
||||
this.mode = 1;
|
||||
await this.page.goto(this.startPage);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
// Financial Supervisory Authority
|
||||
// Government ministry
|
||||
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
|
||||
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': {
|
||||
|
||||
},
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': {
|
||||
|
||||
},
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.startPage = 'http://vut.finanstilsynet.dk/en/Tal-og-fakta/Virksomheder-under-tilsyn/VUT-database.aspx';
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
|
||||
|
||||
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
this.page = await this.browser.newPage();
|
||||
|
||||
this.page.on('domcontentloaded', () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('####', err);
|
||||
this.emit('done');
|
||||
});
|
||||
});
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this.page.goto(this.startPage).catch((err) => {
|
||||
logger.error(err);
|
||||
this._uploadError();
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = DKScrape;
|
498
ncas/dkV2.js
Normal file
498
ncas/dkV2.js
Normal file
@ -0,0 +1,498 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const path = require('path');
|
||||
const logger = require('log4js').getLogger('DK');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class DKScrape extends Scraper {
|
||||
|
||||
constructor(checkForLock = true) {
|
||||
super();
|
||||
this.id = 'DK';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (checkForLock)
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
|
||||
this.on('error', (err) => {
|
||||
logger.error('Error catcher!!', err);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async movePageToTop() {
|
||||
await this.page.evaluate(() => {
|
||||
window.scrollTo(0, 0);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async movePageToBottom() {
|
||||
await this.page.evaluate(() => {
|
||||
window.scrollBy(0, window.innerHeight);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async renameFile() {
|
||||
const filename = this.filenames[this.step];
|
||||
|
||||
const sourceFile = 'Finanstilsynets virksomhedsregister - SQL.xlsx';
|
||||
|
||||
const origFile = `${this.path}/${sourceFile}`;
|
||||
const newFile = `${this.path}/${filename}.xlsx`;
|
||||
|
||||
await this._renameFile(origFile, newFile);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async clickReturn() {
|
||||
logger.debug('clickReturn');
|
||||
await this.iframe.waitForSelector('#lsAnalysisPage > div > div:nth-child(2)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
|
||||
console.log('found');
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch((e) => {
|
||||
logger.error('iframe missing stuff', e);
|
||||
// pageLoaded = false;
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'after clickReturn click');
|
||||
this.step++;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async scrollContainer() {
|
||||
await this.page.evaluate(() => {
|
||||
console.log('window.innerWidth', window.innerWidth);
|
||||
window.scrollBy(window.innerWidth, window.innerHeight);
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'scroll x?');
|
||||
|
||||
this.page.mouse.move(1061, 437);
|
||||
await this._randomWait(this.page, 2, 3, 'bottom right scroll arrow');
|
||||
|
||||
for(let count = 0; count < 15; count++) {
|
||||
this.page.mouse.click(1061, 437, { 'delay':500 });
|
||||
await this._randomWait(this.page, 1, 2, 'scrolling');
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 4, 5, 'after scroll');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async clickExport() {
|
||||
logger.debug('clickExport');
|
||||
|
||||
await this.movePageToTop();
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'Move to top');
|
||||
|
||||
const filename = this.filenames[this.step];
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
await this._randomWait(this.page, 4, 4, 'Screenshot');
|
||||
|
||||
this.page.mouse.move(175, 440);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 175, 440');
|
||||
|
||||
this.page.mouse.click(175, 440, { 'button':'right', 'delay':90 });
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 175, 440');
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
await this.iframe.waitForSelector('div.lsDialogContent > div:nth-child(2)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
|
||||
console.log('found');
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch((e) => {
|
||||
logger.error('iframe missing stuff', e);
|
||||
// pageLoaded = false;
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'after clickExport click');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async clickSearch() {
|
||||
logger.debug('clickSearch');
|
||||
|
||||
await this.movePageToBottom();
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Move to bottom');
|
||||
|
||||
await this.iframe.waitForSelector('#lsAnalysisPage > div > div:nth-child(11)', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
|
||||
console.log('found');
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch((e) => {
|
||||
logger.error('iframe missing stuff', e);
|
||||
// pageLoaded = false;
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'after clickSearch click');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectPs01() {
|
||||
logger.debug('selectPs01');
|
||||
|
||||
/*
|
||||
List 1 - betalingstjeneste området = Payment Service Area
|
||||
List 2 - Betalingsinstitutter = Payment Institutions
|
||||
*/
|
||||
|
||||
this.page.mouse.move(200, 418);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
|
||||
|
||||
this.page.mouse.click(200, 418);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
|
||||
|
||||
this.page.mouse.move(400, 434);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 400, 434');
|
||||
|
||||
this.page.mouse.click(400, 434);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 400, 434');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectPs02() {
|
||||
logger.debug('selectPs02');
|
||||
|
||||
/*
|
||||
List 1 - betalingstjeneste området = Payment Service Area
|
||||
List 2 - List 2 - Udbyder af betalingstjenester med begraenset tilladelse = Provider of limited payment services
|
||||
|
||||
*/
|
||||
|
||||
this.page.mouse.move(200, 418);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
|
||||
|
||||
this.page.mouse.click(200, 418);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
|
||||
|
||||
this.page.mouse.move(400, 585);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 400, 585');
|
||||
|
||||
this.page.mouse.click(400, 585);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 400, 585');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectEm01() {
|
||||
logger.debug('selectEm01');
|
||||
|
||||
/*
|
||||
List 1 - betalingstjeneste området = Payment Service Area
|
||||
List 2 - E-penge-institutter = E money Institutions
|
||||
*/
|
||||
|
||||
this.page.mouse.move(200, 418);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
|
||||
|
||||
this.page.mouse.click(200, 418);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
|
||||
|
||||
this.page.mouse.move(400, 473);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 400, 473');
|
||||
|
||||
this.page.mouse.click(400, 473);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 400, 473');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectEm02() {
|
||||
logger.debug('selectEm01');
|
||||
|
||||
/*
|
||||
List 1 - betalingstjeneste området = Payment Service Area
|
||||
List 2 - Udbyder af elektroniske penge med begraenset tilladelse = Provider of electronic money with limited permission
|
||||
*/
|
||||
|
||||
this.page.mouse.move(200, 418);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 200, 418');
|
||||
|
||||
this.page.mouse.click(200, 418);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 200, 418');
|
||||
|
||||
this.page.mouse.move(400, 631);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 400, 631');
|
||||
|
||||
this.page.mouse.click(400, 631);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 400, 631');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectCs01() {
|
||||
logger.debug('selectCs01');
|
||||
|
||||
/*
|
||||
List 1 - Kreditinsti Området = Credit Institution
|
||||
List 2 - pengeinstitutter = Banks
|
||||
*/
|
||||
|
||||
this.page.mouse.move(200, 508);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 200, 508');
|
||||
|
||||
this.page.mouse.click(200, 508);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 200, 508');
|
||||
|
||||
this.page.mouse.move(400, 473);
|
||||
await this._randomWait(this.page, 2, 3, 'Move 400, 473');
|
||||
|
||||
this.page.mouse.click(400, 473);
|
||||
|
||||
await this._randomWait(this.page, 2, 3, 'Click 400, 473');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async motions() {
|
||||
do
|
||||
switch(this.step) {
|
||||
|
||||
case 0:
|
||||
await this.selectPs01();
|
||||
|
||||
await this.clickSearch();
|
||||
|
||||
await this.scrollContainer();
|
||||
|
||||
await this.clickExport();
|
||||
|
||||
await this.renameFile();
|
||||
|
||||
await this.clickReturn();
|
||||
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.selectPs02();
|
||||
|
||||
await this.clickSearch();
|
||||
|
||||
await this.clickExport();
|
||||
|
||||
await this.renameFile();
|
||||
|
||||
await this.clickReturn();
|
||||
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.selectEm01();
|
||||
|
||||
await this.clickSearch();
|
||||
|
||||
await this.scrollContainer();
|
||||
|
||||
await this.clickExport();
|
||||
|
||||
await this.renameFile();
|
||||
|
||||
await this.clickReturn();
|
||||
|
||||
break;
|
||||
|
||||
case 3:
|
||||
await this.selectEm02();
|
||||
|
||||
await this.clickSearch();
|
||||
|
||||
await this.scrollContainer();
|
||||
|
||||
await this.clickExport();
|
||||
|
||||
await this.renameFile();
|
||||
|
||||
await this.clickReturn();
|
||||
|
||||
break;
|
||||
|
||||
case 4:
|
||||
await this.selectCs01();
|
||||
|
||||
await this.clickSearch();
|
||||
|
||||
await this.scrollContainer();
|
||||
|
||||
await this.clickExport();
|
||||
|
||||
await this.renameFile();
|
||||
|
||||
await this.clickReturn();
|
||||
|
||||
break;
|
||||
|
||||
default:
|
||||
// Menu fell through
|
||||
this.complete = true;
|
||||
|
||||
this.emit('done');
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
while(!this.complete );
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async waitForIframe() {
|
||||
await this.page.waitForSelector('body > div.site > div > div.content > div.content__sections > section > center > iframe', { 'visible':true, 'timeout':75000 }).then(async (elm) => {
|
||||
logger.debug('iframe');
|
||||
|
||||
this.iframe = await this.page.mainFrame().childFrames()[0].childFrames()[0];
|
||||
|
||||
await this._randomWait(this.page, 15, 20);
|
||||
|
||||
await this.motions();
|
||||
}).catch((e) => {
|
||||
logger.error('processEntityDetails', e);
|
||||
// pageLoaded = false;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async processNewPage() {
|
||||
logger.debug('** processNewPage');
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
logger.debug(pageUrl);
|
||||
|
||||
await this.waitForIframe();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
// Financial Supervisory Authority
|
||||
// Government ministry
|
||||
// https://en.wikipedia.org/wiki/Financial_Supervisory_Authority_(Denmark)
|
||||
|
||||
this.mode = 0;
|
||||
|
||||
this.step = 0;
|
||||
this.complete = false;
|
||||
|
||||
this.filenames = ['paymentServices01', 'paymentServices02', 'eMoney01', 'eMoney02', 'creditServices01', 'creditServices02'];
|
||||
|
||||
this.startPage = 'https://virksomhedsregister.finanstilsynet.dk/listeudtr%C3%A6k-en.html';
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/DK/FSA`));
|
||||
|
||||
await this._doNonRepudiation(false, { 'sslWithPrefix': false }).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
await this._makeResponsive();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 5000));
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this.page.goto(this.startPage).catch((err) => {
|
||||
logger.error(err);
|
||||
this._uploadError();
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
* @private
|
||||
*/
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = DKScrape;
|
781
ncas/ee.js
Normal file
781
ncas/ee.js
Normal file
@ -0,0 +1,781 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('EE');
|
||||
const url = require('url');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class EEScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'EE';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
this.recover = this._debounce(async () => {
|
||||
await this.__recover();
|
||||
}, 120000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractIndexItems(html, serviceObject) {
|
||||
const newArray = [] ;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const links = $('a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
const text = this._cleanUp($(item).text());
|
||||
|
||||
const newUrl = `${this.rootURI}${href}`;
|
||||
|
||||
newArray.push({ 'name':text, 'href':newUrl });
|
||||
});
|
||||
|
||||
return newArray;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
try {
|
||||
const newObj = {};
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1.page-title').text();
|
||||
|
||||
newObj.title = this._cleanUp(title);
|
||||
|
||||
const tables = $('article div.table-wrap table');
|
||||
|
||||
const rows = $(tables).eq(0).find('tbody > tr');
|
||||
|
||||
rows.each((i, item) => {
|
||||
const children = $(item).children();
|
||||
|
||||
const curLabel = this._makeFieldName($(children).eq(0).text());
|
||||
|
||||
newObj[curLabel] = (this._cleanUp($(children).eq(1).text()));
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityServices(html) {
|
||||
try {
|
||||
const newObj = {};
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const tables = $('article div.table-wrap table');
|
||||
|
||||
if (tables.length > 1)
|
||||
|
||||
tables.each((i, table) => {
|
||||
if (i > 0) {
|
||||
const label = this._makeFieldName($(table).find('caption').text());
|
||||
|
||||
const services = $(table).find('div.field__item').map((i, el) => {
|
||||
return this._cleanUp($(el).text());
|
||||
}).get();
|
||||
|
||||
if (!newObj.hasOwnProperty(label))
|
||||
newObj[label] = services.slice();
|
||||
else
|
||||
newObj[label] = newObj[label].concat(services);
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param blockType
|
||||
* @returns {{licenseDescription: string, blockType: string}}
|
||||
*/
|
||||
extractEntityLicense(html ) {
|
||||
try {
|
||||
const blockType = 'Licenses';
|
||||
const newObj = { 'licenseDescription':'', 'blockType': blockType, 'licenses' : [] };
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const header = $(`h3:contains("${blockType}")`);
|
||||
|
||||
if ($(header).length === 0) return {};
|
||||
|
||||
const fieldContent = $(header).next();
|
||||
|
||||
const children = $(fieldContent).children();
|
||||
|
||||
children.each((i, item) => {
|
||||
const newLicense = {};
|
||||
newLicense.permitNumber = this._cleanUp($(item).find('div.field--name-field-permit-number div.field__item').text()) ;
|
||||
newLicense.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-permit-entry-date div.field__item').text()) ;
|
||||
|
||||
const block = $(item).find('div.field--name-field-permit-restrictions');
|
||||
|
||||
newLicense.restrictions = $(block).find('p').map((i, el) => {
|
||||
return this._cleanUp($(el).text());
|
||||
}).get();
|
||||
|
||||
newObj.licenses.push(newLicense);
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param blockType
|
||||
* @returns {{licenseDescription: string, blockType: string}}
|
||||
*/
|
||||
extractEntityCrossBorder(html ) {
|
||||
try {
|
||||
const blockType = 'List of cross-border services provided';
|
||||
const newObj = { 'crossBorder' : [] };
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const header = $(`h3:contains("${blockType}")`);
|
||||
|
||||
if ($(header).length === 0) return {};
|
||||
|
||||
const fieldContent = $(header).next();
|
||||
|
||||
const children = $(fieldContent).children();
|
||||
|
||||
children.each((i, item) => {
|
||||
const cb = {};
|
||||
cb.permitNumber = this._cleanUp($(item).find('div.field--name-field-overborder-permit-number div.field__item').text()) ;
|
||||
cb.permitEntryDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-date div.field__item').text()) ;
|
||||
cb.startDate = this._cleanUp($(item).find('div.field--name-field-overborder-permit-start div.field__item').text()) ;
|
||||
|
||||
// field--name-field-overborder-permit-start
|
||||
const block = $(item).find('div.field--name-field-services-list');
|
||||
|
||||
cb.cbServices = $(block).find('div.paragraph--type--subject-services-list').map((i, el) => {
|
||||
const service = this._cleanUp($(el).children().eq(0).text());
|
||||
const country = this._cleanUp($(el).children().eq(1).text());
|
||||
|
||||
return { service, country };
|
||||
}).get();
|
||||
|
||||
newObj.crossBorder.push(cb);
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param blockType
|
||||
* @returns {{licenseDescription: string, blockType: string}}
|
||||
*/
|
||||
extractEntityBranches(html ) {
|
||||
try {
|
||||
const subDetails = [['country', 'field--name-field-country'], ['businessName', 'field--name-field-business-name'], ['address', 'field--name-field-address'], ['phone', 'field--name-field-phone']];
|
||||
const blockType = 'Branches';
|
||||
const newObj = { 'branches' : [] };
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const header = $(`h3:contains("${blockType}")`);
|
||||
|
||||
if ($(header).length === 0) return {};
|
||||
|
||||
const fieldContent = $(header).next();
|
||||
|
||||
const children = $(fieldContent).children();
|
||||
|
||||
children.each((i, item) => {
|
||||
const workObj = { 'details' : {}, 'branchServices':[], 'licenses':{} };
|
||||
|
||||
workObj.name = this._cleanUp($(item).find('header.paragraph-heading h4').text());
|
||||
|
||||
for (const sdItems of subDetails)
|
||||
workObj.details[sdItems[0]] = this._cleanUp($(item).find(`div.${sdItems[1]} div.field__item`).text()) ;
|
||||
|
||||
const branchPermissions = $(item).find('div.field--name-field-branch-permissions');
|
||||
const branchServices = $(item).find('div.field--name-field-branch-services');
|
||||
|
||||
workObj.branchServices = $(branchServices).find('div.paragraph--type--subject-services-list-simple div.field__item').map((i, el) => {
|
||||
return this._cleanUp($(el).text());
|
||||
}).get();
|
||||
|
||||
workObj.licenses = $(branchPermissions).find('div.paragraph--type--subject-branch-permits').map((i, el) => {
|
||||
const permitNumber = this._cleanUp($(el).children().eq(0).find('div.field__item').text());
|
||||
const start = this._cleanUp($(el).children().eq(1).find('div.field__item').text());
|
||||
|
||||
return { permitNumber, start };
|
||||
}).get();
|
||||
|
||||
newObj.branches.push(workObj);
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const id = serviceObject.links[serviceObject.step].name;
|
||||
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
|
||||
|
||||
let pageLoaded = true;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = this._makeFileName(entity);
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.waitForSelector('h1.page-title').catch((e) => {
|
||||
logger.error('processEntityDetails', e);
|
||||
pageLoaded = false;
|
||||
});
|
||||
|
||||
if (pageLoaded) {
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
// --
|
||||
|
||||
const details = await this.extractEntityDetails(body);
|
||||
|
||||
const licenses = await this.extractEntityLicense(body);
|
||||
const crossBorder = await this.extractEntityCrossBorder(body);
|
||||
const services = await this.extractEntityServices(body);
|
||||
const branches = await this.extractEntityBranches(body);
|
||||
|
||||
// --
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details, licenses, crossBorder, services, branches });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl).catch((err) => {
|
||||
if (err.name === 'TimeoutError')
|
||||
this.emit('recover');
|
||||
});
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject) {
|
||||
let html = '';
|
||||
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
html = await this.page.evaluate(el => el.outerHTML, elm);
|
||||
}).catch((e) => {
|
||||
logger.error(e);
|
||||
logger.warn('No index list');
|
||||
});
|
||||
|
||||
const indexList = await this.extractIndexItems(html);
|
||||
|
||||
logger.debug('serviceObject.indexStep', serviceObject.indexStep);
|
||||
|
||||
serviceObject.links = serviceObject.links.concat(indexList).map((v) => {
|
||||
v['meta'] = serviceObject.indexStep;
|
||||
|
||||
return v;
|
||||
});
|
||||
|
||||
const filename = this.modeNames[this.mode];
|
||||
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
|
||||
const subStep = (serviceObject.pageCount > 0) ? `-${serviceObject.pageCount}` : '';
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}${subStep}`, null);
|
||||
|
||||
await this.page.waitForSelector('li.next-nav > a.button.next', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
logger.debug('Next page..');
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
serviceObject.pageCount++;
|
||||
this.emit('pageChanged');
|
||||
}).catch(() => {
|
||||
serviceObject.pageCount = 0;
|
||||
this.emit('indexdone');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
await this.page.waitForSelector('div.view-content', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await this.processIndex(serviceObject);
|
||||
}).catch((e) => {
|
||||
// logger.error(e);
|
||||
logger.warn('No index list');
|
||||
this.emit('indexdone');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
const pathname = pageUrl.pathname;
|
||||
|
||||
logger.debug('workMode::', ['Indexing', 'Scraping'][this.workMode]);
|
||||
|
||||
if (pathname === '/') {
|
||||
logger.error('Invalid path');
|
||||
logger.debug(JSON.stringify(pageUrl));
|
||||
logger.warn('processNewPage::emit recover');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
switch (this.workMode) {
|
||||
|
||||
case 0:
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl.href);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async restart() {
|
||||
logger.info(`Restarting ${this.modeTitles[this.mode]}`);
|
||||
|
||||
this._goto(this.lastUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
* @private
|
||||
*/
|
||||
async __recover() {
|
||||
logger.warn('*** RECONNECTING PAGE ***');
|
||||
|
||||
logger.info('BrowserCrashed:', this.browserCrashed);
|
||||
|
||||
await this._forcePageClose();
|
||||
|
||||
if (this.browserCrashed)
|
||||
await this._initBrowser();
|
||||
|
||||
await this._createBrowserPage();
|
||||
this.page.on('domcontentloaded', () => {
|
||||
this.processNewPage();
|
||||
});
|
||||
const timeout = 90000;
|
||||
|
||||
setTimeout(async() => {
|
||||
logger.warn('Attempting recovery..');
|
||||
|
||||
await this.restart();
|
||||
}, timeout);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('pageChanged', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
// clear out stock recover handler
|
||||
|
||||
this.removeAllListeners('recover');
|
||||
|
||||
this.on('recover', async () => {
|
||||
logger.info('onRecover');
|
||||
await this.recover();
|
||||
});
|
||||
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
let newUrl;
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
this.paymentServices.indexStep++;
|
||||
|
||||
if (this.paymentServices.indexStep >= this.paymentServices.urls.length) {
|
||||
this.workMode = 1;
|
||||
logger.debug(JSON.stringify(this.paymentServices));
|
||||
newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
}
|
||||
else
|
||||
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
let newUrl;
|
||||
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
this.emoneyServices.indexStep++;
|
||||
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length) {
|
||||
this.workMode = 1;
|
||||
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
}
|
||||
else
|
||||
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
let newUrl;
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
this.creditServices.indexStep++;
|
||||
if (this.creditServices.indexStep >= this.creditServices.urls.length) {
|
||||
this.workMode = 1;
|
||||
newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
}
|
||||
else
|
||||
newUrl = this.creditServices.urls[this.creditServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
this.workMode = 0;
|
||||
await super._paymentServicesDone();
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
this.workMode = 0;
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
this.workMode = 0;
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
this.workMode = 0;
|
||||
|
||||
this.rootURI = 'https://www.fi.ee';
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.fi.ee/en/payment-services/payment-institutions/estonian-payment-institutions',
|
||||
'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/estonian-payment-institutions-exemption',
|
||||
'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/branches-foreign-payment-institutions',
|
||||
'https://www.fi.ee/en/payment-services/payment-services/payment-institutions/payment-agents',
|
||||
'https://www.fi.ee/en/payment-services/payment-institutions/payment-services/providers-cross-border-payment-sevices',
|
||||
'https://www.fi.ee/en/payment-services/payment-institutions/payment-agents-providers-cross-border-payment-services'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : [],
|
||||
'pageCount' : 0
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions',
|
||||
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/estonian-e-money-institutions-exemption',
|
||||
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/distributors-e-money',
|
||||
'https://www.fi.ee/en/payment-services/e-money-institutions/providers-cross-border-e-money-services',
|
||||
'https://www.fi.ee/en/distributors-providers-cross-border-e-money-services',
|
||||
'https://www.fi.ee/en/payment-services/payment-services/e-money-institutions/branches-foreign-e-money-institutions'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : [],
|
||||
'pageCount' : 0
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/licensed-credit-institutions-estonia',
|
||||
'https://www.fi.ee/en/banking-and-credit/credit-institutions/affiliated-branches-foreign-credit-institutions',
|
||||
'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/representative-offices-foreign-credit-institutions',
|
||||
'https://www.fi.ee/en/banking-and-credit/banking-and-credit/credit-institutions/providers-cross-border-banking-services'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : [],
|
||||
'pageCount' : 0
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/EE/FI`));
|
||||
|
||||
// await this._doNonRepudiation();
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.paymentServices.urls[0], { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = EEScrape;
|
574
ncas/es.js
Normal file
574
ncas/es.js
Normal file
@ -0,0 +1,574 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const logger = require('log4js').getLogger('ES');
|
||||
const url = require('url');
|
||||
const querystring = require('querystring');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const jsonfile = require('jsonfile');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class ESScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'ES';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractPassporting($) {
|
||||
const passporting = [];
|
||||
|
||||
const headerRow = $('td.tdSubtituloSeccion:contains("PAISES EN LOS QUE OPERA")').eq(0).parent().eq(0);
|
||||
|
||||
const passportRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
|
||||
|
||||
passportRows.each(function(i, elem) {
|
||||
passporting.push(
|
||||
{
|
||||
'country': $(elem).find('td').eq(0).text(),
|
||||
'mode': $(elem).find('td').eq(1).text()
|
||||
}
|
||||
);
|
||||
});
|
||||
|
||||
return passporting;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractActivities($) {
|
||||
const activities = [];
|
||||
|
||||
const headerRow = $('td.tdSubtituloSeccion td.tdSubtituloSeccion:contains("ACTIVIDADES")').eq(0).parent().eq(0);
|
||||
|
||||
const activityRows = headerRow.nextAll('tr:not([height])'); // ignore the small divider row
|
||||
|
||||
activityRows.each(function(i, elem) {
|
||||
activities.push($(elem).text());
|
||||
});
|
||||
|
||||
for (let i = 0; i < activities.length; i++)
|
||||
activities[i] = this._cleanUp(activities[i]);
|
||||
|
||||
return activities;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @param details
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractSingleFields($, details) {
|
||||
const mainDiv = $('div#divSalida > table.tablaParametros > tbody > tr > td> table > tbody');
|
||||
|
||||
details.bancoDeEspanaCode = this._cleanUp($(mainDiv).find('input[name=CODIGO]').val());
|
||||
details.bancoDeEspanaPrevCode = this._cleanUp($(mainDiv).find('input[name=CODIGO_PREVIO]').val());
|
||||
details.effectiveFrom = this._cleanUp($(mainDiv).find('input[name=FechaAlta1]').val());
|
||||
details.effectiveTo = this._cleanUp($(mainDiv).find('input[name=FechaBaja]').val());
|
||||
details.lastUpdated = this._cleanUp($(mainDiv).find('input[name=FechaActualizacion]').val());
|
||||
|
||||
details.name = this._cleanUp(
|
||||
// Can't find accent in "Denominación:" so search for half the word:
|
||||
$(mainDiv).children('tr:contains("Denominaci")').nextAll().eq(0).find('textarea').text()
|
||||
);
|
||||
|
||||
details.institutionType = this._cleanUp(
|
||||
$(mainDiv).children('tr:contains("Tipo de entidad:")').nextAll().eq(0).find('textarea').text()
|
||||
);
|
||||
|
||||
details.address = this._cleanUp(
|
||||
$(mainDiv).children('tr:contains("Domicilio:")').nextAll().eq(0).find('textarea').text()
|
||||
);
|
||||
|
||||
details.legalEntityIdentifierCode = this._cleanUp(
|
||||
$(mainDiv).find('input[name=CODIGO_PREVIO]').parent().nextAll().eq(3).children('input').val()
|
||||
);
|
||||
|
||||
details.shortName = this._cleanUp(
|
||||
$(mainDiv).find('td.textoEtiqueta:contains("Nombre abreviado:")').nextAll().eq(1).children('input').val()
|
||||
);
|
||||
|
||||
details.nif = this._cleanUp(
|
||||
$(mainDiv).find('td.textoEtiqueta:contains("N.I.F.:")').nextAll().eq(1).find('td.textoCampo input').val()
|
||||
);
|
||||
|
||||
// Can't find "Teléfono", probably due to accent. Search for "fono" instead.
|
||||
details.telephone = this._cleanUp(
|
||||
$(mainDiv).find('td.textoEtiqueta:contains("fono:")').nextAll().eq(1).find('td.textoCampo input').val()
|
||||
);
|
||||
|
||||
details.fax = this._cleanUp(
|
||||
$(mainDiv).find('td.textoEtiqueta:contains("Fax:")').nextAll().eq(1).find('td.textoCampo input').val()
|
||||
);
|
||||
|
||||
details.website = this._cleanUp(
|
||||
$(mainDiv).find('td.textoEtiqueta:contains("Dom. / Dir. Internet:")').nextAll().eq(1).find('a').text()
|
||||
);
|
||||
|
||||
details.safeguardOfFunds = this._cleanUp(
|
||||
$(mainDiv).find('td.tdSubtituloSeccion:contains("SALVAGUARDA DE FONDOS")').parent().nextAll('tr').eq(1).text()
|
||||
);
|
||||
|
||||
details.financialExclusivity = this._cleanUp(
|
||||
$(mainDiv).find('td.tdSubtituloSeccion:contains("EXCLUSIVIDAD FINANCIERA")').parent().nextAll('tr').eq(1).text()
|
||||
);
|
||||
|
||||
if ($(mainDiv).find('li.textoAvisoResaltado').length > 0)
|
||||
details.notice = this._cleanUp(
|
||||
$(mainDiv).find('li.textoAvisoResaltado').text()
|
||||
);
|
||||
else
|
||||
details.notice = '';
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
const details = {};
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
try {
|
||||
await this.extractSingleFields($, details);
|
||||
|
||||
details.activities = await this.extractActivities($);
|
||||
|
||||
details.passporting = await this.extractPassporting($);
|
||||
}
|
||||
catch (err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
return details;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject) {
|
||||
const noResultsSelector = '//td[@class="textoEtiqueta"][contains(text(), "NO SE HAN ENCONTRADO ENTIDADES SEGUN LOS CRITERIOS DE BUSQUEDA.")]';
|
||||
const paginationRowSelector = '//table[@class="tablaResultados"]//td[@colspan="4"]';
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
// pagination row is the last to load, so wait for that before scraping the links
|
||||
// Sometimes the row is empty, so look for the surrounding td with `colspan=4`
|
||||
// also look for the "no results" notice in case the result set is empty
|
||||
await this.page.waitForXPath(`${noResultsSelector} | ${paginationRowSelector}`);
|
||||
|
||||
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
|
||||
const filename = this.modeNames[this.mode];
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}_${serviceObject.paginationStep}`, null);
|
||||
|
||||
if (this.page.$x(noResultsSelector).length > 0) {
|
||||
logger.info(`Results page ${serviceObject.indexStep} for ${this.modeNames[this.mode]} is empty`);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO: handle when the table loads, but the entity links are missing (happens occasionally)
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
const links = $('table.tablaResultados tr.estilofila a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
|
||||
// ignore any javascript print links
|
||||
if (href.startsWith('javascript'))
|
||||
return;
|
||||
|
||||
const text = $(item).text().trim();
|
||||
|
||||
const newUrl = `http://app.bde.es${href}`;
|
||||
const id = this._makeFieldName(text);
|
||||
|
||||
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
await this._randomWait(this.page, 6, 9);
|
||||
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}, page ${serviceObject.paginationStep}...`);
|
||||
|
||||
await this.processIndex(serviceObject);
|
||||
|
||||
const nextButtons = await this.page.$x('//a[contains(text(), \'Siguiente\')]');
|
||||
if (nextButtons.length > 0) {
|
||||
serviceObject.paginationStep++;
|
||||
await nextButtons[0].click();
|
||||
}
|
||||
else if (serviceObject.indexStep < serviceObject.urls.length - 1) {
|
||||
serviceObject.indexStep++;
|
||||
serviceObject.paginationStep = 0;
|
||||
const newUrl = serviceObject.urls[serviceObject.indexStep];
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
logger.debug('>> indexRedirector');
|
||||
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const { name, id } = serviceObject.links[serviceObject.step];
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
|
||||
await this.page.waitForSelector('td.tdContenido', { 'visible':true, 'timeout':7500 }); // Wait for buttons at bottom of table to be visible
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const details = await this.extractEntityDetails(body);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const qstring = querystring.parse(pageUrl.search);
|
||||
|
||||
if ('TIPO' in qstring) // 'type'
|
||||
await this.indexRedirector();
|
||||
else if ('CODBE' in qstring) // 'code'
|
||||
await this.processRedirector();
|
||||
else {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('serviceDone', async function() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} paymentServices items indexed`);
|
||||
|
||||
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} emoneyServices items indexed`);
|
||||
|
||||
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} creditServices items indexed`);
|
||||
|
||||
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': [
|
||||
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EP&DONDE=11&LEI=&ORDEN=2&RADIO=0', // Payment Entities
|
||||
'http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EPH&DONDE=11&LEI=&ORDEN=2&RADIO=0' // Hybrid Payment Entities
|
||||
],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=EDE&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Electronic Money Entities
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['http://app.bde.es/ren/app/Search?CFG=ConsultaEntidadesCon.xml&TipoFormato=XSL&Paginate=OPEN&TIPO=BP&DONDE=11&LEI=&ORDEN=2&RADIO=0'], // Credit institutions
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/ES/BE`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = ESScrape;
|
193
ncas/fca.js
Normal file
193
ncas/fca.js
Normal file
@ -0,0 +1,193 @@
|
||||
// load env variables from file
|
||||
require('dotenv').config({
|
||||
'path': `${__dirname }/../.env`
|
||||
});
|
||||
|
||||
const version = '0.0.1-1';
|
||||
// load helper libs etc
|
||||
const CsvData = require('../helpers/csv-data');
|
||||
const csv = new CsvData();
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
|
||||
const cheerio = require('cheerio');
|
||||
const fs = require('fs');
|
||||
|
||||
const range = n => Array.from({ 'length': n }, (value, key) => key + 1);
|
||||
|
||||
const searchables = new Map([[759676, '759676 Barclays Bank UK PLC'],
|
||||
[661836, '661836 American Express Services Europe Limited (AESEL)']
|
||||
]);
|
||||
|
||||
const userAgents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
|
||||
'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36',
|
||||
'Mozilla/5.0 (Linux; Ubuntu 16.04) AppleWebKit/537.36 Chromium/57.0.2987.110 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58.0.3029.110 Chrome/58.0.3029.110 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
|
||||
'Mozilla/5.0 (Linux; Ubuntu 14.04 like Android 4.4) AppleWebKit/537.36 Chromium/35.0.1870.2 Mobile Safari/537.36'];
|
||||
|
||||
class FCAScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
async _checkPassporting(page, id, waitFor) {
|
||||
const passportingHeader = await page.$x('//a[contains(text(), \'Passport Out\')]');
|
||||
if (passportingHeader.length > 0) {
|
||||
await passportingHeader[0].click(); // click tab to open passporting accordion
|
||||
await this._makeScreenshot(page, `${id}-passporting`, waitFor); // save general screen
|
||||
|
||||
// check how many countries we need to parse
|
||||
const countryLinks = await page.$$('#PanelShPo_PassportOut .countries li');
|
||||
const passportingTempArray = range(countryLinks.length);
|
||||
console.log('>> passportingTempArray', passportingTempArray);
|
||||
|
||||
for (const item of passportingTempArray) {
|
||||
await page.mouse.move(50, 50, 100);
|
||||
|
||||
console.log(id, item);
|
||||
// const cookies = await page.cookies();
|
||||
// const cookiesNames = cookies.map(el => {
|
||||
// return {name : el.name};
|
||||
// });
|
||||
// console.log(cookiesNames);
|
||||
await page.deleteCookie(
|
||||
{ 'name': '_gat' },
|
||||
{ 'name': '_gid' },
|
||||
{ 'name': '_ga' },
|
||||
{ 'name': '__cfduid' },
|
||||
{ 'name': 'pctrk' }
|
||||
);
|
||||
|
||||
/* const newAgent = userAgents[Math.floor(Math.random() * (userAgents.length - 1))];
|
||||
|
||||
console.log('New agent:', newAgent);
|
||||
|
||||
await page.setUserAgent(newAgent);*/
|
||||
await this._processPassportingCountry(page, id, item);
|
||||
}
|
||||
}
|
||||
else
|
||||
throw new Error('Passporting not found');
|
||||
}
|
||||
|
||||
async _processPassportingCountry(page, orgId, id) {
|
||||
// Mousedown Duration between 90 - 120ms
|
||||
const mouseDownDuration = 90 + Math.floor(Math.random() * (30 - 1));
|
||||
|
||||
console.log('Mouse duration:', mouseDownDuration);
|
||||
|
||||
await page.click(`#PanelShPo_PassportOut .countries li:nth-child(${id}) a`, { 'delay':mouseDownDuration });
|
||||
await this._randomWait(page, 20, 40);
|
||||
|
||||
const innerHtml = await page.evaluate(() => document.body.innerHTML);
|
||||
await this._makeScreenshot(page, `${orgId}-${id}-passporting`);
|
||||
await this._saveToFile(`${orgId}-${id}-inner.html`, innerHtml);
|
||||
|
||||
const parsedPassportOut = await this._parseHtmlPassportingData(innerHtml);
|
||||
await this._saveToFile(`${orgId}-${id}-parsed.json`, JSON.stringify(parsedPassportOut));
|
||||
}
|
||||
|
||||
async _getOrgData(id) {
|
||||
try {
|
||||
await this._initBrowser();
|
||||
const page = await this.browser.newPage();
|
||||
|
||||
// await page.setUserAgent(userAgents[Math.floor(Math.random() * (userAgents.length - 1))]);
|
||||
|
||||
console.log('>> Wanted searchable', searchables.get(id));
|
||||
|
||||
await page.goto('https://register.fca.org.uk/ShPo_HomePage');
|
||||
await page.type('input[type=text].input.form-control.searchbox', searchables.get(id));
|
||||
await page.keyboard.press(String.fromCharCode(13)); // press Enter (so we do not need to search for submit button by CSS selector)
|
||||
await page.waitForSelector('div.RecordDetails h1.RecordName');
|
||||
|
||||
// make general screenshot
|
||||
await this._makeScreenshot(page, `${id}-general`);
|
||||
|
||||
// check if org has passporting rights and parse if poss
|
||||
await this._checkPassporting(page, id);
|
||||
|
||||
await this.browser.close();
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async _parseHtmlPassportingData(innerHtml) {
|
||||
const $ = cheerio.load(innerHtml);
|
||||
|
||||
// get List of PassportOut countries
|
||||
const countries = [];
|
||||
$('li.PassportOutLink a').each((i, el) => {
|
||||
countries[i] = $(el).text();
|
||||
});
|
||||
|
||||
// get current country data
|
||||
// lets count tables - how many different directives!
|
||||
const directives = $('.ShPo_PassportOutTable').map((i, el) => {
|
||||
const head = $(el).find('table tbody tr').first().find('th');
|
||||
|
||||
// table headers
|
||||
const country = head.eq(0).text().trim();
|
||||
const directive = head.eq(1).text().trim();
|
||||
const passportType = head.eq(2).text().trim();
|
||||
|
||||
// get actual table data
|
||||
const data = $(el).find('table tbody tr').find('td').map((i, el) => {
|
||||
// if element does contain H3 - we need more parsing
|
||||
if ($(el).find('.InvestmentTypes li').length) {
|
||||
const name = $(el).find('h3').text().trim();
|
||||
const investment = $(el).find('.InvestmentTypes li').map((ii, subel) => {
|
||||
const name = $(subel).text().trim();
|
||||
let tt = null;
|
||||
|
||||
// check if LI contains span == it has tooltips, get data and override null
|
||||
if ($(subel).find('span').length) {
|
||||
const $$ = cheerio.load($(subel).find('span').data('content'));
|
||||
tt = $$('div').text().trim();
|
||||
}
|
||||
|
||||
return { name, tt };
|
||||
}).get();
|
||||
|
||||
return { name, investment };
|
||||
}
|
||||
|
||||
// no lists in HTML, so record just name
|
||||
else
|
||||
return {
|
||||
'name': $(el).text().trim(),
|
||||
'investment': null
|
||||
};
|
||||
}).get();
|
||||
|
||||
return { country, directive, passportType, data };
|
||||
}).get();
|
||||
|
||||
return directives;
|
||||
}
|
||||
|
||||
// TODO: get initial list as per ticket
|
||||
// https://register.fca.org.uk/shpo_searchresultspage?preDefined=AIPISP&TOKEN=3wq1nht7eg7tr
|
||||
async getInitialList(page) {
|
||||
return;
|
||||
}
|
||||
|
||||
async run() {
|
||||
const passporting = await this._getOrgData(661836);
|
||||
|
||||
// const passporting = await this._parseCurrentPassporting(1);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = FCAScrape;
|
488
ncas/fr.js
Normal file
488
ncas/fr.js
Normal file
@ -0,0 +1,488 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('FR');
|
||||
const url = require('url');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
// load env variables from file
|
||||
|
||||
class FRScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super(); // must call super for "this" to be defined.
|
||||
this.setID('FR');
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param path
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async gotoPage(path = null) {
|
||||
const newUrl = `${this.parsedUrl.protocol}//${this.parsedUrl.hostname}${path.link}`;
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
logger.info('newurl:', newUrl);
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param rows
|
||||
* @returns {Array}
|
||||
*/
|
||||
extractDataFromTable(rows) {
|
||||
const unchecked = /(unchecked)/;
|
||||
const output = [];
|
||||
const crossBorder = [];
|
||||
|
||||
let currentActivityID ;
|
||||
rows.each((i, elm) => {
|
||||
const children = cheerio(elm).children();
|
||||
let newItem;
|
||||
|
||||
if (children.eq(1).text().trim() !== '')
|
||||
currentActivityID = children.eq(1).text().trim();
|
||||
|
||||
if (children.eq(0).html().match(unchecked) === null)
|
||||
if (children.length === 2) {
|
||||
crossBorder.push(this._cleanUp(currentActivityID.trim()));
|
||||
}
|
||||
else
|
||||
if (children.length === 3) {
|
||||
newItem = [currentActivityID, this._cleanUp(children.eq(2).text().trim())];
|
||||
output.push(newItem);
|
||||
}
|
||||
else {
|
||||
newItem = [`${currentActivityID}${children.eq(2).text().replace(')', '').trim()}`, this._cleanUp(children.eq(3).text().trim())];
|
||||
output.push(newItem);
|
||||
}
|
||||
});
|
||||
|
||||
return { output, crossBorder };
|
||||
}
|
||||
|
||||
extractDataFromInvestmentServicesTable(rows) {
|
||||
const unchecked = /(unchecked)/;
|
||||
const output = [];
|
||||
const authorised = [];
|
||||
const financialInstruments = [];
|
||||
|
||||
rows.each((i, elm) => {
|
||||
const finInst = [];
|
||||
const children = cheerio(elm).children();
|
||||
|
||||
if (children.length > 2) {
|
||||
if (children.length === 11)
|
||||
children.each((step, fiElm) => {
|
||||
financialInstruments.push(this._cleanUp(cheerio(fiElm).text()));
|
||||
});
|
||||
|
||||
if (children.length > 11) {
|
||||
let offset = (children.length - 1) - financialInstruments.length;
|
||||
const fiOffset = (offset === 0) ? 1 : 2;
|
||||
|
||||
const rowName = children.eq(offset).text();
|
||||
offset++;
|
||||
while(offset < financialInstruments.length) {
|
||||
if (children.eq(offset).html().match(unchecked) === null)
|
||||
finInst.push(financialInstruments[offset - fiOffset]);
|
||||
|
||||
offset++;
|
||||
}
|
||||
if (finInst.length > 0)
|
||||
output.push([rowName, finInst]);
|
||||
}
|
||||
}
|
||||
else if (children.length === 2)
|
||||
|
||||
if (children.eq(0).html().match(unchecked) === null) {
|
||||
authorised.push(this._cleanUp(children.eq(1).text()));
|
||||
}
|
||||
});
|
||||
|
||||
return { 'investmentServices':output, authorised };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tables
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractEuroData(tables) {
|
||||
const dataBlock = [];
|
||||
const findToColon = /^.*?(?=(:))/;
|
||||
const trimToColon = /^.*?(?=(:)).\s/;
|
||||
|
||||
const divs = tables.find('div.zone_succ');
|
||||
|
||||
divs.each((i, elm) => {
|
||||
const p = cheerio(elm).find('p').eq(0).text();
|
||||
|
||||
const title = this._cleanUp(p.match(findToColon)[0]).trim();
|
||||
const country = this._cleanUp(p.split(trimToColon)[2]).trim();
|
||||
|
||||
const obj = {};
|
||||
obj[title] = country;
|
||||
|
||||
const rows = cheerio(elm).find('table tr');
|
||||
|
||||
const data = this.extractDataFromTable(rows);
|
||||
|
||||
obj.paymentServices = data.output;
|
||||
obj.crossBorder = data.crossBorder;
|
||||
|
||||
dataBlock.push(obj);
|
||||
});
|
||||
|
||||
return dataBlock;
|
||||
}
|
||||
|
||||
async extractLinks($table, creditInstFilter = false) {
|
||||
const wantedCIStatuses = ['legal entity/ company'];
|
||||
const links = [];
|
||||
logger.info('Extracting links...');
|
||||
if ($table.length > 1)
|
||||
// The table contains more than just the heading row
|
||||
for (let count = 1;count < $table.length;count++) {
|
||||
const $row = cheerio($table.get(count)).find('td');
|
||||
|
||||
const $item = $row.children().eq(2);
|
||||
|
||||
const link = $item.attr('href');
|
||||
const title = this._cleanUp($item.text());
|
||||
|
||||
if (!creditInstFilter)
|
||||
// Default mode
|
||||
links.push({ link, title });
|
||||
else
|
||||
if ($row.children().length === 6) {
|
||||
const status = this._cleanUp($row.children().eq(5).text().toLowerCase());
|
||||
|
||||
logger.debug(`Status:**${status}** ${title}`);
|
||||
if(wantedCIStatuses.indexOf(status) !== -1) {
|
||||
logger.debug(`Matched:**${status}** ${title}`);
|
||||
links.push({ link, title });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractDetails($) {
|
||||
const findToColon = /^.*?(?=(:))/;
|
||||
const trimToColon = /^.*?(?=(:)).\s/;
|
||||
const details = [];
|
||||
|
||||
$('div#zone_description ul.nopuce li').each((i, elm) => {
|
||||
if ($(elm).children().length > 0) {
|
||||
const matched = $(elm).text().match(findToColon);
|
||||
|
||||
if (matched !== null) {
|
||||
const field = this._cleanUp($(elm).text().match(findToColon)[0]).trim();
|
||||
const data = this._cleanUp( $(elm).text().split(trimToColon)[2]);
|
||||
|
||||
details.push([field, data]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return details;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processAFPage() {
|
||||
const noWhiteSpace = /\W/g;
|
||||
const trimToColon = /^.*?(?=(:)).\s/;
|
||||
|
||||
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||||
const $ = cheerio.load(body);
|
||||
const modeFilename = ['ps_', 'em_', 'ci_'];
|
||||
|
||||
const pageData = { 'description':[], 'frActivities':null, 'EUActivities':[] };
|
||||
|
||||
pageData.entity = removeAccents.remove($('p.sttr').eq(0).text().replace(trimToColon, '').trim());
|
||||
|
||||
const filename = `${modeFilename[this.mode]}${pageData.entity.replace(noWhiteSpace, '_')}`;
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
|
||||
pageData.description = await this.extractDetails($);
|
||||
|
||||
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(3) a');
|
||||
|
||||
// Process France / French details
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_france`, null);
|
||||
|
||||
const frenchTbl = $('#zone_en_france > table tr');
|
||||
|
||||
if (this.mode < 2)
|
||||
pageData.frActivities = await this.extractDataFromTable(frenchTbl).output;
|
||||
else
|
||||
pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl);
|
||||
|
||||
if (this.mode < 2) {
|
||||
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(5) a');
|
||||
|
||||
// Process EU Details
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_europe`, null);
|
||||
|
||||
const euroTbls = $('#zone_en_europe');
|
||||
|
||||
pageData.EUActivities = await this.extractEuroData(euroTbls);
|
||||
}
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/${filename}.json`, pageData);
|
||||
|
||||
if (this.mode === 0 ) {
|
||||
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
|
||||
this.paymentServices.step++;
|
||||
}
|
||||
else if( this.mode === 1) {
|
||||
this.emoneyServices.links[this.emoneyServices.step].filename = `${filename}.json`;
|
||||
this.emoneyServices.step++;
|
||||
}
|
||||
else if( this.mode === 2) {
|
||||
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
|
||||
this.creditServices.step++;
|
||||
}
|
||||
|
||||
this.perf.scraped++;
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
|
||||
if (this.mode === 0)
|
||||
if (this.paymentServices.step < this.paymentServices.items)
|
||||
await this.gotoPage(this.paymentServices.links[this.paymentServices.step]);
|
||||
else {
|
||||
logger.debug('Payment services complete.');
|
||||
this.paymentServices.done = true;
|
||||
|
||||
this.mode++;
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
await this._goto(this.eMoneyUrl);
|
||||
}
|
||||
|
||||
else if (this.mode === 1)
|
||||
if (this.emoneyServices.step < this.emoneyServices.items)
|
||||
await this.gotoPage(this.emoneyServices.links[this.emoneyServices.step]);
|
||||
else {
|
||||
logger.debug('EMoney services complete.');
|
||||
this.emoneyServices.done = true;
|
||||
this.mode++;
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
await this._goto(this.creditUrl);
|
||||
}
|
||||
|
||||
else if (this.mode === 2)
|
||||
if (this.creditServices.step < this.creditServices.items)
|
||||
await this.gotoPage(this.creditServices.links[this.creditServices.step]);
|
||||
else {
|
||||
logger.debug('Credit services complete.');
|
||||
this.creditServices.done = true;
|
||||
this.mode++;
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.emit('done');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @param store
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async searchResultsProcessor($, store) {
|
||||
const $table = $('table.table tr');
|
||||
|
||||
if ($table.length > 1)
|
||||
// The table contains more than just the heading row
|
||||
|
||||
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
|
||||
|
||||
// check that the next button is active
|
||||
|
||||
const nextExists = $('body > div > div.main.main_evol > ul > li:last-child > a');
|
||||
|
||||
if (nextExists.length === 1 )
|
||||
await this._findAndClick('body > div > div.main.main_evol > ul > li:last-child > a', 'Next page >');
|
||||
else {
|
||||
// Done gathering search results
|
||||
logger.info('Completed gathering search results..');
|
||||
store.searchDone = true;
|
||||
store.items = store.links.length;
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/${['pi', 'eu', 'ci'][this.mode]}.json`, store);
|
||||
|
||||
this.gotoPage(store.links[store.step]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle the search result page and uilt the list of links
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleSearchResults() {
|
||||
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if (this.mode === 0 && !this.paymentServices.searchDone)
|
||||
await this.searchResultsProcessor($, this.paymentServices);
|
||||
|
||||
if (this.mode === 1 && !this.emoneyServices.searchDone)
|
||||
await this.searchResultsProcessor($, this.emoneyServices);
|
||||
|
||||
if (this.mode === 2 && !this.creditServices.searchDone)
|
||||
await this.searchResultsProcessor($, this.creditServices);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage(dump = false) {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const search = pageUrl.search;
|
||||
|
||||
const params = this._getParamsFromUrl(search);
|
||||
const pageID = params.page || '';
|
||||
switch (pageID) {
|
||||
|
||||
case 'results':
|
||||
await this.handleSearchResults( );
|
||||
break;
|
||||
case 'af':
|
||||
await this.processAFPage();
|
||||
break;
|
||||
|
||||
default:
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${currentPage.location}`);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
await super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
|
||||
this.eMoneyUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=22-TBR07&retrait=0';
|
||||
this.creditUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0';
|
||||
|
||||
this.parsedUrl = url.parse(this.creditUrl);
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/FR/REGAFI`));
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async __run() {
|
||||
logger.info('Scraping France...');
|
||||
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = FRScrape;
|
773
ncas/gi.js
Normal file
773
ncas/gi.js
Normal file
@ -0,0 +1,773 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const url = require('url');
|
||||
const logger = require('log4js').getLogger('GI');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class GIScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'GI';
|
||||
|
||||
// treat these elements as block boundaries when scraping permissions
|
||||
this.blockBoundaries = 'div.panel, li';
|
||||
|
||||
// ignore elements matched by these selectors when scraping titles
|
||||
this._ignoreList = 'button, div.modal-body > h3';
|
||||
|
||||
// scrape these top-level permissions headings only
|
||||
this._headingsToScrape = [
|
||||
'Financial Services (Banking) Act',
|
||||
'Financial Services (Investment and Fiduciary Services) Act'
|
||||
];
|
||||
|
||||
// override these values from the base class
|
||||
this.modePrefix = ['ps_', 'em_', 'ci_', 'ag_'];
|
||||
this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices', 'agentServices'];
|
||||
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services', 'Agent'];
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if (l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
async _convertBrToComma(text) {
|
||||
return text.replace(/<br\s*[\/]?>/gi, ', ');
|
||||
}
|
||||
|
||||
async _reduceWhiteSpace(text) {
|
||||
return text.replace(/\s+/g, ' ').trim();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param selector
|
||||
* @returns {Promise<void>}
|
||||
*
|
||||
* Finds elements in the `html` with the given `selector`, but returns only the uppermost matched elements,
|
||||
* and not those that are nested within other matched elements.
|
||||
*/
|
||||
async getUppermostElementsBySelector(html, selector) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
return $(selector).filter(function () {
|
||||
return $(this).parents(selector).length === 0;
|
||||
});
|
||||
}
|
||||
|
||||
async getTextNotInMatchingElements(html, selector) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
$(selector)
|
||||
.remove()
|
||||
.end();
|
||||
|
||||
$(this._ignoreList)
|
||||
.remove()
|
||||
.end();
|
||||
|
||||
return $.text();
|
||||
}
|
||||
|
||||
async extractSingleFields($, details) {
|
||||
details.slug = $('meta[name="og:url"]').attr('content').replace('http://www.fsc.gi/regulated-entity/', '');
|
||||
details.name = $('#fvFirmDetails_lblName').text();
|
||||
details.address = await this._convertBrToComma($('#fvFirmDetails_lblAddress').html());
|
||||
details.telephone = $('#fvFirmDetails_lblTel').text();
|
||||
details.fax = $('#fvFirmDetails_lblFax').text();
|
||||
details.email = $('#fvFirmDetails_Label12').text();
|
||||
details.website = $('#fvFirmDetails_lblWebsite').text();
|
||||
details.legalForm = $('#fvFirmDetails_lblLegalForm').text();
|
||||
details.countryOfIncorporation = $('#fvFirmDetails_lblIncorporationCountry').text();
|
||||
details.incorporationNumber = $('#fvFirmDetails_lblRegistrationNo').text();
|
||||
details.incorporationDate = $('#fvFirmDetails_lblDateOfIncorporation').text();
|
||||
}
|
||||
|
||||
async processOtherNameListItem($, elm, names) {
|
||||
const type = $(elm).children('strong').text();
|
||||
let name = $(elm).children('strong').get(0).nextSibling.nodeValue;
|
||||
|
||||
// trim the preceding ' -'
|
||||
if (name.startsWith(' -'))
|
||||
name = name.substr(2);
|
||||
|
||||
name = name.trim();
|
||||
|
||||
names.push({
|
||||
'type': type,
|
||||
'name': name
|
||||
});
|
||||
}
|
||||
|
||||
async extractOtherNames($) {
|
||||
const otherNames = [];
|
||||
|
||||
const otherNamesList = $('h3:contains("Other names")').next();
|
||||
|
||||
$(otherNamesList).find('li').each(
|
||||
(index, element) => {
|
||||
this.processOtherNameListItem($, element, otherNames);
|
||||
}
|
||||
);
|
||||
|
||||
return otherNames;
|
||||
}
|
||||
|
||||
processParentFirm($, elm, firms) {
|
||||
const href = $(elm).find('a').attr('href');
|
||||
const slug = href.replace('/regulated-entity/', '');
|
||||
|
||||
firms.push(slug);
|
||||
}
|
||||
|
||||
extractAgentOf($) {
|
||||
const parentFirms = [];
|
||||
|
||||
const parentFirmsList = $('h3:contains("Agent of")').next();
|
||||
|
||||
$(parentFirmsList).find('li').each(
|
||||
(index, element) => {
|
||||
this.processParentFirm($, element, parentFirms);
|
||||
}
|
||||
);
|
||||
|
||||
return parentFirms;
|
||||
}
|
||||
|
||||
async processAgentLink($, elm, firmAgentList) {
|
||||
const href = $(elm).attr('href');
|
||||
const fullUrl = `https://www.fsc.gi${href}`;
|
||||
const slug = href.replace('/regulated-entity/', '');
|
||||
const name = await this._cleanUp($(elm).text());
|
||||
const id = this._makeFieldName(name);
|
||||
|
||||
// TODO: refactor this out of this function somehow, it's not unit-testable without a mock for agentServices
|
||||
if ('agentServices' in this) // i.e. don't do this if we're running a unit test
|
||||
// Add the href to our list of links to check later (if it's not already added)
|
||||
if (this.agentServices.links.findIndex(x => x.href === fullUrl) === -1)
|
||||
this.agentServices.links.push({
|
||||
'name': name,
|
||||
'href': fullUrl,
|
||||
'id': id
|
||||
});
|
||||
|
||||
firmAgentList.push({
|
||||
'name': name,
|
||||
'slug': slug
|
||||
});
|
||||
}
|
||||
|
||||
async extractAgents(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const agents = [];
|
||||
|
||||
$('li > a').each(
|
||||
(index, element) => {
|
||||
this.processAgentLink($, element, agents);
|
||||
}
|
||||
);
|
||||
|
||||
return agents;
|
||||
}
|
||||
|
||||
async recurseDOM(html, selector, level = 0) {
|
||||
const currentLevel = level + 1;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const result = [];
|
||||
|
||||
const blocks = await this.getUppermostElementsBySelector(html, selector);
|
||||
|
||||
for (let i = 0; i < blocks.length; i++) {
|
||||
const block = blocks[i];
|
||||
|
||||
const rawName = await this.getTextNotInMatchingElements($(block).html(), selector);
|
||||
const name = await this._reduceWhiteSpace(rawName);
|
||||
|
||||
// Only scrape the top level headings we're interested in
|
||||
if (currentLevel === 1 && this._headingsToScrape.indexOf(name) === -1)
|
||||
continue;
|
||||
|
||||
const blockHtml = $(block).html();
|
||||
|
||||
let data;
|
||||
if (name === 'Agents')
|
||||
data = await this.extractAgents(blockHtml);
|
||||
else
|
||||
data = await this.recurseDOM(blockHtml, selector, currentLevel);
|
||||
|
||||
if (data === null)
|
||||
result.push(name);
|
||||
else
|
||||
result.push({
|
||||
'name': name,
|
||||
'data': data
|
||||
});
|
||||
}
|
||||
|
||||
if (result.length > 0)
|
||||
return result;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async extractPermissions(html) {
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const permissionsContainer = $('h3:contains("Permissions")').next();
|
||||
|
||||
if (permissionsContainer.length === 0)
|
||||
return {};
|
||||
|
||||
const permissions = await this.recurseDOM(permissionsContainer.html(), this.blockBoundaries);
|
||||
|
||||
return permissions;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
try {
|
||||
const details = {};
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
await this.extractSingleFields($, details);
|
||||
|
||||
details.otherNames = await this.extractOtherNames($);
|
||||
|
||||
details.permissions = await this.extractPermissions(html);
|
||||
|
||||
details.agentOf = await this.extractAgentOf($);
|
||||
|
||||
return details;
|
||||
}
|
||||
catch (err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const { name, id } = serviceObject.links[serviceObject.step];
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
// Wait for the paragraph at the bottom to have loaded.
|
||||
await this.page.$x('//a[contains(text(), "* Firms with an asterisk")]');
|
||||
|
||||
// open all accordions before taking screenshot
|
||||
// first, add a class `expand-below` to the container divs we are interested in:
|
||||
for (const heading of this._headingsToScrape) {
|
||||
const expandBelowThisDiv = await this.page.$x(`//h4[contains(., "${heading}")]/../..`);
|
||||
expandBelowThisDiv.forEach(async (elm) => {
|
||||
await this.page.evaluate(el => {
|
||||
const currentClass = el.getAttribute('class');
|
||||
el.setAttribute('class', `${currentClass} expand-below`);
|
||||
}, elm);
|
||||
});
|
||||
}
|
||||
|
||||
// then, add a style tag to the <head> to expand the content
|
||||
await this.page.addStyleTag({
|
||||
'content':
|
||||
`
|
||||
div.expand-below div.collapse {
|
||||
display: block;
|
||||
}
|
||||
|
||||
div.expand-below div.modal {
|
||||
display: block;
|
||||
position: static;
|
||||
opacity: 1;
|
||||
overflow: visible;
|
||||
margin-top: 125px;
|
||||
}
|
||||
|
||||
/* remove drop shadows for faster rendering on large pages */
|
||||
.modal-content {
|
||||
-webkit-box-shadow: none;
|
||||
box-shadow: none;
|
||||
}
|
||||
`
|
||||
});
|
||||
|
||||
// temporarily disable GI screenshots
|
||||
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
const underConstruction = $('h3:contains("under construction")').length > 0;
|
||||
|
||||
if (underConstruction) {
|
||||
logger.warn(`Page under construction: ${this.page.url()}`);
|
||||
await jsonfile.writeFile(`${filePath}.json`, { 'underConstruction': true });
|
||||
}
|
||||
else {
|
||||
const details = await this.extractEntityDetails(body);
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details });
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
await this.processEntityDetails(this.agentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject) {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const filename = this.modeNames[this.mode];
|
||||
// temporarily disable GI screenshots
|
||||
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
|
||||
// await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
let ul;
|
||||
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
ul = $('h3:contains("Authorised Payment Institutions")');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
ul = $('h3:contains("E-money Institutions")');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
ul = $('h3:contains("Banks")');
|
||||
break;
|
||||
|
||||
case 3:
|
||||
ul = $('h3:contains("Electronic Money and Payment Institution Agents")');
|
||||
|
||||
}
|
||||
|
||||
const links = ul.next().find('li > a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
|
||||
const text = this._cleanUp($(item).text());
|
||||
|
||||
const newUrl = `https://www.fsc.gi${href}`;
|
||||
const id = this._makeFieldName(text);
|
||||
|
||||
if (serviceObject.links.findIndex(x => x.href === newUrl) === -1)
|
||||
serviceObject.links.push({ 'name': text, 'href': newUrl, 'id': id });
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
await this._randomWait(this.page, 6, 9);
|
||||
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
|
||||
|
||||
await this.processIndex(serviceObject);
|
||||
|
||||
if (serviceObject.indexStep < serviceObject.urls.length - 1) {
|
||||
serviceObject.indexStep++;
|
||||
const newUrl = serviceObject.urls[serviceObject.indexStep];
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
logger.debug('>> indexRedirector');
|
||||
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
await this.buildIndex(this.agentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
if (
|
||||
pageUrl.pathname.includes('payment-institutions-20') ||
|
||||
pageUrl.pathname.includes('e-money-institutions-17') ||
|
||||
pageUrl.pathname.includes('banks-1') ||
|
||||
pageUrl.pathname.includes('electronic-money-and-payment-institution-agents-26')
|
||||
)
|
||||
await this.indexRedirector();
|
||||
else if (pageUrl.pathname.includes('regulated-entity'))
|
||||
await this.processRedirector();
|
||||
else if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl.href}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl.href);
|
||||
}
|
||||
}
|
||||
catch (err) {
|
||||
if (err.name === 'TimeoutError') {
|
||||
logger.error(`Reloading page after timeout: ${err.name}: ${err.message}`);
|
||||
this.page.reload();
|
||||
}
|
||||
else
|
||||
throw(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
case 3:
|
||||
this.emit('agentServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('agindexdone', async () => {
|
||||
this.agentServices.items = this.agentServices.links.length;
|
||||
logger.info(`${this.agentServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.agentServices.links[this.agentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
case 3:
|
||||
this.emit('agindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try {
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try {
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try {
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
|
||||
await this._goto(this.agentServices.urls[0]);
|
||||
});
|
||||
|
||||
this.on('agentServicesDone', async () => {
|
||||
logger.warn('agentServicesDone');
|
||||
try {
|
||||
this.agentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/agentServices.json`, { 'links': this.agentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/agentServices.json`, this.agentServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done': false,
|
||||
'urls': ['https://www.fsc.gi/regulated-entities/payment-institutions-20'],
|
||||
'sections': [],
|
||||
'sectionLinks': []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done': false,
|
||||
'urls': ['https://www.fsc.gi/regulated-entities/e-money-institutions-17'],
|
||||
'sections': [],
|
||||
'sectionLinks': []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'paginationStep': 0,
|
||||
'visited': false,
|
||||
'done': false,
|
||||
'urls': ['https://www.fsc.gi/regulated-entities/banks-1'],
|
||||
'sections': [],
|
||||
'sectionLinks': []
|
||||
};
|
||||
|
||||
this.agentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'done': false,
|
||||
'urls': ['https://www.fsc.gi/regulated-entities/electronic-money-and-payment-institution-agents-26']
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname}/../artefacts/GI/FSC`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch (e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = GIScrape;
|
79
ncas/gr.js
Normal file
79
ncas/gr.js
Normal file
@ -0,0 +1,79 @@
|
||||
const logger = require('log4js').getLogger('GR');
|
||||
const path = require('path');
|
||||
const url = require('url');
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
|
||||
class GRScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'GR';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
async start() {
|
||||
super._start();
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/GR/BG`));
|
||||
|
||||
this.startPage = 'https://www.bankofgreece.gr/Pages/en/Supervision/SupervisedInstitutions/default.aspx';
|
||||
|
||||
await this._doNonRepudiation(false, { 'sslWithPrefix': true }).catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
this._makeScreenshotV2(this.page, `${this.path}/index`);
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
logger.info('Saving excels into:', this.path);
|
||||
|
||||
for (const linkText of [
|
||||
'List of credit institutions operating in Greece',
|
||||
'List of credit institutions authorised in Greece with operations abroad through a subsidiary or a branch',
|
||||
'List/register of payment institutions',
|
||||
'List/register of electronic money institutions'
|
||||
]) {
|
||||
const links = await this.page.$x(`//a[contains(text(), \'${linkText}\')]`);
|
||||
const linkElement = links[0];
|
||||
const href = await this.page.evaluate(
|
||||
link => link.getAttribute('href'),
|
||||
linkElement,
|
||||
);
|
||||
const xlsUrl = url.resolve(await this.page.url(), href);
|
||||
await this._goto(xlsUrl, { 'waitUntil':'networkidle0' });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
|
||||
// wait until all downloads finished (currently this is only possible with 'page.goto', so we go back to the start page.
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = GRScrape;
|
186
ncas/ie.js
Normal file
186
ncas/ie.js
Normal file
@ -0,0 +1,186 @@
|
||||
/**
|
||||
*
|
||||
* User: Martin Donnelly
|
||||
* Date: 2018-09-13
|
||||
* Time: 12:23
|
||||
*
|
||||
*/
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const logger = require('log4js').getLogger('IE');
|
||||
|
||||
class IEScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('IE');
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
await super._start();
|
||||
try{
|
||||
this.startPage = 'http://registers.centralbank.ie/Home.aspx';
|
||||
const mouseDownDuration = IEScrape.notARobot();
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/IE/CBI`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
await this.page.waitForSelector('#ctl00_cphRegistersMasterPage_lblViewList');
|
||||
await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* locate the download section associated with the searchText
|
||||
* @param downloadsection
|
||||
* @param searchText
|
||||
* @returns {Promise<*>}
|
||||
*/
|
||||
async findDownloadSection(downloadsection, searchText) {
|
||||
let wantedId;
|
||||
try{
|
||||
await this.page.waitFor(downloadsection);
|
||||
|
||||
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
$(`${downloadsection} span`).each((i, el) => {
|
||||
if ($(el).text() === searchText)
|
||||
wantedId = $(el).attr('id');
|
||||
|
||||
return wantedId;
|
||||
});
|
||||
|
||||
return wantedId;
|
||||
}
|
||||
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand the relevant section
|
||||
* @param elmId
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async expandArea(elmId) {
|
||||
await this.page.click(`span#${elmId}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the Download Links via section ID
|
||||
* @param elmId
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async findDownloadsLinksID(elmId) {
|
||||
return await this.page.$eval(`span#${elmId}`, e => e.parentElement.nextElementSibling.getAttribute('id'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Process the download links and grab the pdf files
|
||||
* @param id
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processDownloadLinks(id) {
|
||||
try {
|
||||
// Each link is duplicated in a P and an Image. We just use the one in the P tag.
|
||||
const clickableLinks = await this.page.$$(`[id="${id}"] p a`);
|
||||
const mouseDownDuration = IEScrape.notARobot();
|
||||
|
||||
for (const item of clickableLinks) {
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
await item.click({ 'delay':mouseDownDuration }).catch((err) => {
|
||||
this._uploadError();
|
||||
});
|
||||
await this._randomWait(this.page, 5, 10);
|
||||
}
|
||||
}
|
||||
catch(e) {
|
||||
await this._uploadError();
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async grabSection(dlSectionElm, sectionTitle) {
|
||||
try {
|
||||
const section = await this.findDownloadSection(dlSectionElm, sectionTitle);
|
||||
|
||||
await this.expandArea(section);
|
||||
|
||||
this._makeScreenshotV2(this.page, `${ this.path}/${sectionTitle}`, null);
|
||||
|
||||
const sectionID = await this.findDownloadsLinksID(section);
|
||||
|
||||
await this.processDownloadLinks(sectionID);
|
||||
|
||||
await this._randomWait(this.page, 5, 10);
|
||||
}
|
||||
catch(e) {
|
||||
await this._uploadError();
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Grab the Pdf's and screenshots
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async __run() {
|
||||
try {
|
||||
await this.start();
|
||||
|
||||
await this._randomWait(this.page, 5, 10);
|
||||
await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null);
|
||||
|
||||
const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions'];
|
||||
|
||||
for (const section of sections)
|
||||
await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section);
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = IEScrape;
|
889
ncas/it.js
Normal file
889
ncas/it.js
Normal file
@ -0,0 +1,889 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('IT');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class ITscrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('IT');
|
||||
|
||||
this.on('done', () => {
|
||||
// this._saveLocalStorage(this.page, `${this.path}/${this.id}_localstorage.json`);
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async forceScrollToTop() {
|
||||
// Force the scroll
|
||||
|
||||
await this.page.evaluate(() => {
|
||||
window.scrollBy(0, window.innerHeight);
|
||||
});
|
||||
|
||||
// Force the hover
|
||||
|
||||
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
// Force the focus
|
||||
|
||||
await this.page.focus('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async forceEnglish() {
|
||||
await this._randomWait(this.page, 2, 2, 'Force English');
|
||||
|
||||
await this.page.waitForSelector('#bs-example-navbar-collapse-1 > ul > li.dropdown > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 2, 2);
|
||||
}).catch(() => {
|
||||
logger.debug('No Language button');
|
||||
});
|
||||
|
||||
await this._findAndClick('#bs-example-navbar-collapse-1 > ul > li.dropdown.open > ul > li:nth-child(2) > a');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleFrontPage() {
|
||||
let pageReturned = false;
|
||||
await this._randomWait(this.page, 3, 5, 'handleFrontPage');
|
||||
|
||||
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(async (err) => {
|
||||
logger.info('handleFrontPage: ul.linkgroup a Not found', err);
|
||||
});
|
||||
|
||||
do{
|
||||
await this.page.waitFor('#my-container > div.container > div', { 'visible':true, 'timeout':7500 }).then(() => {
|
||||
pageReturned = true;
|
||||
}).catch(async () => {
|
||||
logger.info('We didnt transition back correctly, forcing another click..\n');
|
||||
});
|
||||
|
||||
if (!pageReturned) {
|
||||
await this.page.hover('ul.linkgroup a').catch((err) => {
|
||||
logger.debug(err.name);
|
||||
});
|
||||
|
||||
await this.page.focus('ul.linkgroup a').catch((err) => {
|
||||
logger.debug(err.name);
|
||||
});
|
||||
|
||||
await this.page.waitFor('ul.linkgroup a', { 'visible':true }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(async (err) => {
|
||||
logger.info('handleFrontPage: ul.linkgroup a still not found', err.name);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
while(!pageReturned);
|
||||
|
||||
// Supervisory registers and lists
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleSecondPage() {
|
||||
try{
|
||||
// sometimes this page takes a while to load...
|
||||
const url = await this.page.evaluate('location.href');
|
||||
|
||||
await this._randomWait(this.page, 10, 13, 'handleSecondPage');
|
||||
|
||||
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':90000 }).catch((e) => {
|
||||
logger.warn('Ajax loading shroud not removed after 90 seconds');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('ul.nav.navbar-nav.navbar-center li a', { 'visible':false, 'timeout':90000 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 5, 8, 'await transition');
|
||||
}).catch((e) => {
|
||||
logger.warn('Page Navigation navigation links failed to load / display');
|
||||
});
|
||||
|
||||
// await this._findAndClick('ul.nav.navbar-nav.navbar-center li a', null, 'https://infostat.bancaditalia.it/GIAVAInquiry-public/ng/int-albi');
|
||||
|
||||
const newUrl = await this.page.evaluate('location.href');
|
||||
|
||||
if (url !== newUrl) {
|
||||
logger.debug('The page Has changed!');
|
||||
this.emit('pageChanged');
|
||||
}
|
||||
}
|
||||
catch( err) {
|
||||
logger.error('Failed to progress past second page', err);
|
||||
this.emit('recover');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractPSRegistry(html) {
|
||||
try{
|
||||
const registry = {};
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('app-details-anagrafica > div.row');
|
||||
rows.each((index, item) => {
|
||||
const divs = $(item).find('div');
|
||||
|
||||
if ($(item).children().length === 2) {
|
||||
const name = this._cleanUp(divs.eq(0).text()) ;
|
||||
registry[name] = this._cleanUp(divs.eq(1).text());
|
||||
}
|
||||
});
|
||||
|
||||
return registry;
|
||||
}
|
||||
catch (err) {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('extractPSRegistry\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('extractPSRegistry\n', err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractPSRegisters(html) {
|
||||
try {
|
||||
const registers = [];
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('app-details-albi div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
|
||||
|
||||
logger.info(`${rows.length} registers item${(rows.length !== 1) ? 's' : ''}`);
|
||||
rows.each((index, item) => {
|
||||
const divs = $(item).find('div');
|
||||
const obj = {};
|
||||
|
||||
for (let counter = 0; counter < divs.length;counter++) {
|
||||
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
|
||||
obj[name] = this._cleanUp(divs.eq(counter).text());
|
||||
}
|
||||
registers.push(obj);
|
||||
});
|
||||
|
||||
return registers;
|
||||
}
|
||||
catch (err) {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('extractPSRegisters\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('extractPSRegisters\n', err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractPSAuthority(html) {
|
||||
try{
|
||||
const authority = [];
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('app-details-att-autorizzate div.ag-bl-center.ag-bl-full-height-center > div > div.ag-body > div.ag-body-viewport-wrapper > div > div div[role="row"]');
|
||||
|
||||
logger.info(`${rows.length} authority item${(rows.length !== 1) ? 's' : ''}`);
|
||||
rows.each((index, item) => {
|
||||
const divs = $(item).find('div');
|
||||
const obj = {};
|
||||
|
||||
for (let counter = 0; counter < divs.length;counter++) {
|
||||
const name = this._cleanUp(divs.eq(counter).attr('col-id'));
|
||||
obj[name] = this._cleanUp(divs.eq(counter).text());
|
||||
}
|
||||
authority.push(obj);
|
||||
});
|
||||
|
||||
return authority;
|
||||
}
|
||||
catch (err) {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('extractPSAuthority\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('extractPSAuthority\n', err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async preparePSSearch() {
|
||||
try{
|
||||
await this._randomWait(this.page, 3, 5, `preparePSSearch - ${this.modeTitles[this.mode]}`);
|
||||
|
||||
// Brute force the selector
|
||||
|
||||
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-search > div > div:nth-child(3) > div > input');
|
||||
|
||||
await this.page.waitForFunction(
|
||||
'document.querySelector("#alboElenco").options.length > 1'
|
||||
, { 'timeout':7500 }).then(() => {
|
||||
logger.debug('Ajax done');
|
||||
}).catch(() => {
|
||||
throw new Error('Ajax not done');
|
||||
});
|
||||
|
||||
const options = await this.page.$$('#alboElenco option');
|
||||
const optionList = ['ALBO IP ART.114-SEPTIES TUB ', 'ALBO IMEL ITA EX 114-QUATER ', 'ALBO DELLE BANCHE '];
|
||||
|
||||
const wantedOption = [optionList[this.mode]];
|
||||
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('#alboElenco', value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// wait for loading shroud to go away
|
||||
await this.page.waitForSelector('div.loading', { 'visible':false, 'timeout':25000 });
|
||||
|
||||
let btnSuccess = false;
|
||||
|
||||
do {
|
||||
await this.page.waitForSelector('button.btn.btn-success', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(() => {
|
||||
btnSuccess = true;
|
||||
});
|
||||
await this._randomWait(this.page, 1, 1, 'preparePSSearch btnSuccess');
|
||||
}
|
||||
|
||||
while(!btnSuccess);
|
||||
|
||||
this.page.waitFor('app-int-albi-grid-result').then(async () => {
|
||||
//
|
||||
await this.forceEnglish();
|
||||
await this.emit('processAgTable');
|
||||
}).catch(async (err) => {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('No results transition\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('No results transition\n', err);
|
||||
});
|
||||
}
|
||||
catch (err) {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('preparePSSearch\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('preparePSSearch\n', err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<{registry, authority, registers}>}
|
||||
*/
|
||||
async processPSDetail() {
|
||||
let registry = {}, registers = {}, authority = {};
|
||||
|
||||
await this._randomWait(this.page, 3, 3, 'processPSDetail: AJAX');
|
||||
|
||||
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div.card.card-title > span > span', { 'visible': true }).catch((err) => {
|
||||
logger.warn('AJAX data has failed to load');
|
||||
logger.debug(err);
|
||||
|
||||
return { registry, registers, authority };
|
||||
});
|
||||
|
||||
await this.page.waitFor('app-int-albi-details').then(async () => {
|
||||
await this.forceScrollToTop();
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
registry = await this.extractPSRegistry(body);
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'processPSDetail app-int-albi-details');
|
||||
}).catch(async (err) => {
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error('processPSDetail\n', err);
|
||||
}
|
||||
else
|
||||
logger.error('processPSDetail\n', err);
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-int-albi-details');
|
||||
|
||||
//
|
||||
|
||||
await this.forceScrollToTop();
|
||||
|
||||
// wait for Registers Tab
|
||||
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(2) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
|
||||
logger.debug('** Showing Registers Tab');
|
||||
await elm.click({ 'delay':90 });
|
||||
|
||||
await this.page.waitFor('app-details-albi', { 'visible': true, 'timeout':10000 }).then(async () => {
|
||||
const body = await this.page.content();
|
||||
|
||||
registers = await this.extractPSRegisters(body);
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-albi');
|
||||
}).catch(async (err) => {
|
||||
if (process.env.NODE_ENV)
|
||||
// await this._uploadError();
|
||||
throw new Error('No tab transition\n', err);
|
||||
|
||||
else
|
||||
logger.error('No tab transition');
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 1, 1, 'processPSDetail after app-details-albi');
|
||||
}).catch((err) => {
|
||||
logger.warn('No "registers" Block...');
|
||||
logger.debug(err);
|
||||
});
|
||||
|
||||
// wait for Activity Tab
|
||||
await this.forceScrollToTop();
|
||||
await this.page.waitFor('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a', { 'visible': true, 'timeout':10000 }).then(async (elm) => {
|
||||
logger.debug('** Showing Activity Tab');
|
||||
await elm.click({ 'delay':90 });
|
||||
let pageReturned = false;
|
||||
|
||||
do
|
||||
await this.page.waitFor('app-details-att-autorizzate', { 'visible': true, 'timeout':10000 }).then(async () => {
|
||||
pageReturned = true;
|
||||
const body = await this.page.content();
|
||||
|
||||
authority = await this.extractPSAuthority(body);
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'processPSDetail app-details-att-autorizzate');
|
||||
}).catch(async (err) => {
|
||||
await this.forceScrollToTop();
|
||||
await this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-details > div > div:nth-child(2) > ul > li:nth-child(3) > a');
|
||||
|
||||
if (process.env.NODE_ENV)
|
||||
throw new Error('No tab transition\n', err);
|
||||
|
||||
else
|
||||
logger.warn('No tab transition');
|
||||
});
|
||||
while(!pageReturned);
|
||||
}).catch((err) => {
|
||||
logger.warn('No "Activity" Block...');
|
||||
logger.debug(err);
|
||||
});
|
||||
|
||||
return { registry, registers, authority };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async returnToPSList() {
|
||||
try{
|
||||
let pageReturned = false;
|
||||
await this.page.hover('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a').catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
|
||||
|
||||
do
|
||||
await this.page.waitFor('app-int-albi-grid-result').then(() => {
|
||||
pageReturned = true;
|
||||
}).catch(async (err) => {
|
||||
logger.warn('We didnt transition back correctly, forcing another click..\n', err);
|
||||
|
||||
await this.forceScrollToTop();
|
||||
|
||||
await this._findAndClick('#sub-navbar > giava-breadcrumb > ol > li:nth-child(3) > a');
|
||||
});
|
||||
|
||||
while(!pageReturned);
|
||||
}
|
||||
catch (err) {
|
||||
logger.error('returnToPSList\n', err);
|
||||
this.emit('recover');
|
||||
if (process.env.NODE_ENV)
|
||||
await this._uploadError();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<number>}
|
||||
*/
|
||||
async psGetMaxRows() {
|
||||
const regExNumbersOnly = /\d{1,13}(?:,\d{0,2})?/g;
|
||||
|
||||
const elm = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(1) > p');
|
||||
|
||||
const text = await this.page.evaluate(el => el.innerText, elm[0]);
|
||||
|
||||
const numbers = regExNumbersOnly.exec(text);
|
||||
|
||||
return (numbers !== null) ? parseInt(numbers[0], 10) : -1;
|
||||
}
|
||||
|
||||
async processDivs($, divs) {
|
||||
const entries = {};
|
||||
|
||||
divs.each((index, item) => {
|
||||
const itemText = this._cleanUp($(item).text());
|
||||
const itemName = $(item).attr('col-id');
|
||||
// logger.info(`>> ${index}`, itemName, itemText);
|
||||
entries[itemName] = itemText;
|
||||
});
|
||||
|
||||
return entries;
|
||||
}
|
||||
|
||||
async psSetListCount(count) {
|
||||
logger.debug('+ psSetListCount ');
|
||||
await this.page.focus('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(7) > div > input');
|
||||
|
||||
for(let del = 0;del < 5;del++)
|
||||
await this.page.keyboard.press('Backspace');
|
||||
|
||||
await this.page.keyboard.type(count.toString(), { 'delay': 100 }); // Types slower, like a user
|
||||
|
||||
await this.page.keyboard.press('Enter');
|
||||
|
||||
await this._randomWait(this.page, 10, 10, 'ajax refresh');
|
||||
logger.debug('- psSetListCount ');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processAGTableV3(serviceObject) {
|
||||
// this whole thing is ugly but at the moment it works
|
||||
|
||||
await this._randomWait(this.page, 3, 5, 'processAGTableV3');
|
||||
|
||||
const _defaultMaxPerPage = 10;
|
||||
let workingData;
|
||||
let elmStep;
|
||||
let item;
|
||||
let maxPages = 0;
|
||||
let rowsInPass;
|
||||
|
||||
await this.psSetListCount(_defaultMaxPerPage);
|
||||
const maxRows = await this.psGetMaxRows();
|
||||
let remainingRows = maxRows;
|
||||
|
||||
logger.info('Max Rows', maxRows);
|
||||
|
||||
if (maxRows > _defaultMaxPerPage) {
|
||||
maxPages = ~~(maxRows / _defaultMaxPerPage);
|
||||
|
||||
logger.info('Max pages:', maxPages);
|
||||
}
|
||||
|
||||
for(let pageStep = 0; pageStep <= maxPages; pageStep++) {
|
||||
logger.info('Pagestep', pageStep, (pageStep + 1) * _defaultMaxPerPage);
|
||||
|
||||
if (maxPages > 0)
|
||||
if ((maxRows - ((pageStep ) * _defaultMaxPerPage)) > _defaultMaxPerPage)
|
||||
rowsInPass = _defaultMaxPerPage;
|
||||
else
|
||||
|
||||
rowsInPass = (maxRows - ((pageStep ) * _defaultMaxPerPage));
|
||||
else
|
||||
rowsInPass = maxRows;
|
||||
|
||||
logger.info(`Rows in this pass : ${rowsInPass}`);
|
||||
|
||||
for (let step = 0; step < rowsInPass; step++) {
|
||||
for ( elmStep = 0; elmStep <= step; elmStep++) {
|
||||
workingData = await this.page.$$(`div.ag-body-container div.ag-row[row-id="${elmStep}"]`);
|
||||
item = workingData[0];
|
||||
|
||||
if (typeof item !== 'undefined')
|
||||
await item.hover().catch((err) => {
|
||||
logger.warn(err);
|
||||
logger.info(item);
|
||||
});
|
||||
|
||||
await this._microWait(this.page, 1);
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'processAGTableV3 after rows');
|
||||
|
||||
if (typeof item !== 'undefined') {
|
||||
const html = await this.page.evaluate(el => el.innerHTML, item);
|
||||
const clickable = await item.$('div[col-id="name"]');
|
||||
const abiCodeElm = await item.$('div[col-id="abiCode"]');
|
||||
const uid = await this.page.evaluate(el => el.innerText, abiCodeElm);
|
||||
const clickName = await this.page.evaluate(el => el.innerText, clickable);
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const divs = $('div');
|
||||
|
||||
logger.info(`Processing : ${clickName}, ${remainingRows} remain.`);
|
||||
|
||||
if (!serviceObject.workingData.has(uid)) {
|
||||
// Exract all the data from the cells
|
||||
|
||||
const newEntry = await this.processDivs($, divs);
|
||||
|
||||
// Insert it in the map
|
||||
serviceObject.workingData.set(uid, newEntry);
|
||||
|
||||
await this._randomWait(this.page, 2, 2, `Processing : ${clickName}`);
|
||||
|
||||
const filePath = await this._makeFilePath(clickName);
|
||||
const fileName = this._makeFileName(clickName);
|
||||
await this._randomWait(this.page, 2, 2, 'processAGTableV3 before ss');
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
serviceObject.links.push({ uid, 'fileName':`${fileName}.json`, 'name':clickName });
|
||||
|
||||
// Go into the detail
|
||||
await clickable.click();
|
||||
|
||||
await this._randomWait(this.page, 3, 4, 'processAGTableV3 before next');
|
||||
|
||||
remainingRows--;
|
||||
|
||||
await this.page.waitFor('app-int-albi-details').then(
|
||||
|
||||
await this.doAlbiDetails(filePath, newEntry)
|
||||
|
||||
).catch(async (err) => {
|
||||
logger.error('No detail transition', err);
|
||||
this.emit('recover');
|
||||
|
||||
if (process.env.NODE_ENV)
|
||||
await this._uploadError();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (maxPages > 0) {
|
||||
logger.info('Clicking to the next page...');
|
||||
|
||||
const nextButton = await this.page.$$('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
|
||||
const buttonDisabled = await this.page.evaluate(el => el.disabled, nextButton[0]);
|
||||
if (!buttonDisabled) {
|
||||
this._findAndClick('#sub-navbar > app-int-albi > app-int-albi-grid-result > grid-pagination > div > div > div:nth-child(5) > button');
|
||||
await this._randomWait(this.page, 5, 5, 'processAGTableV3 next page click');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug('processAGTableV3 DONE');
|
||||
|
||||
this.emit('doneProcessingGrid');
|
||||
}
|
||||
|
||||
async doAlbiDetails(filePath, newEntry) {
|
||||
try{
|
||||
// process the page
|
||||
const data = await this.processPSDetail();
|
||||
data.details = newEntry;
|
||||
|
||||
logger.info(`Saving ${filePath}.json`);
|
||||
await jsonfile.writeFile(`${filePath}.json`, data);
|
||||
|
||||
await this._randomWait(this.page, 5, 7, 'doAlbiDetails');
|
||||
|
||||
// Retun back to list
|
||||
|
||||
await this.returnToPSList();
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'doAlbiDetails after returnToPSList');
|
||||
// wArray.push([uid, clickName]);
|
||||
}
|
||||
catch (err) {
|
||||
logger.error('doAlbiDetails\n', err);
|
||||
this.emit('recover');
|
||||
|
||||
if (process.env.NODE_ENV)
|
||||
await this._uploadError();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5, 'processNewPage');
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/compiti/vigilanza/albi-elenchi/index.html':
|
||||
await this.handleFrontPage();
|
||||
break;
|
||||
|
||||
case '/GIAVAInquiry-public/ng/':
|
||||
await this.handleSecondPage();
|
||||
break;
|
||||
|
||||
case '/GIAVAInquiry-public/ng/int-albi/search':
|
||||
await this.preparePSSearch();
|
||||
break;
|
||||
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
||||
await this.crossBorderRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
// Need thiss for Angular based sites
|
||||
|
||||
// clear out stock recover handler
|
||||
|
||||
this.removeAllListeners('recover');
|
||||
|
||||
this.on('pageChanged', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
this.on('recover', this._debounce(async () => {
|
||||
clearTimeout(this.backOffTimer);
|
||||
|
||||
logger.warn('Backing off for 5 minutes..');
|
||||
const timeout = (60 * 1000) * 5;
|
||||
|
||||
this.backOffTimer = setTimeout(() => {
|
||||
this.emit('restart');
|
||||
// this.recover();
|
||||
}, timeout);
|
||||
}, 30000));
|
||||
|
||||
this.on('restart', this._debounce(async() => {
|
||||
clearTimeout(this.backOffTimer);
|
||||
|
||||
logger.warn('Restarting::');
|
||||
|
||||
// await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
// use the Scraper recovery now to ensure crashed browser is resurrected
|
||||
await this.__recover(this.startPage);
|
||||
}, 15000));
|
||||
|
||||
this.on('processAgTable', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processAGTableV3(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processAGTableV3(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processAGTableV3(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('doneProcessingGrid', async () => {
|
||||
let curObj;
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
curObj = this.emoneyServices;
|
||||
break;
|
||||
|
||||
case 2:
|
||||
curObj = this.creditServices;
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
curObj = this.paymentServices;
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
curObj.done = true;
|
||||
curObj.items = curObj.links.length;
|
||||
|
||||
jsonfile.writeFileSync(`${this.path}/${this.modeNames[this.mode]}.json`, { 'links':curObj.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/${this.modeNames[this.mode]}.json`, curObj);
|
||||
|
||||
this.mode++;
|
||||
|
||||
if (this.mode < 3) {
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5, 'doneProcessingGrid');
|
||||
}
|
||||
else
|
||||
this.emit('done');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
|
||||
'workingData': new Map([]),
|
||||
'workingIndex': 0
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
|
||||
'workingData': new Map([]),
|
||||
'workingIndex': 0
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.bancaditalia.it/compiti/vigilanza/albi-elenchi/index.html?com.dotmarketing.htmlpage.language=1'],
|
||||
'workingData': new Map([]),
|
||||
'workingIndex': 0
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = '';
|
||||
this.credit = '';
|
||||
|
||||
this.backOffTimer = 0;
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/IT/FSA`));
|
||||
|
||||
await this._doNonRepudiation(false, { 'sslWithPrefix':true }).catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5, 'After start');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = ITscrape;
|
666
ncas/lt.js
Normal file
666
ncas/lt.js
Normal file
@ -0,0 +1,666 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('LT');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class LTScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'LT';
|
||||
|
||||
this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']);
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param path
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async extractEntityIntermediaries(html, path = 'item-contra-intermediaries') {
|
||||
try{
|
||||
const newObj = { } ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $(`#${path} li div.row`);
|
||||
|
||||
rows.each((i, li) => {
|
||||
const children = $(li).children();
|
||||
|
||||
if ($(children).length === 2) {
|
||||
const label = this._makeFieldName($(children).eq(0).text());
|
||||
|
||||
if (!newObj.hasOwnProperty(label))
|
||||
newObj[label] = [];
|
||||
|
||||
newObj[label].push(this._cleanUp($(children).eq(1).text()));
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async extractEntityList(html) {
|
||||
try{
|
||||
const newArray = [] ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('#item-lists li');
|
||||
|
||||
rows.each((i, li) => {
|
||||
const children = $(li).children();
|
||||
|
||||
if ($(children).length === 1)
|
||||
newArray.push(this._cleanUp($(children).eq(0).text()));
|
||||
});
|
||||
|
||||
return newArray;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async extractEntityActivity(html) {
|
||||
try{
|
||||
const newArray = [] ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('#item-activities tbody tr');
|
||||
|
||||
rows.each((i, li) => {
|
||||
const children = $(li).children();
|
||||
|
||||
if ($(children).length === 3) {
|
||||
const activity = this._cleanUp($(children).eq(0).text());
|
||||
const from = this._cleanUp($(children).eq(1).text());
|
||||
const to = this._cleanUp($(children).eq(2).text());
|
||||
|
||||
newArray.push({ activity, from, to });
|
||||
}
|
||||
});
|
||||
|
||||
return newArray;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async extractEntityFOSContent(html) {
|
||||
try{
|
||||
const newObj = {} ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('#fos-content div.panel-heading');
|
||||
|
||||
rows.each((i, row) => {
|
||||
const label = this._makeFieldName($(row).find('span.l').text());
|
||||
|
||||
if (!newObj.hasOwnProperty(label))
|
||||
newObj[label] = [];
|
||||
|
||||
const sibling = $(row).next();
|
||||
|
||||
const tr = $(sibling).find('tbody tr');
|
||||
|
||||
tr.each((y, item) => {
|
||||
const children = $(item).children();
|
||||
if ($(children).length === 3) {
|
||||
const activity = this._cleanUp($(children).eq(0).text());
|
||||
const from = this._cleanUp($(children).eq(1).text());
|
||||
const to = this._cleanUp($(children).eq(2).text());
|
||||
|
||||
newObj[label].push({ activity, from, to });
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
const spliterRX = /(.+)(?::\s+)(.+)/;
|
||||
try{
|
||||
const newObj = { } ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const items = $('div.frd-props.text.row p');
|
||||
|
||||
items.each((i, elm) => {
|
||||
const children = cheerio(elm).children();
|
||||
if (children.length > 0) {
|
||||
const propType = $(children.eq(0)).prop('name');
|
||||
|
||||
if (propType !== 'a') {
|
||||
const ws = $(elm).text().match(spliterRX);
|
||||
|
||||
const label = this._makeFieldName(ws[1]);
|
||||
newObj[label] = this._cleanUp(ws[2]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
async preBuildIndex(serviceObject) {
|
||||
await this.page.waitForSelector('#cookies_msg > div > a', { 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch(() => {
|
||||
logger.info('No cookie band...');
|
||||
});
|
||||
|
||||
// Ensure that the max number f items is shown
|
||||
|
||||
await this.page.waitForSelector('#content > div > div:nth-child(4) > div.totals > form > span > button:nth-child(3)', { 'visible': true, 'timeout':7500 }).then(async (elm) => {
|
||||
const cls = await this.page.evaluate(el => el.getAttribute('class'), elm);
|
||||
|
||||
logger.debug('button class', cls);
|
||||
|
||||
if (cls === null)
|
||||
await elm.click({ 'delay':90 });
|
||||
else
|
||||
await this.buildIndex(serviceObject);
|
||||
});
|
||||
}
|
||||
|
||||
async expandAreas() {
|
||||
const divs = ['item-activities', 'item-contra-intermediaries', 'item-intermediaries', 'item-lists', 'foe-countries'];
|
||||
|
||||
// #content > div > div:nth-child(4) > div > a:nth-child(2)
|
||||
|
||||
for (const item of divs)
|
||||
await this.page.waitForSelector(`div#${item}`, { 'visible': false, 'timeout':2500 }).then(async (elm) => {
|
||||
await this.page.evaluate(el => {
|
||||
el.removeAttribute('class');
|
||||
el.style.display = '';
|
||||
}, elm);
|
||||
}).catch(() => {
|
||||
logger.debug(`No ${item}`);
|
||||
});
|
||||
|
||||
// these needs to load content via ajax
|
||||
const fosA = await this.page.$$('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]');
|
||||
if (fosA.length === 1) {
|
||||
await this.page.waitForSelector('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]', { 'timeout':2500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch(() => {
|
||||
logger.debug('No #fos-countries');
|
||||
});
|
||||
|
||||
// #fos-countries > div > div > div.modal-body > div > div > i
|
||||
await this.page.waitForSelector('#fos-countries > div > div > div.modal-body > div > div > i', { 'visible': false, 'timeout':10000 });
|
||||
|
||||
await this.page.waitForSelector('div#fos-countries', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
|
||||
await this.page.evaluate(async el => {
|
||||
el.style.display = '';
|
||||
await el.removeAttribute('class');
|
||||
}, elm);
|
||||
}).catch(() => {
|
||||
logger.debug('No #fos-countries');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('div.modal-backdrop.in', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
|
||||
await this.page.evaluate(async el => {
|
||||
el.style.height = '0px';
|
||||
el.style.display = 'none';
|
||||
await el.removeAttribute('class');
|
||||
}, elm);
|
||||
}).catch(() => {
|
||||
logger.debug('No #fos-countries');
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async extractIndex(html) {
|
||||
const links = [];
|
||||
const slashRgx = /(\/\/)/;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('table.table tbody tr');
|
||||
|
||||
rows.each((index, item) => {
|
||||
const children = $(item.children);
|
||||
|
||||
const title = this._cleanUp($(children).eq(1).text()) ;
|
||||
|
||||
const type = this._cleanUp($(children).eq(3).text()) ;
|
||||
const businessForm = this._cleanUp($(children).eq(5).text()) ;
|
||||
|
||||
const rawUrl = $(children).eq(1).find('a').attr('href');
|
||||
|
||||
const href = rawUrl.replace(slashRgx, 'https://');
|
||||
|
||||
links.push({ 'id': title, 'href': href, 'type': type, 'businessForm':businessForm });
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
async processEntityPage(serviceObject) {
|
||||
const newObj = {};
|
||||
|
||||
const id = serviceObject.links[serviceObject.step].id;
|
||||
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entityName = serviceObject.links[serviceObject.step].id;
|
||||
const fileName = this._makeFileName(entityName);
|
||||
const filePath = await this._makeFilePath(entityName);
|
||||
|
||||
await this.expandAreas();
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
newObj.details = await this.extractEntityDetails(body);
|
||||
newObj.contraIntermediaries = await this.extractEntityIntermediaries(body, 'item-contra-intermediaries');
|
||||
newObj.intermediaries = await this.extractEntityIntermediaries(body, 'item-intermediaries');
|
||||
newObj.list = await this.extractEntityList(body);
|
||||
newObj.activity = await this.extractEntityActivity(body);
|
||||
newObj.foeCountries = await this.extractEntityIntermediaries(body, 'foe-countries');
|
||||
newObj.fosContent = await this.extractEntityFOSContent(body);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, newObj);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
// await this._randomWait(this.page, 1000, 1000, 'Throttled');
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${fileName}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const entityName = `${this.modeNames[this.mode]}`;
|
||||
|
||||
const filePath = await this._makeFilePath(entityName);
|
||||
|
||||
await this._makeScreenshotV2(this.page, filePath, null);
|
||||
|
||||
const links = await this.extractIndex(body);
|
||||
|
||||
serviceObject.links = links.slice();
|
||||
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.preBuildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.preBuildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.preBuildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityPage(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityPage(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityPage(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
const rX = /(\/en\/sfi-financial-market-participants)(\/?)/;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const pathName = pageUrl.pathname.match(rX)[0];
|
||||
|
||||
logger.debug(pathName);
|
||||
|
||||
switch (pathName) {
|
||||
|
||||
case '/en/sfi-financial-market-participants':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case '/en/sfi-financial-market-participants/':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
// logger.debug(this.paymentServices.links);
|
||||
|
||||
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
// logger.debug(this.paymentServices.links);
|
||||
|
||||
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
// logger.debug(this.paymentServices.links);
|
||||
|
||||
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=6&type%5B%5D=20&business_form%5B%5D=28&business_form%5B%5D=27&business_form%5B%5D=89'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=7&type%5B%5D=21&business_form%5B%5D=32&business_form%5B%5D=33'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=3&type%5B%5D=27&business_form%5B%5D=82&business_form%5B%5D=22&business_form%5B%5D=110'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/LT/LB`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
// start the browser
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = LTScrape;
|
790
ncas/lu.js
Normal file
790
ncas/lu.js
Normal file
@ -0,0 +1,790 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('LU');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
function debounce(func, wait, immediate) {
|
||||
var timeout;
|
||||
|
||||
return () => {
|
||||
const context = this;
|
||||
const args = arguments;
|
||||
const later = () => {
|
||||
timeout = null;
|
||||
if (!immediate) func.apply(context, args);
|
||||
};
|
||||
var callNow = immediate && !timeout;
|
||||
clearTimeout(timeout);
|
||||
timeout = setTimeout(later, wait);
|
||||
if (callNow) func.apply(context, args);
|
||||
};
|
||||
}
|
||||
|
||||
class LUScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'LU';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
|
||||
this.debounceHandleIndexPage = debounce(() => {
|
||||
// the index page sometimes reloads up to 3 times..
|
||||
this.emit('handleIndexPage');
|
||||
}, 7500);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleIndexPage() {
|
||||
const thisUrl = await this.page.url();
|
||||
const pageUrl = url.parse(thisUrl);
|
||||
|
||||
switch (pageUrl.hash) {
|
||||
|
||||
case '#Home':
|
||||
case '#AdvancedSearch':
|
||||
await this.indexPageHomeMode();
|
||||
break;
|
||||
|
||||
case '#ResultResearch':
|
||||
this.emit('handleEntityIndex');
|
||||
break;
|
||||
|
||||
case '#DetailEntity':
|
||||
|
||||
this.emit('processEntity');
|
||||
break;
|
||||
case null:
|
||||
this.emit('selectSearchManually');
|
||||
break;
|
||||
default:
|
||||
logger.error('HASH NOT RECOGNISED');
|
||||
logger.error(pageUrl);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexPageHomeMode() {
|
||||
try{
|
||||
const searchType = ['6', '7', '1'];
|
||||
|
||||
const bodys = ['#advancedsearch_paymentservicestype-body', '#advancedsearch_electronicmoneytype-body', '#advancedsearch_banktype-body'];
|
||||
|
||||
const bankInputs = ['#advancedsearch_bankgroup1_inputEl', '#advancedsearch_bankgroupA_inputEl', '#advancedsearch_bankgroupB_inputEl',
|
||||
'#advancedsearch_bankgroupC_inputEl', '#advancedsearch_bankgroupD_inputEl', '#advancedsearch_bankgroup2_inputEl', '#advancedsearch_bankgroup3_inputEl'];
|
||||
|
||||
// click the advanced search button
|
||||
await this.page.waitForSelector('#menu_advanced').then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
});
|
||||
|
||||
// click
|
||||
await this.page.waitForSelector('#advancedsearch_type-bodyEl').then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 2);
|
||||
|
||||
// call the EXT function to set the advanced search mode..
|
||||
|
||||
await this.page.evaluate(x => {
|
||||
return Ext.getCmp('advancedsearch_type').setValue(x);
|
||||
}, searchType[this.mode]);
|
||||
|
||||
// Mode 0 & Mode 1 have a list of options which can be iterated easily
|
||||
// Mode 2 requires a handful of different inputs to be clicked on
|
||||
|
||||
await this._microWait(this.page, 7);
|
||||
|
||||
if (this.mode === 0) {
|
||||
await this.page.waitForSelector('label#advancedsearch_paymentinstitutionsgroup1-boxLabelEl').then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 2, 2);
|
||||
}
|
||||
|
||||
if (this.mode === 0 && this.mode === 1) {
|
||||
const options = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input.x-form-checkbox-default`);
|
||||
|
||||
// click all the elements
|
||||
logger.debug('options length', options.length);
|
||||
|
||||
for (const item of options)
|
||||
await item.click({ 'delay':Scraper.notARobot() });
|
||||
}
|
||||
|
||||
if (this.mode === 2)
|
||||
for(const bI of bankInputs) {
|
||||
const input = await this.page.$$(`${bodys[this.mode]} div.x-form-item-body input${bI}`);
|
||||
await input[0].click({ 'delay':Scraper.notARobot() });
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 1, 1);
|
||||
// click the button
|
||||
await this.page.waitForSelector('#advancedsearch_searchbutton').then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
});
|
||||
|
||||
// now wait for the results to load..
|
||||
|
||||
await this.page.waitForSelector('#title-1083-textEl').then(async () => {
|
||||
logger.debug('Results loaded');
|
||||
this.emit('pageChanged');
|
||||
});
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async entityIndexFirstPass(serviceObject) {
|
||||
try{
|
||||
const body = await this.page.content();
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const pageDetails = await this.extractBarDetails($);
|
||||
|
||||
const { currentPageIndex, currentPageMax } = pageDetails;
|
||||
|
||||
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
|
||||
serviceObject.currentIndexLength = pageDetails.currentIndexLength;
|
||||
serviceObject.currentPageMax = currentPageMax;
|
||||
|
||||
serviceObject.visited = true;
|
||||
serviceObject.currentIndex = url.parse(await this.page.url());
|
||||
serviceObject.currentMetaIndex = 0;
|
||||
}
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<{currentIndexLength: number, maxPages: number, currentPageMax: number, page: number, currentPageIndex: number}>}
|
||||
*/
|
||||
async extractBarDetails($) {
|
||||
try{
|
||||
const numberExtract = /(\d+)/g;
|
||||
|
||||
const pagingBar = $('#resultresearch_paging-targetEl').children();
|
||||
|
||||
const page = parseInt($(pagingBar).eq(4).find('input').val(), 10);
|
||||
|
||||
const workMaxPages = this._cleanUp($(pagingBar).eq(5).text() );
|
||||
const maxPages = parseInt(workMaxPages.match(numberExtract)[0], 10);
|
||||
|
||||
const rawDisplaying = this._cleanUp($(pagingBar).eq(pagingBar.length - 1).text());
|
||||
|
||||
const [ currentPageIndex, currentPageMax, currentIndexLength ] = rawDisplaying.match(numberExtract).map((s) => {
|
||||
return parseInt(s, 10);
|
||||
});
|
||||
|
||||
return { page, maxPages, currentPageIndex, currentPageMax, currentIndexLength };
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityIndex(serviceObject) {
|
||||
try{
|
||||
const fields = ['type', 'name', 'address'];
|
||||
|
||||
logger.info(`Working on the ${this.modeTitles[this.mode]} index...`);
|
||||
|
||||
await this._randomWait(this.page, 1, 2);
|
||||
|
||||
if (serviceObject.visited === false) {
|
||||
logger.debug('Preparing...');
|
||||
serviceObject.restart = false;
|
||||
await this.entityIndexFirstPass(serviceObject);
|
||||
}
|
||||
|
||||
if (serviceObject.visited === true) {
|
||||
serviceObject.currentMetaIndex = serviceObject.step % serviceObject.currentPageMax;
|
||||
|
||||
logger.debug('serviceObject.currentMetaIndex', serviceObject.currentMetaIndex);
|
||||
|
||||
if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) {
|
||||
logger.debug('Maxed out this page..');
|
||||
|
||||
// serviceObject.visited = false;
|
||||
|
||||
serviceObject.restart = false;
|
||||
|
||||
await this.page.waitForSelector('#button-1052').then(async (elm) => {
|
||||
logger.debug('Proceeding to next index page..');
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
this.emit('pageChanged');
|
||||
});
|
||||
}
|
||||
else {
|
||||
logger.debug('dealing...');
|
||||
|
||||
serviceObject.restart = true;
|
||||
|
||||
logger.debug(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
|
||||
const wantedRow = await this.page.$$(`div#ResultResearchGridView table:nth-child(${serviceObject.currentMetaIndex + 1})`);
|
||||
const htmlTable = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
|
||||
|
||||
const $ = cheerio.load(`<table>${htmlTable}</table>`);
|
||||
|
||||
const cells = $('div.x-grid-cell-inner');
|
||||
|
||||
serviceObject.current = {};
|
||||
|
||||
cells.each((index, item) => {
|
||||
serviceObject.current[ fields[index] ] = this._cleanUp($(item).text());
|
||||
});
|
||||
|
||||
if (typeof(serviceObject.current.name ) !== 'undefined' && serviceObject.current.name !== '') {
|
||||
const fileName = this._makeFileName(serviceObject.current.name);
|
||||
serviceObject.current.fileName = fileName;
|
||||
serviceObject.current.filePath = `${this.path}/${fileName}`.substring(0, 240);
|
||||
}
|
||||
|
||||
// logger.debug(serviceObject);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await wantedRow[0].click({ 'delay':97, 'clickCount': 2 });
|
||||
|
||||
await this._randomWait(this.page, 1, 1);
|
||||
|
||||
this.emit('pageChanged');
|
||||
}
|
||||
}
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @param html
|
||||
* @param divId
|
||||
* @param sequence
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractGridPanel($, html, divId, sequence) {
|
||||
try{
|
||||
const outObj = [];
|
||||
|
||||
const elms = $(html).find(`${divId} div.x-grid-item-container table`);
|
||||
|
||||
elms.each((index, itm) => {
|
||||
const newObj = {};
|
||||
for(const seqItem of sequence) {
|
||||
const mclass = `.x-grid-cell-${seqItem[0]}`;
|
||||
const rowElm = $(itm).find(mclass);
|
||||
newObj[seqItem[1]] = this._cleanUp($(rowElm).text());
|
||||
}
|
||||
|
||||
outObj.push(newObj);
|
||||
});
|
||||
|
||||
return outObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
try{
|
||||
const details = {};
|
||||
const detailSequence = [['detailEntity_type_inputEl', 'type'],
|
||||
['detailEntity_number_inputEl', 'number'],
|
||||
['detailEntity_name_inputEl', 'name'],
|
||||
['detailEntity_address_inputEl', 'address'],
|
||||
['detailEntity_startdate_inputEl', 'startdate'],
|
||||
['detailEntity_closeddate_inputEl', 'closedate'],
|
||||
['detailEntity_countrycode_inputEl', 'countrycode'],
|
||||
['detailEntity_group_inputEl', 'group'],
|
||||
['detailEntity_subgroup_inputEl', 'subgroup'],
|
||||
['detailEntity_iciOutside_inputEl', 'iciOutside'],
|
||||
['detailEntity_icilinked_inputEl', 'icilinked']
|
||||
|
||||
];
|
||||
|
||||
const gridPanels = [{
|
||||
'id': 'autorisedStatus',
|
||||
'sequence': [['detailEntity_autorisedStatus', 'autorisedStatus'],
|
||||
['detailEntity_recentChangeautorisedStatus', 'recentChangeautorisedStatus'],
|
||||
['detailEntity_recentChangeautorisedDate', 'recentChangeautorisedDate']],
|
||||
'divId': '#detailEntity_autorisedStatusGridPanel-body'
|
||||
}, {
|
||||
'id': 'agentOrBranch',
|
||||
'sequence': [['detailEntity_agentorbranchData', 'agentorbranchData'], ['detailEntity_agentData', 'agentData'],
|
||||
['detailEntity_branchData', 'branchData'], ['detailEntity_agentorbranchCountry', 'agentorbranchCountry'],
|
||||
['detailEntity_agentorbranchAddress', 'agentorbranchAddress'], ['detailEntity_agentorbranchlegalstatus', 'agentorbranchlegalstatus']],
|
||||
'divId': '#detailEntity_agentorbranchGridPanel-body'
|
||||
}, {
|
||||
'id': 'iciOutsideTable',
|
||||
'sequence': [['detailEntity_iciOutsideMember', 'iciOutsideMember']],
|
||||
'divId': '#detailEntity_iciOutsideGridPanel-body'
|
||||
}, {
|
||||
'id': 'icilinkedTable',
|
||||
'sequence': [['detailEntity_icilinkedname', 'icilinkedname'], ['detailEntity_icilinkedstartingdate', 'icilinkedstartingdate'],
|
||||
['detailEntity_icilinkedendingdate', 'icilinkedendingdate']],
|
||||
'divId': '#detailEntity_icilinkedGridPanel-body'
|
||||
}, {
|
||||
'id': 'othersStatus',
|
||||
'sequence': [['detailEntity_otherStatus', 'otherStatus'], ['detailEntity_recentChangeotherStatus', 'recentChangeotherStatus'],
|
||||
['detailEntity_recentChangeotherDate', 'recentChangeotherDate']],
|
||||
'divId': '#detailEntity_othersStatusGridPanel-body'
|
||||
}, {
|
||||
'id': 'services',
|
||||
'sequence': [['detailEntity_service', 'service'], ['detailEntity_recentChangeservice', 'recentChangeservice'],
|
||||
['detailEntity_recentChangeserviceDate', 'recentChangeserviceDate']],
|
||||
'divId': '#detailEntity_servicesGridPanel-body'
|
||||
}, {
|
||||
'id': 'ancillaryservices',
|
||||
'sequence': [['detailEntity_ancillaryservice', 'ancillaryservice'],
|
||||
['detailEntity_recentChangeancillaryservice', 'recentChangeancillaryservice'],
|
||||
['detailEntity_recentChangeancillaryserviceDate', 'recentChangeancillaryserviceDate']],
|
||||
'divId': '#detailEntity_ancillaryservicesGridPanel-body'
|
||||
}, {
|
||||
'id': 'prestataire',
|
||||
'sequence': [['detailEntity_prestatairename', 'prestatairename'], ['detailEntity_prestataireheadoffice', 'prestataireheadoffice'],
|
||||
['detailEntity_prestataireauthorisation', 'prestataireauthorisation']],
|
||||
'divId': '#detailEntity_prestataireGridPanel-body'
|
||||
}, {
|
||||
'id': 'historicName',
|
||||
'sequence': [['detailEntity_historicNameName', 'historicNameName'], ['detailEntity_historicNameDate', 'historicNameDate']],
|
||||
'divId': '#detailEntity_historicNameGridPanel-body'
|
||||
}];
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const mainDiv = $('#promoteDetailEntityPanel-innerCt');
|
||||
|
||||
for(const item of detailSequence) {
|
||||
const i = $(mainDiv).find(`#${item[0]}`);
|
||||
|
||||
details[item[1]] = this._cleanUp($(i).text());
|
||||
}
|
||||
|
||||
for( const grid of gridPanels)
|
||||
details[grid.id] = await this.extractGridPanel($, mainDiv, grid.divId, grid.sequence);
|
||||
|
||||
return details;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntity(serviceObject) {
|
||||
try{
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity:${serviceObject.current.name}`);
|
||||
logger.info(`Step ${serviceObject.step} of ${serviceObject.currentIndexLength}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filePath = serviceObject.current.filePath;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
serviceObject.current.details = await this.extractEntityDetails(body);
|
||||
|
||||
this.emit('entityComplete');
|
||||
|
||||
logger.info('Entity complete...');
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<null>}
|
||||
*/
|
||||
async entityCompleter(serviceObject) {
|
||||
try{
|
||||
const filename = serviceObject.current.fileName;
|
||||
|
||||
const filePath = serviceObject.current.filePath;
|
||||
const newObj = {};
|
||||
|
||||
logger.info(`Saving: ${filename}.json`);
|
||||
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
newObj.fileName = `${filename}.json`;
|
||||
newObj.name = serviceObject.current.name;
|
||||
newObj.number = serviceObject.current.details.number || '';
|
||||
|
||||
serviceObject.links.push(newObj);
|
||||
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.currentIndexLength) {
|
||||
serviceObject.current = {};
|
||||
await this.page.waitForSelector('a#detailEntity_backtolist').then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
this.emit('pageChanged');
|
||||
});
|
||||
}
|
||||
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleProcessEntity() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processEntity(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntity(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processEntity(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleEntityComplete() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.entityCompleter(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.entityCompleter(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.entityCompleter(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
// await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (pageUrl.href === 'about:blank') return;
|
||||
|
||||
if (pageUrl.pathname === '/index.html')
|
||||
this.debounceHandleIndexPage();
|
||||
|
||||
else
|
||||
if (process.env.NODE_ENV === 'production') {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
// Need thiss for Angular / EXT based sites
|
||||
this.on('pageChanged', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 1000));
|
||||
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('handleIndexPage', () => {
|
||||
this.handleIndexPage();
|
||||
});
|
||||
|
||||
this.on('processEntity', () => {
|
||||
this.handleProcessEntity();
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('handleEntityIndex', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processEntityIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processEntityIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
this.emit('pageChanged');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
this.emit('pageChanged');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('selectSearchManually', async () => {
|
||||
logger.debug('Locating advanced search button');
|
||||
|
||||
await this.page.waitForSelector('#menu_advanced', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':90 });
|
||||
}).catch(() => {
|
||||
logger.error('No advanced search button');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('#promoteAdvancedSearchPanel-body', { 'visible':true, 'timeout':7500 }).then(async () => {
|
||||
await this.indexPageHomeMode();
|
||||
}).catch(() => {
|
||||
logger.error('No advanced search form');
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://supervisedentities.apps.cssf.lu/index.html?language=en#AdvancedSearch'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/LU/CSSF`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 1000));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this._makeResponsive();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'load' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = LUScrape;
|
||||
|
626
ncas/lv.js
Normal file
626
ncas/lv.js
Normal file
@ -0,0 +1,626 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('LV');
|
||||
const url = require('url');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class LVScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'LV';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject, html) {
|
||||
const newArray = [] ;
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const links = $('div.featured-articles-title a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
const text = this._cleanUp($(item).text());
|
||||
|
||||
const newUrl = `${this.rootURI}${href}`;
|
||||
const id = this._makeFieldName(text);
|
||||
|
||||
newArray.push({ 'name':text, 'href':newUrl, 'id':id });
|
||||
});
|
||||
|
||||
serviceObject.links = serviceObject.links.concat(newArray);
|
||||
|
||||
const filename = this.modeNames[this.mode];
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
|
||||
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
// ('div.featured-articles-title')
|
||||
|
||||
await this.page.waitForSelector('table#organizcijasList', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
logger.debug('Menu details.');
|
||||
const elmHtml = await this.page.evaluate(el => el.outerHTML, elm);
|
||||
|
||||
await this.processIndex(serviceObject, elmHtml);
|
||||
}).catch(() => {
|
||||
logger.info('No show all button');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param section
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntitySections(html, section) {
|
||||
const httpRegEx = /(http|ftp|https):\/\//;
|
||||
const filenameFromURL = /(?:\/.*\/)(.*)/;
|
||||
|
||||
try{
|
||||
const newObj = { } ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
// const wantedItem = $('div#featured-articles-title');
|
||||
// const wantedItem = $('h2:contains("Sanctions")');
|
||||
|
||||
const wantedItem = $(section);
|
||||
|
||||
if (wantedItem.length === 0) return newObj;
|
||||
|
||||
newObj.name = this._cleanUp($(wantedItem).text());
|
||||
|
||||
const sibling = $(wantedItem).next();
|
||||
|
||||
const rows = $(sibling).find('tbody tr');
|
||||
|
||||
rows.each((i, item) => {
|
||||
const children = $(item).children();
|
||||
|
||||
if ($(children).length === 2) {
|
||||
const label = this._makeFieldName($(children).eq(0).text());
|
||||
|
||||
newObj[label] = this._cleanUp($(children).eq(1).text());
|
||||
}
|
||||
|
||||
if ($(children).length === 1) {
|
||||
const label = 'notes';
|
||||
|
||||
if (!newObj.hasOwnProperty(label))
|
||||
newObj[label] = [];
|
||||
|
||||
newObj[label].push(this._cleanUp($(children).eq(0).text()));
|
||||
|
||||
const links = $(item).find('a');
|
||||
|
||||
if ($(links).length > 0)
|
||||
|
||||
links.each((y, link) => {
|
||||
const href = $(link).attr('href');
|
||||
const text = this._cleanUp($(link).text());
|
||||
|
||||
if (href.match(httpRegEx) === null) {
|
||||
const fileName = href.match(filenameFromURL);
|
||||
|
||||
if (!newObj.hasOwnProperty('links'))
|
||||
newObj['links'] = [];
|
||||
|
||||
newObj['links'].push({ href, text, 'filename': fileName[1] });
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param section
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractEntitySubSections(html, section) {
|
||||
try{
|
||||
const newObj = [] ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const wantedItem = $(section);
|
||||
|
||||
if (wantedItem.length === 0) return newObj;
|
||||
|
||||
const sibling = $(wantedItem).next();
|
||||
|
||||
const rows = $(sibling).find('tbody tr');
|
||||
|
||||
let newItem = {};
|
||||
rows.each((i, item) => {
|
||||
const children = $(item).children();
|
||||
|
||||
if (i === 0 || $(children).length === 1) {
|
||||
if (Object.keys(newItem).length !== 0)
|
||||
newObj.push(newItem);
|
||||
|
||||
newItem = {};
|
||||
}
|
||||
if ($(children).length === 2) {
|
||||
const label = this._makeFieldName($(children).eq(0).text());
|
||||
|
||||
newItem[label] = this._cleanUp($(children).eq(1).text());
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractEntityLicenses(html) {
|
||||
try{
|
||||
const newObj = [] ;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const wantedItem = $('h2:contains("Licenses / Types of activities")');
|
||||
|
||||
if (wantedItem.length === 0) return newObj;
|
||||
|
||||
const sibling = $(wantedItem).next();
|
||||
|
||||
const rows = $(sibling).find('tbody tr');
|
||||
|
||||
let newItem = {};
|
||||
rows.each((i, item) => {
|
||||
const children = $(item).children();
|
||||
|
||||
if (i === 0 || $(children).length === 1) {
|
||||
if (Object.keys(newItem).length !== 0)
|
||||
newObj.push(newItem);
|
||||
|
||||
newItem = {};
|
||||
}
|
||||
if ($(children).length === 2) {
|
||||
const label = this._makeFieldName($(children).eq(0).text());
|
||||
|
||||
newItem[label] = this._cleanUp($(children).eq(1).text());
|
||||
}
|
||||
});
|
||||
|
||||
logger.debug(JSON.stringify(newObj));
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
// const filenameFromURL = /(?:\/.*\/)(.*)/;
|
||||
|
||||
const { name, id } = serviceObject.links[serviceObject.step];
|
||||
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
|
||||
await this.page.waitForSelector('#featured-articles-title > h2', { 'visible':true, 'timeout':7500 });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
// --
|
||||
|
||||
const details = await this.extractEntitySections(body, 'div#featured-articles-title');
|
||||
const marketSegments = await this.extractEntitySubSections(body, 'h2:contains("Market segments")');
|
||||
const relatedPersons = await this.extractEntitySubSections(body, 'h2:contains("Related persons")');
|
||||
const licenses = await this.extractEntityLicenses(body);
|
||||
const sanctions = await this.extractEntitySections(body, 'h2:contains("Sanctions")');
|
||||
const qualifyHoldings = await this.extractEntitySubSections(body, 'h2:contains("Qualifying holdings")');
|
||||
|
||||
// --
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details, marketSegments, relatedPersons, licenses, sanctions, qualifyHoldings });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
if (details.hasOwnProperty('links')) {
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
for(const items of details.links) {
|
||||
const href = `${this.rootURI}${items.href}`;
|
||||
await this.page.goto(href, { 'waitUntil': 'networkidle0' }).catch((err) => {
|
||||
// log this error but Puppeteer isn't supposed to support this sort of download....
|
||||
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
|
||||
|
||||
if (!err.message.includes('net::ERR_ABORTED') )
|
||||
logger.error('grabLink', err);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
const urlSplitter = /(\/en\/.*\/)(.*)/;
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
const splitUrl = pageUrl.pathname.match(urlSplitter);
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (splitUrl === null) return;
|
||||
|
||||
switch (splitUrl[1]) {
|
||||
|
||||
case '/en/market/payment-institutions/':
|
||||
case '/en/market/electronic-money-institutions/':
|
||||
case '/en/market/credit-institutions/':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case '/en/market/payment-institutions/authorized-payment-institutions/':
|
||||
case '/en/market/payment-institutions/registered-payment-institutions/':
|
||||
case '/en/market/electronic-money-institutions/authorized-electronic-money-institutions/':
|
||||
case '/en/market/electronic-money-institutions/registered-electronic-money-institutions/':
|
||||
case '/en/market/credit-institutions/banks/':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
let newUrl;
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
this.paymentServices.indexStep++;
|
||||
if (this.paymentServices.indexStep >= this.paymentServices.urls.length)
|
||||
newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
else
|
||||
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
let newUrl;
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
this.emoneyServices.indexStep++;
|
||||
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
|
||||
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
else
|
||||
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
let newUrl;
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
this.creditServices.indexStep++;
|
||||
if (this.creditServices.indexStep >= this.creditServices.urls.length)
|
||||
newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
else
|
||||
newUrl = this.creditServices.urls[this.creditServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.rootURI = 'http://www.fktk.lv';
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = LVScrape;
|
818
ncas/mt.js
Normal file
818
ncas/mt.js
Normal file
@ -0,0 +1,818 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const logger = require('log4js').getLogger('MT');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class MTScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'MT';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<{authorization, details}>}
|
||||
* @constructor
|
||||
*/
|
||||
async OLDextractEntity(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const details = {};
|
||||
const authorization = {};
|
||||
|
||||
details.name = this._cleanUp($('#lblName').text());
|
||||
|
||||
const dlCells = $('div#pnlCommonDetails').children();
|
||||
const superCells = $('#LHDetails span.fix-width-caption');
|
||||
|
||||
// #lblStatus
|
||||
|
||||
dlCells.each((index, item) => {
|
||||
if ($(item).attr('id') === 'pnlRegDate') {
|
||||
const itemText = this._cleanUp($(item).find('span').text()).split(/\s*:\s*/);
|
||||
|
||||
details[itemText[0]] = itemText[1];
|
||||
}
|
||||
else {
|
||||
const current = this._cleanUp($(item).find('p').text()).replace(/\s*:\s*/, '');
|
||||
|
||||
details[current] = this._cleanUp($(item).find('span').text());
|
||||
}
|
||||
});
|
||||
|
||||
superCells.each((index, item) => {
|
||||
const nextElm = $($(item).next());
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
|
||||
|
||||
authorization[thisId] = [];
|
||||
if (li.length > 0)
|
||||
li.each((index, item) => {
|
||||
const auth = $(item).html().split(' - ');
|
||||
|
||||
auth[1] = this._cleanUp(auth[1]);
|
||||
authorization[thisId].push(auth);
|
||||
});
|
||||
else {
|
||||
const itemText = this._cleanUp($(nextElm).text());
|
||||
authorization[thisId].push(itemText);
|
||||
}
|
||||
});
|
||||
|
||||
return { details, authorization };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<{authorization, details}>}
|
||||
*/
|
||||
async extractEntityV2(html) {
|
||||
const trimToColon = /^.*?(?=(:))/;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const details = {};
|
||||
const authorization = {};
|
||||
const errors = [];
|
||||
|
||||
details.name = this._cleanUp($('div#mainTitle > div').text());
|
||||
|
||||
const dlCells = $('table#tableLicenceResult tr');
|
||||
const superCells = $('#LHDetails span.fix-width-caption');
|
||||
|
||||
let previousLabel = '';
|
||||
dlCells.each((index, item) => {
|
||||
const children = $(item).children();
|
||||
|
||||
const rawLabel = $(children).eq(0).text().match(trimToColon);
|
||||
const itemValue = this._cleanUp($(children).eq(1).text().trim());
|
||||
|
||||
if (rawLabel !== null ) {
|
||||
const itemLabel = this._cleanUp(rawLabel[0]);
|
||||
|
||||
details[itemLabel] = itemValue;
|
||||
previousLabel = itemLabel;
|
||||
}
|
||||
else
|
||||
details[previousLabel] = details[previousLabel].concat([itemValue]);
|
||||
});
|
||||
|
||||
previousLabel = '';
|
||||
superCells.each((index, item) => {
|
||||
const nextElm = $($(item).next());
|
||||
|
||||
const children = $(nextElm).children();
|
||||
|
||||
if ($(children).length <= 1) {
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const thisId = this._cleanUp($(item).text()).replace(/\s*:\s*/, '');
|
||||
|
||||
authorization[thisId] = [];
|
||||
if (li.length > 0)
|
||||
li.each((index, item) => {
|
||||
const auth = $(item).text().split(' - ');
|
||||
|
||||
auth[1] = this._cleanUp(auth[1]);
|
||||
|
||||
if (auth[1] !== '')
|
||||
authorization[thisId].push(auth);
|
||||
});
|
||||
else {
|
||||
const itemText = this._cleanUp($(nextElm).text());
|
||||
authorization[thisId].push(itemText);
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.warn('Possible error in the HTML');
|
||||
logger.warn($(nextElm).html());
|
||||
errors.push($(nextElm).html());
|
||||
}
|
||||
});
|
||||
|
||||
const outObj = { details, authorization };
|
||||
|
||||
if (errors.length > 0)
|
||||
outObj.errors = errors;
|
||||
|
||||
return outObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
* @constructor
|
||||
*/
|
||||
async OLDprocessIndex(serviceObject) {
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pagingItem = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tfoot > tr > td > table > tbody > tr > td > div.rgWrap.rgInfoPart strong');
|
||||
|
||||
const maxPagesText = (pagingItem.length > 0) ? await this.page.evaluate(el => el.innerText, pagingItem[1]) : '0';
|
||||
|
||||
const maxPages = parseInt(maxPagesText, 10);
|
||||
|
||||
const links = await this.page.$$('#ctl00_cphMain_rgLicenceHolders_ctl00 > tbody > tr > td> a');
|
||||
|
||||
for (const item of links) {
|
||||
const id = await this.page.evaluate(el => el.innerText, item);
|
||||
const href = await this.page.evaluate(el => el.href, item);
|
||||
|
||||
const params = this._getParamsFromUrl(href);
|
||||
|
||||
serviceObject.links.push({ id, href, 'entId': params.id, 'metaStep': serviceObject.indexMetaStep });
|
||||
}
|
||||
|
||||
if (serviceObject.indexStep < (maxPages - 1) ) {
|
||||
serviceObject.indexStep++;
|
||||
await this._findAndClick('input.rgPageNext');
|
||||
}
|
||||
else
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
async processIndexV2(serviceObject) {
|
||||
// #tableResult span
|
||||
const numberRegEx = /\d+/;
|
||||
|
||||
logger.debug('+ processIndexV2');
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const links = await this.page.$$('#tableResult span');
|
||||
|
||||
for (const item of links) {
|
||||
const id = await this.page.evaluate(el => el.innerText, item);
|
||||
const href = await this.page.evaluate(el => el.getAttribute('onclick'), item);
|
||||
|
||||
serviceObject.links.push({ id, 'entId': href.match(numberRegEx)[0], 'metaStep': serviceObject.indexMetaStep });
|
||||
}
|
||||
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
* @constructor
|
||||
*/
|
||||
async OLDinitiateIndex(serviceObject) {
|
||||
logger.debug('initiateIndex');
|
||||
const matched = { 'left':false, 'right':false };
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
await this._findAndClick('#ctl00_cphMain_RadComboBox1');
|
||||
|
||||
await this._randomWait(this.page, 2, 3);
|
||||
const leftOptions = await this.page.$$('#ctl00_cphMain_RadComboBox1_DropDown > div > ul.rcbList li');
|
||||
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
|
||||
|
||||
for (const item of leftOptions) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await item.click({ 'delay':95 });
|
||||
matched.left = true;
|
||||
|
||||
// this element can take a while to reload..
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 7, 9);
|
||||
await this._findAndClick('#ctl00_cphMain_RadComboBox2_Input');
|
||||
await this._randomWait(this.page, 2, 3);
|
||||
|
||||
const rightOptions = await this.page.$$('#ctl00_cphMain_RadComboBox2_DropDown > div > ul.rcbList li');
|
||||
for (const item of rightOptions) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
if (text === wantedOption[1]) {
|
||||
matched.right = true;
|
||||
await item.click({ 'delay':95 });
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for items to setttle
|
||||
await this._randomWait(this.page, 2, 3);
|
||||
|
||||
if (matched.left && matched.right) {
|
||||
serviceObject.started = true;
|
||||
await this._findAndClick('#cphMain_btnSearch2');
|
||||
}
|
||||
|
||||
else
|
||||
logger.error('Not fully matched', matched);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reworked for site reskin
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async initiateIndexV2(serviceObject) {
|
||||
logger.debug('initiateIndexV2');
|
||||
const matched = { 'left':false, 'right':false };
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
// select#select1
|
||||
|
||||
const leftOptions = await this.page.$$('select#select1 option');
|
||||
const wantedOption = serviceObject.indexMeta[serviceObject.indexMetaStep];
|
||||
|
||||
for (const item of leftOptions) {
|
||||
const rawText = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
const text = this._cleanUp(rawText);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('select#select1', value);
|
||||
|
||||
matched.left = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for items to setttle
|
||||
await this._randomWait(this.page, 2, 3);
|
||||
|
||||
const rightOptions = await this.page.$$('select#select2 option');
|
||||
for (const item of rightOptions) {
|
||||
const rawText = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
const text = this._cleanUp(rawText);
|
||||
|
||||
if (text === wantedOption[1]) {
|
||||
matched.right = true;
|
||||
await this.page.select('select#select2', value);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
await this._randomWait(this.page, 2, 2);
|
||||
|
||||
if (matched.left && matched.right) {
|
||||
serviceObject.started = true;
|
||||
await this._findAndClick('button.searchButtonAdv');
|
||||
|
||||
this.emit('processIndex');
|
||||
}
|
||||
|
||||
else
|
||||
logger.error('Not fully matched', matched);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
logger.debug('buildIndex');
|
||||
if (!serviceObject.started)
|
||||
await this.initiateIndexV2(serviceObject);
|
||||
else
|
||||
await this.processIndexV2(serviceObject);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async nextItem(serviceObject) {
|
||||
const entId = serviceObject.links[serviceObject.step].entId;
|
||||
logger.debug('nextItem', entId);
|
||||
|
||||
await this.newLoadLicenceHolder(entId);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
if (!this.processing)
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
else
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.nextItem(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.nextItem(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.nextItem(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const { id, entId } = serviceObject.links[serviceObject.step];
|
||||
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step}:${id}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${entId}`].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const details = await this.extractEntityV2(body);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items)
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
// processIndex
|
||||
|
||||
async handleProcessIndex() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processIndexV2(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processIndexV2(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processIndexV2(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async processNewPage() {
|
||||
// give the ajax page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
logger.debug('processNewPage', pageUrl.href);
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/pages/licenceholders.aspx':
|
||||
case '/financial-services-register/':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case'/pages/licenceholder.aspx':
|
||||
case '/financial-services-register/result/':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
||||
await this.crossBorderRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
this.emit('backoff');
|
||||
throw new Error(`Unknown page: ${pageUrl.href}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('pathName', pathName);
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces the goto
|
||||
* @param id
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async newLoadLicenceHolder(id) {
|
||||
// loadLicenceHolder(10966)
|
||||
const formElm = await this.page.$('form#loadHolder');
|
||||
|
||||
logger.debug('loadLicenceHolder', id);
|
||||
|
||||
await this.page.evaluate(x => {
|
||||
x.target = '_self';
|
||||
}, formElm);
|
||||
|
||||
await this._microWait(this.page, 5);
|
||||
|
||||
await this.page.evaluate(x => {
|
||||
return loadLicenceHolder(x);
|
||||
}, id);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('processIndex', async () => {
|
||||
this.handleProcessIndex();
|
||||
});
|
||||
//
|
||||
|
||||
this.on('pageChanged', this._debounce(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 1000));
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
this.paymentServices.indexMetaStep++;
|
||||
|
||||
if (this.paymentServices.indexMetaStep < this.paymentServices.indexMeta.length) {
|
||||
logger.info('Resetting for next meta index...');
|
||||
// next..
|
||||
this.paymentServices.started = false;
|
||||
this.paymentServices.indexStep = 0;
|
||||
|
||||
await this._goto(this.startPage);
|
||||
}
|
||||
else {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
logger.warn('GO THROUGH THE NEW LIST!!!!');
|
||||
|
||||
this.processing = true;
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'New page transition');
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
this.emoneyServices.indexMetaStep++;
|
||||
|
||||
if (this.emoneyServices.indexMetaStep < this.emoneyServices.indexMeta.length) {
|
||||
logger.info('Resetting for next meta index...');
|
||||
// next..
|
||||
this.emoneyServices.started = false;
|
||||
this.emoneyServices.indexStep = 0;
|
||||
|
||||
await this._goto(this.startPage);
|
||||
}
|
||||
else {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
logger.warn('GO THROUGH THE NEW LIST!!!!');
|
||||
|
||||
this.processing = true;
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'New page transition');
|
||||
}
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
this.creditServices.indexMetaStep++;
|
||||
|
||||
if (this.creditServices.indexMetaStep < this.creditServices.indexMeta.length) {
|
||||
logger.info('Resetting for next meta index...');
|
||||
// next..
|
||||
this.creditServices.started = false;
|
||||
this.creditServices.indexStep = 0;
|
||||
|
||||
await this._goto(this.startPage);
|
||||
}
|
||||
else {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
logger.warn('GO THROUGH THE NEW LIST!!!!');
|
||||
|
||||
this.processing = true;
|
||||
|
||||
await this._randomWait(this.page, 2, 2, 'New page transition');
|
||||
}
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
this.mode++;
|
||||
this.processing = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
|
||||
this.mode++;
|
||||
this.processing = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
|
||||
this.emit('done');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
this.processing = false;
|
||||
|
||||
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services'];
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 46,
|
||||
'indexStep': 0,
|
||||
'indexMetaStep':0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
|
||||
'indexMeta' : [
|
||||
['Financial Institutions',
|
||||
'Financial Institutions licensed to undertake payment services under the 2nd Schedule to the Financial Institutions Act (Payment Institutions)'],
|
||||
['Financial Institutions',
|
||||
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to provide services outside Malta'],
|
||||
['Financial Institutions',
|
||||
'Local Financial Institutions licensed to undertake activities under the 2nd Schedule to the Financial Institutions Act (Payment Institutions) exercising the freedom to establish a branch outside Malta']
|
||||
|
||||
]
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'indexMetaStep':0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
|
||||
'indexMeta' : [
|
||||
['Financial Institutions',
|
||||
'Financial Institutions licenced to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions)'],
|
||||
['Financial Institutions',
|
||||
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to provide services outside Malta'],
|
||||
['Financial Institutions',
|
||||
'Local Financial Institutions licensed to issue electronic money under the 3rd Schedule to the Financial Institutions Act (Electronic Money Institutions) exercising the freedom to establish a branch outside Malta']
|
||||
|
||||
]
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'indexMetaStep':0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.mfsa.com.mt/pages/licenceholders.aspx'],
|
||||
'indexMeta' : [
|
||||
['Credit Institutions',
|
||||
'Credit Institutions'],
|
||||
['Credit Institutions',
|
||||
'Freedom of Services and Establishments - Exercise of the freedom to provide services outside Malta'],
|
||||
['Credit Institutions',
|
||||
'Freedom of Services and Establishments - Exercise of the freedom to set up an establishment outside Malta']
|
||||
|
||||
]
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
|
||||
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/MT/MFSA`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._debounce(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = MTScrape;
|
794
ncas/nl.js
Normal file
794
ncas/nl.js
Normal file
@ -0,0 +1,794 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const logger = require('log4js').getLogger('NL');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class NLScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('NL');
|
||||
|
||||
this.addToBlockFilters(['cookiebar.js', 'readspeaker']);
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
// Delays the call to 30 seconds after the last time it was called.
|
||||
// Useful if the page beaks and multiple errors happen at the same time
|
||||
this.recover = this._debounce(async () => {
|
||||
await this.__recover();
|
||||
}, 30000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
async extractDetail(body) {
|
||||
const description = [];
|
||||
try{
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const rows = $('dl.extra > dd > table > tbody > tr');
|
||||
|
||||
rows.each((index, item) => {
|
||||
let cells = $(item).find('th');
|
||||
|
||||
const title = this._cleanUp($(cells.get(0)).text()).replace(':', '') || '';
|
||||
|
||||
cells = $(item).find('td');
|
||||
const detail = this._cleanUp($(cells.get(0)).text()) || '';
|
||||
|
||||
if (title !== '')
|
||||
description.push([title, detail]);
|
||||
});
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
return description;
|
||||
}
|
||||
|
||||
async extractActivity(body) {
|
||||
const details = [];
|
||||
try{
|
||||
const $ = cheerio.load(body);
|
||||
const rows = $('#tab2 > div > div > table > tbody > tr');
|
||||
let previousFinancialService = '';
|
||||
|
||||
rows.each((index, item) => {
|
||||
const cells = $(item).find('td');
|
||||
|
||||
const activity = this._cleanUp($(cells.get(0)).text()) || '';
|
||||
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
|
||||
const endDate = this._cleanUp($(cells.get(2)).text()) || '';
|
||||
|
||||
const thCell = $(item).find('th');
|
||||
const financialService = this._cleanUp($(thCell.get(0)).text()) || previousFinancialService;
|
||||
|
||||
details.push({ financialService, activity, startDate, endDate });
|
||||
|
||||
previousFinancialService = financialService;
|
||||
});
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
return details;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract Passporting Out Data from page
|
||||
* @param body
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractPassportingOut(body) {
|
||||
const details = {};
|
||||
|
||||
try{
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const rows = $('#tab6 > div > div > table > tbody > tr');
|
||||
let previouseuPassportOut = '';
|
||||
|
||||
rows.each((index, item) => {
|
||||
const cells = $(item).find('td');
|
||||
|
||||
const activity = this._cleanUp($(cells.get(0)).text()) || '';
|
||||
const country = this._cleanUp($(cells.get(1)).text()) || '';
|
||||
const startDate = this._cleanUp($(cells.get(2)).text()) || '';
|
||||
const endDate = this._cleanUp($(cells.get(3)).text()) || '';
|
||||
|
||||
const thCell = $(item).find('th');
|
||||
const euPassportOut = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportOut;
|
||||
|
||||
if (!details.hasOwnProperty(country))
|
||||
details[country] = [{ activity, startDate, endDate, euPassportOut }];
|
||||
else
|
||||
details[country].push({ activity, startDate, endDate, euPassportOut });
|
||||
|
||||
previouseuPassportOut = euPassportOut;
|
||||
});
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
return details;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract Passporting In Data from page
|
||||
* @param body
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractPassportingIn(body) {
|
||||
const details = {};
|
||||
|
||||
try{
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const rows = $('#tab7 > div > div > table > tbody > tr');
|
||||
let previouseuPassportIn = '';
|
||||
|
||||
rows.each((index, item) => {
|
||||
const cells = $(item).find('td');
|
||||
|
||||
const activity = this._cleanUp($(cells.get(0)).text()) || '';
|
||||
const startDate = this._cleanUp($(cells.get(1)).text()) || '';
|
||||
|
||||
const thCell = $(item).find('th');
|
||||
const euPassportIn = this._cleanUp($(thCell.get(0)).text()) || previouseuPassportIn;
|
||||
|
||||
if (!details.hasOwnProperty(euPassportIn))
|
||||
details[euPassportIn] = [{ activity, startDate }];
|
||||
else
|
||||
details[euPassportIn].push({ activity, startDate });
|
||||
|
||||
previouseuPassportIn = euPassportIn;
|
||||
});
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
|
||||
return details;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process Entity Detail
|
||||
*
|
||||
* @returns {Promise<{activity: *, details: *}>}
|
||||
*/
|
||||
async processEntityDetail(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
const urlSections = ['WFTBI', 'WFTEG', 'WFTKF'];
|
||||
const id = serviceObject.links[serviceObject.step].id;
|
||||
|
||||
logger.info(`Process V2 ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${id}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = this._makeFileName(entity);
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this.page.waitForSelector('#contentcolumn > div.interactive-tabs > ol > li:nth-child(2) > a', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
}).catch(() => {
|
||||
logger.debug('No activity tab');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab6"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_passportingout`, null);
|
||||
}).catch(() => {
|
||||
logger.debug('No passporting Out tab');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('div.interactive-tabs > ol > li a[href*="#tab7"]', { 'visible':true, 'timeout':2500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_passportingin`, null);
|
||||
}).catch(() => {
|
||||
logger.debug('No passporting In tab');
|
||||
});
|
||||
|
||||
const body = await this.page.content();
|
||||
const details = await this.extractDetail(body);
|
||||
const activity = await this.extractActivity(body);
|
||||
const passportingOut = await this.extractPassportingOut(body);
|
||||
const passportingIn = await this.extractPassportingIn(body);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details, activity, passportingOut, passportingIn });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = `https://www.dnb.nl/en/supervision/public-register/${urlSections[this.mode]}/${serviceObject.links[serviceObject.step].href}`;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('entityDone');
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTBI / Payment Services Detail
|
||||
*
|
||||
* @returns {Promise<{activity: *, details: *}>}
|
||||
*/
|
||||
async processWFTBIDetail() {
|
||||
await this.processEntityDetail(this.paymentServices);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTEG / Emoney services Detail
|
||||
* @returns {Promise<{activity: *, details: *}>}
|
||||
*/
|
||||
async processWFTEGDetail() {
|
||||
await this.processEntityDetail(this.emoneyServices);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTKF / Credit Services Details
|
||||
* @returns {Promise<{activity: *, passportingOut: void, details: *}>}
|
||||
*/
|
||||
async processWFTKFDetail() {
|
||||
await this.processEntityDetail(this.creditServices);
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate WFTBI / Payment Services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async initiateWFTBI() {
|
||||
try{
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
const options = await this.page.$$('#ddfilter option');
|
||||
const wantedOption = ['2:3c Dutch branch of payment institution (EEA incl. NL)'];
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('#ddfilter', value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
this._findAndClick('#search-main button');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiaite WFTEG / Emoney services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async initiateWFTEG() {
|
||||
try{
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
const options = await this.page.$$('#ddfilter option');
|
||||
const wantedOption = ['2:10b Carrying on the business of an electronic money institution'];
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('#ddfilter', value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
this._findAndClick('#search-main button');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initiate WFTKF / Credit Services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async initiateWFTKF() {
|
||||
try{
|
||||
// first time around.
|
||||
// need to kick off the index correctly..
|
||||
|
||||
const options = await this.page.$$('#ddfilter option');
|
||||
const selects = ['2:12(1) Carrying on the business of a bank', '2:13(1) Carrying on the business of a bank'];
|
||||
const wantedOption = [];
|
||||
wantedOption.push(selects[this.creditServices.step]);
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption.indexOf(text) !== -1) {
|
||||
await this.page.select('#ddfilter', value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
this._findAndClick('#search-main button');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTBI / Payment Services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processWFTBI() {
|
||||
const nonWhiteSpace = /\W/g;
|
||||
logger.info('WFTBI / Payment Services');
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
const origUrl = await this.page.url();
|
||||
const pageUrl = url.parse(origUrl);
|
||||
|
||||
if (pageUrl.query === null)
|
||||
// we need to select the correct item from the dropdown.
|
||||
this.initiateWFTBI();
|
||||
|
||||
else {
|
||||
// crack query
|
||||
|
||||
const body = await this.page.content();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const q = this._getParamsFromUrl(origUrl);
|
||||
|
||||
const page = q.page || '1';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/paymentServices_menu_${page}`, null);
|
||||
|
||||
const rows = $('#contentcolumn table tbody tr');
|
||||
|
||||
rows.each((i, elm) => {
|
||||
const children = cheerio(elm).children();
|
||||
let statutoryName = children.eq(0).text();
|
||||
let tradeName = children.eq(1).text();
|
||||
|
||||
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
|
||||
|
||||
let href = cheerio(children.eq(0)).find('a').attr('href');
|
||||
href = href.concat('&locale=en_GB');
|
||||
// this is the one we want.
|
||||
|
||||
this.paymentServices.links.push({ id, href });
|
||||
});
|
||||
|
||||
const next = $('a.next').attr('href') || '';
|
||||
|
||||
if (next !== '')
|
||||
this._findAndClick('a.next');
|
||||
else
|
||||
this.emit('startProcessingPaymentServices');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTEG / Emoney services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processWFTEG() {
|
||||
const nonWhiteSpace = /\W/g;
|
||||
logger.info('WFTEG / EMoney Services');
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
const origUrl = await this.page.url();
|
||||
const pageUrl = url.parse(origUrl);
|
||||
|
||||
if (pageUrl.query === null)
|
||||
// we need to select the correct item from the dropdown.
|
||||
this.initiateWFTEG();
|
||||
|
||||
else {
|
||||
// crack query
|
||||
|
||||
const body = await this.page.content();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const q = this._getParamsFromUrl(origUrl);
|
||||
|
||||
const page = q.page || '1';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/eMoney_menu_${page}`, null);
|
||||
|
||||
const rows = $('#contentcolumn table tbody tr');
|
||||
|
||||
rows.each((i, elm) => {
|
||||
const children = cheerio(elm).children();
|
||||
let statutoryName = children.eq(0).text();
|
||||
let tradeName = children.eq(1).text();
|
||||
|
||||
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
// const id = `${statutoryName}-${tradeName}`;
|
||||
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
|
||||
|
||||
let href = cheerio(children.eq(0)).find('a').attr('href');
|
||||
href = href.concat('&locale=en_GB');
|
||||
// this is the one we want.
|
||||
|
||||
this.emoneyServices.links.push({ id, href });
|
||||
});
|
||||
|
||||
const next = $('a.next').attr('href') || '';
|
||||
|
||||
if (next !== '')
|
||||
this._findAndClick('a.next');
|
||||
else
|
||||
this.emit('startProcessingEMoneyServices');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Process WFTKF / Credit Services
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processWFTKF() {
|
||||
try {
|
||||
// Credit Institute
|
||||
const nonWhiteSpace = /\W/g;
|
||||
logger.info('WFTKF / Credit Services');
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
const origUrl = await this.page.url();
|
||||
const pageUrl = url.parse(origUrl);
|
||||
|
||||
if (pageUrl.query === null)
|
||||
// we need to select the correct item from the dropdown.
|
||||
this.initiateWFTKF();
|
||||
|
||||
else {
|
||||
// crack query
|
||||
|
||||
const body = await this.page.content();
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const q = this._getParamsFromUrl(origUrl);
|
||||
|
||||
const page = q.page || '1';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/creditServices_menu_${page}`, null);
|
||||
|
||||
const rows = $('#contentcolumn table tbody tr');
|
||||
|
||||
rows.each((i, elm) => {
|
||||
const children = cheerio(elm).children();
|
||||
let statutoryName = children.eq(0).text();
|
||||
let tradeName = children.eq(1).text();
|
||||
|
||||
statutoryName = removeAccents.remove(statutoryName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
tradeName = removeAccents.remove(tradeName.trim()).replace(nonWhiteSpace, '_');
|
||||
|
||||
const id = (statutoryName === tradeName) ? statutoryName : `${statutoryName}-${tradeName}`;
|
||||
|
||||
// const id = `${statutoryName}-${tradeName}`;
|
||||
|
||||
let href = cheerio(children.eq(0)).find('a').attr('href');
|
||||
href = href.concat('&locale=en_GB');
|
||||
// this is the one we want.
|
||||
|
||||
logger.debug({ id, href });
|
||||
|
||||
this.creditServices.links.push({ id, href });
|
||||
});
|
||||
|
||||
const next = $('a.next').attr('href') || '';
|
||||
|
||||
if (next !== '')
|
||||
this._findAndClick('a.next');
|
||||
else
|
||||
if (this.creditServices.step === 0) {
|
||||
this.creditServices.step = 1;
|
||||
await this._goto(this.credit);
|
||||
}
|
||||
else
|
||||
this.emit('startProcessingCreditServices');
|
||||
}
|
||||
}
|
||||
catch(e) {
|
||||
await this._uploadError();
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
|
||||
const failedUrls = ['chrome-error://chromewebdata/'];
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (failedUrls.indexOf(pageUrl.href) !== -1) {
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/en/supervision/public-register/WFTBI/index.jsp':
|
||||
await this.processWFTBI();
|
||||
break;
|
||||
case '/en/supervision/public-register/WFTBI/detail.jsp':
|
||||
await this.processWFTBIDetail();
|
||||
break;
|
||||
case '/en/supervision/public-register/WFTEG/index.jsp':
|
||||
await this.processWFTEG();
|
||||
break;
|
||||
case '/en/supervision/public-register/WFTEG/detail.jsp':
|
||||
await this.processWFTEGDetail();
|
||||
break;
|
||||
case '/en/supervision/public-register/WFTKF/index.jsp':
|
||||
await this.processWFTKF();
|
||||
break;
|
||||
case '/en/supervision/public-register/WFTKF/detail.jsp':
|
||||
await this.processWFTKFDetail();
|
||||
break;
|
||||
default:
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl.href}`);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async restart() {
|
||||
logger.info(`Restarting ${this.modeTitles[this.mode]}`);
|
||||
|
||||
switch (this.mode) {
|
||||
|
||||
case 2:
|
||||
this.emit('startProcessingCreditServices');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('startProcessingEMoneyServices');
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
this.emit('startProcessingPaymentServices');
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
* @private
|
||||
*/
|
||||
async __recover() {
|
||||
logger.warn('*** RECONNECTING PAGE ***');
|
||||
|
||||
if (this.browserCrashed) await this._initBrowser(true);
|
||||
|
||||
await this._createBrowserPage();
|
||||
this.page.on('domcontentloaded', () => {
|
||||
this.processNewPage();
|
||||
});
|
||||
|
||||
const timeout = 90000;
|
||||
|
||||
setTimeout(async() => {
|
||||
logger.warn('Attempting recovery..');
|
||||
|
||||
await this.restart();
|
||||
}, timeout);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('entityDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('startProcessingPaymentServices', async () => {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTBI/${this.paymentServices.links[this.paymentServices.step].href}`;
|
||||
|
||||
logger.debug('startProcessingPaymentServices', newUrl);
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
await this._goto(this.emoneyUrl);
|
||||
});
|
||||
|
||||
this.on('startProcessingEMoneyServices', async () => {
|
||||
this.mode = 1;
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.debug(`${this.emoneyServices.items} EMoney items indexed` );
|
||||
logger.debug(this.emoneyServices.links[this.emoneyServices.step].href);
|
||||
|
||||
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTEG/${this.emoneyServices.links[this.emoneyServices.step].href}`;
|
||||
|
||||
logger.debug('startProcessingEMoneyServices', newUrl);
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
|
||||
await this._goto(this.credit);
|
||||
});
|
||||
|
||||
this.on('startProcessingCreditServices', async () => {
|
||||
this.mode = 2;
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.debug(`${this.creditServices.items} CI items indexed` );
|
||||
logger.debug(this.creditServices.links[this.creditServices.step].href);
|
||||
|
||||
const newUrl = `https://www.dnb.nl/en/supervision/public-register/WFTKF/${this.creditServices.links[this.creditServices.step].href}`;
|
||||
logger.debug('startProcessingCreditServices', newUrl);
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
|
||||
this.emit('done');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
|
||||
this.mode = 0;
|
||||
try {
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.startPage = 'https://www.dnb.nl/en/supervision/public-register/WFTBI/index.jsp';
|
||||
this.emoneyUrl = 'https://www.dnb.nl/en/supervision/public-register/WFTEG/index.jsp';
|
||||
this.credit = 'https://www.dnb.nl/en/supervision/public-register/WFTKF/index.jsp';
|
||||
|
||||
//
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/NL/DNB`));
|
||||
|
||||
await this._doNonRepudiation(false, { 'sslWithPrefix': true }).catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = NLScrape;
|
767
ncas/no.js
Normal file
767
ncas/no.js
Normal file
@ -0,0 +1,767 @@
|
||||
// Version: 0.0.1-3
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('NO');
|
||||
const url = require('url');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class NOScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'NO';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
try {
|
||||
const newObj = {};
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const title = $('h1.common-header-text').text();
|
||||
|
||||
newObj.title = this._cleanUp(title);
|
||||
|
||||
const detailBox = $('div.side-container.license-side-unit-container');
|
||||
|
||||
const children = $(detailBox).children();
|
||||
|
||||
let curLabel = '';
|
||||
children.each((i, item) => {
|
||||
const tagName = $(item).prop('tagName');
|
||||
|
||||
if (tagName === 'H4') {
|
||||
curLabel = this._makeFieldName($(item).text());
|
||||
if (!newObj.hasOwnProperty(curLabel))
|
||||
newObj[curLabel] = [];
|
||||
}
|
||||
|
||||
if (['P', 'SPAN', 'A'].indexOf(tagName) !== -1)
|
||||
newObj[curLabel].push(this._cleanUp($(item).text()));
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @param blockType
|
||||
* @returns {{licenseDescription: string, blockType: string}}
|
||||
*/
|
||||
extractEntityDetailLicense(html, blockType = 'License') {
|
||||
try {
|
||||
const newObj = { 'licenseDescription':'', 'blockType': blockType };
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const detailBox = $('div.license-container');
|
||||
|
||||
const children = $(detailBox).children();
|
||||
|
||||
let curLabel = '';
|
||||
children.each((i, item) => {
|
||||
const tagName = $(item).prop('tagName');
|
||||
|
||||
if (tagName === 'H3') {
|
||||
curLabel = this._makeFieldName($(item).text());
|
||||
if (!newObj.hasOwnProperty(curLabel))
|
||||
newObj[curLabel] = [];
|
||||
}
|
||||
|
||||
if (tagName === 'H2') {
|
||||
if (!newObj.hasOwnProperty('misc'))
|
||||
newObj['misc'] = [];
|
||||
newObj['misc'].push(this._cleanUp($(item).text()));
|
||||
}
|
||||
|
||||
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1) {
|
||||
const elmClass = $(item).attr('class');
|
||||
if (elmClass === 'license-description')
|
||||
newObj['licenseDescription'] = this._cleanUp($(item).text());
|
||||
else
|
||||
newObj[curLabel].push( this._cleanUp($(item).text()));
|
||||
}
|
||||
|
||||
if (tagName === 'UL') {
|
||||
const liArray = [];
|
||||
const li = $(item).children('li');
|
||||
for (let i = 0; i < $(li).length;i++)
|
||||
liArray.push(this._cleanUp($(li).eq(i).text()));
|
||||
|
||||
newObj[curLabel].push(liArray);
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {{description: {}}}
|
||||
*/
|
||||
extractCrossBorderDetailsV2(html) {
|
||||
const newObj = { 'description':{} };
|
||||
|
||||
const titleRegEx = /([^]*?)(?:<ul>)/;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const top = $('ul');
|
||||
|
||||
const parent = $(top).parent();
|
||||
|
||||
const title = this._cleanUp($(parent).children().first().text());
|
||||
|
||||
const li = $(top).first().children();
|
||||
|
||||
li.each(async (i, item) => {
|
||||
const anotherUL = $(item).find('ul').index();
|
||||
|
||||
if (anotherUL !== -1) {
|
||||
// There are UL's within this LI
|
||||
|
||||
const elms = $(item).find('ul').children('li');
|
||||
if ($(elms).length !== 0) {
|
||||
const nameArray = $(item).html().match(titleRegEx);
|
||||
const rawTitle = nameArray[0].replace('<ul>', '');
|
||||
const title = this._cleanUp(rawTitle) || 'main';
|
||||
|
||||
const label = this._makeFieldName(title);
|
||||
if (!newObj.hasOwnProperty(label)) {
|
||||
newObj[label] = [];
|
||||
newObj.description[label] = title;
|
||||
}
|
||||
|
||||
elms.each((i, e) => {
|
||||
newObj[label].push(this._cleanUp($(e).text()));
|
||||
});
|
||||
}
|
||||
}
|
||||
else {
|
||||
const label = this._makeFieldName(title);
|
||||
if (!newObj.hasOwnProperty(label)) {
|
||||
newObj[label] = [];
|
||||
newObj.description[label] = title;
|
||||
}
|
||||
|
||||
newObj[label].push(this._cleanUp($(item).text()));
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
extractEntityDetailCrossBorder(html) {
|
||||
try {
|
||||
const newObj = { };
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const header = $('h3.license-unit-label:contains("Cross-border services/classes")');
|
||||
|
||||
const detailBox = $(header).parent();
|
||||
|
||||
const children = $(detailBox).children();
|
||||
|
||||
let curLabel = '';
|
||||
children.each(async (i, item) => {
|
||||
const tagName = $(item).prop('tagName');
|
||||
|
||||
if (tagName === 'H3') {
|
||||
curLabel = this._makeFieldName($(item).text());
|
||||
if (!newObj.hasOwnProperty(curLabel))
|
||||
newObj[curLabel] = [];
|
||||
}
|
||||
|
||||
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1)
|
||||
newObj[curLabel].push(this._cleanUp($(item).text()));
|
||||
|
||||
if(tagName === 'DIV' || tagName === 'UL') {
|
||||
if (!newObj.hasOwnProperty('data'))
|
||||
newObj['data'] = [];
|
||||
|
||||
const cbData = this.extractCrossBorderDetailsV2($(item).html());
|
||||
|
||||
newObj['data'].push(cbData);
|
||||
}
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @param elm
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async selectLicenseOption(serviceObject, elm) {
|
||||
const wantedOption = serviceObject.wanted[serviceObject.indexStep];
|
||||
|
||||
const elmSelector = await this.page.evaluate((el) => [el.tagName, el.getAttribute('class')].join('.'), elm);
|
||||
|
||||
const options = await elm.$$('option');
|
||||
for (const item of options) {
|
||||
const text = await this.page.evaluate(el => el.innerText, item);
|
||||
const value = await this.page.evaluate(el => el.value, item);
|
||||
|
||||
if (wantedOption === text) {
|
||||
await this.page.select(elmSelector, value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async extractIndexItems(html) {
|
||||
const newArray = [] ;
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
const links = $('a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
const text = this._cleanUp($(item).find('.licenseregistry-search-result-item-header').text());
|
||||
const country = this._cleanUp($(item).find('.licenseregistry-search-result-item-metadata').text());
|
||||
const type = this._cleanUp($(item).find('.licenseregistry-search-result-item-type').text());
|
||||
const params = this._getParamsFromUrl(href);
|
||||
|
||||
const newUrl = `${this.rootURI}${href}`;
|
||||
const id = params.id;
|
||||
|
||||
newArray.push({ 'name':text, 'href':newUrl, 'id':id, 'country':country, 'type': type });
|
||||
});
|
||||
|
||||
return newArray;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject) {
|
||||
let html = '';
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
let loadedAll = false;
|
||||
|
||||
do
|
||||
await this.page.waitForSelector('button.search-result-loadmore', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
loadedAll = false;
|
||||
logger.debug('Expanding index..');
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}).catch(() => {
|
||||
loadedAll = true;
|
||||
});
|
||||
|
||||
while( loadedAll === false);
|
||||
|
||||
logger.debug('>> All loaded...');
|
||||
|
||||
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
html = await this.page.evaluate(el => el.outerHTML, elm);
|
||||
}).catch((e) => {
|
||||
logger.error(e);
|
||||
logger.warn('No index list');
|
||||
});
|
||||
|
||||
const indexList = await this.extractIndexItems(html);
|
||||
|
||||
serviceObject.links = serviceObject.links.concat(indexList);
|
||||
|
||||
const filename = this.modeNames[this.mode];
|
||||
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
|
||||
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
|
||||
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
await this.page.waitForSelector('select.search-filter', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await this.selectLicenseOption(serviceObject, elm);
|
||||
}).catch((e) => {
|
||||
logger.error(e);
|
||||
logger.warn('No select');
|
||||
});
|
||||
|
||||
// this reload can take a long time
|
||||
await this.page.waitForSelector('span.search-results-count.highlight', { 'visible':true, 'timeout':75000 }).catch((e) => {
|
||||
logger.error(e);
|
||||
logger.warn('Waiting for data timeout');
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await this.processIndex(serviceObject);
|
||||
}).catch((e) => {
|
||||
logger.error(e);
|
||||
logger.warn('No index list');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async entityContentSniffer(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const output = [];
|
||||
|
||||
const contentArray = [
|
||||
{ 'find':'h2:contains("Bank")', 'blockType':'Bank' },
|
||||
{ 'find':'h2:contains("Agent of payment institution (company)")', 'blockType':'Agent Payment Institution' },
|
||||
{ 'find':'h2:contains("Tied Agent")', 'blockType':'Agent' },
|
||||
{ 'find':'h3.license-unit-label:contains("The entity is a tied agent affiliated to")', 'blockType':'Affiliation' },
|
||||
{ 'find':'h2:contains("Nominee in Norwegian securities registers")', 'blockType':'Securities register' },
|
||||
{ 'find':'h2:contains("Branch of foreign credit institution")', 'blockType':'Foreign credit institution' },
|
||||
{ 'find':'h2:contains("Finance company")', 'blockType':'Finance company' },
|
||||
{ 'find':'h2:contains("Payment institution")', 'blockType':'Payment institution' },
|
||||
{ 'find':'h2:contains("Agency debt collection on behalf of others")', 'blockType':'Debt collection' },
|
||||
{ 'find':'h2:contains("E-money institution")', 'blockType':'E-money institution' },
|
||||
{ 'find':'h2:contains("Investment firm")', 'blockType':'h2:contains("Investment firm")' },
|
||||
{ 'find':'h2:contains("Intermediator of loans and guarantees")', 'blockType':'Intermediator of loans and guarantees' }
|
||||
|
||||
];
|
||||
|
||||
const licenseBlocks = $('div.article-content-container').children('div.license-container');
|
||||
|
||||
licenseBlocks.each( (i, item) => {
|
||||
let license = {};
|
||||
|
||||
for(const cItem of contentArray)
|
||||
if ($(item).find(cItem.find).index() !== -1)
|
||||
license = this.extractEntityDetailLicense(item, cItem.blockType);
|
||||
|
||||
if ($(item).find('h3.license-unit-label:contains("Cross-border services/classes")').index() !== -1)
|
||||
license.crossBorder = this.extractEntityDetailCrossBorder(item);
|
||||
|
||||
output.push(license);
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const { name, id } = serviceObject.links[serviceObject.step];
|
||||
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
|
||||
await this.page.waitForSelector('h1.common-header-text', { 'visible':true, 'timeout':7500 });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(name.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${id}`].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
// --
|
||||
|
||||
const details = await this.extractEntityDetails(body);
|
||||
|
||||
const licenses = await this.entityContentSniffer(body);
|
||||
|
||||
// --
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details, licenses });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/en/finanstilsynets-registry/':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case '/en/finanstilsynets-registry/detail/':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
||||
await this.crossBorderRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
let newUrl;
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
this.paymentServices.indexStep++;
|
||||
if (this.paymentServices.indexStep >= this.paymentServices.wanted.length)
|
||||
newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
else
|
||||
newUrl = this.paymentServices.urls[0];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
let newUrl;
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
this.emoneyServices.indexStep++;
|
||||
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
|
||||
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
else
|
||||
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
let newUrl;
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
this.creditServices.indexStep++;
|
||||
if (this.creditServices.indexStep >= this.creditServices.urls.length)
|
||||
newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
else
|
||||
newUrl = this.creditServices.urls[this.creditServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.rootURI = 'https://www.finanstilsynet.no';
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
|
||||
'wanted' : ['Payment institution', 'Agent of payment institution (company)', 'Payment service provider with a limited authorisat'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
|
||||
'wanted' : ['E-money institution'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
|
||||
'wanted' : ['Bank', 'Branch of foreign credit institution', 'Credit Institution', 'Savings bank foundation'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/NO/FS`));
|
||||
|
||||
// await this._doNonRepudiation();
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5, 'Startup');
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = NOScrape;
|
1384
ncas/pl.js
Normal file
1384
ncas/pl.js
Normal file
File diff suppressed because it is too large
Load Diff
1022
ncas/plR.js
Normal file
1022
ncas/plR.js
Normal file
File diff suppressed because it is too large
Load Diff
513
ncas/pt.js
Normal file
513
ncas/pt.js
Normal file
@ -0,0 +1,513 @@
|
||||
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const logger = require('log4js').getLogger('PT');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class PTScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'PT';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractEntityDetails(html) {
|
||||
try {
|
||||
const details = {};
|
||||
const detailSequence = [
|
||||
|
||||
['field-name-field-tipo-ent-aut', 'institutionType'],
|
||||
|
||||
['field-name-field-estado-ent', 'state'],
|
||||
|
||||
['field-name-field-morada', 'address'],
|
||||
|
||||
['field-name-field-localidade', 'firstName'],
|
||||
|
||||
['field-name-field-cod-postal', 'postcode'],
|
||||
|
||||
['field-name-field-pais', 'country'],
|
||||
|
||||
['field-name-field-data-limite', 'beginningOfActivity'],
|
||||
|
||||
['field-name-field-capital-subscrito', 'subscribedCapital'],
|
||||
|
||||
['field-name-field-capital-realizado', 'paidUpCapital'],
|
||||
|
||||
['field-name-field-jel', 'institutionCodeNumber']
|
||||
|
||||
];
|
||||
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
details.name = this._cleanUp($('h1.page-title').text()) ;
|
||||
|
||||
const mainDiv = $('div.content');
|
||||
|
||||
for(const item of detailSequence) {
|
||||
const i = $(mainDiv).find(`.${item[0]} div.field-items`);
|
||||
|
||||
details[item[1]] = this._cleanUp($(i).text());
|
||||
}
|
||||
|
||||
return details;
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processIndex(serviceObject) {
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
if ($('div.view-empty').length > 0) {
|
||||
// We have reached an empty page, so we assume we've scraped all links from this index
|
||||
this.emit('indexdone');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const links = $('div.views-field.views-field-title > span > a');
|
||||
|
||||
links.each((i, item) => {
|
||||
const href = $(item).attr('href');
|
||||
const text = $(item).text();
|
||||
|
||||
const newUrl = `https://www.bportugal.pt${href}`;
|
||||
const id = this._makeFieldName(text);
|
||||
|
||||
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
|
||||
});
|
||||
|
||||
const filename = this.modeNames[this.mode];
|
||||
|
||||
const parsedUrl = url.parse(this.page.url(), true);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${parsedUrl.query.page}`, null);
|
||||
|
||||
parsedUrl.query.page++;
|
||||
parsedUrl.search = undefined; // Forces parsedUrl to use `query` property, as modified on line above
|
||||
const nextPage = url.format(parsedUrl);
|
||||
|
||||
await this._goto(nextPage);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
// We have stopped using the "view all" button due to it breaking.
|
||||
// Leaving the code below commented in case it is ever useful in future.
|
||||
|
||||
// await this.page.waitForSelector('#block-system-main > div > div > div.view-content-wrapper > ul > li.pager__item.pager__item_all', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
// logger.debug('Extend menu list..');
|
||||
// await elm.click({ 'delay':90 });
|
||||
// }).catch(() => {
|
||||
// logger.info('No show all button');
|
||||
// });
|
||||
|
||||
await this._randomWait(this.page, 6, 9);
|
||||
|
||||
await this.processIndex(serviceObject);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
logger.debug('>> indexRedirector');
|
||||
let doIndex = false;
|
||||
await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
logger.warn('Sent back to the main selector screen');
|
||||
await elm.click({ 'delay':90 });
|
||||
|
||||
doIndex = false;
|
||||
}).catch(() => {
|
||||
// logger.info('No show all button');
|
||||
doIndex = true;
|
||||
});
|
||||
|
||||
if (doIndex)
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
|
||||
const { name, id } = serviceObject.links[serviceObject.step];
|
||||
// const id = serviceObject.links[serviceObject.step].id;
|
||||
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
||||
|
||||
// 'h1.page-title'
|
||||
|
||||
await this.page.waitForSelector('h1.page-title', { 'visible':true, 'timeout':7500 });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const details = await this.extractEntityDetails(body);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
const pathSplitter = /(\/en\/.+?\/)/;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const splitPath = pageUrl.pathname.match(pathSplitter);
|
||||
|
||||
const pathname = splitPath[0];
|
||||
|
||||
switch (pathname) {
|
||||
|
||||
case '/en/entidades-autorizadas/':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case '/en/entidadeautorizada/':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
||||
await this.crossBorderRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
async attachEvents() {
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('serviceDone', async function() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
//
|
||||
|
||||
this.on('psindexdone', async () => {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emindexdone', async () => {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('ciindexdone', async () => {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
|
||||
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('indexdone', async function() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async function() {
|
||||
logger.warn('paymentServicesDone');
|
||||
try{
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async function() {
|
||||
logger.warn('emoneyServicesDone');
|
||||
try{
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async function() {
|
||||
logger.warn('creditServicesDone');
|
||||
try{
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
logger.debug(this.eventNames());
|
||||
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/75/all?page=0'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/72/all?page=0'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/67-68-1524-69/all?page=0'],
|
||||
'sections' : [],
|
||||
'sectionLinks' : []
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/PT/BP`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
await this._makeResponsive();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage();
|
||||
}, 5000));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = PTScrape;
|
569
ncas/se.js
Normal file
569
ncas/se.js
Normal file
@ -0,0 +1,569 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const removeAccents = require('remove-accents-diacritics');
|
||||
const logger = require('log4js').getLogger('SE');
|
||||
const url = require('url');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class SEScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.setID('SE');
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<{authorization: Array, details}>}
|
||||
*/
|
||||
async extractEntity(html) {
|
||||
const $ = cheerio.load(html);
|
||||
const details = {};
|
||||
const authorization = [];
|
||||
|
||||
details.name = this._cleanUp($('h2').text());
|
||||
|
||||
const dlCells = $('dl.funky').children();
|
||||
const ulCells = $('ul.tillstand').children();
|
||||
|
||||
let current = '';
|
||||
dlCells.each((index, item) => {
|
||||
const itemText = this._cleanUp($(item).text());
|
||||
if (item.name === 'dt') {
|
||||
details[itemText] = [];
|
||||
current = itemText;
|
||||
}
|
||||
else
|
||||
details[current].push(itemText);
|
||||
});
|
||||
|
||||
ulCells.each((index, item) => {
|
||||
const date = this._cleanUp($(item.children).eq(0).text()) ;
|
||||
const text = this._cleanUp($(item.children).eq(1).text()) ;
|
||||
|
||||
authorization.push({ date, text, 'translated':this._translate(text) });
|
||||
});
|
||||
|
||||
return { details, authorization };
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetails(serviceObject) {
|
||||
const noWhiteSpace = /\W/g;
|
||||
const id = serviceObject.links[serviceObject.step].id;
|
||||
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.waitForSelector('h1').catch((e) => {
|
||||
throw e;
|
||||
});
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
const $ = cheerio.load(body);
|
||||
const details = await this.extractEntity(body);
|
||||
|
||||
const crossBorderExists = $('div.container a.link');
|
||||
|
||||
if (crossBorderExists.length !== 0) {
|
||||
serviceObject.links[serviceObject.step].data = { details };
|
||||
await this._findAndClick('div.container a.link', 'View cross border services');
|
||||
}
|
||||
else {
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param html
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractCrossBorderServices(html) {
|
||||
const services = {};
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const rows = $('div.container table tbody tr');
|
||||
|
||||
let current = '';
|
||||
|
||||
rows.each((index, item) => {
|
||||
if ($(item).children().length === 1) {
|
||||
// this is a heading...
|
||||
const itemText = this._cleanUp($(item).text());
|
||||
services[itemText] = { 'authorization': [], 'translated': this._translate(itemText) };
|
||||
current = itemText;
|
||||
}
|
||||
else {
|
||||
const date = this._cleanUp($(item.children).eq(0).text()) ;
|
||||
const text = this._cleanUp($(item.children).eq(1).text()) ;
|
||||
const translated = this._translate(text);
|
||||
|
||||
services[current].authorization.push({ date, text, translated });
|
||||
}
|
||||
});
|
||||
|
||||
return services;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processCrossBorderServicesV2(serviceObject) {
|
||||
try{
|
||||
const noWhiteSpace = /\W/g;
|
||||
const id = serviceObject.links[serviceObject.step].id;
|
||||
logger.info('Process CBS entity:', id);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const entity = removeAccents.remove(id.trim());
|
||||
|
||||
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.waitForSelector('h1').catch((e) => {
|
||||
throw e;
|
||||
});
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_crossborder`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const crossBorderServices = await this.extractCrossBorderServices(body);
|
||||
|
||||
const details = serviceObject.links[serviceObject.step].data;
|
||||
|
||||
serviceObject.links[serviceObject.step].data = null;
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, { details, crossBorderServices });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.items) {
|
||||
const newUrl = serviceObject.links[serviceObject.step].href;
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async buildIndex(serviceObject) {
|
||||
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
||||
|
||||
// await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.waitForSelector('#institut', { 'visible':true });
|
||||
|
||||
const links = await this.page.$$('#institut > tbody > tr > td > a');
|
||||
|
||||
for (const item of links) {
|
||||
// logger.debug(item);
|
||||
const id = await this.page.evaluate(el => el.innerText, item);
|
||||
let href = await this.page.evaluate(el => el.href, item);
|
||||
|
||||
href = href.concat('&locale=en_GB');
|
||||
|
||||
serviceObject.links.push({ id, href });
|
||||
}
|
||||
|
||||
serviceObject.items = serviceObject.links.length;
|
||||
|
||||
serviceObject.indexStep++;
|
||||
this.emit('indexdone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async indexRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.buildIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.buildIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.buildIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processEntityDetails(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetails(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetails(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async crossBorderRedirector() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
await this.processCrossBorderServicesV2(this.paymentServices);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
await this.processCrossBorderServicesV2(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processCrossBorderServicesV2(this.creditServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
switch (pageUrl.pathname) {
|
||||
|
||||
case '/en/our-registers/company-register/':
|
||||
await this.indexRedirector();
|
||||
break;
|
||||
|
||||
case '/en/our-registers/company-register/details':
|
||||
await this.processRedirector();
|
||||
break;
|
||||
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
||||
await this.crossBorderRedirector();
|
||||
break;
|
||||
|
||||
default:
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('indexdone', async function() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('psindexdone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emindexdone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('ciindexdone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('serviceDone', async function() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('psindexdone', async function() {
|
||||
if (this.paymentServices.indexStep < this.paymentServices.urls.length) {
|
||||
const newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('startProcessingPaymentServices');
|
||||
});
|
||||
|
||||
this.on('startProcessingPaymentServices', async function() {
|
||||
this.paymentServices.items = this.paymentServices.links.length;
|
||||
logger.info(`${this.paymentServices.items} items indexed`);
|
||||
// logger.debug(this.paymentServices.links);
|
||||
|
||||
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async function() {
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
});
|
||||
|
||||
// emoney Services
|
||||
|
||||
this.on('emindexdone', async function() {
|
||||
if (this.emoneyServices.indexStep < this.emoneyServices.urls.length) {
|
||||
const newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('startProcessingEMoneyServices');
|
||||
});
|
||||
|
||||
this.on('startProcessingEMoneyServices', async function() {
|
||||
this.emoneyServices.items = this.emoneyServices.links.length;
|
||||
logger.info(`${this.emoneyServices.items} items indexed`);
|
||||
// logger.debug(this.emoneyServices.links);
|
||||
|
||||
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async function() {
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
|
||||
this.mode++;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
});
|
||||
|
||||
// credit services
|
||||
this.on('ciindexdone', async function() {
|
||||
if (this.creditServices.indexStep < this.creditServices.urls.length) {
|
||||
const newUrl = this.creditServices.urls[this.creditServices.indexStep];
|
||||
|
||||
await this._goto(newUrl);
|
||||
}
|
||||
else
|
||||
this.emit('startProcessingcreditServices');
|
||||
});
|
||||
|
||||
this.on('startProcessingcreditServices', async function() {
|
||||
this.creditServices.items = this.creditServices.links.length;
|
||||
logger.info(`${this.creditServices.items} items indexed`);
|
||||
// logger.debug(this.creditServices.links);
|
||||
|
||||
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
||||
|
||||
await this._goto(newUrl);
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async function() {
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
|
||||
this.emit('done');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
await this._loadDictionary();
|
||||
|
||||
this.mode = 0;
|
||||
|
||||
this.modeTitles = ['**Payment Service', 'EMoney', 'Credit Services'];
|
||||
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BET&area=#results'/* ,
|
||||
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BETREG&area=#results'*/]
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=EINST&area=#results',
|
||||
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=REGUTG&area=#results']
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=BANK&area=#results',
|
||||
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=MBANK&area=#results',
|
||||
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=SPAR&area=#results']
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
|
||||
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/SE/FI`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
|
||||
logger.error(err);
|
||||
});
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = SEScrape;
|
833
ncas/sk.js
Normal file
833
ncas/sk.js
Normal file
@ -0,0 +1,833 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('SK');
|
||||
const url = require('url');
|
||||
const camelCase = require('camelcase');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
class SKScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'SK';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._throttle(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async checkChangeLanguage() {
|
||||
const languageIcon = await this.page.$$('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a > img');
|
||||
|
||||
if (languageIcon.length > 0) {
|
||||
const value = await this.page.evaluate(el => el.getAttribute('src'), languageIcon[0]);
|
||||
|
||||
if (value === '/static/icon/ico_en.gif') {
|
||||
// this needs a click
|
||||
logger.info('Changing language to English..');
|
||||
await this._findAndClick('#SubjectForm > div > div.panel-heading.sufit > table > tbody > tr > td:nth-child(2) > h3 > span > a ');
|
||||
|
||||
return true;
|
||||
//
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleIntroPage() {
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
// Clear cookie bar
|
||||
await this.page.waitForSelector('a.btnCookieAccept', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(() => {
|
||||
logger.info('No cookie bar');
|
||||
});
|
||||
|
||||
if (!this.inProgress && pageUrl.query === null) {
|
||||
// fix language before going on
|
||||
|
||||
const changedLanguage = await this.checkChangeLanguage();
|
||||
|
||||
if (!changedLanguage) {
|
||||
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
|
||||
|
||||
await this._findAndClick(' body > div.container > div:nth-child(5) > div:nth-child(1) > div > div');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processMainMenu(serviceObject) {
|
||||
const wantedItem = serviceObject.sections[serviceObject.indexStep];
|
||||
|
||||
const expandables = ['#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl1',
|
||||
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl2',
|
||||
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl3',
|
||||
'#Categories > tbody:nth-child(4) > tr.level0.categctrl.categctrl4'
|
||||
];
|
||||
|
||||
for (const item of expandables)
|
||||
await this.page.$eval(item, e => e.click({ 'delay':90 }));
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const wantedRow = `[data-sector="${wantedItem}"]`;
|
||||
|
||||
logger.debug('Looking for', wantedRow);
|
||||
|
||||
await this.page.waitForSelector(wantedRow, { 'visible':true, 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(() => {
|
||||
logger.warn('processMainMenu did not find what it was looking for!');
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async entityIndexFirstPass(serviceObject) {
|
||||
// breaks up `Showing 1 to 10 of 12 entries`
|
||||
const breaker = /(\d+)/g;
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const subjectsInfo = $('#Subjects_info').text();
|
||||
|
||||
const brokenString = subjectsInfo.match(breaker);
|
||||
|
||||
const currentPageIndex = parseInt(brokenString[0], 10);
|
||||
const currentPageMax = parseInt(brokenString[1], 10);
|
||||
|
||||
// The site returns the index from the last page when you select a different view.
|
||||
// This should be watched and can cause a problem
|
||||
|
||||
logger.debug('subjectsInfo', subjectsInfo);
|
||||
logger.debug('Step', serviceObject.step);
|
||||
logger.debug('currentPageIndex', currentPageIndex);
|
||||
|
||||
if (((currentPageIndex <= currentPageMax) && (currentPageIndex === (serviceObject.step + 1))) || (currentPageIndex === 0 && currentPageMax === 0 )) {
|
||||
serviceObject.currentIndexLength = parseInt(brokenString[2], 10);
|
||||
serviceObject.currentPageMax = currentPageMax;
|
||||
|
||||
serviceObject.visited = true;
|
||||
serviceObject.currentIndex = url.parse(await this.page.url());
|
||||
serviceObject.currentMetaIndex = 0;
|
||||
}
|
||||
else {
|
||||
logger.info('Need to click previous');
|
||||
const nextButton = await this.page.$$('#Subjects_previous');
|
||||
|
||||
const buttonClasses = await this.page.$eval('#Subjects_previous', e => e.getAttribute('class'));
|
||||
|
||||
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
|
||||
// we need a click..
|
||||
nextButton[0].click({ 'delay':90 });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.visited = false;
|
||||
this.emit('entityIndex');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityIndex(serviceObject) {
|
||||
const fields = ['referenceNumber', 'businessName', 'address', 'start', 'end', 'reason'];
|
||||
|
||||
const mouseDownDuration = Scraper.notARobot();
|
||||
if (serviceObject.visited === false) {
|
||||
logger.debug('Preparing...');
|
||||
|
||||
await this.page.waitForSelector('table#Subjects', { 'visible':true }).then(async () => {
|
||||
await this.entityIndexFirstPass(serviceObject);
|
||||
}).catch(() => {
|
||||
logger.error('Table failed to render');
|
||||
});
|
||||
}
|
||||
|
||||
if (serviceObject.visited === true) {
|
||||
serviceObject.currentMetaIndex = serviceObject.step % 10;
|
||||
|
||||
if ((serviceObject.step ) >= serviceObject.currentPageMax) {
|
||||
const nextButton = await this.page.$$('#Subjects_next');
|
||||
|
||||
const buttonClasses = await this.page.$eval('#Subjects_next', e => e.getAttribute('class'));
|
||||
|
||||
if (buttonClasses.split(' ').indexOf('disabled') === -1) {
|
||||
// we need a click..
|
||||
nextButton[0].click({ 'delay':mouseDownDuration });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.visited = false;
|
||||
this.emit('entityIndex');
|
||||
}
|
||||
else {
|
||||
logger.debug('I think we are done here...');
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
await this.page.waitForSelector('#Subjects > tbody');
|
||||
|
||||
const wantedRow = await this.page.$$(`#Subjects > tbody > tr:nth-child(${serviceObject.currentMetaIndex + 1})`);
|
||||
const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
|
||||
|
||||
const $ = cheerio.load(`<table>${htmlRow}</table>`);
|
||||
|
||||
const cells = $('td');
|
||||
|
||||
serviceObject.current = {};
|
||||
|
||||
cells.each((index, item) => {
|
||||
serviceObject.current[ fields[index] ] = $(item).text();
|
||||
});
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await wantedRow[0].click({ 'delay':mouseDownDuration });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetailBasicDetails($) {
|
||||
const newObj = {};
|
||||
|
||||
const rows = $('tr');
|
||||
|
||||
rows.each((index, elm) => {
|
||||
const children = $(elm).children();
|
||||
|
||||
const preLabel = $(children).eq(0).text();
|
||||
const label = camelCase(this._cleanUp(preLabel.replace(':', '')));
|
||||
|
||||
newObj[label] = this._cleanUp($(children).eq(1).text());
|
||||
});
|
||||
|
||||
return newObj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @param elm
|
||||
*/
|
||||
decodeTable($, elm) {
|
||||
const rows = $(elm).find('table.details tr');
|
||||
const obj = {};
|
||||
|
||||
rows.each( (index, elm) => {
|
||||
const children = $(elm).children();
|
||||
|
||||
const labelClass = $(children[0]).attr('class');
|
||||
const label = camelCase(this._cleanUp($(children[0]).text().replace(':', '').replace(',', '')));
|
||||
|
||||
const contents = this._cleanUp($(children[1]).text().replace(/(Hide|View)\s*/, ''));
|
||||
|
||||
if (typeof(labelClass) !== 'undefined' && labelClass === 'dlabel')
|
||||
obj[label] = contents;
|
||||
});
|
||||
|
||||
return obj;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param $
|
||||
* @returns {Promise<Array>}
|
||||
*/
|
||||
async processEntityDetailTableV2($) {
|
||||
// take the first tbody as this is the main one...
|
||||
const fields = [ 'license', 'start', 'end', 'reason'];
|
||||
const outData = [];
|
||||
let newObj = {};
|
||||
|
||||
let topLevel = '';
|
||||
let midLevel = {};
|
||||
let level1ID = '';
|
||||
|
||||
const tbody = $('tbody')[0];
|
||||
const children = $(tbody).children();
|
||||
|
||||
children.each((index, item) => {
|
||||
const itemClasses = $(item).attr('class').split(' ');
|
||||
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('sublicctrl') !== -1)) {
|
||||
// TOP LEVEL
|
||||
const itemChildren = $(item).children();
|
||||
|
||||
if (Object.keys(newObj).length !== 0) {
|
||||
// push this object into the list
|
||||
outData.push(newObj);
|
||||
newObj = {};
|
||||
}
|
||||
|
||||
topLevel = camelCase(this._cleanUp($(itemChildren[0]).text().replace(',', '')));
|
||||
midLevel = {};
|
||||
|
||||
itemChildren.each((ci, celm) => {
|
||||
midLevel[fields[ci]] = this._cleanUp($(celm).text());
|
||||
});
|
||||
|
||||
midLevel.detail = [];
|
||||
newObj[topLevel] = Object.assign({}, midLevel);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if ((itemClasses.indexOf('level0') !== -1) && (itemClasses.indexOf('details') !== -1))
|
||||
// TOP LEVEL - DETAILS
|
||||
newObj[topLevel].detail.push(this.decodeTable($, item));
|
||||
|
||||
//
|
||||
|
||||
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') === -1)) {
|
||||
// LEVEL 1
|
||||
const itemChildren = $(item).children();
|
||||
level1ID = camelCase(this._cleanUp($(itemChildren[0]).text()));
|
||||
|
||||
newObj[topLevel][level1ID] = [];
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if ((itemClasses.indexOf('level1') !== -1) && (itemClasses.indexOf('details') !== -1)) {
|
||||
// LEVEL 1 - DETAIL
|
||||
|
||||
const table = this.decodeTable($, item);
|
||||
|
||||
newObj[topLevel][level1ID].push(table);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
if ((itemClasses.indexOf('level2') !== -1) && (itemClasses.indexOf('details') === -1)) {
|
||||
// LEVEL 2
|
||||
const itemChildren = $(item).children();
|
||||
const obj = {};
|
||||
|
||||
itemChildren.each((ci, celm) => {
|
||||
obj[fields[ci]] = this._cleanUp($(celm).text());
|
||||
});
|
||||
|
||||
const nexttable = $(item).next();
|
||||
|
||||
obj.details = this.decodeTable($, nexttable);
|
||||
|
||||
if (level1ID === '') {
|
||||
const newID = camelCase(this._cleanUp(obj.license.replace(',', '')));
|
||||
newObj[topLevel][newID] = [];
|
||||
newObj[topLevel][newID].push(obj);
|
||||
}
|
||||
|
||||
else {
|
||||
if (!newObj[topLevel].hasOwnProperty(level1ID))
|
||||
newObj[topLevel][level1ID] = [];
|
||||
|
||||
newObj[topLevel][level1ID].push(obj);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// insert final obj
|
||||
if (Object.keys(newObj).length !== 0) {
|
||||
// push this object into the list
|
||||
outData.push(newObj);
|
||||
newObj = {};
|
||||
}
|
||||
|
||||
return outData;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processEntityDetail(serviceObject) {
|
||||
// level0 sublicctrl sublicctrl1 odd
|
||||
// level0 sublicctrl sublicctrl1 odd sublicshow shown
|
||||
|
||||
// expand all accordians
|
||||
|
||||
const rows = await this.page.$$('tr.sublicctrl');
|
||||
|
||||
for (const item of rows) {
|
||||
const cls = await this.page.evaluate(el => el.getAttribute('class'), item);
|
||||
if (!cls.includes('shown'))
|
||||
|
||||
await item.click({ 'delay':Scraper.notARobot() });
|
||||
}
|
||||
|
||||
await this.page.waitForSelector('#Licenses > tbody > tr.level1.shown.sublichide1.sllhidectrl.sllhidectrl1', { 'timeout':7500 }).then(async (elm) => {
|
||||
await elm.click({ 'delay':Scraper.notARobot() });
|
||||
}).catch(() => {
|
||||
logger.debug('No License information');
|
||||
});
|
||||
|
||||
await this._microWait(this.page, 5);
|
||||
|
||||
// expand all viewable anchors
|
||||
const wantedAnchors = await this.page.$$('.row a');
|
||||
|
||||
for (const item of wantedAnchors) {
|
||||
const exItem = this._cleanUp(await this.page.evaluate(el => el.text, item));
|
||||
|
||||
if (exItem === 'View')
|
||||
await item.click({ 'delay': Scraper.notARobot() }).catch((e) => {
|
||||
logger.debug('View click failed', e);
|
||||
});
|
||||
}
|
||||
|
||||
const entityName = `${serviceObject.current.businessName}_${serviceObject.current.referenceNumber}`;
|
||||
const fileName = this._makeFileName(entityName);
|
||||
const filePath = await this._makeFilePath(entityName);
|
||||
|
||||
serviceObject.current.fileName = fileName;
|
||||
|
||||
await this._randomWait(this.page, 2, 2);
|
||||
await this.page.focus('h3.page-header');
|
||||
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
||||
|
||||
await this.page.waitForSelector('body > div.container > form.form-horizontal > table', { 'timeout':7500 }).then(async (elm) => {
|
||||
logger.debug('prep for processEntityDetailBasicDetails');
|
||||
|
||||
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
|
||||
|
||||
const $ = cheerio.load(htmlBlock);
|
||||
|
||||
serviceObject.current.basicDetails = await this.processEntityDetailBasicDetails($);
|
||||
});
|
||||
|
||||
await this.page.waitForSelector('#Licenses').then(async (elm) => {
|
||||
logger.debug('prep for processEntityDetailTableV2');
|
||||
|
||||
const htmlBlock = await this.page.evaluate(el => el.outerHTML, elm);
|
||||
|
||||
const $ = cheerio.load(htmlBlock);
|
||||
|
||||
serviceObject.current.entityDetails = await this.processEntityDetailTableV2($);
|
||||
});
|
||||
|
||||
this.entityCompleter(serviceObject);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param serviceObject
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async entityCompleter(serviceObject) {
|
||||
const filename = serviceObject.current.fileName;
|
||||
|
||||
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
||||
|
||||
logger.info(`Saving: ${filename}.json`);
|
||||
|
||||
const newLink = { 'referenceNumber':serviceObject.current.referenceNumber, 'businessName':serviceObject.current.businessName, 'fileName':`${filename}.json` };
|
||||
|
||||
serviceObject.links.push(newLink);
|
||||
|
||||
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
serviceObject.step++;
|
||||
|
||||
if (serviceObject.step < serviceObject.currentIndexLength) {
|
||||
serviceObject.current = {};
|
||||
|
||||
await this.page.goBack({ 'waitUntil':'networkidle0' });
|
||||
}
|
||||
else
|
||||
this.emit('serviceDone');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleMainIndex() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processMainMenu(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processMainMenu(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processMainMenu(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleEntityIndex() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processEntityIndex(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityIndex(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processEntityIndex(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleEntityDetail() {
|
||||
switch (this.mode) {
|
||||
|
||||
case 1:
|
||||
await this.processEntityDetail(this.emoneyServices);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
await this.processEntityDetail(this.creditServices);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
default:
|
||||
await this.processEntityDetail(this.paymentServices);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const pageUrl = url.parse(await this.page.url());
|
||||
|
||||
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
||||
logger.warn('Directed to: chrome-error://chromewebdata/');
|
||||
this.emit('recover');
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
const params = Object.assign({ 'aa': '' }, this._getParamsFromUrl(pageUrl.search));
|
||||
|
||||
switch (params.aa) {
|
||||
|
||||
case '':
|
||||
await this.handleIntroPage();
|
||||
break;
|
||||
|
||||
case 'select_sector':
|
||||
await this.handleMainIndex();
|
||||
break;
|
||||
|
||||
case 'select_categ':
|
||||
await this.handleEntityIndex();
|
||||
break;
|
||||
case 'select_subject':
|
||||
await this.handleEntityDetail();
|
||||
break;
|
||||
|
||||
default:
|
||||
if (process.env.NODE_ENV) {
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${pageUrl}`);
|
||||
}
|
||||
else {
|
||||
logger.warn('processNewPage Fell through');
|
||||
logger.warn('currentPage.location', pageUrl);
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
this.on('entityComplete', () => {
|
||||
this.handleEntityComplete();
|
||||
});
|
||||
|
||||
this.on('serviceDone', async () => {
|
||||
switch (this.mode) {
|
||||
|
||||
case 0:
|
||||
this.emit('paymentServicesDone');
|
||||
break;
|
||||
|
||||
case 1:
|
||||
this.emit('emoneyServicesDone');
|
||||
break;
|
||||
|
||||
case 2:
|
||||
this.emit('creditServicesDone');
|
||||
break;
|
||||
|
||||
}
|
||||
});
|
||||
|
||||
this.on('entityIndex', async () => {
|
||||
await this.handleEntityIndex();
|
||||
});
|
||||
|
||||
this.on('paymentServicesDone', async () => {
|
||||
try{
|
||||
this.paymentServices.indexStep++;
|
||||
if (this.paymentServices.indexStep < this.paymentServices.sections.length) {
|
||||
this.paymentServices.visited = false;
|
||||
this.paymentServices.step = 0;
|
||||
await this._goto(this.paymentServices.urls[1]);
|
||||
}
|
||||
else {
|
||||
this.paymentServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('emoneyServicesDone', async () => {
|
||||
try{
|
||||
this.emoneyServices.indexStep++;
|
||||
|
||||
if (this.emoneyServices.indexStep < this.emoneyServices.sections.length) {
|
||||
this.emoneyServices.visited = false;
|
||||
this.emoneyServices.step = 0;
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
else {
|
||||
this.emoneyServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
await this._goto(this.emoneyServices.urls[0]);
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
|
||||
this.on('creditServicesDone', async () => {
|
||||
try{
|
||||
this.creditServices.indexStep++;
|
||||
|
||||
if (this.creditServices.indexStep < this.creditServices.sections.length) {
|
||||
this.creditServices.visited = false;
|
||||
this.creditServices.step = 0;
|
||||
await this._goto(this.creditServices.urls[0]);
|
||||
}
|
||||
else {
|
||||
this.creditServices.done = true;
|
||||
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
|
||||
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
||||
|
||||
this.mode++;
|
||||
this.inProgress = false;
|
||||
|
||||
this.emit('done');
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
logger.error(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Initite the process
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
super._start();
|
||||
try {
|
||||
this.mode = 0;
|
||||
|
||||
this.inProgress = false;
|
||||
|
||||
/*
|
||||
|
||||
Swapping sections from text to
|
||||
data-sector ids.
|
||||
document.querySelector('[data-sector="156"]')
|
||||
|
||||
Payment Services:
|
||||
Payment Institutions and Branches of Foreign Payment Institutions // 9
|
||||
Providing Payment Services in Limited Scope // 11
|
||||
Account information service providers // 156
|
||||
|
||||
eMoney Services:
|
||||
E-Money Institutions and Branches of Foreign E-Money Institutions // 12
|
||||
E-Money Institutions Based in Slovakia // 37
|
||||
|
||||
credit Services:
|
||||
Banks Authorised to Provide Investment Services // 5
|
||||
Banks Based in Slovakia // 19
|
||||
|
||||
*/
|
||||
this.paymentServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://subjekty.nbs.sk/', 'https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
|
||||
'sections' : [9, 11, 156],
|
||||
'sectionStep': 0,
|
||||
'currentIndexLength' : 0,
|
||||
'sectionLinks' : [],
|
||||
'currentIndex' :'',
|
||||
'currentMetaIndex' : 0
|
||||
};
|
||||
|
||||
this.emoneyServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
|
||||
'sections' : [12, 37],
|
||||
'sectionStep': 0,
|
||||
'currentIndexLength' : 0,
|
||||
'sectionLinks' : [],
|
||||
'currentIndex' :'',
|
||||
'currentMetaIndex' : 0
|
||||
};
|
||||
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'indexStep': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false,
|
||||
'started': false,
|
||||
'urls': ['https://subjekty.nbs.sk/?aa=select_sector&bb=2&cc=&qq='],
|
||||
'sections' : [5, 19],
|
||||
'sectionStep': 0,
|
||||
'currentIndexLength' : 0,
|
||||
'sectionLinks' : [],
|
||||
'currentIndex' :'',
|
||||
'currentMetaIndex' : 0
|
||||
};
|
||||
|
||||
this.startPage = this.paymentServices.urls[0];
|
||||
this.emoneyUrl = this.emoneyServices.urls[0];
|
||||
this.credit = this.creditServices.urls[0];
|
||||
|
||||
this.setPath(path.resolve(`${__dirname }/../artefacts/SK/NBS`));
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser();
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
//
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch(e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
async __run() {
|
||||
await this.start();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
module.exports = SKScrape;
|
36
nl.js
Normal file
36
nl.js
Normal file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
// TODO:
|
||||
// parse arguments - we should run just 1 FCA per go &
|
||||
// have option to run selected company from selected NCA
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
// load helper libs etc
|
||||
// const Fca = require('./ncas/fca');
|
||||
|
||||
const Netherlands = require('./ncas/nl');
|
||||
|
||||
async function run() {
|
||||
const nlScraper = new Netherlands();
|
||||
|
||||
if (typeof(process.env.NL_CRON) === 'string' )
|
||||
new CronJob(process.env.NL_CRON, async function() {
|
||||
await nlScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === nlScraper.id)
|
||||
await nlScraper.run();
|
||||
|
||||
console.log('NL Launched');
|
||||
}
|
||||
|
||||
process.once('uncaughtException', function caught(err) {
|
||||
console.error('Uncaught', err);
|
||||
done = true;
|
||||
});
|
||||
|
||||
run();
|
23
no.js
Normal file
23
no.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Norway = require('./ncas/no');
|
||||
|
||||
async function run() {
|
||||
const noScraper = new Norway();
|
||||
|
||||
if (typeof(process.env.NO_CRON) === 'string' )
|
||||
new CronJob(process.env.NO_CRON, async () => {
|
||||
await noScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === noScraper.id)
|
||||
await noScraper.run();
|
||||
|
||||
console.log('NO Launched');
|
||||
}
|
||||
|
||||
run();
|
8848
package-lock.json
generated
Normal file
8848
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
68
package.json
Normal file
68
package.json
Normal file
@ -0,0 +1,68 @@
|
||||
{
|
||||
"name": "obdfcascrape",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "nyc tape tests/**/*.js",
|
||||
"testScrapers": "nyc tape tests/**/scrape.*.js",
|
||||
"testSpecific": "nyc tape tests/scrape.se.js",
|
||||
"testRep": "nyc tape tests/**/rep.*.js",
|
||||
"testfr": "nyc tape tests/fr.js",
|
||||
"cleanup": "rm artefacts/*.{html,json}; rm artefacts/screenshots/*.{jpg,jpeg,png};",
|
||||
"start": "./start.sh",
|
||||
"server": "http-server ./public",
|
||||
"malta": "node mt.js",
|
||||
"debuglogs": "node debuglogs.js"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"archiver": "^2.1.1",
|
||||
"archiver-promise": "^1.0.0",
|
||||
"aws-sdk": "^2.395.0",
|
||||
"camelcase": "^5.0.0",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"crc": "^3.8.0",
|
||||
"cron": "^1.6.0",
|
||||
"csv": "^3.1.0",
|
||||
"dateformat": "^3.0.3",
|
||||
"del": "^3.0.0",
|
||||
"dotenv": "^6.2.0",
|
||||
"fs-extra": "^7.0.1",
|
||||
"get-ssl-certificate": "^2.3.1",
|
||||
"google-translate-api": "^2.3.0",
|
||||
"hh-mm-ss": "^1.2.0",
|
||||
"jsonfile": "^5.0.0",
|
||||
"log4js": "^3.0.6",
|
||||
"memory": "0.0.3",
|
||||
"moment": "^2.24.0",
|
||||
"node-free": "^1.0.0",
|
||||
"pm2": "^3.5.0",
|
||||
"puppeteer": "^1.14.0",
|
||||
"remove-accents-diacritics": "^1.0.2",
|
||||
"request": "^2.88.0",
|
||||
"tld-extract": "^1.0.1",
|
||||
"underscore": "^1.9.1",
|
||||
"whois": "^2.9.1",
|
||||
"whois-json": "^2.0.4",
|
||||
"yargs": "^12.0.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"deep-diff": "^1.0.2",
|
||||
"gulp": "^3.9.1",
|
||||
"gulp-archiver": "^1.0.0",
|
||||
"gulp-aws-s3": "^1.1.0",
|
||||
"gulp-bump": "^3.1.3",
|
||||
"gulp-changed-in-place": "^2.3.0",
|
||||
"gulp-debug": "^4.0.0",
|
||||
"gulp-gzip": "^1.4.2",
|
||||
"gulp-tar": "^2.1.0",
|
||||
"nyc": "^13.1.0",
|
||||
"static-server": "^2.2.1",
|
||||
"tap-summary": "^4.0.0",
|
||||
"tape": "^4.9.2",
|
||||
"tape-promise": "^3.0.0",
|
||||
"translate-google": "^1.3.5"
|
||||
}
|
||||
}
|
63
package.old
Normal file
63
package.old
Normal file
@ -0,0 +1,63 @@
|
||||
{
|
||||
"name": "obdfcascrape",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "nyc tape tests/**/*.js",
|
||||
"testScrapers": "nyc tape tests/**/scrape.*.js",
|
||||
"testSpecific": "nyc tape tests/scrape.se.js",
|
||||
"testRep": "nyc tape tests/**/rep.*.js",
|
||||
"testfr": "nyc tape tests/fr.js",
|
||||
"cleanup": "rm artefacts/*.{html,json}; rm artefacts/screenshots/*.{jpg,jpeg,png};",
|
||||
"start": "./start.sh",
|
||||
"server": "http-server ./public",
|
||||
"malta": "node mt.js",
|
||||
"debuglogs": "node debuglogs.js"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"archiver": "^2.1.1",
|
||||
"archiver-promise": "^1.0.0",
|
||||
"aws-sdk": "^2.395.0",
|
||||
"camelcase": "^5.0.0",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"crc": "^3.8.0",
|
||||
"cron": "^1.6.0",
|
||||
"csv": "^3.1.0",
|
||||
"dateformat": "^3.0.3",
|
||||
"del": "^3.0.0",
|
||||
"dotenv": "^6.2.0",
|
||||
"fs-extra": "^7.0.1",
|
||||
"get-ssl-certificate": "^2.3.1",
|
||||
"google-translate-api": "^2.3.0",
|
||||
"hh-mm-ss": "^1.2.0",
|
||||
"jsonfile": "^5.0.0",
|
||||
"log4js": "^3.0.6",
|
||||
"moment": "^2.24.0",
|
||||
"pm2": "^3.2.9",
|
||||
"puppeteer": "1.11.0",
|
||||
"remove-accents-diacritics": "^1.0.2",
|
||||
"request": "^2.88.0",
|
||||
"tld-extract": "^1.0.1",
|
||||
"underscore": "^1.9.1",
|
||||
"whois": "^2.9.1",
|
||||
"whois-json": "^2.0.4",
|
||||
"yargs": "^12.0.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"gulp": "^3.9.1",
|
||||
"gulp-archiver": "^1.0.0",
|
||||
"gulp-aws-s3": "^1.1.0",
|
||||
"gulp-bump": "^3.1.3",
|
||||
"gulp-gzip": "^1.4.2",
|
||||
"gulp-tar": "^2.1.0",
|
||||
"nyc": "^13.1.0",
|
||||
"static-server": "^2.2.1",
|
||||
"tap-summary": "^4.0.0",
|
||||
"tape": "^4.9.2",
|
||||
"tape-promise": "^3.0.0",
|
||||
"translate-google": "^1.3.5"
|
||||
}
|
||||
}
|
23
pl.js
Normal file
23
pl.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Poland = require('./ncas/pl');
|
||||
|
||||
async function run() {
|
||||
const plScraper = new Poland();
|
||||
|
||||
if (typeof(process.env.PL_CRON) === 'string' )
|
||||
new CronJob(process.env.PL_CRON, async function() {
|
||||
await plScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === plScraper.id)
|
||||
await plScraper.run();
|
||||
|
||||
console.log('PL Launched');
|
||||
}
|
||||
|
||||
run();
|
23
pt.js
Normal file
23
pt.js
Normal file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const Portugal = require('./ncas/pt');
|
||||
|
||||
async function run() {
|
||||
const ptScraper = new Portugal();
|
||||
|
||||
if (typeof(process.env.PT_CRON) === 'string' )
|
||||
new CronJob(process.env.PT_CRON, async function() {
|
||||
await ptScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === ptScraper.id)
|
||||
await ptScraper.run();
|
||||
|
||||
console.log('PT Launched');
|
||||
}
|
||||
|
||||
run();
|
28
publish.js
Normal file
28
publish.js
Normal file
@ -0,0 +1,28 @@
|
||||
var AWS = require('aws-sdk');
|
||||
var util = require('util');
|
||||
var config = require('./config.json');
|
||||
|
||||
require('dotenv').config({
|
||||
'path': `${__dirname }/.env`
|
||||
});
|
||||
|
||||
// configure AWS
|
||||
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
||||
|
||||
var sns = new AWS.SNS();
|
||||
|
||||
function publish(mesg) {
|
||||
var publishParams = {
|
||||
'TopicArn' : config.TopicArn,
|
||||
'Message': mesg
|
||||
};
|
||||
|
||||
sns.publish(publishParams, function(err, data) {
|
||||
process.stdout.write('.');
|
||||
// console.log(data);
|
||||
});
|
||||
}
|
||||
|
||||
for (var i = 0; i < 500; i++)
|
||||
publish(`message: ${ i}`);
|
||||
|
31
se.js
Normal file
31
se.js
Normal file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
// TODO:
|
||||
// parse arguments - we should run just 1 FCA per go &
|
||||
// have option to run selected company from selected NCA
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
// load helper libs etc
|
||||
// const Fca = require('./ncas/fca');
|
||||
|
||||
const Sweden = require('./ncas/se');
|
||||
|
||||
async function run() {
|
||||
const seScraper = new Sweden();
|
||||
|
||||
if (typeof(process.env.SE_CRON) === 'string' )
|
||||
new CronJob(process.env.SE_CRON, async function() {
|
||||
await seScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === seScraper.id)
|
||||
await seScraper.run();
|
||||
|
||||
console.log('SE Launched');
|
||||
}
|
||||
|
||||
run();
|
55
setup/eslintrc.json
Normal file
55
setup/eslintrc.json
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"parserOptions": {
|
||||
"ecmaVersion": 2017,
|
||||
"sourceType": "module",
|
||||
"ecmaFeatures": {
|
||||
"jsx": false
|
||||
}
|
||||
},
|
||||
"env": {
|
||||
"browser": false,
|
||||
"node": true,
|
||||
"es6": true
|
||||
},
|
||||
"rules": {
|
||||
"arrow-spacing": "error",
|
||||
"block-scoped-var": "error",
|
||||
"block-spacing": "error",
|
||||
"brace-style": ["error", "stroustrup", {}],
|
||||
"camelcase": "error",
|
||||
"comma-dangle": ["error", "never"],
|
||||
"comma-spacing": ["error", { "before": false, "after": true }],
|
||||
"comma-style": [1, "last"],
|
||||
"consistent-this": [1, "_this"],
|
||||
"curly": [1, "multi"],
|
||||
"eol-last": 1,
|
||||
"eqeqeq": 1,
|
||||
"func-names": 1,
|
||||
"indent": ["error", 2, { "SwitchCase": 1 }],
|
||||
"lines-around-comment": ["error", { "beforeBlockComment": true, "allowArrayStart": true }],
|
||||
"max-len": [1, 180, 2], // 2 spaces per tab, max 80 chars per line
|
||||
"new-cap": 1,
|
||||
"newline-before-return": "error",
|
||||
"no-array-constructor": 1,
|
||||
"no-inner-declarations": [1, "both"],
|
||||
"no-mixed-spaces-and-tabs": 1,
|
||||
"no-multi-spaces": 2,
|
||||
"no-new-object": 1,
|
||||
"no-shadow-restricted-names": 1,
|
||||
"object-curly-spacing": ["error", "always"],
|
||||
"padded-blocks": ["error", { "blocks": "never", "switches": "always" }],
|
||||
"prefer-const": "error",
|
||||
"prefer-template": "error",
|
||||
"one-var": 0,
|
||||
"quote-props": ["error", "always"],
|
||||
"quotes": [1, "single"],
|
||||
"radix": 1,
|
||||
"semi": [1, "always"],
|
||||
"space-before-blocks": [1, "always"],
|
||||
"space-infix-ops": 1,
|
||||
"vars-on-top": 1,
|
||||
"no-multiple-empty-lines": ["error", { "max": 1, "maxEOF": 1 }],
|
||||
"spaced-comment": ["error", "always", { "markers": ["/"] }]
|
||||
}
|
||||
|
||||
}
|
8
setup/init.sh
Executable file
8
setup/init.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#cloud-boothook
|
||||
#!/bin/bash
|
||||
curl -o- https://raw.githubusercontent.com/OpenBankingUK/obdfcascrape/DIR-3232/setup/install.sh?token=ApJJhry7P8vGWWpPtttCgOaregsZnXdmks5b_rG7wA%3D%3D | bash
|
||||
|
||||
|
||||
|
||||
|
||||
# cat /var/log/cloud-init-output.log
|
29
setup/install.sh
Executable file
29
setup/install.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
NVM="$HOME/.nvm"
|
||||
NVM_VERSION="stable"
|
||||
DEV="$HOME/dev"
|
||||
SWAP="/swapfile"
|
||||
|
||||
|
||||
# apt-get -y -q update && apt-get -y -q upgrade
|
||||
apt-get -y -q update && apt-get --assume-yes install build-essential git nginx htop screen wget curl xorg openbox libasound2
|
||||
apt-get -y -q clean
|
||||
|
||||
fallocate -l 1G $SWAP
|
||||
chmod 600 $SWAP
|
||||
mkswap $SWAP
|
||||
swapon $SWAP
|
||||
|
||||
echo '/swapfile none swap defaults 0 0' >> /etc/fstab
|
||||
|
||||
curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash
|
||||
export NVM_DIR=$NVM
|
||||
[ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" # This loads nvm
|
||||
|
||||
source $HOME/.bashrc
|
||||
source $NVM/nvm.sh
|
||||
nvm install $NVM_VERSION
|
||||
|
||||
npm install -g gulp pm2@latest npm-check npm-install-missing
|
||||
pm2 update
|
||||
touch $HOME/martin.txt
|
7
setup/setup.sh
Normal file
7
setup/setup.sh
Normal file
@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
rsync -avz --exclude 'artefacts' --exclude 'node_modules' --exclude '.git' --exclude 'dist' /media/sf_mdev/obdfcascrape/ ~/dev/
|
||||
|
||||
|
||||
|
||||
SCRAPE_START=EE;NODE_ENV=;LOGGER_LEVEL=trace
|
52
setup/work/Dockerfile
Normal file
52
setup/work/Dockerfile
Normal file
@ -0,0 +1,52 @@
|
||||
FROM node:8-slim
|
||||
LABEL name "slimscrape"
|
||||
|
||||
# See https://crbug.com/795759
|
||||
RUN apt-get update && apt-get install -yq libgconf-2-4
|
||||
|
||||
# Install latest chrome dev package and fonts to support major
|
||||
# charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
|
||||
# Note: this installs the necessary libs to make the bundled version
|
||||
# of Chromium that Puppeteer
|
||||
# installs, work.
|
||||
RUN apt-get update && apt-get install -y wget --no-install-recommends \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \
|
||||
--no-install-recommends \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& apt-get purge --auto-remove -y curl \
|
||||
&& rm -rf /src/*.deb
|
||||
|
||||
# It's a good idea to use dumb-init to help prevent zombie chrome processes.
|
||||
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init
|
||||
RUN chmod +x /usr/local/bin/dumb-init
|
||||
|
||||
# Uncomment to skip the chromium download when installing puppeteer.
|
||||
# If you do, you'll need to launch puppeteer with:
|
||||
# browser.launch({executablePath: 'google-chrome-unstable'})
|
||||
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
||||
|
||||
# Copy the app
|
||||
WORKDIR /app
|
||||
|
||||
ADD archive.tar.gz /app
|
||||
|
||||
RUN npm install pm2 -g
|
||||
|
||||
RUN npm i
|
||||
|
||||
# Add user so we don't need --no-sandbox.
|
||||
# RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
|
||||
# && mkdir -p /home/pptruser/Downloads \
|
||||
# && chown -R pptruser:pptruser /home/pptruser \
|
||||
# && chown -R pptruser:pptruser ./node_modules
|
||||
|
||||
# Run everything after as non-privileged user.
|
||||
# USER pptruser
|
||||
|
||||
# EXPOSE 8084
|
||||
ENTRYPOINT ["/usr/local/bin/dumb-init", "--"]
|
||||
|
||||
CMD ["pm2-runtime", "start", "ecosystem.config.js", "--raw" , "--env", "production"]
|
29
setup/work/Makefile
Normal file
29
setup/work/Makefile
Normal file
@ -0,0 +1,29 @@
|
||||
PROJECT = obdfcascrape
|
||||
VERSION = $(shell git rev-parse --short HEAD)
|
||||
ECR_REGION = eu-west-1
|
||||
ECR_ACCOUNT_NUMBER = 482681734622
|
||||
#ECR_REPO = $(ECR_ACCOUNT_NUMBER).dkr.ecr.$(ECR_REGION).amazonaws.com
|
||||
ECR_REPO = mail.caliban.io:5000
|
||||
#APP_IMAGE = 482681734622.dkr.ecr.eu-west-1.amazonaws.com/$(PROJECT):$(VERSION)
|
||||
APP_IMAGE = $(ECR_REPO)/$(PROJECT):$(VERSION)
|
||||
NO_CACHE = false
|
||||
|
||||
#build docker image
|
||||
build:
|
||||
# docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
|
||||
docker build . -t $(APP_IMAGE) --build-arg VERSION=$(VERSION) --no-cache=$(NO_CACHE)
|
||||
.PHONY: build
|
||||
|
||||
#push docker image to registry
|
||||
push: build
|
||||
docker push $(APP_IMAGE)
|
||||
.PHONY: push
|
||||
|
||||
#push docker image to registry
|
||||
run: build
|
||||
docker run $(APP_IMAGE)
|
||||
.PHONY: run
|
||||
ver:
|
||||
@echo '$(VERSION)'
|
||||
#echo $ERSION
|
||||
.PHONY: ver
|
BIN
setup/work/archive.tar.gz
Normal file
BIN
setup/work/archive.tar.gz
Normal file
Binary file not shown.
146
setupQueue.js
Normal file
146
setupQueue.js
Normal file
@ -0,0 +1,146 @@
|
||||
// https://github.com/markcallen/snssqs
|
||||
|
||||
const AWS = require('aws-sdk');
|
||||
const util = require('util');
|
||||
const async = require('async');
|
||||
const fs = require('fs');
|
||||
|
||||
require('dotenv').config({
|
||||
'path': `${__dirname }/.env`
|
||||
});
|
||||
|
||||
// configure AWS
|
||||
AWS.config.update({ 'accessKeyId': process.env.AWS_ACCESS_KEY_ID, 'secretAccessKey': process.env.AWS_SECRET_ACCESS_KEY, 'region': process.env.AWS_REGION || 'eu-west-1' });
|
||||
|
||||
const sns = new AWS.SNS();
|
||||
const sqs = new AWS.SQS();
|
||||
|
||||
const config = {};
|
||||
|
||||
function createTopic(cb) {
|
||||
sns.createTopic({
|
||||
'Name': process.env.SQS_NAME
|
||||
}, function (err, result) {
|
||||
if (err !== null) {
|
||||
console.log(util.inspect(err));
|
||||
|
||||
return cb(err);
|
||||
}
|
||||
console.log(util.inspect(result));
|
||||
|
||||
config.TopicArn = result.TopicArn;
|
||||
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
function createQueue(cb) {
|
||||
sqs.createQueue({
|
||||
'QueueName': process.env.SQS_NAME
|
||||
}, function (err, result) {
|
||||
if (err !== null) {
|
||||
console.log(util.inspect(err));
|
||||
|
||||
return cb(err);
|
||||
}
|
||||
|
||||
console.log(util.inspect(result));
|
||||
|
||||
config.QueueUrl = result.QueueUrl;
|
||||
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
function getQueueAttr(cb) {
|
||||
sqs.getQueueAttributes({
|
||||
'QueueUrl': config.QueueUrl,
|
||||
'AttributeNames': ['QueueArn']
|
||||
}, function (err, result) {
|
||||
if (err !== null) {
|
||||
console.log(util.inspect(err));
|
||||
|
||||
return cb(err);
|
||||
}
|
||||
|
||||
console.log(util.inspect(result));
|
||||
|
||||
config.QueueArn = result.Attributes.QueueArn;
|
||||
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
function snsSubscribe(cb) {
|
||||
sns.subscribe({
|
||||
'TopicArn': config.TopicArn,
|
||||
'Protocol': 'sqs',
|
||||
'Endpoint': config.QueueArn
|
||||
}, function (err, result) {
|
||||
if (err !== null) {
|
||||
console.log(util.inspect(err));
|
||||
|
||||
return cb(err);
|
||||
}
|
||||
|
||||
console.log(util.inspect(result));
|
||||
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
function setQueueAttr(cb) {
|
||||
const queueUrl = config.QueueUrl;
|
||||
const topicArn = config.TopicArn;
|
||||
const sqsArn = config.QueueArn;
|
||||
|
||||
const attributes = {
|
||||
'Version': '2008-10-17',
|
||||
'Id': `${sqsArn}/SQSDefaultPolicy`,
|
||||
'Statement': [{
|
||||
'Sid': `Sid${new Date().getTime()}`,
|
||||
'Effect': 'Allow',
|
||||
'Principal': {
|
||||
'AWS': '*'
|
||||
},
|
||||
'Action': 'SQS:SendMessage',
|
||||
'Resource': sqsArn,
|
||||
'Condition': {
|
||||
'ArnEquals': {
|
||||
'aws:SourceArn': topicArn
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
};
|
||||
|
||||
sqs.setQueueAttributes({
|
||||
'QueueUrl': queueUrl,
|
||||
'Attributes': {
|
||||
'Policy': JSON.stringify(attributes)
|
||||
}
|
||||
}, function (err, result) {
|
||||
if (err !== null) {
|
||||
console.log(util.inspect(err));
|
||||
|
||||
return cb(err);
|
||||
}
|
||||
|
||||
console.log(util.inspect(result));
|
||||
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
function writeConfigFile(cb) {
|
||||
fs.writeFile('config.json', JSON.stringify(config, null, 4), function(err) {
|
||||
if(err)
|
||||
return cb(err);
|
||||
|
||||
console.log('config saved to config.json');
|
||||
cb();
|
||||
});
|
||||
}
|
||||
|
||||
async.series([createTopic, createQueue, getQueueAttr, snsSubscribe, setQueueAttr, writeConfigFile]);
|
||||
|
25
sk.js
Normal file
25
sk.js
Normal file
@ -0,0 +1,25 @@
|
||||
#!/usr/bin/env node
|
||||
const CronJob = require('cron').CronJob;
|
||||
|
||||
// load env variables from file
|
||||
require('dotenv').config();
|
||||
|
||||
const argv = require('yargs').argv;
|
||||
|
||||
const Slovakia = require('./ncas/sk');
|
||||
|
||||
async function run() {
|
||||
const skScraper = new Slovakia();
|
||||
|
||||
if (typeof(process.env.SK_CRON) === 'string' )
|
||||
new CronJob(process.env.SK_CRON, async function() {
|
||||
await skScraper.run();
|
||||
}, null, true);
|
||||
|
||||
if (process.env.SCRAPE_START === skScraper.id)
|
||||
await skScraper.run();
|
||||
|
||||
console.log('SK Launched');
|
||||
}
|
||||
|
||||
run();
|
1
sonar-project.properties
Normal file
1
sonar-project.properties
Normal file
@ -0,0 +1 @@
|
||||
sonar.exclusions=**/tests/**/*
|
448
src/cy.js
Normal file
448
src/cy.js
Normal file
@ -0,0 +1,448 @@
|
||||
const Scraper = require('../helpers/scraper');
|
||||
const cheerio = require('cheerio');
|
||||
const path = require('path');
|
||||
const jsonfile = require('jsonfile');
|
||||
const logger = require('log4js').getLogger('CY');
|
||||
|
||||
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
||||
|
||||
// load env variables from file
|
||||
|
||||
class CYScrape extends Scraper {
|
||||
|
||||
constructor() {
|
||||
super();
|
||||
this.id = 'CY';
|
||||
this.version = '0.0.2';
|
||||
|
||||
this.on('done', () => {
|
||||
this._done();
|
||||
});
|
||||
|
||||
this.run = this._debounce(async () => {
|
||||
await this.__run();
|
||||
}, 5000);
|
||||
|
||||
if (process.env.NODE_ENV === 'production')
|
||||
this._checkLock().then((l) => {
|
||||
if(l)
|
||||
this.run();
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param selector
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async grabLink(selector) {
|
||||
const clickableLinks = await this.page.$$(selector);
|
||||
|
||||
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
||||
|
||||
if (clickableLinks.length > 0)
|
||||
for (const item of clickableLinks) {
|
||||
const href = await this.page.evaluate(el => el.href, item);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
await this.page.goto(href, { 'waitUntil': 'networkidle2' }).catch((err) => {
|
||||
// log this error but Puppeteer isn't supposed to support this sort of download....
|
||||
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
|
||||
|
||||
if (!err.message.includes('net::ERR_ABORTED') )
|
||||
logger.error('grabLink', err);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param id
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async downloadEmoney(id) {
|
||||
const selector = ['#generic_article > div > div.row > div > div > ul > li:nth-child(1) > a', '#generic_article > div > div.row > div > div > ul > li:nth-child(2) > b > b > a'];
|
||||
|
||||
await this.grabLink(selector[id]);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async downloadExcel() {
|
||||
const selector = '#workshops > div > div.workshop-article-container > div > div > div > h3 > a';
|
||||
|
||||
await this.grabLink(selector);
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handlePaymentInstitutions() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filename = 'licensing-and-supervision-of-payment-institutions';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadExcel();
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.page.goto(this.eMoneyUrl, { 'waitUntil': 'networkidle2' });
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async handleElectronicMoneyInstitutions() {
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const filename = 'licensing-and-supervision-of-electronic-money-institutions';
|
||||
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadEmoney(0);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
await this.downloadEmoney(1);
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
this.emit('startProcessingCreditServices');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param body
|
||||
* @returns {Promise<{}|Array>}
|
||||
*/
|
||||
async extractLocalCreditInstitutions(body) {
|
||||
try{
|
||||
const matchHeading = /LOCAL AUTHORISED CREDIT INSTITUTIONS/;
|
||||
const sanity = /(\d+\.\s)(.+)/;
|
||||
const $ = cheerio.load(body, {
|
||||
'normalizeWhitespace': true
|
||||
});
|
||||
|
||||
let nextItem;
|
||||
|
||||
$('p').each(function(i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
const isHeading = matchHeading.test(lineText);
|
||||
if (isHeading)
|
||||
nextItem = $(this).next();
|
||||
});
|
||||
|
||||
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
||||
const splitText = $(nextItem).text().split('\n');
|
||||
|
||||
const output = [];
|
||||
|
||||
splitText.forEach((item) => {
|
||||
const newItem = this._cleanUp(item);
|
||||
|
||||
if ( newItem !== '')
|
||||
output.push( sanity.exec(newItem)[2]);
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
catch( err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param body
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async extractForeignCreditInstitutions(body) {
|
||||
try{
|
||||
const matchHeading = /FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING/;
|
||||
|
||||
const sanity = /(\w+\.\s+)(.+)/;
|
||||
|
||||
const $ = cheerio.load(body, {
|
||||
'normalizeWhitespace': true
|
||||
});
|
||||
|
||||
const output = {};
|
||||
|
||||
let nextItem;
|
||||
|
||||
$('p').each(function(i, elem) {
|
||||
const lineText = $(this).text();
|
||||
const isHeading = matchHeading.test(lineText);
|
||||
if (isHeading)
|
||||
nextItem = $(this).next();
|
||||
});
|
||||
|
||||
// Rolling this out for ease as it could be changed by hand
|
||||
let nextElm;
|
||||
|
||||
let firstHeadOrig, firstHead;
|
||||
|
||||
if (typeof nextItem !== 'undefined' && nextItem !== null) {
|
||||
firstHeadOrig = this._cleanUp($(nextItem).text());
|
||||
firstHead = sanity.exec(firstHeadOrig)[2];
|
||||
output[firstHead] = {};
|
||||
|
||||
nextElm = $(nextItem).next();
|
||||
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
firstHeadOrig = this._cleanUp($(nextElm).text());
|
||||
firstHead = sanity.exec(firstHeadOrig)[2];
|
||||
output[firstHead] = {};
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
nextElm = $(nextElm).next();
|
||||
}
|
||||
|
||||
if (typeof nextElm !== 'undefined' && nextElm !== null) {
|
||||
const secondHeadOrig = this._cleanUp($(nextElm).text());
|
||||
const secondHead = sanity.exec(secondHeadOrig)[2];
|
||||
|
||||
nextElm = $(nextElm).next();
|
||||
|
||||
const li = $(nextElm).find('li');
|
||||
|
||||
const arrayA = [];
|
||||
$(li).each(function (i, elem) {
|
||||
const lineText = $(this).text();
|
||||
arrayA.push(lineText);
|
||||
});
|
||||
|
||||
output[firstHead][secondHead] = arrayA;
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
catch(err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<{local: Promise<*|void>}>}
|
||||
*/
|
||||
async processCreditInstitute() {
|
||||
logger.info('Credit institutes');
|
||||
try{
|
||||
await this._makeScreenshotV2(this.page, `${this.path}/creditInstitutes`, null);
|
||||
|
||||
const body = await this.page.content();
|
||||
|
||||
await this._dumpFile(`${this.path}/creditInstitutes.html`, body);
|
||||
const $ = cheerio.load(body);
|
||||
|
||||
const content = $('.generic_page-intro');
|
||||
|
||||
const local = await this.extractLocalCreditInstitutions(content.html());
|
||||
const creditInstitutes = await this.extractForeignCreditInstitutions(content.html());
|
||||
|
||||
await jsonfile.writeFile(`${this.path}/creditInstitutes.json`, { local, creditInstitutes });
|
||||
|
||||
this.emit('done');
|
||||
|
||||
return { local, creditInstitutes };
|
||||
}
|
||||
catch(err) {
|
||||
logger.error(err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param filePath
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async savePDF(filePath) {
|
||||
logger.info('Saving the pdf:', filePath);
|
||||
|
||||
await this._randomWait(this.page, 5, 7);
|
||||
await this.page.pdf({ 'path': filePath, 'format': 'A4' });
|
||||
// this.emit('startProcessingCreditServices');
|
||||
logger.debug('!! i SHOULD EMIT SOMETHING HERE !!');
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async processNewPage() {
|
||||
// give the page a few seconds to settle
|
||||
const checkPDF = /(.pdf)/g;
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
|
||||
const currentPage = await this.page.evaluate(() => document);
|
||||
|
||||
let currentPath = currentPage.location.pathname;
|
||||
let pdfFile;
|
||||
|
||||
if (checkPDF.test(currentPath)) {
|
||||
const splitPath = currentPath.split('/');
|
||||
|
||||
pdfFile = splitPath.pop();
|
||||
currentPath = splitPath.join('/');
|
||||
}
|
||||
|
||||
switch (currentPath) {
|
||||
|
||||
case '/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions':
|
||||
await this.handlePaymentInstitutions();
|
||||
break;
|
||||
case '/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions':
|
||||
await this.handleElectronicMoneyInstitutions();
|
||||
break;
|
||||
case '/images/media/redirectfile/Electronic%20Money%20Institutions':
|
||||
logger.warn('We should only arrive here when in Non-headless mode');
|
||||
await this.savePDF(pdfFile);
|
||||
break;
|
||||
case '/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus':
|
||||
|
||||
await this.processCreditInstitute();
|
||||
break;
|
||||
default:
|
||||
|
||||
await this._uploadError();
|
||||
throw new Error(`Unknown page: ${currentPath}`);
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async attachEvents() {
|
||||
logger.info('Attaching events');
|
||||
this.on('startProcessingCreditServices', async function() {
|
||||
await this._goto(this.credit);
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async start() {
|
||||
try {
|
||||
super._start();
|
||||
this.creditServices = {
|
||||
'items': 0,
|
||||
'links': [],
|
||||
'step': 0,
|
||||
'visited': false,
|
||||
'done' : false,
|
||||
'searchDone' : false
|
||||
};
|
||||
|
||||
this.startPage = 'https://www.centralbank.cy/en/licensing-supervision/payment-institutions/licensing-and-supervision-of-payment-institutions';
|
||||
this.eMoneyUrl = 'https://www.centralbank.cy/en/licensing-supervision/electronic-money-institutions/licensing-and-supervision-of-electronic-money-institutions';
|
||||
this.credit = 'https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus';
|
||||
|
||||
this.path = path.resolve(`${__dirname }/../artefacts/CY/CBOC`);
|
||||
await this._createDirectory(this.path);
|
||||
|
||||
await this._doNonRepudiation().catch((err) => {
|
||||
logger.warn(err);
|
||||
});
|
||||
|
||||
await this._initBrowser(true);
|
||||
await this._createBrowserPage();
|
||||
|
||||
this.page.on('domcontentloaded', this._throttle(async () => {
|
||||
this.processNewPage().catch((err) => {
|
||||
logger.error('processNewPage fail', err);
|
||||
});
|
||||
}, 2500));
|
||||
|
||||
if (this.eventNames().length === 2)
|
||||
await this.attachEvents();
|
||||
|
||||
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots': true });
|
||||
|
||||
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
||||
await this._goto(this.startPage);
|
||||
|
||||
await this._randomWait(this.page, 3, 5);
|
||||
}
|
||||
catch (e) {
|
||||
throw new Error(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @returns {Promise<void>}
|
||||
*/
|
||||
async __run() {
|
||||
logger.info('Scraping Cyprus...');
|
||||
|
||||
await this.start();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = CYScrape;
|
8
start.sh
Normal file
8
start.sh
Normal file
@ -0,0 +1,8 @@
|
||||
#!/bin/sh
|
||||
set -ex
|
||||
|
||||
eval "$(aws ssm get-parameters-by-path --region $REGION --path "/$SERVICE_NAME/$ENV/" --query 'Parameters[*].{Name:Name,Value:Value}' --output text | sed 's/\/'"$SERVICE_NAME"'\/'"$ENV"'\///g' | awk -F '\t' '{ print "export " $1 "=" "\""$2"\";" }')"
|
||||
|
||||
npm show puppeteer version
|
||||
|
||||
pm2-runtime start ecosystem.config.js --raw --env production
|
51
tests/data/cy/all_credit_001.json
Normal file
51
tests/data/cy/all_credit_001.json
Normal file
@ -0,0 +1,51 @@
|
||||
{
|
||||
"local": [
|
||||
"Ancoria Bank Limited",
|
||||
"Astrobank Limited",
|
||||
"Bank of Cyprus Public Company Ltd",
|
||||
"Cyprus Development Bank Public Company Limited",
|
||||
"Hellenic Bank Public Company Limited",
|
||||
"Housing Finance Corporation",
|
||||
"RCB BANK LTD"
|
||||
],
|
||||
"creditInstitutes": {
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS": {
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
|
||||
"Αlpha Bank Cyprus Ltd",
|
||||
"Eurobank Cyprus Ltd",
|
||||
"National Bank of Greece (Cyprus) Ltd"
|
||||
],
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
|
||||
"Societe Generale Bank-Cyprus Limited",
|
||||
"USB Bank Plc"
|
||||
]
|
||||
},
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS": {
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
|
||||
"AS Expobank ",
|
||||
"Banque SBA",
|
||||
"Central Cooperative Bank PLC",
|
||||
"EFG Bank (Luxembourg) S.A.",
|
||||
"First Investment Bank Ltd ",
|
||||
"National Bank of Greece S.A."
|
||||
],
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
|
||||
"Arab Jordan Investment Bank SA",
|
||||
"Bank of Beirut SAL",
|
||||
"BankMed s.a.l. ",
|
||||
"Banque BEMO SAL ",
|
||||
"BBAC SAL ",
|
||||
"BLOM Bank SAL ",
|
||||
"Byblos Bank SAL ",
|
||||
"Credit Libanais SAL ",
|
||||
"IBL Bank sal ",
|
||||
"Joint-stock company AVTOVAZBANK * ",
|
||||
"Jordan Ahli Bank plc",
|
||||
"Jordan Kuwait Bank PLC ",
|
||||
"Lebanon and Gulf Bank SAL ",
|
||||
"Promsvyazbank PJSC **",
|
||||
"Public Joint-Stock Company Commercial Bank \"Privatbank\"***"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
101
tests/data/cy/content.html
Normal file
101
tests/data/cy/content.html
Normal file
@ -0,0 +1,101 @@
|
||||
<div class="generic_page-intro">
|
||||
<h1 class="text-center">Register of Credit Institutions operating in Cyprus</h1>
|
||||
|
||||
<p class="text-center"></p>
|
||||
<p> </p>
|
||||
|
||||
<p><b>1. LOCAL AUTHORISED CREDIT INSTITUTIONS</b></p>
|
||||
|
||||
<p><b> </b> <br>
|
||||
1. <a href="http://www.ancoriabank.com" target="_blank"><font color="#0000ff">Ancoria Bank Limited</font></a> <br>
|
||||
2. <a href="http://www.astrobank.com"><font color="#0000ff">Astrobank Limited</font></a><br>
|
||||
3. <a href="http://www.bankofcyprus.com/" target="_blank"><font color="#0000ff">Bank of Cyprus Public Company Ltd</font></a><br>
|
||||
4. <a href="http://www.cyprusdevelopmentbank.com/" target="_blank"><span style="color: rgb(0, 0, 255);">Cyprus Development Bank Public Company Limited</span></a> <br>
|
||||
5. <a href="http://www.hellenicbank.com/" target="_blank"><font color="#0000ff">Hellenic Bank Public Company Limited </font></a><br>
|
||||
6. <a href="http://www.hfc.com.cy/" target="_blank"><font color="#0000ff">Housing Finance Corporation</font></a><br>
|
||||
7. <a href="http://www.rcbcy.com/" target="_blank"><font color="#0000ff">RCB BANK LTD</font></a></p>
|
||||
|
||||
<p><strong> </strong></p>
|
||||
|
||||
<p><b>2. FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING UNDER THE "EUROPEAN PASSPORT"</b></p>
|
||||
|
||||
<p><b> A. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS</b></p>
|
||||
|
||||
<p><b> I. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES</b></p>
|
||||
|
||||
<ol type="1">
|
||||
<li><a href="http://www.alphabank.com.cy/" target="_blank"><font color="#0000ff">Αlpha Bank Cyprus Ltd</font></a></li>
|
||||
<li><a href="http://www.eurobank.com.cy" target="_blank"><font color="#0000ff">Eurobank Cyprus Ltd</font></a></li>
|
||||
<li><a href="http://www.nbg.com.cy/" target="_blank"><font color="#0000ff">National Bank of Greece (Cyprus) Ltd</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><strong> II. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES</strong></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.sgcyprus.com/" target="_blank"><font color="#0000ff">Societe Generale Bank-Cyprus Limited</font></a></li>
|
||||
<li><a href="http://www.usbbank.com.cy/" target="_blank"><font color="#0000ff">USB Bank Plc</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><strong>B. BRANCHES OF FOREIGN CREDIT INSTITUTIONS</strong></p>
|
||||
|
||||
<p><strong> I. BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES</strong></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.expobank.eu"><font color="#0000ff">AS Expobank </font></a></li>
|
||||
<li><a href="http://www.banque-sba.com/" target="_blank"><font color="#0000ff">Banque SBA</font></a></li>
|
||||
<li><a href="http://www.ccbank.bg/" target="_blank"><font color="#0000ff">Central Cooperative Bank PLC</font></a></li>
|
||||
<li><font color="#0000ff"><a href="http://www.efgbank.lu/" target="_blank"><font color="#0000ff"><font color="#0000ff">EFG Bank (Luxembourg) S.A.</font></font></a></font></li>
|
||||
<li><a href="http://www.fibank.bg/" target="_blank"><font color="#0000ff">First Investment Bank Ltd</font></a> </li>
|
||||
<li><a href="http://www.nbg.gr/" target="_blank"><font color="#0000ff">National Bank of Greece S.A.</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><br>
|
||||
<b> II. BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES</b></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.ajib.com/" target="_blank"><font color="#0000ff">Arab Jordan Investment Bank SA</font></a></li>
|
||||
<li><a href="http://www.bankofbeirut.com.lb/" target="_blank"><font color="#0000ff">Bank of Beirut SAL</font></a></li>
|
||||
<li><a href="http://www.bankmed.com.lb/" target="_blank"><font color="#0000ff">BankMed s.a.l.</font></a> </li>
|
||||
<li><a href="http://www.bemobank.com/"><font color="#0000ff">Banque BEMO SAL</font> </a></li>
|
||||
<li><a href="http://www.bbac.com.lb/" target="_blank"><font color="#0000ff">BBAC SAL</font></a> </li>
|
||||
<li><a href="http://www.blom.com.lb/" target="_blank"><font color="#0000ff">BLOM Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.byblosbank.com.lb/" target="_blank"><font color="#0000ff">Byblos Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.creditlibanais.com.lb/" target="_blank"><font color="#0000ff">Credit Libanais SAL</font></a> </li>
|
||||
<li><a href="http://www.ibl.com.lb/" target="_blank"><font color="#0000ff">IBL Bank sal</font></a> </li>
|
||||
<li><a href="http://www.avbbank.ru" target="_blank"><font color="#0000ff">Joint-stock company AVTOVAZBANK</font></a> <sup>*</sup> </li>
|
||||
<li><font color="#0000ff"><a href="http://www.ahli.com/" target="_blank"><font color="#0000ff">Jordan Ahli Bank plc</font></a></font></li>
|
||||
<li><font color="#0000ff"><a href="http://www.jkb.com" target="_blank"><font color="#0000ff">Jordan Kuwait Bank PLC</font></a> </font></li>
|
||||
<li><a href="http://www.lgb.com.lb/" target="_blank"><font color="#0000ff">Lebanon and Gulf Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.psbank.ru/" target="_blank"><font color="#0000ff">Promsvyazbank PJSC</font></a> <sup>**</sup></li>
|
||||
<li><a href="https://privatbank.ua/ua/"><font color="#0000ff">Public Joint-Stock Company Commercial Bank "Privatbank"</font></a><sup>***</sup></li>
|
||||
</ol>
|
||||
|
||||
<div> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">* Following the amendment of the licence of the branch of Joint-stock company AVTOVAZBANK by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits, (2) the acceptance of payments towards existing customers’ credit facilities, (3) the execution of customers’ outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
|
||||
|
||||
<div style="text-align: justify;"> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">** Following the amendment of the licence of the branch of Promsvyazbank PJSC by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits, (2) the acceptance of payments towards existing customers’ credit facilities, (3) the execution of customers’ outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
|
||||
|
||||
<div style="text-align: justify;"> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">*** Following the amendment of the licence of the branch of Public Joint-Stock Company Commercial Bank "Privatbank" by the Central Bank of Cyprus on 20/12/2016, the said branch is not permitted to engage in any banking business, other than: (i) the repayment or renewal of existing deposits and the acceptance of payments towards existing credit facilities, and (ii) the repayment of administrative expenses relating to the operations of the branch.<strong> </strong></span></div>
|
||||
|
||||
<p><span style="font-size: 10px;"> </span></p>
|
||||
|
||||
<p><strong> 3. REPRESENTATIVE OFFICES</strong></p>
|
||||
|
||||
<div>
|
||||
<ol>
|
||||
<li><a href="http://www.atlasbanka.com/en/"><font color="#0000ff">Atlasmont Banka A.</font><font color="#0000ff">D</font></a></li>
|
||||
<li><a href="http://bankofgeorgia.ge/en/"><font color="#0000ff">JSC Bank of Georgia</font> </a> </li>
|
||||
</ol>
|
||||
|
||||
<p> </p>
|
||||
</div>
|
||||
|
||||
<p> </p>
|
||||
<p></p>
|
||||
|
||||
</div>
|
541
tests/data/cy/credit_institutes.html
Normal file
541
tests/data/cy/credit_institutes.html
Normal file
@ -0,0 +1,541 @@
|
||||
<html lang="en"><script async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript" async="" src="https://www.gstatic.com/recaptcha/api2/v1540794797339/recaptcha__en.js"></script><script>
|
||||
Object.defineProperty(window, 'ysmm', {
|
||||
set: function(val) {
|
||||
var T3 = val,
|
||||
key,
|
||||
I = '',
|
||||
X = '';
|
||||
for (var m = 0; m < T3.length; m++) {
|
||||
if (m % 2 == 0) {
|
||||
I += T3.charAt(m);
|
||||
} else {
|
||||
X = T3.charAt(m) + X;
|
||||
}
|
||||
}
|
||||
T3 = I + X;
|
||||
var U = T3.split('');
|
||||
for (var m = 0; m < U.length; m++) {
|
||||
if (!isNaN(U[m])) {
|
||||
for (var R = m + 1; R < U.length; R++) {
|
||||
if (!isNaN(U[R])) {
|
||||
var S = U[m]^U[R];
|
||||
if (S < 10) {
|
||||
U[m] = S;
|
||||
}
|
||||
m = R;
|
||||
R = U.length;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
T3 = U.join('');
|
||||
T3 = window.atob(T3);
|
||||
T3 = T3.substring(T3.length - (T3.length - 16));
|
||||
T3 = T3.substring(0, T3.length - 16);
|
||||
key = T3;
|
||||
if (key && (key.indexOf('http://') === 0 || key.indexOf("https://") === 0)) {
|
||||
document.write('<!--');
|
||||
window.stop();
|
||||
|
||||
window.onbeforeunload = null;
|
||||
window.location = key;
|
||||
}
|
||||
}
|
||||
});
|
||||
</script><head><script src="https://www.google.com/recaptcha/api.js"></script>
|
||||
|
||||
|
||||
|
||||
|
||||
<title>Central Bank of Cyprus - Register of Credit Institutions operating in Cyprus</title>
|
||||
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="author" content="DW Dynamic Works Ltd">
|
||||
<meta name="keywords" content="">
|
||||
<meta name="description" content="">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="app" data-lang="en">
|
||||
|
||||
|
||||
<meta property="og:title" content="Central Bank of Cyprus - Register of Credit Institutions operating in Cyprus">
|
||||
<meta property="og:description" content="">
|
||||
<meta property="og:site_name" content="www.centralbank.cy">
|
||||
<meta property="og:image" contnt="www.centralbank.cy/assets/image/imageoriginal/Register-of-Credit.jpg">
|
||||
|
||||
<meta property="og:url" content="https://www.centralbank.cy/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus">
|
||||
<meta property="og:type" content="">
|
||||
|
||||
|
||||
<script type="text/javascript" src="/lib/jquery/2.1.4/jquery.min.js"></script>
|
||||
<script type="text/javascript" src="/lib/jquery-base64/jquery.base64.js"></script>
|
||||
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/lib/bootstrap/3.3.6/css/bootstrap.min.css">
|
||||
<script type="text/javascript" src="/lib/bootstrap/3.3.6/js/bootstrap.min.js"></script>
|
||||
|
||||
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Tinos">
|
||||
<link rel="stylesheet" type="text/css" href="/lib/font-awesome/4.6.3/css/font-awesome.min.css">
|
||||
<link rel="stylesheet" type="text/css" href="/lib/simple-line-icons/2.3.2/css/simple-line-icons.css">
|
||||
|
||||
|
||||
<script type="text/javascript" src="/js/data.appvars.min.js"></script>
|
||||
<script type="text/javascript" src="/js/data.dictionary.min.js"></script>
|
||||
|
||||
|
||||
<script type="text/javascript" src="/system.js"></script>
|
||||
<script type="text/javascript" src="/js/app.js"></script>
|
||||
<script type="text/javascript" src="/js/localization.js"></script>
|
||||
<script type="text/javascript" src="/js/number.js"></script>
|
||||
<script type="text/javascript" src="/js/string.js"></script>
|
||||
<script type="text/javascript" src="/js/global.js"></script>
|
||||
<script type="text/javascript" src="/js/controller.js"></script>
|
||||
<link href="/plugins/bootstrap-datepicker/1.7.0/dist/css/bootstrap-datepicker.min.css" rel="stylesheet">
|
||||
<script src="/plugins/bootstrap-datepicker/1.7.0/dist/js/bootstrap-datepicker.min.js"></script>
|
||||
<script src="/plugins/bootstrap-datepicker/1.7.0/dist/locales/bootstrap-datepicker.el.min.js" charset="UTF-8"></script>
|
||||
|
||||
|
||||
<!-- Go to www.addthis.com/dashboard to customize your tools -->
|
||||
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5952254251d264eb"></script>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/css/sections.css">
|
||||
|
||||
<script type="text/javascript" src="/js/sections.js"></script>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="/css/dwf.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/style.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/responsive.css">
|
||||
<link rel="stylesheet" type="text/css" href="/css/custom.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" type="image/x-icon" href="/favicon.ico">
|
||||
<link rel="apple-touch-icon" sizes="57x57" href="/images/favicon/apple-icon-57x57.png">
|
||||
<link rel="apple-touch-icon" sizes="60x60" href="/images/favicon/apple-icon-60x60.png">
|
||||
<link rel="apple-touch-icon" sizes="72x72" href="/images/favicon/images/favicon/apple-icon-72x72.png">
|
||||
<link rel="apple-touch-icon" sizes="76x76" href="/images/favicon/apple-icon-76x76.png">
|
||||
<link rel="apple-touch-icon" sizes="114x114" href="/images/favicon/apple-icon-114x114.png">
|
||||
<link rel="apple-touch-icon" sizes="120x120" href="/images/favicon/apple-icon-120x120.png">
|
||||
<link rel="apple-touch-icon" sizes="144x144" href="/images/favicon/apple-icon-144x144.png">
|
||||
<link rel="apple-touch-icon" sizes="152x152" href="/images/favicon/apple-icon-152x152.png">
|
||||
<link rel="apple-touch-icon" sizes="180x180" href="/images/favicon/apple-icon-180x180.png">
|
||||
<link rel="icon" type="image/png" sizes="192x192" href="/images/favicon/android-icon-192x192.png">
|
||||
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon/favicon-32x32.png">
|
||||
<link rel="icon" type="image/png" sizes="96x96" href="/images/favicon/favicon-96x96.png">
|
||||
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon/favicon-16x16.png">
|
||||
<link rel="manifest" href="/images/favicon/manifest.json">
|
||||
<meta name="msapplication-TileColor" content="#ffffff">
|
||||
<meta name="msapplication-TileImage" content="/images/favicon/ms-icon-144x144.png">
|
||||
<meta name="theme-color" content="#ffffff">
|
||||
|
||||
|
||||
|
||||
<script>
|
||||
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
|
||||
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
|
||||
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
|
||||
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
|
||||
|
||||
ga('create', 'UA-103148572-1', 'auto');
|
||||
ga('send', 'pageview');
|
||||
</script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header>
|
||||
<div class="container-fluid">
|
||||
<div class="language-switch">
|
||||
<a class="nav-top-items" href="/en/the-bank">The Bank</a>
|
||||
<a class="nav-top-items" href="/en/contact-us"><i class="fa fa-envelope" aria-hidden="true"></i></a>
|
||||
<a class="nav-top-items searchBtn search_icon" href=".toggle-search"> <span class="[ glyphicon glyphicon-search ]"></span></a>
|
||||
<a href="javascript:void(0);" class="contrast-toggle" onclick="visuallyImpaired()">High Contrast</a>
|
||||
<a href="javascript:void(0);" onclick="location.href='/en/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus'">EN</a> <span>|</span> <a href="javascript:void(0);" onclick="location.href='/el/licensing-supervision/banks/register-of-credit-institutions-operating-in-cyprus'">ΕΛ</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<nav class="navbar navbar-default navbar-bootsnipp" role="navigation">
|
||||
<div class="container-fluid">
|
||||
<div class="navbar-header">
|
||||
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target="#navbar-collapse-1">
|
||||
<span class="sr-only">Toggle navigation</span>
|
||||
<span class="icon-bar"></span>
|
||||
<span class="icon-bar"></span>
|
||||
<span class="icon-bar"></span>
|
||||
</button>
|
||||
<a class="navbar-brand navbrand-custom normal-brand" href="/en/home" title=" Central Bank of Cyprus"><img src="/images/central-bank-of-cyprus-en.png"></a>
|
||||
<a class="navbar-brand navbrand-custom visually-impaired-brand" href="/en/home" title=" Central Bank of Cyprus" style="display: none;"><h3>Central Bank of Cyprus</h3></a>
|
||||
</div>
|
||||
<div class="collapse navbar-collapse" id="navbar-collapse-1">
|
||||
<ul class="nav navbar-nav navbar-right navbar-custom-media">
|
||||
|
||||
<li><a href="/en/monetary-policy" title="Monetary Policy">Monetary Policy</a></li>
|
||||
|
||||
<li><a href="/en/licensing-supervision" title="Licensing & Supervision">Licensing & Supervision</a></li>
|
||||
|
||||
<li><a href="/en/resolution" title="Resolution">Resolution</a></li>
|
||||
|
||||
<li><a href="/en/financial-stability" title="Financial Stability">Financial Stability</a></li>
|
||||
|
||||
<li><a href="/en/deposit-guarantee-investors-compensation-schemes" title="Deposit Guarantee & Investors' Compensation">Deposit Guarantee & Investors' Compensation</a></li>
|
||||
|
||||
<li><a href="/en/payment-systems-services" title="Payment Systems & Services">Payment Systems & Services</a></li>
|
||||
|
||||
<li><a href="/en/banknotes-and-coins" title="Banknotes & Coins">Banknotes & Coins</a></li>
|
||||
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
<form action="/easyconsole.cfm/page/search" method="POST" role="search">
|
||||
<div class="search_drop" style="display: none;">
|
||||
<div class="search_in">
|
||||
<div class="container_inn">
|
||||
<input type="text" name="q" placeholder="Search" onblur="if (this.placeholder=='') this.placeholder='Search';" onfocus="if (this.placeholder=='Search') this.placeholder='';">
|
||||
<button type="submit" class="btn-default btn-theme-custom btn1">Search <span><i class="fa fa-angle-right" aria-hidden="true"></i></span></button>
|
||||
<div class="clearfix"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
</nav>
|
||||
</header>
|
||||
<script type="text/javascript">
|
||||
$(document).ready(function() {
|
||||
$('.search_drop').hide();
|
||||
// Search
|
||||
$('.searchBtn').on('click', function(event) {
|
||||
event.preventDefault();
|
||||
$('.search_drop').slideToggle();
|
||||
});
|
||||
|
||||
$('.searchBtn').on('click', function(event) {
|
||||
event.preventDefault();
|
||||
$('.search_icon input').toggle('fast');
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
function createCookie(name,value,days) {
|
||||
var expires = "";
|
||||
if (days) {
|
||||
var date = new Date();
|
||||
date.setTime(date.getTime() + (days*24*60*60*1000));
|
||||
expires = "; expires=" + date.toUTCString();
|
||||
}
|
||||
document.cookie = name + "=" + value + expires + "; path=/";
|
||||
}
|
||||
|
||||
function readCookie(name) {
|
||||
var nameEQ = name + "=";
|
||||
var ca = document.cookie.split(';');
|
||||
for(var i=0;i < ca.length;i++) {
|
||||
var c = ca[i];
|
||||
while (c.charAt(0)==' ') c = c.substring(1,c.length);
|
||||
if (c.indexOf(nameEQ) == 0) return c.substring(nameEQ.length,c.length);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function visuallyImpaired(){
|
||||
$('header').toggleClass('visually-impaired');
|
||||
// $('footer').toggleClass('visually-impaired')
|
||||
// $('nav').toggleClass('visually-impaired')
|
||||
// $('section').toggleClass('visually-impaired-sections')
|
||||
// $('ul').toggleClass('visually-impaired')
|
||||
// $('li').toggleClass('visually-impaired')
|
||||
// $('div').toggleClass('visually-impaired')
|
||||
// $('article').toggleClass('visually-impaired')
|
||||
// $('a').toggleClass('visually-impaired')
|
||||
// $('p').toggleClass('visually-impaired')
|
||||
// $('input').toggleClass('visually-impaired')
|
||||
// $('select').toggleClass('visually-impaired')
|
||||
// $('h1').toggleClass('visually-impaired')
|
||||
// $('h2').toggleClass('visually-impaired')
|
||||
// $('h3').toggleClass('visually-impaired')
|
||||
// $('h4').toggleClass('visually-impaired')
|
||||
// $('h5').toggleClass('visually-impaired')
|
||||
// $('address').toggleClass('visually-impaired')
|
||||
// $('figcaption').toggleClass('visually-impaired')
|
||||
// $('span').toggleClass('visually-impaired')
|
||||
// $('a').toggleClass('visually-impaired-links-buttons')
|
||||
// $('button').toggleClass('visually-impaired-links-buttons')
|
||||
// $('figcaption').toggleClass('visually-impaired-links-buttons')
|
||||
$('a.navbar-brand').toggleClass('visually-impaired-dont-display');
|
||||
// $('.btn-theme-custom').toggleClass('visually-impaired')
|
||||
if (readCookie('VIcookie')=='V-I-off'){
|
||||
createCookie('VIcookie','V-I-on',7);
|
||||
}
|
||||
else
|
||||
{
|
||||
createCookie('VIcookie','V-I-off',7);
|
||||
};
|
||||
location.reload(true);
|
||||
};
|
||||
|
||||
|
||||
|
||||
if (readCookie('VIcookie') == null) {
|
||||
$('.visually-impaired-brand').hide();
|
||||
$('.normal-brand').show();
|
||||
$('.contrast-toggle').html('High Contrast');
|
||||
document.write('<link href="/themes/default/main.css" rel="stylesheet">')
|
||||
}
|
||||
else{
|
||||
if (readCookie('VIcookie')=='V-I-on'){
|
||||
$('.visually-impaired-brand').show();
|
||||
$('.normal-brand').hide();
|
||||
$('.contrast-toggle').html('Normal Contrast');
|
||||
document.write('<link href="/themes/high_contrast/main.css" rel="stylesheet">')
|
||||
}
|
||||
else
|
||||
{
|
||||
$('.visually-impaired-brand').hide();
|
||||
$('.normal-brand').show();
|
||||
$('.contrast-toggle').html('High Contrast');
|
||||
document.write('<link href="/themes/default/main.css" rel="stylesheet">')
|
||||
}
|
||||
}
|
||||
|
||||
</script><link href="/themes/default/main.css" rel="stylesheet">
|
||||
|
||||
|
||||
<!-- Go to www.addthis.com/dashboard to customize your tools -->
|
||||
|
||||
<section id="breadcrumbs">
|
||||
<div class="container">
|
||||
<ul>
|
||||
|
||||
<li><a class="active" href="/en/home">Home</a> / </li>
|
||||
|
||||
<li><a class="active" href="/en/licensing-supervision">Licensing & Supervision</a> / </li>
|
||||
|
||||
<li><a class="active" href="/en/licensing-supervision/banks">Banks</a> / </li>
|
||||
|
||||
<li>Register of Credit Institutions operating in Cyprus</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</div>
|
||||
<div class="clearfix"></div>
|
||||
</section>
|
||||
|
||||
<section id="generic_section">
|
||||
<div class="container">
|
||||
<div class="generic_page-intro">
|
||||
<h1 class="text-center">Register of Credit Institutions operating in Cyprus</h1>
|
||||
|
||||
<p class="text-center"></p><p> </p>
|
||||
|
||||
<p><b>1. LOCAL AUTHORISED CREDIT INSTITUTIONS</b></p>
|
||||
|
||||
<p><b> </b> <br>
|
||||
1. <a href="http://www.ancoriabank.com" target="_blank"><font color="#0000ff">Ancoria Bank Limited</font></a> <br>
|
||||
2. <a href="http://www.astrobank.com"><font color="#0000ff">Astrobank Limited</font></a><br>
|
||||
3. <a href="http://www.bankofcyprus.com/" target="_blank"><font color="#0000ff">Bank of Cyprus Public Company Ltd</font></a><br>
|
||||
4. <a href="http://www.cyprusdevelopmentbank.com/" target="_blank"><span style="color: rgb(0, 0, 255);">Cyprus Development Bank Public Company Limited</span></a> <br>
|
||||
5. <a href="http://www.hellenicbank.com/" target="_blank"><font color="#0000ff">Hellenic Bank Public Company Limited </font></a><br>
|
||||
6. <a href="http://www.hfc.com.cy/" target="_blank"><font color="#0000ff">Housing Finance Corporation</font></a><br>
|
||||
7. <a href="http://www.rcbcy.com/" target="_blank"><font color="#0000ff">RCB BANK LTD</font></a></p>
|
||||
|
||||
<p><strong> </strong></p>
|
||||
|
||||
<p><b>2. FOREIGN AUTHORISED CREDIT INSTITUTIONS AND BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM EU MEMBER STATES OPERATING UNDER THE "EUROPEAN PASSPORT"</b></p>
|
||||
|
||||
<p><b> A. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS</b></p>
|
||||
|
||||
<p><b> I. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES</b></p>
|
||||
|
||||
<ol type="1">
|
||||
<li><a href="http://www.alphabank.com.cy/" target="_blank"><font color="#0000ff">Αlpha Bank Cyprus Ltd</font></a></li>
|
||||
<li><a href="http://www.eurobank.com.cy" target="_blank"><font color="#0000ff">Eurobank Cyprus Ltd</font></a></li>
|
||||
<li><a href="http://www.nbg.com.cy/" target="_blank"><font color="#0000ff">National Bank of Greece (Cyprus) Ltd</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><strong> II. SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES</strong></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.sgcyprus.com/" target="_blank"><font color="#0000ff">Societe Generale Bank-Cyprus Limited</font></a></li>
|
||||
<li><a href="http://www.usbbank.com.cy/" target="_blank"><font color="#0000ff">USB Bank Plc</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><strong>B. BRANCHES OF FOREIGN CREDIT INSTITUTIONS</strong></p>
|
||||
|
||||
<p><strong> I. BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES</strong></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.expobank.eu"><font color="#0000ff">AS Expobank </font></a></li>
|
||||
<li><a href="http://www.banque-sba.com/" target="_blank"><font color="#0000ff">Banque SBA</font></a></li>
|
||||
<li><a href="http://www.ccbank.bg/" target="_blank"><font color="#0000ff">Central Cooperative Bank PLC</font></a></li>
|
||||
<li><font color="#0000ff"><a href="http://www.efgbank.lu/" target="_blank"><font color="#0000ff"><font color="#0000ff">EFG Bank (Luxembourg) S.A.</font></font></a></font></li>
|
||||
<li><a href="http://www.fibank.bg/" target="_blank"><font color="#0000ff">First Investment Bank Ltd</font></a> </li>
|
||||
<li><a href="http://www.nbg.gr/" target="_blank"><font color="#0000ff">National Bank of Greece S.A.</font></a></li>
|
||||
</ol>
|
||||
|
||||
<p><br>
|
||||
<b> II. BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES</b></p>
|
||||
|
||||
<ol>
|
||||
<li><a href="http://www.ajib.com/" target="_blank"><font color="#0000ff">Arab Jordan Investment Bank SA</font></a></li>
|
||||
<li><a href="http://www.bankofbeirut.com.lb/" target="_blank"><font color="#0000ff">Bank of Beirut SAL</font></a></li>
|
||||
<li><a href="http://www.bankmed.com.lb/" target="_blank"><font color="#0000ff">BankMed s.a.l.</font></a> </li>
|
||||
<li><a href="http://www.bemobank.com/"><font color="#0000ff">Banque BEMO SAL</font> </a></li>
|
||||
<li><a href="http://www.bbac.com.lb/" target="_blank"><font color="#0000ff">BBAC SAL</font></a> </li>
|
||||
<li><a href="http://www.blom.com.lb/" target="_blank"><font color="#0000ff">BLOM Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.byblosbank.com.lb/" target="_blank"><font color="#0000ff">Byblos Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.creditlibanais.com.lb/" target="_blank"><font color="#0000ff">Credit Libanais SAL</font></a> </li>
|
||||
<li><a href="http://www.ibl.com.lb/" target="_blank"><font color="#0000ff">IBL Bank sal</font></a> </li>
|
||||
<li><a href="http://www.avbbank.ru" target="_blank"><font color="#0000ff">Joint-stock company AVTOVAZBANK</font></a> <sup>*</sup> </li>
|
||||
<li><font color="#0000ff"><a href="http://www.ahli.com/" target="_blank"><font color="#0000ff">Jordan Ahli Bank plc</font></a></font></li>
|
||||
<li><font color="#0000ff"><a href="http://www.jkb.com" target="_blank"><font color="#0000ff">Jordan Kuwait Bank PLC</font></a> </font></li>
|
||||
<li><a href="http://www.lgb.com.lb/" target="_blank"><font color="#0000ff">Lebanon and Gulf Bank SAL</font></a> </li>
|
||||
<li><a href="http://www.psbank.ru/" target="_blank"><font color="#0000ff">Promsvyazbank PJSC</font></a> <sup>**</sup></li>
|
||||
<li><a href="https://privatbank.ua/ua/"><font color="#0000ff">Public Joint-Stock Company Commercial Bank "Privatbank"</font></a><sup>***</sup></li>
|
||||
</ol>
|
||||
|
||||
<div> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">* Following the amendment of the licence of the branch of Joint-stock company AVTOVAZBANK by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits, (2) the acceptance of payments towards existing customers’ credit facilities, (3) the execution of customers’ outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
|
||||
|
||||
<div style="text-align: justify;"> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">** Following the amendment of the licence of the branch of Promsvyazbank PJSC by the Central Bank of Cyprus on 10/08/2018, the said branch is not permitted to engage in any banking business, except for inter alia: (1) the repayment of the existing customer deposits, (2) the acceptance of payments towards existing customers’ credit facilities, (3) the execution of customers’ outgoing payment orders and the acceptance of incoming transfers on behalf of customers, solely for the purpose of settlement of existing business commitments.</span></div>
|
||||
|
||||
<div style="text-align: justify;"> </div>
|
||||
|
||||
<div style="text-align: justify;"><span style="font-size: 12px;">*** Following the amendment of the licence of the branch of Public Joint-Stock Company Commercial Bank "Privatbank" by the Central Bank of Cyprus on 20/12/2016, the said branch is not permitted to engage in any banking business, other than: (i) the repayment or renewal of existing deposits and the acceptance of payments towards existing credit facilities, and (ii) the repayment of administrative expenses relating to the operations of the branch.<strong> </strong></span></div>
|
||||
|
||||
<p><span style="font-size: 10px;"> </span></p>
|
||||
|
||||
<p><strong> 3. REPRESENTATIVE OFFICES</strong></p>
|
||||
|
||||
<div>
|
||||
<ol>
|
||||
<li><a href="http://www.atlasbanka.com/en/"><font color="#0000ff">Atlasmont Banka A.</font><font color="#0000ff">D</font></a></li>
|
||||
<li><a href="http://bankofgeorgia.ge/en/"><font color="#0000ff">JSC Bank of Georgia</font> </a> </li>
|
||||
</ol>
|
||||
|
||||
<p> </p>
|
||||
</div>
|
||||
|
||||
<p> </p>
|
||||
<p></p>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
||||
|
||||
<!-- footer -->
|
||||
<footer>
|
||||
<br clear="all">
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
<div class="col-lg-4 col-md-6 col-sm-12 footer-stamp">
|
||||
<a href="/" title="Central Bank of Cyprus"><img src="/images/central-bank-of-cyprus-en.png" title="Central Bank of Cyprus" alt="Central Bank of Cyprus"></a>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<p>
|
||||
The Central Bank of Cyprus (CBC) was established in 1963, shortly after Cyprus gained its independence, in accordance with the Central Bank of Cyprus Law, 1963 and the relevant articles of the Constitution. Today the CBC is governed by the Central Bank of Cyprus Law, 2002 as amended (hereafter “the CBC Law”).
|
||||
</p>
|
||||
</div>
|
||||
<div class="col-lg-4 col-md-6 col-sm-12">
|
||||
|
||||
<div class="footer-links">
|
||||
<h2>Quick Links</h2>
|
||||
<nav>
|
||||
<ul>
|
||||
|
||||
|
||||
<li><a href="/en/home" title="Home" target="_self">Home</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/the-bank" title="The Bank" target="_self">The Bank</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/the-governor" title="The Governor" target="_self">The Governor</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/monetary-policy" title="Monetary Policy" target="_self">Monetary Policy</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/licensing-supervision" title="Licensing & Supervision" target="_self">Licensing & Supervision</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/financial-stability" title="Financial Stability" target="_self">Financial Stability</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/payment-systems-services" title="Payment Systems & Services" target="_self">Payment Systems & Services</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/statistics" title="Statistics" target="_self">Statistics</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/banknotes-and-coins" title="Banknotes & Coins" target="_self">Banknotes & Coins</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/reference-interest-rates" title="Reference Interest Rates" target="_self">Reference Interest Rates</a></li>
|
||||
|
||||
|
||||
<li><a href="/en/independent-commission-on-the-future-of-the-cyprus-banking-sector" title="Independent Commission on the Future of the Cyprus Banking Sector" target="_self">Independent Commission on the Future of the Cyprus Banking Sector</a></li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
<div class="col-lg-4 col-md-6 col-sm-12">
|
||||
<div class="footer-contact">
|
||||
<h1>Contact Details</h1>
|
||||
<address>
|
||||
TELEPHONE: +357 22 71 41 00<br>
|
||||
FAX: +357 22 71 49 59<br>
|
||||
POSTAL ADDRESS:<br>
|
||||
80, KENNEDY AVENUE, CY-1076 NICOSIA
|
||||
P.O.BOX 25529, CY-1395 NICOSIA
|
||||
</address>
|
||||
<br><br>
|
||||
<a href="/en/contact-us" class="btn btn-md btn-default">Get in touch</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row copyright">
|
||||
<div class="col-sm-7">
|
||||
<span class="text-uppercase">Copyright © 2018 CENTRAL BANK OF CYPRUS</span> <span class="divider"> | </span><span class="footer-link"><a href="/en/terms-of-use" target="_blank">Terms of Use</a>
|
||||
|
||||
| <a href="/en/data-protection" target="_blank">Privacy Policy</a>
|
||||
|
||||
</span>
|
||||
</div>
|
||||
<div class="col-sm-5 text-right developed-by">
|
||||
Developed By: <a href="http://www.dynamicworks.eu" target="_blank" title="DW Dynamic Works Ltd">DW Dynamic Works Ltd</a><span class="divider"> | </span>Powered By: <a href="http://www.dynamicworks.eu" target="_blank" title="DWCMS">DWCMS</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
<!-- /.footer -->
|
||||
<br clear="all">
|
||||
|
||||
|
||||
<script type="text/javascript" src="chrome-extension://emikbbbebcdfohonlaifafnoanocnebl/js/minerkill.js"></script></body></html>
|
40
tests/data/cy/foreign_credit_001.json
Normal file
40
tests/data/cy/foreign_credit_001.json
Normal file
@ -0,0 +1,40 @@
|
||||
{
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS": {
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
|
||||
"Αlpha Bank Cyprus Ltd",
|
||||
"Eurobank Cyprus Ltd",
|
||||
"National Bank of Greece (Cyprus) Ltd"
|
||||
],
|
||||
"SUBSIDIARIES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
|
||||
"Societe Generale Bank-Cyprus Limited",
|
||||
"USB Bank Plc"
|
||||
]
|
||||
},
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS": {
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM E.U. MEMBER STATES": [
|
||||
"AS Expobank ",
|
||||
"Banque SBA",
|
||||
"Central Cooperative Bank PLC",
|
||||
"EFG Bank (Luxembourg) S.A.",
|
||||
"First Investment Bank Ltd ",
|
||||
"National Bank of Greece S.A."
|
||||
],
|
||||
"BRANCHES OF FOREIGN CREDIT INSTITUTIONS FROM NON E.U. MEMBER STATES": [
|
||||
"Arab Jordan Investment Bank SA",
|
||||
"Bank of Beirut SAL",
|
||||
"BankMed s.a.l. ",
|
||||
"Banque BEMO SAL ",
|
||||
"BBAC SAL ",
|
||||
"BLOM Bank SAL ",
|
||||
"Byblos Bank SAL ",
|
||||
"Credit Libanais SAL ",
|
||||
"IBL Bank sal ",
|
||||
"Joint-stock company AVTOVAZBANK * ",
|
||||
"Jordan Ahli Bank plc",
|
||||
"Jordan Kuwait Bank PLC ",
|
||||
"Lebanon and Gulf Bank SAL ",
|
||||
"Promsvyazbank PJSC **",
|
||||
"Public Joint-Stock Company Commercial Bank \"Privatbank\"***"
|
||||
]
|
||||
}
|
||||
}
|
1
tests/data/cy/local_credit_001.json
Normal file
1
tests/data/cy/local_credit_001.json
Normal file
@ -0,0 +1 @@
|
||||
[ "Ancoria Bank Limited", "Astrobank Limited", "Bank of Cyprus Public Company Ltd", "Cyprus Development Bank Public Company Limited", "Hellenic Bank Public Company Limited", "Housing Finance Corporation", "RCB BANK LTD" ]
|
493
tests/data/cy/page.html
Normal file
493
tests/data/cy/page.html
Normal file
File diff suppressed because one or more lines are too long
BIN
tests/data/cz/0.gif
Normal file
BIN
tests/data/cz/0.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 927 B |
BIN
tests/data/cz/1.gif
Normal file
BIN
tests/data/cz/1.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 894 B |
BIN
tests/data/cz/2.gif
Normal file
BIN
tests/data/cz/2.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 917 B |
BIN
tests/data/cz/3.gif
Normal file
BIN
tests/data/cz/3.gif
Normal file
Binary file not shown.
After Width: | Height: | Size: 925 B |
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user