Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

570 lines
15 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('SE');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class SEScrape extends Scraper {
constructor() {
super();
this.setID('SE');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<{authorization: Array, details}>}
*/
async extractEntity(html) {
const $ = cheerio.load(html);
const details = {};
const authorization = [];
details.name = this._cleanUp($('h2').text());
const dlCells = $('dl.funky').children();
const ulCells = $('ul.tillstand').children();
let current = '';
dlCells.each((index, item) => {
const itemText = this._cleanUp($(item).text());
if (item.name === 'dt') {
details[itemText] = [];
current = itemText;
}
else
details[current].push(itemText);
});
ulCells.each((index, item) => {
const date = this._cleanUp($(item.children).eq(0).text()) ;
const text = this._cleanUp($(item.children).eq(1).text()) ;
authorization.push({ date, text, 'translated':this._translate(text) });
});
return { details, authorization };
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('h1').catch((e) => {
throw e;
});
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const $ = cheerio.load(body);
const details = await this.extractEntity(body);
const crossBorderExists = $('div.container a.link');
if (crossBorderExists.length !== 0) {
serviceObject.links[serviceObject.step].data = { details };
await this._findAndClick('div.container a.link', 'View cross border services');
}
else {
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractCrossBorderServices(html) {
const services = {};
const $ = cheerio.load(html);
const rows = $('div.container table tbody tr');
let current = '';
rows.each((index, item) => {
if ($(item).children().length === 1) {
// this is a heading...
const itemText = this._cleanUp($(item).text());
services[itemText] = { 'authorization': [], 'translated': this._translate(itemText) };
current = itemText;
}
else {
const date = this._cleanUp($(item.children).eq(0).text()) ;
const text = this._cleanUp($(item.children).eq(1).text()) ;
const translated = this._translate(text);
services[current].authorization.push({ date, text, translated });
}
});
return services;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processCrossBorderServicesV2(serviceObject) {
try{
const noWhiteSpace = /\W/g;
const id = serviceObject.links[serviceObject.step].id;
logger.info('Process CBS entity:', id);
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('h1').catch((e) => {
throw e;
});
await this._makeScreenshotV2(this.page, `${filePath}_crossborder`, null);
const body = await this.page.content();
const crossBorderServices = await this.extractCrossBorderServices(body);
const details = serviceObject.links[serviceObject.step].data;
serviceObject.links[serviceObject.step].data = null;
await jsonfile.writeFile(`${filePath}.json`, { details, crossBorderServices });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
// await this._randomWait(this.page, 3, 5);
await this.page.waitForSelector('#institut', { 'visible':true });
const links = await this.page.$$('#institut > tbody > tr > td > a');
for (const item of links) {
// logger.debug(item);
const id = await this.page.evaluate(el => el.innerText, item);
let href = await this.page.evaluate(el => el.href, item);
href = href.concat('&locale=en_GB');
serviceObject.links.push({ id, href });
}
serviceObject.items = serviceObject.links.length;
serviceObject.indexStep++;
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async crossBorderRedirector() {
switch (this.mode) {
case 0:
await this.processCrossBorderServicesV2(this.paymentServices);
break;
case 1:
await this.processCrossBorderServicesV2(this.emoneyServices);
break;
case 2:
await this.processCrossBorderServicesV2(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/our-registers/company-register/':
await this.indexRedirector();
break;
case '/en/our-registers/company-register/details':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('indexdone', async function() {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async function() {
if (this.paymentServices.indexStep < this.paymentServices.urls.length) {
const newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingPaymentServices');
});
this.on('startProcessingPaymentServices', async function() {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
// logger.debug(this.paymentServices.links);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('paymentServicesDone', async function() {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
await this._goto(this.emoneyServices.urls[0]);
});
// emoney Services
this.on('emindexdone', async function() {
if (this.emoneyServices.indexStep < this.emoneyServices.urls.length) {
const newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingEMoneyServices');
});
this.on('startProcessingEMoneyServices', async function() {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
// logger.debug(this.emoneyServices.links);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('emoneyServicesDone', async function() {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
await this._goto(this.creditServices.urls[0]);
});
// credit services
this.on('ciindexdone', async function() {
if (this.creditServices.indexStep < this.creditServices.urls.length) {
const newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
}
else
this.emit('startProcessingcreditServices');
});
this.on('startProcessingcreditServices', async function() {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
// logger.debug(this.creditServices.links);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('creditServicesDone', async function() {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
await this._loadDictionary();
this.mode = 0;
this.modeTitles = ['**Payment Service', 'EMoney', 'Credit Services'];
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BET&area=#results'/* ,
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Betaltj%C3%A4nstf%C3%B6retag&cat=BETREG&area=#results'*/]
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=EINST&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Utgivare+av+elektroniska+pengar&cat=REGUTG&area=#results']
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=BANK&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=MBANK&area=#results',
'https://www.fi.se/en/our-registers/company-register/?huvudkategori=Bank&cat=SPAR&area=#results']
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = 'https://www.bafin.de/DE/PublikationenDaten/Datenbanken/EGeldInstitute/e-geld-institute_node.html';
this.credit = 'https://portal.mvp.bafin.de/database/InstInfo/sucheForm.do?locale=en_GB';
this.setPath(path.resolve(`${__dirname }/../artefacts/SE/FI`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }).catch((err) => {
logger.error(err);
});
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle2' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
await this.start();
}
}
module.exports = SEScrape;