obdfcascrape/ncas/lv.js

629 lines
17 KiB
JavaScript
Raw Permalink Normal View History

2019-05-05 19:13:56 +00:00
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
2019-08-15 07:48:49 +00:00
const logger = require('log4js').getLogger('(LV)');
2019-05-05 19:13:56 +00:00
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class LVScrape extends Scraper {
constructor() {
super();
2019-08-15 07:48:49 +00:00
this.setID('LV');
2019-05-05 19:13:56 +00:00
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param serviceObject
* @param html
* @returns {Promise<void>}
*/
async processIndex(serviceObject, html) {
const newArray = [] ;
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const $ = cheerio.load(html);
const links = $('div.featured-articles-title a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).text());
const newUrl = `${this.rootURI}${href}`;
const id = this._makeFieldName(text);
newArray.push({ 'name':text, 'href':newUrl, 'id':id });
});
serviceObject.links = serviceObject.links.concat(newArray);
const filename = this.modeNames[this.mode];
2019-08-15 07:48:49 +00:00
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, 1500);
2019-05-05 19:13:56 +00:00
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
// ('div.featured-articles-title')
await this.page.waitForSelector('table#organizcijasList', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.debug('Menu details.');
const elmHtml = await this.page.evaluate(el => el.outerHTML, elm);
await this.processIndex(serviceObject, elmHtml);
}).catch(() => {
logger.info('No show all button');
});
}
/**
*
* @param html
* @param section
* @returns {Promise<void>}
*/
async extractEntitySections(html, section) {
const httpRegEx = /(http|ftp|https):\/\//;
const filenameFromURL = /(?:\/.*\/)(.*)/;
try{
const newObj = { } ;
const $ = cheerio.load(html);
// const wantedItem = $('div#featured-articles-title');
// const wantedItem = $('h2:contains("Sanctions")');
const wantedItem = $(section);
if (wantedItem.length === 0) return newObj;
newObj.name = this._cleanUp($(wantedItem).text());
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
rows.each((i, item) => {
const children = $(item).children();
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newObj[label] = this._cleanUp($(children).eq(1).text());
}
if ($(children).length === 1) {
const label = 'notes';
if (!newObj.hasOwnProperty(label))
newObj[label] = [];
newObj[label].push(this._cleanUp($(children).eq(0).text()));
const links = $(item).find('a');
if ($(links).length > 0)
links.each((y, link) => {
const href = $(link).attr('href');
const text = this._cleanUp($(link).text());
if (href.match(httpRegEx) === null) {
const fileName = href.match(filenameFromURL);
if (!newObj.hasOwnProperty('links'))
newObj['links'] = [];
newObj['links'].push({ href, text, 'filename': fileName[1] });
}
});
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param section
* @returns {Promise<Array>}
*/
async extractEntitySubSections(html, section) {
try{
const newObj = [] ;
const $ = cheerio.load(html);
const wantedItem = $(section);
if (wantedItem.length === 0) return newObj;
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
let newItem = {};
rows.each((i, item) => {
const children = $(item).children();
if (i === 0 || $(children).length === 1) {
if (Object.keys(newItem).length !== 0)
newObj.push(newItem);
newItem = {};
}
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newItem[label] = this._cleanUp($(children).eq(1).text());
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractEntityLicenses(html) {
try{
const newObj = [] ;
const $ = cheerio.load(html);
const wantedItem = $('h2:contains("Licenses / Types of activities")');
if (wantedItem.length === 0) return newObj;
const sibling = $(wantedItem).next();
const rows = $(sibling).find('tbody tr');
let newItem = {};
rows.each((i, item) => {
const children = $(item).children();
if (i === 0 || $(children).length === 1) {
if (Object.keys(newItem).length !== 0)
newObj.push(newItem);
newItem = {};
}
if ($(children).length === 2) {
const label = this._makeFieldName($(children).eq(0).text());
newItem[label] = this._cleanUp($(children).eq(1).text());
}
});
logger.debug(JSON.stringify(newObj));
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
// const filenameFromURL = /(?:\/.*\/)(.*)/;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('#featured-articles-title > h2', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
2019-08-15 07:48:49 +00:00
await this._makeScreenshotV2(this.page, `${filePath}_main`, 2000);
2019-05-05 19:13:56 +00:00
const body = await this.page.content();
// --
const details = await this.extractEntitySections(body, 'div#featured-articles-title');
const marketSegments = await this.extractEntitySubSections(body, 'h2:contains("Market segments")');
const relatedPersons = await this.extractEntitySubSections(body, 'h2:contains("Related persons")');
const licenses = await this.extractEntityLicenses(body);
const sanctions = await this.extractEntitySections(body, 'h2:contains("Sanctions")');
const qualifyHoldings = await this.extractEntitySubSections(body, 'h2:contains("Qualifying holdings")');
// --
2019-08-15 07:48:49 +00:00
await jsonfile.writeFile(`${filePath}.json`, { details, licenses, marketSegments, qualifyHoldings, relatedPersons, sanctions });
2019-05-05 19:13:56 +00:00
await this._randomWait(this.page, 3, 5);
2019-08-15 07:48:49 +00:00
2019-05-05 19:13:56 +00:00
if (details.hasOwnProperty('links')) {
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
for(const items of details.links) {
const href = `${this.rootURI}${items.href}`;
await this.page.goto(href, { 'waitUntil': 'networkidle0' }).catch((err) => {
// log this error but Puppeteer isn't supposed to support this sort of download....
// mute the ERR_ABORTED error which happens everytime but alert for everything else.
if (!err.message.includes('net::ERR_ABORTED') )
logger.error('grabLink', err);
});
2019-08-15 07:48:49 +00:00
await this._randomWait(this.page, 2, 3);
2019-05-05 19:13:56 +00:00
}
}
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const urlSplitter = /(\/en\/.*\/)(.*)/;
const pageUrl = url.parse(await this.page.url());
const splitUrl = pageUrl.pathname.match(urlSplitter);
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
if (splitUrl === null) return;
switch (splitUrl[1]) {
2019-05-12 17:33:09 +00:00
case '/en/market/payment-service-providers/payment-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/':
2019-05-05 19:13:56 +00:00
case '/en/market/credit-institutions/':
await this.indexRedirector();
break;
2019-05-12 17:33:09 +00:00
case '/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions/':
case '/en/market/payment-service-providers/payment-institutions/registered-payment-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions/':
case '/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions/':
2019-05-05 19:13:56 +00:00
case '/en/market/credit-institutions/banks/':
await this.processRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
let newUrl;
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep >= this.paymentServices.urls.length)
newUrl = this.paymentServices.links[this.paymentServices.step].href;
else
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
let newUrl;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
else
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
let newUrl;
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
this.creditServices.indexStep++;
if (this.creditServices.indexStep >= this.creditServices.urls.length)
newUrl = this.creditServices.links[this.creditServices.step].href;
else
newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.rootURI = 'http://www.fktk.lv';
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
2019-05-12 17:33:09 +00:00
'urls': ['http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/payment-institutions/registered-payment-institutions.html'],
2019-05-05 19:13:56 +00:00
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
2019-05-12 17:33:09 +00:00
'urls': ['http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/payment-service-providers/electronic-money-institutions/registered-electronic-money-institutions.html'],
2019-05-05 19:13:56 +00:00
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = LVScrape;