obdfcascrape/ncas/no.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

768 lines
20 KiB
JavaScript

// Version: 0.0.1-3
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('NO');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class NOScrape extends Scraper {
constructor() {
super();
this.id = 'NO';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const newObj = {};
const $ = cheerio.load(html);
const title = $('h1.common-header-text').text();
newObj.title = this._cleanUp(title);
const detailBox = $('div.side-container.license-side-unit-container');
const children = $(detailBox).children();
let curLabel = '';
children.each((i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H4') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (['P', 'SPAN', 'A'].indexOf(tagName) !== -1)
newObj[curLabel].push(this._cleanUp($(item).text()));
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @param blockType
* @returns {{licenseDescription: string, blockType: string}}
*/
extractEntityDetailLicense(html, blockType = 'License') {
try {
const newObj = { 'licenseDescription':'', 'blockType': blockType };
const $ = cheerio.load(html);
const detailBox = $('div.license-container');
const children = $(detailBox).children();
let curLabel = '';
children.each((i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H3') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (tagName === 'H2') {
if (!newObj.hasOwnProperty('misc'))
newObj['misc'] = [];
newObj['misc'].push(this._cleanUp($(item).text()));
}
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1) {
const elmClass = $(item).attr('class');
if (elmClass === 'license-description')
newObj['licenseDescription'] = this._cleanUp($(item).text());
else
newObj[curLabel].push( this._cleanUp($(item).text()));
}
if (tagName === 'UL') {
const liArray = [];
const li = $(item).children('li');
for (let i = 0; i < $(li).length;i++)
liArray.push(this._cleanUp($(li).eq(i).text()));
newObj[curLabel].push(liArray);
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {{description: {}}}
*/
extractCrossBorderDetailsV2(html) {
const newObj = { 'description':{} };
const titleRegEx = /([^]*?)(?:<ul>)/;
const $ = cheerio.load(html);
const top = $('ul');
const parent = $(top).parent();
const title = this._cleanUp($(parent).children().first().text());
const li = $(top).first().children();
li.each(async (i, item) => {
const anotherUL = $(item).find('ul').index();
if (anotherUL !== -1) {
// There are UL's within this LI
const elms = $(item).find('ul').children('li');
if ($(elms).length !== 0) {
const nameArray = $(item).html().match(titleRegEx);
const rawTitle = nameArray[0].replace('<ul>', '');
const title = this._cleanUp(rawTitle) || 'main';
const label = this._makeFieldName(title);
if (!newObj.hasOwnProperty(label)) {
newObj[label] = [];
newObj.description[label] = title;
}
elms.each((i, e) => {
newObj[label].push(this._cleanUp($(e).text()));
});
}
}
else {
const label = this._makeFieldName(title);
if (!newObj.hasOwnProperty(label)) {
newObj[label] = [];
newObj.description[label] = title;
}
newObj[label].push(this._cleanUp($(item).text()));
}
});
return newObj;
}
/**
*
* @param html
* @returns {Promise<void>}
*/
extractEntityDetailCrossBorder(html) {
try {
const newObj = { };
const $ = cheerio.load(html);
const header = $('h3.license-unit-label:contains("Cross-border services/classes")');
const detailBox = $(header).parent();
const children = $(detailBox).children();
let curLabel = '';
children.each(async (i, item) => {
const tagName = $(item).prop('tagName');
if (tagName === 'H3') {
curLabel = this._makeFieldName($(item).text());
if (!newObj.hasOwnProperty(curLabel))
newObj[curLabel] = [];
}
if (['SPAN', 'A', 'P'].indexOf(tagName) !== -1)
newObj[curLabel].push(this._cleanUp($(item).text()));
if(tagName === 'DIV' || tagName === 'UL') {
if (!newObj.hasOwnProperty('data'))
newObj['data'] = [];
const cbData = this.extractCrossBorderDetailsV2($(item).html());
newObj['data'].push(cbData);
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @param elm
* @returns {Promise<void>}
*/
async selectLicenseOption(serviceObject, elm) {
const wantedOption = serviceObject.wanted[serviceObject.indexStep];
const elmSelector = await this.page.evaluate((el) => [el.tagName, el.getAttribute('class')].join('.'), elm);
const options = await elm.$$('option');
for (const item of options) {
const text = await this.page.evaluate(el => el.innerText, item);
const value = await this.page.evaluate(el => el.value, item);
if (wantedOption === text) {
await this.page.select(elmSelector, value);
break;
}
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractIndexItems(html) {
const newArray = [] ;
const $ = cheerio.load(html);
const links = $('a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).find('.licenseregistry-search-result-item-header').text());
const country = this._cleanUp($(item).find('.licenseregistry-search-result-item-metadata').text());
const type = this._cleanUp($(item).find('.licenseregistry-search-result-item-type').text());
const params = this._getParamsFromUrl(href);
const newUrl = `${this.rootURI}${href}`;
const id = params.id;
newArray.push({ 'name':text, 'href':newUrl, 'id':id, 'country':country, 'type': type });
});
return newArray;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
let html = '';
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
let loadedAll = false;
do
await this.page.waitForSelector('button.search-result-loadmore', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
loadedAll = false;
logger.debug('Expanding index..');
await elm.click({ 'delay':Scraper.notARobot() });
await this._randomWait(this.page, 3, 5);
}).catch(() => {
loadedAll = true;
});
while( loadedAll === false);
logger.debug('>> All loaded...');
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
html = await this.page.evaluate(el => el.outerHTML, elm);
}).catch((e) => {
logger.error(e);
logger.warn('No index list');
});
const indexList = await this.extractIndexItems(html);
serviceObject.links = serviceObject.links.concat(indexList);
const filename = this.modeNames[this.mode];
await this._randomWait(this.page, 5, 7);
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
this.emit('indexdone');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this.page.waitForSelector('select.search-filter', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await this.selectLicenseOption(serviceObject, elm);
}).catch((e) => {
logger.error(e);
logger.warn('No select');
});
// this reload can take a long time
await this.page.waitForSelector('span.search-results-count.highlight', { 'visible':true, 'timeout':75000 }).catch((e) => {
logger.error(e);
logger.warn('Waiting for data timeout');
});
await this.page.waitForSelector('#js-konsregList > div > div', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
await this.processIndex(serviceObject);
}).catch((e) => {
logger.error(e);
logger.warn('No index list');
});
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async entityContentSniffer(html) {
const $ = cheerio.load(html);
const output = [];
const contentArray = [
{ 'find':'h2:contains("Bank")', 'blockType':'Bank' },
{ 'find':'h2:contains("Agent of payment institution (company)")', 'blockType':'Agent Payment Institution' },
{ 'find':'h2:contains("Tied Agent")', 'blockType':'Agent' },
{ 'find':'h3.license-unit-label:contains("The entity is a tied agent affiliated to")', 'blockType':'Affiliation' },
{ 'find':'h2:contains("Nominee in Norwegian securities registers")', 'blockType':'Securities register' },
{ 'find':'h2:contains("Branch of foreign credit institution")', 'blockType':'Foreign credit institution' },
{ 'find':'h2:contains("Finance company")', 'blockType':'Finance company' },
{ 'find':'h2:contains("Payment institution")', 'blockType':'Payment institution' },
{ 'find':'h2:contains("Agency debt collection on behalf of others")', 'blockType':'Debt collection' },
{ 'find':'h2:contains("E-money institution")', 'blockType':'E-money institution' },
{ 'find':'h2:contains("Investment firm")', 'blockType':'h2:contains("Investment firm")' },
{ 'find':'h2:contains("Intermediator of loans and guarantees")', 'blockType':'Intermediator of loans and guarantees' }
];
const licenseBlocks = $('div.article-content-container').children('div.license-container');
licenseBlocks.each( (i, item) => {
let license = {};
for(const cItem of contentArray)
if ($(item).find(cItem.find).index() !== -1)
license = this.extractEntityDetailLicense(item, cItem.blockType);
if ($(item).find('h3.license-unit-label:contains("Cross-border services/classes")').index() !== -1)
license.crossBorder = this.extractEntityDetailCrossBorder(item);
output.push(license);
});
return output;
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
await this.page.waitForSelector('h1.common-header-text', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(name.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_'), `_${id}`].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 5, 7);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
// --
const details = await this.extractEntityDetails(body);
const licenses = await this.entityContentSniffer(body);
// --
await jsonfile.writeFile(`${filePath}.json`, { details, licenses });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
switch (pageUrl.pathname) {
case '/en/finanstilsynets-registry/':
await this.indexRedirector();
break;
case '/en/finanstilsynets-registry/detail/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('psindexdone', async () => {
let newUrl;
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
this.paymentServices.indexStep++;
if (this.paymentServices.indexStep >= this.paymentServices.wanted.length)
newUrl = this.paymentServices.links[this.paymentServices.step].href;
else
newUrl = this.paymentServices.urls[0];
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
let newUrl;
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
this.emoneyServices.indexStep++;
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
else
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
let newUrl;
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
this.creditServices.indexStep++;
if (this.creditServices.indexStep >= this.creditServices.urls.length)
newUrl = this.creditServices.links[this.creditServices.step].href;
else
newUrl = this.creditServices.urls[this.creditServices.indexStep];
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.rootURI = 'https://www.finanstilsynet.no';
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['Payment institution', 'Agent of payment institution (company)', 'Payment service provider with a limited authorisat'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['E-money institution'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.finanstilsynet.no/en/finanstilsynets-registry/'],
'wanted' : ['Bank', 'Branch of foreign credit institution', 'Credit Institution', 'Savings bank foundation'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/NO/FS`));
// await this._doNonRepudiation();
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5, 'Startup');
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = NOScrape;