obdfcascrape/ncas/gi.js
Martin Donnelly a5109efabe 2019-05-12
2019-05-12 18:33:09 +01:00

774 lines
20 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const url = require('url');
const logger = require('log4js').getLogger('GI');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class GIScrape extends Scraper {
constructor() {
super();
this.setID('GI');
// treat these elements as block boundaries when scraping permissions
this.blockBoundaries = 'div.panel, li';
// ignore elements matched by these selectors when scraping titles
this._ignoreList = 'button, div.modal-body > h3';
// scrape these top-level permissions headings only
this._headingsToScrape = [
'Financial Services (Banking) Act',
'Financial Services (Investment and Fiduciary Services) Act'
];
// override these values from the base class
this.modePrefix = ['ps_', 'em_', 'ci_', 'ag_'];
this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices', 'agentServices'];
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services', 'Agent'];
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if (l)
this.run();
});
}
async _convertBrToComma(text) {
return text.replace(/<br\s*[\/]?>/gi, ', ');
}
async _reduceWhiteSpace(text) {
return text.replace(/\s+/g, ' ').trim();
}
/**
*
* @param html
* @param selector
* @returns {Promise<void>}
*
* Finds elements in the `html` with the given `selector`, but returns only the uppermost matched elements,
* and not those that are nested within other matched elements.
*/
async getUppermostElementsBySelector(html, selector) {
const $ = cheerio.load(html);
return $(selector).filter(function () {
return $(this).parents(selector).length === 0;
});
}
async getTextNotInMatchingElements(html, selector) {
const $ = cheerio.load(html);
$(selector)
.remove()
.end();
$(this._ignoreList)
.remove()
.end();
return $.text();
}
async extractSingleFields($, details) {
details.slug = $('meta[name="og:url"]').attr('content').replace('http://www.fsc.gi/regulated-entity/', '');
details.name = $('#fvFirmDetails_lblName').text();
details.address = await this._convertBrToComma($('#fvFirmDetails_lblAddress').html());
details.telephone = $('#fvFirmDetails_lblTel').text();
details.fax = $('#fvFirmDetails_lblFax').text();
details.email = $('#fvFirmDetails_Label12').text();
details.website = $('#fvFirmDetails_lblWebsite').text();
details.legalForm = $('#fvFirmDetails_lblLegalForm').text();
details.countryOfIncorporation = $('#fvFirmDetails_lblIncorporationCountry').text();
details.incorporationNumber = $('#fvFirmDetails_lblRegistrationNo').text();
details.incorporationDate = $('#fvFirmDetails_lblDateOfIncorporation').text();
}
async processOtherNameListItem($, elm, names) {
const type = $(elm).children('strong').text();
let name = $(elm).children('strong').get(0).nextSibling.nodeValue;
// trim the preceding ' -'
if (name.startsWith(' -'))
name = name.substr(2);
name = name.trim();
names.push({
'type': type,
'name': name
});
}
async extractOtherNames($) {
const otherNames = [];
const otherNamesList = $('h3:contains("Other names")').next();
$(otherNamesList).find('li').each(
(index, element) => {
this.processOtherNameListItem($, element, otherNames);
}
);
return otherNames;
}
processParentFirm($, elm, firms) {
const href = $(elm).find('a').attr('href');
const slug = href.replace('/regulated-entity/', '');
firms.push(slug);
}
extractAgentOf($) {
const parentFirms = [];
const parentFirmsList = $('h3:contains("Agent of")').next();
$(parentFirmsList).find('li').each(
(index, element) => {
this.processParentFirm($, element, parentFirms);
}
);
return parentFirms;
}
async processAgentLink($, elm, firmAgentList) {
const href = $(elm).attr('href');
const fullUrl = `https://www.fsc.gi${href}`;
const slug = href.replace('/regulated-entity/', '');
const name = await this._cleanUp($(elm).text());
const id = this._makeFieldName(name);
// TODO: refactor this out of this function somehow, it's not unit-testable without a mock for agentServices
if ('agentServices' in this) // i.e. don't do this if we're running a unit test
// Add the href to our list of links to check later (if it's not already added)
if (this.agentServices.links.findIndex(x => x.href === fullUrl) === -1)
this.agentServices.links.push({
'name': name,
'href': fullUrl,
'id': id
});
firmAgentList.push({
'name': name,
'slug': slug
});
}
async extractAgents(html) {
const $ = cheerio.load(html);
const agents = [];
$('li > a').each(
(index, element) => {
this.processAgentLink($, element, agents);
}
);
return agents;
}
async recurseDOM(html, selector, level = 0) {
const currentLevel = level + 1;
const $ = cheerio.load(html);
const result = [];
const blocks = await this.getUppermostElementsBySelector(html, selector);
for (let i = 0; i < blocks.length; i++) {
const block = blocks[i];
const rawName = await this.getTextNotInMatchingElements($(block).html(), selector);
const name = await this._reduceWhiteSpace(rawName);
// Only scrape the top level headings we're interested in
if (currentLevel === 1 && this._headingsToScrape.indexOf(name) === -1)
continue;
const blockHtml = $(block).html();
let data;
if (name === 'Agents')
data = await this.extractAgents(blockHtml);
else
data = await this.recurseDOM(blockHtml, selector, currentLevel);
if (data === null)
result.push(name);
else
result.push({
'name': name,
'data': data
});
}
if (result.length > 0)
return result;
return null;
}
async extractPermissions(html) {
const $ = cheerio.load(html);
const permissionsContainer = $('h3:contains("Permissions")').next();
if (permissionsContainer.length === 0)
return {};
const permissions = await this.recurseDOM(permissionsContainer.html(), this.blockBoundaries);
return permissions;
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const details = {};
const $ = cheerio.load(html);
await this.extractSingleFields($, details);
details.otherNames = await this.extractOtherNames($);
details.permissions = await this.extractPermissions(html);
details.agentOf = await this.extractAgentOf($);
return details;
}
catch (err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
// Wait for the paragraph at the bottom to have loaded.
await this.page.$x('//a[contains(text(), "* Firms with an asterisk")]');
// open all accordions before taking screenshot
// first, add a class `expand-below` to the container divs we are interested in:
for (const heading of this._headingsToScrape) {
const expandBelowThisDiv = await this.page.$x(`//h4[contains(., "${heading}")]/../..`);
expandBelowThisDiv.forEach(async (elm) => {
await this.page.evaluate(el => {
const currentClass = el.getAttribute('class');
el.setAttribute('class', `${currentClass} expand-below`);
}, elm);
});
}
// then, add a style tag to the <head> to expand the content
await this.page.addStyleTag({
'content':
`
div.expand-below div.collapse {
display: block;
}
div.expand-below div.modal {
display: block;
position: static;
opacity: 1;
overflow: visible;
margin-top: 125px;
}
/* remove drop shadows for faster rendering on large pages */
.modal-content {
-webkit-box-shadow: none;
box-shadow: none;
}
`
});
// temporarily disable GI screenshots
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const $ = cheerio.load(body);
const underConstruction = $('h3:contains("under construction")').length > 0;
if (underConstruction) {
logger.warn(`Page under construction: ${this.page.url()}`);
await jsonfile.writeFile(`${filePath}.json`, { 'underConstruction': true });
}
else {
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
}
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
case 3:
await this.processEntityDetails(this.agentServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const filename = this.modeNames[this.mode];
// temporarily disable GI screenshots
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
// await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
const $ = cheerio.load(body);
let ul;
switch (this.mode) {
case 0:
ul = $('h3:contains("Authorised Payment Institutions")');
break;
case 1:
ul = $('h3:contains("E-money Institutions")');
break;
case 2:
ul = $('h3:contains("Banks")');
break;
case 3:
ul = $('h3:contains("Electronic Money and Payment Institution Agents")');
}
const links = ul.next().find('li > a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = this._cleanUp($(item).text());
const newUrl = `https://www.fsc.gi${href}`;
const id = this._makeFieldName(text);
if (serviceObject.links.findIndex(x => x.href === newUrl) === -1)
serviceObject.links.push({ 'name': text, 'href': newUrl, 'id': id });
});
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
await this._randomWait(this.page, 6, 9);
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
await this.processIndex(serviceObject);
if (serviceObject.indexStep < serviceObject.urls.length - 1) {
serviceObject.indexStep++;
const newUrl = serviceObject.urls[serviceObject.indexStep];
await this._goto(newUrl);
}
else
this.emit('indexdone');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
case 3:
await this.buildIndex(this.agentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
try {
if (
pageUrl.pathname.includes('payment-institutions-20') ||
pageUrl.pathname.includes('e-money-institutions-17') ||
pageUrl.pathname.includes('banks-1') ||
pageUrl.pathname.includes('electronic-money-and-payment-institution-agents-26')
)
await this.indexRedirector();
else if (pageUrl.pathname.includes('regulated-entity'))
await this.processRedirector();
else if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl.href);
}
}
catch (err) {
if (err.name === 'TimeoutError') {
logger.error(`Reloading page after timeout: ${err.name}: ${err.message}`);
this.page.reload();
}
else
throw(err);
}
}
/**
*
* @returns {Promise<void>}
*/
async attachEvents() {
this.on('serviceDone', async () => {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
case 3:
this.emit('agentServicesDone');
break;
}
});
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('agindexdone', async () => {
this.agentServices.items = this.agentServices.links.length;
logger.info(`${this.agentServices.items} items indexed`);
const newUrl = this.agentServices.links[this.agentServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async () => {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
case 3:
this.emit('agindexdone');
break;
}
});
this.on('paymentServicesDone', async () => {
logger.warn('paymentServicesDone');
try {
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async () => {
logger.warn('emoneyServicesDone');
try {
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async () => {
logger.warn('creditServicesDone');
try {
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
}
catch (e) {
logger.error(e);
}
await this._goto(this.agentServices.urls[0]);
});
this.on('agentServicesDone', async () => {
logger.warn('agentServicesDone');
try {
this.agentServices.done = true;
jsonfile.writeFileSync(`${this.path}/agentServices.json`, { 'links': this.agentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/agentServices.json`, this.agentServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/payment-institutions-20'],
'sections': [],
'sectionLinks': []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/e-money-institutions-17'],
'sections': [],
'sectionLinks': []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'paginationStep': 0,
'visited': false,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/banks-1'],
'sections': [],
'sectionLinks': []
};
this.agentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'done': false,
'urls': ['https://www.fsc.gi/regulated-entities/electronic-money-and-payment-institution-agents-26']
};
this.startPage = this.paymentServices.urls[0];
this.setPath(path.resolve(`${__dirname}/../artefacts/GI/FSC`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
if (this.eventNames().length === 2)
await this.attachEvents();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch (e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = GIScrape;