2019-05-05 19:13:56 +00:00
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const path = require('path');
|
|
|
|
const jsonfile = require('jsonfile');
|
|
|
|
const removeAccents = require('remove-accents-diacritics');
|
|
|
|
const url = require('url');
|
|
|
|
const logger = require('log4js').getLogger('GI');
|
|
|
|
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
|
|
|
|
class GIScrape extends Scraper {
|
|
|
|
|
|
|
|
constructor() {
|
|
|
|
super();
|
2019-05-12 17:33:09 +00:00
|
|
|
this.setID('GI');
|
2019-05-05 19:13:56 +00:00
|
|
|
|
|
|
|
// treat these elements as block boundaries when scraping permissions
|
|
|
|
this.blockBoundaries = 'div.panel, li';
|
|
|
|
|
|
|
|
// ignore elements matched by these selectors when scraping titles
|
|
|
|
this._ignoreList = 'button, div.modal-body > h3';
|
|
|
|
|
|
|
|
// scrape these top-level permissions headings only
|
|
|
|
this._headingsToScrape = [
|
|
|
|
'Financial Services (Banking) Act',
|
|
|
|
'Financial Services (Investment and Fiduciary Services) Act'
|
|
|
|
];
|
|
|
|
|
|
|
|
// override these values from the base class
|
|
|
|
this.modePrefix = ['ps_', 'em_', 'ci_', 'ag_'];
|
|
|
|
this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices', 'agentServices'];
|
|
|
|
this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services', 'Agent'];
|
|
|
|
|
|
|
|
this.on('done', () => {
|
|
|
|
this._done();
|
|
|
|
});
|
|
|
|
|
|
|
|
this.run = this._throttle(async () => {
|
|
|
|
await this.__run();
|
|
|
|
}, 5000);
|
|
|
|
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
|
|
this._checkLock().then((l) => {
|
|
|
|
if (l)
|
|
|
|
this.run();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async _convertBrToComma(text) {
|
|
|
|
return text.replace(/<br\s*[\/]?>/gi, ', ');
|
|
|
|
}
|
|
|
|
|
|
|
|
async _reduceWhiteSpace(text) {
|
|
|
|
return text.replace(/\s+/g, ' ').trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param html
|
|
|
|
* @param selector
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*
|
|
|
|
* Finds elements in the `html` with the given `selector`, but returns only the uppermost matched elements,
|
|
|
|
* and not those that are nested within other matched elements.
|
|
|
|
*/
|
|
|
|
async getUppermostElementsBySelector(html, selector) {
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
return $(selector).filter(function () {
|
|
|
|
return $(this).parents(selector).length === 0;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async getTextNotInMatchingElements(html, selector) {
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
$(selector)
|
|
|
|
.remove()
|
|
|
|
.end();
|
|
|
|
|
|
|
|
$(this._ignoreList)
|
|
|
|
.remove()
|
|
|
|
.end();
|
|
|
|
|
|
|
|
return $.text();
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractSingleFields($, details) {
|
|
|
|
details.slug = $('meta[name="og:url"]').attr('content').replace('http://www.fsc.gi/regulated-entity/', '');
|
|
|
|
details.name = $('#fvFirmDetails_lblName').text();
|
|
|
|
details.address = await this._convertBrToComma($('#fvFirmDetails_lblAddress').html());
|
|
|
|
details.telephone = $('#fvFirmDetails_lblTel').text();
|
|
|
|
details.fax = $('#fvFirmDetails_lblFax').text();
|
|
|
|
details.email = $('#fvFirmDetails_Label12').text();
|
|
|
|
details.website = $('#fvFirmDetails_lblWebsite').text();
|
|
|
|
details.legalForm = $('#fvFirmDetails_lblLegalForm').text();
|
|
|
|
details.countryOfIncorporation = $('#fvFirmDetails_lblIncorporationCountry').text();
|
|
|
|
details.incorporationNumber = $('#fvFirmDetails_lblRegistrationNo').text();
|
|
|
|
details.incorporationDate = $('#fvFirmDetails_lblDateOfIncorporation').text();
|
|
|
|
}
|
|
|
|
|
|
|
|
async processOtherNameListItem($, elm, names) {
|
|
|
|
const type = $(elm).children('strong').text();
|
|
|
|
let name = $(elm).children('strong').get(0).nextSibling.nodeValue;
|
|
|
|
|
|
|
|
// trim the preceding ' -'
|
|
|
|
if (name.startsWith(' -'))
|
|
|
|
name = name.substr(2);
|
|
|
|
|
|
|
|
name = name.trim();
|
|
|
|
|
|
|
|
names.push({
|
|
|
|
'type': type,
|
|
|
|
'name': name
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractOtherNames($) {
|
|
|
|
const otherNames = [];
|
|
|
|
|
|
|
|
const otherNamesList = $('h3:contains("Other names")').next();
|
|
|
|
|
|
|
|
$(otherNamesList).find('li').each(
|
|
|
|
(index, element) => {
|
|
|
|
this.processOtherNameListItem($, element, otherNames);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
return otherNames;
|
|
|
|
}
|
|
|
|
|
|
|
|
processParentFirm($, elm, firms) {
|
|
|
|
const href = $(elm).find('a').attr('href');
|
|
|
|
const slug = href.replace('/regulated-entity/', '');
|
|
|
|
|
|
|
|
firms.push(slug);
|
|
|
|
}
|
|
|
|
|
|
|
|
extractAgentOf($) {
|
|
|
|
const parentFirms = [];
|
|
|
|
|
|
|
|
const parentFirmsList = $('h3:contains("Agent of")').next();
|
|
|
|
|
|
|
|
$(parentFirmsList).find('li').each(
|
|
|
|
(index, element) => {
|
|
|
|
this.processParentFirm($, element, parentFirms);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
return parentFirms;
|
|
|
|
}
|
|
|
|
|
|
|
|
async processAgentLink($, elm, firmAgentList) {
|
|
|
|
const href = $(elm).attr('href');
|
|
|
|
const fullUrl = `https://www.fsc.gi${href}`;
|
|
|
|
const slug = href.replace('/regulated-entity/', '');
|
|
|
|
const name = await this._cleanUp($(elm).text());
|
|
|
|
const id = this._makeFieldName(name);
|
|
|
|
|
|
|
|
// TODO: refactor this out of this function somehow, it's not unit-testable without a mock for agentServices
|
|
|
|
if ('agentServices' in this) // i.e. don't do this if we're running a unit test
|
|
|
|
// Add the href to our list of links to check later (if it's not already added)
|
|
|
|
if (this.agentServices.links.findIndex(x => x.href === fullUrl) === -1)
|
|
|
|
this.agentServices.links.push({
|
|
|
|
'name': name,
|
|
|
|
'href': fullUrl,
|
|
|
|
'id': id
|
|
|
|
});
|
|
|
|
|
|
|
|
firmAgentList.push({
|
|
|
|
'name': name,
|
|
|
|
'slug': slug
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractAgents(html) {
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const agents = [];
|
|
|
|
|
|
|
|
$('li > a').each(
|
|
|
|
(index, element) => {
|
|
|
|
this.processAgentLink($, element, agents);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
return agents;
|
|
|
|
}
|
|
|
|
|
|
|
|
async recurseDOM(html, selector, level = 0) {
|
|
|
|
const currentLevel = level + 1;
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
const result = [];
|
|
|
|
|
|
|
|
const blocks = await this.getUppermostElementsBySelector(html, selector);
|
|
|
|
|
|
|
|
for (let i = 0; i < blocks.length; i++) {
|
|
|
|
const block = blocks[i];
|
|
|
|
|
|
|
|
const rawName = await this.getTextNotInMatchingElements($(block).html(), selector);
|
|
|
|
const name = await this._reduceWhiteSpace(rawName);
|
|
|
|
|
|
|
|
// Only scrape the top level headings we're interested in
|
|
|
|
if (currentLevel === 1 && this._headingsToScrape.indexOf(name) === -1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const blockHtml = $(block).html();
|
|
|
|
|
|
|
|
let data;
|
|
|
|
if (name === 'Agents')
|
|
|
|
data = await this.extractAgents(blockHtml);
|
|
|
|
else
|
|
|
|
data = await this.recurseDOM(blockHtml, selector, currentLevel);
|
|
|
|
|
|
|
|
if (data === null)
|
|
|
|
result.push(name);
|
|
|
|
else
|
|
|
|
result.push({
|
|
|
|
'name': name,
|
|
|
|
'data': data
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.length > 0)
|
|
|
|
return result;
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
async extractPermissions(html) {
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
const permissionsContainer = $('h3:contains("Permissions")').next();
|
|
|
|
|
|
|
|
if (permissionsContainer.length === 0)
|
|
|
|
return {};
|
|
|
|
|
|
|
|
const permissions = await this.recurseDOM(permissionsContainer.html(), this.blockBoundaries);
|
|
|
|
|
|
|
|
return permissions;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param html
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async extractEntityDetails(html) {
|
|
|
|
try {
|
|
|
|
const details = {};
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
|
|
|
|
await this.extractSingleFields($, details);
|
|
|
|
|
|
|
|
details.otherNames = await this.extractOtherNames($);
|
|
|
|
|
|
|
|
details.permissions = await this.extractPermissions(html);
|
|
|
|
|
|
|
|
details.agentOf = await this.extractAgentOf($);
|
|
|
|
|
|
|
|
return details;
|
|
|
|
}
|
|
|
|
catch (err) {
|
|
|
|
logger.error(err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param serviceObject
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processEntityDetails(serviceObject) {
|
|
|
|
const noWhiteSpace = /\W/g;
|
|
|
|
|
|
|
|
const { name, id } = serviceObject.links[serviceObject.step];
|
|
|
|
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
|
|
|
|
|
|
|
const entity = removeAccents.remove(id.trim());
|
|
|
|
|
|
|
|
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
|
|
|
|
|
|
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
|
|
|
|
|
|
// Wait for the paragraph at the bottom to have loaded.
|
|
|
|
await this.page.$x('//a[contains(text(), "* Firms with an asterisk")]');
|
|
|
|
|
|
|
|
// open all accordions before taking screenshot
|
|
|
|
// first, add a class `expand-below` to the container divs we are interested in:
|
|
|
|
for (const heading of this._headingsToScrape) {
|
|
|
|
const expandBelowThisDiv = await this.page.$x(`//h4[contains(., "${heading}")]/../..`);
|
|
|
|
expandBelowThisDiv.forEach(async (elm) => {
|
|
|
|
await this.page.evaluate(el => {
|
|
|
|
const currentClass = el.getAttribute('class');
|
|
|
|
el.setAttribute('class', `${currentClass} expand-below`);
|
|
|
|
}, elm);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
// then, add a style tag to the <head> to expand the content
|
|
|
|
await this.page.addStyleTag({
|
|
|
|
'content':
|
|
|
|
`
|
|
|
|
div.expand-below div.collapse {
|
|
|
|
display: block;
|
|
|
|
}
|
|
|
|
|
|
|
|
div.expand-below div.modal {
|
|
|
|
display: block;
|
|
|
|
position: static;
|
|
|
|
opacity: 1;
|
|
|
|
overflow: visible;
|
|
|
|
margin-top: 125px;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* remove drop shadows for faster rendering on large pages */
|
|
|
|
.modal-content {
|
|
|
|
-webkit-box-shadow: none;
|
|
|
|
box-shadow: none;
|
|
|
|
}
|
|
|
|
`
|
|
|
|
});
|
|
|
|
|
|
|
|
// temporarily disable GI screenshots
|
|
|
|
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
|
|
|
|
// await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
|
|
|
|
|
|
|
const body = await this.page.content();
|
|
|
|
|
|
|
|
const $ = cheerio.load(body);
|
|
|
|
const underConstruction = $('h3:contains("under construction")').length > 0;
|
|
|
|
|
|
|
|
if (underConstruction) {
|
|
|
|
logger.warn(`Page under construction: ${this.page.url()}`);
|
|
|
|
await jsonfile.writeFile(`${filePath}.json`, { 'underConstruction': true });
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
const details = await this.extractEntityDetails(body);
|
|
|
|
await jsonfile.writeFile(`${filePath}.json`, { details });
|
|
|
|
}
|
|
|
|
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
|
|
|
|
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
|
|
|
serviceObject.step++;
|
|
|
|
|
|
|
|
if (serviceObject.step < serviceObject.items) {
|
|
|
|
const newUrl = serviceObject.links[serviceObject.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
this.emit('serviceDone');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processRedirector() {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
await this.processEntityDetails(this.paymentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
await this.processEntityDetails(this.emoneyServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
await this.processEntityDetails(this.creditServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
await this.processEntityDetails(this.agentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param serviceObject
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processIndex(serviceObject) {
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
|
|
|
|
const body = await this.page.content();
|
|
|
|
|
|
|
|
const filename = this.modeNames[this.mode];
|
|
|
|
// temporarily disable GI screenshots
|
|
|
|
// logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
|
|
|
|
// await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null);
|
|
|
|
|
|
|
|
const $ = cheerio.load(body);
|
|
|
|
|
|
|
|
let ul;
|
|
|
|
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
ul = $('h3:contains("Authorised Payment Institutions")');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
ul = $('h3:contains("E-money Institutions")');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
ul = $('h3:contains("Banks")');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
ul = $('h3:contains("Electronic Money and Payment Institution Agents")');
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
const links = ul.next().find('li > a');
|
|
|
|
|
|
|
|
links.each((i, item) => {
|
|
|
|
const href = $(item).attr('href');
|
|
|
|
|
|
|
|
const text = this._cleanUp($(item).text());
|
|
|
|
|
|
|
|
const newUrl = `https://www.fsc.gi${href}`;
|
|
|
|
const id = this._makeFieldName(text);
|
|
|
|
|
|
|
|
if (serviceObject.links.findIndex(x => x.href === newUrl) === -1)
|
|
|
|
serviceObject.links.push({ 'name': text, 'href': newUrl, 'id': id });
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param serviceObject
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async buildIndex(serviceObject) {
|
|
|
|
await this._randomWait(this.page, 6, 9);
|
|
|
|
|
|
|
|
logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`);
|
|
|
|
|
|
|
|
await this.processIndex(serviceObject);
|
|
|
|
|
|
|
|
if (serviceObject.indexStep < serviceObject.urls.length - 1) {
|
|
|
|
serviceObject.indexStep++;
|
|
|
|
const newUrl = serviceObject.urls[serviceObject.indexStep];
|
|
|
|
await this._goto(newUrl);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
this.emit('indexdone');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async indexRedirector() {
|
|
|
|
logger.debug('>> indexRedirector');
|
|
|
|
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
await this.buildIndex(this.paymentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
await this.buildIndex(this.emoneyServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
await this.buildIndex(this.creditServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
await this.buildIndex(this.agentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processNewPage() {
|
|
|
|
// give the page a few seconds to settle
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
|
|
this.emit('recover');
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
if (
|
|
|
|
pageUrl.pathname.includes('payment-institutions-20') ||
|
|
|
|
pageUrl.pathname.includes('e-money-institutions-17') ||
|
|
|
|
pageUrl.pathname.includes('banks-1') ||
|
|
|
|
pageUrl.pathname.includes('electronic-money-and-payment-institution-agents-26')
|
|
|
|
)
|
|
|
|
await this.indexRedirector();
|
|
|
|
else if (pageUrl.pathname.includes('regulated-entity'))
|
|
|
|
await this.processRedirector();
|
|
|
|
else if (process.env.NODE_ENV) {
|
|
|
|
await this._uploadError();
|
|
|
|
throw new Error(`Unknown page: ${pageUrl.href}`);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
logger.warn('processNewPage Fell through');
|
|
|
|
logger.warn('currentPage.location', pageUrl.href);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (err) {
|
|
|
|
if (err.name === 'TimeoutError') {
|
|
|
|
logger.error(`Reloading page after timeout: ${err.name}: ${err.message}`);
|
|
|
|
this.page.reload();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
throw(err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async attachEvents() {
|
|
|
|
this.on('serviceDone', async () => {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
this.emit('paymentServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
this.emit('emoneyServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
this.emit('creditServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
this.emit('agentServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('psindexdone', async () => {
|
|
|
|
this.paymentServices.items = this.paymentServices.links.length;
|
|
|
|
logger.info(`${this.paymentServices.items} items indexed`);
|
|
|
|
|
|
|
|
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('emindexdone', async () => {
|
|
|
|
this.emoneyServices.items = this.emoneyServices.links.length;
|
|
|
|
logger.info(`${this.emoneyServices.items} items indexed`);
|
|
|
|
|
|
|
|
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('ciindexdone', async () => {
|
|
|
|
this.creditServices.items = this.creditServices.links.length;
|
|
|
|
logger.info(`${this.creditServices.items} items indexed`);
|
|
|
|
|
|
|
|
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('agindexdone', async () => {
|
|
|
|
this.agentServices.items = this.agentServices.links.length;
|
|
|
|
logger.info(`${this.agentServices.items} items indexed`);
|
|
|
|
|
|
|
|
const newUrl = this.agentServices.links[this.agentServices.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('indexdone', async () => {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
this.emit('psindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
this.emit('emindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
this.emit('ciindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 3:
|
|
|
|
this.emit('agindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('paymentServicesDone', async () => {
|
|
|
|
logger.warn('paymentServicesDone');
|
|
|
|
try {
|
|
|
|
this.paymentServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
|
|
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('emoneyServicesDone', async () => {
|
|
|
|
logger.warn('emoneyServicesDone');
|
|
|
|
try {
|
|
|
|
this.emoneyServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
await this._goto(this.creditServices.urls[0]);
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('creditServicesDone', async () => {
|
|
|
|
logger.warn('creditServicesDone');
|
|
|
|
try {
|
|
|
|
this.creditServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
|
|
|
|
await this._goto(this.agentServices.urls[0]);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('agentServicesDone', async () => {
|
|
|
|
logger.warn('agentServicesDone');
|
|
|
|
try {
|
|
|
|
this.agentServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/agentServices.json`, { 'links': this.agentServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/agentServices.json`, this.agentServices);
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
this.emit('done');
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async start() {
|
|
|
|
super._start();
|
|
|
|
try {
|
|
|
|
this.mode = 0;
|
|
|
|
|
|
|
|
this.paymentServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'paginationStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done': false,
|
|
|
|
'urls': ['https://www.fsc.gi/regulated-entities/payment-institutions-20'],
|
|
|
|
'sections': [],
|
|
|
|
'sectionLinks': []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.emoneyServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'paginationStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done': false,
|
|
|
|
'urls': ['https://www.fsc.gi/regulated-entities/e-money-institutions-17'],
|
|
|
|
'sections': [],
|
|
|
|
'sectionLinks': []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.creditServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'paginationStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done': false,
|
|
|
|
'urls': ['https://www.fsc.gi/regulated-entities/banks-1'],
|
|
|
|
'sections': [],
|
|
|
|
'sectionLinks': []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.agentServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'done': false,
|
|
|
|
'urls': ['https://www.fsc.gi/regulated-entities/electronic-money-and-payment-institution-agents-26']
|
|
|
|
};
|
|
|
|
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
|
|
|
|
|
|
this.setPath(path.resolve(`${__dirname}/../artefacts/GI/FSC`));
|
|
|
|
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
|
|
logger.warn(err);
|
|
|
|
});
|
|
|
|
|
|
|
|
await this._initBrowser();
|
|
|
|
await this._createBrowserPage();
|
|
|
|
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
|
|
this.processNewPage().catch((err) => {
|
|
|
|
logger.error('processNewPage fail', err);
|
|
|
|
});
|
|
|
|
}, 2500));
|
|
|
|
|
|
|
|
if (this.eventNames().length === 2)
|
|
|
|
await this.attachEvents();
|
|
|
|
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
|
|
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
|
|
|
|
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
throw new Error(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async __run() {
|
|
|
|
await this.start();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = GIScrape;
|