const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const removeAccents = require('remove-accents-diacritics'); const url = require('url'); const logger = require('log4js').getLogger('GI'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class GIScrape extends Scraper { constructor() { super(); this.setID('GI'); // treat these elements as block boundaries when scraping permissions this.blockBoundaries = 'div.panel, li'; // ignore elements matched by these selectors when scraping titles this._ignoreList = 'button, div.modal-body > h3'; // scrape these top-level permissions headings only this._headingsToScrape = [ 'Financial Services (Banking) Act', 'Financial Services (Investment and Fiduciary Services) Act' ]; // override these values from the base class this.modePrefix = ['ps_', 'em_', 'ci_', 'ag_']; this.modeNames = ['paymentServices', 'emoneyServices', 'creditServices', 'agentServices']; this.modeTitles = ['Payment Service', 'EMoney', 'Credit Services', 'Agent']; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if (l) this.run(); }); } async _convertBrToComma(text) { return text.replace(//gi, ', '); } async _reduceWhiteSpace(text) { return text.replace(/\s+/g, ' ').trim(); } /** * * @param html * @param selector * @returns {Promise} * * Finds elements in the `html` with the given `selector`, but returns only the uppermost matched elements, * and not those that are nested within other matched elements. */ async getUppermostElementsBySelector(html, selector) { const $ = cheerio.load(html); return $(selector).filter(function () { return $(this).parents(selector).length === 0; }); } async getTextNotInMatchingElements(html, selector) { const $ = cheerio.load(html); $(selector) .remove() .end(); $(this._ignoreList) .remove() .end(); return $.text(); } async extractSingleFields($, details) { details.slug = $('meta[name="og:url"]').attr('content').replace('http://www.fsc.gi/regulated-entity/', ''); details.name = $('#fvFirmDetails_lblName').text(); details.address = await this._convertBrToComma($('#fvFirmDetails_lblAddress').html()); details.telephone = $('#fvFirmDetails_lblTel').text(); details.fax = $('#fvFirmDetails_lblFax').text(); details.email = $('#fvFirmDetails_Label12').text(); details.website = $('#fvFirmDetails_lblWebsite').text(); details.legalForm = $('#fvFirmDetails_lblLegalForm').text(); details.countryOfIncorporation = $('#fvFirmDetails_lblIncorporationCountry').text(); details.incorporationNumber = $('#fvFirmDetails_lblRegistrationNo').text(); details.incorporationDate = $('#fvFirmDetails_lblDateOfIncorporation').text(); } async processOtherNameListItem($, elm, names) { const type = $(elm).children('strong').text(); let name = $(elm).children('strong').get(0).nextSibling.nodeValue; // trim the preceding ' -' if (name.startsWith(' -')) name = name.substr(2); name = name.trim(); names.push({ 'type': type, 'name': name }); } async extractOtherNames($) { const otherNames = []; const otherNamesList = $('h3:contains("Other names")').next(); $(otherNamesList).find('li').each( (index, element) => { this.processOtherNameListItem($, element, otherNames); } ); return otherNames; } processParentFirm($, elm, firms) { const href = $(elm).find('a').attr('href'); const slug = href.replace('/regulated-entity/', ''); firms.push(slug); } extractAgentOf($) { const parentFirms = []; const parentFirmsList = $('h3:contains("Agent of")').next(); $(parentFirmsList).find('li').each( (index, element) => { this.processParentFirm($, element, parentFirms); } ); return parentFirms; } async processAgentLink($, elm, firmAgentList) { const href = $(elm).attr('href'); const fullUrl = `https://www.fsc.gi${href}`; const slug = href.replace('/regulated-entity/', ''); const name = await this._cleanUp($(elm).text()); const id = this._makeFieldName(name); // TODO: refactor this out of this function somehow, it's not unit-testable without a mock for agentServices if ('agentServices' in this) // i.e. don't do this if we're running a unit test // Add the href to our list of links to check later (if it's not already added) if (this.agentServices.links.findIndex(x => x.href === fullUrl) === -1) this.agentServices.links.push({ 'name': name, 'href': fullUrl, 'id': id }); firmAgentList.push({ 'name': name, 'slug': slug }); } async extractAgents(html) { const $ = cheerio.load(html); const agents = []; $('li > a').each( (index, element) => { this.processAgentLink($, element, agents); } ); return agents; } async recurseDOM(html, selector, level = 0) { const currentLevel = level + 1; const $ = cheerio.load(html); const result = []; const blocks = await this.getUppermostElementsBySelector(html, selector); for (let i = 0; i < blocks.length; i++) { const block = blocks[i]; const rawName = await this.getTextNotInMatchingElements($(block).html(), selector); const name = await this._reduceWhiteSpace(rawName); // Only scrape the top level headings we're interested in if (currentLevel === 1 && this._headingsToScrape.indexOf(name) === -1) continue; const blockHtml = $(block).html(); let data; if (name === 'Agents') data = await this.extractAgents(blockHtml); else data = await this.recurseDOM(blockHtml, selector, currentLevel); if (data === null) result.push(name); else result.push({ 'name': name, 'data': data }); } if (result.length > 0) return result; return null; } async extractPermissions(html) { const $ = cheerio.load(html); const permissionsContainer = $('h3:contains("Permissions")').next(); if (permissionsContainer.length === 0) return {}; const permissions = await this.recurseDOM(permissionsContainer.html(), this.blockBoundaries); return permissions; } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { try { const details = {}; const $ = cheerio.load(html); await this.extractSingleFields($, details); details.otherNames = await this.extractOtherNames($); details.permissions = await this.extractPermissions(html); details.agentOf = await this.extractAgentOf($); return details; } catch (err) { logger.error(err); } } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { const noWhiteSpace = /\W/g; const { name, id } = serviceObject.links[serviceObject.step]; logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`); const entity = removeAccents.remove(id.trim()); const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join(''); const filePath = `${this.path}/${filename}`.substring(0, 240); // Wait for the paragraph at the bottom to have loaded. await this.page.$x('//a[contains(text(), "* Firms with an asterisk")]'); // open all accordions before taking screenshot // first, add a class `expand-below` to the container divs we are interested in: for (const heading of this._headingsToScrape) { const expandBelowThisDiv = await this.page.$x(`//h4[contains(., "${heading}")]/../..`); expandBelowThisDiv.forEach(async (elm) => { await this.page.evaluate(el => { const currentClass = el.getAttribute('class'); el.setAttribute('class', `${currentClass} expand-below`); }, elm); }); } // then, add a style tag to the to expand the content await this.page.addStyleTag({ 'content': ` div.expand-below div.collapse { display: block; } div.expand-below div.modal { display: block; position: static; opacity: 1; overflow: visible; margin-top: 125px; } /* remove drop shadows for faster rendering on large pages */ .modal-content { -webkit-box-shadow: none; box-shadow: none; } ` }); // temporarily disable GI screenshots // logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`); // await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); const $ = cheerio.load(body); const underConstruction = $('h3:contains("under construction")').length > 0; if (underConstruction) { logger.warn(`Page under construction: ${this.page.url()}`); await jsonfile.writeFile(`${filePath}.json`, { 'underConstruction': true }); } else { const details = await this.extractEntityDetails(body); await jsonfile.writeFile(`${filePath}.json`, { details }); } await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; case 3: await this.processEntityDetails(this.agentServices); break; } } /** * * @param serviceObject * @returns {Promise} */ async processIndex(serviceObject) { await this._randomWait(this.page, 3, 5); const body = await this.page.content(); const filename = this.modeNames[this.mode]; // temporarily disable GI screenshots // logger.info(`Taking screenshot of ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`); // await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${serviceObject.indexStep}`, null); const $ = cheerio.load(body); let ul; switch (this.mode) { case 0: ul = $('h3:contains("Authorised Payment Institutions")'); break; case 1: ul = $('h3:contains("E-money Institutions")'); break; case 2: ul = $('h3:contains("Banks")'); break; case 3: ul = $('h3:contains("Electronic Money and Payment Institution Agents")'); } const links = ul.next().find('li > a'); links.each((i, item) => { const href = $(item).attr('href'); const text = this._cleanUp($(item).text()); const newUrl = `https://www.fsc.gi${href}`; const id = this._makeFieldName(text); if (serviceObject.links.findIndex(x => x.href === newUrl) === -1) serviceObject.links.push({ 'name': text, 'href': newUrl, 'id': id }); }); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { await this._randomWait(this.page, 6, 9); logger.info(`Building the ${this.modeTitles[this.mode]} index, url ${serviceObject.indexStep}...`); await this.processIndex(serviceObject); if (serviceObject.indexStep < serviceObject.urls.length - 1) { serviceObject.indexStep++; const newUrl = serviceObject.urls[serviceObject.indexStep]; await this._goto(newUrl); } else this.emit('indexdone'); } /** * * @returns {Promise} */ async indexRedirector() { logger.debug('>> indexRedirector'); switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; case 3: await this.buildIndex(this.agentServices); break; } } /** * * @returns {Promise} */ async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } try { if ( pageUrl.pathname.includes('payment-institutions-20') || pageUrl.pathname.includes('e-money-institutions-17') || pageUrl.pathname.includes('banks-1') || pageUrl.pathname.includes('electronic-money-and-payment-institution-agents-26') ) await this.indexRedirector(); else if (pageUrl.pathname.includes('regulated-entity')) await this.processRedirector(); else if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl.href}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl.href); } } catch (err) { if (err.name === 'TimeoutError') { logger.error(`Reloading page after timeout: ${err.name}: ${err.message}`); this.page.reload(); } else throw(err); } } /** * * @returns {Promise} */ async attachEvents() { this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; case 3: this.emit('agentServicesDone'); break; } }); this.on('psindexdone', async () => { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); const newUrl = this.paymentServices.links[this.paymentServices.step].href; await this._goto(newUrl); }); this.on('emindexdone', async () => { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); const newUrl = this.emoneyServices.links[this.emoneyServices.step].href; await this._goto(newUrl); }); this.on('ciindexdone', async () => { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); const newUrl = this.creditServices.links[this.creditServices.step].href; await this._goto(newUrl); }); this.on('agindexdone', async () => { this.agentServices.items = this.agentServices.links.length; logger.info(`${this.agentServices.items} items indexed`); const newUrl = this.agentServices.links[this.agentServices.step].href; await this._goto(newUrl); }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; case 3: this.emit('agindexdone'); break; } }); this.on('paymentServicesDone', async () => { logger.warn('paymentServicesDone'); try { this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); try { this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links': this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); try { this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links': this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; } catch (e) { logger.error(e); } await this._goto(this.agentServices.urls[0]); }); this.on('agentServicesDone', async () => { logger.warn('agentServicesDone'); try { this.agentServices.done = true; jsonfile.writeFileSync(`${this.path}/agentServices.json`, { 'links': this.agentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/agentServices.json`, this.agentServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done': false, 'urls': ['https://www.fsc.gi/regulated-entities/payment-institutions-20'], 'sections': [], 'sectionLinks': [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done': false, 'urls': ['https://www.fsc.gi/regulated-entities/e-money-institutions-17'], 'sections': [], 'sectionLinks': [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'paginationStep': 0, 'visited': false, 'done': false, 'urls': ['https://www.fsc.gi/regulated-entities/banks-1'], 'sections': [], 'sectionLinks': [] }; this.agentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'done': false, 'urls': ['https://www.fsc.gi/regulated-entities/electronic-money-and-payment-institution-agents-26'] }; this.startPage = this.paymentServices.urls[0]; this.setPath(path.resolve(`${__dirname}/../artefacts/GI/FSC`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil': 'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch (e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = GIScrape;