const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('FR'); const url = require('url'); const removeAccents = require('remove-accents-diacritics'); logger.level = process.env.LOGGER_LEVEL || 'warn'; // load env variables from file class FRScrape extends Scraper { constructor() { super(); // must call super for "this" to be defined. this.setID('FR'); this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param path * @returns {Promise} */ async gotoPage(path = null) { const newUrl = `${this.parsedUrl.protocol}//${this.parsedUrl.hostname}${path.link}`; await this._randomWait(this.page, 3, 5); logger.info('newurl:', newUrl); await this._goto(newUrl); } /** * * @param rows * @returns {Array} */ extractDataFromTable(rows) { const unchecked = /(unchecked)/; const output = []; const crossBorder = []; let currentActivityID ; rows.each((i, elm) => { const children = cheerio(elm).children(); let newItem; if (children.eq(1).text().trim() !== '') currentActivityID = children.eq(1).text().trim(); if (children.eq(0).html().match(unchecked) === null) if (children.length === 2) { crossBorder.push(this._cleanUp(currentActivityID.trim())); } else if (children.length === 3) { newItem = [currentActivityID, this._cleanUp(children.eq(2).text().trim())]; output.push(newItem); } else { newItem = [`${currentActivityID}${children.eq(2).text().replace(')', '').trim()}`, this._cleanUp(children.eq(3).text().trim())]; output.push(newItem); } }); return { output, crossBorder }; } extractDataFromInvestmentServicesTable(rows) { const unchecked = /(unchecked)/; const output = []; const authorised = []; const financialInstruments = []; rows.each((i, elm) => { const finInst = []; const children = cheerio(elm).children(); if (children.length > 2) { if (children.length === 11) children.each((step, fiElm) => { financialInstruments.push(this._cleanUp(cheerio(fiElm).text())); }); if (children.length > 11) { let offset = (children.length - 1) - financialInstruments.length; const fiOffset = (offset === 0) ? 1 : 2; const rowName = children.eq(offset).text(); offset++; while(offset < financialInstruments.length) { if (children.eq(offset).html().match(unchecked) === null) finInst.push(financialInstruments[offset - fiOffset]); offset++; } if (finInst.length > 0) output.push([rowName, finInst]); } } else if (children.length === 2) if (children.eq(0).html().match(unchecked) === null) { authorised.push(this._cleanUp(children.eq(1).text())); } }); return { 'investmentServices':output, authorised }; } /** * * @param tables * @returns {Promise} */ async extractEuroData(tables) { const dataBlock = []; const findToColon = /^.*?(?=(:))/; const trimToColon = /^.*?(?=(:)).\s/; const divs = tables.find('div.zone_succ'); divs.each((i, elm) => { const p = cheerio(elm).find('p').eq(0).text(); const title = this._cleanUp(p.match(findToColon)[0]).trim(); const country = this._cleanUp(p.split(trimToColon)[2]).trim(); const obj = {}; obj[title] = country; const rows = cheerio(elm).find('table tr'); const data = this.extractDataFromTable(rows); obj.paymentServices = data.output; obj.crossBorder = data.crossBorder; dataBlock.push(obj); }); return dataBlock; } async extractLinks($table, creditInstFilter = false) { const wantedCIStatuses = ['legal entity/ company']; const links = []; logger.info('Extracting links...'); if ($table.length > 1) // The table contains more than just the heading row for (let count = 1;count < $table.length;count++) { const $row = cheerio($table.get(count)).find('td'); const $item = $row.children().eq(2); const link = $item.attr('href'); const title = this._cleanUp($item.text()); if (!creditInstFilter) // Default mode links.push({ link, title }); else if ($row.children().length >= 6) { const statusField = $row.children().length - 1; const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase()); if(wantedCIStatuses.indexOf(status) !== -1) links.push({ link, title }); } } return links; } /** * * @param $ * @returns {Promise} */ async extractDetails($) { const findToColon = /^.*?(?=(:))/; const trimToColon = /^.*?(?=(:)).\s/; const details = []; $('div#zone_description ul.nopuce li').each((i, elm) => { if ($(elm).children().length > 0) { const matched = $(elm).text().match(findToColon); if (matched !== null) { const field = this._cleanUp($(elm).text().match(findToColon)[0]).trim(); const data = this._cleanUp( $(elm).text().split(trimToColon)[2]); details.push([field, data]); } } }); return details; } /** * * @returns {Promise} */ async processAFPage() { const noWhiteSpace = /\W/g; const trimToColon = /^.*?(?=(:)).\s/; const body = await this.page.evaluate(() => document.documentElement.outerHTML); const $ = cheerio.load(body); const modeFilename = ['ps_', 'em_', 'ci_']; const pageData = { 'description':[], 'frActivities':null, 'EUActivities':[] }; pageData.entity = removeAccents.remove($('p.sttr').eq(0).text().replace(trimToColon, '').trim()); const filename = `${modeFilename[this.mode]}${pageData.entity.replace(noWhiteSpace, '_')}`; this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null); pageData.description = await this.extractDetails($); await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(3) a'); // Process France / French details this._makeScreenshotV2(this.page, `${this.path}/${filename}_france`, null); const frenchTbl = $('#zone_en_france > table tr'); if (this.mode < 2) pageData.frActivities = await this.extractDataFromTable(frenchTbl).output; else pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl); if (this.mode < 2) { await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(5) a'); // Process EU Details this._makeScreenshotV2(this.page, `${this.path}/${filename}_europe`, null); const euroTbls = $('#zone_en_europe'); pageData.EUActivities = await this.extractEuroData(euroTbls); } jsonfile.writeFileSync(`${this.path}/${filename}.json`, pageData); if (this.mode === 0 ) { this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`; this.paymentServices.step++; } else if( this.mode === 1) { this.emoneyServices.links[this.emoneyServices.step].filename = `${filename}.json`; this.emoneyServices.step++; } else if( this.mode === 2) { this.creditServices.links[this.creditServices.step].filename = `${filename}.json`; this.creditServices.step++; } this.perf.scraped++; await this._randomWait(this.page, 5, 7); if (this.mode === 0) if (this.paymentServices.step < this.paymentServices.items) await this.gotoPage(this.paymentServices.links[this.paymentServices.step]); else { logger.debug('Payment services complete.'); this.paymentServices.done = true; this.mode++; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); await this._goto(this.eMoneyUrl); } else if (this.mode === 1) if (this.emoneyServices.step < this.emoneyServices.items) await this.gotoPage(this.emoneyServices.links[this.emoneyServices.step]); else { logger.debug('EMoney services complete.'); this.emoneyServices.done = true; this.mode++; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); await this._goto(this.creditUrl); } else if (this.mode === 2) if (this.creditServices.step < this.creditServices.items) await this.gotoPage(this.creditServices.links[this.creditServices.step]); else { logger.debug('Credit services complete.'); this.creditServices.done = true; this.mode++; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.emit('done'); } } /** * * @param $ * @param store * @returns {Promise} */ async searchResultsProcessor($, store) { const $table = $('table.table tr'); if ($table.length > 1) { // The table contains more than just the heading row store.indexcount++; logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`); await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null); store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2))); } // check that the next button is active const nextExists = $('body > div > div.main.main_evol > ul > li:last-child > a'); if (nextExists.length === 1 ) await this._findAndClick('body > div > div.main.main_evol > ul > li:last-child > a', 'Next page >'); else { // Done gathering search results logger.info('Completed gathering search results..'); store.searchDone = true; store.items = store.links.length; jsonfile.writeFileSync(`${this.path}/${['pi', 'eu', 'ci'][this.mode]}.json`, store); this.gotoPage(store.links[store.step]); } } /** * Handle the search result page and uilt the list of links * @returns {Promise} */ async handleSearchResults() { const body = await this.page.evaluate(() => document.documentElement.outerHTML); const $ = cheerio.load(body); if (this.mode === 0 && !this.paymentServices.searchDone) await this.searchResultsProcessor($, this.paymentServices); if (this.mode === 1 && !this.emoneyServices.searchDone) await this.searchResultsProcessor($, this.emoneyServices); if (this.mode === 2 && !this.creditServices.searchDone) await this.searchResultsProcessor($, this.creditServices); } /** * * @returns {Promise} */ async processNewPage(dump = false) { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } const search = pageUrl.search; const params = this._getParamsFromUrl(search); const pageID = params.page || ''; switch (pageID) { case 'results': await this.handleSearchResults( ); break; case 'af': await this.processAFPage(); break; default: await this._uploadError(); throw new Error(`Unknown page: ${currentPage.location}`); break; } } /** * * @returns {Promise} */ async start() { await super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'indexcount' :0 }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'indexcount' :0 }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'indexcount' :0 }; this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3'; this.eMoneyUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=22-TBR07&retrait=0'; this.creditUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0'; this.parsedUrl = url.parse(this.creditUrl); this.setPath(path.resolve(`${__dirname }/../artefacts/FR/REGAFI`)); await this._initBrowser(true); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } /** * * @returns {Promise} */ async __run() { logger.info('Scraping France...'); await this.start(); } } module.exports = FRScrape;