obdfcascrape/ncas/fr.js
Martin Donnelly a5109efabe 2019-05-12
2019-05-12 18:33:09 +01:00

496 lines
14 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('FR');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');
logger.level = process.env.LOGGER_LEVEL || 'warn';
// load env variables from file
class FRScrape extends Scraper {
constructor() {
super(); // must call super for "this" to be defined.
this.setID('FR');
this.on('done', () => {
this._done();
});
this.run = this._debounce(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param path
* @returns {Promise<void>}
*/
async gotoPage(path = null) {
const newUrl = `${this.parsedUrl.protocol}//${this.parsedUrl.hostname}${path.link}`;
await this._randomWait(this.page, 3, 5);
logger.info('newurl:', newUrl);
await this._goto(newUrl);
}
/**
*
* @param rows
* @returns {Array}
*/
extractDataFromTable(rows) {
const unchecked = /(unchecked)/;
const output = [];
const crossBorder = [];
let currentActivityID ;
rows.each((i, elm) => {
const children = cheerio(elm).children();
let newItem;
if (children.eq(1).text().trim() !== '')
currentActivityID = children.eq(1).text().trim();
if (children.eq(0).html().match(unchecked) === null)
if (children.length === 2) {
crossBorder.push(this._cleanUp(currentActivityID.trim()));
}
else
if (children.length === 3) {
newItem = [currentActivityID, this._cleanUp(children.eq(2).text().trim())];
output.push(newItem);
}
else {
newItem = [`${currentActivityID}${children.eq(2).text().replace(')', '').trim()}`, this._cleanUp(children.eq(3).text().trim())];
output.push(newItem);
}
});
return { output, crossBorder };
}
extractDataFromInvestmentServicesTable(rows) {
const unchecked = /(unchecked)/;
const output = [];
const authorised = [];
const financialInstruments = [];
rows.each((i, elm) => {
const finInst = [];
const children = cheerio(elm).children();
if (children.length > 2) {
if (children.length === 11)
children.each((step, fiElm) => {
financialInstruments.push(this._cleanUp(cheerio(fiElm).text()));
});
if (children.length > 11) {
let offset = (children.length - 1) - financialInstruments.length;
const fiOffset = (offset === 0) ? 1 : 2;
const rowName = children.eq(offset).text();
offset++;
while(offset < financialInstruments.length) {
if (children.eq(offset).html().match(unchecked) === null)
finInst.push(financialInstruments[offset - fiOffset]);
offset++;
}
if (finInst.length > 0)
output.push([rowName, finInst]);
}
}
else if (children.length === 2)
if (children.eq(0).html().match(unchecked) === null) {
authorised.push(this._cleanUp(children.eq(1).text()));
}
});
return { 'investmentServices':output, authorised };
}
/**
*
* @param tables
* @returns {Promise<Array>}
*/
async extractEuroData(tables) {
const dataBlock = [];
const findToColon = /^.*?(?=(:))/;
const trimToColon = /^.*?(?=(:)).\s/;
const divs = tables.find('div.zone_succ');
divs.each((i, elm) => {
const p = cheerio(elm).find('p').eq(0).text();
const title = this._cleanUp(p.match(findToColon)[0]).trim();
const country = this._cleanUp(p.split(trimToColon)[2]).trim();
const obj = {};
obj[title] = country;
const rows = cheerio(elm).find('table tr');
const data = this.extractDataFromTable(rows);
obj.paymentServices = data.output;
obj.crossBorder = data.crossBorder;
dataBlock.push(obj);
});
return dataBlock;
}
async extractLinks($table, creditInstFilter = false) {
const wantedCIStatuses = ['legal entity/ company'];
const links = [];
logger.info('Extracting links...');
if ($table.length > 1)
// The table contains more than just the heading row
for (let count = 1;count < $table.length;count++) {
const $row = cheerio($table.get(count)).find('td');
const $item = $row.children().eq(2);
const link = $item.attr('href');
const title = this._cleanUp($item.text());
if (!creditInstFilter)
// Default mode
links.push({ link, title });
else
if ($row.children().length >= 6) {
const statusField = $row.children().length - 1;
const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase());
if(wantedCIStatuses.indexOf(status) !== -1)
links.push({ link, title });
}
}
return links;
}
/**
*
* @param $
* @returns {Promise<Array>}
*/
async extractDetails($) {
const findToColon = /^.*?(?=(:))/;
const trimToColon = /^.*?(?=(:)).\s/;
const details = [];
$('div#zone_description ul.nopuce li').each((i, elm) => {
if ($(elm).children().length > 0) {
const matched = $(elm).text().match(findToColon);
if (matched !== null) {
const field = this._cleanUp($(elm).text().match(findToColon)[0]).trim();
const data = this._cleanUp( $(elm).text().split(trimToColon)[2]);
details.push([field, data]);
}
}
});
return details;
}
/**
*
* @returns {Promise<void>}
*/
async processAFPage() {
const noWhiteSpace = /\W/g;
const trimToColon = /^.*?(?=(:)).\s/;
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
const modeFilename = ['ps_', 'em_', 'ci_'];
const pageData = { 'description':[], 'frActivities':null, 'EUActivities':[] };
pageData.entity = removeAccents.remove($('p.sttr').eq(0).text().replace(trimToColon, '').trim());
const filename = `${modeFilename[this.mode]}${pageData.entity.replace(noWhiteSpace, '_')}`;
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
pageData.description = await this.extractDetails($);
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(3) a');
// Process France / French details
this._makeScreenshotV2(this.page, `${this.path}/${filename}_france`, null);
const frenchTbl = $('#zone_en_france > table tr');
if (this.mode < 2)
pageData.frActivities = await this.extractDataFromTable(frenchTbl).output;
else
pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl);
if (this.mode < 2) {
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(5) a');
// Process EU Details
this._makeScreenshotV2(this.page, `${this.path}/${filename}_europe`, null);
const euroTbls = $('#zone_en_europe');
pageData.EUActivities = await this.extractEuroData(euroTbls);
}
jsonfile.writeFileSync(`${this.path}/${filename}.json`, pageData);
if (this.mode === 0 ) {
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
this.paymentServices.step++;
}
else if( this.mode === 1) {
this.emoneyServices.links[this.emoneyServices.step].filename = `${filename}.json`;
this.emoneyServices.step++;
}
else if( this.mode === 2) {
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
this.creditServices.step++;
}
this.perf.scraped++;
await this._randomWait(this.page, 5, 7);
if (this.mode === 0)
if (this.paymentServices.step < this.paymentServices.items)
await this.gotoPage(this.paymentServices.links[this.paymentServices.step]);
else {
logger.debug('Payment services complete.');
this.paymentServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
await this._goto(this.eMoneyUrl);
}
else if (this.mode === 1)
if (this.emoneyServices.step < this.emoneyServices.items)
await this.gotoPage(this.emoneyServices.links[this.emoneyServices.step]);
else {
logger.debug('EMoney services complete.');
this.emoneyServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
await this._goto(this.creditUrl);
}
else if (this.mode === 2)
if (this.creditServices.step < this.creditServices.items)
await this.gotoPage(this.creditServices.links[this.creditServices.step]);
else {
logger.debug('Credit services complete.');
this.creditServices.done = true;
this.mode++;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.emit('done');
}
}
/**
*
* @param $
* @param store
* @returns {Promise<void>}
*/
async searchResultsProcessor($, store) {
const $table = $('table.table tr');
if ($table.length > 1) {
// The table contains more than just the heading row
store.indexcount++;
logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`);
await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null);
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
}
// check that the next button is active
const nextExists = $('body > div > div.main.main_evol > ul > li:last-child > a');
if (nextExists.length === 1 )
await this._findAndClick('body > div > div.main.main_evol > ul > li:last-child > a', 'Next page >');
else {
// Done gathering search results
logger.info('Completed gathering search results..');
store.searchDone = true;
store.items = store.links.length;
jsonfile.writeFileSync(`${this.path}/${['pi', 'eu', 'ci'][this.mode]}.json`, store);
this.gotoPage(store.links[store.step]);
}
}
/**
* Handle the search result page and uilt the list of links
* @returns {Promise<void>}
*/
async handleSearchResults() {
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
const $ = cheerio.load(body);
if (this.mode === 0 && !this.paymentServices.searchDone)
await this.searchResultsProcessor($, this.paymentServices);
if (this.mode === 1 && !this.emoneyServices.searchDone)
await this.searchResultsProcessor($, this.emoneyServices);
if (this.mode === 2 && !this.creditServices.searchDone)
await this.searchResultsProcessor($, this.creditServices);
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage(dump = false) {
// give the page a few seconds to settle
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const search = pageUrl.search;
const params = this._getParamsFromUrl(search);
const pageID = params.page || '';
switch (pageID) {
case 'results':
await this.handleSearchResults( );
break;
case 'af':
await this.processAFPage();
break;
default:
await this._uploadError();
throw new Error(`Unknown page: ${currentPage.location}`);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async start() {
await super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'indexcount' :0
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'indexcount' :0
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'indexcount' :0
};
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
this.eMoneyUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=22-TBR07&retrait=0';
this.creditUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0';
this.parsedUrl = url.parse(this.creditUrl);
this.setPath(path.resolve(`${__dirname }/../artefacts/FR/REGAFI`));
await this._initBrowser(true);
await this._createBrowserPage();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage().catch((err) => {
logger.error('processNewPage fail', err);
});
}, 2500));
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
/**
*
* @returns {Promise<void>}
*/
async __run() {
logger.info('Scraping France...');
await this.start();
}
}
module.exports = FRScrape;