496 lines
14 KiB
JavaScript
496 lines
14 KiB
JavaScript
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const logger = require('log4js').getLogger('FR');
|
|
const url = require('url');
|
|
const removeAccents = require('remove-accents-diacritics');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
// load env variables from file
|
|
|
|
class FRScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super(); // must call super for "this" to be defined.
|
|
this.setID('FR');
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._debounce(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param path
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async gotoPage(path = null) {
|
|
const newUrl = `${this.parsedUrl.protocol}//${this.parsedUrl.hostname}${path.link}`;
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
logger.info('newurl:', newUrl);
|
|
await this._goto(newUrl);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param rows
|
|
* @returns {Array}
|
|
*/
|
|
extractDataFromTable(rows) {
|
|
const unchecked = /(unchecked)/;
|
|
const output = [];
|
|
const crossBorder = [];
|
|
|
|
let currentActivityID ;
|
|
rows.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
let newItem;
|
|
|
|
if (children.eq(1).text().trim() !== '')
|
|
currentActivityID = children.eq(1).text().trim();
|
|
|
|
if (children.eq(0).html().match(unchecked) === null)
|
|
if (children.length === 2) {
|
|
crossBorder.push(this._cleanUp(currentActivityID.trim()));
|
|
}
|
|
else
|
|
if (children.length === 3) {
|
|
newItem = [currentActivityID, this._cleanUp(children.eq(2).text().trim())];
|
|
output.push(newItem);
|
|
}
|
|
else {
|
|
newItem = [`${currentActivityID}${children.eq(2).text().replace(')', '').trim()}`, this._cleanUp(children.eq(3).text().trim())];
|
|
output.push(newItem);
|
|
}
|
|
});
|
|
|
|
return { output, crossBorder };
|
|
}
|
|
|
|
extractDataFromInvestmentServicesTable(rows) {
|
|
const unchecked = /(unchecked)/;
|
|
const output = [];
|
|
const authorised = [];
|
|
const financialInstruments = [];
|
|
|
|
rows.each((i, elm) => {
|
|
const finInst = [];
|
|
const children = cheerio(elm).children();
|
|
|
|
if (children.length > 2) {
|
|
if (children.length === 11)
|
|
children.each((step, fiElm) => {
|
|
financialInstruments.push(this._cleanUp(cheerio(fiElm).text()));
|
|
});
|
|
|
|
if (children.length > 11) {
|
|
let offset = (children.length - 1) - financialInstruments.length;
|
|
const fiOffset = (offset === 0) ? 1 : 2;
|
|
|
|
const rowName = children.eq(offset).text();
|
|
offset++;
|
|
while(offset < financialInstruments.length) {
|
|
if (children.eq(offset).html().match(unchecked) === null)
|
|
finInst.push(financialInstruments[offset - fiOffset]);
|
|
|
|
offset++;
|
|
}
|
|
if (finInst.length > 0)
|
|
output.push([rowName, finInst]);
|
|
}
|
|
}
|
|
else if (children.length === 2)
|
|
|
|
if (children.eq(0).html().match(unchecked) === null) {
|
|
authorised.push(this._cleanUp(children.eq(1).text()));
|
|
}
|
|
});
|
|
|
|
return { 'investmentServices':output, authorised };
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param tables
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractEuroData(tables) {
|
|
const dataBlock = [];
|
|
const findToColon = /^.*?(?=(:))/;
|
|
const trimToColon = /^.*?(?=(:)).\s/;
|
|
|
|
const divs = tables.find('div.zone_succ');
|
|
|
|
divs.each((i, elm) => {
|
|
const p = cheerio(elm).find('p').eq(0).text();
|
|
|
|
const title = this._cleanUp(p.match(findToColon)[0]).trim();
|
|
const country = this._cleanUp(p.split(trimToColon)[2]).trim();
|
|
|
|
const obj = {};
|
|
obj[title] = country;
|
|
|
|
const rows = cheerio(elm).find('table tr');
|
|
|
|
const data = this.extractDataFromTable(rows);
|
|
|
|
obj.paymentServices = data.output;
|
|
obj.crossBorder = data.crossBorder;
|
|
|
|
dataBlock.push(obj);
|
|
});
|
|
|
|
return dataBlock;
|
|
}
|
|
|
|
async extractLinks($table, creditInstFilter = false) {
|
|
const wantedCIStatuses = ['legal entity/ company'];
|
|
const links = [];
|
|
logger.info('Extracting links...');
|
|
if ($table.length > 1)
|
|
// The table contains more than just the heading row
|
|
for (let count = 1;count < $table.length;count++) {
|
|
const $row = cheerio($table.get(count)).find('td');
|
|
|
|
const $item = $row.children().eq(2);
|
|
|
|
const link = $item.attr('href');
|
|
const title = this._cleanUp($item.text());
|
|
|
|
if (!creditInstFilter)
|
|
// Default mode
|
|
links.push({ link, title });
|
|
else
|
|
if ($row.children().length >= 6) {
|
|
const statusField = $row.children().length - 1;
|
|
const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase());
|
|
|
|
if(wantedCIStatuses.indexOf(status) !== -1)
|
|
links.push({ link, title });
|
|
}
|
|
}
|
|
|
|
return links;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractDetails($) {
|
|
const findToColon = /^.*?(?=(:))/;
|
|
const trimToColon = /^.*?(?=(:)).\s/;
|
|
const details = [];
|
|
|
|
$('div#zone_description ul.nopuce li').each((i, elm) => {
|
|
if ($(elm).children().length > 0) {
|
|
const matched = $(elm).text().match(findToColon);
|
|
|
|
if (matched !== null) {
|
|
const field = this._cleanUp($(elm).text().match(findToColon)[0]).trim();
|
|
const data = this._cleanUp( $(elm).text().split(trimToColon)[2]);
|
|
|
|
details.push([field, data]);
|
|
}
|
|
}
|
|
});
|
|
|
|
return details;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processAFPage() {
|
|
const noWhiteSpace = /\W/g;
|
|
const trimToColon = /^.*?(?=(:)).\s/;
|
|
|
|
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
|
const $ = cheerio.load(body);
|
|
const modeFilename = ['ps_', 'em_', 'ci_'];
|
|
|
|
const pageData = { 'description':[], 'frActivities':null, 'EUActivities':[] };
|
|
|
|
pageData.entity = removeAccents.remove($('p.sttr').eq(0).text().replace(trimToColon, '').trim());
|
|
|
|
const filename = `${modeFilename[this.mode]}${pageData.entity.replace(noWhiteSpace, '_')}`;
|
|
|
|
this._makeScreenshotV2(this.page, `${this.path}/${filename}_main`, null);
|
|
|
|
pageData.description = await this.extractDetails($);
|
|
|
|
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(3) a');
|
|
|
|
// Process France / French details
|
|
|
|
this._makeScreenshotV2(this.page, `${this.path}/${filename}_france`, null);
|
|
|
|
const frenchTbl = $('#zone_en_france > table tr');
|
|
|
|
if (this.mode < 2)
|
|
pageData.frActivities = await this.extractDataFromTable(frenchTbl).output;
|
|
else
|
|
pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl);
|
|
|
|
if (this.mode < 2) {
|
|
await this._findAndClick('div.main.main_evol > table > tbody > tr > td:nth-child(5) a');
|
|
|
|
// Process EU Details
|
|
|
|
this._makeScreenshotV2(this.page, `${this.path}/${filename}_europe`, null);
|
|
|
|
const euroTbls = $('#zone_en_europe');
|
|
|
|
pageData.EUActivities = await this.extractEuroData(euroTbls);
|
|
}
|
|
|
|
jsonfile.writeFileSync(`${this.path}/${filename}.json`, pageData);
|
|
|
|
if (this.mode === 0 ) {
|
|
this.paymentServices.links[this.paymentServices.step].filename = `${filename}.json`;
|
|
this.paymentServices.step++;
|
|
}
|
|
else if( this.mode === 1) {
|
|
this.emoneyServices.links[this.emoneyServices.step].filename = `${filename}.json`;
|
|
this.emoneyServices.step++;
|
|
}
|
|
else if( this.mode === 2) {
|
|
this.creditServices.links[this.creditServices.step].filename = `${filename}.json`;
|
|
this.creditServices.step++;
|
|
}
|
|
|
|
this.perf.scraped++;
|
|
await this._randomWait(this.page, 5, 7);
|
|
|
|
if (this.mode === 0)
|
|
if (this.paymentServices.step < this.paymentServices.items)
|
|
await this.gotoPage(this.paymentServices.links[this.paymentServices.step]);
|
|
else {
|
|
logger.debug('Payment services complete.');
|
|
this.paymentServices.done = true;
|
|
|
|
this.mode++;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
await this._goto(this.eMoneyUrl);
|
|
}
|
|
|
|
else if (this.mode === 1)
|
|
if (this.emoneyServices.step < this.emoneyServices.items)
|
|
await this.gotoPage(this.emoneyServices.links[this.emoneyServices.step]);
|
|
else {
|
|
logger.debug('EMoney services complete.');
|
|
this.emoneyServices.done = true;
|
|
this.mode++;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
|
await this._goto(this.creditUrl);
|
|
}
|
|
|
|
else if (this.mode === 2)
|
|
if (this.creditServices.step < this.creditServices.items)
|
|
await this.gotoPage(this.creditServices.links[this.creditServices.step]);
|
|
else {
|
|
logger.debug('Credit services complete.');
|
|
this.creditServices.done = true;
|
|
this.mode++;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
|
this.emit('done');
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param $
|
|
* @param store
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async searchResultsProcessor($, store) {
|
|
const $table = $('table.table tr');
|
|
|
|
if ($table.length > 1) {
|
|
// The table contains more than just the heading row
|
|
store.indexcount++;
|
|
logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`);
|
|
|
|
await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null);
|
|
|
|
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
|
|
}
|
|
|
|
// check that the next button is active
|
|
|
|
const nextExists = $('body > div > div.main.main_evol > ul > li:last-child > a');
|
|
|
|
if (nextExists.length === 1 )
|
|
await this._findAndClick('body > div > div.main.main_evol > ul > li:last-child > a', 'Next page >');
|
|
else {
|
|
// Done gathering search results
|
|
logger.info('Completed gathering search results..');
|
|
store.searchDone = true;
|
|
store.items = store.links.length;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/${['pi', 'eu', 'ci'][this.mode]}.json`, store);
|
|
|
|
this.gotoPage(store.links[store.step]);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handle the search result page and uilt the list of links
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleSearchResults() {
|
|
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
|
const $ = cheerio.load(body);
|
|
|
|
if (this.mode === 0 && !this.paymentServices.searchDone)
|
|
await this.searchResultsProcessor($, this.paymentServices);
|
|
|
|
if (this.mode === 1 && !this.emoneyServices.searchDone)
|
|
await this.searchResultsProcessor($, this.emoneyServices);
|
|
|
|
if (this.mode === 2 && !this.creditServices.searchDone)
|
|
await this.searchResultsProcessor($, this.creditServices);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processNewPage(dump = false) {
|
|
// give the page a few seconds to settle
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
this.emit('recover');
|
|
|
|
return;
|
|
}
|
|
|
|
const search = pageUrl.search;
|
|
|
|
const params = this._getParamsFromUrl(search);
|
|
const pageID = params.page || '';
|
|
switch (pageID) {
|
|
|
|
case 'results':
|
|
await this.handleSearchResults( );
|
|
break;
|
|
case 'af':
|
|
await this.processAFPage();
|
|
break;
|
|
|
|
default:
|
|
await this._uploadError();
|
|
throw new Error(`Unknown page: ${currentPage.location}`);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
await super._start();
|
|
try {
|
|
this.mode = 0;
|
|
|
|
this.paymentServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'indexcount' :0
|
|
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'indexcount' :0
|
|
};
|
|
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'indexcount' :0
|
|
};
|
|
|
|
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
|
|
this.eMoneyUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=22-TBR07&retrait=0';
|
|
this.creditUrl = 'https://www.regafi.fr/spip.php?page=results&type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0';
|
|
|
|
this.parsedUrl = url.parse(this.creditUrl);
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/FR/REGAFI`));
|
|
|
|
await this._initBrowser(true);
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async __run() {
|
|
logger.info('Scraping France...');
|
|
|
|
await this.start();
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = FRScrape;
|