Martin Donnelly a5109efabe 2019-05-12
2019-05-12 18:33:09 +01:00

514 lines
13 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const removeAccents = require('remove-accents-diacritics');
const logger = require('log4js').getLogger('(PT)');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
class PTScrape extends Scraper {
constructor() {
super();
this.setID('PT');
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
try {
const details = {};
const detailSequence = [
['field-name-field-tipo-ent-aut', 'institutionType'],
['field-name-field-estado-ent', 'state'],
['field-name-field-morada', 'address'],
['field-name-field-localidade', 'firstName'],
['field-name-field-cod-postal', 'postcode'],
['field-name-field-pais', 'country'],
['field-name-field-data-limite', 'beginningOfActivity'],
['field-name-field-capital-subscrito', 'subscribedCapital'],
['field-name-field-capital-realizado', 'paidUpCapital'],
['field-name-field-jel', 'institutionCodeNumber']
];
const $ = cheerio.load(html);
details.name = this._cleanUp($('h1.page-title').text()) ;
const mainDiv = $('div.content');
for(const item of detailSequence) {
const i = $(mainDiv).find(`.${item[0]} div.field-items`);
details[item[1]] = this._cleanUp($(i).text());
}
return details;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processIndex(serviceObject) {
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
await this._randomWait(this.page, 3, 5);
const body = await this.page.content();
const $ = cheerio.load(body);
if ($('div.view-empty').length > 0) {
// We have reached an empty page, so we assume we've scraped all links from this index
this.emit('indexdone');
return;
}
const links = $('div.views-field.views-field-title > span > a');
links.each((i, item) => {
const href = $(item).attr('href');
const text = $(item).text();
const newUrl = `https://www.bportugal.pt${href}`;
const id = this._makeFieldName(text);
serviceObject.links.push({ 'name':text, 'href':newUrl, 'id':id });
});
const filename = this.modeNames[this.mode];
const parsedUrl = url.parse(this.page.url(), true);
await this._makeScreenshotV2(this.page, `${this.path}/${filename}_main_${parsedUrl.query.page}`, null);
parsedUrl.query.page++;
parsedUrl.search = undefined; // Forces parsedUrl to use `query` property, as modified on line above
const nextPage = url.format(parsedUrl);
await this._goto(nextPage);
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async buildIndex(serviceObject) {
// We have stopped using the "view all" button due to it breaking.
// Leaving the code below commented in case it is ever useful in future.
// await this.page.waitForSelector('#block-system-main > div > div > div.view-content-wrapper > ul > li.pager__item.pager__item_all', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
// logger.debug('Extend menu list..');
// await elm.click({ 'delay':90 });
// }).catch(() => {
// logger.info('No show all button');
// });
await this._randomWait(this.page, 6, 9);
await this.processIndex(serviceObject);
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('>> indexRedirector');
let doIndex = false;
await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':7500 }).then(async (elm) => {
logger.warn('Sent back to the main selector screen');
await elm.click({ 'delay':90 });
doIndex = false;
}).catch(() => {
// logger.info('No show all button');
doIndex = true;
});
if (doIndex)
switch (this.mode) {
case 0:
await this.buildIndex(this.paymentServices);
break;
case 1:
await this.buildIndex(this.emoneyServices);
break;
case 2:
await this.buildIndex(this.creditServices);
break;
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetails(serviceObject) {
const noWhiteSpace = /\W/g;
const { name, id } = serviceObject.links[serviceObject.step];
// const id = serviceObject.links[serviceObject.step].id;
logger.info(`Process ${this.modeTitles[this.mode]} entity ${serviceObject.step + 1} of ${serviceObject.items} // ${name}`);
// 'h1.page-title'
await this.page.waitForSelector('h1.page-title', { 'visible':true, 'timeout':7500 });
await this._randomWait(this.page, 3, 5);
const entity = removeAccents.remove(id.trim());
const filename = [this.modePrefix[this.mode], entity.replace(noWhiteSpace, '_')].join('');
const filePath = `${this.path}/${filename}`.substring(0, 240);
await this._randomWait(this.page, 3, 5);
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
const body = await this.page.content();
const details = await this.extractEntityDetails(body);
await jsonfile.writeFile(`${filePath}.json`, { details });
await this._randomWait(this.page, 3, 5);
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
serviceObject.step++;
if (serviceObject.step < serviceObject.items) {
const newUrl = serviceObject.links[serviceObject.step].href;
await this._goto(newUrl);
}
else
this.emit('serviceDone');
}
/**
*
* @returns {Promise<void>}
*/
async processRedirector() {
switch (this.mode) {
case 0:
await this.processEntityDetails(this.paymentServices);
break;
case 1:
await this.processEntityDetails(this.emoneyServices);
break;
case 2:
await this.processEntityDetails(this.creditServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
// give the page a few seconds to settle
const pathSplitter = /(\/en\/.+?\/)/;
await this._randomWait(this.page, 3, 5);
const pageUrl = url.parse(await this.page.url());
if (pageUrl.href === 'chrome-error://chromewebdata/') {
logger.warn('Directed to: chrome-error://chromewebdata/');
this.emit('recover');
return;
}
const splitPath = pageUrl.pathname.match(pathSplitter);
const pathname = splitPath[0];
switch (pathname) {
case '/en/entidades-autorizadas/':
await this.indexRedirector();
break;
case '/en/entidadeautorizada/':
await this.processRedirector();
break;
case '/en/our-registers/company-register/gransoverskridandehandel/':
await this.crossBorderRedirector();
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
throw new Error(`Unknown page: ${pageUrl}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('currentPage.location', pageUrl);
}
break;
}
}
async attachEvents() {
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
//
this.on('psindexdone', async () => {
this.paymentServices.items = this.paymentServices.links.length;
logger.info(`${this.paymentServices.items} items indexed`);
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
await this._goto(newUrl);
});
this.on('emindexdone', async () => {
this.emoneyServices.items = this.emoneyServices.links.length;
logger.info(`${this.emoneyServices.items} items indexed`);
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
await this._goto(newUrl);
});
this.on('ciindexdone', async () => {
this.creditServices.items = this.creditServices.links.length;
logger.info(`${this.creditServices.items} items indexed`);
const newUrl = this.creditServices.links[this.creditServices.step].href;
await this._goto(newUrl);
});
this.on('indexdone', async function() {
switch (this.mode) {
case 0:
this.emit('psindexdone');
break;
case 1:
this.emit('emindexdone');
break;
case 2:
this.emit('ciindexdone');
break;
}
});
this.on('paymentServicesDone', async function() {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async function() {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async function() {
logger.warn('creditServicesDone');
try{
this.creditServices.done = true;
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.mode++;
this.inProgress = false;
this.emit('done');
}
catch (e) {
logger.error(e);
}
});
}
/**
*
* @returns {Promise<void>}
*/
async start() {
logger.debug(this.eventNames());
super._start();
try {
this.mode = 0;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/75/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/72/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.bportugal.pt/en/entidades-autorizadas/67-68-1524-69/all?page=0'],
'sections' : [],
'sectionLinks' : []
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/PT/BP`));
await this._doNonRepudiation().catch((err) => {
logger.warn(err);
});
await this._initBrowser();
await this._createBrowserPage();
await this._makeResponsive();
this.page.on('domcontentloaded', this._throttle(async () => {
this.processNewPage();
}, 5000));
if (this.eventNames().length === 2)
await this.attachEvents();
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async __run() {
await this.start();
}
}
module.exports = PTScrape;