obdfcascrape/ncas/plR.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

1023 lines
25 KiB
JavaScript

const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('PL');
const url = require('url');
logger.level = process.env.LOGGER_LEVEL || 'warn';
function tag() {
const now = new Date().getTime();
return now.toString(36);
}
class PLScrape extends Scraper {
constructor() {
super();
this.id = 'PL';
this.on('done', () => {
this._done();
});
/* if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});*/
}
/**
*
* @param rows
* @returns {Promise<void>}
*/
async rowReducer(rows) {
try{
const newObj = { } ;
rows.each((i, elm) => {
const children = cheerio(elm).children();
if (children.length === 2) {
// we want this data
const label = this._makeFieldName(cheerio(children.eq(0)).text());
newObj[label] = this._cleanUp(cheerio(children.eq(1)).text());
}
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param items
* @returns {Promise<Array>}
*/
async reduceBullets(items) {
try{
const newArray = [] ;
items.each((i, elm) => {
newArray.push(this._cleanUp(cheerio(elm).text()));
});
return newArray;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<string>}
*/
async extractCSHeading(html) {
try{
const $ = cheerio.load(html);
const rawHeading = $('#singleEtity > div > div > div.panel-heading > h2');
if ($(rawHeading).length === 0) return '';
return this._cleanUp($(rawHeading).text());
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<string>}
*/
async extractCSBodyText(html) {
try{
const wanted = ['b', 'text'];
const $ = cheerio.load(html);
const rawBody = $('#singleEntityBody');
if ($(rawBody).length === 0) return '';
const firstRow = $(rawBody).contents()[0];
if (wanted.indexOf(firstRow.name) !== -1)
return this._cleanUp($(firstRow).text());
return '';
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<...Map<any, any>[]>}
*/
async extractCSTable(html) {
try{
const outMap = new Map([]);
const $ = cheerio.load(html);
const mainTable = $('#singleEntityBody > table');
if ($(mainTable).children().length === 0) return [...outMap];
const mainBody = $(mainTable).children()[0];
const tableRows = $(mainBody).children();
tableRows.each((i, elm) => {
const rows = $(elm);
const cells = $(rows).children();
if (cells.length > 0) {
const label = this._cleanUp($(cells).eq(0).text());
const text = this._cleanUp($(cells).eq(1).html());
outMap.set(label, text);
}
});
return [...outMap];
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityActivity(html) {
try{
const removeCountry = /(Kraj)\s+/g;
const newObj = {} ;
const $ = cheerio.load(html);
const mainTable = $('div#areatabs1_5 table.tableDynamic');
if ($(mainTable).children().length === 0) return newObj;
const mainBody = $(mainTable).children()[0];
const tableRows = $(mainBody).children();
tableRows.each((i, elm) => {
const rows = $(elm).find('tr');
const listItems = $(elm).find('li');
const rawCountryName = this._cleanUp($($(rows)[0]).text()).replace(removeCountry, '');
const countryName = this._makeFieldName(rawCountryName);
this.reduceBullets(listItems).then((d) => {
newObj[countryName] = d;
});
});
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractEntityBranches(html) {
try{
const newArray = [] ;
const $ = cheerio.load(html);
const mainTable = $('div#areatabs1_4 table.tableDynamic');
if ($(mainTable).children().length === 0) return newArray;
const mainBody = $(mainTable).children()[0];
const tableRows = $(mainBody).children();
tableRows.each((i, elm) => {
const rows = $(elm).find('tr');
this.rowReducer(rows).then((d) => {
newArray.push(d);
});
});
return newArray;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<Array>}
*/
async extractEntityAgents(html) {
try{
const newArray = [] ;
const $ = cheerio.load(html);
const mainTable = $('div#areatabs1_3 table.tableDynamic');
if ($(mainTable).children().length === 0) return newArray;
const mainBody = $(mainTable).children()[0];
const tableRows = $(mainBody).children();
tableRows.each((i, elm) => {
const rows = $(elm).find('tr');
this.rowReducer(rows).then((d) => {
newArray.push(d);
});
});
return newArray;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityServices(html) {
logger.debug('+ extractEntityServices');
try{
const newObj = { } ;
const $ = cheerio.load(html);
const rows = $('#areatabs1_2 > table tr');
const label = this._makeFieldName($(rows).find('.left').text());
newObj[label] = [];
const listItems = $(rows).find('.container100 li');
listItems.each((i, elm) => {
newObj[label].push(this._cleanUp($(elm).text()));
});
logger.debug('- extractEntityServices');
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param html
* @returns {Promise<void>}
*/
async extractEntityDetails(html) {
logger.debug('+ extractEntityDetails');
try{
const newObj = { } ;
const $ = cheerio.load(html);
const rows = $('div#areatabs1_1 tr');
rows.each((i, elm) => {
const children = cheerio(elm).children();
if (children.length === 2) {
// we want this data
const label = this._makeFieldName($(children.eq(0)).text());
newObj[label] = this._cleanUp($(children.eq(1)).text());
}
});
logger.debug('- extractEntityDetails');
return newObj;
}
catch( err) {
logger.error(err);
}
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityIndexFirstPass(serviceObject) {
logger.debug('+ entityIndexFirstPass');
try{
// breaks up `1/146 (1455)`
const breaker = /(\d+)/g;
const body = await this.page.content();
const $ = cheerio.load(body);
const subjectsInfo = $($('.infoNavigation').contents()[2]).text();
const brokenString = subjectsInfo.match(breaker);
const currentPageIndex = parseInt(brokenString[0], 10);
const currentPageMax = parseInt(brokenString[1], 10);
const currentIndexLength = parseInt(brokenString[2], 10);
logger.info(`First pass on the ${this.modeTitles[this.mode]} index...`);
serviceObject.currentIndexLength = currentIndexLength;
serviceObject.currentPageMax = currentPageMax;
serviceObject.currentPageIndex = currentPageIndex;
serviceObject.visited = true;
serviceObject.currentIndex = url.parse(await this.page.url());
serviceObject.currentMetaIndex = 0;
const entityName = `${this.modeNames[this.mode]}_${currentPageIndex}`;
const filePath = await this._makeFilePath(entityName);
await this._makeScreenshotV2(this.page, filePath, null);
}
catch( err) {
logger.error(err);
}
logger.debug('- entityIndexFirstPass');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityIndex(serviceObject) {
logger.debug('+ processEntityIndex');
try{
const fields = ['count', 'referenceNumber', 'typeOfEntity', 'name', 'registrationNumber', 'nip', 'date'];
const mouseDownDuration = Scraper.notARobot();
logger.info(`Working on the ${this.modeTitles[this.mode]} index...`);
if (serviceObject.visited === false) {
logger.debug('Preparing...');
await this.entityIndexFirstPass(serviceObject);
}
if (serviceObject.visited === true) {
serviceObject.currentMetaIndex = serviceObject.step % 10;
if ((serviceObject.step ) >= (serviceObject.currentPageIndex * 10)) {
logger.debug('Maxed out this page..');
const nextButton = await this.page.$$('#j_idt64-tableViewS-recordsGoToNext');
const isDisabled = await this.page.$eval('#j_idt64-tableViewS-recordsGoToNext', (elm) => {
return elm.disabled;
});
if (!isDisabled) {
// we need a click..
serviceObject.visited = false;
await this._randomWait(this.page, 1, 2);
nextButton[0].click({ 'delay':mouseDownDuration });
}
else {
logger.debug('I think we are done here...');
this.emit('serviceDone');
}
}
else {
logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`);
const elmStr = `table#j_idt64-tableViewS tbody tr:nth-child(${serviceObject.currentMetaIndex + 1})`;
await this.page.waitForSelector(elmStr).then(async (elm) => {
await elm.hover().catch((err) => {
logger.warn(err);
});
await elm.focus();
});
// Force the focus
const wantedRow = await this.page.$$(elmStr);
const htmlRow = await this.page.evaluate(el => el.outerHTML, wantedRow[0]);
const $ = cheerio.load(`<table>${htmlRow}</table>`);
const cells = $('td');
serviceObject.current = {};
cells.each((index, item) => {
serviceObject.current[ fields[index] ] = $(item).text();
});
await this._randomWait(this.page, 2, 3);
await wantedRow[0].click({ 'delay':mouseDownDuration });
await this._findAndClick('#j_idt112 > input.button');
}
}
}
catch( err) {
logger.error(err);
}
logger.debug('- processEntityIndex');
}
/**
*
* @returns {Promise<void>}
*/
async indexRedirector() {
logger.debug('+ indexRedirector');
try{
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
await this.page.waitForSelector('#allByJS > tbody > tr:nth-child(4) > td > input').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
}
catch( err) {
logger.warn('!!!!!');
logger.error(err);
await this._uploadError();
this.emit('stall');
}
logger.debug('- indexRedirector');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async processEntityDetail(serviceObject) {
logger.debug('+ processEntityDetail');
try{
const tabs = [
{ 'id': '', 'name' : 'details' },
{ 'id': 'div#tabs1_2', 'name' : 'services' },
{ 'id': 'div#tabs1_3', 'name' : 'agents' },
{ 'id': 'div#tabs1_4', 'name' : 'branches' },
{ 'id': 'div#tabs1_5', 'name' : 'activity' }
];
if (serviceObject.visited === false) {
logger.debug('Process the menu correctly');
this.emit('handleEntityIndex');
return;
}
logger.info(`Process ${this.modeTitles[this.mode]} // ${serviceObject.current.name}`);
const stallObj = Object.assign({}, serviceObject.current);
this.stall = setTimeout(() => {
logger.warn(`Page stalled. Backing off :: ${stallObj.name}`);
this.stalled = true;
this.emit('backoff');
}, 75000);
const newObj = {};
const entityName = `${serviceObject.current.name}_${serviceObject.current.nip}`;
const fileName = this._makeFileName(entityName);
const filePath = await this._makeFilePath(entityName);
serviceObject.current.fileName = fileName;
const body = await this.page.content();
newObj.details = await this.extractEntityDetails(body);
newObj.services = await this.extractEntityServices(body);
newObj.agents = await this.extractEntityAgents(body);
newObj.branches = await this.extractEntityBranches(body);
newObj.activity = await this.extractEntityActivity(body);
serviceObject.current = Object.assign(serviceObject.current, newObj);
for(const item of tabs)
if (item.id !== '') {
const tabExists = await this.page.$$(item.id);
if (tabExists.length > 0) {
await this._findAndClick(item.id);
await this._makeScreenshotV2(this.page, `${filePath}_${item.name}`, null);
}
}
if (!this.stalled) {
this.emit('entityComplete');
logger.info('Entity complete...');
}
else
throw('Stalled');
}
catch( err) {
logger.error(err);
}
logger.debug('- processEntityDetail');
}
/**
*
* @param serviceObject
* @returns {Promise<void>}
*/
async entityCompleter(serviceObject) {
const _tag = tag();
logger.debug('+ entityCompleter', _tag);
try{
const filename = serviceObject.current.fileName;
const filePath = `${this.path}/${filename}`.substring(0, 240);
logger.info(`Saving: ${filename}.json`);
const newLink = { 'name':serviceObject.current.name, 'fileName':`${filename}.json` };
if (this.mode === 0)
newLink.nip = serviceObject.current.nip;
if (this.mode === 2)
newLink.hash = serviceObject.current.hash;
serviceObject.links.push(newLink);
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
await this._randomWait(this.page, 10, 15, 'Throttled');
serviceObject.step++;
clearTimeout(this.stall);
this.stall = 0;
if (serviceObject.step < serviceObject.currentIndexLength) {
serviceObject.current = {};
if (this.mode === 0)
// await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input');
await this.page.waitForSelector('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
else {
// await this._findAndClick('#previousSearchPage');
await this.page.waitForSelector('#previousSearchPage').then(async (elm) => {
await elm.click({ 'delay':Scraper.notARobot() });
});
this.emit('pageChanged');
}
}
else
this.emit('serviceDone');
}
catch( err) {
logger.error(err);
}
logger.debug('- entityCompleter', _tag);
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityIndex() {
switch (this.mode) {
case 1:
await this.processEntityIndex(this.emoneyServices);
break;
case 2:
await this.processCSEntityIndex(this.creditServices);
break;
case 0:
default:
await this.processEntityIndex(this.paymentServices);
break;
}
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityDetail() {
logger.debug('+ handleEntityDetail');
switch (this.mode) {
case 1:
await this.processEntityDetail(this.emoneyServices);
break;
case 2:
await this.processCSEntityDetail(this.creditServices);
break;
case 0:
default:
await this.processEntityDetail(this.paymentServices);
break;
}
logger.debug('- handleEntityDetail');
}
/**
*
* @returns {Promise<void>}
*/
async handleEntityComplete() {
logger.debug('+ handleEntityComplete');
switch (this.mode) {
case 1:
await this.entityCompleter(this.emoneyServices);
break;
case 2:
await this.entityCompleter(this.creditServices);
break;
case 0:
default:
await this.entityCompleter(this.paymentServices);
break;
}
logger.debug('- handleEntityComplete');
}
/**
*
* @returns {Promise<void>}
*/
async processNewPage() {
const _tag = tag();
logger.debug('+ processNewPage', _tag);
// give the page a few seconds to settle
const removeJSession = /(;jsessionid=[0-9a-f]*)/g;
await this._randomWait(this.page, 3, 5, 'processNewPage');
const pageUrl = url.parse(await this.page.url());
const pathName = (pageUrl.pathname || '').replace(removeJSession, '');
// logger.debug('## Page changed', pageUrl);
switch (pathName) {
case '/View/':
await this.indexRedirector();
break;
case '/View/faces/start2OuterView.xhtml':
case '/View/faces/dataEdit.xhtml':
await this.handleEntityIndex();
break;
case '/View/faces/subjectsList.xhtml':
await this.handleEntityDetail();
break;
case '/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych':
await this.handleXLSDownload();
break;
case '/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych':
await this.handleXLSDownload();
break;
case '/podmioty/wyszukiwarka_podmiotow':
await this.csIndexHandler();
break;
case '/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi':
await this.processArt70();
break;
case '/':
if (pageUrl.href === 'chrome-error://chromewebdata/')
this.emit('backoff');
else
throw new Error(`Bad page: ${pageUrl.href}`);
break;
default:
if (process.env.NODE_ENV) {
await this._uploadError();
// this.emit('backoff');
throw new Error(`Unknown page: ${pageUrl.href}`);
}
else {
logger.warn('processNewPage Fell through');
logger.warn('pathName', pathName);
logger.warn('currentPage.location', pageUrl);
}
break;
}
logger.debug('- processNewPage', _tag);
}
async restart() {
logger.warn(`Tryng to restart ${this.modeTitles[this.mode]}`);
logger.error('KILLING PAGE & BROWSER');
await this.page.close();
await this.browser.close().catch((err) => {
logger.error(err);
});
this.page = null;
this.browser = null;
logger.error('RESTARTING');
await this._initBrowser(true);
this.page = await this.browser.newPage();
logger.warn('Restarted');
switch (this.mode) {
case 1:
await this._goto(this.emoneyServices.urls[0]);
break;
case 2:
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
break;
case 0:
default:
await this._goto(this.paymentServices.urls[0]);
break;
}
}
async backoff() {
this.backOffStep++;
clearTimeout(this.stall);
this.stall = 0;
this.stalled = false;
if (this.backOffStep > this.backOffLimit) this.backOffStep = this.backOffLimit;
logger.warn(`Backing off for ${this.backOffStep * 5} minutes..`);
const timeout = (60 * 1000) * (this.backOffStep * 5);
await this._uploadError();
this.backOffTimer = setTimeout(() => {
this.emit('restart');
}, timeout);
}
async start() {
super._start();
try {
this.mode = 0;
this.backOffStep = 0;
this.backOffLimit = 6;
this.backOffTimer = null;
this.stall = null;
this.stalled = false;
this.paymentServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://erup.knf.gov.pl/View/'],
'sections' : [],
'sectionLinks' : [],
'brokenReturn' : false
};
this.emoneyServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'urls': ['https://www.knf.gov.pl/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych'],
'sections' : [],
'sectionLinks' : []
};
this.creditServices = {
'items': 0,
'links': [],
'step': 0,
'indexStep': 0,
'visited': false,
'done' : false,
'searchDone' : false,
'started': false,
'urls': ['https://www.knf.gov.pl/podmioty/wyszukiwarka_podmiotow', 'https://www.knf.gov.pl/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych', 'https://www.knf.gov.pl/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi'],
'sections' : [],
'sectionLinks' : [],
'restart' : false,
'metastep' : 0
};
this.startPage = this.paymentServices.urls[0];
this.emoneyUrl = this.emoneyServices.urls[0];
this.credit = this.creditServices.urls[0];
this.setPath(path.resolve(`${__dirname }/../artefacts/PL/KNF`));
// await this._doNonRepudiation();
await this._initBrowser(true);
this.page = await this.browser.newPage();
this.page.on('domcontentloaded', () => {
this.processNewPage();
});
this.on('pageChanged', async () => {
await this.processNewPage();
});
this.on('stall', () => {
this.backoff();
});
this.on('backoff', () => {
this.backoff();
});
this.on('restart', () => {
this.restart();
});
this.on('entityComplete', () => {
this.handleEntityComplete();
});
this.on('handleEntityIndex', () => {
this.handleEntityIndex();
});
this.on('entityDetail', () => {
this.handleEntityDetail();
});
this.on('startcs', () => {
this.handleStartcs();
});
this.on('serviceDone', async function() {
switch (this.mode) {
case 0:
this.emit('paymentServicesDone');
break;
case 1:
this.emit('emoneyServicesDone');
break;
case 2:
this.emit('creditServicesDone');
break;
}
});
this.on('paymentServicesDone', async function() {
logger.warn('paymentServicesDone');
try{
this.paymentServices.done = true;
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
this.mode++;
this.inProgress = false;
await this._goto(this.emoneyServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('emoneyServicesDone', async function() {
logger.warn('emoneyServicesDone');
try{
this.emoneyServices.done = true;
this.mode++;
this.inProgress = false;
await this._goto(this.creditServices.urls[0]);
}
catch (e) {
logger.error(e);
}
});
this.on('creditServicesDone', async function() {
logger.warn('creditServicesDone');
try{
if (this.creditServices.metastep === 0) {
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
this.creditServices.metastep++;
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
return;
}
if (this.creditServices.metastep === 1) {
this.creditServices.metastep++;
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
return;
}
if (this.creditServices.metastep === 2) {
this.creditServices.done = true;
this.mode++;
this.inProgress = false;
this.emit('done');
}
}
catch (e) {
logger.error(e);
}
});
//
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' }).catch((err) => {
logger.error(err);
});
await this._randomWait(this.page, 3, 5);
}
catch(e) {
throw new Error(e);
}
}
async run() {
await this.start();
}
}
module.exports = PLScrape;