1395 lines
34 KiB
JavaScript
1395 lines
34 KiB
JavaScript
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const logger = require('log4js').getLogger('(PL)');
|
|
const url = require('url');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
class PLScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('PL');
|
|
this.version = '0.0.1-1';
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param rows
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async rowReducer(rows) {
|
|
try{
|
|
const newObj = { } ;
|
|
rows.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
|
|
if (children.length === 2) {
|
|
// we want this data
|
|
const label = this._makeFieldName(cheerio(children.eq(0)).text());
|
|
newObj[label] = this._cleanUp(cheerio(children.eq(1)).text());
|
|
}
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param items
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async reduceBullets(items) {
|
|
try{
|
|
const newArray = [] ;
|
|
|
|
items.each((i, elm) => {
|
|
newArray.push(this._cleanUp(cheerio(elm).text()));
|
|
});
|
|
|
|
return newArray;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<string>}
|
|
*/
|
|
async extractCSHeading(html) {
|
|
try{
|
|
const $ = cheerio.load(html);
|
|
|
|
const rawHeading = $('#singleEtity > div > div > div.panel-heading > h2');
|
|
|
|
if ($(rawHeading).length === 0) return '';
|
|
|
|
return this._cleanUp($(rawHeading).text());
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<string>}
|
|
*/
|
|
async extractCSBodyText(html) {
|
|
try{
|
|
const wanted = ['b', 'text'];
|
|
const $ = cheerio.load(html);
|
|
|
|
const rawBody = $('#singleEntityBody');
|
|
|
|
if ($(rawBody).length === 0) return '';
|
|
|
|
const firstRow = $(rawBody).contents()[0];
|
|
|
|
if (wanted.indexOf(firstRow.name) !== -1)
|
|
return this._cleanUp($(firstRow).text());
|
|
|
|
return '';
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<...Map<any, any>[]>}
|
|
*/
|
|
|
|
async extractCSTable(html) {
|
|
try{
|
|
const outMap = new Map([]);
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const mainTable = $('#singleEntityBody > table');
|
|
|
|
if ($(mainTable).children().length === 0) return [...outMap];
|
|
|
|
const mainBody = $(mainTable).children()[0];
|
|
const tableRows = $(mainBody).children();
|
|
|
|
tableRows.each((i, elm) => {
|
|
const rows = $(elm);
|
|
const cells = $(rows).children();
|
|
|
|
if (cells.length > 0) {
|
|
const label = this._cleanUp($(cells).eq(0).text());
|
|
const text = this._cleanUp($(cells).eq(1).html());
|
|
|
|
outMap.set(label, text);
|
|
}
|
|
});
|
|
|
|
return [...outMap];
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractEntityActivity(html) {
|
|
try{
|
|
const removeCountry = /(Kraj)\s+/g;
|
|
const newObj = {} ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const mainTable = $('div#areatabs1_5 table.tableDynamic');
|
|
|
|
if ($(mainTable).children().length === 0) return newObj;
|
|
|
|
const mainBody = $(mainTable).children()[0];
|
|
const tableRows = $(mainBody).children();
|
|
|
|
tableRows.each((i, elm) => {
|
|
const rows = $(elm).find('tr');
|
|
const listItems = $(elm).find('li');
|
|
|
|
const rawCountryName = this._cleanUp($($(rows)[0]).text()).replace(removeCountry, '');
|
|
|
|
const countryName = this._makeFieldName(rawCountryName);
|
|
|
|
this.reduceBullets(listItems).then((d) => {
|
|
newObj[countryName] = d;
|
|
});
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractEntityBranches(html) {
|
|
try{
|
|
const newArray = [] ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const mainTable = $('div#areatabs1_4 table.tableDynamic');
|
|
|
|
if ($(mainTable).children().length === 0) return newArray;
|
|
|
|
const mainBody = $(mainTable).children()[0];
|
|
const tableRows = $(mainBody).children();
|
|
|
|
tableRows.each((i, elm) => {
|
|
const rows = $(elm).find('tr');
|
|
this.rowReducer(rows).then((d) => {
|
|
newArray.push(d);
|
|
});
|
|
});
|
|
|
|
return newArray;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<Array>}
|
|
*/
|
|
async extractEntityAgents(html) {
|
|
try{
|
|
const newArray = [] ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const mainTable = $('div#areatabs1_3 table.tableDynamic');
|
|
|
|
if ($(mainTable).children().length === 0) return newArray;
|
|
|
|
const mainBody = $(mainTable).children()[0];
|
|
const tableRows = $(mainBody).children();
|
|
|
|
tableRows.each((i, elm) => {
|
|
const rows = $(elm).find('tr');
|
|
this.rowReducer(rows).then((d) => {
|
|
newArray.push(d);
|
|
});
|
|
});
|
|
|
|
return newArray;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractEntityServices(html) {
|
|
try{
|
|
const newObj = { } ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('#areatabs1_2 > table tr');
|
|
|
|
const label = this._makeFieldName($(rows).find('.left').text());
|
|
newObj[label] = [];
|
|
|
|
const listItems = $(rows).find('.container100 li');
|
|
|
|
listItems.each((i, elm) => {
|
|
newObj[label].push(this._cleanUp($(elm).text()));
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractEntityDetails(html) {
|
|
try{
|
|
const newObj = { } ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('div#areatabs1_1 tr');
|
|
|
|
rows.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
|
|
if (children.length === 2) {
|
|
// we want this data
|
|
const label = this._makeFieldName($(children.eq(0)).text());
|
|
newObj[label] = this._cleanUp($(children.eq(1)).text());
|
|
}
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async entityIndexFirstPass(serviceObject) {
|
|
try{
|
|
// breaks up `1/146 (1455)`
|
|
const breaker = /(\d+)/g;
|
|
|
|
const body = await this.page.content();
|
|
|
|
const $ = cheerio.load(body);
|
|
|
|
const subjectsInfo = $($('.infoNavigation').contents()[2]).text();
|
|
|
|
const brokenString = subjectsInfo.match(breaker);
|
|
|
|
const currentPageIndex = parseInt(brokenString[0], 10);
|
|
const currentPageMax = parseInt(brokenString[1], 10);
|
|
const currentIndexLength = parseInt(brokenString[2], 10);
|
|
|
|
logger.info(`First pass on the ${this.modeTitles[this.mode]} index...`);
|
|
|
|
serviceObject.currentIndexLength = currentIndexLength;
|
|
serviceObject.currentPageMax = currentPageMax;
|
|
serviceObject.currentPageIndex = currentPageIndex;
|
|
|
|
serviceObject.visited = true;
|
|
serviceObject.currentIndex = url.parse(await this.page.url());
|
|
serviceObject.currentMetaIndex = 0;
|
|
|
|
const entityName = `${this.modeNames[this.mode]}_${currentPageIndex}`;
|
|
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
await this._makeScreenshotV2(this.page, filePath, null);
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processCSEntityIndex(serviceObject) {
|
|
try{
|
|
const mouseDownDuration = Scraper.notARobot();
|
|
|
|
serviceObject.currentMetaIndex = serviceObject.step % 20;
|
|
|
|
if ((serviceObject.step > 0) && (serviceObject.currentMetaIndex === 0) && (serviceObject.restart === true)) {
|
|
logger.debug('Maxed out this page..');
|
|
|
|
serviceObject.restart = false;
|
|
|
|
await this.page.waitForSelector('#nextPage', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
logger.debug('Proceeding to next index page..');
|
|
await elm.click({ 'delay':Scraper.notARobot() });
|
|
this.emit('pageChanged');
|
|
});
|
|
}
|
|
else {
|
|
logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`);
|
|
|
|
serviceObject.restart = true;
|
|
|
|
const wantedRow = await this.page.$$(`#searchEntitites > div:nth-child(${serviceObject.currentMetaIndex + 1}) a`);
|
|
const text = await this.page.evaluate(el => el.innerText, wantedRow[0]);
|
|
|
|
await this._randomWait(this.page, 2, 3);
|
|
|
|
wantedRow[0].click({ 'delay':mouseDownDuration }).then(() => {
|
|
serviceObject.current = { 'name':text };
|
|
|
|
this.emit('pageChanged');
|
|
});
|
|
}
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processEntityIndex(serviceObject) {
|
|
// #j_idt64-tableViewS > tbody > tr:nth-child(1) > td.tableViewNoR
|
|
|
|
logger.debug('###### processEntityIndex');
|
|
try{
|
|
const fields = ['count', 'referenceNumber', 'typeOfEntity', 'name', 'registrationNumber', 'nip', 'date'];
|
|
const mouseDownDuration = Scraper.notARobot();
|
|
|
|
logger.info(`Working on the ${this.modeTitles[this.mode]} index...`);
|
|
|
|
if (serviceObject.visited === false) {
|
|
logger.debug('Preparing...');
|
|
await this.entityIndexFirstPass(serviceObject);
|
|
}
|
|
|
|
let pageMaxContent = 10;
|
|
|
|
await this.page.waitForSelector('#j_idt64-tableViewS > tfoot > tr > td > select', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
const rawValue = await elm.getProperty('value');
|
|
const value = await rawValue.jsonValue();
|
|
|
|
pageMaxContent = parseInt(value, 10);
|
|
});
|
|
|
|
logger.debug('pageMaxContent', pageMaxContent);
|
|
|
|
if (serviceObject.visited === true) {
|
|
serviceObject.currentMetaIndex = serviceObject.step % pageMaxContent;
|
|
|
|
if ((serviceObject.step ) >= (serviceObject.currentPageIndex * pageMaxContent)) {
|
|
logger.debug('Maxed out this page..');
|
|
|
|
await this.page.waitForSelector('#j_idt64-tableViewS-recordsGoToNext', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
const isDisabled = elm.disabled;
|
|
|
|
if (!isDisabled) {
|
|
// we need a click..
|
|
serviceObject.visited = false;
|
|
|
|
await this._randomWait(this.page, 1, 2);
|
|
elm.click({ 'delay':mouseDownDuration });
|
|
}
|
|
else {
|
|
logger.debug('Check if we should be done:', serviceObject.step, serviceObject.currentIndexLength);
|
|
if(serviceObject.step >= serviceObject.currentIndexLength) {
|
|
logger.debug('processEntityIndex Done here...');
|
|
this.emit('serviceDone');
|
|
}
|
|
}
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
this.emit('backoff');
|
|
});
|
|
}
|
|
else {
|
|
logger.info(`Dealing with ${serviceObject.step + 1} of ${serviceObject.currentIndexLength}`);
|
|
|
|
const elmStr = `table#j_idt64-tableViewS tbody tr:nth-child(${serviceObject.currentMetaIndex + 1})`;
|
|
|
|
await this.page.waitForSelector(elmStr, { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
await elm.hover().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await elm.focus();
|
|
|
|
const htmlRow = await this.page.evaluate(el => el.outerHTML, elm);
|
|
|
|
const $ = cheerio.load(`<table>${htmlRow}</table>`);
|
|
const cells = $('td');
|
|
|
|
serviceObject.current = {};
|
|
|
|
cells.each((index, item) => {
|
|
serviceObject.current[ fields[index] ] = $(item).text();
|
|
});
|
|
|
|
await this._randomWait(this.page, 2, 3);
|
|
|
|
await elm.click({ 'delay':mouseDownDuration });
|
|
|
|
await this._findAndClick('#j_idt112 > input.button');
|
|
}).catch((err) => {
|
|
logger.error(err);
|
|
this.emit('backoff');
|
|
});
|
|
}
|
|
}
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async indexRedirector() {
|
|
try{
|
|
await this._randomWait(this.page, 3, 5, 'handleIntroPage');
|
|
|
|
await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
await elm.click({ 'delay':Scraper.notARobot() });
|
|
});
|
|
}
|
|
catch( err) {
|
|
logger.warn('!!!!!');
|
|
logger.error(err);
|
|
await this._uploadError();
|
|
this.emit('stall');
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async csIndexHandler() {
|
|
try{
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (this.creditServices.started !== false)
|
|
if (pageUrl.hash === null || pageUrl.hash === '#')
|
|
this.emit('processCSEntityIndex');
|
|
else
|
|
this.emit('entityDetail');
|
|
|
|
else
|
|
this.emit('startcs');
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processCSEntityDetail(serviceObject) {
|
|
try{
|
|
logger.info(`Process ${serviceObject.current.name}`);
|
|
|
|
const newObj = {};
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
const hash = (pageUrl.hash || '').replace('#', '');
|
|
|
|
const entityName = `${serviceObject.current.name}_${hash}`;
|
|
const fileName = this._makeFileName(entityName);
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
serviceObject.current.fileName = fileName;
|
|
|
|
const body = await this.page.content();
|
|
|
|
newObj.hash = hash;
|
|
newObj.heading = await this.extractCSHeading(body);
|
|
newObj.bodytext = await this.extractCSBodyText(body);
|
|
newObj.table = await this.extractCSTable(body);
|
|
|
|
serviceObject.current = Object.assign(serviceObject.current, newObj);
|
|
|
|
await this._makeScreenshotV2(this.page, `${filePath}`, null);
|
|
|
|
this.emit('entityComplete');
|
|
|
|
logger.info('Entity complete...');
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processEntityDetail(serviceObject) {
|
|
logger.debug('-----> processEntityDetail');
|
|
try{
|
|
const tabs = [
|
|
{ 'id': '', 'name' : 'details' },
|
|
{ 'id': 'div#tabs1_2', 'name' : 'services' },
|
|
{ 'id': 'div#tabs1_3', 'name' : 'agents' },
|
|
{ 'id': 'div#tabs1_4', 'name' : 'branches' },
|
|
{ 'id': 'div#tabs1_5', 'name' : 'activity' }
|
|
];
|
|
|
|
if (serviceObject.visited === false) {
|
|
logger.debug('Process the menu correctly');
|
|
|
|
this.emit('handleEntityIndex');
|
|
|
|
return;
|
|
}
|
|
|
|
logger.debug('====== processEntityDetail ----->');
|
|
logger.info(`Process ${this.modeTitles[this.mode]} // ${serviceObject.current.name}`);
|
|
|
|
const newObj = {};
|
|
|
|
const entityName = `${serviceObject.current.name}_${serviceObject.current.nip}`;
|
|
const fileName = this._makeFileName(entityName);
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
serviceObject.current.fileName = fileName;
|
|
|
|
const body = await this.page.content();
|
|
|
|
newObj.details = await this.extractEntityDetails(body);
|
|
newObj.services = await this.extractEntityServices(body);
|
|
newObj.agents = await this.extractEntityAgents(body);
|
|
newObj.branches = await this.extractEntityBranches(body);
|
|
newObj.activity = await this.extractEntityActivity(body);
|
|
|
|
serviceObject.current = Object.assign(serviceObject.current, newObj);
|
|
|
|
for(const item of tabs)
|
|
if (item.id !== '') {
|
|
const tabExists = await this.page.$$(item.id);
|
|
if (tabExists.length > 0) {
|
|
await this._findAndClick(item.id);
|
|
await this._makeScreenshotV2(this.page, `${filePath}_${item.name}`, null);
|
|
await this._microWait(this.page, 15);
|
|
}
|
|
}
|
|
|
|
this.emit('entityComplete');
|
|
|
|
logger.info('Entity complete...');
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async entityCompleter(serviceObject) {
|
|
try{
|
|
const filename = serviceObject.current.fileName;
|
|
|
|
const filePath = `${this.path}/${filename}`.substring(0, 240);
|
|
|
|
logger.info(`Saving: ${filename}.json`);
|
|
|
|
const newLink = { 'name':serviceObject.current.name, 'fileName':`${filename}.json` };
|
|
|
|
// Payment service
|
|
if (this.mode === 0)
|
|
newLink.nip = serviceObject.current.nip;
|
|
|
|
// Credit Institute
|
|
if (this.mode === 2)
|
|
newLink.hash = serviceObject.current.hash;
|
|
|
|
serviceObject.links.push(newLink);
|
|
|
|
await jsonfile.writeFile(`${filePath}.json`, serviceObject.current);
|
|
|
|
await this._randomWait(this.page, 10, 15, 'Throttled');
|
|
|
|
serviceObject.step++;
|
|
|
|
clearTimeout(this.stall);
|
|
|
|
this.stall = 0;
|
|
|
|
logger.debug('>> this.mode:', this.mode);
|
|
logger.debug('>> serviceObject.step:', serviceObject.step);
|
|
logger.debug('>> serviceObject.currentIndexLength:', serviceObject.currentIndexLength);
|
|
|
|
if (serviceObject.step < serviceObject.currentIndexLength) {
|
|
serviceObject.current = {};
|
|
|
|
// 2019-05-08 :: THIS BIT BROKE TODAY
|
|
if (this.mode === 0)
|
|
await this._findAndClick('#allByJS > tbody > tr:nth-child(8) > td > span:nth-child(2) > input');
|
|
|
|
else {
|
|
await this._findAndClick('#previousSearchPage');
|
|
this.emit('pageChanged');
|
|
}
|
|
}
|
|
else
|
|
this.emit('serviceDone');
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
this.emit('backoff');
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async handleXLSDownload() {
|
|
try{
|
|
const entityName = `${this.modeNames[this.mode]}_main`;
|
|
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
await this._makeScreenshotV2(this.page, filePath, null);
|
|
|
|
await this._randomWait(this.page, 3, 6);
|
|
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
this._findAndClick('body > section.article-view > div > div > div.col-xs-12.col-lg-9.article-content.pl50-lg > div.row.mb30 > div > div');
|
|
|
|
await this._randomWait(this.page, 3, 6);
|
|
|
|
this.emit('serviceDone');
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<jQuery>}
|
|
*/
|
|
async countCSRows() {
|
|
try{
|
|
const body = await this.page.content();
|
|
|
|
const $ = cheerio.load(body);
|
|
const searchEntitites = $('#searchEntitites');
|
|
|
|
return $(searchEntitites).children().length;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async startcs() {
|
|
try{
|
|
const options = await this.page.$$('#selectCategory option');
|
|
const wantedOption = ['Działalność transgraniczna podmiotów krajowych'];
|
|
for (const item of options) {
|
|
const text = await this.page.evaluate(el => el.innerText, item);
|
|
const value = await this.page.evaluate(el => el.value, item);
|
|
|
|
if (wantedOption.indexOf(text) !== -1) {
|
|
await this.page.select('#selectCategory', value);
|
|
break;
|
|
}
|
|
}
|
|
|
|
await this._randomWait(this.page, 1, 2);
|
|
|
|
await this._findAndClick('#searchButton');
|
|
|
|
await this.page.waitForSelector('#band-cookies-close', { 'timeout':7500 }).then(async (elm) => {
|
|
await elm.click({ 'delay':90 });
|
|
}).catch(() => {
|
|
logger.info('No cookie band...');
|
|
});
|
|
|
|
await this.page.waitForSelector('#searchresults > div.searchresults-counter.border-top.text-uppercase.text-center > p > span', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
const count = await this.page.evaluate(el => el.innerText, elm);
|
|
this.creditServices.started = true;
|
|
this.creditServices.currentIndexLength = parseInt(count, 10);
|
|
this.creditServices.currentPageLimit = await this.countCSRows();
|
|
|
|
this.emit('pageChanged');
|
|
});
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reduce the Article 70 Spans into an array
|
|
* @param html
|
|
* @returns {Array}
|
|
*/
|
|
reduceArt70Spans(html) {
|
|
try{
|
|
const output = [];
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const spans = $('span');
|
|
|
|
spans.each((i, item) => {
|
|
output.push($(item).text());
|
|
});
|
|
|
|
return output;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reduce the Article 70 data
|
|
* @param html
|
|
* @returns {Promise<...Map<any, any>[]>}
|
|
*/
|
|
async reduceArt70(html) {
|
|
try{
|
|
const outMap = new Map([]);
|
|
|
|
const $ = cheerio.load(html);
|
|
|
|
const tables = $('table');
|
|
|
|
tables.each(async (i, itm) => {
|
|
const rows = $(itm).find('td');
|
|
|
|
const title = this._cleanUp($($(rows)[1]).text());
|
|
|
|
const spans = $(rows)[3];
|
|
|
|
if($(spans).length > 0) {
|
|
const reducedTable = this.reduceArt70Spans($(spans).html());
|
|
|
|
outMap.set(title, reducedTable);
|
|
}
|
|
});
|
|
|
|
return [...outMap];
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processArt70() {
|
|
try{
|
|
await this.page.waitForSelector('table#entityTable', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
const html = await this.page.evaluate(el => el.outerHTML, elm);
|
|
|
|
const activities = await this.reduceArt70(html);
|
|
|
|
const entityName = `${this.modeNames[this.mode]}_article70`;
|
|
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
await this._makeScreenshotV2(this.page, filePath, null);
|
|
|
|
logger.info(`Saving: ${entityName}.json`);
|
|
await jsonfile.writeFile(`${filePath}.json`, activities);
|
|
|
|
this.emit('serviceDone');
|
|
});
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleStartcs() {
|
|
await this.page.waitForSelector('#selectCategory', { 'visible': true, 'timeout':90000 }).then(async () => {
|
|
await this.startcs();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleEntityIndex() {
|
|
let doIndex = false;
|
|
await this.page.waitForSelector('input[value="Lista podmiotów"]', { 'visible':true, 'timeout':3500 }).then(async (elm) => {
|
|
logger.warn('Sent back to the main selector screen');
|
|
await elm.click({ 'delay':90 });
|
|
|
|
doIndex = false;
|
|
}).catch(() => {
|
|
// logger.info('No show all button');
|
|
doIndex = true;
|
|
});
|
|
|
|
if (!doIndex) return;
|
|
|
|
await this.page.waitForSelector('tfoot > tr > td > select', { 'visible': true, 'timeout':90000 }).then(async (elm) => {
|
|
const rawValue = await elm.getProperty('value');
|
|
const value = await rawValue.jsonValue();
|
|
|
|
logger.debug('Dropdown value', value);
|
|
|
|
if (parseInt(value, 10) === 10) {
|
|
doIndex = false;
|
|
await this.page.select('tfoot > tr > td > select', '200');
|
|
logger.debug('Drop down changed..');
|
|
}
|
|
}).catch(() => {
|
|
logger.debug('There was no paging drop down??');
|
|
});
|
|
|
|
if (doIndex)
|
|
await this.processEntityIndex(this.paymentServices).catch(async (err) => {
|
|
logger.error('processEntityIndex catch: ', err);
|
|
this.emit('restart');
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleEntityDetail() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.processEntityDetail(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.processCSEntityDetail(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.processEntityDetail(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async handleEntityComplete() {
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this.entityCompleter(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.entityCompleter(this.creditServices);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this.entityCompleter(this.paymentServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<boolean>}
|
|
*/
|
|
async bouncerCheck() {
|
|
let canProceed = 0;
|
|
let msg = 'Bouncer: ';
|
|
await this.page.waitForFunction(
|
|
'document.querySelector("body").innerText.includes("Usługa chwilowo niedostępna. Przepraszamy.");'
|
|
, { 'timeout':2500 }).then(() => {
|
|
msg += '❌';
|
|
}).catch(() => {
|
|
msg += '✔️';
|
|
canProceed++;
|
|
});
|
|
|
|
await this.page.waitForFunction(
|
|
'document.querySelector("body").innerText.length===0'
|
|
, { 'timeout':2500 }).then(() => {
|
|
msg += '❌';
|
|
}).catch(() => {
|
|
msg += '✔️';
|
|
canProceed++;
|
|
});
|
|
|
|
if (canProceed === 2)
|
|
logger.debug(msg);
|
|
else
|
|
logger.warn(msg);
|
|
|
|
return (canProceed === 2);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
const removeJSession = /(;jsessionid=[0-9a-f]*)/g;
|
|
await this._randomWait(this.page, 3, 5, 'processNewPage');
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
this.emit('backoff');
|
|
|
|
return;
|
|
}
|
|
|
|
const pathName = (pageUrl.pathname || '').replace(removeJSession, '');
|
|
|
|
// pre check
|
|
logger.debug('Hit:', pathName);
|
|
|
|
const canProceed = await this.bouncerCheck();
|
|
|
|
if (canProceed)
|
|
switch (pathName) {
|
|
|
|
case '/View/':
|
|
case '/View/faces/start2OuterView.xhtml':
|
|
case '/View/faces/dataEdit.xhtml':
|
|
await this.handleEntityIndex();
|
|
break;
|
|
|
|
case '/View/faces/subjectsList.xhtml':
|
|
this.emit('entityDetail');
|
|
break;
|
|
|
|
case '/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych':
|
|
await this.handleXLSDownload();
|
|
break;
|
|
|
|
case '/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych':
|
|
await this.handleXLSDownload();
|
|
break;
|
|
|
|
case '/podmioty/wyszukiwarka_podmiotow':
|
|
await this.csIndexHandler();
|
|
break;
|
|
|
|
case '/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi':
|
|
await this.processArt70();
|
|
break;
|
|
|
|
case '/View/redirect2OuterView.jsp':
|
|
logger.warn('Letting this page transition...');
|
|
// do nothing
|
|
// let the page transition
|
|
break;
|
|
|
|
case '/':
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/')
|
|
this.emit('backoff');
|
|
else
|
|
throw new Error(`Bad page: ${pageUrl.href}`);
|
|
|
|
break;
|
|
|
|
default:
|
|
if (process.env.NODE_ENV) {
|
|
await this._uploadError();
|
|
// this.emit('backoff');
|
|
throw new Error(`Unknown page: ${pageUrl.href}`);
|
|
}
|
|
else {
|
|
logger.warn('processNewPage Fell through');
|
|
logger.warn('pathName', pathName);
|
|
logger.warn('currentPage.location', pageUrl);
|
|
}
|
|
break;
|
|
|
|
} else {
|
|
logger.warn('We have hit a bouncer.. Back off for a bit');
|
|
this.emit('backoff');
|
|
}
|
|
|
|
// logger.debug('## Page changed', pageUrl);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async _restart() {
|
|
logger.warn(`Tryng to restart ${this.modeTitles[this.mode]}`);
|
|
|
|
if (this.mode === 0) {
|
|
logger.debug('Clearing current object..');
|
|
this.paymentServices.visited = false;
|
|
this.paymentServices.current = {};
|
|
}
|
|
|
|
switch (this.mode) {
|
|
|
|
case 1:
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
break;
|
|
|
|
case 2:
|
|
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
|
|
break;
|
|
|
|
case 0:
|
|
default:
|
|
await this._goto(this.paymentServices.urls[0]);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async backoff() {
|
|
this.backOffStep++;
|
|
|
|
clearTimeout(this.stall);
|
|
|
|
this.stall = 0;
|
|
|
|
this.stalled = false;
|
|
|
|
if (this.backOffStep > this.backOffLimit) this.backOffStep = this.backOffLimit;
|
|
|
|
logger.warn(`Backing off for ${this.backOffStep * 5} minutes..`);
|
|
const timeout = 300000; // (this.backOffStep * 5) * 60000;
|
|
|
|
logger.warn('timeout', timeout);
|
|
// await this._uploadError();
|
|
|
|
this.backOffTimer = setTimeout(() => {
|
|
this.emit('restart');
|
|
}, timeout);
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async attachEvents() {
|
|
this.on('pageChanged', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
this.on('stall', () => {
|
|
this.emit('backoff');
|
|
});
|
|
|
|
this.on('backoff', this._debounce( () => {
|
|
this.backoff();
|
|
}, 10000));
|
|
|
|
/* this.on('backoff', () => {
|
|
this.backoff();
|
|
});*/
|
|
|
|
this.on('restart', async () => {
|
|
await this._restart();
|
|
});
|
|
|
|
this.on('entityComplete', () => {
|
|
this.handleEntityComplete();
|
|
});
|
|
|
|
this.on('handleEntityIndex', () => {
|
|
this.handleEntityIndex();
|
|
});
|
|
|
|
this.on('entityDetail', async () => {
|
|
await this.handleEntityDetail();
|
|
});
|
|
|
|
this.on('startcs', () => {
|
|
this.handleStartcs();
|
|
});
|
|
|
|
this.on('processCSEntityIndex', async () => {
|
|
await this.processCSEntityIndex(this.creditServices).catch(() => {
|
|
this.emit('backoff');
|
|
});
|
|
});
|
|
|
|
this.on('serviceDone', async function() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
this.emit('paymentServicesDone');
|
|
break;
|
|
|
|
case 1:
|
|
this.emit('emoneyServicesDone');
|
|
break;
|
|
|
|
case 2:
|
|
this.emit('creditServicesDone');
|
|
break;
|
|
|
|
}
|
|
});
|
|
|
|
this.on('paymentServicesDone', async function() {
|
|
logger.warn('paymentServicesDone');
|
|
try{
|
|
this.paymentServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('emoneyServicesDone', async function() {
|
|
logger.warn('emoneyServicesDone');
|
|
try{
|
|
this.emoneyServices.done = true;
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.creditServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('creditServicesDone', async function() {
|
|
logger.warn('creditServicesDone');
|
|
try{
|
|
if (this.creditServices.metastep === 0) {
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
|
this.creditServices.metastep++;
|
|
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
|
|
|
|
return;
|
|
}
|
|
|
|
if (this.creditServices.metastep === 1) {
|
|
this.creditServices.metastep++;
|
|
await this._goto(this.creditServices.urls[this.creditServices.metastep]);
|
|
|
|
return;
|
|
}
|
|
|
|
if (this.creditServices.metastep === 2) {
|
|
this.creditServices.done = true;
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
this.emit('done');
|
|
}
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
super._start();
|
|
try {
|
|
this.mode = 0;
|
|
this.backOffStep = 0;
|
|
this.backOffLimit = 3;
|
|
this.backOffTimer = null;
|
|
this.stall = null;
|
|
this.stalled = false;
|
|
|
|
this.paymentServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://erup.knf.gov.pl/View/'],
|
|
'sections' : [],
|
|
'sectionLinks' : [],
|
|
'brokenReturn' : false,
|
|
'started':0
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://www.knf.gov.pl/podmioty/podmioty_rynku_uslug_platniczych/Rejestr_malych_instytucji_platniczych'],
|
|
'sections' : [],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'started': false,
|
|
'urls': ['https://www.knf.gov.pl/podmioty/wyszukiwarka_podmiotow', 'https://www.knf.gov.pl/podmioty/Rejestry_i_Ewidencje/rejestr_instytucji_pozyczkowych', 'https://www.knf.gov.pl/podmioty/Podmioty_sektora_bankowego/Banki_komercyjne_art_70_ust_2_ustawy_o_obrocie_instrumentami_finansowymi'],
|
|
'sections' : [],
|
|
'sectionLinks' : [],
|
|
'restart' : false,
|
|
'metastep' : 0
|
|
};
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
this.emoneyUrl = this.emoneyServices.urls[0];
|
|
this.credit = this.creditServices.urls[0];
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/PL/KNF`));
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser();
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
if (this.eventNames().length === 2)
|
|
await this.attachEvents();
|
|
|
|
//
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
|
|
const now = new Date();
|
|
this.paymentServices.started = now.getTime();
|
|
await this._goto(this.startPage, { 'waitUntil':'networkidle0' }).catch((err) => {
|
|
logger.error(err);
|
|
});
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = PLScrape;
|
|
|