667 lines
17 KiB
JavaScript
667 lines
17 KiB
JavaScript
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const jsonfile = require('jsonfile');
|
|
const logger = require('log4js').getLogger('(LT)');
|
|
const url = require('url');
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
class LTScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('LT');
|
|
|
|
this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']);
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @param path
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async extractEntityIntermediaries(html, path = 'item-contra-intermediaries') {
|
|
try{
|
|
const newObj = { } ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $(`#${path} li div.row`);
|
|
|
|
rows.each((i, li) => {
|
|
const children = $(li).children();
|
|
|
|
if ($(children).length === 2) {
|
|
const label = this._makeFieldName($(children).eq(0).text());
|
|
|
|
if (!newObj.hasOwnProperty(label))
|
|
newObj[label] = [];
|
|
|
|
newObj[label].push(this._cleanUp($(children).eq(1).text()));
|
|
}
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async extractEntityList(html) {
|
|
try{
|
|
const newArray = [] ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('#item-lists li');
|
|
|
|
rows.each((i, li) => {
|
|
const children = $(li).children();
|
|
|
|
if ($(children).length === 1)
|
|
newArray.push(this._cleanUp($(children).eq(0).text()));
|
|
});
|
|
|
|
return newArray;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async extractEntityActivity(html) {
|
|
try{
|
|
const newArray = [] ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('#item-activities tbody tr');
|
|
|
|
rows.each((i, li) => {
|
|
const children = $(li).children();
|
|
|
|
if ($(children).length === 3) {
|
|
const activity = this._cleanUp($(children).eq(0).text());
|
|
const from = this._cleanUp($(children).eq(1).text());
|
|
const to = this._cleanUp($(children).eq(2).text());
|
|
|
|
newArray.push({ activity, from, to });
|
|
}
|
|
});
|
|
|
|
return newArray;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
//
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
|
|
async extractEntityFOSContent(html) {
|
|
try{
|
|
const newObj = {} ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('#fos-content div.panel-heading');
|
|
|
|
rows.each((i, row) => {
|
|
const label = this._makeFieldName($(row).find('span.l').text());
|
|
|
|
if (!newObj.hasOwnProperty(label))
|
|
newObj[label] = [];
|
|
|
|
const sibling = $(row).next();
|
|
|
|
const tr = $(sibling).find('tbody tr');
|
|
|
|
tr.each((y, item) => {
|
|
const children = $(item).children();
|
|
if ($(children).length === 3) {
|
|
const activity = this._cleanUp($(children).eq(0).text());
|
|
const from = this._cleanUp($(children).eq(1).text());
|
|
const to = this._cleanUp($(children).eq(2).text());
|
|
|
|
newObj[label].push({ activity, from, to });
|
|
}
|
|
});
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param html
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async extractEntityDetails(html) {
|
|
const spliterRX = /(.+)(?::\s+)(.+)/;
|
|
try{
|
|
const newObj = { } ;
|
|
const $ = cheerio.load(html);
|
|
|
|
const items = $('div.frd-props.text.row p');
|
|
|
|
items.each((i, elm) => {
|
|
const children = cheerio(elm).children();
|
|
if (children.length > 0) {
|
|
const propType = $(children.eq(0)).prop('name');
|
|
|
|
if (propType !== 'a') {
|
|
const ws = $(elm).text().match(spliterRX);
|
|
|
|
const label = this._makeFieldName(ws[1]);
|
|
newObj[label] = this._cleanUp(ws[2]);
|
|
}
|
|
}
|
|
});
|
|
|
|
return newObj;
|
|
}
|
|
catch( err) {
|
|
logger.error(err);
|
|
}
|
|
}
|
|
|
|
async preBuildIndex(serviceObject) {
|
|
await this.page.waitForSelector('#cookies_msg > div > a', { 'timeout':7500 }).then(async (elm) => {
|
|
await elm.click({ 'delay':90 });
|
|
}).catch(() => {
|
|
logger.info('No cookie band...');
|
|
});
|
|
|
|
// Ensure that the max number f items is shown
|
|
|
|
await this.page.waitForSelector('#content > div > div:nth-child(4) > div.totals > form > span > button:nth-child(3)', { 'visible': true, 'timeout':7500 }).then(async (elm) => {
|
|
const cls = await this.page.evaluate(el => el.getAttribute('class'), elm);
|
|
|
|
logger.debug('button class', cls);
|
|
|
|
if (cls === null)
|
|
await elm.click({ 'delay':90 });
|
|
else
|
|
await this.buildIndex(serviceObject);
|
|
});
|
|
}
|
|
|
|
async expandAreas() {
|
|
const divs = ['item-activities', 'item-contra-intermediaries', 'item-intermediaries', 'item-lists', 'foe-countries'];
|
|
|
|
// #content > div > div:nth-child(4) > div > a:nth-child(2)
|
|
|
|
for (const item of divs)
|
|
await this.page.waitForSelector(`div#${item}`, { 'visible': false, 'timeout':2500 }).then(async (elm) => {
|
|
await this.page.evaluate(el => {
|
|
el.removeAttribute('class');
|
|
el.style.display = '';
|
|
}, elm);
|
|
}).catch(() => {
|
|
logger.debug(`No ${item}`);
|
|
});
|
|
|
|
// these needs to load content via ajax
|
|
const fosA = await this.page.$$('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]');
|
|
if (fosA.length === 1) {
|
|
await this.page.waitForSelector('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]', { 'timeout':2500 }).then(async (elm) => {
|
|
await elm.click({ 'delay':90 });
|
|
}).catch(() => {
|
|
logger.debug('No #fos-countries');
|
|
});
|
|
|
|
// #fos-countries > div > div > div.modal-body > div > div > i
|
|
await this.page.waitForSelector('#fos-countries > div > div > div.modal-body > div > div > i', { 'visible': false, 'timeout':10000 });
|
|
|
|
await this.page.waitForSelector('div#fos-countries', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
|
|
await this.page.evaluate(async el => {
|
|
el.style.display = '';
|
|
await el.removeAttribute('class');
|
|
}, elm);
|
|
}).catch(() => {
|
|
logger.debug('No #fos-countries');
|
|
});
|
|
|
|
await this.page.waitForSelector('div.modal-backdrop.in', { 'visible': true, 'timeout':2500 }).then(async (elm) => {
|
|
await this.page.evaluate(async el => {
|
|
el.style.height = '0px';
|
|
el.style.display = 'none';
|
|
await el.removeAttribute('class');
|
|
}, elm);
|
|
}).catch(() => {
|
|
logger.debug('No #fos-countries');
|
|
});
|
|
}
|
|
}
|
|
|
|
async extractIndex(html) {
|
|
const links = [];
|
|
const slashRgx = /(\/\/)/;
|
|
const $ = cheerio.load(html);
|
|
|
|
const rows = $('table.table tbody tr');
|
|
|
|
rows.each((index, item) => {
|
|
const children = $(item.children);
|
|
|
|
const title = this._cleanUp($(children).eq(1).text()) ;
|
|
|
|
const type = this._cleanUp($(children).eq(3).text()) ;
|
|
const businessForm = this._cleanUp($(children).eq(5).text()) ;
|
|
|
|
const rawUrl = $(children).eq(1).find('a').attr('href');
|
|
|
|
const href = rawUrl.replace(slashRgx, 'https://');
|
|
|
|
links.push({ 'id': title, 'href': href, 'type': type, 'businessForm':businessForm });
|
|
});
|
|
|
|
return links;
|
|
}
|
|
|
|
async processEntityPage(serviceObject) {
|
|
const newObj = {};
|
|
|
|
const id = serviceObject.links[serviceObject.step].id;
|
|
logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const entityName = serviceObject.links[serviceObject.step].id;
|
|
const fileName = this._makeFileName(entityName);
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
await this.expandAreas();
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
await this._makeScreenshotV2(this.page, `${filePath}_main`, null);
|
|
|
|
const body = await this.page.content();
|
|
|
|
newObj.details = await this.extractEntityDetails(body);
|
|
newObj.contraIntermediaries = await this.extractEntityIntermediaries(body, 'item-contra-intermediaries');
|
|
newObj.intermediaries = await this.extractEntityIntermediaries(body, 'item-intermediaries');
|
|
newObj.list = await this.extractEntityList(body);
|
|
newObj.activity = await this.extractEntityActivity(body);
|
|
newObj.foeCountries = await this.extractEntityIntermediaries(body, 'foe-countries');
|
|
newObj.fosContent = await this.extractEntityFOSContent(body);
|
|
|
|
await jsonfile.writeFile(`${filePath}.json`, newObj);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
// await this._randomWait(this.page, 1000, 1000, 'Throttled');
|
|
|
|
serviceObject.links[serviceObject.step].filename = `${fileName}.json`;
|
|
serviceObject.step++;
|
|
|
|
if (serviceObject.step < serviceObject.items) {
|
|
const newUrl = serviceObject.links[serviceObject.step].href;
|
|
|
|
await this._goto(newUrl);
|
|
}
|
|
else
|
|
this.emit('serviceDone');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param serviceObject
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async buildIndex(serviceObject) {
|
|
logger.info(`Building the ${this.modeTitles[this.mode]} index...`);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const body = await this.page.content();
|
|
|
|
const entityName = `${this.modeNames[this.mode]}`;
|
|
|
|
const filePath = await this._makeFilePath(entityName);
|
|
|
|
await this._makeScreenshotV2(this.page, filePath, null);
|
|
|
|
const links = await this.extractIndex(body);
|
|
|
|
serviceObject.links = links.slice();
|
|
|
|
this.emit('indexdone');
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async indexRedirector() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
await this.preBuildIndex(this.paymentServices);
|
|
break;
|
|
|
|
case 1:
|
|
await this.preBuildIndex(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.preBuildIndex(this.creditServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
async processRedirector() {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
await this.processEntityPage(this.paymentServices);
|
|
break;
|
|
|
|
case 1:
|
|
await this.processEntityPage(this.emoneyServices);
|
|
break;
|
|
|
|
case 2:
|
|
await this.processEntityPage(this.creditServices);
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
const rX = /(\/en\/sfi-financial-market-participants)(\/?)/;
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
this.emit('recover');
|
|
|
|
return;
|
|
}
|
|
|
|
const pathName = pageUrl.pathname.match(rX)[0];
|
|
|
|
logger.debug(pathName);
|
|
|
|
switch (pathName) {
|
|
|
|
case '/en/sfi-financial-market-participants':
|
|
await this.indexRedirector();
|
|
break;
|
|
|
|
case '/en/sfi-financial-market-participants/':
|
|
await this.processRedirector();
|
|
break;
|
|
|
|
default:
|
|
if (process.env.NODE_ENV) {
|
|
await this._uploadError();
|
|
throw new Error(`Unknown page: ${pageUrl}`);
|
|
}
|
|
else {
|
|
logger.warn('processNewPage Fell through');
|
|
logger.warn('currentPage.location', pageUrl);
|
|
}
|
|
break;
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async attachEvents() {
|
|
this.on('entityComplete', () => {
|
|
this.handleEntityComplete();
|
|
});
|
|
|
|
this.on('indexdone', async () => {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
this.emit('psindexdone');
|
|
break;
|
|
|
|
case 1:
|
|
this.emit('emindexdone');
|
|
break;
|
|
|
|
case 2:
|
|
this.emit('ciindexdone');
|
|
break;
|
|
|
|
}
|
|
});
|
|
|
|
this.on('serviceDone', async () => {
|
|
switch (this.mode) {
|
|
|
|
case 0:
|
|
this.emit('paymentServicesDone');
|
|
break;
|
|
|
|
case 1:
|
|
this.emit('emoneyServicesDone');
|
|
break;
|
|
|
|
case 2:
|
|
this.emit('creditServicesDone');
|
|
break;
|
|
|
|
}
|
|
});
|
|
|
|
this.on('psindexdone', async () => {
|
|
this.paymentServices.items = this.paymentServices.links.length;
|
|
logger.info(`${this.paymentServices.items} items indexed`);
|
|
// logger.debug(this.paymentServices.links);
|
|
|
|
const newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
|
|
|
await this._goto(newUrl);
|
|
});
|
|
|
|
this.on('emindexdone', async () => {
|
|
this.emoneyServices.items = this.emoneyServices.links.length;
|
|
logger.info(`${this.emoneyServices.items} items indexed`);
|
|
// logger.debug(this.paymentServices.links);
|
|
|
|
const newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
|
|
|
await this._goto(newUrl);
|
|
});
|
|
|
|
this.on('ciindexdone', async () => {
|
|
this.creditServices.items = this.creditServices.links.length;
|
|
logger.info(`${this.creditServices.items} items indexed`);
|
|
// logger.debug(this.paymentServices.links);
|
|
|
|
const newUrl = this.creditServices.links[this.creditServices.step].href;
|
|
|
|
await this._goto(newUrl);
|
|
});
|
|
|
|
this.on('paymentServicesDone', async () => {
|
|
logger.warn('paymentServicesDone');
|
|
try{
|
|
this.paymentServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('emoneyServicesDone', async () => {
|
|
logger.warn('emoneyServicesDone');
|
|
try{
|
|
this.emoneyServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
await this._goto(this.creditServices.urls[0]);
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
|
|
this.on('creditServicesDone', async () => {
|
|
logger.warn('creditServicesDone');
|
|
try{
|
|
this.creditServices.done = true;
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
|
this.mode++;
|
|
this.inProgress = false;
|
|
|
|
this.emit('done');
|
|
}
|
|
catch (e) {
|
|
logger.error(e);
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
super._start();
|
|
try {
|
|
this.mode = 0;
|
|
|
|
this.paymentServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=6&type%5B%5D=20&business_form%5B%5D=28&business_form%5B%5D=27&business_form%5B%5D=89'],
|
|
'sections' : [],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.emoneyServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=7&type%5B%5D=21&business_form%5B%5D=32&business_form%5B%5D=33'],
|
|
'sections' : [],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.creditServices = {
|
|
'items': 0,
|
|
'links': [],
|
|
'step': 0,
|
|
'indexStep': 0,
|
|
'visited': false,
|
|
'done' : false,
|
|
'searchDone' : false,
|
|
'started': false,
|
|
'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=3&type%5B%5D=27&business_form%5B%5D=82&business_form%5B%5D=22&business_form%5B%5D=110'],
|
|
'sections' : [],
|
|
'sectionLinks' : []
|
|
};
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
this.emoneyUrl = this.emoneyServices.urls[0];
|
|
this.credit = this.creditServices.urls[0];
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/LT/LB`));
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
// start the browser
|
|
|
|
await this._initBrowser();
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
if (this.eventNames().length === 2)
|
|
await this.attachEvents();
|
|
|
|
//
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = LTScrape;
|