const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('(LT)'); const url = require('url'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class LTScrape extends Scraper { constructor() { super(); this.setID('LT'); this.addToBlockFilters(['smartlook.com', 'd10lpsik1i8c69', 'mouseflow.com', 'inspectlet.com']); this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param html * @param path * @returns {Promise} */ async extractEntityIntermediaries(html, path = 'item-contra-intermediaries') { try{ const newObj = { } ; const $ = cheerio.load(html); const rows = $(`#${path} li div.row`); rows.each((i, li) => { const children = $(li).children(); if ($(children).length === 2) { const label = this._makeFieldName($(children).eq(0).text()); if (!newObj.hasOwnProperty(label)) newObj[label] = []; newObj[label].push(this._cleanUp($(children).eq(1).text())); } }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityList(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const rows = $('#item-lists li'); rows.each((i, li) => { const children = $(li).children(); if ($(children).length === 1) newArray.push(this._cleanUp($(children).eq(0).text())); }); return newArray; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityActivity(html) { try{ const newArray = [] ; const $ = cheerio.load(html); const rows = $('#item-activities tbody tr'); rows.each((i, li) => { const children = $(li).children(); if ($(children).length === 3) { const activity = this._cleanUp($(children).eq(0).text()); const from = this._cleanUp($(children).eq(1).text()); const to = this._cleanUp($(children).eq(2).text()); newArray.push({ activity, from, to }); } }); return newArray; } catch( err) { logger.error(err); } } // /** * * @param html * @returns {Promise} */ async extractEntityFOSContent(html) { try{ const newObj = {} ; const $ = cheerio.load(html); const rows = $('#fos-content div.panel-heading'); rows.each((i, row) => { const label = this._makeFieldName($(row).find('span.l').text()); if (!newObj.hasOwnProperty(label)) newObj[label] = []; const sibling = $(row).next(); const tr = $(sibling).find('tbody tr'); tr.each((y, item) => { const children = $(item).children(); if ($(children).length === 3) { const activity = this._cleanUp($(children).eq(0).text()); const from = this._cleanUp($(children).eq(1).text()); const to = this._cleanUp($(children).eq(2).text()); newObj[label].push({ activity, from, to }); } }); }); return newObj; } catch( err) { logger.error(err); } } /** * * @param html * @returns {Promise} */ async extractEntityDetails(html) { const spliterRX = /(.+)(?::\s+)(.+)/; try{ const newObj = { } ; const $ = cheerio.load(html); const items = $('div.frd-props.text.row p'); items.each((i, elm) => { const children = cheerio(elm).children(); if (children.length > 0) { const propType = $(children.eq(0)).prop('name'); if (propType !== 'a') { const ws = $(elm).text().match(spliterRX); const label = this._makeFieldName(ws[1]); newObj[label] = this._cleanUp(ws[2]); } } }); return newObj; } catch( err) { logger.error(err); } } async preBuildIndex(serviceObject) { await this.page.waitForSelector('#cookies_msg > div > a', { 'timeout':7500 }).then(async (elm) => { await elm.click({ 'delay':90 }); }).catch(() => { logger.info('No cookie band...'); }); // Ensure that the max number f items is shown await this.page.waitForSelector('#content > div > div:nth-child(4) > div.totals > form > span > button:nth-child(3)', { 'visible': true, 'timeout':7500 }).then(async (elm) => { const cls = await this.page.evaluate(el => el.getAttribute('class'), elm); logger.debug('button class', cls); if (cls === null) await elm.click({ 'delay':90 }); else await this.buildIndex(serviceObject); }); } async expandAreas() { const divs = ['item-activities', 'item-contra-intermediaries', 'item-intermediaries', 'item-lists', 'foe-countries']; // #content > div > div:nth-child(4) > div > a:nth-child(2) for (const item of divs) await this.page.waitForSelector(`div#${item}`, { 'visible': false, 'timeout':2500 }).then(async (elm) => { await this.page.evaluate(el => { el.removeAttribute('class'); el.style.display = ''; }, elm); }).catch(() => { logger.debug(`No ${item}`); }); // these needs to load content via ajax const fosA = await this.page.$$('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]'); if (fosA.length === 1) { await this.page.waitForSelector('#content > div > div:nth-child(4) > div > a[href="#fos-countries"]', { 'timeout':2500 }).then(async (elm) => { await elm.click({ 'delay':90 }); }).catch(() => { logger.debug('No #fos-countries'); }); // #fos-countries > div > div > div.modal-body > div > div > i await this.page.waitForSelector('#fos-countries > div > div > div.modal-body > div > div > i', { 'visible': false, 'timeout':10000 }); await this.page.waitForSelector('div#fos-countries', { 'visible': true, 'timeout':2500 }).then(async (elm) => { await this.page.evaluate(async el => { el.style.display = ''; await el.removeAttribute('class'); }, elm); }).catch(() => { logger.debug('No #fos-countries'); }); await this.page.waitForSelector('div.modal-backdrop.in', { 'visible': true, 'timeout':2500 }).then(async (elm) => { await this.page.evaluate(async el => { el.style.height = '0px'; el.style.display = 'none'; await el.removeAttribute('class'); }, elm); }).catch(() => { logger.debug('No #fos-countries'); }); } } async extractIndex(html) { const links = []; const slashRgx = /(\/\/)/; const $ = cheerio.load(html); const rows = $('table.table tbody tr'); rows.each((index, item) => { const children = $(item.children); const title = this._cleanUp($(children).eq(1).text()) ; const type = this._cleanUp($(children).eq(3).text()) ; const businessForm = this._cleanUp($(children).eq(5).text()) ; const rawUrl = $(children).eq(1).find('a').attr('href'); const href = rawUrl.replace(slashRgx, 'https://'); links.push({ 'id': title, 'href': href, 'type': type, 'businessForm':businessForm }); }); return links; } async processEntityPage(serviceObject) { const newObj = {}; const id = serviceObject.links[serviceObject.step].id; logger.info(`Process ${serviceObject.step} of ${serviceObject.items} // ${this.modeTitles[this.mode]} entity:${id}`); await this._randomWait(this.page, 3, 5); const entityName = serviceObject.links[serviceObject.step].id; const fileName = this._makeFileName(entityName); const filePath = await this._makeFilePath(entityName); await this.expandAreas(); await this._randomWait(this.page, 3, 5); await this._makeScreenshotV2(this.page, `${filePath}_main`, null); const body = await this.page.content(); newObj.details = await this.extractEntityDetails(body); newObj.contraIntermediaries = await this.extractEntityIntermediaries(body, 'item-contra-intermediaries'); newObj.intermediaries = await this.extractEntityIntermediaries(body, 'item-intermediaries'); newObj.list = await this.extractEntityList(body); newObj.activity = await this.extractEntityActivity(body); newObj.foeCountries = await this.extractEntityIntermediaries(body, 'foe-countries'); newObj.fosContent = await this.extractEntityFOSContent(body); await jsonfile.writeFile(`${filePath}.json`, newObj); await this._randomWait(this.page, 3, 5); // await this._randomWait(this.page, 1000, 1000, 'Throttled'); serviceObject.links[serviceObject.step].filename = `${fileName}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } /** * * @param serviceObject * @returns {Promise} */ async buildIndex(serviceObject) { logger.info(`Building the ${this.modeTitles[this.mode]} index...`); await this._randomWait(this.page, 3, 5); const body = await this.page.content(); const entityName = `${this.modeNames[this.mode]}`; const filePath = await this._makeFilePath(entityName); await this._makeScreenshotV2(this.page, filePath, null); const links = await this.extractIndex(body); serviceObject.links = links.slice(); this.emit('indexdone'); } /** * * @returns {Promise} */ async indexRedirector() { switch (this.mode) { case 0: await this.preBuildIndex(this.paymentServices); break; case 1: await this.preBuildIndex(this.emoneyServices); break; case 2: await this.preBuildIndex(this.creditServices); break; } } async processRedirector() { switch (this.mode) { case 0: await this.processEntityPage(this.paymentServices); break; case 1: await this.processEntityPage(this.emoneyServices); break; case 2: await this.processEntityPage(this.creditServices); break; } } async processNewPage() { // give the page a few seconds to settle const rX = /(\/en\/sfi-financial-market-participants)(\/?)/; await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } const pathName = pageUrl.pathname.match(rX)[0]; logger.debug(pathName); switch (pathName) { case '/en/sfi-financial-market-participants': await this.indexRedirector(); break; case '/en/sfi-financial-market-participants/': await this.processRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } break; } } /** * * @returns {Promise} */ async attachEvents() { this.on('entityComplete', () => { this.handleEntityComplete(); }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('psindexdone', async () => { this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); // logger.debug(this.paymentServices.links); const newUrl = this.paymentServices.links[this.paymentServices.step].href; await this._goto(newUrl); }); this.on('emindexdone', async () => { this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); // logger.debug(this.paymentServices.links); const newUrl = this.emoneyServices.links[this.emoneyServices.step].href; await this._goto(newUrl); }); this.on('ciindexdone', async () => { this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); // logger.debug(this.paymentServices.links); const newUrl = this.creditServices.links[this.creditServices.step].href; await this._goto(newUrl); }); this.on('paymentServicesDone', async () => { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=6&type%5B%5D=20&business_form%5B%5D=28&business_form%5B%5D=27&business_form%5B%5D=89'], 'sections' : [], 'sectionLinks' : [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=7&type%5B%5D=21&business_form%5B%5D=32&business_form%5B%5D=33'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['https://www.lb.lt/en/sfi-financial-market-participants?ff=1&market=1&type%5B%5D=3&type%5B%5D=27&business_form%5B%5D=82&business_form%5B%5D=22&business_form%5B%5D=110'], 'sections' : [], 'sectionLinks' : [] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/LT/LB`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); // start the browser await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = LTScrape;