Fresh Scraper ======================== ```javascript const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const jsonfile = require('jsonfile'); const logger = require('log4js').getLogger('LV'); const url = require('url'); const removeAccents = require('remove-accents-diacritics'); logger.level = process.env.LOGGER_LEVEL || 'warn'; class LVScrape extends Scraper { constructor() { super(); this.id = 'LV'; this.on('done', () => { this._done(); }); this.run = this._throttle(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @param serviceObject * @returns {Promise} */ async processEntityDetails(serviceObject) { await this._randomWait(this.page, 3, 5); serviceObject.links[serviceObject.step].filename = `${filename}.json`; serviceObject.step++; if (serviceObject.step < serviceObject.items) { const newUrl = serviceObject.links[serviceObject.step].href; await this._goto(newUrl); } else this.emit('serviceDone'); } /** * * @returns {Promise} */ async indexRedirector() { switch (this.mode) { case 0: await this.buildIndex(this.paymentServices); break; case 1: await this.buildIndex(this.emoneyServices); break; case 2: await this.buildIndex(this.creditServices); break; } } /** * * @returns {Promise} */ async processRedirector() { switch (this.mode) { case 0: await this.processEntityDetails(this.paymentServices); break; case 1: await this.processEntityDetails(this.emoneyServices); break; case 2: await this.processEntityDetails(this.creditServices); break; } } async processNewPage() { // give the page a few seconds to settle await this._randomWait(this.page, 3, 5); const pageUrl = url.parse(await this.page.url()); if (pageUrl.href === 'chrome-error://chromewebdata/') { logger.warn('Directed to: chrome-error://chromewebdata/'); this.emit('recover'); return; } switch (pageUrl.pathname) { case '/en/our-registers/company-register/': await this.indexRedirector(); break; case '/en/our-registers/company-register/details': await this.processRedirector(); break; case '/en/our-registers/company-register/gransoverskridandehandel/': await this.crossBorderRedirector(); break; default: if (process.env.NODE_ENV) { await this._uploadError(); throw new Error(`Unknown page: ${pageUrl}`); } else { logger.warn('processNewPage Fell through'); logger.warn('currentPage.location', pageUrl); } break; } } /** * * @returns {Promise} */ async attachEvents() { this.on('serviceDone', async () => { switch (this.mode) { case 0: this.emit('paymentServicesDone'); break; case 1: this.emit('emoneyServicesDone'); break; case 2: this.emit('creditServicesDone'); break; } }); this.on('psindexdone', async () => { let newUrl; this.paymentServices.items = this.paymentServices.links.length; logger.info(`${this.paymentServices.items} items indexed`); this.paymentServices.indexStep++; if (this.paymentServices.indexStep >= this.paymentServices.urls.length) newUrl = this.paymentServices.links[this.paymentServices.step].href; else newUrl = this.paymentServices.urls[this.paymentServices.indexStep]; await this._goto(newUrl); }); this.on('emindexdone', async () => { let newUrl; this.emoneyServices.items = this.emoneyServices.links.length; logger.info(`${this.emoneyServices.items} items indexed`); this.emoneyServices.indexStep++; if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length) newUrl = this.emoneyServices.links[this.emoneyServices.step].href; else newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep]; await this._goto(newUrl); }); this.on('ciindexdone', async () => { let newUrl; this.creditServices.items = this.creditServices.links.length; logger.info(`${this.creditServices.items} items indexed`); this.creditServices.indexStep++; if (this.creditServices.indexStep >= this.creditServices.urls.length) newUrl = this.creditServices.links[this.creditServices.step].href; else newUrl = this.creditServices.urls[this.creditServices.indexStep]; await this._goto(newUrl); }); this.on('indexdone', async () => { switch (this.mode) { case 0: this.emit('psindexdone'); break; case 1: this.emit('emindexdone'); break; case 2: this.emit('ciindexdone'); break; } }); this.on('paymentServicesDone', async () => { logger.warn('paymentServicesDone'); try{ this.paymentServices.done = true; jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links }); jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices); this.mode++; this.inProgress = false; await this._goto(this.emoneyServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('emoneyServicesDone', async () => { logger.warn('emoneyServicesDone'); try{ this.emoneyServices.done = true; jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links }); jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices); this.mode++; this.inProgress = false; await this._goto(this.creditServices.urls[0]); } catch (e) { logger.error(e); } }); this.on('creditServicesDone', async () => { logger.warn('creditServicesDone'); try{ this.creditServices.done = true; jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links }); jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices); this.mode++; this.inProgress = false; this.emit('done'); } catch (e) { logger.error(e); } }); } /** * * @returns {Promise} */ async start() { super._start(); try { this.mode = 0; this.rootURI = 'http://www.fktk.lv'; this.paymentServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'], 'sections' : [], 'sectionLinks' : [] }; this.emoneyServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'], 'sections' : [], 'sectionLinks' : [] }; this.creditServices = { 'items': 0, 'links': [], 'step': 0, 'indexStep': 0, 'visited': false, 'done' : false, 'searchDone' : false, 'started': false, 'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'], 'sections' : [], 'sectionLinks' : [] }; this.startPage = this.paymentServices.urls[0]; this.emoneyUrl = this.emoneyServices.urls[0]; this.credit = this.creditServices.urls[0]; this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(); await this._createBrowserPage(); this.page.on('domcontentloaded', this._throttle(async () => { this.processNewPage().catch((err) => { logger.error('processNewPage fail', err); }); }, 2500)); if (this.eventNames().length === 2) await this.attachEvents(); // await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage, { 'waitUntil':'networkidle0' }); await this._randomWait(this.page, 3, 5); } catch(e) { throw new Error(e); } } async __run() { await this.start(); } } module.exports = LVScrape; ```