2019-03-21 11:11:07 +00:00
|
|
|
Fresh Scraper
|
|
|
|
========================
|
|
|
|
|
|
|
|
|
|
|
|
```javascript
|
|
|
|
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const path = require('path');
|
|
|
|
const jsonfile = require('jsonfile');
|
|
|
|
const logger = require('log4js').getLogger('LV');
|
|
|
|
const url = require('url');
|
2019-05-21 15:40:16 +00:00
|
|
|
const removeAccents = require('remove-accents-diacritics');
|
2019-03-21 11:11:07 +00:00
|
|
|
|
|
|
|
logger.level = process.env.LOGGER_LEVEL || 'warn';
|
|
|
|
|
|
|
|
class LVScrape extends Scraper {
|
|
|
|
|
|
|
|
constructor() {
|
|
|
|
super();
|
|
|
|
this.id = 'LV';
|
|
|
|
|
|
|
|
this.on('done', () => {
|
|
|
|
this._done();
|
|
|
|
});
|
|
|
|
|
|
|
|
this.run = this._throttle(async () => {
|
|
|
|
await this.__run();
|
|
|
|
}, 5000);
|
|
|
|
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
|
|
this._checkLock().then((l) => {
|
|
|
|
if(l)
|
|
|
|
this.run();
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @param serviceObject
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processEntityDetails(serviceObject) {
|
|
|
|
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
|
|
|
|
serviceObject.links[serviceObject.step].filename = `${filename}.json`;
|
|
|
|
serviceObject.step++;
|
|
|
|
|
|
|
|
if (serviceObject.step < serviceObject.items) {
|
|
|
|
const newUrl = serviceObject.links[serviceObject.step].href;
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
this.emit('serviceDone');
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async indexRedirector() {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
await this.buildIndex(this.paymentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
await this.buildIndex(this.emoneyServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
await this.buildIndex(this.creditServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async processRedirector() {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
await this.processEntityDetails(this.paymentServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
await this.processEntityDetails(this.emoneyServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
await this.processEntityDetails(this.creditServices);
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-03-21 11:11:07 +00:00
|
|
|
async processNewPage() {
|
|
|
|
// give the page a few seconds to settle
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
|
|
this.emit('recover');
|
|
|
|
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-03-21 11:11:07 +00:00
|
|
|
switch (pageUrl.pathname) {
|
|
|
|
|
|
|
|
case '/en/our-registers/company-register/':
|
|
|
|
await this.indexRedirector();
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '/en/our-registers/company-register/details':
|
|
|
|
await this.processRedirector();
|
|
|
|
break;
|
|
|
|
case '/en/our-registers/company-register/gransoverskridandehandel/':
|
|
|
|
await this.crossBorderRedirector();
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
if (process.env.NODE_ENV) {
|
|
|
|
await this._uploadError();
|
|
|
|
throw new Error(`Unknown page: ${pageUrl}`);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
logger.warn('processNewPage Fell through');
|
|
|
|
logger.warn('currentPage.location', pageUrl);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
|
|
|
async attachEvents() {
|
|
|
|
this.on('serviceDone', async () => {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
this.emit('paymentServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
this.emit('emoneyServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
this.emit('creditServicesDone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('psindexdone', async () => {
|
|
|
|
let newUrl;
|
|
|
|
this.paymentServices.items = this.paymentServices.links.length;
|
|
|
|
logger.info(`${this.paymentServices.items} items indexed`);
|
|
|
|
|
|
|
|
this.paymentServices.indexStep++;
|
|
|
|
if (this.paymentServices.indexStep >= this.paymentServices.urls.length)
|
|
|
|
newUrl = this.paymentServices.links[this.paymentServices.step].href;
|
|
|
|
else
|
|
|
|
newUrl = this.paymentServices.urls[this.paymentServices.indexStep];
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('emindexdone', async () => {
|
|
|
|
let newUrl;
|
|
|
|
this.emoneyServices.items = this.emoneyServices.links.length;
|
|
|
|
logger.info(`${this.emoneyServices.items} items indexed`);
|
|
|
|
|
|
|
|
this.emoneyServices.indexStep++;
|
|
|
|
if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
|
|
|
|
newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
|
|
|
|
else
|
|
|
|
newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('ciindexdone', async () => {
|
|
|
|
let newUrl;
|
|
|
|
this.creditServices.items = this.creditServices.links.length;
|
|
|
|
logger.info(`${this.creditServices.items} items indexed`);
|
|
|
|
|
|
|
|
this.creditServices.indexStep++;
|
|
|
|
if (this.creditServices.indexStep >= this.creditServices.urls.length)
|
|
|
|
newUrl = this.creditServices.links[this.creditServices.step].href;
|
|
|
|
else
|
|
|
|
newUrl = this.creditServices.urls[this.creditServices.indexStep];
|
|
|
|
|
|
|
|
await this._goto(newUrl);
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('indexdone', async () => {
|
|
|
|
switch (this.mode) {
|
|
|
|
|
|
|
|
case 0:
|
|
|
|
this.emit('psindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 1:
|
|
|
|
this.emit('emindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
case 2:
|
|
|
|
this.emit('ciindexdone');
|
|
|
|
break;
|
|
|
|
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('paymentServicesDone', async () => {
|
|
|
|
logger.warn('paymentServicesDone');
|
|
|
|
try{
|
|
|
|
this.paymentServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);
|
|
|
|
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
await this._goto(this.emoneyServices.urls[0]);
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('emoneyServicesDone', async () => {
|
|
|
|
logger.warn('emoneyServicesDone');
|
|
|
|
try{
|
|
|
|
this.emoneyServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
await this._goto(this.creditServices.urls[0]);
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.on('creditServicesDone', async () => {
|
|
|
|
logger.warn('creditServicesDone');
|
|
|
|
try{
|
|
|
|
this.creditServices.done = true;
|
|
|
|
jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
|
|
|
|
jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
|
|
|
|
this.mode++;
|
|
|
|
this.inProgress = false;
|
|
|
|
|
|
|
|
this.emit('done');
|
|
|
|
}
|
|
|
|
catch (e) {
|
|
|
|
logger.error(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
|
|
|
* @returns {Promise<void>}
|
|
|
|
*/
|
2019-03-21 11:11:07 +00:00
|
|
|
async start() {
|
|
|
|
super._start();
|
|
|
|
try {
|
|
|
|
this.mode = 0;
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
this.rootURI = 'http://www.fktk.lv';
|
|
|
|
|
2019-03-21 11:11:07 +00:00
|
|
|
this.paymentServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done' : false,
|
2019-05-21 15:40:16 +00:00
|
|
|
'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'],
|
2019-03-21 11:11:07 +00:00
|
|
|
'sections' : [],
|
|
|
|
'sectionLinks' : []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.emoneyServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done' : false,
|
2019-05-21 15:40:16 +00:00
|
|
|
'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'],
|
2019-03-21 11:11:07 +00:00
|
|
|
'sections' : [],
|
|
|
|
'sectionLinks' : []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.creditServices = {
|
|
|
|
'items': 0,
|
|
|
|
'links': [],
|
|
|
|
'step': 0,
|
|
|
|
'indexStep': 0,
|
|
|
|
'visited': false,
|
|
|
|
'done' : false,
|
|
|
|
'searchDone' : false,
|
|
|
|
'started': false,
|
2019-05-21 15:40:16 +00:00
|
|
|
'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'],
|
2019-03-21 11:11:07 +00:00
|
|
|
'sections' : [],
|
|
|
|
'sectionLinks' : []
|
|
|
|
};
|
|
|
|
|
|
|
|
this.startPage = this.paymentServices.urls[0];
|
|
|
|
this.emoneyUrl = this.emoneyServices.urls[0];
|
|
|
|
this.credit = this.creditServices.urls[0];
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`));
|
2019-03-21 11:11:07 +00:00
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
|
|
logger.warn(err);
|
2019-03-21 11:11:07 +00:00
|
|
|
});
|
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
await this._initBrowser();
|
|
|
|
await this._createBrowserPage();
|
2019-03-21 11:11:07 +00:00
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
|
|
this.processNewPage().catch((err) => {
|
|
|
|
logger.error('processNewPage fail', err);
|
|
|
|
});
|
|
|
|
}, 2500));
|
2019-03-21 11:11:07 +00:00
|
|
|
|
2019-05-21 15:40:16 +00:00
|
|
|
if (this.eventNames().length === 2)
|
|
|
|
await this.attachEvents();
|
2019-03-21 11:11:07 +00:00
|
|
|
|
|
|
|
//
|
|
|
|
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
|
|
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
|
|
|
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
}
|
|
|
|
catch(e) {
|
|
|
|
throw new Error(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async __run() {
|
|
|
|
await this.start();
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = LVScrape;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```
|