129 lines
4.3 KiB
JavaScript
129 lines
4.3 KiB
JavaScript
const logger = require('log4js').getLogger('BG');
|
|
const path = require('path');
|
|
const url = require('url');
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
class BGScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('BG');
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._throttle(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
async downloadByHrefFilename(filename) {
|
|
logger.info(`Downloading ${filename} from ${this.page.url}`);
|
|
const linkHandles = await this.page.$x(`//a[contains(@href, \'${filename}\')]`);
|
|
const linkElement = linkHandles[0];
|
|
await linkElement.click();
|
|
await this._randomWait(this.page, 3, 5);
|
|
}
|
|
|
|
async processPaymentServicesPage() {
|
|
await this._randomWait(this.page, 3, 5);
|
|
this._makeScreenshotV2(this.page, `${this.path}/ps_em_index`);
|
|
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
await this.downloadByHrefFilename('ps_po_register_2_en.xls'); // Payment Institutions
|
|
await this.downloadByHrefFilename('ps_po_register_3a_en.xls'); // eMoney Institutions
|
|
|
|
// wait until networkidle to ensure the above downloads are complete, then go to next page
|
|
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
|
|
}
|
|
|
|
async processCreditInstitutionsPage() {
|
|
await this._randomWait(this.page, 3, 5);
|
|
this._makeScreenshotV2(this.page, `${this.path}/ci_index`);
|
|
|
|
// TODO: come back and scrape the html page version of this word doc, if we have time
|
|
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc'); // banks and foreign banks' branches operating in Bulgaria
|
|
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls'); // permissions list
|
|
|
|
// no more pages to go to at this point, so wait a final 10 seconds to allow files to download
|
|
// TODO: investigate whether this could be done with: // page.waitForNavigation({ waitUntil: 'networkidle0' })
|
|
await this.page.waitFor(10000);
|
|
|
|
this.emit('done');
|
|
}
|
|
|
|
async processNewPage() {
|
|
// give the page a few seconds to settle
|
|
await this._randomWait(this.page, 3, 5);
|
|
|
|
// set download behaviour on every processNewPage in case this is a recovery attempt / new tab
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
const pageUrl = url.parse(await this.page.url());
|
|
|
|
if (pageUrl.href === 'chrome-error://chromewebdata/') {
|
|
logger.warn('Directed to: chrome-error://chromewebdata/');
|
|
this.emit('recover');
|
|
|
|
return;
|
|
}
|
|
|
|
if (pageUrl.href.includes('/PSPaymentOversightRegisters'))
|
|
await this.processPaymentServicesPage();
|
|
else if (pageUrl.href.includes('/RSCIRegisters'))
|
|
await this.processCreditInstitutionsPage();
|
|
else if (process.env.NODE_ENV) {
|
|
await this._uploadError();
|
|
throw new Error(`Unknown page: ${pageUrl.href}`);
|
|
}
|
|
else {
|
|
logger.warn('processNewPage Fell through');
|
|
logger.warn('currentPage.location', pageUrl.href);
|
|
}
|
|
}
|
|
|
|
async start() {
|
|
super._start();
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/BG/BNB`));
|
|
|
|
this.startPage = 'http://www.bnb.bg/PaymentSystem/PSPaymentOversight/PSPaymentOversightRegisters/index.htm';
|
|
this.creditInstitutionsPage = 'http://www.bnb.bg/RegistersAndServices/RSCIRegisters/index.htm';
|
|
|
|
// site only over http, so skip ssl during non-repudiation
|
|
await this._doNonRepudiation(false, { 'skipSsl': true }).catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser(false);
|
|
await this._createBrowserPage();
|
|
|
|
this.page.on('domcontentloaded', this._throttle(async () => {
|
|
this.processNewPage().catch((err) => {
|
|
logger.error('processNewPage fail', err);
|
|
});
|
|
}, 2500));
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
|
|
// set cookie for English language and load start page
|
|
await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' });
|
|
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
|
|
}
|
|
|
|
async __run() {
|
|
await this.start();
|
|
}
|
|
}
|
|
|
|
module.exports = BGScrape;
|