Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

84 lines
2.7 KiB
JavaScript

const logger = require('log4js').getLogger('BG');
const path = require('path');
const Scraper = require('../helpers/scraper');
class BGScrape extends Scraper {
constructor() {
super();
this.id = 'BG';
this.on('done', () => {
this._done();
});
this.run = this._throttle(async () => {
await this.__run();
}, 5000);
if (process.env.NODE_ENV === 'production')
this._checkLock().then((l) => {
if(l)
this.run();
});
}
async downloadByHrefFilename(filename) {
logger.info(`Downloading ${filename} from ${this.page.url}`);
const linkHandles = await this.page.$x(`//a[contains(@href, \'${filename}\')]`);
const linkElement = linkHandles[0];
await linkElement.click();
await this._randomWait(this.page, 3, 5);
}
async start() {
super._start();
this.setPath(path.resolve(`${__dirname }/../artefacts/BG/BNB`));
this.startPage = 'http://www.bnb.bg/PaymentSystem/PSPaymentOversight/PSPaymentOversightRegisters/index.htm';
this.creditInstitutionsPage = 'http://www.bnb.bg/RegistersAndServices/RSCIRegisters/index.htm';
// site only over http, so skip ssl during non-repudiation
await this._doNonRepudiation(false, { 'skipSsl': true }).catch((err) => {
logger.warn(err);
});
await this._initBrowser();
this.page = await this.browser.newPage();
await this.page.setViewport({ 'width': 1200, 'height': 800 });
// set cookie for English language and load start page
await this.page.setCookie({ 'name': 'userLanguage', 'value': 'EN', 'domain': 'www.bnb.bg', 'path': '/' });
await this._goto(this.startPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index1`);
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
await this.downloadByHrefFilename('ps_po_register_2_en.xls');
await this.downloadByHrefFilename('ps_po_register_3a_en.xls');
await this._goto(this.creditInstitutionsPage, { 'waitUntil':'networkidle0' });
await this._randomWait(this.page, 3, 5);
this._makeScreenshotV2(this.page, `${this.path}/index2`);
// TODO: come back and scrape the html page version of this word doc, if we have time
await this.downloadByHrefFilename('bs_ci_reg_bankslist_en.doc');
await this.downloadByHrefFilename('bs_ci_reg_permissions_bg.xls');
// wait until all downloads finished with 'networkidle0' (currently this is only possible with 'page.goto', so we go back to the start page)
await this._goto(this.startPage, { 'waitUntil': 'networkidle0' });
this.emit('done');
}
async __run() {
await this.start();
}
}
module.exports = BGScrape;