/** * * User: Martin Donnelly * Date: 2018-09-13 * Time: 12:23 * */ const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const path = require('path'); const logger = require('log4js').getLogger('IE'); class IEScrape extends Scraper { constructor() { super(); this.setID('IE'); this.on('done', () => { this._done(); }); this.run = this._debounce(async () => { await this.__run(); }, 5000); if (process.env.NODE_ENV === 'production') this._checkLock().then((l) => { if(l) this.run(); }); } /** * * @returns {Promise} */ async start() { await super._start(); try{ this.startPage = 'http://registers.centralbank.ie/Home.aspx'; const mouseDownDuration = IEScrape.notARobot(); this.setPath(path.resolve(`${__dirname }/../artefacts/IE/CBI`)); await this._doNonRepudiation().catch((err) => { logger.warn(err); }); await this._initBrowser(true); await this._createBrowserPage(); await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true }); await this.page.setViewport({ 'width': 1200, 'height': 800 }); await this._goto(this.startPage); await this._randomWait(this.page, 3, 5); await this.page.waitForSelector('#ctl00_cphRegistersMasterPage_lblViewList'); await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration }); } catch(e) { throw new Error(e); } } /** * locate the download section associated with the searchText * @param downloadsection * @param searchText * @returns {Promise<*>} */ async findDownloadSection(downloadsection, searchText) { let wantedId; try{ await this.page.waitFor(downloadsection); const body = await this.page.evaluate(() => document.documentElement.outerHTML); const $ = cheerio.load(body); $(`${downloadsection} span`).each((i, el) => { if ($(el).text() === searchText) wantedId = $(el).attr('id'); return wantedId; }); return wantedId; } catch(e) { throw new Error(e); } } /** * Expand the relevant section * @param elmId * @returns {Promise} */ async expandArea(elmId) { await this.page.click(`span#${elmId}`); } /** * Find the Download Links via section ID * @param elmId * @returns {Promise} */ async findDownloadsLinksID(elmId) { return await this.page.$eval(`span#${elmId}`, e => e.parentElement.nextElementSibling.getAttribute('id')); } /** * Process the download links and grab the pdf files * @param id * @returns {Promise} */ async processDownloadLinks(id) { try { // Each link is duplicated in a P and an Image. We just use the one in the P tag. const clickableLinks = await this.page.$$(`[id="${id}"] p a`); const mouseDownDuration = IEScrape.notARobot(); for (const item of clickableLinks) { await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path }); await item.click({ 'delay':mouseDownDuration }).catch((err) => { this._uploadError(); }); await this._randomWait(this.page, 5, 10); } } catch(e) { await this._uploadError(); throw new Error(e); } } async grabSection(dlSectionElm, sectionTitle) { try { const section = await this.findDownloadSection(dlSectionElm, sectionTitle); await this.expandArea(section); this._makeScreenshotV2(this.page, `${ this.path}/${sectionTitle}`, null); const sectionID = await this.findDownloadsLinksID(section); await this.processDownloadLinks(sectionID); await this._randomWait(this.page, 5, 10); } catch(e) { await this._uploadError(); throw new Error(e); } } /** * Grab the Pdf's and screenshots * @returns {Promise} */ async __run() { try { await this.start(); await this._randomWait(this.page, 5, 10); await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null); const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions']; for (const section of sections) await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section); this.emit('done'); } catch(e) { throw new Error(e); } } } module.exports = IEScrape;