187 lines
4.5 KiB
JavaScript
187 lines
4.5 KiB
JavaScript
/**
|
|
*
|
|
* User: Martin Donnelly
|
|
* Date: 2018-09-13
|
|
* Time: 12:23
|
|
*
|
|
*/
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
const cheerio = require('cheerio');
|
|
const path = require('path');
|
|
const logger = require('log4js').getLogger('IE');
|
|
|
|
class IEScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
this.setID('IE');
|
|
|
|
this.on('done', () => {
|
|
this._done();
|
|
});
|
|
|
|
this.run = this._debounce(async () => {
|
|
await this.__run();
|
|
}, 5000);
|
|
|
|
if (process.env.NODE_ENV === 'production')
|
|
this._checkLock().then((l) => {
|
|
if(l)
|
|
this.run();
|
|
});
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async start() {
|
|
await super._start();
|
|
try{
|
|
this.startPage = 'http://registers.centralbank.ie/Home.aspx';
|
|
const mouseDownDuration = IEScrape.notARobot();
|
|
|
|
this.setPath(path.resolve(`${__dirname }/../artefacts/IE/CBI`));
|
|
|
|
await this._doNonRepudiation().catch((err) => {
|
|
logger.warn(err);
|
|
});
|
|
|
|
await this._initBrowser(true);
|
|
await this._createBrowserPage();
|
|
|
|
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
|
|
|
|
await this.page.setViewport({ 'width': 1200, 'height': 800 });
|
|
await this._goto(this.startPage);
|
|
|
|
await this._randomWait(this.page, 3, 5);
|
|
await this.page.waitForSelector('#ctl00_cphRegistersMasterPage_lblViewList');
|
|
await this.page.click('#ctl00_cphRegistersMasterPage_lblViewList > a', { 'delay':mouseDownDuration });
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* locate the download section associated with the searchText
|
|
* @param downloadsection
|
|
* @param searchText
|
|
* @returns {Promise<*>}
|
|
*/
|
|
async findDownloadSection(downloadsection, searchText) {
|
|
let wantedId;
|
|
try{
|
|
await this.page.waitFor(downloadsection);
|
|
|
|
const body = await this.page.evaluate(() => document.documentElement.outerHTML);
|
|
const $ = cheerio.load(body);
|
|
|
|
$(`${downloadsection} span`).each((i, el) => {
|
|
if ($(el).text() === searchText)
|
|
wantedId = $(el).attr('id');
|
|
|
|
return wantedId;
|
|
});
|
|
|
|
return wantedId;
|
|
}
|
|
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Expand the relevant section
|
|
* @param elmId
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async expandArea(elmId) {
|
|
await this.page.click(`span#${elmId}`);
|
|
}
|
|
|
|
/**
|
|
* Find the Download Links via section ID
|
|
* @param elmId
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async findDownloadsLinksID(elmId) {
|
|
return await this.page.$eval(`span#${elmId}`, e => e.parentElement.nextElementSibling.getAttribute('id'));
|
|
}
|
|
|
|
/**
|
|
* Process the download links and grab the pdf files
|
|
* @param id
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async processDownloadLinks(id) {
|
|
try {
|
|
// Each link is duplicated in a P and an Image. We just use the one in the P tag.
|
|
const clickableLinks = await this.page.$$(`[id="${id}"] p a`);
|
|
const mouseDownDuration = IEScrape.notARobot();
|
|
|
|
for (const item of clickableLinks) {
|
|
await this.page._client.send('Page.setDownloadBehavior', { 'behavior': 'allow', 'downloadPath': this.path });
|
|
|
|
await item.click({ 'delay':mouseDownDuration }).catch((err) => {
|
|
this._uploadError();
|
|
});
|
|
await this._randomWait(this.page, 5, 10);
|
|
}
|
|
}
|
|
catch(e) {
|
|
await this._uploadError();
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async grabSection(dlSectionElm, sectionTitle) {
|
|
try {
|
|
const section = await this.findDownloadSection(dlSectionElm, sectionTitle);
|
|
|
|
await this.expandArea(section);
|
|
|
|
this._makeScreenshotV2(this.page, `${ this.path}/${sectionTitle}`, null);
|
|
|
|
const sectionID = await this.findDownloadsLinksID(section);
|
|
|
|
await this.processDownloadLinks(sectionID);
|
|
|
|
await this._randomWait(this.page, 5, 10);
|
|
}
|
|
catch(e) {
|
|
await this._uploadError();
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Grab the Pdf's and screenshots
|
|
* @returns {Promise<void>}
|
|
*/
|
|
async __run() {
|
|
try {
|
|
await this.start();
|
|
|
|
await this._randomWait(this.page, 5, 10);
|
|
await this._makeScreenshotV2(this.page, `${ this.path}/Central Bank of Ireland Registers`, null);
|
|
|
|
const sections = ['Registers of Payment Services Firms', 'Registers of E-Money Firms', 'Register of Credit Institutions'];
|
|
|
|
for (const section of sections)
|
|
await this.grabSection('#ctl00_cphRegistersMasterPage_downloadsSection', section);
|
|
|
|
this.emit('done');
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = IEScrape;
|