obdfcascrape/ncas/fca.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

194 lines
7.4 KiB
JavaScript

// load env variables from file
require('dotenv').config({
'path': `${__dirname }/../.env`
});
const version = '0.0.1-1';
// load helper libs etc
const CsvData = require('../helpers/csv-data');
const csv = new CsvData();
const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const fs = require('fs');
const range = n => Array.from({ 'length': n }, (value, key) => key + 1);
const searchables = new Map([[759676, '759676 Barclays Bank UK PLC'],
[661836, '661836 American Express Services Europe Limited (AESEL)']
]);
const userAgents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 16.04) AppleWebKit/537.36 Chromium/57.0.2987.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36',
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58.0.3029.110 Chrome/58.0.3029.110 Safari/537.36',
'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
'Mozilla/5.0 (Linux; Ubuntu 14.04 like Android 4.4) AppleWebKit/537.36 Chromium/35.0.1870.2 Mobile Safari/537.36'];
class FCAScrape extends Scraper {
constructor() {
super();
}
async _checkPassporting(page, id, waitFor) {
const passportingHeader = await page.$x('//a[contains(text(), \'Passport Out\')]');
if (passportingHeader.length > 0) {
await passportingHeader[0].click(); // click tab to open passporting accordion
await this._makeScreenshot(page, `${id}-passporting`, waitFor); // save general screen
// check how many countries we need to parse
const countryLinks = await page.$$('#PanelShPo_PassportOut .countries li');
const passportingTempArray = range(countryLinks.length);
console.log('>> passportingTempArray', passportingTempArray);
for (const item of passportingTempArray) {
await page.mouse.move(50, 50, 100);
console.log(id, item);
// const cookies = await page.cookies();
// const cookiesNames = cookies.map(el => {
// return {name : el.name};
// });
// console.log(cookiesNames);
await page.deleteCookie(
{ 'name': '_gat' },
{ 'name': '_gid' },
{ 'name': '_ga' },
{ 'name': '__cfduid' },
{ 'name': 'pctrk' }
);
/* const newAgent = userAgents[Math.floor(Math.random() * (userAgents.length - 1))];
console.log('New agent:', newAgent);
await page.setUserAgent(newAgent);*/
await this._processPassportingCountry(page, id, item);
}
}
else
throw new Error('Passporting not found');
}
async _processPassportingCountry(page, orgId, id) {
// Mousedown Duration between 90 - 120ms
const mouseDownDuration = 90 + Math.floor(Math.random() * (30 - 1));
console.log('Mouse duration:', mouseDownDuration);
await page.click(`#PanelShPo_PassportOut .countries li:nth-child(${id}) a`, { 'delay':mouseDownDuration });
await this._randomWait(page, 20, 40);
const innerHtml = await page.evaluate(() => document.body.innerHTML);
await this._makeScreenshot(page, `${orgId}-${id}-passporting`);
await this._saveToFile(`${orgId}-${id}-inner.html`, innerHtml);
const parsedPassportOut = await this._parseHtmlPassportingData(innerHtml);
await this._saveToFile(`${orgId}-${id}-parsed.json`, JSON.stringify(parsedPassportOut));
}
async _getOrgData(id) {
try {
await this._initBrowser();
const page = await this.browser.newPage();
// await page.setUserAgent(userAgents[Math.floor(Math.random() * (userAgents.length - 1))]);
console.log('>> Wanted searchable', searchables.get(id));
await page.goto('https://register.fca.org.uk/ShPo_HomePage');
await page.type('input[type=text].input.form-control.searchbox', searchables.get(id));
await page.keyboard.press(String.fromCharCode(13)); // press Enter (so we do not need to search for submit button by CSS selector)
await page.waitForSelector('div.RecordDetails h1.RecordName');
// make general screenshot
await this._makeScreenshot(page, `${id}-general`);
// check if org has passporting rights and parse if poss
await this._checkPassporting(page, id);
await this.browser.close();
}
catch(e) {
throw new Error(e);
}
}
async _parseHtmlPassportingData(innerHtml) {
const $ = cheerio.load(innerHtml);
// get List of PassportOut countries
const countries = [];
$('li.PassportOutLink a').each((i, el) => {
countries[i] = $(el).text();
});
// get current country data
// lets count tables - how many different directives!
const directives = $('.ShPo_PassportOutTable').map((i, el) => {
const head = $(el).find('table tbody tr').first().find('th');
// table headers
const country = head.eq(0).text().trim();
const directive = head.eq(1).text().trim();
const passportType = head.eq(2).text().trim();
// get actual table data
const data = $(el).find('table tbody tr').find('td').map((i, el) => {
// if element does contain H3 - we need more parsing
if ($(el).find('.InvestmentTypes li').length) {
const name = $(el).find('h3').text().trim();
const investment = $(el).find('.InvestmentTypes li').map((ii, subel) => {
const name = $(subel).text().trim();
let tt = null;
// check if LI contains span == it has tooltips, get data and override null
if ($(subel).find('span').length) {
const $$ = cheerio.load($(subel).find('span').data('content'));
tt = $$('div').text().trim();
}
return { name, tt };
}).get();
return { name, investment };
}
// no lists in HTML, so record just name
else
return {
'name': $(el).text().trim(),
'investment': null
};
}).get();
return { country, directive, passportType, data };
}).get();
return directives;
}
// TODO: get initial list as per ticket
// https://register.fca.org.uk/shpo_searchresultspage?preDefined=AIPISP&TOKEN=3wq1nht7eg7tr
async getInitialList(page) {
return;
}
async run() {
const passporting = await this._getOrgData(661836);
// const passporting = await this._parseCurrentPassporting(1);
}
}
module.exports = FCAScrape;