194 lines
7.4 KiB
JavaScript
194 lines
7.4 KiB
JavaScript
// load env variables from file
|
|
require('dotenv').config({
|
|
'path': `${__dirname }/../.env`
|
|
});
|
|
|
|
const version = '0.0.1-1';
|
|
// load helper libs etc
|
|
const CsvData = require('../helpers/csv-data');
|
|
const csv = new CsvData();
|
|
|
|
const Scraper = require('../helpers/scraper');
|
|
|
|
const cheerio = require('cheerio');
|
|
const fs = require('fs');
|
|
|
|
const range = n => Array.from({ 'length': n }, (value, key) => key + 1);
|
|
|
|
const searchables = new Map([[759676, '759676 Barclays Bank UK PLC'],
|
|
[661836, '661836 American Express Services Europe Limited (AESEL)']
|
|
]);
|
|
|
|
const userAgents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36',
|
|
'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36',
|
|
'Mozilla/5.0 (Linux; Ubuntu 16.04) AppleWebKit/537.36 Chromium/57.0.2987.110 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58.0.3029.110 Chrome/58.0.3029.110 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
|
|
'Mozilla/5.0 (Linux; Ubuntu 14.04 like Android 4.4) AppleWebKit/537.36 Chromium/35.0.1870.2 Mobile Safari/537.36'];
|
|
|
|
class FCAScrape extends Scraper {
|
|
|
|
constructor() {
|
|
super();
|
|
}
|
|
|
|
async _checkPassporting(page, id, waitFor) {
|
|
const passportingHeader = await page.$x('//a[contains(text(), \'Passport Out\')]');
|
|
if (passportingHeader.length > 0) {
|
|
await passportingHeader[0].click(); // click tab to open passporting accordion
|
|
await this._makeScreenshot(page, `${id}-passporting`, waitFor); // save general screen
|
|
|
|
// check how many countries we need to parse
|
|
const countryLinks = await page.$$('#PanelShPo_PassportOut .countries li');
|
|
const passportingTempArray = range(countryLinks.length);
|
|
console.log('>> passportingTempArray', passportingTempArray);
|
|
|
|
for (const item of passportingTempArray) {
|
|
await page.mouse.move(50, 50, 100);
|
|
|
|
console.log(id, item);
|
|
// const cookies = await page.cookies();
|
|
// const cookiesNames = cookies.map(el => {
|
|
// return {name : el.name};
|
|
// });
|
|
// console.log(cookiesNames);
|
|
await page.deleteCookie(
|
|
{ 'name': '_gat' },
|
|
{ 'name': '_gid' },
|
|
{ 'name': '_ga' },
|
|
{ 'name': '__cfduid' },
|
|
{ 'name': 'pctrk' }
|
|
);
|
|
|
|
/* const newAgent = userAgents[Math.floor(Math.random() * (userAgents.length - 1))];
|
|
|
|
console.log('New agent:', newAgent);
|
|
|
|
await page.setUserAgent(newAgent);*/
|
|
await this._processPassportingCountry(page, id, item);
|
|
}
|
|
}
|
|
else
|
|
throw new Error('Passporting not found');
|
|
}
|
|
|
|
async _processPassportingCountry(page, orgId, id) {
|
|
// Mousedown Duration between 90 - 120ms
|
|
const mouseDownDuration = 90 + Math.floor(Math.random() * (30 - 1));
|
|
|
|
console.log('Mouse duration:', mouseDownDuration);
|
|
|
|
await page.click(`#PanelShPo_PassportOut .countries li:nth-child(${id}) a`, { 'delay':mouseDownDuration });
|
|
await this._randomWait(page, 20, 40);
|
|
|
|
const innerHtml = await page.evaluate(() => document.body.innerHTML);
|
|
await this._makeScreenshot(page, `${orgId}-${id}-passporting`);
|
|
await this._saveToFile(`${orgId}-${id}-inner.html`, innerHtml);
|
|
|
|
const parsedPassportOut = await this._parseHtmlPassportingData(innerHtml);
|
|
await this._saveToFile(`${orgId}-${id}-parsed.json`, JSON.stringify(parsedPassportOut));
|
|
}
|
|
|
|
async _getOrgData(id) {
|
|
try {
|
|
await this._initBrowser();
|
|
const page = await this.browser.newPage();
|
|
|
|
// await page.setUserAgent(userAgents[Math.floor(Math.random() * (userAgents.length - 1))]);
|
|
|
|
console.log('>> Wanted searchable', searchables.get(id));
|
|
|
|
await page.goto('https://register.fca.org.uk/ShPo_HomePage');
|
|
await page.type('input[type=text].input.form-control.searchbox', searchables.get(id));
|
|
await page.keyboard.press(String.fromCharCode(13)); // press Enter (so we do not need to search for submit button by CSS selector)
|
|
await page.waitForSelector('div.RecordDetails h1.RecordName');
|
|
|
|
// make general screenshot
|
|
await this._makeScreenshot(page, `${id}-general`);
|
|
|
|
// check if org has passporting rights and parse if poss
|
|
await this._checkPassporting(page, id);
|
|
|
|
await this.browser.close();
|
|
}
|
|
catch(e) {
|
|
throw new Error(e);
|
|
}
|
|
}
|
|
|
|
async _parseHtmlPassportingData(innerHtml) {
|
|
const $ = cheerio.load(innerHtml);
|
|
|
|
// get List of PassportOut countries
|
|
const countries = [];
|
|
$('li.PassportOutLink a').each((i, el) => {
|
|
countries[i] = $(el).text();
|
|
});
|
|
|
|
// get current country data
|
|
// lets count tables - how many different directives!
|
|
const directives = $('.ShPo_PassportOutTable').map((i, el) => {
|
|
const head = $(el).find('table tbody tr').first().find('th');
|
|
|
|
// table headers
|
|
const country = head.eq(0).text().trim();
|
|
const directive = head.eq(1).text().trim();
|
|
const passportType = head.eq(2).text().trim();
|
|
|
|
// get actual table data
|
|
const data = $(el).find('table tbody tr').find('td').map((i, el) => {
|
|
// if element does contain H3 - we need more parsing
|
|
if ($(el).find('.InvestmentTypes li').length) {
|
|
const name = $(el).find('h3').text().trim();
|
|
const investment = $(el).find('.InvestmentTypes li').map((ii, subel) => {
|
|
const name = $(subel).text().trim();
|
|
let tt = null;
|
|
|
|
// check if LI contains span == it has tooltips, get data and override null
|
|
if ($(subel).find('span').length) {
|
|
const $$ = cheerio.load($(subel).find('span').data('content'));
|
|
tt = $$('div').text().trim();
|
|
}
|
|
|
|
return { name, tt };
|
|
}).get();
|
|
|
|
return { name, investment };
|
|
}
|
|
|
|
// no lists in HTML, so record just name
|
|
else
|
|
return {
|
|
'name': $(el).text().trim(),
|
|
'investment': null
|
|
};
|
|
}).get();
|
|
|
|
return { country, directive, passportType, data };
|
|
}).get();
|
|
|
|
return directives;
|
|
}
|
|
|
|
// TODO: get initial list as per ticket
|
|
// https://register.fca.org.uk/shpo_searchresultspage?preDefined=AIPISP&TOKEN=3wq1nht7eg7tr
|
|
async getInitialList(page) {
|
|
return;
|
|
}
|
|
|
|
async run() {
|
|
const passporting = await this._getOrgData(661836);
|
|
|
|
// const passporting = await this._parseCurrentPassporting(1);
|
|
}
|
|
}
|
|
|
|
module.exports = FCAScrape;
|