// load env variables from file require('dotenv').config({ 'path': `${__dirname }/../.env` }); const version = '0.0.1-1'; // load helper libs etc const CsvData = require('../helpers/csv-data'); const csv = new CsvData(); const Scraper = require('../helpers/scraper'); const cheerio = require('cheerio'); const fs = require('fs'); const range = n => Array.from({ 'length': n }, (value, key) => key + 1); const searchables = new Map([[759676, '759676 Barclays Bank UK PLC'], [661836, '661836 American Express Services Europe Limited (AESEL)'] ]); const userAgents = ['Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36', 'Mozilla/5.0 (Linux; Ubuntu 14.04) AppleWebKit/537.36 Chromium/35.0.1870.2 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/66.0.3359.181 Chrome/66.0.3359.181 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.73 Chrome/47.0.2526.73 Safari/537.36', 'Mozilla/5.0 (Linux; Ubuntu 16.04) AppleWebKit/537.36 Chromium/57.0.2987.110 Safari/537.36', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58.0.3029.110 Chrome/58.0.3029.110 Safari/537.36', 'Mozilla/5.0 (X11; Linux armv7l) AppleWebKit/537.42 (KHTML, like Gecko) Chromium/25.0.1349.2 Chrome/25.0.1349.2 Safari/537.42', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36', 'Mozilla/5.0 (Linux; Ubuntu 14.04 like Android 4.4) AppleWebKit/537.36 Chromium/35.0.1870.2 Mobile Safari/537.36']; class FCAScrape extends Scraper { constructor() { super(); } async _checkPassporting(page, id, waitFor) { const passportingHeader = await page.$x('//a[contains(text(), \'Passport Out\')]'); if (passportingHeader.length > 0) { await passportingHeader[0].click(); // click tab to open passporting accordion await this._makeScreenshot(page, `${id}-passporting`, waitFor); // save general screen // check how many countries we need to parse const countryLinks = await page.$$('#PanelShPo_PassportOut .countries li'); const passportingTempArray = range(countryLinks.length); console.log('>> passportingTempArray', passportingTempArray); for (const item of passportingTempArray) { await page.mouse.move(50, 50, 100); console.log(id, item); // const cookies = await page.cookies(); // const cookiesNames = cookies.map(el => { // return {name : el.name}; // }); // console.log(cookiesNames); await page.deleteCookie( { 'name': '_gat' }, { 'name': '_gid' }, { 'name': '_ga' }, { 'name': '__cfduid' }, { 'name': 'pctrk' } ); /* const newAgent = userAgents[Math.floor(Math.random() * (userAgents.length - 1))]; console.log('New agent:', newAgent); await page.setUserAgent(newAgent);*/ await this._processPassportingCountry(page, id, item); } } else throw new Error('Passporting not found'); } async _processPassportingCountry(page, orgId, id) { // Mousedown Duration between 90 - 120ms const mouseDownDuration = 90 + Math.floor(Math.random() * (30 - 1)); console.log('Mouse duration:', mouseDownDuration); await page.click(`#PanelShPo_PassportOut .countries li:nth-child(${id}) a`, { 'delay':mouseDownDuration }); await this._randomWait(page, 20, 40); const innerHtml = await page.evaluate(() => document.body.innerHTML); await this._makeScreenshot(page, `${orgId}-${id}-passporting`); await this._saveToFile(`${orgId}-${id}-inner.html`, innerHtml); const parsedPassportOut = await this._parseHtmlPassportingData(innerHtml); await this._saveToFile(`${orgId}-${id}-parsed.json`, JSON.stringify(parsedPassportOut)); } async _getOrgData(id) { try { await this._initBrowser(); const page = await this.browser.newPage(); // await page.setUserAgent(userAgents[Math.floor(Math.random() * (userAgents.length - 1))]); console.log('>> Wanted searchable', searchables.get(id)); await page.goto('https://register.fca.org.uk/ShPo_HomePage'); await page.type('input[type=text].input.form-control.searchbox', searchables.get(id)); await page.keyboard.press(String.fromCharCode(13)); // press Enter (so we do not need to search for submit button by CSS selector) await page.waitForSelector('div.RecordDetails h1.RecordName'); // make general screenshot await this._makeScreenshot(page, `${id}-general`); // check if org has passporting rights and parse if poss await this._checkPassporting(page, id); await this.browser.close(); } catch(e) { throw new Error(e); } } async _parseHtmlPassportingData(innerHtml) { const $ = cheerio.load(innerHtml); // get List of PassportOut countries const countries = []; $('li.PassportOutLink a').each((i, el) => { countries[i] = $(el).text(); }); // get current country data // lets count tables - how many different directives! const directives = $('.ShPo_PassportOutTable').map((i, el) => { const head = $(el).find('table tbody tr').first().find('th'); // table headers const country = head.eq(0).text().trim(); const directive = head.eq(1).text().trim(); const passportType = head.eq(2).text().trim(); // get actual table data const data = $(el).find('table tbody tr').find('td').map((i, el) => { // if element does contain H3 - we need more parsing if ($(el).find('.InvestmentTypes li').length) { const name = $(el).find('h3').text().trim(); const investment = $(el).find('.InvestmentTypes li').map((ii, subel) => { const name = $(subel).text().trim(); let tt = null; // check if LI contains span == it has tooltips, get data and override null if ($(subel).find('span').length) { const $$ = cheerio.load($(subel).find('span').data('content')); tt = $$('div').text().trim(); } return { name, tt }; }).get(); return { name, investment }; } // no lists in HTML, so record just name else return { 'name': $(el).text().trim(), 'investment': null }; }).get(); return { country, directive, passportType, data }; }).get(); return directives; } // TODO: get initial list as per ticket // https://register.fca.org.uk/shpo_searchresultspage?preDefined=AIPISP&TOKEN=3wq1nht7eg7tr async getInitialList(page) { return; } async run() { const passporting = await this._getOrgData(661836); // const passporting = await this._parseCurrentPassporting(1); } } module.exports = FCAScrape;