cc-tracking/tasks/fresh Scraper.md
2019-05-21 16:40:16 +01:00

9.0 KiB

Fresh Scraper


const Scraper = require('../helpers/scraper');
const cheerio = require('cheerio');
const path = require('path');
const jsonfile = require('jsonfile');
const logger = require('log4js').getLogger('LV');
const url = require('url');
const removeAccents = require('remove-accents-diacritics');

logger.level = process.env.LOGGER_LEVEL || 'warn';

class LVScrape extends Scraper {

  constructor() {
    super();
    this.id = 'LV';

    this.on('done', () => {
      this._done();
    });

    this.run = this._throttle(async () => {
      await this.__run();
    }, 5000);

    if (process.env.NODE_ENV === 'production')
      this._checkLock().then((l) => {
        if(l)
          this.run();
      });
  }


  /**
   *
   * @param serviceObject
   * @returns {Promise<void>}
   */
  async processEntityDetails(serviceObject) {

    await this._randomWait(this.page, 3, 5);

    serviceObject.links[serviceObject.step].filename = `${filename}.json`;
    serviceObject.step++;

    if (serviceObject.step < serviceObject.items) {
      const newUrl = serviceObject.links[serviceObject.step].href;

      await this._goto(newUrl);
    }
    else
      this.emit('serviceDone');
  }

  /**
   *
   * @returns {Promise<void>}
   */
  async indexRedirector() {
    switch (this.mode) {

      case 0:
        await this.buildIndex(this.paymentServices);
        break;

      case 1:
        await this.buildIndex(this.emoneyServices);
        break;

      case 2:
        await this.buildIndex(this.creditServices);
        break;

    }
  }

  /**
   *
   * @returns {Promise<void>}
   */
  async processRedirector() {
    switch (this.mode) {

      case 0:
        await this.processEntityDetails(this.paymentServices);
        break;

      case 1:
        await this.processEntityDetails(this.emoneyServices);
        break;

      case 2:
        await this.processEntityDetails(this.creditServices);
        break;

    }
  }

  async processNewPage() {
    // give the page a few seconds to settle
    await this._randomWait(this.page, 3, 5);

    const pageUrl = url.parse(await this.page.url());

    if (pageUrl.href === 'chrome-error://chromewebdata/') {
      logger.warn('Directed to: chrome-error://chromewebdata/');
      this.emit('recover');

      return;
    }

    switch (pageUrl.pathname) {

      case '/en/our-registers/company-register/':
        await this.indexRedirector();
        break;

      case '/en/our-registers/company-register/details':
        await this.processRedirector();
        break;
      case '/en/our-registers/company-register/gransoverskridandehandel/':
        await this.crossBorderRedirector();
        break;

      default:
        if (process.env.NODE_ENV) {
          await this._uploadError();
          throw new Error(`Unknown page: ${pageUrl}`);
        }
        else {
          logger.warn('processNewPage Fell through');
          logger.warn('currentPage.location', pageUrl);
        }
        break;

    }
  }

  /**
   *
   * @returns {Promise<void>}
   */
  async attachEvents() {
    this.on('serviceDone', async () => {
      switch (this.mode) {

        case 0:
          this.emit('paymentServicesDone');
          break;

        case 1:
          this.emit('emoneyServicesDone');
          break;

        case 2:
          this.emit('creditServicesDone');
          break;

      }
    });

    this.on('psindexdone', async () => {
      let newUrl;
      this.paymentServices.items = this.paymentServices.links.length;
      logger.info(`${this.paymentServices.items} items indexed`);

      this.paymentServices.indexStep++;
      if (this.paymentServices.indexStep >= this.paymentServices.urls.length)
        newUrl = this.paymentServices.links[this.paymentServices.step].href;
      else
        newUrl = this.paymentServices.urls[this.paymentServices.indexStep];

      await this._goto(newUrl);
    });

    this.on('emindexdone', async () => {
      let newUrl;
      this.emoneyServices.items = this.emoneyServices.links.length;
      logger.info(`${this.emoneyServices.items} items indexed`);

      this.emoneyServices.indexStep++;
      if (this.emoneyServices.indexStep >= this.emoneyServices.urls.length)
        newUrl = this.emoneyServices.links[this.emoneyServices.step].href;
      else
        newUrl = this.emoneyServices.urls[this.emoneyServices.indexStep];

      await this._goto(newUrl);
    });

    this.on('ciindexdone', async () => {
      let newUrl;
      this.creditServices.items = this.creditServices.links.length;
      logger.info(`${this.creditServices.items} items indexed`);

      this.creditServices.indexStep++;
      if (this.creditServices.indexStep >= this.creditServices.urls.length)
        newUrl = this.creditServices.links[this.creditServices.step].href;
      else
        newUrl = this.creditServices.urls[this.creditServices.indexStep];

      await this._goto(newUrl);
    });

    this.on('indexdone', async () => {
      switch (this.mode) {

        case 0:
          this.emit('psindexdone');
          break;

        case 1:
          this.emit('emindexdone');
          break;

        case 2:
          this.emit('ciindexdone');
          break;

      }
    });

    this.on('paymentServicesDone', async () => {
      logger.warn('paymentServicesDone');
      try{
        this.paymentServices.done = true;
        jsonfile.writeFileSync(`${this.path}/paymentServices.json`, { 'links': this.paymentServices.links });
        jsonfile.writeFileSync(`${this.debugPath}/paymentServices.json`, this.paymentServices);

        this.mode++;
        this.inProgress = false;

        await this._goto(this.emoneyServices.urls[0]);
      }
      catch (e) {
        logger.error(e);
      }
    });

    this.on('emoneyServicesDone', async () => {
      logger.warn('emoneyServicesDone');
      try{
        this.emoneyServices.done = true;
        jsonfile.writeFileSync(`${this.path}/emoneyServices.json`, { 'links':this.emoneyServices.links });
        jsonfile.writeFileSync(`${this.debugPath}/emoneyServices.json`, this.emoneyServices);
        this.mode++;
        this.inProgress = false;

        await this._goto(this.creditServices.urls[0]);
      }
      catch (e) {
        logger.error(e);
      }
    });

    this.on('creditServicesDone', async () => {
      logger.warn('creditServicesDone');
      try{
        this.creditServices.done = true;
        jsonfile.writeFileSync(`${this.path}/creditServices.json`, { 'links':this.creditServices.links });
        jsonfile.writeFileSync(`${this.debugPath}/creditServices.json`, this.creditServices);
        this.mode++;
        this.inProgress = false;

        this.emit('done');
      }
      catch (e) {
        logger.error(e);
      }
    });
  }

  /**
   *
   * @returns {Promise<void>}
   */
  async start() {
    super._start();
    try {
      this.mode = 0;

      this.rootURI = 'http://www.fktk.lv';

      this.paymentServices = {
        'items': 0,
        'links': [],
        'step': 0,
        'indexStep': 0,
        'visited': false,
        'done' : false,
        'urls': ['http://www.fktk.lv/en/market/payment-institutions/authorized-payment-institutions.html', 'http://www.fktk.lv/en/market/payment-institutions/registered-payment-institutions.html'],
        'sections' : [],
        'sectionLinks' : []
      };

      this.emoneyServices = {
        'items': 0,
        'links': [],
        'step': 0,
        'indexStep': 0,
        'visited': false,
        'done' : false,
        'urls': ['http://www.fktk.lv/en/market/electronic-money-institutions/authorized-electronic-money-institutions.html', 'http://www.fktk.lv/en/market/electronic-money-institutions/registered-electronic-money-institutions.html'],
        'sections' : [],
        'sectionLinks' : []
      };

      this.creditServices = {
        'items': 0,
        'links': [],
        'step': 0,
        'indexStep': 0,
        'visited': false,
        'done' : false,
        'searchDone' : false,
        'started': false,
        'urls': ['http://www.fktk.lv/en/market/credit-institutions/banks.html'],
        'sections' : [],
        'sectionLinks' : []
      };

      this.startPage = this.paymentServices.urls[0];
      this.emoneyUrl = this.emoneyServices.urls[0];
      this.credit = this.creditServices.urls[0];

      this.setPath(path.resolve(`${__dirname }/../artefacts/LV/FCMC`));

      await this._doNonRepudiation().catch((err) => {
        logger.warn(err);
      });

      await this._initBrowser();
      await this._createBrowserPage();

      this.page.on('domcontentloaded', this._throttle(async () => {
        this.processNewPage().catch((err) => {
          logger.error('processNewPage fail', err);
        });
      }, 2500));

      if (this.eventNames().length === 2)
        await this.attachEvents();

      //

      await this.page.setViewport({ 'width': 1200, 'height': 800 });
      await this._goto(this.startPage, { 'waitUntil':'networkidle0' });

      await this._randomWait(this.page, 3, 5);
    }
    catch(e) {
      throw new Error(e);
    }
  }

  async __run() {
    await this.start();
  }

}

module.exports = LVScrape;