obdfcascrape/tests/scrape.gi.js
Martin Donnelly be5d3eae07 init
2019-05-05 20:13:56 +01:00

167 lines
5.0 KiB
JavaScript

const tape = require('tape');
const _test = require('tape-promise').default; // <---- notice 'default'
const test = _test(tape); // decorate tape
const cheerio = require('cheerio')
const diff = require('deep-diff');
const fs = require('fs');
const jsonfile = require('jsonfile');
const Gibraltar = require('../ncas/gi');
const giScraper = new Gibraltar();
const failure = { 'fail':true };
const empty = {};
test.test('Unit', async t => {
test.test('Get uppermost elements by selector', async t => {
const htmlFile = 'tests/data/gi/unit/get-uppermost-elements-by-selector.html';
const html = fs.readFileSync(htmlFile);
const selector = 'div.foo, li.bar';
const uppermostElements = await giScraper.getUppermostElementsBySelector(html, selector);
t.equal(uppermostElements.length, 7);
for (let i = 0; i < uppermostElements.length; i++) {
t.false(
uppermostElements[i].attribs['class'].includes('nomatch')
);
}
t.end();
});
test.test('Get text not in matching elements', async t => {
const fixtures = [
{ 'selector': '#fixture1', 'expectedText': 'This text A should match.' },
{ 'selector': '#fixture2', 'expectedText': 'This text C should match. This text E should match.' },
{ 'selector': '#fixture3', 'expectedText': 'This text F should match. This text G should match. This text I should match.' }
];
const htmlFile = 'tests/data/gi/unit/get-text-not-in-matching-elements.html';
const html = fs.readFileSync(htmlFile);
const selector = 'div.foo, li.bar';
const $ = cheerio.load(html);
for (let i = 0; i < fixtures.length; i++) {
const f = fixtures[i];
const fixtureHtml = $(f.selector).html();
const textNotInMatchingElements = await giScraper.getTextNotInMatchingElements(fixtureHtml, selector);
const reducedText = await giScraper._reduceWhiteSpace(textNotInMatchingElements);
t.equal(reducedText, f.expectedText);
}
t.end();
});
t.end();
});
test.test('Entities', async t => {
test.test('Gibraltar:: Process PS Entity 001', async t => {
const htmlFile = 'tests/data/gi/ps_001.html';
t.test('🇬🇮::Extract entity details', async t => {
const psDetail = fs.readFileSync(htmlFile);
const expectedJSON = jsonfile.readFileSync('tests/data/gi/ps_001.json');
const output = await giScraper.extractEntityDetails(psDetail);
// don't compare perms in these tests
delete output.permissions;
t.deepEquals(output, expectedJSON, 'Extracted entity details from Page');
t.end();
});
t.end();
});
test.test('Gibraltar:: Process EM Entity 001', async t => {
const htmlFile = 'tests/data/gi/em_001.html';
t.test('🇬🇮::Extract entity details', async t => {
const emDetail = fs.readFileSync(htmlFile);
const expectedJSON = jsonfile.readFileSync('tests/data/gi/em_001.json');
const output = await giScraper.extractEntityDetails(emDetail);
// don't compare perms in these tests
delete output.permissions;
t.deepEquals(output, expectedJSON, 'Extracted entity details from Page');
t.end();
});
t.end();
});
test.test('Gibraltar:: Process Agent Entity 001', async t => {
const htmlFile = 'tests/data/gi/agent_001.html';
t.test('🇬🇮::Extract entity details', async t => {
const agentDetail = fs.readFileSync(htmlFile);
const expectedJSON = jsonfile.readFileSync('tests/data/gi/agent_001.json');
const output = await giScraper.extractEntityDetails(agentDetail);
// don't compare perms in these tests
delete output.permissions;
t.deepEquals(output, expectedJSON, 'Extracted entity details from Page');
t.end();
});
t.end();
});
t.end();
});
test.test('Permissions, Agents, and other html fragments', async t => {
test.test('Gibraltar:: Process permissions 001', async t => {
const permissionsHtml = fs.readFileSync('tests/data/gi/perms_001.html');
const expectedJSON = jsonfile.readFileSync('tests/data/gi/perms_001.json');
const output = await giScraper.recurseDOM(permissionsHtml, giScraper.blockBoundaries);
t.deepEquals(output, expectedJSON, 'Extracted permissions from html');
t.end();
});
test.test('Gibraltar:: Process permissions 002', async t => {
const permissionsHtml = fs.readFileSync('tests/data/gi/perms_002.html');
const expectedJSON = jsonfile.readFileSync('tests/data/gi/perms_002.json');
const output = await giScraper.recurseDOM(permissionsHtml, giScraper.blockBoundaries);
t.deepEquals(output, expectedJSON, 'Extracted permissions from html');
t.end();
});
test.test('Gibraltar:: Process agents in perms 001', async t => {
const agentsHtml = fs.readFileSync('tests/data/gi/agents_in_perms_001.html');
const expectedJSON = jsonfile.readFileSync('tests/data/gi/agents_in_perms_001.json');
const output = await giScraper.extractAgents(agentsHtml);
t.deepEquals(output, expectedJSON, 'Extracted agents from html');
t.end();
});
t.end();
});