fix for 3741

This commit is contained in:
Martin Donnelly 2019-05-07 09:33:57 +01:00
parent 81ade092b9
commit ff6985b72f
10 changed files with 988 additions and 82 deletions

35
env
View File

@ -1,35 +0,0 @@
# AWS_ACCESS_KEY_ID=AKIAICAGOZCGAQIYCAUA
# AWS_SECRET_ACCESS_KEY=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
# S3_BUCKET=obregstoretest
AWS_ACCESS_KEY_ID=AKIAJWJS75F7WNCGK64A
AWS_SECRET_ACCESS_KEY=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
AWS_REGION=eu-west-1
S3_BUCKET=obregstoretest-mdtest
SQS_ID=https://sqs.eu-west-1.amazonaws.com/115486161803/obdfcascrape
SQS_ARN=arn:aws:sqs:eu-west-1:115486161803:obdfcascrape
SQS_NAME=obdfcascrape
TOPIC_ARN=arn:aws:sns:eu-west-1:115486161803:obdfcascrape
QUEUE_URL=https://sqs.eu-west-1.amazonaws.com/115486161803/obdfcascrape
QUEUE_ARN=arn:aws:sqs:eu-west-1:115486161803:obdfcascrape
#PROXY_URI=us-wa.proxymesh.com:31280
PROXY_URI=uk.proxymesh.com:31280
FILE_DATE_FORMAT=yyyymmdd
LOGGER_LEVEL=debug
#SCRAPE_START=EE
CY_CRON=00 12 * * *
#DE_CRON=15 12 * * *
#DK_CRON=00 12 * * *
#FR_CRON=15 12 * * *
#IE_CRON=00 12 * * *
MT_CRON=15 12 * * *
NL_CRON=00 12 * * *
#SE_CRON=15 12 * * *
#IT_CRON=00 12 * * *
#CZ_CRON=15 12 * * *
#SK_CRON=00 12 * * *

View File

@ -91,7 +91,7 @@ class FRScrape extends Scraper {
const children = cheerio(elm).children();
if (children.length > 2) {
if (children.length === 11)
if (children.length === 11)
children.each((step, fiElm) => {
financialInstruments.push(this._cleanUp(cheerio(fiElm).text()));
});
@ -105,14 +105,14 @@ class FRScrape extends Scraper {
while(offset < financialInstruments.length) {
if (children.eq(offset).html().match(unchecked) === null)
finInst.push(financialInstruments[offset - fiOffset]);
offset++;
}
if (finInst.length > 0)
if (finInst.length > 0)
output.push([rowName, finInst]);
}
}
else if (children.length === 2)
else if (children.length === 2)
if (children.eq(0).html().match(unchecked) === null) {
authorised.push(this._cleanUp(children.eq(1).text()));
@ -173,13 +173,12 @@ class FRScrape extends Scraper {
if (!creditInstFilter)
// Default mode
links.push({ link, title });
else
if ($row.children().length === 6) {
const status = this._cleanUp($row.children().eq(5).text().toLowerCase());
else
if ($row.children().length >= 6) {
const statusField = $row.children().length - 1;
const status = this._cleanUp($row.children().eq(statusField).text().toLowerCase());
logger.debug(`Status:**${status}** ${title}`);
if(wantedCIStatuses.indexOf(status) !== -1) {
logger.debug(`Matched:**${status}** ${title}`);
links.push({ link, title });
}
}
@ -210,7 +209,7 @@ class FRScrape extends Scraper {
}
}
});
return details;
}
@ -244,9 +243,9 @@ class FRScrape extends Scraper {
const frenchTbl = $('#zone_en_france > table tr');
if (this.mode < 2)
if (this.mode < 2)
pageData.frActivities = await this.extractDataFromTable(frenchTbl).output;
else
else
pageData.creditInstituteActivities = await this.extractDataFromInvestmentServicesTable(frenchTbl);
if (this.mode < 2) {
@ -331,6 +330,10 @@ class FRScrape extends Scraper {
if ($table.length > 1)
// The table contains more than just the heading row
store.indexcount++;
logger.debug(`Processing menu: ${this.modeTitles[this.mode]} // ${store.indexcount}`);
await this._makeScreenshotV2(this.page, `${this.path}/${this.modePrefix[this.mode]}_menu_${store.indexcount}`, null);
store.links = store.links.concat(await this.extractLinks($table, (this.mode === 2)));
@ -415,7 +418,7 @@ class FRScrape extends Scraper {
async start() {
await super._start();
try {
this.mode = 0;
this.mode = 2;
this.paymentServices = {
'items': 0,
@ -423,7 +426,9 @@ class FRScrape extends Scraper {
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
'searchDone' : false,
'indexcount' :0
};
this.emoneyServices = {
@ -432,7 +437,8 @@ class FRScrape extends Scraper {
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
'searchDone' : false,
'indexcount' :0
};
this.creditServices = {
@ -441,7 +447,8 @@ class FRScrape extends Scraper {
'step': 0,
'visited': false,
'done' : false,
'searchDone' : false
'searchDone' : false,
'indexcount' :0
};
this.startPage = 'https://www.regafi.fr/spip.php?page=results&type=advanced&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=21-TBR07&retrait=0&lang=en&id_secteur=3';
@ -464,7 +471,7 @@ class FRScrape extends Scraper {
await this.page.tracing.start({ 'path': `${this.path}/trace.json`, 'screenshots':true });
await this.page.setViewport({ 'width': 1200, 'height': 800 });
await this._goto(this.startPage);
await this._goto(this.creditUrl);
await this._randomWait(this.page, 3, 5);
}

41
package-lock.json generated
View File

@ -2300,8 +2300,7 @@
},
"ansi-regex": {
"version": "2.1.1",
"bundled": true,
"optional": true
"bundled": true
},
"aproba": {
"version": "1.2.0",
@ -2319,13 +2318,11 @@
},
"balanced-match": {
"version": "1.0.0",
"bundled": true,
"optional": true
"bundled": true
},
"brace-expansion": {
"version": "1.1.11",
"bundled": true,
"optional": true,
"requires": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
@ -2338,18 +2335,15 @@
},
"code-point-at": {
"version": "1.1.0",
"bundled": true,
"optional": true
"bundled": true
},
"concat-map": {
"version": "0.0.1",
"bundled": true,
"optional": true
"bundled": true
},
"console-control-strings": {
"version": "1.1.0",
"bundled": true,
"optional": true
"bundled": true
},
"core-util-is": {
"version": "1.0.2",
@ -2452,8 +2446,7 @@
},
"inherits": {
"version": "2.0.3",
"bundled": true,
"optional": true
"bundled": true
},
"ini": {
"version": "1.3.5",
@ -2463,7 +2456,6 @@
"is-fullwidth-code-point": {
"version": "1.0.0",
"bundled": true,
"optional": true,
"requires": {
"number-is-nan": "^1.0.0"
}
@ -2476,20 +2468,17 @@
"minimatch": {
"version": "3.0.4",
"bundled": true,
"optional": true,
"requires": {
"brace-expansion": "^1.1.7"
}
},
"minimist": {
"version": "0.0.8",
"bundled": true,
"optional": true
"bundled": true
},
"minipass": {
"version": "2.3.5",
"bundled": true,
"optional": true,
"requires": {
"safe-buffer": "^5.1.2",
"yallist": "^3.0.0"
@ -2506,7 +2495,6 @@
"mkdirp": {
"version": "0.5.1",
"bundled": true,
"optional": true,
"requires": {
"minimist": "0.0.8"
}
@ -2579,8 +2567,7 @@
},
"number-is-nan": {
"version": "1.0.1",
"bundled": true,
"optional": true
"bundled": true
},
"object-assign": {
"version": "4.1.1",
@ -2590,7 +2577,6 @@
"once": {
"version": "1.4.0",
"bundled": true,
"optional": true,
"requires": {
"wrappy": "1"
}
@ -2666,8 +2652,7 @@
},
"safe-buffer": {
"version": "5.1.2",
"bundled": true,
"optional": true
"bundled": true
},
"safer-buffer": {
"version": "2.1.2",
@ -2697,7 +2682,6 @@
"string-width": {
"version": "1.0.2",
"bundled": true,
"optional": true,
"requires": {
"code-point-at": "^1.0.0",
"is-fullwidth-code-point": "^1.0.0",
@ -2715,7 +2699,6 @@
"strip-ansi": {
"version": "3.0.1",
"bundled": true,
"optional": true,
"requires": {
"ansi-regex": "^2.0.0"
}
@ -2754,13 +2737,11 @@
},
"wrappy": {
"version": "1.0.2",
"bundled": true,
"optional": true
"bundled": true
},
"yallist": {
"version": "3.0.3",
"bundled": true,
"optional": true
"bundled": true
}
}
},

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
[{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=2&page=af&id=70","title":"AGENCE FRANCE LOCALE"}]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
[{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=4&page=af&id=85","title":"Amundi"},{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=4&page=af&id=40","title":"Amundi finance"},{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=4&page=af&id=8943","title":"Andbank Monaco S.A.M."}]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
[{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=11&page=af&id=8972","title":"Bank Audi France"},{"link":"/spip.php?type=advanced&id_secteur=3&lang=en&denomination=&siren=&cib=&bic=&nom=&siren_agent=&num=&cat=01-TBR07&retrait=0&pg=11&page=af&id=8935","title":"Bank Julius Baer (Monaco) S.A.M."}]

View File

@ -54,6 +54,9 @@ const creditData = jsonfile.readFileSync('tests/data/fr/creditInstituteData.json
const breakingCr001 = fs.readFileSync('tests/data/fr/breaking_CI_001.html');
const breakingCr001Data = jsonfile.readFileSync('tests/data/fr/breaking_CI_001.json');
const failure = { 'fail':true };
const empty = {};
test('FRANCE:: Scrape Indexes', async t => {
const frScraper = new France();
t.test('Extract PI Search links', async t => {
@ -218,3 +221,62 @@ test('FRANCE Breaking CI 001', async t => {
t.end();
});
test('FRANCE:: DIR-3741', async t => {
const dir3741_001 = fs.readFileSync('tests/data/fr/dir3741_001.html');
const dir3741_001Data = jsonfile.readFileSync('tests/data/fr/dir3741_001.json');
const dir3741_002 = fs.readFileSync('tests/data/fr/dir3741_002.html');
const dir3741_002Data = jsonfile.readFileSync('tests/data/fr/dir3741_002.json');
const dir3741_003 = fs.readFileSync('tests/data/fr/dir3741_003.html');
const dir3741_003Data = jsonfile.readFileSync('tests/data/fr/dir3741_003.json');
const frScraper = new France();
t.test('FRANCE::Extract Details from Page 1/103', async t => {
const $ = cheerio.load(dir3741_001);
const $table = $('table.table tr');
const links = await frScraper.extractLinks($table, true);
const linkCount = links.length;
t.equal(linkCount, 1, 'Scrapes the correct number of links (1)');
t.deepEquals(links, dir3741_001Data, 'Links match the data');
t.end();
});
t.test('FRANCE::Extract Details from Page 4/103', async t => {
const $ = cheerio.load(dir3741_002);
const $table = $('table.table tr');
const links = await frScraper.extractLinks($table, true);
const linkCount = links.length;
t.equal(linkCount, 3, 'Scrapes the correct number of links (3)');
t.deepEquals(links, dir3741_002Data, 'Links match the data');
t.end();
});
t.test('FRANCE::Extract Details from Page 11/103', async t => {
const $ = cheerio.load(dir3741_003);
const $table = $('table.table tr');
const links = await frScraper.extractLinks($table, true);
const linkCount = links.length;
t.equal(linkCount, 2, 'Scrapes the correct number of links (2)');
t.deepEquals(links, dir3741_003Data, 'Links match the data');
t.end();
});
t.end();
});