Updated euronews reader to get rid of 1's and adverts from news
This commit is contained in:
parent
5ff3fc6a34
commit
fcae12997d
1
.gitignore
vendored
1
.gitignore
vendored
@ -145,3 +145,4 @@ fabric.properties
|
|||||||
/src/bundle.js
|
/src/bundle.js
|
||||||
/src/bundle.js.map
|
/src/bundle.js.map
|
||||||
/live/
|
/live/
|
||||||
|
!/output/
|
||||||
|
32
package-lock.json
generated
32
package-lock.json
generated
@ -54,7 +54,7 @@
|
|||||||
},
|
},
|
||||||
"@sinonjs/formatio": {
|
"@sinonjs/formatio": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "http://registry.npmjs.org/@sinonjs/formatio/-/formatio-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/@sinonjs/formatio/-/formatio-2.0.0.tgz",
|
||||||
"integrity": "sha512-ls6CAMA6/5gG+O/IdsBcblvnd8qcO/l1TYoNeAzp3wcISOxlPXQEus0mLcdwazEkWjaBdaJ3TaxmNgCLWwvWzg==",
|
"integrity": "sha512-ls6CAMA6/5gG+O/IdsBcblvnd8qcO/l1TYoNeAzp3wcISOxlPXQEus0mLcdwazEkWjaBdaJ3TaxmNgCLWwvWzg==",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -1093,7 +1093,7 @@
|
|||||||
},
|
},
|
||||||
"browserify-rsa": {
|
"browserify-rsa": {
|
||||||
"version": "4.0.1",
|
"version": "4.0.1",
|
||||||
"resolved": "http://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz",
|
||||||
"integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=",
|
"integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"bn.js": "^4.1.0",
|
"bn.js": "^4.1.0",
|
||||||
@ -4270,7 +4270,7 @@
|
|||||||
},
|
},
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "1.0.34",
|
"version": "1.0.34",
|
||||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
|
||||||
"integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
|
"integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -4282,13 +4282,13 @@
|
|||||||
},
|
},
|
||||||
"string_decoder": {
|
"string_decoder": {
|
||||||
"version": "0.10.31",
|
"version": "0.10.31",
|
||||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
|
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
|
||||||
"integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
|
"integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
"through2": {
|
"through2": {
|
||||||
"version": "0.6.5",
|
"version": "0.6.5",
|
||||||
"resolved": "http://registry.npmjs.org/through2/-/through2-0.6.5.tgz",
|
"resolved": "https://registry.npmjs.org/through2/-/through2-0.6.5.tgz",
|
||||||
"integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=",
|
"integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -4393,7 +4393,7 @@
|
|||||||
},
|
},
|
||||||
"lodash": {
|
"lodash": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"resolved": "http://registry.npmjs.org/lodash/-/lodash-1.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/lodash/-/lodash-1.0.2.tgz",
|
||||||
"integrity": "sha1-j1dWDIO1n8JwvT1WG2kAQ0MOJVE=",
|
"integrity": "sha1-j1dWDIO1n8JwvT1WG2kAQ0MOJVE=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
@ -4524,7 +4524,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"semver": {
|
"semver": {
|
||||||
"version": "4.3.6",
|
"version": "4.3.6",
|
||||||
"resolved": "http://registry.npmjs.org/semver/-/semver-4.3.6.tgz",
|
"resolved": "https://registry.npmjs.org/semver/-/semver-4.3.6.tgz",
|
||||||
"integrity": "sha1-MAvG4OhjdPe6YQaLWx7NV/xlMto=",
|
"integrity": "sha1-MAvG4OhjdPe6YQaLWx7NV/xlMto=",
|
||||||
"dev": true
|
"dev": true
|
||||||
}
|
}
|
||||||
@ -5395,7 +5395,7 @@
|
|||||||
},
|
},
|
||||||
"htmlescape": {
|
"htmlescape": {
|
||||||
"version": "1.1.1",
|
"version": "1.1.1",
|
||||||
"resolved": "http://registry.npmjs.org/htmlescape/-/htmlescape-1.1.1.tgz",
|
"resolved": "https://registry.npmjs.org/htmlescape/-/htmlescape-1.1.1.tgz",
|
||||||
"integrity": "sha1-OgPtwiFLyjtmQko+eVk0lQnLA1E="
|
"integrity": "sha1-OgPtwiFLyjtmQko+eVk0lQnLA1E="
|
||||||
},
|
},
|
||||||
"htmlparser2": {
|
"htmlparser2": {
|
||||||
@ -9143,7 +9143,7 @@
|
|||||||
},
|
},
|
||||||
"pretty-hrtime": {
|
"pretty-hrtime": {
|
||||||
"version": "1.0.3",
|
"version": "1.0.3",
|
||||||
"resolved": "http://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz",
|
"resolved": "https://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz",
|
||||||
"integrity": "sha1-t+PqQkNaTJsnWdmeDyAesZWALuE=",
|
"integrity": "sha1-t+PqQkNaTJsnWdmeDyAesZWALuE=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
@ -9977,7 +9977,7 @@
|
|||||||
},
|
},
|
||||||
"safe-regex": {
|
"safe-regex": {
|
||||||
"version": "1.1.0",
|
"version": "1.1.0",
|
||||||
"resolved": "http://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",
|
"resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz",
|
||||||
"integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=",
|
"integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -10147,7 +10147,7 @@
|
|||||||
},
|
},
|
||||||
"shasum": {
|
"shasum": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"resolved": "http://registry.npmjs.org/shasum/-/shasum-1.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/shasum/-/shasum-1.0.2.tgz",
|
||||||
"integrity": "sha1-5wEjENj0F/TetXEhUOVni4euVl8=",
|
"integrity": "sha1-5wEjENj0F/TetXEhUOVni4euVl8=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"json-stable-stringify": "~0.0.0",
|
"json-stable-stringify": "~0.0.0",
|
||||||
@ -10572,7 +10572,7 @@
|
|||||||
},
|
},
|
||||||
"stream-browserify": {
|
"stream-browserify": {
|
||||||
"version": "2.0.1",
|
"version": "2.0.1",
|
||||||
"resolved": "http://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz",
|
||||||
"integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=",
|
"integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=",
|
||||||
"requires": {
|
"requires": {
|
||||||
"inherits": "~2.0.1",
|
"inherits": "~2.0.1",
|
||||||
@ -11340,7 +11340,7 @@
|
|||||||
},
|
},
|
||||||
"unique-stream": {
|
"unique-stream": {
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"resolved": "http://registry.npmjs.org/unique-stream/-/unique-stream-1.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-1.0.0.tgz",
|
||||||
"integrity": "sha1-1ZpKdUJ0R9mqbJHnAmP40mpLEEs=",
|
"integrity": "sha1-1ZpKdUJ0R9mqbJHnAmP40mpLEEs=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
@ -11574,7 +11574,7 @@
|
|||||||
},
|
},
|
||||||
"readable-stream": {
|
"readable-stream": {
|
||||||
"version": "1.0.34",
|
"version": "1.0.34",
|
||||||
"resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
|
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
|
||||||
"integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
|
"integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
@ -11586,13 +11586,13 @@
|
|||||||
},
|
},
|
||||||
"string_decoder": {
|
"string_decoder": {
|
||||||
"version": "0.10.31",
|
"version": "0.10.31",
|
||||||
"resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
|
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
|
||||||
"integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
|
"integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
"through2": {
|
"through2": {
|
||||||
"version": "0.6.5",
|
"version": "0.6.5",
|
||||||
"resolved": "http://registry.npmjs.org/through2/-/through2-0.6.5.tgz",
|
"resolved": "https://registry.npmjs.org/through2/-/through2-0.6.5.tgz",
|
||||||
"integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=",
|
"integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=",
|
||||||
"dev": true,
|
"dev": true,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
@ -63,7 +63,7 @@
|
|||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"expect.js": "^0.3.1",
|
"expect.js": "^0.3.1",
|
||||||
"gulp": "^3.9.1",
|
"gulp": "3.9.1",
|
||||||
"gulp-google-webfonts": "0.0.14",
|
"gulp-google-webfonts": "0.0.14",
|
||||||
"gulp-rename": "^1.4.0",
|
"gulp-rename": "^1.4.0",
|
||||||
"gulp-sass": "^3.2.1",
|
"gulp-sass": "^3.2.1",
|
||||||
|
@ -20,6 +20,10 @@ const twitterClient = new Twitter({
|
|||||||
|
|
||||||
logger.level = 'debug';
|
logger.level = 'debug';
|
||||||
|
|
||||||
|
// google api key AIzaSyBl7O9LHIthCagcqIaDkQ4um_hghYG5reE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
function nowTS() {
|
function nowTS() {
|
||||||
const now = new Date();
|
const now = new Date();
|
||||||
|
|
||||||
@ -322,7 +326,7 @@ function doGetMoreDetail(id) {
|
|||||||
.then((d) => {
|
.then((d) => {
|
||||||
logger.info('Final', d.name, d.id);
|
logger.info('Final', d.name, d.id);
|
||||||
jsonfile.writeFileSync(`output/${d.id}-doGetMoreDetail.json`, d);
|
jsonfile.writeFileSync(`output/${d.id}-doGetMoreDetail.json`, d);
|
||||||
|
|
||||||
return resolve(d);
|
return resolve(d);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -3,7 +3,7 @@ const fecha = require('fecha');
|
|||||||
const request = require('request');
|
const request = require('request');
|
||||||
const http = require('http');
|
const http = require('http');
|
||||||
|
|
||||||
const { reduceArticle } = require('./reducers/euronews');
|
const { reduceArticle, reduceArticleV2 } = require('./reducers/euronews');
|
||||||
|
|
||||||
const logger = require('log4js').getLogger('Euronews');
|
const logger = require('log4js').getLogger('Euronews');
|
||||||
logger.level = 'debug';
|
logger.level = 'debug';
|
||||||
@ -40,7 +40,7 @@ class Template {
|
|||||||
|
|
||||||
function doGetEuroNews() {
|
function doGetEuroNews() {
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
logger.info('Retrieving Euronews Headlines..');
|
logger.info('doGetEuroNews:Retrieving Euronews Headlines..');
|
||||||
|
|
||||||
// http://feeds.feedburner.com/euronews/en/home/
|
// http://feeds.feedburner.com/euronews/en/home/
|
||||||
// http://feeds.feedburner.com/euronews/en/news/
|
// http://feeds.feedburner.com/euronews/en/news/
|
||||||
@ -114,7 +114,7 @@ function doGetArticle(guid = '') {
|
|||||||
return reject(err);
|
return reject(err);
|
||||||
// Throw err;
|
// Throw err;
|
||||||
|
|
||||||
const output = reduceArticle(body);
|
const output = reduceArticleV2(body);
|
||||||
|
|
||||||
logger.debug(JSON.stringify(output));
|
logger.debug(JSON.stringify(output));
|
||||||
|
|
||||||
|
358
server/lib/readability.js
Normal file
358
server/lib/readability.js
Normal file
@ -0,0 +1,358 @@
|
|||||||
|
var readabilityVersion = '2';
|
||||||
|
var readStyle = 'style-ebook';
|
||||||
|
var readSize = 'size-medium';
|
||||||
|
var readMargin = 'margin-wide';
|
||||||
|
(function() {
|
||||||
|
// removing all existing scripts so they don't cause conflicts...
|
||||||
|
var docscripts = document.getElementsByTagName('script');
|
||||||
|
for (k = 0;k < docscripts.length; k++)
|
||||||
|
if (docscripts[k].src != null && ! docscripts[k].src.match(/readability|[Cc]lippability/))
|
||||||
|
docscripts[k].parentNode.removeChild(docscripts[k]);
|
||||||
|
|
||||||
|
// let's just load jQuery and get it over with
|
||||||
|
var gjs = document.createElement('SCRIPT');
|
||||||
|
gjs.type = 'text/javascript';
|
||||||
|
gjs.src = 'http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js';
|
||||||
|
document.getElementsByTagName('head')[0].appendChild(gjs);
|
||||||
|
gjs.onload = gjs.onreadystatechange = function() {
|
||||||
|
$('script').each(function() {
|
||||||
|
// jQuery gets scripts inside of conditional comments far more easily than I could figure out
|
||||||
|
if (! this.src.match(/readability|[Cc]lippability|jquery\.min\.js$/)) $(this).remove();
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
var objOverlay = document.createElement('div');
|
||||||
|
var objinnerDiv = document.createElement('div');
|
||||||
|
|
||||||
|
objOverlay.id = 'readOverlay';
|
||||||
|
objinnerDiv.id = 'readInner';
|
||||||
|
|
||||||
|
// Apply user-selected styling:
|
||||||
|
document.body.className = readStyle;
|
||||||
|
objOverlay.className = readStyle;
|
||||||
|
objinnerDiv.className = `${readMargin } ${ readSize}`;
|
||||||
|
|
||||||
|
objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div
|
||||||
|
objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay
|
||||||
|
|
||||||
|
// For totally hosed HTML, add body node that can't be found because of bad HTML or something.
|
||||||
|
if(document.body == null) {
|
||||||
|
body = document.createElement('body');
|
||||||
|
document.body = body;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.body.innerHTML = '';
|
||||||
|
|
||||||
|
// Inserts the new content :
|
||||||
|
|
||||||
|
document.body.insertBefore(objOverlay, document.body.firstChild);
|
||||||
|
var o = document.body.firstChild;
|
||||||
|
|
||||||
|
return o.innerHTML;
|
||||||
|
})();
|
||||||
|
|
||||||
|
function getElementsByClassName(classname, node) {
|
||||||
|
if(!node) node = document.getElementsByTagName('body')[0];
|
||||||
|
var a = [];
|
||||||
|
var re = new RegExp(`\\b${ classname }\\b`);
|
||||||
|
var els = node.getElementsByTagName('*');
|
||||||
|
for(var i = 0, j = els.length; i < j; i++)
|
||||||
|
if(re.test(els[i].className))a.push(els[i]);
|
||||||
|
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
function grabArticle() {
|
||||||
|
var allParagraphs = document.getElementsByTagName('p');
|
||||||
|
var topDivCount = 0;
|
||||||
|
var topDiv = null;
|
||||||
|
var topDivParas;
|
||||||
|
|
||||||
|
var articleContent = document.createElement('DIV');
|
||||||
|
var articleTitle = document.createElement('H1');
|
||||||
|
var articleFooter = document.createElement('DIV');
|
||||||
|
|
||||||
|
// Replace all doubled-up <BR> tags with <P> tags, and remove fonts.
|
||||||
|
var pattern = new RegExp ('<br/?>[ \r\n\s]*<br/?>', 'g');
|
||||||
|
document.body.innerHTML = document.body.innerHTML.replace(pattern, '</p><p>').replace(/<\/?font[^>]*>/g, '');
|
||||||
|
|
||||||
|
// Grab the title from the <title> tag and inject it as the title.
|
||||||
|
articleTitle.innerHTML = document.title;
|
||||||
|
articleContent.appendChild(articleTitle);
|
||||||
|
|
||||||
|
// Study all the paragraphs and find the chunk that has the best score.
|
||||||
|
// A score is determined by things like: Number of <p>'s, commas, special classes, etc.
|
||||||
|
for (var j = 0; j < allParagraphs.length; j++) {
|
||||||
|
parentNode = allParagraphs[j].parentNode;
|
||||||
|
|
||||||
|
// Initialize readability data
|
||||||
|
if(typeof parentNode.readability === 'undefined') {
|
||||||
|
parentNode.readability = { 'contentScore': 0 };
|
||||||
|
|
||||||
|
// Look for a special classname
|
||||||
|
if(parentNode.className.match(/(comment|meta|footer|footnote)/))
|
||||||
|
parentNode.readability.contentScore -= 50;
|
||||||
|
else if(parentNode.className.match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/))
|
||||||
|
parentNode.readability.contentScore += 25;
|
||||||
|
|
||||||
|
// Look for a special ID
|
||||||
|
if(parentNode.id.match(/(comment|meta|footer|footnote)/))
|
||||||
|
parentNode.readability.contentScore -= 50;
|
||||||
|
else if(parentNode.id.match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/))
|
||||||
|
parentNode.readability.contentScore += 25;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a point for the paragraph found
|
||||||
|
if(getInnerText(allParagraphs[j]).length > 10)
|
||||||
|
parentNode.readability.contentScore++;
|
||||||
|
|
||||||
|
// Add points for any commas within this paragraph
|
||||||
|
parentNode.readability.contentScore += getCharCount(allParagraphs[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
|
||||||
|
for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++)
|
||||||
|
if(typeof node.readability !== 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore))
|
||||||
|
topDiv = node;
|
||||||
|
|
||||||
|
if(topDiv == null) {
|
||||||
|
topDiv = document.createElement('div');
|
||||||
|
topDiv.innerHTML = 'Sorry, clippable was unable to parse this page for content. If you feel like it should have been able to, please <a href="http://brettterpstra.com/contact">let us know.</a>';
|
||||||
|
}
|
||||||
|
|
||||||
|
// REMOVES ALL STYLESHEETS ...
|
||||||
|
for (var k = 0;k < document.styleSheets.length; k++)
|
||||||
|
if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf('readability') == -1)
|
||||||
|
document.styleSheets[k].disabled = true;
|
||||||
|
|
||||||
|
var sh = getElementsByClassName('syntaxhighlighter');
|
||||||
|
for (var i = 0;i < sh.length;i++) {
|
||||||
|
var bar = getElementsByClassName('toolbar', sh[i]);
|
||||||
|
if (bar.length > 0)
|
||||||
|
for (var bn = 0;bn < bar.length;bn++)
|
||||||
|
bar[bn].parentNode.removeChild(bar[bn]);
|
||||||
|
|
||||||
|
var numbers = getElementsByClassName('number', sh[i]);
|
||||||
|
if (numbers.length > 0)
|
||||||
|
for (var num = 0;num < numbers.length;num++)
|
||||||
|
numbers[num].parentNode.removeChild(numbers[num]);
|
||||||
|
}
|
||||||
|
|
||||||
|
var dp = getElementsByClassName('dp-highlighter');
|
||||||
|
for (var d = 0;d < dp.length;d++)
|
||||||
|
dp[d].parentNode.removeChild(dp[d]);
|
||||||
|
|
||||||
|
var sth = getElementsByClassName('standardLighter');
|
||||||
|
for (d = 0;d < sth.length;d++)
|
||||||
|
sth[d].parentNode.removeChild(sth[d]);
|
||||||
|
|
||||||
|
// Remove all style tags in head (not doing this on IE) :
|
||||||
|
var styleTags = document.getElementsByTagName('style');
|
||||||
|
for (var l = 0;l < styleTags.length; l++)
|
||||||
|
if (navigator.appName != 'Microsoft Internet Explorer')
|
||||||
|
styleTags[l].textContent = '';
|
||||||
|
|
||||||
|
topDiv = killCodeSpans(topDiv); // removes span tags
|
||||||
|
cleanStyles(topDiv); // Removes all style attributes
|
||||||
|
topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff
|
||||||
|
topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br />
|
||||||
|
|
||||||
|
// Cleans out junk from the topDiv just in case:
|
||||||
|
topDiv = clean(topDiv, 'form');
|
||||||
|
// topDiv = clean(topDiv, "object");
|
||||||
|
topDiv = clean(topDiv, 'table', 8);
|
||||||
|
topDiv = clean(topDiv, 'h1');
|
||||||
|
// topDiv = clean(topDiv, "h2");
|
||||||
|
topDiv = clean(topDiv, 'iframe');
|
||||||
|
|
||||||
|
// Add the footer and contents:
|
||||||
|
articleFooter.id = 'readFooter';
|
||||||
|
articleFooter.innerHTML = `\
|
||||||
|
<a href='http://lab.arc90.com/experiments/readability'><img src='http://lab.arc90.com/experiments/readability/images/footer-readability.png'></a>\
|
||||||
|
<a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer-arc90.png'></a>\
|
||||||
|
<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter »</a>\
|
||||||
|
<div class='footer-right' >\
|
||||||
|
<span class='version'>Readability version ${ readabilityVersion }</span>\
|
||||||
|
</div>\
|
||||||
|
`;
|
||||||
|
|
||||||
|
articleContent.appendChild(topDiv);
|
||||||
|
// articleContent.appendChild(articleFooter);
|
||||||
|
document.onkeyup = docOnKeyup;
|
||||||
|
|
||||||
|
return articleContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
function docOnKeyup(ev) {
|
||||||
|
var keyID = null;
|
||||||
|
if (navigator.appName == 'Microsoft Internet Explorer')
|
||||||
|
keyID = event.keyCode;
|
||||||
|
else
|
||||||
|
keyID = (window.event) ? event.keyCode : ev.keyCode;
|
||||||
|
|
||||||
|
var bgcolor, fgcolor, acolor;
|
||||||
|
switch (keyID) {
|
||||||
|
|
||||||
|
case 27: // escape
|
||||||
|
document.location.reload(true);
|
||||||
|
break;
|
||||||
|
case 37: // left arrow
|
||||||
|
bgcolor = '#222';
|
||||||
|
fgcolor = '#F3EFCE';
|
||||||
|
acolor = '#A19F89';
|
||||||
|
break;
|
||||||
|
case 39: // right arrow
|
||||||
|
bgcolor = '#fff';
|
||||||
|
fgcolor = '#333';
|
||||||
|
acolor = '#276F78';
|
||||||
|
break;
|
||||||
|
case 46: // delete
|
||||||
|
bgcolor = '#eee';
|
||||||
|
fgcolor = '#333';
|
||||||
|
acolor = '#blue';
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
body = document.getElementById('readOverlay');
|
||||||
|
// body.className = body.className.replace('/\blightened\b/','') + " darkened";
|
||||||
|
body.style.backgroundColor = bgcolor;
|
||||||
|
body.style.color = fgcolor;
|
||||||
|
var alinks = body.getElementsByTagName('a');
|
||||||
|
for (var lc = 0;lc < alinks.length;lc++)
|
||||||
|
alinks[lc].style.color = acolor;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the inner text of a node - cross browser compatibly.
|
||||||
|
function getInnerText(e) {
|
||||||
|
if (navigator.appName == 'Microsoft Internet Explorer')
|
||||||
|
return e.innerText;
|
||||||
|
else
|
||||||
|
return e.textContent;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get character count
|
||||||
|
function getCharCount ( e, s ) {
|
||||||
|
s = s || ',';
|
||||||
|
|
||||||
|
return getInnerText(e).split(s).length;
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanStyles( e ) {
|
||||||
|
e = e || document;
|
||||||
|
var cur = e.firstChild;
|
||||||
|
|
||||||
|
// If we had a bad node, there's not much we can do.
|
||||||
|
if(!e)
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Remove any root styles, if we're able.
|
||||||
|
if(typeof e.removeAttribute === 'function')
|
||||||
|
e.removeAttribute('style');
|
||||||
|
|
||||||
|
// Go until there are no more child nodes
|
||||||
|
while ( cur != null ) {
|
||||||
|
if ( cur.nodeType == 1 ) {
|
||||||
|
// Remove style attribute(s) :
|
||||||
|
cur.removeAttribute('style');
|
||||||
|
cleanStyles( cur );
|
||||||
|
}
|
||||||
|
cur = cur.nextSibling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function killDivs ( e ) {
|
||||||
|
var divsList = e.getElementsByTagName( 'div' );
|
||||||
|
var curDivLength = divsList.length;
|
||||||
|
|
||||||
|
// Gather counts for other typical elements embedded within.
|
||||||
|
// Traverse backwards so we can remove nodes at the same time without effecting the traversal.
|
||||||
|
for (var i = curDivLength - 1; i >= 0; i--) {
|
||||||
|
var p = divsList[i].getElementsByTagName('p').length;
|
||||||
|
var img = divsList[i].getElementsByTagName('img').length;
|
||||||
|
var li = divsList[i].getElementsByTagName('li').length;
|
||||||
|
var a = divsList[i].getElementsByTagName('a').length;
|
||||||
|
var embed = divsList[i].getElementsByTagName('embed').length;
|
||||||
|
var object = divsList[i].getElementsByTagName('object').length;
|
||||||
|
var pre = divsList[i].getElementsByTagName('pre').length;
|
||||||
|
var code = divsList[i].getElementsByTagName('code').length;
|
||||||
|
var divId = divsList[i].id;
|
||||||
|
var divClass = divsList[i].className;
|
||||||
|
var sphereit = divsList[i].innerHTML.match('<!-- sphereit') == null ? 0 : 1;
|
||||||
|
// If the number of commas is less than 10 (bad sign) ...
|
||||||
|
if ( getCharCount(divsList[i]) < 10 )
|
||||||
|
// And the number of non-paragraph elements is more than paragraphs
|
||||||
|
// or other ominous signs :
|
||||||
|
if (( img > p || li > p || a > p || p == 0 || divId.match('comment') != null || divClass.match('comment') != null || divId.match('share') != null || divClass.match('share') != null) && ( pre == 0 && code == 0 && embed == 0 && object == 0 && sphereit == 0 )) {
|
||||||
|
if (!p == 0 && img == 1) divsList[i].parentNode.removeChild(divsList[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
var stopwords = ['comment', 'share', 'footer', '^ad'];
|
||||||
|
for (var sw = 0;sw < stopwords.length;sw++) {
|
||||||
|
regex = new RegExp(stopwords[sw]);
|
||||||
|
if (divId.match(regex) != null || divClass.match(regex) != null) {
|
||||||
|
console.log(`matched ${stopwords[sw]}`);
|
||||||
|
divsList[i].parentNode.removeChild(divsList[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if (divId.match("comment") != null || divClass.match("comment") != null || divId.match("share") != null || divClass.match("share") != null || divClass.match("footer") != null || divId.match("footer") != null || divClass.match(/^ad/) != null || divId.match(/^ad/) != null) {
|
||||||
|
// divsList[i].parentNode.removeChild(divsList[i]);
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
|
||||||
|
function killBreaks ( e ) {
|
||||||
|
e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s| ?)*){1,}/g, '<br />');
|
||||||
|
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
|
||||||
|
function killCodeSpans ( e ) {
|
||||||
|
e.innerHTML = e.innerHTML.replace(/<\/?\s?span(?:[^>]+)?>/g, '');
|
||||||
|
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
|
||||||
|
function clean(e, tags, minWords) {
|
||||||
|
var targetList;
|
||||||
|
var y;
|
||||||
|
if (tags == 'table') {
|
||||||
|
targetList = e.getElementsByTagName( tags );
|
||||||
|
minWords = minWords || 1000000;
|
||||||
|
for (y = 0; y < targetList.length; y++) {
|
||||||
|
// If the text content isn't laden with words, remove the child:
|
||||||
|
cells = targetList[y].getElementsByTagName('td').length;
|
||||||
|
if (cells < minWords)
|
||||||
|
targetList[y].parentNode.removeChild(targetList[y]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
targetList = e.getElementsByTagName( tags );
|
||||||
|
minWords = minWords || 1000000;
|
||||||
|
|
||||||
|
for (y = 0; y < targetList.length; y++)
|
||||||
|
// If the text content isn't laden with words, remove the child:
|
||||||
|
if (getCharCount(targetList[y], ' ') < minWords && targetList[y].tagName != 'pre')
|
||||||
|
targetList[y].parentNode.removeChild(targetList[y]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return e;
|
||||||
|
}
|
||||||
|
|
||||||
|
function convert(e, tagId) {
|
||||||
|
var children, parent, newNode;
|
||||||
|
var elems = document.getElementsByTagName(tagId);
|
||||||
|
for (y = 0; y < elems.length; y++) {
|
||||||
|
children = elems[y].childNodes;
|
||||||
|
parent = elems[y].parentNode;
|
||||||
|
newNode = document.createElement('span');
|
||||||
|
newNode.setAttribute('style', 'font-weight:bold');
|
||||||
|
for(var i = 0;i < children.length;i++)
|
||||||
|
newNode.appendChild(children[i]);
|
||||||
|
|
||||||
|
parent.replaceChild(newNode, elems[y]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return e;
|
||||||
|
}
|
@ -50,6 +50,8 @@ function reduceArticle(body = '') {
|
|||||||
// const outputHTML = html.join('').replace(htmlTidy, '');
|
// const outputHTML = html.join('').replace(htmlTidy, '');
|
||||||
const outputHTML = html.join('');
|
const outputHTML = html.join('');
|
||||||
|
|
||||||
|
console.log(outputHTML);
|
||||||
|
|
||||||
obj.title = title;
|
obj.title = title;
|
||||||
obj.image = image;
|
obj.image = image;
|
||||||
obj.html = outputHTML;
|
obj.html = outputHTML;
|
||||||
@ -59,5 +61,61 @@ function reduceArticle(body = '') {
|
|||||||
return obj;
|
return obj;
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = { reduceArticle };
|
function reduceArticleV2(body = '') {
|
||||||
|
if (body === '') return {};
|
||||||
|
|
||||||
|
const obj = {};
|
||||||
|
const $ = cheerio.load(body);
|
||||||
|
|
||||||
|
$('amp-ad').remove();
|
||||||
|
|
||||||
|
const title = $('meta[property="og:title"]').attr('content');
|
||||||
|
const image = `https://image.silvrtree.co.uk/640,fit,q80/${ $('meta[property="og:image"]').attr('content')}`;
|
||||||
|
|
||||||
|
const stuff = $('[itemprop="articleBody"]');
|
||||||
|
|
||||||
|
const html = [];
|
||||||
|
|
||||||
|
const content = $('div.article__content');
|
||||||
|
|
||||||
|
for (let top = 0, topLen = content.length; top < topLen; top++) {
|
||||||
|
const children = $(content[top]).children();
|
||||||
|
|
||||||
|
for (let index = 0, len = children.length; index < len; index++) {
|
||||||
|
let line = $.html($(children[index])).replace('amp-img', 'img');
|
||||||
|
const tag = children[index].name;
|
||||||
|
|
||||||
|
const symbol = /src=(['"])(http[s]?:\/\/)/.exec(line) || [];
|
||||||
|
|
||||||
|
if (tag === 'amp-twitter') {
|
||||||
|
const tweetid = $(children[index]).data('tweetid');
|
||||||
|
line = `<amp-twitter width="375"
|
||||||
|
height="472"
|
||||||
|
layout="responsive" data-tweetid="${tweetid}" > </amp-twitter>`;
|
||||||
|
}
|
||||||
|
// logger.debug(symbol);
|
||||||
|
|
||||||
|
if (symbol.length !== 0)
|
||||||
|
line = line.replace(/src=['"]http[s]?:\/\//, `src=${symbol[1]}https://image.silvrtree.co.uk/640,fit,q80/${symbol[2]}`);
|
||||||
|
|
||||||
|
html.push(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
html.push('<div class="endbumper"></div>');
|
||||||
|
// const outputHTML = html.join('').replace(htmlTidy, '');
|
||||||
|
const outputHTML = html.join('');
|
||||||
|
|
||||||
|
console.log(outputHTML);
|
||||||
|
|
||||||
|
obj.title = title;
|
||||||
|
obj.image = image;
|
||||||
|
obj.html = outputHTML;
|
||||||
|
|
||||||
|
logger.debug(JSON.stringify(obj));
|
||||||
|
|
||||||
|
return obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = { reduceArticle, reduceArticleV2 };
|
||||||
|
|
||||||
|
@ -4,6 +4,9 @@ const { get, isEmpty } = require('lodash');
|
|||||||
|
|
||||||
// Bearer YlF_b6D149xr_xnrrYudlSnpn1A53b67vALlIK2HnD0ymBXQocRvPW3KjGN8jZNw0KnyAqxGaOzU7CLVPr84_KbnTxutNRXFVR9axmRqGN6ccda1xahoZo58KC2GWnYx'
|
// Bearer YlF_b6D149xr_xnrrYudlSnpn1A53b67vALlIK2HnD0ymBXQocRvPW3KjGN8jZNw0KnyAqxGaOzU7CLVPr84_KbnTxutNRXFVR9axmRqGN6ccda1xahoZo58KC2GWnYx'
|
||||||
|
|
||||||
|
|
||||||
|
// google api key AIzaSyBl7O9LHIthCagcqIaDkQ4um_hghYG5reE
|
||||||
|
|
||||||
logger.level = 'debug';
|
logger.level = 'debug';
|
||||||
|
|
||||||
function reduceExplore(data) {
|
function reduceExplore(data) {
|
||||||
|
@ -52,14 +52,16 @@ const NewsModel = Backbone.Model.extend({
|
|||||||
}, 'initialize': function() {
|
}, 'initialize': function() {
|
||||||
this.newsCollection = newsCollection;
|
this.newsCollection = newsCollection;
|
||||||
this.listenTo(this, 'change:update', this.onChange);
|
this.listenTo(this, 'change:update', this.onChange);
|
||||||
|
this.getNews = _.throttle(this.getNewsReal, 6000);
|
||||||
|
|
||||||
this.getNews();
|
this.getNews();
|
||||||
},
|
},
|
||||||
'onChange': function() {
|
'onChange': function() {
|
||||||
this.getNews();
|
this.getNews();
|
||||||
},
|
},
|
||||||
'getNews': function() {
|
'getNewsReal': function() {
|
||||||
// const ll = this.get('llShort');
|
// const ll = this.get('llShort');
|
||||||
console.info('>> News:request');
|
console.info('>> News:getNewsReal');
|
||||||
request({
|
request({
|
||||||
'url': `${window.loc}/news`,
|
'url': `${window.loc}/news`,
|
||||||
'method': 'GET', 'qs': {
|
'method': 'GET', 'qs': {
|
||||||
|
@ -57,7 +57,7 @@ const NewsListModel = Backbone.Model.extend({
|
|||||||
this.getNews();
|
this.getNews();
|
||||||
},
|
},
|
||||||
'getNews': function() {
|
'getNews': function() {
|
||||||
console.info('>> NewsList:request');
|
console.info('>> NewsList:getNews');
|
||||||
request({
|
request({
|
||||||
'url': `${window.loc}/news`,
|
'url': `${window.loc}/news`,
|
||||||
'method': 'GET', 'qs': {
|
'method': 'GET', 'qs': {
|
||||||
|
@ -102,9 +102,14 @@ const reduceEuronews = function(item) {
|
|||||||
obj.pubdate = fecha.format(pubdateSrc, 'dddd MMMM Do, YYYY HH:mm');
|
obj.pubdate = fecha.format(pubdateSrc, 'dddd MMMM Do, YYYY HH:mm');
|
||||||
obj.description = item.description.replace(/(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/g, '');
|
obj.description = item.description.replace(/(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/g, '');
|
||||||
|
|
||||||
|
if (obj.description === '1')
|
||||||
|
obj.description = '';
|
||||||
|
|
||||||
obj.guid = encodeURI(item.guid.text);
|
obj.guid = encodeURI(item.guid.text);
|
||||||
obj.title = item.title;
|
obj.title = item.title;
|
||||||
|
|
||||||
|
console.log(obj);
|
||||||
|
|
||||||
return obj;
|
return obj;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -158,7 +158,6 @@ const templates = {
|
|||||||
|
|
||||||
module.exports = templates;
|
module.exports = templates;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
<div data-id="4b926c3af964a52068f833e3" class="itemRow mui--align-middle">
|
<div data-id="4b926c3af964a52068f833e3" class="itemRow mui--align-middle">
|
||||||
<img class="mui--align-middle" src="https://ss3.4sqi.net/img/categories_v2/nightlife/pub_32.png" width="32px" height="32px" style="-webkit-filter: invert(100%);">
|
<img class="mui--align-middle" src="https://ss3.4sqi.net/img/categories_v2/nightlife/pub_32.png" width="32px" height="32px" style="-webkit-filter: invert(100%);">
|
||||||
|
@ -7,11 +7,11 @@ const jsonfile = require('jsonfile');
|
|||||||
|
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
|
|
||||||
const { reduceArticle } = require('../server/reducers/euronews');
|
const { reduceArticle,reduceArticleV2 } = require('../server/reducers/euronews');
|
||||||
|
|
||||||
test('Euronews', async t => {
|
test('Euronews', async t => {
|
||||||
t.test('Reduce a page', async t => {
|
t.skip('Reduce a page', async t => {
|
||||||
const psDetail = fs.readFileSync('test/data/euronews/en002.html');
|
const psDetail = fs.readFileSync('./data/euronews/amp-20200114.html');
|
||||||
const expectedJSON = { 'bob':false }; // jsonfile.readFileSync('tests/data/cz/ps001.json');
|
const expectedJSON = { 'bob':false }; // jsonfile.readFileSync('tests/data/cz/ps001.json');
|
||||||
|
|
||||||
|
|
||||||
@ -22,5 +22,19 @@ test('Euronews', async t => {
|
|||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
t.test('Reduce a page V2', async t => {
|
||||||
|
const psDetail = fs.readFileSync('./data/euronews/amp-20200114.html');
|
||||||
|
const expectedJSON = { 'bob':false }; // jsonfile.readFileSync('tests/data/cz/ps001.json');
|
||||||
|
|
||||||
|
|
||||||
|
const output = await reduceArticleV2(psDetail);
|
||||||
|
|
||||||
|
t.deepEquals(output, expectedJSON, 'Extracted Details from Page');
|
||||||
|
|
||||||
|
t.end();
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
t.end();
|
t.end();
|
||||||
});
|
});
|
||||||
|
0
test/euronewsrss.spec.js
Normal file
0
test/euronewsrss.spec.js
Normal file
@ -522,7 +522,23 @@ const goodMergeResult = {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const goodExplore = { 'latitude': 56.39449115920086, 'longitude': -5.1161516920810755, 'name':'Cruachan Power Station', 'category':'Building', 'icon':'https://ss3.4sqi.net/img/categories_v2/building/default_64.png', 'id':'4c233b549a67a5937ea7dd87', 'provider':'foursquare', 'address':'A85, Dalmally, PA33 1AN, United Kingdom', 'city':'Dalmally', 'state':'', 'postcode':'PA33 1AN', 'twitter':'', 'facebook':'', 'url':'http://www.visitcruachan.co.uk' };
|
const goodExplore = {
|
||||||
|
'address': 'A85, Dalmally, PA33 1AN, United Kingdom',
|
||||||
|
'category': 'Building',
|
||||||
|
'city': 'Dalmally',
|
||||||
|
'description': '',
|
||||||
|
'facebook': '',
|
||||||
|
'icon': 'https://ss3.4sqi.net/img/categories_v2/building/default_64.png',
|
||||||
|
'id': '4c233b549a67a5937ea7dd87',
|
||||||
|
'latitude': 56.39449115920086,
|
||||||
|
'longitude': -5.1161516920810755,
|
||||||
|
'name': 'Cruachan Power Station',
|
||||||
|
'postcode': 'PA33 1AN',
|
||||||
|
'provider': 'foursquare',
|
||||||
|
'state': '',
|
||||||
|
'twitter': '',
|
||||||
|
'url': ''
|
||||||
|
};
|
||||||
|
|
||||||
const goodFS = {
|
const goodFS = {
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user