From fcae12997d963a7cd1ebb5bd75ecbbed9d138a24 Mon Sep 17 00:00:00 2001 From: Martin Donnelly Date: Mon, 20 Jan 2020 19:06:36 +0000 Subject: [PATCH] Updated euronews reader to get rid of 1's and adverts from news --- .gitignore | 1 + package-lock.json | 32 ++-- package.json | 2 +- server/RightByMe.js | 6 +- server/euronews.js | 6 +- server/lib/readability.js | 358 +++++++++++++++++++++++++++++++++++ server/reducers/euronews.js | 60 +++++- server/reducers/rightbyme.js | 3 + src/v1/js/News.js | 6 +- src/v1/js/NewsList.js | 2 +- src/v1/js/libs/reducers.js | 5 + src/v1/js/libs/templates.js | 1 - test/euronews.scrape.js | 20 +- test/euronewsrss.spec.js | 0 test/rightbyme.spec.js | 18 +- 15 files changed, 490 insertions(+), 30 deletions(-) create mode 100644 server/lib/readability.js create mode 100644 test/euronewsrss.spec.js diff --git a/.gitignore b/.gitignore index a02038c..c8f8389 100644 --- a/.gitignore +++ b/.gitignore @@ -145,3 +145,4 @@ fabric.properties /src/bundle.js /src/bundle.js.map /live/ +!/output/ diff --git a/package-lock.json b/package-lock.json index 3da7161..fcca00f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -54,7 +54,7 @@ }, "@sinonjs/formatio": { "version": "2.0.0", - "resolved": "http://registry.npmjs.org/@sinonjs/formatio/-/formatio-2.0.0.tgz", + "resolved": "https://registry.npmjs.org/@sinonjs/formatio/-/formatio-2.0.0.tgz", "integrity": "sha512-ls6CAMA6/5gG+O/IdsBcblvnd8qcO/l1TYoNeAzp3wcISOxlPXQEus0mLcdwazEkWjaBdaJ3TaxmNgCLWwvWzg==", "dev": true, "requires": { @@ -1093,7 +1093,7 @@ }, "browserify-rsa": { "version": "4.0.1", - "resolved": "http://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", + "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", "integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=", "requires": { "bn.js": "^4.1.0", @@ -4270,7 +4270,7 @@ }, "readable-stream": { "version": "1.0.34", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", "dev": true, "requires": { @@ -4282,13 +4282,13 @@ }, "string_decoder": { "version": "0.10.31", - "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", "dev": true }, "through2": { "version": "0.6.5", - "resolved": "http://registry.npmjs.org/through2/-/through2-0.6.5.tgz", + "resolved": "https://registry.npmjs.org/through2/-/through2-0.6.5.tgz", "integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=", "dev": true, "requires": { @@ -4393,7 +4393,7 @@ }, "lodash": { "version": "1.0.2", - "resolved": "http://registry.npmjs.org/lodash/-/lodash-1.0.2.tgz", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-1.0.2.tgz", "integrity": "sha1-j1dWDIO1n8JwvT1WG2kAQ0MOJVE=", "dev": true }, @@ -4524,7 +4524,7 @@ "dependencies": { "semver": { "version": "4.3.6", - "resolved": "http://registry.npmjs.org/semver/-/semver-4.3.6.tgz", + "resolved": "https://registry.npmjs.org/semver/-/semver-4.3.6.tgz", "integrity": "sha1-MAvG4OhjdPe6YQaLWx7NV/xlMto=", "dev": true } @@ -5395,7 +5395,7 @@ }, "htmlescape": { "version": "1.1.1", - "resolved": "http://registry.npmjs.org/htmlescape/-/htmlescape-1.1.1.tgz", + "resolved": "https://registry.npmjs.org/htmlescape/-/htmlescape-1.1.1.tgz", "integrity": "sha1-OgPtwiFLyjtmQko+eVk0lQnLA1E=" }, "htmlparser2": { @@ -9143,7 +9143,7 @@ }, "pretty-hrtime": { "version": "1.0.3", - "resolved": "http://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz", + "resolved": "https://registry.npmjs.org/pretty-hrtime/-/pretty-hrtime-1.0.3.tgz", "integrity": "sha1-t+PqQkNaTJsnWdmeDyAesZWALuE=", "dev": true }, @@ -9977,7 +9977,7 @@ }, "safe-regex": { "version": "1.1.0", - "resolved": "http://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", + "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", "integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=", "dev": true, "requires": { @@ -10147,7 +10147,7 @@ }, "shasum": { "version": "1.0.2", - "resolved": "http://registry.npmjs.org/shasum/-/shasum-1.0.2.tgz", + "resolved": "https://registry.npmjs.org/shasum/-/shasum-1.0.2.tgz", "integrity": "sha1-5wEjENj0F/TetXEhUOVni4euVl8=", "requires": { "json-stable-stringify": "~0.0.0", @@ -10572,7 +10572,7 @@ }, "stream-browserify": { "version": "2.0.1", - "resolved": "http://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", + "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", "integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=", "requires": { "inherits": "~2.0.1", @@ -11340,7 +11340,7 @@ }, "unique-stream": { "version": "1.0.0", - "resolved": "http://registry.npmjs.org/unique-stream/-/unique-stream-1.0.0.tgz", + "resolved": "https://registry.npmjs.org/unique-stream/-/unique-stream-1.0.0.tgz", "integrity": "sha1-1ZpKdUJ0R9mqbJHnAmP40mpLEEs=", "dev": true }, @@ -11574,7 +11574,7 @@ }, "readable-stream": { "version": "1.0.34", - "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz", "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=", "dev": true, "requires": { @@ -11586,13 +11586,13 @@ }, "string_decoder": { "version": "0.10.31", - "resolved": "http://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=", "dev": true }, "through2": { "version": "0.6.5", - "resolved": "http://registry.npmjs.org/through2/-/through2-0.6.5.tgz", + "resolved": "https://registry.npmjs.org/through2/-/through2-0.6.5.tgz", "integrity": "sha1-QaucZ7KdVyCQcUEOHXp6lozTrUg=", "dev": true, "requires": { diff --git a/package.json b/package.json index 7cf4a87..60b0f41 100644 --- a/package.json +++ b/package.json @@ -63,7 +63,7 @@ }, "devDependencies": { "expect.js": "^0.3.1", - "gulp": "^3.9.1", + "gulp": "3.9.1", "gulp-google-webfonts": "0.0.14", "gulp-rename": "^1.4.0", "gulp-sass": "^3.2.1", diff --git a/server/RightByMe.js b/server/RightByMe.js index 037ca58..84c75a9 100644 --- a/server/RightByMe.js +++ b/server/RightByMe.js @@ -20,6 +20,10 @@ const twitterClient = new Twitter({ logger.level = 'debug'; +// google api key AIzaSyBl7O9LHIthCagcqIaDkQ4um_hghYG5reE + + + function nowTS() { const now = new Date(); @@ -322,7 +326,7 @@ function doGetMoreDetail(id) { .then((d) => { logger.info('Final', d.name, d.id); jsonfile.writeFileSync(`output/${d.id}-doGetMoreDetail.json`, d); - + return resolve(d); }); }); diff --git a/server/euronews.js b/server/euronews.js index 4b89439..975d0dd 100644 --- a/server/euronews.js +++ b/server/euronews.js @@ -3,7 +3,7 @@ const fecha = require('fecha'); const request = require('request'); const http = require('http'); -const { reduceArticle } = require('./reducers/euronews'); +const { reduceArticle, reduceArticleV2 } = require('./reducers/euronews'); const logger = require('log4js').getLogger('Euronews'); logger.level = 'debug'; @@ -40,7 +40,7 @@ class Template { function doGetEuroNews() { return new Promise((resolve, reject) => { - logger.info('Retrieving Euronews Headlines..'); + logger.info('doGetEuroNews:Retrieving Euronews Headlines..'); // http://feeds.feedburner.com/euronews/en/home/ // http://feeds.feedburner.com/euronews/en/news/ @@ -114,7 +114,7 @@ function doGetArticle(guid = '') { return reject(err); // Throw err; - const output = reduceArticle(body); + const output = reduceArticleV2(body); logger.debug(JSON.stringify(output)); diff --git a/server/lib/readability.js b/server/lib/readability.js new file mode 100644 index 0000000..375d22b --- /dev/null +++ b/server/lib/readability.js @@ -0,0 +1,358 @@ +var readabilityVersion = '2'; +var readStyle = 'style-ebook'; +var readSize = 'size-medium'; +var readMargin = 'margin-wide'; +(function() { + // removing all existing scripts so they don't cause conflicts... + var docscripts = document.getElementsByTagName('script'); + for (k = 0;k < docscripts.length; k++) + if (docscripts[k].src != null && ! docscripts[k].src.match(/readability|[Cc]lippability/)) + docscripts[k].parentNode.removeChild(docscripts[k]); + + // let's just load jQuery and get it over with + var gjs = document.createElement('SCRIPT'); + gjs.type = 'text/javascript'; + gjs.src = 'http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js'; + document.getElementsByTagName('head')[0].appendChild(gjs); + gjs.onload = gjs.onreadystatechange = function() { + $('script').each(function() { + // jQuery gets scripts inside of conditional comments far more easily than I could figure out + if (! this.src.match(/readability|[Cc]lippability|jquery\.min\.js$/)) $(this).remove(); + }); + }; + + var objOverlay = document.createElement('div'); + var objinnerDiv = document.createElement('div'); + + objOverlay.id = 'readOverlay'; + objinnerDiv.id = 'readInner'; + + // Apply user-selected styling: + document.body.className = readStyle; + objOverlay.className = readStyle; + objinnerDiv.className = `${readMargin } ${ readSize}`; + + objinnerDiv.appendChild(grabArticle()); // Get the article and place it inside the inner Div + objOverlay.appendChild(objinnerDiv); // Insert the inner div into the overlay + + // For totally hosed HTML, add body node that can't be found because of bad HTML or something. + if(document.body == null) { + body = document.createElement('body'); + document.body = body; + } + + document.body.innerHTML = ''; + + // Inserts the new content : + + document.body.insertBefore(objOverlay, document.body.firstChild); + var o = document.body.firstChild; + + return o.innerHTML; +})(); + +function getElementsByClassName(classname, node) { + if(!node) node = document.getElementsByTagName('body')[0]; + var a = []; + var re = new RegExp(`\\b${ classname }\\b`); + var els = node.getElementsByTagName('*'); + for(var i = 0, j = els.length; i < j; i++) + if(re.test(els[i].className))a.push(els[i]); + + return a; +} + +function grabArticle() { + var allParagraphs = document.getElementsByTagName('p'); + var topDivCount = 0; + var topDiv = null; + var topDivParas; + + var articleContent = document.createElement('DIV'); + var articleTitle = document.createElement('H1'); + var articleFooter = document.createElement('DIV'); + + // Replace all doubled-up
tags with

tags, and remove fonts. + var pattern = new RegExp ('
[ \r\n\s]*
', 'g'); + document.body.innerHTML = document.body.innerHTML.replace(pattern, '

').replace(/<\/?font[^>]*>/g, ''); + + // Grab the title from the tag and inject it as the title. + articleTitle.innerHTML = document.title; + articleContent.appendChild(articleTitle); + + // Study all the paragraphs and find the chunk that has the best score. + // A score is determined by things like: Number of <p>'s, commas, special classes, etc. + for (var j = 0; j < allParagraphs.length; j++) { + parentNode = allParagraphs[j].parentNode; + + // Initialize readability data + if(typeof parentNode.readability === 'undefined') { + parentNode.readability = { 'contentScore': 0 }; + + // Look for a special classname + if(parentNode.className.match(/(comment|meta|footer|footnote)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.className.match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/)) + parentNode.readability.contentScore += 25; + + // Look for a special ID + if(parentNode.id.match(/(comment|meta|footer|footnote)/)) + parentNode.readability.contentScore -= 50; + else if(parentNode.id.match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/)) + parentNode.readability.contentScore += 25; + } + + // Add a point for the paragraph found + if(getInnerText(allParagraphs[j]).length > 10) + parentNode.readability.contentScore++; + + // Add points for any commas within this paragraph + parentNode.readability.contentScore += getCharCount(allParagraphs[j]); + } + + // Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 + for(nodeIndex = 0; (node = document.getElementsByTagName('*')[nodeIndex]); nodeIndex++) + if(typeof node.readability !== 'undefined' && (topDiv == null || node.readability.contentScore > topDiv.readability.contentScore)) + topDiv = node; + + if(topDiv == null) { + topDiv = document.createElement('div'); + topDiv.innerHTML = 'Sorry, clippable was unable to parse this page for content. If you feel like it should have been able to, please <a href="http://brettterpstra.com/contact">let us know.</a>'; + } + + // REMOVES ALL STYLESHEETS ... + for (var k = 0;k < document.styleSheets.length; k++) + if (document.styleSheets[k].href != null && document.styleSheets[k].href.lastIndexOf('readability') == -1) + document.styleSheets[k].disabled = true; + + var sh = getElementsByClassName('syntaxhighlighter'); + for (var i = 0;i < sh.length;i++) { + var bar = getElementsByClassName('toolbar', sh[i]); + if (bar.length > 0) + for (var bn = 0;bn < bar.length;bn++) + bar[bn].parentNode.removeChild(bar[bn]); + + var numbers = getElementsByClassName('number', sh[i]); + if (numbers.length > 0) + for (var num = 0;num < numbers.length;num++) + numbers[num].parentNode.removeChild(numbers[num]); + } + + var dp = getElementsByClassName('dp-highlighter'); + for (var d = 0;d < dp.length;d++) + dp[d].parentNode.removeChild(dp[d]); + + var sth = getElementsByClassName('standardLighter'); + for (d = 0;d < sth.length;d++) + sth[d].parentNode.removeChild(sth[d]); + + // Remove all style tags in head (not doing this on IE) : + var styleTags = document.getElementsByTagName('style'); + for (var l = 0;l < styleTags.length; l++) + if (navigator.appName != 'Microsoft Internet Explorer') + styleTags[l].textContent = ''; + + topDiv = killCodeSpans(topDiv); // removes span tags + cleanStyles(topDiv); // Removes all style attributes + topDiv = killDivs(topDiv); // Goes in and removes DIV's that have more non <p> stuff than <p> stuff + topDiv = killBreaks(topDiv); // Removes any consecutive <br />'s into just one <br /> + + // Cleans out junk from the topDiv just in case: + topDiv = clean(topDiv, 'form'); + // topDiv = clean(topDiv, "object"); + topDiv = clean(topDiv, 'table', 8); + topDiv = clean(topDiv, 'h1'); + // topDiv = clean(topDiv, "h2"); + topDiv = clean(topDiv, 'iframe'); + + // Add the footer and contents: + articleFooter.id = 'readFooter'; + articleFooter.innerHTML = `\ + <a href='http://lab.arc90.com/experiments/readability'><img src='http://lab.arc90.com/experiments/readability/images/footer-readability.png'></a>\ + <a href='http://www.arc90.com'><img src='http://lab.arc90.com/experiments/readability/images/footer-arc90.png'></a>\ + <a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter »</a>\ + <div class='footer-right' >\ + <span class='version'>Readability version ${ readabilityVersion }</span>\ + </div>\ + `; + + articleContent.appendChild(topDiv); + // articleContent.appendChild(articleFooter); + document.onkeyup = docOnKeyup; + + return articleContent; +} + +function docOnKeyup(ev) { + var keyID = null; + if (navigator.appName == 'Microsoft Internet Explorer') + keyID = event.keyCode; + else + keyID = (window.event) ? event.keyCode : ev.keyCode; + + var bgcolor, fgcolor, acolor; + switch (keyID) { + + case 27: // escape + document.location.reload(true); + break; + case 37: // left arrow + bgcolor = '#222'; + fgcolor = '#F3EFCE'; + acolor = '#A19F89'; + break; + case 39: // right arrow + bgcolor = '#fff'; + fgcolor = '#333'; + acolor = '#276F78'; + break; + case 46: // delete + bgcolor = '#eee'; + fgcolor = '#333'; + acolor = '#blue'; + break; + + } + body = document.getElementById('readOverlay'); + // body.className = body.className.replace('/\blightened\b/','') + " darkened"; + body.style.backgroundColor = bgcolor; + body.style.color = fgcolor; + var alinks = body.getElementsByTagName('a'); + for (var lc = 0;lc < alinks.length;lc++) + alinks[lc].style.color = acolor; +} + +// Get the inner text of a node - cross browser compatibly. +function getInnerText(e) { + if (navigator.appName == 'Microsoft Internet Explorer') + return e.innerText; + else + return e.textContent; +} + +// Get character count +function getCharCount ( e, s ) { + s = s || ','; + + return getInnerText(e).split(s).length; +} + +function cleanStyles( e ) { + e = e || document; + var cur = e.firstChild; + + // If we had a bad node, there's not much we can do. + if(!e) + return; + + // Remove any root styles, if we're able. + if(typeof e.removeAttribute === 'function') + e.removeAttribute('style'); + + // Go until there are no more child nodes + while ( cur != null ) { + if ( cur.nodeType == 1 ) { + // Remove style attribute(s) : + cur.removeAttribute('style'); + cleanStyles( cur ); + } + cur = cur.nextSibling; + } +} + +function killDivs ( e ) { + var divsList = e.getElementsByTagName( 'div' ); + var curDivLength = divsList.length; + + // Gather counts for other typical elements embedded within. + // Traverse backwards so we can remove nodes at the same time without effecting the traversal. + for (var i = curDivLength - 1; i >= 0; i--) { + var p = divsList[i].getElementsByTagName('p').length; + var img = divsList[i].getElementsByTagName('img').length; + var li = divsList[i].getElementsByTagName('li').length; + var a = divsList[i].getElementsByTagName('a').length; + var embed = divsList[i].getElementsByTagName('embed').length; + var object = divsList[i].getElementsByTagName('object').length; + var pre = divsList[i].getElementsByTagName('pre').length; + var code = divsList[i].getElementsByTagName('code').length; + var divId = divsList[i].id; + var divClass = divsList[i].className; + var sphereit = divsList[i].innerHTML.match('<!-- sphereit') == null ? 0 : 1; + // If the number of commas is less than 10 (bad sign) ... + if ( getCharCount(divsList[i]) < 10 ) + // And the number of non-paragraph elements is more than paragraphs + // or other ominous signs : + if (( img > p || li > p || a > p || p == 0 || divId.match('comment') != null || divClass.match('comment') != null || divId.match('share') != null || divClass.match('share') != null) && ( pre == 0 && code == 0 && embed == 0 && object == 0 && sphereit == 0 )) { + if (!p == 0 && img == 1) divsList[i].parentNode.removeChild(divsList[i]); + } + + var stopwords = ['comment', 'share', 'footer', '^ad']; + for (var sw = 0;sw < stopwords.length;sw++) { + regex = new RegExp(stopwords[sw]); + if (divId.match(regex) != null || divClass.match(regex) != null) { + console.log(`matched ${stopwords[sw]}`); + divsList[i].parentNode.removeChild(divsList[i]); + } + } + // if (divId.match("comment") != null || divClass.match("comment") != null || divId.match("share") != null || divClass.match("share") != null || divClass.match("footer") != null || divId.match("footer") != null || divClass.match(/^ad/) != null || divId.match(/^ad/) != null) { + // divsList[i].parentNode.removeChild(divsList[i]); + // } + } + + return e; +} + +function killBreaks ( e ) { + e.innerHTML = e.innerHTML.replace(/(<br\s*\/?>(\s| ?)*){1,}/g, '<br />'); + + return e; +} + +function killCodeSpans ( e ) { + e.innerHTML = e.innerHTML.replace(/<\/?\s?span(?:[^>]+)?>/g, ''); + + return e; +} + +function clean(e, tags, minWords) { + var targetList; + var y; + if (tags == 'table') { + targetList = e.getElementsByTagName( tags ); + minWords = minWords || 1000000; + for (y = 0; y < targetList.length; y++) { + // If the text content isn't laden with words, remove the child: + cells = targetList[y].getElementsByTagName('td').length; + if (cells < minWords) + targetList[y].parentNode.removeChild(targetList[y]); + } + } + else { + targetList = e.getElementsByTagName( tags ); + minWords = minWords || 1000000; + + for (y = 0; y < targetList.length; y++) + // If the text content isn't laden with words, remove the child: + if (getCharCount(targetList[y], ' ') < minWords && targetList[y].tagName != 'pre') + targetList[y].parentNode.removeChild(targetList[y]); + } + + return e; +} + +function convert(e, tagId) { + var children, parent, newNode; + var elems = document.getElementsByTagName(tagId); + for (y = 0; y < elems.length; y++) { + children = elems[y].childNodes; + parent = elems[y].parentNode; + newNode = document.createElement('span'); + newNode.setAttribute('style', 'font-weight:bold'); + for(var i = 0;i < children.length;i++) + newNode.appendChild(children[i]); + + parent.replaceChild(newNode, elems[y]); + } + + return e; +} diff --git a/server/reducers/euronews.js b/server/reducers/euronews.js index db57e31..743f803 100644 --- a/server/reducers/euronews.js +++ b/server/reducers/euronews.js @@ -50,6 +50,8 @@ function reduceArticle(body = '') { // const outputHTML = html.join('').replace(htmlTidy, ''); const outputHTML = html.join(''); + console.log(outputHTML); + obj.title = title; obj.image = image; obj.html = outputHTML; @@ -59,5 +61,61 @@ function reduceArticle(body = '') { return obj; } -module.exports = { reduceArticle }; +function reduceArticleV2(body = '') { + if (body === '') return {}; + + const obj = {}; + const $ = cheerio.load(body); + + $('amp-ad').remove(); + + const title = $('meta[property="og:title"]').attr('content'); + const image = `https://image.silvrtree.co.uk/640,fit,q80/${ $('meta[property="og:image"]').attr('content')}`; + + const stuff = $('[itemprop="articleBody"]'); + + const html = []; + + const content = $('div.article__content'); + + for (let top = 0, topLen = content.length; top < topLen; top++) { + const children = $(content[top]).children(); + + for (let index = 0, len = children.length; index < len; index++) { + let line = $.html($(children[index])).replace('amp-img', 'img'); + const tag = children[index].name; + + const symbol = /src=(['"])(http[s]?:\/\/)/.exec(line) || []; + + if (tag === 'amp-twitter') { + const tweetid = $(children[index]).data('tweetid'); + line = `<amp-twitter width="375" + height="472" + layout="responsive" data-tweetid="${tweetid}" > </amp-twitter>`; + } + // logger.debug(symbol); + + if (symbol.length !== 0) + line = line.replace(/src=['"]http[s]?:\/\//, `src=${symbol[1]}https://image.silvrtree.co.uk/640,fit,q80/${symbol[2]}`); + + html.push(line); + } + } + + html.push('<div class="endbumper"></div>'); + // const outputHTML = html.join('').replace(htmlTidy, ''); + const outputHTML = html.join(''); + + console.log(outputHTML); + + obj.title = title; + obj.image = image; + obj.html = outputHTML; + + logger.debug(JSON.stringify(obj)); + + return obj; +} + +module.exports = { reduceArticle, reduceArticleV2 }; diff --git a/server/reducers/rightbyme.js b/server/reducers/rightbyme.js index 371f45a..1ff8f62 100644 --- a/server/reducers/rightbyme.js +++ b/server/reducers/rightbyme.js @@ -4,6 +4,9 @@ const { get, isEmpty } = require('lodash'); // Bearer YlF_b6D149xr_xnrrYudlSnpn1A53b67vALlIK2HnD0ymBXQocRvPW3KjGN8jZNw0KnyAqxGaOzU7CLVPr84_KbnTxutNRXFVR9axmRqGN6ccda1xahoZo58KC2GWnYx' + +// google api key AIzaSyBl7O9LHIthCagcqIaDkQ4um_hghYG5reE + logger.level = 'debug'; function reduceExplore(data) { diff --git a/src/v1/js/News.js b/src/v1/js/News.js index 2e28f07..dabb576 100644 --- a/src/v1/js/News.js +++ b/src/v1/js/News.js @@ -52,14 +52,16 @@ const NewsModel = Backbone.Model.extend({ }, 'initialize': function() { this.newsCollection = newsCollection; this.listenTo(this, 'change:update', this.onChange); + this.getNews = _.throttle(this.getNewsReal, 6000); + this.getNews(); }, 'onChange': function() { this.getNews(); }, - 'getNews': function() { + 'getNewsReal': function() { // const ll = this.get('llShort'); - console.info('>> News:request'); + console.info('>> News:getNewsReal'); request({ 'url': `${window.loc}/news`, 'method': 'GET', 'qs': { diff --git a/src/v1/js/NewsList.js b/src/v1/js/NewsList.js index 530dcb9..be38a26 100644 --- a/src/v1/js/NewsList.js +++ b/src/v1/js/NewsList.js @@ -57,7 +57,7 @@ const NewsListModel = Backbone.Model.extend({ this.getNews(); }, 'getNews': function() { - console.info('>> NewsList:request'); + console.info('>> NewsList:getNews'); request({ 'url': `${window.loc}/news`, 'method': 'GET', 'qs': { diff --git a/src/v1/js/libs/reducers.js b/src/v1/js/libs/reducers.js index a2fedb6..690b92e 100644 --- a/src/v1/js/libs/reducers.js +++ b/src/v1/js/libs/reducers.js @@ -102,9 +102,14 @@ const reduceEuronews = function(item) { obj.pubdate = fecha.format(pubdateSrc, 'dddd MMMM Do, YYYY HH:mm'); obj.description = item.description.replace(/(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)/g, ''); + if (obj.description === '1') + obj.description = ''; + obj.guid = encodeURI(item.guid.text); obj.title = item.title; + console.log(obj); + return obj; }; diff --git a/src/v1/js/libs/templates.js b/src/v1/js/libs/templates.js index 0ed5567..14f4f11 100644 --- a/src/v1/js/libs/templates.js +++ b/src/v1/js/libs/templates.js @@ -158,7 +158,6 @@ const templates = { module.exports = templates; - /* <div data-id="4b926c3af964a52068f833e3" class="itemRow mui--align-middle"> <img class="mui--align-middle" src="https://ss3.4sqi.net/img/categories_v2/nightlife/pub_32.png" width="32px" height="32px" style="-webkit-filter: invert(100%);"> diff --git a/test/euronews.scrape.js b/test/euronews.scrape.js index 32dc9da..239ab2d 100644 --- a/test/euronews.scrape.js +++ b/test/euronews.scrape.js @@ -7,11 +7,11 @@ const jsonfile = require('jsonfile'); const cheerio = require('cheerio'); -const { reduceArticle } = require('../server/reducers/euronews'); +const { reduceArticle,reduceArticleV2 } = require('../server/reducers/euronews'); test('Euronews', async t => { - t.test('Reduce a page', async t => { - const psDetail = fs.readFileSync('test/data/euronews/en002.html'); + t.skip('Reduce a page', async t => { + const psDetail = fs.readFileSync('./data/euronews/amp-20200114.html'); const expectedJSON = { 'bob':false }; // jsonfile.readFileSync('tests/data/cz/ps001.json'); @@ -22,5 +22,19 @@ test('Euronews', async t => { t.end(); }); + + t.test('Reduce a page V2', async t => { + const psDetail = fs.readFileSync('./data/euronews/amp-20200114.html'); + const expectedJSON = { 'bob':false }; // jsonfile.readFileSync('tests/data/cz/ps001.json'); + + + const output = await reduceArticleV2(psDetail); + + t.deepEquals(output, expectedJSON, 'Extracted Details from Page'); + + t.end(); + }); + + t.end(); }); diff --git a/test/euronewsrss.spec.js b/test/euronewsrss.spec.js new file mode 100644 index 0000000..e69de29 diff --git a/test/rightbyme.spec.js b/test/rightbyme.spec.js index f45a6a1..915faf4 100644 --- a/test/rightbyme.spec.js +++ b/test/rightbyme.spec.js @@ -522,7 +522,23 @@ const goodMergeResult = { } }; -const goodExplore = { 'latitude': 56.39449115920086, 'longitude': -5.1161516920810755, 'name':'Cruachan Power Station', 'category':'Building', 'icon':'https://ss3.4sqi.net/img/categories_v2/building/default_64.png', 'id':'4c233b549a67a5937ea7dd87', 'provider':'foursquare', 'address':'A85, Dalmally, PA33 1AN, United Kingdom', 'city':'Dalmally', 'state':'', 'postcode':'PA33 1AN', 'twitter':'', 'facebook':'', 'url':'http://www.visitcruachan.co.uk' }; +const goodExplore = { + 'address': 'A85, Dalmally, PA33 1AN, United Kingdom', + 'category': 'Building', + 'city': 'Dalmally', + 'description': '', + 'facebook': '', + 'icon': 'https://ss3.4sqi.net/img/categories_v2/building/default_64.png', + 'id': '4c233b549a67a5937ea7dd87', + 'latitude': 56.39449115920086, + 'longitude': -5.1161516920810755, + 'name': 'Cruachan Power Station', + 'postcode': 'PA33 1AN', + 'provider': 'foursquare', + 'state': '', + 'twitter': '', + 'url': '' +}; const goodFS = {