commit 0b9023f45e9c39592d7097b497b649ad7f260da0 Author: Kip Gebhardt Date: Tue Dec 23 15:39:57 2014 -0800 first commit of new repo diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/index.js b/index.js new file mode 100644 index 0000000..bb77575 --- /dev/null +++ b/index.js @@ -0,0 +1,6 @@ +var RssBraider = require('./lib/RssBraider'); + +exports.createClient = function (options) { + return new RssBraider(options); +}; +exports.RssBraider = RssBraider; diff --git a/lib/RssBraider.js b/lib/RssBraider.js new file mode 100644 index 0000000..cdb750e --- /dev/null +++ b/lib/RssBraider.js @@ -0,0 +1,240 @@ +// process feed-reader item into node-rss item + +var FeedParser = require('feedparser'), + bunyan = require('bunyan'), + _ = require('lodash'), + async = require('async'), + request = require('request'), + RSS = require('rss'); + + +var logger; +var RssBraider = function (options) { + this.config = options.config; + this.feeds = options.feeds || null; // TOOD validate feed configs + this.logger = logger = options.logger || bunyan.createLogger({name: 'rss-braider'}); +}; + +RssBraider.prototype.setConfig = function(config) { + this.config = config; +}; + +RssBraider.prototype.init = function() { + // Validate the configs? +}; + +RssBraider.prototype.feedExists = function (feed_name) { + if (this.feeds && this.feeds[feed_name]) { + return true; + } else { + return false; + } +}; + +RssBraider.prototype.processFeed = function(feed_name, format, callback) +{ + console.time("process"); + + if (!format) { + format = 'json'; + } + var self = this, + feed = this.feeds[feed_name], + feed_articles = []; + + async.each(feed.feed_sources, function(source, callback) { + var count = source.count || feed.default_count || 10, + url = source.url, + source_articles = []; + + // todo: Check if source.file is set and set up a fs stream read + var req = request(url), + feedparser = new FeedParser(); + + logger.info("request to", url); + + req.on('error', function (error) { + logger.error(error); + }); + + req.on('response', function (res) { + var stream = this; + if (res.statusCode !== 200) { + return this.emit('error', new Error('Bad status code')); + } + stream.pipe(feedparser); + }); + + feedparser.on('error', function(error) { + logger.error("feedparser: error", error); + }); + + // Collect the articles from this source + feedparser.on('readable', function() { + // This is where the action is! + var stream = this, + item; + + while ( item = stream.read() ) { + logger.info("item received", item.guid); + var article = self.processItem(item); + if (article) { + source_articles.push(article); + } + } + }); + + feedparser.on("end", function(){ + // sort and de-dupe this feed's articles and push them into array + source_articles = self.dedupe(source_articles); + source_articles = self.date_sort(source_articles); + source_articles = source_articles.slice(0, count); + feed_articles = feed_articles.concat(source_articles); + callback(); + }); + }, + function(err){ + if (err) { + logger.error(err); + } else { + // all done + + logger.info(feed_articles); + + // Sort the stories for the source by date descending + feed_articles = self.dedupe(feed_articles); + feed_articles = self.date_sort(feed_articles); + feed_articles.reverse(); + + // Create new feed with these articles + var options = { + title : feed.data.title, + site_url : "http://www.kqed.org", + description : feed.data.description, + generator : feed.data.generator || 'rss-braider', + feed_url : feed.data.url, + custom_namespaces : self.config.custom_namespaces || [], + no_cdata_fields : feed.no_cdata_fields + }; + + var newfeed = new RSS(options, feed_articles); + + + var ret_string; + switch (format.toLowerCase()) { + case 'json': + ret_string = JSON.stringify(newfeed); + break; + case 'xml': + ret_string = newfeed.xml("\t"); + break; + default: + ret_string = "{}"; + } + console.timeEnd("process"); + console.log(format, ret_string); + + callback(null, ret_string); + // return ret_string; + } + }); + +}; + +// Accepts a feed-parser item and builds a node-rss itemOptions object +RssBraider.prototype.processItem = function (item) { + if (!item) { + logger.error("processItem: no item passed in"); + return null; + } + + // logger.info("item", item); + + // Basics + var itemOptions = { + title : item.title, + description : item.summary, + url : item.link, + guid : item.guid, + permalink : item.permalink, + author : item.author, + date : item.date, + custom_elements : [] + }; + + + ////////////////// + // Custom elements + ////////////////// + + // TODO: Generify the following if possible + + // content:encoded (i.e. description) + if (item["content:encoded"] && item["content:encoded"]["#"]){ + var content_encoded = item["content:encoded"]["#"]; + itemOptions.custom_elements.push( + { "content:encoded": + { + _cdata: content_encoded + } + } + ); + } + + // // wfw + if (item["wfw:commentrss"] && item["wfw:commentrss"]["#"]){ + itemOptions.custom_elements.push({ "wfw:commentRss": item["wfw:commentrss"]["#"]}); + } + + // // // slash comments + if (item["slash:comments"] && item["slash:comments"]["#"]){ + itemOptions.custom_elements.push({ "slash:comments": item["slash:comments"]["#"]}); + } + + // Images + // Take 'media:thumbnail', + // else + // 'media:content'[0]'media:thumbnail' + // else + // 'media:thumbnail' + if (item['media:thumbnail'] && item['media:thumbnail']['#']) { + itemOptions.custom_elements.push({'media:thumbnail' : item['media:thumbnail']['#']}); + } else { + if (item["media:content"]) { + var media_contents; + if (! _.isArray(item['media:content'])) { + media_contents = [item['media:content']]; + } else { + media_contents = item['media:content']; + } + + if ( media_contents[0] && + media_contents[0]['media:thumbnail'] && + media_contents[0]['media:thumbnail']['@'] && + media_contents[0]['media:thumbnail']['@'].url) { + + itemOptions.custom_elements.push({'media:thumbnail' : media_contents[0]['media:thumbnail']['@'].url}); + + } else { + itemOptions.custom_elements.push({'media:thumbnail' : media_contents[0]['@'].url}); + } + } + } + + return itemOptions; +}; + +// Dedupe articles in node-rss itemOptions format +RssBraider.prototype.dedupe = function(articles_arr){ + // TODO: sort by guid, url, etc + return _.uniq(articles_arr); +}; + +RssBraider.prototype.date_sort = function(articles_arr) { + var sorted_articles = _.sortBy(articles_arr, function(article) { + return article.date.getTime(); + }); + return sorted_articles; +}; + +module.exports = RssBraider; \ No newline at end of file diff --git a/package.json b/package.json new file mode 100644 index 0000000..356de10 --- /dev/null +++ b/package.json @@ -0,0 +1,25 @@ +{ + "name": "rss-braider", + "version": "0.0.1", + "description": "Braid or aggregate RSS feeds into a single RSS document", + "main": "index.js", + "scripts": { + "test": "tape test" + }, + "keywords": [ + "rss", + "braider", + "combiner" + ], + "author": "Kip Gebhardt ", + "license": "ISC", + "dependencies": { + "async": "^0.9.0", + "bunyan": "^1.2.3", + "feedparser": "^0.19.2", + "lodash": "^2.4.1", + "request": "^2.51.0", + "rss": "rv-kip/node-rss.git", + "tape": "^3.0.3" + } +}