rss-braider/lib/RssBraider.js

263 lines
8.2 KiB
JavaScript
Raw Normal View History

2014-12-23 23:39:57 +00:00
// process feed-reader item into node-rss item
var FeedParser = require('feedparser'),
bunyan = require('bunyan'),
_ = require('lodash'),
async = require('async'),
request = require('request'),
2014-12-26 22:47:48 +00:00
RSS = require('rss'),
fs = require('fs');
2014-12-23 23:39:57 +00:00
var logger;
var RssBraider = function (options) {
this.feeds = options.feeds || null; // TOOD validate feed configs
this.logger = logger = options.logger || bunyan.createLogger({name: 'rss-braider'});
2014-12-26 22:47:48 +00:00
this.indent = options.indent || " ";
2014-12-23 23:39:57 +00:00
};
RssBraider.prototype.init = function() {
2014-12-26 22:47:48 +00:00
// Validate the feeds?
2014-12-23 23:39:57 +00:00
};
RssBraider.prototype.feedExists = function (feed_name) {
if (this.feeds && this.feeds[feed_name]) {
return true;
} else {
return false;
}
};
RssBraider.prototype.processFeed = function(feed_name, format, callback)
{
2014-12-23 23:52:58 +00:00
// DEBUG
2014-12-23 23:39:57 +00:00
console.time("process");
if (!format) {
format = 'json';
}
var self = this,
feed = this.feeds[feed_name],
feed_articles = [];
2014-12-26 22:47:48 +00:00
if (!feed || !feed.sources || feed.sources.length < 1) {
2014-12-26 23:37:34 +00:00
return callback("No definition for feed name: " + feed_name);
2014-12-26 22:47:48 +00:00
}
async.each(feed.sources, function(source, callback) {
2014-12-23 23:39:57 +00:00
var count = source.count || feed.default_count || 10,
2014-12-26 22:47:48 +00:00
url = source.url || null,
file_path = source.file_path || null,
2014-12-23 23:39:57 +00:00
source_articles = [];
// todo: Check if source.file is set and set up a fs stream read
2014-12-26 22:47:48 +00:00
var feedparser = new FeedParser();
if (url) {
var req = request(url);
2014-12-23 23:39:57 +00:00
2014-12-26 23:57:23 +00:00
// logger.info("request to", url);
2014-12-23 23:39:57 +00:00
2014-12-26 22:47:48 +00:00
req.on('error', function (error) {
logger.error(error);
});
2014-12-23 23:39:57 +00:00
2014-12-26 22:47:48 +00:00
req.on('response', function (res) {
var stream = this;
if (res.statusCode !== 200) {
return this.emit('error', new Error('Bad status code'));
}
stream.pipe(feedparser);
});
} else if (file_path) {
// open file
var filestream = fs.createReadStream(file_path);
filestream.pipe(feedparser);
} else {
logger.error("url or file_path not defined for feed: " + source.name);
}
2014-12-23 23:39:57 +00:00
feedparser.on('error', function(error) {
logger.error("feedparser: error", error);
});
// Collect the articles from this source
feedparser.on('readable', function() {
// This is where the action is!
var stream = this,
item;
while ( item = stream.read() ) {
2014-12-26 22:47:48 +00:00
// logger.info("item received", item.guid);
2014-12-23 23:39:57 +00:00
var article = self.processItem(item);
if (article) {
source_articles.push(article);
}
}
});
feedparser.on("end", function(){
// sort and de-dupe this feed's articles and push them into array
source_articles = self.dedupe(source_articles);
source_articles = self.date_sort(source_articles);
source_articles = source_articles.slice(0, count);
feed_articles = feed_articles.concat(source_articles);
callback();
});
},
function(err){
if (err) {
logger.error(err);
2014-12-23 23:52:58 +00:00
return callback(err);
2014-12-23 23:39:57 +00:00
} else {
// Sort the stories for the source by date descending
feed_articles = self.dedupe(feed_articles);
feed_articles = self.date_sort(feed_articles);
feed_articles.reverse();
// Create new feed with these articles
var options = {
2014-12-26 22:47:48 +00:00
title : feed.meta.title,
2014-12-23 23:39:57 +00:00
site_url : "http://www.kqed.org",
2014-12-26 22:47:48 +00:00
description : feed.meta.description,
generator : feed.meta.generator || 'rss-braider',
feed_url : feed.meta.url,
custom_namespaces : feed.custom_namespaces || [],
no_cdata_fields : feed.no_cdata_fields
2014-12-23 23:39:57 +00:00
};
var newfeed = new RSS(options, feed_articles);
var ret_string;
switch (format.toLowerCase()) {
case 'json':
ret_string = JSON.stringify(newfeed);
break;
2014-12-23 23:52:58 +00:00
case 'rss':
2014-12-23 23:39:57 +00:00
case 'xml':
2014-12-26 22:47:48 +00:00
ret_string = newfeed.xml(self.indent);
2014-12-23 23:39:57 +00:00
break;
default:
2014-12-23 23:52:58 +00:00
logger.error("Unknown format:", format);
2014-12-23 23:39:57 +00:00
ret_string = "{}";
}
2014-12-23 23:52:58 +00:00
// DEBUG
2014-12-23 23:39:57 +00:00
console.timeEnd("process");
2014-12-23 23:52:58 +00:00
return callback(null, ret_string);
2014-12-23 23:39:57 +00:00
}
});
};
// Accepts a feed-parser item and builds a node-rss itemOptions object
RssBraider.prototype.processItem = function (item) {
if (!item) {
logger.error("processItem: no item passed in");
return null;
}
// Basics
var itemOptions = {
title : item.title,
description : item.summary,
url : item.link,
guid : item.guid,
permalink : item.permalink,
author : item.author,
date : item.date,
custom_elements : []
};
//////////////////
2014-12-23 23:52:58 +00:00
// Custom elements PLUGINS
2014-12-23 23:39:57 +00:00
//////////////////
// TODO: Generify the following if possible
// content:encoded (i.e. description)
if (item["content:encoded"] && item["content:encoded"]["#"]){
var content_encoded = item["content:encoded"]["#"];
itemOptions.custom_elements.push(
{ "content:encoded":
{
_cdata: content_encoded
}
}
);
}
// // wfw
if (item["wfw:commentrss"] && item["wfw:commentrss"]["#"]){
itemOptions.custom_elements.push({ "wfw:commentRss": item["wfw:commentrss"]["#"]});
}
// // // slash comments
if (item["slash:comments"] && item["slash:comments"]["#"]){
itemOptions.custom_elements.push({ "slash:comments": item["slash:comments"]["#"]});
}
// Images
// Take 'media:thumbnail',
// else
// 'media:content'[0]'media:thumbnail'
// else
// 'media:thumbnail'
2014-12-26 23:57:23 +00:00
var thumbnail;
2014-12-23 23:39:57 +00:00
if (item['media:thumbnail'] && item['media:thumbnail']['#']) {
2014-12-26 23:57:23 +00:00
thumbnail = {
'media:thumbnail': item['media:thumbnail']['#']
};
itemOptions.custom_elements.push(thumbnail);
2014-12-23 23:39:57 +00:00
} else {
if (item["media:content"]) {
var media_contents;
if (! _.isArray(item['media:content'])) {
media_contents = [item['media:content']];
} else {
media_contents = item['media:content'];
}
if ( media_contents[0] &&
media_contents[0]['media:thumbnail'] &&
media_contents[0]['media:thumbnail']['@'] &&
media_contents[0]['media:thumbnail']['@'].url) {
2014-12-26 23:57:23 +00:00
thumbnail = {
'media:thumbnail' : [{
_attr: {
url: media_contents[0]['media:thumbnail']['@'].url
}
}]
};
// itemOptions.custom_elements.push({'media:thumbnail' : { url: media_contents[0]['media:thumbnail']['@'].url}} );
itemOptions.custom_elements.push(thumbnail);
2014-12-23 23:39:57 +00:00
} else {
2014-12-26 23:57:23 +00:00
thumbnail = {
'media:thumbnail' : [{
_attr: {
url: media_contents[0]['@'].url
}
}]
};
itemOptions.custom_elements.push(thumbnail);
2014-12-23 23:39:57 +00:00
}
}
}
return itemOptions;
};
// Dedupe articles in node-rss itemOptions format
RssBraider.prototype.dedupe = function(articles_arr){
// TODO: sort by guid, url, etc
return _.uniq(articles_arr);
};
RssBraider.prototype.date_sort = function(articles_arr) {
var sorted_articles = _.sortBy(articles_arr, function(article) {
return article.date.getTime();
});
return sorted_articles;
};
module.exports = RssBraider;