382 lines
13 KiB
JavaScript
382 lines
13 KiB
JavaScript
// process feed-reader item into node-rss item
|
|
var FeedParser = require('feedparser'),
|
|
bunyan = require('bunyan'),
|
|
_ = require('lodash'),
|
|
async = require('async'),
|
|
request = require('request'),
|
|
RSS = require('rss'),
|
|
fs = require('fs'),
|
|
package_json = require('../package.json'),
|
|
logger;
|
|
|
|
var RssBraider = function (options) {
|
|
this.uniques = {};
|
|
this.mdUniques = [];
|
|
this.feedCount = [];
|
|
if (!options) {
|
|
options = {};
|
|
}
|
|
this.feeds = options.feeds || null;
|
|
this.logger = options.logger || bunyan.createLogger({name: package_json.name});
|
|
|
|
if (options.log_level) {
|
|
this.logger.level(options.log_level);
|
|
}
|
|
|
|
this.indent = options.indent || " ";
|
|
this.dedupe_fields = options.dedupe_fields || []; // The fields to use to identify duplicate articles
|
|
this.date_sort_order = options.date_sort_order || "desc";
|
|
|
|
this.plugins_directories = options.plugins_directories || [];
|
|
this.plugins = {};
|
|
this.loadPlugins();
|
|
};
|
|
|
|
// loadup self.plugins with the plugin functions
|
|
RssBraider.prototype.loadPlugins = function () {
|
|
var self = this;
|
|
|
|
if (self.plugins_directories.length < 1) {
|
|
self.logger.debug("No plugins_directories specified. No plugins loaded.");
|
|
}
|
|
self.plugins_directories.forEach(function(path){
|
|
// load up each file and assign it to the plugins
|
|
var filenames = fs.readdirSync(path);
|
|
filenames.forEach(function(filename){
|
|
var plugin_name = filename.replace(/.js$/, '');
|
|
if (self.plugins[plugin_name]) {
|
|
self.logger.warn("Duplicate plugin name: ", plugin_name, "Overwriting with newer plugin");
|
|
}
|
|
self.plugins[plugin_name] = require(path + '/' + plugin_name);
|
|
self.logger.debug("plugin registered:", plugin_name);
|
|
});
|
|
});
|
|
};
|
|
|
|
RssBraider.prototype.feedExists = function (feed_name) {
|
|
if (this.feeds && this.feeds[feed_name]) {
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
};
|
|
|
|
// Gather data from all feed sources, process each article/item through plugins,
|
|
// trim down to desired count, dedupe and sort
|
|
RssBraider.prototype.processFeed = function(feed_name, format, callback)
|
|
{
|
|
var self = this,
|
|
feed = self.feeds[feed_name],
|
|
feed_articles = [];
|
|
|
|
if (!format) {
|
|
format = 'rss';
|
|
}
|
|
|
|
if (!feed || !feed.sources || feed.sources.length < 1) {
|
|
return callback("No definition for feed name: " + feed_name);
|
|
}
|
|
|
|
// Process each feed source through Feedparser to get articles.
|
|
// Then process each item/article through rss-braider and any plugins
|
|
async.each(feed.sources, function(source, callback) {
|
|
console.log('+ async.each');
|
|
self.feedCount.push(source.feed_url);
|
|
console.log('## FEED COUNT: ' + self.feedCount.length);
|
|
var count = source.count || feed.default_count || 10, // Number of articles per source
|
|
url = source.feed_url || null,
|
|
file_path = source.file_path || null,
|
|
source_articles = [];
|
|
|
|
console.log('Pre FeedParser');
|
|
var feedparser = new FeedParser();
|
|
if (url) {
|
|
var req = request(url);
|
|
|
|
req.on('error', function (error) {
|
|
self.logger.error(error);
|
|
});
|
|
|
|
req.on('response', function (res) {
|
|
var stream = this;
|
|
if (res.statusCode !== 200) {
|
|
return this.emit('error', 'Bad status code: ' + res.statusCode);
|
|
}
|
|
console.log('** DO FEED Parser A');
|
|
stream.pipe(feedparser);
|
|
});
|
|
} else if (file_path) {
|
|
// open file
|
|
console.log('** DO FEED Parser B');
|
|
var filestream = fs.createReadStream(file_path);
|
|
filestream.pipe(feedparser);
|
|
} else {
|
|
self.logger.error("url or file_path not defined for feed: " + source.name);
|
|
return callback();
|
|
}
|
|
|
|
feedparser.on('error', function(error) {
|
|
self.logger.error("feedparser",", source.name:", source.name, ", url:", source.feed_url, error.stack);
|
|
});
|
|
|
|
// Collect the articles from this source
|
|
feedparser.on('readable', function() {
|
|
console.log('+ readable: ' + source.feed_url);
|
|
var stream = this,
|
|
item;
|
|
|
|
while ( !!(item = stream.read()) ) {
|
|
if (source.feed_url) {
|
|
item.source_url = source.feed_url;
|
|
}
|
|
// Process Item/Article
|
|
var article = self.processItem(item, source, feed_name);
|
|
|
|
// plugins may filter items and return null
|
|
if (article) {
|
|
source_articles.push(article);
|
|
}
|
|
}
|
|
console.log('- readable');
|
|
});
|
|
|
|
feedparser.on("end", function(){
|
|
console.log('+ end');
|
|
// de-dupe , date sort, and trim this feed's articles and push them into array
|
|
|
|
// console.log('** Go for dedupe');
|
|
console.log(source.feed_url);
|
|
source_articles = self.dedupe(source_articles, self.dedupe_fields);
|
|
|
|
// source_articles = self.mdDedupe(source_articles, self.dedupe_fields);
|
|
source_articles = self.date_sort(source_articles);
|
|
source_articles = source_articles.slice(0, count);
|
|
feed_articles = feed_articles.concat(source_articles);
|
|
var p = self.feedCount.indexOf(source.feed_url);
|
|
if (p > -1) {
|
|
self.feedCount.splice(p, 1);
|
|
}
|
|
console.log('~~ FEED COUNT: ' + self.feedCount.length);
|
|
console.log(self.feedCount);
|
|
callback();
|
|
console.log('- end');
|
|
|
|
});
|
|
console.log('- async.each');
|
|
},
|
|
function(err){
|
|
if (err) {
|
|
self.logger.error(err);
|
|
return callback(err);
|
|
} else {
|
|
// Final Dedupe step and resort
|
|
console.log('** ERR Go for dedupe');
|
|
|
|
feed_articles = self.dedupe(feed_articles, self.dedupe_fields);
|
|
|
|
// feed_articles = self.mdDedupe(feed_articles, self.dedupe_fields);
|
|
|
|
feed_articles = self.date_sort(feed_articles);
|
|
|
|
// Create new feed with these articles. Follows node-rss spec
|
|
var options = {
|
|
title : feed.meta.title,
|
|
description : feed.meta.description,
|
|
generator : feed.meta.generator || 'rss-braider',
|
|
site_url : feed.meta.site_url || null,
|
|
feed_url : feed.meta.feed_url || null,
|
|
image_url : feed.meta.image_url || null,
|
|
webMaster : feed.meta.webMaster || null,
|
|
copyright : feed.meta.copyright || null,
|
|
categories : feed.meta.categories || null,
|
|
custom_namespaces : feed.custom_namespaces || [],
|
|
no_cdata_fields : feed.no_cdata_fields
|
|
};
|
|
|
|
var newfeed = new RSS(options, feed_articles);
|
|
|
|
var ret_string;
|
|
switch (format.toLowerCase()) {
|
|
case 'json':
|
|
ret_string = JSON.stringify(newfeed);
|
|
break;
|
|
case 'rss':
|
|
ret_string = newfeed.xml(self.indent);
|
|
break;
|
|
default:
|
|
self.logger.error("Unknown format:", format);
|
|
ret_string = "{}";
|
|
}
|
|
|
|
return callback(null, ret_string);
|
|
}
|
|
});
|
|
};
|
|
|
|
// Accepts a feed-parser item and builds a node-rss itemOptions object
|
|
RssBraider.prototype.processItem = function (item, source, feed_name) {
|
|
var self = this;
|
|
|
|
if (!item || !source || !feed_name) {
|
|
self.logger.error("processItem: missing item, source, and/or feed_name");
|
|
return null;
|
|
}
|
|
// Basics
|
|
var itemOptions = {
|
|
title : item.title,
|
|
description : item.summary,
|
|
url : item.link,
|
|
guid : item.guid,
|
|
permalink : item.permalink,
|
|
author : item.author,
|
|
date : item.date,
|
|
categories : item.categories,
|
|
custom_elements : []
|
|
};
|
|
|
|
// Run the plugins specified by the "plugins" section of the
|
|
// feed .js file to build out any custom elements or
|
|
// do transforms/filters
|
|
var filteredItemOptions = self.runPlugins(item, itemOptions, source, feed_name);
|
|
|
|
return filteredItemOptions;
|
|
};
|
|
|
|
RssBraider.prototype.runPlugins = function (item, itemOptions, source, feed_name) {
|
|
var self = this,
|
|
feed = self.feeds[feed_name] || {},
|
|
plugins_list = feed.plugins || [],
|
|
ret_val,
|
|
filteredItemOptions;
|
|
|
|
// Process the item through the desired feed plugins
|
|
// plugins_list.forEach(function(plugin_name){
|
|
for (var i = 0; i < plugins_list.length; i++) {
|
|
var plugin_name = plugins_list[i];
|
|
if (self.plugins[plugin_name]) {
|
|
filteredItemOptions = self.plugins[plugin_name](item, itemOptions, source);
|
|
} else {
|
|
self.logger.error("A plugin named '" + plugin_name + "' hasn't been registered");
|
|
}
|
|
|
|
// A plugin returning -1 means skip this item
|
|
if (filteredItemOptions === -1) {
|
|
self.logger.debug("Plugin '" + plugin_name + "' filtered item from feed '" + feed.meta.title + "'", item.guid);
|
|
itemOptions = null;
|
|
break;
|
|
}
|
|
|
|
// Check that the plugin didn't just return null or undef, which would be bad.
|
|
if (!filteredItemOptions) {
|
|
self.logger.debug("Plugin '" + plugin_name + "' failed to return itemOptions for feed:'" + feed.meta.title + "'", item.guid);
|
|
filteredItemOptions = itemOptions; // Reset
|
|
}
|
|
// Prepare for next plugin.
|
|
itemOptions = filteredItemOptions;
|
|
}
|
|
return itemOptions;
|
|
};
|
|
|
|
// Dedupe articles in node-rss itemOptions format
|
|
// Accepts an array of fields to dedupe on, or does a basic uniq
|
|
// operation on the articles array
|
|
// TODO, make this a plugin?
|
|
RssBraider.prototype.mdDedupe = function(articles_arr, fields)
|
|
{
|
|
var deduped_articles=[];
|
|
var self = this;
|
|
// this.mdUniques = [];
|
|
|
|
console.log('+ mdDedupe');
|
|
if ( !fields || fields.length < 1 ) {
|
|
console.log('- deDupe -- empty');
|
|
return _.uniq(articles_arr);
|
|
} else {
|
|
articles_arr.forEach(function(article){
|
|
var count = 0;
|
|
fields.forEach(function(field){
|
|
/* console.log('---');
|
|
console.log(field);
|
|
console.log(article[field]);
|
|
*/
|
|
if (self.mdUniques.indexOf(article[field]) !== -1) {
|
|
// console.log('Increase count');
|
|
count++;
|
|
} else {
|
|
// console.log('Push item');
|
|
self.mdUniques.push(article[field]);
|
|
}
|
|
});
|
|
if (count == 0) {
|
|
// it's unique
|
|
// console.log('Adding:' );
|
|
deduped_articles.push(article);
|
|
} else {
|
|
// The article matched all of another article's "dedupe" fields
|
|
// so filter it out (i.e. do nothing)
|
|
// console.log("skipping duplicate", '"' + article.title + '"', article.guid);
|
|
}
|
|
});
|
|
|
|
}
|
|
|
|
|
|
console.log('- MDdeDupe');
|
|
return deduped_articles;
|
|
|
|
};
|
|
|
|
RssBraider.prototype.dedupe = function(articles_arr, fields){
|
|
console.log('+ deDupe');
|
|
var self = this;
|
|
if ( !fields || fields.length < 1 ) {
|
|
console.log('- deDupe -- empty');
|
|
return _.uniq(articles_arr);
|
|
} else {
|
|
var deduped_articles = [];
|
|
articles_arr.forEach(function(article){
|
|
var count = 0;
|
|
fields.forEach(function(field){
|
|
console.log('---');
|
|
console.log(field);
|
|
console.log(article[field]);
|
|
if (!self.uniques[field]) {
|
|
console.log('Doesnt exist, empty it');
|
|
self.uniques[field] = [];
|
|
}
|
|
if (self.uniques[field].indexOf(article[field]) !== -1) {
|
|
console.log('Increase count');
|
|
count++;
|
|
} else {
|
|
console.log('Push item');
|
|
self.uniques[field].push(article[field]);
|
|
}
|
|
});
|
|
if (count !== fields.length) {
|
|
// it's unique
|
|
console.log('Adding:' );
|
|
deduped_articles.push(article);
|
|
} else {
|
|
// The article matched all of another article's "dedupe" fields
|
|
// so filter it out (i.e. do nothing)
|
|
self.logger.debug("skipping duplicate", '"' + article.title + '"', article.guid);
|
|
}
|
|
});
|
|
console.log('- deDupe');
|
|
return deduped_articles;
|
|
}
|
|
};
|
|
|
|
// TODO: Could be a plugin
|
|
// Sort articles by date
|
|
RssBraider.prototype.date_sort = function(articles_arr) {
|
|
var sorted_articles = _.sortBy(articles_arr, function(article) {
|
|
return article.date.getTime();
|
|
});
|
|
if (this.date_sort_order === "desc") {
|
|
sorted_articles.reverse();
|
|
}
|
|
return sorted_articles;
|
|
};
|
|
|
|
module.exports = RssBraider; |