'use strict'; /** * Created by Martin on 22/02/2016. */ var express = require('express'); var http = require('http'), request = require('request'), cheerio = require( 'cheerio'), util = require('util'); var jsonfile = require('jsonfile'), fs = require('fs'), STRING = require( 'string'); var converter = require('html-to-markdown'); var zlib = require('zlib'); var log4js = require('log4js'); var logger = log4js.getLogger(); var URL = require('url'); var router = express.Router(); var EventEmitter = require('events'); //var nano = require('nano')('http://martind2000:1V3D4m526i@localhost:5984'); var busEmitter = new EventEmitter(); var db_name = 'keeper'; //var dbCouch = nano.use(db_name); /* We've moved to cloudant through IBM Bluemix for the database https://25f854ee-1b51-49ff-acd9-5b0ff478d944-bluemix.cloudant.com/dashboard.html#usage */ var credentials = { "username": "25f854ee-1b51-49ff-acd9-5b0ff478d944-bluemix", "password": "8e417af1b0462ca55726848846cc6b8696fc76defe9d1864cbc334be59549e0c", "host": "25f854ee-1b51-49ff-acd9-5b0ff478d944-bluemix.cloudant.com", "port": 443, "url": "https://25f854ee-1b51-49ff-acd9-5b0ff478d944-bluemix:8e417af1b0462ca55726848846cc6b8696fc76defe9d1864cbc334be59549e0c@25f854ee-1b51-49ff-acd9-5b0ff478d944-bluemix.cloudant.com", "database" : "keeper" }; var Cloudant = require('cloudant'); var cloudant = Cloudant({account:credentials.username, password:credentials.password}); var dbCloudant = cloudant.db.use(credentials.database); var jsonFile = __dirname + '/' + 'output.json'; var bodyfile = __dirname + '/' + 'body.html'; var htmlfile = __dirname + '/' + 'testoutput.html'; var generics = [ 'ARTICLE', 'div.content_column', 'div.post', 'div.page', '#recipe-single', 'div.content.body' ]; var specialHandlers = [{ url: 'www.reddit.com', fn: function(body, url) { return doReddit(body, url); } }, { url: 'developer.android.com', fn: function(body, url) { return doAndroidDeveloper(body, url); } }, { url: 'www.engadget.com', fn: function(body, url) { return doEngadget(body, url); } } ]; function cleaner(b) { var _b = b; var unwanted = [ 'LINK', 'META', 'TITLE', 'div#disqus_thread', 'SCRIPT', 'FOOTER', 'div.ssba', '.shareaholic-canvas', '.yarpp-related', 'div.dfad', 'div.postFooterShare', 'div#nextPrevLinks', '.post-comments', 'HEADER', '.post-title', '#side-menu', '.footer-container', '#pre-footer', '#cakephp-global-navigation', '.masthead', '.breadcrumb-header', '.single-recipe-sidebar', '#recipe-related-videos', '#tnav', '.footer', '#tb-wrapper', '#comments', '#menu' ]; for (var i = 0; i < unwanted.length; i++) { _b.find(unwanted[i]).remove(); } return _b; } function insertBookmark(obj) { logger.debug('Inserting into couch...'); logger.info(util.inspect(obj)); // dbCouch.insert(obj, function(err, body, header) { dbCloudant.insert(obj, function(err, body, header) { if (err) { logger.error('Error inserting into couch'); return; } }); logger.debug('Insert done..'); } function updateBookmark(obj, _id, _rev) { logger.debug('Updating couch...'); var _obj = obj; _obj._id = _id; _obj._rev = _rev; //dbCouch.insert(_obj, function(err, body, header) { dbCloudant.insert(_obj, function(err, body, header) { if (err) { logger.error('Error updating into couch'); return; } else { logger.info('I think we updated ok...'); busEmitter.emit('updateTagsDB'); } }); logger.debug('Update done..'); } var doInsertBookmark = (obj) => { // Logger.info('sendSocket: ' + JSON.stringify(obj)); insertBookmark(obj); }; var doUpdateBookmark = (obj, _id, _rev) => { // Logger.info('sendSocket: ' + JSON.stringify(obj)); updateBookmark(obj, _id, _rev); }; var doGetBookmark = (obj) => { // Logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(obj); }; var doGetBookmarkRedo = (obj) => { // Logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(obj); }; var doGetBookmarkRes = (url, res) => { logger.debug('doGetBookmarkRes'); // Logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(url, res); }; var doUpdateTagsDB = () => { logger.debug('Update the tags database...'); // dbCouch.view('getAllTags', 'getAllTags', function(err, body) { dbCloudant.view('getAllTags', 'getAllTags', function(err, body) { var masterList = []; if (!err) { body.rows.forEach(function(doc) { masterList = masterList.concat(doc.value); }); masterList = masterList.filter((value, index, self) => { return self.indexOf(value) === index; }); //dbCouch.view('taglist', 'taglist', function(err, body) { dbCloudant.view('taglist', 'taglist', function(err, body) { // Logger.debug(body); if (!err) { var outJSON = {}; body.rows.forEach(function(doc) { doSaveTagsDB(doc.value, masterList); }); } else { logger.error('NO TAG LIST EXISTS'); } }); } else { } }); }; var doSaveTagsDB = (orig, newList) => { logger.debug('doSaveTagsDB'); var _obj = orig; _obj.taglist = newList; //dbCouch.insert(_obj, function(err, body, header) { dbCloudant.insert(_obj, function(err, body, header) { if (err) { logger.error('Error updating into couch'); return; } else { logger.info('Updated the tags list...'); } }); }; // Events busEmitter.on('saveBookmarkData', doInsertBookmark); busEmitter.on('updateBookmarkData', doUpdateBookmark); busEmitter.on('getBookmark', doGetBookmark); busEmitter.on('getBookmarkRes', doGetBookmarkRes); busEmitter.on('getBookmarkRedo', doGetBookmarkRedo); busEmitter.on('updateTagsDB', doUpdateTagsDB); busEmitter.on('saveTagsDB', doSaveTagsDB); function doEngadget(body, url) { logger.info('GRABBING Engadget'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV#page_body'); logger.debug('Length:' , tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug('Title: ', title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { let s, src = $(this).attr('src'); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function doAndroidDeveloper(body, url) { logger.info('GRABBING AndroidDeveloper'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.jd-descr'); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { let s, src = $(this).attr('src'); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function doReddit(body, url) { logger.info('GRABBING REDDIT'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.entry'); tdihbody.find('A.thumbnail').each(function(i, elem) { logger.warn($(this)); }); logger.info('++++++'); // Logger.debug(tdihbody.html()); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function genericProcessor(body, url) { logger.info('USING DEFAULT PROCESSOR'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); i = 0; while (($(generics[i]).length == 0) && (i < generics.length)) { i++; } logger.debug(i); if (i < generics.length) { logger.warn('Used a generic'); tdihbody = $(generics[i]); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); } else { logger.warn('Using whole body'); // Bah. nothing to reduce so just grab the body, tidy it and use that tdihbody = $('BODY'); if (tdihbody.length === 0) { tdihbody = $(':root'); } logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); } // Logger.info(util.inspect(tdihbody)); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { let s, src = $(this).attr('src'); console.log('!!!!' + src); if (src !== null && typeof src !== 'undefined') { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function processBody(body, url, _id, _rev) { var obj = {}, i, urlObj, urlPrefix; // Try to find a body to grab urlObj = URL.parse(url); logger.debug('host:', urlObj.host); var flag; for (i = 0;i < specialHandlers.length;i++) { if (urlObj.host === specialHandlers[i].url) { flag = true; obj = specialHandlers[i].fn(body,url); } } if (!flag) { // Do generic processing obj = genericProcessor(body,url); } // Logger.warn(obj.reduced); obj.host = urlObj.host; /* Jsonfile.writeFile(jsonFile, obj, function (err) { console.error(err); });*/ if (_id !== null) { busEmitter.emit('updateBookmarkData', obj, _id, _rev); } else { busEmitter.emit('saveBookmarkData', obj); } return obj; } function genericGrab(obj, res) { var url, _id = null, _ver = null; if (typeof obj === 'string') { logger.info(obj); url = obj; } else { url = obj.url; _id = obj._id || null; _ver = obj._rev || null; } logger.warn(typeof obj); logger.info(url); logger.info(_id); logger.info(_ver); var options = { url: url, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36' }, jar: true, followRedirect: true, followAllRedirects: true }; request(options, function(err, resp, body) { if (err) throw err; if (resp.headers.hasOwnProperty('content-encoding')) { logger.warn('content-encoding'); if (resp.headers['content-encoding'] == 'gzip') { // to test http://chaosinthekitchen.com/2009/07/lime-and-coconut-chicken/ var gunzip = zlib.createGunzip(); var jsonString = ''; resp.pipe(gunzip); gunzip.on('data', function(chunk) { jsonString += chunk; }); gunzip.on('end', function() { // Console.log((jsonString)); callback(JSON.stringify(jsonString)); }); gunzip.on('error', function(e) { console.log(e); }); } else { var b = processBody(body, url, _id, _ver); if (res != null) { res.render('grabbed'); } } } else { var b = processBody(body, url, _id, _ver); if (res != null) { res.render('grabbed', {data: b}); } } }); } router.get('/pocket', function(req, res) { logger.debug('list..'); // dbCouch.view('pocketList', 'pocketList', function(err, body) { dbCloudant.view('pocketList', 'pocketList', function(err, body) { if (!err) { var outJSON = []; body.rows.forEach(function(doc) { var obj = {id: doc.id, entry: doc.value}; console.log(typeof obj.entry.tn); if (typeof obj.entry.tn === 'string') { console.log('its a string:', typeof obj.entry.tn) obj.entry.tn = 'http://image.silvrtree.co.uk/100,fit,q80/' + obj.entry.tn; } else { obj.entry.tn = 'gfx/fm.png'; } outJSON.push(obj); }); logger.debug(util.inspect(body)); logger.info(util.inspect(outJSON)); res.render('pocket', {data: outJSON}); } else { res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }); router.get('/list', function(req, res) { logger.debug('list..'); //dbCouch.view('titles', 'titles', function(err, body) { dbCloudant.view('titles', 'titles', function(err, body) { if (!err) { var outJSON = []; body.rows.forEach(function(doc) { outJSON.push({id: doc.id, title: doc.value}); }); //Logger.debug(util.inspect(body)); res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify({list: outJSON})); } else { res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }); router.get('/entry/:id', function(req, res) { logger.debug('entry..'); logger.debug(req.params.id); //dbCouch.get(req.params.id, function(err, body) { dbCloudant.get(req.params.id, function(err, body) { if (!err) { var outJSON = {}; outJSON._id = body._id; outJSON._rev = body._rev; outJSON.title = body.title; outJSON.reduced = body.reduced; outJSON.url = body.url; outJSON.tags = body.tags || {solid: '', list: []}; //Logger.debug(util.inspect(body)); res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify(outJSON)); } else { res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }); router.route('/tags') .get(function(req, res, next) { logger.debug('tag list..'); logger.debug(req.params.id); //dbCouch.view('taglist', 'taglist', function(err, body) { dbCloudant.view('taglist', 'taglist', function(err, body) { if (!err) { logger.debug(body); var outJSON = []; body.rows.forEach(function(doc) { logger.info(doc.value.taglist); if (doc.value[0] == req.params.id) { outJSON = doc.value.taglist.sort(); } }); //Logger.debug(util.inspect(body)); res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify({list: outJSON})); } else { logger.error(err); res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }).post(function(req, res, next) { var t = req.body; console.log(t); logger.info('regetting:' + req.body._id); //dbCouch.get(req.body._id, function(err, body) { dbCloudant.get(req.body._id, function(err, body) { if (!err) { var obj = {}; obj.url = body.url; obj.html = body.html; obj.reduced = body.reduced; obj.title = body.title; obj.tags = req.body.tags; logger.info('Updating...'); busEmitter.emit('updateBookmarkData', obj, body._id, body._rev, res); var outJSON = {}; outJSON._id = body._id; outJSON._rev = body._rev; outJSON.title = body.title; outJSON.reduced = body.reduced; outJSON.url = body.url; outJSON.tags = req.body.tags; //Logger.debug(util.inspect(body)); res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify(outJSON)); } else { res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }); router.get('/tags/:id', function(req, res) { logger.debug('entry..'); logger.debug(req.params.id); //dbCouch.view('getTagByKey', 'getTagByKey', function(err, body) { dbCloudant.view('getTagByKey', 'getTagByKey', function(err, body) { if (!err) { // Logger.debug(body); var outJSON = []; body.rows.forEach(function(doc) { // Logger.debug(doc); if (doc.value[0] == req.params.id) { outJSON.push({id: doc.id, title: doc.value[1]}) } }); //Logger.debug(util.inspect(body)); res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify({list: outJSON})); } else { logger.error(err); res.writeHead(500, {ContentType: 'application/json'}); res.end(JSON.stringify({})); } }); }); router.post('/add', function(req, res) { logger.debug('add entry..'); var t = req.body; if (t.hasOwnProperty('url')) { var url = JSON.parse(t.url.toString()); logger.debug(url); busEmitter.emit('getBookmark', t); } else { logger.error('No data block!'); } res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify({adding: url})); }); router.post('/redo', function(req, res) { logger.debug('redoing entry..'); var t = req.body; console.log(t); if (t.hasOwnProperty('url')) { var url = t.url.toString(); logger.debug(url); busEmitter.emit('getBookmark', t); } else { logger.error('No data block!'); } res.writeHead(200, {ContentType: 'application/json'}); res.end(JSON.stringify({adding: url})); }); router.route('/new') .get(function(req, res, next) { logger.debug('Save new'); busEmitter.emit('getBookmarkRes', req.query.url, res); }).post(function(req, res, next) { logger.debug('Posted Save new'); logger.info(req.body); if (Object.keys(req.body).length !== 0) { busEmitter.emit('getBookmarkRes', req.body.url, res); } else { res.status(422).end(); } }); busEmitter.emit('updateTagsDB'); module.exports = router;