"use strict"; /** * Created by Martin on 22/02/2016. */ var express = require('express'); var http = require('http'), request = require('request'), cheerio = require( 'cheerio'), util = require('util'); var jsonfile = require('jsonfile'), fs = require('fs'), STRING = require( 'string'); var converter = require('html-to-markdown'); var zlib = require("zlib"); var log4js = require('log4js'); var logger = log4js.getLogger(); var URL = require('url'); var router = express.Router(); var EventEmitter = require('events'); var nano = require('nano')('http://localhost:5984'); var busEmitter = new EventEmitter(); var db_name = 'keeper'; var dbCouch = nano.use(db_name); var jsonFile = __dirname + '/' + 'output.json'; var bodyfile = __dirname + '/' + 'body.html'; var htmlfile = __dirname + '/' + 'testoutput.html'; var generics = [ 'ARTICLE', 'div.content_column', 'div.post', 'div.page', '#recipe-single', 'div.content.body' ]; var specialHandlers = [{ url: 'www.reddit.com', fn: function (body, url) { return doReddit(body, url); } }, { url: 'developer.android.com', fn: function (body, url) { return doAndroidDeveloper(body, url); } } ]; function cleaner(b) { var _b = b; var unwanted = [ 'LINK', 'META', 'TITLE', 'div#disqus_thread', 'SCRIPT', 'FOOTER', 'div.ssba', '.shareaholic-canvas', '.yarpp-related', 'div.dfad', 'div.postFooterShare', 'div#nextPrevLinks', '.post-comments', 'HEADER', '.post-title', '#side-menu', '.footer-container', '#pre-footer', '#cakephp-global-navigation', '.masthead', '.breadcrumb-header', '.single-recipe-sidebar', '#recipe-related-videos', '#tnav', '.footer', '#tb-wrapper' ]; for (var i = 0; i < unwanted.length; i++) { _b.find(unwanted[i]).remove(); } return _b; } function insertBookmark(obj) { logger.debug('Inserting into couch...'); // logger.info(util.inspect(obj)); dbCouch.insert(obj, function (err, body, header) { if (err) { logger.error('Error inserting into couch'); return; } }); logger.debug('Insert done..'); } function updateBookmark(obj, _id, _rev) { logger.debug('Updating couch...'); var _obj = obj; _obj._id = _id; _obj._rev = _rev; dbCouch.insert(_obj, function (err, body, header) { if (err) { logger.error('Error updating into couch'); return; } else { logger.info('I think we updated ok...'); busEmitter.emit("updateTagsDB"); } }); logger.debug('Update done..'); } var doInsertBookmark = (obj) => { // logger.info('sendSocket: ' + JSON.stringify(obj)); insertBookmark(obj); }; var doUpdateBookmark = (obj, _id, _rev) => { // logger.info('sendSocket: ' + JSON.stringify(obj)); updateBookmark(obj, _id, _rev); }; var doGetBookmark = (obj) => { // logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(obj); }; var doGetBookmarkRedo = (obj) => { // logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(obj); }; var doGetBookmarkRes = (url, res) => { logger.debug('doGetBookmarkRes'); // logger.info('sendSocket: ' + JSON.stringify(obj)); genericGrab(url, res); }; var doUpdateTagsDB = () => { logger.debug('Update the tags database...'); dbCouch.view('getAllTags', 'getAllTags', function (err, body) { var masterList = []; if (!err) { body.rows.forEach(function (doc) { masterList = masterList.concat(doc.value); }); masterList = masterList.filter((value, index, self) => { return self.indexOf(value) === index; }); dbCouch.view('taglist', 'taglist', function (err, body) { // logger.debug(body); if (!err) { var outJSON = {}; body.rows.forEach(function (doc) { doSaveTagsDB(doc.value, masterList); }); } else { logger.error('NO TAG LIST EXISTS'); } }); } else { } }); }; var doSaveTagsDB = (orig, newList) => { logger.debug('doSaveTagsDB'); var _obj = orig; _obj.taglist = newList; dbCouch.insert(_obj, function (err, body, header) { if (err) { logger.error('Error updating into couch'); return; } else { logger.info('Updated the tags list...'); } }); }; // Events busEmitter.on('saveBookmarkData', doInsertBookmark); busEmitter.on('updateBookmarkData', doUpdateBookmark); busEmitter.on('getBookmark', doGetBookmark); busEmitter.on('getBookmarkRes', doGetBookmarkRes); busEmitter.on('getBookmarkRedo', doGetBookmarkRedo); busEmitter.on('updateTagsDB', doUpdateTagsDB); busEmitter.on('saveTagsDB', doSaveTagsDB); function doAndroidDeveloper(body, url) { logger.info('GRABBING AndroidDeveloper'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.jd-descr'); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function (i, elem) { let s, src = $(this).attr("src"); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr("src", s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function doReddit(body, url) { logger.info('GRABBING REDDIT'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.entry'); tdihbody.find('A.thumbnail').each(function (i, elem) { logger.warn($(this)); }); logger.info('++++++'); // logger.debug(tdihbody.html()); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function genericProcessor(body, url) { logger.info('USING DEFAULT PROCESSOR'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); i = 0; while (($(generics[i]).length == 0) && (i < generics.length)) { i++; } logger.debug(i); if (i < generics.length) { logger.warn('Used a generic'); tdihbody = $(generics[i]); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); } else { logger.warn('Using whole body'); // bah. nothing to reduce so just grab the body, tidy it and use that tdihbody = $('BODY'); if (tdihbody.length === 0) { tdihbody = $(":root"); } logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); } // logger.info(util.inspect(tdihbody)); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function (i, elem) { let s, src = $(this).attr("src"); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr("src", s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } function processBody(body, url, _id, _rev) { var obj = {}, i, urlObj, urlPrefix; // try to find a body to grab urlObj = URL.parse(url); logger.debug('host:', urlObj.host); var flag; for (i=0;i