/** * * User: Martin Donnelly * Date: 2016-07-05 * Time: 15:01 * */ 'use strict'; var cheerio = require('cheerio'); var log4js = require('log4js'); var logger = log4js.getLogger(); var STRING = require('string'); var URL = require('url'); var converter = require('html-to-markdown'); function cleaner(b) { var _b = b; var unwanted = [ 'LINK', 'META', 'TITLE', 'div#disqus_thread', 'SCRIPT', 'FOOTER', 'div.ssba', '.shareaholic-canvas', '.yarpp-related', 'div.dfad', 'div.postFooterShare', 'div#nextPrevLinks', '.post-comments', 'HEADER', '.post-title', '#side-menu', '.footer-container', '#pre-footer', '#cakephp-global-navigation', '.masthead', '.breadcrumb-header', '.single-recipe-sidebar', '#recipe-related-videos', '#tnav', '.footer', '#tb-wrapper', '#comments', '#menu', 'aside', '#ad-mpu-premium-1-mobile', '#recipetools', '.adsense-ads-separator', '.comments', '.related-content', '.tip-wrapper', '#recipe-related-video-mobile', '.float-wrapper', '.source-jamie', '.ad.mobile', '.foodity-wrapper', '#ad-most-watched-mobile', '#sticky', '.nutrition-expand', '.grid-list-wrapper', '#recipe-finder__box', '.browser-upgrade-alert-message', '.main-menu', '.recipe-media', '.method-mobile-prompt-wrapper', '.sharebox-wrapper', '.jumbotron', '.slideshow', '.top-cat-recipe', '.analytic-box', '.recipe-reviews', '.recipe-tools', '.promo-module', '.widgettitle', '.post_related' ]; for (var i = 0; i < unwanted.length; i++) { _b.find(unwanted[i]).remove(); } return _b; } exports = module.exports = { doBBCGoodFood: function(body, url) { logger.info('GRABBING BBCGoodFood'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV#main-content'); logger.debug('Length:' , tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug('Title: ', title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { var s, src = $(this).attr('src'); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; }, doEngadget: function(body, url) { logger.info('GRABBING Engadget'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV#page_body'); logger.debug('Length:' , tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug('Title: ', title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { let s, src = $(this).attr('src'); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; }, doAndroidDeveloper: function(body, url) { logger.info('GRABBING AndroidDeveloper'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.jd-descr'); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); urlObj = URL.parse(url); urlPrefix = urlObj.protocol + '//' + urlObj.host + '/'; try { tdihbody.find('IMG').each(function(i, elem) { let s, src = $(this).attr('src'); if (src !== null) { if (!STRING(src).startsWith('http')) { logger.debug('Stripping:' + src); src = urlPrefix + STRING(src).stripLeft('/').trim().s; } if (typeof obj.thumbnail === 'undefined') { obj.thumbnail = src; } s = 'http://image.silvrtree.co.uk/900,fit/' + src; $(this).attr('src', s); } }); } catch (e) { logger.error(e); } obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; }, doReddit: function(body, url) { logger.info('GRABBING REDDIT'); var obj = {}, tdihbody, i, urlObj, urlPrefix; var $ = cheerio.load(body); var title = $('TITLE').text(); tdihbody = $('DIV.entry'); tdihbody.find('A.thumbnail').each(function(i, elem) { logger.warn($(this)); }); logger.info('++++++'); // Logger.debug(tdihbody.html()); logger.debug(tdihbody.length); tdihbody = cleaner(tdihbody); logger.debug(title); obj.url = STRING(url).trim().s; obj.html = $.html(); obj.reduced = STRING(tdihbody.html()).trim().s; obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s; obj.title = STRING(title).collapseWhitespace().s; obj.markdown = converter.convert(obj.reduced); return obj; } };