recipes/server/grabbers.js
2016-07-12 16:26:36 +01:00

273 lines
6.4 KiB
JavaScript

/**
*
* User: Martin Donnelly
* Date: 2016-07-05
* Time: 15:01
*
*/
'use strict';
var cheerio = require('cheerio');
var log4js = require('log4js');
var logger = log4js.getLogger();
var STRING = require('string');
var URL = require('url');
var converter = require('html-to-markdown');
function cleaner(b) {
var _b = b;
var unwanted = [
'LINK',
'META',
'TITLE',
'div#disqus_thread',
'SCRIPT',
'FOOTER',
'div.ssba',
'.shareaholic-canvas',
'.yarpp-related',
'div.dfad',
'div.postFooterShare',
'div#nextPrevLinks',
'.post-comments',
'HEADER',
'.post-title',
'#side-menu',
'.footer-container',
'#pre-footer',
'#cakephp-global-navigation',
'.masthead',
'.breadcrumb-header',
'.single-recipe-sidebar',
'#recipe-related-videos',
'#tnav',
'.footer',
'#tb-wrapper',
'#comments',
'#menu',
'aside',
'#ad-mpu-premium-1-mobile',
'#recipetools',
'.adsense-ads-separator',
'.comments',
'.related-content',
'.tip-wrapper',
'#recipe-related-video-mobile',
'.float-wrapper',
'.source-jamie',
'.ad.mobile',
'.foodity-wrapper',
'#ad-most-watched-mobile',
'#sticky',
'.nutrition-expand',
'.grid-list-wrapper',
'#recipe-finder__box',
'.browser-upgrade-alert-message',
'.main-menu',
'.recipe-media',
'.method-mobile-prompt-wrapper',
'.sharebox-wrapper',
'.jumbotron',
'.slideshow',
'.top-cat-recipe',
'.analytic-box',
'.recipe-reviews',
'.recipe-tools',
'.promo-module',
'.widgettitle',
'.post_related'
];
for (var i = 0; i < unwanted.length; i++) {
_b.find(unwanted[i]).remove();
}
return _b;
}
exports = module.exports = {
doBBCGoodFood: function(body, url) {
logger.info('GRABBING BBCGoodFood');
var obj = {}, tdihbody, i, urlObj, urlPrefix;
var $ = cheerio.load(body);
var title = $('TITLE').text();
tdihbody = $('DIV#main-content');
logger.debug('Length:' , tdihbody.length);
tdihbody = cleaner(tdihbody);
logger.debug('Title: ', title);
urlObj = URL.parse(url);
urlPrefix = urlObj.protocol + '//' + urlObj.host + '/';
try {
tdihbody.find('IMG').each(function(i, elem) {
var s, src = $(this).attr('src');
if (src !== null) {
if (!STRING(src).startsWith('http')) {
logger.debug('Stripping:' + src);
src = urlPrefix + STRING(src).stripLeft('/').trim().s;
}
if (typeof obj.thumbnail === 'undefined') {
obj.thumbnail = src;
}
s = 'http://image.silvrtree.co.uk/900,fit/' + src;
$(this).attr('src', s);
}
});
}
catch (e) {
logger.error(e);
}
obj.url = STRING(url).trim().s;
obj.html = $.html();
obj.reduced = STRING(tdihbody.html()).trim().s;
obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s;
obj.title = STRING(title).collapseWhitespace().s;
obj.markdown = converter.convert(obj.reduced);
return obj;
},
doEngadget: function(body, url) {
logger.info('GRABBING Engadget');
var obj = {}, tdihbody, i, urlObj, urlPrefix;
var $ = cheerio.load(body);
var title = $('TITLE').text();
tdihbody = $('DIV#page_body');
logger.debug('Length:' , tdihbody.length);
tdihbody = cleaner(tdihbody);
logger.debug('Title: ', title);
urlObj = URL.parse(url);
urlPrefix = urlObj.protocol + '//' + urlObj.host + '/';
try {
tdihbody.find('IMG').each(function(i, elem) {
let s, src = $(this).attr('src');
if (src !== null) {
if (!STRING(src).startsWith('http')) {
logger.debug('Stripping:' + src);
src = urlPrefix + STRING(src).stripLeft('/').trim().s;
}
if (typeof obj.thumbnail === 'undefined') {
obj.thumbnail = src;
}
s = 'http://image.silvrtree.co.uk/900,fit/' + src;
$(this).attr('src', s);
}
});
}
catch (e) {
logger.error(e);
}
obj.url = STRING(url).trim().s;
obj.html = $.html();
obj.reduced = STRING(tdihbody.html()).trim().s;
obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s;
obj.title = STRING(title).collapseWhitespace().s;
obj.markdown = converter.convert(obj.reduced);
return obj;
},
doAndroidDeveloper: function(body, url) {
logger.info('GRABBING AndroidDeveloper');
var obj = {}, tdihbody, i, urlObj, urlPrefix;
var $ = cheerio.load(body);
var title = $('TITLE').text();
tdihbody = $('DIV.jd-descr');
logger.debug(tdihbody.length);
tdihbody = cleaner(tdihbody);
logger.debug(title);
urlObj = URL.parse(url);
urlPrefix = urlObj.protocol + '//' + urlObj.host + '/';
try {
tdihbody.find('IMG').each(function(i, elem) {
let s, src = $(this).attr('src');
if (src !== null) {
if (!STRING(src).startsWith('http')) {
logger.debug('Stripping:' + src);
src = urlPrefix + STRING(src).stripLeft('/').trim().s;
}
if (typeof obj.thumbnail === 'undefined') {
obj.thumbnail = src;
}
s = 'http://image.silvrtree.co.uk/900,fit/' + src;
$(this).attr('src', s);
}
});
}
catch (e) {
logger.error(e);
}
obj.url = STRING(url).trim().s;
obj.html = $.html();
obj.reduced = STRING(tdihbody.html()).trim().s;
obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s;
obj.title = STRING(title).collapseWhitespace().s;
obj.markdown = converter.convert(obj.reduced);
return obj;
},
doReddit: function(body, url) {
logger.info('GRABBING REDDIT');
var obj = {}, tdihbody, i, urlObj, urlPrefix;
var $ = cheerio.load(body);
var title = $('TITLE').text();
tdihbody = $('DIV.entry');
tdihbody.find('A.thumbnail').each(function(i, elem) {
logger.warn($(this));
});
logger.info('++++++');
// Logger.debug(tdihbody.html());
logger.debug(tdihbody.length);
tdihbody = cleaner(tdihbody);
logger.debug(title);
obj.url = STRING(url).trim().s;
obj.html = $.html();
obj.reduced = STRING(tdihbody.html()).trim().s;
obj.nib = STRING(tdihbody.text()).collapseWhitespace().trim().left(300).s;
obj.title = STRING(title).collapseWhitespace().s;
obj.markdown = converter.convert(obj.reduced);
return obj;
}
};