mirror of
https://gitlab.silvrtree.co.uk/martind2000/recipes.git
synced 2025-01-26 23:26:16 +00:00
61 lines
1.8 KiB
JavaScript
61 lines
1.8 KiB
JavaScript
/**
|
|
* Created by Martin on 22/02/2016.
|
|
*/
|
|
|
|
var http = require('http'), request = require('request'), cheerio = require('cheerio'), util = require('util');
|
|
var jsonfile = require('jsonfile'), fs = require('fs'), STRING = require('string');
|
|
var log4js = require('log4js');
|
|
var logger = log4js.getLogger();
|
|
|
|
var bodyfile = __dirname + '/' + 'body.html';
|
|
var htmlfile = __dirname + '/' + 'testoutput.html';
|
|
var generics = ['ARTICLE', 'div.content_column', 'div.post'];
|
|
|
|
function cleaner(b) {
|
|
var _b = b;
|
|
|
|
var unwanted = ['div#disqus_thread', 'SCRIPT', 'FOOTER', 'div.ssba', '.shareaholic-canvas', '.yarpp-related', 'div.dfad', 'div.postFooterShare', 'div#nextPrevLinks','.post-comments'];
|
|
|
|
for (var i = 0; i < unwanted.length; i++) {
|
|
_b.find(unwanted[i]).remove();
|
|
}
|
|
return _b;
|
|
}
|
|
|
|
module.exports = {
|
|
|
|
generic: function (url) {
|
|
logger.info(url);
|
|
request(url, function (err, resp, body) {
|
|
if (err)
|
|
throw err;
|
|
|
|
$ = cheerio.load(body);
|
|
var title = $('TITLE').text();
|
|
|
|
// try to find a body to grab
|
|
|
|
var i = 0;
|
|
|
|
while (($(generics[i]).length == 0) && (i < generics.length)) {
|
|
|
|
i++;
|
|
}
|
|
logger.debug(i);
|
|
|
|
if (i < generics.length) {
|
|
var tdihbody = $(generics[i]);
|
|
logger.debug(tdihbody.length);
|
|
|
|
tdihbody = cleaner(tdihbody);
|
|
logger.debug(title);
|
|
|
|
fs.writeFileSync(htmlfile, tdihbody.html());
|
|
}
|
|
fs.writeFileSync(bodyfile, $.html());
|
|
});
|
|
}
|
|
};
|
|
|
|
//module.exports.grabMarksDailyApple('http://www.marksdailyapple.com/spiced-pork-and-butternut-squash-with-sage');
|
|
module.exports.generic('http://www.health-bent.com/soups/paleo-mediterranean-beef-stew'); |