old-silvrgit/savetodisk.js
2016-02-29 15:05:57 +00:00

74 lines
2.1 KiB
JavaScript

// Example use of simplecrawler, courtesy of @breck7! Thanks mate. :)
/**
* @param String. Domain to download.
* @Param Function. Callback when crawl is complete.
*/
var downloadSite = function(domain, callback) {
var fs = require("node-fs"),
url = require("url"),
path = require("path"),
Crawler = require("simplecrawler").Crawler;
var myCrawler = new Crawler(domain);
myCrawler.interval = 250;
myCrawler.maxConcurrency = 5;
myCrawler.maxDepth = 1;
myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
// Parse url
var parsed = url.parse(queueItem.url);
// Rename / to index.html
if (parsed.pathname === "/") {
parsed.pathname = "/index.html";
}
// Where to save downloaded data
var outputDirectory = path.join(__dirname, domain);
// Get directory name in order to create any nested dirs
var dirname = outputDirectory + parsed.pathname.replace(/\/[^\/]+$/, "");
// Path to save file
var filepath = 'snapshots/' + outputDirectory + parsed.pathname;
// Check if DIR exists
fs.exists(dirname, function(exists) {
// If DIR exists, write file
if (exists) {
fs.writeFile(filepath, responseBuffer, function() {});
} else {
// Else, recursively create dir using node-fs, then write file
fs.mkdir(dirname, 0755, true, function() {
fs.writeFile(filepath, responseBuffer, function() {});
});
}
});
console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length);
console.log("It was a resource of type %s", response.headers["content-type"]);
});
// Fire callback
myCrawler.on("complete", function() {
callback();
});
// Start Crawl
myCrawler.start();
};
if (process.argv.length < 3) {
console.log("Usage: node savetodisk.js mysite.com");
process.exit(1);
}
downloadSite(process.argv[2], function() {
console.log("Done!");
});