74 lines
2.1 KiB
JavaScript
74 lines
2.1 KiB
JavaScript
// Example use of simplecrawler, courtesy of @breck7! Thanks mate. :)
|
|
|
|
/**
|
|
* @param String. Domain to download.
|
|
* @Param Function. Callback when crawl is complete.
|
|
*/
|
|
var downloadSite = function(domain, callback) {
|
|
var fs = require("node-fs"),
|
|
url = require("url"),
|
|
path = require("path"),
|
|
Crawler = require("simplecrawler").Crawler;
|
|
|
|
var myCrawler = new Crawler(domain);
|
|
myCrawler.interval = 250;
|
|
myCrawler.maxConcurrency = 5;
|
|
myCrawler.maxDepth = 1;
|
|
|
|
myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
|
|
|
|
// Parse url
|
|
var parsed = url.parse(queueItem.url);
|
|
|
|
// Rename / to index.html
|
|
if (parsed.pathname === "/") {
|
|
parsed.pathname = "/index.html";
|
|
}
|
|
|
|
// Where to save downloaded data
|
|
var outputDirectory = path.join(__dirname, domain);
|
|
|
|
// Get directory name in order to create any nested dirs
|
|
var dirname = outputDirectory + parsed.pathname.replace(/\/[^\/]+$/, "");
|
|
|
|
// Path to save file
|
|
var filepath = 'snapshots/' + outputDirectory + parsed.pathname;
|
|
|
|
// Check if DIR exists
|
|
fs.exists(dirname, function(exists) {
|
|
|
|
// If DIR exists, write file
|
|
if (exists) {
|
|
fs.writeFile(filepath, responseBuffer, function() {});
|
|
} else {
|
|
// Else, recursively create dir using node-fs, then write file
|
|
fs.mkdir(dirname, 0755, true, function() {
|
|
fs.writeFile(filepath, responseBuffer, function() {});
|
|
});
|
|
}
|
|
|
|
});
|
|
|
|
console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length);
|
|
console.log("It was a resource of type %s", response.headers["content-type"]);
|
|
|
|
});
|
|
|
|
// Fire callback
|
|
myCrawler.on("complete", function() {
|
|
callback();
|
|
});
|
|
|
|
// Start Crawl
|
|
myCrawler.start();
|
|
|
|
};
|
|
|
|
if (process.argv.length < 3) {
|
|
console.log("Usage: node savetodisk.js mysite.com");
|
|
process.exit(1);
|
|
}
|
|
|
|
downloadSite(process.argv[2], function() {
|
|
console.log("Done!");
|
|
}); |