joseraya
1/21/2014 - 8:15 PM

A node script that crawls a web site and stores snapshots (taken with zombie.js) to the file system. Based on code from this article: http:/

A node script that crawls a web site and stores snapshots (taken with zombie.js) to the file system. Based on code from this article: http://www.ng-newsletter.com/posts/serious-angular-seo.html

var Browser = require('zombie'),
    url     = require('url'),
    fs      = require('fs'),
    $q 		= require('Q'),
    saveDir = __dirname + '/_snapshots';


var scriptTagRegex = /<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi;

var stripScriptTags = function(html) {
  return html.replace(scriptTagRegex, '');
}

var mkdirParent = function(dirPath, mode, callback) {
  //Call the standard fs.mkdir
  fs.mkdir(dirPath, mode, function(error) {
    //When it fail in this way, do the custom steps
    if (error && error.errno === 34) {
      //Create all the parents recursively
      fs.mkdirParent(path.dirname(dirPath), mode, callback);
      //And then the directory
      fs.mkdirParent(dirPath, mode, callback);
    }
    //Manually run the callback since we used our own callback to do all these
    callback && callback(error);
  });
};


var saveSnapshot = function(uri, body) {

  var lastIdx = uri.lastIndexOf('#!/');

  if (lastIdx < 0) {
    // If we're using html5mode
    path = url.parse(uri).pathname;
  } else {
    // If we're using hashbang mode
    path = 
      uri.substring(lastIdx + 2, uri.length);
  }

  if (path === '/') path = "/index.html";

  if (path.indexOf('.html') == -1)
    path += ".html";

  var filename = saveDir + path;
  console.log("Saving ", uri, " to ", filename);
  var dirname = require("path").dirname(filename);
  mkdirParent(dirname);
	fs.open(filename, 'w', function(e, fd) {
		if (e) return;
		fs.write(fd, body);
	});

 
};

var browserOpts = {
  waitFor: "100ms",
  loadCSS: false,
  waitDuration: "100ms"
}

var browser = new Browser(browserOpts);

var crawlPage = function(idx, arr) {
  // location = window.location
  if (idx < arr.length) {
    var uri = arr[idx];
	console.time("voy");
    var promise = browser.visit(uri)
    .then(function() {
		console.timeEnd("voy");
		var intervalId = setInterval(function() {
			console.log("checking status")
			var status =  browser.body.getAttribute('data-status');
			console.log(status);
			if (status === "ready") {
				clearInterval(intervalId);
				// Turn links into absolute links
			    // and save them, if we need to
			    // and we haven't already crawled them
			    var links = browser.queryAll('a');
			    links.forEach(function(link) {
					var href = link.getAttribute('href');
			        var absUrl = url.resolve(uri, href);
			        link.setAttribute('href', absUrl);
			        if (arr.indexOf(absUrl) < 0) {
			          arr.push(absUrl);
			        }
			    });

			    // Save
			    saveSnapshot(uri, browser.html());
			    // Call again on the next iteration
			    crawlPage(idx+1, arr);			
			  }
		}, 500);
    	var d = $q.defer();
    });
  }
}
crawlPage(0, ["http://localhost:4000/#!/"]);