donfanning
8/15/2018 - 12:42 PM

Simple indexing script. Run with node indexer > output.file

Simple indexing script. Run with node indexer > output.file

var Crawler = require("simplecrawler").Crawler;
// var myCrawler = new Crawler("jeradbitner.com");
var myCrawler = new Crawler("www.grammy.com");
// myCrawler.domain = "www.grammy.com";
myCrawler.supportedMimeTypes = [
  /^text\//i
];
myCrawler.scanSubdomains = false;
myCrawler.ignoreWWWDomain = true;
// myCrawler.discoverResources = false;

var items = new Array;

myCrawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
  // Only want html pages &&
  // only want stuff on this domain. (Seems to pull in other things sometimes
  //   even though simplcrawler claims it should not).
  if (queueItem.stateData.contentType.indexOf("text/html") != -1 &&
    (queueItem.domain  == myCrawler.domain)) {
    items.push(queueItem.path);
    console.log(queueItem.url);
  }
});

myCrawler.start();

myCrawler.on("complete", function() {
  console.log(items);
  console.log(items.length);
});