donfanning
8/15/2018 - 12:55 PM

Simple PhantomJS-based web crawler library

Simple PhantomJS-based web crawler library

//PhantomJS http://phantomjs.org/ based web crawler Anton Ivanov anton.al.ivanov@gmail.com 2012
//UPDATE: This gist has been made into a Node.js module and now can be installed with "npm install js-crawler"
//the Node.js version does not use Phantom.JS, but the API available to the client is similar to the present gist

(function(host) {

    function Crawler() {
        this.visitedURLs = {};
    };
    
    Crawler.webpage = require('webpage');

    Crawler.prototype.crawl = function (url, depth, onSuccess, onFailure) {
        if (0 == depth || this.visitedURLs[url]) {
            return;
        };
        var self = this;
        var page = Crawler.webpage.create();

        page.open(url, function (status) {
            if ('fail' === status) { 
                onFailure({
                    url: url, 
                    status: status
                });
            } else {
                var documentHTML = page.evaluate(function () {
                    return document.body && document.body.innerHTML ? document.body.innerHTML : "";
                });
                self.crawlURLs(self.getAllURLs(page), depth - 1, onSuccess, onFailure);
                self.visitedURLs[url] = true;
                onSuccess({
                    url: url,
                    status: status,
                    content: documentHTML
                });
            };
        });
    };

    Crawler.prototype.getAllURLs = function(page) {
        return page.evaluate(function () {
            return Array.prototype.slice.call(document.querySelectorAll("a"), 0)
                .map(function (link) {
                    return link.getAttribute("href");
                });
        });
    };

    Crawler.prototype.crawlURLs = function(urls, depth, onSuccess, onFailure) {
        var self = this;
        urls.filter(function (url) {
            return Crawler.isTopLevelURL(url);
        }).forEach(function (url) {
            self.crawl(url, depth, onSuccess, onFailure);
        });
    };

    Crawler.isTopLevelURL = function(url) {
        return 0 == url.indexOf("http");
    };

    host.Crawler = Crawler;
})(phantom);

new phantom.Crawler().crawl("https://github.com/ariya/phantomjs/wiki/Quick-Start", 2, 
    function onSuccess(page) {
        console.log("Loaded page. URL = " + page.url + " content length = " + page.content.length + " status = " + page.status);
    }, 
    function onFailure(page) {
        console.log("Could not load page. URL = " +  page.url + " status = " + page.status);
    }
);