donfanning
8/15/2018 - 12:08 PM

Crawl facebook feeds in nodejs with node-simplecrawler

Crawl facebook feeds in nodejs with node-simplecrawler

var Crawler = require("simplecrawler");
var Url = require("url");

var target = "https://graph.facebook.com/ledzeppelin/feed?access_token=1597581200507009%7Ce749be55ea86249f92ae56b081c37b38&fields=from%2Cmessage%2Ccreated_time%2Ctype%2Clink%2Ccomments.summary(true)%2Clikes.summary(true)%2Cshares&since=2016-07-11&until=2016-07-14&limit=10";
var url = Url.parse(target);

var crawler = new Crawler(url.host);
crawler.initialPath = url.path;
crawler.initialPort = 443;
crawler.initialProtocol = "https";

crawler.maxConcurrency = 1;
crawler.maxDepth = 2;
crawler.stripQuerystring = false;
crawler.decodeResponses = true;
crawler.supportedMimeTypes.push(/^application\/json/i);
 
crawler.on("fetchcomplete", function(queueItem, buffer, response) {
    console.log("Completed fetching resource:", queueItem.url);
    node.send({payload: "Completed fetching resource: " + queueItem.url});
    node.send({payload: buffer});
});

crawler.on("fetcherror", function(queueItem, response) {
    console.log("Error fetching resource:", queueItem.url);
    //console.log(queueItem);
    node.send({payload: {"error" : queueItem}});
});

crawler.discoverResources = function(buffer, queueItem) {
    try {
        var data = JSON.parse(buffer);
        if (data.paging && data.paging.next) {
            return [data.paging.next];
        }
    }
    catch (e) {
        console.log(e);
    }
    return [];
};


crawler.start();
console.log("start crawling...");