nowindxdw
11/2/2017 - 1:34 AM

scrapy in nodejs


/**
 * 并发抓取原始网页数据
 * @param trackList
 * @param callback
 */
start: function (trackList, callback) {
    var urls = trackList;
    var ep = new eventproxy();
    ep.after("eventName", urls.length, function (urlResults) {
        callback(null, urlResults);
    });
    urls.forEach(function (url) {
        superagent.get(url)
            .end(function (err, urlRes) {
                ep.emit("eventName", [url, urlRes.text])
            });
    });
},

translateCDNews: function (data, callback) {
    // logger.trace(data);
    var url = data[0];
    logger.trace(url);
    var $ = cheerio.load(data[1]);
    // console.log($)
    //通过jquery方式获取内容链接
    var topnewsEle =$('.top-news');
    var topNews=[];
    topnewsEle.each(function(idx,element){
        var $element = $(element).children('h1').first();
        var title = $element.text();  //标题
        var href = $element.children('a').attr('href');//链接
        topNews.push({
            title:title,
            href:href
        })
    });
    logger.debug(topNews);
    callback(null,topNews)
},

var trackList = [
    // 'https://www.lagou.com/jobs/2678740.html',//lagou nodejs
    //bosszhipin
    //neitui
    "https://www.bilibili.com/ranking#!/all/0/0/3/"//bilibili
];
start(trackList,function(err,result){
    if(err){
        logger.error(err.stack);
    }else{
        logger.debug(result);
        testData = result;
    }
})
translateCDNews(testData[0],function(err,result){
    console.log(result)                
})