baidut
10/29/2014 - 1:24 PM

web spider

web spider

// example 1
// $sp = new spider;
// echo $sp -> fetch('http://yingzhenqiang.sinaapp.com'); 可以直接fetch

// example 2 登录网信中心
// $sp = new spider;
// $sp-> login(
// 		'http://tulip.dlut.edu.cn',
// 		's201181086',
// 		md5('987412365'),
// 		array('flag'=> 'save')
// 	) or die("Fail in login."); // return 1
// echo $sp -> fetch('http://tulip.dlut.edu.cn/main.php');

// example 3 登录教务处
// $sp = new spider;
// $sp-> login(
// 		'http://tulip.dlut.edu.cn',
// 		's201181086',
// 		md5('987412365'),
// 		array('flag'=> 'save')
// 	) or die("Fail in login."); // return 1
// echo $sp -> fetch('http://tulip.dlut.edu.cn/main.php');

// example 4 返回豆瓣图书信息

// example 3
$sp = new spider;
// echo $sp -> fetch('http://search.china-pub.com/s/?key1=9787530355138&type=&pz=1&t=2'); 直接抓取会跳转到主页,不能实现

// 抓取百度知道页面成功
// echo $sp -> fetch('http://zhidao.baidu.com/link?url=BYWzkFRbZvyiJsyy8FQQvatZhBNt5FpbhZDe2bqWxuOctX9CUvoGfVUj0QhVYJHzgNLT_i-McxZmOnzqVxuRLsh3MilXKkIHkfgQ7Q8itZm'); 
<?php
/*======================================================================*\
SPIDER - the PHP web spider
Author: Zhenqiang Ying <yingzhenqiang@163.com>
Version: 1.2

The latest version of SPIDER can be obtained from:
https://github.com/baidut/php_web_spider
\*======================================================================*/
class spider{

    private $ch;        // cURL handle
    private $error;     // error messages sent here

    function __construct() { 
        require_once(PARSER_FILE);
        if(!extension_loaded('curl'))
            exit('Fatal error:The system does not extend php_curl.dll.');
        $this-> ch = curl_init();
        $this-> reset();
    }
    function __destruct() { 
        curl_close($this-> ch);
    }
/*======================================================================*\
    Purpose:    reset spider
\*======================================================================*/
    function reset(){
        curl_setopt($this-> ch, CURLOPT_USERAGENT,      "kind spider");
        curl_setopt($this-> ch, CURLOPT_COOKIEJAR,      COOKIE_FILE);
        curl_setopt($this-> ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($this-> ch, CURLOPT_TIMEOUT,        120);
    }
/*======================================================================*\
    Purpose:    fetch   a web page by url
    Input:      $url    web page address
    Output:     web page content
\*======================================================================*/
    function fetch($url){
        curl_setopt($this-> ch,CURLOPT_URL,             $url);
        curl_setopt($this-> ch,CURLOPT_COOKIE,          COOKIE_FILE); 
        curl_setopt($this-> ch,CURLOPT_FOLLOWLOCATION,  true);
        return curl_exec($this-> ch);
    }
/*======================================================================*\
    Purpose:    submit a form
    Input:      $url    web page address
                $fields form content
                    format: $fields["name"] = "value";
    Output:     web page content
\*======================================================================*/
    function post($url,$fields){
        curl_setopt($this-> ch,CURLOPT_POST,1);
        curl_setopt($this-> ch,CURLOPT_POSTFIELDS,$fields);
        curl_setopt($this-> ch,CURLOPT_URL,$url);
        curl_setopt($this-> ch,CURLOPT_COOKIE, COOKIE_FILE); 
        curl_setopt($this-> ch,CURLOPT_FOLLOWLOCATION,true);
        // 返回跳转后的页面 如果只提交表单,则返回1表示成功
        return curl_exec($this-> ch);
    }
/*======================================================================*\
    Purpose:    login
    Input:      $_url       
                $_username  
                $_password  
                $_hidden    
    Output:     the text output from the post
\*======================================================================*/
    function login($_url,$_username,$_password,$_hidden=""){
    // 分析网页,获得表单并分析
        $html = file_get_html($_url);           // 获取页面成功 
        $form = $html-> find('form',0);         // 定位表单
        $fields = array(  
                   $form-> find('input[type=text]',0)-> name => urlencode($_username),
                   $form-> find('input[type=password]',0)-> name => urlencode($_password),
                  ); 
        if($_hidden) $fields = array_merge($fields, $_hidden); // 添加hidden

        if(! $action = $form-> action) $action = $_url; // 如果action为空的话,如果不为空还要分析出主机 
        return $this-> post($_url,$fields);
    }
/*======================================================================*\
    Purpose:    根据搜索的页面地址,以及输入框位置,模拟一次输入文本搜索的操作
    Input:      $_url       
                $_txt  
                $_how2find  
    Output:     the search result
\*======================================================================*/
    function search($_url,$_txt,$_how2find){ // 
    // 分析网页,获得表单并分析,这一步不需要模拟登陆工具
        $html = file_get_html($_url);                   // 获取页面成功 
        $form = $html->find('form'.$_how2find,0);       // 定位表单echo $form;exit(0);
        // 填写搜索框
        $text = $form-> find('input[type=text]',0);
        $fields = array( $text-> name => $_txt );
        // 添加hidden域
        $hiddens = $form-> find('input[type=hidden]');
        foreach ($hiddens as $key => $hidden) {
            $fields[ $hidden-> name ] = $hidden->value;
        }
        // 分析提交动作
        $method = $form-> method;
        $action = $form-> action; // 假设是绝对路径,没有处理相对路径
        // 下面执行模拟搜索
        if($action)
            $_url = $action;
        if($method=='get')
            return $data = $this-> fetch( $_url . '?' . http_build_query($fields) );
        if($method=='post'){
            // print_r($_url);print_r($fields);exit(0);
            // echo $this-> fetch($action);
            return $this-> post($_url,$fields);
        }
    }
}
?>
<?php 
function echoFetchResult($adress){

	require_once("lib/snoopy.php");
	$snoopy = new Snoopy;

	$snoopy->user = "13591120447";// 可以模拟登陆
	$snoopy->pass = "wy987412365";

	if($snoopy->fetch($adress))
	{
		echo "response code: ".$snoopy->response_code."<br>\n";
		while(list($key,$val) = each($snoopy->headers))
			echo $key.": ".$val."<br>\n";
	}
	else {
		echo "error fetching document: ".$snoopy->error."\n";
	}
	Title_Disp("Links");
	$snoopy->fetchlinks($adress);
	Var_Disp($snoopy->results);
	
	Title_Disp("Form");
	$snoopy->fetchform($adress);
	Var_Disp($snoopy->results);

	Title_Disp("Text");
	$snoopy->fetchtext($adress);
	Var_Disp($snoopy->results,$GLOBALS["isText2Pre"]);// 注意这里 要声明全局变量
	
	Title_Disp("Content");
	$snoopy->fetch($adress);
	Var_Disp($snoopy->results,true);
}	

function Var_Disp(&$var,$isPre=false){
	if(is_array($var)) {
		while(list($key,$val) = each($var)){
			echo $key.": ".$val."<br/>\n";
		}
	}
	else if($isPre){
		echo "<PRE>".htmlspecialchars($var)."</PRE>\n";
	}
	else {
		echo $var;
	}
	$var=null;
}
function Title_Disp($var){
	echo "<h1>".$var."</h1><hr/>";
}
?>