web spider
// example 1
// $sp = new spider;
// echo $sp -> fetch('http://yingzhenqiang.sinaapp.com'); 可以直接fetch
// example 2 登录网信中心
// $sp = new spider;
// $sp-> login(
// 'http://tulip.dlut.edu.cn',
// 's201181086',
// md5('987412365'),
// array('flag'=> 'save')
// ) or die("Fail in login."); // return 1
// echo $sp -> fetch('http://tulip.dlut.edu.cn/main.php');
// example 3 登录教务处
// $sp = new spider;
// $sp-> login(
// 'http://tulip.dlut.edu.cn',
// 's201181086',
// md5('987412365'),
// array('flag'=> 'save')
// ) or die("Fail in login."); // return 1
// echo $sp -> fetch('http://tulip.dlut.edu.cn/main.php');
// example 4 返回豆瓣图书信息
// example 3
$sp = new spider;
// echo $sp -> fetch('http://search.china-pub.com/s/?key1=9787530355138&type=&pz=1&t=2'); 直接抓取会跳转到主页,不能实现
// 抓取百度知道页面成功
// echo $sp -> fetch('http://zhidao.baidu.com/link?url=BYWzkFRbZvyiJsyy8FQQvatZhBNt5FpbhZDe2bqWxuOctX9CUvoGfVUj0QhVYJHzgNLT_i-McxZmOnzqVxuRLsh3MilXKkIHkfgQ7Q8itZm');
<?php
/*======================================================================*\
SPIDER - the PHP web spider
Author: Zhenqiang Ying <yingzhenqiang@163.com>
Version: 1.2
The latest version of SPIDER can be obtained from:
https://github.com/baidut/php_web_spider
\*======================================================================*/
class spider{
private $ch; // cURL handle
private $error; // error messages sent here
function __construct() {
require_once(PARSER_FILE);
if(!extension_loaded('curl'))
exit('Fatal error:The system does not extend php_curl.dll.');
$this-> ch = curl_init();
$this-> reset();
}
function __destruct() {
curl_close($this-> ch);
}
/*======================================================================*\
Purpose: reset spider
\*======================================================================*/
function reset(){
curl_setopt($this-> ch, CURLOPT_USERAGENT, "kind spider");
curl_setopt($this-> ch, CURLOPT_COOKIEJAR, COOKIE_FILE);
curl_setopt($this-> ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($this-> ch, CURLOPT_TIMEOUT, 120);
}
/*======================================================================*\
Purpose: fetch a web page by url
Input: $url web page address
Output: web page content
\*======================================================================*/
function fetch($url){
curl_setopt($this-> ch,CURLOPT_URL, $url);
curl_setopt($this-> ch,CURLOPT_COOKIE, COOKIE_FILE);
curl_setopt($this-> ch,CURLOPT_FOLLOWLOCATION, true);
return curl_exec($this-> ch);
}
/*======================================================================*\
Purpose: submit a form
Input: $url web page address
$fields form content
format: $fields["name"] = "value";
Output: web page content
\*======================================================================*/
function post($url,$fields){
curl_setopt($this-> ch,CURLOPT_POST,1);
curl_setopt($this-> ch,CURLOPT_POSTFIELDS,$fields);
curl_setopt($this-> ch,CURLOPT_URL,$url);
curl_setopt($this-> ch,CURLOPT_COOKIE, COOKIE_FILE);
curl_setopt($this-> ch,CURLOPT_FOLLOWLOCATION,true);
// 返回跳转后的页面 如果只提交表单,则返回1表示成功
return curl_exec($this-> ch);
}
/*======================================================================*\
Purpose: login
Input: $_url
$_username
$_password
$_hidden
Output: the text output from the post
\*======================================================================*/
function login($_url,$_username,$_password,$_hidden=""){
// 分析网页,获得表单并分析
$html = file_get_html($_url); // 获取页面成功
$form = $html-> find('form',0); // 定位表单
$fields = array(
$form-> find('input[type=text]',0)-> name => urlencode($_username),
$form-> find('input[type=password]',0)-> name => urlencode($_password),
);
if($_hidden) $fields = array_merge($fields, $_hidden); // 添加hidden
if(! $action = $form-> action) $action = $_url; // 如果action为空的话,如果不为空还要分析出主机
return $this-> post($_url,$fields);
}
/*======================================================================*\
Purpose: 根据搜索的页面地址,以及输入框位置,模拟一次输入文本搜索的操作
Input: $_url
$_txt
$_how2find
Output: the search result
\*======================================================================*/
function search($_url,$_txt,$_how2find){ //
// 分析网页,获得表单并分析,这一步不需要模拟登陆工具
$html = file_get_html($_url); // 获取页面成功
$form = $html->find('form'.$_how2find,0); // 定位表单echo $form;exit(0);
// 填写搜索框
$text = $form-> find('input[type=text]',0);
$fields = array( $text-> name => $_txt );
// 添加hidden域
$hiddens = $form-> find('input[type=hidden]');
foreach ($hiddens as $key => $hidden) {
$fields[ $hidden-> name ] = $hidden->value;
}
// 分析提交动作
$method = $form-> method;
$action = $form-> action; // 假设是绝对路径,没有处理相对路径
// 下面执行模拟搜索
if($action)
$_url = $action;
if($method=='get')
return $data = $this-> fetch( $_url . '?' . http_build_query($fields) );
if($method=='post'){
// print_r($_url);print_r($fields);exit(0);
// echo $this-> fetch($action);
return $this-> post($_url,$fields);
}
}
}
?>
<?php
function echoFetchResult($adress){
require_once("lib/snoopy.php");
$snoopy = new Snoopy;
$snoopy->user = "13591120447";// 可以模拟登陆
$snoopy->pass = "wy987412365";
if($snoopy->fetch($adress))
{
echo "response code: ".$snoopy->response_code."<br>\n";
while(list($key,$val) = each($snoopy->headers))
echo $key.": ".$val."<br>\n";
}
else {
echo "error fetching document: ".$snoopy->error."\n";
}
Title_Disp("Links");
$snoopy->fetchlinks($adress);
Var_Disp($snoopy->results);
Title_Disp("Form");
$snoopy->fetchform($adress);
Var_Disp($snoopy->results);
Title_Disp("Text");
$snoopy->fetchtext($adress);
Var_Disp($snoopy->results,$GLOBALS["isText2Pre"]);// 注意这里 要声明全局变量
Title_Disp("Content");
$snoopy->fetch($adress);
Var_Disp($snoopy->results,true);
}
function Var_Disp(&$var,$isPre=false){
if(is_array($var)) {
while(list($key,$val) = each($var)){
echo $key.": ".$val."<br/>\n";
}
}
else if($isPre){
echo "<PRE>".htmlspecialchars($var)."</PRE>\n";
}
else {
echo $var;
}
$var=null;
}
function Title_Disp($var){
echo "<h1>".$var."</h1><hr/>";
}
?>