LiamBao
6/21/2017 - 6:51 AM

JS

JS

Charles 826179140@qq.com

load("module.js");
load("global_parameter.js");
importClass(net.sf.json.JSONObject);

解析Jason
var url="http://www.xiaohongshu.com/api/discovery/get_comment?&_r=1458289534039&discovery_id=56ea7a48d1d3b91ccea96cfb";
var xml=loadPage(url);

var jsonstr=getjsonStr(xml);
println(jsonstr);
function getjsonStr(xml){
	var jsonStr = html2text(xml..BODY);
	var startPos = jsonStr.indexOf("({")
	if(startPos > -1) {
		jsonStr = jsonStr.substring(startPos+1);
		var endPos = jsonStr.indexOf("})");
		jsonStr = jsonStr.substring(0, endPos+1);			
	}
	var jsons = eval('(' + jsonStr + ')');4
	return jsons;
}
function hexToDec(str) {
	
	var json =  xml.replace(/\\\"/g,'"').replace(/\\\//g,'/').replace(/\\u/g,"%u"); 
	json = unescape(json);
    return unescape(str);
}




buildPropertyDefine(type, isRequire, isRefindKey, refindKeyIndex, format, defaultValue)

=IF(ISERROR(MATCH(A1,$G:$G,0)),A1,VLOOKUP(A1,$G$1:$H$3,2,FALSE))


	str = str.replace(/[&]/g,"");
	
	
var threadURLRegex = /bbs\/.*\/(\d+).html/;   
var threadRegex = normalthread\_(\d+);
var threadURLRegex = /thread.*\-(\d+)\-\d+\.html/;

var rowNodes = xml..DIV.(@CLASS =="carea")..DL.(/list_dl.*/.test(@CLASS));


return html2text(node.@HREF).indexOf('http:')>=0?node.@HREF:'http:'+node.@HREF;

var str=toString(node);
str=str.substring(str.indexOf('>'),str.indexOf('BR'));

addInvalidTagReplacePair("<script.*?/script>","");
addInvalidTagReplacePair("<SCRIPT.*?/SCRIPT>","");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]", "<P");
addInvalidTagReplacePair("<IMG [a-zA-Z0-9_].*>", "");
addInvalidTagReplacePair("<img [a-zA-Z0-9_].*>", "");
addInvalidArrt("xml:lang=");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+", "<P");
addInvalidTagReplacePair("</[a-zA-Z0-9_]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+>", "</P>");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+/>", "");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+", "<P");
addInvalidTagReplacePair("</[a-zA-Z0-9_]+:[a-zA-Z0-9_]+>", "</P>");
addInvalidTagReplacePair("FORM\\.|form\\.", "FORM");
addInvalidTagReplacePair("\\?>", "/>");
addInvalidTagReplacePair("&nbsp;", "");
addInvalidTagReplacePair("\\?xml:", "xml");
addInvalidTagReplacePair("<FORM.*?FORM>","");
addInvalidTagReplacePair("<XMLNS.*?XMLNS>","");
addInvalidArrt("[0-9a-zA-Z_]+:[0-9a-zA-Z_]+=");
addInvalidTagReplacePair("<!DOCTYPE .*?>","");
addInvalidTagReplacePair("<o:p></o:p>","");
addInvalidTagReplacePair("x:str=\"\"",""); 


登录

function loginPage(username, password) {
	try {
		var url = "http://www.kaixin001.com";
		browser.url = url;
		sleep(20000);
		if (checkLogin()) {
			logInfo("Login Sucessed. Do not need input username and password.");
			return;
		}
		var xml = new XML(browser.document.xmlContent);
		var emailInput = xml..INPUT.(@NAME == "email");
		if (emailInput == null || emailInput.length() == 0) {
			return;
		}
		browser.elementSetValue(emailInput, username);
		browser.elementSetValue(xml..INPUT.(@NAME == "password"), password);
		browser.elementSetChecked(xml..INPUT.(@NAME == "remember"), true);
		browser.elementClick(xml..INPUT.(@ID == "btn_dl"));
		sleep(10000);
		var htmlContent = browser.document.xmlContent;
		if (checkLogin()) {
			logInfo("Login Sucessed. Do not need input code by verifyImage.");
			return;
		}
		var xml = new XML(htmlContent);
		try {
			var verifyImage = xml..IMG.(@ID == "randimg");
			// xml..AIMG.(@ID == "randddimg").length();
			if (verifyImage != null && verifyImage.length() > 0 && verifyImage.@SRC.length() > 0) {
/* var verifyCode = inputVerifyCode(browser.readImageData("randimg"));
		        if (verifyCode != null) {
			       browser.elementSetValue(xml..INPUT.(@ID == "code"), verifyCode);
		        }
	            */
/*saveErrorContent(url+"?user="+username+"&password="+password, htmlContent);
	            dbError("Need input verify code when login.");
                */
			}
		} catch (parseLoginError) {
			hasError = true;
			logError("Error when parse login page", parseLoginError);
		}
	} catch (err) {
		hasError = true;
		logError("Error in login", err);
	}
}

function checkLogin() {
	var xml = new XML(browser.document.xmlContent);
	var loginedLink = xml..A.(@TITLE == "个人资料");
	if (loginedLink.length() > 0 && (browser.url == "http://www.kaixin001.com/home/?l=a" || parseUserId(loginedLink.@HREF) == parseUserId(browser.url))) {
		return true;
	}
	return false;
}



zhihu.com 知乎:postrawNode
///////////////////////////////////////////
function getPostRowNodes(xml) {
	xml=hexToDec(toString(xml).replace(/\\\"/g,'"').replace(/\\\//g,'/'))
	var tmp=new Array();
	var bpos=0;
	var epos=0;
	while(true){
		bpos=xml.indexOf('zm-item-answer ',epos);
		if(bpos<0){
		  break;	
		}
		epos=xml.indexOf('","',bpos);
		if(epos<0){
		  epos=xml.indexOf(']',bpos);
		}
		if(epos<0){
		  break;	
		}
		tmp.push(xml.substring(bpos,epos));
	}
    
  //  var rowNodes = xml..LI.(@CLASS=='item clearfix');
    return tmp;
}



//返回为字符串类型
function parseSubject(rowNode) {
	
	var bpos=rowNode.indexOf('<dt>');
	var epos=rowNode.indexOf('</dt>');
	var start =rowNode.indexOf('">',bpos);
	var titleNode = rowNode.substring(start+2,epos).replace(/\\\//g,'/');
	if (isInvalidObject(titleNode)) {
		throw "Can not parse subject";
	}
	return titleNode; 


///////////////////////
	var url='http://search.dxy.cn/?words='+encodeURI(theKeyword)+'&source=BBS&limit=15&o=1&page='+page;
	

function URL(url){
	try{
		var htmlxml=loadPage(url);
	}catch(err){	
		try{
			var htmlxml=loadPage(url);
		}catch(err){
			var htmlxml= replaceAll(getHtmlContent(),"<!DOCTYPE html>","");
		}
	}
	return htmlxml;
}



//////////////////////

var sleepTime=parseInt(Math.random()*10000 + 1000);
			logInfo("Sleep====="+sleepTime);
			sleep(sleepTime);
			waitLoadPage();
	
			

//
//////
	var dt = parseDateTime(lastScraptTime);
	//	var dt = getCurrentDate();
	theLastScraptTime = calcDate(dt,-1,'d');
///////////////////


function parseRefInfo(node) {

	var bpos=toString(node).indexOf('<LI>');
	var epos=toString(node).indexOf('<A',bpos);
	return toString(node).substring(bpos+4,epos).replace(/\s+|:/g,'');
	
}


/////////////

	try {
		//加载页面		
		logInfo("start loadPage: " + theThreadURL);	
		
		try{
			xml = loadPage(theThreadURL);

		}catch(err){
			//println(err);
			var str = getXmlContent();
			str = replaceAll(str,"HTTP:=","MDA=");
			//println(str);
			xml = new XML(str);
			//println(xml);
		}
		
		
		
		
		
		

/////页面格式问题
扫描下网页的 meta 标签里有没有如 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

	setWebCharset("gbk");     //如果查到页面格式是‘gbk’,但抓到的数据仍是乱码,用下面一句
	setXmlCharset("gbk");





//  引用回复抓取
	if(!isInvalidObject(node..DIV.(@CLASS == "relyhf"))){
		var com=node..DIV.(@CLASS == "relyhf");
		var com2=node..DIV.(@CLASS == "yy_reply_cont");
		var ret ="『"+html2text(com)+"』"+html2text(com2);
		return ret;
		
	}
	return node;
}






// 当NEXT没有特殊class的时候用到的遍历语句
	var nextPageNode = null;
	if (pageNode.length() > 1) {
		for(var i=0;isInvalidObject(nextPageNode) && i<pageNode.length(); ++i){
			if(html2text(pageNode[i]) == "»" ) {
				nextPageNode = pageNode[i];
				return nextPageNode;
			}
		}
	}

	return nextPageNode;
	
	
	
	
	
	
	/////////////////////////////////////////////
	当postdate出现  “几天前”   时用到的脚本
	/////////////////////////////////////////////
	
	
	function parsePostDateOfPost(rowNode) {
		var node = rowNode..DIV.(@CLASS == "authi").EM;
			if (isInvalidObject(node)) {
				throw "Can not parse Dateofpost";
			}
      
	
			if (node.length() > 1) {
			   node = node[1];
				}
			if (toString(node).indexOf("TITLE")>=0){
				var node_time=node.SPAN.@TITLE;
				return parseDateTime(node_time);
			}else{
			
				node = html2text(node);
				node=  node.replace(" ","");
			
				return  parseDateTime(node);
			}
	
}


 //  或者 :

function parseIssueDate(rowNode) {
	var node = rowNode..TD.(@CLASS == "by").EM;
	if(node.length() > 1) {
		node = node[0];
	}
	if (isInvalidObject(node)) {
		throw "Can not parse IssueDate";
	}
	
	if (html2text(node).indexOf("前") >= 0){
		node=html2text(node).replace(/\s+/g,"");
		if(node=="半小时前"){
			node="30分钟前";
		}
		var dateNode=parseDateTime(node);
	}else{
		var dateNode=parseDateTime(html2text(node));
	}
	
	return dateNode;
}





//在传入的行对象中解析首帖的发布时间,
//返回为字符串类型
function parseIssueDate(rowNode) {
	var node = rowNode..TD.(@CLASS == "by")[0]..SPAN;
	if (isInvalidObject(node)) {
		node = rowNode..TD.(@CLASS == "by")[0].EM.A;
		}
	if (isInvalidObject(node)) {
		throw "Can not parse IssueDate";
		}
		
	return parseDateTime(node);
}


	
	// getThreadRowNodes 时当本版置顶贴用到的ID类不同时,用到的脚本
	
	function getThreadRowNodes(xml) {
			var rowNodes = xml..TABLE.(@ID =="threadlisttableid").TBODY.(/normalthread\_(\d+)/.test(@ID) || /stickthread\_(\d+)/.test(@ID));
				 if( isInvalidObject(rowNodes))
				return  rowNodes;

	}
	
	// getThreadID 的时候,各页面出现不同的  ID类  用到的脚本
	
	function parseThreadId(parseUrl) {
	
	try{
		if (parseUrl.indexOf("bbs/thread")>=0){
			urlid= /thread\-(\d+)/.exec(parseUrl)[1];
		}else{
			urlid= /tid\=(\d+)/.exec(parseUrl)[1];
		}
//		url= (/thread\-(\d+)/.exec(parseUrl)[1])||(/tid\=(\d+)/.exec(parseUrl)[1]);
		return urlid;
	} catch (err) {
		throw "Can not parse ThreadID."
	}

	
}


function pagesleep(){
	var sleepTime=parseInt(Math.random()*10000 *6 + 1000*30);
	logInfo("Sleep====="+sleepTime);
	sleep(sleepTime);
	waitLoadPage();
	waitLoadPage();
}




	//逐行读取代码



	var content = readFile("e:/myac.txt", 'utf-8');
	var array = new Array();
	var lines = content.split('\n');
	for (var i =0; i <lines.length; ++i) {//lines.length
		logInfo(" ******************************** "+i+" -- start "+lines[i]);
				eval(lines[i]);
		logInfo(" -------------------------------- "+i+" -- end "+lines[i]);
	}
	commit();




	PlayNum = getNumber(html2text(PlayNum).replace("views","").replace(/,/g,""));