JS
Charles 826179140@qq.com
load("module.js");
load("global_parameter.js");
importClass(net.sf.json.JSONObject);
解析Jason
var url="http://www.xiaohongshu.com/api/discovery/get_comment?&_r=1458289534039&discovery_id=56ea7a48d1d3b91ccea96cfb";
var xml=loadPage(url);
var jsonstr=getjsonStr(xml);
println(jsonstr);
function getjsonStr(xml){
var jsonStr = html2text(xml..BODY);
var startPos = jsonStr.indexOf("({")
if(startPos > -1) {
jsonStr = jsonStr.substring(startPos+1);
var endPos = jsonStr.indexOf("})");
jsonStr = jsonStr.substring(0, endPos+1);
}
var jsons = eval('(' + jsonStr + ')');4
return jsons;
}
function hexToDec(str) {
var json = xml.replace(/\\\"/g,'"').replace(/\\\//g,'/').replace(/\\u/g,"%u");
json = unescape(json);
return unescape(str);
}
buildPropertyDefine(type, isRequire, isRefindKey, refindKeyIndex, format, defaultValue)
=IF(ISERROR(MATCH(A1,$G:$G,0)),A1,VLOOKUP(A1,$G$1:$H$3,2,FALSE))
str = str.replace(/[&]/g,"");
var threadURLRegex = /bbs\/.*\/(\d+).html/;
var threadRegex = normalthread\_(\d+);
var threadURLRegex = /thread.*\-(\d+)\-\d+\.html/;
var rowNodes = xml..DIV.(@CLASS =="carea")..DL.(/list_dl.*/.test(@CLASS));
return html2text(node.@HREF).indexOf('http:')>=0?node.@HREF:'http:'+node.@HREF;
var str=toString(node);
str=str.substring(str.indexOf('>'),str.indexOf('BR'));
addInvalidTagReplacePair("<script.*?/script>","");
addInvalidTagReplacePair("<SCRIPT.*?/SCRIPT>","");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]", "<P");
addInvalidTagReplacePair("<IMG [a-zA-Z0-9_].*>", "");
addInvalidTagReplacePair("<img [a-zA-Z0-9_].*>", "");
addInvalidArrt("xml:lang=");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+", "<P");
addInvalidTagReplacePair("</[a-zA-Z0-9_]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+>", "</P>");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+/>", "");
addInvalidTagReplacePair("<[a-zA-Z0-9_]+:[a-zA-Z0-9_]+", "<P");
addInvalidTagReplacePair("</[a-zA-Z0-9_]+:[a-zA-Z0-9_]+>", "</P>");
addInvalidTagReplacePair("FORM\\.|form\\.", "FORM");
addInvalidTagReplacePair("\\?>", "/>");
addInvalidTagReplacePair(" ", "");
addInvalidTagReplacePair("\\?xml:", "xml");
addInvalidTagReplacePair("<FORM.*?FORM>","");
addInvalidTagReplacePair("<XMLNS.*?XMLNS>","");
addInvalidArrt("[0-9a-zA-Z_]+:[0-9a-zA-Z_]+=");
addInvalidTagReplacePair("<!DOCTYPE .*?>","");
addInvalidTagReplacePair("<o:p></o:p>","");
addInvalidTagReplacePair("x:str=\"\"","");
登录
function loginPage(username, password) {
try {
var url = "http://www.kaixin001.com";
browser.url = url;
sleep(20000);
if (checkLogin()) {
logInfo("Login Sucessed. Do not need input username and password.");
return;
}
var xml = new XML(browser.document.xmlContent);
var emailInput = xml..INPUT.(@NAME == "email");
if (emailInput == null || emailInput.length() == 0) {
return;
}
browser.elementSetValue(emailInput, username);
browser.elementSetValue(xml..INPUT.(@NAME == "password"), password);
browser.elementSetChecked(xml..INPUT.(@NAME == "remember"), true);
browser.elementClick(xml..INPUT.(@ID == "btn_dl"));
sleep(10000);
var htmlContent = browser.document.xmlContent;
if (checkLogin()) {
logInfo("Login Sucessed. Do not need input code by verifyImage.");
return;
}
var xml = new XML(htmlContent);
try {
var verifyImage = xml..IMG.(@ID == "randimg");
// xml..AIMG.(@ID == "randddimg").length();
if (verifyImage != null && verifyImage.length() > 0 && verifyImage.@SRC.length() > 0) {
/* var verifyCode = inputVerifyCode(browser.readImageData("randimg"));
if (verifyCode != null) {
browser.elementSetValue(xml..INPUT.(@ID == "code"), verifyCode);
}
*/
/*saveErrorContent(url+"?user="+username+"&password="+password, htmlContent);
dbError("Need input verify code when login.");
*/
}
} catch (parseLoginError) {
hasError = true;
logError("Error when parse login page", parseLoginError);
}
} catch (err) {
hasError = true;
logError("Error in login", err);
}
}
function checkLogin() {
var xml = new XML(browser.document.xmlContent);
var loginedLink = xml..A.(@TITLE == "个人资料");
if (loginedLink.length() > 0 && (browser.url == "http://www.kaixin001.com/home/?l=a" || parseUserId(loginedLink.@HREF) == parseUserId(browser.url))) {
return true;
}
return false;
}
zhihu.com 知乎:postrawNode
///////////////////////////////////////////
function getPostRowNodes(xml) {
xml=hexToDec(toString(xml).replace(/\\\"/g,'"').replace(/\\\//g,'/'))
var tmp=new Array();
var bpos=0;
var epos=0;
while(true){
bpos=xml.indexOf('zm-item-answer ',epos);
if(bpos<0){
break;
}
epos=xml.indexOf('","',bpos);
if(epos<0){
epos=xml.indexOf(']',bpos);
}
if(epos<0){
break;
}
tmp.push(xml.substring(bpos,epos));
}
// var rowNodes = xml..LI.(@CLASS=='item clearfix');
return tmp;
}
//返回为字符串类型
function parseSubject(rowNode) {
var bpos=rowNode.indexOf('<dt>');
var epos=rowNode.indexOf('</dt>');
var start =rowNode.indexOf('">',bpos);
var titleNode = rowNode.substring(start+2,epos).replace(/\\\//g,'/');
if (isInvalidObject(titleNode)) {
throw "Can not parse subject";
}
return titleNode;
///////////////////////
var url='http://search.dxy.cn/?words='+encodeURI(theKeyword)+'&source=BBS&limit=15&o=1&page='+page;
function URL(url){
try{
var htmlxml=loadPage(url);
}catch(err){
try{
var htmlxml=loadPage(url);
}catch(err){
var htmlxml= replaceAll(getHtmlContent(),"<!DOCTYPE html>","");
}
}
return htmlxml;
}
//////////////////////
var sleepTime=parseInt(Math.random()*10000 + 1000);
logInfo("Sleep====="+sleepTime);
sleep(sleepTime);
waitLoadPage();
//
//////
var dt = parseDateTime(lastScraptTime);
// var dt = getCurrentDate();
theLastScraptTime = calcDate(dt,-1,'d');
///////////////////
function parseRefInfo(node) {
var bpos=toString(node).indexOf('<LI>');
var epos=toString(node).indexOf('<A',bpos);
return toString(node).substring(bpos+4,epos).replace(/\s+|:/g,'');
}
/////////////
try {
//加载页面
logInfo("start loadPage: " + theThreadURL);
try{
xml = loadPage(theThreadURL);
}catch(err){
//println(err);
var str = getXmlContent();
str = replaceAll(str,"HTTP:=","MDA=");
//println(str);
xml = new XML(str);
//println(xml);
}
/////页面格式问题
扫描下网页的 meta 标签里有没有如 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
setWebCharset("gbk"); //如果查到页面格式是‘gbk’,但抓到的数据仍是乱码,用下面一句
setXmlCharset("gbk");
// 引用回复抓取
if(!isInvalidObject(node..DIV.(@CLASS == "relyhf"))){
var com=node..DIV.(@CLASS == "relyhf");
var com2=node..DIV.(@CLASS == "yy_reply_cont");
var ret ="『"+html2text(com)+"』"+html2text(com2);
return ret;
}
return node;
}
// 当NEXT没有特殊class的时候用到的遍历语句
var nextPageNode = null;
if (pageNode.length() > 1) {
for(var i=0;isInvalidObject(nextPageNode) && i<pageNode.length(); ++i){
if(html2text(pageNode[i]) == "»" ) {
nextPageNode = pageNode[i];
return nextPageNode;
}
}
}
return nextPageNode;
/////////////////////////////////////////////
当postdate出现 “几天前” 时用到的脚本
/////////////////////////////////////////////
function parsePostDateOfPost(rowNode) {
var node = rowNode..DIV.(@CLASS == "authi").EM;
if (isInvalidObject(node)) {
throw "Can not parse Dateofpost";
}
if (node.length() > 1) {
node = node[1];
}
if (toString(node).indexOf("TITLE")>=0){
var node_time=node.SPAN.@TITLE;
return parseDateTime(node_time);
}else{
node = html2text(node);
node= node.replace(" ","");
return parseDateTime(node);
}
}
// 或者 :
function parseIssueDate(rowNode) {
var node = rowNode..TD.(@CLASS == "by").EM;
if(node.length() > 1) {
node = node[0];
}
if (isInvalidObject(node)) {
throw "Can not parse IssueDate";
}
if (html2text(node).indexOf("前") >= 0){
node=html2text(node).replace(/\s+/g,"");
if(node=="半小时前"){
node="30分钟前";
}
var dateNode=parseDateTime(node);
}else{
var dateNode=parseDateTime(html2text(node));
}
return dateNode;
}
//在传入的行对象中解析首帖的发布时间,
//返回为字符串类型
function parseIssueDate(rowNode) {
var node = rowNode..TD.(@CLASS == "by")[0]..SPAN;
if (isInvalidObject(node)) {
node = rowNode..TD.(@CLASS == "by")[0].EM.A;
}
if (isInvalidObject(node)) {
throw "Can not parse IssueDate";
}
return parseDateTime(node);
}
// getThreadRowNodes 时当本版置顶贴用到的ID类不同时,用到的脚本
function getThreadRowNodes(xml) {
var rowNodes = xml..TABLE.(@ID =="threadlisttableid").TBODY.(/normalthread\_(\d+)/.test(@ID) || /stickthread\_(\d+)/.test(@ID));
if( isInvalidObject(rowNodes))
return rowNodes;
}
// getThreadID 的时候,各页面出现不同的 ID类 用到的脚本
function parseThreadId(parseUrl) {
try{
if (parseUrl.indexOf("bbs/thread")>=0){
urlid= /thread\-(\d+)/.exec(parseUrl)[1];
}else{
urlid= /tid\=(\d+)/.exec(parseUrl)[1];
}
// url= (/thread\-(\d+)/.exec(parseUrl)[1])||(/tid\=(\d+)/.exec(parseUrl)[1]);
return urlid;
} catch (err) {
throw "Can not parse ThreadID."
}
}
function pagesleep(){
var sleepTime=parseInt(Math.random()*10000 *6 + 1000*30);
logInfo("Sleep====="+sleepTime);
sleep(sleepTime);
waitLoadPage();
waitLoadPage();
}
//逐行读取代码
var content = readFile("e:/myac.txt", 'utf-8');
var array = new Array();
var lines = content.split('\n');
for (var i =0; i <lines.length; ++i) {//lines.length
logInfo(" ******************************** "+i+" -- start "+lines[i]);
eval(lines[i]);
logInfo(" -------------------------------- "+i+" -- end "+lines[i]);
}
commit();
PlayNum = getNumber(html2text(PlayNum).replace("views","").replace(/,/g,""));