leodutra
6/5/2014 - 7:33 PM

Simple JavaScript Cralwer (js, crawler, javascript)

Simple JavaScript Cralwer (js, crawler, javascript)

/*
	JavaScript Link Crawler
	author: Leonardo Dutra (leodutra.br@gmail.com)
	
	Instructions: open browser console, paste, run.
*/


var limit = 30000; // limite de links encontrados

var linkHolder = {}; // objeto usado como hash de links
var SCPReferenciado = [];
var visitedDomains = [];
var success = found = executionTime = 0;
var currentDomain = null;
var stopped = false;

function getLinks(html, origin) {

	html = removeComments(html);

	// regex que encontra links href, src e do JavaScript
	var matches = html.match(/(?:\b(?:href|src)[^=]*=|["'>])[^"'<>]+?\.(?:html|php|asp|do|jsp|htm)\b[^"'<>\s]*/gim)
	if (matches) {
		var i = matches.length;
		while (i--) {
			// remove espaços, quebra de linha e quotes dos links encontrados
			matches[i] = matches[i].replace(/["\s'>\r\n]+|^(?:href|src)[^=]*=/gim, '');
		}
		return relativeToAbsolute(matches, origin);
	}
	return null;
}

// remove comentarios JS e HTML
function removeComments(html) {
	// HTML    <!--[\S\s]*?-->
	// JS /**/ \/\*(?:[^*] | \*[^/])*?\*\/
	// JS //   ([^:])\/\/.*
	return html.replace(/(?:<!-[^>]*>|\/\*(?:[^*]|\*[^\/])*?\*\/)/gm, '')
	  .replace(/([^:])\/\/.*/gm, '$1'); // TODO IMPROVE "//" in case of not a actual comment
}

// obtém uma simulação do objeto location com a uri definida
function getLocationInfo(uri) {
	var a = document.createElement('a');
	a.href = uri;
	return a;
}

// extrai domínio de uma uri
function getDomain(uri) {
	return uri.match(/^\w+:\/\/[^\/]+/)[0];
}

// tranforma caminhos relativos em uri completas
function relativeToAbsolute(links, origin) {
	var absURI = getAbsolutePath(origin) + '/';
	var domain = getDomain(origin) + '/';
	var i = links.length;
	var link;
	while (i--) {
		link = links[i];
		if (link.search(/^\w+:\/\//i) === -1) {
			link = (link.charAt(0) === '/' ? domain : absURI) + link;
		}
		links[i] = link.replace(/([^:])\/\/+/g, '$1/');
	}
	return links;
}

// function backTrackURI(uri) {
// 	return uri.replace(/(?:http[s]?:\/)?\/*?[^\/]+?\/?$/im, '');
// }

// obtem caminho absoluto de uma uri
function getAbsolutePath(uri) {
	return uri.match(/^\w+:\/\/[^\/]+\/*(?:[^\/\.]+(?:\/+|\r)|\.\.\/)*/)[0];
}

// retorna link nao visitado seguindo prioridade de dominio
function getUnvisitedURI() {
	for (var i = 0, l = visitedDomains.length, visitedDomain; i < l; ++i) {
		visitedDomain = visitedDomains[i];
		for (var link in linkHolder) {
			if (~link.indexOf(visitedDomain) && linkHolder[link].status === '_') {
				return link;
			}
		}
	}
	if (currentDomain) {
		for (var link in linkHolder) {
			if (~link.indexOf(currentDomain) && linkHolder[link].status === '_') return link;
		}
		visitedDomains.push(currentDomain);
	}
	for (var link in linkHolder) {
		if (linkHolder[link].status === '_') {
			currentDomain = getLocationInfo(link).hostname;
			return link;
		}
	}
	return null;
}

// definicao de classe de informacao de link para melhor perfomance
function LinkInfo(origin) { this.origin = origin; };
LinkInfo.prototype = {
	status: '_',
	origin: ''
};

// adiciona links ao hash de controle
function pushLinks(links, origin) {
	if (links) {
		var i = links.length;
		var link;
		while (i--) {
			if (linkHolder[links[i]]) continue;
			linkHolder[links[i]] = new LinkInfo(origin);
			++found;
		}
	}
}

function getExecutionTime() {
	return new Date(Date.now() - executionTime).toISOString().match(/([^T]*)Z$/)[1];
}

function toLink(href) {
	return '<a href="' + href + '" target="_blank">' + href + '</a>';
}

// mostra status simplificado e retorna links por categoria
function status() {

	var visited = [];
	var broken = [];
	var unvisited = [];
	var redirected = [];

	for (var link in linkHolder) {
		switch (linkHolder[link].status) {
		case '_':
			type = unvisited;
			break;
		case 'V':
			type = visited;
			break;
		case 'X':
			type = broken;
			break;
		case 'R':
			type = redirected;
			break;
		}
		type.push(linkHolder[link].status + ' ' + toLink(link) +'<span>&nbsp;'+ toLink(linkHolder[link].origin)+'</span>');
	}

	console.log([
		'Execution time: ' + getExecutionTime(),
		found + ' found',
		unvisited.length + ' unvisited',
		visited.length + ' visited',
		redirected.length + ' redirected',
		broken.length + ' broken'
	].join('\n'));

	return {
		broken: broken,
		visited: visited,
		unvisited: unvisited,
		redirected: redirected
	};
}

// exibe links em popup para impressao
function showLinks() {

	var data = status();
	var br = '<br/>';
	var logInfo = [

		'Execution time: ' + getExecutionTime(),
		found + ' found',
		data.unvisited.length + ' unvisited',
		data.visited.length + ' visited',
		data.redirected.length + ' redirected',
		data.broken.length + ' broken',
		br,

		'### BROKEN: ' + data.broken.length,
		data.broken.sort().join(br),
		br,

		'### REDIRECTED: ' + data.redirected.length,
		data.redirected.sort().join(br),
		br,

		'### VISITED: ' + data.visited.length,
		data.visited.sort().join(br),
		br,

		'### UNVISITED: ' + data.unvisited.length,
		data.unvisited.sort().join(br)
	];
   	
	var popup = open(null, '_blank');
	if (popup) {
		popup.document.write(
			'<head><style>a {color: #555;text-decoration: none;} span a {color: #bbb;}</style></head>'+
			'<body>'+
			'<div style="white-space:nowrap;font-size: 12px; font-family: Consolas,\'Lucida Console\',\'DejaVu Sans Mono\',monospace;">' + 
				logInfo.join(br) + 
			'</pre></div></body>'
		);
	}
	else {
		alert('Popup bloqueado.')
	}
}

// visita determinado link e extrai outros links (crawl)
function visitLink(link) {
	if (link) {
		jQuery.ajax({
			url: link
			//,xhrFields: {
			//    withCredentials: true
			//}
		}).fail(function (jqXHR, textStatus, errorThrown) {
			linkHolder[link].status = 'X';
			run()
		}).done(function (data, textStatus, jqXHR) {
			++success;
			linkHolder[link].status = 'V';
			if (typeof data==='string') pushLinks(getLinks(data, link), link);
			run();
		});
	}
	else console.log('FINISHED (no more links to crawl)');
}

function run() {
	if (success < limit && !stopped) visitLink(getUnvisitedURI());
	else {
		stopped = true;
		showLinks();
		limit = (success / limit >> 0) * limit + limit;
 	}
}

function start(limit) {
	stopped = false;
	console.log('RUNNING...\nUse status() and showLinks()');
	executionTime = Date.now();
	run();
}

// importa jQuery necessaria para o crawler
function importScript(src) {
	var el = document.createElement('script');
	el.type = 'text/javascript';
	el.src = src;
	(document.head || document.body).appendChild(el);
}

// START
importScript('//ajax.googleapis.com/ajax/libs/jquery/1.8.1/jquery.min.js');
pushLinks([location.href]);
setTimeout(start, 2000); // 2 seg de aguardo pelo import

//'limit:'+limit;