Jatin-8898
10/2/2018 - 4:15 PM

WebCrawler using Java

WebCrawler using Java

//package com.webCrawler;
import java.util.*;
import java.net.*;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class WebCrawler{

	private Queue<String> queue;
	private List<String> discoveredWebsiteList;

	public WebCrawler(){

		//We are instatianting queue as a Linked list
		this.queue = new LinkedList<>();
		//And an arraylist of the discovered websites
		this.discoveredWebsiteList = new ArrayList<>();
	}


	public void discoverWeb(String root){
		this.queue.add(root);		/*add in queue*/
		this.discoveredWebsiteList.add(root);	/*add in ArrayList*/


		while( !queue.isEmpty()){
			String v = this.queue.remove();		/*remove it from the queue*/
			String rawHtml = readURL(v);		/*Call to func*/

			String regexp  = "http://(\\w+\\.)*(\\w+)";		/*Regex*/
			Pattern pattern = Pattern.compile(regexp);	
			Matcher matcher = pattern.matcher(rawHtml);		/*see if it matches*/

			while( matcher.find() ){			/*While it has more urls*/

				String actualUrl = matcher.group();		/*make group*/

				if( !discoveredWebsiteList.contains(actualUrl)){	/*if doesnt contains*/

					discoveredWebsiteList.add(actualUrl);		/*then add it to AL*/
					System.out.println("Website has been added " + actualUrl);
					queue.add(actualUrl); 					
				}
			}	
		}
	}


	private String readURL(String v){
		String rawHtml = "";
		
		try{
			URL url = new URL(v);
			//with this url openStream
			BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
			String inputLine = "";

			while( (inputLine = in.readLine()) != null){
				rawHtml +=inputLine;
			}

			in.close();

		}catch(Exception e){
			e.printStackTrace();
		}

		return rawHtml;

	}



	public static void main(String[] args){

	 	WebCrawler crawler = new WebCrawler();

	 	//Pass the name of the webite you want to crawl upon
	 	String rootUrl = "http://www.geeksforgeeks.com";
	 	crawler.discoverWeb(rootUrl);
	 }


 }