Web Crawler
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Piyush Juneja on 3/31/17.
*/
public class Word implements Serializable {
String word;
ArrayList<Integer> postings;
public static final long serialVersionUID = -3696191086353573895L;
int urlID;
public Word(String word, int urlID) {
postings = new ArrayList<>();
this.word = word;
this.postings.add(0, urlID);
}
public void addURLID(int urlID) {
postings.add(urlID);
}
public String getWord() {
return word;
}
public List<Integer> getList() {
return postings;
}
public boolean equals(Object obj) {
Word w = (Word) obj;
return this.word.equals(w.getWord());
}
}
public void addURLID(int urlID) {
postings.add(urlID);
}
public String getWord() {
return word;
}
public List<Integer> getList() {
return postings;
}
public boolean equals(Object obj) {
Word w = (Word) obj;
return this.word.equals(w.getWord());
}
}
import java.io.Serializable;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class SearchThread implements Serializable, Runnable {
int start;
int finish;
String[] terms;
public SearchThread(int start, int finish, String[] terms) {
this.start = start;
this.finish = finish;
this.terms = terms;
}
public Word findTerm(String term) {
for (Word word : Search.wordList) {
if (word.getWord().equalsIgnoreCase(term)) {
return word;
}
}
return null;
}
public void run() {
for (String term : terms) { //For each term searched
for (int i = start; i <= finish; i++) {
if (Search.wordList.get(i).getWord().equalsIgnoreCase(term)) { //if Word was found in Search.wordList || CHANGELOG: replaced findTerm(term) != null
for (int ID : Search.wordList.get(i).getList()) { //Parse through link postings for given words
String url = Search.pageList.get(ID).getURL();
Result result = new Result(url , ID); //Store each link as a result
if (Search.resultSet.contains(result)) { //if array of results already contains current result
int index = Search.resultSet.indexOf(result);
Search.resultSet.get(index).incrementScore(); //Increment score of current result object
}
else
Search.resultSet.add(result); //if result doesn't exist in result array, add to array;
}
}
}
}
}
}
import java.io.File;
import java.io.Serializable;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Search {
static List<Page> pageList;
private String pageListFile;
static List<Result> resultSet;
static List<Word> wordList;
private String wordListFile;
private FileUtils fl;
public Search(String wordListFile, String pageListFile) {
this.wordListFile = wordListFile;
this.pageListFile = pageListFile;
fl = new FileUtils();
resultSet = Collections.synchronizedList(new ArrayList<Result>()); //code from project page on sync
setup(wordListFile, pageListFile);
}
public synchronized void setup(String wordListFile, String pageListFile) {
try {
pageList = fl.getPageList(pageListFile);
wordList = fl.getWordList(wordListFile);
}
catch (Exception e) {
e.printStackTrace();
}
}
public List<Result> executeQuery(String query) {
String[] keywords = query.split(" ");
Thread[] threads = new Thread[5];
int totalSize = wordList.size();
int divided = totalSize/5;
int endPoint = totalSize/5; //Variable endpoint for SearchThread constructor
int startPoint = 0; //Variable startpoint for SearchThread constructor
int currentThread = 0; //current thread position in array
for (Thread thread : threads) {
thread = new Thread(new SearchThread(startPoint, endPoint, keywords)); //Start thread
threads[currentThread] = thread; //Assign thread to current position in array
if (startPoint == 0)
startPoint += divided + 1;
else
startPoint+=(totalSize/5);
if (endPoint+divided < totalSize-5)
endPoint += divided;
else
endPoint = totalSize -1;
currentThread++;
}
for (Thread thread : threads) {
thread.start();
}
for (Thread thread : threads) {
try {
thread.join();
} catch (Exception e) {
e.printStackTrace();
}
}
sort();
return resultSet;
}
public void nullCheck() {
if (pageList == null || wordList == null)
setup(this.wordListFile, this.pageListFile);
}
public void sort() {
Collections.sort(resultSet);
}
}
import java.io.Serializable;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Result implements Serializable, Comparable<Result> {
public int score;
public static final long serialVersionUID = -938761094876384658L;
public String url;
public int urlID;
public Result(String url, int urlID) {
this.url = url;
this.urlID = urlID;
score = 1;
}
public void updateScore(int score) {this.score += score;} //TODO: FIX: PARAM IS ALWAYS 783
public void incrementScore() {score++;}
public int getScore() {return score;}
public String getURL() {return url;}
public int getURLID() {return urlID;}
@Override
public boolean equals(Object obj) {
if (obj instanceof Result) {
Result result = (Result) obj;
if (this.url.equals(result.url) || this.urlID == result.urlID) {
return true;
}
}
return false;
}
public int compareTo(Result candidate) {
if (this.score > candidate.score)
return -1;
else if (this.score < candidate.score)
return 1;
else
return 0;
}
}
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.IOException;
import java.io.Serializable;
/**
* Created by SahilMPattni on 3/31/2017.
*/
public class Parser {
public Parser() {}
public Document getDocument(String url) throws ParseException {
Document d = null;
//Custom Exceptions
if (url == null)
throw new ParseException("getDocument() failed. String url is null.");
else if (url.equals(""))
throw new ParseException("getDocument() failed. String url is empty.");
//Try-Catch
else {
try {
System.out.println(url);
d = Jsoup.connect(url).timeout(3000).get();
} catch (Exception e) {
throw new ParseException("getDocument() failed. Connection failed.");
}
if (d == null)
throw new ParseException("getDocument() failed. Docoument is null.");
}
return d;
}
public Elements getLinks(Document doc) throws ParseException {
if (doc == null)
throw new ParseException("getLinks() failed. Document parameter is null.");
else
return doc.select("a[href]");
}
public String getBody(Document doc) throws ParseException {
if (doc == null)
throw new ParseException("getBody() failed. Document parameter is null.");
Element content = doc.body();
if (content != null)
return content.text();
else
return "";
}
public static void main(String[] args) throws ParseException {
Parser p = new Parser();
Document doc = p.getDocument("http://www.cs.purdue.edu");
Element body = doc.body();
String content = body.text();
System.out.println(content);
//for (Element e : links)
//System.out.println(e.attr("abs:href"));
}
}
import java.io.Serializable;
/**
* Created by SahilMPattni on 3/31/2017.
*/
public class ParseException extends Exception {
public ParseException(String message) {
super(message);
}
}
import java.io.Serializable;
/**
* Created by piyushjuneja on 3/31/17.
*/
public class Page implements Comparable, Serializable {
public static final long serialVersionUID = -1827677255104766839L;
String url;
private int urlID;
public Page(String url, int urlID) {
this.url = url;
this.urlID = urlID;
}
public String getURL() {
return this.url;
}
public int getURLID() {
return this.urlID;
}
@Override
public int compareTo(Object o) {
Page candidate = (Page) o;
if (candidate.getURLID() < this.getURLID()) return -1; // Less than this
else if (candidate.getURLID() > this.getURLID()) return 1; // Greater than this
else return 0; // equal to this
}
public boolean equals(Object obj) {
Page candidate = (Page) obj;
if (candidate.getURL().equals(this.getURL()) ||
candidate.getURLID() == this.getURLID())
return true;
else
return false;
}
}
import java.io.Serializable;
public class Node {
private Object data;
private Node next;
private Node prev;
public Node(Object obj) {
this.data = obj;
}
public void setNext(Node next) {
this.next = next;
}
public void setPrev(Node prev) {
this.prev = prev;
}
public Node getNext() {
return this.next;
}
public Node getPrev() {
return this.prev;
}
public Object getData() {
return this.data;
}
}
import java.io.Serializable;
public class MyQueue {
int count;
Node head;
Node tail;
public MyQueue() {
head = new Node(null);
tail = new Node(null);
count = 0;
head.setNext(tail);
}
public void add(Object o) {
if(o == null)
return;
Node temp = new Node(o);
if (head.getData() == null) {
head = new Node(o);
count++;
head.setNext(tail);
}
else if(head.getNext().getData() != null) {
Node ptr = head.getNext();
while(ptr.getNext() != null) {
if(ptr.getNext().getData() == null) {
ptr.setNext(temp);
temp.setNext(tail);
count++;
break;
}
ptr = ptr.getNext();
}
}
else {
head.setNext(temp);
temp.setNext(tail);
count++;
}
}
public Node peek() {
if(isEmpty()) return null;
return head;
}
public synchronized Node remove() { //CHANGELOG: added synchronized
if(isEmpty())
return null;
else {
Node toReturn = head;
head = head.getNext();
count--;
//System.out.println(toReturn.getData());
return toReturn;
}
}
public boolean isEmpty() {
return count == 0;
}
public int size() {
return count;
}
}
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* Created by Sahil Pattni on 16-Apr-17.
*/
public class FileUtils {
public boolean saveWordTable(List<Word> wordTable, String filePath)
{
if (wordTable == null || filePath == null)
return false;
try
{
FileOutputStream fos = new FileOutputStream(filePath);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(wordTable);
oos.close();
}
catch(Exception e)
{
e.printStackTrace();
return false;
}
return true;
}
public boolean savePageTable(List<Page> pageTable, String filePath)
{
if (pageTable == null || filePath == null)
return false;
FileOutputStream fos = null;
ObjectOutputStream oos = null;
try
{
fos = new FileOutputStream(filePath);
oos = new ObjectOutputStream(fos);
oos.writeObject(pageTable);
}
catch(Exception e)
{
e.printStackTrace();
return false;
}
finally
{
try
{
fos.close();
oos.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return true;
}
public List<Page> getPageList(String filePath)
{
FileInputStream fis = null;
ObjectInputStream ois = null;
if (filePath == null)
return null;
try
{
fis = new FileInputStream(filePath);
ois = new ObjectInputStream(fis);
List<Page> pages = (ArrayList<Page>) ois.readObject();
return pages;
}
catch(Exception e)
{
e.printStackTrace();
}
finally
{
try
{
ois.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public List<Word> getWordList(String filePath)
{
FileInputStream fis = null;
ObjectInputStream ois = null;
if (filePath == null)
return null;
try
{
fis = new FileInputStream(filePath);
ois = new ObjectInputStream(fis);
List<Word> words = (ArrayList<Word>) ois.readObject();
return words;
}
catch(Exception e)
{
return null;
}
/*finally
{
try
{
assert ois != null;
ois.close();
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}*/
}
}
import sun.swing.FilePane;
import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;
/**
* Created by Sahil Pattni on 17-Apr-17.
*/
public class Driver {
private FileUtils fu;
private List<Page> pageFile;
private List<Word> wordFile;
public void crawl() {
Crawler cl = new Crawler("https://www.investing.com/", "www.investing.com", 10);
cl.crawl();
}
public static void main(String[] args) {
Driver d = new Driver();
d.crawl();
d.save();
Scanner s = new Scanner(System.in);
boolean doneSearching = false;
while (!doneSearching) {
System.out.println("Enter Query");
String query = s.nextLine();
d.search(query);
System.out.println();
System.out.println("Do you want to continue (yes/no)");
String answer = s.nextLine();
if (answer.equalsIgnoreCase("no"))
doneSearching = true;
}
}
public void save() {
fu = new FileUtils();
fu.savePageTable(Crawler.parsed,"C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt");
fu.saveWordTable(Crawler.words, "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt");
}
public void search(String query) {
String pageLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt";
String wordLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt";
Search s = new Search(wordLocation, pageLocation);
List<Result> results = s.executeQuery(query);
Collections.sort(results);
//Output
int currentResult = 0;
System.out.println("Query: " + query);
for (Result result: results) {
System.out.println("("+currentResult+")" + result.getURL() + " | " +
"score: " + result.getScore());
}
}
}
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* Created by Sahil Pattni on 04-Apr-17.
*/
public class Crawler {
static String seedURL;
static int currentID;
static String domain;
static int limit;
MyQueue toParse;
static Parser parser;
static int totalURLs;
static List<Page> parsed = new ArrayList<>();
static List<String> visited = new ArrayList<>();
static List<Word> words = new ArrayList<>();
public Crawler(String seed, String domain, int limit) {
currentID = 0;
totalURLs = 0;
Crawler.seedURL = seed;
Crawler.domain = domain;
Crawler.limit = limit;
parser = new Parser();
toParse = new MyQueue();
toParse.add(seed);
}
public void crawl(){
while (!toParse.isEmpty() && currentID < limit) {
if (toParse.peek().getData() != null) { //if next node to be parsed is not null
String link = (String) toParse.remove().getData(); //remove node from queue and parse
if (isValidURL(link)) {
if (!visited.contains(link)) { //if link has not been previously visited
Page p = new Page(link, currentID);
if (!pageExists(p)) { //custom method to test whether page has already been created
try {
Document d = parser.getDocument(link);
if (parse(d, currentID)) {
currentID++;
addPageToList(p); //Add page to list of parsed pages
}
}
catch (ParseException e) {
e.printStackTrace();
}
}
visited.add(link); //Add links to visited regardless of whether they have been parsed
}
}
}
else {
break;
}
}
}
public boolean parse(Document doc, int id) {
boolean returner = false;
try {
parseLinks(doc);
parseText(doc, id);
returner = true;
} catch (ParseException e) {
e.printStackTrace();
}
return returner;
}
public boolean pageExists (Page p) {
for (Page page : parsed) {
if (page.equals(p))
return true;
}
return false;
}
public static void main(String[] args) throws ParseException {
Crawler cl = new Crawler("https://www.cs.purdue.edu/homes/bxd/", "www.cs.purdue.edu", 50);
Parser p = new Parser();
Document d = p.getDocument("http://www.cs.purdue.edu/homes/cs177");
cl.parse(d, currentID);
}
public void parseLinks(Document doc) throws ParseException{
Elements links = parser.getLinks(doc);
boolean exists = false;
for (Element element : links) {
for (String string : visited) {
if (element.hasText()) {
if (element.equals(string))
exists = true;
}
}
if (!exists)
toParse.add(element.attr("abs:href"));
}
}
public void parseText(Document doc, int id) {
try {
String text = parser.getBody(doc); //Gather text
if (!text.equals("")) { //see getBody() in Parser class
String[] splitUp = text.split(" "); //Split up into array
for (String string : splitUp) { //parse through words
boolean isInArray = false; // boolean to check if current word is in array
for (Word word : words) { // parse through Words array
if (word.getWord().equals(string)) { // if the word exists
isInArray = true;
word.addURLID(id);
}
}
if (!isInArray) {
addWordToList(string.toLowerCase(), id);
}
}
}
} catch (ParseException e) {
e.printStackTrace();
}
}
public void addWordToList(String word, int id) {
Word wordle = new Word(word, id);
words.add(wordle);
}
public void addToQueue(String url) {
boolean duplicateURL = false;
for (Page page : parsed) {
if (page.getURL().equals(url))
duplicateURL = true;
}
if (!duplicateURL) {
toParse.add(url);
totalURLs++;
}
}
public void addPageToList(Page p) {
boolean exists = false;
for (Page page : parsed) {
if (page.equals(p))
exists = true;
}
if (!exists)
parsed.add(p);
}
public boolean isInDomain(String url) {
if (url.contains(domain))
return true;
return false;
}
public boolean isValidURL(String url) {
boolean valid = false;
if ((url.startsWith("https://") || url.startsWith("http://"))) {
valid = true;
}
else
valid = false;
return valid;
}
}