Web Crawler

4/18/2017 - 9:29 AM

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Piyush Juneja on 3/31/17.
 */
public class Word implements Serializable {
    String word;
    ArrayList<Integer> postings;
    public static final long serialVersionUID = -3696191086353573895L;
    int urlID;

    public Word(String word, int urlID) {
        postings = new ArrayList<>();
        this.word = word;
        this.postings.add(0, urlID);
    }

    public void addURLID(int urlID) {
        postings.add(urlID);
    }

    public String getWord() {
        return word;
    }

    public List<Integer> getList() {
        return postings;
    }

    public boolean equals(Object obj) {
        Word w = (Word) obj;
        return this.word.equals(w.getWord());
    }
}


    public void addURLID(int urlID) {
        postings.add(urlID);
    }

    public String getWord() {
        return word;
    }

    public List<Integer> getList() {
        return postings;
    }

    public boolean equals(Object obj) {
        Word w = (Word) obj;
        return this.word.equals(w.getWord());
    }
}

SearchThread

import java.io.Serializable;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class SearchThread implements Serializable, Runnable {
    int start;
    int finish;
    String[] terms;

    public SearchThread(int start, int finish, String[] terms) {
        this.start = start;
        this.finish = finish;
        this.terms = terms;
    }

    public Word findTerm(String term) {
        for (Word word : Search.wordList) {
            if (word.getWord().equalsIgnoreCase(term)) {
                return word;
            }
        }
        return null;
    }

    public void run() {
        for (String term : terms) { //For each term searched
            for (int i = start; i <= finish; i++) {
                if (Search.wordList.get(i).getWord().equalsIgnoreCase(term)) { //if Word was found in Search.wordList  || CHANGELOG: replaced findTerm(term) != null
                    for (int ID : Search.wordList.get(i).getList()) { //Parse through link postings for given words
                        String url = Search.pageList.get(ID).getURL();
                        Result result = new Result(url , ID); //Store each link as a result

                        if (Search.resultSet.contains(result)) { //if array of results already contains current result
                            int index = Search.resultSet.indexOf(result);
                            Search.resultSet.get(index).incrementScore(); //Increment score of current result object
                        }
                        else
                            Search.resultSet.add(result); //if result doesn't exist in result array, add to array;
                    }
                }
            }
        }
    }
}

Search.java

import java.io.File;
import java.io.Serializable;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Search {
    static List<Page> pageList;
    private String pageListFile;
    static List<Result> resultSet;
    static List<Word> wordList;
    private String wordListFile;
    private FileUtils fl;

    public Search(String wordListFile, String pageListFile) {
        this.wordListFile = wordListFile;
        this.pageListFile = pageListFile;
        fl = new FileUtils();
        resultSet = Collections.synchronizedList(new ArrayList<Result>()); //code from project page on sync
        setup(wordListFile, pageListFile);
    }

    public synchronized void setup(String wordListFile, String pageListFile) {
        try {
            pageList = fl.getPageList(pageListFile);
            wordList = fl.getWordList(wordListFile);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public List<Result> executeQuery(String query) {
        String[] keywords = query.split(" ");
        Thread[] threads = new Thread[5];
        int totalSize = wordList.size();
        int divided = totalSize/5;
        int endPoint = totalSize/5;  //Variable endpoint for SearchThread constructor
        int startPoint = 0;         //Variable startpoint for SearchThread constructor
        int currentThread = 0;  //current thread position in array

        for (Thread thread : threads) {
            thread = new Thread(new SearchThread(startPoint, endPoint, keywords)); //Start thread
            threads[currentThread] = thread; //Assign thread to current position in array
            if (startPoint == 0)
                startPoint += divided + 1;
            else
                startPoint+=(totalSize/5);

            if (endPoint+divided < totalSize-5)
                endPoint += divided;
            else
                endPoint = totalSize -1;

            currentThread++;
        }
        for (Thread thread : threads) {
            thread.start();
        }
        for (Thread thread : threads) {
            try {
                thread.join();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        sort();

        return resultSet;
    }

    public void nullCheck() {
        if (pageList == null || wordList == null)
            setup(this.wordListFile, this.pageListFile);
    }
    public void sort() {
        Collections.sort(resultSet);
    }
}

Result.java

import java.io.Serializable;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Result implements Serializable, Comparable<Result> {
    public int score;
    public static final long serialVersionUID = -938761094876384658L;
    public String url;
    public int urlID;

    public Result(String url, int urlID) {
        this.url = url;
        this.urlID = urlID;
        score = 1;
    }

    public void updateScore(int score) {this.score += score;} //TODO: FIX: PARAM IS ALWAYS 783

    public void incrementScore() {score++;}

    public int getScore() {return score;}

    public String getURL() {return url;}

    public int getURLID() {return urlID;}

    @Override
    public boolean equals(Object obj) {
        if (obj instanceof Result) {
            Result result = (Result) obj;
            if (this.url.equals(result.url) || this.urlID == result.urlID) {
                return true;
            }
        }
        return false;
    }

    public int compareTo(Result candidate) {
        if (this.score > candidate.score)
            return -1;
        else if (this.score < candidate.score)
            return 1;
        else
            return 0;
    }


}

Parser.java

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import javax.print.Doc;
import java.io.IOException;
import java.io.Serializable;

/**
 * Created by SahilMPattni on 3/31/2017.
 */
public class Parser {
    public Parser() {}

    public Document getDocument(String url) throws ParseException {
        Document d = null;

        //Custom Exceptions
        if (url == null)
            throw new ParseException("getDocument() failed. String url is null.");
        else if (url.equals(""))
            throw new ParseException("getDocument() failed. String url is empty.");

            //Try-Catch
        else {
            try {
                System.out.println(url);
                d = Jsoup.connect(url).timeout(3000).get();
            } catch (Exception e) {
                throw new ParseException("getDocument() failed. Connection failed.");
            }
            if (d == null)
                throw new ParseException("getDocument() failed. Docoument is null.");
        }
        return d;
    }

    public Elements getLinks(Document doc) throws ParseException {
        if (doc == null)
            throw new ParseException("getLinks() failed. Document parameter is null.");
        else
            return doc.select("a[href]");
    }

    public String getBody(Document doc) throws ParseException {
        if (doc == null)
            throw new ParseException("getBody() failed. Document parameter is null.");

        Element content = doc.body();

        if (content != null)
            return content.text();
        else
            return "";
    }

    public static void main(String[] args) throws ParseException {
        Parser p = new Parser();
        Document doc = p.getDocument("http://www.cs.purdue.edu");
        Element body = doc.body();
        String content = body.text();
        System.out.println(content);
        //for (Element e : links)
            //System.out.println(e.attr("abs:href"));
    }
}

ParseException.java

import java.io.Serializable;

/**
 * Created by SahilMPattni on 3/31/2017.
 */
public class ParseException extends Exception {
    public ParseException(String message) {
        super(message);
    }
}

Page.java

import java.io.Serializable;

/**
 * Created by piyushjuneja on 3/31/17.
 */
public class Page implements Comparable, Serializable {
    public static final long serialVersionUID = -1827677255104766839L;
    String url;
    private int urlID;


    public Page(String url, int urlID) {
        this.url = url;
        this.urlID = urlID;
    }


    public String getURL() {
        return this.url;
    }

    public int getURLID() {
        return this.urlID;
    }

    @Override
    public int compareTo(Object o) {
        Page candidate = (Page) o;
        if (candidate.getURLID() < this.getURLID()) return -1; // Less than this
        else if (candidate.getURLID() > this.getURLID()) return 1; // Greater than this
        else return 0; // equal to this
    }

    public boolean equals(Object obj) {
        Page candidate = (Page) obj;
        if (candidate.getURL().equals(this.getURL()) ||
                candidate.getURLID() == this.getURLID())
            return true;
        else
            return false;
    }
}

Node.java

import java.io.Serializable;

public class Node {

    private Object data;
    private Node next;
    private Node prev;

    public Node(Object obj) {
        this.data = obj;
    }

    public void setNext(Node next) {
        this.next = next;
    }

    public void setPrev(Node prev) {
        this.prev = prev;
    }

    public Node getNext() {
        return this.next;
    }

    public Node getPrev() {
        return this.prev;
    }

    public Object getData() {
        return this.data;
    }
}

MyQueue.java

import java.io.Serializable;

public class MyQueue {
    int count;
    Node head;
    Node tail;

    public MyQueue() {
        head = new Node(null);
        tail = new Node(null);
        count = 0;
        head.setNext(tail);
    }


    public void add(Object o) {
        if(o == null)
            return;
        Node temp = new Node(o);
        if (head.getData() == null) {
            head = new Node(o);
            count++;
            head.setNext(tail);
        }
        else if(head.getNext().getData() != null) {
            Node ptr = head.getNext();
            while(ptr.getNext() != null) {
                if(ptr.getNext().getData() == null) {
                    ptr.setNext(temp);
                    temp.setNext(tail);
                    count++;
                    break;
                }
                ptr = ptr.getNext();
            }
        }
        else {
            head.setNext(temp);
            temp.setNext(tail);
            count++;
        }
    }

    public Node peek() {
        if(isEmpty()) return null;
        return head;
    }

    public synchronized Node remove() { //CHANGELOG: added synchronized
        if(isEmpty())
            return null;
        else {
            Node toReturn = head;
            head = head.getNext();
            count--;
            //System.out.println(toReturn.getData());
            return toReturn;
            }
        }

    public boolean isEmpty() {
        return count == 0;
    }

    public int size() {
        return count;
    }

}

FileUtils.java

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Sahil Pattni on 16-Apr-17.
 */
public class FileUtils {

    public boolean saveWordTable(List<Word> wordTable, String filePath)
    {
        if (wordTable == null || filePath == null)
            return false;
        try
        {
            FileOutputStream fos = new FileOutputStream(filePath);

            ObjectOutputStream oos = new ObjectOutputStream(fos);

            oos.writeObject(wordTable);

            oos.close();
        }
        catch(Exception e)
        {
            e.printStackTrace();
            return false;
        }

        return true;
    }

    public boolean savePageTable(List<Page> pageTable, String filePath)
    {
        if (pageTable == null || filePath == null)
            return false;
        FileOutputStream fos = null;
        ObjectOutputStream oos = null;
        try
        {
            fos = new FileOutputStream(filePath);

            oos = new ObjectOutputStream(fos);

            oos.writeObject(pageTable);

        }
        catch(Exception e)
        {
            e.printStackTrace();
            return false;
        }
        finally
        {
            try
            {
                fos.close();
                oos.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        return true;
    }

    public List<Page> getPageList(String filePath)
    {
        FileInputStream fis = null;
        ObjectInputStream ois = null;
        if (filePath == null)
            return null;
        try
        {
            fis = new FileInputStream(filePath);
            ois = new ObjectInputStream(fis);

            List<Page> pages = (ArrayList<Page>) ois.readObject();
            return pages;
        }
        catch(Exception e)
        {
            e.printStackTrace();
        }
        finally
        {
            try
            {
                ois.close();
                fis.close();


            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }

    public List<Word> getWordList(String filePath)
    {
        FileInputStream fis = null;
        ObjectInputStream ois = null;
        if (filePath == null)
            return null;
        try
        {
            fis = new FileInputStream(filePath);
            ois = new ObjectInputStream(fis);

            List<Word> words = (ArrayList<Word>) ois.readObject();
            return words;
        }
        catch(Exception e)
        {
            return null;
        }
        /*finally
        {
            try
            {
                assert ois != null;
                ois.close();
                fis.close();


            } catch (IOException e) {
                e.printStackTrace();
            }
        }*/
    }

}

Driver.java

import sun.swing.FilePane;

import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Scanner;

/**
 * Created by Sahil Pattni on 17-Apr-17.
 */
public class Driver {

    private FileUtils fu;
    private List<Page> pageFile;
    private List<Word> wordFile;

    public void crawl() {
        Crawler cl = new Crawler("https://www.investing.com/", "www.investing.com", 10);
        cl.crawl();
    }

    public static void main(String[] args) {
        Driver d = new Driver();
        d.crawl();
        d.save();
        
        Scanner s = new Scanner(System.in);
        boolean doneSearching = false;

        while (!doneSearching) {
            System.out.println("Enter Query");
            String query = s.nextLine();
            d.search(query);
            System.out.println();
            System.out.println("Do you want to continue (yes/no)");
            String answer = s.nextLine();
            if (answer.equalsIgnoreCase("no"))
                doneSearching = true;

        }
    }

    public void save() {
        fu = new FileUtils();
        fu.savePageTable(Crawler.parsed,"C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt");
        fu.saveWordTable(Crawler.words, "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt");
    }

    public void search(String query) {
        String pageLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\parsed.txt";
        String wordLocation = "C:\\Users\\Sahil Pattni\\Desktop\\IntelliJ\\HW 10\\words.txt";
        Search s = new Search(wordLocation, pageLocation);
        List<Result> results = s.executeQuery(query);
        Collections.sort(results);

        //Output
        int currentResult = 0;
        System.out.println("Query: " + query);
        for (Result result: results) {
            System.out.println("("+currentResult+")" + result.getURL() + " | " +
            "score: " + result.getScore());
        }
    }

}

Crawler.java

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import javax.print.Doc;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

/**
 * Created by Sahil Pattni on 04-Apr-17.
 */
public class Crawler {
    static String seedURL;
    static int currentID;
    static String domain;
    static int limit;
    MyQueue toParse;
    static Parser parser;

    static int totalURLs;
    static List<Page> parsed  = new ArrayList<>();
    static List<String> visited = new ArrayList<>();
    static List<Word> words = new ArrayList<>();

    public Crawler(String seed, String domain, int limit) {
        currentID = 0;
        totalURLs = 0;
        Crawler.seedURL = seed;
        Crawler.domain = domain;
        Crawler.limit = limit;

        parser = new Parser();
        toParse = new MyQueue();

        toParse.add(seed);
    }

    public void crawl(){
        while (!toParse.isEmpty() && currentID < limit) {
            if (toParse.peek().getData() != null) { //if next node to be parsed is not null
                String link = (String) toParse.remove().getData(); //remove node from queue and parse
                if (isValidURL(link)) {
                    if (!visited.contains(link)) { //if link has not been previously visited
                        Page p = new Page(link, currentID);
                        if (!pageExists(p)) { //custom method to test whether page has already been created
                            try {
                                Document d = parser.getDocument(link);
                                if (parse(d, currentID)) {
                                    currentID++;
                                    addPageToList(p); //Add page to list of parsed pages
                                }
                            }
                            catch (ParseException e) {
                                e.printStackTrace();
                            }
                        }
                        visited.add(link); //Add links to visited regardless of whether they have been parsed
                    }
                }
            }
            else {
                break;
            }
        }
    }

    public boolean parse(Document doc, int id) {
        boolean returner = false;
        try {
            parseLinks(doc);
            parseText(doc, id);
            returner = true;
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return returner;
    }

    public boolean pageExists (Page p) {
        for (Page page : parsed) {
            if (page.equals(p))
                return true;
        }

        return false;
    }

    public static void main(String[] args) throws ParseException {
        Crawler cl = new Crawler("https://www.cs.purdue.edu/homes/bxd/", "www.cs.purdue.edu", 50);
        Parser p = new Parser();
        Document d = p.getDocument("http://www.cs.purdue.edu/homes/cs177");
        cl.parse(d, currentID);
    }

    public void parseLinks(Document doc) throws ParseException{
        Elements links = parser.getLinks(doc);
        boolean exists = false;
        for (Element element : links) {
            for (String string : visited) {
                if (element.hasText()) {
                    if (element.equals(string))
                        exists = true;
                }

            }
            if (!exists)
                toParse.add(element.attr("abs:href"));
        }
    }
    public void parseText(Document doc, int id) {
        try {
            String text = parser.getBody(doc);  //Gather text
            if (!text.equals("")) { //see getBody() in Parser class
                String[] splitUp = text.split(" "); //Split up into array

                for (String string : splitUp) {  //parse through words
                    boolean isInArray = false;  // boolean to check if current word is in array
                    for (Word word : words) { // parse through Words array
                        if (word.getWord().equals(string)) { // if the word exists
                            isInArray = true;
                            word.addURLID(id);
                        }
                    }
                    if (!isInArray) {
                        addWordToList(string.toLowerCase(), id);
                    }
                }
            }
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }
    public void addWordToList(String word, int id) {
        Word wordle = new Word(word, id);
        words.add(wordle);
    }
    public void addToQueue(String url) {
        boolean duplicateURL = false;

        for (Page page : parsed) {
            if (page.getURL().equals(url))
                duplicateURL = true;
        }

        if (!duplicateURL) {
            toParse.add(url);
            totalURLs++;
        }

    }

    public void addPageToList(Page p) {
        boolean exists = false;

        for (Page page : parsed) {
            if (page.equals(p))
                exists = true;
        }

        if (!exists)
            parsed.add(p);
    }

    public boolean isInDomain(String url) {
        if (url.contains(domain))
            return true;
        return false;
    }
    public boolean isValidURL(String url) {
        boolean valid = false;
        if ((url.startsWith("https://") || url.startsWith("http://"))) {
            valid = true;
        }
        else
            valid = false;

        return valid;
    }
}

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Web Crawler