Python script to harvest links from a url.

7/5/2015 - 11:14 PM

Python script to harvest links from a url.



import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
import time
from queue import *
import threading
from worker import Worker
from sqlworker import sqlWorker
import pymysql





class spider():


    def __init__(self):
        # set vars 
        self.visitedLinks = set()

        self.allExtLinks = Queue()
        self.maxThreads = 10
        self.workers = []
        self.running  = True

        #create db connection
        self.initDb()

        #initial Link
        self.startLink()

        #run the spider
        self.run()


    def initDb(self):
        conn = pymysql.connect(host='', unix_socket='/tmp/mysql.sock', user='', passwd='', db='')
        self.cur = conn.cursor(pymysql.cursors.DictCursor)
        self.cur.execute('USE `0090-scraping`')


    def randomSeed(self):
        self.cur.execute('select url from urls order by rand() limit 1')
        return self.cur.fetchone()['url']


    def startLink(self):
        self.allExtLinks.put({ 
            'url': self.seedUrl()
        })


    def seedUrl(self):
        return 'http://www.reddit.com/'
        return 'http://www.bbc.co.uk'
        return 'http://shopping.indiatimes.com/lifestyle/bed-linen/8-designer-rajasthani-cotton-double-bed-sheets-with-16-pillow-covers/11574/p_B4661019'


    def createWorker(self, allExtLinks, theadNum, cur, visitedLinks):
        return Worker(allExtLinks, theadNum, cur, visitedLinks)

    def getUniques():


    def run(self):

        #we have 1 active link
        activeThreads = 1

        #the sql worker
        self.pending  = Queue()

        ## waiting for output
        print ("Spider: Waiting...")

        # create the sql woorker thread
        self.sW = sqlWorker( self.pending, self.cur )
        self.sW.start()


        #while we are running
        while self.running :

            #show the the loop is running
            print(' ')
            print(' -------- Ext Links ' + str(self.allExtLinks.qsize()) + ', Threads: ' + str(threading.activeCount()) + ' ----------'  )
            print(' ')

            #if thread count < max - start new thread
            if threading.activeCount() < self.maxThreads:

                w = self.createWorker( self.allExtLinks, activeThreads, self.cur, self.visitedLinks )
                activeThreads = activeThreads + 1
                self.workers.append(w)
                w.start()
                    
            #end the dead workers
            for w in self.workers:

                #if the worker is still running
                if( w.isAlive() == True):

                    #get the workers visited links
                    for i in w.getVisitedLinks():
                        self.visitedLinks.add(i)

                    #add all of the visited linsk to the worker thread
                    w.setVisitedLinks(self.visitedLinks)

                    #append the waiting data
                    for i in w.getUrlDetails():
                        self.pending.put(i)

                # join the dead threads and count
                if( w.isAlive() == False):
                    w.join()
                    activeThreads = activeThreads - 1
                    self.workers.remove(w)

            #sleep 1 second per loop
            time.sleep(1)    

            #end the loop if no more
            if( self.allExtLinks.empty() == True ):
                self.running = False

        #join active threads - to end the app
        while threading.activeCount()>1:            
            for w in self.workers:
                w.join()


        self.sW.join()


        ## waiting for output
        print ("Spider: Complete...")




if __name__ == '__main__':
    s = spider();

sqlworker.py



import sys, traceback
from threading import Thread
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import logging
import re
from queue import *
import pymysql
import time


'''
logging config
'''
logging.basicConfig(level=logging.DEBUG)


class sqlWorker(Thread):


    def __init__( self, queue, cur ):
        '''
        init with the queue and set the logger - this worker is a thread extended class
        '''
        Thread.__init__(self)
        self.cursor = cur      
        self.queue = queue
        self.running = True


    def join(self, timeout=None):
        '''
        when the thread joins send the loop end signal
        '''
        self.running = False
        super(sqlWorker, self).join(timeout)


    def saveLink(self, url, title, description):
        '''
        see if the url exists
        '''
        self.cursor.execute( 'select * from urls where url = %s', (url) )
        r = self.cursor.fetchone()
        url_id = 0

        if r == None:
            self.cursor.execute('insert into urls (url) values (%s)', url)
            self.cursor.execute('select LAST_INSERT_ID()')
            url_id = self.cursor.fetchone()[0]
        else:
            url_id = r['urls_id']


        '''
        see if the title exists
        '''
        self.cursor.execute('select * from phrazes where phraze = %s', (title))
        r = self.cursor.fetchone()

        title_id = 0

        if r == None:
            self.cursor.execute('insert into phrazes (phraze) values (%s)', title)
            self.cursor.execute('select LAST_INSERT_ID()')
            title_id = self.cursor.fetchone()[0]
        else:
            title_id = r['phrazes_id']


        '''
        add title to piviot
        '''
        self.cursor.execute('select * from url_phraze_pivot where urls_id = %s and phrazes_id = %s', (url_id, title_id))
        r = self.cursor.fetchone()
        if r == None:
            self.cursor.execute('insert into url_phraze_pivot (urls_id, phrazes_id, occurrences) values (%s, %s, 0)', (url_id, title_id))
        else:
            self.cursor.execute('update url_phraze_pivot set occurrences = occurrences + 1 where urls_id = %s and phrazes_id = %s ', (url_id, title_id))


        '''
        see if the description exists
        '''
        self.cursor.execute( 'select * from phrazes where phraze = %s', (description) )
        r = self.cursor.fetchone()

        description_id = 0

        if r == None:
            self.cursor.execute('insert into phrazes (phraze) values (%s)', description)
            self.cursor.execute('select LAST_INSERT_ID()')
            description_id = self.cursor.fetchone()[0]
        else:
            description_id = r['phrazes_id']


        '''
        add title to piviot
        '''
        self.cursor.execute('select * from url_phraze_pivot where urls_id = %s and phrazes_id = %s', (url_id, description_id))
        r = self.cursor.fetchone()
        if r == None:
            self.cursor.execute('insert into url_phraze_pivot (urls_id, phrazes_id, occurrences) values (%s, %s, 0)', (url_id, description_id))
        else:
            self.cursor.execute('update url_phraze_pivot set occurrences = occurrences +1 where urls_id = %s and phrazes_id = %s ', (url_id, description_id))


        print(' ')
        print(' -------- save link -------- ')
        print(url, title, description)
        print(' ')  


    def run(self):
        '''
        thread run, sace url
        '''
        
        #get the item from the queue
        item = self.queue.get()

        #while there is an item in the current queue
        while self.running == True:                 

            try:

                if item != None:

                    #get the parts
                    url = item['url']
                    title = item['title']
                    description = item['description']

                    #save the parts
                    self.saveLink(url, title, description)


                time.sleep(2)

                #set the next item for the while loop
                item = self.queue.get()                

            except Empty as e:
                self.running = False
                continue

            except Exception as e:

                print(' ')
                print(' -------- Sql Worker exception ' + self.threadNum + ': -------- ');
                print(e)
                print(' ')

                print("-"*60)
                traceback.print_exc(file=sys.stdout)
                print("-"*60)

                continue

        self.queue.task_done()

worker.py



import sys, traceback
from threading import Thread
import urllib
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import logging
import re
from queue import *

import pymysql

import time


'''
logging config
'''
logging.basicConfig(level=logging.DEBUG)




class Worker(Thread):


    def __init__( self, queue, threadNum, cur, visitedLinks ):

        '''
        init with the queue and set the logger - this worker is a thread extended class
        '''
        Thread.__init__(self)
        self.threadNum = str(threadNum)
        self.cursor = cur      
        self.visitedLinks = visitedLinks
        self.queue = queue;
        self.urlDetails = []
        self.running = True


    def join(self, timeout=None):
        '''
        when the thread joins send the loop end signal
        '''
        self.running = False
        super(Worker, self).join(timeout)


    def getVisitedLinks(self):
        '''
        return the workers current visited links
        '''
        return self.visitedLinks


    def setVisitedLinks(self, visitedLinks):
        '''
        add to the workers visited links
        '''
        for i in visitedLinks:
            self.visitedLinks.add(i)


    def getCurrentDomain(self, page):
        '''
        get the domain we are crawling
        '''
        parsed_uri = urlparse(page)
        return parsed_uri.netloc


    def getMetaTitle(self, html):
        '''
        extract the page meta title
        '''
        if html.title is not None:
            return html.title.string.encode('utf-8')

        return ""


    def getMetaDescription(self, html):
        '''
        extract the page meta description
        '''
        description = html.findAll(attrs={"name":"description"})

        if len(description) and description[0]['content'] != None:
            return description[0]['content'].encode('utf-8')

        return ""


    def encodeLink(self, link):
        '''
        attempt to fix encoding issues with links
        '''
        input = bytes(link, "UTF-8")
        link = input.decode("ascii", "ignore")
        return link;


    def fetch(self, uri):
        '''
        url open, check the headers for text/html
        if so return data
        '''
        uri = self.encodeLink(uri)

        self.visitedLinks.add(uri)

        try:
            h = urlopen(uri)
            x =  h.info()
            if 'text/html' in x['Content-Type'].lower():
                return h.read()
            else:
                return None
        except urllib.error.URLError:
            return None

                
    def getUrlDetails(self):
        '''
        get the list of url urlDetails
        '''
        return self.urlDetails


    def getLinks(self, page, url):
        '''
        find all anchor linsk within the page
        add to either array depending on its http(s) status
        '''

        internalLinks = [];
        externalLinks = [];
        
        currentDomain = self.getCurrentDomain(url)
        self.urlparse = urlparse(url)
       

        for link in page.findAll('a', href=True):

            if link.has_attr('href'):
                
                if( link['href'].startswith('#') == False and link['href'] != '/' ):
     
                    #internal link - rel link
                    if currentDomain not in link['href'] and link['href'].startswith('http://') == False and link['href'].startswith('https://') == False :


                        if currentDomain.endswith('/') :
                            currentDomain = currentDomain[:-1]

                        if link['href'].startswith('/') :
                            link['href'] = link['href'][1:]

                        link['href'] = self.urlparse.scheme + '://' + currentDomain + '/' + link['href']

                        link = link['href']

                        internalLinks.append(link)


                    # external link
                    elif currentDomain not in link['href'] and ( link['href'].startswith('http://') or link['href'].startswith('https://') ) :

                        link = link['href']

                        externalLinks.append(link)


                    # internal link non rel
                    elif currentDomain in link['href'] :

                        if link['href'].startswith('http://') == False and link['href'].startswith('https://') == False:
                        
                            if currentDomain.endswith('/') :
                                currentDomain = currentDomain[:-1]

                            if link['href'].startswith('/') :
                                link['href'] = link['href'][1:]

                            link['href'] = self.urlparse.scheme + '://' + currentDomain + '/' + link['href']

                        link = link['href']

                        internalLinks.append(link)


        return internalLinks, externalLinks


    def run(self):
        '''
        thread run, check url
        '''

        #get the item from the queue
        item = self.queue.get()
        
        #while there is an item in the current queue
        while self.running == True:                 

            try:

                #the current url
                url = item['url']

                #make sure we have not yet visited
                if( url not in self.visitedLinks ):

                    #fetch the html
                    data = self.fetch(url)


                    if data == None:
                        
                        #log that we could not get data from the url
                        #logging.info('[-] Thread: ' + self.threadNum + ' - Could not fetch: %s because type != text/html', url)
                        a = 2

                    else:

                        #log the current url we are scraping
                        logging.info('[+] Thread: ' + self.threadNum + ' - Success fetched: %s', url)

                        #create the beautifulSoup object
                        bsObj = BeautifulSoup(data, 'lxml')

                        #get the internal and external links
                        internalLinks, externalLinks = self.getLinks(bsObj, url)
             
                        #get the meta title
                        metaTitle = self.getMetaTitle(bsObj)

                        #get the meta desciption
                        metaDescription = self.getMetaDescription(bsObj)
                        
                        #add to the save queue
                        self.urlDetails.append({
                            'url': url,  
                            'title': metaTitle,  
                            'description': metaDescription  
                        })

                        # only scrape pages that are relative to the start page
                        for i in internalLinks:
                            self.queue.put({ 'url' : i })

                        #add to the queue of external links
                        for i in externalLinks:
                            self.queue.put({ 'url' : i })

                        #have a quick nap
                        time.sleep(2)

                #set the next item for the while loop
                item = self.queue.get()                

            except Empty as e:
                print(' ')
                print(' -------- Thread empty ' + self.threadNum + ': -------- ');
                print(e)
                print(' ')
                
                print("-"*60)
                traceback.print_exc(file=sys.stdout)
                print("-"*60)

                self.running = False

            except Exception as e:
                print(' ')
                print(' -------- Thread Running exception ' + self.threadNum + ': -------- ');
                print(e)
                print(' ')

                print("-"*60)
                traceback.print_exc(file=sys.stdout)
                print("-"*60)

                continue

        self.queue.task_done()

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Python script to harvest links from a url.