jetz
5/21/2013 - 6:54 AM

简单爬虫

简单爬虫

#!/usr/bin/env python
# coding=utf8

import sys
import logging
import requests
from bs4 import BeautifulSoup
from gevent import (queue,
                    monkey,
                    event,
                    pool)


class URL(object):

    def __init__(self, link):
        self.link = link
        self.key = hash(link)

    def is_valid(self):
        if self.link is not None and len(self.link) > 3 and self.link.startswith('http'):  # noqa
            return True
        return False

    def can_access(self):
        try:
            resp = requests.head(self.link, timeout=15)
        except:
            logging.info("Can't Access [%s]", self.link)
            return False
        if resp.status_code == requests.codes.ok:
            return True
        return False


class URLPair(object):
    def __init__(self, url, depth):
        self.url = url
        self.depth = depth


class Crawler(object):

    # import re
    # url_pattern = re.compile(r'(https?://[\w-][\.\w-]+(/[\w/.?%&=-]*)?)')

    def __init__(self, url, depth, total=3000, concurrency=5):
        monkey.patch_all()
        self.url = url
        self.depth = depth
        self.total = total
        self.count = 0
        self.greenlet_finished = event.Event()
        self.stopped = event.Event()
        self.pool = pool.Pool(concurrency)
        # (url,depth) - url pair
        self.url_queue = queue.Queue()
        self.url_table = {}

    def _worker(self, url_pair):
        if url_pair.depth < self.depth:
            try:
                resp = requests.get(url_pair.url.link, timeout=5)
                if not resp.headers['content-type'].startswith('text/'):
                    self.greenlet_finished.set()
                    return
            except requests.exceptions.RequestException:
                logging.info('[%s] - GET_URL_FAILED!!', url_pair.url.link)  # noqa
                self.greenlet_finished.set()
                return

            try:
                soup = BeautifulSoup(resp.text)
            except Exception:
                logging.info('[%s] - PARSE_FAILED!!', url_pair.url.link)  # noqa
                self.greenlet_finished.set()
                return

            tags = {'a': 'href', 'go': 'href', 'form': 'action'}

            for k, v in tags.items():
                for tag in soup.find_all(k):
                    link = tag.get(v)
                    url = URL(link)
                    if url.is_valid() and url.key not in self.url_table:
                        self.count += 1
                        if self.count >= self.total:
                            self.greenlet_finished.set()
                            self._stop()
                            return
                        self.url_table[url.key] = url.link
                        self.url_queue.put(URLPair(url, url_pair.depth + 1))  # noqa

        self.greenlet_finished.set()

    def _stop(self):
        self.stopped.set()
        self.pool.join()
        return

    def _stopped(self):
        return self.stopped.is_set()

    def start(self):
        start_url = URL(self.url)
        if start_url.can_access():
            self.url_queue.put(URLPair(start_url, 0))
            self.url_table[start_url.key] = start_url.link

            while not self._stopped():
                for workerlet in list(self.pool):
                    if workerlet.dead:
                        self.pool.discard(workerlet)
                try:
                    url_pair = self.url_queue.get_nowait()
                except queue.Empty:
                    if self.pool.free_count() == self.pool.size:
                        self._stop()
                    else:
                        self.greenlet_finished.wait()
                        self.greenlet_finished.clear()
                        continue
                logging.info('Crawling [%s]', url_pair.url.link)
                workerlet = self.pool.spawn(self._worker, url_pair)

        return self.url_table.values()


if __name__ == '__main__':
    logging.basicConfig(
        level=logging.INFO if "-v" in sys.argv else logging.WARN,
        format='%(asctime)s %(levelname)s %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('Start grabbing...')
    #urls = Crawler('http://www.baidu.com', 3,).start()
    urls = Crawler('http://www.noexist.com', 3).start()
    logging.info('Grabbing Finished. Total: %d', len(urls))