简单爬虫
#!/usr/bin/env python
# coding=utf8
import sys
import logging
import requests
from bs4 import BeautifulSoup
from gevent import (queue,
monkey,
event,
pool)
class URL(object):
def __init__(self, link):
self.link = link
self.key = hash(link)
def is_valid(self):
if self.link is not None and len(self.link) > 3 and self.link.startswith('http'): # noqa
return True
return False
def can_access(self):
try:
resp = requests.head(self.link, timeout=15)
except:
logging.info("Can't Access [%s]", self.link)
return False
if resp.status_code == requests.codes.ok:
return True
return False
class URLPair(object):
def __init__(self, url, depth):
self.url = url
self.depth = depth
class Crawler(object):
# import re
# url_pattern = re.compile(r'(https?://[\w-][\.\w-]+(/[\w/.?%&=-]*)?)')
def __init__(self, url, depth, total=3000, concurrency=5):
monkey.patch_all()
self.url = url
self.depth = depth
self.total = total
self.count = 0
self.greenlet_finished = event.Event()
self.stopped = event.Event()
self.pool = pool.Pool(concurrency)
# (url,depth) - url pair
self.url_queue = queue.Queue()
self.url_table = {}
def _worker(self, url_pair):
if url_pair.depth < self.depth:
try:
resp = requests.get(url_pair.url.link, timeout=5)
if not resp.headers['content-type'].startswith('text/'):
self.greenlet_finished.set()
return
except requests.exceptions.RequestException:
logging.info('[%s] - GET_URL_FAILED!!', url_pair.url.link) # noqa
self.greenlet_finished.set()
return
try:
soup = BeautifulSoup(resp.text)
except Exception:
logging.info('[%s] - PARSE_FAILED!!', url_pair.url.link) # noqa
self.greenlet_finished.set()
return
tags = {'a': 'href', 'go': 'href', 'form': 'action'}
for k, v in tags.items():
for tag in soup.find_all(k):
link = tag.get(v)
url = URL(link)
if url.is_valid() and url.key not in self.url_table:
self.count += 1
if self.count >= self.total:
self.greenlet_finished.set()
self._stop()
return
self.url_table[url.key] = url.link
self.url_queue.put(URLPair(url, url_pair.depth + 1)) # noqa
self.greenlet_finished.set()
def _stop(self):
self.stopped.set()
self.pool.join()
return
def _stopped(self):
return self.stopped.is_set()
def start(self):
start_url = URL(self.url)
if start_url.can_access():
self.url_queue.put(URLPair(start_url, 0))
self.url_table[start_url.key] = start_url.link
while not self._stopped():
for workerlet in list(self.pool):
if workerlet.dead:
self.pool.discard(workerlet)
try:
url_pair = self.url_queue.get_nowait()
except queue.Empty:
if self.pool.free_count() == self.pool.size:
self._stop()
else:
self.greenlet_finished.wait()
self.greenlet_finished.clear()
continue
logging.info('Crawling [%s]', url_pair.url.link)
workerlet = self.pool.spawn(self._worker, url_pair)
return self.url_table.values()
if __name__ == '__main__':
logging.basicConfig(
level=logging.INFO if "-v" in sys.argv else logging.WARN,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logging.info('Start grabbing...')
#urls = Crawler('http://www.baidu.com', 3,).start()
urls = Crawler('http://www.noexist.com', 3).start()
logging.info('Grabbing Finished. Total: %d', len(urls))