j-bennet
10/25/2016 - 11:19 PM

Download list of crawl urls and write to file "urls.txt"

Download list of crawl urls and write to file "urls.txt"

#!/usr/bin/env python
'''Build up a set of URLs using the common crawl index. See
http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/ for more info.
'''
from __future__ import print_function
import gzip
import logging
import os
import random

import boto3

log = logging.getLogger('urlutils.profiling.get_urls')

_here = lambda *paths: os.path.join(os.path.dirname(os.path.abspath(__file__)), *paths)

def get_common_crawl_urls(week='2016-07', max_urls=10000000):
    num_urls = 0
    bucket = boto3.resource('s3').Bucket('commoncrawl')
    objects = bucket.objects.filter(Prefix='cc-index/collections/CC-MAIN-{}/indexes/'.format(week))
    objects = [o for o in objects if o.key.endswith('.gz')]
    # Common Crawl URLS are alphabetically sorted so we don't want to grab only
    # stuff like http://69.30.227.140/showthread.php?tid=35992
    objects = random.sample(objects, 3)
    for object_ in objects:
        filename = _here(os.path.basename(object_.key))
        if not os.path.exists(filename):
            log.info('Downloading common crawl index file %s to %s', object_.key, filename)
            bucket.download_file(object_.key, filename)
            log.info('Downloaded %s to %s', object_.key, filename)
        with gzip.open(filename) as fp:
            for line in fp:
                if num_urls == max_urls:
                    break

                yield line.split(' ')[3][1:-2]
                num_urls += 1

        os.unlink(filename)
        if num_urls == max_urls:
            break

logging.basicConfig(level=logging.INFO)
[logging.getLogger(l).setLevel(logging.WARN) for l in ('boto3', 'botocore')]
filename = _here('urls.txt')
max_urls = 10000000
log.info('Writing {:,} URLs to %s'.format(max_urls), filename)
with open(filename, 'w') as fp:
    for i, url in enumerate(get_common_crawl_urls()):
        print(url, file=fp)