xitij2000
3/9/2014 - 8:40 PM

A script to download videos from EdX using aria2 and automatically putting them in a clean folder hierarchy.

A script to download videos from EdX using aria2 and automatically putting them in a clean folder hierarchy.

# A script that can automatically download videos from Edx
# Currently this is heavily tied to the way my Edx account and my computer is
# set up. It downloads by sending the the download url and download directory 
# to aria2 runnig in rpc mode. 
# More info here: http://aria2.sourceforge.net/manual/en/html/aria2c.html#rpc-interface
# You can use http://ziahamza.github.io/webui-aria2/ to see download progress
# For now parameters, such as username, password, and which course to download 
# can be provided in the script
# I intend to make it more flexible
from __future__ import print_function

import xmlrpclib

import requests
from bs4 import BeautifulSoup


class Edx(object):
    BASE_URL = 'https://courses.edx.org'
    LOGIN_URL = BASE_URL + '/login_ajax'
    DASHBOARD_URL = BASE_URL + '/dashboard'

    def __init__(self, email, password):
        self.__session = requests.Session()
        s = self.__session
        token = s.get(self.LOGIN_URL).cookies['csrftoken']
        r = s.post(self.LOGIN_URL, data={
            'email': email,
            'password': password,
            'remember': False,
        }, headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24'
                          '.0) Gecko/20100101 Firefox/24.0',
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
            'Referer': self.LOGIN_URL,
            'X-Requested-With': 'XMLHttpRequest',
            'X-CSRFToken': token,
        })

        self.soup = BeautifulSoup(s.get(self.DASHBOARD_URL).text)
        self.courses = {}
        try:
            self.courses = pickle.load(open('data.p', 'rb'))
        except IOError as e:
            print("Cached data not available, downloading from scratch")

    def get_courses(self):
        if self.courses == {}:
            print("Loading Course Data")
            for course in self.soup.find_all('li', class_='course-item'):
                title = course.find('h3').text.strip()
                url = self.BASE_URL + course.article.a['href'].strip()
                self.courses[title] = {'url': url}
            for course in self.courses:
                print("Loading info for Course: %s" % course)
                self.get_course_info(course)
        pickle.dump(self.courses, open('data.p', 'wb'))
        return self.courses

    def get_course_info(self, course):
        course_data = self.courses.get(course)
        if 'chapters' in course_data:
            return course_data['chapters']
        s = self.__session
        r = s.get(course_data['url'].replace('info', 'courseware'))
        soup = BeautifulSoup(r.text)
        chapters = {}
        for chapter in soup.find_all('div', class_='chapter'):
            title = chapter.h3.a.text.strip()
            url = self.BASE_URL + chapter.ul.li.a['href']
            chapters[title] = {'url': url, 'sections': {}}
            for section in chapter.find_all('li'):
                section_title = section.a.p.text.strip()
                section_url = self.BASE_URL + section.a['href']
                chapters[title]['sections'][section_title] = {
                    'url': section_url}
        course_data['chapters'] = chapters
        for chapter in chapters:
            self.get_chapter_info(course, chapter)
        return chapters

    def get_chapter_info(self, course, chapter):
        course_data = self.courses.get(course)
        chapter_data = course_data['chapters'][chapter]
        for section in chapter_data['sections']:
            self.get_section_info(course, chapter, section)
        return chapter_data
        #chapter_url = chapter.get('url')

    def get_section_info(self, course, chapter, section):
        course_data = self.courses.get(course)
        chapter_data = course_data['chapters'][chapter]
        section_data = chapter_data['sections'][section]
        s = self.__session
        r = s.get(section_data['url'])
        soup = BeautifulSoup(r.text)
        videos = []
        for content in soup.find_all('div', class_='seq_contents'):
            resoup = BeautifulSoup(content.text)
            for video in resoup.find_all('li', class_='video-sources'):
                videos.append(video.a['href'])

        section_data['videos'] = videos
        return section_data

    def download_all_from_section(self, course, chapter, section):
        course_data = self.courses.get(course)
        chapter_data = course_data['chapters'][chapter]
        section_data = chapter_data['sections'][section]
        for i, video in enumerate(section_data['videos']):
            s = xmlrpclib.ServerProxy("http://localhost:6800/rpc")
            s.aria2.addUri([video], {'dir': '~/Studies/EdX/'
                                            + course + '/'
                                            + chapter + '/'
                                            + section + '/'
                                            + str(i) + '/'})

    def download_all_from_chapter(self, course, chapter):
        course_data = self.courses.get(course)
        chapter_data = course_data['chapters'][chapter]
        for section in chapter_data['sections']:
            self.download_all_from_section(course, chapter, section)

    def download_all_from_course(self, course):
        course_data = self.courses.get(course)
        for chapter in course_data['chapters']:
            self.download_all_from_chapter(course, chapter)

    def download_all(self):
        for course in self.courses:
            self.download_all_from_course(course)


import cPickle as pickle

e = Edx('email', 'password')
e.get_courses()

# Full course name, e.g. Louv1.01x Paradigms of Computer Programming
e.download_all_from_course('full course name')