Initial check in.

asvoboda · Sep 16, 2011 · 85b1baf · 85b1baf
commit 85b1baf
Show file tree

Hide file tree

Showing 4 changed files with 418 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,36 @@
+# Initially from http://help.github.com/ignore-files/
+# Compiled source #
+###################
+*.com
+*.class
+*.dll
+*.exe
+*.o
+*.so
+*.pyc
+
+# Packages #
+############
+# it's better to unpack these files and commit the raw source
+# git has its own built in compression methods
+*.7z
+*.dmg
+*.gz
+*.iso
+*.jar
+*.rar
+*.tar
+*.zip
+
+# Logs and databases #
+######################
+*.log
+*.sql
+*.sqlite
+
+# OS generated files #
+######################
+.DS_Store?
+ehthumbs.db
+Icon?
+Thumbs.db
diff --git a/d2d.py b/d2d.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+d2d.py
+
+Created by Stephen Holiday on 2011-09-15.
+Copyright (c) 2011 Stephen Holiday. All rights reserved.
+
+
+# desire2download #
+[Stephen Holiday](http://stephenholiday.com)
+
+d2d is a tool to download all of the content from the University of Waterloo's
+new learning management system which uses Desire2Learn instead of the old Angel
+based UWACE.
+
+d2d was inspired by Jamie Wong's fabulous [UWAngel-CLI](https://github.com/phleet/UWAngel-CLI)
+written in Ruby.
+
+d2d is somewhat hacky and has not been tested extensively. If you do find a bug,
+please [let me know](mailto:[email protected])
+
+## Usage ##
+Using d2d is easy:
+    ./d2d.py --username scholida     
+    Password: 
+    Logging In...
+    Logged In
+    Finding courses...
+    ECE 224 - Fall 2011
+     + ECE 224 - Fall 2011/Labs/Lab Tools Tutorial.html (1.70K)
+     + ECE 224 - Fall 2011/Labs/Lab 1/lab1_checklist-s2010.pdf (107.65K)
+     
+    ...
+
+
+d2d will not download a file if it has been already saved.
+
+"""
+
+import getopt
+
+from desire2download import Desire2Download
+from getpass import getpass
+import sys
+reload(sys) 
+sys.setdefaultencoding("utf-8")
+
+help_message = '''
+Desire2Download
+===============
+
+Download all of the content from the University of Waterloo's
+new learning management system which uses Desire2Learn instead of the old Angel
+based UWACE.
+
+d2d was inspired by Jamie Wong's fabulous [UWAngel-CLI](https://github.com/phleet/UWAngel-CLI)
+written in Ruby.
+
+d2d is somewhat hacky and has not been tested extensively. If you do find a bug,
+please [let me know](mailto:[email protected])
+
+Using d2d is easy:
+
+    ./d2d.py --username scholida     
+    Password: 
+    Logging In...
+    Logged In
+    Finding courses...
+    ECE 224 - Fall 2011
+     + ECE 224 - Fall 2011/Labs/Lab Tools Tutorial.html (1.70K)
+     + ECE 224 - Fall 2011/Labs/Lab 1/lab1_checklist-s2010.pdf (107.65K)
+     
+    ...
+    
+d2d will not download a file if it has been already saved.
+
+
+Other Options:
+    -h                          This help message
+    -u, --username [username]   set your username
+    -p, --password [password]   set your password
+'''
+
+class Usage(Exception):
+    def __init__(self, msg):
+        self.msg = msg
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    try:
+        try:
+            opts, args = getopt.getopt(argv[1:], "hup:v", ["help", "username=", "password="])
+        except getopt.error, msg:
+            raise Usage(msg)
+
+        username = None
+        password = None
+
+        # option processing
+        for option, value in opts:
+            if option == "-v":
+                verbose = True
+            if option in ("-h", "--help"):
+                raise Usage(help_message)
+            if option in ("-u", "--username"):
+                username = value
+            if option in ("-p", "--password"):
+                password = value
+
+        if username is None:
+            username = raw_input('Username: ')
+        if password is None:
+            password = getpass()
+
+
+        # Start the actual work
+        d2d = Desire2Download(username,password)
+
+        d2d.login()
+        links = d2d.get_course_links()
+        for link in links:
+            print link.text
+            document_tree = d2d.get_course_documents(link)
+            d2d.download_tree(document_tree, [link.text])
+
+    except Usage, err:
+        print >> sys.stderr, sys.argv[0].split("/")[-1] + ": " + str(err.msg)
+        print >> sys.stderr, "\t for help use --help"
+        return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/desire2download.py b/desire2download.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+desire2download.py
+
+Created by Stephen Holiday on 2011-09-15.
+Copyright (c) 2011 Stephen Holiday. All rights reserved.
+"""
+
+import re
+import os
+import urlparse
+from urllib import urlencode
+import mechanize
+import BeautifulSoup
+
+import sys
+reload(sys) 
+sys.setdefaultencoding("utf-8")
+
+class Desire2Download(object):
+    base_url = 'https://learn.uwaterloo.ca/d2l/lp/homepage/home.d2l?ou=6606'
+    cas_login = 'https://cas.uwaterloo.ca/cas/login?service=http%3a%2f%2flearn.uwaterloo.ca%2fd2l%2forgtools%2fCAS%2fDefault.aspx'
+    def __init__(self, username, password):
+        self.username=username
+        self.password=password
+
+        self.br = mechanize.Browser(factory=mechanize.RobustFactory())
+        self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
+        self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
+
+
+    def safe_unicode(self,obj):
+        try:
+            return str(obj)
+        except UnicodeEncodeError:
+            # obj is unicode
+            return unicode(obj).encode('utf-8')
+
+    def login(self):
+        print 'Logging In...'
+
+        self.br.open(self.cas_login)
+
+        self.br.select_form(nr=0)
+        self.br['username']=self.username
+        self.br['password']=self.password
+        response = self.br.submit().read()
+        print 'Logged In'
+
+    def get_course_links(self):
+        print 'Finding courses...'
+        links=list()
+        for link in self.br.links():
+            matches=re.match('[A-Z]+ [0-9]{3} - [A-Z][a-z]+ 20[0-9]{2}', link.text)
+            if matches is not None:
+                links.append(link)
+        return links
+
+    def _nice_regex(self,regex,content,group):
+        res=re.search(regex,content)
+        if res!=None:
+            return res.group(group)
+        else:
+            return ''
+
+    def convert_bytes(self, bytes):
+        '''
+            Stolen from http://www.5dollarwhitebox.org/drupal/node/84
+        '''
+        bytes = float(bytes)
+        if bytes >= 1099511627776:
+            terabytes = bytes / 1099511627776
+            size = '%.2fT' % terabytes
+        elif bytes >= 1073741824:
+            gigabytes = bytes / 1073741824
+            size = '%.2fG' % gigabytes
+        elif bytes >= 1048576:
+            megabytes = bytes / 1048576
+            size = '%.2fM' % megabytes
+        elif bytes >= 1024:
+            kilobytes = bytes / 1024
+            size = '%.2fK' % kilobytes
+        else:
+            size = '%.2fb' % bytes
+        return size
+
+    def get_course_documents(self, link):
+        self.br.follow_link(link)
+        content_link=None
+        for link_j in self.br.links():
+
+            if link_j.text == 'Content':
+                if content_link is None:
+                    content_link=link_j
+
+        self.br.follow_link(content_link)
+
+        print_dl_link=None
+        for link_k in self.br.links(url_regex='print_download.d2l'):
+            if print_dl_link is not None:
+                print_dl_link = link_k
+
+        r = self.br.follow_link(print_dl_link)
+
+
+        page = r.read()
+        #print page
+        soup = BeautifulSoup.BeautifulSoup(page)
+        table = soup.find(id='z_n')
+
+        document_tree={}
+        path_to_root = list()
+
+        rows=table.findAll('tr')
+        for row in rows[1:]:
+            columns = row.findAll('td')
+
+            depth = len(columns)-2
+
+            cell = None
+            for column in columns:
+                if column.has_key('class') and column['class'] == 'd_gn':
+                    cell = column
+
+            cell_str = ''.join(map(lambda x: x.__str__(), cell.contents ))
+
+            is_heading = True
+            if re.search('href=', cell_str):
+                is_heading = False
+                link = cell.a
+                if hasattr(link, 'img'):
+                    link.img.extract()
+
+                title = ''.join(map(lambda x: x.__str__(), link.contents ))
+
+                ou = self._nice_regex('\?ou\=([0-9]+)', link['href'], 1)
+                tId = self._nice_regex('\&tId\=([0-9]+)', link['href'], 1)
+
+                link_href = 'https://learn.uwaterloo.ca/d2l/lms/content/preview.d2l?tId=%s&ou=%s'%(tId, ou)
+
+
+                cur_tree_node = document_tree
+                for cur_path_node in path_to_root:
+                    key = cur_path_node['title']
+                    if not cur_tree_node.has_key(key):
+                        cur_tree_node[key]=dict()
+                    cur_tree_node=cur_tree_node[key]
+
+                cur_tree_node[title]=link_href
+
+            else:
+                cell_str = cell_str.replace('&nbsp;','').strip()
+                cell_str = cell_str.replace('<strong>','').replace('</strong>','').strip()
+                node = {'heading':True, 'title':cell_str}
+
+
+                if len(path_to_root) < depth:
+                    path_to_root.append(node)
+                else:
+                    path_to_root=path_to_root[:depth]
+                    path_to_root.append(node)
+
+
+        return document_tree
+
+    def download_tree(self, root, _path=list()):
+        for k in root:
+            path=_path[:]
+
+            node = root[k]
+
+            if type(node) is dict:
+                path.append(k)
+                self.download_tree(node, path)
+            else:
+                title = k
+                url = node
+                path = '/'.join(map(lambda x: x.replace('/','\/'), path))
+
+                try:
+                    os.makedirs(path)
+                except:
+                    pass
+
+                #print url
+                page = self.br.open(url).read()
+                soup = BeautifulSoup.BeautifulSoup(page)
+                url = soup.find('iframe')['src']
+                url_path = url.split('?')[0]
+                split = urlparse.urlsplit(url_path)
+                if split.netloc == '':
+                    url = 'https://learn.uwaterloo.ca%s'%url_path
+                else:
+                    url = url_path
+                    url_path = split.path
+
+                clean_url =  url.replace(' ', '%20')
+
+                if 'https://learn.uwaterloo.ca/d2l/common/dialogs/' in url:
+                    pass
+
+                else:
+
+                    file_name = os.path.split(url_path)[1]
+                    path_and_filename = '%s/%s'%(path,file_name.strip('/'))
+
+                    if os.path.isfile(path_and_filename):
+                        print ' - %s (Already Saved)'%path_and_filename
+                    else:
+
+                        content = self.br.open(clean_url).read()
+
+                        f = open(path_and_filename, 'w')
+                        f.write(content)
+                        f.close()
+
+                        print ' + %s (%s)'%(path_and_filename, self.convert_bytes(len(content)))