scripts/get_esys_data.py at master · HWoidt/scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3

from pprint import pprint
from robobrowser import RoboBrowser
import os, os.path
import re
import time
import shutil
from zipfile import ZipFile, BadZipFile
from getpass import getpass

# list of regexes for matching urls
file_types = [
        ".pdf$",
        ".zip$",
]

# list of tuples (url, target_dir)
# page at url is fetched and all matching references are downloaded into
# ./target_dir (-> relative to the current working directory)
source_list = [
        ("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/lectures-67/",
                "lectures"),
        ("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/lectures-67://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/tutorials-21/",
                "tutorials"),
        ("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/laboratories-5/",
                "labs"),
        ("https://www.kth.se/social/course/IL2206/page/exercise-collection-2/",
                "exc"),
]

def credentials():
        """
        Provide credentials for logging into kth.se
        returns tuple(name, password)
        """
        name = input("name?:")
        pw = getpass("pw?:")
        return (name, pw)

def do_login(browser, credential_provider=credentials):
        """
        Perform a login to kth.se in the given browser session

        :param credential_provider: function that returns a tuple of (name,
                                    password) for logging in
        """
        r = browser

        r.open("https://login.kth.se")
        f = r.get_form()
        name, pw = credential_provider()
        f["username"] = name
        f["password"] = pw
        r.submit_form(f)
        if r.find("h2", text=re.compile(".*Försök igen.*")) is not None:
                raise ValueError("Wrong username or Password!")

def headers_for_file(url, path):
        try:
                t = os.path.getmtime(path)
                ts = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(t))
                return {"if-modified-since": ts}
        except OSError as e:
                return {}

def extract_zip(path):
        zipdir = path[:-len(".zip")]

        # clean up
        shutil.rmtree(zipdir, ignore_errors=True)
        os.makedirs(zipdir, exist_ok=True)

        # extract all
        try:
                with ZipFile(path) as z:
                        z.extractall(zipdir)
        except BadZipFile as e:
                print("ERROR: Could not extract ", path)
                print("ERROR:", e)


def retrieve_links(browser, links, target_dir):
        """
        Downloads the given links into target_dir

        :param links: list of <a href="..."> tag-objects
        :param target_dir: path to local storage
        """
        r = browser
        for a in links:
                url = a["href"]
                path = os.path.join(target_dir, os.path.basename(url))

                headers = headers_for_file(url, path)
                r.follow_link(a, headers=headers)

                if r.response.status_code == 304: # "HTTP: Not Modified"
                        print("up to date: ",url)
                else:
                        print("downloaded: ",url)
                        try:
                                os.remove(path)
                        except OSError as e:
                                pass # no such file

                        with open(path, "wb") as f:
                                f.write(r.response.content)
                        if path.endswith(".zip"):
                                extract_zip(path)
                r.back()

def get_files(browser, pages, types):
        """
        downloads all files of "types" that are found on "pages"

        :param pages: list of tuples (url, target_dir)
        :param types: list of regex-strings to match file urls on all pages
        """
        r = browser
        for page,target_dir in pages:
                os.makedirs(target_dir, exist_ok=True)
                r.open(page)
                links = []
                for t in types:
                        links += browser.get_links(href=re.compile(t))
                retrieve_links(browser, links, target_dir)

def main(*args):
        r = RoboBrowser()
        do_login(r, credentials)
        get_files(r, source_list, file_types)

if __name__=='__main__':
        import sys
        main(*sys.argv)