-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_esys_data.py
More file actions
executable file
·136 lines (115 loc) · 4.34 KB
/
get_esys_data.py
File metadata and controls
executable file
·136 lines (115 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
from pprint import pprint
from robobrowser import RoboBrowser
import os, os.path
import re
import time
import shutil
from zipfile import ZipFile, BadZipFile
from getpass import getpass
# list of regexes for matching urls
file_types = [
".pdf$",
".zip$",
]
# list of tuples (url, target_dir)
# page at url is fetched and all matching references are downloaded into
# ./target_dir (-> relative to the current working directory)
source_list = [
("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/lectures-67/",
"lectures"),
("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/lectures-67://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/tutorials-21/",
"tutorials"),
("https://www.kth.se/social/course/IL2206/subgroup/ht-2014-50315/page/laboratories-5/",
"labs"),
("https://www.kth.se/social/course/IL2206/page/exercise-collection-2/",
"exc"),
]
def credentials():
"""
Provide credentials for logging into kth.se
returns tuple(name, password)
"""
name = input("name?:")
pw = getpass("pw?:")
return (name, pw)
def do_login(browser, credential_provider=credentials):
"""
Perform a login to kth.se in the given browser session
:param credential_provider: function that returns a tuple of (name,
password) for logging in
"""
r = browser
r.open("https://login.kth.se")
f = r.get_form()
name, pw = credential_provider()
f["username"] = name
f["password"] = pw
r.submit_form(f)
if r.find("h2", text=re.compile(".*Försök igen.*")) is not None:
raise ValueError("Wrong username or Password!")
def headers_for_file(url, path):
try:
t = os.path.getmtime(path)
ts = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(t))
return {"if-modified-since": ts}
except OSError as e:
return {}
def extract_zip(path):
zipdir = path[:-len(".zip")]
# clean up
shutil.rmtree(zipdir, ignore_errors=True)
os.makedirs(zipdir, exist_ok=True)
# extract all
try:
with ZipFile(path) as z:
z.extractall(zipdir)
except BadZipFile as e:
print("ERROR: Could not extract ", path)
print("ERROR:", e)
def retrieve_links(browser, links, target_dir):
"""
Downloads the given links into target_dir
:param links: list of <a href="..."> tag-objects
:param target_dir: path to local storage
"""
r = browser
for a in links:
url = a["href"]
path = os.path.join(target_dir, os.path.basename(url))
headers = headers_for_file(url, path)
r.follow_link(a, headers=headers)
if r.response.status_code == 304: # "HTTP: Not Modified"
print("up to date: ",url)
else:
print("downloaded: ",url)
try:
os.remove(path)
except OSError as e:
pass # no such file
with open(path, "wb") as f:
f.write(r.response.content)
if path.endswith(".zip"):
extract_zip(path)
r.back()
def get_files(browser, pages, types):
"""
downloads all files of "types" that are found on "pages"
:param pages: list of tuples (url, target_dir)
:param types: list of regex-strings to match file urls on all pages
"""
r = browser
for page,target_dir in pages:
os.makedirs(target_dir, exist_ok=True)
r.open(page)
links = []
for t in types:
links += browser.get_links(href=re.compile(t))
retrieve_links(browser, links, target_dir)
def main(*args):
r = RoboBrowser()
do_login(r, credentials)
get_files(r, source_list, file_types)
if __name__=='__main__':
import sys
main(*sys.argv)