-
Notifications
You must be signed in to change notification settings - Fork 0
/
simpledownload.py
executable file
·82 lines (69 loc) · 2.15 KB
/
simpledownload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
__author__ = 'jiayiliu'
import urllib.request as ur
from html.parser import HTMLParser
class WebPage:
def __init__(self, url, isURL=True):
"""
Create connection to read html page.
:param url: URL or file path
:param isURL: True for URL, False for file
:return: Webpage class contains html source content
"""
if isURL:
self.web = ur.urlopen(url)
self.doc = str(self.web.read())
else:
with open(url, 'r') as f:
self.doc = ''.join(f.readlines())
class TargetHTMLParser(HTMLParser):
def __init__(self, is_target):
"""
Create HTML Parser to extract download file list
:param is_target: function to determine whether to download file
:return:
"""
super().__init__()
self.download = []
self.is_target = is_target
def handle_starttag(self, tag, attrs):
if tag != "a":
return
for attr in attrs:
if 'href' != attr[0]:
continue
if is_target(attr[1]):
self.download.append(attr[1])
class downloadFile():
def __init__(self, link, path='./', file=None):
"""
initiate download file class
:param link: download url
:param path: download path, default ./
:param file: download file default url file name
"""
if file is None:
self.file = path + link.split('/')[-1]
else:
self.file = path + file
self.link = link
def download(self):
"""
initiate downloading
"""
ur.urlretrieve(self.link, self.file)
def is_target(url):
"""
simple function to determine whether the given link is download target
:param url: download link to be determined
:return: True / False
"""
if name[-3:] == 'pdf' and name[:4] == 'http':
return True
else:
return False
if __name__ == "__main__":
w = WebPage("./temp.html", isURL=False)
parser = TargetHTMLParser(is_target)
parser.feed(w.doc)
for name in parser.download:
downloadFile(name, path='./').download()