-
Notifications
You must be signed in to change notification settings - Fork 0
/
skua.py
executable file
·102 lines (87 loc) · 3.06 KB
/
skua.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
"""
skua is a tool to download data, such as pictures or videos, from websites.
Usage information is shown by running: ./skua.py
More information is available at https://github.com/THLO/skua.
"""
import urllib2, requests, time, random,sys
import argparse
from collections import OrderedDict
# Version number
version = '0.0.4'
# Default values for certain parameters:
# The prefix is prepended to the original filename, resulting in the filename
# that is used when storing the downloaded file.
# By default, the prefix is the emtpy string.
defaultDestination = ''
# Only files are downloaded with a certain extension.
# By default, the allowed extension is 'jpg'.
defaultExtension = 'jpg'
# A random timeout chosen uniformly at random from the range
# [minimumTimeout,maximumTimeout] occurs between downloads.
#By default, this range is [0.5,5.0] in seconds.
defaultMinimumTimeout = 0.5
defaultMaximumTimeout = 5.0
# Site-specific parameters can be added in the form of key-value pairs.
# By default, there are no such parameters.
defaultParameter = ''
""" Helper function to test if a string is an integer. """
def isInt(string):
try:
int(string)
return True
except ValueError:
return False
"""
An example for a filter:
File containing [int]x[int] are dropped because they (often) refer to
rescaled images.
"""
def filterRescaled(items):
filteredList = []
for item in items:
pos = item.rfind('x')
if not(pos >= 0 and pos < len(item)-1 and isInt(item[pos-1]) and isInt(item[pos+1])):
filteredList.append(item)
return filteredList
"""
A simple usage message is printed by this method.
This will be replaced by argparse.
"""
def printUsage():
print "Usage: skua [URL] {prefix} {extension}"
print "* URL (mandatory): Website from which files should be downloaded."
print "* Prefix (optional): Prefix added to the files that are downloaded."
print " Default: Empty"
print "* Extension (optional): Only files with the given extension are downloaded."
print " Default: jpg"
# The first parameter is the URL
if len(sys.argv) < 2:
printUsage()
else:
url = sys.argv[1]
prefix = defaultDestination if len(sys.argv) < 3 else sys.argv[2]
extension = defaultExtension if len(sys.argv) < 4 else sys.argv[3]
minimumTimeout = defaultMinimumTimeout
maximumTimeout = defaultMaximumTimeout
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
req = urllib2.Request(url, headers=hdr)
response = urllib2.urlopen(req)
html = response.read()
pieces = html.split("\"")
items = [piece for piece in pieces if piece.endswith(extension)]
itemsNoDuplicates = list(OrderedDict.fromkeys(items))
filteredList = filterRescaled(itemsNoDuplicates)
for item in filteredList:
if item.startswith("//"):
item = "https:"+item
print "Downloading "+item+"..."
try:
data = requests.get(item)
itemNameParts = item.split("/")
f = open(prefix+itemNameParts[-1], "wb")
f.write(data.content)
f.close()
time.sleep(random.uniform(minimumTimeout,maximumTimeout))
except:
pass