-
Notifications
You must be signed in to change notification settings - Fork 10
/
domain.py
146 lines (123 loc) · 4.26 KB
/
domain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from __future__ import absolute_import
from __future__ import print_function
import tempfile
import codecs
import os
import six
from ipaddress import ip_address
from functools import wraps
from publicsuffix import PublicSuffixList, fetch
from six.moves import range
from six.moves.urllib.parse import urlparse
# We cache the Public Suffix List in temp directory
PSL_CACHE_LOC = os.path.join(tempfile.gettempdir(), 'public_suffix_list.dat')
def get_psl(location=PSL_CACHE_LOC):
"""
Grabs an updated public suffix list.
"""
if not os.path.isfile(location):
psl_file = fetch()
with codecs.open(location, 'w', encoding='utf8') as f:
f.write(psl_file.read())
psl_cache = codecs.open(location, encoding='utf8')
return PublicSuffixList(psl_cache)
def load_psl(function):
@wraps(function)
def wrapper(*args, **kwargs):
if 'psl' not in kwargs:
if wrapper.psl is None:
wrapper.psl = get_psl()
return function(*args, psl=wrapper.psl, **kwargs)
else:
return function(*args, **kwargs)
wrapper.psl = None
return wrapper
def is_ip_address(hostname):
"""
Check if the given string is a valid IP address
"""
try:
ip_address(six.text_type(hostname))
return True
except ValueError:
return False
@load_psl
def get_ps_plus_1(url, **kwargs):
"""
Returns the PS+1 of the url. This will also return
an IP address if the hostname of the url is a valid
IP address.
An (optional) PublicSuffixList object can be passed with keyword arg 'psl',
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
if is_ip_address(hostname):
return hostname
elif hostname is None:
# Possible reasons hostname is None, `url` is:
# * malformed
# * a relative url
# * a `javascript:` or `data:` url
# * many others
return
else:
return kwargs['psl'].get_public_suffix(hostname)
@load_psl
def hostname_subparts(url, include_ps=False, **kwargs):
"""
Returns a list of slices of a url's hostname down to the PS+1
If `include_ps` is set, the hostname slices will include the public suffix
For example: http://a.b.c.d.com/path?query#frag would yield:
[a.b.c.d.com, b.c.d.com, c.d.com, d.com] if include_ps == False
[a.b.c.d.com, b.c.d.com, c.d.com, d.com, com] if include_ps == True
An (optional) PublicSuffixList object can be passed with keyword arg 'psl'.
otherwise a version cached in the system temp directory is used.
"""
if 'psl' not in kwargs:
raise ValueError(
"A PublicSuffixList must be passed as a keyword argument.")
hostname = urlparse(url).hostname
# If an IP address, just return a single item list with the IP
if is_ip_address(hostname):
return [hostname]
subparts = list()
ps_plus_1 = kwargs['psl'].get_public_suffix(hostname)
# We expect all ps_plus_1s to have at least one '.'
# If they don't, the url was likely malformed, so we'll just return an
# empty list
if '.' not in ps_plus_1:
return []
subdomains = hostname[:-(len(ps_plus_1)+1)].split('.')
if subdomains == ['']:
subdomains = []
for i in range(len(subdomains)):
subparts.append('.'.join(subdomains[i:])+'.'+ps_plus_1)
subparts.append(ps_plus_1)
if include_ps:
try:
subparts.append(ps_plus_1[ps_plus_1.index('.')+1:])
except Exception:
pass
return subparts
def get_stripped_url(url, scheme=False):
"""Returns a url stripped to (scheme)?+hostname+path"""
purl = urlparse(url)
surl = ''
if scheme:
surl += purl.scheme + '://'
try:
surl += purl.hostname + purl.path
except TypeError:
surl += purl.hostname
return surl
def get_stripped_urls(urls, scheme=False):
""" Returns a set (or list) of urls stripped to (scheme)?+hostname+path """
new_urls = list()
for url in urls:
get_stripped_url(url, scheme)
if type(urls) == set:
return set(new_urls)
return new_urls