forked from andrewf/pcap2har
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pagetracker.py
128 lines (122 loc) · 4.25 KB
/
pagetracker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class Page(object):
'''
Members:
* pageref
* url = string or None
* root_document = entry or None
* startedDateTime
* user_agent = string, UA of program requesting page
* title = url
* referrers = set([string]), urls that have referred to this page, directly
or indirectly. If anything refers to them, they also belong on this page
* last_entry = entry, the last entry to be added
'''
def __init__(self, pageref, entry, is_root_doc=True):
'''
Creates new page with passed ref and data from entry
'''
# basics
self.pageref = pageref
self.referrers = set()
self.startedDateTime = entry.startedDateTime
self.last_entry = entry
self.user_agent = entry.request.msg.headers.get('user-agent')
# url, title, etc.
if is_root_doc:
self.root_document = entry
self.url = entry.request.url
self.title = self.url
else:
# if this is a hanging referrer
if 'referer' in entry.request.msg.headers:
# save it so other entries w/ the same referrer will come here
self.referrers.add(entry.request.msg.headers['referer'])
self.url = None # can't guarantee it's the referrer
self.title = 'unknown title'
def has_referrer(self, ref):
'''
Returns whether the passed ref might be referring to an url in this page
'''
return ref == self.url or ref in self.referrers
def add(self, entry):
'''
Adds the entry to the page's data, whether it likes it or not
'''
self.last_entry = entry
self.referrers.add(entry.request.url)
def json_repr(self):
return {
'id': self.pageref,
'startedDateTime': self.startedDateTime.isoformat() + 'Z',
'title': self.title,
'pageTimings': default_page_timings
}
default_page_timings = {
'onContentLoad': -1,
'onLoad': -1
}
def is_root_document(entry):
'''
guesses whether the entry is from the root document of a web page
'''
# guess based on media type
mt = entry.response.mediaType
if mt.type == 'text':
if mt.subtype in ['html', 'xhtml', 'xml']:
# probably...
return True
return False
class PageTracker(object):
'''
Groups http entries into pages.
Takes a series of http entries and returns string pagerefs. Divides them
into pages based on http referer headers (and maybe someday by temporal
locality). Basically all it has to do is sort entries into buckets by any
means available.
'''
def __init__(self):
self.page_number = 0 # used for generating pageids
self.pages = [] # [Page]
def getref(self, entry):
'''
takes an Entry and returns a pageref.
Entries must be passed in by order of arrival
'''
# extract interesting information all at once
req = entry.request # all the interesting stuff is in the request
referrer = req.msg.headers.get('referer')
user_agent = req.msg.headers.get('user-agent')
matched_page = None # page we added the request to
# look through pages for matches
for page in self.pages:
# check user agent
if page.user_agent and user_agent:
if page.user_agent != user_agent:
continue
# check referrers
if referrer and page.has_referrer(referrer):
matched_page = page
break
# if we found a page, return it
if matched_page:
matched_page.add(entry)
return matched_page.pageref
else:
# make a new page
return self.new_ref(entry)
def new_ref(self, entry):
'''
Internal. Wraps creating a new pages entry. Returns the new ref
'''
new_page = Page(
self.new_id(),
entry,
is_root_document(entry))
self.pages.append(new_page)
return new_page.pageref
def new_id(self):
result = 'page_%d' % self.page_number
self.page_number += 1
return result
def json_repr(self):
return sorted(self.pages)