-
Notifications
You must be signed in to change notification settings - Fork 24
/
scrape_mrgdatashare.py
783 lines (582 loc) · 23.5 KB
/
scrape_mrgdatashare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
'''
Downloads files matching patterns from the Oxford Robotcar Dataset website.
Matt Gadd
Jan 2018
Oxford Robotics Institute, Oxford University.
'''
# imports
import argparse
import datetime
from lxml import html
import os
import requests
import tarfile
import time
from tqdm import tqdm
import math
# urls
login_url = "https://mrgdatashare.robots.ox.ac.uk/"
datasets_url = "https://robotcar-dataset.robots.ox.ac.uk/datasets/"
base_download_url = "http://mrgdatashare.robots.ox.ac.uk:80/download/?filename=datasets/"
# responses
good_status_code = 200
failed_login = "Please try again or email for support"
# filesystem
file_extension = ".tar"
downloads_dir_example = os.path.expanduser("~/Downloads")
# throttle params (seconds) - to avoid overloading the server with requests and to avoid getting blocked by the server for too many requests in a short time period
default_period_duration = 10 * 60
default_chunks_per_period = 1000
default_chunk_length = 1 * 1024
# download errors handling params (seconds) - to avoid overloading the server and to avoid losing data due to network errors
default_relogin_duration = 10 * 60
default_nb_tries_reconnection = 5
default_reconnection_duration = 10 * 60
default_choice_sensors = 'all'
default_choice_runs_file = 'all'
class Datasets:
"""Reads and provides a list of datasets to scrape via CL, input file.
Attributes:
datasets_file (string): Location of file with list of datasets to scrape.
datasets (list): List of datasets along with file patterns that can be scraped.
"""
def __init__(self, parse_args):
"""Loads dataset queries and list from CL and input file.
Args:
parse_args (list): List of input CL arguments.
"""
# check datasets file
self.datasets_file = Datasets.get_dataset_file(parse_args)
# read datasets file
self.choice_sensors = parse_args.choice_sensors.split(',')
self.choice_runs_file = parse_args.choice_runs_file
self.datasets = Datasets.get_datasets(self.datasets_file, self.choice_runs_file, self.choice_sensors)
@staticmethod
def get_dataset_file(parse_args):
"""Gets input file from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Input file.
Raises:
IOError: When no input file is provided on the CL.
"""
# check datasets file
if not parse_args.datasets_file:
raise IOError("Please specify option datasets_file.")
return os.path.abspath(
parse_args.datasets_file)
@staticmethod
def get_datasets(datasets_file, choice_runs_file, choice_sensors):
"""Reads known datasets list and file patterns from input file.
Args:
datasets_file (string): Location of file with list of datasets to scrape.
Returns:
list: Known datasets list.
"""
print("reading datasets_file: " + datasets_file)
datasets = []
# choose dataset and sensor type to download
if choice_runs_file == 'all':
choice_runs = 'all'
else:
with open(choice_runs_file, 'r') as f:
choice_runs = f.read().splitlines()
with open(datasets_file, "r") as file_handle:
lines = file_handle.readlines()
for line in lines:
line = line.strip("\n").split(",")
if choice_runs == 'all' or line[0] in choice_runs: # choose this run
if choice_sensors[0] == 'all':
dataset = {"dataset": line[0], "file_patterns": line[1:]}
else: # not all sensors
exist_sensors = [] # sensors that will be downloaded
for exist_sensor in line[1:]: # enumerate exist_sensor in this run
for choice_sensor in choice_sensors: # enumerate choice_sensor the user want to download
if choice_sensor in exist_sensor: # if choice_sensor is in exist_sensor
exist_sensors.append(exist_sensor) # add exist_sensor to exist_sensors
dataset = {"dataset": line[0], "file_patterns": exist_sensors}
datasets.append(dataset)
print("got num_datasets: " + str(len(datasets)))
return datasets
class Scraper:
"""Maintains login session and performs file requests.
Attributes:
username (string): RCD login username.
password (string): RCD login password.
session_requests (requests.Session): Persistent login session.
"""
def __init__(self, parse_args):
"""Sets up credentials and starts login session.
Args:
parse_args (list): List of input CL arguments.
"""
# credentials
self.username = Scraper.get_username(parse_args)
self.password = Scraper.get_password(parse_args)
# persistent login session
self.session_requests = requests.session()
# errors handling
self.relogin_duration = parse_args.relogin_duration
@staticmethod
def get_username(parse_args):
"""Retrieves account details from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Username.
Raises:
IOError: If username is not provided on the CL.
"""
if not parse_args.username:
raise IOError("Please specify option username.")
return parse_args.username
@staticmethod
def get_password(parse_args):
"""Retrieves login password from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Password.
Raises:
IOError: If password is not provided on the CL.
"""
if not parse_args.password:
raise IOError("Please specify option password.")
return parse_args.password
def login(self):
"""Initialises the login session using credentials.
"""
# get middleware token
csrf_middleware_token = self.get_csrf_middleware_token()
# login dictionary payload
payload = self.get_payload(csrf_middleware_token)
# perform login
self.post(payload)
def get_csrf_middleware_token(self):
"""Retrieve authentication token from login session cookies.
Returns:
string: Authentication token from cookies.
"""
# load cookies
result = self.session_requests.get(login_url)
tree = html.fromstring(result.text)
# get authentication token
csrf_middleware_token = list(
set(tree.xpath("//input[@name='csrfmiddlewaretoken']/@value")))[0]
print("got csrf_middleware_token: " + csrf_middleware_token)
return csrf_middleware_token
def get_payload(self, csrf_middleware_token):
"""Prepare login dictionary payload.
Args:
csrf_middleware_token (string): Authentication token from cookies.
Returns:
dict: Login credentials and cookies.
"""
return {"username": self.username,
"password": self.password,
"csrfmiddlewaretoken": csrf_middleware_token}
def post(self, payload):
"""Sends payload to login session (logs in).
Args:
payload (dict): Login credentials and cookies.
Raises:
ValueError: If login credentials did not work.
"""
# post login data
result = self.session_requests.post(
login_url,
data=payload,
headers=dict(referer=login_url)
)
# get status code
status_code = result.status_code
# check for success
text = result.text
if status_code != good_status_code or failed_login in text:
raise ValueError("Login failed, check username and password.")
print("Logged in!")
def scrape(self, url_handler):
"""Downloads a sensor log from a particular dataset.
Args:
url_handler (URLHandler): Local file path for the sensor log to be downloaded.
"""
# make request
print("requesting file_url: " + url_handler.file_url)
result = self.session_requests.get(url_handler.file_url, stream=True)
if result.status_code != good_status_code:
raise ValueError(
"bad file_url: " +
url_handler.file_url)
while 'html' in result.headers.get('content-type', 'html'):
print(
"Got html file as result. Wait {self.relogin_duration} seconds and re-loging...")
time.sleep(self.relogin_duration)
self.login()
result = self.session_requests.get(
url_handler.file_url, stream=True)
# open local file
print(
"downloading local_file_path: " +
url_handler.local_file_path)
agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
result.headers.update({'user-agent': agent})
with open(url_handler.local_file_path, 'wb') as file_handle:
# iterate chunks
total_size = int(result.headers.get('content-length', 0))
for chunk in tqdm(result.iter_content(
chunk_size=throttle.chunk_length),
total=math.ceil(total_size // int(throttle.chunk_length)),
unit='KB',
unit_scale=True):
# bad url/no match for sensor
if b"File not found." in chunk:
return False
# count recent chunks
throttle.count()
# filter out keep-alive new chunks
if chunk:
file_handle.write(chunk)
return True
class Throttle:
"""Forces downloads to obey a conservative limit.
Attributes:
period_duration (int): Top limit for duration of download window.
chunk_length (int): Top limit for size of chunks in bytes.
chunks_per_period (int): Top limit for chunks downloaded in one period.
num_chunks_in_period (int): Chunks downloaded in this period.
period (datetime.datetime): Timestamp.
"""
def __init__(self, parse_args):
"""Initialises download throttling window.
Args:
parse_args (list): List of input CL arguments.
"""
# length of throttling period
self.period_duration = Throttle.get_period_duration(parse_args)
# data size
self.chunk_length = Throttle.get_chunk_length(parse_args)
# throttle limit
self.chunks_per_period = Throttle.get_chunks_per_period(
parse_args)
# reset counters
self.num_chunks_in_period = 0
self.period = datetime.datetime.now()
@staticmethod
def get_period_duration(parse_args):
"""Gets period length from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Period length.
Raises:
IOError: If period length was not provided on the CL.
"""
if not parse_args.period_duration:
raise IOError("Please specify option period_duration.")
return parse_args.period_duration
@staticmethod
def get_chunk_length(parse_args):
"""Gets chunk size from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Chunk size.
Raises:
IOError: If chunk size was not provided on the CL.
"""
if not parse_args.chunk_length:
raise IOError("Please specify option chunk_length.")
return parse_args.chunk_length
@staticmethod
def get_chunks_per_period(parse_args):
"""Get throttle from CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Throttle.
Raises:
IOError: If throttle was not provided on the CL.
"""
if not parse_args.chunks_per_period:
raise IOError("Please specify option chunks_per_period.")
return parse_args.chunks_per_period
def reset(self):
"""Resets throttle counters.
"""
print("resetting period throttle...")
self.num_chunks_in_period = 0
self.period = datetime.datetime.now()
def wait(self):
"""Forces the downloader to obey throttle limit by idling.
"""
# reset period count
period_seconds = self.get_period_seconds()
# needs reset
if period_seconds < 0:
self.reset()
# needs to pause
elif self.num_chunks_in_period > self.chunks_per_period:
Throttle.pause(period_seconds)
def get_period_seconds(self):
"""Computes the time remaining in this throttle window.
Returns:
int: Time remaining in this throttle window.
"""
period_seconds = self.period_duration - \
(datetime.datetime.now() - self.period).seconds
# console
print("num_chunks_in_period: " + str(self.num_chunks_in_period) +
", period_seconds: " + str(period_seconds))
return period_seconds
@staticmethod
def pause(period_seconds):
"""Idles.
"""
print(
"pausing for throttle for period_seconds: " +
str(period_seconds) +
"...")
time.sleep(period_seconds)
def count(self):
"""Increments the number of chunks retrieved in this throttle window.
"""
self.num_chunks_in_period = self.num_chunks_in_period + 1
class DatasetHandler:
"""Manages local file paths for a dataset to be downloaded.
Attributes:
downloads_dir (string): Root download directory.
dataset (string): Dataset to download.
dataset_dir (string): Dataset's download directory.
tar_dir (string): Extraction directory.
"""
def __init__(self, parse_args, dataset):
"""Initialises local file paths for downloads concerning this dataset.
Args:
parse_args (list): List of input CL arguments.
dataset (string): Dataset to download.
"""
# root download dir
self.downloads_dir = DatasetHandler.get_downloads_dir(parse_args)
# dataset to download
self.dataset = dataset
@staticmethod
def get_downloads_dir(parse_args):
"""Gets the root download directory from the CL.
Args:
parse_args (list): List of input CL arguments.
Returns:
string: Absolute path to root downloads directory.
Raises:
IOError: If root download directory was not provided on the CL.
"""
if not parse_args.downloads_dir:
raise IOError("Please specify option downloads_dir.")
return os.path.abspath(parse_args.downloads_dir)
class Zipper:
"""Performs archiving operations on local file system.
Attributes:
dataset_handler (DatasetHandler): Local file paths for this dataset to be downloaded.
num_successful_unzipped (int): Number of successful archiving operations.
"""
def __init__(self, dataset_handler):
"""Initialises an archiver for this downloaded dataset.
Args:
dataset_handler (DatasetHandler): Local file paths for this dataset to be downloaded.
"""
self.dataset_handler = dataset_handler
self.num_successful_unzipped = 0
# unzip
def unzip(self, url_handler):
"""Extracts archive contents.
Args:
url_handler (URLHandler): Local file path for the sensor log to be downloaded.
"""
print("unzipping local_file_path: " + url_handler.local_file_path)
try:
# open tar
tar = tarfile.open(url_handler.local_file_path, "r:")
# do extraction
tar.extractall(path=self.dataset_handler.downloads_dir)
# close tar file
tar.close()
# keep track of successful archives
self.num_successful_unzipped = self.num_successful_unzipped + 1
except tarfile.ReadError:
print(
"failed when unzipping local_file_path: " +
url_handler.local_file_path)
# clear tar
os.remove(url_handler.local_file_path)
def tidy_up(self):
"""Tidies up dataset's download directory.
"""
# tidy up
print("tidying up dataset: " + self.dataset_handler.dataset)
class URLHandler:
"""Manages local file paths for a file pattern and dataset combination.
Attributes:
file_pattern (string): Sensor type / file stub.
file_url (string): URL target path.
local_file_path (string): Local file system destination path.
"""
def __init__(self, dataset_handler, file_pattern):
"""Initialises the download of one file type for this dataset.
Args:
dataset_handler (DatasetHandler): Local file paths for this dataset to be downloaded.
file_pattern (string): Sensor type to download.
"""
# file pattern
self.file_pattern = file_pattern
# URL
self.file_url = URLHandler.get_file_url(
self.file_pattern, dataset_handler)
# path to save to disk
self.local_file_path = URLHandler.get_local_file_path(
self.file_url, dataset_handler)
@staticmethod
def get_file_url(file_pattern, dataset_handler):
"""Constructs URL for sensor log.
Args:
file_pattern (string): Sensor type to download.
dataset_handler (DatasetHandler): Local file paths for this dataset to be downloaded.
Returns:
string: Internet address for the sensor log in this dataset.
"""
return base_download_url + dataset_handler.dataset + "/" + \
dataset_handler.dataset + "_" + file_pattern + file_extension
@staticmethod
def get_local_file_path(file_url, dataset_handler):
"""Constructs filesystem location for sensor log.
Args:
file_url (string): Internet address for the sensor log in this dataset.
dataset_handler (DatasetHandler): Local file paths for this dataset to be downloaded.
Returns:
string: Destination filesystem location for sensor log.
"""
# get stem from ULR
local_file_name = file_url.split('/')[-1]
# extend dataset dir
local_file_path = os.path.join(
dataset_handler.downloads_dir, local_file_name)
return local_file_path
# main routine
if __name__ == "__main__":
# console
print("ScrapeMRGDatashare is starting...")
# option parsing suite
argument_parser = argparse.ArgumentParser(
description="ScrapeMRGDatashare input parameters")
# specify CL args
argument_parser.add_argument(
"--username",
dest="username",
help="Registered username for " + login_url)
argument_parser.add_argument(
"--password",
dest="password",
help="Registered password for " + login_url)
argument_parser.add_argument(
"--datasets_file",
dest="datasets_file",
help="List of dataset names from " +
datasets_url + " - separated by new lines.")
argument_parser.add_argument(
"--downloads_dir",
dest="downloads_dir",
default=downloads_dir_example,
help="Root download directory e.g. " + downloads_dir_example)
argument_parser.add_argument(
"--period_duration",
dest="period_duration",
type=int,
default=default_period_duration,
help="Length of throttled download period in seconds e.g. " +
str(default_period_duration))
argument_parser.add_argument(
"--chunk_length",
dest="chunk_length",
type=int,
default=default_chunk_length,
help="Length of download chunks in bytes e.g. " +
str(default_chunk_length))
argument_parser.add_argument(
"--chunks_per_period",
dest="chunks_per_period",
type=int,
default=default_chunks_per_period,
help="Maximum number of chunks to download in a throttled download period e.g. " +
str(default_chunks_per_period))
argument_parser.add_argument(
"--relogin_duration",
dest="relogin_duration",
type=int,
default=default_relogin_duration,
help="Number of seconds to wait in case you are disconnected (if you don't use an academic connection) e.g. " +
str(default_relogin_duration))
argument_parser.add_argument(
"--reconnection_duration",
dest="reconnection_duration",
type=int,
default=default_reconnection_duration,
help="Number of seconds to wait if the connection is broken (if you don't use an academic connection) e.g. " +
str(default_reconnection_duration))
argument_parser.add_argument(
"--nb_tries",
dest="nb_tries_reconnection",
type=int,
default=default_nb_tries_reconnection,
help="Number of downloading tries for a file e.g. " +
str(default_nb_tries_reconnection))
argument_parser.add_argument(
"--choice_sensors",
dest="choice_sensors",
type=str,
default=default_choice_sensors,
help="choice of sensors in [tags, stereo_centre, stereo_left, stereo_right, vo, mono_left, mono_right, mono_rear, lms_front, lms_rear, ldmrs, gps, all] to download e.g. " + default_choice_sensors)
argument_parser.add_argument(
"--choice_runs_file",
dest="choice_runs_file",
type=str,
default=default_choice_runs_file,
help="choice of runs recorded in a file to download, if 'all' all runs are downloaded")
# parse CL
args = argument_parser.parse_args()
# set up datasets file
datasets = Datasets(args).datasets
# persistent login
scraper = Scraper(args)
# load cookies
scraper.login()
# start throttle
throttle = Throttle(args)
# iterate datasets
for dataset in datasets:
# apply throttle
throttle.wait()
# dataset handler
dataset_handler = DatasetHandler(args, dataset["dataset"])
# set up zipper
zipper = Zipper(dataset_handler)
# iterate file patterns
for file_pattern in dataset["file_patterns"]:
# set up URL handler
url_handler = URLHandler(dataset_handler, file_pattern)
# perform download
file_was_found = False
for i in range(args.nb_tries_reconnection):
try:
file_was_found = scraper.scrape(url_handler)
if file_was_found:
break
except requests.exceptions.ChunkedEncodingError:
print(
"Connection broken on try {i+1}/5, wait {parse_args.reconnection_duration} seconds and restart downloading: ")
time.sleep(args.reconnection_duration)
# unzip
if file_was_found:
zipper.unzip(url_handler)
# tidy up
zipper.tidy_up()
# console
print("ScrapeMRGDatashare is finished!")