forked from OPENDAP/pydmr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
string_search.py
248 lines (214 loc) · 8.77 KB
/
string_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import os.path
import requests
import regex as re
import time
import concurrent.futures
import cmr
import errLog
verbose = False
vVerbose = False
search_string = ""
divider = "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
todo = 0
done = 0
def get_provider_collections(provider):
"""
Retrieves all collections for a specified provider
:param provider:
:return: all collections
"""
try:
# Get the collections for a given provider - this provides the CCID and title
entries = cmr.get_provider_collections(provider, opendap=True, pretty=True)
except cmr.CMRException as e:
err = "/////////////////////////////////////////////////////\n"
err += "CMRException : string_search.py::get_provider_collections() - " + e.message + "\n"
errLog.output_errlog(err)
print(e)
except Exception as e:
print(e)
return entries
def search(ccid, title):
"""
Gets the first and last granules for a collection
checks the granules for any urls that begin with 'http' and adds them to a list
once url list has been populated, runs a request on each url
once the request is finished, searches the request response for the search string
:param ccid: collection id
:param title:
:return:
"""
update_progress()
found = False
results = []
try:
first_last_dict = cmr.get_collection_granules_umm_first_last(ccid, pretty=True)
# print("size: " + str(len(first_last_dict)))
except cmr.CMRException as e:
return {ccid: (title, "error")}
# return {ccid: (title, {"error": e.message})}
for gid, granule_tuple in first_last_dict.items():
if re.search('https://opendap.earthdata.nasa.gov/collections/', granule_tuple[1]):
entries = cmr.get_related_urls(ccid, granule_tuple[0], pretty=True)
url_list = []
for url in entries:
print("entries.url: " + entries[url]) if vVerbose else ''
if re.search('https', entries[url]):
url_list.append(entries[url])
# write_to_file(entries[url])
for url_address in url_list:
print("\turl_address: " + url_address[0:10]) if vVerbose else ''
if url_address != "":
ext = '.dmrpp'
try:
full_url = url_address + ext
# print(full_url)
r = requests.get(full_url)
if re.search(search_string, r.text):
print("\t\tfound: true") if vVerbose else ''
a = (full_url, True)
results.append(a)
# Ignore exception, the url_tester will return 'fail'
except requests.exceptions.InvalidSchema:
pass
except requests.exceptions.ConnectionError:
err = "/////////////////////////////////////////////////////\n"
err += "ConnectionError : string_search.py::search() - " + url_address + ext + "\n"
errLog.output_errlog(err)
return {ccid: results}
def write_to_file(url):
"""
Writes a provided text to a file
:param url: provided text
:return:
"""
if not os.path.exists("Exports/" + time.strftime("%m.%d.%y") + "_dmrpp_urls.txt"):
with open("Exports/" + time.strftime("%m.%d.%y") + "_dmrpp_urls.txt", 'w') as create:
create.write(url+"\n")
create.close()
else:
with open("Exports/" + time.strftime("%m.%d.%y") + "_dmrpp_urls.txt", 'a') as file:
file.write(url+"\n")
file.close()
def run_search(providers, search_str, concurrency, workers, ver, very):
"""
entry point for the search functionality
:param providers: list of providers to run the string search on
:param search_str: the string to be searched for
:param concurrency: flag to use threads or not
:param workers: number of threads to use if concurrency is true
:param ver: verbose flag
:param very: very verbose flag
:return:
"""
global verbose, vVerbose
verbose = ver
vVerbose = very
global search_string
search_string = search_str
with open('Exports/' + time.strftime("%m.%d.%y") + '_' + search_str + '_search.txt', 'w') as file:
pros = len(providers)
pro_done = 1
for provider in providers:
# if provider == "ORNL_CLOUD": # Add me to test single provider, make sure to TAB all line below
print("[ " + str(pro_done) + " / " + str(pros) + " ] searching " + provider + " files for \'"
+ search_string + "\'")
file.write(f'{divider}\nProvider: {provider}\n\n')
collections = get_provider_collections(provider)
global todo, done
todo = len(collections.items())
done = 0
results = dict()
if concurrency:
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
try:
result_list = executor.map(search, collections.keys(), collections.values(), timeout=300)
except concurrent.futures.TimeoutError:
print("This took to long...") # It suspends infinitely.
except Exception as exc:
print(f'Exception: {exc}')
for result in result_list:
try:
results = cmr.merge_dict(results, result)
except Exception as exc:
print(f'Exception: {exc}')
else:
for ccid, title in collections.items():
found = search(ccid, title)
results = cmr.merge_dict(results, found)
print('\n')
for ccid, result in results.items():
# print(ccid + "\nresults: " + str(len(result)))
for rTuple in result:
# print("tuple: " + rTuple[0] + " : " + str(rTuple[1]))
if rTuple != "error":
print("\t" + rTuple[1] + "\n\t\t" + ccid + "\n\t\t" + rTuple[0]) if verbose else ''
if rTuple[1] is True:
file.write(f'\t {ccid}: {rTuple[0]}\n\n')
# end "if provider == ..." /!\ DO NOT TAB SHIFT PASS THIS LINE /!\
pro_done += 1
def run_url_finder(providers, concurrency, workers, ver, very):
"""
entry point for the url finder functionality
:param providers: list of providers to run the string search on
:param concurrency: flag to use threads or not
:param workers: number of threads to use if concurrency is true
:param ver: verbose flag
:param very: very verbose flag
:return:
"""
global verbose, vVerbose
verbose = ver
vVerbose = very
pros = len(providers)
pro_done = 1
for provider in providers:
print("[ " + str(pro_done) + " / " + str(pros) + " ] searching " + provider + " for urls")
collections = get_provider_collections(provider)
global todo, done
todo = len(collections.items())
done = 0
if concurrency:
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
executor.map(find, collections.keys(), collections.values())
else:
for ccid, title in collections.items():
find(ccid, title)
print('\n')
pro_done += 1
def find(ccid, title):
"""
Retrieves first and last granules and retrieves urls from those granules
then check if the url matches and if so, save to file
:param ccid:
:param title:
:return:
"""
update_progress()
try:
first_last_dict = cmr.get_collection_granules_umm_first_last(ccid, pretty=True)
# print("size: " + str(len(first_last_dict)))
except cmr.CMRException as e:
# print("CMRException: " + e.message)
return
for gid, granule_tuple in first_last_dict.items():
if re.search('https://opendap.earthdata.nasa.gov/collections/', granule_tuple[1]):
write_to_file(granule_tuple[1])
def update_progress():
"""
updates the progress bar on the terminal
:return:
"""
global done, todo
done += 1
print_progress(done, todo)
def print_progress(amount, total):
"""
outputs the progress bar to the terminal
:param amount:
:param total:
:return:
"""
percent = amount * 100 / total
msg = "\t" + str(round(percent, 2)) + "% [ " + str(amount) + " / " + str(total) + " ] "
print(msg, end="\r", flush=True)