This repository has been archived by the owner on Oct 20, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
lookups.py
531 lines (414 loc) · 19.2 KB
/
lookups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
# Functions which get data from other APIs
# General policy: We don't return complete data structures,
# just the fields we're using. This is so having test ones
# is easier.
import requests
import json
import datetime
import re
import collections
import csv
import io
import datetime
import app
import elections
import boto.s3.connection
import boto.s3.key
import boto.utils
###################################################################
# General helpers
conn = None
def _get_s3_bucket(config):
global conn
if not conn:
conn = boto.s3.connection.S3Connection(
config.get('S3_ACCESS_KEY_ID'),
config.get('S3_SECRET_ACCESS_KEY')
)
bucket_name = config.get('S3_BUCKET_NAME')
bucket = conn.get_bucket(bucket_name)
return bucket
###################################################################
# Democracy APIs
# Takes an election and a postcode returns a dict of:
# error - with a user friendly message, if the lookup failed
# id - the mySociety identifier of the constituency
# name - the text name of the constituency
def lookup_postcode(postcode):
canon_postcode = postcode.upper().strip().replace(" ", "")
if canon_postcode in ['ZZ99ZZ']:
return { 'id': "8888888", 'name': "Democracy Club Test Constituency", 'postcode': 'ZZ9 9ZZ' }
headers = {"user-agent": "Democracy Club CVs/1.0"}
try:
data = requests.get("https://elections.democracyclub.org.uk/api/elections/", params={'postcode':canon_postcode}, headers=headers).json()
except json.decoder.JSONDecodeError:
return { "error": "Postcode is not valid." }
# Error response method varies, so we check three different ways
if "error" in data:
return data
if "detail" in data:
if data["detail"] == "Invalid postcode":
return { "error": "Postcode is not valid." }
return { "error": data["detail"] }
if "results" not in data:
return { "error": "Postcode not properly recognised" }
for election in data["results"]:
if election["group"] == elections.current_election:
if election["division"]["division_type"] != "WMC":
return { "error": "Internal error: Unexpectedly not Westminster election" }
constituency_id = election["division"]["official_identifier"]
return {
'id': constituency_id,
'name': election["division"]["name"],
'postcode': canon_postcode
}
return { "error": "Internal error: Election not found" }
# Returns a pair of hashes of data from Democracy Club Candidates.
# by_candidate_id - maps from person id to dictionary about candidate
# by_constituency_id - maps from constituency id to array of dictionaries about candidate
#
# The fields in the dictionaries about each candidate are:
# id - the mySociety person_id of the candidate
# name - name of the candidate
# email - email address of the candidate (if known)
# twitter - Twitter account name of the candidate (if known)
# linkedin_url - LinkedIn page of the candidate (if known)
# party - political party name of the candidate
# constituency_id - identifier of constituency
# constituency_name - name of constituency
@app.cache.memoize(60 * 60)
def _hashes_of_candidates(config):
print("warming cache _hashes_of_candidates")
by_candidate_id = {}
by_constituency_id = collections.defaultdict(list)
rows = _fetch_candidates(config)
for row in rows:
candidate_id = int(row['id'])
constituency_id = str(row['post_id'])
if row['email'] == '':
row['email'] = None
if row['twitter_username'] == '':
row['twitter_username'] = None
candidate = {
'id': candidate_id,
'name': row['name'],
'email': row['email'],
'twitter': row['twitter_username'],
'linkedin_url': row['linkedin_url'],
'party': row['party_name'],
'constituency_id': constituency_id,
'constituency_name': row['post_label']
}
# XXX reenable this when 546 candidate duplicate fixed
# assert candidate_id not in by_candidate_id, candidate_id
by_candidate_id[candidate_id] = candidate
by_constituency_id[constituency_id].append(candidate)
return by_candidate_id, by_constituency_id
def candidates_csv_url():
return "https://candidates.democracyclub.org.uk/media/candidates-" + elections.current_election + ".csv"
def _fetch_candidates(config):
bucket = _get_s3_bucket(config)
key_name = "cache/candidates.csv"
r = requests.get(candidates_csv_url())
if r.status_code == 200:
r.encoding = 'utf-8'
text = r.text
# save to bucket
key = boto.s3.key.Key(bucket)
key.key = key_name
key.set_contents_from_string(text)
else:
print("couldn't read from Candidates API; loading candidates from S3")
key = bucket.get_key(key_name)
text = key.get_contents_as_string().decode('utf-8')
return csv.DictReader(io.StringIO(text))
# Takes a constituency identifier and returns a dictionary:
# error - if there was an error
# Or an array of dictionaries with fields as in _hashes_of_candidates.
def lookup_candidates(config, constituency_id):
if constituency_id == "8888888":
return [
{ 'id': 7777777, 'name' : 'Sicnarf Gnivri', 'email': '[email protected]', 'twitter': 'frabcus+sicnarf', 'linkedin_url': 'https://www.linkedin.com/in/FrancisIrving', 'party': 'Bunny Rabbits Rule',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
},
{ 'id': 7777778, 'name' : 'Notlits Esuom', 'email': '[email protected]', 'twitter': 'frabcus+notlits', 'linkedin_url': 'https://www.linkedin.com/in/FrancisIrving', 'party': 'Mice Rule More',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
},
{ 'id': 7777779, 'name' : 'Ojom Yeknom', 'email': '[email protected]', 'twitter': None, 'linkedin_url': None, 'party': 'Monkeys Are Best',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
}
]
_, by_constituency_id = _hashes_of_candidates(config)
if constituency_id not in by_constituency_id:
return { 'error': "Constituency not found: {}".format(constituency_id)}
current_candidate_list = by_constituency_id[constituency_id]
# Sort by surname (as best we can -- "Duncan Smith" won't work)
# so it is same as on ballot paper. So can get used to it.
def surname(candidate):
return candidate['name'].split(" ")[-1]
return sorted(current_candidate_list, key=surname)
# Takes a candidate identifier (mySociety person_id) and returns a dictionary:
# error - if there's an error
# Or fields as in _hashes_of_candidates.
def lookup_candidate(config, person_id):
if person_id == 7777777:
return {
'id': 7777777, 'name' : 'Sicnarf Gnivri', 'email': '[email protected]', 'twitter': 'frabcus+sicnarf', 'linkedin_url': 'https://www.linkedin.com/in/FrancisIrving', 'party': 'Bunny Rabbits Rule',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
}
if person_id == 7777778:
return {
'id': 7777778, 'name' : 'Notlits Esuom', 'email': '[email protected]', 'twitter': 'frabcus+notlits', 'linkedin_url': 'https://www.linkedin.com/in/FrancisIrving', 'party': 'Mice Rule More',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
}
if person_id == 7777779:
return {
'id': 7777779, 'name' : 'Ojom Yeknom', 'email': '[email protected]', 'twitter': None, 'linkedin_url': None, 'party': 'Monkeys Are Best',
'constituency_id': "8888888", 'constituency_name': "Democracy Club Test Constituency"
}
by_candidate_id, _ = _hashes_of_candidates(config)
if person_id not in by_candidate_id:
return { 'error': "Candidate not found: {}".format(person_id) }
candidate = by_candidate_id[person_id]
return candidate
# Returns an array of every constituency alphabetically by name.
# Each constituency is an array of candidates, with fields
# from _hashes_of_candidates and from augment_if_has_cv.
def all_constituencies(config):
_, by_constituency_id = _hashes_of_candidates(config)
result = []
for constituency_id, candidates in by_constituency_id.items():
candidates = augment_if_has_cv(config, candidates)
result.append(candidates)
result = sorted(result, key=lambda x: x[0]['constituency_name'])
return result
###################################################################
# Storing CVs
# Takes the app config (for S3 keys), candidate identifier, file contents, a
# (secured) filename and content type. Saves that new CV in S3. Raises
# an exception if it goes wrong, returns nothing.
def add_cv(config, person_id, contents, filename):
person_id = str(int(person_id))
assert person_id != 0
bucket = _get_s3_bucket(config)
when = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S-")
key = boto.s3.key.Key(bucket)
key.key = "cvs/" + str(person_id) + "/" + when + filename
key.set_contents_from_string(contents)
key.set_acl('public-read')
# Takes the app config (for S3 keys), candidate identifier, a local filename, a
# filename in the bucket and the extension to use in mime type. Saves thumbnail
# in S3. Raises an exception if it goes wrong, returns nothing.
def add_thumb(config, local_filename, remote_filename, extension):
bucket = _get_s3_bucket(config)
key = boto.s3.key.Key(bucket)
key.key = remote_filename
key.set_contents_from_filename(local_filename)
key.set_metadata('Content-Type', "image/" + extension)
key.set_acl('public-read')
# Takes a candidate id, and returns most recent CV. Fields of CV
# are as in _hash_by_prefix.
def get_current_cv(config, person_id):
cv_hash = _hash_by_prefix(config, "cvs/")
if person_id not in cv_hash:
return None
return cv_hash[person_id]
# Takes a candidate id, and returns a thumbnail. Fields of thumbnail
# are as in _hash_by_prefix.
def get_current_thumb(config, person_id):
thumb_hash = _hash_by_prefix(config, "thumbs/")
if person_id not in thumb_hash:
return None
return thumb_hash[person_id]
# Takes an array of candidates of the same form list_candidates returns.
# Auguments with a variable to say if they have a CV, and when last updated.
def augment_if_has_cv(config, candidates):
cv_hash = _hash_by_prefix(config, "cvs/")
thumb_hash = _hash_by_prefix(config, "thumbs/")
for candidate in candidates:
if candidate['id'] in cv_hash:
candidate['has_cv'] = True
candidate['cv'] = cv_hash[candidate['id']]
if candidate['id'] in thumb_hash:
candidate['cv']['has_thumb'] = True
candidate['cv']['thumb'] = thumb_hash[candidate['id']]
else:
candidate['cv']['has_thumb'] = False
else:
candidate['has_cv'] = False
return candidates
# Takes the app config (for S3), returns a list, ordered by reverse time,
# of all CVs which have thumbnails from any candidate, with the following
# fields:
# all the fields of _hash_by_prefix
# has_thumb - True
# thumb - dictionary of details, including all the fields of _hash_by_prefix
def all_cvs_with_thumbnails(config):
cv_hash = _hash_by_prefix(config, "cvs/")
thumb_hash = _hash_by_prefix(config, "thumbs/")
cvs = []
for person_id, cv in cv_hash.items():
# strip out the test one
if person_id == 7777777:
continue
if 'removed_after_election_by_candidate' in cv['url']:
continue
if cv['person_id'] in thumb_hash:
cv['has_thumb'] = True
cv['thumb'] = thumb_hash[person_id]
cv['candidate'] = lookup_candidate(config, cv['person_id'])
# can have CVs for people who aren't candidates (e.g. withdrew)
if 'error' not in cv['candidate']:
cvs.append(cv)
return cvs
# Takes the app config (for S3), returns a list, ordered by reverse time,
# of all CVs from any candidate which don't have an up to date thumbnails, with
# the following fields:
# all the fields of _hash_by_prefix
# has_thumb - False
def all_cvs_bad_thumbnails(config):
cv_hash = _hash_by_prefix(config, "cvs/")
thumb_hash = _hash_by_prefix(config, "thumbs/")
cvs = []
for person_id, cv in cv_hash.items():
# strip out the test one
if person_id == 7777777:
continue
# no thumb at all
if person_id not in thumb_hash:
cv['has_thumb'] = False
cvs.append(cv)
continue
# latest thumb doesn't match name of CV file using
cv_name = cv['name']
thumb_name = thumb_hash[person_id]['name']
if cv_name.replace("cvs/", "thumbs/") + ".jpg" != thumb_name:
cv['has_thumb'] = False
cvs.append(cv)
continue
return cvs
# Given a prefix, returns a hash from integer person_id to
# a dictionary with the following fields:
# name - full name of S3 key
# url - publically accessible address of the file
# last_modified - when it was uploaded
# person_id - id of the person the CV is for
# Caches for 10 minutes for speed.
@app.cache.memoize(60 * 10)
def _hash_by_prefix(config, prefix):
print("warming cache _hash_by_prefix", prefix)
bucket = _get_s3_bucket(config)
cvs = bucket.list(prefix)
cvs = reversed(sorted(cvs, key=lambda k: k.last_modified))
# Optionally filter to show what the CVs used to look like on a certain day
#cvs = filter(lambda k: boto.utils.parse_ts(k.last_modified) <= datetime.datetime(2015, 5, 8), cvs) # XXX temp debug
person_ids = []
result = collections.OrderedDict()
for key in cvs:
# we use .jpg thumbnails now (and don't accept images as CVs)
if key.name.endswith(".png"):
continue
key_last_modified = boto.utils.parse_ts(key.last_modified)
person_id = int(re.match(prefix + "([0-9]+)[^0-9]", key.name).group(1))
if person_id not in result:
result[person_id] = {
'name': key.name,
'url': key.generate_url(expires_in=0, query_auth=False),
'last_modified': key_last_modified,
'created': key_last_modified,
'person_id': person_id
}
result[person_id]['created'] = key_last_modified
return result
###################################################################
# Combinations of things
def split_candidates_by_type(config, all_candidates):
candidates_no_email = [ candidate for candidate in all_candidates if candidate['email'] is None]
candidates_have_cv = [ candidate for candidate in all_candidates if candidate['email'] is not None and candidate['has_cv']]
# sort chronologically by time CV was first uploaded
candidates_have_cv.sort(key=lambda x: x['cv']['created'])
candidates_no_cv = [ candidate for candidate in all_candidates if candidate['email'] is not None and not candidate['has_cv']]
return candidates_no_cv, candidates_no_email, candidates_have_cv
def split_candidates_by_updates(config, all_candidates, since):
candidates_cv_created = [ candidate for candidate in all_candidates if candidate['has_cv'] and candidate['cv']['created'] >= since ]
candidates_cv_updated = [ candidate for candidate in all_candidates if candidate['has_cv'] and candidate['cv']['last_modified'] >= since and candidate['cv']['last_modified'] < since]
# sort chronologically by time CV was first uploaded
candidates_cv_created.sort(key=lambda x: x['cv']['created'])
candidates_cv_updated.sort(key=lambda x: x['cv']['created'])
return candidates_cv_created, candidates_cv_updated
###################################################################
# Volunteer mailing list
# Subscribe to updates - we store the postcode in a file names
# after the email address.
def updates_join(config, email, postcode):
email = email.lower().replace("/", "_")
bucket = _get_s3_bucket(config)
key = boto.s3.key.Key(bucket)
key.key = "updates/" + str(email)
key.set_contents_from_string(postcode)
url = key.generate_url(expires_in=0, query_auth=False)
# Is the email already getting updates?
def updates_getting(config, email):
email = email.lower().replace("/", "_")
bucket = _get_s3_bucket(config)
prefix = "updates/" + str(email)
results = bucket.list(prefix)
for result in results:
if result.name == "updates/" + str(email):
return True
return False
# Used for sending the mailings out, slow. Last modified of
# the subscription S3 file is the last sent to date.
def slow_updates_list(config):
bucket = _get_s3_bucket(config)
prefix = "updates/"
results = bucket.list(prefix)
results = sorted(results, key=lambda k: k.last_modified)
for key in results:
email = re.match("updates/(.*)", key.name).group(1)
postcode = key.get_contents_as_string().strip().decode('ascii')
constituency = lookup_postcode(postcode)
if 'error' in constituency:
print("ERROR looking up postcode", postcode)
continue
last_modified = boto.utils.parse_ts(key.last_modified)
candidates = lookup_candidates(config, constituency['id'])
if 'errors' in candidates:
print("ERROR looking up candidates", postcode)
continue
candidates = augment_if_has_cv(config, candidates)
candidates_no_cv, candidates_no_email, candidates_have_cv = split_candidates_by_type(config, candidates)
candidates_cv_created, candidates_cv_updated = split_candidates_by_updates(config, candidates, last_modified)
subscriber = {
'email': email,
'postcode': postcode,
'constituency': constituency,
'candidates': candidates,
'has_cv_count': len(candidates_have_cv),
'no_cv_count': len(candidates_no_cv),
'no_email_count': len(candidates_no_email),
'candidates_cv_created': candidates_cv_created,
'candidates_cv_updated': candidates_cv_updated,
'last_modified': last_modified
}
yield subscriber
###################################################################
# Last mailed a candidate
def candidate_mail_sent(config, email):
email = email.lower().replace("/", "_")
bucket = _get_s3_bucket(config)
key = boto.s3.key.Key(bucket)
key.key = "candidate_mail/" + str(email)
key.set_contents_from_string("sent")
url = key.generate_url(expires_in=0, query_auth=False)
def candidate_mail_last_sent(config):
bucket = _get_s3_bucket(config)
prefix = "candidate_mail/"
results = bucket.list(prefix)
results = sorted(results, key=lambda k: k.last_modified)
ret = {}
for key in results:
email = re.match("candidate_mail/(.*)", key.name).group(1)
last_modified = boto.utils.parse_ts(key.last_modified)
ret[email] = last_modified
return ret