-
Notifications
You must be signed in to change notification settings - Fork 2
/
populate.py
89 lines (72 loc) · 3.34 KB
/
populate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import sys
import csv
import re
import logging
logger = logging.getLogger()
urls = {}
def get_url(short_code: any):
if short_code in urls:
return urls[short_code]
def process_urls(metadata: any):
for line in metadata:
if line["Primary Access"] == "IA":
urls[line["Q code"]] = line["IA URL"]
elif line["Primary Access"] == "Peel":
urls[line["Q code"]] = line["Peel URL"]
file.seek(0)
def get_publication_range(line: any):
publication_range = ''
if line["Pub start"]:
publication_range = line["Pub start"]
if line["Pub end"]:
publication_range += f' to {line["Pub end"]}'
return publication_range
if __name__ == '__main__':
if len(sys.argv) < 2:
logger.warning("Wrong number of arguments. You need to provide a filename. Example: %s filename.csv", sys.argv[0])
sys.exit()
# Hardcoding file argument from sys.argv[1]
with open(sys.argv[1], mode ='r', encoding="utf-8") as file:
metadata = csv.DictReader(file)
# Gather all the urls we will need based on the "Primary Access" column in CSV file.
process_urls(metadata)
for line_count, line in enumerate(metadata):
# We dont process the headers column in the CSV file
if line_count == 0:
continue
short_code_array = re.findall(r'<b>(.*?)\</b>', line["Related items"])
for related_item_index, related_item in enumerate(line["Related items"].split("<br>")):
# If we dont have a <b> tag insert just the title
if '<b>' not in related_item and related_item:
short_code_array.insert(related_item_index, None)
# Here we link to the matching shortcode inside bold tag. Change regex to
# (<i>.*?<\/i>) if we just want to link the title
title_array = re.findall(r'<i>(.*?)<\/i>', line["Related items"])
matching_url_array = list(map(get_url, short_code_array))
related_items_field = line["Related items"]
# Check if all titles have corresponding links which are marked with
# shortcodes defined in the <b></b> tags, otherwise leave related_items_field as it was
if (len(title_array) == len(matching_url_array)):
for i, match in enumerate(matching_url_array):
related_items_field_regex = r'(<b>.*?<i>)' + re.escape(title_array[i]) + r'(<\/i>)'
replacing_title = title_array[i]
if match:
replacing_title = f"<i><a href=\"{matching_url_array[i]}\">{title_array[i]}</a></i>"
related_items_field = re.sub(related_items_field_regex, replacing_title, related_items_field)
else:
logging.error("Links on line %s do not match on 'Related items' column", line_count+1)
# Make sure we add the correct publication date range if "Pub start" and "Pub end" exist or not
publication_range = get_publication_range(line)
print(f'''
<!--{line["Display title"]}-->
<div class="card card-body item {line["Coverage-Province"]} {line["Language"]} {line["Coverage-City1"]}">
<h5><a href="{get_url(line["Q code"])}">{line["Display title"]}</a></h5>
<p class="ml-2 mb-0 small font-weight-bold">{line["number of issues"]} issues digitized from {line["date range of digitization"]}</p>
<div class="small px-2 py-1">
<p>{line["Notes"]}</p>
<hr/>
<p>{related_items_field}</p>
</div>
</div>
''')