-
Notifications
You must be signed in to change notification settings - Fork 2
/
top-y-csv-to-word-cloud.py
executable file
·141 lines (121 loc) · 5.47 KB
/
top-y-csv-to-word-cloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
from csv import DictReader
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
from snli_cooccur import mkdirp_parent
DEFAULT_COLOR_NAME = '#1f497d'
DEFAULT_RELATIVE_SCALING = 1.
DEFAULT_WIDTH = 800
DEFAULT_HEIGHT = 400
DEFAULT_MAX_WORDS = 50
DEFAULT_COLOR_MAP_RANGE = (0., 1.)
def parse_color_map_range(s):
t = tuple(map(float, s.split(',')))
if len(t) != 2:
raise ValueError('color map range must be two comma-delimited numbers')
if t[0] > t[1]:
raise ValueError('lower bound of color map range must be no greater '
'than upper bound')
if t[0] < 0 or t[1] > 1:
raise ValueError('color map range must be within [0, 1]')
return t
def top_y_csv_to_word_cloud(input_path, query, x, output_path,
mask_path=None,
color_name=DEFAULT_COLOR_NAME,
color_map_name=None,
color_map_range=DEFAULT_COLOR_MAP_RANGE,
relative_scaling=DEFAULT_RELATIVE_SCALING,
background_color_name=None,
max_words=DEFAULT_MAX_WORDS,
width=DEFAULT_WIDTH,
height=DEFAULT_HEIGHT):
y_scores = dict()
with open(input_path) as f:
reader = DictReader(f)
for row in reader:
if row['query'] == query and row['x'] == x:
y_scores[row['y']] = float(row['score'])
if not y_scores:
raise ValueError('found no rows matching query %s and row %s' %
(query, x))
mask = None if mask_path is None else np.array(Image.open(mask_path))
cmap = None if color_map_name is None else plt.get_cmap(color_map_name)
def color_func(word, font_size, position, orientation, font_path,
random_state):
if cmap is None:
return color_name
else:
u = random_state.uniform(*color_map_range)
(r, g, b, a) = 255 * np.array(cmap(u))
return 'rgb(%.0f, %.0f, %.0f)' % (r, g, b)
wordcloud = WordCloud(
max_words=max_words,
stopwords=(),
prefer_horizontal=0.9,
width=width,
height=height,
margin=2,
relative_scaling=relative_scaling,
mode='RGBA',
color_func=color_func,
background_color=background_color_name,
mask=mask,
collocations=False,
normalize_plurals=False,
regexp=r'\S+',
)
wordcloud.generate_from_frequencies(y_scores)
image = wordcloud.to_image()
mkdirp_parent(output_path)
with open(output_path, 'wb') as f:
image.save(f, format='png')
def main():
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
parser = ArgumentParser(
description='Generate word cloud from CSV top-y results',
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('input_path', help='path to input CSV file')
parser.add_argument('query',
help='query for which top y will be visualized')
parser.add_argument('x',
help='x for which top y will be visualized '
'(must appear in specified query)')
parser.add_argument('output_path', help='path to output PNG file')
parser.add_argument('--mask-path', help='path to image mask PNG file')
parser.add_argument('--background-color-name',
help='name of background color (default: transparent)')
parser.add_argument('--color-name', default=DEFAULT_COLOR_NAME,
help='name of text color')
parser.add_argument('--color-map-name',
help='name of color map to select word colors from '
'(randomly) (default: use color-name for all '
'words)')
parser.add_argument('--color-map-range', type=parse_color_map_range,
default=DEFAULT_COLOR_MAP_RANGE,
help='range of color map to use (as two '
'comma-delimited floats, a lower bound and an '
'upper bound)')
parser.add_argument('--max-words', type=int, default=DEFAULT_MAX_WORDS,
help='number of words to display')
parser.add_argument('--width', type=int, default=DEFAULT_WIDTH,
help='width of image, in pixels')
parser.add_argument('--height', type=int, default=DEFAULT_HEIGHT,
help='height of image, in pixels')
parser.add_argument('--relative-scaling', type=float,
default=DEFAULT_RELATIVE_SCALING,
help='degree to which score (rather than rank) is '
'used to scale words')
args = parser.parse_args()
top_y_csv_to_word_cloud(args.input_path, args.query, args.x,
args.output_path, mask_path=args.mask_path,
background_color_name=args.background_color_name,
color_name=args.color_name,
color_map_name=args.color_map_name,
color_map_range=args.color_map_range,
width=args.width,
height=args.height,
relative_scaling=args.relative_scaling)
if __name__ == '__main__':
main()