-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
levenshtein_cpp.pyx
255 lines (193 loc) · 8.2 KB
/
levenshtein_cpp.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# distutils: language=c++
# cython: language_level=3
# cython: binding=True
from libc.stdint cimport uint32_t
from libcpp.vector cimport vector
from libcpp cimport bool
from libcpp.utility cimport move
cdef extern from *:
int PyUnicode_4BYTE_KIND
object PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
cdef extern from "<string>" namespace "std" nogil:
cdef cppclass basic_string[T]:
ctypedef size_t size_type
basic_string() except +
void resize(size_type) except +
T& operator[](size_type)
const T* data()
size_type size()
cdef extern from "_levenshtein.hpp":
cdef basic_string[uint32_t] lev_greedy_median(const vector[RF_String]& strings, const vector[double]& weights) except +
cdef basic_string[uint32_t] lev_median_improve(const RF_String& string, const vector[RF_String]& strings, const vector[double]& weights) except +
cdef basic_string[uint32_t] lev_quick_median(const vector[RF_String]& strings, const vector[double]& weights) except +
cdef basic_string[uint32_t] lev_set_median(const vector[RF_String]& strings, const vector[double]& weights) except +
cdef double lev_set_distance(const vector[RF_String]& strings1, const vector[RF_String]& strings2) except +
cdef double lev_edit_seq_distance(const vector[RF_String]& strings1, const vector[RF_String]& strings2) except +
ctypedef struct RF_String:
pass
cdef bool is_valid_string(object)
cdef RF_String convert_string(object)
cdef inline RF_String conv_sequence(seq) except *:
if is_valid_string(seq):
return convert_string(seq)
raise TypeError("Expected string or bytes")
cdef vector[double] extract_weightlist(wlist, size_t n) except *:
cdef size_t i
cdef double weight
cdef vector[double] weights
if wlist is None:
weights.resize(n, 1.0)
else:
weights.resize(n)
for i, w in enumerate(wlist):
weight = w
if w < 0:
raise ValueError(f"weight {weight} is negative")
weights[i] = w
return weights
cdef vector[RF_String] extract_stringlist(strings) except *:
cdef vector[RF_String] strlist
for string in strings:
strlist.push_back(move(conv_sequence(string)))
return move(strlist)
def median(strlist, wlist = None, *):
"""
Find an approximate generalized median string using greedy algorithm.
You can optionally pass a weight for each string as the second
argument. The weights are interpreted as item multiplicities,
although any non-negative real numbers are accepted. Use them to
improve computation speed when strings often appear multiple times
in the sequence.
Examples
--------
>>> median(['SpSm', 'mpamm', 'Spam', 'Spa', 'Sua', 'hSam'])
'Spam'
>>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',
... 'Leveshtei', 'Lenshtein', 'Lvenstein',
... 'Levenhtin', 'evenshtei']
>>> median(fixme)
'Levenshtein'
Hm. Even a computer program can spell Levenshtein better than me.
"""
if wlist is not None and len(strlist) != len(wlist):
raise ValueError("strlist has a different length than wlist")
weights = extract_weightlist(wlist, len(strlist))
strings = extract_stringlist(strlist)
median = lev_greedy_median(strings, weights)
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, median.data(), median.size())
def quickmedian(strlist, wlist = None, *):
"""
Find a very approximate generalized median string, but fast.
See median() for argument description.
This method is somewhere between setmedian() and picking a random
string from the set; both speedwise and quality-wise.
Examples
--------
>>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',
... 'Leveshtei', 'Lenshtein', 'Lvenstein',
... 'Levenhtin', 'evenshtei']
>>> quickmedian(fixme)
'Levnshein'
"""
if wlist is not None and len(strlist) != len(wlist):
raise ValueError("strlist has a different length than wlist")
weights = extract_weightlist(wlist, len(strlist))
strings = extract_stringlist(strlist)
median = lev_quick_median(strings, weights)
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, median.data(), median.size())
def median_improve(string, strlist, wlist = None, *):
"""
Improve an approximate generalized median string by perturbations.
The first argument is the estimated generalized median string you
want to improve, the others are the same as in median(). It returns
a string with total distance less or equal to that of the given string.
Note this is much slower than median(). Also note it performs only
one improvement step, calling median_improve() again on the result
may improve it further, though this is unlikely to happen unless the
given string was not very similar to the actual generalized median.
Examples
--------
>>> fixme = ['Levnhtein', 'Leveshein', 'Leenshten',
... 'Leveshtei', 'Lenshtein', 'Lvenstein',
... 'Levenhtin', 'evenshtei']
>>> median_improve('spam', fixme)
'enhtein'
>>> median_improve(median_improve('spam', fixme), fixme)
'Levenshtein'
It takes some work to change spam to Levenshtein.
"""
if wlist is not None and len(strlist) != len(wlist):
raise ValueError("strlist has a different length than wlist")
weights = extract_weightlist(wlist, len(strlist))
query = conv_sequence(string)
strings = extract_stringlist(strlist)
median = lev_median_improve(query, strings, weights)
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, median.data(), median.size())
def setmedian(strlist, wlist = None, *):
"""
Find set median of a string set (passed as a sequence).
See median() for argument description.
The returned string is always one of the strings in the sequence.
Examples
--------
>>> setmedian(['ehee', 'cceaes', 'chees', 'chreesc',
... 'chees', 'cheesee', 'cseese', 'chetese'])
'chees'
You haven't asked me about Limburger, sir.
"""
if wlist is not None and len(strlist) != len(wlist):
raise ValueError("strlist has a different length than wlist")
weights = extract_weightlist(wlist, len(strlist))
strings = extract_stringlist(strlist)
median = lev_set_median(strings, weights)
return PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, median.data(), median.size())
def setratio(strlist1, strlist2, *):
"""
Compute similarity ratio of two strings sets (passed as sequences).
The best match between any strings in the first set and the second
set (passed as sequences) is attempted. I.e., the order doesn't
matter here.
Examples
--------
>>> setratio(['newspaper', 'litter bin', 'tinny', 'antelope'],
... ['caribou', 'sausage', 'gorn', 'woody'])
0.281845
No, even reordering doesn't help the tinny words to match the
woody ones.
"""
strings1 = extract_stringlist(strlist1)
strings2 = extract_stringlist(strlist2)
lensum = strings1.size() + strings2.size()
if lensum == 0:
return 1.0
if strings1.empty():
dist = <double>strings2.size()
elif strings2.empty():
dist = <double>strings1.size()
else:
dist = lev_set_distance(strings1, strings2)
return (<double>lensum - dist) / <double>lensum
def seqratio(strlist1, strlist2, *):
"""
Compute similarity ratio of two sequences of strings.
This is like ratio(), but for string sequences. A kind of ratio()
is used to to measure the cost of item change operation for the
strings.
Examples
--------
>>> seqratio(['newspaper', 'litter bin', 'tinny', 'antelope'],
... ['caribou', 'sausage', 'gorn', 'woody'])
0.215178
"""
strings1 = extract_stringlist(strlist1)
strings2 = extract_stringlist(strlist2)
lensum = strings1.size() + strings2.size()
if lensum == 0:
return 1.0
if strings1.empty():
dist = <double>strings2.size()
elif strings2.empty():
dist = <double>strings1.size()
else:
dist = lev_edit_seq_distance(strings1, strings2)
return (<double>lensum - dist) / <double>lensum