From e1b3d9880c0fc15e546bd2d2b05470490b9f689b Mon Sep 17 00:00:00 2001 From: Beherith Date: Thu, 29 Aug 2024 10:41:28 +0200 Subject: [PATCH] Add Levenshtein distance functions to string lib --- common/stringFunctions.lua | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/common/stringFunctions.lua b/common/stringFunctions.lua index f93176d99b7..c5d4529f252 100644 --- a/common/stringFunctions.lua +++ b/common/stringFunctions.lua @@ -182,3 +182,66 @@ if not string.formatSI then return str .. siPrefix end end + +if not string.Levenshtein then + -- reuseable tables + local Levenshtein0 = {} + local Levenshtein1 = {} + local min = math.min + local ssub = string.sub + + --- Calculates edit distance of two strings, O(n*m) time, O(n) memory + ---@param a string + ---@param b string + ---@return number + + function string.Levenshtein(a,b) + local lena = string.len(a) + local lenb = string.len(b) + local ssub = string.sub + + Levenshtein1[1] = 0 + for c = 0, lenb do -- initialize the first row + Levenshtein0[c+1] = c + end + for r = 1, lena do + --print(table.unpack(Levenshtein0)) + for c = 0, lenb do -- 16 ns/loop wtf + if c == 0 then + Levenshtein1[1] = r + else + Levenshtein1[c+1] = min( + min(Levenshtein0[c+1] + 1, Levenshtein1[c] + 1), + Levenshtein0[c] + (ssub(a,r,r) == ssub(b,c,c) and 0 or 1) + ) + end + end + Levenshtein0, Levenshtein1 = Levenshtein1, Levenshtein0 -- swap rows + end + return Levenshtein1[lenb] + end + + --- Finds string that is closest to a in a table + ---@param a string + ---@param t table, primarily values are strings, keys can be strings too + ---@return string, number bestresult, bestscore + function string.FindClosest(a,t) + local lena = string.len(a) + local bestscore = lena + local bestresult = a + for k,v in pairs(t) do + local b = (type(v) == 'string' and v) or (type(k) == 'string' and k) or "" -- whichever is string, or empty + if math.abs(string.len(b) - lena) < bestscore then -- heuristics + local distance = string.Levenshtein(a, b) + if distance < bestscore then + bestscore = distance + bestresult = b + end + end + end + return bestresult, bestscore + end + + -- print(string.Levenshtein(string.rep("asdfasdfasdfasdf", 1000), string.rep("asdfasdfasdfasda", 1000))) -- 5 seconds + -- print(string.FindClosest("apple", {"pear", "popple","bear","ple"})) +end