internal/lsp/fuzzy: add fuzzy matching library

This change uses a fuzzy matching library to score completion results. Updates golang/go#32754 Change-Id: Ia7771b33534de393a865443e05c0fcbf1e9a969b Reviewed-on: https://go-review.googlesource.com/c/tools/+/184441 Run-TryBot: Rebecca Stambler <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Ian Cottrell <[email protected]>
sauyon · Jul 3, 2019 · 2214986 · 2214986
1 parent 719fbf7
commit 2214986
Show file tree

Hide file tree

Showing 6 changed files with 1,169 additions and 4 deletions.
diff --git a/internal/lsp/completion.go b/internal/lsp/completion.go
@@ -30,7 +30,7 @@ func (s *Server) completion(ctx context.Context, params *protocol.CompletionPara
 	if err != nil {
 		return nil, err
 	}
-	items, surrounding, err := source.Completion(ctx, view, f, rng.Start, source.CompletionOptions{
+	candidates, surrounding, err := source.Completion(ctx, view, f, rng.Start, source.CompletionOptions{
 		DeepComplete: s.useDeepCompletions,
 	})
 	if err != nil {
@@ -58,7 +58,7 @@ func (s *Server) completion(ctx context.Context, params *protocol.CompletionPara
 	}
 	return &protocol.CompletionList{
 		IsIncomplete: false,
-		Items:        toProtocolCompletionItems(items, prefix, insertionRng, s.insertTextFormat, s.usePlaceholders, s.useDeepCompletions),
+		Items:        toProtocolCompletionItems(candidates, prefix, insertionRng, s.insertTextFormat, s.usePlaceholders, s.useDeepCompletions),
 	}, nil
 }
 

diff --git a/internal/lsp/fuzzy/input.go b/internal/lsp/fuzzy/input.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fuzzy
+
+import (
+	"unicode"
+)
+
+// Input specifies the type of the input. This influences how the runes are interpreted wrt to
+// segmenting the input.
+type Input int
+
+const (
+	// Text represents a text input type. Input is not segmented.
+	Text Input = iota
+	// Filename represents a filepath input type with '/' segment delimitors.
+	Filename
+	// Symbol represents a symbol input type with '.' and ':' segment delimitors.
+	Symbol
+)
+
+// RuneRole specifies the role of a rune in the context of an input.
+type RuneRole byte
+
+const (
+	// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
+	RNone RuneRole = iota
+	// RSep specifies a rune with the role of segment separator.
+	RSep
+	// RTail specifies a rune which is a lower-case tail in a word in the input.
+	RTail
+	// RUCTail specifies a rune which is an upper-case tail in a word in the input.
+	RUCTail
+	// RHead specifies a rune which is the first character in a word in the input.
+	RHead
+)
+
+// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
+// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
+// or when it filled the output. If output is nil, then it gets created.
+func RuneRoles(str string, input Input, reuse []RuneRole) []RuneRole {
+	var output []RuneRole
+	if cap(reuse) < len(str) {
+		output = make([]RuneRole, 0, len(str))
+	} else {
+		output = reuse[:0]
+	}
+
+	prev, prev2 := rtNone, rtNone
+	for i := 0; i < len(str); i++ {
+		r := rune(str[i])
+
+		role := RNone
+
+		curr := rtLower
+		if str[i] <= unicode.MaxASCII {
+			curr = runeType(rt[str[i]] - '0')
+		}
+
+		if curr == rtLower {
+			if prev == rtNone || prev == rtPunct {
+				role = RHead
+			} else {
+				role = RTail
+			}
+		} else if curr == rtUpper {
+			role = RHead
+
+			if prev == rtUpper {
+				// This and previous characters are both upper case.
+
+				if i+1 == len(str) {
+					// This is last character, previous was also uppercase -> this is UCTail
+					// i.e., (current char is C): aBC / BC / ABC
+					role = RUCTail
+				}
+			}
+		} else if curr == rtPunct {
+			switch {
+			case input == Filename && r == '/':
+				role = RSep
+			case input == Symbol && r == '.':
+				role = RSep
+			case input == Symbol && r == ':':
+				role = RSep
+			}
+		}
+		if curr != rtLower {
+			if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
+				// The previous two characters were uppercase. The current one is not a lower case, so the
+				// previous one can't be a HEAD. Make it a UCTail.
+				// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
+				output[i-1] = RUCTail
+			}
+		}
+
+		output = append(output, role)
+		prev2 = prev
+		prev = curr
+	}
+	return output
+}
+
+type runeType byte
+
+const (
+	rtNone runeType = iota
+	rtPunct
+	rtLower
+	rtUpper
+)
+
+const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
+
+// LastSegment returns the substring representing the last segment from the input, where each
+// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
+// or Filename type.
+func LastSegment(input string, roles []RuneRole) string {
+	// Exclude ending separators.
+	end := len(input) - 1
+	for end >= 0 && roles[end] == RSep {
+		end--
+	}
+	if end < 0 {
+		return ""
+	}
+
+	start := end - 1
+	for start >= 0 && roles[start] != RSep {
+		start--
+	}
+
+	return input[start+1 : end+1]
+}
+
+// ToLower transforms the input string to lower case, which is stored in the output byte slice.
+// The lower casing considers only ASCII values - non ASCII values are left unmodified.
+// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
+// created.
+func ToLower(input string, reuse []byte) []byte {
+	output := reuse
+	if cap(reuse) < len(input) {
+		output = make([]byte, len(input))
+	}
+
+	for i := 0; i < len(input); i++ {
+		r := rune(input[i])
+		if r <= unicode.MaxASCII {
+			if 'A' <= r && r <= 'Z' {
+				r += 'a' - 'A'
+			}
+		}
+		output[i] = byte(r)
+	}
+	return output[:len(input)]
+}
+
+// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
+// (start is inclusive, end is exclusive).
+type WordConsumer func(start, end int)
+
+// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
+// delimiters for each word are fed to the provided consumer function.
+func Words(roles []RuneRole, consume WordConsumer) {
+	var wordStart int
+	for i, r := range roles {
+		switch r {
+		case RUCTail, RTail:
+		case RHead, RNone, RSep:
+			if i != wordStart {
+				consume(wordStart, i)
+			}
+			wordStart = i
+			if r != RHead {
+				// Skip this character.
+				wordStart = i + 1
+			}
+		}
+	}
+	if wordStart != len(roles) {
+		consume(wordStart, len(roles))
+	}
+}
diff --git a/internal/lsp/fuzzy/input_test.go b/internal/lsp/fuzzy/input_test.go
@@ -0,0 +1,186 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fuzzy_test
+
+import (
+	"bytes"
+	"sort"
+	"testing"
+
+	"golang.org/x/tools/internal/lsp/fuzzy"
+)
+
+var rolesTests = []struct {
+	str   string
+	input fuzzy.Input
+	want  string
+}{
+	{str: "abc", want: "Ccc", input: fuzzy.Text},
+	{str: ".abc", want: " Ccc", input: fuzzy.Text},
+	{str: "abc def", want: "Ccc Ccc", input: fuzzy.Text},
+	{str: "SWT MyID", want: "Cuu CcCu", input: fuzzy.Text},
+	{str: "ID", want: "Cu", input: fuzzy.Text},
+	{str: "IDD", want: "Cuu", input: fuzzy.Text},
+	{str: " ID ", want: " Cu ", input: fuzzy.Text},
+	{str: "IDSome", want: "CuCccc", input: fuzzy.Text},
+	{str: "0123456789", want: "Cccccccccc", input: fuzzy.Text},
+	{str: "abcdefghigklmnopqrstuvwxyz", want: "Cccccccccccccccccccccccccc", input: fuzzy.Text},
+	{str: "ABCDEFGHIGKLMNOPQRSTUVWXYZ", want: "Cuuuuuuuuuuuuuuuuuuuuuuuuu", input: fuzzy.Text},
+	{str: "こんにちは", want: "Ccccccccccccccc", input: fuzzy.Text}, // We don't parse unicode
+	{str: ":/.", want: "   ", input: fuzzy.Text},
+
+	// Filenames
+	{str: "abc/def", want: "Ccc/Ccc", input: fuzzy.Filename},
+	{str: " abc_def", want: " Ccc Ccc", input: fuzzy.Filename},
+	{str: " abc_DDf", want: " Ccc CCc", input: fuzzy.Filename},
+	{str: ":.", want: "  ", input: fuzzy.Filename},
+
+	// Symbols
+	{str: "abc::def::goo", want: "Ccc//Ccc//Ccc", input: fuzzy.Symbol},
+	{str: "proto::Message", want: "Ccccc//Ccccccc", input: fuzzy.Symbol},
+	{str: "AbstractSWTFactory", want: "CcccccccCuuCcccccc", input: fuzzy.Symbol},
+	{str: "Abs012", want: "Cccccc", input: fuzzy.Symbol},
+	{str: "/", want: " ", input: fuzzy.Symbol},
+	{str: "fOO", want: "CCu", input: fuzzy.Symbol},
+	{str: "fo_oo.o_oo", want: "Cc Cc/C Cc", input: fuzzy.Symbol},
+}
+
+func rolesString(roles []fuzzy.RuneRole) string {
+	var buf bytes.Buffer
+	for _, r := range roles {
+		buf.WriteByte(" /cuC"[int(r)])
+	}
+	return buf.String()
+}
+
+func TestRoles(t *testing.T) {
+	for _, tc := range rolesTests {
+		gotRoles := make([]fuzzy.RuneRole, len(tc.str))
+		fuzzy.RuneRoles(tc.str, tc.input, gotRoles)
+		got := rolesString(gotRoles)
+		if got != tc.want {
+			t.Errorf("roles(%s) = %v; want %v", tc.str, got, tc.want)
+		}
+	}
+}
+
+func words(strWords ...string) [][]byte {
+	var ret [][]byte
+	for _, w := range strWords {
+		ret = append(ret, []byte(w))
+	}
+	return ret
+}
+
+var wordSplitTests = []struct {
+	input string
+	want  []string
+}{
+	{
+		input: "foo bar baz",
+		want:  []string{"foo", "bar", "baz"},
+	},
+	{
+		input: "fooBarBaz",
+		want:  []string{"foo", "Bar", "Baz"},
+	},
+	{
+		input: "FOOBarBAZ",
+		want:  []string{"FOO", "Bar", "BAZ"},
+	},
+	{
+		input: "foo123_bar2Baz3",
+		want:  []string{"foo123", "bar2", "Baz3"},
+	},
+}
+
+func TestWordSplit(t *testing.T) {
+	for _, tc := range wordSplitTests {
+		roles := fuzzy.RuneRoles(tc.input, fuzzy.Symbol, nil)
+
+		var got []string
+		consumer := func(i, j int) {
+			got = append(got, tc.input[i:j])
+		}
+		fuzzy.Words(roles, consumer)
+
+		if eq := diffStringLists(tc.want, got); !eq {
+			t.Errorf("input %v: (want %v -> got %v)", tc.input, tc.want, got)
+		}
+	}
+}
+
+func diffStringLists(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	sort.Strings(a)
+	sort.Strings(b)
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+var lastSegmentSplitTests = []struct {
+	str   string
+	input fuzzy.Input
+	want  string
+}{
+	{
+		str:   "identifier",
+		input: fuzzy.Symbol,
+		want:  "identifier",
+	},
+	{
+		str:   "two_words",
+		input: fuzzy.Symbol,
+		want:  "two_words",
+	},
+	{
+		str:   "first::second",
+		input: fuzzy.Symbol,
+		want:  "second",
+	},
+	{
+		str:   "foo.bar.FOOBar_buz123_test",
+		input: fuzzy.Symbol,
+		want:  "FOOBar_buz123_test",
+	},
+	{
+		str:   "golang.org/x/tools/internal/lsp/fuzzy_matcher.go",
+		input: fuzzy.Filename,
+		want:  "fuzzy_matcher.go",
+	},
+	{
+		str:   "golang.org/x/tools/internal/lsp/fuzzy_matcher.go",
+		input: fuzzy.Text,
+		want:  "golang.org/x/tools/internal/lsp/fuzzy_matcher.go",
+	},
+}
+
+func TestLastSegment(t *testing.T) {
+	for _, tc := range lastSegmentSplitTests {
+		roles := fuzzy.RuneRoles(tc.str, tc.input, nil)
+
+		got := fuzzy.LastSegment(tc.str, roles)
+
+		if got != tc.want {
+			t.Errorf("str %v: want %v; got %v", tc.str, tc.want, got)
+		}
+	}
+}
+
+func BenchmarkRoles(b *testing.B) {
+	str := "AbstractSWTFactory"
+	out := make([]fuzzy.RuneRole, len(str))
+
+	for i := 0; i < b.N; i++ {
+		fuzzy.RuneRoles(str, fuzzy.Symbol, out)
+	}
+	b.SetBytes(int64(len(str)))
+}