Skip to content

Commit

Permalink
Calculate the size of Text in UTF-16 code units (yorkie-team#165)
Browse files Browse the repository at this point in the history
External editors often don't consider Grapheme Clusters, so we calculate the
size of Text in the same way as in JavaScript.

For more details:
https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/split
  • Loading branch information
hackerwins authored and jeonjonghyeok committed Aug 4, 2022
1 parent a1d1d8c commit 2b1b791
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 10 deletions.
12 changes: 7 additions & 5 deletions pkg/document/json/rich_text.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package json
import (
"fmt"
"strings"
"unicode/utf8"
"unicode/utf16"

"github.com/yorkie-team/yorkie/pkg/document/time"
"github.com/yorkie-team/yorkie/pkg/log"
Expand Down Expand Up @@ -60,8 +60,10 @@ func (t *RichTextValue) Value() string {
}

// Len returns the length of this value.
// It is calculated in UTF-16 code units.
func (t *RichTextValue) Len() int {
return utf8.RuneCountInString(t.value)
encoded := utf16.Encode([]rune(t.value))
return len(encoded)
}

// String returns the string representation of this value.
Expand All @@ -78,9 +80,9 @@ func (t *RichTextValue) AnnotatedString() string {
// Split splits this value by the given offset.
func (t *RichTextValue) Split(offset int) RGATreeSplitValue {
value := t.value
r := []rune(value)
t.value = string(r[0:offset])
return NewRichTextValue(t.attrs.DeepCopy(), string(r[offset:]))
encoded := utf16.Encode([]rune(value))
t.value = string(utf16.Decode(encoded[0:offset]))
return NewRichTextValue(t.attrs.DeepCopy(), string(utf16.Decode(encoded[offset:])))
}

// DeepCopy copies itself deeply.
Expand Down
12 changes: 7 additions & 5 deletions pkg/document/json/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ package json

import (
"fmt"
"unicode/utf8"
"unicode/utf16"

"github.com/yorkie-team/yorkie/pkg/document/time"
"github.com/yorkie-team/yorkie/pkg/log"
Expand All @@ -45,8 +45,10 @@ func NewTextValue(value string) *TextValue {
}

// Len returns the length of this value.
// It is calculated in UTF-16 code units.
func (t *TextValue) Len() int {
return utf8.RuneCountInString(t.value)
encoded := utf16.Encode([]rune(t.value))
return len(encoded)
}

// String returns the string representation of this value.
Expand All @@ -63,9 +65,9 @@ func (t *TextValue) AnnotatedString() string {
// Split splits this value by the given offset.
func (t *TextValue) Split(offset int) RGATreeSplitValue {
value := t.value
r := []rune(value)
t.value = string(r[0:offset])
return NewTextValue(string(r[offset:]))
encoded := utf16.Encode([]rune(value))
t.value = string(utf16.Decode(encoded[0:offset]))
return NewTextValue(string(utf16.Decode(encoded[offset:])))
}

// DeepCopy copies itself deeply.
Expand Down
22 changes: 22 additions & 0 deletions pkg/document/json/text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,26 @@ func TestText(t *testing.T) {
text.Edit(fromPos, toPos, nil, "Yorkie", ctx.IssueTimeTicket())
assert.Equal(t, `"Hello Yorkie"`, text.Marshal())
})

t.Run("UTF-16 code units test", func(t *testing.T) {
tests := []struct {
length int
value string
}{
{4, "abcd"},
{2, "한글"},
{8, "अनुच्छेद"},
{12, "🌷🎁💩😜👍🏳"},
{10, "Ĺo͂řȩm̅"},
}
for _, test := range tests {
val := json.NewTextValue(test.value)
assert.Equal(t, test.length, val.Len())
assert.Equal(t, test.length-2, val.Split(2).Len())

richVal := json.NewRichTextValue(json.NewRHT(), test.value)
assert.Equal(t, test.length, richVal.Len())
assert.Equal(t, test.length-2, richVal.Split(2).Len())
}
})
}

0 comments on commit 2b1b791

Please sign in to comment.