Skip to content

Commit

Permalink
charset: support utf8mb4_unicode_ci collation (#8420) (#9577)
Browse files Browse the repository at this point in the history
cherry-pick #8420 to release-4.0
You can switch your code base to this Pull Request by using [git-extras](https://github.com/tj/git-extras):
```bash
# In tikv repo:
git pr #9577
```

After apply modifications, you can push your change to this PR via:
```bash
git push [email protected]:ti-srebot/tikv.git pr/9577:release-4.0-f456abae9e5c
```

---

<!--
Thank you for contributing to TiKV!

If you haven't already, please read TiKV's [CONTRIBUTING](https://github.com/tikv/tikv/blob/master/CONTRIBUTING.md) document.

If you're unsure about anything, just ask; somebody should be along to answer within a day or two.

PR Title Format:
1. module [, module2, module3]: what's changed
2. *: what's changed
-->
Signed-off-by: jwxiong <[email protected]>

### What problem does this PR solve?

Problem Summary:

support utf8mb4_unicode_ci collation pingcap/tidb#17596

### What is changed and how it works?

add utf8mb4_unicode_ci support in tikv
for detail, please see pingcap/tidb#18776

### Check List <!--REMOVE the items that are not applicable-->

Tests <!-- At least one of them must be included. -->

- Unit test
- Integration test
- Manual test (add detailed scripts or steps below)


### Release note <!-- bugfixes or new feature need a release note -->
- add utf8mb4_unicode_ci implement
  • Loading branch information
ti-srebot authored Jan 28, 2021
1 parent 4492bc4 commit fcd71f3
Show file tree
Hide file tree
Showing 8 changed files with 842 additions and 12 deletions.
2 changes: 2 additions & 0 deletions components/tidb_query/src/aggr_fn/impl_max_min.rs
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,12 @@ mod tests {
(Collation::Binary, true, vec!["B", "a"], "a"),
(Collation::Utf8Mb4Bin, true, vec!["B", "a"], "a"),
(Collation::Utf8Mb4GeneralCi, true, vec!["B", "a"], "B"),
(Collation::Utf8Mb4UnicodeCi, true, vec!["ß", "sr"], "ß"),
(Collation::Utf8Mb4BinNoPadding, true, vec!["B", "a"], "a"),
(Collation::Binary, false, vec!["B", "a"], "B"),
(Collation::Utf8Mb4Bin, false, vec!["B", "a"], "B"),
(Collation::Utf8Mb4GeneralCi, false, vec!["B", "a"], "a"),
(Collation::Utf8Mb4UnicodeCi, false, vec!["ß", "st"], "ß"),
(Collation::Utf8Mb4BinNoPadding, false, vec!["B", "a"], "B"),
];
for (coll, is_max, args, expected) in cases {
Expand Down
2 changes: 2 additions & 0 deletions components/tidb_query/src/codec/collation/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0.

mod latin1_bin;
mod unicode_ci_data;
mod utf8mb4;

pub use self::latin1_bin::*;
Expand All @@ -23,6 +24,7 @@ pub macro match_template_collator($t:tt, $($tail:tt)*) {
Utf8Mb4BinNoPadding => CollatorUtf8Mb4BinNoPadding,
Utf8Mb4GeneralCi => CollatorUtf8Mb4GeneralCi,
Latin1Bin => CollatorLatin1Bin,
Utf8Mb4UnicodeCi => CollatorUtf8Mb4UnicodeCi,
],
$($tail)*
}
Expand Down
582 changes: 582 additions & 0 deletions components/tidb_query/src/codec/collation/unicode_ci_data.rs

Large diffs are not rendered by default.

210 changes: 198 additions & 12 deletions components/tidb_query/src/codec/collation/utf8mb4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::str;
use codec::prelude::*;

use super::*;
use crate::codec::collation::unicode_ci_data::*;
use crate::codec::Result;

pub struct CharsetUtf8mb4;
Expand Down Expand Up @@ -395,6 +396,111 @@ impl Collator for CollatorUtf8Mb4GeneralCi {
}
}

#[inline]
fn unicode_ci_convert(c: char) -> u128 {
let r = c as usize;
if r > 0xFFFF {
return 0xFFFD;
}

let u = UNICODE_CI_TABLE[r];
if u == LONG_RUNE {
return map_long_rune(r);
}

u as u128
}

/// Collator for `utf8mb4_unicode_ci` collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorUtf8Mb4UnicodeCi;

impl Collator for CollatorUtf8Mb4UnicodeCi {
type Charset = CharsetUtf8mb4;

#[inline]
fn validate(bstr: &[u8]) -> Result<()> {
str::from_utf8(bstr)?;
Ok(())
}

fn write_sort_key<W: BufferWriter>(writer: &mut W, bstr: &[u8]) -> Result<usize> {
let s = str::from_utf8(bstr)?.trim_end_matches(TRIM_PADDING_SPACE);
let mut n = 0;
for ch in s.chars() {
let mut convert = unicode_ci_convert(ch);
while convert != 0 {
writer.write_u16_be((convert & 0xFFFF) as u16)?;
n += 1;
convert >>= 16
}
}
Ok(n * std::mem::size_of::<u16>())
}

fn sort_compare(a: &[u8], b: &[u8]) -> Result<Ordering> {
let mut ca = str::from_utf8(a)?
.trim_end_matches(TRIM_PADDING_SPACE)
.chars();
let mut cb = str::from_utf8(b)?
.trim_end_matches(TRIM_PADDING_SPACE)
.chars();
let mut an = 0;
let mut bn = 0;

loop {
if an == 0 {
for ach in &mut ca {
an = unicode_ci_convert(ach);
if an != 0 {
break;
}
}
}

if bn == 0 {
for bch in &mut cb {
bn = unicode_ci_convert(bch);
if bn != 0 {
break;
}
}
}

if an == 0 || bn == 0 {
return Ok(an.cmp(&bn));
}

if an == bn {
an = 0;
bn = 0;
continue;
}

while an != 0 && bn != 0 {
if (an ^ bn) & 0xFFFF == 0 {
an >>= 16;
bn >>= 16;
} else {
return Ok((an & 0xFFFF).cmp(&(bn & 0xFFFF)));
}
}
}
}

fn sort_hash<H: Hasher>(state: &mut H, bstr: &[u8]) -> Result<()> {
let s = str::from_utf8(bstr)?.trim_end_matches(TRIM_PADDING_SPACE);
for ch in s.chars() {
let mut convert = unicode_ci_convert(ch);
while convert != 0 {
(convert & 0xFFFF).hash(state);
convert >>= 16;
}
}
Ok(())
}
}

/// Collator for utf8mb4_bin collation with padding behavior (trims right spaces).
#[derive(Debug)]
pub struct CollatorUtf8Mb4Bin;
Expand Down Expand Up @@ -480,43 +586,89 @@ mod tests {
(Collation::Utf8Mb4Bin, 0),
(Collation::Utf8Mb4BinNoPadding, 1),
(Collation::Utf8Mb4GeneralCi, 2),
(Collation::Utf8Mb4UnicodeCi, 3),
];
let cases = vec![
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi])
// (sa, sb, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi])
(
"a".as_bytes(),
"a".as_bytes(),
[Ordering::Equal, Ordering::Equal, Ordering::Equal],
[
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
Ordering::Equal,
],
),
(
"a".as_bytes(),
"a ".as_bytes(),
[Ordering::Equal, Ordering::Less, Ordering::Equal],
[
Ordering::Equal,
Ordering::Less,
Ordering::Equal,
Ordering::Equal,
],
),
(
"a".as_bytes(),
"A ".as_bytes(),
[Ordering::Greater, Ordering::Greater, Ordering::Equal],
[
Ordering::Greater,
Ordering::Greater,
Ordering::Equal,
Ordering::Equal,
],
),
(
"aa ".as_bytes(),
"a a".as_bytes(),
[Ordering::Greater, Ordering::Greater, Ordering::Greater],
[
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
Ordering::Greater,
],
),
(
"A".as_bytes(),
"a\t".as_bytes(),
[Ordering::Less, Ordering::Less, Ordering::Less],
[
Ordering::Less,
Ordering::Less,
Ordering::Less,
Ordering::Less,
],
),
(
"cAfe".as_bytes(),
"café".as_bytes(),
[Ordering::Less, Ordering::Less, Ordering::Equal],
[
Ordering::Less,
Ordering::Less,
Ordering::Equal,
Ordering::Equal,
],
),
(
"cAfe ".as_bytes(),
"café".as_bytes(),
[Ordering::Less, Ordering::Less, Ordering::Equal],
[
Ordering::Less,
Ordering::Less,
Ordering::Equal,
Ordering::Equal,
],
),
(
"ß".as_bytes(),
"ss".as_bytes(),
[
Ordering::Greater,
Ordering::Greater,
Ordering::Less,
Ordering::Equal,
],
),
];

Expand Down Expand Up @@ -568,18 +720,34 @@ mod tests {
(Collation::Utf8Mb4Bin, 0),
(Collation::Utf8Mb4BinNoPadding, 1),
(Collation::Utf8Mb4GeneralCi, 2),
(Collation::Utf8Mb4UnicodeCi, 3),
];
let cases = vec![
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi])
("a", [vec![0x61], vec![0x61], vec![0x00, 0x41]]),
("A ", [vec![0x41], vec![0x41, 0x20], vec![0x00, 0x41]]),
("A", [vec![0x41], vec![0x41], vec![0x00, 0x41]]),
// (str, [Utf8Mb4Bin, Utf8Mb4BinNoPadding, Utf8Mb4GeneralCi, Utf8Mb4UnicodeCi])
(
"a",
[vec![0x61], vec![0x61], vec![0x00, 0x41], vec![0x0E, 0x33]],
),
(
"A ",
[
vec![0x41],
vec![0x41, 0x20],
vec![0x00, 0x41],
vec![0x0E, 0x33],
],
),
(
"A",
[vec![0x41], vec![0x41], vec![0x00, 0x41], vec![0x0E, 0x33]],
),
(
"😃",
[
vec![0xF0, 0x9F, 0x98, 0x83],
vec![0xF0, 0x9F, 0x98, 0x83],
vec![0xff, 0xfd],
vec![0xff, 0xfd],
],
),
(
Expand All @@ -601,6 +769,24 @@ mod tests {
0x00, 0x42, 0x00, 0x41, 0x00, 0x5a, 0x00, 0x20, 0x26, 0x3, 0x00, 0x20,
0x00, 0x51, 0x00, 0x55, 0x00, 0x58,
],
vec![
0x0E, 0xB9, 0x0F, 0x82, 0x0F, 0x82, 0x02, 0x09, 0x02, 0xC5, 0x02, 0x09,
0x0E, 0x4A, 0x0E, 0x33, 0x0F, 0xC0, 0x02, 0x09, 0xFF, 0xFD, 0x02, 0x09,
0x0E, 0x4A, 0x0E, 0x33, 0x10, 0x6A, 0x02, 0x09, 0x06, 0xFF, 0x02, 0x09,
0x0F, 0xB4, 0x10, 0x1F, 0x10, 0x5A,
],
],
),
(
"ﷻ",
[
vec![0xEF, 0xB7, 0xBB],
vec![0xEF, 0xB7, 0xBB],
vec![0xFD, 0xFB],
vec![
0x13, 0x5E, 0x13, 0xAB, 0x02, 0x09, 0x13, 0x5E, 0x13, 0xAB, 0x13, 0x50,
0x13, 0xAB, 0x13, 0xB7,
],
],
),
];
Expand Down
Loading

0 comments on commit fcd71f3

Please sign in to comment.