Skip to content

Commit

Permalink
fix: gcformat space and continuous sign (#3921)
Browse files Browse the repository at this point in the history
* fix: gcformat space

* fix: gcformat continuous sign use hash

* fix: delete incorrect comments
  • Loading branch information
wyl4pd authored May 14, 2024
1 parent 673ab1d commit 63d3a17
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 39 deletions.
80 changes: 45 additions & 35 deletions cases/query/feature_signature_query.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ cases:
mode: procedure-unsupport
db: db1
sql: |
select gcformat(
select concat("#", gcformat(
discrete(3, -1),
discrete(3, 0),
discrete(3, int("null")),
Expand All @@ -57,31 +57,31 @@ cases:
discrete(-1, 5),
discrete(-2, 5),
discrete(-3, 5),
discrete(-4, 5)) as instance,
discrete(-4, 5))) as instance;
expect:
schema: instance:string
data: |
| 4:628 5:491882390849628 6:0 7:4 8:1 9:3 10:1 11:1 12:0 13:0 14:4
# | 4:628 5:491882390849628 6:0 7:4 8:1 9:3 10:1 11:1 12:0 13:0 14:4
- id: 2
desc: feature signature select GCFormat no label
mode: procedure-unsupport
db: db1
sql: |
select gcformat(
select concat("#", gcformat(
discrete(hash64("x"), 1),
continuous(pow(10, 30)),
continuous(-pow(10, 1000)),
continuous(abs(sqrt(-1)))) as instance;
continuous(abs(sqrt(-1))))) as instance;
expect:
schema: instance:string
data: |
| 1:0 2:0:1000000000000000019884624838656.000000 3:0:-inf 4:0:nan
# | 1:0 2:3353244675891348105:1000000000000000019884624838656.000000 3:7262150054277104024:-inf 4:3255232038643208583:nan
- id: 3
desc: feature signature GCFormat null
mode: procedure-unsupport
db: db1
sql: |
select gcformat(
select concat("#", gcformat(
regression_label(2),
regression_label(int("null")),
continuous(int("null")),
Expand All @@ -98,31 +98,31 @@ cases:
discrete(3, -100),
discrete(3),
continuous(0.0),
continuous(int("null"))) as instance;
continuous(int("null")))) as instance;
expect:
schema: instance:string
data: |
| 3:0:-1 4:0:2681491882390849628 5:28 8:2681491882390849628 9:0:-1 10:28 13:2681491882390849628 14:0:0.000000
# | 3:7262150054277104024:-1 4:3255232038643208583:2681491882390849628 5:28 8:2681491882390849628 9:-7745589761753622095:-1 10:28 13:2681491882390849628 14:398281081943027035:0.000000
- id: 4
desc: feature signature GCFormat no feature
mode: procedure-unsupport
db: db1
sql: |
select gcformat(binary_label(false));
select concat(gcformat(binary_label(false)), "#") as instance;
expect:
schema: gcformat(binary_label(false)):string
schema: instance:string
data: |
0|
0 | #
- id: 5
desc: feature signature GCFormat nothing
mode: procedure-unsupport
db: db1
sql: |
select gcformat();
select concat(concat("#", gcformat()), "#") as instance;
expect:
schema: gcformat():string
schema: instance:string
data: |
|
# | #
- id: 6
desc: feature signature CSV no label
mode: procedure-unsupport
Expand All @@ -136,7 +136,7 @@ cases:
expect:
columns: [instance:string]
rows:
- [",,,628"]
- [ ",,,628" ]
- id: 7
desc: feature signature CSV null
mode: procedure-unsupport
Expand All @@ -163,7 +163,7 @@ cases:
expect:
columns: [ "instance:string "]
rows:
- ["2,,,,-1,2681491882390849628,28,,,2681491882390849628,-1,28,,,2681491882390849628,0.000000,"]
- [ "2,,,,-1,2681491882390849628,28,,,2681491882390849628,-1,28,,,2681491882390849628,0.000000," ]
- id: 8
desc: feature signature CSV no feature
mode: procedure-unsupport
Expand Down Expand Up @@ -263,7 +263,7 @@ cases:
expect:
schema: instance:string
data: |
1| 1:0:0 2:0:1 3:0
1 | 1:5925585971146611297:0 2:3353244675891348105:1 3:0
- id: 15
desc: feature signature select GCFormat from
mode: request-unsupport
Expand All @@ -289,11 +289,11 @@ cases:
schema: instance:string
order: instance
data: |
1| 1:0:0 2:0:1 3:0
2| 1:0:0 2:0:2 3:0
3| 1:0:1 2:0:3 3:0
4| 1:0:1 2:0:4 3:0
5| 1:0:2 2:0:5 3:0
1 | 1:5925585971146611297:0 2:3353244675891348105:1 3:0
2 | 1:5925585971146611297:0 2:3353244675891348105:2 3:0
3 | 1:5925585971146611297:1 2:3353244675891348105:3 3:0
4 | 1:5925585971146611297:1 2:3353244675891348105:4 3:0
5 | 1:5925585971146611297:2 2:3353244675891348105:5 3:0
- id: 16
desc: feature signature select CSV from
mode: request-unsupport
Expand Down Expand Up @@ -360,7 +360,7 @@ cases:
mode: request-unsupport
db: db1
sql: |
SELECT gcformat(regression_label(col1)) as col1,
SELECT gcformat(regression_label(col1), discrete(col1, 1)) as col1,
csv(regression_label(col1)) as col2,
libsvm(regression_label(col1)) as col3
FROM t1;
Expand All @@ -375,14 +375,14 @@ cases:
1, 4, 55, 4.4, 44.4, 2, 4444
2, 5, 55, 5.5, 55.5, 3, aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
expect:
schema: col1:string, col2:string, col3:string
order: col1
data: |
1|, 1, 1
2|, 2, 2
3|, 3, 3
4|, 4, 4
5|, 5, 5
columns: [ "col1:string", "col2:string", "col3:string" ]
order: "col1"
rows:
- [ "1 | 1:0", "1", "1" ]
- [ "2 | 1:0", "2", "2" ]
- [ "3 | 1:0", "3", "3" ]
- [ "4 | 1:0", "4", "4" ]
- [ "5 | 1:0", "5", "5" ]
- id: 19
desc: feature signature select from join
mode: request-unsupport
Expand Down Expand Up @@ -471,15 +471,25 @@ cases:
mode: procedure-unsupport
db: db1
sql: |
select gcformat(
select concat("#", gcformat(
regression_label(2),
continuous(1),
continuous(int("notint")),
continuous(0),
continuous(0.0),
discrete(3),
regression_label(int("notint"))) as instance;
regression_label(int("notint")))) as instance;
expect:
schema: instance:string
data: |
| 1:0:1 3:0:0 4:0:0.000000 5:2681491882390849628
# | 1:5925585971146611297:1 3:7262150054277104024:0 4:3255232038643208583:0.000000 5:2681491882390849628
- id: 23
desc: hash64
mode: procedure-unsupport
db: db1
sql: |
select hash64(3) as col1, hash64(bigint(3)) as col2;
expect:
schema: col1:int64, col2:int64
data: |
2681491882390849628, 7262150054277104024
17 changes: 13 additions & 4 deletions hybridse/src/udf/default_defs/feature_signature_def.cc
Original file line number Diff line number Diff line change
Expand Up @@ -204,14 +204,23 @@ struct GCFormat {
switch (feature_signature) {
case kFeatureSignatureContinuous: {
if (!is_null) {
instance_feature += " " + std::to_string(slot_number) + ":0:" + format_continuous(input);
if (!instance_feature.empty()) {
instance_feature += " ";
}
int64_t hash = FarmFingerprint(CCallDataTypeTrait<int64_t>::to_bytes_ref(&slot_number));
instance_feature += std::to_string(slot_number) + ":";
instance_feature += format_discrete(hash);
instance_feature += ":" + format_continuous(input);
}
++slot_number;
break;
}
case kFeatureSignatureDiscrete: {
if (!is_null) {
instance_feature += " " + std::to_string(slot_number) + ":" + format_discrete(input);
if (!instance_feature.empty()) {
instance_feature += " ";
}
instance_feature += std::to_string(slot_number) + ":" + format_discrete(input);
}
++slot_number;
break;
Expand Down Expand Up @@ -249,7 +258,7 @@ struct GCFormat {
}

std::string Output() {
return instance_label + "|" + instance_feature;
return instance_label + " | " + instance_feature;
}

size_t slot_number = 1;
Expand Down Expand Up @@ -482,7 +491,7 @@ void DefaultUdfLibrary::InitFeatureSignature() {
Example:
@code{.sql}
select gcformat(multiclass_label(6), continuous(1.5), category(3));
-- output 6| 1:0:1.500000 2:2681491882390849628
-- output 6 | 1:0:1.500000 2:2681491882390849628
@endcode
@since 0.9.0
Expand Down

0 comments on commit 63d3a17

Please sign in to comment.