Merge remote-tracking branch 'origin/main' into feat/delete_range

4paradigm · Jul 18, 2023 · 73e16d2 · 73e16d2
2 parents 2c05e00 + 2b6e630
commit 73e16d2
Show file tree

Hide file tree

Showing 138 changed files with 3,317 additions and 1,681 deletions.
diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml
@@ -177,6 +177,9 @@ jobs:
 
       - name: build
         run: |
+          # even gnu binutils support ar -M script, get error `ar: BFD (GNU Binutils) 2.40 assertion fail archive.c:1813`
+          # brew install binutils
+          # export PATH="/usr/local/opt/binutils/bin:$PATH"
           make build
           # GitHub runner disk space is limited
           # delete thirdparty build directory($ROOT/.deps/) to save disk space

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -41,6 +41,7 @@ jobs:
       SQL_JAVASDK_ENABLE: OFF
       NPROC: 2
       BUILD_SHARED_LIBS: ON
+      TESTING_ENABLE_STRIP: ON
     steps:
       - uses: actions/checkout@v3
         with:
@@ -60,36 +61,15 @@ jobs:
 
       - name: coverage configure
         run: |
-          make coverage-configure COVERAGE_NO_DEPS=ON
+          make coverage-configure
 
       - name: start service
         run: |
           ./steps/ut_zookeeper.sh start
       # make coverage-cpp will gen 16G build/ (no run), run test will take 2G more, so split make and test
       - name: Coverage CPP
         run: |
-          # rm some irrelevant files
-          rm -rf .deps/build
-          # only build tests: no build output by ctest, so we use cmake
-          # ctest --build-and-test . build --build-nocmake --build-generator "Unix Makefiles" -j $NPROC
-
-          # split make too, delete irrelavant files in the middle
-          cmake --build build --target help | grep -Eo "\S+_test$" | awk '{if(NR <80) printf(" %s ",$0)}' | xargs cmake --build build -j $NPROC --target
-          # rm some in the middle, leave ~7.4G
-          rm -f build/bin/* # no need anymore
-          find build -name "*.a" -type f -delete
-          du -d 1 -h build
-          df -h
-          cmake --build build --target help | grep -Eo "\S+_test$" | awk '{if(NR >=80) printf(" %s ",$0)}' | xargs cmake --build build -j $NPROC --target
-          # ~11G
-          # rm some irrelevant files, *.dir/Makefile/*.make can't be deleted, test in bin won't test by coverage
-          rm -f build/bin/*
-          find build \( -name "*.a" -o -name "*.o" -o -name "*.o.d" -o -name "cmake*.cmake" \) -delete
-          du -d 1 -h build
-          df -h
-          # Makefile target coverage, not the cmake target(won't build tests again), total 119(120-api_server_test)
-          cd build
-          SQL_CASE_BASE_DIR=$(pwd)/.. YAML_CASE_BASE_DIR=$(pwd)/.. make coverage
+          make coverage-cpp
 
       - name: debug
         if: always()

diff --git a/.github/workflows/sdk.yml b/.github/workflows/sdk.yml
@@ -260,8 +260,7 @@ jobs:
 
       - name: prepare python deps
         run: |
-          python3 -m easy_install pip
-          pip install setuptools wheel twine
+          pip install twine "urllib3>=1.26.0,<2.0.0"
           yum install -y net-tools
 
       - name: test sqlalchemy and generate coverage report

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,28 @@
 # Changelog
 
+## [0.8.1] - 2023-06-28
+
+### Features
+- Support a new SQL statement `ALTER TABLE ... ADD/DROP OFFLINE_PATH ...` (#3286 @aceforeverd, #3323 @tobegit3hub)
+- Support deploying SQLs in which the involved tables have data but without pre-aggregation defined (#3288 @dl239)
+- Support new built-in functions `top_n_value_ratio_cate`, `top_n_key_ratio_cate`, `list_except_by_key` and `list_except_by_value` (#3329 @aceforeverd)
+- Add a new SDK API to merge multiple SQLs for deployment (#3297 @vagetablechicken)
+- Support mapping topic tables in the Kafka connector (#3282 @vagetablechicken)
+- Support deploying the Kafka connector in Docker and Kubernetes (#3276 @tobegit3hub)
+- Support fetching jobs from NameServer (#3293 @dl239)
+- Enhance the diagnostic tool (#3224 #3208 #3285 #3258 #3303 @zhangziheng01233)
+- Enhance the `SELECT INTO ...` statement (#2529 @vagetablechicken)
+- Improve the documents (#3308 @aceforeverd)
+- Other minor features (#3312 #3314 @vagetablechicken, #3298 @aceforeverd)
+
+### Bug Fixes
+- SQL deployment fails in some cases (#3328 @vagetablechicken)
+- Creating UDFs/UDAFs may fail as the `udf` directory does not exist by default. (#3326 @vagetablechicken)
+- Other minor bug fixes (#3281 #3284 @vagetablechicken)
+
+### Code Refactoring
+#3226 @dl239, #3294 @aceforeverd
+
 ## [0.8.0] - 2023-05-12
 
 ### Features
@@ -576,6 +599,7 @@ Removed
 - openmldb-0.2.0-linux.tar.gz targets on x86_64
 - aarch64 artifacts consider experimental
 
+[0.8.1]: https://github.com/4paradigm/OpenMLDB/compare/v0.8.0...v0.8.1
 [0.8.0]: https://github.com/4paradigm/OpenMLDB/compare/v0.7.3...v0.8.0
 [0.7.3]: https://github.com/4paradigm/OpenMLDB/compare/v0.7.2...v0.7.3
 [0.7.2]: https://github.com/4paradigm/OpenMLDB/compare/v0.7.1...v0.7.2

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -90,6 +90,7 @@ option(TESTING_ENABLE "Enable test" OFF)
 option(TCMALLOC_ENABLE "Enable TCMALLOC" ON)
 option(SQL_PYSDK_ENABLE "Enable sql pysdk" OFF)
 option(SQL_JAVASDK_ENABLE "Enable sql javasdk" OFF)
+option(INSTALL_CXXSDK "Enable sql cxxsdk install" OFF)
 option(MAC_TABLET_ENABLE "Enable Table on Mac OS" ON)
 option(COVERAGE_ENABLE "Enable Coverage" OFF)
 option(COVERAGE_NO_DEPS "Coverage without test deps, should ensure test built" OFF)
@@ -195,7 +196,7 @@ set(LLVM_DIR "${CMAKE_PREFIX_PATH}/lib/cmake/llvm")
 find_package(LLVM REQUIRED CONFIG)
 message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
-llvm_map_components_to_libnames(LLVM_LIBS support core orcjit nativecodegen)
+llvm_map_components_to_libnames(LLVM_LIBS support core irreader orcjit nativecodegen)
 message(STATUS "Using LLVM components: ${LLVM_LIBS}")
 add_definitions(${LLVM_DEFINITIONS})
 
@@ -222,6 +223,7 @@ list(
         absl::random_random
         absl::strings
         absl::strings_internal
+        absl::str_format
         absl::synchronization
         absl::time
         absl::status
@@ -238,28 +240,17 @@ if (NOT ICU_FOUND)
 endif ()
 message(STATUS "Found ICU Libraries: ${ICU_LIBRARIES}")
 
-list(APPEND file_based_test_driver_LIBS
-        alternations
-        test_case_options
-        test_case_outputs
-        test_case_mode
-        logging
-        path
-        status
-        unified_diff_oss
-        ret_check
-        )
+find_library(zetasql_LIB zetasql)
+find_library(re2_LIB re2)
 set(ZETASQL_LIBS
-        zetasql
-        ${file_based_test_driver_LIBS}
+        ${zetasql_LIB}
         ${ABSL_LIBS}
-        re2
+        ${re2_LIB}
         ${ICU_LIBRARIES}
-        date_proto
-        timeofday_proto
-        latlng_proto
         )
 
+find_library(zookeeper_mt_LIB zookeeper_mt)
+
 if (TCMALLOC_ENABLE)
     set(CMAKE_EXE_LINKER_FLAGS "-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free ${CMAKE_EXE_LINKER_FLAGS}")
 endif()
@@ -328,3 +319,4 @@ install(
   DESTINATION tools
   PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
 )
+install(DIRECTORY DESTINATION udf)
diff --git a/Makefile b/Makefile
@@ -34,6 +34,9 @@ endif
 ifdef SQL_JAVASDK_ENABLE
     OPENMLDB_CMAKE_FLAGS += -DSQL_JAVASDK_ENABLE=$(SQL_JAVASDK_ENABLE)
 endif
+ifdef INSTALL_CXXSDK
+	OPENMLDB_CMAKE_FLAGS += -DINSTALL_CXXSDK=$(INSTALL_CXXSDK)
+endif
 ifdef TESTING_ENABLE
     OPENMLDB_CMAKE_FLAGS += -DTESTING_ENABLE=$(TESTING_ENABLE)
 endif

diff --git a/README.md b/README.md
@@ -4,7 +4,6 @@
 [![docker pulls](https://img.shields.io/docker/pulls/4pdosc/openmldb.svg)](https://hub.docker.com/r/4pdosc/openmldb)
 [![slack](https://img.shields.io/badge/Slack-Join%20Slack-blue)](https://join.slack.com/t/hybridsql-ws/shared_invite/zt-ozu3llie-K~hn9Ss1GZcFW2~K_L5sMg)
 [![discuss](https://img.shields.io/badge/Discuss-Ask%20Questions-blue)](https://github.com/4paradigm/OpenMLDB/discussions)
-[![codecov](https://codecov.io/gh/4paradigm/OpenMLDB/branch/main/graph/badge.svg?token=OMPII8NGN2)](https://codecov.io/gh/4paradigm/OpenMLDB)
 [![release](https://img.shields.io/github/v/release/4paradigm/OpenMLDB?color=lime)](https://github.com/4paradigm/OpenMLDB/releases)
 [![license](https://img.shields.io/github/license/4paradigm/OpenMLDB?color=orange)](https://github.com/4paradigm/OpenMLDB/blob/main/LICENSE)
 [![gitee](https://img.shields.io/badge/Gitee-mirror-lightyellow)](https://gitee.com/paradigm4/OpenMLDB)
@@ -158,7 +157,7 @@ We really appreciate the contribution from our community.
 ## 12. Publications
 
 - [Scalable Online Interval Join on Modern Multicore Processors in OpenMLDB](docs/paper/scale_oij_icde2023.pdf). Hao Zhang, Xianzhi Zeng, Shuhao Zhang, Xinyi Liu, Mian Lu, and Zhao Zheng. In 2023 IEEE 39rd International Conference on Data Engineering (ICDE) 2023. [[code]](https://github.com/4paradigm/OpenMLDB/tree/stream)
-- [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. In submission. [[code]](https://github.com/decis-bench/febench).
+- [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. International Conference on Very Large Data Bases (VLDB) 2023. [[code]](https://github.com/decis-bench/febench).
 -  [A System for Time Series Feature Extraction in Federated Learning](https://dl.acm.org/doi/pdf/10.1145/3511808.3557176). Siqi Wang, Jiashu Li, Mian Lu, Zhao Zheng, Yuqiang Chen, and Bingsheng He. 2022. In Proceedings of the 31st ACM International Conference on Information & Knowledge Management (CIKM) 2022. [[code]](https://github.com/4paradigm/tsfe).
 - [Optimizing in-memory database engine for AI-powered on-line decision augmentation using persistent memory](http://vldb.org/pvldb/vol14/p799-chen.pdf). Cheng Chen, Jun Yang, Mian Lu, Taize Wang, Zhao Zheng, Yuqiang Chen, Wenyuan Dai, Bingsheng He, Weng-Fai Wong, Guoan Wu, Yuping Zhao, and Andy Rudoff. International Conference on Very Large Data Bases (VLDB) 2021.
 

diff --git a/README_cn.md b/README_cn.md
@@ -4,7 +4,6 @@
 [![docker pulls](https://img.shields.io/docker/pulls/4pdosc/openmldb.svg)](https://hub.docker.com/r/4pdosc/openmldb)
 [![slack](https://img.shields.io/badge/Slack-Join%20Slack-blue)](https://join.slack.com/t/hybridsql-ws/shared_invite/zt-ozu3llie-K~hn9Ss1GZcFW2~K_L5sMg)
 [![discuss](https://img.shields.io/badge/Discuss-Ask%20Questions-blue)](https://github.com/4paradigm/OpenMLDB/discussions)
-[![codecov](https://codecov.io/gh/4paradigm/OpenMLDB/branch/main/graph/badge.svg?token=OMPII8NGN2)](https://codecov.io/gh/4paradigm/OpenMLDB)
 [![release](https://img.shields.io/github/v/release/4paradigm/OpenMLDB?color=lime)](https://github.com/4paradigm/OpenMLDB/releases)
 [![license](https://img.shields.io/github/license/4paradigm/OpenMLDB?color=orange)](https://github.com/4paradigm/OpenMLDB/blob/main/LICENSE)
 [![gitee](https://img.shields.io/badge/Gitee-mirror-lightyellow)](https://gitee.com/paradigm4/OpenMLDB)
@@ -147,7 +146,7 @@ OpenMLDB 的整体架构设计是为了达到特征平台从开发到部署的
 ## 12. 学术论文
 
 - [Scalable Online Interval Join on Modern Multicore Processors in OpenMLDB](docs/paper/scale_oij_icde2023.pdf). Hao Zhang, Xianzhi Zeng, Shuhao Zhang, Xinyi Liu, Mian Lu, and Zhao Zheng. In 2023 IEEE 39rd International Conference on Data Engineering (ICDE) 2023. [[code]](https://github.com/4paradigm/OpenMLDB/tree/stream)
-- [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. In submission. [[code]](https://github.com/decis-bench/febench).
+- [FEBench: A Benchmark for Real-Time Relational Data Feature Extraction](https://github.com/decis-bench/febench/blob/main/report/febench.pdf). Xuanhe Zhou, Cheng Chen, Kunyi Li, Bingsheng He, Mian Lu, Qiaosheng Liu, Wei Huang, Guoliang Li, Zhao Zheng, Yuqiang Chen. International Conference on Very Large Data Bases (VLDB) 2023. [[code]](https://github.com/decis-bench/febench).
 - [A System for Time Series Feature Extraction in Federated Learning](https://dl.acm.org/doi/pdf/10.1145/3511808.3557176). Siqi Wang, Jiashu Li, Mian Lu, Zhao Zheng, Yuqiang Chen, and Bingsheng He. 2022. In Proceedings of the 31st ACM International Conference on Information & Knowledge Management (CIKM) 2022. [[code]](https://github.com/4paradigm/tsfe).
 - [Optimizing in-memory database engine for AI-powered on-line decision augmentation using persistent memory](http://vldb.org/pvldb/vol14/p799-chen.pdf). Cheng Chen, Jun Yang, Mian Lu, Taize Wang, Zhao Zheng, Yuqiang Chen, Wenyuan Dai, Bingsheng He, Weng-Fai Wong, Guoan Wu, Yuping Zhao, and Andy Rudoff. International Conference on Very Large Data Bases (VLDB) 2021.
 

diff --git a/cases/function/function/test_udaf_function.yaml b/cases/function/function/test_udaf_function.yaml
@@ -2719,3 +2719,47 @@ cases:
         200, 1-2, 2,  NULL
         300, 1-2, 2,  2
         400, 1-2, 2,  2
+
+  - id: 66
+    desc: top_n_value_ratio_cate/top_n_key_ratio_cate
+    sql: |
+      select
+        idx,
+        top_n_value_ratio_cate(val, val > 100, cate, 2) over w as ratio_by_value,
+        top_n_key_ratio_cate(val, val > 100, cate, 2) over w as ratio_by_key,
+        top_n_value_ratio_cate(val, val > 100, cate, -1) over w as ratio_by_value_full,
+        top_n_key_ratio_cate(val, val > 100, cate, -2) over w as ratio_by_key_full,
+        top_n_value_ratio_cate(val, val > 100, cate, 0) over w as ratio_by_value_empty,
+        top_n_key_ratio_cate(val, val > 100, cate, 0) over w as ratio_by_key_empty,
+      from t1
+      window w as (
+        partition by gp order by ts
+        rows_range between 10s preceding and 1s preceding)
+    inputs:
+      - name: t1
+        columns: ["idx int", "gp int", "val int", "cate string", "ts timestamp"]
+        indexs: ['idx:gp:ts']
+        data: |
+          0,   1, 200, a, 1000
+          100, 1, 300, b, 2000
+          200, 1, 200, NULL, 3000
+          300, 1, 10,  b, 4000
+          400, 1, 101, c, 5000
+          500, 1, 101, c, 6000
+    expect:
+      columns:
+        - idx int
+        - ratio_by_value string
+        - ratio_by_key string
+        - ratio_by_value_full string
+        - ratio_by_key_full string
+        - ratio_by_value_empty string
+        - ratio_by_key_empty string
+      order: idx
+      rows:
+        - [0, "", "", "", "", "", ""]
+        - [100, "a:1.000000", "a:1.000000", "a:1.000000", "a:1.000000", "", ""]
+        - [200, "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "", ""]
+        - [300, "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "b:1.000000,a:1.000000", "", ""]
+        - [400, "a:1.000000,b:0.500000", "b:0.500000,a:1.000000", "a:1.000000,b:0.500000", "b:0.500000,a:1.000000",  "", ""]
+        - [500, "c:1.000000,a:1.000000", "c:1.000000,b:0.500000", "c:1.000000,a:1.000000,b:0.500000", "c:1.000000,b:0.500000,a:1.000000", "", ""]
diff --git a/cases/function/test_feature_zero_function.yaml b/cases/function/test_feature_zero_function.yaml
@@ -201,3 +201,36 @@ cases:
         - [3, 3, 1, 1]
         - [4, 0, 0, 0]
         - [5, 2, 2, 2]
+
+  - id: 7
+    desc: list_except_by_key/list_except_by_value
+    inputs:
+      - name: t1
+        columns: ["idx int", "gp int", "val string", "ts timestamp"]
+        indexs: ['idx:gp:ts']
+        rows:
+          - [100, 1, "a:1,b:2,c:0", 1000]
+          - [101, 1, "abc", 1000]
+          - [102, 1, "a,b,c", 1000]
+    # note
+    # 1.'abc' as kv pair parsed to `abc=`, value is a empty string
+    # 2. list is not kv pair but simply 'k1,k2,k3' ? it fallbacked (same result as) filter on whole kv pair
+    sql: |
+      select idx,
+        `join`(list_except_by_key(split(val, ','), 'a,b'), " ") as keys_filterd,
+        `join`(list_except_by_value(split(val, ','), '1,2'), " ") as values_filterd,
+        `join`(list_except_by_key(split(val, ','), ''), " ") as filter_nothing1,
+        `join`(list_except_by_value(split(val, ','), ''), " ") as filter_nothing2,
+      from t1
+    expect:
+      order: idx
+      columns:
+        - idx int
+        - keys_filterd string
+        - values_filterd string
+        - filter_nothing1 string
+        - filter_nothing2 string
+      rows:
+        - [100, "c:0", "c:0", "a:1 b:2 c:0", "a:1 b:2 c:0"]
+        - [101, "abc", "abc", "abc", ""]
+        - [102, "c", "a b c", "a b c", ""]
diff --git a/cases/plan/cmd.yaml b/cases/plan/cmd.yaml
@@ -139,6 +139,15 @@ cases:
         +-node[CMD]
           +-cmd_type: drop table
           +-args: [db1, t1]
+  - id: 14-3
+    desc: DROP TABLE IF EXISTS
+    sql: DROP TABLE IF EXISTS t1;
+    expect:
+      node_tree_str: |
+        +-node[CMD]
+          +-cmd_type: drop table
+          +-if_exists: true
+          +-args: [t1]
   - id: 15-1
     desc: DROP INDEX
     sql: DROP INDEX t1.index1
@@ -171,6 +180,15 @@ cases:
         +-node[CMD]
           +-cmd_type: drop database
           +-args: [db1]
+  - id: 18
+    desc: DROP DATABASE IF EXISTS
+    sql: DROP DATABASE IF EXISTS db1
+    expect:
+      node_tree_str: |
+        +-node[CMD]
+          +-cmd_type: drop database
+          +-if_exists: true
+          +-args: [db1]
   - id: show_deployments
     desc: show deployments
     sql: SHOW DEPLOYMENTS;

diff --git a/cases/plan/error_unsupport_sql.yaml b/cases/plan/error_unsupport_sql.yaml
@@ -107,3 +107,10 @@ cases:
     expect:
       success: false
       msg: 'array of non-basic type is not supported: ARRAY<ARRAY<INT32>>'
+  - id: 21
+    sql: |
+      insert into t1 values (29223372036854775807)
+    expect:
+      success: false
+      msg: |
+        Invalid integer literal<OUT_OF_RANGE: 29223372036854775807 (overflow)>
diff --git a/demo/Dockerfile b/demo/Dockerfile
@@ -25,7 +25,7 @@ COPY *_dist.yml /work/
 
 ENV LANG=en_US.UTF-8
 ENV SPARK_HOME=/work/openmldb/spark-3.2.1-bin-openmldbspark
-ARG OPENMLDB_VERSION=0.8.0
+ARG OPENMLDB_VERSION=0.8.1
 ENV OPENMLDB_VERSION="${OPENMLDB_VERSION}"
 
 RUN  if [ "${USE_ADD_WHL}" = "true" ] ; then \

diff --git a/demo/README.md b/demo/README.md
@@ -12,7 +12,7 @@ docker-compose -f docker-compose.test.yml -- build
 
 1. quickstart, java_quickstart, python_quickstart
 
-   NOTE: java project can't be built in docker container(no mvn), so you should built it by `cd java_quickstart;mvn package`. If no target jar, the test will be failed. But it won't fail the whole test. The python quickstart won't fail the whole test either.
+   NOTE: java project can't be built in docker container(no mvn), so you should built it by `cd java_quickstart;mvn package`. If no target jar, the test will be failed. But it won't fail the whole test. The python quickstart will be checked.
 2. taxi
 3. talkingdata
 4. oneflow sqls(no train or predict), a bit slow, ~10min.

diff --git a/demo/cxx_quickstart/Makefile b/demo/cxx_quickstart/Makefile
@@ -0,0 +1,7 @@
+all: demo
+
+demo: quickstart.cxx
+	g++ -o demo quickstart.cxx -lstdc++ -std=c++17 -I/work/openmldb/include  -L/work/openmldb/lib -lopenmldbsdk -lpthread -lm -ldl -lstdc++fs
+
+clean:
+	rm -f demo