From b131d7d4dda0b9bf28c54fcf0626bb2059534559 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 13 Jun 2023 19:45:23 +0800 Subject: [PATCH 01/18] feat: use dictionary type to store column --- .github/workflows/ci.yml | 42 ++ Cargo.lock | 314 ++++----- Cargo.toml | 14 +- Dockerfile | 2 +- analytic_engine/src/compaction/mod.rs | 108 ++- analytic_engine/src/compaction/picker.rs | 66 +- analytic_engine/src/compaction/scheduler.rs | 23 +- analytic_engine/src/instance/engine.rs | 16 +- .../src/instance/flush_compaction.rs | 155 +++-- analytic_engine/src/instance/mod.rs | 4 +- analytic_engine/src/instance/open.rs | 315 +++------ .../src/instance/serial_executor.rs | 8 +- analytic_engine/src/instance/wal_replayer.rs | 618 ++++++++++++++++++ analytic_engine/src/lib.rs | 13 + analytic_engine/src/manifest/details.rs | 32 +- analytic_engine/src/memtable/mod.rs | 15 +- .../src/memtable/skiplist/factory.rs | 3 +- analytic_engine/src/memtable/skiplist/mod.rs | 44 +- analytic_engine/src/sampler.rs | 4 +- analytic_engine/src/setup.rs | 48 ++ analytic_engine/src/sst/meta_data/cache.rs | 182 +++++- .../src/sst/parquet/async_reader.rs | 116 +--- analytic_engine/src/sst/parquet/encoding.rs | 15 +- analytic_engine/src/sst/parquet/hybrid.rs | 2 + analytic_engine/src/sst/parquet/writer.rs | 161 ++++- analytic_engine/src/sst/reader.rs | 42 +- analytic_engine/src/table/data.rs | 21 +- analytic_engine/src/table/version.rs | 95 +-- analytic_engine/src/tests/alter_test.rs | 29 +- analytic_engine/src/tests/drop_test.rs | 39 +- analytic_engine/src/tests/read_write_test.rs | 128 ++-- analytic_engine/src/tests/util.rs | 145 +++- common_types/src/column.rs | 331 +++++++++- common_types/src/column_schema.rs | 86 ++- common_types/src/datum.rs | 132 +++- common_types/src/hex.rs | 64 ++ common_types/src/lib.rs | 3 +- common_types/src/record_batch.rs | 41 +- common_types/src/row/mod.rs | 4 + common_types/src/schema.rs | 2 +- common_types/src/tests.rs | 64 +- components/message_queue/Cargo.toml | 4 +- components/message_queue/src/kafka/config.rs | 21 +- .../message_queue/src/kafka/kafka_impl.rs | 10 +- components/parquet_ext/Cargo.toml | 2 + components/parquet_ext/src/lib.rs | 1 + components/parquet_ext/src/meta_data.rs | 25 +- components/parquet_ext/src/reader.rs | 81 +++ components/profile/Cargo.toml | 1 + components/profile/src/lib.rs | 65 +- df_operator/src/udfs/time_bucket.rs | 3 +- integration_tests/Makefile | 3 + .../config/shard-based-recovery.toml | 21 + integration_tests/recovery/check.py | 84 +++ integration_tests/recovery/run.sh | 35 + interpreters/src/describe.rs | 4 + interpreters/src/insert.rs | 6 +- interpreters/src/show_create.rs | 5 + interpreters/src/tests.rs | 21 +- proxy/src/forward.rs | 29 +- proxy/src/grpc/metrics.rs | 9 + proxy/src/grpc/prom_query.rs | 55 +- proxy/src/grpc/sql_query.rs | 39 +- proxy/src/grpc/write.rs | 9 +- proxy/src/http/prom.rs | 2 + proxy/src/http/sql.rs | 1 + proxy/src/influxdb/mod.rs | 1 + proxy/src/influxdb/types.rs | 12 +- proxy/src/lib.rs | 4 + proxy/src/read.rs | 7 +- proxy/src/write.rs | 26 +- query_frontend/src/frontend.rs | 7 + query_frontend/src/parser.rs | 69 ++ query_frontend/src/planner.rs | 186 +++++- rust-toolchain | 1 - rust-toolchain.toml | 3 + server/Cargo.toml | 1 - server/src/grpc/metrics.rs | 8 + server/src/grpc/remote_engine_service/mod.rs | 47 +- server/src/grpc/storage_service/mod.rs | 48 +- server/src/http.rs | 72 +- server/src/mysql/writer.rs | 5 + table_engine/src/memory.rs | 4 +- table_engine/src/table.rs | 1 + tools/src/bin/sst-metadata.rs | 87 ++- wal/src/message_queue_impl/log_cleaner.rs | 6 +- wal/src/message_queue_impl/region.rs | 6 +- wal/src/rocks_impl/config.rs | 21 + wal/src/rocks_impl/manager.rs | 90 ++- 89 files changed, 3795 insertions(+), 999 deletions(-) create mode 100644 analytic_engine/src/instance/wal_replayer.rs create mode 100644 common_types/src/hex.rs create mode 100644 components/parquet_ext/src/reader.rs create mode 100644 integration_tests/config/shard-based-recovery.toml create mode 100644 integration_tests/recovery/check.py create mode 100755 integration_tests/recovery/run.sh delete mode 100644 rust-toolchain create mode 100644 rust-toolchain.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a3d825fe1..e682ae5b55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -232,3 +232,45 @@ jobs: name: sdk-test-${{ github.head_ref }}.${{ github.sha }} path: | /tmp/ceresdb-stdout.log + + recovery-test: + name: recovery-test + runs-on: ubuntu-latest + timeout-minutes: 60 + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Cache Rust Dependencies + uses: actions/cache@v3 + with: + path: | + ~/.cargo + ./target + key: debug-${{ runner.os }}-${{ hashFiles('rust-toolchain') }}-${{ hashFiles('Cargo.lock') }} + restore-keys: | + debug-${{ runner.os }}-${{ hashFiles('rust-toolchain') }}- + debug-${{ runner.os }}- + debug- + - run: | + rustup set auto-self-update disable + rustup toolchain install ${RUST_VERSION} --profile minimal + - name: Release Disk Quota + run: | + sudo rm -rf /usr/local/lib/android # release about 10 GB + sudo rm -rf /usr/share/dotnet # release about 20GB + - name: Setup Build Environment + run: | + sudo apt update + sudo apt install --yes protobuf-compiler + - name: Run recovery tests + working-directory: integration_tests + run: | + make run-recovery + - name: Upload Logs + if: always() + uses: actions/upload-artifact@v3 + with: + name: recovery-test-${{ github.head_ref }}.${{ github.sha }} + path: | + /tmp/ceresdb-stdout.log diff --git a/Cargo.lock b/Cargo.lock index 116d2dca10..466fe8b216 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,12 +80,16 @@ version = "1.2.2" dependencies = [ "arc-swap 1.6.0", "arena", - "arrow 38.0.0", + "arrow", "async-stream", "async-trait", "base64 0.13.1", "bytes", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "datafusion", @@ -184,31 +188,6 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" -[[package]] -name = "arrow" -version = "23.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fedc767fbaa36ea50f086215f54f1a007d22046fc4754b0448c657bcbe9f8413" -dependencies = [ - "ahash 0.8.3", - "arrow-buffer 23.0.0", - "bitflags", - "chrono", - "csv", - "flatbuffers 2.1.2", - "half 2.2.1", - "hashbrown 0.12.3", - "indexmap", - "lazy_static", - "lexical-core 0.8.5", - "multiversion", - "num", - "regex", - "regex-syntax 0.6.29", - "serde", - "serde_json", -] - [[package]] name = "arrow" version = "38.0.0" @@ -218,7 +197,7 @@ dependencies = [ "ahash 0.8.3", "arrow-arith", "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-cast", "arrow-csv", "arrow-data", @@ -238,7 +217,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace6aa3d5617c5d03041a05e01c6819428a8ddf49dd0b055df9b40fef9d96094" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "chrono", @@ -253,7 +232,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "104a04520692cc674e6afd7682f213ca41f9b13ff1873f63a5a2857a590b87b3" dependencies = [ "ahash 0.8.3", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "chrono", @@ -263,16 +242,6 @@ dependencies = [ "num", ] -[[package]] -name = "arrow-buffer" -version = "23.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d290050c6e12a81a24ad08525cef2203c4156a6350f75508d49885d677e88ea9" -dependencies = [ - "half 2.2.1", - "num", -] - [[package]] name = "arrow-buffer" version = "38.0.0" @@ -290,7 +259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6d6e18281636c8fc0b93be59834da6bf9a72bb70fd0c98ddfdaf124da466c28" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", @@ -307,7 +276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3197dab0963a236ff8e7c82e2272535745955ac1321eb740c29f2f88b353f54e" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", @@ -325,7 +294,7 @@ version = "38.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb68113d6ecdbe8bba48b2c4042c151bf9e1c61244e45072a50250a6fc59bafe" dependencies = [ - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-schema", "half 2.2.1", "num", @@ -338,11 +307,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eab4bbf2dd3078facb5ce0a9641316a64f42bfd8cf357e6775c8a5e6708e3a8d" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", - "flatbuffers 23.1.21", + "flatbuffers", ] [[package]] @@ -352,7 +321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c5b650d23746a494665d914a7fa3d21d939153cff9d53bdebe39bffa88f263" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-cast", "arrow-data", "arrow-schema", @@ -372,7 +341,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68c6fce28e5011e30acc7466b5efcb8ed0197c396240bd2b10e167f275a3c208" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", @@ -388,7 +357,7 @@ checksum = "f20a421f19799d8b93eb8edde5217e910fa1e2d6ceb3c529f000e57b6db144c0" dependencies = [ "ahash 0.8.3", "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "half 2.2.1", @@ -408,7 +377,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ab6613ce65b61d85a3410241744e84e48fbab0fe06e1251b4429d21b3470fd" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "num", @@ -421,7 +390,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3008641239e884aefba66d8b8532da6af40d14296349fcc85935de4ba67b89e" dependencies = [ "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", @@ -433,7 +402,7 @@ dependencies = [ name = "arrow_ext" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "serde", "snafu 0.6.10", "zstd 0.12.3+zstd.1.5.2", @@ -442,10 +411,10 @@ dependencies = [ [[package]] name = "arrow_util" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "ahash 0.8.3", - "arrow 38.0.0", + "arrow", "chrono", "comfy-table", "hashbrown 0.13.2", @@ -640,7 +609,7 @@ version = "1.2.2" dependencies = [ "analytic_engine", "arena", - "arrow 38.0.0", + "arrow", "base64 0.13.1", "clap 3.2.23", "common_types", @@ -1087,13 +1056,17 @@ dependencies = [ [[package]] name = "ceresdb-client" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a2c1699cb154e97cfccd3d6a0679f561c6214a33d86b3eacb78685c7479d022" +checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b" dependencies = [ - "arrow 23.0.0", + "arrow", "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.5", +======= + "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", +>>>>>>> 0abc9181 (update pb) "dashmap 5.4.0", "futures 0.3.28", "paste 1.0.12", @@ -1120,8 +1093,24 @@ dependencies = [ [[package]] name = "ceresdbproto" version = "1.0.4" +source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=53f5c74a54d8a08ebb08c41e8b862b2369df4a02#53f5c74a54d8a08ebb08c41e8b862b2369df4a02" +dependencies = [ + "prost", + "protoc-bin-vendored", + "tonic 0.8.3", + "tonic-build", + "walkdir", +] + +[[package]] +name = "ceresdbproto" +version = "1.0.5" +<<<<<<< HEAD source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18d5d1c238f84dee01e671603c6a921868f3663256e0393c64ece88e58ee4869" +checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a" +======= +source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" +>>>>>>> 0abc9181 (update pb) dependencies = [ "prost", "protoc-bin-vendored", @@ -1274,7 +1263,11 @@ name = "cluster" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "etcd-client", @@ -1323,11 +1316,15 @@ name = "common_types" version = "1.2.2" dependencies = [ "ahash 0.8.3", - "arrow 38.0.0", + "arrow", "arrow_ext", "byteorder", "bytes_ext", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "chrono", "datafusion", "murmur3", @@ -1344,9 +1341,13 @@ dependencies = [ name = "common_util" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "backtrace", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "chrono", "common_types", "crossbeam-utils 0.8.15", @@ -1794,10 +1795,10 @@ dependencies = [ [[package]] name = "datafusion" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ "ahash 0.8.3", - "arrow 38.0.0", + "arrow", "arrow-array", "arrow-schema", "async-compression", @@ -1843,9 +1844,9 @@ dependencies = [ [[package]] name = "datafusion-common" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow-array", "chrono", "num_cpus", @@ -1857,7 +1858,7 @@ dependencies = [ [[package]] name = "datafusion-execution" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ "dashmap 5.4.0", "datafusion-common", @@ -1874,10 +1875,10 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ "ahash 0.8.3", - "arrow 38.0.0", + "arrow", "datafusion-common", "sqlparser", ] @@ -1885,9 +1886,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ - "arrow 38.0.0", + "arrow", "async-trait", "chrono", "datafusion-common", @@ -1902,12 +1903,12 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ "ahash 0.8.3", - "arrow 38.0.0", + "arrow", "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-schema", "blake2", "blake3", @@ -1934,9 +1935,9 @@ dependencies = [ [[package]] name = "datafusion-proto" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ - "arrow 38.0.0", + "arrow", "chrono", "datafusion", "datafusion-common", @@ -1948,9 +1949,9 @@ dependencies = [ [[package]] name = "datafusion-row" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ - "arrow 38.0.0", + "arrow", "datafusion-common", "paste 1.0.12", "rand 0.8.5", @@ -1959,9 +1960,9 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "23.0.0" -source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018" +source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow-schema", "datafusion-common", "datafusion-expr", @@ -1972,7 +1973,7 @@ dependencies = [ [[package]] name = "datafusion_util" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "async-trait", "datafusion", @@ -2027,7 +2028,7 @@ dependencies = [ name = "df_operator" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "base64 0.13.1", "bincode", "chrono", @@ -2239,17 +2240,6 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" -[[package]] -name = "flatbuffers" -version = "2.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a" -dependencies = [ - "bitflags", - "smallvec", - "thiserror", -] - [[package]] name = "flatbuffers" version = "23.1.21" @@ -2507,7 +2497,7 @@ checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" [[package]] name = "generated_types" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "pbjson", "pbjson-build", @@ -2900,7 +2890,7 @@ dependencies = [ [[package]] name = "influxdb_influxql_parser" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "chrono", "chrono-tz", @@ -2929,7 +2919,7 @@ name = "interpreters" version = "1.2.2" dependencies = [ "analytic_engine", - "arrow 38.0.0", + "arrow", "async-trait", "catalog", "catalog_impls", @@ -2962,9 +2952,9 @@ dependencies = [ [[package]] name = "iox_query" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow_util", "async-trait", "chrono", @@ -2986,9 +2976,9 @@ dependencies = [ [[package]] name = "iox_query_influxql" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ - "arrow 38.0.0", + "arrow", "chrono", "chrono-tz", "datafusion", @@ -3470,7 +3460,11 @@ name = "meta_client" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -3579,26 +3573,6 @@ dependencies = [ "twoway", ] -[[package]] -name = "multiversion" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" -dependencies = [ - "multiversion-macros", -] - -[[package]] -name = "multiversion-macros" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "murmur2" version = "0.1.0" @@ -3979,7 +3953,11 @@ version = "1.2.2" dependencies = [ "async-trait", "bytes", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "chrono", "clru", "common_types", @@ -4043,7 +4021,7 @@ dependencies = [ [[package]] name = "observability_deps" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "tracing", ] @@ -4166,7 +4144,7 @@ checksum = "4cbd51311f8d9ff3d2697b1522b18a588782e097d313a1a278b0faf2ccf2d3f6" dependencies = [ "ahash 0.8.3", "arrow-array", - "arrow-buffer 38.0.0", + "arrow-buffer", "arrow-cast", "arrow-data", "arrow-ipc", @@ -4196,13 +4174,15 @@ dependencies = [ name = "parquet_ext" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow_ext", "async-trait", "bytes", "common_util", "datafusion", + "futures 0.3.28", "log", + "object_store 1.2.2", "parquet", "tokio", ] @@ -4574,6 +4554,7 @@ dependencies = [ "jemalloc-sys", "jemallocator", "log", + "pprof 0.11.1", ] [[package]] @@ -4769,12 +4750,16 @@ checksum = "9653c3ed92974e34c5a6e0a510864dab979760481714c172e0a34e437cb98804" name = "proxy" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow_ext", "async-trait", "bytes", "catalog", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "clru", "cluster", "common_types", @@ -4863,7 +4848,7 @@ dependencies = [ name = "query_engine" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "async-trait", "chrono", "common_types", @@ -4884,10 +4869,14 @@ dependencies = [ name = "query_frontend" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "async-trait", "catalog", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "cluster", "common_types", "common_util", @@ -4914,9 +4903,9 @@ dependencies = [ [[package]] name = "query_functions" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ - "arrow 38.0.0", + "arrow", "chrono", "datafusion", "itertools", @@ -5195,7 +5184,11 @@ version = "1.2.2" dependencies = [ "arrow_ext", "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -5321,7 +5314,11 @@ name = "router" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "cluster", "common_types", "common_util", @@ -5337,8 +5334,8 @@ dependencies = [ [[package]] name = "rskafka" -version = "0.3.0" -source = "git+https://github.com/influxdata/rskafka.git?rev=00988a564b1db0249d858065fc110476c075efad#00988a564b1db0249d858065fc110476c075efad" +version = "0.4.0" +source = "git+https://github.com/Rachelint/rskafka.git?rev=f0fd8e278d8164cb0cfca5a80476361fc308ecc3#f0fd8e278d8164cb0cfca5a80476361fc308ecc3" dependencies = [ "async-trait", "bytes", @@ -5521,9 +5518,9 @@ dependencies = [ [[package]] name = "schema" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ - "arrow 38.0.0", + "arrow", "hashbrown 0.13.2", "indexmap", "itertools", @@ -5671,12 +5668,16 @@ name = "server" version = "1.2.2" dependencies = [ "analytic_engine", - "arrow 38.0.0", + "arrow", "arrow_ext", "async-trait", "bytes", "catalog", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "clru", "cluster", "common_types", @@ -5694,7 +5695,6 @@ dependencies = [ "opensrv-mysql", "partition_table_engine", "paste 1.0.12", - "pprof 0.11.1", "profile", "prom-remote-api", "prometheus 0.12.0", @@ -6213,10 +6213,14 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" name = "system_catalog" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "async-trait", "catalog", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -6232,10 +6236,14 @@ dependencies = [ name = "table_engine" version = "1.2.2" dependencies = [ - "arrow 38.0.0", + "arrow", "arrow_ext", "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "common_types", "common_util", "datafusion", @@ -6319,7 +6327,7 @@ dependencies = [ [[package]] name = "test_helpers" version = "0.1.0" -source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51" +source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad" dependencies = [ "dotenvy", "observability_deps", @@ -6892,8 +6900,8 @@ version = "1.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ - "cfg-if 1.0.0", - "rand 0.8.5", + "cfg-if 0.1.10", + "rand 0.3.23", "static_assertions 1.1.0", ] @@ -7049,7 +7057,11 @@ name = "wal" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto", +<<<<<<< HEAD + "ceresdbproto 1.0.4", +======= + "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", +>>>>>>> 0abc9181 (update pb) "chrono", "common_types", "common_util", diff --git a/Cargo.toml b/Cargo.toml index 2f2036df2f..b3e3bcb0ee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,7 @@ bytes = "1.1.0" bytes_ext = { path = "components/bytes_ext" } catalog = { path = "catalog" } catalog_impls = { path = "catalog_impls" } -ceresdbproto = "1.0.4" +ceresdbproto = { git = "https://github.com/tanruixiang/ceresdbproto.git", rev = "6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" } chrono = "0.4" clap = "3.0" clru = "0.6.1" @@ -76,8 +76,8 @@ cluster = { path = "cluster" } criterion = "0.3" common_types = { path = "common_types" } common_util = { path = "common_util" } -datafusion = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" } -datafusion-proto = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" } +datafusion = { git = "https://github.com/ceresdb/arrow-datafusion.git", rev = "acb5d97a8a8de5296989740f97db3773fe3aa45a" } +datafusion-proto = { git = "https://github.com/ceresdb/arrow-datafusion.git", rev = "acb5d97a8a8de5296989740f97db3773fe3aa45a" } df_operator = { path = "df_operator" } etcd-client = "0.10.3" env_logger = "0.6" @@ -87,10 +87,10 @@ lazy_static = "1.4.0" log = "0.4" logger = { path = "components/logger" } lru = "0.7.6" -influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query_influxql" } -influxql-parser = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "influxdb_influxql_parser" } -influxql-query = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query" } -influxql-schema = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "schema" } +influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "iox_query_influxql" } +influxql-parser = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "influxdb_influxql_parser" } +influxql-query = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "iox_query" } +influxql-schema = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "schema" } interpreters = { path = "interpreters" } itertools = "0.10.5" meta_client = { path = "meta_client" } diff --git a/Dockerfile b/Dockerfile index 142f575408..192b877814 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ FROM rust:${RUST_VERSION}-slim-bullseye as build # cache mounts below may already exist and owned by root USER root -RUN apt update && apt install --yes gcc g++ libssl-dev pkg-config cmake protobuf-compiler && rm -rf /var/lib/apt/lists/* +RUN apt update && apt install --yes git gcc g++ libssl-dev pkg-config cmake protobuf-compiler && rm -rf /var/lib/apt/lists/* COPY . /ceresdb WORKDIR /ceresdb diff --git a/analytic_engine/src/compaction/mod.rs b/analytic_engine/src/compaction/mod.rs index bcbead4af9..e0485522b1 100644 --- a/analytic_engine/src/compaction/mod.rs +++ b/analytic_engine/src/compaction/mod.rs @@ -318,13 +318,26 @@ pub struct ExpiredFiles { #[derive(Default, Clone)] pub struct CompactionTask { - pub compaction_inputs: Vec, - pub expired: Vec, + inputs: Vec, + expired: Vec, +} + +impl Drop for CompactionTask { + fn drop(&mut self) { + // When a CompactionTask is dropped, it means + // 1. the task finished successfully, or + // 2. the task is cancelled for some reason, like memory limit + // + // In case 2, we need to mark files as not compacted in order for them to be + // scheduled again. In case 1, the files will be moved out of level controller, + // so it doesn't care what the flag is, so it's safe to set false here. + self.mark_files_being_compacted(false); + } } impl CompactionTask { - pub fn mark_files_being_compacted(&self, being_compacted: bool) { - for input in &self.compaction_inputs { + fn mark_files_being_compacted(&self, being_compacted: bool) { + for input in &self.inputs { for file in &input.files { file.set_being_compacted(being_compacted); } @@ -337,9 +350,10 @@ impl CompactionTask { } // Estimate the size of the total input files. + #[inline] pub fn estimated_total_input_file_size(&self) -> usize { let total_input_size: u64 = self - .compaction_inputs + .inputs .iter() .map(|v| v.files.iter().map(|f| f.size()).sum::()) .sum(); @@ -347,19 +361,65 @@ impl CompactionTask { total_input_size as usize } + #[inline] pub fn num_compact_files(&self) -> usize { - self.compaction_inputs.iter().map(|v| v.files.len()).sum() + self.inputs.iter().map(|v| v.files.len()).sum() } - pub fn num_expired_files(&self) -> usize { - self.expired.iter().map(|v| v.files.len()).sum() + #[inline] + pub fn is_empty(&self) -> bool { + self.is_input_empty() && self.expired.is_empty() + } + + #[inline] + pub fn is_input_empty(&self) -> bool { + self.inputs.is_empty() + } + + #[inline] + pub fn expired(&self) -> &[ExpiredFiles] { + &self.expired + } + + #[inline] + pub fn inputs(&self) -> &[CompactionInputFiles] { + &self.inputs + } +} + +pub struct CompactionTaskBuilder { + expired: Vec, + inputs: Vec, +} + +impl CompactionTaskBuilder { + pub fn with_expired(expired: Vec) -> Self { + Self { + expired, + inputs: Vec::new(), + } + } + + pub fn add_inputs(&mut self, files: CompactionInputFiles) { + self.inputs.push(files); + } + + pub fn build(self) -> CompactionTask { + let task = CompactionTask { + expired: self.expired, + inputs: self.inputs, + }; + + task.mark_files_being_compacted(true); + + task } } impl fmt::Debug for CompactionTask { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("CompactionTask") - .field("inputs", &self.compaction_inputs) + .field("inputs", &self.inputs) .field( "expired", &self @@ -380,36 +440,12 @@ impl fmt::Debug for CompactionTask { } } -pub struct PickerManager { - default_picker: CompactionPickerRef, - time_window_picker: CompactionPickerRef, - size_tiered_picker: CompactionPickerRef, -} - -impl Default for PickerManager { - fn default() -> Self { - let size_tiered_picker = Arc::new(CommonCompactionPicker::new( - CompactionStrategy::SizeTiered(SizeTieredCompactionOptions::default()), - )); - let time_window_picker = Arc::new(CommonCompactionPicker::new( - CompactionStrategy::TimeWindow(TimeWindowCompactionOptions::default()), - )); - - Self { - default_picker: time_window_picker.clone(), - size_tiered_picker, - time_window_picker, - } - } -} +#[derive(Default)] +pub struct PickerManager; impl PickerManager { pub fn get_picker(&self, strategy: CompactionStrategy) -> CompactionPickerRef { - match strategy { - CompactionStrategy::Default => self.default_picker.clone(), - CompactionStrategy::SizeTiered(_) => self.size_tiered_picker.clone(), - CompactionStrategy::TimeWindow(_) => self.time_window_picker.clone(), - } + Arc::new(CommonCompactionPicker::new(strategy)) } } diff --git a/analytic_engine/src/compaction/picker.rs b/analytic_engine/src/compaction/picker.rs index 96600199f0..e104aca7d2 100644 --- a/analytic_engine/src/compaction/picker.rs +++ b/analytic_engine/src/compaction/picker.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Compaction picker. @@ -15,8 +15,8 @@ use snafu::Snafu; use crate::{ compaction::{ - CompactionInputFiles, CompactionStrategy, CompactionTask, SizeTieredCompactionOptions, - TimeWindowCompactionOptions, + CompactionInputFiles, CompactionStrategy, CompactionTask, CompactionTaskBuilder, + SizeTieredCompactionOptions, TimeWindowCompactionOptions, }, sst::{ file::{FileHandle, Level}, @@ -60,7 +60,7 @@ pub trait CompactionPicker { fn pick_compaction( &self, ctx: PickerContext, - levels_controller: &LevelsController, + levels_controller: &mut LevelsController, ) -> Result; } @@ -86,10 +86,10 @@ pub struct CommonCompactionPicker { impl CommonCompactionPicker { pub fn new(strategy: CompactionStrategy) -> Self { let level_picker: LevelPickerRef = match strategy { - CompactionStrategy::SizeTiered(_) | CompactionStrategy::Default => { - Arc::new(SizeTieredPicker::default()) + CompactionStrategy::SizeTiered(_) => Arc::new(SizeTieredPicker::default()), + CompactionStrategy::TimeWindow(_) | CompactionStrategy::Default => { + Arc::new(TimeWindowPicker::default()) } - CompactionStrategy::TimeWindow(_) => Arc::new(TimeWindowPicker::default()), }; Self { level_picker } } @@ -123,13 +123,11 @@ impl CompactionPicker for CommonCompactionPicker { fn pick_compaction( &self, ctx: PickerContext, - levels_controller: &LevelsController, + levels_controller: &mut LevelsController, ) -> Result { let expire_time = ctx.ttl.map(Timestamp::expire_time); - let mut compaction_task = CompactionTask { - expired: levels_controller.expired_ssts(expire_time), - ..Default::default() - }; + let mut builder = + CompactionTaskBuilder::with_expired(levels_controller.expired_ssts(expire_time)); if let Some(input_files) = self.pick_compact_candidates(&ctx, levels_controller, expire_time) @@ -139,10 +137,10 @@ impl CompactionPicker for CommonCompactionPicker { ctx.strategy, input_files ); - compaction_task.compaction_inputs = vec![input_files]; + builder.add_inputs(input_files); } - Ok(compaction_task) + Ok(builder.build()) } } @@ -734,39 +732,39 @@ mod tests { }; let now = Timestamp::now(); { - let lc = build_old_bucket_case(now.as_i64()); - let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); - assert_eq!(task.compaction_inputs[0].files.len(), 2); - assert_eq!(task.compaction_inputs[0].files[0].id(), 0); - assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + let mut lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap(); + assert_eq!(task.inputs[0].files.len(), 2); + assert_eq!(task.inputs[0].files[0].id(), 0); + assert_eq!(task.inputs[0].files[1].id(), 1); assert_eq!(task.expired[0].files.len(), 1); assert_eq!(task.expired[0].files[0].id(), 3); } { - let lc = build_newest_bucket_case(now.as_i64()); - let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); - assert_eq!(task.compaction_inputs[0].files.len(), 4); - assert_eq!(task.compaction_inputs[0].files[0].id(), 2); - assert_eq!(task.compaction_inputs[0].files[1].id(), 3); - assert_eq!(task.compaction_inputs[0].files[2].id(), 4); - assert_eq!(task.compaction_inputs[0].files[3].id(), 5); + let mut lc = build_newest_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap(); + assert_eq!(task.inputs[0].files.len(), 4); + assert_eq!(task.inputs[0].files[0].id(), 2); + assert_eq!(task.inputs[0].files[1].id(), 3); + assert_eq!(task.inputs[0].files[2].id(), 4); + assert_eq!(task.inputs[0].files[3].id(), 5); } { - let lc = build_newest_bucket_no_match_case(now.as_i64()); - let task = twp.pick_compaction(ctx.clone(), &lc).unwrap(); - assert_eq!(task.compaction_inputs.len(), 0); + let mut lc = build_newest_bucket_no_match_case(now.as_i64()); + let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap(); + assert_eq!(task.inputs.len(), 0); } // If ttl is None, then no file is expired. ctx.ttl = None; { - let lc = build_old_bucket_case(now.as_i64()); - let task = twp.pick_compaction(ctx, &lc).unwrap(); - assert_eq!(task.compaction_inputs[0].files.len(), 2); - assert_eq!(task.compaction_inputs[0].files[0].id(), 0); - assert_eq!(task.compaction_inputs[0].files[1].id(), 1); + let mut lc = build_old_bucket_case(now.as_i64()); + let task = twp.pick_compaction(ctx, &mut lc).unwrap(); + assert_eq!(task.inputs[0].files.len(), 2); + assert_eq!(task.inputs[0].files[0].id(), 0); + assert_eq!(task.inputs[0].files[1].id(), 1); assert!(task.expired[0].files.is_empty()); } } diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs index 30cf277521..620bfa83ed 100644 --- a/analytic_engine/src/compaction/scheduler.rs +++ b/analytic_engine/src/compaction/scheduler.rs @@ -237,7 +237,7 @@ impl OngoingTaskLimit { if dropped > 0 { warn!( - "Too many compaction pending tasks, limit: {}, dropped {} older tasks.", + "Too many compaction pending tasks, limit:{}, dropped:{}.", self.max_pending_compaction_tasks, dropped, ); } @@ -428,12 +428,11 @@ impl ScheduleWorker { let ongoing = self.limit.ongoing_tasks(); match schedule_task { ScheduleTask::Request(compact_req) => { - debug!("Ongoing compaction tasks:{}", ongoing); + debug!("Ongoing compaction tasks:{ongoing}"); if ongoing >= self.max_ongoing_tasks { self.limit.add_request(compact_req); warn!( - "Too many compaction ongoing tasks:{}, max:{}, buf_len:{}", - ongoing, + "Too many compaction ongoing tasks:{ongoing}, max:{}, buf_len:{}", self.max_ongoing_tasks, self.limit.request_buf_len() ); @@ -448,7 +447,13 @@ impl ScheduleWorker { for compact_req in pending { self.handle_table_compaction_request(compact_req).await; } - debug!("Scheduled {} pending compaction tasks.", len); + debug!("Scheduled {len} pending compaction tasks."); + } else { + warn!( + "Too many compaction ongoing tasks:{ongoing}, max:{}, buf_len:{}", + self.max_ongoing_tasks, + self.limit.request_buf_len() + ); } } ScheduleTask::Exit => (), @@ -462,10 +467,7 @@ impl ScheduleWorker { waiter_notifier: WaiterNotifier, token: MemoryUsageToken, ) { - // Mark files being in compaction. - compaction_task.mark_files_being_compacted(true); - - let keep_scheduling_compaction = !compaction_task.compaction_inputs.is_empty(); + let keep_scheduling_compaction = !compaction_task.is_input_empty(); let runtime = self.runtime.clone(); let space_store = self.space_store.clone(); @@ -503,9 +505,6 @@ impl ScheduleWorker { .await; if let Err(e) = &res { - // Compaction is failed, we need to unset the compaction mark. - compaction_task.mark_files_being_compacted(false); - error!( "Failed to compact table, table_name:{}, table_id:{}, request_id:{}, err:{}", table_data.name, table_data.id, request_id, e diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs index 00b6ba8745..e32fa064d5 100644 --- a/analytic_engine/src/instance/engine.rs +++ b/analytic_engine/src/instance/engine.rs @@ -218,6 +218,18 @@ pub enum Error { #[snafu(display("Failed to open shard, msg:{}.\nBacktrace:\n{}", msg, backtrace))] OpenTablesOfShard { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to replay wal, msg:{:?}, err:{}", msg, source))] + ReplayWalWithCause { + msg: Option, + source: GenericError, + }, + + #[snafu(display("Failed to replay wal, msg:{:?}.\nBacktrace:\n{}", msg, backtrace))] + ReplayWalNoCause { + msg: Option, + backtrace: Backtrace, + }, } define_result!(Error); @@ -250,7 +262,9 @@ impl From for table_engine::engine::Error { | Error::DoManifestSnapshot { .. } | Error::OpenManifest { .. } | Error::TableNotExist { .. } - | Error::OpenTablesOfShard { .. } => Self::Unexpected { + | Error::OpenTablesOfShard { .. } + | Error::ReplayWalNoCause { .. } + | Error::ReplayWalWithCause { .. } => Self::Unexpected { source: Box::new(err), }, } diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs index 0dc4c6208c..0d807349dd 100644 --- a/analytic_engine/src/instance/flush_compaction.rs +++ b/analytic_engine/src/instance/flush_compaction.rs @@ -130,6 +130,18 @@ pub enum Error { #[snafu(display("Other failure, msg:{}.\nBacktrace:\n{:?}", msg, backtrace))] Other { msg: String, backtrace: Backtrace }, + + #[snafu(display("Failed to run flush job, msg:{:?}, err:{}", msg, source))] + FlushJobWithCause { + msg: Option, + source: GenericError, + }, + + #[snafu(display("Failed to run flush job, msg:{:?}.\nBacktrace:\n{}", msg, backtrace))] + FlushJobNoCause { + msg: Option, + backtrace: Backtrace, + }, } define_result!(Error); @@ -163,6 +175,7 @@ pub struct TableFlushRequest { pub max_sequence: SequenceNumber, } +#[derive(Clone)] pub struct Flusher { pub space_store: SpaceStoreRef, @@ -173,8 +186,6 @@ pub struct Flusher { struct FlushTask { space_store: SpaceStoreRef, table_data: TableDataRef, - max_sequence: SequenceNumber, - runtime: RuntimeRef, write_sst_max_buffer_size: usize, } @@ -192,9 +203,7 @@ impl Flusher { table_data, opts ); - let flush_req = self.preprocess_flush(table_data).await?; - - self.schedule_table_flush(flush_scheduler, flush_req, opts, false) + self.schedule_table_flush(flush_scheduler, table_data.clone(), opts, false) .await } @@ -210,74 +219,20 @@ impl Flusher { table_data, opts ); - let flush_req = self.preprocess_flush(table_data).await?; - - self.schedule_table_flush(flush_scheduler, flush_req, opts, true) + self.schedule_table_flush(flush_scheduler, table_data.clone(), opts, true) .await } - async fn preprocess_flush(&self, table_data: &TableDataRef) -> Result { - let current_version = table_data.current_version(); - let last_sequence = table_data.last_sequence(); - // Switch (freeze) all mutable memtables. And update segment duration if - // suggestion is returned. - if let Some(suggest_segment_duration) = - current_version.switch_memtables_or_suggest_duration() - { - info!( - "Update segment duration, table:{}, table_id:{}, segment_duration:{:?}", - table_data.name, table_data.id, suggest_segment_duration - ); - assert!(!suggest_segment_duration.is_zero()); - - let mut new_table_opts = (*table_data.table_options()).clone(); - new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration)); - - let edit_req = { - let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { - space_id: table_data.space_id, - table_id: table_data.id, - options: new_table_opts.clone(), - }); - MetaEditRequest { - shard_info: table_data.shard_info, - meta_edit: MetaEdit::Update(meta_update), - } - }; - self.space_store - .manifest - .apply_edit(edit_req) - .await - .context(StoreVersionEdit)?; - - // Now the segment duration is applied, we can stop sampling and freeze the - // sampling memtable. - current_version.freeze_sampling(); - } - - info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{}", - table_data.name, table_data.id, table_data.last_memtable_id(), last_sequence); - - // Try to flush all memtables of current table - Ok(TableFlushRequest { - table_data: table_data.clone(), - max_sequence: last_sequence, - }) - } - /// Schedule table flush request to background workers async fn schedule_table_flush( &self, flush_scheduler: &mut TableFlushScheduler, - flush_req: TableFlushRequest, + table_data: TableDataRef, opts: TableFlushOptions, block_on: bool, ) -> Result<()> { - let table_data = flush_req.table_data.clone(); - let flush_task = FlushTask { table_data: table_data.clone(), - max_sequence: flush_req.max_sequence, space_store: self.space_store.clone(), runtime: self.runtime.clone(), write_sst_max_buffer_size: self.write_sst_max_buffer_size, @@ -295,23 +250,29 @@ impl FlushTask { /// should be ensured by the caller. async fn run(&self) -> Result<()> { let instant = Instant::now(); + let flush_req = self.preprocess_flush(&self.table_data).await?; + let current_version = self.table_data.current_version(); - let mems_to_flush = current_version.pick_memtables_to_flush(self.max_sequence); + let mems_to_flush = current_version.pick_memtables_to_flush(flush_req.max_sequence); if mems_to_flush.is_empty() { return Ok(()); } let request_id = RequestId::next_id(); - info!( - "Instance try to flush memtables, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}", - self.table_data.name, self.table_data.id, request_id, mems_to_flush - ); // Start flush duration timer. let local_metrics = self.table_data.metrics.local_flush_metrics(); let _timer = local_metrics.start_flush_timer(); - self.dump_memtables(request_id, &mems_to_flush).await?; + self.dump_memtables(request_id, &mems_to_flush) + .await + .box_err() + .context(FlushJobWithCause { + msg: Some(format!( + "table:{}, table_id:{}, request_id:{request_id}", + self.table_data.name, self.table_data.id + )), + })?; self.table_data .set_last_flush_time(time::current_time_millis()); @@ -327,6 +288,57 @@ impl FlushTask { Ok(()) } + async fn preprocess_flush(&self, table_data: &TableDataRef) -> Result { + let current_version = table_data.current_version(); + let mut last_sequence = table_data.last_sequence(); + // Switch (freeze) all mutable memtables. And update segment duration if + // suggestion is returned. + if let Some(suggest_segment_duration) = current_version.suggest_duration() { + info!( + "Update segment duration, table:{}, table_id:{}, segment_duration:{:?}", + table_data.name, table_data.id, suggest_segment_duration + ); + assert!(!suggest_segment_duration.is_zero()); + + let mut new_table_opts = (*table_data.table_options()).clone(); + new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration)); + + let edit_req = { + let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta { + space_id: table_data.space_id, + table_id: table_data.id, + options: new_table_opts.clone(), + }); + MetaEditRequest { + shard_info: table_data.shard_info, + meta_edit: MetaEdit::Update(meta_update), + } + }; + self.space_store + .manifest + .apply_edit(edit_req) + .await + .context(StoreVersionEdit)?; + + // Now the segment duration is applied, we can stop sampling and freeze the + // sampling memtable. + if let Some(seq) = current_version.freeze_sampling_memtable() { + last_sequence = seq.max(last_sequence); + } + } else if let Some(seq) = current_version.switch_memtables() { + last_sequence = seq.max(last_sequence); + } + + info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{last_sequence}", + table_data.name, table_data.id, table_data.last_memtable_id()); + + // Try to flush all memtables of current table + Ok(TableFlushRequest { + table_data: table_data.clone(), + max_sequence: last_sequence, + }) + } + /// This will write picked memtables [FlushableMemTables] to level 0 sst /// files. Sampling memtable may be dumped into multiple sst file according /// to the sampled segment duration. @@ -648,22 +660,23 @@ impl SpaceStore { "Begin compact table, table_name:{}, id:{}, task:{:?}", table_data.name, table_data.id, task ); + let inputs = task.inputs(); let mut edit_meta = VersionEditMeta { space_id: table_data.space_id, table_id: table_data.id, flushed_sequence: 0, // Use the number of compaction inputs as the estimated number of files to add. - files_to_add: Vec::with_capacity(task.compaction_inputs.len()), + files_to_add: Vec::with_capacity(inputs.len()), files_to_delete: vec![], mems_to_remove: vec![], }; - if task.num_expired_files() == 0 && task.num_compact_files() == 0 { + if task.is_empty() { // Nothing to compact. return Ok(()); } - for files in &task.expired { + for files in task.expired() { self.delete_expired_files(table_data, request_id, files, &mut edit_meta); } @@ -675,7 +688,7 @@ impl SpaceStore { task.num_compact_files(), ); - for input in &task.compaction_inputs { + for input in inputs { self.compact_input_files( request_id, table_data, diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs index 89a71ba2d6..1faf254f08 100644 --- a/analytic_engine/src/instance/mod.rs +++ b/analytic_engine/src/instance/mod.rs @@ -15,6 +15,7 @@ pub(crate) mod mem_collector; pub mod open; mod read; pub(crate) mod serial_executor; +pub mod wal_replayer; pub(crate) mod write; use std::sync::Arc; @@ -44,7 +45,7 @@ use crate::{ meta_data::cache::MetaCacheRef, }, table::data::{TableDataRef, TableShardInfo}, - TableOptions, + RecoverMode, TableOptions, }; #[allow(clippy::enum_variant_names)] @@ -159,6 +160,7 @@ pub struct Instance { /// Options for scanning sst pub(crate) scan_options: ScanOptions, pub(crate) iter_options: Option, + pub(crate) recover_mode: RecoverMode, } impl Instance { diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs index 6c4d178fe2..cf11b05b1d 100644 --- a/analytic_engine/src/instance/open.rs +++ b/analytic_engine/src/instance/open.rs @@ -3,19 +3,16 @@ //! Open logic of instance use std::{ - collections::{HashMap, VecDeque}, + collections::HashMap, sync::{Arc, RwLock}, }; -use common_types::{schema::IndexInWriterSchema, table::ShardId}; -use log::{debug, error, info, trace}; +use common_types::table::ShardId; +use log::{error, info}; use object_store::ObjectStoreRef; use snafu::ResultExt; use table_engine::{engine::TableDef, table::TableId}; -use wal::{ - log_batch::LogEntry, - manager::{ReadBoundary, ReadContext, ReadRequest, WalManager, WalManagerRef}, -}; +use wal::manager::WalManagerRef; use super::{engine::OpenTablesOfShard, flush_compaction::Flusher}; use crate::{ @@ -23,16 +20,12 @@ use crate::{ context::OpenContext, engine, instance::{ - self, - engine::{ApplyMemTable, FlushTable, OpenManifest, ReadMetaUpdate, ReadWal, Result}, - flush_compaction::TableFlushOptions, + engine::{OpenManifest, ReadMetaUpdate, Result}, mem_collector::MemUsageCollector, - serial_executor::TableOpSerialExecutor, - write::MemTableWriter, + wal_replayer::{ReplayMode, WalReplayer}, Instance, SpaceStore, }, manifest::{details::ManifestImpl, LoadRequest, Manifest, ManifestRef}, - payload::{ReadPayload, WalDecoder}, row_iter::IterOptions, space::{SpaceAndTable, SpaceRef, Spaces}, sst::{ @@ -41,6 +34,7 @@ use crate::{ }, table::data::TableDataRef, table_meta_set_impl::TableMetaSetImpl, + RecoverMode, }; const MAX_RECORD_BATCHES_IN_FLIGHT_WHEN_COMPACTION_READ: usize = 64; @@ -133,6 +127,7 @@ impl Instance { .map(|v| v.as_byte() as usize), iter_options, scan_options, + recover_mode: ctx.config.recover_mode, }); Ok(instance) @@ -150,6 +145,7 @@ impl Instance { self.replay_batch_size, self.make_flusher(), self.max_retry_flush_limit, + self.recover_mode, )?; shard_opener.open().await @@ -197,10 +193,11 @@ struct ShardOpener { shard_id: ShardId, manifest: ManifestRef, wal_manager: WalManagerRef, - states: HashMap, + stages: HashMap, wal_replay_batch_size: usize, flusher: Flusher, max_retry_flush_limit: usize, + recover_mode: RecoverMode, } impl ShardOpener { @@ -211,8 +208,9 @@ impl ShardOpener { wal_replay_batch_size: usize, flusher: Flusher, max_retry_flush_limit: usize, + recover_mode: RecoverMode, ) -> Result { - let mut states = HashMap::with_capacity(shard_context.table_ctxs.len()); + let mut stages = HashMap::with_capacity(shard_context.table_ctxs.len()); for table_ctx in shard_context.table_ctxs { let space = &table_ctx.space; let table_id = table_ctx.table_def.id; @@ -226,17 +224,18 @@ impl ShardOpener { space: table_ctx.space, }) }; - states.insert(table_id, state); + stages.insert(table_id, state); } Ok(Self { shard_id: shard_context.shard_id, manifest, wal_manager, - states, + stages, wal_replay_batch_size, flusher, max_retry_flush_limit, + recover_mode, }) } @@ -248,9 +247,9 @@ impl ShardOpener { self.recover_table_datas().await?; // Retrieve the table results and return. - let states = std::mem::take(&mut self.states); - let mut table_results = HashMap::with_capacity(states.len()); - for (table_id, state) in states { + let stages = std::mem::take(&mut self.stages); + let mut table_results = HashMap::with_capacity(stages.len()); + for (table_id, state) in stages { match state { TableOpenStage::Failed(e) => { table_results.insert(table_id, Err(e)); @@ -274,7 +273,12 @@ impl ShardOpener { /// Recover table meta data from manifest based on shard. async fn recover_table_metas(&mut self) -> Result<()> { - for (table_id, state) in self.states.iter_mut() { + info!( + "ShardOpener recover table metas begin, shard_id:{}", + self.shard_id + ); + + for (table_id, state) in self.stages.iter_mut() { match state { // Only do the meta recovery work in `RecoverTableMeta` state. TableOpenStage::RecoverTableMeta(ctx) => { @@ -289,7 +293,10 @@ impl ShardOpener { let table_data = ctx.space.find_table_by_id(*table_id); Ok(table_data.map(|data| (data, ctx.space.clone()))) } - Err(e) => Err(e), + Err(e) => { + error!("ShardOpener recover single table meta failed, table:{:?}, shard_id:{}, err:{e}", ctx.table_def, self.shard_id); + Err(e) + } }; match result { @@ -314,55 +321,88 @@ impl ShardOpener { } } + info!( + "ShardOpener recover table metas finish, shard_id:{}", + self.shard_id + ); Ok(()) } /// Recover table data based on shard. async fn recover_table_datas(&mut self) -> Result<()> { - for state in self.states.values_mut() { - match state { + info!( + "ShardOpener recover table datas begin, shard_id:{}", + self.shard_id + ); + + // Replay wal logs of tables. + let mut replay_table_datas = Vec::with_capacity(self.stages.len()); + for (table_id, stage) in self.stages.iter_mut() { + match stage { // Only do the wal recovery work in `RecoverTableData` state. TableOpenStage::RecoverTableData(ctx) => { - let table_data = ctx.table_data.clone(); - let read_ctx = ReadContext { - batch_size: self.wal_replay_batch_size, - ..Default::default() - }; - - let result = match Self::recover_single_table_data( - &self.flusher, - self.max_retry_flush_limit, - self.wal_manager.as_ref(), - table_data.clone(), - self.wal_replay_batch_size, - &read_ctx, - ) - .await - { - Ok(()) => Ok((table_data, ctx.space.clone())), - Err(e) => Err(e), - }; - - match result { - Ok((table_data, space)) => { - *state = TableOpenStage::Success(Some(SpaceAndTable::new( - space, table_data, - ))); - } - Err(e) => *state = TableOpenStage::Failed(e), - } + replay_table_datas.push(ctx.table_data.clone()); } // Table was found opened, or failed in meta recovery stage. TableOpenStage::Failed(_) | TableOpenStage::Success(_) => {} TableOpenStage::RecoverTableMeta(_) => { return OpenTablesOfShard { - msg: format!("unexpected table state:{state:?}"), + msg: format!( + "unexpected stage, stage:{stage:?}, table_id:{table_id}, shard_id:{}", + self.shard_id + ), } - .fail() + .fail(); } } } + let replay_mode = match self.recover_mode { + RecoverMode::TableBased => ReplayMode::TableBased, + RecoverMode::ShardBased => ReplayMode::RegionBased, + }; + let mut wal_replayer = WalReplayer::new( + &replay_table_datas, + self.shard_id, + self.wal_manager.clone(), + self.wal_replay_batch_size, + self.flusher.clone(), + self.max_retry_flush_limit, + replay_mode, + ); + let mut table_results = wal_replayer.replay().await?; + + // Process the replay results. + for table_data in replay_table_datas { + let table_id = table_data.id; + // Each `table_data` has its related `stage` in `stages`, impossible to panic + // here. + let stage = self.stages.get_mut(&table_id).unwrap(); + let failed_table_opt = table_results.remove(&table_id); + + match (&stage, failed_table_opt) { + (TableOpenStage::RecoverTableData(ctx), None) => { + let space_table = SpaceAndTable::new(ctx.space.clone(), ctx.table_data.clone()); + *stage = TableOpenStage::Success(Some(space_table)); + } + + (TableOpenStage::RecoverTableData(_), Some(e)) => { + error!("ShardOpener replay wals of single table failed, table:{}, table_id:{}, shard_id:{}, err:{e}", table_data.name, table_data.id, self.shard_id); + *stage = TableOpenStage::Failed(e); + } + + (other_stage, _) => { + return OpenTablesOfShard { + msg: format!("unexpected stage, stage:{other_stage:?}, table_id:{table_id}, shard_id:{}", self.shard_id), + }.fail(); + } + } + } + + info!( + "ShardOpener recover table datas finish, shard_id:{}", + self.shard_id + ); Ok(()) } @@ -398,171 +438,4 @@ impl ShardOpener { Ok(()) } - - /// Recover table data from wal. - /// - /// Called by write worker - pub(crate) async fn recover_single_table_data( - flusher: &Flusher, - max_retry_flush_limit: usize, - wal_manager: &dyn WalManager, - table_data: TableDataRef, - replay_batch_size: usize, - read_ctx: &ReadContext, - ) -> Result<()> { - debug!( - "Instance recover table from wal, replay batch size:{}, table id:{}, shard info:{:?}", - replay_batch_size, table_data.id, table_data.shard_info - ); - - let table_location = table_data.table_location(); - let wal_location = - instance::create_wal_location(table_location.id, table_location.shard_info); - let read_req = ReadRequest { - location: wal_location, - start: ReadBoundary::Excluded(table_data.current_version().flushed_sequence()), - end: ReadBoundary::Max, - }; - - // Read all wal of current table. - let mut log_iter = wal_manager - .read_batch(read_ctx, &read_req) - .await - .context(ReadWal)?; - - let mut serial_exec = table_data.serial_exec.lock().await; - let mut log_entry_buf = VecDeque::with_capacity(replay_batch_size); - loop { - // fetch entries to log_entry_buf - let decoder = WalDecoder::default(); - log_entry_buf = log_iter - .next_log_entries(decoder, log_entry_buf) - .await - .context(ReadWal)?; - - // Replay all log entries of current table - Self::replay_table_log_entries( - flusher, - max_retry_flush_limit, - &mut serial_exec, - &table_data, - &log_entry_buf, - ) - .await?; - - // No more entries. - if log_entry_buf.is_empty() { - break; - } - } - - Ok(()) - } - - /// Replay all log entries into memtable and flush if necessary. - async fn replay_table_log_entries( - flusher: &Flusher, - max_retry_flush_limit: usize, - serial_exec: &mut TableOpSerialExecutor, - table_data: &TableDataRef, - log_entries: &VecDeque>, - ) -> Result<()> { - if log_entries.is_empty() { - info!( - "Instance replay an empty table log entries, table:{}, table_id:{:?}", - table_data.name, table_data.id - ); - - // No data in wal - return Ok(()); - } - - let last_sequence = log_entries.back().unwrap().sequence; - - debug!( - "Instance replay table log entries begin, table:{}, table_id:{:?}, sequence:{}", - table_data.name, table_data.id, last_sequence - ); - - for log_entry in log_entries { - let (sequence, payload) = (log_entry.sequence, &log_entry.payload); - - // Apply to memtable - match payload { - ReadPayload::Write { row_group } => { - trace!( - "Instance replay row_group, table:{}, row_group:{:?}", - table_data.name, - row_group - ); - - let table_schema_version = table_data.schema_version(); - if table_schema_version != row_group.schema().version() { - // Data with old schema should already been flushed, but we avoid panic - // here. - error!( - "Ignore data with mismatch schema version during replaying, \ - table:{}, \ - table_id:{:?}, \ - expect:{}, \ - actual:{}, \ - last_sequence:{}, \ - sequence:{}", - table_data.name, - table_data.id, - table_schema_version, - row_group.schema().version(), - last_sequence, - sequence, - ); - - continue; - } - - let index_in_writer = - IndexInWriterSchema::for_same_schema(row_group.schema().num_columns()); - let memtable_writer = MemTableWriter::new(table_data.clone(), serial_exec); - memtable_writer - .write(sequence, &row_group.into(), index_in_writer) - .context(ApplyMemTable { - space_id: table_data.space_id, - table: &table_data.name, - table_id: table_data.id, - })?; - - // Flush the table if necessary. - if table_data.should_flush_table(serial_exec) { - let opts = TableFlushOptions { - res_sender: None, - max_retry_flush_limit, - }; - let flush_scheduler = serial_exec.flush_scheduler(); - flusher - .schedule_flush(flush_scheduler, table_data, opts) - .await - .context(FlushTable { - space_id: table_data.space_id, - table: &table_data.name, - table_id: table_data.id, - })?; - } - } - ReadPayload::AlterSchema { .. } | ReadPayload::AlterOptions { .. } => { - // Ignore records except Data. - // - // - DDL (AlterSchema and AlterOptions) should be recovered - // from Manifest on start. - } - } - } - - debug!( - "Instance replay table log entries end, table:{}, table_id:{:?}, last_sequence:{}", - table_data.name, table_data.id, last_sequence - ); - - table_data.set_last_sequence(last_sequence); - - Ok(()) - } } diff --git a/analytic_engine/src/instance/serial_executor.rs b/analytic_engine/src/instance/serial_executor.rs index 0e48ce5f18..4a84404e2c 100644 --- a/analytic_engine/src/instance/serial_executor.rs +++ b/analytic_engine/src/instance/serial_executor.rs @@ -154,7 +154,11 @@ impl TableFlushScheduler { *flush_state = FlushState::Flushing; break; } - FlushState::Flushing => (), + FlushState::Flushing => { + if !block_on_write_thread { + return Ok(()); + } + } FlushState::Failed { err_msg } => { if self .schedule_sync @@ -223,6 +227,8 @@ fn on_flush_finished(schedule_sync: ScheduleSyncRef, res: &Result<()>) { *flush_state = FlushState::Ready; } Err(e) => { + error!("Failed to run flush task, err:{e}"); + schedule_sync.inc_flush_failure_count(); let err_msg = e.to_string(); *flush_state = FlushState::Failed { err_msg }; diff --git a/analytic_engine/src/instance/wal_replayer.rs b/analytic_engine/src/instance/wal_replayer.rs new file mode 100644 index 0000000000..c1e6fd96b9 --- /dev/null +++ b/analytic_engine/src/instance/wal_replayer.rs @@ -0,0 +1,618 @@ +// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0. + +//! Wal replayer + +use std::{ + collections::{HashMap, VecDeque}, + fmt::Display, + ops::Range, +}; + +use async_trait::async_trait; +use common_types::{schema::IndexInWriterSchema, table::ShardId}; +use common_util::error::BoxError; +use lazy_static::lazy_static; +use log::{debug, error, info, trace}; +use prometheus::{exponential_buckets, register_histogram, Histogram}; +use snafu::ResultExt; +use table_engine::table::TableId; +use tokio::sync::MutexGuard; +use wal::{ + log_batch::LogEntry, + manager::{ + ReadBoundary, ReadContext, ReadRequest, RegionId, ScanContext, ScanRequest, WalManagerRef, + }, +}; + +use crate::{ + instance::{ + self, + engine::{Error, ReplayWalWithCause, Result}, + flush_compaction::{Flusher, TableFlushOptions}, + serial_executor::TableOpSerialExecutor, + write::MemTableWriter, + }, + payload::{ReadPayload, WalDecoder}, + table::data::TableDataRef, +}; + +// Metrics of wal replayer +lazy_static! { + static ref PULL_LOGS_DURATION_HISTOGRAM: Histogram = register_histogram!( + "wal_replay_pull_logs_duration", + "Histogram for pull logs duration in wal replay in seconds", + exponential_buckets(0.01, 2.0, 13).unwrap() + ) + .unwrap(); + static ref APPLY_LOGS_DURATION_HISTOGRAM: Histogram = register_histogram!( + "wal_replay_apply_logs_duration", + "Histogram for apply logs duration in wal replay in seconds", + exponential_buckets(0.01, 2.0, 13).unwrap() + ) + .unwrap(); +} + +/// Wal replayer supporting both table based and region based +// TODO: limit the memory usage in `RegionBased` mode. +pub struct WalReplayer<'a> { + context: ReplayContext, + replay: Box, + table_datas: &'a [TableDataRef], +} + +impl<'a> WalReplayer<'a> { + pub fn new( + table_datas: &'a [TableDataRef], + shard_id: ShardId, + wal_manager: WalManagerRef, + wal_replay_batch_size: usize, + flusher: Flusher, + max_retry_flush_limit: usize, + replay_mode: ReplayMode, + ) -> Self { + let context = ReplayContext { + shard_id, + wal_manager, + wal_replay_batch_size, + flusher, + max_retry_flush_limit, + }; + + let replay = Self::build_replay(replay_mode); + + Self { + replay, + context, + table_datas, + } + } + + fn build_replay(mode: ReplayMode) -> Box { + info!("Replay wal in mode:{mode:?}"); + + match mode { + ReplayMode::RegionBased => Box::new(RegionBasedReplay), + ReplayMode::TableBased => Box::new(TableBasedReplay), + } + } + + /// Replay tables and return the failed tables and the causes. + pub async fn replay(&mut self) -> Result { + // Build replay action according to mode. + info!( + "Replay wal logs begin, context:{}, tables:{:?}", + self.context, self.table_datas + ); + let result = self.replay.run(&self.context, self.table_datas).await; + info!( + "Replay wal logs finish, context:{}, tables:{:?}", + self.context, self.table_datas, + ); + + result + } +} + +pub struct ReplayContext { + pub shard_id: ShardId, + pub wal_manager: WalManagerRef, + pub wal_replay_batch_size: usize, + pub flusher: Flusher, + pub max_retry_flush_limit: usize, +} + +impl Display for ReplayContext { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ReplayContext") + .field("shard_id", &self.shard_id) + .field("replay_batch_size", &self.wal_replay_batch_size) + .field("max_retry_flush_limit", &self.max_retry_flush_limit) + .finish() + } +} + +#[derive(Debug, Clone, Copy)] +pub enum ReplayMode { + RegionBased, + TableBased, +} + +pub type FailedTables = HashMap; + +/// Replay action, the abstract of different replay strategies +#[async_trait] +trait Replay: Send + Sync + 'static { + async fn run( + &self, + context: &ReplayContext, + table_datas: &[TableDataRef], + ) -> Result; +} + +/// Table based wal replay +struct TableBasedReplay; + +#[async_trait] +impl Replay for TableBasedReplay { + async fn run( + &self, + context: &ReplayContext, + table_datas: &[TableDataRef], + ) -> Result { + debug!("Replay wal logs on table mode, context:{context}, tables:{table_datas:?}",); + + let mut faileds = HashMap::new(); + let read_ctx = ReadContext { + batch_size: context.wal_replay_batch_size, + ..Default::default() + }; + for table_data in table_datas { + let table_id = table_data.id; + if let Err(e) = Self::recover_table_logs(context, table_data, &read_ctx).await { + faileds.insert(table_id, e); + } + } + + Ok(faileds) + } +} + +impl TableBasedReplay { + async fn recover_table_logs( + context: &ReplayContext, + table_data: &TableDataRef, + read_ctx: &ReadContext, + ) -> Result<()> { + let table_location = table_data.table_location(); + let wal_location = + instance::create_wal_location(table_location.id, table_location.shard_info); + let read_req = ReadRequest { + location: wal_location, + start: ReadBoundary::Excluded(table_data.current_version().flushed_sequence()), + end: ReadBoundary::Max, + }; + + // Read all wal of current table. + let mut log_iter = context + .wal_manager + .read_batch(read_ctx, &read_req) + .await + .box_err() + .context(ReplayWalWithCause { msg: None })?; + + let mut serial_exec = table_data.serial_exec.lock().await; + let mut log_entry_buf = VecDeque::with_capacity(context.wal_replay_batch_size); + loop { + // fetch entries to log_entry_buf + let _timer = PULL_LOGS_DURATION_HISTOGRAM.start_timer(); + let decoder = WalDecoder::default(); + log_entry_buf = log_iter + .next_log_entries(decoder, log_entry_buf) + .await + .box_err() + .context(ReplayWalWithCause { msg: None })?; + + if log_entry_buf.is_empty() { + break; + } + + // Replay all log entries of current table + let _timer = APPLY_LOGS_DURATION_HISTOGRAM.start_timer(); + replay_table_log_entries( + &context.flusher, + context.max_retry_flush_limit, + &mut serial_exec, + table_data, + log_entry_buf.iter(), + ) + .await?; + } + + Ok(()) + } +} + +/// Region based wal replay +struct RegionBasedReplay; + +#[async_trait] +impl Replay for RegionBasedReplay { + async fn run( + &self, + context: &ReplayContext, + table_datas: &[TableDataRef], + ) -> Result { + debug!("Replay wal logs on region mode, context:{context}, tables:{table_datas:?}",); + + // Init all table results to be oks, and modify to errs when failed to replay. + let mut faileds = FailedTables::new(); + let scan_ctx = ScanContext { + batch_size: context.wal_replay_batch_size, + ..Default::default() + }; + + Self::replay_region_logs(context, table_datas, &scan_ctx, &mut faileds).await?; + + Ok(faileds) + } +} + +impl RegionBasedReplay { + /// Replay logs in same region. + /// + /// Steps: + /// + Scan all logs of region. + /// + Split logs according to table ids. + /// + Replay logs to recover data of tables. + async fn replay_region_logs( + context: &ReplayContext, + table_datas: &[TableDataRef], + scan_ctx: &ScanContext, + faileds: &mut FailedTables, + ) -> Result<()> { + // Scan all wal logs of current shard. + let scan_req = ScanRequest { + region_id: context.shard_id as RegionId, + }; + + let mut log_iter = context + .wal_manager + .scan(scan_ctx, &scan_req) + .await + .box_err() + .context(ReplayWalWithCause { msg: None })?; + let mut log_entry_buf = VecDeque::with_capacity(context.wal_replay_batch_size); + + // Lock all related tables. + let mut serial_exec_ctxs = HashMap::with_capacity(table_datas.len()); + for table_data in table_datas { + let serial_exec = table_data.serial_exec.lock().await; + let serial_exec_ctx = SerialExecContext { + table_data: table_data.clone(), + serial_exec, + }; + serial_exec_ctxs.insert(table_data.id, serial_exec_ctx); + } + + // Split and replay logs. + loop { + let _timer = PULL_LOGS_DURATION_HISTOGRAM.start_timer(); + let decoder = WalDecoder::default(); + log_entry_buf = log_iter + .next_log_entries(decoder, log_entry_buf) + .await + .box_err() + .context(ReplayWalWithCause { msg: None })?; + + if log_entry_buf.is_empty() { + break; + } + + let _timer = APPLY_LOGS_DURATION_HISTOGRAM.start_timer(); + Self::replay_single_batch(context, &log_entry_buf, &mut serial_exec_ctxs, faileds) + .await?; + } + + Ok(()) + } + + async fn replay_single_batch( + context: &ReplayContext, + log_batch: &VecDeque>, + serial_exec_ctxs: &mut HashMap>, + faileds: &mut FailedTables, + ) -> Result<()> { + let mut table_batches = Vec::new(); + // TODO: No `group_by` method in `VecDeque`, so implement it manually here... + Self::split_log_batch_by_table(log_batch, &mut table_batches); + + // TODO: Replay logs of different tables in parallel. + for table_batch in table_batches { + // Some tables may have failed in previous replay, ignore them. + if faileds.contains_key(&table_batch.table_id) { + continue; + } + + // Replay all log entries of current table. + // Some tables may have been moved to other shards or dropped, ignore such logs. + if let Some(ctx) = serial_exec_ctxs.get_mut(&table_batch.table_id) { + let result = replay_table_log_entries( + &context.flusher, + context.max_retry_flush_limit, + &mut ctx.serial_exec, + &ctx.table_data, + log_batch.range(table_batch.range), + ) + .await; + + // If occur error, mark this table as failed and store the cause. + if let Err(e) = result { + faileds.insert(table_batch.table_id, e); + } + } + } + + Ok(()) + } + + fn split_log_batch_by_table

( + log_batch: &VecDeque>, + table_batches: &mut Vec, + ) { + table_batches.clear(); + + if log_batch.is_empty() { + return; + } + + // Split log batch by table id, for example: + // input batch: + // |1|1|2|2|2|3|3|3|3| + // + // output batches: + // |1|1|, |2|2|2|, |3|3|3|3| + let mut start_log_idx = 0usize; + let mut curr_log_idx = 0usize; + let mut start_table_id = log_batch.get(start_log_idx).unwrap().table_id; + loop { + let time_to_break = curr_log_idx == log_batch.len(); + let found_end_idx = if time_to_break { + true + } else { + let current_table_id = log_batch.get(curr_log_idx).unwrap().table_id; + current_table_id != start_table_id + }; + + if found_end_idx { + table_batches.push(TableBatch { + table_id: TableId::new(start_table_id), + range: start_log_idx..curr_log_idx, + }); + + // Step to next start idx. + start_log_idx = curr_log_idx; + start_table_id = if time_to_break { + // The final round, just set it to max as an invalid flag. + u64::MAX + } else { + log_batch.get(start_log_idx).unwrap().table_id + }; + } + + if time_to_break { + break; + } + curr_log_idx += 1; + } + } +} + +#[derive(Debug, Eq, PartialEq)] +struct TableBatch { + table_id: TableId, + range: Range, +} + +struct SerialExecContext<'a> { + table_data: TableDataRef, + serial_exec: MutexGuard<'a, TableOpSerialExecutor>, +} + +/// Replay all log entries into memtable and flush if necessary +async fn replay_table_log_entries( + flusher: &Flusher, + max_retry_flush_limit: usize, + serial_exec: &mut TableOpSerialExecutor, + table_data: &TableDataRef, + log_entries: impl Iterator>, +) -> Result<()> { + let flushed_sequence = table_data.current_version().flushed_sequence(); + debug!( + "Replay table log entries begin, table:{}, table_id:{:?}, last_sequence:{}, flushed_sequence:{flushed_sequence}", + table_data.name, table_data.id, table_data.last_sequence(), + ); + + for log_entry in log_entries { + let (sequence, payload) = (log_entry.sequence, &log_entry.payload); + + // Ignore too old logs(sequence <= `flushed_sequence`). + if sequence <= flushed_sequence { + continue; + } + + // Apply logs to memtable. + match payload { + ReadPayload::Write { row_group } => { + trace!( + "Instance replay row_group, table:{}, row_group:{:?}", + table_data.name, + row_group + ); + + // TODO: too strict check here, should be modified to like what in + // `ColumnSchema::compatible_for_write`.` + let table_schema_version = table_data.schema_version(); + if table_schema_version != row_group.schema().version() { + // Data with old schema should already been flushed, but we avoid panic + // here. + error!( + "Ignore data with mismatch schema version during replaying, \ + table:{}, \ + table_id:{:?}, \ + expect:{}, \ + actual:{}, \ + last_sequence:{}, \ + sequence:{}", + table_data.name, + table_data.id, + table_schema_version, + row_group.schema().version(), + table_data.last_sequence(), + sequence, + ); + + continue; + } + + let index_in_writer = + IndexInWriterSchema::for_same_schema(row_group.schema().num_columns()); + let memtable_writer = MemTableWriter::new(table_data.clone(), serial_exec); + memtable_writer + .write(sequence, &row_group.into(), index_in_writer) + .box_err() + .context(ReplayWalWithCause { + msg: Some(format!( + "table_id:{}, table_name:{}, space_id:{}", + table_data.space_id, table_data.name, table_data.id + )), + })?; + + // Flush the table if necessary. + if table_data.should_flush_table(serial_exec) { + let opts = TableFlushOptions { + res_sender: None, + max_retry_flush_limit, + }; + let flush_scheduler = serial_exec.flush_scheduler(); + flusher + .schedule_flush(flush_scheduler, table_data, opts) + .await + .box_err() + .context(ReplayWalWithCause { + msg: Some(format!( + "table_id:{}, table_name:{}, space_id:{}", + table_data.space_id, table_data.name, table_data.id + )), + })?; + } + } + ReadPayload::AlterSchema { .. } | ReadPayload::AlterOptions { .. } => { + // Ignore records except Data. + // + // - DDL (AlterSchema and AlterOptions) should be recovered from + // Manifest on start. + } + } + + table_data.set_last_sequence(sequence); + } + + debug!( + "Replay table log entries finish, table:{}, table_id:{:?}, last_sequence:{}, flushed_sequence:{}", + table_data.name, table_data.id, table_data.last_sequence(), table_data.current_version().flushed_sequence() + ); + + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::collections::VecDeque; + + use table_engine::table::TableId; + use wal::log_batch::LogEntry; + + use crate::instance::wal_replayer::{RegionBasedReplay, TableBatch}; + + #[test] + fn test_split_log_batch_by_table() { + let test_set = test_set(); + for (test_batch, expected) in test_set { + check_split_result(&test_batch, &expected); + } + } + + fn test_set() -> Vec<(VecDeque>, Vec)> { + let test_log_batch1: VecDeque> = VecDeque::from([ + LogEntry { + table_id: 0, + sequence: 1, + payload: 0, + }, + LogEntry { + table_id: 0, + sequence: 2, + payload: 0, + }, + LogEntry { + table_id: 0, + sequence: 3, + payload: 0, + }, + LogEntry { + table_id: 1, + sequence: 1, + payload: 0, + }, + LogEntry { + table_id: 1, + sequence: 2, + payload: 0, + }, + LogEntry { + table_id: 2, + sequence: 1, + payload: 0, + }, + ]); + let expected1 = vec![ + TableBatch { + table_id: TableId::new(0), + range: 0..3, + }, + TableBatch { + table_id: TableId::new(1), + range: 3..5, + }, + TableBatch { + table_id: TableId::new(2), + range: 5..6, + }, + ]; + + let test_log_batch2: VecDeque> = VecDeque::from([LogEntry { + table_id: 0, + sequence: 1, + payload: 0, + }]); + let expected2 = vec![TableBatch { + table_id: TableId::new(0), + range: 0..1, + }]; + + let test_log_batch3: VecDeque> = VecDeque::default(); + let expected3 = vec![]; + + vec![ + (test_log_batch1, expected1), + (test_log_batch2, expected2), + (test_log_batch3, expected3), + ] + } + + fn check_split_result(batch: &VecDeque>, expected: &[TableBatch]) { + let mut table_batches = Vec::new(); + RegionBasedReplay::split_log_batch_by_table(batch, &mut table_batches); + assert_eq!(&table_batches, expected); + } +} diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs index 9e95d8e97b..025845afbe 100644 --- a/analytic_engine/src/lib.rs +++ b/analytic_engine/src/lib.rs @@ -97,9 +97,21 @@ pub struct Config { /// + Kafka pub wal: WalStorageConfig, + /// Recover mode + /// + /// + TableBased, tables on same shard will be recovered table by table. + /// + ShardBased, tables on same shard will be recovered together. + pub recover_mode: RecoverMode, + pub remote_engine_client: remote_engine_client::config::Config, } +#[derive(Debug, Clone, Copy, Deserialize, Serialize)] +pub enum RecoverMode { + TableBased, + ShardBased, +} + impl Default for Config { fn default() -> Self { Self { @@ -127,6 +139,7 @@ impl Default for Config { max_bytes_per_write_batch: None, wal: WalStorageConfig::RocksDB(Box::default()), remote_engine_client: remote_engine_client::config::Config::default(), + recover_mode: RecoverMode::TableBased, } } } diff --git a/analytic_engine/src/manifest/details.rs b/analytic_engine/src/manifest/details.rs index 9e902086b7..5ac94c9b82 100644 --- a/analytic_engine/src/manifest/details.rs +++ b/analytic_engine/src/manifest/details.rs @@ -199,6 +199,8 @@ pub(crate) trait TableMetaSet: fmt::Debug + Send + Sync { // `SnapshotReoverer`. #[derive(Debug, Clone)] struct SnapshotRecoverer { + table_id: TableId, + space_id: SpaceId, log_store: LogStore, snapshot_store: SnapshotStore, } @@ -217,6 +219,11 @@ where } async fn create_latest_snapshot_with_prev(&self, prev_snapshot: Snapshot) -> Result { + debug!( + "Manifest recover with prev snapshot, snapshot:{:?}, table_id:{}, space_id:{}", + prev_snapshot, self.table_id, self.space_id + ); + let log_start_boundary = ReadBoundary::Excluded(prev_snapshot.end_seq); let mut reader = self.log_store.scan(log_start_boundary).await?; @@ -239,6 +246,11 @@ where } async fn create_latest_snapshot_without_prev(&self) -> Result> { + debug!( + "Manifest recover without prev snapshot, table_id:{}, space_id:{}", + self.table_id, self.space_id + ); + let mut reader = self.log_store.scan(ReadBoundary::Min).await?; let mut latest_seq = SequenceNumber::MIN; @@ -258,6 +270,10 @@ where data: manifest_data_builder.build(), })) } else { + debug!( + "Manifest recover nothing, table_id:{}, space_id:{}", + self.table_id, self.space_id + ); Ok(None) } } @@ -474,7 +490,7 @@ impl Manifest for ManifestImpl { } async fn recover(&self, load_req: &LoadRequest) -> GenericResult<()> { - info!("Manifest recover, request:{:?}", load_req); + info!("Manifest recover begin, request:{load_req:?}"); // Load table meta snapshot from storage. let location = WalLocation::new(load_req.shard_id as u64, load_req.table_id.as_u64()); @@ -490,6 +506,8 @@ impl Manifest for ManifestImpl { self.store.clone(), ); let reoverer = SnapshotRecoverer { + table_id: load_req.table_id, + space_id: load_req.space_id, log_store, snapshot_store, }; @@ -505,6 +523,8 @@ impl Manifest for ManifestImpl { self.table_meta_set.apply_edit_to_table(request)?; } + info!("Manifest recover finish, request:{load_req:?}"); + Ok(()) } @@ -1386,7 +1406,8 @@ mod tests { assert_eq!(snapshot.data, expect_table_manifest_data); assert_eq!(snapshot.end_seq, log_store.next_seq() - 1); - let recovered_snapshot = recover_snapshot(&log_store, &snapshot_store).await; + let recovered_snapshot = + recover_snapshot(table_id, 0, &log_store, &snapshot_store).await; assert_eq!(snapshot, recovered_snapshot.unwrap()); } // The logs in the log store should be cleared after snapshot. @@ -1418,7 +1439,8 @@ mod tests { assert_eq!(snapshot.data, expect_table_manifest_data); assert_eq!(snapshot.end_seq, log_store.next_seq() - 1); - let recovered_snapshot = recover_snapshot(&log_store, &snapshot_store).await; + let recovered_snapshot = + recover_snapshot(table_id, 0, &log_store, &snapshot_store).await; assert_eq!(snapshot, recovered_snapshot.unwrap()); } // The logs in the log store should be cleared after snapshot. @@ -1446,10 +1468,14 @@ mod tests { } async fn recover_snapshot( + table_id: TableId, + space_id: SpaceId, log_store: &MemLogStore, snapshot_store: &MemSnapshotStore, ) -> Option { let recoverer = SnapshotRecoverer { + table_id, + space_id, log_store: log_store.clone(), snapshot_store: snapshot_store.clone(), }; diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs index bc4e4a2743..aeafdd9abe 100644 --- a/analytic_engine/src/memtable/mod.rs +++ b/analytic_engine/src/memtable/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! MemTable @@ -193,6 +193,19 @@ pub trait MemTable { /// /// If the memtable is empty, then the last sequence is 0. fn last_sequence(&self) -> SequenceNumber; + + /// Metrics of inner state. + fn metrics(&self) -> Metrics; +} + +#[derive(Debug)] +pub struct Metrics { + /// Size of original rows. + pub row_raw_size: usize, + /// Size of rows after encoded. + pub row_encoded_size: usize, + /// Row number count. + pub row_count: usize, } /// A reference to memtable diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs index 89dd453587..3c11e6ea3c 100644 --- a/analytic_engine/src/memtable/skiplist/factory.rs +++ b/analytic_engine/src/memtable/skiplist/factory.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Skiplist memtable factory @@ -25,6 +25,7 @@ impl Factory for SkiplistMemTableFactory { schema: opts.schema, skiplist, last_sequence: AtomicU64::new(opts.creation_sequence), + metrics: Default::default(), }); Ok(memtable) diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs index 4e3a1a8e27..ec06a98efd 100644 --- a/analytic_engine/src/memtable/skiplist/mod.rs +++ b/analytic_engine/src/memtable/skiplist/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! MemTable based on skiplist @@ -8,7 +8,7 @@ pub mod iter; use std::{ cmp::Ordering, convert::TryInto, - sync::atomic::{self, AtomicU64}, + sync::atomic::{self, AtomicU64, AtomicUsize}, }; use arena::{Arena, BasicStats}; @@ -26,10 +26,17 @@ use snafu::{ensure, ResultExt}; use crate::memtable::{ key::{ComparableInternalKey, KeySequence}, skiplist::iter::{ColumnarIterImpl, ReversedColumnarIterator}, - ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, PutContext, - Result, ScanContext, ScanRequest, + ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, + Metrics as MemtableMetrics, PutContext, Result, ScanContext, ScanRequest, }; +#[derive(Default, Debug)] +struct Metrics { + row_raw_size: AtomicUsize, + row_encoded_size: AtomicUsize, + row_count: AtomicUsize, +} + /// MemTable implementation based on skiplist pub struct SkiplistMemTable + Clone + Sync + Send> { /// Schema of this memtable, is immutable. @@ -38,6 +45,8 @@ pub struct SkiplistMemTable + Clone + Sync + Send> /// The last sequence of the rows in this memtable. Update to this field /// require external synchronization. last_sequence: AtomicU64, + + metrics: Metrics, } impl + Clone + Sync + Send + 'static> MemTable @@ -95,9 +104,20 @@ impl + Clone + Sync + Send + 'static> MemTable let row_value = &mut ctx.value_buf; let mut row_writer = ContiguousRowWriter::new(row_value, schema, &ctx.index_in_writer); row_writer.write_row(row).box_err().context(InvalidRow)?; - + let encoded_size = internal_key.len() + row_value.len(); self.skiplist.put(internal_key, row_value); + // Update metrics + self.metrics + .row_raw_size + .fetch_add(row.size(), atomic::Ordering::Relaxed); + self.metrics + .row_count + .fetch_add(1, atomic::Ordering::Relaxed); + self.metrics + .row_encoded_size + .fetch_add(encoded_size, atomic::Ordering::Relaxed); + Ok(()) } @@ -147,6 +167,20 @@ impl + Clone + Sync + Send + 'static> MemTable fn last_sequence(&self) -> SequenceNumber { self.last_sequence.load(atomic::Ordering::Relaxed) } + + fn metrics(&self) -> MemtableMetrics { + let row_raw_size = self.metrics.row_raw_size.load(atomic::Ordering::Relaxed); + let row_encoded_size = self + .metrics + .row_encoded_size + .load(atomic::Ordering::Relaxed); + let row_count = self.metrics.row_count.load(atomic::Ordering::Relaxed); + MemtableMetrics { + row_raw_size, + row_encoded_size, + row_count, + } + } } #[derive(Debug, Clone)] diff --git a/analytic_engine/src/sampler.rs b/analytic_engine/src/sampler.rs index 86729dba2b..9a72011eec 100644 --- a/analytic_engine/src/sampler.rs +++ b/analytic_engine/src/sampler.rs @@ -382,8 +382,8 @@ mod tests { &[(0, 2 * HOUR_MS as i64)], ); - let now_ts = Timestamp::now(); - let now = now_ts.as_i64(); + let now = 1672502400000i64; + let now_ts = Timestamp::new(now); let sec_ms_i64 = SEC_MS as i64; let bucket = TimeRange::bucket_of(now_ts, Duration::from_millis(2 * HOUR_MS)).unwrap(); diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs index be39178bbc..940ce16579 100644 --- a/analytic_engine/src/setup.rs +++ b/analytic_engine/src/setup.rs @@ -173,15 +173,63 @@ impl WalsOpener for RocksDBWalsOpener { let data_path = Path::new(&rocksdb_wal_config.data_dir); let wal_path = data_path.join(WAL_DIR_NAME); let data_wal = RocksWalBuilder::new(wal_path, write_runtime.clone()) + .max_subcompactions(rocksdb_wal_config.data_namespace.max_subcompactions) .max_background_jobs(rocksdb_wal_config.data_namespace.max_background_jobs) .enable_statistics(rocksdb_wal_config.data_namespace.enable_statistics) + .write_buffer_size(rocksdb_wal_config.data_namespace.write_buffer_size.0) + .max_write_buffer_number(rocksdb_wal_config.data_namespace.max_write_buffer_number) + .level_zero_file_num_compaction_trigger( + rocksdb_wal_config + .data_namespace + .level_zero_file_num_compaction_trigger, + ) + .level_zero_slowdown_writes_trigger( + rocksdb_wal_config + .data_namespace + .level_zero_slowdown_writes_trigger, + ) + .level_zero_stop_writes_trigger( + rocksdb_wal_config + .data_namespace + .level_zero_stop_writes_trigger, + ) + .fifo_compaction_max_table_files_size( + rocksdb_wal_config + .data_namespace + .fifo_compaction_max_table_files_size + .0, + ) .build() .context(OpenWal)?; let manifest_path = data_path.join(MANIFEST_DIR_NAME); let manifest_wal = RocksWalBuilder::new(manifest_path, write_runtime) + .max_subcompactions(rocksdb_wal_config.meta_namespace.max_subcompactions) .max_background_jobs(rocksdb_wal_config.meta_namespace.max_background_jobs) .enable_statistics(rocksdb_wal_config.meta_namespace.enable_statistics) + .write_buffer_size(rocksdb_wal_config.meta_namespace.write_buffer_size.0) + .max_write_buffer_number(rocksdb_wal_config.meta_namespace.max_write_buffer_number) + .level_zero_file_num_compaction_trigger( + rocksdb_wal_config + .meta_namespace + .level_zero_file_num_compaction_trigger, + ) + .level_zero_slowdown_writes_trigger( + rocksdb_wal_config + .meta_namespace + .level_zero_slowdown_writes_trigger, + ) + .level_zero_stop_writes_trigger( + rocksdb_wal_config + .meta_namespace + .level_zero_stop_writes_trigger, + ) + .fifo_compaction_max_table_files_size( + rocksdb_wal_config + .meta_namespace + .fifo_compaction_max_table_files_size + .0, + ) .build() .context(OpenManifestWal)?; let opened_wals = OpenedWals { diff --git a/analytic_engine/src/sst/meta_data/cache.rs b/analytic_engine/src/sst/meta_data/cache.rs index 296c4e2476..5e2bacdcbd 100644 --- a/analytic_engine/src/sst/meta_data/cache.rs +++ b/analytic_engine/src/sst/meta_data/cache.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. use std::{ fmt::Debug, @@ -7,7 +7,7 @@ use std::{ use lru::LruCache; use parquet::file::metadata::FileMetaData; -use snafu::{OptionExt, ResultExt}; +use snafu::{ensure, OptionExt, ResultExt}; use crate::sst::{ meta_data::{DecodeCustomMetaData, KvMetaDataNotFound, ParquetMetaDataRef, Result}, @@ -39,14 +39,24 @@ impl MetaData { let kv_metas = file_meta_data .key_value_metadata() .context(KvMetaDataNotFound)?; - let kv_meta = kv_metas - .iter() - .find(|kv| kv.key == encoding::META_KEY) - .context(KvMetaDataNotFound)?; + + ensure!(!kv_metas.is_empty(), KvMetaDataNotFound); + let mut other_kv_metas = Vec::with_capacity(kv_metas.len() - 1); + let mut custom_kv_meta = None; + for kv_meta in kv_metas { + // Remove our extended custom meta data from the parquet metadata for small + // memory consumption in the cache. + if kv_meta.key == encoding::META_KEY { + custom_kv_meta = Some(kv_meta); + } else { + other_kv_metas.push(kv_meta.clone()); + } + } let custom = { + let custom_kv_meta = custom_kv_meta.context(KvMetaDataNotFound)?; let mut sst_meta = - encoding::decode_sst_meta_data(kv_meta).context(DecodeCustomMetaData)?; + encoding::decode_sst_meta_data(custom_kv_meta).context(DecodeCustomMetaData)?; if ignore_sst_filter { sst_meta.parquet_filter = None; } @@ -56,13 +66,17 @@ impl MetaData { // let's build a new parquet metadata without the extended key value // metadata. + let other_kv_metas = if other_kv_metas.is_empty() { + None + } else { + Some(other_kv_metas) + }; let parquet = { let thin_file_meta_data = FileMetaData::new( file_meta_data.version(), file_meta_data.num_rows(), file_meta_data.created_by().map(|v| v.to_string()), - // Remove the key value metadata. - None, + other_kv_metas, file_meta_data.schema_descr_ptr(), file_meta_data.column_orders().cloned(), ); @@ -111,3 +125,153 @@ impl MetaCache { self.cache.write().unwrap().put(key, value); } } + +#[cfg(test)] +mod tests { + use std::{fs::File, path::Path, sync::Arc}; + + use arrow::{ + array::UInt64Builder, + datatypes::{DataType, Field, Schema}, + record_batch::RecordBatch, + }; + use bytes::Bytes; + use common_types::{ + column_schema::Builder as ColumnSchemaBuilder, + schema::Builder as CustomSchemaBuilder, + time::{TimeRange, Timestamp}, + }; + use parquet::{arrow::ArrowWriter, file::footer}; + use parquet_ext::ParquetMetaData; + + use super::MetaData; + use crate::sst::parquet::{encoding, meta_data::ParquetMetaData as CustomParquetMetaData}; + + fn check_parquet_meta_data(original: &ParquetMetaData, processed: &ParquetMetaData) { + assert_eq!(original.page_indexes(), processed.page_indexes()); + assert_eq!(original.offset_indexes(), processed.offset_indexes()); + assert_eq!(original.num_row_groups(), processed.num_row_groups()); + assert_eq!(original.row_groups(), processed.row_groups()); + + let original_file_md = original.file_metadata(); + let processed_file_md = processed.file_metadata(); + assert_eq!(original_file_md.num_rows(), processed_file_md.num_rows()); + assert_eq!(original_file_md.version(), processed_file_md.version()); + assert_eq!( + original_file_md.created_by(), + processed_file_md.created_by() + ); + assert_eq!(original_file_md.schema(), processed_file_md.schema()); + assert_eq!( + original_file_md.schema_descr(), + processed_file_md.schema_descr() + ); + assert_eq!( + original_file_md.schema_descr_ptr(), + processed_file_md.schema_descr_ptr() + ); + assert_eq!( + original_file_md.column_orders(), + processed_file_md.column_orders() + ); + + if let Some(kv_metas) = original_file_md.key_value_metadata() { + let processed_kv_metas = processed_file_md.key_value_metadata().unwrap(); + assert_eq!(kv_metas.len(), processed_kv_metas.len() + 1); + let mut idx_for_processed = 0; + for kv in kv_metas { + if kv.key == encoding::META_KEY { + continue; + } + assert_eq!(kv, &processed_kv_metas[idx_for_processed]); + idx_for_processed += 1; + } + } else { + assert!(processed_file_md.key_value_metadata().is_none()); + } + } + + fn write_parquet_file_with_metadata( + parquet_file_path: &Path, + custom_meta_data: &CustomParquetMetaData, + ) { + let tsid_array = { + let mut builder = UInt64Builder::new(); + builder.append_value(10); + builder.append_null(); + builder.append_value(11); + builder.finish() + }; + let timestamp_array = { + let mut builder = UInt64Builder::new(); + builder.append_value(1000); + builder.append_null(); + builder.append_value(1001); + builder.finish() + }; + let file = File::create(parquet_file_path).unwrap(); + let schema = Schema::new(vec![ + Field::new("tsid", DataType::UInt64, true), + Field::new("timestamp", DataType::UInt64, true), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(tsid_array), Arc::new(timestamp_array)], + ) + .unwrap(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), None).unwrap(); + + let encoded_meta_data = encoding::encode_sst_meta_data(custom_meta_data.clone()).unwrap(); + writer.append_key_value_metadata(encoded_meta_data); + + writer.write(&batch).unwrap(); + writer.close().unwrap(); + } + + #[test] + fn test_arrow_meta_data() { + let temp_dir = tempfile::tempdir().unwrap(); + let parquet_file_path = temp_dir.path().join("test_arrow_meta_data.par"); + let schema = { + let tsid_column_schema = ColumnSchemaBuilder::new( + "tsid".to_string(), + common_types::datum::DatumKind::UInt64, + ) + .build() + .unwrap(); + let timestamp_column_schema = ColumnSchemaBuilder::new( + "timestamp".to_string(), + common_types::datum::DatumKind::Timestamp, + ) + .build() + .unwrap(); + CustomSchemaBuilder::new() + .auto_increment_column_id(true) + .add_key_column(tsid_column_schema) + .unwrap() + .add_key_column(timestamp_column_schema) + .unwrap() + .build() + .unwrap() + }; + let custom_meta_data = CustomParquetMetaData { + min_key: Bytes::from_static(&[0, 1]), + max_key: Bytes::from_static(&[2, 2]), + time_range: TimeRange::new_unchecked(Timestamp::new(0), Timestamp::new(10)), + max_sequence: 1001, + schema, + parquet_filter: None, + collapsible_cols_idx: vec![], + }; + write_parquet_file_with_metadata(parquet_file_path.as_path(), &custom_meta_data); + + let parquet_file = File::open(parquet_file_path.as_path()).unwrap(); + let parquet_meta_data = footer::parse_metadata(&parquet_file).unwrap(); + + let meta_data = MetaData::try_new(&parquet_meta_data, false).unwrap(); + + assert_eq!(**meta_data.custom(), custom_meta_data); + check_parquet_meta_data(&parquet_meta_data, meta_data.parquet()); + } +} diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs index dac48bff44..b2181f727b 100644 --- a/analytic_engine/src/sst/parquet/async_reader.rs +++ b/analytic_engine/src/sst/parquet/async_reader.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Sst reader implementation based on parquet. @@ -30,17 +30,14 @@ use datafusion::{ metrics::ExecutionPlanMetricsSet, }, }; -use futures::{future::BoxFuture, FutureExt, Stream, StreamExt, TryFutureExt}; +use futures::{Stream, StreamExt}; use log::{debug, error}; use object_store::{ObjectStoreRef, Path}; use parquet::{ - arrow::{ - arrow_reader::RowSelection, async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder, - ProjectionMask, - }, + arrow::{arrow_reader::RowSelection, ParquetRecordBatchStreamBuilder, ProjectionMask}, file::metadata::RowGroupMetaData, }; -use parquet_ext::meta_data::ChunkReader; +use parquet_ext::{meta_data::ChunkReader, reader::ObjectStoreReader}; use snafu::ResultExt; use table_engine::predicate::PredicateRef; use tokio::sync::mpsc::{self, Receiver, Sender}; @@ -281,13 +278,23 @@ impl<'a> Reader<'a> { let mut streams = Vec::with_capacity(target_row_group_chunks.len()); for chunk in target_row_group_chunks { - let object_store_reader = - ObjectStoreReader::new(self.store.clone(), self.path.clone(), meta_data.clone()); + let object_store_reader = ObjectStoreReader::new( + self.store.clone(), + self.path.clone(), + parquet_metadata.clone(), + ); let mut builder = ParquetRecordBatchStreamBuilder::new(object_store_reader) .await .with_context(|| ParquetError)?; + let row_selection = self.build_row_selection(arrow_schema.clone(), &chunk, parquet_metadata)?; + + debug!( + "Build row selection for file path:{}, result:{row_selection:?}, page indexes:{}", + self.path, + parquet_metadata.page_indexes().is_some() + ); if let Some(selection) = row_selection { builder = builder.with_row_selection(selection); }; @@ -340,18 +347,32 @@ impl<'a> Reader<'a> { Ok(file_size) } - async fn load_meta_data_from_storage(&self) -> Result { + async fn load_meta_data_from_storage(&self, ignore_sst_filter: bool) -> Result { let file_size = self.load_file_size().await?; let chunk_reader_adapter = ChunkReaderAdapter::new(self.path, self.store); - let (meta_data, _) = + let (parquet_meta_data, _) = parquet_ext::meta_data::fetch_parquet_metadata(file_size, &chunk_reader_adapter) .await .with_context(|| FetchAndDecodeSstMeta { file_path: self.path.to_string(), })?; - Ok(Arc::new(meta_data)) + let object_store_reader = parquet_ext::reader::ObjectStoreReader::new( + self.store.clone(), + self.path.clone(), + Arc::new(parquet_meta_data), + ); + + let parquet_meta_data = parquet_ext::meta_data::meta_with_page_indexes(object_store_reader) + .await + .with_context(|| DecodePageIndexes { + file_path: self.path.to_string(), + })?; + + MetaData::try_new(&parquet_meta_data, ignore_sst_filter) + .box_err() + .context(DecodeSstMeta) } fn need_update_cache(&self) -> bool { @@ -375,12 +396,8 @@ impl<'a> Reader<'a> { let empty_predicate = self.predicate.exprs().is_empty(); let meta_data = { - let parquet_meta_data = self.load_meta_data_from_storage().await?; - let ignore_sst_filter = avoid_update_cache && empty_predicate; - MetaData::try_new(&parquet_meta_data, ignore_sst_filter) - .box_err() - .context(DecodeSstMeta)? + self.load_meta_data_from_storage(ignore_sst_filter).await? }; if avoid_update_cache || self.meta_cache.is_none() { @@ -413,71 +430,6 @@ impl<'a> Drop for Reader<'a> { } } -#[derive(Clone)] -struct ObjectStoreReader { - storage: ObjectStoreRef, - path: Path, - meta_data: MetaData, - begin: Instant, -} - -impl ObjectStoreReader { - fn new(storage: ObjectStoreRef, path: Path, meta_data: MetaData) -> Self { - Self { - storage, - path, - meta_data, - begin: Instant::now(), - } - } -} - -impl Drop for ObjectStoreReader { - fn drop(&mut self) { - debug!( - "ObjectStoreReader dropped, path:{}, elapsed:{:?}", - &self.path, - self.begin.elapsed() - ); - } -} - -impl AsyncFileReader for ObjectStoreReader { - fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { - self.storage - .get_range(&self.path, range) - .map_err(|e| { - parquet::errors::ParquetError::General(format!( - "Failed to fetch range from object store, err:{e}" - )) - }) - .boxed() - } - - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - async move { - self.storage - .get_ranges(&self.path, &ranges) - .map_err(|e| { - parquet::errors::ParquetError::General(format!( - "Failed to fetch ranges from object store, err:{e}" - )) - }) - .await - } - .boxed() - } - - fn get_metadata( - &mut self, - ) -> BoxFuture<'_, parquet::errors::Result>> { - Box::pin(async move { Ok(self.meta_data.parquet().clone()) }) - } -} - pub struct ChunkReaderAdapter<'a> { path: &'a Path, store: &'a ObjectStoreRef, diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs index 2effc6bc36..1a92338dd4 100644 --- a/analytic_engine/src/sst/parquet/encoding.rs +++ b/analytic_engine/src/sst/parquet/encoding.rs @@ -526,7 +526,16 @@ impl HybridRecordDecoder { .iter() .map(|f| { if let DataType::List(nested_field) = f.data_type() { - Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true)) + match f.data_type() { + DataType::Dictionary(_, _) => Arc::new(Field::new_dict( + f.name(), + nested_field.data_type().clone(), + true, + f.dict_id().unwrap(), + f.dict_is_ordered().unwrap(), + )), + _ => Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true)), + } } else { f.clone() } @@ -1030,11 +1039,11 @@ mod tests { ArrowRecordBatch::try_new(schema.to_arrow_schema_ref(), columns).unwrap(); let input_record_batch2 = ArrowRecordBatch::try_new(schema.to_arrow_schema_ref(), columns2).unwrap(); - let row_nums = encoder + let num_rows = encoder .encode(vec![input_record_batch, input_record_batch2]) .await .unwrap(); - assert_eq!(2, row_nums); + assert_eq!(2, num_rows); // read encoded records back, and then compare with input records encoder.close().await.unwrap(); diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs index df0b3808af..1cf7481ecf 100644 --- a/analytic_engine/src/sst/parquet/hybrid.rs +++ b/analytic_engine/src/sst/parquet/hybrid.rs @@ -127,6 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef { field.data_type().clone(), true, ))); + // TODO is there need to use new_dict? Arc::new(Field::new(field.name(), field_type, true)) } else { field.clone() @@ -418,6 +419,7 @@ impl ListArrayBuilder { let array_len = self.multi_row_arrays.len(); let mut offsets = MutableBuffer::new(array_len * std::mem::size_of::()); let child_data = self.build_child_data(&mut offsets)?; + // TODO is there need to use new_dict? let field = Arc::new(Field::new( LIST_ITEM_NAME, self.datum_kind.to_arrow_data_type(), diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs index 8bba1b41a2..378854bafa 100644 --- a/analytic_engine/src/sst/parquet/writer.rs +++ b/analytic_engine/src/sst/parquet/writer.rs @@ -333,7 +333,7 @@ mod tests { use common_types::{ bytes::Bytes, projected_schema::ProjectedSchema, - tests::{build_row, build_schema}, + tests::{build_row, build_row_for_dictionary, build_schema, build_schema_for_dictionary}, time::{TimeRange, Timestamp}, }; use common_util::{ @@ -358,8 +358,163 @@ mod tests { table_options::{self, StorageFormatHint}, }; + fn write_parquet_with_dictionary_encode_and_read_back( + runtime: Arc, + num_rows_per_row_group: usize, + expected_num_rows: Vec, + ) { + runtime.block_on(async { + let sst_factory = FactoryImpl; + let sst_write_options = SstWriteOptions { + storage_format_hint: StorageFormatHint::Auto, + num_rows_per_row_group, + compression: table_options::Compression::Uncompressed, + max_buffer_size: 0, + }; + + let dir = tempdir().unwrap(); + let root = dir.path(); + let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap()); + let store_picker: ObjectStorePickerRef = Arc::new(store); + let sst_file_path = Path::from("test_dictionary.par"); + + let schema = build_schema_for_dictionary(); + let reader_projected_schema = ProjectedSchema::no_projection(schema.clone()); + let sst_meta = MetaData { + min_key: Bytes::from_static(b"100"), + max_key: Bytes::from_static(b"200"), + time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)), + max_sequence: 200, + schema: schema.clone(), + }; + + let mut counter = 5; + let record_batch_stream = Box::new(stream::poll_fn(move |_| -> Poll> { + if counter == 0 { + return Poll::Ready(None); + } + counter -= 1; + + let ts = 100 + counter; + let rows = vec![ + build_row_for_dictionary(1, ts, Some("tagv1"), "tagv2", 1), + build_row_for_dictionary(2, ts, Some("tagv2"), "tagv2", 2), + build_row_for_dictionary(3, ts, None, "tagv3", 3), + build_row_for_dictionary(4, ts, Some("tagv3"), "tagv2", 2), + ]; + let batch = build_record_batch_with_key(schema.clone(), rows); + Poll::Ready(Some(Ok(batch))) + })); + let mut writer = sst_factory + .create_writer( + &sst_write_options, + &sst_file_path, + &store_picker, + Level::MAX, + ) + .await + .unwrap(); + let sst_info = writer + .write(RequestId::next_id(), &sst_meta, record_batch_stream) + .await + .unwrap(); + + assert_eq!(20, sst_info.row_num); + + let scan_options = ScanOptions::default(); + // read sst back to test + let sst_read_options = SstReadOptions { + reverse: false, + frequency: ReadFrequency::Frequent, + num_rows_per_row_group: 5, + projected_schema: reader_projected_schema, + predicate: Arc::new(Predicate::empty()), + meta_cache: None, + scan_options, + runtime: runtime.clone(), + }; + + let mut reader: Box = { + let mut reader = AsyncParquetReader::new( + &sst_file_path, + &sst_read_options, + None, + &store_picker, + None, + ); + let mut sst_meta_readback = reader + .meta_data() + .await + .unwrap() + .as_parquet() + .unwrap() + .as_ref() + .clone(); + // sst filter is built insider sst writer, so overwrite to default for + // comparison. + sst_meta_readback.parquet_filter = Default::default(); + assert_eq!(&sst_meta_readback, &ParquetMetaData::from(sst_meta)); + assert_eq!( + expected_num_rows, + reader + .row_groups() + .await + .iter() + .map(|g| g.num_rows()) + .collect::>() + ); + + Box::new(reader) + }; + let mut stream = reader.read().await.unwrap(); + let mut expect_rows = vec![]; + for counter in &[4, 3, 2, 1, 0] { + expect_rows.push(build_row_for_dictionary( + 1, + 100 + counter, + Some("tagv1"), + "tagv2", + 1, + )); + expect_rows.push(build_row_for_dictionary( + 2, + 100 + counter, + Some("tagv2"), + "tagv2", + 2, + )); + expect_rows.push(build_row_for_dictionary(3, 100 + counter, None, "tagv3", 3)); + expect_rows.push(build_row_for_dictionary( + 4, + 100 + counter, + Some("tagv3"), + "tagv2", + 2, + )); + } + check_stream(&mut stream, expect_rows).await; + }); + } + // TODO(xikai): add test for reverse reader + #[test] + fn test_parquet_use_dictionary() { + init_log_for_test(); + let runtime = Arc::new(runtime::Builder::default().build().unwrap()); + write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 5, vec![5, 5, 5, 5]); + write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]); + write_parquet_with_dictionary_encode_and_read_back( + runtime.clone(), + 3, + vec![3, 3, 3, 3, 3, 3, 2], + ); + write_parquet_with_dictionary_encode_and_read_back( + runtime, + 2, + vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + ); + } #[test] fn test_parquet_build_and_read() { init_log_for_test(); @@ -391,7 +546,7 @@ mod tests { let sst_file_path = Path::from("data.par"); let schema = build_schema(); - let projected_schema = ProjectedSchema::no_projection(schema.clone()); + let reader_projected_schema = ProjectedSchema::no_projection(schema.clone()); let sst_meta = MetaData { min_key: Bytes::from_static(b"100"), max_key: Bytes::from_static(b"200"), @@ -440,7 +595,7 @@ mod tests { reverse: false, frequency: ReadFrequency::Frequent, num_rows_per_row_group: 5, - projected_schema, + projected_schema: reader_projected_schema, predicate: Arc::new(Predicate::empty()), meta_cache: None, scan_options, diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs index 99872d448a..029b0aa34a 100644 --- a/analytic_engine/src/sst/reader.rs +++ b/analytic_engine/src/sst/reader.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Sst reader trait definition. @@ -15,20 +15,17 @@ pub mod error { #[derive(Debug, Snafu)] #[snafu(visibility(pub))] pub enum Error { - #[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))] + #[snafu(display("Try to read again, path:{path}.\nBacktrace:\n{backtrace}"))] ReadAgain { backtrace: Backtrace, path: String }, - #[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))] + #[snafu(display("Fail to read persisted file, path:{path}, err:{source}"))] ReadPersist { path: String, source: GenericError }, - #[snafu(display("Failed to decode record batch, err:{}", source))] + #[snafu(display("Failed to decode record batch, err:{source}"))] DecodeRecordBatch { source: GenericError }, #[snafu(display( - "Failed to decode sst meta data, file_path:{}, err:{}.\nBacktrace:\n{:?}", - file_path, - source, - backtrace + "Failed to decode sst meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}", ))] FetchAndDecodeSstMeta { file_path: String, @@ -36,43 +33,52 @@ pub mod error { backtrace: Backtrace, }, - #[snafu(display("Failed to decode sst meta data, err:{}", source))] + #[snafu(display( + "Failed to decode page indexes for meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}", + ))] + DecodePageIndexes { + file_path: String, + source: parquet::errors::ParquetError, + backtrace: Backtrace, + }, + + #[snafu(display("Failed to decode sst meta data, err:{source}"))] DecodeSstMeta { source: GenericError }, - #[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))] + #[snafu(display("Sst meta data is not found.\nBacktrace:\n{backtrace}"))] SstMetaNotFound { backtrace: Backtrace }, - #[snafu(display("Fail to projection, err:{}", source))] + #[snafu(display("Fail to projection, err:{source}"))] Projection { source: GenericError }, - #[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))] + #[snafu(display("Sst meta data is empty.\nBacktrace:\n{backtrace}"))] EmptySstMeta { backtrace: Backtrace }, - #[snafu(display("Invalid schema, err:{}", source))] + #[snafu(display("Invalid schema, err:{source}"))] InvalidSchema { source: common_types::schema::Error }, - #[snafu(display("Meet a datafusion error, err:{}\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Meet a datafusion error, err:{source}\nBacktrace:\n{backtrace}"))] DataFusionError { source: datafusion::error::DataFusionError, backtrace: Backtrace, }, - #[snafu(display("Meet a object store error, err:{}\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Meet a object store error, err:{source}\nBacktrace:\n{backtrace}"))] ObjectStoreError { source: object_store::ObjectStoreError, backtrace: Backtrace, }, - #[snafu(display("Meet a parquet error, err:{}\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Meet a parquet error, err:{source}\nBacktrace:\n{backtrace}"))] ParquetError { source: parquet::errors::ParquetError, backtrace: Backtrace, }, - #[snafu(display("Other kind of error:{}", source))] + #[snafu(display("Other kind of error:{source}"))] Other { source: GenericError }, - #[snafu(display("Other kind of error, msg:{}.\nBacktrace:\n{}", msg, backtrace))] + #[snafu(display("Other kind of error, msg:{msg}.\nBacktrace:\n{backtrace}"))] OtherNoCause { msg: String, backtrace: Backtrace }, } diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs index 49d9d6cae8..6af8c5fad4 100644 --- a/analytic_engine/src/table/data.rs +++ b/analytic_engine/src/table/data.rs @@ -358,6 +358,12 @@ impl TableData { self.current_version.total_memory_usage() } + /// Returns mutable memtable memory usage in bytes. + #[inline] + pub fn mutable_memory_usage(&self) -> usize { + self.current_version.mutable_memory_usage() + } + /// Find memtable for given timestamp to insert, create if not exists /// /// If the memtable schema is outdated, switch all memtables and create the @@ -443,12 +449,11 @@ impl TableData { let mutable_usage = self.current_version.mutable_memory_usage(); let total_usage = self.current_version.total_memory_usage(); - let in_flush = serial_exec.flush_scheduler().is_in_flush(); // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94 if mutable_usage > mutable_limit && !in_flush { info!( - "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + "TableData should flush by mutable limit, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size ); return true; @@ -467,7 +472,7 @@ impl TableData { if should_flush { info!( - "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", + "TableData should flush by total usage, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}", self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size ); } @@ -592,6 +597,14 @@ impl TableDataSet { .cloned() } + pub fn find_maximum_mutable_memory_usage_table(&self) -> Option { + // TODO: Possible performance issue here when there are too many tables. + self.table_datas + .values() + .max_by_key(|t| t.mutable_memory_usage()) + .cloned() + } + /// List all tables to `tables` pub fn list_all_tables(&self, tables: &mut Vec) { for table_data in self.table_datas.values().cloned() { @@ -766,7 +779,7 @@ pub mod tests { Some(ReadableDuration(table_options::DEFAULT_SEGMENT_DURATION)); table_data.set_table_options(table_opts); // Freeze sampling memtable. - current_version.freeze_sampling(); + current_version.freeze_sampling_memtable(); // A new mutable memtable should be created. let mutable = table_data.find_or_create_mutable(now_ts, &schema).unwrap(); diff --git a/analytic_engine/src/table/version.rs b/analytic_engine/src/table/version.rs index 329c09677a..9f64403294 100644 --- a/analytic_engine/src/table/version.rs +++ b/analytic_engine/src/table/version.rs @@ -130,6 +130,8 @@ impl fmt::Debug for MemTableState { f.debug_struct("MemTableState") .field("time_range", &self.time_range) .field("id", &self.id) + .field("mem", &self.mem.approximate_memory_usage()) + .field("metrics", &self.mem.metrics()) .field("last_sequence", &self.mem.last_sequence()) .finish() } @@ -278,16 +280,19 @@ impl MemTableView { mutable_usage + immutable_usage } - /// Switch all memtables or just sample the segment duration. + /// Instead of replace the old memtable by a new memtable, we just move the + /// old memtable to immutable memtables and left mutable memtables + /// empty. New mutable memtable will be constructed via put request. + fn switch_memtables(&mut self) -> Option { + self.mutables.move_to_inmem(&mut self.immutables) + } + + /// Sample the segment duration. /// /// If the sampling memtable is still active, return the suggested segment /// duration or move all mutable memtables into immutable memtables if /// the sampling memtable is freezed and returns None. - /// - /// Instead of replace the old memtable by a new memtable, we just move the - /// old memtable to immutable memtables and left mutable memtables - /// empty. New mutable memtable will be constructed via put request. - fn switch_memtables_or_suggest_duration(&mut self) -> Option { + fn suggest_duration(&mut self) -> Option { if let Some(v) = &mut self.sampling_mem { if !v.freezed { // Other memtable should be empty during sampling phase. @@ -304,15 +309,15 @@ impl MemTableView { } } - self.mutables.move_to_inmem(&mut self.immutables); - None } - fn freeze_sampling_memtable(&mut self) { + fn freeze_sampling_memtable(&mut self) -> Option { if let Some(v) = &mut self.sampling_mem { v.freezed = true; + return Some(v.mem.last_sequence()); } + None } /// Returns memtables need to be flushed. Only sampling memtable and @@ -414,13 +419,20 @@ impl MutableMemTableSet { } /// Move all mutable memtables to immutable memtables. - fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) { - for m in self.0.values() { - let state = m.clone(); + fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) -> Option { + let last_seq = self + .0 + .values() + .map(|m| { + let last_sequence = m.mem.last_sequence(); + immem.0.insert(m.id, m.clone()); + + last_sequence + }) + .max(); - immem.0.insert(m.id, state); - } self.0.clear(); + last_seq } fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) { @@ -568,29 +580,29 @@ impl TableVersion { .total_memory_usage() } - /// Switch all mutable memtables or just return the suggested segment - /// duration if sampling memtable is still active. - /// - /// Returns a duration if a sampled segment duration needs to be persisted. + /// Return the suggested segment duration if sampling memtable is still + /// active. + pub fn suggest_duration(&self) -> Option { + self.inner.write().unwrap().memtable_view.suggest_duration() + } + + /// Switch all mutable memtables /// - /// REQUIRE: Do in write worker - pub fn switch_memtables_or_suggest_duration(&self) -> Option { - self.inner - .write() - .unwrap() - .memtable_view - .switch_memtables_or_suggest_duration() + /// Returns the maxium `SequenceNumber` in the mutable memtables needs to be + /// freezed. + pub fn switch_memtables(&self) -> Option { + self.inner.write().unwrap().memtable_view.switch_memtables() } /// Stop timestamp sampling and freezed the sampling memtable. /// /// REQUIRE: Do in write worker - pub fn freeze_sampling(&self) { + pub fn freeze_sampling_memtable(&self) -> Option { self.inner .write() .unwrap() .memtable_view - .freeze_sampling_memtable(); + .freeze_sampling_memtable() } /// See [MemTableView::pick_memtables_to_flush] @@ -727,9 +739,9 @@ impl TableVersion { picker_ctx: PickerContext, picker: &CompactionPickerRef, ) -> picker::Result { - let inner = self.inner.read().unwrap(); + let mut inner = self.inner.write().unwrap(); - picker.pick_compaction(picker_ctx, &inner.levels_controller) + picker.pick_compaction(picker_ctx, &mut inner.levels_controller) } pub fn has_expired_sst(&self, expire_time: Option) -> bool { @@ -870,7 +882,8 @@ mod tests { assert!(mutable.is_none()); // Nothing to switch. - assert!(version.switch_memtables_or_suggest_duration().is_none()); + assert!(version.suggest_duration().is_none()); + assert!(version.switch_memtables().is_none()); } fn check_flushable_mem_with_sampling( @@ -936,8 +949,9 @@ mod tests { version.set_sampling(sampling_mem); - let duration = version.switch_memtables_or_suggest_duration().unwrap(); + let duration = version.suggest_duration().unwrap(); assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + assert!(version.switch_memtables().is_none()); // Flushable memtables only contains sampling memtable. let flushable_mems = version.pick_memtables_to_flush(last_sequence); @@ -954,9 +968,11 @@ mod tests { assert_eq!(memtable_id, actual_memtable.id); // Switch still return duration before freezed. - let duration = version.switch_memtables_or_suggest_duration().unwrap(); + let duration = version.suggest_duration().unwrap(); assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration); + assert!(version.switch_memtables().is_none()); + version.switch_memtables(); // Flushable memtables only contains sampling memtable before sampling // memtable is freezed. let flushable_mems = version.pick_memtables_to_flush(last_sequence); @@ -977,11 +993,11 @@ mod tests { version.set_sampling(sampling_mem); assert_eq!( table_options::DEFAULT_SEGMENT_DURATION, - version.switch_memtables_or_suggest_duration().unwrap() + version.suggest_duration().unwrap() ); - + assert!(version.switch_memtables().is_none()); // Freeze the sampling memtable. - version.freeze_sampling(); + version.freeze_sampling_memtable(); // No memtable after switch and freezed. let now = Timestamp::now(); @@ -1029,7 +1045,8 @@ mod tests { assert_eq!(memtable_id2, read_view.memtables[0].id); // Switch mutable memtable. - assert!(version.switch_memtables_or_suggest_duration().is_none()); + assert!(version.suggest_duration().is_none()); + assert!(version.switch_memtables().is_some()); // No memtable after switch. let now = Timestamp::now(); assert!(version @@ -1055,7 +1072,7 @@ mod tests { // Prepare sampling memtable. version.set_sampling(sampling_mem); - version.freeze_sampling(); + version.freeze_sampling_memtable(); let now = Timestamp::now(); let time_range = @@ -1073,8 +1090,8 @@ mod tests { version.insert_mutable(mem_state); // Switch memtable. - assert!(version.switch_memtables_or_suggest_duration().is_none()); - + assert!(version.suggest_duration().is_none()); + assert!(version.switch_memtables().is_some()); let max_sequence = 120; let file_id = 13; let add_file = AddFileMocker::new(file_id) diff --git a/analytic_engine/src/tests/alter_test.rs b/analytic_engine/src/tests/alter_test.rs index 614cab7541..c6f4b08eec 100644 --- a/analytic_engine/src/tests/alter_test.rs +++ b/analytic_engine/src/tests/alter_test.rs @@ -20,24 +20,25 @@ use crate::{ tests::{ row_util, table::{self, FixedSchemaTable}, - util::{ - EngineBuildContext, MemoryEngineBuildContext, Null, RocksDBEngineBuildContext, - TestContext, TestEnv, - }, + util::{memory_ctxs, rocksdb_ctxs, EngineBuildContext, Null, TestContext, TestEnv}, }, }; #[test] fn test_alter_table_add_column_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_alter_table_add_column(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_alter_table_add_column(ctx); + } } #[ignore = "Enable this test when manifest use another snapshot implementation"] #[test] fn test_alter_table_add_column_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_alter_table_add_column(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_alter_table_add_column(ctx); + } } fn test_alter_table_add_column(engine_context: T) { @@ -370,15 +371,19 @@ async fn check_read_row_group( #[test] fn test_alter_table_options_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_alter_table_options(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_alter_table_options(ctx); + } } #[ignore = "Enable this test when manifest use another snapshot implementation"] #[test] fn test_alter_table_options_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_alter_table_options(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_alter_table_options(ctx); + } } fn test_alter_table_options(engine_context: T) { diff --git a/analytic_engine/src/tests/drop_test.rs b/analytic_engine/src/tests/drop_test.rs index 5dd0be033a..c915ae1482 100644 --- a/analytic_engine/src/tests/drop_test.rs +++ b/analytic_engine/src/tests/drop_test.rs @@ -10,7 +10,8 @@ use table_engine::table::AlterSchemaRequest; use crate::tests::{ table::FixedSchemaTable, util::{ - self, EngineBuildContext, MemoryEngineBuildContext, RocksDBEngineBuildContext, TestEnv, + self, memory_ctxs, rocksdb_ctxs, EngineBuildContext, MemoryEngineBuildContext, + RocksDBEngineBuildContext, TestEnv, }, }; @@ -209,14 +210,18 @@ fn test_drop_create_same_table_case(flush: bool, engine_c #[test] fn test_drop_create_same_table_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_drop_create_same_table(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_drop_create_same_table(ctx); + } } #[test] fn test_drop_create_same_table_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_drop_create_same_table(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_drop_create_same_table(ctx); + } } fn test_drop_create_same_table(engine_context: T) { @@ -227,14 +232,18 @@ fn test_drop_create_same_table(engine_context: T) { #[test] fn test_alter_schema_drop_create_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_alter_schema_drop_create(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_alter_schema_drop_create(ctx); + } } #[test] fn test_alter_schema_drop_create_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_alter_schema_drop_create(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_alter_schema_drop_create(ctx); + } } fn test_alter_schema_drop_create(engine_context: T) { @@ -284,14 +293,18 @@ fn test_alter_schema_drop_create(engine_context: T) { #[test] fn test_alter_options_drop_create_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_alter_options_drop_create(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_alter_options_drop_create(ctx); + } } #[test] fn test_alter_options_drop_create_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_alter_options_drop_create(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_alter_options_drop_create(ctx); + } } fn test_alter_options_drop_create(engine_context: T) { diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs index 783f46aa42..30a4f3a980 100644 --- a/analytic_engine/src/tests/read_write_test.rs +++ b/analytic_engine/src/tests/read_write_test.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Read write test. @@ -11,22 +11,23 @@ use table_engine::table::ReadOrder; use crate::{ setup::WalsOpener, table_options, - tests::util::{ - self, EngineBuildContext, MemoryEngineBuildContext, RocksDBEngineBuildContext, TestContext, - TestEnv, - }, + tests::util::{self, memory_ctxs, rocksdb_ctxs, EngineBuildContext, TestContext, TestEnv}, }; #[test] fn test_multi_table_read_write_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_multi_table_read_write(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_multi_table_read_write(ctx); + } } #[test] fn test_multi_table_read_write_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_multi_table_read_write(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_multi_table_read_write(ctx); + } } fn test_multi_table_read_write(engine_context: T) { @@ -171,14 +172,18 @@ fn test_multi_table_read_write(engine_context: T) { #[test] fn test_table_write_read_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_table_write_read(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_table_write_read(ctx); + } } #[test] fn test_table_write_read_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_table_write_read(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_table_write_read(ctx); + } } fn test_table_write_read(engine_context: T) { @@ -192,7 +197,7 @@ fn test_table_write_read(engine_context: T) { let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await; let start_ms = test_ctx.start_ms(); - let rows = [ + let rows: [(&str, Timestamp, &str, f64, f64, &str); 3] = [ ( "key1", Timestamp::new(start_ms), @@ -250,14 +255,18 @@ fn test_table_write_read(engine_context: T) { #[test] fn test_table_write_get_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_table_write_get(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_table_write_get(ctx); + } } #[test] fn test_table_write_get_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_table_write_get(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_table_write_get(ctx); + } } fn test_table_write_get(engine_context: T) { @@ -327,22 +336,28 @@ fn test_table_write_get(engine_context: T) { #[test] fn test_table_write_get_override_rocks() { - test_table_write_get_override::(); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_table_write_get_override(ctx); + } } #[test] fn test_table_write_get_override_mem_wal() { - test_table_write_get_override::(); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_table_write_get_override(ctx); + } } -fn test_table_write_get_override() { - test_table_write_get_override_case::(FlushPoint::NoFlush, T::default()); +fn test_table_write_get_override(engine_context: T) { + test_table_write_get_override_case::(FlushPoint::NoFlush, engine_context.clone()); - test_table_write_get_override_case::(FlushPoint::AfterFirstWrite, T::default()); + test_table_write_get_override_case::(FlushPoint::AfterFirstWrite, engine_context.clone()); - test_table_write_get_override_case::(FlushPoint::AfterOverwrite, T::default()); + test_table_write_get_override_case::(FlushPoint::AfterOverwrite, engine_context.clone()); - test_table_write_get_override_case::(FlushPoint::FirstAndOverwrite, T::default()); + test_table_write_get_override_case::(FlushPoint::FirstAndOverwrite, engine_context); } #[derive(Debug)] @@ -506,16 +521,20 @@ fn test_table_write_get_override_case( #[test] fn test_db_write_buffer_size_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - // Use different table name to avoid metrics collision. - test_db_write_buffer_size("test_db_write_buffer_size_rocks", rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + // Use different table name to avoid metrics collision. + test_db_write_buffer_size("test_db_write_buffer_size_rocks", ctx); + } } #[test] fn test_db_write_buffer_size_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - // Use different table name to avoid metrics collision. - test_db_write_buffer_size("test_db_write_buffer_size_mem_wal", memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + // Use different table name to avoid metrics collision. + test_db_write_buffer_size("test_db_write_buffer_size_mem_wal", ctx); + } } fn test_db_write_buffer_size(table_name: &str, engine_context: T) { @@ -527,16 +546,20 @@ fn test_db_write_buffer_size(table_name: &str, engine_con #[test] fn test_space_write_buffer_size_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - // Use different table name to avoid metrics collision. - test_space_write_buffer_size("test_space_write_buffer_size_rocks", rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + // Use different table name to avoid metrics collision. + test_space_write_buffer_size("test_space_write_buffer_size_rocks", ctx); + } } #[test] fn test_space_write_buffer_size_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - // Use different table name to avoid metrics collision. - test_space_write_buffer_size("test_space_write_buffer_size_mem_wal", memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + // Use different table name to avoid metrics collision. + test_space_write_buffer_size("test_space_write_buffer_size_mem_wal", ctx); + } } fn test_space_write_buffer_size(table_name: &str, engine_context: T) { @@ -623,9 +646,6 @@ fn test_write_buffer_size_overflow( rows.extend_from_slice(&rows1); rows.extend_from_slice(&rows2); - // TODO(boyan) a better way to wait table flushing finishes. - thread::sleep(time::Duration::from_millis(500)); - // Read with different opts. util::check_read( &test_ctx, @@ -636,9 +656,13 @@ fn test_write_buffer_size_overflow( ) .await; + // TODO(lee) a better way to wait table flushing finishes. + thread::sleep(time::Duration::from_millis(500)); + let stats = table.stats(); assert_eq!(old_stats.num_read + 5, stats.num_read); assert_eq!(old_stats.num_write + 2, stats.num_write); + // Flush when reaches (db/space) write_buffer size limitation. assert_eq!(old_stats.num_flush + 1, stats.num_flush); @@ -660,14 +684,18 @@ fn test_write_buffer_size_overflow( #[test] fn test_table_write_read_reverse_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_table_write_read_reverse(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_table_write_read_reverse(ctx); + } } #[test] fn test_table_write_read_reverse_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_table_write_read_reverse(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_table_write_read_reverse(ctx); + } } fn test_table_write_read_reverse(engine_context: T) { @@ -746,15 +774,19 @@ fn test_table_write_read_reverse(engine_context: T) { #[test] #[ignore = "https://github.com/CeresDB/ceresdb/issues/313"] fn test_table_write_read_reverse_after_flush_rocks() { - let rocksdb_ctx = RocksDBEngineBuildContext::default(); - test_table_write_read_reverse_after_flush(rocksdb_ctx); + let rocksdb_ctxs = rocksdb_ctxs(); + for ctx in rocksdb_ctxs { + test_table_write_read_reverse_after_flush(ctx); + } } #[test] #[ignore = "https://github.com/CeresDB/ceresdb/issues/313"] fn test_table_write_read_reverse_after_flush_mem_wal() { - let memory_ctx = MemoryEngineBuildContext::default(); - test_table_write_read_reverse_after_flush(memory_ctx); + let memory_ctxs = memory_ctxs(); + for ctx in memory_ctxs { + test_table_write_read_reverse_after_flush(ctx); + } } fn test_table_write_read_reverse_after_flush(engine_context: T) { diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs index 0e2c897ecc..0cc8fb94e3 100644 --- a/analytic_engine/src/tests/util.rs +++ b/analytic_engine/src/tests/util.rs @@ -8,7 +8,7 @@ use common_types::{ datum::Datum, record_batch::RecordBatch, row::{Row, RowGroup}, - table::DEFAULT_SHARD_ID, + table::{ShardId, DEFAULT_SHARD_ID}, time::Timestamp, }; use common_util::{ @@ -20,8 +20,8 @@ use log::info; use object_store::config::{LocalOptions, ObjectStoreOptions, StorageOptions}; use table_engine::{ engine::{ - CreateTableRequest, DropTableRequest, EngineRuntimes, OpenTableRequest, - Result as EngineResult, TableEngineRef, + CreateTableRequest, DropTableRequest, EngineRuntimes, OpenShardRequest, OpenTableRequest, + Result as EngineResult, TableDef, TableEngineRef, }, table::{ AlterSchemaRequest, FlushRequest, GetRequest, ReadOrder, ReadRequest, Result, SchemaId, @@ -33,7 +33,7 @@ use tempfile::TempDir; use crate::{ setup::{EngineBuilder, MemWalsOpener, OpenedWals, RocksDBWalsOpener, WalsOpener}, tests::table::{self, FixedSchemaTable, RowTuple}, - Config, RocksDBConfig, WalStorageConfig, + Config, RecoverMode, RocksDBConfig, WalStorageConfig, }; const DAY_MS: i64 = 24 * 60 * 60 * 1000; @@ -113,6 +113,7 @@ pub struct TestContext { opened_wals: Option, schema_id: SchemaId, last_table_seq: u32, + open_method: OpenTablesMethod, name_to_tables: HashMap, } @@ -169,8 +170,69 @@ impl TestContext { self.open().await; - for (id, name) in table_infos { - self.open_table(id, name).await; + match self.open_method { + OpenTablesMethod::WithOpenTable => { + for (id, name) in table_infos { + self.open_table(id, name).await; + } + } + OpenTablesMethod::WithOpenShard => { + self.open_tables_of_shard(table_infos, DEFAULT_SHARD_ID) + .await; + } + } + } + + pub async fn reopen_with_tables_of_shard(&mut self, tables: &[&str], shard_id: ShardId) { + let table_infos: Vec<_> = tables + .iter() + .map(|name| { + let table_id = self.name_to_tables.get(*name).unwrap().id(); + (table_id, *name) + }) + .collect(); + { + // Close all tables. + self.name_to_tables.clear(); + + // Close engine. + let engine = self.engine.take().unwrap(); + engine.close().await.unwrap(); + } + + self.open().await; + + self.open_tables_of_shard(table_infos, shard_id).await + } + + async fn open_tables_of_shard(&mut self, table_infos: Vec<(TableId, &str)>, shard_id: ShardId) { + let table_defs = table_infos + .into_iter() + .map(|table| TableDef { + catalog_name: "ceresdb".to_string(), + schema_name: "public".to_string(), + schema_id: self.schema_id, + id: table.0, + name: table.1.to_string(), + }) + .collect(); + + let open_shard_request = OpenShardRequest { + shard_id, + table_defs, + engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(), + }; + + let tables = self + .engine() + .open_shard(open_shard_request) + .await + .unwrap() + .into_values() + .map(|result| result.unwrap().unwrap()); + + for table in tables { + self.name_to_tables.insert(table.name().to_string(), table); } } @@ -368,6 +430,12 @@ impl TestContext { } } +#[derive(Clone, Copy)] +pub enum OpenTablesMethod { + WithOpenTable, + WithOpenShard, +} + impl TestContext { pub fn config_mut(&mut self) -> &mut Config { &mut self.config @@ -405,6 +473,7 @@ impl TestEnv { schema_id: SchemaId::from_u32(100), last_table_seq: 1, name_to_tables: HashMap::new(), + open_method: build_context.open_method(), } } @@ -474,10 +543,22 @@ pub trait EngineBuildContext: Clone + Default { fn wals_opener(&self) -> Self::WalsOpener; fn config(&self) -> Config; + fn open_method(&self) -> OpenTablesMethod; } pub struct RocksDBEngineBuildContext { config: Config, + open_method: OpenTablesMethod, +} + +impl RocksDBEngineBuildContext { + pub fn new(mode: RecoverMode, open_method: OpenTablesMethod) -> Self { + let mut context = Self::default(); + context.config.recover_mode = mode; + context.open_method = open_method; + + context + } } impl Default for RocksDBEngineBuildContext { @@ -504,7 +585,10 @@ impl Default for RocksDBEngineBuildContext { ..Default::default() }; - Self { config } + Self { + config, + open_method: OpenTablesMethod::WithOpenTable, + } } } @@ -531,7 +615,10 @@ impl Clone for RocksDBEngineBuildContext { ..Default::default() })); - Self { config } + Self { + config, + open_method: self.open_method, + } } } @@ -545,11 +632,26 @@ impl EngineBuildContext for RocksDBEngineBuildContext { fn config(&self) -> Config { self.config.clone() } + + fn open_method(&self) -> OpenTablesMethod { + self.open_method + } } #[derive(Clone)] pub struct MemoryEngineBuildContext { config: Config, + open_method: OpenTablesMethod, +} + +impl MemoryEngineBuildContext { + pub fn new(mode: RecoverMode, open_method: OpenTablesMethod) -> Self { + let mut context = Self::default(); + context.config.recover_mode = mode; + context.open_method = open_method; + + context + } } impl Default for MemoryEngineBuildContext { @@ -572,7 +674,10 @@ impl Default for MemoryEngineBuildContext { ..Default::default() }; - Self { config } + Self { + config, + open_method: OpenTablesMethod::WithOpenTable, + } } } @@ -586,4 +691,26 @@ impl EngineBuildContext for MemoryEngineBuildContext { fn config(&self) -> Config { self.config.clone() } + + fn open_method(&self) -> OpenTablesMethod { + self.open_method + } +} + +pub fn rocksdb_ctxs() -> Vec { + vec![ + RocksDBEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenTable), + RocksDBEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenTable), + RocksDBEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenShard), + RocksDBEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenShard), + ] +} + +pub fn memory_ctxs() -> Vec { + vec![ + MemoryEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenTable), + MemoryEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenTable), + MemoryEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenShard), + MemoryEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenShard), + ] } diff --git a/common_types/src/column.rs b/common_types/src/column.rs index 4c09a84644..a2580a47c2 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -5,16 +5,17 @@ use std::sync::Arc; use arrow::{ array::{ - Array, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder, - Date32Array as DateArray, Date32Builder as DateBuilder, Float32Array as FloatArray, - Float32Builder as FloatBuilder, Float64Array as DoubleArray, + Array, ArrayAccessor, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, + BooleanBuilder, Date32Array as DateArray, Date32Builder as DateBuilder, DictionaryArray, + Float32Array as FloatArray, Float32Builder as FloatBuilder, Float64Array as DoubleArray, Float64Builder as DoubleBuilder, Int16Array, Int16Builder, Int32Array, Int32Builder, Int64Array, Int64Builder, Int8Array, Int8Builder, NullArray, StringArray, StringBuilder, - Time64NanosecondArray as TimeArray, Time64NanosecondBuilder as TimeBuilder, - TimestampMillisecondArray, TimestampMillisecondBuilder, UInt16Array, UInt16Builder, - UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, UInt8Builder, + StringDictionaryBuilder, Time64NanosecondArray as TimeArray, + Time64NanosecondBuilder as TimeBuilder, TimestampMillisecondArray, + TimestampMillisecondBuilder, UInt16Array, UInt16Builder, UInt32Array, UInt32Builder, + UInt64Array, UInt64Builder, UInt8Array, UInt8Builder, }, - datatypes::{DataType, TimeUnit}, + datatypes::{DataType, Int32Type, TimeUnit}, error::ArrowError, }; use datafusion::physical_plan::{ @@ -142,6 +143,9 @@ pub struct VarbinaryColumn(BinaryArray); #[derive(Debug)] pub struct StringColumn(StringArray); +#[derive(Debug)] +pub struct StringDictionaryColumn(DictionaryArray); + #[derive(Debug)] pub struct DateColumn(DateArray); @@ -287,6 +291,55 @@ impl_column!( ); impl_column!(StringColumn, get_string_datum, get_string_datum_view); +impl StringDictionaryColumn { + #[doc = " Get datum by index."] + pub fn datum_opt(&self, index: usize) -> Option { + if index >= self.0.len() { + return None; + } + Some(self.datum(index)) + } + + pub fn datum_view_opt(&self, index: usize) -> Option { + if index >= self.0.len() { + return None; + } + Some(self.datum_view(index)) + } + + pub fn datum_view(&self, index: usize) -> DatumView { + if self.0.is_null(index) { + return DatumView::Null; + } + // TODO : Is this the efficient way? + DatumView::String(self.0.downcast_dict::().unwrap().value(index)) + } + + pub fn datum(&self, index: usize) -> Datum { + if self.0.is_null(index) { + return Datum::Null; + } + // TODO : Is this the efficient way? + Datum::String( + self.0 + .downcast_dict::() + .unwrap() + .value(index) + .into(), + ) + } + + #[inline] + pub fn num_rows(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.num_rows() == 0 + } +} + macro_rules! impl_dedup { ($Column: ident) => { impl $Column { @@ -321,6 +374,30 @@ impl_dedup!(TimestampColumn); impl_dedup!(VarbinaryColumn); impl_dedup!(StringColumn); +impl StringDictionaryColumn { + #[doc = " If datum i is not equal to previous datum i - 1, mark `selected[i]` to"] + #[doc = " true."] + #[doc = ""] + #[doc = " The first datum is marked to true."] + #[doc = ""] + #[doc = " The size of selected must equal to the size of this column and"] + #[doc = " initialized to false."] + #[allow(clippy::float_cmp)] + pub fn dedup(&self, selected: &mut [bool]) { + if self.0.is_empty() { + return; + } + selected[0] = true; + for (i, v) in selected.iter_mut().enumerate().take(self.0.len()).skip(1) { + let current = self.0.key(i); + let prev = self.0.key(i - 1); + if current != prev { + *v = true; + } + } + } +} + macro_rules! impl_new_null { ($Column: ident, $Builder: ident) => { impl $Column { @@ -389,6 +466,36 @@ impl_from_array_and_slice!(TimestampColumn, TimestampMillisecondArray); impl_from_array_and_slice!(VarbinaryColumn, BinaryArray); impl_from_array_and_slice!(StringColumn, StringArray); +impl From> for StringDictionaryColumn { + fn from(array: DictionaryArray) -> Self { + Self(array) + } +} +impl From<&DictionaryArray> for StringDictionaryColumn { + fn from(array_ref: &DictionaryArray) -> Self { + let array_data = array_ref.into_data(); + let array = DictionaryArray::::from(array_data); + Self(array) + } +} +impl StringDictionaryColumn { + fn to_arrow_array(&self) -> DictionaryArray { + let array_data = self.0.clone().into_data(); + DictionaryArray::::from(array_data) + } + + #[doc = " Returns a zero-copy slice of this array with the indicated offset and"] + #[doc = " length."] + #[doc = ""] + #[doc = " Panics if offset with length is greater than column length."] + fn slice(&self, offset: usize, length: usize) -> Self { + let array_slice = self.0.slice(offset, length); + let array_data = array_slice.into_data(); + let array = DictionaryArray::::from(array_data); + Self(array) + } +} + macro_rules! impl_iter { ($Column: ident, $Value: ident) => { impl $Column { @@ -438,6 +545,19 @@ impl StringColumn { } } +impl StringDictionaryColumn { + /// Create a column that all values are null. + fn new_null(num_rows: usize) -> Self { + let mut builder = StringDictionaryBuilder::::new(); + for _ in 0..num_rows { + builder.append_null(); + } + let array = builder.finish(); + + Self(array) + } +} + macro_rules! impl_numeric_column { ($(($Kind: ident, $type: ty)), *) => { $( @@ -543,18 +663,21 @@ macro_rules! impl_column_block { impl ColumnBlock { pub fn datum_kind(&self) -> DatumKind { match self { + ColumnBlock::StringDictionary(_) => DatumKind::String, $(ColumnBlock::$Kind(_) => DatumKind::$Kind,)* } } pub fn datum_opt(&self, index: usize) -> Option { match self { + ColumnBlock::StringDictionary(col) => col.datum_opt(index), $(ColumnBlock::$Kind(col) => col.datum_opt(index),)* } } pub fn datum_view_opt(&self, index: usize) -> Option { match self { + ColumnBlock::StringDictionary(col) => col.datum_view_opt(index), $(ColumnBlock::$Kind(col) => col.datum_view_opt(index),)* } } @@ -562,6 +685,7 @@ macro_rules! impl_column_block { /// Panic if index is out fo bound. pub fn datum_view(&self, index: usize) -> DatumView { match self { + ColumnBlock::StringDictionary(col) => col.datum_view(index), $(ColumnBlock::$Kind(col) => col.datum_view(index),)* } } @@ -569,18 +693,21 @@ macro_rules! impl_column_block { /// Panic if index is out fo bound. pub fn datum(&self, index: usize) -> Datum { match self { + ColumnBlock::StringDictionary(col) => col.datum(index), $(ColumnBlock::$Kind(col) => col.datum(index),)* } } pub fn num_rows(&self) -> usize { match self { + ColumnBlock::StringDictionary(col) => col.num_rows(), $(ColumnBlock::$Kind(col) => col.num_rows(),)* } } pub fn to_arrow_array_ref(&self) -> ArrayRef { match self { + ColumnBlock::StringDictionary(col) => Arc::new(col.to_arrow_array()), $(ColumnBlock::$Kind(col) => Arc::new(col.to_arrow_array()),)* } } @@ -590,6 +717,7 @@ macro_rules! impl_column_block { /// The first datum is not marked to true. pub fn dedup(&self, selected: &mut [bool]) { match self { + ColumnBlock::StringDictionary(col) => col.dedup(selected), $(ColumnBlock::$Kind(col) => col.dedup(selected),)* } } @@ -600,6 +728,7 @@ macro_rules! impl_column_block { #[must_use] pub fn slice(&self, offset: usize, length: usize) -> Self { match self { + ColumnBlock::StringDictionary(col) => ColumnBlock::StringDictionary(col.slice(offset, length)), $(ColumnBlock::$Kind(col) => ColumnBlock::$Kind(col.slice(offset, length)),)* } } @@ -612,6 +741,12 @@ macro_rules! impl_column_block { } } })* + + impl From for ColumnBlock { + fn from(column: StringDictionaryColumn) -> Self { + Self::StringDictionary(column) + } + } }; } @@ -628,6 +763,8 @@ macro_rules! define_column_block { #[derive(Debug)] pub enum ColumnBlock { Null(NullColumn), + StringDictionary(StringDictionaryColumn), + String(StringColumn), $( $Kind([<$Kind Column>]), )* @@ -635,8 +772,36 @@ macro_rules! define_column_block { impl ColumnBlock { pub fn try_from_arrow_array_ref(datum_kind: &DatumKind, array: &ArrayRef) -> Result { + let is_dictionary : bool = if let DataType::Dictionary(..) = array.data_type() { + true + } else { + false + }; let column = match datum_kind { DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())), + DatumKind::String => { + if !is_dictionary { + let mills_array; + let cast_column = match array.data_type() { + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + mills_array = cast_nanosecond_to_mills(array)?; + cast_array(datum_kind, &mills_array)? + } + _ => cast_array(datum_kind, array)?, + }; + ColumnBlock::String(StringColumn::from(cast_column)) + } else { + let mills_array; + let cast_column = match array.data_type() { + DataType::Timestamp(TimeUnit::Nanosecond, None) => { + mills_array = cast_nanosecond_to_mills(array)?; + cast_array(datum_kind, &mills_array)? + } + _ => cast_array(datum_kind, array)?, + }; + ColumnBlock::StringDictionary(StringDictionaryColumn::from(cast_column)) + } + }, $( DatumKind::$Kind => { let mills_array; @@ -657,9 +822,16 @@ macro_rules! define_column_block { Ok(column) } - pub fn new_null_with_type(kind: &DatumKind, rows: usize) -> Result { + pub fn new_null_with_type(kind: &DatumKind, rows: usize, is_dictionary: bool) -> Result { let block = match kind { DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(rows)), + DatumKind::String => { + if is_dictionary { + ColumnBlock::StringDictionary(StringDictionaryColumn::new_null(rows)) + }else { + ColumnBlock::String(StringColumn::new_null(rows)) + } + }, $( DatumKind::$Kind => ColumnBlock::$Kind([<$Kind Column>]::new_null(rows)), )* @@ -674,8 +846,8 @@ macro_rules! define_column_block { // Define column blocks, Null is defined explicitly in macro. define_column_block!( - Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32, - Int16, Int8, Boolean, Date, Time + Timestamp, Double, Float, Varbinary, UInt64, UInt32, UInt16, UInt8, Int64, Int32, Int16, Int8, + Boolean, Date, Time ); impl ColumnBlock { @@ -796,7 +968,6 @@ macro_rules! append_block { macro_rules! define_column_block_builder { ($(($Kind: ident, $Builder: ident)), *) => { paste! { - #[derive(Debug)] pub enum ColumnBlockBuilder { Null { rows: usize }, Timestamp(TimestampMillisecondBuilder), @@ -804,6 +975,7 @@ macro_rules! define_column_block_builder { String(StringBuilder), Date(DateBuilder), Time(TimeBuilder), + Dictionary(StringDictionaryBuilder::), $( $Kind($Builder), )* @@ -811,13 +983,19 @@ macro_rules! define_column_block_builder { impl ColumnBlockBuilder { /// Create by data type with initial capacity - pub fn with_capacity(data_type: &DatumKind, item_capacity: usize) -> Self { + pub fn with_capacity(data_type: &DatumKind, item_capacity: usize, is_dictionary : bool) -> Self { match data_type { DatumKind::Null => Self::Null { rows: 0 }, DatumKind::Timestamp => Self::Timestamp(TimestampMillisecondBuilder::with_capacity(item_capacity)), // The data_capacity is set as 1024, because the item is variable-size type. DatumKind::Varbinary => Self::Varbinary(BinaryBuilder::with_capacity(item_capacity, 1024)), - DatumKind::String => Self::String(StringBuilder::with_capacity(item_capacity, 1024)), + DatumKind::String =>{ + if !is_dictionary{ + Self::String(StringBuilder::with_capacity(item_capacity, 1024)) + }else { + Self::Dictionary(StringDictionaryBuilder::::new()) + } + } DatumKind::Date => Self::Date(DateBuilder::with_capacity(item_capacity)), DatumKind::Time => Self::Time(TimeBuilder::with_capacity(item_capacity)), $( @@ -847,6 +1025,17 @@ macro_rules! define_column_block_builder { Self::String(builder) => append_datum!(String, builder, Datum, datum), Self::Date(builder) => append_datum!(Date, builder, Datum, datum), Self::Time(builder) => append_datum!(Time, builder, Datum, datum), + Self::Dictionary(builder) => { + match datum { + Datum::Null => Ok(builder.append_null()), + Datum::String(v) => Ok(builder.append_value(v)), + _ => ConflictType { + expect: DatumKind::String, + given: datum.kind(), + } + .fail() + } + }, $( Self::$Kind(builder) => append_datum!($Kind, builder, Datum, datum), )* @@ -874,6 +1063,17 @@ macro_rules! define_column_block_builder { Self::String(builder) => append_datum!(String, builder, DatumView, datum), Self::Date(builder) => append_datum!(Date, builder, DatumView, datum), Self::Time(builder) => append_datum!(Time, builder, DatumView, datum), + Self::Dictionary(builder) => { + match datum { + DatumView::Null => Ok(builder.append_null()), + DatumView::String(v) => Ok(builder.append_value(v)), + _ => ConflictType { + expect: DatumKind::String, + given: datum.kind(), + } + .fail() + } + }, $( Self::$Kind(builder) => append_datum!($Kind, builder, DatumView, datum), )* @@ -898,6 +1098,34 @@ macro_rules! define_column_block_builder { Self::String(builder) => append_block!(String, builder, ColumnBlock, block, start, len), Self::Date(builder) => append_block!(Date, builder, ColumnBlock, block, start, len), Self::Time(builder) => append_block!(Time, builder, ColumnBlock, block, start, len), + Self::Dictionary(builder) => { + match block { + ColumnBlock::Null(v) => { + let end = std::cmp::min(start + len, v.num_rows()); + for _ in start..end { + builder.append_null(); + } + Ok(()) + } + ColumnBlock::StringDictionary(v) => { + let end = std::cmp::min(start + len, v.num_rows()); + for i in start..end { + if v.0.is_null(i) { + builder.append_null(); + } else { + let value = v.datum(i); + builder.append_value(value.as_str().unwrap()); + } + } + Ok(()) + } + _ => ConflictType { + expect: DatumKind::String, + given: block.datum_kind(), + } + .fail(), + } + }, $( Self::$Kind(builder) => append_block!($Kind, builder, ColumnBlock, block, start, len), )* @@ -912,6 +1140,7 @@ macro_rules! define_column_block_builder { Self::String(builder) => builder.len(), Self::Date(builder) => builder.len(), Self::Time(builder) => builder.len(), + Self::Dictionary(builder) => builder.len(), $( Self::$Kind(builder) => builder.len(), )* @@ -931,6 +1160,9 @@ macro_rules! define_column_block_builder { Self::String(builder) => StringColumn::from(builder.finish()).into(), Self::Date(builder) => DateColumn::from(builder.finish()).into(), Self::Time(builder) => TimeColumn::from(builder.finish()).into(), + Self::Dictionary(builder) => { + StringDictionaryColumn::from(builder.finish()).into() + }, $( Self::$Kind(builder) => [<$Kind Column>]::from(builder.finish()).into(), )* @@ -959,8 +1191,8 @@ define_column_block_builder!( impl ColumnBlockBuilder { /// Create by data type - pub fn new(data_type: &DatumKind) -> Self { - Self::with_capacity(data_type, 0) + pub fn new(data_type: &DatumKind, is_dictionry: bool) -> Self { + Self::with_capacity(data_type, 0, is_dictionry) } pub fn is_empty(&self) -> bool { @@ -976,7 +1208,9 @@ impl ColumnBlockBuilder { #[cfg(test)] mod tests { use super::*; - use crate::tests::{build_rows, build_schema}; + use crate::tests::{ + build_row_for_dictionary, build_rows, build_schema, build_schema_for_dictionary, + }; #[test] fn test_column_block_builder() { @@ -984,7 +1218,7 @@ mod tests { let rows = build_rows(); // DatumKind::Varbinary let column = schema.column(0); - let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2, false); // append builder.append(rows[0][0].clone()).unwrap(); @@ -998,7 +1232,7 @@ mod tests { let column_block = builder.build(); assert_eq!(column_block.num_rows(), 2); - let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2, false); // append_block_range builder.append_block_range(&column_block, 0, 1).unwrap(); @@ -1015,4 +1249,65 @@ mod tests { Datum::Varbinary(Bytes::copy_from_slice(b"binary key1")) ); } + + #[test] + fn test_column_block_string_dictionary_builder() { + let schema = build_schema_for_dictionary(); + let rows = vec![ + build_row_for_dictionary(1, 1, Some("tag1_1"), "tag2_1", 1), + build_row_for_dictionary(2, 2, Some("tag1_2"), "tag2_2", 2), + build_row_for_dictionary(3, 3, Some("tag1_3"), "tag2_3", 3), + build_row_for_dictionary(4, 4, Some("tag1_1"), "tag2_4", 3), + build_row_for_dictionary(5, 5, Some("tag1_3"), "tag2_4", 4), + build_row_for_dictionary(6, 6, None, "tag2_4", 4), + ]; + // DatumKind::String , is_dictionary = true + let column = schema.column(2); + println!("{column:?}"); + let mut builder = + ColumnBlockBuilder::with_capacity(&column.data_type, 0, column.is_dictionary); + // append + (0..rows.len()).for_each(|i| builder.append(rows[i][2].clone()).unwrap()); + + let ret = builder.append(rows[0][0].clone()); + assert!(ret.is_err()); + + // append_view + builder.append_view(rows[5][2].as_view()).unwrap(); + let ret = builder.append_view(rows[1][0].as_view()); + + assert!(ret.is_err()); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 7); + let mut builder = + ColumnBlockBuilder::with_capacity(&column.data_type, 2, column.is_dictionary); + + // append_block_range + (0..rows.len()).for_each(|i| builder.append_block_range(&column_block, i, 1).unwrap()); + + let column_block = builder.build(); + assert_eq!(column_block.num_rows(), 6); + assert_eq!( + column_block.datum(0), + Datum::String(StringBytes::from("tag1_1")) + ); + assert_eq!( + column_block.datum(1), + Datum::String(StringBytes::from("tag1_2")) + ); + assert_eq!( + column_block.datum(2), + Datum::String(StringBytes::from("tag1_3")) + ); + assert_eq!( + column_block.datum(3), + Datum::String(StringBytes::from("tag1_1")) + ); + assert_eq!( + column_block.datum(4), + Datum::String(StringBytes::from("tag1_3")) + ); + assert_eq!(column_block.datum(5), Datum::Null); + } } diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs index 2f1a48cbbd..6deaefa5c4 100644 --- a/common_types/src/column_schema.rs +++ b/common_types/src/column_schema.rs @@ -29,6 +29,12 @@ pub enum Error { backtrace: Backtrace, }, + #[snafu(display("Invalid dictionary type:{}.\nBacktrace:\n{}", data_type, backtrace))] + InvalidDictionaryType { + data_type: DataType, + backtrace: Backtrace, + }, + #[snafu(display( "Arrow field meta data is missing, field name:{}.\nBacktrace:\n{}", field_name, @@ -119,6 +125,7 @@ pub enum ReadOp { struct ArrowFieldMeta { id: u32, is_tag: bool, + is_dictionary: bool, comment: String, } @@ -126,6 +133,7 @@ struct ArrowFieldMeta { pub enum ArrowFieldMetaKey { Id, IsTag, + IsDictionary, Comment, } @@ -134,6 +142,7 @@ impl ArrowFieldMetaKey { match self { ArrowFieldMetaKey::Id => "field::id", ArrowFieldMetaKey::IsTag => "field::is_tag", + ArrowFieldMetaKey::IsDictionary => "field::is_dictionary", ArrowFieldMetaKey::Comment => "field::comment", } } @@ -159,6 +168,8 @@ pub struct ColumnSchema { /// Is tag, tag is just a hint for a column, there is no restriction that a /// tag column must be a part of primary key pub is_tag: bool, + // Whether to use dictionary types for parquet store + pub is_dictionary: bool, /// Comment of the column pub comment: String, /// Column name in response @@ -191,6 +202,11 @@ impl ColumnSchema { } } + /// Check whether a type is valid dictionary type. + pub fn is_valid_dictionary_type(typ: DatumKind) -> bool { + matches!(typ, DatumKind::String) + } + /// Convert `self` to [`arrow::datatypes::Field`] pub fn to_arrow_field(&self) -> Field { From::from(self) @@ -273,6 +289,7 @@ impl TryFrom for ColumnSchema { data_type: DatumKind::from(data_type), is_nullable: column_schema.is_nullable, is_tag: column_schema.is_tag, + is_dictionary: column_schema.is_dictionary, comment: column_schema.comment, escaped_name, default_value, @@ -287,6 +304,7 @@ impl TryFrom<&Arc> for ColumnSchema { let ArrowFieldMeta { id, is_tag, + is_dictionary, comment, } = decode_arrow_field_meta_data(field.metadata())?; Ok(Self { @@ -299,6 +317,7 @@ impl TryFrom<&Arc> for ColumnSchema { )?, is_nullable: field.is_nullable(), is_tag, + is_dictionary, comment, escaped_name: field.name().escape_debug().to_string(), default_value: None, @@ -309,11 +328,24 @@ impl TryFrom<&Arc> for ColumnSchema { impl From<&ColumnSchema> for Field { fn from(col_schema: &ColumnSchema) -> Self { let metadata = encode_arrow_field_meta_data(col_schema); - let mut field = Field::new( - &col_schema.name, - col_schema.data_type.into(), - col_schema.is_nullable, - ); + // If the column sholud use dictionary, create correspond dictionary type. + let mut field = if col_schema.is_dictionary { + Field::new_dict( + &col_schema.name, + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + col_schema.is_nullable, + col_schema.id.into(), + false, + // Todo how to use dict_is_ordered + ) + } else { + Field::new( + &col_schema.name, + col_schema.data_type.into(), + col_schema.is_nullable, + ) + }; + field.set_metadata(metadata); field @@ -343,6 +375,7 @@ fn decode_arrow_field_meta_data(meta: &HashMap) -> Result HashMap, } @@ -385,6 +423,7 @@ impl Builder { data_type, is_nullable: false, is_tag: false, + is_dictionary: false, comment: String::new(), default_value: None, } @@ -407,6 +446,12 @@ impl Builder { self } + /// Set this column is dictionary, default is false (not a dictionary). + pub fn is_dictionary(mut self, is_dictionary: bool) -> Self { + self.is_dictionary = is_dictionary; + self + } + pub fn comment(mut self, comment: String) -> Self { self.comment = comment; self @@ -427,6 +472,15 @@ impl Builder { ); } + if self.is_dictionary { + ensure!( + ColumnSchema::is_valid_dictionary_type(self.data_type), + InvalidDictionaryType { + data_type: self.data_type + } + ); + } + Ok(()) } @@ -439,6 +493,7 @@ impl Builder { data_type: self.data_type, is_nullable: self.is_nullable, is_tag: self.is_tag, + is_dictionary: self.is_dictionary, comment: self.comment, escaped_name, default_value: self.default_value, @@ -460,6 +515,7 @@ impl From for schema_pb::ColumnSchema { is_nullable: src.is_nullable, id: src.id, is_tag: src.is_tag, + is_dictionary: src.is_dictionary, comment: src.comment, default_value, } @@ -475,10 +531,11 @@ mod tests { /// Create a column schema for test, each field is filled with non-default /// value fn new_test_column_schema() -> ColumnSchema { - Builder::new("test_column_schema".to_string(), DatumKind::Boolean) + Builder::new("test_column_schema".to_string(), DatumKind::String) .id(18) .is_nullable(true) .is_tag(true) + .is_dictionary(true) .comment("Comment of this column".to_string()) .default_value(Some(Expr::Value(Value::Boolean(true)))) .build() @@ -491,9 +548,10 @@ mod tests { let rhs = ColumnSchema { id: 18, name: "test_column_schema".to_string(), - data_type: DatumKind::Boolean, + data_type: DatumKind::String, is_nullable: true, is_tag: true, + is_dictionary: true, comment: "Comment of this column".to_string(), escaped_name: "test_column_schema".escape_debug().to_string(), default_value: Some(Expr::Value(Value::Boolean(true))), @@ -508,6 +566,8 @@ mod tests { let pb_schema = schema_pb::ColumnSchema::from(column_schema.clone()); // Check pb specific fields assert!(pb_schema.is_tag); + assert!(pb_schema.is_dictionary); + assert!(pb_schema.is_nullable); let schema_from_pb = ColumnSchema::try_from(pb_schema).unwrap(); assert_eq!(&schema_from_pb, &column_schema); @@ -524,4 +584,16 @@ mod tests { ); } } + + #[test] + fn test_valid_dictionary_type() { + let valid_dictionary_types = vec![DatumKind::String]; + + for v in &DatumKind::VALUES { + assert_eq!( + ColumnSchema::is_valid_dictionary_type(*v), + valid_dictionary_types.contains(v) + ); + } + } } diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs index 6f01baa8d8..d0618a1880 100644 --- a/common_types/src/datum.rs +++ b/common_types/src/datum.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Datum holds different kind of data @@ -8,78 +8,78 @@ use arrow::temporal_conversions::{EPOCH_DAYS_FROM_CE, NANOSECONDS}; use ceresdbproto::schema::DataType as DataTypePb; use chrono::{Datelike, Local, NaiveDate, NaiveTime, TimeZone, Timelike}; use serde::ser::{Serialize, Serializer}; -use snafu::{Backtrace, ResultExt, Snafu}; +use snafu::{Backtrace, OptionExt, ResultExt, Snafu}; use sqlparser::ast::{DataType as SqlDataType, Value}; -use crate::{bytes::Bytes, hash::hash64, string::StringBytes, time::Timestamp}; +use crate::{bytes::Bytes, hash::hash64, hex, string::StringBytes, time::Timestamp}; const DATE_FORMAT: &str = "%Y-%m-%d"; const TIME_FORMAT: &str = "%H:%M:%S%.3f"; #[derive(Debug, Snafu)] pub enum Error { - #[snafu(display( - "Unsupported SQL data type, type:{}.\nBacktrace:\n{}", - sql_type, - backtrace - ))] + #[snafu(display("Unsupported SQL data type, type:{sql_type}.\nBacktrace:\n{backtrace}"))] UnsupportedDataType { sql_type: SqlDataType, backtrace: Backtrace, }, - #[snafu(display("Invalid double or float, err:{}.\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Invalid double or float, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidDouble { source: std::num::ParseFloatError, backtrace: Backtrace, }, #[snafu(display( - "Invalid insert value, kind:{}, value:{:?}.\nBacktrace:\n{}", - kind, - value, - backtrace + "Invalid insert value, kind:{kind}, value:{value:?}.\nBacktrace:\n{backtrace}" ))] InvalidValueType { kind: DatumKind, value: Value, backtrace: Backtrace, }, - #[snafu(display("Invalid timestamp, err:{}.\nBacktrace:\n{}", source, backtrace))] + + #[snafu(display("Invalid timestamp, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidTimestamp { source: std::num::ParseIntError, backtrace: Backtrace, }, - #[snafu(display("Invalid date, err:{}.\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Invalid date, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidDate { source: chrono::ParseError, backtrace: Backtrace, }, - #[snafu(display("Invalid time, err:{}.\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Invalid time, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidTimeCause { source: chrono::ParseError, backtrace: Backtrace, }, - #[snafu(display("Invalid time, err:{}.\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Invalid time, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidTimeHourFormat { source: std::num::ParseIntError, backtrace: Backtrace, }, - #[snafu(display("Invalid time, err:{}", msg))] - InvalidTimeNoCause { msg: String }, + #[snafu(display("Invalid time, err:{msg}.\nBacktrace:\n{backtrace}"))] + InvalidTimeNoCause { msg: String, backtrace: Backtrace }, - #[snafu(display("Invalid integer, err:{}.\nBacktrace:\n{}", source, backtrace))] + #[snafu(display("Invalid integer, err:{source}.\nBacktrace:\n{backtrace}"))] InvalidInt { source: std::num::ParseIntError, backtrace: Backtrace, }, - #[snafu(display("Invalid datum byte, byte:{}.\nBacktrace:\n{}", value, backtrace))] + #[snafu(display("Invalid datum byte, byte:{value}.\nBacktrace:\n{backtrace}"))] InvalidDatumByte { value: u8, backtrace: Backtrace }, + + #[snafu(display("Invalid hex value, hex_val:{hex_val}.\nBacktrace:\n{backtrace}"))] + InvalidHexValue { + hex_val: String, + backtrace: Backtrace, + }, } pub type Result = std::result::Result; @@ -171,6 +171,11 @@ impl DatumKind { ) } + /// Can column of this datum kind used as dictionary encode column + pub fn is_dictionary_kind(&self) -> bool { + matches!(self, DatumKind::String) + } + pub fn unsign_kind(&self) -> Option { match self { Self::Int64 | Self::UInt64 => Some(Self::UInt64), @@ -225,8 +230,8 @@ impl DatumKind { DatumKind::UInt8 => 1, DatumKind::Int64 => 8, DatumKind::Int32 => 4, - DatumKind::Int16 => 8, - DatumKind::Int8 => 8, + DatumKind::Int16 => 2, + DatumKind::Int8 => 1, DatumKind::Boolean => 1, DatumKind::Date => 4, DatumKind::Time => 8, @@ -749,6 +754,10 @@ impl Datum { (DatumKind::Varbinary, Value::DoubleQuotedString(s)) => { Ok(Datum::Varbinary(Bytes::from(s))) } + (DatumKind::Varbinary, Value::HexStringLiteral(s)) => { + let bytes = hex::try_decode(&s).context(InvalidHexValue { hex_val: s })?; + Ok(Datum::Varbinary(Bytes::from(bytes))) + } (DatumKind::String, Value::DoubleQuotedString(s)) => { Ok(Datum::String(StringBytes::from(s))) } @@ -847,6 +856,28 @@ impl Datum { Ok(Datum::Date(days)) } + pub fn size(&self) -> usize { + match self { + Datum::Null => 1, + Datum::Timestamp(_) => 8, + Datum::Double(_) => 8, + Datum::Float(_) => 4, + Datum::Varbinary(v) => v.len(), + Datum::String(v) => v.len(), + Datum::UInt64(_) => 8, + Datum::UInt32(_) => 4, + Datum::UInt16(_) => 2, + Datum::UInt8(_) => 1, + Datum::Int64(_) => 8, + Datum::Int32(_) => 4, + Datum::Int16(_) => 2, + Datum::Int8(_) => 1, + Datum::Boolean(_) => 1, + Datum::Date(_) => 4, + Datum::Time(_) => 8, + } + } + #[cfg(test)] pub fn as_view(&self) -> DatumView { match self { @@ -1112,6 +1143,7 @@ pub mod arrow_convert { DataType::Boolean => Some(Self::Boolean), DataType::Date32 => Some(Self::Date), DataType::Time64(TimeUnit::Nanosecond) => Some(Self::Time), + DataType::Dictionary(_, _) => Some(Self::String), DataType::Float16 | DataType::LargeUtf8 | DataType::LargeBinary @@ -1127,7 +1159,6 @@ pub mod arrow_convert { | DataType::Date64 | DataType::Interval(_) | DataType::Duration(_) - | DataType::Dictionary(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) | DataType::RunEndEncoded(_, _) @@ -1209,6 +1240,7 @@ pub mod arrow_convert { } ScalarValue::Date32(v) => v.map(Datum::Date), ScalarValue::Time64Nanosecond(v) => v.map(Datum::Time), + ScalarValue::Dictionary(_, literal) => Datum::from_scalar_value(literal), ScalarValue::List(_, _) | ScalarValue::Date64(_) | ScalarValue::Time32Second(_) @@ -1222,8 +1254,7 @@ pub mod arrow_convert { | ScalarValue::Struct(_, _) | ScalarValue::Decimal128(_, _, _) | ScalarValue::Null - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Dictionary(_, _) => None, + | ScalarValue::IntervalMonthDayNano(_) => None, } } } @@ -1255,6 +1286,7 @@ pub mod arrow_convert { ScalarValue::TimestampMillisecond(v, _) => { v.map(|v| DatumView::Timestamp(Timestamp::new(v))) } + ScalarValue::Dictionary(_, literal) => DatumView::from_scalar_value(literal), ScalarValue::List(_, _) | ScalarValue::Date64(_) | ScalarValue::Time32Second(_) @@ -1268,8 +1300,7 @@ pub mod arrow_convert { | ScalarValue::Struct(_, _) | ScalarValue::Decimal128(_, _, _) | ScalarValue::Null - | ScalarValue::IntervalMonthDayNano(_) - | ScalarValue::Dictionary(_, _) => None, + | ScalarValue::IntervalMonthDayNano(_) => None, } } } @@ -1478,4 +1509,49 @@ mod tests { assert!(Datum::parse_datum_time_from_str(source).is_err()); } } + + #[test] + fn test_convert_from_sql_value() { + let cases = vec![ + ( + Value::Boolean(false), + DatumKind::Boolean, + true, + Some(Datum::Boolean(false)), + ), + ( + Value::Number("100.1".to_string(), false), + DatumKind::Float, + true, + Some(Datum::Float(100.1)), + ), + ( + Value::SingleQuotedString("string_literal".to_string()), + DatumKind::String, + true, + Some(Datum::String(StringBytes::from_static("string_literal"))), + ), + ( + Value::HexStringLiteral("c70a0b".to_string()), + DatumKind::Varbinary, + true, + Some(Datum::Varbinary(Bytes::from(vec![199, 10, 11]))), + ), + ( + Value::EscapedStringLiteral("string_literal".to_string()), + DatumKind::String, + false, + None, + ), + ]; + + for (input, kind, succeed, expect) in cases { + let res = Datum::try_from_sql_value(&kind, input); + if succeed { + assert_eq!(res.unwrap(), expect.unwrap()); + } else { + assert!(res.is_err()); + } + } + } } diff --git a/common_types/src/hex.rs b/common_types/src/hex.rs new file mode 100644 index 0000000000..8ae595c562 --- /dev/null +++ b/common_types/src/hex.rs @@ -0,0 +1,64 @@ +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. + +// TODO: move this module to common_util package after remove the common_types +// from the dependencies of the common_util. + +/// Try to decode bytes from hex literal string. +/// +/// None will be returned if the input literal is hex-invalid. +pub fn try_decode(s: &str) -> Option> { + let hex_bytes = s.as_bytes(); + + let mut decoded_bytes = Vec::with_capacity((hex_bytes.len() + 1) / 2); + + let start_idx = hex_bytes.len() % 2; + if start_idx > 0 { + // The first byte is formed of only one char. + decoded_bytes.push(try_decode_hex_char(hex_bytes[0])?); + } + + for i in (start_idx..hex_bytes.len()).step_by(2) { + let high = try_decode_hex_char(hex_bytes[i])?; + let low = try_decode_hex_char(hex_bytes[i + 1])?; + decoded_bytes.push(high << 4 | low); + } + + Some(decoded_bytes) +} + +/// Try to decode a byte from a hex char. +/// +/// None will be returned if the input char is hex-invalid. +const fn try_decode_hex_char(c: u8) -> Option { + match c { + b'A'..=b'F' => Some(c - b'A' + 10), + b'a'..=b'f' => Some(c - b'a' + 10), + b'0'..=b'9' => Some(c - b'0'), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode_hex_literal() { + let cases = [ + ("", Some(vec![])), + ("FF00", Some(vec![255, 0])), + ("a00a", Some(vec![160, 10])), + ("FF0", Some(vec![15, 240])), + ("f", Some(vec![15])), + ("FF0X", None), + ("X0", None), + ("XX", None), + ("x", None), + ]; + + for (input, expect) in cases { + let output = try_decode(input); + assert_eq!(output, expect); + } + } +} diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs index a796fb207c..0179b72613 100644 --- a/common_types/src/lib.rs +++ b/common_types/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Contains common types @@ -9,6 +9,7 @@ pub mod column; pub mod column_schema; pub mod datum; pub mod hash; +pub mod hex; #[cfg(feature = "arrow")] pub mod projected_schema; #[cfg(feature = "arrow")] diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs index a7a73c9381..fbfacd902b 100644 --- a/common_types/src/record_batch.rs +++ b/common_types/src/record_batch.rs @@ -318,7 +318,23 @@ fn cast_arrow_record_batch(source: ArrowRecordBatch) -> Result DataType::Timestamp(TimeUnit::Millisecond, None), field.is_nullable(), ), - _ => Field::new(field.name(), field.data_type().clone(), field.is_nullable()), + _ => { + let (dict_id, dict_is_ordered) = { + match field.data_type() { + DataType::Dictionary(_, _) => { + (field.dict_id().unwrap(), field.dict_is_ordered().unwrap()) + } + _ => (0, false), + } + }; + Field::new_dict( + field.name(), + field.data_type().clone(), + field.is_nullable(), + dict_id, + dict_is_ordered, + ) + } }; f.set_metadata(field.metadata().clone()); f @@ -477,7 +493,13 @@ impl RecordBatchWithKeyBuilder { let builders = schema_with_key .columns() .iter() - .map(|column_schema| ColumnBlockBuilder::with_capacity(&column_schema.data_type, 0)) + .map(|column_schema| { + ColumnBlockBuilder::with_capacity( + &column_schema.data_type, + 0, + column_schema.is_dictionary, + ) + }) .collect(); Self { schema_with_key, @@ -490,7 +512,11 @@ impl RecordBatchWithKeyBuilder { .columns() .iter() .map(|column_schema| { - ColumnBlockBuilder::with_capacity(&column_schema.data_type, capacity) + ColumnBlockBuilder::with_capacity( + &column_schema.data_type, + capacity, + column_schema.is_dictionary, + ) }) .collect(); Self { @@ -660,9 +686,12 @@ impl ArrowRecordBatchProjector { } None => { // Need to push row with specific type. - let null_block = - ColumnBlock::new_null_with_type(&column_schema.data_type, num_rows) - .context(CreateColumnBlock)?; + let null_block = ColumnBlock::new_null_with_type( + &column_schema.data_type, + num_rows, + column_schema.is_dictionary, + ) + .context(CreateColumnBlock)?; column_blocks.push(null_block); } } diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs index 9fb93e56c3..b420275871 100644 --- a/common_types/src/row/mod.rs +++ b/common_types/src/row/mod.rs @@ -128,6 +128,10 @@ impl Row { self.cols[timestamp_index].as_timestamp() } + + pub fn size(&self) -> usize { + self.cols.iter().map(|col| col.size()).sum() + } } #[derive(Debug)] diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs index c5db4d0636..d94f10e52a 100644 --- a/common_types/src/schema.rs +++ b/common_types/src/schema.rs @@ -690,7 +690,7 @@ impl Schema { self.column_schemas.num_columns() } - /// Returns true if idx is primary key idnex + /// Returns true if idx is primary key index pub fn is_primary_key_index(&self, idx: &usize) -> bool { self.primary_key_indexes.contains(idx) } diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs index 0703d39d1e..fd81255da2 100644 --- a/common_types/src/tests.rs +++ b/common_types/src/tests.rs @@ -129,7 +129,8 @@ fn default_value_schema_builder() -> schema::Builder { } /// Build a schema for testing: -/// (key1(varbinary), key2(timestamp), field1(double), field2(string)) +/// (key1(varbinary), key2(timestamp), field1(double), field2(string), +/// field3(date), field4(time)) pub fn build_schema() -> Schema { base_schema_builder().build().unwrap() } @@ -144,6 +145,50 @@ pub fn build_schema() -> Schema { pub fn build_default_value_schema() -> Schema { default_value_schema_builder().build().unwrap() } +/// Build a schema for testing: +/// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8), +/// field2(float)) +pub fn build_schema_for_dictionary() -> Schema { + let builder = schema::Builder::new() + .auto_increment_column_id(true) + .add_key_column( + column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) + .build() + .unwrap(), + ) + .unwrap() + .add_key_column( + column_schema::Builder::new("time".to_string(), DatumKind::Timestamp) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("tag1".to_string(), DatumKind::String) + .is_tag(true) + .is_dictionary(true) + .is_nullable(true) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("tag2".to_string(), DatumKind::String) + .is_tag(true) + .is_dictionary(true) + .build() + .unwrap(), + ) + .unwrap() + .add_normal_column( + column_schema::Builder::new("value".to_string(), DatumKind::Int8) + .build() + .unwrap(), + ) + .unwrap(); + + builder.build().unwrap() +} /// Build a schema for testing: /// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8), @@ -193,6 +238,23 @@ pub fn build_schema_for_cpu() -> Schema { builder.build().unwrap() } +pub fn build_row_for_dictionary( + key1: u64, + key2: i64, + tag1: Option<&str>, + tag2: &str, + value: i8, +) -> Row { + let datums = vec![ + Datum::UInt64(key1), + Datum::Timestamp(Timestamp::new(key2)), + tag1.map(|v| Datum::String(StringBytes::from(v))) + .unwrap_or(Datum::Null), + Datum::String(StringBytes::from(tag2)), + Datum::Int8(value), + ]; + Row::from_datums(datums) +} pub fn build_projected_schema() -> ProjectedSchema { let schema = build_schema(); assert!(schema.num_columns() > 1); diff --git a/components/message_queue/Cargo.toml b/components/message_queue/Cargo.toml index e820d49bca..4766330828 100644 --- a/components/message_queue/Cargo.toml +++ b/components/message_queue/Cargo.toml @@ -16,8 +16,8 @@ snafu = { workspace = true } tokio = { workspace = true } [dependencies.rskafka] -git = "https://github.com/influxdata/rskafka.git" -rev = "00988a564b1db0249d858065fc110476c075efad" +git = "https://github.com/Rachelint/rskafka.git" +rev = "f0fd8e278d8164cb0cfca5a80476361fc308ecc3" default-features = false features = ["compression-gzip", "compression-lz4", "compression-snappy"] diff --git a/components/message_queue/src/kafka/config.rs b/components/message_queue/src/kafka/config.rs index 880b7f4f94..24629442ed 100644 --- a/components/message_queue/src/kafka/config.rs +++ b/components/message_queue/src/kafka/config.rs @@ -2,20 +2,39 @@ //! Kafka implementation's config +use common_util::config::ReadableDuration; use serde::{Deserialize, Serialize}; /// Generic client config that is used for consumers, producers as well as admin /// operations (like "create topic"). -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] #[serde(default)] pub struct Config { pub client: ClientConfig, pub topic_management: TopicManagementConfig, pub consumer: ConsumerConfig, + pub retry_interval_factor: f64, + pub init_retry_interval: ReadableDuration, + pub max_retry_interval: ReadableDuration, + pub max_retry: usize, // TODO: may need some config options for producer, // but it seems nothing needed now. } +impl Default for Config { + fn default() -> Self { + Self { + client: Default::default(), + topic_management: Default::default(), + consumer: Default::default(), + retry_interval_factor: 2.0, + init_retry_interval: ReadableDuration::secs(1), + max_retry_interval: ReadableDuration::secs(10), + max_retry: 10, + } + } +} + #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(default)] pub struct ClientConfig { diff --git a/components/message_queue/src/kafka/kafka_impl.rs b/components/message_queue/src/kafka/kafka_impl.rs index 4937d2efee..0d69d5e382 100644 --- a/components/message_queue/src/kafka/kafka_impl.rs +++ b/components/message_queue/src/kafka/kafka_impl.rs @@ -21,6 +21,7 @@ use rskafka::{ Client, ClientBuilder, }, record::{Record, RecordAndOffset}, + BackoffConfig, }; use snafu::{Backtrace, ResultExt, Snafu}; use tokio::sync::RwLock; @@ -141,7 +142,14 @@ impl KafkaImplInner { panic!("The boost broker must be set"); } - let mut client_builder = ClientBuilder::new(config.client.boost_brokers.clone().unwrap()); + let backoff_config = BackoffConfig { + init_backoff: config.init_retry_interval.0, + max_backoff: config.max_retry_interval.0, + base: config.retry_interval_factor, + max_retry: config.max_retry, + }; + let mut client_builder = ClientBuilder::new(config.client.boost_brokers.clone().unwrap()) + .backoff_config(backoff_config); if let Some(max_message_size) = config.client.max_message_size { client_builder = client_builder.max_message_size(max_message_size); } diff --git a/components/parquet_ext/Cargo.toml b/components/parquet_ext/Cargo.toml index 1b4b4b23c6..ba31703d18 100644 --- a/components/parquet_ext/Cargo.toml +++ b/components/parquet_ext/Cargo.toml @@ -17,6 +17,8 @@ async-trait = { workspace = true } bytes = { workspace = true } common_util = { workspace = true } datafusion = { workspace = true } +futures = { workspace = true } log = { workspace = true } +object_store = { workspace = true } parquet = { workspace = true } tokio = { workspace = true } diff --git a/components/parquet_ext/src/lib.rs b/components/parquet_ext/src/lib.rs index cd413c0afc..7264b38dd6 100644 --- a/components/parquet_ext/src/lib.rs +++ b/components/parquet_ext/src/lib.rs @@ -2,6 +2,7 @@ pub mod meta_data; pub mod prune; +pub mod reader; pub mod reverse_reader; #[cfg(test)] pub mod tests; diff --git a/components/parquet_ext/src/meta_data.rs b/components/parquet_ext/src/meta_data.rs index e796244c16..56bf777ff8 100644 --- a/components/parquet_ext/src/meta_data.rs +++ b/components/parquet_ext/src/meta_data.rs @@ -1,15 +1,18 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. -use std::ops::Range; +use std::{ops::Range, sync::Arc}; use async_trait::async_trait; use bytes::Bytes; use common_util::error::GenericResult; use parquet::{ + arrow::{arrow_reader::ArrowReaderOptions, ParquetRecordBatchStreamBuilder}, errors::{ParquetError, Result}, file::{footer, metadata::ParquetMetaData}, }; +use crate::reader::ObjectStoreReader; + #[async_trait] pub trait ChunkReader: Sync + Send { async fn get_bytes(&self, range: Range) -> GenericResult; @@ -65,3 +68,21 @@ pub async fn fetch_parquet_metadata( footer::decode_metadata(&metadata_bytes).map(|v| (v, metadata_len)) } + +/// Build page indexes for meta data +/// +/// TODO: Currently there is no method to build page indexes for meta data in +/// `parquet`, maybe we can write a issue in `arrow-rs` . +pub async fn meta_with_page_indexes( + object_store_reader: ObjectStoreReader, +) -> Result> { + let read_options = ArrowReaderOptions::new().with_page_index(true); + let builder = + ParquetRecordBatchStreamBuilder::new_with_options(object_store_reader, read_options) + .await + .map_err(|e| { + let err_msg = format!("failed to build page indexes in metadata, err:{e}"); + ParquetError::General(err_msg) + })?; + Ok(builder.metadata().clone()) +} diff --git a/components/parquet_ext/src/reader.rs b/components/parquet_ext/src/reader.rs new file mode 100644 index 0000000000..3a5cd5f170 --- /dev/null +++ b/components/parquet_ext/src/reader.rs @@ -0,0 +1,81 @@ +// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0. + +use std::{ops::Range, sync::Arc, time::Instant}; + +use bytes::Bytes; +use futures::{ + future::{BoxFuture, FutureExt}, + TryFutureExt, +}; +use log::debug; +use object_store::{ObjectStoreRef, Path}; +use parquet::{arrow::async_reader::AsyncFileReader, file::metadata::ParquetMetaData}; + +/// Implemention AsyncFileReader based on `ObjectStore` +/// +/// TODO: Perhaps we should avoid importing `object_store` in `parquet_ext` to +/// keep the crate `parquet_ext` more pure. +#[derive(Clone)] +pub struct ObjectStoreReader { + storage: ObjectStoreRef, + path: Path, + meta_data: Arc, + begin: Instant, +} + +impl ObjectStoreReader { + pub fn new(storage: ObjectStoreRef, path: Path, meta_data: Arc) -> Self { + Self { + storage, + path, + meta_data, + begin: Instant::now(), + } + } +} + +impl Drop for ObjectStoreReader { + fn drop(&mut self) { + debug!( + "ObjectStoreReader dropped, path:{}, elapsed:{:?}", + &self.path, + self.begin.elapsed() + ); + } +} + +impl AsyncFileReader for ObjectStoreReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + self.storage + .get_range(&self.path, range) + .map_err(|e| { + parquet::errors::ParquetError::General(format!( + "Failed to fetch range from object store, err:{e}" + )) + }) + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + self.storage + .get_ranges(&self.path, &ranges) + .map_err(|e| { + parquet::errors::ParquetError::General(format!( + "Failed to fetch ranges from object store, err:{e}" + )) + }) + .await + } + .boxed() + } + + fn get_metadata( + &mut self, + ) -> BoxFuture<'_, parquet::errors::Result>> { + Box::pin(async move { Ok(self.meta_data.clone()) }) + } +} diff --git a/components/profile/Cargo.toml b/components/profile/Cargo.toml index abd15dd1ed..72eb060de3 100644 --- a/components/profile/Cargo.toml +++ b/components/profile/Cargo.toml @@ -18,3 +18,4 @@ features = ["stats", "profiling", "unprefixed_malloc_on_supported_platforms"] jemalloc-ctl = "0.3.2" jemallocator = "0.3.2" log = { workspace = true } +pprof = { version = "0.11.1", features = ["flamegraph"] } diff --git a/components/profile/src/lib.rs b/components/profile/src/lib.rs index 524e97b7f2..82f5319ca6 100644 --- a/components/profile/src/lib.rs +++ b/components/profile/src/lib.rs @@ -1,6 +1,6 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. -//! Memory profiler for running application based on jemalloc features. +//! Profiler for running application. use std::{ fmt::Formatter, @@ -9,6 +9,7 @@ use std::{ io::Read, sync::{Mutex, MutexGuard}, thread, time, + time::Duration, }; use jemalloc_ctl::{Access, AsName}; @@ -36,8 +37,9 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; const PROF_ACTIVE: &[u8] = b"prof.active\0"; const PROF_DUMP: &[u8] = b"prof.dump\0"; -const PROFILE_OUTPUT_FILE_OS_PATH: &[u8] = b"/tmp/profile.out\0"; -const PROFILE_OUTPUT_FILE_PATH: &str = "/tmp/profile.out"; +const PROFILE_HEAP_OUTPUT_FILE_OS_PATH: &[u8] = b"/tmp/profile_heap.out\0"; +const PROFILE_HEAP_OUTPUT_FILE_PATH: &str = "/tmp/profile_heap.out"; +const PROFILE_CPU_OUTPUT_FILE_PATH: &str = "/tmp/flamegraph_cpu.svg"; fn set_prof_active(active: bool) -> Result<()> { let name = PROF_ACTIVE.name(); @@ -46,15 +48,15 @@ fn set_prof_active(active: bool) -> Result<()> { fn dump_profile() -> Result<()> { let name = PROF_DUMP.name(); - name.write(PROFILE_OUTPUT_FILE_OS_PATH) + name.write(PROFILE_HEAP_OUTPUT_FILE_OS_PATH) .map_err(Error::Jemalloc) } struct ProfLockGuard<'a>(MutexGuard<'a, ()>); /// ProfLockGuard hold the profile lock and take responsibilities for -/// (de)activating mem profiling. NOTE: Keeping mem profiling on may cause some -/// extra runtime cost so we choose to activating it dynamically. +/// (de)activating heap profiling. NOTE: Keeping heap profiling on may cause +/// some extra runtime cost so we choose to activating it dynamically. impl<'a> ProfLockGuard<'a> { pub fn new(guard: MutexGuard<'a, ()>) -> Result { set_prof_active(true)?; @@ -71,7 +73,7 @@ impl<'a> Drop for ProfLockGuard<'a> { } pub struct Profiler { - mem_prof_lock: Mutex<()>, + heap_prof_lock: Mutex<()>, } impl Default for Profiler { @@ -83,19 +85,22 @@ impl Default for Profiler { impl Profiler { pub fn new() -> Self { Self { - mem_prof_lock: Mutex::new(()), + heap_prof_lock: Mutex::new(()), } } - // dump_mem_prof collects mem profiling data in `seconds`. + // dump_heap_prof collects heap profiling data in `seconds`. // TODO(xikai): limit the profiling duration - pub fn dump_mem_prof(&self, seconds: u64) -> Result> { + pub fn dump_heap_prof(&self, seconds: u64) -> Result> { // concurrent profiling is disabled. - let lock_guard = self.mem_prof_lock.try_lock().map_err(|e| Error::Internal { - msg: format!("failed to acquire mem_prof_lock, err:{e}"), - })?; + let lock_guard = self + .heap_prof_lock + .try_lock() + .map_err(|e| Error::Internal { + msg: format!("failed to acquire heap_prof_lock, err:{e}"), + })?; info!( - "Profiler::dump_mem_prof start memory profiling {} seconds", + "Profiler::dump_heap_prof start heap profiling {} seconds", seconds ); @@ -109,7 +114,7 @@ impl Profiler { .create(true) .write(true) .truncate(true) - .open(PROFILE_OUTPUT_FILE_PATH) + .open(PROFILE_HEAP_OUTPUT_FILE_PATH) .map_err(|e| { error!("Failed to open prof data file, err:{}", e); Error::IO(e) @@ -119,13 +124,13 @@ impl Profiler { dump_profile().map_err(|e| { error!( "Failed to dump prof to {}, err:{}", - PROFILE_OUTPUT_FILE_PATH, e + PROFILE_HEAP_OUTPUT_FILE_PATH, e ); e })?; // read the profile results into buffer - let mut f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| { + let mut f = File::open(PROFILE_HEAP_OUTPUT_FILE_PATH).map_err(|e| { error!("Failed to open prof data file, err:{}", e); Error::IO(e) })?; @@ -138,4 +143,28 @@ impl Profiler { Ok(buffer) } + + pub fn dump_cpu_prof(&self, seconds: u64) -> Result<()> { + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(100) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build() + .map_err(|e| Error::Internal { + msg: format!("Profiler guard, err:{e}"), + })?; + + thread::sleep(Duration::from_secs(seconds)); + + let report = guard.report().build().map_err(|e| Error::Internal { + msg: format!("Report build, err:{e}"), + })?; + let file = File::create(PROFILE_CPU_OUTPUT_FILE_PATH).map_err(|e| { + error!("Failed to create cpu profile svg file, err:{}", e); + Error::IO(e) + })?; + report.flamegraph(file).map_err(|e| Error::Internal { + msg: format!("Flamegraph output, err:{e}"), + })?; + Ok(()) + } } diff --git a/df_operator/src/udfs/time_bucket.rs b/df_operator/src/udfs/time_bucket.rs index 1ea693d954..bb4c6b29bb 100644 --- a/df_operator/src/udfs/time_bucket.rs +++ b/df_operator/src/udfs/time_bucket.rs @@ -141,8 +141,9 @@ impl<'a> TimeBucket<'a> { } fn call(&self) -> Result { + // TODO mising is_dictionary params let mut out_column_builder = - ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows()); + ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows(), false); for ts_opt in self.column.iter() { match ts_opt { Some(ts) => { diff --git a/integration_tests/Makefile b/integration_tests/Makefile index 49ff4ba9c6..84bb3454e1 100644 --- a/integration_tests/Makefile +++ b/integration_tests/Makefile @@ -69,3 +69,6 @@ run-mysql: run-prom: cd prom && ./run-tests.sh + +run-recovery: clean build-ceresdb kill-old-process + cd recovery && ./run.sh && ./run.sh shard_based diff --git a/integration_tests/config/shard-based-recovery.toml b/integration_tests/config/shard-based-recovery.toml new file mode 100644 index 0000000000..3ad980df0b --- /dev/null +++ b/integration_tests/config/shard-based-recovery.toml @@ -0,0 +1,21 @@ +[server] +bind_addr = "0.0.0.0" +http_port = 5440 +grpc_port = 8831 + +[logger] +level = "info" + +[tracing] +dir = "/tmp/ceresdb" + +[analytic] +recover_mode = "ShardBased" + +[analytic.storage.object_store] +type = "Local" +data_dir = "/tmp/ceresdb" + +[analytic.wal] +type = "RocksDB" +data_dir = "/tmp/ceresdb" diff --git a/integration_tests/recovery/check.py b/integration_tests/recovery/check.py new file mode 100644 index 0000000000..73b7495b14 --- /dev/null +++ b/integration_tests/recovery/check.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# coding: utf-8 + +import requests +import argparse + +api_root = 'http://localhost:5440' +headers = { + 'Content-Type': 'application/json' +} + +def get_test_tables(ts): + table = 'sql_test' + str(ts) + table2 = 'SQL_TEST' + str(ts) + return [table, table2] + +def get_args(): + parser = argparse.ArgumentParser(description='cmd args') + parser.add_argument('--timestamp', '-ts', type=int, help='timestamp') + parser.add_argument('--init_before_check', '-i', help='init_before_check', action="store_true") + args = vars(parser.parse_args()) + return args + + +def execute_sql(sql): + r = requests.post('{}/sql'.format(api_root), json={'query': sql}, headers=headers) + assert r.status_code == 200, r.text + return r.json() + +def prepare_data(ts, tables): + for t in tables: + execute_sql(""" +CREATE TABLE if not exists `{}` ( + `t` timestamp NOT NULL, + `tag1` string TAG, + `tag2` string TAG, + `value` double NOT NULL, + `VALUE2` double NOT NULL, + timestamp KEY (t) +); + """.format(t)) + + execute_sql(""" +insert into {}(t, tag1, tag2, value, VALUE2) +values +({}, "v1", "v2", 1, 2), +({}, "v1", "v2", 11, 22) + ; + """.format(tables[0], ts-5000, ts)) + + execute_sql(""" +insert into {}(t, tag1, tag2, value, VALUE2) +values +({}, "v1", "v2", 10, 20), +({}, "v1", "v2", 110, 220) + ; + """.format(tables[1], ts-5000, ts)) + +def query_and_check(ts, tables): + expected = {'rows': [{'tsid': 7518337278486593135, 't': ts - 5000, 'tag1': 'v1', 'tag2': 'v2', 'value': 1.0, 'VALUE2': 2.0},\ + {'tsid': 7518337278486593135, 't': ts, 'tag1': 'v1', 'tag2': 'v2', 'value': 11.0, 'VALUE2': 22.0}]} + expected2 = {'rows': [{'tsid': 7518337278486593135, 't': ts - 5000, 'tag1': 'v1', 'tag2': 'v2', 'value': 10.0, 'VALUE2': 20.0},\ + {'tsid': 7518337278486593135, 't': ts, 'tag1': 'v1', 'tag2': 'v2', 'value': 110.0, 'VALUE2': 220.0}]} + expecteds = [expected, expected2] + + for idx, t in enumerate(tables): + r = execute_sql("select * from {}".format(t)) + assert r == expecteds[idx] + + print('Restart test pass...') + +def main(): + args = get_args() + init_before_check = args['init_before_check'] + ts = args['timestamp'] + test_tables = get_test_tables(args['timestamp']) + + if init_before_check: + print("Init before check") + prepare_data(ts, test_tables) + query_and_check(ts, test_tables) + +if __name__ == '__main__': + main() diff --git a/integration_tests/recovery/run.sh b/integration_tests/recovery/run.sh new file mode 100755 index 0000000000..83295244cd --- /dev/null +++ b/integration_tests/recovery/run.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -e + +ROOT=`pwd` +# For compatibility in macos, so convert to milliseconds by adding 3 zeros. +NOW=`date +%s000` +BINARY_PATH=${ROOT}/../../target/debug/ceresdb-server +SERVER_HTTP_ENDPOINT=127.0.0.1:5440 + +CONFIG_FILE=${ROOT}/../../docs/minimal.toml +if [ ${1} == 'shard_based' ]; then + CONFIG_FILE=${ROOT}/../config/shard-based-recovery.toml +fi + +echo "Run with config: ${CONFIG_FILE}" +echo "First check..." +nohup ${BINARY_PATH} --config ${CONFIG_FILE} & +sleep 10 +python3 ./check.py -ts ${NOW} -i + +echo "Restart and check..." +killall ceresdb-server | true +nohup ${BINARY_PATH} --config ${CONFIG_FILE} & +sleep 10 +python3 ./check.py -ts ${NOW} + +echo "Flush, restart and check..." +curl -XPOST ${SERVER_HTTP_ENDPOINT}/debug/flush_memtable +echo "\nFlush finish..." +killall ceresdb-server | true +nohup ${BINARY_PATH} --config ${CONFIG_FILE} & +sleep 10 +python3 ./check.py -ts ${NOW} +echo "All finish..." diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs index c0944ed81b..cf415cc6e0 100644 --- a/interpreters/src/describe.rs +++ b/interpreters/src/describe.rs @@ -46,12 +46,14 @@ impl DescribeInterpreter { let mut is_primary_keys = Vec::with_capacity(num_columns); let mut is_nullables = Vec::with_capacity(num_columns); let mut is_tags = Vec::with_capacity(num_columns); + let mut is_dictionarys = Vec::with_capacity(num_columns); for (idx, col) in table_schema.columns().iter().enumerate() { names.push(col.name.to_string()); types.push(col.data_type.to_string()); is_primary_keys.push(table_schema.is_primary_key_index(&idx)); is_nullables.push(col.is_nullable); is_tags.push(col.is_tag); + is_dictionarys.push(col.is_dictionary); } let schema = Schema::new(vec![ @@ -60,6 +62,7 @@ impl DescribeInterpreter { Field::new("is_primary", DataType::Boolean, false), Field::new("is_nullable", DataType::Boolean, false), Field::new("is_tag", DataType::Boolean, false), + Field::new("is_dictionary", DataType::Boolean, false), ]); let arrow_record_batch = RecordBatch::try_new( @@ -70,6 +73,7 @@ impl DescribeInterpreter { Arc::new(BooleanArray::from(is_primary_keys)), Arc::new(BooleanArray::from(is_nullables)), Arc::new(BooleanArray::from(is_tags)), + Arc::new(BooleanArray::from(is_dictionarys)), ], ) .unwrap(); diff --git a/interpreters/src/insert.rs b/interpreters/src/insert.rs index 95bc47a0cb..782b07b6f3 100644 --- a/interpreters/src/insert.rs +++ b/interpreters/src/insert.rs @@ -341,7 +341,11 @@ fn get_or_extract_column_from_row_groups( .unwrap_or_else(|| { let data_type = row_groups.schema().column(column_idx).data_type; let iter = row_groups.iter_column(column_idx); - let mut builder = ColumnBlockBuilder::with_capacity(&data_type, iter.size_hint().0); + let mut builder = ColumnBlockBuilder::with_capacity( + &data_type, + iter.size_hint().0, + row_groups.schema().column(column_idx).is_dictionary, + ); for datum in iter { builder.append(datum.clone()).context(BuildColumnBlock)?; diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs index 29b6048355..a6e4b3dcdf 100644 --- a/interpreters/src/show_create.rs +++ b/interpreters/src/show_create.rs @@ -86,6 +86,11 @@ impl ShowCreateInterpreter { if col.is_tag { res += " TAG"; } + + if col.is_dictionary { + res += " DICTIONARY"; + } + if !col.is_nullable { res += " NOT NULL"; } diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs index 2b2f93b184..6a4473eee6 100644 --- a/interpreters/src/tests.rs +++ b/interpreters/src/tests.rs @@ -170,17 +170,18 @@ where let sql = "desc table test_table"; let output = self.sql_to_output(sql).await.unwrap(); let records = output.try_into().unwrap(); + // todo this maybe need to change let expected = vec![ - "+--------+-----------+------------+-------------+--------+", - "| name | type | is_primary | is_nullable | is_tag |", - "+--------+-----------+------------+-------------+--------+", - "| key1 | varbinary | true | false | false |", - "| key2 | timestamp | true | false | false |", - "| field1 | double | false | true | false |", - "| field2 | string | false | true | false |", - "| field3 | date | false | true | false |", - "| field4 | time | false | true | false |", - "+--------+-----------+------------+-------------+--------+", + "+--------+-----------+------------+-------------+--------+---------------+", + "| name | type | is_primary | is_nullable | is_tag | is_dictionary |", + "+--------+-----------+------------+-------------+--------+---------------+", + "| key1 | varbinary | true | false | false | false |", + "| key2 | timestamp | true | false | false | false |", + "| field1 | double | false | true | false | false |", + "| field2 | string | false | true | false | false |", + "| field3 | date | false | true | false | false |", + "| field4 | time | false | true | false | false |", + "+--------+-----------+------------+-------------+--------+---------------+", ]; common_util::record_batch::assert_record_batches_eq(&expected, records); } diff --git a/proxy/src/forward.rs b/proxy/src/forward.rs index 9603dceee9..e1765dbfda 100644 --- a/proxy/src/forward.rs +++ b/proxy/src/forward.rs @@ -21,6 +21,8 @@ use tonic::{ transport::{self, Channel}, }; +use crate::FORWARDED_FROM; + #[derive(Debug, Snafu)] pub enum Error { #[snafu(display( @@ -68,6 +70,9 @@ pub enum Error { source: tonic::transport::Error, backtrace: Backtrace, }, + + #[snafu(display("Request should not be forwarded twice, forward from:{}", endpoint))] + ForwardedErr { endpoint: String }, } define_result!(Error); @@ -184,6 +189,7 @@ pub struct ForwardRequest { pub schema: String, pub table: String, pub req: tonic::Request, + pub forwarded_from: Option, } impl Forwarder { @@ -256,7 +262,12 @@ impl Forwarder { F: ForwarderRpc, Req: std::fmt::Debug + Clone, { - let ForwardRequest { schema, table, req } = forward_req; + let ForwardRequest { + schema, + table, + req, + forwarded_from, + } = forward_req; let route_req = RouteRequest { context: Some(RequestContext { database: schema }), @@ -281,13 +292,15 @@ impl Forwarder { } }; - self.forward_with_endpoint(endpoint, req, do_rpc).await + self.forward_with_endpoint(endpoint, req, forwarded_from, do_rpc) + .await } pub async fn forward_with_endpoint( &self, endpoint: Endpoint, mut req: tonic::Request, + forwarded_from: Option, do_rpc: F, ) -> Result> where @@ -310,6 +323,17 @@ impl Forwarder { "Try to forward request to {:?}, request:{:?}", endpoint, req, ); + + if let Some(endpoint) = forwarded_from { + return ForwardedErr { endpoint }.fail(); + } + + // mark forwarded + req.metadata_mut().insert( + FORWARDED_FROM, + self.local_endpoint.to_string().parse().unwrap(), + ); + let client = self.get_or_create_client(&endpoint).await?; match do_rpc(client, req, &endpoint).await { Err(e) => { @@ -461,6 +485,7 @@ mod tests { schema: DEFAULT_SCHEMA.to_string(), table: table.to_string(), req: query_request.into_request(), + forwarded_from: None, } }; diff --git a/proxy/src/grpc/metrics.rs b/proxy/src/grpc/metrics.rs index 850f001683..57bb707a04 100644 --- a/proxy/src/grpc/metrics.rs +++ b/proxy/src/grpc/metrics.rs @@ -8,7 +8,16 @@ use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric}; make_auto_flush_static_metric! { pub label_enum GrpcTypeKind { + write_succeeded, write_failed, + query_succeeded, + query_failed, + stream_query_succeeded, + stream_query_failed, + write_succeeded_row, + write_failed_row, + query_succeeded_row, + query_affected_row, } pub struct GrpcHandlerCounterVec: LocalIntCounter { diff --git a/proxy/src/grpc/prom_query.rs b/proxy/src/grpc/prom_query.rs index 4ef9ebeecf..8368921fc0 100644 --- a/proxy/src/grpc/prom_query.rs +++ b/proxy/src/grpc/prom_query.rs @@ -373,54 +373,77 @@ mod tests { .unwrap(), ) .unwrap() + .add_normal_column( + column_schema::Builder::new("tag_dictionary".to_string(), DatumKind::String) + .is_tag(true) + .is_dictionary(true) + .is_nullable(true) + .build() + .unwrap(), + ) + .unwrap() .build() .unwrap() } fn build_column_block() -> Vec { - let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str| -> Row { + let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str, dic: Option<&str>| -> Row { let datums = vec![ Datum::Timestamp(Timestamp::new(ts)), Datum::UInt64(tsid), Datum::Double(field1), Datum::String(StringBytes::from(field2)), + dic.map(|v| Datum::String(StringBytes::from(v))) + .unwrap_or(Datum::Null), ]; Row::from_datums(datums) }; let rows = vec![ - build_row(1000001, 1, 10.0, "v5"), - build_row(1000002, 1, 11.0, "v5"), - build_row(1000000, 2, 10.0, "v4"), - build_row(1000000, 3, 10.0, "v3"), + build_row(1000001, 1, 10.0, "v5", Some("d1")), + build_row(1000002, 1, 11.0, "v5", None), + build_row(1000000, 2, 10.0, "v4", Some("d2")), + build_row(1000000, 3, 10.0, "v3", None), ]; - let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2, false); for row in &rows { builder.append(row[0].clone()).unwrap(); } let timestamp_block = builder.build(); - let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2, false); for row in &rows { builder.append(row[1].clone()).unwrap(); } let tsid_block = builder.build(); - let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2, false); for row in &rows { builder.append(row[2].clone()).unwrap(); } let field_block = builder.build(); - let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2); + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2, false); for row in &rows { builder.append(row[3].clone()).unwrap(); } let tag_block = builder.build(); - vec![timestamp_block, tsid_block, field_block, tag_block] + let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2, true); + for row in &rows { + builder.append(row[4].clone()).unwrap(); + } + let dictionary_block = builder.build(); + + vec![ + timestamp_block, + tsid_block, + field_block, + tag_block, + dictionary_block, + ] } fn make_sample(timestamp: i64, value: f64) -> Sample { @@ -440,7 +463,7 @@ mod tests { let column_name = ColumnNames { timestamp: "timestamp".to_string(), - tag_keys: vec!["tag1".to_string()], + tag_keys: vec!["tag1".to_string(), "tag_dictionary".to_string()], field: "field1".to_string(), }; let converter = RecordConverter::try_new(&column_name, &schema.to_record_schema()).unwrap(); @@ -461,11 +484,17 @@ mod tests { ); assert_eq!( tsid_to_tags.get(&1).unwrap().clone(), - make_tags(vec![("tag1".to_string(), "v5".to_string())]) + make_tags(vec![ + ("tag1".to_string(), "v5".to_string()), + ("tag_dictionary".to_string(), "d1".to_string()) + ]) ); assert_eq!( tsid_to_tags.get(&2).unwrap().clone(), - make_tags(vec![("tag1".to_string(), "v4".to_string())]) + make_tags(vec![ + ("tag1".to_string(), "v4".to_string()), + ("tag_dictionary".to_string(), "d2".to_string()) + ]) ); assert_eq!( tsid_to_tags.get(&3).unwrap().clone(), diff --git a/proxy/src/grpc/sql_query.rs b/proxy/src/grpc/sql_query.rs index 4b9a75210b..6c756be12c 100644 --- a/proxy/src/grpc/sql_query.rs +++ b/proxy/src/grpc/sql_query.rs @@ -28,6 +28,7 @@ use tonic::{transport::Channel, IntoRequest}; use crate::{ error::{self, ErrNoCause, ErrWithCause, Error, Result}, forward::{ForwardRequest, ForwardResult}, + grpc::metrics::GRPC_HANDLER_COUNTER_VEC, read::SqlResponse, Context, Proxy, }; @@ -40,12 +41,16 @@ impl Proxy { match self.handle_sql_query_internal(ctx, req).await { Err(e) => { error!("Failed to handle sql query, err:{e}"); + GRPC_HANDLER_COUNTER_VEC.query_failed.inc(); SqlQueryResponse { header: Some(error::build_err_header(e)), ..Default::default() } } - Ok(v) => v, + Ok(v) => { + GRPC_HANDLER_COUNTER_VEC.query_succeeded.inc(); + v + } } } @@ -79,13 +84,17 @@ impl Proxy { match self.clone().handle_stream_query_internal(ctx, req).await { Err(e) => stream::once(async { error!("Failed to handle stream sql query, err:{e}"); + GRPC_HANDLER_COUNTER_VEC.stream_query_failed.inc(); SqlQueryResponse { header: Some(error::build_err_header(e)), ..Default::default() } }) .boxed(), - Ok(v) => v, + Ok(v) => { + GRPC_HANDLER_COUNTER_VEC.stream_query_succeeded.inc(); + v + } } } @@ -104,7 +113,11 @@ impl Proxy { let req_context = req.context.as_ref().unwrap(); let schema = req_context.database.clone(); - let req = match self.clone().maybe_forward_stream_sql_query(&req).await { + let req = match self + .clone() + .maybe_forward_stream_sql_query(ctx.clone(), &req) + .await + { Some(resp) => match resp { ForwardResult::Forwarded(resp) => return resp, ForwardResult::Local => req, @@ -127,8 +140,12 @@ impl Proxy { if tx.send(resp).await.is_err() { error!("Failed to send affected rows resp in stream sql query"); } + GRPC_HANDLER_COUNTER_VEC + .query_affected_row + .inc_by(rows as u64); } Output::Records(batches) => { + let mut num_rows = 0; for batch in &batches { let resp = { let mut writer = QueryResponseWriter::new(resp_compress_min_length); @@ -140,7 +157,11 @@ impl Proxy { error!("Failed to send record batches resp in stream sql query"); break; } + num_rows += batch.num_rows(); } + GRPC_HANDLER_COUNTER_VEC + .query_succeeded_row + .inc_by(num_rows as u64); } } Ok::<(), Error>(()) @@ -150,6 +171,7 @@ impl Proxy { async fn maybe_forward_stream_sql_query( self: Arc, + ctx: Context, req: &SqlQueryRequest, ) -> Option, Error>> { if req.tables.len() != 1 { @@ -163,6 +185,7 @@ impl Proxy { schema: req_ctx.database.clone(), table: req.tables[0].clone(), req: req.clone().into_request(), + forwarded_from: ctx.forwarded_from, }; let do_query = |mut client: StorageServiceClient, request: tonic::Request, @@ -219,9 +242,19 @@ pub fn convert_output( Output::Records(batches) => { let mut writer = QueryResponseWriter::new(resp_compress_min_length); writer.write_batches(batches)?; + let mut num_rows = 0; + for batch in batches { + num_rows += batch.num_rows(); + } + GRPC_HANDLER_COUNTER_VEC + .query_succeeded_row + .inc_by(num_rows as u64); writer.finish() } Output::AffectedRows(rows) => { + GRPC_HANDLER_COUNTER_VEC + .query_affected_row + .inc_by(*rows as u64); Ok(QueryResponseBuilder::with_ok_header().build_with_affected_rows(*rows)) } } diff --git a/proxy/src/grpc/write.rs b/proxy/src/grpc/write.rs index ca4c7f2f55..ecf0e1dac3 100644 --- a/proxy/src/grpc/write.rs +++ b/proxy/src/grpc/write.rs @@ -21,8 +21,9 @@ impl Proxy { match self.handle_write_internal(ctx, req).await { Err(e) => { error!("Failed to handle write, err:{e}"); + GRPC_HANDLER_COUNTER_VEC.write_failed.inc(); GRPC_HANDLER_COUNTER_VEC - .write_failed + .write_failed_row .inc_by(num_rows as u64); WriteResponse { header: Some(error::build_err_header(e)), @@ -30,9 +31,13 @@ impl Proxy { } } Ok(v) => { + GRPC_HANDLER_COUNTER_VEC.write_succeeded.inc(); GRPC_HANDLER_COUNTER_VEC - .write_failed + .write_failed_row .inc_by(v.failed as u64); + GRPC_HANDLER_COUNTER_VEC + .write_succeeded_row + .inc_by(v.success as u64); WriteResponse { header: Some(build_ok_header()), success: v.success, diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs index d414f7e0db..87fd26e76b 100644 --- a/proxy/src/http/prom.rs +++ b/proxy/src/http/prom.rs @@ -62,6 +62,7 @@ impl Proxy { runtime: self.engine_runtimes.write_runtime.clone(), timeout: ctx.timeout, enable_partition_table_access: false, + forwarded_from: None, }; let result = self.handle_write_internal(ctx, table_request).await?; @@ -241,6 +242,7 @@ impl Converter { let value_idx = schema.index_of(field_col_name).context(InternalNoCause { msg: "Value column is missing in query response", })?; + // Todo is there need add is_dictionary check? let tags = schema .columns() .iter() diff --git a/proxy/src/http/sql.rs b/proxy/src/http/sql.rs index 127732c24e..67ec394042 100644 --- a/proxy/src/http/sql.rs +++ b/proxy/src/http/sql.rs @@ -37,6 +37,7 @@ impl Proxy { timeout: ctx.timeout, runtime: self.engine_runtimes.read_runtime.clone(), enable_partition_table_access: true, + forwarded_from: None, }; match self.handle_sql(context, &ctx.schema, &req.query).await? { diff --git a/proxy/src/influxdb/mod.rs b/proxy/src/influxdb/mod.rs index e028c59ff3..c4a2fee151 100644 --- a/proxy/src/influxdb/mod.rs +++ b/proxy/src/influxdb/mod.rs @@ -58,6 +58,7 @@ impl Proxy { timeout: ctx.timeout, runtime: self.engine_runtimes.write_runtime.clone(), enable_partition_table_access: false, + forwarded_from: None, }; let result = self .handle_write_internal(proxy_context, table_request) diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs index 58cba675ab..85681d95c5 100644 --- a/proxy/src/influxdb/types.rs +++ b/proxy/src/influxdb/types.rs @@ -811,11 +811,13 @@ mod tests { } fn build_test_column_blocks() -> Vec { - let mut measurement_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3); - let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3); - let mut time_builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 3); - let mut field_builder1 = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3); - let mut field_builder2 = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 3); + // TODO missing is_dictionary paramms + let mut measurement_builder = + ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); + let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); + let mut time_builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 3, false); + let mut field_builder1 = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); + let mut field_builder2 = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 3, false); // Data in measurement1 let measurement1 = Datum::String(StringBytes::copy_from_str("m1")); diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8ab85fd147..c54a5acfc6 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -22,6 +22,8 @@ pub mod schema_config_provider; mod util; mod write; +pub const FORWARDED_FROM: &str = "forwarded-from"; + use std::{ sync::Arc, time::{Duration, Instant}, @@ -131,6 +133,7 @@ impl Proxy { schema: req_ctx.database.clone(), table: metric, req: req.into_request(), + forwarded_from: None, }; let do_query = |mut client: StorageServiceClient, request: tonic::Request, @@ -452,4 +455,5 @@ pub struct Context { pub timeout: Option, pub runtime: Arc, pub enable_partition_table_access: bool, + pub forwarded_from: Option, } diff --git a/proxy/src/read.rs b/proxy/src/read.rs index 9131cf54d0..a47b9454be 100644 --- a/proxy/src/read.rs +++ b/proxy/src/read.rs @@ -41,7 +41,10 @@ impl Proxy { schema: &str, sql: &str, ) -> Result { - if let Some(resp) = self.maybe_forward_sql_query(schema, sql).await? { + if let Some(resp) = self + .maybe_forward_sql_query(ctx.clone(), schema, sql) + .await? + { match resp { ForwardResult::Forwarded(resp) => return Ok(SqlResponse::Forwarded(resp?)), ForwardResult::Local => (), @@ -149,6 +152,7 @@ impl Proxy { async fn maybe_forward_sql_query( &self, + ctx: Context, schema: &str, sql: &str, ) -> Result>> { @@ -174,6 +178,7 @@ impl Proxy { schema: schema.to_string(), table: table_name.unwrap(), req: sql_request.into_request(), + forwarded_from: ctx.forwarded_from, }; let do_query = |mut client: StorageServiceClient, request: tonic::Request, diff --git a/proxy/src/write.rs b/proxy/src/write.rs index a371e3cd01..008489bfdc 100644 --- a/proxy/src/write.rs +++ b/proxy/src/write.rs @@ -108,7 +108,7 @@ impl Proxy { let mut futures = Vec::with_capacity(write_requests_to_forward.len() + 1); // Write to remote. - self.collect_write_to_remote_future(&mut futures, write_requests_to_forward) + self.collect_write_to_remote_future(&mut futures, ctx.clone(), write_requests_to_forward) .await; // Write to local. @@ -139,7 +139,7 @@ impl Proxy { let mut futures = Vec::with_capacity(write_requests_to_forward.len() + 1); // Write to remote. - self.collect_write_to_remote_future(&mut futures, write_requests_to_forward) + self.collect_write_to_remote_future(&mut futures, ctx.clone(), write_requests_to_forward) .await; // Create table. @@ -358,12 +358,14 @@ impl Proxy { async fn collect_write_to_remote_future( &self, futures: &mut WriteResponseFutures<'_>, + ctx: Context, write_request: HashMap, ) { for (endpoint, table_write_request) in write_request { let forwarder = self.forwarder.clone(); + let ctx = ctx.clone(); let write_handle = self.engine_runtimes.io_runtime.spawn(async move { - Self::write_to_remote(forwarder, endpoint, table_write_request).await + Self::write_to_remote(ctx, forwarder, endpoint, table_write_request).await }); futures.push(write_handle.boxed()); @@ -408,6 +410,7 @@ impl Proxy { } async fn write_to_remote( + ctx: Context, forwarder: ForwarderRef, endpoint: Endpoint, table_write_request: WriteRequest, @@ -432,7 +435,12 @@ impl Proxy { }; let forward_result = forwarder - .forward_with_endpoint(endpoint, tonic::Request::new(table_write_request), do_write) + .forward_with_endpoint( + endpoint, + tonic::Request::new(table_write_request), + ctx.forwarded_from, + do_write, + ) .await; let forward_res = forward_result .map_err(|e| { @@ -676,8 +684,8 @@ fn find_new_columns( ); let tag_name = &tag_names[name_index]; - - build_column(&mut columns, schema, tag_name, &tag.value, true)?; + // todo is_dictionary set true or false ? + build_column(&mut columns, schema, tag_name, &tag.value, true, false)?; } // Parse fields. @@ -693,7 +701,8 @@ fn find_new_columns( } ); let field_name = &field_names[field.name_index as usize]; - build_column(&mut columns, schema, field_name, &field.value, false)?; + // todo is_dictionary set true or false ? + build_column(&mut columns, schema, field_name, &field.value, false, false)?; } } } @@ -707,6 +716,7 @@ fn build_column<'a>( name: &'a str, value: &Option, is_tag: bool, + is_dictionary: bool, ) -> Result<()> { // Skip adding columns, the following cases: // 1. Column already exists. @@ -732,7 +742,7 @@ fn build_column<'a>( msg: "Failed to get data type", })?; - let column_schema = build_column_schema(name, data_type, is_tag) + let column_schema = build_column_schema(name, data_type, is_tag, is_dictionary) .box_err() .context(Internal { msg: "Failed to build column schema", diff --git a/query_frontend/src/frontend.rs b/query_frontend/src/frontend.rs index b82f58d693..ecdf8e17f2 100644 --- a/query_frontend/src/frontend.rs +++ b/query_frontend/src/frontend.rs @@ -177,6 +177,10 @@ impl Frontend

{ } pub fn parse_table_name(statements: &StatementVec) -> Option { + // maybe have empty sql + if statements.is_empty() { + return None; + } match &statements[0] { Statement::Standard(s) => match *s.clone() { SqlStatement::Insert { table_name, .. } => { @@ -269,5 +273,8 @@ mod tests { Some(table.to_string()) ); } + assert!(frontend::parse_table_name_with_sql("-- just comment") + .unwrap() + .is_none()); } } diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs index 883e8e22ab..813344a888 100644 --- a/query_frontend/src/parser.rs +++ b/query_frontend/src/parser.rs @@ -37,6 +37,7 @@ macro_rules! parser_err { const TS_KEY: &str = "__ts_key"; const TAG: &str = "TAG"; +const DICTIONARY: &str = "DICTIONARY"; const UNSIGN: &str = "UNSIGN"; const MODIFY: &str = "MODIFY"; const SETTING: &str = "SETTING"; @@ -62,6 +63,7 @@ macro_rules! is_custom_column { } is_custom_column!(TAG); +is_custom_column!(DICTIONARY); is_custom_column!(UNSIGN); /// Get the comment from the [`ColumnOption`] if it is a comment option. @@ -326,6 +328,22 @@ impl<'a> Parser<'a> { // WITH ... let options = self.parser.parse_options(Keyword::WITH)?; + // Only String Column Can Be Dictionary Encoded + for c in columns.iter() { + let mut is_dictionary = false; + for op in c.options.iter() { + if is_dictionary_column(&op.option) { + is_dictionary = true; + } + } + if c.data_type != DataType::String && is_dictionary { + return parser_err!(format!( + "Only string column can be dictionary encoded: {:?}", + c.to_string() + )); + } + } + Ok(Statement::Create(Box::new(CreateTable { if_not_exists, table_name, @@ -513,6 +531,11 @@ impl<'a> Parser<'a> { Ok(Some(ColumnOption::DialectSpecific(vec![ Token::make_keyword(TAG), ]))) + } else if self.consume_token(DICTIONARY) { + // Support DICTIONARY for ceresdb + Ok(Some(ColumnOption::DialectSpecific(vec![ + Token::make_keyword(DICTIONARY), + ]))) } else if self.consume_token(UNSIGN) { // Support unsign for ceresdb Ok(Some(ColumnOption::DialectSpecific(vec![ @@ -973,6 +996,52 @@ mod tests { } } + #[test] + fn test_dictionary_column() { + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 float dictionary, c3 bigint unsign)"; + assert!(Parser::parse_sql(sql).is_err()); + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 string dictionary, c3 bigint unsign)"; + let statements = Parser::parse_sql(sql).unwrap(); + assert_eq!(statements.len(), 1); + match &statements[0] { + Statement::Create(v) => { + let columns = &v.columns; + assert_eq!(3, columns.len()); + for c in columns { + if c.name.value == "c1" { + assert_eq!(2, c.options.len()); + let opt = &c.options[0]; + assert!(is_tag_column(&opt.option)); + let opt = &c.options[1]; + assert!(is_dictionary_column(&opt.option)); + } else if c.name.value == "c2" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_dictionary_column(&opt.option)); + } else if c.name.value == "c3" { + assert_eq!(1, c.options.len()); + let opt = &c.options[0]; + assert!(is_unsign_column(&opt.option)); + } else { + panic!("failed"); + } + } + } + _ => panic!("failed"), + } + } + + #[test] + fn test_dictionary_use_unstring_column() { + let sql = + "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float dictionary, c3 bigint unsign)"; + assert!(Parser::parse_sql(sql).is_err()); + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 float dictionary, c3 bigint unsign)"; + assert!(Parser::parse_sql(sql).is_err()); + let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float dictionary, c3 bigint unsign dictionary)"; + assert!(Parser::parse_sql(sql).is_err()); + } + #[test] fn test_comment_column() { let sql = "CREATE TABLE IF NOT EXISTS t(c1 string, c2 float, c3 bigint comment 'id')"; diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs index 694bc0ee0e..2baf01be10 100644 --- a/query_frontend/src/planner.rs +++ b/query_frontend/src/planner.rs @@ -367,10 +367,12 @@ pub fn build_column_schema( column_name: &str, data_type: DatumKind, is_tag: bool, + is_dictionary: bool, ) -> Result { let builder = column_schema::Builder::new(column_name.to_string(), data_type) .is_nullable(true) - .is_tag(is_tag); + .is_tag(is_tag) + .is_dictionary(is_dictionary); builder.build().with_context(|| InvalidColumnSchema { column_name: column_name.to_string(), @@ -429,9 +431,19 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(tag_value)?; if let Some(column_schema) = name_column_map.get(tag_name) { - ensure_data_type_compatible(table, tag_name, true, data_type, column_schema)?; + // Todo is_dictionary set true or false ? Do we need modify the pb ? + ensure_data_type_compatible( + table, + tag_name, + true, + false, + data_type, + column_schema, + )?; } - let column_schema = build_column_schema(tag_name, data_type, true)?; + + // Todo is_dictionary set true or false ? Do we need modify the pb ? + let column_schema = build_column_schema(tag_name, data_type, true, false)?; name_column_map.insert(tag_name, column_schema); } @@ -457,16 +469,18 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(field_value)?; if let Some(column_schema) = name_column_map.get(field_name) { + // todo is_dictionary set true or false ? ensure_data_type_compatible( table, field_name, false, + false, data_type, column_schema, )?; } - - let column_schema = build_column_schema(field_name, data_type, false)?; + // todo is_dictionary set true or false ? + let column_schema = build_column_schema(field_name, data_type, false, false)?; name_column_map.insert(field_name, column_schema); } } @@ -512,9 +526,11 @@ fn ensure_data_type_compatible( table_name: &str, column_name: &str, is_tag: bool, + _is_dictionary: bool, data_type: DatumKind, column_schema: &ColumnSchema, ) -> Result<()> { + // Todo how to check is_dictionary ? ensure!( column_schema.is_tag == is_tag, InvalidWriteEntry { @@ -1234,6 +1250,7 @@ fn parse_column(col: &ColumnDef) -> Result { // Process column options let mut is_nullable = true; // A column is nullable by default. let mut is_tag = false; + let mut is_dictionary = false; let mut is_unsign = false; let mut comment = String::new(); let mut default_value = None; @@ -1242,6 +1259,8 @@ fn parse_column(col: &ColumnDef) -> Result { is_nullable = false; } else if parser::is_tag_column(&option_def.option) { is_tag = true; + } else if parser::is_dictionary_column(&option_def.option) { + is_dictionary = true; } else if parser::is_unsign_column(&option_def.option) { is_unsign = true; } else if let Some(default_value_expr) = parser::get_default_value(&option_def.option) { @@ -1260,6 +1279,7 @@ fn parse_column(col: &ColumnDef) -> Result { let builder = column_schema::Builder::new(col.name.value.clone(), data_type) .is_nullable(is_nullable) .is_tag(is_tag) + .is_dictionary(is_dictionary) .comment(comment) .default_value(default_value); @@ -1441,6 +1461,7 @@ mod tests { data_type: String, is_nullable: false, is_tag: true, + is_dictionary: false, comment: "", escaped_name: "c1", default_value: None, @@ -1451,6 +1472,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "ts", default_value: None, @@ -1461,6 +1483,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "c3", default_value: None, @@ -1471,6 +1494,7 @@ mod tests { data_type: UInt32, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "c4", default_value: Some( @@ -1488,6 +1512,7 @@ mod tests { data_type: UInt32, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "c5", default_value: Some( @@ -1514,6 +1539,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "c6", default_value: Some( @@ -1612,6 +1638,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -1622,6 +1649,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -1632,6 +1660,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -1642,6 +1671,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -1652,6 +1682,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -1662,6 +1693,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, @@ -1687,6 +1719,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -1697,6 +1730,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -1707,6 +1741,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -1717,6 +1752,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -1727,6 +1763,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -1737,6 +1774,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, @@ -1851,6 +1889,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -1861,6 +1900,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -1871,6 +1911,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -1881,6 +1922,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -1891,6 +1933,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -1901,6 +1944,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, @@ -1920,6 +1964,119 @@ mod tests { .unwrap(); } + #[test] + fn test_alter_column_with_dictionary_encode() { + let sql = "ALTER TABLE test_table ADD column dic string dictionary;"; + quick_test( + sql, + r#"AlterTable( + AlterTablePlan { + table: MemoryTable { + name: "test_table", + id: TableId( + 100, + ), + schema: Schema { + timestamp_index: 1, + tsid_index: None, + column_schemas: ColumnSchemas { + columns: [ + ColumnSchema { + id: 1, + name: "key1", + data_type: Varbinary, + is_nullable: false, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "key1", + default_value: None, + }, + ColumnSchema { + id: 2, + name: "key2", + data_type: Timestamp, + is_nullable: false, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "key2", + default_value: None, + }, + ColumnSchema { + id: 3, + name: "field1", + data_type: Double, + is_nullable: true, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "field1", + default_value: None, + }, + ColumnSchema { + id: 4, + name: "field2", + data_type: String, + is_nullable: true, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "field2", + default_value: None, + }, + ColumnSchema { + id: 5, + name: "field3", + data_type: Date, + is_nullable: true, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "field3", + default_value: None, + }, + ColumnSchema { + id: 6, + name: "field4", + data_type: Time, + is_nullable: true, + is_tag: false, + is_dictionary: false, + comment: "", + escaped_name: "field4", + default_value: None, + }, + ], + }, + version: 1, + primary_key_indexes: [ + 0, + 1, + ], + }, + }, + operations: AddColumn( + [ + ColumnSchema { + id: 0, + name: "dic", + data_type: String, + is_nullable: true, + is_tag: false, + is_dictionary: true, + comment: "", + escaped_name: "dic", + default_value: None, + }, + ], + ), + }, +)"#, + ) + .unwrap(); + } + #[test] fn test_alter_column_statement_to_plan() { let sql = "ALTER TABLE test_tablex ADD column add_col string;"; @@ -1946,6 +2103,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -1956,6 +2114,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -1966,6 +2125,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -1976,6 +2136,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -1986,6 +2147,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -1996,6 +2158,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, @@ -2017,6 +2180,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "add_col", default_value: None, @@ -2055,6 +2219,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -2065,6 +2230,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -2075,6 +2241,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -2085,6 +2252,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -2095,6 +2263,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -2105,6 +2274,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, @@ -2156,6 +2326,7 @@ mod tests { data_type: Varbinary, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key1", default_value: None, @@ -2166,6 +2337,7 @@ mod tests { data_type: Timestamp, is_nullable: false, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "key2", default_value: None, @@ -2176,6 +2348,7 @@ mod tests { data_type: Double, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field1", default_value: None, @@ -2186,6 +2359,7 @@ mod tests { data_type: String, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field2", default_value: None, @@ -2196,6 +2370,7 @@ mod tests { data_type: Date, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field3", default_value: None, @@ -2206,6 +2381,7 @@ mod tests { data_type: Time, is_nullable: true, is_tag: false, + is_dictionary: false, comment: "", escaped_name: "field4", default_value: None, diff --git a/rust-toolchain b/rust-toolchain deleted file mode 100644 index 3f36906f0f..0000000000 --- a/rust-toolchain +++ /dev/null @@ -1 +0,0 @@ -nightly-2023-02-02 diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000000..1680342afe --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "nightly-2023-02-02" +components = [ "rustfmt", "clippy" ] diff --git a/server/Cargo.toml b/server/Cargo.toml index 36f89ee2d1..eb5c523af5 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -35,7 +35,6 @@ meta_client = { workspace = true } opensrv-mysql = "0.1.0" partition_table_engine = { workspace = true } paste = { workspace = true } -pprof = { version = "0.11.1", features = ["flamegraph"] } profile = { workspace = true } prom-remote-api = { workspace = true, features = ["warp"] } prometheus = { workspace = true } diff --git a/server/src/grpc/metrics.rs b/server/src/grpc/metrics.rs index 94a1213e03..d86109766f 100644 --- a/server/src/grpc/metrics.rs +++ b/server/src/grpc/metrics.rs @@ -36,7 +36,15 @@ make_auto_flush_static_metric! { } pub label_enum RemoteEngineGrpcTypeKind { + write_succeeded, write_failed, + query_succeeded, + query_failed, + stream_query_succeeded, + stream_query_failed, + write_succeeded_row, + write_failed_row, + query_succeeded_row, } pub struct RemoteEngineGrpcHandlerCounterVec: LocalIntCounter { diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs index e1b88eabf5..bd2401628c 100644 --- a/server/src/grpc/remote_engine_service/mod.rs +++ b/server/src/grpc/remote_engine_service/mod.rs @@ -74,12 +74,19 @@ impl RemoteEngineServiceImpl { }); let tx = tx.clone(); self.runtimes.read_runtime.spawn(async move { + let mut num_rows = 0; while let Some(batch) = stream.next().await { + if let Ok(record_batch) = &batch { + num_rows += record_batch.num_rows(); + } if let Err(e) = tx.send(batch).await { error!("Failed to send handler result, err:{}.", e); break; } } + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC + .query_succeeded_row + .inc_by(num_rows as u64); }); } @@ -331,21 +338,32 @@ async fn handle_stream_read( let begin = Instant::now(); let table = find_table_by_identifier(&ctx, &table_ident)?; - let streams = table + let res = table .partitioned_read(read_request) .await .box_err() .with_context(|| ErrWithCause { code: StatusCode::Internal, msg: format!("fail to read table, table:{table_ident:?}"), - })?; - - info!( + }); + match res { + Ok(streams) => { + info!( "Handle stream read success, request_id:{request_id}, table:{table_ident:?}, cost:{:?}", begin.elapsed(), ); - - Ok(streams) + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC + .stream_query_succeeded + .inc(); + Ok(streams) + } + Err(e) => { + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC + .stream_query_failed + .inc(); + Err(e) + } + } } async fn handle_write(ctx: HandlerContext, request: WriteRequest) -> Result { @@ -367,13 +385,20 @@ async fn handle_write(ctx: HandlerContext, request: WriteRequest) -> Result Ok(WriteResponse { - header: None, - affected_rows: affected_rows as u64, - }), + Ok(affected_rows) => { + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC.write_succeeded.inc(); + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC + .write_succeeded_row + .inc_by(affected_rows as u64); + Ok(WriteResponse { + header: None, + affected_rows: affected_rows as u64, + }) + } Err(e) => { + REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC.write_failed.inc(); REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC - .write_failed + .write_failed_row .inc_by(num_rows as u64); Err(e) } diff --git a/server/src/grpc/storage_service/mod.rs b/server/src/grpc/storage_service/mod.rs index 7487b4fda8..ca758146ca 100644 --- a/server/src/grpc/storage_service/mod.rs +++ b/server/src/grpc/storage_service/mod.rs @@ -21,7 +21,7 @@ use ceresdbproto::{ use common_util::time::InstantExt; use futures::{stream, stream::BoxStream, StreamExt}; use http::StatusCode; -use proxy::{Context, Proxy}; +use proxy::{Context, Proxy, FORWARDED_FROM}; use query_engine::executor::Executor as QueryExecutor; use table_engine::engine::EngineRuntimes; @@ -138,6 +138,10 @@ impl StorageService for StorageServiceImpl { runtime: self.runtimes.read_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; let stream = Self::stream_sql_query_internal(ctx, proxy, req).await; @@ -155,13 +159,17 @@ impl StorageServiceImpl { &self, req: tonic::Request, ) -> Result, tonic::Status> { - let req = req.into_inner(); - let proxy = self.proxy.clone(); let ctx = Context { runtime: self.runtimes.read_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; + let req = req.into_inner(); + let proxy = self.proxy.clone(); let join_handle = self .runtimes @@ -186,13 +194,17 @@ impl StorageServiceImpl { &self, req: tonic::Request, ) -> Result, tonic::Status> { - let req = req.into_inner(); - let proxy = self.proxy.clone(); let ctx = Context { runtime: self.runtimes.write_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; + let req = req.into_inner(); + let proxy = self.proxy.clone(); let join_handle = self.runtimes.write_runtime.spawn(async move { if req.context.is_none() { @@ -226,13 +238,18 @@ impl StorageServiceImpl { &self, req: tonic::Request, ) -> Result, tonic::Status> { - let req = req.into_inner(); - let proxy = self.proxy.clone(); let ctx = Context { runtime: self.runtimes.read_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; + let req = req.into_inner(); + let proxy = self.proxy.clone(); + let join_handle = self .runtimes .read_runtime @@ -289,13 +306,18 @@ impl StorageServiceImpl { &self, req: tonic::Request, ) -> Result, tonic::Status> { - let req = req.into_inner(); - let proxy = self.proxy.clone(); let ctx = Context { runtime: self.runtimes.read_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; + let req = req.into_inner(); + let proxy = self.proxy.clone(); + let join_handle = self.runtimes.read_runtime.spawn(async move { if req.context.is_none() { return PrometheusQueryResponse { @@ -329,13 +351,17 @@ impl StorageServiceImpl { ) -> Result, tonic::Status> { let mut total_success = 0; - let mut stream = req.into_inner(); - let proxy = self.proxy.clone(); let ctx = Context { runtime: self.runtimes.write_runtime.clone(), timeout: self.timeout, enable_partition_table_access: false, + forwarded_from: req + .metadata() + .get(FORWARDED_FROM) + .map(|value| value.to_str().unwrap().to_string()), }; + let mut stream = req.into_inner(); + let proxy = self.proxy.clone(); let join_handle = self.runtimes.write_runtime.spawn(async move { let mut resp = WriteResponse::default(); diff --git a/server/src/http.rs b/server/src/http.rs index 9e229fc539..15214fc454 100644 --- a/server/src/http.rs +++ b/server/src/http.rs @@ -3,8 +3,8 @@ //! Http service use std::{ - collections::HashMap, convert::Infallible, error::Error as StdError, fs::File, net::IpAddr, - sync::Arc, thread, time::Duration, + collections::HashMap, convert::Infallible, error::Error as StdError, net::IpAddr, sync::Arc, + time::Duration, }; use analytic_engine::setup::OpenedWals; @@ -80,6 +80,12 @@ pub enum Error { backtrace: Backtrace, }, + #[snafu(display("Fail to do cpu profiling, err:{}.\nBacktrace:\n{}", source, backtrace))] + ProfileCPU { + source: profile::Error, + backtrace: Backtrace, + }, + #[snafu(display("Fail to join async task, err:{}.", source))] JoinAsyncTask { source: common_util::runtime::Error }, @@ -184,8 +190,8 @@ impl Service { // debug APIs .or(self.flush_memtable()) .or(self.update_log_level()) - .or(self.heap_profile()) - .or(self.cpu_profile()) + .or(self.profile_cpu()) + .or(self.profile_heap()) .or(self.server_config()) .or(self.stats()) .with(warp::log("http_requests")) @@ -393,62 +399,51 @@ impl Service { warp::path!("metrics").and(warp::get()).map(metrics::dump) } - // GET /debug/heap_profile/{seconds} - fn heap_profile( + // GET /debug/profile/cpu/{seconds} + fn profile_cpu( &self, ) -> impl Filter + Clone { - warp::path!("debug" / "heap_profile" / ..) + warp::path!("debug" / "profile" / "cpu" / ..) .and(warp::path::param::()) .and(warp::get()) .and(self.with_profiler()) .and(self.with_runtime()) .and_then( |duration_sec: u64, profiler: Arc, runtime: Arc| async move { - let handle = runtime.spawn_blocking(move || { - profiler.dump_mem_prof(duration_sec).context(ProfileHeap) + let handle = runtime.spawn_blocking(move || -> Result<()> { + profiler.dump_cpu_prof(duration_sec).context(ProfileCPU) }); let result = handle.await.context(JoinAsyncTask); match result { - Ok(Ok(prof_data)) => Ok(prof_data.into_response()), - Ok(Err(e)) => Err(reject::custom(e)), + Ok(_) => Ok("ok"), Err(e) => Err(reject::custom(e)), } }, ) } - // GET /debug/cpu_profile/{seconds} - fn cpu_profile( + // GET /debug/profile/heap/{seconds} + fn profile_heap( &self, ) -> impl Filter + Clone { - warp::path!("debug" / "cpu_profile" / ..) + warp::path!("debug" / "profile" / "heap" / ..) .and(warp::path::param::()) .and(warp::get()) + .and(self.with_profiler()) .and(self.with_runtime()) - .and_then(|duration_sec: u64, runtime: Arc| async move { - let handle = runtime.spawn_blocking(move || -> Result<()> { - let guard = pprof::ProfilerGuardBuilder::default() - .frequency(100) - .blocklist(&["libc", "libgcc", "pthread", "vdso"]) - .build() - .box_err() - .context(Internal)?; - - thread::sleep(Duration::from_secs(duration_sec)); - - let report = guard.report().build().box_err().context(Internal)?; - let file = File::create("/tmp/flamegraph.svg") - .box_err() - .context(Internal)?; - report.flamegraph(file).box_err().context(Internal)?; - Ok(()) - }); - let result = handle.await.context(JoinAsyncTask); - match result { - Ok(_) => Ok("ok"), - Err(e) => Err(reject::custom(e)), - } - }) + .and_then( + |duration_sec: u64, profiler: Arc, runtime: Arc| async move { + let handle = runtime.spawn_blocking(move || { + profiler.dump_heap_prof(duration_sec).context(ProfileHeap) + }); + let result = handle.await.context(JoinAsyncTask); + match result { + Ok(Ok(prof_data)) => Ok(prof_data.into_response()), + Ok(Err(e)) => Err(reject::custom(e)), + Err(e) => Err(reject::custom(e)), + } + }, + ) } // GET /debug/config @@ -695,6 +690,7 @@ fn error_to_status_code(err: &Error) -> StatusCode { | Error::MissingProxy { .. } | Error::ParseIpAddr { .. } | Error::ProfileHeap { .. } + | Error::ProfileCPU { .. } | Error::Internal { .. } | Error::JoinAsyncTask { .. } | Error::AlreadyStarted { .. } diff --git a/server/src/mysql/writer.rs b/server/src/mysql/writer.rs index e2af7880e7..af01fa09ec 100644 --- a/server/src/mysql/writer.rs +++ b/server/src/mysql/writer.rs @@ -150,6 +150,7 @@ mod tests { name: "id".to_string(), data_type: DatumKind::Int32, is_nullable: false, + is_dictionary: false, is_tag: false, comment: "".to_string(), escaped_name: "id".to_string(), @@ -163,6 +164,7 @@ mod tests { name: "name".to_string(), data_type: DatumKind::String, is_nullable: true, + is_dictionary: false, is_tag: true, comment: "".to_string(), escaped_name: "name".to_string(), @@ -177,6 +179,7 @@ mod tests { data_type: DatumKind::Timestamp, is_nullable: true, is_tag: true, + is_dictionary: false, comment: "".to_string(), escaped_name: "birthday".to_string(), default_value: None, @@ -190,6 +193,7 @@ mod tests { data_type: DatumKind::Boolean, is_nullable: true, is_tag: true, + is_dictionary: false, comment: "".to_string(), escaped_name: "is_show".to_string(), default_value: None, @@ -203,6 +207,7 @@ mod tests { data_type: DatumKind::Double, is_nullable: true, is_tag: true, + is_dictionary: false, comment: "".to_string(), escaped_name: "money".to_string(), default_value: None, diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 4e79a291fb..5e94d20d1b 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -244,7 +244,9 @@ fn build_column_block<'a, I: Iterator>( data_type: &DatumKind, iter: I, ) -> stream::Result { - let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0); + // TODO ensure there don't use is_dictionary and the datum.clone() is necessary + // ? + let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false); for datum in iter { builder .append(datum.clone()) diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs index 8d7638b685..821bdb6195 100644 --- a/table_engine/src/table.rs +++ b/table_engine/src/table.rs @@ -249,6 +249,7 @@ impl From for TableSeq { pub struct TableId(u64); impl TableId { + pub const MAX: TableId = TableId(u64::MAX); /// Min table id. pub const MIN: TableId = TableId(0); diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs index 9eb81422bb..a089ad2da5 100644 --- a/tools/src/bin/sst-metadata.rs +++ b/tools/src/bin/sst-metadata.rs @@ -2,7 +2,7 @@ //! A cli to query sst meta data -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter}; use anyhow::{Context, Result}; @@ -13,7 +13,7 @@ use common_util::{ }; use futures::StreamExt; use object_store::{LocalFileSystem, ObjectMeta, ObjectStoreRef, Path}; -use parquet_ext::meta_data::fetch_parquet_metadata; +use parquet_ext::{meta_data::fetch_parquet_metadata, reader::ObjectStoreReader}; use tokio::{runtime::Handle, task::JoinSet}; #[derive(Parser, Debug)] @@ -30,6 +30,38 @@ struct Args { /// Thread num, 0 means cpu num #[clap(short, long, default_value_t = 0)] threads: usize, + + /// Print page indexes + #[clap(short, long, required(false))] + page_indexes: bool, +} + +#[derive(Default, Debug)] +struct FileStatistics { + file_count: u64, + size: usize, + metadata_size: usize, + kv_size: usize, + filter_size: usize, + row_num: i64, +} + +impl ToString for FileStatistics { + fn to_string(&self) -> String { + format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}", + self.file_count, + as_mb(self.size), + as_mb(self.metadata_size), + as_mb(self.kv_size), + as_mb(self.filter_size), + self.row_num) + } +} + +#[derive(Default, Debug)] +struct FieldStatistics { + compressed_size: i64, + uncompressed_size: i64, } fn new_runtime(thread_num: usize) -> Runtime { @@ -64,6 +96,7 @@ async fn run(args: Args) -> Result<()> { let mut join_set = JoinSet::new(); let mut ssts = storage.list(None).await?; let verbose = args.verbose; + let page_indexes = args.page_indexes; while let Some(object_meta) = ssts.next().await { let object_meta = object_meta?; let storage = storage.clone(); @@ -71,7 +104,8 @@ async fn run(args: Args) -> Result<()> { join_set.spawn_on( async move { let (metadata, metadata_size, kv_size) = - parse_metadata(storage, location, object_meta.size, verbose).await?; + parse_metadata(storage, location, object_meta.size, verbose, page_indexes) + .await?; Ok::<_, anyhow::Error>((object_meta, metadata, metadata_size, kv_size)) }, &handle, @@ -93,6 +127,8 @@ async fn run(args: Args) -> Result<()> { .cmp(&b.1.custom().time_range.inclusive_start()) }); + let mut file_stats = FileStatistics::default(); + let mut field_stats_map = HashMap::new(); for (object_meta, sst_metadata, metadata_size, kv_size) in metas { let ObjectMeta { location, size, .. } = &object_meta; let custom_meta = sst_metadata.custom(); @@ -108,6 +144,27 @@ async fn run(args: Args) -> Result<()> { .unwrap_or(0); let file_metadata = parquet_meta.file_metadata(); let row_num = file_metadata.num_rows(); + + file_stats.file_count += 1; + file_stats.size += object_meta.size; + file_stats.metadata_size += metadata_size; + file_stats.kv_size += kv_size; + file_stats.filter_size += filter_size; + file_stats.row_num += row_num; + + let fields = file_metadata.schema().get_fields(); + for (_, row_group) in parquet_meta.row_groups().iter().enumerate() { + for i in 0..fields.len() { + let column_meta = row_group.column(i); + let field_name = fields.get(i).unwrap().get_basic_info().name().to_string(); + let mut field_stats = field_stats_map + .entry(field_name) + .or_insert(FieldStatistics::default()); + field_stats.compressed_size += column_meta.compressed_size(); + field_stats.uncompressed_size += column_meta.uncompressed_size(); + } + } + if verbose { println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}, custom_meta:{custom_meta:?}"); } else { @@ -121,6 +178,17 @@ async fn run(args: Args) -> Result<()> { } } + println!("{}", file_stats.to_string()); + println!("FieldStatistics: "); + for (k, v) in field_stats_map.iter() { + println!( + "{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}", + k, + as_mb(v.compressed_size as usize), + as_mb(v.uncompressed_size as usize), + v.uncompressed_size as f64 / v.compressed_size as f64 + ); + } Ok(()) } @@ -133,9 +201,11 @@ async fn parse_metadata( path: Path, size: usize, verbose: bool, + page_indexes: bool, ) -> Result<(MetaData, usize, usize)> { let reader = ChunkReaderAdapter::new(&path, &storage); let (parquet_metadata, metadata_size) = fetch_parquet_metadata(size, &reader).await?; + let kv_metadata = parquet_metadata.file_metadata().key_value_metadata(); let kv_size = kv_metadata .map(|kvs| { @@ -155,6 +225,15 @@ async fn parse_metadata( }) .unwrap_or(0); - let md = MetaData::try_new(&parquet_metadata, false)?; + let md = if page_indexes { + let object_store_reader = + ObjectStoreReader::new(storage, path.clone(), Arc::new(parquet_metadata)); + let parquet_metadata = + parquet_ext::meta_data::meta_with_page_indexes(object_store_reader).await?; + MetaData::try_new(&parquet_metadata, false)? + } else { + MetaData::try_new(&parquet_metadata, false)? + }; + Ok((md, metadata_size, kv_size)) } diff --git a/wal/src/message_queue_impl/log_cleaner.rs b/wal/src/message_queue_impl/log_cleaner.rs index b1c4f05a85..074a01b29a 100644 --- a/wal/src/message_queue_impl/log_cleaner.rs +++ b/wal/src/message_queue_impl/log_cleaner.rs @@ -1,4 +1,4 @@ -// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0. +// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0. //! Log cleaner @@ -68,7 +68,7 @@ impl LogCleaner { pub async fn maybe_clean_logs(&mut self, safe_delete_offset: Offset) -> Result<()> { info!( - "Begin to check and clean logs, region id:{}, topic:{}, safe delete offset:{:?}", + "Region clean logs begin, region id:{}, topic:{}, safe delete offset:{:?}", self.region_id, self.log_topic, safe_delete_offset ); @@ -102,7 +102,7 @@ impl LogCleaner { } info!( - "Finished to check and clean logs, do clean:{}, region id:{}, topic:{}, prepare delete to offset:{:?}", + "Region clean logs finish, do clean:{}, region id:{}, topic:{}, prepare delete to offset:{:?}", do_clean, self.region_id, self.log_topic, safe_delete_offset ); diff --git a/wal/src/message_queue_impl/region.rs b/wal/src/message_queue_impl/region.rs index 49f331dfac..37beea0358 100644 --- a/wal/src/message_queue_impl/region.rs +++ b/wal/src/message_queue_impl/region.rs @@ -579,14 +579,14 @@ impl Region { let (snapshot, synchronizer) = { let inner = self.inner.write().await; - debug!( + info!( "Mark deleted entries to sequence num:{}, region id:{}, table id:{}", sequence_num, inner.region_context.region_id(), table_id ); - inner.mark_delete_to(table_id, sequence_num).await.unwrap(); + inner.mark_delete_to(table_id, sequence_num).await?; ( inner.make_meta_snapshot().await, @@ -618,6 +618,8 @@ impl Region { }; let safe_delete_offset = snapshot.safe_delete_offset(); + info!("Region clean logs, snapshot:{snapshot:?}, safe_delete_offset:{safe_delete_offset}"); + // Sync snapshot first. synchronizer .sync(snapshot) diff --git a/wal/src/rocks_impl/config.rs b/wal/src/rocks_impl/config.rs index 439313c144..966720f094 100644 --- a/wal/src/rocks_impl/config.rs +++ b/wal/src/rocks_impl/config.rs @@ -2,13 +2,26 @@ //! RocksDB Config +use common_util::config::ReadableSize; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(default)] pub struct Config { + pub max_subcompactions: u32, pub max_background_jobs: i32, pub enable_statistics: bool, + pub write_buffer_size: ReadableSize, + pub max_write_buffer_number: i32, + // Number of files to trigger level-0 compaction. A value <0 means that level-0 compaction will + // not be triggered by number of files at all. + pub level_zero_file_num_compaction_trigger: i32, + // Soft limit on number of level-0 files. We start slowing down writes at this point. A value + // <0 means that no writing slow down will be triggered by number of files in level-0. + pub level_zero_slowdown_writes_trigger: i32, + // Maximum number of level-0 files. We stop writes at this point. + pub level_zero_stop_writes_trigger: i32, + pub fifo_compaction_max_table_files_size: ReadableSize, } impl Default for Config { @@ -16,8 +29,16 @@ impl Default for Config { Self { // Same with rocksdb // https://github.com/facebook/rocksdb/blob/v6.4.6/include/rocksdb/options.h#L537 + max_subcompactions: 1, max_background_jobs: 2, enable_statistics: false, + write_buffer_size: ReadableSize::mb(64), + max_write_buffer_number: 2, + level_zero_file_num_compaction_trigger: 4, + level_zero_slowdown_writes_trigger: 20, + level_zero_stop_writes_trigger: 36, + // default is 1G, use 0 to disable fifo + fifo_compaction_max_table_files_size: ReadableSize::gb(0), } } } diff --git a/wal/src/rocks_impl/manager.rs b/wal/src/rocks_impl/manager.rs index b8a97b8e7a..2e4e70cf00 100644 --- a/wal/src/rocks_impl/manager.rs +++ b/wal/src/rocks_impl/manager.rs @@ -19,7 +19,10 @@ use common_types::{ }; use common_util::{error::BoxError, runtime::Runtime}; use log::{debug, info, warn}; -use rocksdb::{DBIterator, DBOptions, ReadOptions, SeekKey, Statistics, Writable, WriteBatch, DB}; +use rocksdb::{ + rocksdb_options::ColumnFamilyDescriptor, ColumnFamilyOptions, DBCompactionStyle, DBIterator, + DBOptions, FifoCompactionOptions, ReadOptions, SeekKey, Statistics, Writable, WriteBatch, DB, +}; use snafu::ResultExt; use tokio::sync::Mutex; @@ -525,8 +528,15 @@ impl RocksImpl { pub struct Builder { wal_path: String, runtime: Arc, + max_subcompactions: Option, max_background_jobs: Option, enable_statistics: Option, + write_buffer_size: Option, + max_write_buffer_number: Option, + level_zero_file_num_compaction_trigger: Option, + level_zero_slowdown_writes_trigger: Option, + level_zero_stop_writes_trigger: Option, + fifo_compaction_max_table_files_size: Option, } impl Builder { @@ -535,11 +545,23 @@ impl Builder { Self { wal_path: wal_path.to_str().unwrap().to_owned(), runtime, + max_subcompactions: None, max_background_jobs: None, enable_statistics: None, + write_buffer_size: None, + max_write_buffer_number: None, + level_zero_file_num_compaction_trigger: None, + level_zero_slowdown_writes_trigger: None, + level_zero_stop_writes_trigger: None, + fifo_compaction_max_table_files_size: None, } } + pub fn max_subcompactions(mut self, v: u32) -> Self { + self.max_subcompactions = Some(v); + self + } + pub fn max_background_jobs(mut self, v: i32) -> Self { self.max_background_jobs = Some(v); self @@ -550,10 +572,43 @@ impl Builder { self } + pub fn write_buffer_size(mut self, v: u64) -> Self { + self.write_buffer_size = Some(v); + self + } + + pub fn max_write_buffer_number(mut self, v: i32) -> Self { + self.max_write_buffer_number = Some(v); + self + } + + pub fn level_zero_file_num_compaction_trigger(mut self, v: i32) -> Self { + self.level_zero_file_num_compaction_trigger = Some(v); + self + } + + pub fn level_zero_slowdown_writes_trigger(mut self, v: i32) -> Self { + self.level_zero_slowdown_writes_trigger = Some(v); + self + } + + pub fn level_zero_stop_writes_trigger(mut self, v: i32) -> Self { + self.level_zero_stop_writes_trigger = Some(v); + self + } + + pub fn fifo_compaction_max_table_files_size(mut self, v: u64) -> Self { + self.fifo_compaction_max_table_files_size = Some(v); + self + } + pub fn build(self) -> Result { let mut rocksdb_config = DBOptions::default(); rocksdb_config.create_if_missing(true); + if let Some(v) = self.max_subcompactions { + rocksdb_config.set_max_subcompactions(v); + } if let Some(v) = self.max_background_jobs { rocksdb_config.set_max_background_jobs(v); } @@ -566,7 +621,38 @@ impl Builder { None }; - let db = DB::open(rocksdb_config, &self.wal_path) + let mut cf_opts = ColumnFamilyOptions::new(); + if let Some(v) = self.write_buffer_size { + cf_opts.set_write_buffer_size(v); + } + if let Some(v) = self.max_write_buffer_number { + cf_opts.set_max_write_buffer_number(v); + } + if let Some(v) = self.level_zero_file_num_compaction_trigger { + cf_opts.set_level_zero_file_num_compaction_trigger(v); + } + if let Some(v) = self.level_zero_slowdown_writes_trigger { + cf_opts.set_level_zero_slowdown_writes_trigger(v); + } + if let Some(v) = self.level_zero_stop_writes_trigger { + cf_opts.set_level_zero_stop_writes_trigger(v); + } + + // FIFO compaction strategy let rocksdb looks like a message queue. + if let Some(v) = self.fifo_compaction_max_table_files_size { + if v > 0 { + let mut fifo_opts = FifoCompactionOptions::new(); + fifo_opts.set_max_table_files_size(v); + cf_opts.set_fifo_compaction_options(fifo_opts); + cf_opts.set_compaction_style(DBCompactionStyle::Fifo); + } + } + + let default_cfd = ColumnFamilyDescriptor { + options: cf_opts, + ..ColumnFamilyDescriptor::default() + }; + let db = DB::open_cf(rocksdb_config, &self.wal_path, vec![default_cfd]) .map_err(|e| e.into()) .context(Open { wal_path: self.wal_path.clone(), From 305df31d170a81e20e035d7422708ba60051f308 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 28 Jun 2023 15:45:26 +0800 Subject: [PATCH 02/18] update --- Cargo.lock | 70 +++--------------------------------------------------- 1 file changed, 3 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 466fe8b216..5c45b2db7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -85,11 +85,7 @@ dependencies = [ "async-trait", "base64 0.13.1", "bytes", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "datafusion", @@ -1062,11 +1058,7 @@ checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b" dependencies = [ "arrow", "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.5", -======= "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", ->>>>>>> 0abc9181 (update pb) "dashmap 5.4.0", "futures 0.3.28", "paste 1.0.12", @@ -1092,8 +1084,9 @@ dependencies = [ [[package]] name = "ceresdbproto" -version = "1.0.4" -source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=53f5c74a54d8a08ebb08c41e8b862b2369df4a02#53f5c74a54d8a08ebb08c41e8b862b2369df4a02" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a" dependencies = [ "prost", "protoc-bin-vendored", @@ -1105,12 +1098,7 @@ dependencies = [ [[package]] name = "ceresdbproto" version = "1.0.5" -<<<<<<< HEAD -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a" -======= source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" ->>>>>>> 0abc9181 (update pb) dependencies = [ "prost", "protoc-bin-vendored", @@ -1263,11 +1251,7 @@ name = "cluster" version = "1.2.2" dependencies = [ "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "etcd-client", @@ -1320,11 +1304,7 @@ dependencies = [ "arrow_ext", "byteorder", "bytes_ext", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "chrono", "datafusion", "murmur3", @@ -1343,11 +1323,7 @@ version = "1.2.2" dependencies = [ "arrow", "backtrace", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "chrono", "common_types", "crossbeam-utils 0.8.15", @@ -3460,11 +3436,7 @@ name = "meta_client" version = "1.2.2" dependencies = [ "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -3953,11 +3925,7 @@ version = "1.2.2" dependencies = [ "async-trait", "bytes", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "chrono", "clru", "common_types", @@ -4755,11 +4723,7 @@ dependencies = [ "async-trait", "bytes", "catalog", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "clru", "cluster", "common_types", @@ -4872,11 +4836,7 @@ dependencies = [ "arrow", "async-trait", "catalog", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "cluster", "common_types", "common_util", @@ -5184,11 +5144,7 @@ version = "1.2.2" dependencies = [ "arrow_ext", "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -5314,11 +5270,7 @@ name = "router" version = "1.2.2" dependencies = [ "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "cluster", "common_types", "common_util", @@ -5673,11 +5625,7 @@ dependencies = [ "async-trait", "bytes", "catalog", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "clru", "cluster", "common_types", @@ -6216,11 +6164,7 @@ dependencies = [ "arrow", "async-trait", "catalog", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "futures 0.3.28", @@ -6239,11 +6183,7 @@ dependencies = [ "arrow", "arrow_ext", "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "common_types", "common_util", "datafusion", @@ -7057,11 +6997,7 @@ name = "wal" version = "1.2.2" dependencies = [ "async-trait", -<<<<<<< HEAD - "ceresdbproto 1.0.4", -======= "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", ->>>>>>> 0abc9181 (update pb) "chrono", "common_types", "common_util", From d885790c4b28416c27029aa58a3ea7d117a45b0b Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 28 Jun 2023 15:50:50 +0800 Subject: [PATCH 03/18] remove comment --- Cargo.lock | 1 - common_types/src/column.rs | 7 ------- 2 files changed, 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 662e846f41..5c45b2db7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1099,7 +1099,6 @@ dependencies = [ name = "ceresdbproto" version = "1.0.5" source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" - dependencies = [ "prost", "protoc-bin-vendored", diff --git a/common_types/src/column.rs b/common_types/src/column.rs index a2580a47c2..9e9a390fd7 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -375,13 +375,6 @@ impl_dedup!(VarbinaryColumn); impl_dedup!(StringColumn); impl StringDictionaryColumn { - #[doc = " If datum i is not equal to previous datum i - 1, mark `selected[i]` to"] - #[doc = " true."] - #[doc = ""] - #[doc = " The first datum is marked to true."] - #[doc = ""] - #[doc = " The size of selected must equal to the size of this column and"] - #[doc = " initialized to false."] #[allow(clippy::float_cmp)] pub fn dedup(&self, selected: &mut [bool]) { if self.0.is_empty() { From 1e1437e58d522d35efabf7436b891f36f20ab5fd Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 28 Jun 2023 16:26:27 +0800 Subject: [PATCH 04/18] remove sdk related code --- interpreters/src/describe.rs | 4 ---- interpreters/src/show_create.rs | 5 ----- interpreters/src/tests.rs | 21 ++++++++++----------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs index cf415cc6e0..c0944ed81b 100644 --- a/interpreters/src/describe.rs +++ b/interpreters/src/describe.rs @@ -46,14 +46,12 @@ impl DescribeInterpreter { let mut is_primary_keys = Vec::with_capacity(num_columns); let mut is_nullables = Vec::with_capacity(num_columns); let mut is_tags = Vec::with_capacity(num_columns); - let mut is_dictionarys = Vec::with_capacity(num_columns); for (idx, col) in table_schema.columns().iter().enumerate() { names.push(col.name.to_string()); types.push(col.data_type.to_string()); is_primary_keys.push(table_schema.is_primary_key_index(&idx)); is_nullables.push(col.is_nullable); is_tags.push(col.is_tag); - is_dictionarys.push(col.is_dictionary); } let schema = Schema::new(vec![ @@ -62,7 +60,6 @@ impl DescribeInterpreter { Field::new("is_primary", DataType::Boolean, false), Field::new("is_nullable", DataType::Boolean, false), Field::new("is_tag", DataType::Boolean, false), - Field::new("is_dictionary", DataType::Boolean, false), ]); let arrow_record_batch = RecordBatch::try_new( @@ -73,7 +70,6 @@ impl DescribeInterpreter { Arc::new(BooleanArray::from(is_primary_keys)), Arc::new(BooleanArray::from(is_nullables)), Arc::new(BooleanArray::from(is_tags)), - Arc::new(BooleanArray::from(is_dictionarys)), ], ) .unwrap(); diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs index a6e4b3dcdf..29b6048355 100644 --- a/interpreters/src/show_create.rs +++ b/interpreters/src/show_create.rs @@ -86,11 +86,6 @@ impl ShowCreateInterpreter { if col.is_tag { res += " TAG"; } - - if col.is_dictionary { - res += " DICTIONARY"; - } - if !col.is_nullable { res += " NOT NULL"; } diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs index 6a4473eee6..2b2f93b184 100644 --- a/interpreters/src/tests.rs +++ b/interpreters/src/tests.rs @@ -170,18 +170,17 @@ where let sql = "desc table test_table"; let output = self.sql_to_output(sql).await.unwrap(); let records = output.try_into().unwrap(); - // todo this maybe need to change let expected = vec![ - "+--------+-----------+------------+-------------+--------+---------------+", - "| name | type | is_primary | is_nullable | is_tag | is_dictionary |", - "+--------+-----------+------------+-------------+--------+---------------+", - "| key1 | varbinary | true | false | false | false |", - "| key2 | timestamp | true | false | false | false |", - "| field1 | double | false | true | false | false |", - "| field2 | string | false | true | false | false |", - "| field3 | date | false | true | false | false |", - "| field4 | time | false | true | false | false |", - "+--------+-----------+------------+-------------+--------+---------------+", + "+--------+-----------+------------+-------------+--------+", + "| name | type | is_primary | is_nullable | is_tag |", + "+--------+-----------+------------+-------------+--------+", + "| key1 | varbinary | true | false | false |", + "| key2 | timestamp | true | false | false |", + "| field1 | double | false | true | false |", + "| field2 | string | false | true | false |", + "| field3 | date | false | true | false |", + "| field4 | time | false | true | false |", + "+--------+-----------+------------+-------------+--------+", ]; common_util::record_batch::assert_record_batches_eq(&expected, records); } From 6f9d38f387fd620732123070527cc7d7855ab60f Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 28 Jun 2023 16:44:29 +0800 Subject: [PATCH 05/18] update result --- integration_tests/cases/env/local/ddl/create_tables.result | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration_tests/cases/env/local/ddl/create_tables.result b/integration_tests/cases/env/local/ddl/create_tables.result index ede67e9c1a..87779ac52b 100644 --- a/integration_tests/cases/env/local/ddl/create_tables.result +++ b/integration_tests/cases/env/local/ddl/create_tables.result @@ -48,7 +48,7 @@ affected_rows: 0 CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic; -Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"c1\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"c1\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t." }) +Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"c1\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"c1\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t." }) create table `05_create_tables_t2`(a int, b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic with (enable_ttl='false'); @@ -67,11 +67,11 @@ Int32(4), create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic; -Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." }) +Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." }) create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic; -Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." }) +Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." }) create table `05_create_tables_t3`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic; From 72170a363c94229784d28c9272a59f35e3a0dab2 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 28 Jun 2023 16:56:33 +0800 Subject: [PATCH 06/18] update ceresdb proto version --- Cargo.lock | 46 +++++++++++++++++----------------------------- Cargo.toml | 3 +-- 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c45b2db7d..e1430f278e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -85,7 +85,7 @@ dependencies = [ "async-trait", "base64 0.13.1", "bytes", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "datafusion", @@ -1058,7 +1058,7 @@ checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b" dependencies = [ "arrow", "async-trait", - "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)", + "ceresdbproto", "dashmap 5.4.0", "futures 0.3.28", "paste 1.0.12", @@ -1084,21 +1084,9 @@ dependencies = [ [[package]] name = "ceresdbproto" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a" -dependencies = [ - "prost", - "protoc-bin-vendored", - "tonic 0.8.3", - "tonic-build", - "walkdir", -] - -[[package]] -name = "ceresdbproto" -version = "1.0.5" -source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" +checksum = "81229e82e9afa8318e7f765cc01cd15f7380786699f4c7beceec7540e0488d7e" dependencies = [ "prost", "protoc-bin-vendored", @@ -1251,7 +1239,7 @@ name = "cluster" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "etcd-client", @@ -1304,7 +1292,7 @@ dependencies = [ "arrow_ext", "byteorder", "bytes_ext", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "chrono", "datafusion", "murmur3", @@ -1323,7 +1311,7 @@ version = "1.2.2" dependencies = [ "arrow", "backtrace", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "chrono", "common_types", "crossbeam-utils 0.8.15", @@ -3436,7 +3424,7 @@ name = "meta_client" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "futures 0.3.28", @@ -3925,7 +3913,7 @@ version = "1.2.2" dependencies = [ "async-trait", "bytes", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "chrono", "clru", "common_types", @@ -4723,7 +4711,7 @@ dependencies = [ "async-trait", "bytes", "catalog", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "clru", "cluster", "common_types", @@ -4836,7 +4824,7 @@ dependencies = [ "arrow", "async-trait", "catalog", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "cluster", "common_types", "common_util", @@ -5144,7 +5132,7 @@ version = "1.2.2" dependencies = [ "arrow_ext", "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "futures 0.3.28", @@ -5270,7 +5258,7 @@ name = "router" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "cluster", "common_types", "common_util", @@ -5625,7 +5613,7 @@ dependencies = [ "async-trait", "bytes", "catalog", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "clru", "cluster", "common_types", @@ -6164,7 +6152,7 @@ dependencies = [ "arrow", "async-trait", "catalog", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "futures 0.3.28", @@ -6183,7 +6171,7 @@ dependencies = [ "arrow", "arrow_ext", "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "common_types", "common_util", "datafusion", @@ -6997,7 +6985,7 @@ name = "wal" version = "1.2.2" dependencies = [ "async-trait", - "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)", + "ceresdbproto", "chrono", "common_types", "common_util", diff --git a/Cargo.toml b/Cargo.toml index cfb2a37dbd..d94dd002f4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,8 +68,7 @@ bytes = "1.1.0" bytes_ext = { path = "components/bytes_ext" } catalog = { path = "catalog" } catalog_impls = { path = "catalog_impls" } -ceresdbproto = { git = "https://github.com/tanruixiang/ceresdbproto.git", rev = "6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" } - +ceresdbproto = "1.0" chrono = "0.4" clap = "3.0" clru = "0.6.1" From 41b39704883fd71b72424e74f10add1ac07ea29a Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 10:52:25 +0800 Subject: [PATCH 07/18] adjust testcase and comment --- analytic_engine/src/sst/parquet/hybrid.rs | 2 +- analytic_engine/src/sst/parquet/writer.rs | 239 +++++++--------------- common_types/src/column.rs | 7 +- common_types/src/column_schema.rs | 2 +- common_types/src/tests.rs | 42 ++-- df_operator/src/udfs/time_bucket.rs | 2 +- proxy/src/write.rs | 4 +- query_frontend/src/planner.rs | 12 +- table_engine/src/memory.rs | 2 +- 9 files changed, 105 insertions(+), 207 deletions(-) diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs index 1cf7481ecf..ee86496f48 100644 --- a/analytic_engine/src/sst/parquet/hybrid.rs +++ b/analytic_engine/src/sst/parquet/hybrid.rs @@ -127,7 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef { field.data_type().clone(), true, ))); - // TODO is there need to use new_dict? + // TODO(tanruixiang) is there need to use new_dict? Arc::new(Field::new(field.name(), field_type, true)) } else { field.clone() diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs index 378854bafa..3d5e896c06 100644 --- a/analytic_engine/src/sst/parquet/writer.rs +++ b/analytic_engine/src/sst/parquet/writer.rs @@ -333,7 +333,7 @@ mod tests { use common_types::{ bytes::Bytes, projected_schema::ProjectedSchema, - tests::{build_row, build_row_for_dictionary, build_schema, build_schema_for_dictionary}, + tests::{build_row, build_row_for_dictionary, build_schema, build_schema_with_dictionary}, time::{TimeRange, Timestamp}, }; use common_util::{ @@ -358,7 +358,18 @@ mod tests { table_options::{self, StorageFormatHint}, }; - fn write_parquet_with_dictionary_encode_and_read_back( + #[test] + fn test_parquet_build_and_read() { + init_log_for_test(); + + let runtime = Arc::new(runtime::Builder::default().build().unwrap()); + parquet_write_and_then_read_back(runtime.clone(), 2, vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2]); + parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3, 3, 2]); + parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]); + parquet_write_and_then_read_back(runtime, 5, vec![5, 5, 5, 5]); + } + + fn parquet_write_and_then_read_back( runtime: Arc, num_rows_per_row_group: usize, expected_num_rows: Vec, @@ -376,9 +387,9 @@ mod tests { let root = dir.path(); let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap()); let store_picker: ObjectStorePickerRef = Arc::new(store); - let sst_file_path = Path::from("test_dictionary.par"); + let sst_file_path = Path::from("data.par"); - let schema = build_schema_for_dictionary(); + let schema = build_schema_with_dictionary(); let reader_projected_schema = ProjectedSchema::no_projection(schema.clone()); let sst_meta = MetaData { min_key: Bytes::from_static(b"100"), @@ -395,16 +406,45 @@ mod tests { } counter -= 1; + // reach here when counter is 9 7 5 3 1 let ts = 100 + counter; let rows = vec![ - build_row_for_dictionary(1, ts, Some("tagv1"), "tagv2", 1), - build_row_for_dictionary(2, ts, Some("tagv2"), "tagv2", 2), - build_row_for_dictionary(3, ts, None, "tagv3", 3), - build_row_for_dictionary(4, ts, Some("tagv3"), "tagv2", 2), + build_row_for_dictionary( + b"a", + ts, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tagv1"), + "tagv2", + ), + build_row_for_dictionary( + b"b", + ts, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tagv2"), + "tagv4", + ), + build_row_for_dictionary(b"c", ts, 10.0, "v4", 1000, 1_000_000, None, "tagv2"), + build_row_for_dictionary( + b"d", + ts, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tagv3"), + "tagv2", + ), ]; let batch = build_record_batch_with_key(schema.clone(), rows); Poll::Ready(Some(Ok(batch))) })); + let mut writer = sst_factory .create_writer( &sst_write_options, @@ -466,186 +506,55 @@ mod tests { Box::new(reader) }; + let mut stream = reader.read().await.unwrap(); let mut expect_rows = vec![]; for counter in &[4, 3, 2, 1, 0] { expect_rows.push(build_row_for_dictionary( - 1, + b"a", 100 + counter, + 10.0, + "v4", + 1000, + 1_000_000, Some("tagv1"), "tagv2", - 1, )); expect_rows.push(build_row_for_dictionary( - 2, + b"b", 100 + counter, + 10.0, + "v4", + 1000, + 1_000_000, Some("tagv2"), + "tagv4", + )); + expect_rows.push(build_row_for_dictionary( + b"c", + 100 + counter, + 10.0, + "v4", + 1000, + 1_000_000, + None, "tagv2", - 2, )); - expect_rows.push(build_row_for_dictionary(3, 100 + counter, None, "tagv3", 3)); expect_rows.push(build_row_for_dictionary( - 4, + b"d", 100 + counter, + 10.0, + "v4", + 1000, + 1_000_000, Some("tagv3"), "tagv2", - 2, )); } check_stream(&mut stream, expect_rows).await; }); } - // TODO(xikai): add test for reverse reader - #[test] - fn test_parquet_use_dictionary() { - init_log_for_test(); - - let runtime = Arc::new(runtime::Builder::default().build().unwrap()); - write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 5, vec![5, 5, 5, 5]); - write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]); - write_parquet_with_dictionary_encode_and_read_back( - runtime.clone(), - 3, - vec![3, 3, 3, 3, 3, 3, 2], - ); - write_parquet_with_dictionary_encode_and_read_back( - runtime, - 2, - vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2], - ); - } - #[test] - fn test_parquet_build_and_read() { - init_log_for_test(); - - let runtime = Arc::new(runtime::Builder::default().build().unwrap()); - parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3]); - parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 4, 4, 3]); - parquet_write_and_then_read_back(runtime, 5, vec![5, 5, 5]); - } - - fn parquet_write_and_then_read_back( - runtime: Arc, - num_rows_per_row_group: usize, - expected_num_rows: Vec, - ) { - runtime.block_on(async { - let sst_factory = FactoryImpl; - let sst_write_options = SstWriteOptions { - storage_format_hint: StorageFormatHint::Auto, - num_rows_per_row_group, - compression: table_options::Compression::Uncompressed, - max_buffer_size: 0, - }; - - let dir = tempdir().unwrap(); - let root = dir.path(); - let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap()); - let store_picker: ObjectStorePickerRef = Arc::new(store); - let sst_file_path = Path::from("data.par"); - - let schema = build_schema(); - let reader_projected_schema = ProjectedSchema::no_projection(schema.clone()); - let sst_meta = MetaData { - min_key: Bytes::from_static(b"100"), - max_key: Bytes::from_static(b"200"), - time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)), - max_sequence: 200, - schema: schema.clone(), - }; - - let mut counter = 5; - let record_batch_stream = Box::new(stream::poll_fn(move |_| -> Poll> { - if counter == 0 { - return Poll::Ready(None); - } - counter -= 1; - - // reach here when counter is 9 7 5 3 1 - let ts = 100 + counter; - let rows = vec![ - build_row(b"a", ts, 10.0, "v4", 1000, 1_000_000), - build_row(b"b", ts, 10.0, "v4", 1000, 1_000_000), - build_row(b"c", ts, 10.0, "v4", 1000, 1_000_000), - ]; - let batch = build_record_batch_with_key(schema.clone(), rows); - Poll::Ready(Some(Ok(batch))) - })); - - let mut writer = sst_factory - .create_writer( - &sst_write_options, - &sst_file_path, - &store_picker, - Level::MAX, - ) - .await - .unwrap(); - let sst_info = writer - .write(RequestId::next_id(), &sst_meta, record_batch_stream) - .await - .unwrap(); - - assert_eq!(15, sst_info.row_num); - - let scan_options = ScanOptions::default(); - // read sst back to test - let sst_read_options = SstReadOptions { - reverse: false, - frequency: ReadFrequency::Frequent, - num_rows_per_row_group: 5, - projected_schema: reader_projected_schema, - predicate: Arc::new(Predicate::empty()), - meta_cache: None, - scan_options, - runtime: runtime.clone(), - }; - - let mut reader: Box = { - let mut reader = AsyncParquetReader::new( - &sst_file_path, - &sst_read_options, - None, - &store_picker, - None, - ); - let mut sst_meta_readback = reader - .meta_data() - .await - .unwrap() - .as_parquet() - .unwrap() - .as_ref() - .clone(); - // sst filter is built insider sst writer, so overwrite to default for - // comparison. - sst_meta_readback.parquet_filter = Default::default(); - assert_eq!(&sst_meta_readback, &ParquetMetaData::from(sst_meta)); - assert_eq!( - expected_num_rows, - reader - .row_groups() - .await - .iter() - .map(|g| g.num_rows()) - .collect::>() - ); - - Box::new(reader) - }; - - let mut stream = reader.read().await.unwrap(); - let mut expect_rows = vec![]; - for counter in &[4, 3, 2, 1, 0] { - expect_rows.push(build_row(b"a", 100 + counter, 10.0, "v4", 1000, 1_000_000)); - expect_rows.push(build_row(b"b", 100 + counter, 10.0, "v4", 1000, 1_000_000)); - expect_rows.push(build_row(b"c", 100 + counter, 10.0, "v4", 1000, 1_000_000)); - } - check_stream(&mut stream, expect_rows).await; - }); - } - #[tokio::test] async fn test_fetch_row_group() { // rows per group: 10 diff --git a/common_types/src/column.rs b/common_types/src/column.rs index 9e9a390fd7..b1656e2cfc 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -292,7 +292,7 @@ impl_column!( impl_column!(StringColumn, get_string_datum, get_string_datum_view); impl StringDictionaryColumn { - #[doc = " Get datum by index."] + /// Get datum by index pub fn datum_opt(&self, index: usize) -> Option { if index >= self.0.len() { return None; @@ -311,7 +311,7 @@ impl StringDictionaryColumn { if self.0.is_null(index) { return DatumView::Null; } - // TODO : Is this the efficient way? + // TODO(tanruixiang): Is this the efficient way? DatumView::String(self.0.downcast_dict::().unwrap().value(index)) } @@ -319,7 +319,7 @@ impl StringDictionaryColumn { if self.0.is_null(index) { return Datum::Null; } - // TODO : Is this the efficient way? + // TODO(tanruixiang): Is this the efficient way? Datum::String( self.0 .downcast_dict::() @@ -375,7 +375,6 @@ impl_dedup!(VarbinaryColumn); impl_dedup!(StringColumn); impl StringDictionaryColumn { - #[allow(clippy::float_cmp)] pub fn dedup(&self, selected: &mut [bool]) { if self.0.is_empty() { return; diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs index 6deaefa5c4..606d04eaf5 100644 --- a/common_types/src/column_schema.rs +++ b/common_types/src/column_schema.rs @@ -336,7 +336,7 @@ impl From<&ColumnSchema> for Field { col_schema.is_nullable, col_schema.id.into(), false, - // Todo how to use dict_is_ordered + // TODO(tanruixiang) how to use dict_is_ordered ) } else { Field::new( diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs index fd81255da2..0839c6c121 100644 --- a/common_types/src/tests.rs +++ b/common_types/src/tests.rs @@ -145,24 +145,12 @@ pub fn build_schema() -> Schema { pub fn build_default_value_schema() -> Schema { default_value_schema_builder().build().unwrap() } + /// Build a schema for testing: -/// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8), -/// field2(float)) -pub fn build_schema_for_dictionary() -> Schema { - let builder = schema::Builder::new() - .auto_increment_column_id(true) - .add_key_column( - column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64) - .build() - .unwrap(), - ) - .unwrap() - .add_key_column( - column_schema::Builder::new("time".to_string(), DatumKind::Timestamp) - .build() - .unwrap(), - ) - .unwrap() +/// (key1(varbinary), key2(timestamp), field1(double), field2(string), +/// field3(date), field4(time)) tag1(string dictionary), tag2(string dictionary) +pub fn build_schema_with_dictionary() -> Schema { + let builder = base_schema_builder() .add_normal_column( column_schema::Builder::new("tag1".to_string(), DatumKind::String) .is_tag(true) @@ -179,12 +167,6 @@ pub fn build_schema_for_dictionary() -> Schema { .build() .unwrap(), ) - .unwrap() - .add_normal_column( - column_schema::Builder::new("value".to_string(), DatumKind::Int8) - .build() - .unwrap(), - ) .unwrap(); builder.build().unwrap() @@ -239,19 +221,25 @@ pub fn build_schema_for_cpu() -> Schema { } pub fn build_row_for_dictionary( - key1: u64, + key1: &[u8], key2: i64, + field1: f64, + field2: &str, + field3: i32, + field4: i64, tag1: Option<&str>, tag2: &str, - value: i8, ) -> Row { let datums = vec![ - Datum::UInt64(key1), + Datum::Varbinary(Bytes::copy_from_slice(key1)), Datum::Timestamp(Timestamp::new(key2)), + Datum::Double(field1), + Datum::String(StringBytes::from(field2)), + Datum::Date(field3), + Datum::Time(field4), tag1.map(|v| Datum::String(StringBytes::from(v))) .unwrap_or(Datum::Null), Datum::String(StringBytes::from(tag2)), - Datum::Int8(value), ]; Row::from_datums(datums) } diff --git a/df_operator/src/udfs/time_bucket.rs b/df_operator/src/udfs/time_bucket.rs index bb4c6b29bb..29d2932aff 100644 --- a/df_operator/src/udfs/time_bucket.rs +++ b/df_operator/src/udfs/time_bucket.rs @@ -141,7 +141,7 @@ impl<'a> TimeBucket<'a> { } fn call(&self) -> Result { - // TODO mising is_dictionary params + // TODO(tanruixiang) : mising is_dictionary params let mut out_column_builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows(), false); for ts_opt in self.column.iter() { diff --git a/proxy/src/write.rs b/proxy/src/write.rs index 008489bfdc..0cae62fbb7 100644 --- a/proxy/src/write.rs +++ b/proxy/src/write.rs @@ -684,7 +684,7 @@ fn find_new_columns( ); let tag_name = &tag_names[name_index]; - // todo is_dictionary set true or false ? + // TODO(tanruixiang) : is_dictionary set true or false ? build_column(&mut columns, schema, tag_name, &tag.value, true, false)?; } @@ -701,7 +701,7 @@ fn find_new_columns( } ); let field_name = &field_names[field.name_index as usize]; - // todo is_dictionary set true or false ? + // TODO(tanruixiang) : is_dictionary set true or false ? build_column(&mut columns, schema, field_name, &field.value, false, false)?; } } diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs index 2baf01be10..30f1ad6e74 100644 --- a/query_frontend/src/planner.rs +++ b/query_frontend/src/planner.rs @@ -431,7 +431,8 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(tag_value)?; if let Some(column_schema) = name_column_map.get(tag_name) { - // Todo is_dictionary set true or false ? Do we need modify the pb ? + // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb + // ? ensure_data_type_compatible( table, tag_name, @@ -442,7 +443,8 @@ pub fn build_schema_from_write_table_request( )?; } - // Todo is_dictionary set true or false ? Do we need modify the pb ? + // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb + // ? let column_schema = build_column_schema(tag_name, data_type, true, false)?; name_column_map.insert(tag_name, column_schema); } @@ -469,7 +471,7 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(field_value)?; if let Some(column_schema) = name_column_map.get(field_name) { - // todo is_dictionary set true or false ? + // TODO(tanruixiang) : is_dictionary set true or false ? ensure_data_type_compatible( table, field_name, @@ -479,7 +481,7 @@ pub fn build_schema_from_write_table_request( column_schema, )?; } - // todo is_dictionary set true or false ? + // TODO(tanruixiang) : is_dictionary set true or false ? let column_schema = build_column_schema(field_name, data_type, false, false)?; name_column_map.insert(field_name, column_schema); } @@ -530,7 +532,7 @@ fn ensure_data_type_compatible( data_type: DatumKind, column_schema: &ColumnSchema, ) -> Result<()> { - // Todo how to check is_dictionary ? + // TODO(tanruixiang) : how to check is_dictionary ? ensure!( column_schema.is_tag == is_tag, InvalidWriteEntry { diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 5e94d20d1b..846a6e18b3 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -244,7 +244,7 @@ fn build_column_block<'a, I: Iterator>( data_type: &DatumKind, iter: I, ) -> stream::Result { - // TODO ensure there don't use is_dictionary and the datum.clone() is necessary + // TODO(tanruixiang) : ensure there don't use is_dictionary and the datum.clone() is necessary // ? let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false); for datum in iter { From 8f705c83cf95491ace35bab7de35b2bafd53728d Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 11:01:30 +0800 Subject: [PATCH 08/18] update --- analytic_engine/src/sst/parquet/writer.rs | 2 ++ table_engine/src/memory.rs | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs index 3d5e896c06..3040746efc 100644 --- a/analytic_engine/src/sst/parquet/writer.rs +++ b/analytic_engine/src/sst/parquet/writer.rs @@ -358,6 +358,8 @@ mod tests { table_options::{self, StorageFormatHint}, }; + // TODO(xikai): add test for reverse reader + #[test] fn test_parquet_build_and_read() { init_log_for_test(); diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 846a6e18b3..0755552398 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -244,8 +244,8 @@ fn build_column_block<'a, I: Iterator>( data_type: &DatumKind, iter: I, ) -> stream::Result { - // TODO(tanruixiang) : ensure there don't use is_dictionary and the datum.clone() is necessary - // ? + // TODO(tanruixiang) : ensure there don't use is_dictionary and the + // datum.clone() is necessary ? let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false); for datum in iter { builder From 736867ba4a03a102a7aefbf4eee843adcc498f55 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 11:15:53 +0800 Subject: [PATCH 09/18] fmt --- analytic_engine/src/sst/parquet/writer.rs | 2 +- common_types/src/column.rs | 68 +++++++++++++++++++---- 2 files changed, 57 insertions(+), 13 deletions(-) diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs index 3040746efc..0e1b5f3658 100644 --- a/analytic_engine/src/sst/parquet/writer.rs +++ b/analytic_engine/src/sst/parquet/writer.rs @@ -359,7 +359,7 @@ mod tests { }; // TODO(xikai): add test for reverse reader - + #[test] fn test_parquet_build_and_read() { init_log_for_test(); diff --git a/common_types/src/column.rs b/common_types/src/column.rs index b1656e2cfc..f10b1aa35a 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -1201,7 +1201,7 @@ impl ColumnBlockBuilder { mod tests { use super::*; use crate::tests::{ - build_row_for_dictionary, build_rows, build_schema, build_schema_for_dictionary, + build_row_for_dictionary, build_rows, build_schema, build_schema_with_dictionary, }; #[test] @@ -1244,28 +1244,72 @@ mod tests { #[test] fn test_column_block_string_dictionary_builder() { - let schema = build_schema_for_dictionary(); + let schema = build_schema_with_dictionary(); let rows = vec![ - build_row_for_dictionary(1, 1, Some("tag1_1"), "tag2_1", 1), - build_row_for_dictionary(2, 2, Some("tag1_2"), "tag2_2", 2), - build_row_for_dictionary(3, 3, Some("tag1_3"), "tag2_3", 3), - build_row_for_dictionary(4, 4, Some("tag1_1"), "tag2_4", 3), - build_row_for_dictionary(5, 5, Some("tag1_3"), "tag2_4", 4), - build_row_for_dictionary(6, 6, None, "tag2_4", 4), + build_row_for_dictionary( + b"a", + 1, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tag1_1"), + "tag2_1", + ), + build_row_for_dictionary( + b"b", + 2, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tag1_2"), + "tag2_2", + ), + build_row_for_dictionary( + b"c", + 3, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tag1_3"), + "tag2_3", + ), + build_row_for_dictionary( + b"d", + 4, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tag1_1"), + "tag2_4", + ), + build_row_for_dictionary( + b"e", + 5, + 10.0, + "v4", + 1000, + 1_000_000, + Some("tag1_3"), + "tag2_4", + ), + build_row_for_dictionary(b"f", 6, 10.0, "v4", 1000, 1_000_000, None, "tag2_4"), ]; // DatumKind::String , is_dictionary = true - let column = schema.column(2); - println!("{column:?}"); + let column = schema.column(6); let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 0, column.is_dictionary); // append - (0..rows.len()).for_each(|i| builder.append(rows[i][2].clone()).unwrap()); + (0..rows.len()).for_each(|i| builder.append(rows[i][6].clone()).unwrap()); let ret = builder.append(rows[0][0].clone()); assert!(ret.is_err()); // append_view - builder.append_view(rows[5][2].as_view()).unwrap(); + builder.append_view(rows[5][6].as_view()).unwrap(); let ret = builder.append_view(rows[1][0].as_view()); assert!(ret.is_err()); From 4e54955b6c978cf4ee140ce0faa98ad9624e14aa Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 11:23:24 +0800 Subject: [PATCH 10/18] adjust cilppy --- common_types/src/tests.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs index 0839c6c121..b43269427f 100644 --- a/common_types/src/tests.rs +++ b/common_types/src/tests.rs @@ -220,6 +220,7 @@ pub fn build_schema_for_cpu() -> Schema { builder.build().unwrap() } +#[allow(clippy::too_many_arguments)] pub fn build_row_for_dictionary( key1: &[u8], key2: i64, From 688437cb4cf0d209d1a2df3f89087760bdaee818 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 14:43:39 +0800 Subject: [PATCH 11/18] adjust todo comment --- analytic_engine/src/sst/parquet/hybrid.rs | 4 ++-- common_types/src/column.rs | 4 ---- common_types/src/column_schema.rs | 2 +- proxy/src/http/prom.rs | 2 +- proxy/src/influxdb/types.rs | 2 +- proxy/src/write.rs | 4 ++-- query_frontend/src/planner.rs | 12 ++++++------ table_engine/src/memory.rs | 2 +- 8 files changed, 14 insertions(+), 18 deletions(-) diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs index ee86496f48..3d155bc01b 100644 --- a/analytic_engine/src/sst/parquet/hybrid.rs +++ b/analytic_engine/src/sst/parquet/hybrid.rs @@ -127,7 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef { field.data_type().clone(), true, ))); - // TODO(tanruixiang) is there need to use new_dict? + // TODO(tanruixiang): is there need to use new_dict? Arc::new(Field::new(field.name(), field_type, true)) } else { field.clone() @@ -419,7 +419,7 @@ impl ListArrayBuilder { let array_len = self.multi_row_arrays.len(); let mut offsets = MutableBuffer::new(array_len * std::mem::size_of::()); let child_data = self.build_child_data(&mut offsets)?; - // TODO is there need to use new_dict? + // TODO(tanruixiang): is there need to use new_dict? let field = Arc::new(Field::new( LIST_ITEM_NAME, self.datum_kind.to_arrow_data_type(), diff --git a/common_types/src/column.rs b/common_types/src/column.rs index f10b1aa35a..b11bdbfd3e 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -476,10 +476,6 @@ impl StringDictionaryColumn { DictionaryArray::::from(array_data) } - #[doc = " Returns a zero-copy slice of this array with the indicated offset and"] - #[doc = " length."] - #[doc = ""] - #[doc = " Panics if offset with length is greater than column length."] fn slice(&self, offset: usize, length: usize) -> Self { let array_slice = self.0.slice(offset, length); let array_data = array_slice.into_data(); diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs index 606d04eaf5..1691ab8f90 100644 --- a/common_types/src/column_schema.rs +++ b/common_types/src/column_schema.rs @@ -336,7 +336,7 @@ impl From<&ColumnSchema> for Field { col_schema.is_nullable, col_schema.id.into(), false, - // TODO(tanruixiang) how to use dict_is_ordered + // TODO(tanruixiang): how to use dict_is_ordered ) } else { Field::new( diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs index 87fd26e76b..d82e669847 100644 --- a/proxy/src/http/prom.rs +++ b/proxy/src/http/prom.rs @@ -242,7 +242,7 @@ impl Converter { let value_idx = schema.index_of(field_col_name).context(InternalNoCause { msg: "Value column is missing in query response", })?; - // Todo is there need add is_dictionary check? + // TODO(tanruixiang): is there need add is_dictionary check? let tags = schema .columns() .iter() diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs index 85681d95c5..2b311cc9f1 100644 --- a/proxy/src/influxdb/types.rs +++ b/proxy/src/influxdb/types.rs @@ -811,7 +811,7 @@ mod tests { } fn build_test_column_blocks() -> Vec { - // TODO missing is_dictionary paramms + // TODO(tanruixiang): missing is_dictionary paramms let mut measurement_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); diff --git a/proxy/src/write.rs b/proxy/src/write.rs index 0cae62fbb7..43b39eb6c1 100644 --- a/proxy/src/write.rs +++ b/proxy/src/write.rs @@ -684,7 +684,7 @@ fn find_new_columns( ); let tag_name = &tag_names[name_index]; - // TODO(tanruixiang) : is_dictionary set true or false ? + // TODO(tanruixiang): is_dictionary set true or false ? build_column(&mut columns, schema, tag_name, &tag.value, true, false)?; } @@ -701,7 +701,7 @@ fn find_new_columns( } ); let field_name = &field_names[field.name_index as usize]; - // TODO(tanruixiang) : is_dictionary set true or false ? + // TODO(tanruixiang): is_dictionary set true or false ? build_column(&mut columns, schema, field_name, &field.value, false, false)?; } } diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs index 30f1ad6e74..785b75fa09 100644 --- a/query_frontend/src/planner.rs +++ b/query_frontend/src/planner.rs @@ -431,8 +431,8 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(tag_value)?; if let Some(column_schema) = name_column_map.get(tag_name) { - // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb - // ? + // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the + // pb? ensure_data_type_compatible( table, tag_name, @@ -443,8 +443,8 @@ pub fn build_schema_from_write_table_request( )?; } - // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb - // ? + // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the + // pb? let column_schema = build_column_schema(tag_name, data_type, true, false)?; name_column_map.insert(tag_name, column_schema); } @@ -471,7 +471,7 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(field_value)?; if let Some(column_schema) = name_column_map.get(field_name) { - // TODO(tanruixiang) : is_dictionary set true or false ? + // TODO(tanruixiang): is_dictionary set true or false ? ensure_data_type_compatible( table, field_name, @@ -481,7 +481,7 @@ pub fn build_schema_from_write_table_request( column_schema, )?; } - // TODO(tanruixiang) : is_dictionary set true or false ? + // TODO(tanruixiang): is_dictionary set true or false ? let column_schema = build_column_schema(field_name, data_type, false, false)?; name_column_map.insert(field_name, column_schema); } diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 0755552398..8d39a40823 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -244,7 +244,7 @@ fn build_column_block<'a, I: Iterator>( data_type: &DatumKind, iter: I, ) -> stream::Result { - // TODO(tanruixiang) : ensure there don't use is_dictionary and the + // TODO(tanruixiang): ensure there don't use is_dictionary and the // datum.clone() is necessary ? let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false); for datum in iter { From 1567c0b2de62901d7da8d7276afc600875f23008 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 17:04:18 +0800 Subject: [PATCH 12/18] update by review's comment --- common_types/src/column.rs | 28 +++++++++------------------- common_types/src/column_schema.rs | 4 ++-- proxy/src/http/prom.rs | 1 - proxy/src/influxdb/types.rs | 1 - proxy/src/read.rs | 8 ++++++++ proxy/src/write.rs | 9 +++------ query_frontend/src/parser.rs | 1 - query_frontend/src/planner.rs | 26 ++++---------------------- 8 files changed, 26 insertions(+), 52 deletions(-) diff --git a/common_types/src/column.rs b/common_types/src/column.rs index b11bdbfd3e..346d3fd80e 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -463,6 +463,7 @@ impl From> for StringDictionaryColumn { Self(array) } } + impl From<&DictionaryArray> for StringDictionaryColumn { fn from(array_ref: &DictionaryArray) -> Self { let array_data = array_ref.into_data(); @@ -470,6 +471,7 @@ impl From<&DictionaryArray> for StringDictionaryColumn { Self(array) } } + impl StringDictionaryColumn { fn to_arrow_array(&self) -> DictionaryArray { let array_data = self.0.clone().into_data(); @@ -533,6 +535,7 @@ impl StringColumn { } } +/// dictionary encode type is difference from other types impl StringDictionaryColumn { /// Create a column that all values are null. fn new_null(num_rows: usize) -> Self { @@ -768,26 +771,13 @@ macro_rules! define_column_block { let column = match datum_kind { DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())), DatumKind::String => { - if !is_dictionary { - let mills_array; - let cast_column = match array.data_type() { - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - mills_array = cast_nanosecond_to_mills(array)?; - cast_array(datum_kind, &mills_array)? - } - _ => cast_array(datum_kind, array)?, - }; - ColumnBlock::String(StringColumn::from(cast_column)) - } else { - let mills_array; - let cast_column = match array.data_type() { - DataType::Timestamp(TimeUnit::Nanosecond, None) => { - mills_array = cast_nanosecond_to_mills(array)?; - cast_array(datum_kind, &mills_array)? - } - _ => cast_array(datum_kind, array)?, - }; + if is_dictionary { + let cast_column = cast_array(datum_kind, array)?; ColumnBlock::StringDictionary(StringDictionaryColumn::from(cast_column)) + + } else { + let cast_column = cast_array(datum_kind, array)?; + ColumnBlock::String(StringColumn::from(cast_column)) } }, $( diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs index 1691ab8f90..0678336f01 100644 --- a/common_types/src/column_schema.rs +++ b/common_types/src/column_schema.rs @@ -168,7 +168,7 @@ pub struct ColumnSchema { /// Is tag, tag is just a hint for a column, there is no restriction that a /// tag column must be a part of primary key pub is_tag: bool, - // Whether to use dictionary types for parquet store + // Whether to use dictionary types for encoding column pub is_dictionary: bool, /// Comment of the column pub comment: String, @@ -335,8 +335,8 @@ impl From<&ColumnSchema> for Field { DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), col_schema.is_nullable, col_schema.id.into(), - false, // TODO(tanruixiang): how to use dict_is_ordered + false, ) } else { Field::new( diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs index d82e669847..dab16707ab 100644 --- a/proxy/src/http/prom.rs +++ b/proxy/src/http/prom.rs @@ -242,7 +242,6 @@ impl Converter { let value_idx = schema.index_of(field_col_name).context(InternalNoCause { msg: "Value column is missing in query response", })?; - // TODO(tanruixiang): is there need add is_dictionary check? let tags = schema .columns() .iter() diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs index 2b311cc9f1..d090842b07 100644 --- a/proxy/src/influxdb/types.rs +++ b/proxy/src/influxdb/types.rs @@ -811,7 +811,6 @@ mod tests { } fn build_test_column_blocks() -> Vec { - // TODO(tanruixiang): missing is_dictionary paramms let mut measurement_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false); diff --git a/proxy/src/read.rs b/proxy/src/read.rs index a47b9454be..0f70b99869 100644 --- a/proxy/src/read.rs +++ b/proxy/src/read.rs @@ -9,6 +9,7 @@ use ceresdbproto::storage::{ }; use common_types::request_id::RequestId; use common_util::{error::BoxError, time::InstantExt}; +use datafusion::common::tree_node::TreeNode; use futures::FutureExt; use http::StatusCode; use interpreters::interpreter::Output; @@ -17,6 +18,7 @@ use query_engine::executor::Executor as QueryExecutor; use query_frontend::{ frontend, frontend::{Context as SqlContext, Frontend}, + plan::Plan, provider::CatalogMetaProvider, }; use router::endpoint::Endpoint; @@ -122,6 +124,7 @@ impl Proxy { // Create logical plan // Note: Remember to store sql in error when creating logical plan + println!("stmts: {:?}", stmts); let plan = frontend // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we // return internal server error in those cases @@ -132,6 +135,11 @@ impl Proxy { msg: format!("Failed to create plan, query:{sql}"), })?; + if let Plan::Query(tmp) = &plan { + println!("{:?}", stmts); + println!("{:?}", tmp.df_plan.all_out_ref_exprs()); + }; + let output = if ctx.enable_partition_table_access { self.execute_plan_involving_partition_table(request_id, catalog, schema, plan, deadline) .await diff --git a/proxy/src/write.rs b/proxy/src/write.rs index 43b39eb6c1..fc2f88d4d9 100644 --- a/proxy/src/write.rs +++ b/proxy/src/write.rs @@ -684,8 +684,7 @@ fn find_new_columns( ); let tag_name = &tag_names[name_index]; - // TODO(tanruixiang): is_dictionary set true or false ? - build_column(&mut columns, schema, tag_name, &tag.value, true, false)?; + build_column(&mut columns, schema, tag_name, &tag.value, true)?; } // Parse fields. @@ -701,8 +700,7 @@ fn find_new_columns( } ); let field_name = &field_names[field.name_index as usize]; - // TODO(tanruixiang): is_dictionary set true or false ? - build_column(&mut columns, schema, field_name, &field.value, false, false)?; + build_column(&mut columns, schema, field_name, &field.value, false)?; } } } @@ -716,7 +714,6 @@ fn build_column<'a>( name: &'a str, value: &Option, is_tag: bool, - is_dictionary: bool, ) -> Result<()> { // Skip adding columns, the following cases: // 1. Column already exists. @@ -742,7 +739,7 @@ fn build_column<'a>( msg: "Failed to get data type", })?; - let column_schema = build_column_schema(name, data_type, is_tag, is_dictionary) + let column_schema = build_column_schema(name, data_type, is_tag) .box_err() .context(Internal { msg: "Failed to build column schema", diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs index 813344a888..8304416b32 100644 --- a/query_frontend/src/parser.rs +++ b/query_frontend/src/parser.rs @@ -532,7 +532,6 @@ impl<'a> Parser<'a> { Token::make_keyword(TAG), ]))) } else if self.consume_token(DICTIONARY) { - // Support DICTIONARY for ceresdb Ok(Some(ColumnOption::DialectSpecific(vec![ Token::make_keyword(DICTIONARY), ]))) diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs index 785b75fa09..de8327da70 100644 --- a/query_frontend/src/planner.rs +++ b/query_frontend/src/planner.rs @@ -367,12 +367,10 @@ pub fn build_column_schema( column_name: &str, data_type: DatumKind, is_tag: bool, - is_dictionary: bool, ) -> Result { let builder = column_schema::Builder::new(column_name.to_string(), data_type) .is_nullable(true) - .is_tag(is_tag) - .is_dictionary(is_dictionary); + .is_tag(is_tag); builder.build().with_context(|| InvalidColumnSchema { column_name: column_name.to_string(), @@ -431,21 +429,10 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(tag_value)?; if let Some(column_schema) = name_column_map.get(tag_name) { - // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the - // pb? - ensure_data_type_compatible( - table, - tag_name, - true, - false, - data_type, - column_schema, - )?; + ensure_data_type_compatible(table, tag_name, true, data_type, column_schema)?; } - // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the - // pb? - let column_schema = build_column_schema(tag_name, data_type, true, false)?; + let column_schema = build_column_schema(tag_name, data_type, true)?; name_column_map.insert(tag_name, column_schema); } @@ -471,18 +458,15 @@ pub fn build_schema_from_write_table_request( let data_type = try_get_data_type_from_value(field_value)?; if let Some(column_schema) = name_column_map.get(field_name) { - // TODO(tanruixiang): is_dictionary set true or false ? ensure_data_type_compatible( table, field_name, false, - false, data_type, column_schema, )?; } - // TODO(tanruixiang): is_dictionary set true or false ? - let column_schema = build_column_schema(field_name, data_type, false, false)?; + let column_schema = build_column_schema(field_name, data_type, false)?; name_column_map.insert(field_name, column_schema); } } @@ -528,11 +512,9 @@ fn ensure_data_type_compatible( table_name: &str, column_name: &str, is_tag: bool, - _is_dictionary: bool, data_type: DatumKind, column_schema: &ColumnSchema, ) -> Result<()> { - // TODO(tanruixiang) : how to check is_dictionary ? ensure!( column_schema.is_tag == is_tag, InvalidWriteEntry { From df47927dc267ce13b4b861e728db7c71681f6097 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 17:12:55 +0800 Subject: [PATCH 13/18] remove debug message --- proxy/src/read.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/proxy/src/read.rs b/proxy/src/read.rs index 0f70b99869..bcb9f653a0 100644 --- a/proxy/src/read.rs +++ b/proxy/src/read.rs @@ -124,7 +124,6 @@ impl Proxy { // Create logical plan // Note: Remember to store sql in error when creating logical plan - println!("stmts: {:?}", stmts); let plan = frontend // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we // return internal server error in those cases @@ -135,11 +134,6 @@ impl Proxy { msg: format!("Failed to create plan, query:{sql}"), })?; - if let Plan::Query(tmp) = &plan { - println!("{:?}", stmts); - println!("{:?}", tmp.df_plan.all_out_ref_exprs()); - }; - let output = if ctx.enable_partition_table_access { self.execute_plan_involving_partition_table(request_id, catalog, schema, plan, deadline) .await From db350e323d9d07dec8473a2868d06e81afe8bafc Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 17:16:11 +0800 Subject: [PATCH 14/18] clippy --- proxy/src/read.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/proxy/src/read.rs b/proxy/src/read.rs index bcb9f653a0..86a06b1ee0 100644 --- a/proxy/src/read.rs +++ b/proxy/src/read.rs @@ -9,7 +9,7 @@ use ceresdbproto::storage::{ }; use common_types::request_id::RequestId; use common_util::{error::BoxError, time::InstantExt}; -use datafusion::common::tree_node::TreeNode; + use futures::FutureExt; use http::StatusCode; use interpreters::interpreter::Output; @@ -18,7 +18,6 @@ use query_engine::executor::Executor as QueryExecutor; use query_frontend::{ frontend, frontend::{Context as SqlContext, Frontend}, - plan::Plan, provider::CatalogMetaProvider, }; use router::endpoint::Endpoint; From bf81853fc2d269cf89dd879535a3aaf988808ce5 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Tue, 4 Jul 2023 17:21:28 +0800 Subject: [PATCH 15/18] fmt --- proxy/src/read.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/proxy/src/read.rs b/proxy/src/read.rs index 86a06b1ee0..a47b9454be 100644 --- a/proxy/src/read.rs +++ b/proxy/src/read.rs @@ -9,7 +9,6 @@ use ceresdbproto::storage::{ }; use common_types::request_id::RequestId; use common_util::{error::BoxError, time::InstantExt}; - use futures::FutureExt; use http::StatusCode; use interpreters::interpreter::Output; From 51a7ddf83b11a205825c76274d8abddeb761c82f Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 5 Jul 2023 11:33:28 +0800 Subject: [PATCH 16/18] fix --- common_types/src/column.rs | 9 +++++---- common_types/src/tests.rs | 1 + table_engine/src/memory.rs | 6 ++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/common_types/src/column.rs b/common_types/src/column.rs index 346d3fd80e..d1cf85fefc 100644 --- a/common_types/src/column.rs +++ b/common_types/src/column.rs @@ -143,6 +143,8 @@ pub struct VarbinaryColumn(BinaryArray); #[derive(Debug)] pub struct StringColumn(StringArray); +/// dictionary encode type is difference from other types, need implement +/// without macro #[derive(Debug)] pub struct StringDictionaryColumn(DictionaryArray); @@ -535,7 +537,6 @@ impl StringColumn { } } -/// dictionary encode type is difference from other types impl StringDictionaryColumn { /// Create a column that all values are null. fn new_null(num_rows: usize) -> Self { @@ -968,10 +969,10 @@ macro_rules! define_column_block_builder { // The data_capacity is set as 1024, because the item is variable-size type. DatumKind::Varbinary => Self::Varbinary(BinaryBuilder::with_capacity(item_capacity, 1024)), DatumKind::String =>{ - if !is_dictionary{ - Self::String(StringBuilder::with_capacity(item_capacity, 1024)) - }else { + if is_dictionary { Self::Dictionary(StringDictionaryBuilder::::new()) + }else { + Self::String(StringBuilder::with_capacity(item_capacity, 1024)) } } DatumKind::Date => Self::Date(DateBuilder::with_capacity(item_capacity)), diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs index b43269427f..9cfb86a0fb 100644 --- a/common_types/src/tests.rs +++ b/common_types/src/tests.rs @@ -242,6 +242,7 @@ pub fn build_row_for_dictionary( .unwrap_or(Datum::Null), Datum::String(StringBytes::from(tag2)), ]; + Row::from_datums(datums) } pub fn build_projected_schema() -> ProjectedSchema { diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 8d39a40823..7e390ee221 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -229,7 +229,7 @@ fn row_group_to_record_batch( ), })?; let cols = rows.iter_column(col_index); - let column_block = build_column_block(&column.data_type, cols)?; + let column_block = build_column_block(&column.data_type, cols, column.is_dictionary)?; column_blocks.push(column_block); } @@ -243,10 +243,12 @@ fn row_group_to_record_batch( fn build_column_block<'a, I: Iterator>( data_type: &DatumKind, iter: I, + is_dictionary: bool, ) -> stream::Result { // TODO(tanruixiang): ensure there don't use is_dictionary and the // datum.clone() is necessary ? - let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false); + let mut builder = + ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, is_dictionary); for datum in iter { builder .append(datum.clone()) From 3f36476c541ac77a95ce9962f6becfd02f4397f8 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 5 Jul 2023 14:33:39 +0800 Subject: [PATCH 17/18] fmt --- query_frontend/src/parser.rs | 4 ++-- table_engine/src/memory.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs index 8304416b32..77f4c35056 100644 --- a/query_frontend/src/parser.rs +++ b/query_frontend/src/parser.rs @@ -329,9 +329,9 @@ impl<'a> Parser<'a> { let options = self.parser.parse_options(Keyword::WITH)?; // Only String Column Can Be Dictionary Encoded - for c in columns.iter() { + for c in &columns { let mut is_dictionary = false; - for op in c.options.iter() { + for op in &c.options { if is_dictionary_column(&op.option) { is_dictionary = true; } diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs index 7e390ee221..80d092609c 100644 --- a/table_engine/src/memory.rs +++ b/table_engine/src/memory.rs @@ -245,8 +245,6 @@ fn build_column_block<'a, I: Iterator>( iter: I, is_dictionary: bool, ) -> stream::Result { - // TODO(tanruixiang): ensure there don't use is_dictionary and the - // datum.clone() is necessary ? let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, is_dictionary); for datum in iter { From 0523f446d55a3ca9b9058b498e5d8955698b1499 Mon Sep 17 00:00:00 2001 From: tanruixiang Date: Wed, 5 Jul 2023 14:53:16 +0800 Subject: [PATCH 18/18] use assert --- analytic_engine/src/sst/parquet/encoding.rs | 23 ++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs index 1a92338dd4..47d3b8c2dd 100644 --- a/analytic_engine/src/sst/parquet/encoding.rs +++ b/analytic_engine/src/sst/parquet/encoding.rs @@ -527,13 +527,22 @@ impl HybridRecordDecoder { .map(|f| { if let DataType::List(nested_field) = f.data_type() { match f.data_type() { - DataType::Dictionary(_, _) => Arc::new(Field::new_dict( - f.name(), - nested_field.data_type().clone(), - true, - f.dict_id().unwrap(), - f.dict_is_ordered().unwrap(), - )), + DataType::Dictionary(_, _) => { + assert!(f.dict_id().is_some(), "Dictionary must have dict_id"); + assert!( + f.dict_is_ordered().is_some(), + "Dictionary must have dict_is_ordered" + ); + let dict_id = f.dict_id().unwrap(); + let dict_is_ordered = f.dict_is_ordered().unwrap(); + Arc::new(Field::new_dict( + f.name(), + nested_field.data_type().clone(), + true, + dict_id, + dict_is_ordered, + )) + } _ => Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true)), } } else {