From b131d7d4dda0b9bf28c54fcf0626bb2059534559 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 13 Jun 2023 19:45:23 +0800
Subject: [PATCH 01/18] feat: use dictionary type to store column

---
 .github/workflows/ci.yml                      |  42 ++
 Cargo.lock                                    | 314 ++++-----
 Cargo.toml                                    |  14 +-
 Dockerfile                                    |   2 +-
 analytic_engine/src/compaction/mod.rs         | 108 ++-
 analytic_engine/src/compaction/picker.rs      |  66 +-
 analytic_engine/src/compaction/scheduler.rs   |  23 +-
 analytic_engine/src/instance/engine.rs        |  16 +-
 .../src/instance/flush_compaction.rs          | 155 +++--
 analytic_engine/src/instance/mod.rs           |   4 +-
 analytic_engine/src/instance/open.rs          | 315 +++------
 .../src/instance/serial_executor.rs           |   8 +-
 analytic_engine/src/instance/wal_replayer.rs  | 618 ++++++++++++++++++
 analytic_engine/src/lib.rs                    |  13 +
 analytic_engine/src/manifest/details.rs       |  32 +-
 analytic_engine/src/memtable/mod.rs           |  15 +-
 .../src/memtable/skiplist/factory.rs          |   3 +-
 analytic_engine/src/memtable/skiplist/mod.rs  |  44 +-
 analytic_engine/src/sampler.rs                |   4 +-
 analytic_engine/src/setup.rs                  |  48 ++
 analytic_engine/src/sst/meta_data/cache.rs    | 182 +++++-
 .../src/sst/parquet/async_reader.rs           | 116 +---
 analytic_engine/src/sst/parquet/encoding.rs   |  15 +-
 analytic_engine/src/sst/parquet/hybrid.rs     |   2 +
 analytic_engine/src/sst/parquet/writer.rs     | 161 ++++-
 analytic_engine/src/sst/reader.rs             |  42 +-
 analytic_engine/src/table/data.rs             |  21 +-
 analytic_engine/src/table/version.rs          |  95 +--
 analytic_engine/src/tests/alter_test.rs       |  29 +-
 analytic_engine/src/tests/drop_test.rs        |  39 +-
 analytic_engine/src/tests/read_write_test.rs  | 128 ++--
 analytic_engine/src/tests/util.rs             | 145 +++-
 common_types/src/column.rs                    | 331 +++++++++-
 common_types/src/column_schema.rs             |  86 ++-
 common_types/src/datum.rs                     | 132 +++-
 common_types/src/hex.rs                       |  64 ++
 common_types/src/lib.rs                       |   3 +-
 common_types/src/record_batch.rs              |  41 +-
 common_types/src/row/mod.rs                   |   4 +
 common_types/src/schema.rs                    |   2 +-
 common_types/src/tests.rs                     |  64 +-
 components/message_queue/Cargo.toml           |   4 +-
 components/message_queue/src/kafka/config.rs  |  21 +-
 .../message_queue/src/kafka/kafka_impl.rs     |  10 +-
 components/parquet_ext/Cargo.toml             |   2 +
 components/parquet_ext/src/lib.rs             |   1 +
 components/parquet_ext/src/meta_data.rs       |  25 +-
 components/parquet_ext/src/reader.rs          |  81 +++
 components/profile/Cargo.toml                 |   1 +
 components/profile/src/lib.rs                 |  65 +-
 df_operator/src/udfs/time_bucket.rs           |   3 +-
 integration_tests/Makefile                    |   3 +
 .../config/shard-based-recovery.toml          |  21 +
 integration_tests/recovery/check.py           |  84 +++
 integration_tests/recovery/run.sh             |  35 +
 interpreters/src/describe.rs                  |   4 +
 interpreters/src/insert.rs                    |   6 +-
 interpreters/src/show_create.rs               |   5 +
 interpreters/src/tests.rs                     |  21 +-
 proxy/src/forward.rs                          |  29 +-
 proxy/src/grpc/metrics.rs                     |   9 +
 proxy/src/grpc/prom_query.rs                  |  55 +-
 proxy/src/grpc/sql_query.rs                   |  39 +-
 proxy/src/grpc/write.rs                       |   9 +-
 proxy/src/http/prom.rs                        |   2 +
 proxy/src/http/sql.rs                         |   1 +
 proxy/src/influxdb/mod.rs                     |   1 +
 proxy/src/influxdb/types.rs                   |  12 +-
 proxy/src/lib.rs                              |   4 +
 proxy/src/read.rs                             |   7 +-
 proxy/src/write.rs                            |  26 +-
 query_frontend/src/frontend.rs                |   7 +
 query_frontend/src/parser.rs                  |  69 ++
 query_frontend/src/planner.rs                 | 186 +++++-
 rust-toolchain                                |   1 -
 rust-toolchain.toml                           |   3 +
 server/Cargo.toml                             |   1 -
 server/src/grpc/metrics.rs                    |   8 +
 server/src/grpc/remote_engine_service/mod.rs  |  47 +-
 server/src/grpc/storage_service/mod.rs        |  48 +-
 server/src/http.rs                            |  72 +-
 server/src/mysql/writer.rs                    |   5 +
 table_engine/src/memory.rs                    |   4 +-
 table_engine/src/table.rs                     |   1 +
 tools/src/bin/sst-metadata.rs                 |  87 ++-
 wal/src/message_queue_impl/log_cleaner.rs     |   6 +-
 wal/src/message_queue_impl/region.rs          |   6 +-
 wal/src/rocks_impl/config.rs                  |  21 +
 wal/src/rocks_impl/manager.rs                 |  90 ++-
 89 files changed, 3795 insertions(+), 999 deletions(-)
 create mode 100644 analytic_engine/src/instance/wal_replayer.rs
 create mode 100644 common_types/src/hex.rs
 create mode 100644 components/parquet_ext/src/reader.rs
 create mode 100644 integration_tests/config/shard-based-recovery.toml
 create mode 100644 integration_tests/recovery/check.py
 create mode 100755 integration_tests/recovery/run.sh
 delete mode 100644 rust-toolchain
 create mode 100644 rust-toolchain.toml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8a3d825fe1..e682ae5b55 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -232,3 +232,45 @@ jobs:
           name: sdk-test-${{ github.head_ref }}.${{ github.sha }}
           path: |
             /tmp/ceresdb-stdout.log
+
+  recovery-test:
+    name: recovery-test
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: true
+      - name: Cache Rust Dependencies
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo
+            ./target
+          key: debug-${{ runner.os }}-${{ hashFiles('rust-toolchain') }}-${{ hashFiles('Cargo.lock') }}
+          restore-keys: |
+            debug-${{ runner.os }}-${{ hashFiles('rust-toolchain') }}-
+            debug-${{ runner.os }}-
+            debug-
+      - run: |
+          rustup set auto-self-update disable
+          rustup toolchain install ${RUST_VERSION} --profile minimal
+      - name: Release Disk Quota
+        run: |
+          sudo rm -rf /usr/local/lib/android # release about 10 GB
+          sudo rm -rf /usr/share/dotnet # release about 20GB
+      - name: Setup Build Environment
+        run: |
+          sudo apt update
+          sudo apt install --yes protobuf-compiler
+      - name: Run recovery tests
+        working-directory: integration_tests
+        run: |
+          make run-recovery
+      - name: Upload Logs
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: recovery-test-${{ github.head_ref }}.${{ github.sha }}
+          path: |
+            /tmp/ceresdb-stdout.log
diff --git a/Cargo.lock b/Cargo.lock
index 116d2dca10..466fe8b216 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -80,12 +80,16 @@ version = "1.2.2"
 dependencies = [
  "arc-swap 1.6.0",
  "arena",
- "arrow 38.0.0",
+ "arrow",
  "async-stream",
  "async-trait",
  "base64 0.13.1",
  "bytes",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "datafusion",
@@ -184,31 +188,6 @@ version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6"
 
-[[package]]
-name = "arrow"
-version = "23.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fedc767fbaa36ea50f086215f54f1a007d22046fc4754b0448c657bcbe9f8413"
-dependencies = [
- "ahash 0.8.3",
- "arrow-buffer 23.0.0",
- "bitflags",
- "chrono",
- "csv",
- "flatbuffers 2.1.2",
- "half 2.2.1",
- "hashbrown 0.12.3",
- "indexmap",
- "lazy_static",
- "lexical-core 0.8.5",
- "multiversion",
- "num",
- "regex",
- "regex-syntax 0.6.29",
- "serde",
- "serde_json",
-]
-
 [[package]]
 name = "arrow"
 version = "38.0.0"
@@ -218,7 +197,7 @@ dependencies = [
  "ahash 0.8.3",
  "arrow-arith",
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-cast",
  "arrow-csv",
  "arrow-data",
@@ -238,7 +217,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace6aa3d5617c5d03041a05e01c6819428a8ddf49dd0b055df9b40fef9d96094"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
@@ -253,7 +232,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "104a04520692cc674e6afd7682f213ca41f9b13ff1873f63a5a2857a590b87b3"
 dependencies = [
  "ahash 0.8.3",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
@@ -263,16 +242,6 @@ dependencies = [
  "num",
 ]
 
-[[package]]
-name = "arrow-buffer"
-version = "23.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d290050c6e12a81a24ad08525cef2203c4156a6350f75508d49885d677e88ea9"
-dependencies = [
- "half 2.2.1",
- "num",
-]
-
 [[package]]
 name = "arrow-buffer"
 version = "38.0.0"
@@ -290,7 +259,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6d6e18281636c8fc0b93be59834da6bf9a72bb70fd0c98ddfdaf124da466c28"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "arrow-select",
@@ -307,7 +276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3197dab0963a236ff8e7c82e2272535745955ac1321eb740c29f2f88b353f54e"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-cast",
  "arrow-data",
  "arrow-schema",
@@ -325,7 +294,7 @@ version = "38.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eb68113d6ecdbe8bba48b2c4042c151bf9e1c61244e45072a50250a6fc59bafe"
 dependencies = [
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-schema",
  "half 2.2.1",
  "num",
@@ -338,11 +307,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eab4bbf2dd3078facb5ce0a9641316a64f42bfd8cf357e6775c8a5e6708e3a8d"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-cast",
  "arrow-data",
  "arrow-schema",
- "flatbuffers 23.1.21",
+ "flatbuffers",
 ]
 
 [[package]]
@@ -352,7 +321,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c5b650d23746a494665d914a7fa3d21d939153cff9d53bdebe39bffa88f263"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-cast",
  "arrow-data",
  "arrow-schema",
@@ -372,7 +341,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68c6fce28e5011e30acc7466b5efcb8ed0197c396240bd2b10e167f275a3c208"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "arrow-select",
@@ -388,7 +357,7 @@ checksum = "f20a421f19799d8b93eb8edde5217e910fa1e2d6ceb3c529f000e57b6db144c0"
 dependencies = [
  "ahash 0.8.3",
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "half 2.2.1",
@@ -408,7 +377,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f6ab6613ce65b61d85a3410241744e84e48fbab0fe06e1251b4429d21b3470fd"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "num",
@@ -421,7 +390,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f3008641239e884aefba66d8b8532da6af40d14296349fcc85935de4ba67b89e"
 dependencies = [
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "arrow-select",
@@ -433,7 +402,7 @@ dependencies = [
 name = "arrow_ext"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "serde",
  "snafu 0.6.10",
  "zstd 0.12.3+zstd.1.5.2",
@@ -442,10 +411,10 @@ dependencies = [
 [[package]]
 name = "arrow_util"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "ahash 0.8.3",
- "arrow 38.0.0",
+ "arrow",
  "chrono",
  "comfy-table",
  "hashbrown 0.13.2",
@@ -640,7 +609,7 @@ version = "1.2.2"
 dependencies = [
  "analytic_engine",
  "arena",
- "arrow 38.0.0",
+ "arrow",
  "base64 0.13.1",
  "clap 3.2.23",
  "common_types",
@@ -1087,13 +1056,17 @@ dependencies = [
 
 [[package]]
 name = "ceresdb-client"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a2c1699cb154e97cfccd3d6a0679f561c6214a33d86b3eacb78685c7479d022"
+checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b"
 dependencies = [
- "arrow 23.0.0",
+ "arrow",
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.5",
+=======
+ "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
+>>>>>>> 0abc9181 (update pb)
  "dashmap 5.4.0",
  "futures 0.3.28",
  "paste 1.0.12",
@@ -1120,8 +1093,24 @@ dependencies = [
 [[package]]
 name = "ceresdbproto"
 version = "1.0.4"
+source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=53f5c74a54d8a08ebb08c41e8b862b2369df4a02#53f5c74a54d8a08ebb08c41e8b862b2369df4a02"
+dependencies = [
+ "prost",
+ "protoc-bin-vendored",
+ "tonic 0.8.3",
+ "tonic-build",
+ "walkdir",
+]
+
+[[package]]
+name = "ceresdbproto"
+version = "1.0.5"
+<<<<<<< HEAD
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18d5d1c238f84dee01e671603c6a921868f3663256e0393c64ece88e58ee4869"
+checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a"
+=======
+source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39"
+>>>>>>> 0abc9181 (update pb)
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1274,7 +1263,11 @@ name = "cluster"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "etcd-client",
@@ -1323,11 +1316,15 @@ name = "common_types"
 version = "1.2.2"
 dependencies = [
  "ahash 0.8.3",
- "arrow 38.0.0",
+ "arrow",
  "arrow_ext",
  "byteorder",
  "bytes_ext",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "chrono",
  "datafusion",
  "murmur3",
@@ -1344,9 +1341,13 @@ dependencies = [
 name = "common_util"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "backtrace",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "chrono",
  "common_types",
  "crossbeam-utils 0.8.15",
@@ -1794,10 +1795,10 @@ dependencies = [
 [[package]]
 name = "datafusion"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
  "ahash 0.8.3",
- "arrow 38.0.0",
+ "arrow",
  "arrow-array",
  "arrow-schema",
  "async-compression",
@@ -1843,9 +1844,9 @@ dependencies = [
 [[package]]
 name = "datafusion-common"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow-array",
  "chrono",
  "num_cpus",
@@ -1857,7 +1858,7 @@ dependencies = [
 [[package]]
 name = "datafusion-execution"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
  "dashmap 5.4.0",
  "datafusion-common",
@@ -1874,10 +1875,10 @@ dependencies = [
 [[package]]
 name = "datafusion-expr"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
  "ahash 0.8.3",
- "arrow 38.0.0",
+ "arrow",
  "datafusion-common",
  "sqlparser",
 ]
@@ -1885,9 +1886,9 @@ dependencies = [
 [[package]]
 name = "datafusion-optimizer"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "async-trait",
  "chrono",
  "datafusion-common",
@@ -1902,12 +1903,12 @@ dependencies = [
 [[package]]
 name = "datafusion-physical-expr"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
  "ahash 0.8.3",
- "arrow 38.0.0",
+ "arrow",
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-schema",
  "blake2",
  "blake3",
@@ -1934,9 +1935,9 @@ dependencies = [
 [[package]]
 name = "datafusion-proto"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "chrono",
  "datafusion",
  "datafusion-common",
@@ -1948,9 +1949,9 @@ dependencies = [
 [[package]]
 name = "datafusion-row"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "datafusion-common",
  "paste 1.0.12",
  "rand 0.8.5",
@@ -1959,9 +1960,9 @@ dependencies = [
 [[package]]
 name = "datafusion-sql"
 version = "23.0.0"
-source = "git+https://github.com/jiacai2050/arrow-datafusion.git?rev=13314c37020b90246db9b80f8294370c06e61018#13314c37020b90246db9b80f8294370c06e61018"
+source = "git+https://github.com/ceresdb/arrow-datafusion.git?rev=acb5d97a8a8de5296989740f97db3773fe3aa45a#acb5d97a8a8de5296989740f97db3773fe3aa45a"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow-schema",
  "datafusion-common",
  "datafusion-expr",
@@ -1972,7 +1973,7 @@ dependencies = [
 [[package]]
 name = "datafusion_util"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "async-trait",
  "datafusion",
@@ -2027,7 +2028,7 @@ dependencies = [
 name = "df_operator"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "base64 0.13.1",
  "bincode",
  "chrono",
@@ -2239,17 +2240,6 @@ version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
 
-[[package]]
-name = "flatbuffers"
-version = "2.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a"
-dependencies = [
- "bitflags",
- "smallvec",
- "thiserror",
-]
-
 [[package]]
 name = "flatbuffers"
 version = "23.1.21"
@@ -2507,7 +2497,7 @@ checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2"
 [[package]]
 name = "generated_types"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "pbjson",
  "pbjson-build",
@@ -2900,7 +2890,7 @@ dependencies = [
 [[package]]
 name = "influxdb_influxql_parser"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "chrono",
  "chrono-tz",
@@ -2929,7 +2919,7 @@ name = "interpreters"
 version = "1.2.2"
 dependencies = [
  "analytic_engine",
- "arrow 38.0.0",
+ "arrow",
  "async-trait",
  "catalog",
  "catalog_impls",
@@ -2962,9 +2952,9 @@ dependencies = [
 [[package]]
 name = "iox_query"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow_util",
  "async-trait",
  "chrono",
@@ -2986,9 +2976,9 @@ dependencies = [
 [[package]]
 name = "iox_query_influxql"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "chrono",
  "chrono-tz",
  "datafusion",
@@ -3470,7 +3460,11 @@ name = "meta_client"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -3579,26 +3573,6 @@ dependencies = [
  "twoway",
 ]
 
-[[package]]
-name = "multiversion"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373"
-dependencies = [
- "multiversion-macros",
-]
-
-[[package]]
-name = "multiversion-macros"
-version = "0.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 1.0.109",
-]
-
 [[package]]
 name = "murmur2"
 version = "0.1.0"
@@ -3979,7 +3953,11 @@ version = "1.2.2"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "chrono",
  "clru",
  "common_types",
@@ -4043,7 +4021,7 @@ dependencies = [
 [[package]]
 name = "observability_deps"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "tracing",
 ]
@@ -4166,7 +4144,7 @@ checksum = "4cbd51311f8d9ff3d2697b1522b18a588782e097d313a1a278b0faf2ccf2d3f6"
 dependencies = [
  "ahash 0.8.3",
  "arrow-array",
- "arrow-buffer 38.0.0",
+ "arrow-buffer",
  "arrow-cast",
  "arrow-data",
  "arrow-ipc",
@@ -4196,13 +4174,15 @@ dependencies = [
 name = "parquet_ext"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow_ext",
  "async-trait",
  "bytes",
  "common_util",
  "datafusion",
+ "futures 0.3.28",
  "log",
+ "object_store 1.2.2",
  "parquet",
  "tokio",
 ]
@@ -4574,6 +4554,7 @@ dependencies = [
  "jemalloc-sys",
  "jemallocator",
  "log",
+ "pprof 0.11.1",
 ]
 
 [[package]]
@@ -4769,12 +4750,16 @@ checksum = "9653c3ed92974e34c5a6e0a510864dab979760481714c172e0a34e437cb98804"
 name = "proxy"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow_ext",
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "clru",
  "cluster",
  "common_types",
@@ -4863,7 +4848,7 @@ dependencies = [
 name = "query_engine"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "async-trait",
  "chrono",
  "common_types",
@@ -4884,10 +4869,14 @@ dependencies = [
 name = "query_frontend"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "async-trait",
  "catalog",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "cluster",
  "common_types",
  "common_util",
@@ -4914,9 +4903,9 @@ dependencies = [
 [[package]]
 name = "query_functions"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "chrono",
  "datafusion",
  "itertools",
@@ -5195,7 +5184,11 @@ version = "1.2.2"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -5321,7 +5314,11 @@ name = "router"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "cluster",
  "common_types",
  "common_util",
@@ -5337,8 +5334,8 @@ dependencies = [
 
 [[package]]
 name = "rskafka"
-version = "0.3.0"
-source = "git+https://github.com/influxdata/rskafka.git?rev=00988a564b1db0249d858065fc110476c075efad#00988a564b1db0249d858065fc110476c075efad"
+version = "0.4.0"
+source = "git+https://github.com/Rachelint/rskafka.git?rev=f0fd8e278d8164cb0cfca5a80476361fc308ecc3#f0fd8e278d8164cb0cfca5a80476361fc308ecc3"
 dependencies = [
  "async-trait",
  "bytes",
@@ -5521,9 +5518,9 @@ dependencies = [
 [[package]]
 name = "schema"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "hashbrown 0.13.2",
  "indexmap",
  "itertools",
@@ -5671,12 +5668,16 @@ name = "server"
 version = "1.2.2"
 dependencies = [
  "analytic_engine",
- "arrow 38.0.0",
+ "arrow",
  "arrow_ext",
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "clru",
  "cluster",
  "common_types",
@@ -5694,7 +5695,6 @@ dependencies = [
  "opensrv-mysql",
  "partition_table_engine",
  "paste 1.0.12",
- "pprof 0.11.1",
  "profile",
  "prom-remote-api",
  "prometheus 0.12.0",
@@ -6213,10 +6213,14 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
 name = "system_catalog"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "async-trait",
  "catalog",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -6232,10 +6236,14 @@ dependencies = [
 name = "table_engine"
 version = "1.2.2"
 dependencies = [
- "arrow 38.0.0",
+ "arrow",
  "arrow_ext",
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "datafusion",
@@ -6319,7 +6327,7 @@ dependencies = [
 [[package]]
 name = "test_helpers"
 version = "0.1.0"
-source = "git+https://github.com/CeresDB/influxql?rev=efbc589#efbc589fb4e884e4ce057f066e63183f02a99c51"
+source = "git+https://github.com/CeresDB/influxql?rev=935e037a5ad6eb142a93f3e9eb321ee72e28cbad#935e037a5ad6eb142a93f3e9eb321ee72e28cbad"
 dependencies = [
  "dotenvy",
  "observability_deps",
@@ -6892,8 +6900,8 @@ version = "1.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
 dependencies = [
- "cfg-if 1.0.0",
- "rand 0.8.5",
+ "cfg-if 0.1.10",
+ "rand 0.3.23",
  "static_assertions 1.1.0",
 ]
 
@@ -7049,7 +7057,11 @@ name = "wal"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto",
+<<<<<<< HEAD
+ "ceresdbproto 1.0.4",
+=======
+ "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+>>>>>>> 0abc9181 (update pb)
  "chrono",
  "common_types",
  "common_util",
diff --git a/Cargo.toml b/Cargo.toml
index 2f2036df2f..b3e3bcb0ee 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -68,7 +68,7 @@ bytes = "1.1.0"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = "1.0.4"
+ceresdbproto = { git = "https://github.com/tanruixiang/ceresdbproto.git", rev = "6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" }
 chrono = "0.4"
 clap = "3.0"
 clru = "0.6.1"
@@ -76,8 +76,8 @@ cluster = { path = "cluster" }
 criterion = "0.3"
 common_types = { path = "common_types" }
 common_util = { path = "common_util" }
-datafusion = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" }
-datafusion-proto = { git = "https://github.com/jiacai2050/arrow-datafusion.git", rev = "13314c37020b90246db9b80f8294370c06e61018" }
+datafusion = { git = "https://github.com/ceresdb/arrow-datafusion.git", rev = "acb5d97a8a8de5296989740f97db3773fe3aa45a" }
+datafusion-proto = { git = "https://github.com/ceresdb/arrow-datafusion.git", rev = "acb5d97a8a8de5296989740f97db3773fe3aa45a" }
 df_operator = { path = "df_operator" }
 etcd-client = "0.10.3"
 env_logger = "0.6"
@@ -87,10 +87,10 @@ lazy_static = "1.4.0"
 log = "0.4"
 logger = { path = "components/logger" }
 lru = "0.7.6"
-influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query_influxql" }
-influxql-parser = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "influxdb_influxql_parser" }
-influxql-query = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "iox_query" }
-influxql-schema = { git = "https://github.com/CeresDB/influxql", rev = "efbc589", package = "schema" }
+influxql-logical-planner = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "iox_query_influxql" }
+influxql-parser = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "influxdb_influxql_parser" }
+influxql-query = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "iox_query" }
+influxql-schema = { git = "https://github.com/CeresDB/influxql", rev = "935e037a5ad6eb142a93f3e9eb321ee72e28cbad", package = "schema" }
 interpreters = { path = "interpreters" }
 itertools = "0.10.5"
 meta_client = { path = "meta_client" }
diff --git a/Dockerfile b/Dockerfile
index 142f575408..192b877814 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -5,7 +5,7 @@ FROM rust:${RUST_VERSION}-slim-bullseye as build
 # cache mounts below may already exist and owned by root
 USER root
 
-RUN apt update && apt install --yes gcc g++ libssl-dev pkg-config cmake protobuf-compiler && rm -rf /var/lib/apt/lists/*
+RUN apt update && apt install --yes git gcc g++ libssl-dev pkg-config cmake protobuf-compiler && rm -rf /var/lib/apt/lists/*
 
 COPY . /ceresdb
 WORKDIR /ceresdb
diff --git a/analytic_engine/src/compaction/mod.rs b/analytic_engine/src/compaction/mod.rs
index bcbead4af9..e0485522b1 100644
--- a/analytic_engine/src/compaction/mod.rs
+++ b/analytic_engine/src/compaction/mod.rs
@@ -318,13 +318,26 @@ pub struct ExpiredFiles {
 
 #[derive(Default, Clone)]
 pub struct CompactionTask {
-    pub compaction_inputs: Vec<CompactionInputFiles>,
-    pub expired: Vec<ExpiredFiles>,
+    inputs: Vec<CompactionInputFiles>,
+    expired: Vec<ExpiredFiles>,
+}
+
+impl Drop for CompactionTask {
+    fn drop(&mut self) {
+        // When a CompactionTask is dropped, it means
+        // 1. the task finished successfully, or
+        // 2. the task is cancelled for some reason, like memory limit
+        //
+        // In case 2, we need to mark files as not compacted in order for them to be
+        // scheduled again. In case 1, the files will be moved out of level controller,
+        // so it doesn't care what the flag is, so it's safe to set false here.
+        self.mark_files_being_compacted(false);
+    }
 }
 
 impl CompactionTask {
-    pub fn mark_files_being_compacted(&self, being_compacted: bool) {
-        for input in &self.compaction_inputs {
+    fn mark_files_being_compacted(&self, being_compacted: bool) {
+        for input in &self.inputs {
             for file in &input.files {
                 file.set_being_compacted(being_compacted);
             }
@@ -337,9 +350,10 @@ impl CompactionTask {
     }
 
     // Estimate the size of the total input files.
+    #[inline]
     pub fn estimated_total_input_file_size(&self) -> usize {
         let total_input_size: u64 = self
-            .compaction_inputs
+            .inputs
             .iter()
             .map(|v| v.files.iter().map(|f| f.size()).sum::<u64>())
             .sum();
@@ -347,19 +361,65 @@ impl CompactionTask {
         total_input_size as usize
     }
 
+    #[inline]
     pub fn num_compact_files(&self) -> usize {
-        self.compaction_inputs.iter().map(|v| v.files.len()).sum()
+        self.inputs.iter().map(|v| v.files.len()).sum()
     }
 
-    pub fn num_expired_files(&self) -> usize {
-        self.expired.iter().map(|v| v.files.len()).sum()
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.is_input_empty() && self.expired.is_empty()
+    }
+
+    #[inline]
+    pub fn is_input_empty(&self) -> bool {
+        self.inputs.is_empty()
+    }
+
+    #[inline]
+    pub fn expired(&self) -> &[ExpiredFiles] {
+        &self.expired
+    }
+
+    #[inline]
+    pub fn inputs(&self) -> &[CompactionInputFiles] {
+        &self.inputs
+    }
+}
+
+pub struct CompactionTaskBuilder {
+    expired: Vec<ExpiredFiles>,
+    inputs: Vec<CompactionInputFiles>,
+}
+
+impl CompactionTaskBuilder {
+    pub fn with_expired(expired: Vec<ExpiredFiles>) -> Self {
+        Self {
+            expired,
+            inputs: Vec::new(),
+        }
+    }
+
+    pub fn add_inputs(&mut self, files: CompactionInputFiles) {
+        self.inputs.push(files);
+    }
+
+    pub fn build(self) -> CompactionTask {
+        let task = CompactionTask {
+            expired: self.expired,
+            inputs: self.inputs,
+        };
+
+        task.mark_files_being_compacted(true);
+
+        task
     }
 }
 
 impl fmt::Debug for CompactionTask {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("CompactionTask")
-            .field("inputs", &self.compaction_inputs)
+            .field("inputs", &self.inputs)
             .field(
                 "expired",
                 &self
@@ -380,36 +440,12 @@ impl fmt::Debug for CompactionTask {
     }
 }
 
-pub struct PickerManager {
-    default_picker: CompactionPickerRef,
-    time_window_picker: CompactionPickerRef,
-    size_tiered_picker: CompactionPickerRef,
-}
-
-impl Default for PickerManager {
-    fn default() -> Self {
-        let size_tiered_picker = Arc::new(CommonCompactionPicker::new(
-            CompactionStrategy::SizeTiered(SizeTieredCompactionOptions::default()),
-        ));
-        let time_window_picker = Arc::new(CommonCompactionPicker::new(
-            CompactionStrategy::TimeWindow(TimeWindowCompactionOptions::default()),
-        ));
-
-        Self {
-            default_picker: time_window_picker.clone(),
-            size_tiered_picker,
-            time_window_picker,
-        }
-    }
-}
+#[derive(Default)]
+pub struct PickerManager;
 
 impl PickerManager {
     pub fn get_picker(&self, strategy: CompactionStrategy) -> CompactionPickerRef {
-        match strategy {
-            CompactionStrategy::Default => self.default_picker.clone(),
-            CompactionStrategy::SizeTiered(_) => self.size_tiered_picker.clone(),
-            CompactionStrategy::TimeWindow(_) => self.time_window_picker.clone(),
-        }
+        Arc::new(CommonCompactionPicker::new(strategy))
     }
 }
 
diff --git a/analytic_engine/src/compaction/picker.rs b/analytic_engine/src/compaction/picker.rs
index 96600199f0..e104aca7d2 100644
--- a/analytic_engine/src/compaction/picker.rs
+++ b/analytic_engine/src/compaction/picker.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Compaction picker.
 
@@ -15,8 +15,8 @@ use snafu::Snafu;
 
 use crate::{
     compaction::{
-        CompactionInputFiles, CompactionStrategy, CompactionTask, SizeTieredCompactionOptions,
-        TimeWindowCompactionOptions,
+        CompactionInputFiles, CompactionStrategy, CompactionTask, CompactionTaskBuilder,
+        SizeTieredCompactionOptions, TimeWindowCompactionOptions,
     },
     sst::{
         file::{FileHandle, Level},
@@ -60,7 +60,7 @@ pub trait CompactionPicker {
     fn pick_compaction(
         &self,
         ctx: PickerContext,
-        levels_controller: &LevelsController,
+        levels_controller: &mut LevelsController,
     ) -> Result<CompactionTask>;
 }
 
@@ -86,10 +86,10 @@ pub struct CommonCompactionPicker {
 impl CommonCompactionPicker {
     pub fn new(strategy: CompactionStrategy) -> Self {
         let level_picker: LevelPickerRef = match strategy {
-            CompactionStrategy::SizeTiered(_) | CompactionStrategy::Default => {
-                Arc::new(SizeTieredPicker::default())
+            CompactionStrategy::SizeTiered(_) => Arc::new(SizeTieredPicker::default()),
+            CompactionStrategy::TimeWindow(_) | CompactionStrategy::Default => {
+                Arc::new(TimeWindowPicker::default())
             }
-            CompactionStrategy::TimeWindow(_) => Arc::new(TimeWindowPicker::default()),
         };
         Self { level_picker }
     }
@@ -123,13 +123,11 @@ impl CompactionPicker for CommonCompactionPicker {
     fn pick_compaction(
         &self,
         ctx: PickerContext,
-        levels_controller: &LevelsController,
+        levels_controller: &mut LevelsController,
     ) -> Result<CompactionTask> {
         let expire_time = ctx.ttl.map(Timestamp::expire_time);
-        let mut compaction_task = CompactionTask {
-            expired: levels_controller.expired_ssts(expire_time),
-            ..Default::default()
-        };
+        let mut builder =
+            CompactionTaskBuilder::with_expired(levels_controller.expired_ssts(expire_time));
 
         if let Some(input_files) =
             self.pick_compact_candidates(&ctx, levels_controller, expire_time)
@@ -139,10 +137,10 @@ impl CompactionPicker for CommonCompactionPicker {
                 ctx.strategy, input_files
             );
 
-            compaction_task.compaction_inputs = vec![input_files];
+            builder.add_inputs(input_files);
         }
 
-        Ok(compaction_task)
+        Ok(builder.build())
     }
 }
 
@@ -734,39 +732,39 @@ mod tests {
         };
         let now = Timestamp::now();
         {
-            let lc = build_old_bucket_case(now.as_i64());
-            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
-            assert_eq!(task.compaction_inputs[0].files.len(), 2);
-            assert_eq!(task.compaction_inputs[0].files[0].id(), 0);
-            assert_eq!(task.compaction_inputs[0].files[1].id(), 1);
+            let mut lc = build_old_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap();
+            assert_eq!(task.inputs[0].files.len(), 2);
+            assert_eq!(task.inputs[0].files[0].id(), 0);
+            assert_eq!(task.inputs[0].files[1].id(), 1);
             assert_eq!(task.expired[0].files.len(), 1);
             assert_eq!(task.expired[0].files[0].id(), 3);
         }
 
         {
-            let lc = build_newest_bucket_case(now.as_i64());
-            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
-            assert_eq!(task.compaction_inputs[0].files.len(), 4);
-            assert_eq!(task.compaction_inputs[0].files[0].id(), 2);
-            assert_eq!(task.compaction_inputs[0].files[1].id(), 3);
-            assert_eq!(task.compaction_inputs[0].files[2].id(), 4);
-            assert_eq!(task.compaction_inputs[0].files[3].id(), 5);
+            let mut lc = build_newest_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap();
+            assert_eq!(task.inputs[0].files.len(), 4);
+            assert_eq!(task.inputs[0].files[0].id(), 2);
+            assert_eq!(task.inputs[0].files[1].id(), 3);
+            assert_eq!(task.inputs[0].files[2].id(), 4);
+            assert_eq!(task.inputs[0].files[3].id(), 5);
         }
 
         {
-            let lc = build_newest_bucket_no_match_case(now.as_i64());
-            let task = twp.pick_compaction(ctx.clone(), &lc).unwrap();
-            assert_eq!(task.compaction_inputs.len(), 0);
+            let mut lc = build_newest_bucket_no_match_case(now.as_i64());
+            let task = twp.pick_compaction(ctx.clone(), &mut lc).unwrap();
+            assert_eq!(task.inputs.len(), 0);
         }
 
         // If ttl is None, then no file is expired.
         ctx.ttl = None;
         {
-            let lc = build_old_bucket_case(now.as_i64());
-            let task = twp.pick_compaction(ctx, &lc).unwrap();
-            assert_eq!(task.compaction_inputs[0].files.len(), 2);
-            assert_eq!(task.compaction_inputs[0].files[0].id(), 0);
-            assert_eq!(task.compaction_inputs[0].files[1].id(), 1);
+            let mut lc = build_old_bucket_case(now.as_i64());
+            let task = twp.pick_compaction(ctx, &mut lc).unwrap();
+            assert_eq!(task.inputs[0].files.len(), 2);
+            assert_eq!(task.inputs[0].files[0].id(), 0);
+            assert_eq!(task.inputs[0].files[1].id(), 1);
             assert!(task.expired[0].files.is_empty());
         }
     }
diff --git a/analytic_engine/src/compaction/scheduler.rs b/analytic_engine/src/compaction/scheduler.rs
index 30cf277521..620bfa83ed 100644
--- a/analytic_engine/src/compaction/scheduler.rs
+++ b/analytic_engine/src/compaction/scheduler.rs
@@ -237,7 +237,7 @@ impl OngoingTaskLimit {
 
         if dropped > 0 {
             warn!(
-                "Too many compaction pending tasks,  limit: {}, dropped {} older tasks.",
+                "Too many compaction pending tasks, limit:{}, dropped:{}.",
                 self.max_pending_compaction_tasks, dropped,
             );
         }
@@ -428,12 +428,11 @@ impl ScheduleWorker {
         let ongoing = self.limit.ongoing_tasks();
         match schedule_task {
             ScheduleTask::Request(compact_req) => {
-                debug!("Ongoing compaction tasks:{}", ongoing);
+                debug!("Ongoing compaction tasks:{ongoing}");
                 if ongoing >= self.max_ongoing_tasks {
                     self.limit.add_request(compact_req);
                     warn!(
-                        "Too many compaction ongoing tasks:{}, max:{}, buf_len:{}",
-                        ongoing,
+                        "Too many compaction ongoing tasks:{ongoing}, max:{}, buf_len:{}",
                         self.max_ongoing_tasks,
                         self.limit.request_buf_len()
                     );
@@ -448,7 +447,13 @@ impl ScheduleWorker {
                     for compact_req in pending {
                         self.handle_table_compaction_request(compact_req).await;
                     }
-                    debug!("Scheduled {} pending compaction tasks.", len);
+                    debug!("Scheduled {len} pending compaction tasks.");
+                } else {
+                    warn!(
+                        "Too many compaction ongoing tasks:{ongoing}, max:{}, buf_len:{}",
+                        self.max_ongoing_tasks,
+                        self.limit.request_buf_len()
+                    );
                 }
             }
             ScheduleTask::Exit => (),
@@ -462,10 +467,7 @@ impl ScheduleWorker {
         waiter_notifier: WaiterNotifier,
         token: MemoryUsageToken,
     ) {
-        // Mark files being in compaction.
-        compaction_task.mark_files_being_compacted(true);
-
-        let keep_scheduling_compaction = !compaction_task.compaction_inputs.is_empty();
+        let keep_scheduling_compaction = !compaction_task.is_input_empty();
 
         let runtime = self.runtime.clone();
         let space_store = self.space_store.clone();
@@ -503,9 +505,6 @@ impl ScheduleWorker {
                 .await;
 
             if let Err(e) = &res {
-                // Compaction is failed, we need to unset the compaction mark.
-                compaction_task.mark_files_being_compacted(false);
-
                 error!(
                     "Failed to compact table, table_name:{}, table_id:{}, request_id:{}, err:{}",
                     table_data.name, table_data.id, request_id, e
diff --git a/analytic_engine/src/instance/engine.rs b/analytic_engine/src/instance/engine.rs
index 00b6ba8745..e32fa064d5 100644
--- a/analytic_engine/src/instance/engine.rs
+++ b/analytic_engine/src/instance/engine.rs
@@ -218,6 +218,18 @@ pub enum Error {
 
     #[snafu(display("Failed to open shard, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
     OpenTablesOfShard { msg: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to replay wal, msg:{:?}, err:{}", msg, source))]
+    ReplayWalWithCause {
+        msg: Option<String>,
+        source: GenericError,
+    },
+
+    #[snafu(display("Failed to replay wal, msg:{:?}.\nBacktrace:\n{}", msg, backtrace))]
+    ReplayWalNoCause {
+        msg: Option<String>,
+        backtrace: Backtrace,
+    },
 }
 
 define_result!(Error);
@@ -250,7 +262,9 @@ impl From<Error> for table_engine::engine::Error {
             | Error::DoManifestSnapshot { .. }
             | Error::OpenManifest { .. }
             | Error::TableNotExist { .. }
-            | Error::OpenTablesOfShard { .. } => Self::Unexpected {
+            | Error::OpenTablesOfShard { .. }
+            | Error::ReplayWalNoCause { .. }
+            | Error::ReplayWalWithCause { .. } => Self::Unexpected {
                 source: Box::new(err),
             },
         }
diff --git a/analytic_engine/src/instance/flush_compaction.rs b/analytic_engine/src/instance/flush_compaction.rs
index 0dc4c6208c..0d807349dd 100644
--- a/analytic_engine/src/instance/flush_compaction.rs
+++ b/analytic_engine/src/instance/flush_compaction.rs
@@ -130,6 +130,18 @@ pub enum Error {
 
     #[snafu(display("Other failure, msg:{}.\nBacktrace:\n{:?}", msg, backtrace))]
     Other { msg: String, backtrace: Backtrace },
+
+    #[snafu(display("Failed to run flush job, msg:{:?}, err:{}", msg, source))]
+    FlushJobWithCause {
+        msg: Option<String>,
+        source: GenericError,
+    },
+
+    #[snafu(display("Failed to run flush job, msg:{:?}.\nBacktrace:\n{}", msg, backtrace))]
+    FlushJobNoCause {
+        msg: Option<String>,
+        backtrace: Backtrace,
+    },
 }
 
 define_result!(Error);
@@ -163,6 +175,7 @@ pub struct TableFlushRequest {
     pub max_sequence: SequenceNumber,
 }
 
+#[derive(Clone)]
 pub struct Flusher {
     pub space_store: SpaceStoreRef,
 
@@ -173,8 +186,6 @@ pub struct Flusher {
 struct FlushTask {
     space_store: SpaceStoreRef,
     table_data: TableDataRef,
-    max_sequence: SequenceNumber,
-
     runtime: RuntimeRef,
     write_sst_max_buffer_size: usize,
 }
@@ -192,9 +203,7 @@ impl Flusher {
             table_data, opts
         );
 
-        let flush_req = self.preprocess_flush(table_data).await?;
-
-        self.schedule_table_flush(flush_scheduler, flush_req, opts, false)
+        self.schedule_table_flush(flush_scheduler, table_data.clone(), opts, false)
             .await
     }
 
@@ -210,74 +219,20 @@ impl Flusher {
             table_data, opts
         );
 
-        let flush_req = self.preprocess_flush(table_data).await?;
-
-        self.schedule_table_flush(flush_scheduler, flush_req, opts, true)
+        self.schedule_table_flush(flush_scheduler, table_data.clone(), opts, true)
             .await
     }
 
-    async fn preprocess_flush(&self, table_data: &TableDataRef) -> Result<TableFlushRequest> {
-        let current_version = table_data.current_version();
-        let last_sequence = table_data.last_sequence();
-        // Switch (freeze) all mutable memtables. And update segment duration if
-        // suggestion is returned.
-        if let Some(suggest_segment_duration) =
-            current_version.switch_memtables_or_suggest_duration()
-        {
-            info!(
-                "Update segment duration, table:{}, table_id:{}, segment_duration:{:?}",
-                table_data.name, table_data.id, suggest_segment_duration
-            );
-            assert!(!suggest_segment_duration.is_zero());
-
-            let mut new_table_opts = (*table_data.table_options()).clone();
-            new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration));
-
-            let edit_req = {
-                let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta {
-                    space_id: table_data.space_id,
-                    table_id: table_data.id,
-                    options: new_table_opts.clone(),
-                });
-                MetaEditRequest {
-                    shard_info: table_data.shard_info,
-                    meta_edit: MetaEdit::Update(meta_update),
-                }
-            };
-            self.space_store
-                .manifest
-                .apply_edit(edit_req)
-                .await
-                .context(StoreVersionEdit)?;
-
-            // Now the segment duration is applied, we can stop sampling and freeze the
-            // sampling memtable.
-            current_version.freeze_sampling();
-        }
-
-        info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{}",
-            table_data.name, table_data.id, table_data.last_memtable_id(), last_sequence);
-
-        // Try to flush all memtables of current table
-        Ok(TableFlushRequest {
-            table_data: table_data.clone(),
-            max_sequence: last_sequence,
-        })
-    }
-
     /// Schedule table flush request to background workers
     async fn schedule_table_flush(
         &self,
         flush_scheduler: &mut TableFlushScheduler,
-        flush_req: TableFlushRequest,
+        table_data: TableDataRef,
         opts: TableFlushOptions,
         block_on: bool,
     ) -> Result<()> {
-        let table_data = flush_req.table_data.clone();
-
         let flush_task = FlushTask {
             table_data: table_data.clone(),
-            max_sequence: flush_req.max_sequence,
             space_store: self.space_store.clone(),
             runtime: self.runtime.clone(),
             write_sst_max_buffer_size: self.write_sst_max_buffer_size,
@@ -295,23 +250,29 @@ impl FlushTask {
     /// should be ensured by the caller.
     async fn run(&self) -> Result<()> {
         let instant = Instant::now();
+        let flush_req = self.preprocess_flush(&self.table_data).await?;
+
         let current_version = self.table_data.current_version();
-        let mems_to_flush = current_version.pick_memtables_to_flush(self.max_sequence);
+        let mems_to_flush = current_version.pick_memtables_to_flush(flush_req.max_sequence);
 
         if mems_to_flush.is_empty() {
             return Ok(());
         }
 
         let request_id = RequestId::next_id();
-        info!(
-            "Instance try to flush memtables, table:{}, table_id:{}, request_id:{}, mems_to_flush:{:?}",
-            self.table_data.name, self.table_data.id, request_id, mems_to_flush
-        );
 
         // Start flush duration timer.
         let local_metrics = self.table_data.metrics.local_flush_metrics();
         let _timer = local_metrics.start_flush_timer();
-        self.dump_memtables(request_id, &mems_to_flush).await?;
+        self.dump_memtables(request_id, &mems_to_flush)
+            .await
+            .box_err()
+            .context(FlushJobWithCause {
+                msg: Some(format!(
+                    "table:{}, table_id:{}, request_id:{request_id}",
+                    self.table_data.name, self.table_data.id
+                )),
+            })?;
 
         self.table_data
             .set_last_flush_time(time::current_time_millis());
@@ -327,6 +288,57 @@ impl FlushTask {
         Ok(())
     }
 
+    async fn preprocess_flush(&self, table_data: &TableDataRef) -> Result<TableFlushRequest> {
+        let current_version = table_data.current_version();
+        let mut last_sequence = table_data.last_sequence();
+        // Switch (freeze) all mutable memtables. And update segment duration if
+        // suggestion is returned.
+        if let Some(suggest_segment_duration) = current_version.suggest_duration() {
+            info!(
+                "Update segment duration, table:{}, table_id:{}, segment_duration:{:?}",
+                table_data.name, table_data.id, suggest_segment_duration
+            );
+            assert!(!suggest_segment_duration.is_zero());
+
+            let mut new_table_opts = (*table_data.table_options()).clone();
+            new_table_opts.segment_duration = Some(ReadableDuration(suggest_segment_duration));
+
+            let edit_req = {
+                let meta_update = MetaUpdate::AlterOptions(AlterOptionsMeta {
+                    space_id: table_data.space_id,
+                    table_id: table_data.id,
+                    options: new_table_opts.clone(),
+                });
+                MetaEditRequest {
+                    shard_info: table_data.shard_info,
+                    meta_edit: MetaEdit::Update(meta_update),
+                }
+            };
+            self.space_store
+                .manifest
+                .apply_edit(edit_req)
+                .await
+                .context(StoreVersionEdit)?;
+
+            // Now the segment duration is applied, we can stop sampling and freeze the
+            // sampling memtable.
+            if let Some(seq) = current_version.freeze_sampling_memtable() {
+                last_sequence = seq.max(last_sequence);
+            }
+        } else if let Some(seq) = current_version.switch_memtables() {
+            last_sequence = seq.max(last_sequence);
+        }
+
+        info!("Try to trigger memtable flush of table, table:{}, table_id:{}, max_memtable_id:{}, last_sequence:{last_sequence}",
+            table_data.name, table_data.id, table_data.last_memtable_id());
+
+        // Try to flush all memtables of current table
+        Ok(TableFlushRequest {
+            table_data: table_data.clone(),
+            max_sequence: last_sequence,
+        })
+    }
+
     /// This will write picked memtables [FlushableMemTables] to level 0 sst
     /// files. Sampling memtable may be dumped into multiple sst file according
     /// to the sampled segment duration.
@@ -648,22 +660,23 @@ impl SpaceStore {
             "Begin compact table, table_name:{}, id:{}, task:{:?}",
             table_data.name, table_data.id, task
         );
+        let inputs = task.inputs();
         let mut edit_meta = VersionEditMeta {
             space_id: table_data.space_id,
             table_id: table_data.id,
             flushed_sequence: 0,
             // Use the number of compaction inputs as the estimated number of files to add.
-            files_to_add: Vec::with_capacity(task.compaction_inputs.len()),
+            files_to_add: Vec::with_capacity(inputs.len()),
             files_to_delete: vec![],
             mems_to_remove: vec![],
         };
 
-        if task.num_expired_files() == 0 && task.num_compact_files() == 0 {
+        if task.is_empty() {
             // Nothing to compact.
             return Ok(());
         }
 
-        for files in &task.expired {
+        for files in task.expired() {
             self.delete_expired_files(table_data, request_id, files, &mut edit_meta);
         }
 
@@ -675,7 +688,7 @@ impl SpaceStore {
             task.num_compact_files(),
         );
 
-        for input in &task.compaction_inputs {
+        for input in inputs {
             self.compact_input_files(
                 request_id,
                 table_data,
diff --git a/analytic_engine/src/instance/mod.rs b/analytic_engine/src/instance/mod.rs
index 89a71ba2d6..1faf254f08 100644
--- a/analytic_engine/src/instance/mod.rs
+++ b/analytic_engine/src/instance/mod.rs
@@ -15,6 +15,7 @@ pub(crate) mod mem_collector;
 pub mod open;
 mod read;
 pub(crate) mod serial_executor;
+pub mod wal_replayer;
 pub(crate) mod write;
 
 use std::sync::Arc;
@@ -44,7 +45,7 @@ use crate::{
         meta_data::cache::MetaCacheRef,
     },
     table::data::{TableDataRef, TableShardInfo},
-    TableOptions,
+    RecoverMode, TableOptions,
 };
 
 #[allow(clippy::enum_variant_names)]
@@ -159,6 +160,7 @@ pub struct Instance {
     /// Options for scanning sst
     pub(crate) scan_options: ScanOptions,
     pub(crate) iter_options: Option<IterOptions>,
+    pub(crate) recover_mode: RecoverMode,
 }
 
 impl Instance {
diff --git a/analytic_engine/src/instance/open.rs b/analytic_engine/src/instance/open.rs
index 6c4d178fe2..cf11b05b1d 100644
--- a/analytic_engine/src/instance/open.rs
+++ b/analytic_engine/src/instance/open.rs
@@ -3,19 +3,16 @@
 //! Open logic of instance
 
 use std::{
-    collections::{HashMap, VecDeque},
+    collections::HashMap,
     sync::{Arc, RwLock},
 };
 
-use common_types::{schema::IndexInWriterSchema, table::ShardId};
-use log::{debug, error, info, trace};
+use common_types::table::ShardId;
+use log::{error, info};
 use object_store::ObjectStoreRef;
 use snafu::ResultExt;
 use table_engine::{engine::TableDef, table::TableId};
-use wal::{
-    log_batch::LogEntry,
-    manager::{ReadBoundary, ReadContext, ReadRequest, WalManager, WalManagerRef},
-};
+use wal::manager::WalManagerRef;
 
 use super::{engine::OpenTablesOfShard, flush_compaction::Flusher};
 use crate::{
@@ -23,16 +20,12 @@ use crate::{
     context::OpenContext,
     engine,
     instance::{
-        self,
-        engine::{ApplyMemTable, FlushTable, OpenManifest, ReadMetaUpdate, ReadWal, Result},
-        flush_compaction::TableFlushOptions,
+        engine::{OpenManifest, ReadMetaUpdate, Result},
         mem_collector::MemUsageCollector,
-        serial_executor::TableOpSerialExecutor,
-        write::MemTableWriter,
+        wal_replayer::{ReplayMode, WalReplayer},
         Instance, SpaceStore,
     },
     manifest::{details::ManifestImpl, LoadRequest, Manifest, ManifestRef},
-    payload::{ReadPayload, WalDecoder},
     row_iter::IterOptions,
     space::{SpaceAndTable, SpaceRef, Spaces},
     sst::{
@@ -41,6 +34,7 @@ use crate::{
     },
     table::data::TableDataRef,
     table_meta_set_impl::TableMetaSetImpl,
+    RecoverMode,
 };
 
 const MAX_RECORD_BATCHES_IN_FLIGHT_WHEN_COMPACTION_READ: usize = 64;
@@ -133,6 +127,7 @@ impl Instance {
                 .map(|v| v.as_byte() as usize),
             iter_options,
             scan_options,
+            recover_mode: ctx.config.recover_mode,
         });
 
         Ok(instance)
@@ -150,6 +145,7 @@ impl Instance {
             self.replay_batch_size,
             self.make_flusher(),
             self.max_retry_flush_limit,
+            self.recover_mode,
         )?;
 
         shard_opener.open().await
@@ -197,10 +193,11 @@ struct ShardOpener {
     shard_id: ShardId,
     manifest: ManifestRef,
     wal_manager: WalManagerRef,
-    states: HashMap<TableId, TableOpenStage>,
+    stages: HashMap<TableId, TableOpenStage>,
     wal_replay_batch_size: usize,
     flusher: Flusher,
     max_retry_flush_limit: usize,
+    recover_mode: RecoverMode,
 }
 
 impl ShardOpener {
@@ -211,8 +208,9 @@ impl ShardOpener {
         wal_replay_batch_size: usize,
         flusher: Flusher,
         max_retry_flush_limit: usize,
+        recover_mode: RecoverMode,
     ) -> Result<Self> {
-        let mut states = HashMap::with_capacity(shard_context.table_ctxs.len());
+        let mut stages = HashMap::with_capacity(shard_context.table_ctxs.len());
         for table_ctx in shard_context.table_ctxs {
             let space = &table_ctx.space;
             let table_id = table_ctx.table_def.id;
@@ -226,17 +224,18 @@ impl ShardOpener {
                     space: table_ctx.space,
                 })
             };
-            states.insert(table_id, state);
+            stages.insert(table_id, state);
         }
 
         Ok(Self {
             shard_id: shard_context.shard_id,
             manifest,
             wal_manager,
-            states,
+            stages,
             wal_replay_batch_size,
             flusher,
             max_retry_flush_limit,
+            recover_mode,
         })
     }
 
@@ -248,9 +247,9 @@ impl ShardOpener {
         self.recover_table_datas().await?;
 
         // Retrieve the table results and return.
-        let states = std::mem::take(&mut self.states);
-        let mut table_results = HashMap::with_capacity(states.len());
-        for (table_id, state) in states {
+        let stages = std::mem::take(&mut self.stages);
+        let mut table_results = HashMap::with_capacity(stages.len());
+        for (table_id, state) in stages {
             match state {
                 TableOpenStage::Failed(e) => {
                     table_results.insert(table_id, Err(e));
@@ -274,7 +273,12 @@ impl ShardOpener {
 
     /// Recover table meta data from manifest based on shard.
     async fn recover_table_metas(&mut self) -> Result<()> {
-        for (table_id, state) in self.states.iter_mut() {
+        info!(
+            "ShardOpener recover table metas begin, shard_id:{}",
+            self.shard_id
+        );
+
+        for (table_id, state) in self.stages.iter_mut() {
             match state {
                 // Only do the meta recovery work in `RecoverTableMeta` state.
                 TableOpenStage::RecoverTableMeta(ctx) => {
@@ -289,7 +293,10 @@ impl ShardOpener {
                             let table_data = ctx.space.find_table_by_id(*table_id);
                             Ok(table_data.map(|data| (data, ctx.space.clone())))
                         }
-                        Err(e) => Err(e),
+                        Err(e) => {
+                            error!("ShardOpener recover single table meta failed, table:{:?}, shard_id:{}, err:{e}", ctx.table_def, self.shard_id);
+                            Err(e)
+                        }
                     };
 
                     match result {
@@ -314,55 +321,88 @@ impl ShardOpener {
             }
         }
 
+        info!(
+            "ShardOpener recover table metas finish, shard_id:{}",
+            self.shard_id
+        );
         Ok(())
     }
 
     /// Recover table data based on shard.
     async fn recover_table_datas(&mut self) -> Result<()> {
-        for state in self.states.values_mut() {
-            match state {
+        info!(
+            "ShardOpener recover table datas begin, shard_id:{}",
+            self.shard_id
+        );
+
+        // Replay wal logs of tables.
+        let mut replay_table_datas = Vec::with_capacity(self.stages.len());
+        for (table_id, stage) in self.stages.iter_mut() {
+            match stage {
                 // Only do the wal recovery work in `RecoverTableData` state.
                 TableOpenStage::RecoverTableData(ctx) => {
-                    let table_data = ctx.table_data.clone();
-                    let read_ctx = ReadContext {
-                        batch_size: self.wal_replay_batch_size,
-                        ..Default::default()
-                    };
-
-                    let result = match Self::recover_single_table_data(
-                        &self.flusher,
-                        self.max_retry_flush_limit,
-                        self.wal_manager.as_ref(),
-                        table_data.clone(),
-                        self.wal_replay_batch_size,
-                        &read_ctx,
-                    )
-                    .await
-                    {
-                        Ok(()) => Ok((table_data, ctx.space.clone())),
-                        Err(e) => Err(e),
-                    };
-
-                    match result {
-                        Ok((table_data, space)) => {
-                            *state = TableOpenStage::Success(Some(SpaceAndTable::new(
-                                space, table_data,
-                            )));
-                        }
-                        Err(e) => *state = TableOpenStage::Failed(e),
-                    }
+                    replay_table_datas.push(ctx.table_data.clone());
                 }
                 // Table was found opened, or failed in meta recovery stage.
                 TableOpenStage::Failed(_) | TableOpenStage::Success(_) => {}
                 TableOpenStage::RecoverTableMeta(_) => {
                     return OpenTablesOfShard {
-                        msg: format!("unexpected table state:{state:?}"),
+                        msg: format!(
+                            "unexpected stage, stage:{stage:?}, table_id:{table_id}, shard_id:{}",
+                            self.shard_id
+                        ),
                     }
-                    .fail()
+                    .fail();
                 }
             }
         }
 
+        let replay_mode = match self.recover_mode {
+            RecoverMode::TableBased => ReplayMode::TableBased,
+            RecoverMode::ShardBased => ReplayMode::RegionBased,
+        };
+        let mut wal_replayer = WalReplayer::new(
+            &replay_table_datas,
+            self.shard_id,
+            self.wal_manager.clone(),
+            self.wal_replay_batch_size,
+            self.flusher.clone(),
+            self.max_retry_flush_limit,
+            replay_mode,
+        );
+        let mut table_results = wal_replayer.replay().await?;
+
+        // Process the replay results.
+        for table_data in replay_table_datas {
+            let table_id = table_data.id;
+            // Each `table_data` has its related `stage` in `stages`, impossible to panic
+            // here.
+            let stage = self.stages.get_mut(&table_id).unwrap();
+            let failed_table_opt = table_results.remove(&table_id);
+
+            match (&stage, failed_table_opt) {
+                (TableOpenStage::RecoverTableData(ctx), None) => {
+                    let space_table = SpaceAndTable::new(ctx.space.clone(), ctx.table_data.clone());
+                    *stage = TableOpenStage::Success(Some(space_table));
+                }
+
+                (TableOpenStage::RecoverTableData(_), Some(e)) => {
+                    error!("ShardOpener replay wals of single table failed, table:{}, table_id:{}, shard_id:{}, err:{e}", table_data.name, table_data.id, self.shard_id);
+                    *stage = TableOpenStage::Failed(e);
+                }
+
+                (other_stage, _) => {
+                    return OpenTablesOfShard {
+                        msg: format!("unexpected stage, stage:{other_stage:?}, table_id:{table_id}, shard_id:{}", self.shard_id),
+                    }.fail();
+                }
+            }
+        }
+
+        info!(
+            "ShardOpener recover table datas finish, shard_id:{}",
+            self.shard_id
+        );
         Ok(())
     }
 
@@ -398,171 +438,4 @@ impl ShardOpener {
 
         Ok(())
     }
-
-    /// Recover table data from wal.
-    ///
-    /// Called by write worker
-    pub(crate) async fn recover_single_table_data(
-        flusher: &Flusher,
-        max_retry_flush_limit: usize,
-        wal_manager: &dyn WalManager,
-        table_data: TableDataRef,
-        replay_batch_size: usize,
-        read_ctx: &ReadContext,
-    ) -> Result<()> {
-        debug!(
-            "Instance recover table from wal, replay batch size:{}, table id:{}, shard info:{:?}",
-            replay_batch_size, table_data.id, table_data.shard_info
-        );
-
-        let table_location = table_data.table_location();
-        let wal_location =
-            instance::create_wal_location(table_location.id, table_location.shard_info);
-        let read_req = ReadRequest {
-            location: wal_location,
-            start: ReadBoundary::Excluded(table_data.current_version().flushed_sequence()),
-            end: ReadBoundary::Max,
-        };
-
-        // Read all wal of current table.
-        let mut log_iter = wal_manager
-            .read_batch(read_ctx, &read_req)
-            .await
-            .context(ReadWal)?;
-
-        let mut serial_exec = table_data.serial_exec.lock().await;
-        let mut log_entry_buf = VecDeque::with_capacity(replay_batch_size);
-        loop {
-            // fetch entries to log_entry_buf
-            let decoder = WalDecoder::default();
-            log_entry_buf = log_iter
-                .next_log_entries(decoder, log_entry_buf)
-                .await
-                .context(ReadWal)?;
-
-            // Replay all log entries of current table
-            Self::replay_table_log_entries(
-                flusher,
-                max_retry_flush_limit,
-                &mut serial_exec,
-                &table_data,
-                &log_entry_buf,
-            )
-            .await?;
-
-            // No more entries.
-            if log_entry_buf.is_empty() {
-                break;
-            }
-        }
-
-        Ok(())
-    }
-
-    /// Replay all log entries into memtable and flush if necessary.
-    async fn replay_table_log_entries(
-        flusher: &Flusher,
-        max_retry_flush_limit: usize,
-        serial_exec: &mut TableOpSerialExecutor,
-        table_data: &TableDataRef,
-        log_entries: &VecDeque<LogEntry<ReadPayload>>,
-    ) -> Result<()> {
-        if log_entries.is_empty() {
-            info!(
-                "Instance replay an empty table log entries, table:{}, table_id:{:?}",
-                table_data.name, table_data.id
-            );
-
-            // No data in wal
-            return Ok(());
-        }
-
-        let last_sequence = log_entries.back().unwrap().sequence;
-
-        debug!(
-            "Instance replay table log entries begin, table:{}, table_id:{:?}, sequence:{}",
-            table_data.name, table_data.id, last_sequence
-        );
-
-        for log_entry in log_entries {
-            let (sequence, payload) = (log_entry.sequence, &log_entry.payload);
-
-            // Apply to memtable
-            match payload {
-                ReadPayload::Write { row_group } => {
-                    trace!(
-                        "Instance replay row_group, table:{}, row_group:{:?}",
-                        table_data.name,
-                        row_group
-                    );
-
-                    let table_schema_version = table_data.schema_version();
-                    if table_schema_version != row_group.schema().version() {
-                        // Data with old schema should already been flushed, but we avoid panic
-                        // here.
-                        error!(
-                            "Ignore data with mismatch schema version during replaying, \
-                            table:{}, \
-                            table_id:{:?}, \
-                            expect:{}, \
-                            actual:{}, \
-                            last_sequence:{}, \
-                            sequence:{}",
-                            table_data.name,
-                            table_data.id,
-                            table_schema_version,
-                            row_group.schema().version(),
-                            last_sequence,
-                            sequence,
-                        );
-
-                        continue;
-                    }
-
-                    let index_in_writer =
-                        IndexInWriterSchema::for_same_schema(row_group.schema().num_columns());
-                    let memtable_writer = MemTableWriter::new(table_data.clone(), serial_exec);
-                    memtable_writer
-                        .write(sequence, &row_group.into(), index_in_writer)
-                        .context(ApplyMemTable {
-                            space_id: table_data.space_id,
-                            table: &table_data.name,
-                            table_id: table_data.id,
-                        })?;
-
-                    // Flush the table if necessary.
-                    if table_data.should_flush_table(serial_exec) {
-                        let opts = TableFlushOptions {
-                            res_sender: None,
-                            max_retry_flush_limit,
-                        };
-                        let flush_scheduler = serial_exec.flush_scheduler();
-                        flusher
-                            .schedule_flush(flush_scheduler, table_data, opts)
-                            .await
-                            .context(FlushTable {
-                                space_id: table_data.space_id,
-                                table: &table_data.name,
-                                table_id: table_data.id,
-                            })?;
-                    }
-                }
-                ReadPayload::AlterSchema { .. } | ReadPayload::AlterOptions { .. } => {
-                    // Ignore records except Data.
-                    //
-                    // - DDL (AlterSchema and AlterOptions) should be recovered
-                    //   from Manifest on start.
-                }
-            }
-        }
-
-        debug!(
-            "Instance replay table log entries end, table:{}, table_id:{:?}, last_sequence:{}",
-            table_data.name, table_data.id, last_sequence
-        );
-
-        table_data.set_last_sequence(last_sequence);
-
-        Ok(())
-    }
 }
diff --git a/analytic_engine/src/instance/serial_executor.rs b/analytic_engine/src/instance/serial_executor.rs
index 0e48ce5f18..4a84404e2c 100644
--- a/analytic_engine/src/instance/serial_executor.rs
+++ b/analytic_engine/src/instance/serial_executor.rs
@@ -154,7 +154,11 @@ impl TableFlushScheduler {
                         *flush_state = FlushState::Flushing;
                         break;
                     }
-                    FlushState::Flushing => (),
+                    FlushState::Flushing => {
+                        if !block_on_write_thread {
+                            return Ok(());
+                        }
+                    }
                     FlushState::Failed { err_msg } => {
                         if self
                             .schedule_sync
@@ -223,6 +227,8 @@ fn on_flush_finished(schedule_sync: ScheduleSyncRef, res: &Result<()>) {
                 *flush_state = FlushState::Ready;
             }
             Err(e) => {
+                error!("Failed to run flush task, err:{e}");
+
                 schedule_sync.inc_flush_failure_count();
                 let err_msg = e.to_string();
                 *flush_state = FlushState::Failed { err_msg };
diff --git a/analytic_engine/src/instance/wal_replayer.rs b/analytic_engine/src/instance/wal_replayer.rs
new file mode 100644
index 0000000000..c1e6fd96b9
--- /dev/null
+++ b/analytic_engine/src/instance/wal_replayer.rs
@@ -0,0 +1,618 @@
+// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0.
+
+//! Wal replayer
+
+use std::{
+    collections::{HashMap, VecDeque},
+    fmt::Display,
+    ops::Range,
+};
+
+use async_trait::async_trait;
+use common_types::{schema::IndexInWriterSchema, table::ShardId};
+use common_util::error::BoxError;
+use lazy_static::lazy_static;
+use log::{debug, error, info, trace};
+use prometheus::{exponential_buckets, register_histogram, Histogram};
+use snafu::ResultExt;
+use table_engine::table::TableId;
+use tokio::sync::MutexGuard;
+use wal::{
+    log_batch::LogEntry,
+    manager::{
+        ReadBoundary, ReadContext, ReadRequest, RegionId, ScanContext, ScanRequest, WalManagerRef,
+    },
+};
+
+use crate::{
+    instance::{
+        self,
+        engine::{Error, ReplayWalWithCause, Result},
+        flush_compaction::{Flusher, TableFlushOptions},
+        serial_executor::TableOpSerialExecutor,
+        write::MemTableWriter,
+    },
+    payload::{ReadPayload, WalDecoder},
+    table::data::TableDataRef,
+};
+
+// Metrics of wal replayer
+lazy_static! {
+    static ref PULL_LOGS_DURATION_HISTOGRAM: Histogram = register_histogram!(
+        "wal_replay_pull_logs_duration",
+        "Histogram for pull logs duration in wal replay in seconds",
+        exponential_buckets(0.01, 2.0, 13).unwrap()
+    )
+    .unwrap();
+    static ref APPLY_LOGS_DURATION_HISTOGRAM: Histogram = register_histogram!(
+        "wal_replay_apply_logs_duration",
+        "Histogram for apply logs duration in wal replay in seconds",
+        exponential_buckets(0.01, 2.0, 13).unwrap()
+    )
+    .unwrap();
+}
+
+/// Wal replayer supporting both table based and region based
+// TODO: limit the memory usage in `RegionBased` mode.
+pub struct WalReplayer<'a> {
+    context: ReplayContext,
+    replay: Box<dyn Replay>,
+    table_datas: &'a [TableDataRef],
+}
+
+impl<'a> WalReplayer<'a> {
+    pub fn new(
+        table_datas: &'a [TableDataRef],
+        shard_id: ShardId,
+        wal_manager: WalManagerRef,
+        wal_replay_batch_size: usize,
+        flusher: Flusher,
+        max_retry_flush_limit: usize,
+        replay_mode: ReplayMode,
+    ) -> Self {
+        let context = ReplayContext {
+            shard_id,
+            wal_manager,
+            wal_replay_batch_size,
+            flusher,
+            max_retry_flush_limit,
+        };
+
+        let replay = Self::build_replay(replay_mode);
+
+        Self {
+            replay,
+            context,
+            table_datas,
+        }
+    }
+
+    fn build_replay(mode: ReplayMode) -> Box<dyn Replay> {
+        info!("Replay wal in mode:{mode:?}");
+
+        match mode {
+            ReplayMode::RegionBased => Box::new(RegionBasedReplay),
+            ReplayMode::TableBased => Box::new(TableBasedReplay),
+        }
+    }
+
+    /// Replay tables and return the failed tables and the causes.
+    pub async fn replay(&mut self) -> Result<FailedTables> {
+        // Build replay action according to mode.
+        info!(
+            "Replay wal logs begin, context:{}, tables:{:?}",
+            self.context, self.table_datas
+        );
+        let result = self.replay.run(&self.context, self.table_datas).await;
+        info!(
+            "Replay wal logs finish, context:{}, tables:{:?}",
+            self.context, self.table_datas,
+        );
+
+        result
+    }
+}
+
+pub struct ReplayContext {
+    pub shard_id: ShardId,
+    pub wal_manager: WalManagerRef,
+    pub wal_replay_batch_size: usize,
+    pub flusher: Flusher,
+    pub max_retry_flush_limit: usize,
+}
+
+impl Display for ReplayContext {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("ReplayContext")
+            .field("shard_id", &self.shard_id)
+            .field("replay_batch_size", &self.wal_replay_batch_size)
+            .field("max_retry_flush_limit", &self.max_retry_flush_limit)
+            .finish()
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum ReplayMode {
+    RegionBased,
+    TableBased,
+}
+
+pub type FailedTables = HashMap<TableId, Error>;
+
+/// Replay action, the abstract of different replay strategies
+#[async_trait]
+trait Replay: Send + Sync + 'static {
+    async fn run(
+        &self,
+        context: &ReplayContext,
+        table_datas: &[TableDataRef],
+    ) -> Result<FailedTables>;
+}
+
+/// Table based wal replay
+struct TableBasedReplay;
+
+#[async_trait]
+impl Replay for TableBasedReplay {
+    async fn run(
+        &self,
+        context: &ReplayContext,
+        table_datas: &[TableDataRef],
+    ) -> Result<FailedTables> {
+        debug!("Replay wal logs on table mode, context:{context}, tables:{table_datas:?}",);
+
+        let mut faileds = HashMap::new();
+        let read_ctx = ReadContext {
+            batch_size: context.wal_replay_batch_size,
+            ..Default::default()
+        };
+        for table_data in table_datas {
+            let table_id = table_data.id;
+            if let Err(e) = Self::recover_table_logs(context, table_data, &read_ctx).await {
+                faileds.insert(table_id, e);
+            }
+        }
+
+        Ok(faileds)
+    }
+}
+
+impl TableBasedReplay {
+    async fn recover_table_logs(
+        context: &ReplayContext,
+        table_data: &TableDataRef,
+        read_ctx: &ReadContext,
+    ) -> Result<()> {
+        let table_location = table_data.table_location();
+        let wal_location =
+            instance::create_wal_location(table_location.id, table_location.shard_info);
+        let read_req = ReadRequest {
+            location: wal_location,
+            start: ReadBoundary::Excluded(table_data.current_version().flushed_sequence()),
+            end: ReadBoundary::Max,
+        };
+
+        // Read all wal of current table.
+        let mut log_iter = context
+            .wal_manager
+            .read_batch(read_ctx, &read_req)
+            .await
+            .box_err()
+            .context(ReplayWalWithCause { msg: None })?;
+
+        let mut serial_exec = table_data.serial_exec.lock().await;
+        let mut log_entry_buf = VecDeque::with_capacity(context.wal_replay_batch_size);
+        loop {
+            // fetch entries to log_entry_buf
+            let _timer = PULL_LOGS_DURATION_HISTOGRAM.start_timer();
+            let decoder = WalDecoder::default();
+            log_entry_buf = log_iter
+                .next_log_entries(decoder, log_entry_buf)
+                .await
+                .box_err()
+                .context(ReplayWalWithCause { msg: None })?;
+
+            if log_entry_buf.is_empty() {
+                break;
+            }
+
+            // Replay all log entries of current table
+            let _timer = APPLY_LOGS_DURATION_HISTOGRAM.start_timer();
+            replay_table_log_entries(
+                &context.flusher,
+                context.max_retry_flush_limit,
+                &mut serial_exec,
+                table_data,
+                log_entry_buf.iter(),
+            )
+            .await?;
+        }
+
+        Ok(())
+    }
+}
+
+/// Region based wal replay
+struct RegionBasedReplay;
+
+#[async_trait]
+impl Replay for RegionBasedReplay {
+    async fn run(
+        &self,
+        context: &ReplayContext,
+        table_datas: &[TableDataRef],
+    ) -> Result<FailedTables> {
+        debug!("Replay wal logs on region mode, context:{context}, tables:{table_datas:?}",);
+
+        // Init all table results to be oks, and modify to errs when failed to replay.
+        let mut faileds = FailedTables::new();
+        let scan_ctx = ScanContext {
+            batch_size: context.wal_replay_batch_size,
+            ..Default::default()
+        };
+
+        Self::replay_region_logs(context, table_datas, &scan_ctx, &mut faileds).await?;
+
+        Ok(faileds)
+    }
+}
+
+impl RegionBasedReplay {
+    /// Replay logs in same region.
+    ///
+    /// Steps:
+    /// + Scan all logs of region.
+    /// + Split logs according to table ids.
+    /// + Replay logs to recover data of tables.
+    async fn replay_region_logs(
+        context: &ReplayContext,
+        table_datas: &[TableDataRef],
+        scan_ctx: &ScanContext,
+        faileds: &mut FailedTables,
+    ) -> Result<()> {
+        // Scan all wal logs of current shard.
+        let scan_req = ScanRequest {
+            region_id: context.shard_id as RegionId,
+        };
+
+        let mut log_iter = context
+            .wal_manager
+            .scan(scan_ctx, &scan_req)
+            .await
+            .box_err()
+            .context(ReplayWalWithCause { msg: None })?;
+        let mut log_entry_buf = VecDeque::with_capacity(context.wal_replay_batch_size);
+
+        // Lock all related tables.
+        let mut serial_exec_ctxs = HashMap::with_capacity(table_datas.len());
+        for table_data in table_datas {
+            let serial_exec = table_data.serial_exec.lock().await;
+            let serial_exec_ctx = SerialExecContext {
+                table_data: table_data.clone(),
+                serial_exec,
+            };
+            serial_exec_ctxs.insert(table_data.id, serial_exec_ctx);
+        }
+
+        // Split and replay logs.
+        loop {
+            let _timer = PULL_LOGS_DURATION_HISTOGRAM.start_timer();
+            let decoder = WalDecoder::default();
+            log_entry_buf = log_iter
+                .next_log_entries(decoder, log_entry_buf)
+                .await
+                .box_err()
+                .context(ReplayWalWithCause { msg: None })?;
+
+            if log_entry_buf.is_empty() {
+                break;
+            }
+
+            let _timer = APPLY_LOGS_DURATION_HISTOGRAM.start_timer();
+            Self::replay_single_batch(context, &log_entry_buf, &mut serial_exec_ctxs, faileds)
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn replay_single_batch(
+        context: &ReplayContext,
+        log_batch: &VecDeque<LogEntry<ReadPayload>>,
+        serial_exec_ctxs: &mut HashMap<TableId, SerialExecContext<'_>>,
+        faileds: &mut FailedTables,
+    ) -> Result<()> {
+        let mut table_batches = Vec::new();
+        // TODO: No `group_by` method in `VecDeque`, so implement it manually here...
+        Self::split_log_batch_by_table(log_batch, &mut table_batches);
+
+        // TODO: Replay logs of different tables in parallel.
+        for table_batch in table_batches {
+            // Some tables may have failed in previous replay, ignore them.
+            if faileds.contains_key(&table_batch.table_id) {
+                continue;
+            }
+
+            // Replay all log entries of current table.
+            // Some tables may have been moved to other shards or dropped, ignore such logs.
+            if let Some(ctx) = serial_exec_ctxs.get_mut(&table_batch.table_id) {
+                let result = replay_table_log_entries(
+                    &context.flusher,
+                    context.max_retry_flush_limit,
+                    &mut ctx.serial_exec,
+                    &ctx.table_data,
+                    log_batch.range(table_batch.range),
+                )
+                .await;
+
+                // If occur error, mark this table as failed and store the cause.
+                if let Err(e) = result {
+                    faileds.insert(table_batch.table_id, e);
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn split_log_batch_by_table<P>(
+        log_batch: &VecDeque<LogEntry<P>>,
+        table_batches: &mut Vec<TableBatch>,
+    ) {
+        table_batches.clear();
+
+        if log_batch.is_empty() {
+            return;
+        }
+
+        // Split log batch by table id, for example:
+        // input batch:
+        //  |1|1|2|2|2|3|3|3|3|
+        //
+        // output batches:
+        //  |1|1|, |2|2|2|, |3|3|3|3|
+        let mut start_log_idx = 0usize;
+        let mut curr_log_idx = 0usize;
+        let mut start_table_id = log_batch.get(start_log_idx).unwrap().table_id;
+        loop {
+            let time_to_break = curr_log_idx == log_batch.len();
+            let found_end_idx = if time_to_break {
+                true
+            } else {
+                let current_table_id = log_batch.get(curr_log_idx).unwrap().table_id;
+                current_table_id != start_table_id
+            };
+
+            if found_end_idx {
+                table_batches.push(TableBatch {
+                    table_id: TableId::new(start_table_id),
+                    range: start_log_idx..curr_log_idx,
+                });
+
+                // Step to next start idx.
+                start_log_idx = curr_log_idx;
+                start_table_id = if time_to_break {
+                    // The final round, just set it to max as an invalid flag.
+                    u64::MAX
+                } else {
+                    log_batch.get(start_log_idx).unwrap().table_id
+                };
+            }
+
+            if time_to_break {
+                break;
+            }
+            curr_log_idx += 1;
+        }
+    }
+}
+
+#[derive(Debug, Eq, PartialEq)]
+struct TableBatch {
+    table_id: TableId,
+    range: Range<usize>,
+}
+
+struct SerialExecContext<'a> {
+    table_data: TableDataRef,
+    serial_exec: MutexGuard<'a, TableOpSerialExecutor>,
+}
+
+/// Replay all log entries into memtable and flush if necessary
+async fn replay_table_log_entries(
+    flusher: &Flusher,
+    max_retry_flush_limit: usize,
+    serial_exec: &mut TableOpSerialExecutor,
+    table_data: &TableDataRef,
+    log_entries: impl Iterator<Item = &LogEntry<ReadPayload>>,
+) -> Result<()> {
+    let flushed_sequence = table_data.current_version().flushed_sequence();
+    debug!(
+        "Replay table log entries begin, table:{}, table_id:{:?}, last_sequence:{}, flushed_sequence:{flushed_sequence}",
+        table_data.name, table_data.id, table_data.last_sequence(),
+    );
+
+    for log_entry in log_entries {
+        let (sequence, payload) = (log_entry.sequence, &log_entry.payload);
+
+        // Ignore too old logs(sequence <= `flushed_sequence`).
+        if sequence <= flushed_sequence {
+            continue;
+        }
+
+        // Apply logs to memtable.
+        match payload {
+            ReadPayload::Write { row_group } => {
+                trace!(
+                    "Instance replay row_group, table:{}, row_group:{:?}",
+                    table_data.name,
+                    row_group
+                );
+
+                // TODO: too strict check here, should be modified to like what in
+                // `ColumnSchema::compatible_for_write`.`
+                let table_schema_version = table_data.schema_version();
+                if table_schema_version != row_group.schema().version() {
+                    // Data with old schema should already been flushed, but we avoid panic
+                    // here.
+                    error!(
+                        "Ignore data with mismatch schema version during replaying, \
+                        table:{}, \
+                        table_id:{:?}, \
+                        expect:{}, \
+                        actual:{}, \
+                        last_sequence:{}, \
+                        sequence:{}",
+                        table_data.name,
+                        table_data.id,
+                        table_schema_version,
+                        row_group.schema().version(),
+                        table_data.last_sequence(),
+                        sequence,
+                    );
+
+                    continue;
+                }
+
+                let index_in_writer =
+                    IndexInWriterSchema::for_same_schema(row_group.schema().num_columns());
+                let memtable_writer = MemTableWriter::new(table_data.clone(), serial_exec);
+                memtable_writer
+                    .write(sequence, &row_group.into(), index_in_writer)
+                    .box_err()
+                    .context(ReplayWalWithCause {
+                        msg: Some(format!(
+                            "table_id:{}, table_name:{}, space_id:{}",
+                            table_data.space_id, table_data.name, table_data.id
+                        )),
+                    })?;
+
+                // Flush the table if necessary.
+                if table_data.should_flush_table(serial_exec) {
+                    let opts = TableFlushOptions {
+                        res_sender: None,
+                        max_retry_flush_limit,
+                    };
+                    let flush_scheduler = serial_exec.flush_scheduler();
+                    flusher
+                        .schedule_flush(flush_scheduler, table_data, opts)
+                        .await
+                        .box_err()
+                        .context(ReplayWalWithCause {
+                            msg: Some(format!(
+                                "table_id:{}, table_name:{}, space_id:{}",
+                                table_data.space_id, table_data.name, table_data.id
+                            )),
+                        })?;
+                }
+            }
+            ReadPayload::AlterSchema { .. } | ReadPayload::AlterOptions { .. } => {
+                // Ignore records except Data.
+                //
+                // - DDL (AlterSchema and AlterOptions) should be recovered from
+                //   Manifest on start.
+            }
+        }
+
+        table_data.set_last_sequence(sequence);
+    }
+
+    debug!(
+        "Replay table log entries finish, table:{}, table_id:{:?}, last_sequence:{}, flushed_sequence:{}",
+        table_data.name, table_data.id, table_data.last_sequence(), table_data.current_version().flushed_sequence()
+    );
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::VecDeque;
+
+    use table_engine::table::TableId;
+    use wal::log_batch::LogEntry;
+
+    use crate::instance::wal_replayer::{RegionBasedReplay, TableBatch};
+
+    #[test]
+    fn test_split_log_batch_by_table() {
+        let test_set = test_set();
+        for (test_batch, expected) in test_set {
+            check_split_result(&test_batch, &expected);
+        }
+    }
+
+    fn test_set() -> Vec<(VecDeque<LogEntry<u32>>, Vec<TableBatch>)> {
+        let test_log_batch1: VecDeque<LogEntry<u32>> = VecDeque::from([
+            LogEntry {
+                table_id: 0,
+                sequence: 1,
+                payload: 0,
+            },
+            LogEntry {
+                table_id: 0,
+                sequence: 2,
+                payload: 0,
+            },
+            LogEntry {
+                table_id: 0,
+                sequence: 3,
+                payload: 0,
+            },
+            LogEntry {
+                table_id: 1,
+                sequence: 1,
+                payload: 0,
+            },
+            LogEntry {
+                table_id: 1,
+                sequence: 2,
+                payload: 0,
+            },
+            LogEntry {
+                table_id: 2,
+                sequence: 1,
+                payload: 0,
+            },
+        ]);
+        let expected1 = vec![
+            TableBatch {
+                table_id: TableId::new(0),
+                range: 0..3,
+            },
+            TableBatch {
+                table_id: TableId::new(1),
+                range: 3..5,
+            },
+            TableBatch {
+                table_id: TableId::new(2),
+                range: 5..6,
+            },
+        ];
+
+        let test_log_batch2: VecDeque<LogEntry<u32>> = VecDeque::from([LogEntry {
+            table_id: 0,
+            sequence: 1,
+            payload: 0,
+        }]);
+        let expected2 = vec![TableBatch {
+            table_id: TableId::new(0),
+            range: 0..1,
+        }];
+
+        let test_log_batch3: VecDeque<LogEntry<u32>> = VecDeque::default();
+        let expected3 = vec![];
+
+        vec![
+            (test_log_batch1, expected1),
+            (test_log_batch2, expected2),
+            (test_log_batch3, expected3),
+        ]
+    }
+
+    fn check_split_result(batch: &VecDeque<LogEntry<u32>>, expected: &[TableBatch]) {
+        let mut table_batches = Vec::new();
+        RegionBasedReplay::split_log_batch_by_table(batch, &mut table_batches);
+        assert_eq!(&table_batches, expected);
+    }
+}
diff --git a/analytic_engine/src/lib.rs b/analytic_engine/src/lib.rs
index 9e95d8e97b..025845afbe 100644
--- a/analytic_engine/src/lib.rs
+++ b/analytic_engine/src/lib.rs
@@ -97,9 +97,21 @@ pub struct Config {
     /// + Kafka
     pub wal: WalStorageConfig,
 
+    /// Recover mode
+    ///
+    /// + TableBased, tables on same shard will be recovered table by table.
+    /// + ShardBased, tables on same shard will be recovered together.
+    pub recover_mode: RecoverMode,
+
     pub remote_engine_client: remote_engine_client::config::Config,
 }
 
+#[derive(Debug, Clone, Copy, Deserialize, Serialize)]
+pub enum RecoverMode {
+    TableBased,
+    ShardBased,
+}
+
 impl Default for Config {
     fn default() -> Self {
         Self {
@@ -127,6 +139,7 @@ impl Default for Config {
             max_bytes_per_write_batch: None,
             wal: WalStorageConfig::RocksDB(Box::default()),
             remote_engine_client: remote_engine_client::config::Config::default(),
+            recover_mode: RecoverMode::TableBased,
         }
     }
 }
diff --git a/analytic_engine/src/manifest/details.rs b/analytic_engine/src/manifest/details.rs
index 9e902086b7..5ac94c9b82 100644
--- a/analytic_engine/src/manifest/details.rs
+++ b/analytic_engine/src/manifest/details.rs
@@ -199,6 +199,8 @@ pub(crate) trait TableMetaSet: fmt::Debug + Send + Sync {
 // `SnapshotReoverer`.
 #[derive(Debug, Clone)]
 struct SnapshotRecoverer<LogStore, SnapshotStore> {
+    table_id: TableId,
+    space_id: SpaceId,
     log_store: LogStore,
     snapshot_store: SnapshotStore,
 }
@@ -217,6 +219,11 @@ where
     }
 
     async fn create_latest_snapshot_with_prev(&self, prev_snapshot: Snapshot) -> Result<Snapshot> {
+        debug!(
+            "Manifest recover with prev snapshot, snapshot:{:?}, table_id:{}, space_id:{}",
+            prev_snapshot, self.table_id, self.space_id
+        );
+
         let log_start_boundary = ReadBoundary::Excluded(prev_snapshot.end_seq);
         let mut reader = self.log_store.scan(log_start_boundary).await?;
 
@@ -239,6 +246,11 @@ where
     }
 
     async fn create_latest_snapshot_without_prev(&self) -> Result<Option<Snapshot>> {
+        debug!(
+            "Manifest recover without prev snapshot, table_id:{}, space_id:{}",
+            self.table_id, self.space_id
+        );
+
         let mut reader = self.log_store.scan(ReadBoundary::Min).await?;
 
         let mut latest_seq = SequenceNumber::MIN;
@@ -258,6 +270,10 @@ where
                 data: manifest_data_builder.build(),
             }))
         } else {
+            debug!(
+                "Manifest recover nothing, table_id:{}, space_id:{}",
+                self.table_id, self.space_id
+            );
             Ok(None)
         }
     }
@@ -474,7 +490,7 @@ impl Manifest for ManifestImpl {
     }
 
     async fn recover(&self, load_req: &LoadRequest) -> GenericResult<()> {
-        info!("Manifest recover, request:{:?}", load_req);
+        info!("Manifest recover begin, request:{load_req:?}");
 
         // Load table meta snapshot from storage.
         let location = WalLocation::new(load_req.shard_id as u64, load_req.table_id.as_u64());
@@ -490,6 +506,8 @@ impl Manifest for ManifestImpl {
             self.store.clone(),
         );
         let reoverer = SnapshotRecoverer {
+            table_id: load_req.table_id,
+            space_id: load_req.space_id,
             log_store,
             snapshot_store,
         };
@@ -505,6 +523,8 @@ impl Manifest for ManifestImpl {
             self.table_meta_set.apply_edit_to_table(request)?;
         }
 
+        info!("Manifest recover finish, request:{load_req:?}");
+
         Ok(())
     }
 
@@ -1386,7 +1406,8 @@ mod tests {
                 assert_eq!(snapshot.data, expect_table_manifest_data);
                 assert_eq!(snapshot.end_seq, log_store.next_seq() - 1);
 
-                let recovered_snapshot = recover_snapshot(&log_store, &snapshot_store).await;
+                let recovered_snapshot =
+                    recover_snapshot(table_id, 0, &log_store, &snapshot_store).await;
                 assert_eq!(snapshot, recovered_snapshot.unwrap());
             }
             // The logs in the log store should be cleared after snapshot.
@@ -1418,7 +1439,8 @@ mod tests {
                 assert_eq!(snapshot.data, expect_table_manifest_data);
                 assert_eq!(snapshot.end_seq, log_store.next_seq() - 1);
 
-                let recovered_snapshot = recover_snapshot(&log_store, &snapshot_store).await;
+                let recovered_snapshot =
+                    recover_snapshot(table_id, 0, &log_store, &snapshot_store).await;
                 assert_eq!(snapshot, recovered_snapshot.unwrap());
             }
             // The logs in the log store should be cleared after snapshot.
@@ -1446,10 +1468,14 @@ mod tests {
     }
 
     async fn recover_snapshot(
+        table_id: TableId,
+        space_id: SpaceId,
         log_store: &MemLogStore,
         snapshot_store: &MemSnapshotStore,
     ) -> Option<Snapshot> {
         let recoverer = SnapshotRecoverer {
+            table_id,
+            space_id,
             log_store: log_store.clone(),
             snapshot_store: snapshot_store.clone(),
         };
diff --git a/analytic_engine/src/memtable/mod.rs b/analytic_engine/src/memtable/mod.rs
index bc4e4a2743..aeafdd9abe 100644
--- a/analytic_engine/src/memtable/mod.rs
+++ b/analytic_engine/src/memtable/mod.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! MemTable
 
@@ -193,6 +193,19 @@ pub trait MemTable {
     ///
     /// If the memtable is empty, then the last sequence is 0.
     fn last_sequence(&self) -> SequenceNumber;
+
+    /// Metrics of inner state.
+    fn metrics(&self) -> Metrics;
+}
+
+#[derive(Debug)]
+pub struct Metrics {
+    /// Size of original rows.
+    pub row_raw_size: usize,
+    /// Size of rows after encoded.
+    pub row_encoded_size: usize,
+    /// Row number count.
+    pub row_count: usize,
 }
 
 /// A reference to memtable
diff --git a/analytic_engine/src/memtable/skiplist/factory.rs b/analytic_engine/src/memtable/skiplist/factory.rs
index 89dd453587..3c11e6ea3c 100644
--- a/analytic_engine/src/memtable/skiplist/factory.rs
+++ b/analytic_engine/src/memtable/skiplist/factory.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Skiplist memtable factory
 
@@ -25,6 +25,7 @@ impl Factory for SkiplistMemTableFactory {
             schema: opts.schema,
             skiplist,
             last_sequence: AtomicU64::new(opts.creation_sequence),
+            metrics: Default::default(),
         });
 
         Ok(memtable)
diff --git a/analytic_engine/src/memtable/skiplist/mod.rs b/analytic_engine/src/memtable/skiplist/mod.rs
index 4e3a1a8e27..ec06a98efd 100644
--- a/analytic_engine/src/memtable/skiplist/mod.rs
+++ b/analytic_engine/src/memtable/skiplist/mod.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! MemTable based on skiplist
 
@@ -8,7 +8,7 @@ pub mod iter;
 use std::{
     cmp::Ordering,
     convert::TryInto,
-    sync::atomic::{self, AtomicU64},
+    sync::atomic::{self, AtomicU64, AtomicUsize},
 };
 
 use arena::{Arena, BasicStats};
@@ -26,10 +26,17 @@ use snafu::{ensure, ResultExt};
 use crate::memtable::{
     key::{ComparableInternalKey, KeySequence},
     skiplist::iter::{ColumnarIterImpl, ReversedColumnarIterator},
-    ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable, PutContext,
-    Result, ScanContext, ScanRequest,
+    ColumnarIterPtr, EncodeInternalKey, InvalidPutSequence, InvalidRow, MemTable,
+    Metrics as MemtableMetrics, PutContext, Result, ScanContext, ScanRequest,
 };
 
+#[derive(Default, Debug)]
+struct Metrics {
+    row_raw_size: AtomicUsize,
+    row_encoded_size: AtomicUsize,
+    row_count: AtomicUsize,
+}
+
 /// MemTable implementation based on skiplist
 pub struct SkiplistMemTable<A: Arena<Stats = BasicStats> + Clone + Sync + Send> {
     /// Schema of this memtable, is immutable.
@@ -38,6 +45,8 @@ pub struct SkiplistMemTable<A: Arena<Stats = BasicStats> + Clone + Sync + Send>
     /// The last sequence of the rows in this memtable. Update to this field
     /// require external synchronization.
     last_sequence: AtomicU64,
+
+    metrics: Metrics,
 }
 
 impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send + 'static> MemTable
@@ -95,9 +104,20 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send + 'static> MemTable
         let row_value = &mut ctx.value_buf;
         let mut row_writer = ContiguousRowWriter::new(row_value, schema, &ctx.index_in_writer);
         row_writer.write_row(row).box_err().context(InvalidRow)?;
-
+        let encoded_size = internal_key.len() + row_value.len();
         self.skiplist.put(internal_key, row_value);
 
+        // Update metrics
+        self.metrics
+            .row_raw_size
+            .fetch_add(row.size(), atomic::Ordering::Relaxed);
+        self.metrics
+            .row_count
+            .fetch_add(1, atomic::Ordering::Relaxed);
+        self.metrics
+            .row_encoded_size
+            .fetch_add(encoded_size, atomic::Ordering::Relaxed);
+
         Ok(())
     }
 
@@ -147,6 +167,20 @@ impl<A: Arena<Stats = BasicStats> + Clone + Sync + Send + 'static> MemTable
     fn last_sequence(&self) -> SequenceNumber {
         self.last_sequence.load(atomic::Ordering::Relaxed)
     }
+
+    fn metrics(&self) -> MemtableMetrics {
+        let row_raw_size = self.metrics.row_raw_size.load(atomic::Ordering::Relaxed);
+        let row_encoded_size = self
+            .metrics
+            .row_encoded_size
+            .load(atomic::Ordering::Relaxed);
+        let row_count = self.metrics.row_count.load(atomic::Ordering::Relaxed);
+        MemtableMetrics {
+            row_raw_size,
+            row_encoded_size,
+            row_count,
+        }
+    }
 }
 
 #[derive(Debug, Clone)]
diff --git a/analytic_engine/src/sampler.rs b/analytic_engine/src/sampler.rs
index 86729dba2b..9a72011eec 100644
--- a/analytic_engine/src/sampler.rs
+++ b/analytic_engine/src/sampler.rs
@@ -382,8 +382,8 @@ mod tests {
             &[(0, 2 * HOUR_MS as i64)],
         );
 
-        let now_ts = Timestamp::now();
-        let now = now_ts.as_i64();
+        let now = 1672502400000i64;
+        let now_ts = Timestamp::new(now);
         let sec_ms_i64 = SEC_MS as i64;
 
         let bucket = TimeRange::bucket_of(now_ts, Duration::from_millis(2 * HOUR_MS)).unwrap();
diff --git a/analytic_engine/src/setup.rs b/analytic_engine/src/setup.rs
index be39178bbc..940ce16579 100644
--- a/analytic_engine/src/setup.rs
+++ b/analytic_engine/src/setup.rs
@@ -173,15 +173,63 @@ impl WalsOpener for RocksDBWalsOpener {
         let data_path = Path::new(&rocksdb_wal_config.data_dir);
         let wal_path = data_path.join(WAL_DIR_NAME);
         let data_wal = RocksWalBuilder::new(wal_path, write_runtime.clone())
+            .max_subcompactions(rocksdb_wal_config.data_namespace.max_subcompactions)
             .max_background_jobs(rocksdb_wal_config.data_namespace.max_background_jobs)
             .enable_statistics(rocksdb_wal_config.data_namespace.enable_statistics)
+            .write_buffer_size(rocksdb_wal_config.data_namespace.write_buffer_size.0)
+            .max_write_buffer_number(rocksdb_wal_config.data_namespace.max_write_buffer_number)
+            .level_zero_file_num_compaction_trigger(
+                rocksdb_wal_config
+                    .data_namespace
+                    .level_zero_file_num_compaction_trigger,
+            )
+            .level_zero_slowdown_writes_trigger(
+                rocksdb_wal_config
+                    .data_namespace
+                    .level_zero_slowdown_writes_trigger,
+            )
+            .level_zero_stop_writes_trigger(
+                rocksdb_wal_config
+                    .data_namespace
+                    .level_zero_stop_writes_trigger,
+            )
+            .fifo_compaction_max_table_files_size(
+                rocksdb_wal_config
+                    .data_namespace
+                    .fifo_compaction_max_table_files_size
+                    .0,
+            )
             .build()
             .context(OpenWal)?;
 
         let manifest_path = data_path.join(MANIFEST_DIR_NAME);
         let manifest_wal = RocksWalBuilder::new(manifest_path, write_runtime)
+            .max_subcompactions(rocksdb_wal_config.meta_namespace.max_subcompactions)
             .max_background_jobs(rocksdb_wal_config.meta_namespace.max_background_jobs)
             .enable_statistics(rocksdb_wal_config.meta_namespace.enable_statistics)
+            .write_buffer_size(rocksdb_wal_config.meta_namespace.write_buffer_size.0)
+            .max_write_buffer_number(rocksdb_wal_config.meta_namespace.max_write_buffer_number)
+            .level_zero_file_num_compaction_trigger(
+                rocksdb_wal_config
+                    .meta_namespace
+                    .level_zero_file_num_compaction_trigger,
+            )
+            .level_zero_slowdown_writes_trigger(
+                rocksdb_wal_config
+                    .meta_namespace
+                    .level_zero_slowdown_writes_trigger,
+            )
+            .level_zero_stop_writes_trigger(
+                rocksdb_wal_config
+                    .meta_namespace
+                    .level_zero_stop_writes_trigger,
+            )
+            .fifo_compaction_max_table_files_size(
+                rocksdb_wal_config
+                    .meta_namespace
+                    .fifo_compaction_max_table_files_size
+                    .0,
+            )
             .build()
             .context(OpenManifestWal)?;
         let opened_wals = OpenedWals {
diff --git a/analytic_engine/src/sst/meta_data/cache.rs b/analytic_engine/src/sst/meta_data/cache.rs
index 296c4e2476..5e2bacdcbd 100644
--- a/analytic_engine/src/sst/meta_data/cache.rs
+++ b/analytic_engine/src/sst/meta_data/cache.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 use std::{
     fmt::Debug,
@@ -7,7 +7,7 @@ use std::{
 
 use lru::LruCache;
 use parquet::file::metadata::FileMetaData;
-use snafu::{OptionExt, ResultExt};
+use snafu::{ensure, OptionExt, ResultExt};
 
 use crate::sst::{
     meta_data::{DecodeCustomMetaData, KvMetaDataNotFound, ParquetMetaDataRef, Result},
@@ -39,14 +39,24 @@ impl MetaData {
         let kv_metas = file_meta_data
             .key_value_metadata()
             .context(KvMetaDataNotFound)?;
-        let kv_meta = kv_metas
-            .iter()
-            .find(|kv| kv.key == encoding::META_KEY)
-            .context(KvMetaDataNotFound)?;
+
+        ensure!(!kv_metas.is_empty(), KvMetaDataNotFound);
+        let mut other_kv_metas = Vec::with_capacity(kv_metas.len() - 1);
+        let mut custom_kv_meta = None;
+        for kv_meta in kv_metas {
+            // Remove our extended custom meta data from the parquet metadata for small
+            // memory consumption in the cache.
+            if kv_meta.key == encoding::META_KEY {
+                custom_kv_meta = Some(kv_meta);
+            } else {
+                other_kv_metas.push(kv_meta.clone());
+            }
+        }
 
         let custom = {
+            let custom_kv_meta = custom_kv_meta.context(KvMetaDataNotFound)?;
             let mut sst_meta =
-                encoding::decode_sst_meta_data(kv_meta).context(DecodeCustomMetaData)?;
+                encoding::decode_sst_meta_data(custom_kv_meta).context(DecodeCustomMetaData)?;
             if ignore_sst_filter {
                 sst_meta.parquet_filter = None;
             }
@@ -56,13 +66,17 @@ impl MetaData {
 
         // let's build a new parquet metadata without the extended key value
         // metadata.
+        let other_kv_metas = if other_kv_metas.is_empty() {
+            None
+        } else {
+            Some(other_kv_metas)
+        };
         let parquet = {
             let thin_file_meta_data = FileMetaData::new(
                 file_meta_data.version(),
                 file_meta_data.num_rows(),
                 file_meta_data.created_by().map(|v| v.to_string()),
-                // Remove the key value metadata.
-                None,
+                other_kv_metas,
                 file_meta_data.schema_descr_ptr(),
                 file_meta_data.column_orders().cloned(),
             );
@@ -111,3 +125,153 @@ impl MetaCache {
         self.cache.write().unwrap().put(key, value);
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::{fs::File, path::Path, sync::Arc};
+
+    use arrow::{
+        array::UInt64Builder,
+        datatypes::{DataType, Field, Schema},
+        record_batch::RecordBatch,
+    };
+    use bytes::Bytes;
+    use common_types::{
+        column_schema::Builder as ColumnSchemaBuilder,
+        schema::Builder as CustomSchemaBuilder,
+        time::{TimeRange, Timestamp},
+    };
+    use parquet::{arrow::ArrowWriter, file::footer};
+    use parquet_ext::ParquetMetaData;
+
+    use super::MetaData;
+    use crate::sst::parquet::{encoding, meta_data::ParquetMetaData as CustomParquetMetaData};
+
+    fn check_parquet_meta_data(original: &ParquetMetaData, processed: &ParquetMetaData) {
+        assert_eq!(original.page_indexes(), processed.page_indexes());
+        assert_eq!(original.offset_indexes(), processed.offset_indexes());
+        assert_eq!(original.num_row_groups(), processed.num_row_groups());
+        assert_eq!(original.row_groups(), processed.row_groups());
+
+        let original_file_md = original.file_metadata();
+        let processed_file_md = processed.file_metadata();
+        assert_eq!(original_file_md.num_rows(), processed_file_md.num_rows());
+        assert_eq!(original_file_md.version(), processed_file_md.version());
+        assert_eq!(
+            original_file_md.created_by(),
+            processed_file_md.created_by()
+        );
+        assert_eq!(original_file_md.schema(), processed_file_md.schema());
+        assert_eq!(
+            original_file_md.schema_descr(),
+            processed_file_md.schema_descr()
+        );
+        assert_eq!(
+            original_file_md.schema_descr_ptr(),
+            processed_file_md.schema_descr_ptr()
+        );
+        assert_eq!(
+            original_file_md.column_orders(),
+            processed_file_md.column_orders()
+        );
+
+        if let Some(kv_metas) = original_file_md.key_value_metadata() {
+            let processed_kv_metas = processed_file_md.key_value_metadata().unwrap();
+            assert_eq!(kv_metas.len(), processed_kv_metas.len() + 1);
+            let mut idx_for_processed = 0;
+            for kv in kv_metas {
+                if kv.key == encoding::META_KEY {
+                    continue;
+                }
+                assert_eq!(kv, &processed_kv_metas[idx_for_processed]);
+                idx_for_processed += 1;
+            }
+        } else {
+            assert!(processed_file_md.key_value_metadata().is_none());
+        }
+    }
+
+    fn write_parquet_file_with_metadata(
+        parquet_file_path: &Path,
+        custom_meta_data: &CustomParquetMetaData,
+    ) {
+        let tsid_array = {
+            let mut builder = UInt64Builder::new();
+            builder.append_value(10);
+            builder.append_null();
+            builder.append_value(11);
+            builder.finish()
+        };
+        let timestamp_array = {
+            let mut builder = UInt64Builder::new();
+            builder.append_value(1000);
+            builder.append_null();
+            builder.append_value(1001);
+            builder.finish()
+        };
+        let file = File::create(parquet_file_path).unwrap();
+        let schema = Schema::new(vec![
+            Field::new("tsid", DataType::UInt64, true),
+            Field::new("timestamp", DataType::UInt64, true),
+        ]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![Arc::new(tsid_array), Arc::new(timestamp_array)],
+        )
+        .unwrap();
+        let mut writer = ArrowWriter::try_new(file, batch.schema(), None).unwrap();
+
+        let encoded_meta_data = encoding::encode_sst_meta_data(custom_meta_data.clone()).unwrap();
+        writer.append_key_value_metadata(encoded_meta_data);
+
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+    }
+
+    #[test]
+    fn test_arrow_meta_data() {
+        let temp_dir = tempfile::tempdir().unwrap();
+        let parquet_file_path = temp_dir.path().join("test_arrow_meta_data.par");
+        let schema = {
+            let tsid_column_schema = ColumnSchemaBuilder::new(
+                "tsid".to_string(),
+                common_types::datum::DatumKind::UInt64,
+            )
+            .build()
+            .unwrap();
+            let timestamp_column_schema = ColumnSchemaBuilder::new(
+                "timestamp".to_string(),
+                common_types::datum::DatumKind::Timestamp,
+            )
+            .build()
+            .unwrap();
+            CustomSchemaBuilder::new()
+                .auto_increment_column_id(true)
+                .add_key_column(tsid_column_schema)
+                .unwrap()
+                .add_key_column(timestamp_column_schema)
+                .unwrap()
+                .build()
+                .unwrap()
+        };
+        let custom_meta_data = CustomParquetMetaData {
+            min_key: Bytes::from_static(&[0, 1]),
+            max_key: Bytes::from_static(&[2, 2]),
+            time_range: TimeRange::new_unchecked(Timestamp::new(0), Timestamp::new(10)),
+            max_sequence: 1001,
+            schema,
+            parquet_filter: None,
+            collapsible_cols_idx: vec![],
+        };
+        write_parquet_file_with_metadata(parquet_file_path.as_path(), &custom_meta_data);
+
+        let parquet_file = File::open(parquet_file_path.as_path()).unwrap();
+        let parquet_meta_data = footer::parse_metadata(&parquet_file).unwrap();
+
+        let meta_data = MetaData::try_new(&parquet_meta_data, false).unwrap();
+
+        assert_eq!(**meta_data.custom(), custom_meta_data);
+        check_parquet_meta_data(&parquet_meta_data, meta_data.parquet());
+    }
+}
diff --git a/analytic_engine/src/sst/parquet/async_reader.rs b/analytic_engine/src/sst/parquet/async_reader.rs
index dac48bff44..b2181f727b 100644
--- a/analytic_engine/src/sst/parquet/async_reader.rs
+++ b/analytic_engine/src/sst/parquet/async_reader.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Sst reader implementation based on parquet.
 
@@ -30,17 +30,14 @@ use datafusion::{
         metrics::ExecutionPlanMetricsSet,
     },
 };
-use futures::{future::BoxFuture, FutureExt, Stream, StreamExt, TryFutureExt};
+use futures::{Stream, StreamExt};
 use log::{debug, error};
 use object_store::{ObjectStoreRef, Path};
 use parquet::{
-    arrow::{
-        arrow_reader::RowSelection, async_reader::AsyncFileReader, ParquetRecordBatchStreamBuilder,
-        ProjectionMask,
-    },
+    arrow::{arrow_reader::RowSelection, ParquetRecordBatchStreamBuilder, ProjectionMask},
     file::metadata::RowGroupMetaData,
 };
-use parquet_ext::meta_data::ChunkReader;
+use parquet_ext::{meta_data::ChunkReader, reader::ObjectStoreReader};
 use snafu::ResultExt;
 use table_engine::predicate::PredicateRef;
 use tokio::sync::mpsc::{self, Receiver, Sender};
@@ -281,13 +278,23 @@ impl<'a> Reader<'a> {
 
         let mut streams = Vec::with_capacity(target_row_group_chunks.len());
         for chunk in target_row_group_chunks {
-            let object_store_reader =
-                ObjectStoreReader::new(self.store.clone(), self.path.clone(), meta_data.clone());
+            let object_store_reader = ObjectStoreReader::new(
+                self.store.clone(),
+                self.path.clone(),
+                parquet_metadata.clone(),
+            );
             let mut builder = ParquetRecordBatchStreamBuilder::new(object_store_reader)
                 .await
                 .with_context(|| ParquetError)?;
+
             let row_selection =
                 self.build_row_selection(arrow_schema.clone(), &chunk, parquet_metadata)?;
+
+            debug!(
+                "Build row selection for file path:{}, result:{row_selection:?}, page indexes:{}",
+                self.path,
+                parquet_metadata.page_indexes().is_some()
+            );
             if let Some(selection) = row_selection {
                 builder = builder.with_row_selection(selection);
             };
@@ -340,18 +347,32 @@ impl<'a> Reader<'a> {
         Ok(file_size)
     }
 
-    async fn load_meta_data_from_storage(&self) -> Result<parquet_ext::ParquetMetaDataRef> {
+    async fn load_meta_data_from_storage(&self, ignore_sst_filter: bool) -> Result<MetaData> {
         let file_size = self.load_file_size().await?;
         let chunk_reader_adapter = ChunkReaderAdapter::new(self.path, self.store);
 
-        let (meta_data, _) =
+        let (parquet_meta_data, _) =
             parquet_ext::meta_data::fetch_parquet_metadata(file_size, &chunk_reader_adapter)
                 .await
                 .with_context(|| FetchAndDecodeSstMeta {
                     file_path: self.path.to_string(),
                 })?;
 
-        Ok(Arc::new(meta_data))
+        let object_store_reader = parquet_ext::reader::ObjectStoreReader::new(
+            self.store.clone(),
+            self.path.clone(),
+            Arc::new(parquet_meta_data),
+        );
+
+        let parquet_meta_data = parquet_ext::meta_data::meta_with_page_indexes(object_store_reader)
+            .await
+            .with_context(|| DecodePageIndexes {
+                file_path: self.path.to_string(),
+            })?;
+
+        MetaData::try_new(&parquet_meta_data, ignore_sst_filter)
+            .box_err()
+            .context(DecodeSstMeta)
     }
 
     fn need_update_cache(&self) -> bool {
@@ -375,12 +396,8 @@ impl<'a> Reader<'a> {
         let empty_predicate = self.predicate.exprs().is_empty();
 
         let meta_data = {
-            let parquet_meta_data = self.load_meta_data_from_storage().await?;
-
             let ignore_sst_filter = avoid_update_cache && empty_predicate;
-            MetaData::try_new(&parquet_meta_data, ignore_sst_filter)
-                .box_err()
-                .context(DecodeSstMeta)?
+            self.load_meta_data_from_storage(ignore_sst_filter).await?
         };
 
         if avoid_update_cache || self.meta_cache.is_none() {
@@ -413,71 +430,6 @@ impl<'a> Drop for Reader<'a> {
     }
 }
 
-#[derive(Clone)]
-struct ObjectStoreReader {
-    storage: ObjectStoreRef,
-    path: Path,
-    meta_data: MetaData,
-    begin: Instant,
-}
-
-impl ObjectStoreReader {
-    fn new(storage: ObjectStoreRef, path: Path, meta_data: MetaData) -> Self {
-        Self {
-            storage,
-            path,
-            meta_data,
-            begin: Instant::now(),
-        }
-    }
-}
-
-impl Drop for ObjectStoreReader {
-    fn drop(&mut self) {
-        debug!(
-            "ObjectStoreReader dropped, path:{}, elapsed:{:?}",
-            &self.path,
-            self.begin.elapsed()
-        );
-    }
-}
-
-impl AsyncFileReader for ObjectStoreReader {
-    fn get_bytes(&mut self, range: Range<usize>) -> BoxFuture<'_, parquet::errors::Result<Bytes>> {
-        self.storage
-            .get_range(&self.path, range)
-            .map_err(|e| {
-                parquet::errors::ParquetError::General(format!(
-                    "Failed to fetch range from object store, err:{e}"
-                ))
-            })
-            .boxed()
-    }
-
-    fn get_byte_ranges(
-        &mut self,
-        ranges: Vec<Range<usize>>,
-    ) -> BoxFuture<'_, parquet::errors::Result<Vec<Bytes>>> {
-        async move {
-            self.storage
-                .get_ranges(&self.path, &ranges)
-                .map_err(|e| {
-                    parquet::errors::ParquetError::General(format!(
-                        "Failed to fetch ranges from object store, err:{e}"
-                    ))
-                })
-                .await
-        }
-        .boxed()
-    }
-
-    fn get_metadata(
-        &mut self,
-    ) -> BoxFuture<'_, parquet::errors::Result<Arc<parquet::file::metadata::ParquetMetaData>>> {
-        Box::pin(async move { Ok(self.meta_data.parquet().clone()) })
-    }
-}
-
 pub struct ChunkReaderAdapter<'a> {
     path: &'a Path,
     store: &'a ObjectStoreRef,
diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs
index 2effc6bc36..1a92338dd4 100644
--- a/analytic_engine/src/sst/parquet/encoding.rs
+++ b/analytic_engine/src/sst/parquet/encoding.rs
@@ -526,7 +526,16 @@ impl HybridRecordDecoder {
             .iter()
             .map(|f| {
                 if let DataType::List(nested_field) = f.data_type() {
-                    Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true))
+                    match f.data_type() {
+                        DataType::Dictionary(_, _) => Arc::new(Field::new_dict(
+                            f.name(),
+                            nested_field.data_type().clone(),
+                            true,
+                            f.dict_id().unwrap(),
+                            f.dict_is_ordered().unwrap(),
+                        )),
+                        _ => Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true)),
+                    }
                 } else {
                     f.clone()
                 }
@@ -1030,11 +1039,11 @@ mod tests {
             ArrowRecordBatch::try_new(schema.to_arrow_schema_ref(), columns).unwrap();
         let input_record_batch2 =
             ArrowRecordBatch::try_new(schema.to_arrow_schema_ref(), columns2).unwrap();
-        let row_nums = encoder
+        let num_rows = encoder
             .encode(vec![input_record_batch, input_record_batch2])
             .await
             .unwrap();
-        assert_eq!(2, row_nums);
+        assert_eq!(2, num_rows);
 
         // read encoded records back, and then compare with input records
         encoder.close().await.unwrap();
diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs
index df0b3808af..1cf7481ecf 100644
--- a/analytic_engine/src/sst/parquet/hybrid.rs
+++ b/analytic_engine/src/sst/parquet/hybrid.rs
@@ -127,6 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef {
                     field.data_type().clone(),
                     true,
                 )));
+                // TODO is there need to use new_dict?
                 Arc::new(Field::new(field.name(), field_type, true))
             } else {
                 field.clone()
@@ -418,6 +419,7 @@ impl ListArrayBuilder {
         let array_len = self.multi_row_arrays.len();
         let mut offsets = MutableBuffer::new(array_len * std::mem::size_of::<i32>());
         let child_data = self.build_child_data(&mut offsets)?;
+        // TODO is there need to use new_dict?
         let field = Arc::new(Field::new(
             LIST_ITEM_NAME,
             self.datum_kind.to_arrow_data_type(),
diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index 8bba1b41a2..378854bafa 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -333,7 +333,7 @@ mod tests {
     use common_types::{
         bytes::Bytes,
         projected_schema::ProjectedSchema,
-        tests::{build_row, build_schema},
+        tests::{build_row, build_row_for_dictionary, build_schema, build_schema_for_dictionary},
         time::{TimeRange, Timestamp},
     };
     use common_util::{
@@ -358,8 +358,163 @@ mod tests {
         table_options::{self, StorageFormatHint},
     };
 
+    fn write_parquet_with_dictionary_encode_and_read_back(
+        runtime: Arc<Runtime>,
+        num_rows_per_row_group: usize,
+        expected_num_rows: Vec<i64>,
+    ) {
+        runtime.block_on(async {
+            let sst_factory = FactoryImpl;
+            let sst_write_options = SstWriteOptions {
+                storage_format_hint: StorageFormatHint::Auto,
+                num_rows_per_row_group,
+                compression: table_options::Compression::Uncompressed,
+                max_buffer_size: 0,
+            };
+
+            let dir = tempdir().unwrap();
+            let root = dir.path();
+            let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap());
+            let store_picker: ObjectStorePickerRef = Arc::new(store);
+            let sst_file_path = Path::from("test_dictionary.par");
+
+            let schema = build_schema_for_dictionary();
+            let reader_projected_schema = ProjectedSchema::no_projection(schema.clone());
+            let sst_meta = MetaData {
+                min_key: Bytes::from_static(b"100"),
+                max_key: Bytes::from_static(b"200"),
+                time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)),
+                max_sequence: 200,
+                schema: schema.clone(),
+            };
+
+            let mut counter = 5;
+            let record_batch_stream = Box::new(stream::poll_fn(move |_| -> Poll<Option<_>> {
+                if counter == 0 {
+                    return Poll::Ready(None);
+                }
+                counter -= 1;
+
+                let ts = 100 + counter;
+                let rows = vec![
+                    build_row_for_dictionary(1, ts, Some("tagv1"), "tagv2", 1),
+                    build_row_for_dictionary(2, ts, Some("tagv2"), "tagv2", 2),
+                    build_row_for_dictionary(3, ts, None, "tagv3", 3),
+                    build_row_for_dictionary(4, ts, Some("tagv3"), "tagv2", 2),
+                ];
+                let batch = build_record_batch_with_key(schema.clone(), rows);
+                Poll::Ready(Some(Ok(batch)))
+            }));
+            let mut writer = sst_factory
+                .create_writer(
+                    &sst_write_options,
+                    &sst_file_path,
+                    &store_picker,
+                    Level::MAX,
+                )
+                .await
+                .unwrap();
+            let sst_info = writer
+                .write(RequestId::next_id(), &sst_meta, record_batch_stream)
+                .await
+                .unwrap();
+
+            assert_eq!(20, sst_info.row_num);
+
+            let scan_options = ScanOptions::default();
+            // read sst back to test
+            let sst_read_options = SstReadOptions {
+                reverse: false,
+                frequency: ReadFrequency::Frequent,
+                num_rows_per_row_group: 5,
+                projected_schema: reader_projected_schema,
+                predicate: Arc::new(Predicate::empty()),
+                meta_cache: None,
+                scan_options,
+                runtime: runtime.clone(),
+            };
+
+            let mut reader: Box<dyn SstReader + Send> = {
+                let mut reader = AsyncParquetReader::new(
+                    &sst_file_path,
+                    &sst_read_options,
+                    None,
+                    &store_picker,
+                    None,
+                );
+                let mut sst_meta_readback = reader
+                    .meta_data()
+                    .await
+                    .unwrap()
+                    .as_parquet()
+                    .unwrap()
+                    .as_ref()
+                    .clone();
+                // sst filter is built insider sst writer, so overwrite to default for
+                // comparison.
+                sst_meta_readback.parquet_filter = Default::default();
+                assert_eq!(&sst_meta_readback, &ParquetMetaData::from(sst_meta));
+                assert_eq!(
+                    expected_num_rows,
+                    reader
+                        .row_groups()
+                        .await
+                        .iter()
+                        .map(|g| g.num_rows())
+                        .collect::<Vec<_>>()
+                );
+
+                Box::new(reader)
+            };
+            let mut stream = reader.read().await.unwrap();
+            let mut expect_rows = vec![];
+            for counter in &[4, 3, 2, 1, 0] {
+                expect_rows.push(build_row_for_dictionary(
+                    1,
+                    100 + counter,
+                    Some("tagv1"),
+                    "tagv2",
+                    1,
+                ));
+                expect_rows.push(build_row_for_dictionary(
+                    2,
+                    100 + counter,
+                    Some("tagv2"),
+                    "tagv2",
+                    2,
+                ));
+                expect_rows.push(build_row_for_dictionary(3, 100 + counter, None, "tagv3", 3));
+                expect_rows.push(build_row_for_dictionary(
+                    4,
+                    100 + counter,
+                    Some("tagv3"),
+                    "tagv2",
+                    2,
+                ));
+            }
+            check_stream(&mut stream, expect_rows).await;
+        });
+    }
+
     // TODO(xikai): add test for reverse reader
+    #[test]
+    fn test_parquet_use_dictionary() {
+        init_log_for_test();
 
+        let runtime = Arc::new(runtime::Builder::default().build().unwrap());
+        write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 5, vec![5, 5, 5, 5]);
+        write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]);
+        write_parquet_with_dictionary_encode_and_read_back(
+            runtime.clone(),
+            3,
+            vec![3, 3, 3, 3, 3, 3, 2],
+        );
+        write_parquet_with_dictionary_encode_and_read_back(
+            runtime,
+            2,
+            vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
+        );
+    }
     #[test]
     fn test_parquet_build_and_read() {
         init_log_for_test();
@@ -391,7 +546,7 @@ mod tests {
             let sst_file_path = Path::from("data.par");
 
             let schema = build_schema();
-            let projected_schema = ProjectedSchema::no_projection(schema.clone());
+            let reader_projected_schema = ProjectedSchema::no_projection(schema.clone());
             let sst_meta = MetaData {
                 min_key: Bytes::from_static(b"100"),
                 max_key: Bytes::from_static(b"200"),
@@ -440,7 +595,7 @@ mod tests {
                 reverse: false,
                 frequency: ReadFrequency::Frequent,
                 num_rows_per_row_group: 5,
-                projected_schema,
+                projected_schema: reader_projected_schema,
                 predicate: Arc::new(Predicate::empty()),
                 meta_cache: None,
                 scan_options,
diff --git a/analytic_engine/src/sst/reader.rs b/analytic_engine/src/sst/reader.rs
index 99872d448a..029b0aa34a 100644
--- a/analytic_engine/src/sst/reader.rs
+++ b/analytic_engine/src/sst/reader.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Sst reader trait definition.
 
@@ -15,20 +15,17 @@ pub mod error {
     #[derive(Debug, Snafu)]
     #[snafu(visibility(pub))]
     pub enum Error {
-        #[snafu(display("Try to read again, path:{}.\nBacktrace:\n{}", path, backtrace))]
+        #[snafu(display("Try to read again, path:{path}.\nBacktrace:\n{backtrace}"))]
         ReadAgain { backtrace: Backtrace, path: String },
 
-        #[snafu(display("Fail to read persisted file, path:{}, err:{}", path, source))]
+        #[snafu(display("Fail to read persisted file, path:{path}, err:{source}"))]
         ReadPersist { path: String, source: GenericError },
 
-        #[snafu(display("Failed to decode record batch, err:{}", source))]
+        #[snafu(display("Failed to decode record batch, err:{source}"))]
         DecodeRecordBatch { source: GenericError },
 
         #[snafu(display(
-            "Failed to decode sst meta data, file_path:{}, err:{}.\nBacktrace:\n{:?}",
-            file_path,
-            source,
-            backtrace
+            "Failed to decode sst meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}",
         ))]
         FetchAndDecodeSstMeta {
             file_path: String,
@@ -36,43 +33,52 @@ pub mod error {
             backtrace: Backtrace,
         },
 
-        #[snafu(display("Failed to decode sst meta data, err:{}", source))]
+        #[snafu(display(
+            "Failed to decode page indexes for meta data, file_path:{file_path}, err:{source}.\nBacktrace:\n{backtrace:?}",
+        ))]
+        DecodePageIndexes {
+            file_path: String,
+            source: parquet::errors::ParquetError,
+            backtrace: Backtrace,
+        },
+
+        #[snafu(display("Failed to decode sst meta data, err:{source}"))]
         DecodeSstMeta { source: GenericError },
 
-        #[snafu(display("Sst meta data is not found.\nBacktrace:\n{}", backtrace))]
+        #[snafu(display("Sst meta data is not found.\nBacktrace:\n{backtrace}"))]
         SstMetaNotFound { backtrace: Backtrace },
 
-        #[snafu(display("Fail to projection, err:{}", source))]
+        #[snafu(display("Fail to projection, err:{source}"))]
         Projection { source: GenericError },
 
-        #[snafu(display("Sst meta data is empty.\nBacktrace:\n{}", backtrace))]
+        #[snafu(display("Sst meta data is empty.\nBacktrace:\n{backtrace}"))]
         EmptySstMeta { backtrace: Backtrace },
 
-        #[snafu(display("Invalid schema, err:{}", source))]
+        #[snafu(display("Invalid schema, err:{source}"))]
         InvalidSchema { source: common_types::schema::Error },
 
-        #[snafu(display("Meet a datafusion error, err:{}\nBacktrace:\n{}", source, backtrace))]
+        #[snafu(display("Meet a datafusion error, err:{source}\nBacktrace:\n{backtrace}"))]
         DataFusionError {
             source: datafusion::error::DataFusionError,
             backtrace: Backtrace,
         },
 
-        #[snafu(display("Meet a object store error, err:{}\nBacktrace:\n{}", source, backtrace))]
+        #[snafu(display("Meet a object store error, err:{source}\nBacktrace:\n{backtrace}"))]
         ObjectStoreError {
             source: object_store::ObjectStoreError,
             backtrace: Backtrace,
         },
 
-        #[snafu(display("Meet a parquet error, err:{}\nBacktrace:\n{}", source, backtrace))]
+        #[snafu(display("Meet a parquet error, err:{source}\nBacktrace:\n{backtrace}"))]
         ParquetError {
             source: parquet::errors::ParquetError,
             backtrace: Backtrace,
         },
 
-        #[snafu(display("Other kind of error:{}", source))]
+        #[snafu(display("Other kind of error:{source}"))]
         Other { source: GenericError },
 
-        #[snafu(display("Other kind of error, msg:{}.\nBacktrace:\n{}", msg, backtrace))]
+        #[snafu(display("Other kind of error, msg:{msg}.\nBacktrace:\n{backtrace}"))]
         OtherNoCause { msg: String, backtrace: Backtrace },
     }
 
diff --git a/analytic_engine/src/table/data.rs b/analytic_engine/src/table/data.rs
index 49d9d6cae8..6af8c5fad4 100644
--- a/analytic_engine/src/table/data.rs
+++ b/analytic_engine/src/table/data.rs
@@ -358,6 +358,12 @@ impl TableData {
         self.current_version.total_memory_usage()
     }
 
+    /// Returns mutable memtable memory usage in bytes.
+    #[inline]
+    pub fn mutable_memory_usage(&self) -> usize {
+        self.current_version.mutable_memory_usage()
+    }
+
     /// Find memtable for given timestamp to insert, create if not exists
     ///
     /// If the memtable schema is outdated, switch all memtables and create the
@@ -443,12 +449,11 @@ impl TableData {
 
         let mutable_usage = self.current_version.mutable_memory_usage();
         let total_usage = self.current_version.total_memory_usage();
-
         let in_flush = serial_exec.flush_scheduler().is_in_flush();
         // Inspired by https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L94
         if mutable_usage > mutable_limit && !in_flush {
             info!(
-                "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
+                "TableData should flush by mutable limit, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
                 self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size
             );
             return true;
@@ -467,7 +472,7 @@ impl TableData {
 
         if should_flush {
             info!(
-                "TableData should flush, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
+                "TableData should flush by total usage, table:{}, table_id:{}, mutable_usage:{}, mutable_limit: {}, total_usage:{}, max_write_buffer_size:{}",
                 self.name, self.id, mutable_usage, mutable_limit, total_usage, max_write_buffer_size
             );
         }
@@ -592,6 +597,14 @@ impl TableDataSet {
             .cloned()
     }
 
+    pub fn find_maximum_mutable_memory_usage_table(&self) -> Option<TableDataRef> {
+        // TODO: Possible performance issue here when there are too many tables.
+        self.table_datas
+            .values()
+            .max_by_key(|t| t.mutable_memory_usage())
+            .cloned()
+    }
+
     /// List all tables to `tables`
     pub fn list_all_tables(&self, tables: &mut Vec<TableDataRef>) {
         for table_data in self.table_datas.values().cloned() {
@@ -766,7 +779,7 @@ pub mod tests {
             Some(ReadableDuration(table_options::DEFAULT_SEGMENT_DURATION));
         table_data.set_table_options(table_opts);
         // Freeze sampling memtable.
-        current_version.freeze_sampling();
+        current_version.freeze_sampling_memtable();
 
         // A new mutable memtable should be created.
         let mutable = table_data.find_or_create_mutable(now_ts, &schema).unwrap();
diff --git a/analytic_engine/src/table/version.rs b/analytic_engine/src/table/version.rs
index 329c09677a..9f64403294 100644
--- a/analytic_engine/src/table/version.rs
+++ b/analytic_engine/src/table/version.rs
@@ -130,6 +130,8 @@ impl fmt::Debug for MemTableState {
         f.debug_struct("MemTableState")
             .field("time_range", &self.time_range)
             .field("id", &self.id)
+            .field("mem", &self.mem.approximate_memory_usage())
+            .field("metrics", &self.mem.metrics())
             .field("last_sequence", &self.mem.last_sequence())
             .finish()
     }
@@ -278,16 +280,19 @@ impl MemTableView {
         mutable_usage + immutable_usage
     }
 
-    /// Switch all memtables or just sample the segment duration.
+    /// Instead of replace the old memtable by a new memtable, we just move the
+    /// old memtable to immutable memtables and left mutable memtables
+    /// empty. New mutable memtable will be constructed via put request.
+    fn switch_memtables(&mut self) -> Option<SequenceNumber> {
+        self.mutables.move_to_inmem(&mut self.immutables)
+    }
+
+    /// Sample the segment duration.
     ///
     /// If the sampling memtable is still active, return the suggested segment
     /// duration or move all mutable memtables into immutable memtables if
     /// the sampling memtable is freezed and returns None.
-    ///
-    /// Instead of replace the old memtable by a new memtable, we just move the
-    /// old memtable to immutable memtables and left mutable memtables
-    /// empty. New mutable memtable will be constructed via put request.
-    fn switch_memtables_or_suggest_duration(&mut self) -> Option<Duration> {
+    fn suggest_duration(&mut self) -> Option<Duration> {
         if let Some(v) = &mut self.sampling_mem {
             if !v.freezed {
                 // Other memtable should be empty during sampling phase.
@@ -304,15 +309,15 @@ impl MemTableView {
             }
         }
 
-        self.mutables.move_to_inmem(&mut self.immutables);
-
         None
     }
 
-    fn freeze_sampling_memtable(&mut self) {
+    fn freeze_sampling_memtable(&mut self) -> Option<SequenceNumber> {
         if let Some(v) = &mut self.sampling_mem {
             v.freezed = true;
+            return Some(v.mem.last_sequence());
         }
+        None
     }
 
     /// Returns memtables need to be flushed. Only sampling memtable and
@@ -414,13 +419,20 @@ impl MutableMemTableSet {
     }
 
     /// Move all mutable memtables to immutable memtables.
-    fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) {
-        for m in self.0.values() {
-            let state = m.clone();
+    fn move_to_inmem(&mut self, immem: &mut ImmutableMemTableSet) -> Option<SequenceNumber> {
+        let last_seq = self
+            .0
+            .values()
+            .map(|m| {
+                let last_sequence = m.mem.last_sequence();
+                immem.0.insert(m.id, m.clone());
+
+                last_sequence
+            })
+            .max();
 
-            immem.0.insert(m.id, state);
-        }
         self.0.clear();
+        last_seq
     }
 
     fn memtables_for_read(&self, time_range: TimeRange, mems: &mut MemTableVec) {
@@ -568,29 +580,29 @@ impl TableVersion {
             .total_memory_usage()
     }
 
-    /// Switch all mutable memtables or just return the suggested segment
-    /// duration if sampling memtable is still active.
-    ///
-    /// Returns a duration if a sampled segment duration needs to be persisted.
+    /// Return the suggested segment duration if sampling memtable is still
+    /// active.
+    pub fn suggest_duration(&self) -> Option<Duration> {
+        self.inner.write().unwrap().memtable_view.suggest_duration()
+    }
+
+    /// Switch all mutable memtables
     ///
-    /// REQUIRE: Do in write worker
-    pub fn switch_memtables_or_suggest_duration(&self) -> Option<Duration> {
-        self.inner
-            .write()
-            .unwrap()
-            .memtable_view
-            .switch_memtables_or_suggest_duration()
+    /// Returns the maxium `SequenceNumber` in the mutable memtables needs to be
+    /// freezed.
+    pub fn switch_memtables(&self) -> Option<SequenceNumber> {
+        self.inner.write().unwrap().memtable_view.switch_memtables()
     }
 
     /// Stop timestamp sampling and freezed the sampling memtable.
     ///
     /// REQUIRE: Do in write worker
-    pub fn freeze_sampling(&self) {
+    pub fn freeze_sampling_memtable(&self) -> Option<SequenceNumber> {
         self.inner
             .write()
             .unwrap()
             .memtable_view
-            .freeze_sampling_memtable();
+            .freeze_sampling_memtable()
     }
 
     /// See [MemTableView::pick_memtables_to_flush]
@@ -727,9 +739,9 @@ impl TableVersion {
         picker_ctx: PickerContext,
         picker: &CompactionPickerRef,
     ) -> picker::Result<CompactionTask> {
-        let inner = self.inner.read().unwrap();
+        let mut inner = self.inner.write().unwrap();
 
-        picker.pick_compaction(picker_ctx, &inner.levels_controller)
+        picker.pick_compaction(picker_ctx, &mut inner.levels_controller)
     }
 
     pub fn has_expired_sst(&self, expire_time: Option<Timestamp>) -> bool {
@@ -870,7 +882,8 @@ mod tests {
         assert!(mutable.is_none());
 
         // Nothing to switch.
-        assert!(version.switch_memtables_or_suggest_duration().is_none());
+        assert!(version.suggest_duration().is_none());
+        assert!(version.switch_memtables().is_none());
     }
 
     fn check_flushable_mem_with_sampling(
@@ -936,8 +949,9 @@ mod tests {
 
         version.set_sampling(sampling_mem);
 
-        let duration = version.switch_memtables_or_suggest_duration().unwrap();
+        let duration = version.suggest_duration().unwrap();
         assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration);
+        assert!(version.switch_memtables().is_none());
 
         // Flushable memtables only contains sampling memtable.
         let flushable_mems = version.pick_memtables_to_flush(last_sequence);
@@ -954,9 +968,11 @@ mod tests {
         assert_eq!(memtable_id, actual_memtable.id);
 
         // Switch still return duration before freezed.
-        let duration = version.switch_memtables_or_suggest_duration().unwrap();
+        let duration = version.suggest_duration().unwrap();
         assert_eq!(table_options::DEFAULT_SEGMENT_DURATION, duration);
+        assert!(version.switch_memtables().is_none());
 
+        version.switch_memtables();
         // Flushable memtables only contains sampling memtable before sampling
         // memtable is freezed.
         let flushable_mems = version.pick_memtables_to_flush(last_sequence);
@@ -977,11 +993,11 @@ mod tests {
         version.set_sampling(sampling_mem);
         assert_eq!(
             table_options::DEFAULT_SEGMENT_DURATION,
-            version.switch_memtables_or_suggest_duration().unwrap()
+            version.suggest_duration().unwrap()
         );
-
+        assert!(version.switch_memtables().is_none());
         // Freeze the sampling memtable.
-        version.freeze_sampling();
+        version.freeze_sampling_memtable();
 
         // No memtable after switch and freezed.
         let now = Timestamp::now();
@@ -1029,7 +1045,8 @@ mod tests {
         assert_eq!(memtable_id2, read_view.memtables[0].id);
 
         // Switch mutable memtable.
-        assert!(version.switch_memtables_or_suggest_duration().is_none());
+        assert!(version.suggest_duration().is_none());
+        assert!(version.switch_memtables().is_some());
         // No memtable after switch.
         let now = Timestamp::now();
         assert!(version
@@ -1055,7 +1072,7 @@ mod tests {
 
         // Prepare sampling memtable.
         version.set_sampling(sampling_mem);
-        version.freeze_sampling();
+        version.freeze_sampling_memtable();
 
         let now = Timestamp::now();
         let time_range =
@@ -1073,8 +1090,8 @@ mod tests {
         version.insert_mutable(mem_state);
 
         // Switch memtable.
-        assert!(version.switch_memtables_or_suggest_duration().is_none());
-
+        assert!(version.suggest_duration().is_none());
+        assert!(version.switch_memtables().is_some());
         let max_sequence = 120;
         let file_id = 13;
         let add_file = AddFileMocker::new(file_id)
diff --git a/analytic_engine/src/tests/alter_test.rs b/analytic_engine/src/tests/alter_test.rs
index 614cab7541..c6f4b08eec 100644
--- a/analytic_engine/src/tests/alter_test.rs
+++ b/analytic_engine/src/tests/alter_test.rs
@@ -20,24 +20,25 @@ use crate::{
     tests::{
         row_util,
         table::{self, FixedSchemaTable},
-        util::{
-            EngineBuildContext, MemoryEngineBuildContext, Null, RocksDBEngineBuildContext,
-            TestContext, TestEnv,
-        },
+        util::{memory_ctxs, rocksdb_ctxs, EngineBuildContext, Null, TestContext, TestEnv},
     },
 };
 
 #[test]
 fn test_alter_table_add_column_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_alter_table_add_column(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_alter_table_add_column(ctx);
+    }
 }
 
 #[ignore = "Enable this test when manifest use another snapshot implementation"]
 #[test]
 fn test_alter_table_add_column_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_alter_table_add_column(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_alter_table_add_column(ctx);
+    }
 }
 
 fn test_alter_table_add_column<T: EngineBuildContext>(engine_context: T) {
@@ -370,15 +371,19 @@ async fn check_read_row_group<T: WalsOpener>(
 
 #[test]
 fn test_alter_table_options_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_alter_table_options(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_alter_table_options(ctx);
+    }
 }
 
 #[ignore = "Enable this test when manifest use another snapshot implementation"]
 #[test]
 fn test_alter_table_options_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_alter_table_options(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_alter_table_options(ctx);
+    }
 }
 
 fn test_alter_table_options<T: EngineBuildContext>(engine_context: T) {
diff --git a/analytic_engine/src/tests/drop_test.rs b/analytic_engine/src/tests/drop_test.rs
index 5dd0be033a..c915ae1482 100644
--- a/analytic_engine/src/tests/drop_test.rs
+++ b/analytic_engine/src/tests/drop_test.rs
@@ -10,7 +10,8 @@ use table_engine::table::AlterSchemaRequest;
 use crate::tests::{
     table::FixedSchemaTable,
     util::{
-        self, EngineBuildContext, MemoryEngineBuildContext, RocksDBEngineBuildContext, TestEnv,
+        self, memory_ctxs, rocksdb_ctxs, EngineBuildContext, MemoryEngineBuildContext,
+        RocksDBEngineBuildContext, TestEnv,
     },
 };
 
@@ -209,14 +210,18 @@ fn test_drop_create_same_table_case<T: EngineBuildContext>(flush: bool, engine_c
 
 #[test]
 fn test_drop_create_same_table_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_drop_create_same_table(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_drop_create_same_table(ctx);
+    }
 }
 
 #[test]
 fn test_drop_create_same_table_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_drop_create_same_table(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_drop_create_same_table(ctx);
+    }
 }
 
 fn test_drop_create_same_table<T: EngineBuildContext>(engine_context: T) {
@@ -227,14 +232,18 @@ fn test_drop_create_same_table<T: EngineBuildContext>(engine_context: T) {
 
 #[test]
 fn test_alter_schema_drop_create_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_alter_schema_drop_create(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_alter_schema_drop_create(ctx);
+    }
 }
 
 #[test]
 fn test_alter_schema_drop_create_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_alter_schema_drop_create(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_alter_schema_drop_create(ctx);
+    }
 }
 
 fn test_alter_schema_drop_create<T: EngineBuildContext>(engine_context: T) {
@@ -284,14 +293,18 @@ fn test_alter_schema_drop_create<T: EngineBuildContext>(engine_context: T) {
 
 #[test]
 fn test_alter_options_drop_create_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_alter_options_drop_create(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_alter_options_drop_create(ctx);
+    }
 }
 
 #[test]
 fn test_alter_options_drop_create_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_alter_options_drop_create(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_alter_options_drop_create(ctx);
+    }
 }
 
 fn test_alter_options_drop_create<T: EngineBuildContext>(engine_context: T) {
diff --git a/analytic_engine/src/tests/read_write_test.rs b/analytic_engine/src/tests/read_write_test.rs
index 783f46aa42..30a4f3a980 100644
--- a/analytic_engine/src/tests/read_write_test.rs
+++ b/analytic_engine/src/tests/read_write_test.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Read write test.
 
@@ -11,22 +11,23 @@ use table_engine::table::ReadOrder;
 use crate::{
     setup::WalsOpener,
     table_options,
-    tests::util::{
-        self, EngineBuildContext, MemoryEngineBuildContext, RocksDBEngineBuildContext, TestContext,
-        TestEnv,
-    },
+    tests::util::{self, memory_ctxs, rocksdb_ctxs, EngineBuildContext, TestContext, TestEnv},
 };
 
 #[test]
 fn test_multi_table_read_write_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_multi_table_read_write(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_multi_table_read_write(ctx);
+    }
 }
 
 #[test]
 fn test_multi_table_read_write_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_multi_table_read_write(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_multi_table_read_write(ctx);
+    }
 }
 
 fn test_multi_table_read_write<T: EngineBuildContext>(engine_context: T) {
@@ -171,14 +172,18 @@ fn test_multi_table_read_write<T: EngineBuildContext>(engine_context: T) {
 
 #[test]
 fn test_table_write_read_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_table_write_read(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_table_write_read(ctx);
+    }
 }
 
 #[test]
 fn test_table_write_read_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_table_write_read(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_table_write_read(ctx);
+    }
 }
 
 fn test_table_write_read<T: EngineBuildContext>(engine_context: T) {
@@ -192,7 +197,7 @@ fn test_table_write_read<T: EngineBuildContext>(engine_context: T) {
         let fixed_schema_table = test_ctx.create_fixed_schema_table(test_table1).await;
 
         let start_ms = test_ctx.start_ms();
-        let rows = [
+        let rows: [(&str, Timestamp, &str, f64, f64, &str); 3] = [
             (
                 "key1",
                 Timestamp::new(start_ms),
@@ -250,14 +255,18 @@ fn test_table_write_read<T: EngineBuildContext>(engine_context: T) {
 
 #[test]
 fn test_table_write_get_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_table_write_get(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_table_write_get(ctx);
+    }
 }
 
 #[test]
 fn test_table_write_get_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_table_write_get(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_table_write_get(ctx);
+    }
 }
 
 fn test_table_write_get<T: EngineBuildContext>(engine_context: T) {
@@ -327,22 +336,28 @@ fn test_table_write_get<T: EngineBuildContext>(engine_context: T) {
 
 #[test]
 fn test_table_write_get_override_rocks() {
-    test_table_write_get_override::<RocksDBEngineBuildContext>();
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_table_write_get_override(ctx);
+    }
 }
 
 #[test]
 fn test_table_write_get_override_mem_wal() {
-    test_table_write_get_override::<MemoryEngineBuildContext>();
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_table_write_get_override(ctx);
+    }
 }
 
-fn test_table_write_get_override<T: EngineBuildContext>() {
-    test_table_write_get_override_case::<T>(FlushPoint::NoFlush, T::default());
+fn test_table_write_get_override<T: EngineBuildContext>(engine_context: T) {
+    test_table_write_get_override_case::<T>(FlushPoint::NoFlush, engine_context.clone());
 
-    test_table_write_get_override_case::<T>(FlushPoint::AfterFirstWrite, T::default());
+    test_table_write_get_override_case::<T>(FlushPoint::AfterFirstWrite, engine_context.clone());
 
-    test_table_write_get_override_case::<T>(FlushPoint::AfterOverwrite, T::default());
+    test_table_write_get_override_case::<T>(FlushPoint::AfterOverwrite, engine_context.clone());
 
-    test_table_write_get_override_case::<T>(FlushPoint::FirstAndOverwrite, T::default());
+    test_table_write_get_override_case::<T>(FlushPoint::FirstAndOverwrite, engine_context);
 }
 
 #[derive(Debug)]
@@ -506,16 +521,20 @@ fn test_table_write_get_override_case<T: EngineBuildContext>(
 
 #[test]
 fn test_db_write_buffer_size_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    // Use different table name to avoid metrics collision.
-    test_db_write_buffer_size("test_db_write_buffer_size_rocks", rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        // Use different table name to avoid metrics collision.
+        test_db_write_buffer_size("test_db_write_buffer_size_rocks", ctx);
+    }
 }
 
 #[test]
 fn test_db_write_buffer_size_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    // Use different table name to avoid metrics collision.
-    test_db_write_buffer_size("test_db_write_buffer_size_mem_wal", memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        // Use different table name to avoid metrics collision.
+        test_db_write_buffer_size("test_db_write_buffer_size_mem_wal", ctx);
+    }
 }
 
 fn test_db_write_buffer_size<T: EngineBuildContext>(table_name: &str, engine_context: T) {
@@ -527,16 +546,20 @@ fn test_db_write_buffer_size<T: EngineBuildContext>(table_name: &str, engine_con
 
 #[test]
 fn test_space_write_buffer_size_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    // Use different table name to avoid metrics collision.
-    test_space_write_buffer_size("test_space_write_buffer_size_rocks", rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        // Use different table name to avoid metrics collision.
+        test_space_write_buffer_size("test_space_write_buffer_size_rocks", ctx);
+    }
 }
 
 #[test]
 fn test_space_write_buffer_size_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    // Use different table name to avoid metrics collision.
-    test_space_write_buffer_size("test_space_write_buffer_size_mem_wal", memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        // Use different table name to avoid metrics collision.
+        test_space_write_buffer_size("test_space_write_buffer_size_mem_wal", ctx);
+    }
 }
 
 fn test_space_write_buffer_size<T: EngineBuildContext>(table_name: &str, engine_context: T) {
@@ -623,9 +646,6 @@ fn test_write_buffer_size_overflow<T: WalsOpener>(
         rows.extend_from_slice(&rows1);
         rows.extend_from_slice(&rows2);
 
-        // TODO(boyan) a better way to wait  table flushing finishes.
-        thread::sleep(time::Duration::from_millis(500));
-
         // Read with different opts.
         util::check_read(
             &test_ctx,
@@ -636,9 +656,13 @@ fn test_write_buffer_size_overflow<T: WalsOpener>(
         )
         .await;
 
+        // TODO(lee) a better way to wait table flushing finishes.
+        thread::sleep(time::Duration::from_millis(500));
+
         let stats = table.stats();
         assert_eq!(old_stats.num_read + 5, stats.num_read);
         assert_eq!(old_stats.num_write + 2, stats.num_write);
+
         // Flush when reaches (db/space) write_buffer size limitation.
         assert_eq!(old_stats.num_flush + 1, stats.num_flush);
 
@@ -660,14 +684,18 @@ fn test_write_buffer_size_overflow<T: WalsOpener>(
 
 #[test]
 fn test_table_write_read_reverse_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_table_write_read_reverse(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_table_write_read_reverse(ctx);
+    }
 }
 
 #[test]
 fn test_table_write_read_reverse_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_table_write_read_reverse(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_table_write_read_reverse(ctx);
+    }
 }
 
 fn test_table_write_read_reverse<T: EngineBuildContext>(engine_context: T) {
@@ -746,15 +774,19 @@ fn test_table_write_read_reverse<T: EngineBuildContext>(engine_context: T) {
 #[test]
 #[ignore = "https://github.com/CeresDB/ceresdb/issues/313"]
 fn test_table_write_read_reverse_after_flush_rocks() {
-    let rocksdb_ctx = RocksDBEngineBuildContext::default();
-    test_table_write_read_reverse_after_flush(rocksdb_ctx);
+    let rocksdb_ctxs = rocksdb_ctxs();
+    for ctx in rocksdb_ctxs {
+        test_table_write_read_reverse_after_flush(ctx);
+    }
 }
 
 #[test]
 #[ignore = "https://github.com/CeresDB/ceresdb/issues/313"]
 fn test_table_write_read_reverse_after_flush_mem_wal() {
-    let memory_ctx = MemoryEngineBuildContext::default();
-    test_table_write_read_reverse_after_flush(memory_ctx);
+    let memory_ctxs = memory_ctxs();
+    for ctx in memory_ctxs {
+        test_table_write_read_reverse_after_flush(ctx);
+    }
 }
 
 fn test_table_write_read_reverse_after_flush<T: EngineBuildContext>(engine_context: T) {
diff --git a/analytic_engine/src/tests/util.rs b/analytic_engine/src/tests/util.rs
index 0e2c897ecc..0cc8fb94e3 100644
--- a/analytic_engine/src/tests/util.rs
+++ b/analytic_engine/src/tests/util.rs
@@ -8,7 +8,7 @@ use common_types::{
     datum::Datum,
     record_batch::RecordBatch,
     row::{Row, RowGroup},
-    table::DEFAULT_SHARD_ID,
+    table::{ShardId, DEFAULT_SHARD_ID},
     time::Timestamp,
 };
 use common_util::{
@@ -20,8 +20,8 @@ use log::info;
 use object_store::config::{LocalOptions, ObjectStoreOptions, StorageOptions};
 use table_engine::{
     engine::{
-        CreateTableRequest, DropTableRequest, EngineRuntimes, OpenTableRequest,
-        Result as EngineResult, TableEngineRef,
+        CreateTableRequest, DropTableRequest, EngineRuntimes, OpenShardRequest, OpenTableRequest,
+        Result as EngineResult, TableDef, TableEngineRef,
     },
     table::{
         AlterSchemaRequest, FlushRequest, GetRequest, ReadOrder, ReadRequest, Result, SchemaId,
@@ -33,7 +33,7 @@ use tempfile::TempDir;
 use crate::{
     setup::{EngineBuilder, MemWalsOpener, OpenedWals, RocksDBWalsOpener, WalsOpener},
     tests::table::{self, FixedSchemaTable, RowTuple},
-    Config, RocksDBConfig, WalStorageConfig,
+    Config, RecoverMode, RocksDBConfig, WalStorageConfig,
 };
 
 const DAY_MS: i64 = 24 * 60 * 60 * 1000;
@@ -113,6 +113,7 @@ pub struct TestContext<T> {
     opened_wals: Option<OpenedWals>,
     schema_id: SchemaId,
     last_table_seq: u32,
+    open_method: OpenTablesMethod,
 
     name_to_tables: HashMap<String, TableRef>,
 }
@@ -169,8 +170,69 @@ impl<T: WalsOpener> TestContext<T> {
 
         self.open().await;
 
-        for (id, name) in table_infos {
-            self.open_table(id, name).await;
+        match self.open_method {
+            OpenTablesMethod::WithOpenTable => {
+                for (id, name) in table_infos {
+                    self.open_table(id, name).await;
+                }
+            }
+            OpenTablesMethod::WithOpenShard => {
+                self.open_tables_of_shard(table_infos, DEFAULT_SHARD_ID)
+                    .await;
+            }
+        }
+    }
+
+    pub async fn reopen_with_tables_of_shard(&mut self, tables: &[&str], shard_id: ShardId) {
+        let table_infos: Vec<_> = tables
+            .iter()
+            .map(|name| {
+                let table_id = self.name_to_tables.get(*name).unwrap().id();
+                (table_id, *name)
+            })
+            .collect();
+        {
+            // Close all tables.
+            self.name_to_tables.clear();
+
+            // Close engine.
+            let engine = self.engine.take().unwrap();
+            engine.close().await.unwrap();
+        }
+
+        self.open().await;
+
+        self.open_tables_of_shard(table_infos, shard_id).await
+    }
+
+    async fn open_tables_of_shard(&mut self, table_infos: Vec<(TableId, &str)>, shard_id: ShardId) {
+        let table_defs = table_infos
+            .into_iter()
+            .map(|table| TableDef {
+                catalog_name: "ceresdb".to_string(),
+                schema_name: "public".to_string(),
+                schema_id: self.schema_id,
+                id: table.0,
+                name: table.1.to_string(),
+            })
+            .collect();
+
+        let open_shard_request = OpenShardRequest {
+            shard_id,
+            table_defs,
+            engine: table_engine::ANALYTIC_ENGINE_TYPE.to_string(),
+        };
+
+        let tables = self
+            .engine()
+            .open_shard(open_shard_request)
+            .await
+            .unwrap()
+            .into_values()
+            .map(|result| result.unwrap().unwrap());
+
+        for table in tables {
+            self.name_to_tables.insert(table.name().to_string(), table);
         }
     }
 
@@ -368,6 +430,12 @@ impl<T: WalsOpener> TestContext<T> {
     }
 }
 
+#[derive(Clone, Copy)]
+pub enum OpenTablesMethod {
+    WithOpenTable,
+    WithOpenShard,
+}
+
 impl<T> TestContext<T> {
     pub fn config_mut(&mut self) -> &mut Config {
         &mut self.config
@@ -405,6 +473,7 @@ impl TestEnv {
             schema_id: SchemaId::from_u32(100),
             last_table_seq: 1,
             name_to_tables: HashMap::new(),
+            open_method: build_context.open_method(),
         }
     }
 
@@ -474,10 +543,22 @@ pub trait EngineBuildContext: Clone + Default {
 
     fn wals_opener(&self) -> Self::WalsOpener;
     fn config(&self) -> Config;
+    fn open_method(&self) -> OpenTablesMethod;
 }
 
 pub struct RocksDBEngineBuildContext {
     config: Config,
+    open_method: OpenTablesMethod,
+}
+
+impl RocksDBEngineBuildContext {
+    pub fn new(mode: RecoverMode, open_method: OpenTablesMethod) -> Self {
+        let mut context = Self::default();
+        context.config.recover_mode = mode;
+        context.open_method = open_method;
+
+        context
+    }
 }
 
 impl Default for RocksDBEngineBuildContext {
@@ -504,7 +585,10 @@ impl Default for RocksDBEngineBuildContext {
             ..Default::default()
         };
 
-        Self { config }
+        Self {
+            config,
+            open_method: OpenTablesMethod::WithOpenTable,
+        }
     }
 }
 
@@ -531,7 +615,10 @@ impl Clone for RocksDBEngineBuildContext {
             ..Default::default()
         }));
 
-        Self { config }
+        Self {
+            config,
+            open_method: self.open_method,
+        }
     }
 }
 
@@ -545,11 +632,26 @@ impl EngineBuildContext for RocksDBEngineBuildContext {
     fn config(&self) -> Config {
         self.config.clone()
     }
+
+    fn open_method(&self) -> OpenTablesMethod {
+        self.open_method
+    }
 }
 
 #[derive(Clone)]
 pub struct MemoryEngineBuildContext {
     config: Config,
+    open_method: OpenTablesMethod,
+}
+
+impl MemoryEngineBuildContext {
+    pub fn new(mode: RecoverMode, open_method: OpenTablesMethod) -> Self {
+        let mut context = Self::default();
+        context.config.recover_mode = mode;
+        context.open_method = open_method;
+
+        context
+    }
 }
 
 impl Default for MemoryEngineBuildContext {
@@ -572,7 +674,10 @@ impl Default for MemoryEngineBuildContext {
             ..Default::default()
         };
 
-        Self { config }
+        Self {
+            config,
+            open_method: OpenTablesMethod::WithOpenTable,
+        }
     }
 }
 
@@ -586,4 +691,26 @@ impl EngineBuildContext for MemoryEngineBuildContext {
     fn config(&self) -> Config {
         self.config.clone()
     }
+
+    fn open_method(&self) -> OpenTablesMethod {
+        self.open_method
+    }
+}
+
+pub fn rocksdb_ctxs() -> Vec<RocksDBEngineBuildContext> {
+    vec![
+        RocksDBEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenTable),
+        RocksDBEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenTable),
+        RocksDBEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenShard),
+        RocksDBEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenShard),
+    ]
+}
+
+pub fn memory_ctxs() -> Vec<MemoryEngineBuildContext> {
+    vec![
+        MemoryEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenTable),
+        MemoryEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenTable),
+        MemoryEngineBuildContext::new(RecoverMode::TableBased, OpenTablesMethod::WithOpenShard),
+        MemoryEngineBuildContext::new(RecoverMode::ShardBased, OpenTablesMethod::WithOpenShard),
+    ]
 }
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index 4c09a84644..a2580a47c2 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -5,16 +5,17 @@ use std::sync::Arc;
 
 use arrow::{
     array::{
-        Array, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray, BooleanBuilder,
-        Date32Array as DateArray, Date32Builder as DateBuilder, Float32Array as FloatArray,
-        Float32Builder as FloatBuilder, Float64Array as DoubleArray,
+        Array, ArrayAccessor, ArrayBuilder, ArrayRef, BinaryArray, BinaryBuilder, BooleanArray,
+        BooleanBuilder, Date32Array as DateArray, Date32Builder as DateBuilder, DictionaryArray,
+        Float32Array as FloatArray, Float32Builder as FloatBuilder, Float64Array as DoubleArray,
         Float64Builder as DoubleBuilder, Int16Array, Int16Builder, Int32Array, Int32Builder,
         Int64Array, Int64Builder, Int8Array, Int8Builder, NullArray, StringArray, StringBuilder,
-        Time64NanosecondArray as TimeArray, Time64NanosecondBuilder as TimeBuilder,
-        TimestampMillisecondArray, TimestampMillisecondBuilder, UInt16Array, UInt16Builder,
-        UInt32Array, UInt32Builder, UInt64Array, UInt64Builder, UInt8Array, UInt8Builder,
+        StringDictionaryBuilder, Time64NanosecondArray as TimeArray,
+        Time64NanosecondBuilder as TimeBuilder, TimestampMillisecondArray,
+        TimestampMillisecondBuilder, UInt16Array, UInt16Builder, UInt32Array, UInt32Builder,
+        UInt64Array, UInt64Builder, UInt8Array, UInt8Builder,
     },
-    datatypes::{DataType, TimeUnit},
+    datatypes::{DataType, Int32Type, TimeUnit},
     error::ArrowError,
 };
 use datafusion::physical_plan::{
@@ -142,6 +143,9 @@ pub struct VarbinaryColumn(BinaryArray);
 #[derive(Debug)]
 pub struct StringColumn(StringArray);
 
+#[derive(Debug)]
+pub struct StringDictionaryColumn(DictionaryArray<Int32Type>);
+
 #[derive(Debug)]
 pub struct DateColumn(DateArray);
 
@@ -287,6 +291,55 @@ impl_column!(
 );
 impl_column!(StringColumn, get_string_datum, get_string_datum_view);
 
+impl StringDictionaryColumn {
+    #[doc = " Get datum by index."]
+    pub fn datum_opt(&self, index: usize) -> Option<Datum> {
+        if index >= self.0.len() {
+            return None;
+        }
+        Some(self.datum(index))
+    }
+
+    pub fn datum_view_opt(&self, index: usize) -> Option<DatumView> {
+        if index >= self.0.len() {
+            return None;
+        }
+        Some(self.datum_view(index))
+    }
+
+    pub fn datum_view(&self, index: usize) -> DatumView {
+        if self.0.is_null(index) {
+            return DatumView::Null;
+        }
+        // TODO : Is this the efficient way?
+        DatumView::String(self.0.downcast_dict::<StringArray>().unwrap().value(index))
+    }
+
+    pub fn datum(&self, index: usize) -> Datum {
+        if self.0.is_null(index) {
+            return Datum::Null;
+        }
+        // TODO : Is this the efficient way?
+        Datum::String(
+            self.0
+                .downcast_dict::<StringArray>()
+                .unwrap()
+                .value(index)
+                .into(),
+        )
+    }
+
+    #[inline]
+    pub fn num_rows(&self) -> usize {
+        self.0.len()
+    }
+
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.num_rows() == 0
+    }
+}
+
 macro_rules! impl_dedup {
     ($Column: ident) => {
         impl $Column {
@@ -321,6 +374,30 @@ impl_dedup!(TimestampColumn);
 impl_dedup!(VarbinaryColumn);
 impl_dedup!(StringColumn);
 
+impl StringDictionaryColumn {
+    #[doc = " If datum i is not equal to previous datum i - 1, mark `selected[i]` to"]
+    #[doc = " true."]
+    #[doc = ""]
+    #[doc = " The first datum is marked to true."]
+    #[doc = ""]
+    #[doc = " The size of selected must equal to the size of this column and"]
+    #[doc = " initialized to false."]
+    #[allow(clippy::float_cmp)]
+    pub fn dedup(&self, selected: &mut [bool]) {
+        if self.0.is_empty() {
+            return;
+        }
+        selected[0] = true;
+        for (i, v) in selected.iter_mut().enumerate().take(self.0.len()).skip(1) {
+            let current = self.0.key(i);
+            let prev = self.0.key(i - 1);
+            if current != prev {
+                *v = true;
+            }
+        }
+    }
+}
+
 macro_rules! impl_new_null {
     ($Column: ident, $Builder: ident) => {
         impl $Column {
@@ -389,6 +466,36 @@ impl_from_array_and_slice!(TimestampColumn, TimestampMillisecondArray);
 impl_from_array_and_slice!(VarbinaryColumn, BinaryArray);
 impl_from_array_and_slice!(StringColumn, StringArray);
 
+impl From<DictionaryArray<Int32Type>> for StringDictionaryColumn {
+    fn from(array: DictionaryArray<Int32Type>) -> Self {
+        Self(array)
+    }
+}
+impl From<&DictionaryArray<Int32Type>> for StringDictionaryColumn {
+    fn from(array_ref: &DictionaryArray<Int32Type>) -> Self {
+        let array_data = array_ref.into_data();
+        let array = DictionaryArray::<Int32Type>::from(array_data);
+        Self(array)
+    }
+}
+impl StringDictionaryColumn {
+    fn to_arrow_array(&self) -> DictionaryArray<Int32Type> {
+        let array_data = self.0.clone().into_data();
+        DictionaryArray::<Int32Type>::from(array_data)
+    }
+
+    #[doc = " Returns a zero-copy slice of this array with the indicated offset and"]
+    #[doc = " length."]
+    #[doc = ""]
+    #[doc = " Panics if offset with length is greater than column length."]
+    fn slice(&self, offset: usize, length: usize) -> Self {
+        let array_slice = self.0.slice(offset, length);
+        let array_data = array_slice.into_data();
+        let array = DictionaryArray::<Int32Type>::from(array_data);
+        Self(array)
+    }
+}
+
 macro_rules! impl_iter {
     ($Column: ident, $Value: ident) => {
         impl $Column {
@@ -438,6 +545,19 @@ impl StringColumn {
     }
 }
 
+impl StringDictionaryColumn {
+    /// Create a column that all values are null.
+    fn new_null(num_rows: usize) -> Self {
+        let mut builder = StringDictionaryBuilder::<Int32Type>::new();
+        for _ in 0..num_rows {
+            builder.append_null();
+        }
+        let array = builder.finish();
+
+        Self(array)
+    }
+}
+
 macro_rules! impl_numeric_column {
     ($(($Kind: ident, $type: ty)), *) =>  {
         $(
@@ -543,18 +663,21 @@ macro_rules! impl_column_block {
         impl ColumnBlock {
             pub fn datum_kind(&self) -> DatumKind {
                 match self {
+                    ColumnBlock::StringDictionary(_) => DatumKind::String,
                     $(ColumnBlock::$Kind(_) => DatumKind::$Kind,)*
                 }
             }
 
             pub fn datum_opt(&self, index: usize) -> Option<Datum> {
                 match self {
+                    ColumnBlock::StringDictionary(col) => col.datum_opt(index),
                     $(ColumnBlock::$Kind(col) => col.datum_opt(index),)*
                 }
             }
 
             pub fn datum_view_opt(&self, index: usize) -> Option<DatumView> {
                 match self {
+                    ColumnBlock::StringDictionary(col) => col.datum_view_opt(index),
                     $(ColumnBlock::$Kind(col) => col.datum_view_opt(index),)*
                 }
             }
@@ -562,6 +685,7 @@ macro_rules! impl_column_block {
             /// Panic if index is out fo bound.
             pub fn datum_view(&self, index: usize) -> DatumView {
                 match self {
+                    ColumnBlock::StringDictionary(col) => col.datum_view(index),
                     $(ColumnBlock::$Kind(col) => col.datum_view(index),)*
                 }
             }
@@ -569,18 +693,21 @@ macro_rules! impl_column_block {
             /// Panic if index is out fo bound.
             pub fn datum(&self, index: usize) -> Datum {
                 match self {
+                    ColumnBlock::StringDictionary(col) => col.datum(index),
                     $(ColumnBlock::$Kind(col) => col.datum(index),)*
                 }
             }
 
             pub fn num_rows(&self) -> usize {
                 match self {
+                    ColumnBlock::StringDictionary(col) => col.num_rows(),
                     $(ColumnBlock::$Kind(col) => col.num_rows(),)*
                 }
             }
 
             pub fn to_arrow_array_ref(&self) -> ArrayRef {
                 match self {
+                    ColumnBlock::StringDictionary(col) =>  Arc::new(col.to_arrow_array()),
                     $(ColumnBlock::$Kind(col) => Arc::new(col.to_arrow_array()),)*
                 }
             }
@@ -590,6 +717,7 @@ macro_rules! impl_column_block {
             /// The first datum is not marked to true.
             pub fn dedup(&self, selected: &mut [bool]) {
                 match self {
+                    ColumnBlock::StringDictionary(col) =>  col.dedup(selected),
                     $(ColumnBlock::$Kind(col) => col.dedup(selected),)*
                 }
             }
@@ -600,6 +728,7 @@ macro_rules! impl_column_block {
             #[must_use]
             pub fn slice(&self, offset: usize, length: usize) -> Self {
                 match self {
+                    ColumnBlock::StringDictionary(col) =>  ColumnBlock::StringDictionary(col.slice(offset, length)),
                     $(ColumnBlock::$Kind(col) => ColumnBlock::$Kind(col.slice(offset, length)),)*
                 }
             }
@@ -612,6 +741,12 @@ macro_rules! impl_column_block {
                 }
             }
         })*
+
+        impl From<StringDictionaryColumn> for ColumnBlock {
+            fn from(column: StringDictionaryColumn) -> Self {
+                Self::StringDictionary(column)
+            }
+        }
     };
 }
 
@@ -628,6 +763,8 @@ macro_rules! define_column_block {
             #[derive(Debug)]
             pub enum ColumnBlock {
                 Null(NullColumn),
+                StringDictionary(StringDictionaryColumn),
+                String(StringColumn),
                 $(
                     $Kind([<$Kind Column>]),
                 )*
@@ -635,8 +772,36 @@ macro_rules! define_column_block {
 
             impl ColumnBlock {
                 pub fn try_from_arrow_array_ref(datum_kind: &DatumKind, array: &ArrayRef) -> Result<Self> {
+                    let is_dictionary : bool =  if let DataType::Dictionary(..)  = array.data_type() {
+                        true
+                    } else {
+                        false
+                    };
                     let column = match datum_kind {
                         DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())),
+                        DatumKind::String => {
+                            if !is_dictionary {
+                                let mills_array;
+                                let cast_column = match array.data_type() {
+                                    DataType::Timestamp(TimeUnit::Nanosecond, None) => {
+                                        mills_array = cast_nanosecond_to_mills(array)?;
+                                        cast_array(datum_kind, &mills_array)?
+                                    }
+                                    _ => cast_array(datum_kind, array)?,
+                                };
+                                ColumnBlock::String(StringColumn::from(cast_column))
+                            } else {
+                                let mills_array;
+                                let cast_column = match array.data_type() {
+                                    DataType::Timestamp(TimeUnit::Nanosecond, None) => {
+                                        mills_array = cast_nanosecond_to_mills(array)?;
+                                        cast_array(datum_kind, &mills_array)?
+                                    }
+                                    _ => cast_array(datum_kind, array)?,
+                                };
+                                ColumnBlock::StringDictionary(StringDictionaryColumn::from(cast_column))
+                            }
+                        },
                         $(
                             DatumKind::$Kind => {
                                 let mills_array;
@@ -657,9 +822,16 @@ macro_rules! define_column_block {
                     Ok(column)
                 }
 
-                pub fn new_null_with_type(kind: &DatumKind, rows: usize) -> Result<Self> {
+                pub fn new_null_with_type(kind: &DatumKind, rows: usize, is_dictionary: bool) -> Result<Self> {
                     let block = match kind {
                         DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(rows)),
+                        DatumKind::String => {
+                            if is_dictionary {
+                                ColumnBlock::StringDictionary(StringDictionaryColumn::new_null(rows))
+                            }else {
+                                ColumnBlock::String(StringColumn::new_null(rows))
+                            }
+                        },
                         $(
                             DatumKind::$Kind => ColumnBlock::$Kind([<$Kind Column>]::new_null(rows)),
                         )*
@@ -674,8 +846,8 @@ macro_rules! define_column_block {
 
 // Define column blocks, Null is defined explicitly in macro.
 define_column_block!(
-    Timestamp, Double, Float, Varbinary, String, UInt64, UInt32, UInt16, UInt8, Int64, Int32,
-    Int16, Int8, Boolean, Date, Time
+    Timestamp, Double, Float, Varbinary, UInt64, UInt32, UInt16, UInt8, Int64, Int32, Int16, Int8,
+    Boolean, Date, Time
 );
 
 impl ColumnBlock {
@@ -796,7 +968,6 @@ macro_rules! append_block {
 macro_rules! define_column_block_builder {
     ($(($Kind: ident, $Builder: ident)), *) => {
         paste! {
-            #[derive(Debug)]
             pub enum ColumnBlockBuilder {
                 Null { rows: usize },
                 Timestamp(TimestampMillisecondBuilder),
@@ -804,6 +975,7 @@ macro_rules! define_column_block_builder {
                 String(StringBuilder),
                 Date(DateBuilder),
                 Time(TimeBuilder),
+                Dictionary(StringDictionaryBuilder::<Int32Type>),
                 $(
                     $Kind($Builder),
                 )*
@@ -811,13 +983,19 @@ macro_rules! define_column_block_builder {
 
             impl ColumnBlockBuilder {
                 /// Create by data type with initial capacity
-                pub fn with_capacity(data_type: &DatumKind, item_capacity: usize) -> Self {
+                pub fn with_capacity(data_type: &DatumKind, item_capacity: usize, is_dictionary : bool) -> Self {
                     match data_type {
                         DatumKind::Null => Self::Null { rows: 0 },
                         DatumKind::Timestamp => Self::Timestamp(TimestampMillisecondBuilder::with_capacity(item_capacity)),
                         // The data_capacity is set as 1024, because the item is variable-size type.
                         DatumKind::Varbinary => Self::Varbinary(BinaryBuilder::with_capacity(item_capacity, 1024)),
-                        DatumKind::String => Self::String(StringBuilder::with_capacity(item_capacity, 1024)),
+                        DatumKind::String =>{
+                            if !is_dictionary{
+                                Self::String(StringBuilder::with_capacity(item_capacity, 1024))
+                            }else {
+                                Self::Dictionary(StringDictionaryBuilder::<Int32Type>::new())
+                            }
+                        }
                         DatumKind::Date => Self::Date(DateBuilder::with_capacity(item_capacity)),
                         DatumKind::Time => Self::Time(TimeBuilder::with_capacity(item_capacity)),
                         $(
@@ -847,6 +1025,17 @@ macro_rules! define_column_block_builder {
                         Self::String(builder) => append_datum!(String, builder, Datum, datum),
                         Self::Date(builder) => append_datum!(Date, builder, Datum, datum),
                         Self::Time(builder) => append_datum!(Time, builder, Datum, datum),
+                        Self::Dictionary(builder) => {
+                            match datum {
+                                Datum::Null => Ok(builder.append_null()),
+                                Datum::String(v) => Ok(builder.append_value(v)),
+                                _ => ConflictType {
+                                    expect: DatumKind::String,
+                                    given: datum.kind(),
+                                }
+                                .fail()
+                            }
+                        },
                         $(
                             Self::$Kind(builder) => append_datum!($Kind, builder, Datum, datum),
                         )*
@@ -874,6 +1063,17 @@ macro_rules! define_column_block_builder {
                         Self::String(builder) => append_datum!(String, builder, DatumView, datum),
                         Self::Date(builder) => append_datum!(Date, builder, DatumView, datum),
                         Self::Time(builder) => append_datum!(Time, builder, DatumView, datum),
+                        Self::Dictionary(builder) => {
+                            match datum {
+                                DatumView::Null => Ok(builder.append_null()),
+                                DatumView::String(v) => Ok(builder.append_value(v)),
+                                _ => ConflictType {
+                                    expect: DatumKind::String,
+                                    given: datum.kind(),
+                                }
+                                .fail()
+                            }
+                        },
                         $(
                             Self::$Kind(builder) => append_datum!($Kind, builder, DatumView, datum),
                         )*
@@ -898,6 +1098,34 @@ macro_rules! define_column_block_builder {
                         Self::String(builder) => append_block!(String, builder, ColumnBlock, block, start, len),
                         Self::Date(builder) => append_block!(Date, builder, ColumnBlock, block, start, len),
                         Self::Time(builder) => append_block!(Time, builder, ColumnBlock, block, start, len),
+                        Self::Dictionary(builder) => {
+                                match block {
+                                    ColumnBlock::Null(v) => {
+                                        let end = std::cmp::min(start + len, v.num_rows());
+                                        for _ in start..end {
+                                            builder.append_null();
+                                        }
+                                        Ok(())
+                                    }
+                                    ColumnBlock::StringDictionary(v) => {
+                                        let end = std::cmp::min(start + len, v.num_rows());
+                                        for i in start..end {
+                                            if v.0.is_null(i) {
+                                                builder.append_null();
+                                            } else {
+                                                let value = v.datum(i);
+                                                builder.append_value(value.as_str().unwrap());
+                                            }
+                                        }
+                                        Ok(())
+                                    }
+                                    _ => ConflictType {
+                                        expect: DatumKind::String,
+                                        given: block.datum_kind(),
+                                    }
+                                    .fail(),
+                                }
+                        },
                         $(
                             Self::$Kind(builder) => append_block!($Kind, builder, ColumnBlock, block, start, len),
                         )*
@@ -912,6 +1140,7 @@ macro_rules! define_column_block_builder {
                         Self::String(builder) => builder.len(),
                         Self::Date(builder) => builder.len(),
                         Self::Time(builder) => builder.len(),
+                        Self::Dictionary(builder) => builder.len(),
                         $(
                             Self::$Kind(builder) =>  builder.len(),
                         )*
@@ -931,6 +1160,9 @@ macro_rules! define_column_block_builder {
                         Self::String(builder) => StringColumn::from(builder.finish()).into(),
                         Self::Date(builder) => DateColumn::from(builder.finish()).into(),
                         Self::Time(builder) => TimeColumn::from(builder.finish()).into(),
+                        Self::Dictionary(builder) => {
+                            StringDictionaryColumn::from(builder.finish()).into()
+                        },
                         $(
                             Self::$Kind(builder) => [<$Kind Column>]::from(builder.finish()).into(),
                         )*
@@ -959,8 +1191,8 @@ define_column_block_builder!(
 
 impl ColumnBlockBuilder {
     /// Create by data type
-    pub fn new(data_type: &DatumKind) -> Self {
-        Self::with_capacity(data_type, 0)
+    pub fn new(data_type: &DatumKind, is_dictionry: bool) -> Self {
+        Self::with_capacity(data_type, 0, is_dictionry)
     }
 
     pub fn is_empty(&self) -> bool {
@@ -976,7 +1208,9 @@ impl ColumnBlockBuilder {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tests::{build_rows, build_schema};
+    use crate::tests::{
+        build_row_for_dictionary, build_rows, build_schema, build_schema_for_dictionary,
+    };
 
     #[test]
     fn test_column_block_builder() {
@@ -984,7 +1218,7 @@ mod tests {
         let rows = build_rows();
         // DatumKind::Varbinary
         let column = schema.column(0);
-        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2, false);
 
         // append
         builder.append(rows[0][0].clone()).unwrap();
@@ -998,7 +1232,7 @@ mod tests {
 
         let column_block = builder.build();
         assert_eq!(column_block.num_rows(), 2);
-        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&column.data_type, 2, false);
 
         // append_block_range
         builder.append_block_range(&column_block, 0, 1).unwrap();
@@ -1015,4 +1249,65 @@ mod tests {
             Datum::Varbinary(Bytes::copy_from_slice(b"binary key1"))
         );
     }
+
+    #[test]
+    fn test_column_block_string_dictionary_builder() {
+        let schema = build_schema_for_dictionary();
+        let rows = vec![
+            build_row_for_dictionary(1, 1, Some("tag1_1"), "tag2_1", 1),
+            build_row_for_dictionary(2, 2, Some("tag1_2"), "tag2_2", 2),
+            build_row_for_dictionary(3, 3, Some("tag1_3"), "tag2_3", 3),
+            build_row_for_dictionary(4, 4, Some("tag1_1"), "tag2_4", 3),
+            build_row_for_dictionary(5, 5, Some("tag1_3"), "tag2_4", 4),
+            build_row_for_dictionary(6, 6, None, "tag2_4", 4),
+        ];
+        // DatumKind::String , is_dictionary = true
+        let column = schema.column(2);
+        println!("{column:?}");
+        let mut builder =
+            ColumnBlockBuilder::with_capacity(&column.data_type, 0, column.is_dictionary);
+        // append
+        (0..rows.len()).for_each(|i| builder.append(rows[i][2].clone()).unwrap());
+
+        let ret = builder.append(rows[0][0].clone());
+        assert!(ret.is_err());
+
+        // append_view
+        builder.append_view(rows[5][2].as_view()).unwrap();
+        let ret = builder.append_view(rows[1][0].as_view());
+
+        assert!(ret.is_err());
+
+        let column_block = builder.build();
+        assert_eq!(column_block.num_rows(), 7);
+        let mut builder =
+            ColumnBlockBuilder::with_capacity(&column.data_type, 2, column.is_dictionary);
+
+        // append_block_range
+        (0..rows.len()).for_each(|i| builder.append_block_range(&column_block, i, 1).unwrap());
+
+        let column_block = builder.build();
+        assert_eq!(column_block.num_rows(), 6);
+        assert_eq!(
+            column_block.datum(0),
+            Datum::String(StringBytes::from("tag1_1"))
+        );
+        assert_eq!(
+            column_block.datum(1),
+            Datum::String(StringBytes::from("tag1_2"))
+        );
+        assert_eq!(
+            column_block.datum(2),
+            Datum::String(StringBytes::from("tag1_3"))
+        );
+        assert_eq!(
+            column_block.datum(3),
+            Datum::String(StringBytes::from("tag1_1"))
+        );
+        assert_eq!(
+            column_block.datum(4),
+            Datum::String(StringBytes::from("tag1_3"))
+        );
+        assert_eq!(column_block.datum(5), Datum::Null);
+    }
 }
diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs
index 2f1a48cbbd..6deaefa5c4 100644
--- a/common_types/src/column_schema.rs
+++ b/common_types/src/column_schema.rs
@@ -29,6 +29,12 @@ pub enum Error {
         backtrace: Backtrace,
     },
 
+    #[snafu(display("Invalid dictionary type:{}.\nBacktrace:\n{}", data_type, backtrace))]
+    InvalidDictionaryType {
+        data_type: DataType,
+        backtrace: Backtrace,
+    },
+
     #[snafu(display(
         "Arrow field meta data is missing, field name:{}.\nBacktrace:\n{}",
         field_name,
@@ -119,6 +125,7 @@ pub enum ReadOp {
 struct ArrowFieldMeta {
     id: u32,
     is_tag: bool,
+    is_dictionary: bool,
     comment: String,
 }
 
@@ -126,6 +133,7 @@ struct ArrowFieldMeta {
 pub enum ArrowFieldMetaKey {
     Id,
     IsTag,
+    IsDictionary,
     Comment,
 }
 
@@ -134,6 +142,7 @@ impl ArrowFieldMetaKey {
         match self {
             ArrowFieldMetaKey::Id => "field::id",
             ArrowFieldMetaKey::IsTag => "field::is_tag",
+            ArrowFieldMetaKey::IsDictionary => "field::is_dictionary",
             ArrowFieldMetaKey::Comment => "field::comment",
         }
     }
@@ -159,6 +168,8 @@ pub struct ColumnSchema {
     /// Is tag, tag is just a hint for a column, there is no restriction that a
     /// tag column must be a part of primary key
     pub is_tag: bool,
+    // Whether to use dictionary types for parquet store
+    pub is_dictionary: bool,
     /// Comment of the column
     pub comment: String,
     /// Column name in response
@@ -191,6 +202,11 @@ impl ColumnSchema {
         }
     }
 
+    /// Check whether a type is valid dictionary type.
+    pub fn is_valid_dictionary_type(typ: DatumKind) -> bool {
+        matches!(typ, DatumKind::String)
+    }
+
     /// Convert `self` to [`arrow::datatypes::Field`]
     pub fn to_arrow_field(&self) -> Field {
         From::from(self)
@@ -273,6 +289,7 @@ impl TryFrom<schema_pb::ColumnSchema> for ColumnSchema {
             data_type: DatumKind::from(data_type),
             is_nullable: column_schema.is_nullable,
             is_tag: column_schema.is_tag,
+            is_dictionary: column_schema.is_dictionary,
             comment: column_schema.comment,
             escaped_name,
             default_value,
@@ -287,6 +304,7 @@ impl TryFrom<&Arc<Field>> for ColumnSchema {
         let ArrowFieldMeta {
             id,
             is_tag,
+            is_dictionary,
             comment,
         } = decode_arrow_field_meta_data(field.metadata())?;
         Ok(Self {
@@ -299,6 +317,7 @@ impl TryFrom<&Arc<Field>> for ColumnSchema {
             )?,
             is_nullable: field.is_nullable(),
             is_tag,
+            is_dictionary,
             comment,
             escaped_name: field.name().escape_debug().to_string(),
             default_value: None,
@@ -309,11 +328,24 @@ impl TryFrom<&Arc<Field>> for ColumnSchema {
 impl From<&ColumnSchema> for Field {
     fn from(col_schema: &ColumnSchema) -> Self {
         let metadata = encode_arrow_field_meta_data(col_schema);
-        let mut field = Field::new(
-            &col_schema.name,
-            col_schema.data_type.into(),
-            col_schema.is_nullable,
-        );
+        // If the column sholud use dictionary, create correspond dictionary type.
+        let mut field = if col_schema.is_dictionary {
+            Field::new_dict(
+                &col_schema.name,
+                DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
+                col_schema.is_nullable,
+                col_schema.id.into(),
+                false,
+                // Todo how to use dict_is_ordered
+            )
+        } else {
+            Field::new(
+                &col_schema.name,
+                col_schema.data_type.into(),
+                col_schema.is_nullable,
+            )
+        };
+
         field.set_metadata(metadata);
 
         field
@@ -343,6 +375,7 @@ fn decode_arrow_field_meta_data(meta: &HashMap<String, String>) -> Result<ArrowF
         Ok(ArrowFieldMeta {
             id: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Id)?,
             is_tag: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::IsTag)?,
+            is_dictionary: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::IsDictionary)?,
             comment: parse_arrow_field_meta_value(meta, ArrowFieldMetaKey::Comment)?,
         })
     }
@@ -356,6 +389,10 @@ fn encode_arrow_field_meta_data(col_schema: &ColumnSchema) -> HashMap<String, St
         ArrowFieldMetaKey::IsTag.to_string(),
         col_schema.is_tag.to_string(),
     );
+    meta.insert(
+        ArrowFieldMetaKey::IsDictionary.to_string(),
+        col_schema.is_dictionary.to_string(),
+    );
     meta.insert(
         ArrowFieldMetaKey::Comment.to_string(),
         col_schema.comment.clone(),
@@ -372,6 +409,7 @@ pub struct Builder {
     data_type: DatumKind,
     is_nullable: bool,
     is_tag: bool,
+    is_dictionary: bool,
     comment: String,
     default_value: Option<Expr>,
 }
@@ -385,6 +423,7 @@ impl Builder {
             data_type,
             is_nullable: false,
             is_tag: false,
+            is_dictionary: false,
             comment: String::new(),
             default_value: None,
         }
@@ -407,6 +446,12 @@ impl Builder {
         self
     }
 
+    /// Set this column is dictionary, default is false (not a dictionary).
+    pub fn is_dictionary(mut self, is_dictionary: bool) -> Self {
+        self.is_dictionary = is_dictionary;
+        self
+    }
+
     pub fn comment(mut self, comment: String) -> Self {
         self.comment = comment;
         self
@@ -427,6 +472,15 @@ impl Builder {
             );
         }
 
+        if self.is_dictionary {
+            ensure!(
+                ColumnSchema::is_valid_dictionary_type(self.data_type),
+                InvalidDictionaryType {
+                    data_type: self.data_type
+                }
+            );
+        }
+
         Ok(())
     }
 
@@ -439,6 +493,7 @@ impl Builder {
             data_type: self.data_type,
             is_nullable: self.is_nullable,
             is_tag: self.is_tag,
+            is_dictionary: self.is_dictionary,
             comment: self.comment,
             escaped_name,
             default_value: self.default_value,
@@ -460,6 +515,7 @@ impl From<ColumnSchema> for schema_pb::ColumnSchema {
             is_nullable: src.is_nullable,
             id: src.id,
             is_tag: src.is_tag,
+            is_dictionary: src.is_dictionary,
             comment: src.comment,
             default_value,
         }
@@ -475,10 +531,11 @@ mod tests {
     /// Create a column schema for test, each field is filled with non-default
     /// value
     fn new_test_column_schema() -> ColumnSchema {
-        Builder::new("test_column_schema".to_string(), DatumKind::Boolean)
+        Builder::new("test_column_schema".to_string(), DatumKind::String)
             .id(18)
             .is_nullable(true)
             .is_tag(true)
+            .is_dictionary(true)
             .comment("Comment of this column".to_string())
             .default_value(Some(Expr::Value(Value::Boolean(true))))
             .build()
@@ -491,9 +548,10 @@ mod tests {
         let rhs = ColumnSchema {
             id: 18,
             name: "test_column_schema".to_string(),
-            data_type: DatumKind::Boolean,
+            data_type: DatumKind::String,
             is_nullable: true,
             is_tag: true,
+            is_dictionary: true,
             comment: "Comment of this column".to_string(),
             escaped_name: "test_column_schema".escape_debug().to_string(),
             default_value: Some(Expr::Value(Value::Boolean(true))),
@@ -508,6 +566,8 @@ mod tests {
         let pb_schema = schema_pb::ColumnSchema::from(column_schema.clone());
         // Check pb specific fields
         assert!(pb_schema.is_tag);
+        assert!(pb_schema.is_dictionary);
+        assert!(pb_schema.is_nullable);
 
         let schema_from_pb = ColumnSchema::try_from(pb_schema).unwrap();
         assert_eq!(&schema_from_pb, &column_schema);
@@ -524,4 +584,16 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_valid_dictionary_type() {
+        let valid_dictionary_types = vec![DatumKind::String];
+
+        for v in &DatumKind::VALUES {
+            assert_eq!(
+                ColumnSchema::is_valid_dictionary_type(*v),
+                valid_dictionary_types.contains(v)
+            );
+        }
+    }
 }
diff --git a/common_types/src/datum.rs b/common_types/src/datum.rs
index 6f01baa8d8..d0618a1880 100644
--- a/common_types/src/datum.rs
+++ b/common_types/src/datum.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Datum holds different kind of data
 
@@ -8,78 +8,78 @@ use arrow::temporal_conversions::{EPOCH_DAYS_FROM_CE, NANOSECONDS};
 use ceresdbproto::schema::DataType as DataTypePb;
 use chrono::{Datelike, Local, NaiveDate, NaiveTime, TimeZone, Timelike};
 use serde::ser::{Serialize, Serializer};
-use snafu::{Backtrace, ResultExt, Snafu};
+use snafu::{Backtrace, OptionExt, ResultExt, Snafu};
 use sqlparser::ast::{DataType as SqlDataType, Value};
 
-use crate::{bytes::Bytes, hash::hash64, string::StringBytes, time::Timestamp};
+use crate::{bytes::Bytes, hash::hash64, hex, string::StringBytes, time::Timestamp};
 
 const DATE_FORMAT: &str = "%Y-%m-%d";
 const TIME_FORMAT: &str = "%H:%M:%S%.3f";
 
 #[derive(Debug, Snafu)]
 pub enum Error {
-    #[snafu(display(
-        "Unsupported SQL data type, type:{}.\nBacktrace:\n{}",
-        sql_type,
-        backtrace
-    ))]
+    #[snafu(display("Unsupported SQL data type, type:{sql_type}.\nBacktrace:\n{backtrace}"))]
     UnsupportedDataType {
         sql_type: SqlDataType,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid double or float, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    #[snafu(display("Invalid double or float, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidDouble {
         source: std::num::ParseFloatError,
         backtrace: Backtrace,
     },
 
     #[snafu(display(
-        "Invalid insert value, kind:{}, value:{:?}.\nBacktrace:\n{}",
-        kind,
-        value,
-        backtrace
+        "Invalid insert value, kind:{kind}, value:{value:?}.\nBacktrace:\n{backtrace}"
     ))]
     InvalidValueType {
         kind: DatumKind,
         value: Value,
         backtrace: Backtrace,
     },
-    #[snafu(display("Invalid timestamp, err:{}.\nBacktrace:\n{}", source, backtrace))]
+
+    #[snafu(display("Invalid timestamp, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidTimestamp {
         source: std::num::ParseIntError,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid date, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    #[snafu(display("Invalid date, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidDate {
         source: chrono::ParseError,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid time, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    #[snafu(display("Invalid time, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidTimeCause {
         source: chrono::ParseError,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid time, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    #[snafu(display("Invalid time, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidTimeHourFormat {
         source: std::num::ParseIntError,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid time, err:{}", msg))]
-    InvalidTimeNoCause { msg: String },
+    #[snafu(display("Invalid time, err:{msg}.\nBacktrace:\n{backtrace}"))]
+    InvalidTimeNoCause { msg: String, backtrace: Backtrace },
 
-    #[snafu(display("Invalid integer, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    #[snafu(display("Invalid integer, err:{source}.\nBacktrace:\n{backtrace}"))]
     InvalidInt {
         source: std::num::ParseIntError,
         backtrace: Backtrace,
     },
 
-    #[snafu(display("Invalid datum byte, byte:{}.\nBacktrace:\n{}", value, backtrace))]
+    #[snafu(display("Invalid datum byte, byte:{value}.\nBacktrace:\n{backtrace}"))]
     InvalidDatumByte { value: u8, backtrace: Backtrace },
+
+    #[snafu(display("Invalid hex value, hex_val:{hex_val}.\nBacktrace:\n{backtrace}"))]
+    InvalidHexValue {
+        hex_val: String,
+        backtrace: Backtrace,
+    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
@@ -171,6 +171,11 @@ impl DatumKind {
         )
     }
 
+    /// Can column of this datum kind used as dictionary encode column
+    pub fn is_dictionary_kind(&self) -> bool {
+        matches!(self, DatumKind::String)
+    }
+
     pub fn unsign_kind(&self) -> Option<Self> {
         match self {
             Self::Int64 | Self::UInt64 => Some(Self::UInt64),
@@ -225,8 +230,8 @@ impl DatumKind {
             DatumKind::UInt8 => 1,
             DatumKind::Int64 => 8,
             DatumKind::Int32 => 4,
-            DatumKind::Int16 => 8,
-            DatumKind::Int8 => 8,
+            DatumKind::Int16 => 2,
+            DatumKind::Int8 => 1,
             DatumKind::Boolean => 1,
             DatumKind::Date => 4,
             DatumKind::Time => 8,
@@ -749,6 +754,10 @@ impl Datum {
             (DatumKind::Varbinary, Value::DoubleQuotedString(s)) => {
                 Ok(Datum::Varbinary(Bytes::from(s)))
             }
+            (DatumKind::Varbinary, Value::HexStringLiteral(s)) => {
+                let bytes = hex::try_decode(&s).context(InvalidHexValue { hex_val: s })?;
+                Ok(Datum::Varbinary(Bytes::from(bytes)))
+            }
             (DatumKind::String, Value::DoubleQuotedString(s)) => {
                 Ok(Datum::String(StringBytes::from(s)))
             }
@@ -847,6 +856,28 @@ impl Datum {
         Ok(Datum::Date(days))
     }
 
+    pub fn size(&self) -> usize {
+        match self {
+            Datum::Null => 1,
+            Datum::Timestamp(_) => 8,
+            Datum::Double(_) => 8,
+            Datum::Float(_) => 4,
+            Datum::Varbinary(v) => v.len(),
+            Datum::String(v) => v.len(),
+            Datum::UInt64(_) => 8,
+            Datum::UInt32(_) => 4,
+            Datum::UInt16(_) => 2,
+            Datum::UInt8(_) => 1,
+            Datum::Int64(_) => 8,
+            Datum::Int32(_) => 4,
+            Datum::Int16(_) => 2,
+            Datum::Int8(_) => 1,
+            Datum::Boolean(_) => 1,
+            Datum::Date(_) => 4,
+            Datum::Time(_) => 8,
+        }
+    }
+
     #[cfg(test)]
     pub fn as_view(&self) -> DatumView {
         match self {
@@ -1112,6 +1143,7 @@ pub mod arrow_convert {
                 DataType::Boolean => Some(Self::Boolean),
                 DataType::Date32 => Some(Self::Date),
                 DataType::Time64(TimeUnit::Nanosecond) => Some(Self::Time),
+                DataType::Dictionary(_, _) => Some(Self::String),
                 DataType::Float16
                 | DataType::LargeUtf8
                 | DataType::LargeBinary
@@ -1127,7 +1159,6 @@ pub mod arrow_convert {
                 | DataType::Date64
                 | DataType::Interval(_)
                 | DataType::Duration(_)
-                | DataType::Dictionary(_, _)
                 | DataType::Decimal128(_, _)
                 | DataType::Decimal256(_, _)
                 | DataType::RunEndEncoded(_, _)
@@ -1209,6 +1240,7 @@ pub mod arrow_convert {
                 }
                 ScalarValue::Date32(v) => v.map(Datum::Date),
                 ScalarValue::Time64Nanosecond(v) => v.map(Datum::Time),
+                ScalarValue::Dictionary(_, literal) => Datum::from_scalar_value(literal),
                 ScalarValue::List(_, _)
                 | ScalarValue::Date64(_)
                 | ScalarValue::Time32Second(_)
@@ -1222,8 +1254,7 @@ pub mod arrow_convert {
                 | ScalarValue::Struct(_, _)
                 | ScalarValue::Decimal128(_, _, _)
                 | ScalarValue::Null
-                | ScalarValue::IntervalMonthDayNano(_)
-                | ScalarValue::Dictionary(_, _) => None,
+                | ScalarValue::IntervalMonthDayNano(_) => None,
             }
         }
     }
@@ -1255,6 +1286,7 @@ pub mod arrow_convert {
                 ScalarValue::TimestampMillisecond(v, _) => {
                     v.map(|v| DatumView::Timestamp(Timestamp::new(v)))
                 }
+                ScalarValue::Dictionary(_, literal) => DatumView::from_scalar_value(literal),
                 ScalarValue::List(_, _)
                 | ScalarValue::Date64(_)
                 | ScalarValue::Time32Second(_)
@@ -1268,8 +1300,7 @@ pub mod arrow_convert {
                 | ScalarValue::Struct(_, _)
                 | ScalarValue::Decimal128(_, _, _)
                 | ScalarValue::Null
-                | ScalarValue::IntervalMonthDayNano(_)
-                | ScalarValue::Dictionary(_, _) => None,
+                | ScalarValue::IntervalMonthDayNano(_) => None,
             }
         }
     }
@@ -1478,4 +1509,49 @@ mod tests {
             assert!(Datum::parse_datum_time_from_str(source).is_err());
         }
     }
+
+    #[test]
+    fn test_convert_from_sql_value() {
+        let cases = vec![
+            (
+                Value::Boolean(false),
+                DatumKind::Boolean,
+                true,
+                Some(Datum::Boolean(false)),
+            ),
+            (
+                Value::Number("100.1".to_string(), false),
+                DatumKind::Float,
+                true,
+                Some(Datum::Float(100.1)),
+            ),
+            (
+                Value::SingleQuotedString("string_literal".to_string()),
+                DatumKind::String,
+                true,
+                Some(Datum::String(StringBytes::from_static("string_literal"))),
+            ),
+            (
+                Value::HexStringLiteral("c70a0b".to_string()),
+                DatumKind::Varbinary,
+                true,
+                Some(Datum::Varbinary(Bytes::from(vec![199, 10, 11]))),
+            ),
+            (
+                Value::EscapedStringLiteral("string_literal".to_string()),
+                DatumKind::String,
+                false,
+                None,
+            ),
+        ];
+
+        for (input, kind, succeed, expect) in cases {
+            let res = Datum::try_from_sql_value(&kind, input);
+            if succeed {
+                assert_eq!(res.unwrap(), expect.unwrap());
+            } else {
+                assert!(res.is_err());
+            }
+        }
+    }
 }
diff --git a/common_types/src/hex.rs b/common_types/src/hex.rs
new file mode 100644
index 0000000000..8ae595c562
--- /dev/null
+++ b/common_types/src/hex.rs
@@ -0,0 +1,64 @@
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
+
+// TODO: move this module to common_util package after remove the common_types
+// from the dependencies of the common_util.
+
+/// Try to decode bytes from hex literal string.
+///
+/// None will be returned if the input literal is hex-invalid.
+pub fn try_decode(s: &str) -> Option<Vec<u8>> {
+    let hex_bytes = s.as_bytes();
+
+    let mut decoded_bytes = Vec::with_capacity((hex_bytes.len() + 1) / 2);
+
+    let start_idx = hex_bytes.len() % 2;
+    if start_idx > 0 {
+        // The first byte is formed of only one char.
+        decoded_bytes.push(try_decode_hex_char(hex_bytes[0])?);
+    }
+
+    for i in (start_idx..hex_bytes.len()).step_by(2) {
+        let high = try_decode_hex_char(hex_bytes[i])?;
+        let low = try_decode_hex_char(hex_bytes[i + 1])?;
+        decoded_bytes.push(high << 4 | low);
+    }
+
+    Some(decoded_bytes)
+}
+
+/// Try to decode a byte from a hex char.
+///
+/// None will be returned if the input char is hex-invalid.
+const fn try_decode_hex_char(c: u8) -> Option<u8> {
+    match c {
+        b'A'..=b'F' => Some(c - b'A' + 10),
+        b'a'..=b'f' => Some(c - b'a' + 10),
+        b'0'..=b'9' => Some(c - b'0'),
+        _ => None,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_decode_hex_literal() {
+        let cases = [
+            ("", Some(vec![])),
+            ("FF00", Some(vec![255, 0])),
+            ("a00a", Some(vec![160, 10])),
+            ("FF0", Some(vec![15, 240])),
+            ("f", Some(vec![15])),
+            ("FF0X", None),
+            ("X0", None),
+            ("XX", None),
+            ("x", None),
+        ];
+
+        for (input, expect) in cases {
+            let output = try_decode(input);
+            assert_eq!(output, expect);
+        }
+    }
+}
diff --git a/common_types/src/lib.rs b/common_types/src/lib.rs
index a796fb207c..0179b72613 100644
--- a/common_types/src/lib.rs
+++ b/common_types/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Contains common types
 
@@ -9,6 +9,7 @@ pub mod column;
 pub mod column_schema;
 pub mod datum;
 pub mod hash;
+pub mod hex;
 #[cfg(feature = "arrow")]
 pub mod projected_schema;
 #[cfg(feature = "arrow")]
diff --git a/common_types/src/record_batch.rs b/common_types/src/record_batch.rs
index a7a73c9381..fbfacd902b 100644
--- a/common_types/src/record_batch.rs
+++ b/common_types/src/record_batch.rs
@@ -318,7 +318,23 @@ fn cast_arrow_record_batch(source: ArrowRecordBatch) -> Result<ArrowRecordBatch>
                     DataType::Timestamp(TimeUnit::Millisecond, None),
                     field.is_nullable(),
                 ),
-                _ => Field::new(field.name(), field.data_type().clone(), field.is_nullable()),
+                _ => {
+                    let (dict_id, dict_is_ordered) = {
+                        match field.data_type() {
+                            DataType::Dictionary(_, _) => {
+                                (field.dict_id().unwrap(), field.dict_is_ordered().unwrap())
+                            }
+                            _ => (0, false),
+                        }
+                    };
+                    Field::new_dict(
+                        field.name(),
+                        field.data_type().clone(),
+                        field.is_nullable(),
+                        dict_id,
+                        dict_is_ordered,
+                    )
+                }
             };
             f.set_metadata(field.metadata().clone());
             f
@@ -477,7 +493,13 @@ impl RecordBatchWithKeyBuilder {
         let builders = schema_with_key
             .columns()
             .iter()
-            .map(|column_schema| ColumnBlockBuilder::with_capacity(&column_schema.data_type, 0))
+            .map(|column_schema| {
+                ColumnBlockBuilder::with_capacity(
+                    &column_schema.data_type,
+                    0,
+                    column_schema.is_dictionary,
+                )
+            })
             .collect();
         Self {
             schema_with_key,
@@ -490,7 +512,11 @@ impl RecordBatchWithKeyBuilder {
             .columns()
             .iter()
             .map(|column_schema| {
-                ColumnBlockBuilder::with_capacity(&column_schema.data_type, capacity)
+                ColumnBlockBuilder::with_capacity(
+                    &column_schema.data_type,
+                    capacity,
+                    column_schema.is_dictionary,
+                )
             })
             .collect();
         Self {
@@ -660,9 +686,12 @@ impl ArrowRecordBatchProjector {
                 }
                 None => {
                     // Need to push row with specific type.
-                    let null_block =
-                        ColumnBlock::new_null_with_type(&column_schema.data_type, num_rows)
-                            .context(CreateColumnBlock)?;
+                    let null_block = ColumnBlock::new_null_with_type(
+                        &column_schema.data_type,
+                        num_rows,
+                        column_schema.is_dictionary,
+                    )
+                    .context(CreateColumnBlock)?;
                     column_blocks.push(null_block);
                 }
             }
diff --git a/common_types/src/row/mod.rs b/common_types/src/row/mod.rs
index 9fb93e56c3..b420275871 100644
--- a/common_types/src/row/mod.rs
+++ b/common_types/src/row/mod.rs
@@ -128,6 +128,10 @@ impl Row {
 
         self.cols[timestamp_index].as_timestamp()
     }
+
+    pub fn size(&self) -> usize {
+        self.cols.iter().map(|col| col.size()).sum()
+    }
 }
 
 #[derive(Debug)]
diff --git a/common_types/src/schema.rs b/common_types/src/schema.rs
index c5db4d0636..d94f10e52a 100644
--- a/common_types/src/schema.rs
+++ b/common_types/src/schema.rs
@@ -690,7 +690,7 @@ impl Schema {
         self.column_schemas.num_columns()
     }
 
-    /// Returns true if idx is primary key idnex
+    /// Returns true if idx is primary key index
     pub fn is_primary_key_index(&self, idx: &usize) -> bool {
         self.primary_key_indexes.contains(idx)
     }
diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
index 0703d39d1e..fd81255da2 100644
--- a/common_types/src/tests.rs
+++ b/common_types/src/tests.rs
@@ -129,7 +129,8 @@ fn default_value_schema_builder() -> schema::Builder {
 }
 
 /// Build a schema for testing:
-/// (key1(varbinary), key2(timestamp), field1(double), field2(string))
+/// (key1(varbinary), key2(timestamp), field1(double), field2(string),
+/// field3(date), field4(time))
 pub fn build_schema() -> Schema {
     base_schema_builder().build().unwrap()
 }
@@ -144,6 +145,50 @@ pub fn build_schema() -> Schema {
 pub fn build_default_value_schema() -> Schema {
     default_value_schema_builder().build().unwrap()
 }
+/// Build a schema for testing:
+/// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8),
+/// field2(float))
+pub fn build_schema_for_dictionary() -> Schema {
+    let builder = schema::Builder::new()
+        .auto_increment_column_id(true)
+        .add_key_column(
+            column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_key_column(
+            column_schema::Builder::new("time".to_string(), DatumKind::Timestamp)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("tag1".to_string(), DatumKind::String)
+                .is_tag(true)
+                .is_dictionary(true)
+                .is_nullable(true)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("tag2".to_string(), DatumKind::String)
+                .is_tag(true)
+                .is_dictionary(true)
+                .build()
+                .unwrap(),
+        )
+        .unwrap()
+        .add_normal_column(
+            column_schema::Builder::new("value".to_string(), DatumKind::Int8)
+                .build()
+                .unwrap(),
+        )
+        .unwrap();
+
+    builder.build().unwrap()
+}
 
 /// Build a schema for testing:
 /// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8),
@@ -193,6 +238,23 @@ pub fn build_schema_for_cpu() -> Schema {
     builder.build().unwrap()
 }
 
+pub fn build_row_for_dictionary(
+    key1: u64,
+    key2: i64,
+    tag1: Option<&str>,
+    tag2: &str,
+    value: i8,
+) -> Row {
+    let datums = vec![
+        Datum::UInt64(key1),
+        Datum::Timestamp(Timestamp::new(key2)),
+        tag1.map(|v| Datum::String(StringBytes::from(v)))
+            .unwrap_or(Datum::Null),
+        Datum::String(StringBytes::from(tag2)),
+        Datum::Int8(value),
+    ];
+    Row::from_datums(datums)
+}
 pub fn build_projected_schema() -> ProjectedSchema {
     let schema = build_schema();
     assert!(schema.num_columns() > 1);
diff --git a/components/message_queue/Cargo.toml b/components/message_queue/Cargo.toml
index e820d49bca..4766330828 100644
--- a/components/message_queue/Cargo.toml
+++ b/components/message_queue/Cargo.toml
@@ -16,8 +16,8 @@ snafu = { workspace = true }
 tokio = { workspace = true }
 
 [dependencies.rskafka]
-git = "https://github.com/influxdata/rskafka.git"
-rev = "00988a564b1db0249d858065fc110476c075efad"
+git = "https://github.com/Rachelint/rskafka.git"
+rev = "f0fd8e278d8164cb0cfca5a80476361fc308ecc3"
 default-features = false
 features = ["compression-gzip", "compression-lz4", "compression-snappy"]
 
diff --git a/components/message_queue/src/kafka/config.rs b/components/message_queue/src/kafka/config.rs
index 880b7f4f94..24629442ed 100644
--- a/components/message_queue/src/kafka/config.rs
+++ b/components/message_queue/src/kafka/config.rs
@@ -2,20 +2,39 @@
 
 //! Kafka implementation's config
 
+use common_util::config::ReadableDuration;
 use serde::{Deserialize, Serialize};
 
 /// Generic client config that is used for consumers, producers as well as admin
 /// operations (like "create topic").
-#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 #[serde(default)]
 pub struct Config {
     pub client: ClientConfig,
     pub topic_management: TopicManagementConfig,
     pub consumer: ConsumerConfig,
+    pub retry_interval_factor: f64,
+    pub init_retry_interval: ReadableDuration,
+    pub max_retry_interval: ReadableDuration,
+    pub max_retry: usize,
     // TODO: may need some config options for producer,
     // but it seems nothing needed now.
 }
 
+impl Default for Config {
+    fn default() -> Self {
+        Self {
+            client: Default::default(),
+            topic_management: Default::default(),
+            consumer: Default::default(),
+            retry_interval_factor: 2.0,
+            init_retry_interval: ReadableDuration::secs(1),
+            max_retry_interval: ReadableDuration::secs(10),
+            max_retry: 10,
+        }
+    }
+}
+
 #[derive(Clone, Default, Debug, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(default)]
 pub struct ClientConfig {
diff --git a/components/message_queue/src/kafka/kafka_impl.rs b/components/message_queue/src/kafka/kafka_impl.rs
index 4937d2efee..0d69d5e382 100644
--- a/components/message_queue/src/kafka/kafka_impl.rs
+++ b/components/message_queue/src/kafka/kafka_impl.rs
@@ -21,6 +21,7 @@ use rskafka::{
         Client, ClientBuilder,
     },
     record::{Record, RecordAndOffset},
+    BackoffConfig,
 };
 use snafu::{Backtrace, ResultExt, Snafu};
 use tokio::sync::RwLock;
@@ -141,7 +142,14 @@ impl KafkaImplInner {
             panic!("The boost broker must be set");
         }
 
-        let mut client_builder = ClientBuilder::new(config.client.boost_brokers.clone().unwrap());
+        let backoff_config = BackoffConfig {
+            init_backoff: config.init_retry_interval.0,
+            max_backoff: config.max_retry_interval.0,
+            base: config.retry_interval_factor,
+            max_retry: config.max_retry,
+        };
+        let mut client_builder = ClientBuilder::new(config.client.boost_brokers.clone().unwrap())
+            .backoff_config(backoff_config);
         if let Some(max_message_size) = config.client.max_message_size {
             client_builder = client_builder.max_message_size(max_message_size);
         }
diff --git a/components/parquet_ext/Cargo.toml b/components/parquet_ext/Cargo.toml
index 1b4b4b23c6..ba31703d18 100644
--- a/components/parquet_ext/Cargo.toml
+++ b/components/parquet_ext/Cargo.toml
@@ -17,6 +17,8 @@ async-trait = { workspace = true }
 bytes = { workspace = true }
 common_util = { workspace = true }
 datafusion = { workspace = true }
+futures = { workspace = true }
 log = { workspace = true }
+object_store = { workspace = true }
 parquet = { workspace = true }
 tokio = { workspace = true }
diff --git a/components/parquet_ext/src/lib.rs b/components/parquet_ext/src/lib.rs
index cd413c0afc..7264b38dd6 100644
--- a/components/parquet_ext/src/lib.rs
+++ b/components/parquet_ext/src/lib.rs
@@ -2,6 +2,7 @@
 
 pub mod meta_data;
 pub mod prune;
+pub mod reader;
 pub mod reverse_reader;
 #[cfg(test)]
 pub mod tests;
diff --git a/components/parquet_ext/src/meta_data.rs b/components/parquet_ext/src/meta_data.rs
index e796244c16..56bf777ff8 100644
--- a/components/parquet_ext/src/meta_data.rs
+++ b/components/parquet_ext/src/meta_data.rs
@@ -1,15 +1,18 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
-use std::ops::Range;
+use std::{ops::Range, sync::Arc};
 
 use async_trait::async_trait;
 use bytes::Bytes;
 use common_util::error::GenericResult;
 use parquet::{
+    arrow::{arrow_reader::ArrowReaderOptions, ParquetRecordBatchStreamBuilder},
     errors::{ParquetError, Result},
     file::{footer, metadata::ParquetMetaData},
 };
 
+use crate::reader::ObjectStoreReader;
+
 #[async_trait]
 pub trait ChunkReader: Sync + Send {
     async fn get_bytes(&self, range: Range<usize>) -> GenericResult<Bytes>;
@@ -65,3 +68,21 @@ pub async fn fetch_parquet_metadata(
 
     footer::decode_metadata(&metadata_bytes).map(|v| (v, metadata_len))
 }
+
+/// Build page indexes for meta data
+///
+/// TODO: Currently there is no method to build page indexes for meta data in
+/// `parquet`, maybe we can write a issue in `arrow-rs` .
+pub async fn meta_with_page_indexes(
+    object_store_reader: ObjectStoreReader,
+) -> Result<Arc<ParquetMetaData>> {
+    let read_options = ArrowReaderOptions::new().with_page_index(true);
+    let builder =
+        ParquetRecordBatchStreamBuilder::new_with_options(object_store_reader, read_options)
+            .await
+            .map_err(|e| {
+                let err_msg = format!("failed to build page indexes in metadata, err:{e}");
+                ParquetError::General(err_msg)
+            })?;
+    Ok(builder.metadata().clone())
+}
diff --git a/components/parquet_ext/src/reader.rs b/components/parquet_ext/src/reader.rs
new file mode 100644
index 0000000000..3a5cd5f170
--- /dev/null
+++ b/components/parquet_ext/src/reader.rs
@@ -0,0 +1,81 @@
+// Copyright 2023 CeresDB Project Authors. Licensed under Apache-2.0.
+
+use std::{ops::Range, sync::Arc, time::Instant};
+
+use bytes::Bytes;
+use futures::{
+    future::{BoxFuture, FutureExt},
+    TryFutureExt,
+};
+use log::debug;
+use object_store::{ObjectStoreRef, Path};
+use parquet::{arrow::async_reader::AsyncFileReader, file::metadata::ParquetMetaData};
+
+/// Implemention AsyncFileReader based on `ObjectStore`
+///
+/// TODO: Perhaps we should avoid importing `object_store` in `parquet_ext` to
+/// keep the crate `parquet_ext` more pure.
+#[derive(Clone)]
+pub struct ObjectStoreReader {
+    storage: ObjectStoreRef,
+    path: Path,
+    meta_data: Arc<ParquetMetaData>,
+    begin: Instant,
+}
+
+impl ObjectStoreReader {
+    pub fn new(storage: ObjectStoreRef, path: Path, meta_data: Arc<ParquetMetaData>) -> Self {
+        Self {
+            storage,
+            path,
+            meta_data,
+            begin: Instant::now(),
+        }
+    }
+}
+
+impl Drop for ObjectStoreReader {
+    fn drop(&mut self) {
+        debug!(
+            "ObjectStoreReader dropped, path:{}, elapsed:{:?}",
+            &self.path,
+            self.begin.elapsed()
+        );
+    }
+}
+
+impl AsyncFileReader for ObjectStoreReader {
+    fn get_bytes(&mut self, range: Range<usize>) -> BoxFuture<'_, parquet::errors::Result<Bytes>> {
+        self.storage
+            .get_range(&self.path, range)
+            .map_err(|e| {
+                parquet::errors::ParquetError::General(format!(
+                    "Failed to fetch range from object store, err:{e}"
+                ))
+            })
+            .boxed()
+    }
+
+    fn get_byte_ranges(
+        &mut self,
+        ranges: Vec<Range<usize>>,
+    ) -> BoxFuture<'_, parquet::errors::Result<Vec<Bytes>>> {
+        async move {
+            self.storage
+                .get_ranges(&self.path, &ranges)
+                .map_err(|e| {
+                    parquet::errors::ParquetError::General(format!(
+                        "Failed to fetch ranges from object store, err:{e}"
+                    ))
+                })
+                .await
+        }
+        .boxed()
+    }
+
+    fn get_metadata(
+        &mut self,
+    ) -> BoxFuture<'_, parquet::errors::Result<Arc<parquet::file::metadata::ParquetMetaData>>> {
+        Box::pin(async move { Ok(self.meta_data.clone()) })
+    }
+}
diff --git a/components/profile/Cargo.toml b/components/profile/Cargo.toml
index abd15dd1ed..72eb060de3 100644
--- a/components/profile/Cargo.toml
+++ b/components/profile/Cargo.toml
@@ -18,3 +18,4 @@ features = ["stats", "profiling", "unprefixed_malloc_on_supported_platforms"]
 jemalloc-ctl = "0.3.2"
 jemallocator = "0.3.2"
 log = { workspace = true }
+pprof = { version = "0.11.1", features = ["flamegraph"] }
diff --git a/components/profile/src/lib.rs b/components/profile/src/lib.rs
index 524e97b7f2..82f5319ca6 100644
--- a/components/profile/src/lib.rs
+++ b/components/profile/src/lib.rs
@@ -1,6 +1,6 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
-//! Memory profiler for running application based on jemalloc features.
+//! Profiler for running application.
 
 use std::{
     fmt::Formatter,
@@ -9,6 +9,7 @@ use std::{
     io::Read,
     sync::{Mutex, MutexGuard},
     thread, time,
+    time::Duration,
 };
 
 use jemalloc_ctl::{Access, AsName};
@@ -36,8 +37,9 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 
 const PROF_ACTIVE: &[u8] = b"prof.active\0";
 const PROF_DUMP: &[u8] = b"prof.dump\0";
-const PROFILE_OUTPUT_FILE_OS_PATH: &[u8] = b"/tmp/profile.out\0";
-const PROFILE_OUTPUT_FILE_PATH: &str = "/tmp/profile.out";
+const PROFILE_HEAP_OUTPUT_FILE_OS_PATH: &[u8] = b"/tmp/profile_heap.out\0";
+const PROFILE_HEAP_OUTPUT_FILE_PATH: &str = "/tmp/profile_heap.out";
+const PROFILE_CPU_OUTPUT_FILE_PATH: &str = "/tmp/flamegraph_cpu.svg";
 
 fn set_prof_active(active: bool) -> Result<()> {
     let name = PROF_ACTIVE.name();
@@ -46,15 +48,15 @@ fn set_prof_active(active: bool) -> Result<()> {
 
 fn dump_profile() -> Result<()> {
     let name = PROF_DUMP.name();
-    name.write(PROFILE_OUTPUT_FILE_OS_PATH)
+    name.write(PROFILE_HEAP_OUTPUT_FILE_OS_PATH)
         .map_err(Error::Jemalloc)
 }
 
 struct ProfLockGuard<'a>(MutexGuard<'a, ()>);
 
 /// ProfLockGuard hold the profile lock and take responsibilities for
-/// (de)activating mem profiling. NOTE: Keeping mem profiling on may cause some
-/// extra runtime cost so we choose to activating it  dynamically.
+/// (de)activating heap profiling. NOTE: Keeping heap profiling on may cause
+/// some extra runtime cost so we choose to activating it  dynamically.
 impl<'a> ProfLockGuard<'a> {
     pub fn new(guard: MutexGuard<'a, ()>) -> Result<Self> {
         set_prof_active(true)?;
@@ -71,7 +73,7 @@ impl<'a> Drop for ProfLockGuard<'a> {
 }
 
 pub struct Profiler {
-    mem_prof_lock: Mutex<()>,
+    heap_prof_lock: Mutex<()>,
 }
 
 impl Default for Profiler {
@@ -83,19 +85,22 @@ impl Default for Profiler {
 impl Profiler {
     pub fn new() -> Self {
         Self {
-            mem_prof_lock: Mutex::new(()),
+            heap_prof_lock: Mutex::new(()),
         }
     }
 
-    // dump_mem_prof collects mem profiling data in `seconds`.
+    // dump_heap_prof collects heap profiling data in `seconds`.
     // TODO(xikai): limit the profiling duration
-    pub fn dump_mem_prof(&self, seconds: u64) -> Result<Vec<u8>> {
+    pub fn dump_heap_prof(&self, seconds: u64) -> Result<Vec<u8>> {
         // concurrent profiling is disabled.
-        let lock_guard = self.mem_prof_lock.try_lock().map_err(|e| Error::Internal {
-            msg: format!("failed to acquire mem_prof_lock, err:{e}"),
-        })?;
+        let lock_guard = self
+            .heap_prof_lock
+            .try_lock()
+            .map_err(|e| Error::Internal {
+                msg: format!("failed to acquire heap_prof_lock, err:{e}"),
+            })?;
         info!(
-            "Profiler::dump_mem_prof start memory profiling {} seconds",
+            "Profiler::dump_heap_prof start heap profiling {} seconds",
             seconds
         );
 
@@ -109,7 +114,7 @@ impl Profiler {
             .create(true)
             .write(true)
             .truncate(true)
-            .open(PROFILE_OUTPUT_FILE_PATH)
+            .open(PROFILE_HEAP_OUTPUT_FILE_PATH)
             .map_err(|e| {
                 error!("Failed to open prof data file, err:{}", e);
                 Error::IO(e)
@@ -119,13 +124,13 @@ impl Profiler {
         dump_profile().map_err(|e| {
             error!(
                 "Failed to dump prof to {}, err:{}",
-                PROFILE_OUTPUT_FILE_PATH, e
+                PROFILE_HEAP_OUTPUT_FILE_PATH, e
             );
             e
         })?;
 
         // read the profile results into buffer
-        let mut f = File::open(PROFILE_OUTPUT_FILE_PATH).map_err(|e| {
+        let mut f = File::open(PROFILE_HEAP_OUTPUT_FILE_PATH).map_err(|e| {
             error!("Failed to open prof data file, err:{}", e);
             Error::IO(e)
         })?;
@@ -138,4 +143,28 @@ impl Profiler {
 
         Ok(buffer)
     }
+
+    pub fn dump_cpu_prof(&self, seconds: u64) -> Result<()> {
+        let guard = pprof::ProfilerGuardBuilder::default()
+            .frequency(100)
+            .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+            .build()
+            .map_err(|e| Error::Internal {
+                msg: format!("Profiler guard, err:{e}"),
+            })?;
+
+        thread::sleep(Duration::from_secs(seconds));
+
+        let report = guard.report().build().map_err(|e| Error::Internal {
+            msg: format!("Report build, err:{e}"),
+        })?;
+        let file = File::create(PROFILE_CPU_OUTPUT_FILE_PATH).map_err(|e| {
+            error!("Failed to create cpu profile svg file, err:{}", e);
+            Error::IO(e)
+        })?;
+        report.flamegraph(file).map_err(|e| Error::Internal {
+            msg: format!("Flamegraph output, err:{e}"),
+        })?;
+        Ok(())
+    }
 }
diff --git a/df_operator/src/udfs/time_bucket.rs b/df_operator/src/udfs/time_bucket.rs
index 1ea693d954..bb4c6b29bb 100644
--- a/df_operator/src/udfs/time_bucket.rs
+++ b/df_operator/src/udfs/time_bucket.rs
@@ -141,8 +141,9 @@ impl<'a> TimeBucket<'a> {
     }
 
     fn call(&self) -> Result<ColumnBlock> {
+        // TODO mising is_dictionary params
         let mut out_column_builder =
-            ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows());
+            ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows(), false);
         for ts_opt in self.column.iter() {
             match ts_opt {
                 Some(ts) => {
diff --git a/integration_tests/Makefile b/integration_tests/Makefile
index 49ff4ba9c6..84bb3454e1 100644
--- a/integration_tests/Makefile
+++ b/integration_tests/Makefile
@@ -69,3 +69,6 @@ run-mysql:
 
 run-prom:
 	cd prom && ./run-tests.sh
+
+run-recovery: clean build-ceresdb kill-old-process
+	cd recovery && ./run.sh && ./run.sh shard_based
diff --git a/integration_tests/config/shard-based-recovery.toml b/integration_tests/config/shard-based-recovery.toml
new file mode 100644
index 0000000000..3ad980df0b
--- /dev/null
+++ b/integration_tests/config/shard-based-recovery.toml
@@ -0,0 +1,21 @@
+[server]
+bind_addr = "0.0.0.0"
+http_port = 5440
+grpc_port = 8831
+
+[logger]
+level = "info"
+
+[tracing]
+dir = "/tmp/ceresdb"
+
+[analytic]
+recover_mode = "ShardBased"
+
+[analytic.storage.object_store]
+type = "Local"
+data_dir = "/tmp/ceresdb"
+
+[analytic.wal]
+type = "RocksDB"
+data_dir = "/tmp/ceresdb"
diff --git a/integration_tests/recovery/check.py b/integration_tests/recovery/check.py
new file mode 100644
index 0000000000..73b7495b14
--- /dev/null
+++ b/integration_tests/recovery/check.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import requests
+import argparse
+
+api_root = 'http://localhost:5440'
+headers = {
+    'Content-Type': 'application/json'
+}
+
+def get_test_tables(ts):
+    table = 'sql_test' + str(ts)
+    table2 = 'SQL_TEST' + str(ts)
+    return [table, table2]
+
+def get_args():
+    parser = argparse.ArgumentParser(description='cmd args')
+    parser.add_argument('--timestamp', '-ts', type=int, help='timestamp')
+    parser.add_argument('--init_before_check', '-i', help='init_before_check', action="store_true")
+    args = vars(parser.parse_args())
+    return args
+
+
+def execute_sql(sql):
+    r = requests.post('{}/sql'.format(api_root), json={'query': sql}, headers=headers)
+    assert r.status_code == 200, r.text
+    return r.json()
+
+def prepare_data(ts, tables):
+    for t in tables:
+        execute_sql("""
+CREATE TABLE if not exists `{}` (
+    `t` timestamp NOT NULL,
+    `tag1` string TAG,
+    `tag2` string TAG,
+    `value` double NOT NULL,
+    `VALUE2` double NOT NULL,
+    timestamp KEY (t)
+);
+        """.format(t))
+
+    execute_sql("""
+insert into {}(t, tag1, tag2, value, VALUE2)
+values
+({}, "v1", "v2", 1, 2),
+({}, "v1", "v2", 11, 22)
+    ;
+    """.format(tables[0], ts-5000, ts))
+
+    execute_sql("""
+insert into {}(t, tag1, tag2, value, VALUE2)
+values
+({}, "v1", "v2", 10, 20),
+({}, "v1", "v2", 110, 220)
+    ;
+    """.format(tables[1], ts-5000, ts))
+
+def query_and_check(ts, tables):
+    expected = {'rows': [{'tsid': 7518337278486593135, 't': ts - 5000, 'tag1': 'v1', 'tag2': 'v2', 'value': 1.0, 'VALUE2': 2.0},\
+                         {'tsid': 7518337278486593135, 't': ts, 'tag1': 'v1', 'tag2': 'v2', 'value': 11.0, 'VALUE2': 22.0}]}
+    expected2 = {'rows': [{'tsid': 7518337278486593135, 't': ts - 5000, 'tag1': 'v1', 'tag2': 'v2', 'value': 10.0, 'VALUE2': 20.0},\
+                          {'tsid': 7518337278486593135, 't': ts, 'tag1': 'v1', 'tag2': 'v2', 'value': 110.0, 'VALUE2': 220.0}]}
+    expecteds = [expected, expected2]
+
+    for idx, t in enumerate(tables):
+        r = execute_sql("select * from {}".format(t))
+        assert r == expecteds[idx]
+    
+    print('Restart test pass...')
+
+def main():
+    args = get_args()
+    init_before_check = args['init_before_check']
+    ts = args['timestamp']
+    test_tables = get_test_tables(args['timestamp'])
+
+    if init_before_check:
+        print("Init before check")
+        prepare_data(ts, test_tables)
+    query_and_check(ts, test_tables)
+
+if __name__ == '__main__':
+    main()
diff --git a/integration_tests/recovery/run.sh b/integration_tests/recovery/run.sh
new file mode 100755
index 0000000000..83295244cd
--- /dev/null
+++ b/integration_tests/recovery/run.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+
+set -e
+
+ROOT=`pwd`
+# For compatibility in macos, so convert to milliseconds by adding 3 zeros.
+NOW=`date +%s000`
+BINARY_PATH=${ROOT}/../../target/debug/ceresdb-server
+SERVER_HTTP_ENDPOINT=127.0.0.1:5440
+
+CONFIG_FILE=${ROOT}/../../docs/minimal.toml
+if [ ${1} == 'shard_based' ]; then
+    CONFIG_FILE=${ROOT}/../config/shard-based-recovery.toml
+fi
+
+echo "Run with config: ${CONFIG_FILE}"
+echo "First check..."
+nohup ${BINARY_PATH} --config ${CONFIG_FILE} &
+sleep 10
+python3 ./check.py -ts ${NOW} -i
+
+echo "Restart and check..."
+killall ceresdb-server | true
+nohup ${BINARY_PATH} --config ${CONFIG_FILE} &
+sleep 10
+python3 ./check.py -ts ${NOW}
+
+echo "Flush, restart and check..."
+curl -XPOST ${SERVER_HTTP_ENDPOINT}/debug/flush_memtable
+echo "\nFlush finish..."
+killall ceresdb-server | true
+nohup ${BINARY_PATH} --config ${CONFIG_FILE} &
+sleep 10
+python3 ./check.py -ts ${NOW}
+echo "All finish..."
diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs
index c0944ed81b..cf415cc6e0 100644
--- a/interpreters/src/describe.rs
+++ b/interpreters/src/describe.rs
@@ -46,12 +46,14 @@ impl DescribeInterpreter {
         let mut is_primary_keys = Vec::with_capacity(num_columns);
         let mut is_nullables = Vec::with_capacity(num_columns);
         let mut is_tags = Vec::with_capacity(num_columns);
+        let mut is_dictionarys = Vec::with_capacity(num_columns);
         for (idx, col) in table_schema.columns().iter().enumerate() {
             names.push(col.name.to_string());
             types.push(col.data_type.to_string());
             is_primary_keys.push(table_schema.is_primary_key_index(&idx));
             is_nullables.push(col.is_nullable);
             is_tags.push(col.is_tag);
+            is_dictionarys.push(col.is_dictionary);
         }
 
         let schema = Schema::new(vec![
@@ -60,6 +62,7 @@ impl DescribeInterpreter {
             Field::new("is_primary", DataType::Boolean, false),
             Field::new("is_nullable", DataType::Boolean, false),
             Field::new("is_tag", DataType::Boolean, false),
+            Field::new("is_dictionary", DataType::Boolean, false),
         ]);
 
         let arrow_record_batch = RecordBatch::try_new(
@@ -70,6 +73,7 @@ impl DescribeInterpreter {
                 Arc::new(BooleanArray::from(is_primary_keys)),
                 Arc::new(BooleanArray::from(is_nullables)),
                 Arc::new(BooleanArray::from(is_tags)),
+                Arc::new(BooleanArray::from(is_dictionarys)),
             ],
         )
         .unwrap();
diff --git a/interpreters/src/insert.rs b/interpreters/src/insert.rs
index 95bc47a0cb..782b07b6f3 100644
--- a/interpreters/src/insert.rs
+++ b/interpreters/src/insert.rs
@@ -341,7 +341,11 @@ fn get_or_extract_column_from_row_groups(
         .unwrap_or_else(|| {
             let data_type = row_groups.schema().column(column_idx).data_type;
             let iter = row_groups.iter_column(column_idx);
-            let mut builder = ColumnBlockBuilder::with_capacity(&data_type, iter.size_hint().0);
+            let mut builder = ColumnBlockBuilder::with_capacity(
+                &data_type,
+                iter.size_hint().0,
+                row_groups.schema().column(column_idx).is_dictionary,
+            );
 
             for datum in iter {
                 builder.append(datum.clone()).context(BuildColumnBlock)?;
diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs
index 29b6048355..a6e4b3dcdf 100644
--- a/interpreters/src/show_create.rs
+++ b/interpreters/src/show_create.rs
@@ -86,6 +86,11 @@ impl ShowCreateInterpreter {
             if col.is_tag {
                 res += " TAG";
             }
+
+            if col.is_dictionary {
+                res += " DICTIONARY";
+            }
+
             if !col.is_nullable {
                 res += " NOT NULL";
             }
diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs
index 2b2f93b184..6a4473eee6 100644
--- a/interpreters/src/tests.rs
+++ b/interpreters/src/tests.rs
@@ -170,17 +170,18 @@ where
         let sql = "desc table test_table";
         let output = self.sql_to_output(sql).await.unwrap();
         let records = output.try_into().unwrap();
+        // todo this maybe need to change
         let expected = vec![
-            "+--------+-----------+------------+-------------+--------+",
-            "| name   | type      | is_primary | is_nullable | is_tag |",
-            "+--------+-----------+------------+-------------+--------+",
-            "| key1   | varbinary | true       | false       | false  |",
-            "| key2   | timestamp | true       | false       | false  |",
-            "| field1 | double    | false      | true        | false  |",
-            "| field2 | string    | false      | true        | false  |",
-            "| field3 | date      | false      | true        | false  |",
-            "| field4 | time      | false      | true        | false  |",
-            "+--------+-----------+------------+-------------+--------+",
+            "+--------+-----------+------------+-------------+--------+---------------+",
+            "| name   | type      | is_primary | is_nullable | is_tag | is_dictionary |",
+            "+--------+-----------+------------+-------------+--------+---------------+",
+            "| key1   | varbinary | true       | false       | false  | false         |",
+            "| key2   | timestamp | true       | false       | false  | false         |",
+            "| field1 | double    | false      | true        | false  | false         |",
+            "| field2 | string    | false      | true        | false  | false         |",
+            "| field3 | date      | false      | true        | false  | false         |",
+            "| field4 | time      | false      | true        | false  | false         |",
+            "+--------+-----------+------------+-------------+--------+---------------+",
         ];
         common_util::record_batch::assert_record_batches_eq(&expected, records);
     }
diff --git a/proxy/src/forward.rs b/proxy/src/forward.rs
index 9603dceee9..e1765dbfda 100644
--- a/proxy/src/forward.rs
+++ b/proxy/src/forward.rs
@@ -21,6 +21,8 @@ use tonic::{
     transport::{self, Channel},
 };
 
+use crate::FORWARDED_FROM;
+
 #[derive(Debug, Snafu)]
 pub enum Error {
     #[snafu(display(
@@ -68,6 +70,9 @@ pub enum Error {
         source: tonic::transport::Error,
         backtrace: Backtrace,
     },
+
+    #[snafu(display("Request should not be forwarded twice, forward from:{}", endpoint))]
+    ForwardedErr { endpoint: String },
 }
 
 define_result!(Error);
@@ -184,6 +189,7 @@ pub struct ForwardRequest<Req> {
     pub schema: String,
     pub table: String,
     pub req: tonic::Request<Req>,
+    pub forwarded_from: Option<String>,
 }
 
 impl Forwarder<DefaultClientBuilder> {
@@ -256,7 +262,12 @@ impl<B: ClientBuilder> Forwarder<B> {
         F: ForwarderRpc<Req, Resp, Err>,
         Req: std::fmt::Debug + Clone,
     {
-        let ForwardRequest { schema, table, req } = forward_req;
+        let ForwardRequest {
+            schema,
+            table,
+            req,
+            forwarded_from,
+        } = forward_req;
 
         let route_req = RouteRequest {
             context: Some(RequestContext { database: schema }),
@@ -281,13 +292,15 @@ impl<B: ClientBuilder> Forwarder<B> {
             }
         };
 
-        self.forward_with_endpoint(endpoint, req, do_rpc).await
+        self.forward_with_endpoint(endpoint, req, forwarded_from, do_rpc)
+            .await
     }
 
     pub async fn forward_with_endpoint<Req, Resp, Err, F>(
         &self,
         endpoint: Endpoint,
         mut req: tonic::Request<Req>,
+        forwarded_from: Option<String>,
         do_rpc: F,
     ) -> Result<ForwardResult<Resp, Err>>
     where
@@ -310,6 +323,17 @@ impl<B: ClientBuilder> Forwarder<B> {
             "Try to forward request to {:?}, request:{:?}",
             endpoint, req,
         );
+
+        if let Some(endpoint) = forwarded_from {
+            return ForwardedErr { endpoint }.fail();
+        }
+
+        // mark forwarded
+        req.metadata_mut().insert(
+            FORWARDED_FROM,
+            self.local_endpoint.to_string().parse().unwrap(),
+        );
+
         let client = self.get_or_create_client(&endpoint).await?;
         match do_rpc(client, req, &endpoint).await {
             Err(e) => {
@@ -461,6 +485,7 @@ mod tests {
                 schema: DEFAULT_SCHEMA.to_string(),
                 table: table.to_string(),
                 req: query_request.into_request(),
+                forwarded_from: None,
             }
         };
 
diff --git a/proxy/src/grpc/metrics.rs b/proxy/src/grpc/metrics.rs
index 850f001683..57bb707a04 100644
--- a/proxy/src/grpc/metrics.rs
+++ b/proxy/src/grpc/metrics.rs
@@ -8,7 +8,16 @@ use prometheus_static_metric::{auto_flush_from, make_auto_flush_static_metric};
 
 make_auto_flush_static_metric! {
     pub label_enum GrpcTypeKind {
+        write_succeeded,
         write_failed,
+        query_succeeded,
+        query_failed,
+        stream_query_succeeded,
+        stream_query_failed,
+        write_succeeded_row,
+        write_failed_row,
+        query_succeeded_row,
+        query_affected_row,
     }
 
     pub struct GrpcHandlerCounterVec: LocalIntCounter {
diff --git a/proxy/src/grpc/prom_query.rs b/proxy/src/grpc/prom_query.rs
index 4ef9ebeecf..8368921fc0 100644
--- a/proxy/src/grpc/prom_query.rs
+++ b/proxy/src/grpc/prom_query.rs
@@ -373,54 +373,77 @@ mod tests {
                     .unwrap(),
             )
             .unwrap()
+            .add_normal_column(
+                column_schema::Builder::new("tag_dictionary".to_string(), DatumKind::String)
+                    .is_tag(true)
+                    .is_dictionary(true)
+                    .is_nullable(true)
+                    .build()
+                    .unwrap(),
+            )
+            .unwrap()
             .build()
             .unwrap()
     }
 
     fn build_column_block() -> Vec<ColumnBlock> {
-        let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str| -> Row {
+        let build_row = |ts: i64, tsid: u64, field1: f64, field2: &str, dic: Option<&str>| -> Row {
             let datums = vec![
                 Datum::Timestamp(Timestamp::new(ts)),
                 Datum::UInt64(tsid),
                 Datum::Double(field1),
                 Datum::String(StringBytes::from(field2)),
+                dic.map(|v| Datum::String(StringBytes::from(v)))
+                    .unwrap_or(Datum::Null),
             ];
 
             Row::from_datums(datums)
         };
 
         let rows = vec![
-            build_row(1000001, 1, 10.0, "v5"),
-            build_row(1000002, 1, 11.0, "v5"),
-            build_row(1000000, 2, 10.0, "v4"),
-            build_row(1000000, 3, 10.0, "v3"),
+            build_row(1000001, 1, 10.0, "v5", Some("d1")),
+            build_row(1000002, 1, 11.0, "v5", None),
+            build_row(1000000, 2, 10.0, "v4", Some("d2")),
+            build_row(1000000, 3, 10.0, "v3", None),
         ];
 
-        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 2, false);
         for row in &rows {
             builder.append(row[0].clone()).unwrap();
         }
         let timestamp_block = builder.build();
 
-        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 2, false);
         for row in &rows {
             builder.append(row[1].clone()).unwrap();
         }
         let tsid_block = builder.build();
 
-        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::Double, 2, false);
         for row in &rows {
             builder.append(row[2].clone()).unwrap();
         }
         let field_block = builder.build();
 
-        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2);
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2, false);
         for row in &rows {
             builder.append(row[3].clone()).unwrap();
         }
         let tag_block = builder.build();
 
-        vec![timestamp_block, tsid_block, field_block, tag_block]
+        let mut builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 2, true);
+        for row in &rows {
+            builder.append(row[4].clone()).unwrap();
+        }
+        let dictionary_block = builder.build();
+
+        vec![
+            timestamp_block,
+            tsid_block,
+            field_block,
+            tag_block,
+            dictionary_block,
+        ]
     }
 
     fn make_sample(timestamp: i64, value: f64) -> Sample {
@@ -440,7 +463,7 @@ mod tests {
 
         let column_name = ColumnNames {
             timestamp: "timestamp".to_string(),
-            tag_keys: vec!["tag1".to_string()],
+            tag_keys: vec!["tag1".to_string(), "tag_dictionary".to_string()],
             field: "field1".to_string(),
         };
         let converter = RecordConverter::try_new(&column_name, &schema.to_record_schema()).unwrap();
@@ -461,11 +484,17 @@ mod tests {
         );
         assert_eq!(
             tsid_to_tags.get(&1).unwrap().clone(),
-            make_tags(vec![("tag1".to_string(), "v5".to_string())])
+            make_tags(vec![
+                ("tag1".to_string(), "v5".to_string()),
+                ("tag_dictionary".to_string(), "d1".to_string())
+            ])
         );
         assert_eq!(
             tsid_to_tags.get(&2).unwrap().clone(),
-            make_tags(vec![("tag1".to_string(), "v4".to_string())])
+            make_tags(vec![
+                ("tag1".to_string(), "v4".to_string()),
+                ("tag_dictionary".to_string(), "d2".to_string())
+            ])
         );
         assert_eq!(
             tsid_to_tags.get(&3).unwrap().clone(),
diff --git a/proxy/src/grpc/sql_query.rs b/proxy/src/grpc/sql_query.rs
index 4b9a75210b..6c756be12c 100644
--- a/proxy/src/grpc/sql_query.rs
+++ b/proxy/src/grpc/sql_query.rs
@@ -28,6 +28,7 @@ use tonic::{transport::Channel, IntoRequest};
 use crate::{
     error::{self, ErrNoCause, ErrWithCause, Error, Result},
     forward::{ForwardRequest, ForwardResult},
+    grpc::metrics::GRPC_HANDLER_COUNTER_VEC,
     read::SqlResponse,
     Context, Proxy,
 };
@@ -40,12 +41,16 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         match self.handle_sql_query_internal(ctx, req).await {
             Err(e) => {
                 error!("Failed to handle sql query, err:{e}");
+                GRPC_HANDLER_COUNTER_VEC.query_failed.inc();
                 SqlQueryResponse {
                     header: Some(error::build_err_header(e)),
                     ..Default::default()
                 }
             }
-            Ok(v) => v,
+            Ok(v) => {
+                GRPC_HANDLER_COUNTER_VEC.query_succeeded.inc();
+                v
+            }
         }
     }
 
@@ -79,13 +84,17 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         match self.clone().handle_stream_query_internal(ctx, req).await {
             Err(e) => stream::once(async {
                 error!("Failed to handle stream sql query, err:{e}");
+                GRPC_HANDLER_COUNTER_VEC.stream_query_failed.inc();
                 SqlQueryResponse {
                     header: Some(error::build_err_header(e)),
                     ..Default::default()
                 }
             })
             .boxed(),
-            Ok(v) => v,
+            Ok(v) => {
+                GRPC_HANDLER_COUNTER_VEC.stream_query_succeeded.inc();
+                v
+            }
         }
     }
 
@@ -104,7 +113,11 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
 
         let req_context = req.context.as_ref().unwrap();
         let schema = req_context.database.clone();
-        let req = match self.clone().maybe_forward_stream_sql_query(&req).await {
+        let req = match self
+            .clone()
+            .maybe_forward_stream_sql_query(ctx.clone(), &req)
+            .await
+        {
             Some(resp) => match resp {
                 ForwardResult::Forwarded(resp) => return resp,
                 ForwardResult::Local => req,
@@ -127,8 +140,12 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
                     if tx.send(resp).await.is_err() {
                         error!("Failed to send affected rows resp in stream sql query");
                     }
+                    GRPC_HANDLER_COUNTER_VEC
+                        .query_affected_row
+                        .inc_by(rows as u64);
                 }
                 Output::Records(batches) => {
+                    let mut num_rows = 0;
                     for batch in &batches {
                         let resp = {
                             let mut writer = QueryResponseWriter::new(resp_compress_min_length);
@@ -140,7 +157,11 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
                             error!("Failed to send record batches resp in stream sql query");
                             break;
                         }
+                        num_rows += batch.num_rows();
                     }
+                    GRPC_HANDLER_COUNTER_VEC
+                        .query_succeeded_row
+                        .inc_by(num_rows as u64);
                 }
             }
             Ok::<(), Error>(())
@@ -150,6 +171,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
 
     async fn maybe_forward_stream_sql_query(
         self: Arc<Self>,
+        ctx: Context,
         req: &SqlQueryRequest,
     ) -> Option<ForwardResult<BoxStream<'static, SqlQueryResponse>, Error>> {
         if req.tables.len() != 1 {
@@ -163,6 +185,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             schema: req_ctx.database.clone(),
             table: req.tables[0].clone(),
             req: req.clone().into_request(),
+            forwarded_from: ctx.forwarded_from,
         };
         let do_query = |mut client: StorageServiceClient<Channel>,
                         request: tonic::Request<SqlQueryRequest>,
@@ -219,9 +242,19 @@ pub fn convert_output(
         Output::Records(batches) => {
             let mut writer = QueryResponseWriter::new(resp_compress_min_length);
             writer.write_batches(batches)?;
+            let mut num_rows = 0;
+            for batch in batches {
+                num_rows += batch.num_rows();
+            }
+            GRPC_HANDLER_COUNTER_VEC
+                .query_succeeded_row
+                .inc_by(num_rows as u64);
             writer.finish()
         }
         Output::AffectedRows(rows) => {
+            GRPC_HANDLER_COUNTER_VEC
+                .query_affected_row
+                .inc_by(*rows as u64);
             Ok(QueryResponseBuilder::with_ok_header().build_with_affected_rows(*rows))
         }
     }
diff --git a/proxy/src/grpc/write.rs b/proxy/src/grpc/write.rs
index ca4c7f2f55..ecf0e1dac3 100644
--- a/proxy/src/grpc/write.rs
+++ b/proxy/src/grpc/write.rs
@@ -21,8 +21,9 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         match self.handle_write_internal(ctx, req).await {
             Err(e) => {
                 error!("Failed to handle write, err:{e}");
+                GRPC_HANDLER_COUNTER_VEC.write_failed.inc();
                 GRPC_HANDLER_COUNTER_VEC
-                    .write_failed
+                    .write_failed_row
                     .inc_by(num_rows as u64);
                 WriteResponse {
                     header: Some(error::build_err_header(e)),
@@ -30,9 +31,13 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
                 }
             }
             Ok(v) => {
+                GRPC_HANDLER_COUNTER_VEC.write_succeeded.inc();
                 GRPC_HANDLER_COUNTER_VEC
-                    .write_failed
+                    .write_failed_row
                     .inc_by(v.failed as u64);
+                GRPC_HANDLER_COUNTER_VEC
+                    .write_succeeded_row
+                    .inc_by(v.success as u64);
                 WriteResponse {
                     header: Some(build_ok_header()),
                     success: v.success,
diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs
index d414f7e0db..87fd26e76b 100644
--- a/proxy/src/http/prom.rs
+++ b/proxy/src/http/prom.rs
@@ -62,6 +62,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             runtime: self.engine_runtimes.write_runtime.clone(),
             timeout: ctx.timeout,
             enable_partition_table_access: false,
+            forwarded_from: None,
         };
 
         let result = self.handle_write_internal(ctx, table_request).await?;
@@ -241,6 +242,7 @@ impl Converter {
         let value_idx = schema.index_of(field_col_name).context(InternalNoCause {
             msg: "Value column is missing in query response",
         })?;
+        // Todo is there need add is_dictionary check?
         let tags = schema
             .columns()
             .iter()
diff --git a/proxy/src/http/sql.rs b/proxy/src/http/sql.rs
index 127732c24e..67ec394042 100644
--- a/proxy/src/http/sql.rs
+++ b/proxy/src/http/sql.rs
@@ -37,6 +37,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             timeout: ctx.timeout,
             runtime: self.engine_runtimes.read_runtime.clone(),
             enable_partition_table_access: true,
+            forwarded_from: None,
         };
 
         match self.handle_sql(context, &ctx.schema, &req.query).await? {
diff --git a/proxy/src/influxdb/mod.rs b/proxy/src/influxdb/mod.rs
index e028c59ff3..c4a2fee151 100644
--- a/proxy/src/influxdb/mod.rs
+++ b/proxy/src/influxdb/mod.rs
@@ -58,6 +58,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             timeout: ctx.timeout,
             runtime: self.engine_runtimes.write_runtime.clone(),
             enable_partition_table_access: false,
+            forwarded_from: None,
         };
         let result = self
             .handle_write_internal(proxy_context, table_request)
diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs
index 58cba675ab..85681d95c5 100644
--- a/proxy/src/influxdb/types.rs
+++ b/proxy/src/influxdb/types.rs
@@ -811,11 +811,13 @@ mod tests {
     }
 
     fn build_test_column_blocks() -> Vec<ColumnBlock> {
-        let mut measurement_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3);
-        let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3);
-        let mut time_builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 3);
-        let mut field_builder1 = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3);
-        let mut field_builder2 = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 3);
+        // TODO missing is_dictionary paramms
+        let mut measurement_builder =
+            ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
+        let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
+        let mut time_builder = ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, 3, false);
+        let mut field_builder1 = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
+        let mut field_builder2 = ColumnBlockBuilder::with_capacity(&DatumKind::UInt64, 3, false);
 
         // Data in measurement1
         let measurement1 = Datum::String(StringBytes::copy_from_str("m1"));
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 8ab85fd147..c54a5acfc6 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -22,6 +22,8 @@ pub mod schema_config_provider;
 mod util;
 mod write;
 
+pub const FORWARDED_FROM: &str = "forwarded-from";
+
 use std::{
     sync::Arc,
     time::{Duration, Instant},
@@ -131,6 +133,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             schema: req_ctx.database.clone(),
             table: metric,
             req: req.into_request(),
+            forwarded_from: None,
         };
         let do_query = |mut client: StorageServiceClient<Channel>,
                         request: tonic::Request<PrometheusRemoteQueryRequest>,
@@ -452,4 +455,5 @@ pub struct Context {
     pub timeout: Option<Duration>,
     pub runtime: Arc<Runtime>,
     pub enable_partition_table_access: bool,
+    pub forwarded_from: Option<String>,
 }
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index 9131cf54d0..a47b9454be 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -41,7 +41,10 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         schema: &str,
         sql: &str,
     ) -> Result<SqlResponse> {
-        if let Some(resp) = self.maybe_forward_sql_query(schema, sql).await? {
+        if let Some(resp) = self
+            .maybe_forward_sql_query(ctx.clone(), schema, sql)
+            .await?
+        {
             match resp {
                 ForwardResult::Forwarded(resp) => return Ok(SqlResponse::Forwarded(resp?)),
                 ForwardResult::Local => (),
@@ -149,6 +152,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
 
     async fn maybe_forward_sql_query(
         &self,
+        ctx: Context,
         schema: &str,
         sql: &str,
     ) -> Result<Option<ForwardResult<SqlQueryResponse, Error>>> {
@@ -174,6 +178,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
             schema: schema.to_string(),
             table: table_name.unwrap(),
             req: sql_request.into_request(),
+            forwarded_from: ctx.forwarded_from,
         };
         let do_query = |mut client: StorageServiceClient<Channel>,
                         request: tonic::Request<SqlQueryRequest>,
diff --git a/proxy/src/write.rs b/proxy/src/write.rs
index a371e3cd01..008489bfdc 100644
--- a/proxy/src/write.rs
+++ b/proxy/src/write.rs
@@ -108,7 +108,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         let mut futures = Vec::with_capacity(write_requests_to_forward.len() + 1);
 
         // Write to remote.
-        self.collect_write_to_remote_future(&mut futures, write_requests_to_forward)
+        self.collect_write_to_remote_future(&mut futures, ctx.clone(), write_requests_to_forward)
             .await;
 
         // Write to local.
@@ -139,7 +139,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         let mut futures = Vec::with_capacity(write_requests_to_forward.len() + 1);
 
         // Write to remote.
-        self.collect_write_to_remote_future(&mut futures, write_requests_to_forward)
+        self.collect_write_to_remote_future(&mut futures, ctx.clone(), write_requests_to_forward)
             .await;
 
         // Create table.
@@ -358,12 +358,14 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
     async fn collect_write_to_remote_future(
         &self,
         futures: &mut WriteResponseFutures<'_>,
+        ctx: Context,
         write_request: HashMap<Endpoint, WriteRequest>,
     ) {
         for (endpoint, table_write_request) in write_request {
             let forwarder = self.forwarder.clone();
+            let ctx = ctx.clone();
             let write_handle = self.engine_runtimes.io_runtime.spawn(async move {
-                Self::write_to_remote(forwarder, endpoint, table_write_request).await
+                Self::write_to_remote(ctx, forwarder, endpoint, table_write_request).await
             });
 
             futures.push(write_handle.boxed());
@@ -408,6 +410,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
     }
 
     async fn write_to_remote(
+        ctx: Context,
         forwarder: ForwarderRef,
         endpoint: Endpoint,
         table_write_request: WriteRequest,
@@ -432,7 +435,12 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
         };
 
         let forward_result = forwarder
-            .forward_with_endpoint(endpoint, tonic::Request::new(table_write_request), do_write)
+            .forward_with_endpoint(
+                endpoint,
+                tonic::Request::new(table_write_request),
+                ctx.forwarded_from,
+                do_write,
+            )
             .await;
         let forward_res = forward_result
             .map_err(|e| {
@@ -676,8 +684,8 @@ fn find_new_columns(
             );
 
             let tag_name = &tag_names[name_index];
-
-            build_column(&mut columns, schema, tag_name, &tag.value, true)?;
+            // todo is_dictionary set true or false ?
+            build_column(&mut columns, schema, tag_name, &tag.value, true, false)?;
         }
 
         // Parse fields.
@@ -693,7 +701,8 @@ fn find_new_columns(
                     }
                 );
                 let field_name = &field_names[field.name_index as usize];
-                build_column(&mut columns, schema, field_name, &field.value, false)?;
+                // todo is_dictionary set true or false ?
+                build_column(&mut columns, schema, field_name, &field.value, false, false)?;
             }
         }
     }
@@ -707,6 +716,7 @@ fn build_column<'a>(
     name: &'a str,
     value: &Option<Value>,
     is_tag: bool,
+    is_dictionary: bool,
 ) -> Result<()> {
     // Skip adding columns, the following cases:
     // 1. Column already exists.
@@ -732,7 +742,7 @@ fn build_column<'a>(
             msg: "Failed to get data type",
         })?;
 
-    let column_schema = build_column_schema(name, data_type, is_tag)
+    let column_schema = build_column_schema(name, data_type, is_tag, is_dictionary)
         .box_err()
         .context(Internal {
             msg: "Failed to build column schema",
diff --git a/query_frontend/src/frontend.rs b/query_frontend/src/frontend.rs
index b82f58d693..ecdf8e17f2 100644
--- a/query_frontend/src/frontend.rs
+++ b/query_frontend/src/frontend.rs
@@ -177,6 +177,10 @@ impl<P: MetaProvider> Frontend<P> {
 }
 
 pub fn parse_table_name(statements: &StatementVec) -> Option<String> {
+    // maybe have empty sql
+    if statements.is_empty() {
+        return None;
+    }
     match &statements[0] {
         Statement::Standard(s) => match *s.clone() {
             SqlStatement::Insert { table_name, .. } => {
@@ -269,5 +273,8 @@ mod tests {
                 Some(table.to_string())
             );
         }
+        assert!(frontend::parse_table_name_with_sql("-- just comment")
+            .unwrap()
+            .is_none());
     }
 }
diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs
index 883e8e22ab..813344a888 100644
--- a/query_frontend/src/parser.rs
+++ b/query_frontend/src/parser.rs
@@ -37,6 +37,7 @@ macro_rules! parser_err {
 
 const TS_KEY: &str = "__ts_key";
 const TAG: &str = "TAG";
+const DICTIONARY: &str = "DICTIONARY";
 const UNSIGN: &str = "UNSIGN";
 const MODIFY: &str = "MODIFY";
 const SETTING: &str = "SETTING";
@@ -62,6 +63,7 @@ macro_rules! is_custom_column {
 }
 
 is_custom_column!(TAG);
+is_custom_column!(DICTIONARY);
 is_custom_column!(UNSIGN);
 
 /// Get the comment from the [`ColumnOption`] if it is a comment option.
@@ -326,6 +328,22 @@ impl<'a> Parser<'a> {
         // WITH ...
         let options = self.parser.parse_options(Keyword::WITH)?;
 
+        // Only String Column Can Be Dictionary Encoded
+        for c in columns.iter() {
+            let mut is_dictionary = false;
+            for op in c.options.iter() {
+                if is_dictionary_column(&op.option) {
+                    is_dictionary = true;
+                }
+            }
+            if c.data_type != DataType::String && is_dictionary {
+                return parser_err!(format!(
+                    "Only string column can be dictionary encoded: {:?}",
+                    c.to_string()
+                ));
+            }
+        }
+
         Ok(Statement::Create(Box::new(CreateTable {
             if_not_exists,
             table_name,
@@ -513,6 +531,11 @@ impl<'a> Parser<'a> {
             Ok(Some(ColumnOption::DialectSpecific(vec![
                 Token::make_keyword(TAG),
             ])))
+        } else if self.consume_token(DICTIONARY) {
+            // Support DICTIONARY for ceresdb
+            Ok(Some(ColumnOption::DialectSpecific(vec![
+                Token::make_keyword(DICTIONARY),
+            ])))
         } else if self.consume_token(UNSIGN) {
             // Support unsign for ceresdb
             Ok(Some(ColumnOption::DialectSpecific(vec![
@@ -973,6 +996,52 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_dictionary_column() {
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 float dictionary, c3 bigint unsign)";
+        assert!(Parser::parse_sql(sql).is_err());
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 string dictionary, c3 bigint unsign)";
+        let statements = Parser::parse_sql(sql).unwrap();
+        assert_eq!(statements.len(), 1);
+        match &statements[0] {
+            Statement::Create(v) => {
+                let columns = &v.columns;
+                assert_eq!(3, columns.len());
+                for c in columns {
+                    if c.name.value == "c1" {
+                        assert_eq!(2, c.options.len());
+                        let opt = &c.options[0];
+                        assert!(is_tag_column(&opt.option));
+                        let opt = &c.options[1];
+                        assert!(is_dictionary_column(&opt.option));
+                    } else if c.name.value == "c2" {
+                        assert_eq!(1, c.options.len());
+                        let opt = &c.options[0];
+                        assert!(is_dictionary_column(&opt.option));
+                    } else if c.name.value == "c3" {
+                        assert_eq!(1, c.options.len());
+                        let opt = &c.options[0];
+                        assert!(is_unsign_column(&opt.option));
+                    } else {
+                        panic!("failed");
+                    }
+                }
+            }
+            _ => panic!("failed"),
+        }
+    }
+
+    #[test]
+    fn test_dictionary_use_unstring_column() {
+        let sql =
+            "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float dictionary, c3 bigint unsign)";
+        assert!(Parser::parse_sql(sql).is_err());
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag dictionary, c2 float dictionary, c3 bigint unsign)";
+        assert!(Parser::parse_sql(sql).is_err());
+        let sql = "CREATE TABLE IF NOT EXISTS t(c1 string tag, c2 float dictionary, c3 bigint unsign dictionary)";
+        assert!(Parser::parse_sql(sql).is_err());
+    }
+
     #[test]
     fn test_comment_column() {
         let sql = "CREATE TABLE IF NOT EXISTS t(c1 string, c2 float, c3 bigint comment 'id')";
diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs
index 694bc0ee0e..2baf01be10 100644
--- a/query_frontend/src/planner.rs
+++ b/query_frontend/src/planner.rs
@@ -367,10 +367,12 @@ pub fn build_column_schema(
     column_name: &str,
     data_type: DatumKind,
     is_tag: bool,
+    is_dictionary: bool,
 ) -> Result<ColumnSchema> {
     let builder = column_schema::Builder::new(column_name.to_string(), data_type)
         .is_nullable(true)
-        .is_tag(is_tag);
+        .is_tag(is_tag)
+        .is_dictionary(is_dictionary);
 
     builder.build().with_context(|| InvalidColumnSchema {
         column_name: column_name.to_string(),
@@ -429,9 +431,19 @@ pub fn build_schema_from_write_table_request(
             let data_type = try_get_data_type_from_value(tag_value)?;
 
             if let Some(column_schema) = name_column_map.get(tag_name) {
-                ensure_data_type_compatible(table, tag_name, true, data_type, column_schema)?;
+                // Todo is_dictionary set true or false ? Do we need modify the pb ?
+                ensure_data_type_compatible(
+                    table,
+                    tag_name,
+                    true,
+                    false,
+                    data_type,
+                    column_schema,
+                )?;
             }
-            let column_schema = build_column_schema(tag_name, data_type, true)?;
+
+            // Todo is_dictionary set true or false ? Do we need modify the pb ?
+            let column_schema = build_column_schema(tag_name, data_type, true, false)?;
             name_column_map.insert(tag_name, column_schema);
         }
 
@@ -457,16 +469,18 @@ pub fn build_schema_from_write_table_request(
                     let data_type = try_get_data_type_from_value(field_value)?;
 
                     if let Some(column_schema) = name_column_map.get(field_name) {
+                        // todo is_dictionary set true or false ?
                         ensure_data_type_compatible(
                             table,
                             field_name,
                             false,
+                            false,
                             data_type,
                             column_schema,
                         )?;
                     }
-
-                    let column_schema = build_column_schema(field_name, data_type, false)?;
+                    // todo is_dictionary set true or false ?
+                    let column_schema = build_column_schema(field_name, data_type, false, false)?;
                     name_column_map.insert(field_name, column_schema);
                 }
             }
@@ -512,9 +526,11 @@ fn ensure_data_type_compatible(
     table_name: &str,
     column_name: &str,
     is_tag: bool,
+    _is_dictionary: bool,
     data_type: DatumKind,
     column_schema: &ColumnSchema,
 ) -> Result<()> {
+    // Todo how to check is_dictionary ?
     ensure!(
         column_schema.is_tag == is_tag,
         InvalidWriteEntry {
@@ -1234,6 +1250,7 @@ fn parse_column(col: &ColumnDef) -> Result<ColumnSchema> {
     // Process column options
     let mut is_nullable = true; // A column is nullable by default.
     let mut is_tag = false;
+    let mut is_dictionary = false;
     let mut is_unsign = false;
     let mut comment = String::new();
     let mut default_value = None;
@@ -1242,6 +1259,8 @@ fn parse_column(col: &ColumnDef) -> Result<ColumnSchema> {
             is_nullable = false;
         } else if parser::is_tag_column(&option_def.option) {
             is_tag = true;
+        } else if parser::is_dictionary_column(&option_def.option) {
+            is_dictionary = true;
         } else if parser::is_unsign_column(&option_def.option) {
             is_unsign = true;
         } else if let Some(default_value_expr) = parser::get_default_value(&option_def.option) {
@@ -1260,6 +1279,7 @@ fn parse_column(col: &ColumnDef) -> Result<ColumnSchema> {
     let builder = column_schema::Builder::new(col.name.value.clone(), data_type)
         .is_nullable(is_nullable)
         .is_tag(is_tag)
+        .is_dictionary(is_dictionary)
         .comment(comment)
         .default_value(default_value);
 
@@ -1441,6 +1461,7 @@ mod tests {
                         data_type: String,
                         is_nullable: false,
                         is_tag: true,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "c1",
                         default_value: None,
@@ -1451,6 +1472,7 @@ mod tests {
                         data_type: Timestamp,
                         is_nullable: false,
                         is_tag: false,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "ts",
                         default_value: None,
@@ -1461,6 +1483,7 @@ mod tests {
                         data_type: String,
                         is_nullable: true,
                         is_tag: false,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "c3",
                         default_value: None,
@@ -1471,6 +1494,7 @@ mod tests {
                         data_type: UInt32,
                         is_nullable: true,
                         is_tag: false,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "c4",
                         default_value: Some(
@@ -1488,6 +1512,7 @@ mod tests {
                         data_type: UInt32,
                         is_nullable: true,
                         is_tag: false,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "c5",
                         default_value: Some(
@@ -1514,6 +1539,7 @@ mod tests {
                         data_type: String,
                         is_nullable: true,
                         is_tag: false,
+                        is_dictionary: false,
                         comment: "",
                         escaped_name: "c6",
                         default_value: Some(
@@ -1612,6 +1638,7 @@ mod tests {
                             data_type: Varbinary,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key1",
                             default_value: None,
@@ -1622,6 +1649,7 @@ mod tests {
                             data_type: Timestamp,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key2",
                             default_value: None,
@@ -1632,6 +1660,7 @@ mod tests {
                             data_type: Double,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field1",
                             default_value: None,
@@ -1642,6 +1671,7 @@ mod tests {
                             data_type: String,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field2",
                             default_value: None,
@@ -1652,6 +1682,7 @@ mod tests {
                             data_type: Date,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field3",
                             default_value: None,
@@ -1662,6 +1693,7 @@ mod tests {
                             data_type: Time,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field4",
                             default_value: None,
@@ -1687,6 +1719,7 @@ mod tests {
                             data_type: Varbinary,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key1",
                             default_value: None,
@@ -1697,6 +1730,7 @@ mod tests {
                             data_type: Timestamp,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key2",
                             default_value: None,
@@ -1707,6 +1741,7 @@ mod tests {
                             data_type: Double,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field1",
                             default_value: None,
@@ -1717,6 +1752,7 @@ mod tests {
                             data_type: String,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field2",
                             default_value: None,
@@ -1727,6 +1763,7 @@ mod tests {
                             data_type: Date,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field3",
                             default_value: None,
@@ -1737,6 +1774,7 @@ mod tests {
                             data_type: Time,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field4",
                             default_value: None,
@@ -1851,6 +1889,7 @@ mod tests {
                             data_type: Varbinary,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key1",
                             default_value: None,
@@ -1861,6 +1900,7 @@ mod tests {
                             data_type: Timestamp,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key2",
                             default_value: None,
@@ -1871,6 +1911,7 @@ mod tests {
                             data_type: Double,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field1",
                             default_value: None,
@@ -1881,6 +1922,7 @@ mod tests {
                             data_type: String,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field2",
                             default_value: None,
@@ -1891,6 +1933,7 @@ mod tests {
                             data_type: Date,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field3",
                             default_value: None,
@@ -1901,6 +1944,7 @@ mod tests {
                             data_type: Time,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field4",
                             default_value: None,
@@ -1920,6 +1964,119 @@ mod tests {
         .unwrap();
     }
 
+    #[test]
+    fn test_alter_column_with_dictionary_encode() {
+        let sql = "ALTER TABLE test_table ADD column dic string dictionary;";
+        quick_test(
+            sql,
+            r#"AlterTable(
+    AlterTablePlan {
+        table: MemoryTable {
+            name: "test_table",
+            id: TableId(
+                100,
+            ),
+            schema: Schema {
+                timestamp_index: 1,
+                tsid_index: None,
+                column_schemas: ColumnSchemas {
+                    columns: [
+                        ColumnSchema {
+                            id: 1,
+                            name: "key1",
+                            data_type: Varbinary,
+                            is_nullable: false,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "key1",
+                            default_value: None,
+                        },
+                        ColumnSchema {
+                            id: 2,
+                            name: "key2",
+                            data_type: Timestamp,
+                            is_nullable: false,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "key2",
+                            default_value: None,
+                        },
+                        ColumnSchema {
+                            id: 3,
+                            name: "field1",
+                            data_type: Double,
+                            is_nullable: true,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "field1",
+                            default_value: None,
+                        },
+                        ColumnSchema {
+                            id: 4,
+                            name: "field2",
+                            data_type: String,
+                            is_nullable: true,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "field2",
+                            default_value: None,
+                        },
+                        ColumnSchema {
+                            id: 5,
+                            name: "field3",
+                            data_type: Date,
+                            is_nullable: true,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "field3",
+                            default_value: None,
+                        },
+                        ColumnSchema {
+                            id: 6,
+                            name: "field4",
+                            data_type: Time,
+                            is_nullable: true,
+                            is_tag: false,
+                            is_dictionary: false,
+                            comment: "",
+                            escaped_name: "field4",
+                            default_value: None,
+                        },
+                    ],
+                },
+                version: 1,
+                primary_key_indexes: [
+                    0,
+                    1,
+                ],
+            },
+        },
+        operations: AddColumn(
+            [
+                ColumnSchema {
+                    id: 0,
+                    name: "dic",
+                    data_type: String,
+                    is_nullable: true,
+                    is_tag: false,
+                    is_dictionary: true,
+                    comment: "",
+                    escaped_name: "dic",
+                    default_value: None,
+                },
+            ],
+        ),
+    },
+)"#,
+        )
+        .unwrap();
+    }
+
     #[test]
     fn test_alter_column_statement_to_plan() {
         let sql = "ALTER TABLE test_tablex ADD column add_col string;";
@@ -1946,6 +2103,7 @@ mod tests {
                             data_type: Varbinary,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key1",
                             default_value: None,
@@ -1956,6 +2114,7 @@ mod tests {
                             data_type: Timestamp,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key2",
                             default_value: None,
@@ -1966,6 +2125,7 @@ mod tests {
                             data_type: Double,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field1",
                             default_value: None,
@@ -1976,6 +2136,7 @@ mod tests {
                             data_type: String,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field2",
                             default_value: None,
@@ -1986,6 +2147,7 @@ mod tests {
                             data_type: Date,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field3",
                             default_value: None,
@@ -1996,6 +2158,7 @@ mod tests {
                             data_type: Time,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field4",
                             default_value: None,
@@ -2017,6 +2180,7 @@ mod tests {
                     data_type: String,
                     is_nullable: true,
                     is_tag: false,
+                    is_dictionary: false,
                     comment: "",
                     escaped_name: "add_col",
                     default_value: None,
@@ -2055,6 +2219,7 @@ mod tests {
                             data_type: Varbinary,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key1",
                             default_value: None,
@@ -2065,6 +2230,7 @@ mod tests {
                             data_type: Timestamp,
                             is_nullable: false,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "key2",
                             default_value: None,
@@ -2075,6 +2241,7 @@ mod tests {
                             data_type: Double,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field1",
                             default_value: None,
@@ -2085,6 +2252,7 @@ mod tests {
                             data_type: String,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field2",
                             default_value: None,
@@ -2095,6 +2263,7 @@ mod tests {
                             data_type: Date,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field3",
                             default_value: None,
@@ -2105,6 +2274,7 @@ mod tests {
                             data_type: Time,
                             is_nullable: true,
                             is_tag: false,
+                            is_dictionary: false,
                             comment: "",
                             escaped_name: "field4",
                             default_value: None,
@@ -2156,6 +2326,7 @@ mod tests {
                                 data_type: Varbinary,
                                 is_nullable: false,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "key1",
                                 default_value: None,
@@ -2166,6 +2337,7 @@ mod tests {
                                 data_type: Timestamp,
                                 is_nullable: false,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "key2",
                                 default_value: None,
@@ -2176,6 +2348,7 @@ mod tests {
                                 data_type: Double,
                                 is_nullable: true,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "field1",
                                 default_value: None,
@@ -2186,6 +2359,7 @@ mod tests {
                                 data_type: String,
                                 is_nullable: true,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "field2",
                                 default_value: None,
@@ -2196,6 +2370,7 @@ mod tests {
                                 data_type: Date,
                                 is_nullable: true,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "field3",
                                 default_value: None,
@@ -2206,6 +2381,7 @@ mod tests {
                                 data_type: Time,
                                 is_nullable: true,
                                 is_tag: false,
+                                is_dictionary: false,
                                 comment: "",
                                 escaped_name: "field4",
                                 default_value: None,
diff --git a/rust-toolchain b/rust-toolchain
deleted file mode 100644
index 3f36906f0f..0000000000
--- a/rust-toolchain
+++ /dev/null
@@ -1 +0,0 @@
-nightly-2023-02-02
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 0000000000..1680342afe
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,3 @@
+[toolchain]
+channel = "nightly-2023-02-02"
+components = [ "rustfmt", "clippy" ]
diff --git a/server/Cargo.toml b/server/Cargo.toml
index 36f89ee2d1..eb5c523af5 100644
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -35,7 +35,6 @@ meta_client = { workspace = true }
 opensrv-mysql = "0.1.0"
 partition_table_engine = { workspace = true }
 paste = { workspace = true }
-pprof = { version = "0.11.1", features = ["flamegraph"] }
 profile = { workspace = true }
 prom-remote-api = { workspace = true, features = ["warp"] }
 prometheus = { workspace = true }
diff --git a/server/src/grpc/metrics.rs b/server/src/grpc/metrics.rs
index 94a1213e03..d86109766f 100644
--- a/server/src/grpc/metrics.rs
+++ b/server/src/grpc/metrics.rs
@@ -36,7 +36,15 @@ make_auto_flush_static_metric! {
     }
 
     pub label_enum RemoteEngineGrpcTypeKind {
+        write_succeeded,
         write_failed,
+        query_succeeded,
+        query_failed,
+        stream_query_succeeded,
+        stream_query_failed,
+        write_succeeded_row,
+        write_failed_row,
+        query_succeeded_row,
     }
 
     pub struct RemoteEngineGrpcHandlerCounterVec: LocalIntCounter {
diff --git a/server/src/grpc/remote_engine_service/mod.rs b/server/src/grpc/remote_engine_service/mod.rs
index e1b88eabf5..bd2401628c 100644
--- a/server/src/grpc/remote_engine_service/mod.rs
+++ b/server/src/grpc/remote_engine_service/mod.rs
@@ -74,12 +74,19 @@ impl<Q: QueryExecutor + 'static> RemoteEngineServiceImpl<Q> {
             });
             let tx = tx.clone();
             self.runtimes.read_runtime.spawn(async move {
+                let mut num_rows = 0;
                 while let Some(batch) = stream.next().await {
+                    if let Ok(record_batch) = &batch {
+                        num_rows += record_batch.num_rows();
+                    }
                     if let Err(e) = tx.send(batch).await {
                         error!("Failed to send handler result, err:{}.", e);
                         break;
                     }
                 }
+                REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC
+                    .query_succeeded_row
+                    .inc_by(num_rows as u64);
             });
         }
 
@@ -331,21 +338,32 @@ async fn handle_stream_read(
 
     let begin = Instant::now();
     let table = find_table_by_identifier(&ctx, &table_ident)?;
-    let streams = table
+    let res = table
         .partitioned_read(read_request)
         .await
         .box_err()
         .with_context(|| ErrWithCause {
             code: StatusCode::Internal,
             msg: format!("fail to read table, table:{table_ident:?}"),
-        })?;
-
-    info!(
+        });
+    match res {
+        Ok(streams) => {
+            info!(
         "Handle stream read success, request_id:{request_id}, table:{table_ident:?}, cost:{:?}",
         begin.elapsed(),
     );
-
-    Ok(streams)
+            REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC
+                .stream_query_succeeded
+                .inc();
+            Ok(streams)
+        }
+        Err(e) => {
+            REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC
+                .stream_query_failed
+                .inc();
+            Err(e)
+        }
+    }
 }
 
 async fn handle_write(ctx: HandlerContext, request: WriteRequest) -> Result<WriteResponse> {
@@ -367,13 +385,20 @@ async fn handle_write(ctx: HandlerContext, request: WriteRequest) -> Result<Writ
             msg: format!("fail to write table, table:{:?}", write_request.table),
         });
     match res {
-        Ok(affected_rows) => Ok(WriteResponse {
-            header: None,
-            affected_rows: affected_rows as u64,
-        }),
+        Ok(affected_rows) => {
+            REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC.write_succeeded.inc();
+            REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC
+                .write_succeeded_row
+                .inc_by(affected_rows as u64);
+            Ok(WriteResponse {
+                header: None,
+                affected_rows: affected_rows as u64,
+            })
+        }
         Err(e) => {
+            REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC.write_failed.inc();
             REMOTE_ENGINE_GRPC_HANDLER_COUNTER_VEC
-                .write_failed
+                .write_failed_row
                 .inc_by(num_rows as u64);
             Err(e)
         }
diff --git a/server/src/grpc/storage_service/mod.rs b/server/src/grpc/storage_service/mod.rs
index 7487b4fda8..ca758146ca 100644
--- a/server/src/grpc/storage_service/mod.rs
+++ b/server/src/grpc/storage_service/mod.rs
@@ -21,7 +21,7 @@ use ceresdbproto::{
 use common_util::time::InstantExt;
 use futures::{stream, stream::BoxStream, StreamExt};
 use http::StatusCode;
-use proxy::{Context, Proxy};
+use proxy::{Context, Proxy, FORWARDED_FROM};
 use query_engine::executor::Executor as QueryExecutor;
 use table_engine::engine::EngineRuntimes;
 
@@ -138,6 +138,10 @@ impl<Q: QueryExecutor + 'static> StorageService for StorageServiceImpl<Q> {
             runtime: self.runtimes.read_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
         let stream = Self::stream_sql_query_internal(ctx, proxy, req).await;
 
@@ -155,13 +159,17 @@ impl<Q: QueryExecutor + 'static> StorageServiceImpl<Q> {
         &self,
         req: tonic::Request<RouteRequest>,
     ) -> Result<tonic::Response<RouteResponse>, tonic::Status> {
-        let req = req.into_inner();
-        let proxy = self.proxy.clone();
         let ctx = Context {
             runtime: self.runtimes.read_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
+        let req = req.into_inner();
+        let proxy = self.proxy.clone();
 
         let join_handle = self
             .runtimes
@@ -186,13 +194,17 @@ impl<Q: QueryExecutor + 'static> StorageServiceImpl<Q> {
         &self,
         req: tonic::Request<WriteRequest>,
     ) -> Result<tonic::Response<WriteResponse>, tonic::Status> {
-        let req = req.into_inner();
-        let proxy = self.proxy.clone();
         let ctx = Context {
             runtime: self.runtimes.write_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
+        let req = req.into_inner();
+        let proxy = self.proxy.clone();
 
         let join_handle = self.runtimes.write_runtime.spawn(async move {
             if req.context.is_none() {
@@ -226,13 +238,18 @@ impl<Q: QueryExecutor + 'static> StorageServiceImpl<Q> {
         &self,
         req: tonic::Request<SqlQueryRequest>,
     ) -> Result<tonic::Response<SqlQueryResponse>, tonic::Status> {
-        let req = req.into_inner();
-        let proxy = self.proxy.clone();
         let ctx = Context {
             runtime: self.runtimes.read_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
+        let req = req.into_inner();
+        let proxy = self.proxy.clone();
+
         let join_handle = self
             .runtimes
             .read_runtime
@@ -289,13 +306,18 @@ impl<Q: QueryExecutor + 'static> StorageServiceImpl<Q> {
         &self,
         req: tonic::Request<PrometheusQueryRequest>,
     ) -> Result<tonic::Response<PrometheusQueryResponse>, tonic::Status> {
-        let req = req.into_inner();
-        let proxy = self.proxy.clone();
         let ctx = Context {
             runtime: self.runtimes.read_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
+        let req = req.into_inner();
+        let proxy = self.proxy.clone();
+
         let join_handle = self.runtimes.read_runtime.spawn(async move {
             if req.context.is_none() {
                 return PrometheusQueryResponse {
@@ -329,13 +351,17 @@ impl<Q: QueryExecutor + 'static> StorageServiceImpl<Q> {
     ) -> Result<tonic::Response<WriteResponse>, tonic::Status> {
         let mut total_success = 0;
 
-        let mut stream = req.into_inner();
-        let proxy = self.proxy.clone();
         let ctx = Context {
             runtime: self.runtimes.write_runtime.clone(),
             timeout: self.timeout,
             enable_partition_table_access: false,
+            forwarded_from: req
+                .metadata()
+                .get(FORWARDED_FROM)
+                .map(|value| value.to_str().unwrap().to_string()),
         };
+        let mut stream = req.into_inner();
+        let proxy = self.proxy.clone();
 
         let join_handle = self.runtimes.write_runtime.spawn(async move {
             let mut resp = WriteResponse::default();
diff --git a/server/src/http.rs b/server/src/http.rs
index 9e229fc539..15214fc454 100644
--- a/server/src/http.rs
+++ b/server/src/http.rs
@@ -3,8 +3,8 @@
 //! Http service
 
 use std::{
-    collections::HashMap, convert::Infallible, error::Error as StdError, fs::File, net::IpAddr,
-    sync::Arc, thread, time::Duration,
+    collections::HashMap, convert::Infallible, error::Error as StdError, net::IpAddr, sync::Arc,
+    time::Duration,
 };
 
 use analytic_engine::setup::OpenedWals;
@@ -80,6 +80,12 @@ pub enum Error {
         backtrace: Backtrace,
     },
 
+    #[snafu(display("Fail to do cpu profiling, err:{}.\nBacktrace:\n{}", source, backtrace))]
+    ProfileCPU {
+        source: profile::Error,
+        backtrace: Backtrace,
+    },
+
     #[snafu(display("Fail to join async task, err:{}.", source))]
     JoinAsyncTask { source: common_util::runtime::Error },
 
@@ -184,8 +190,8 @@ impl<Q: QueryExecutor + 'static> Service<Q> {
             // debug APIs
             .or(self.flush_memtable())
             .or(self.update_log_level())
-            .or(self.heap_profile())
-            .or(self.cpu_profile())
+            .or(self.profile_cpu())
+            .or(self.profile_heap())
             .or(self.server_config())
             .or(self.stats())
             .with(warp::log("http_requests"))
@@ -393,62 +399,51 @@ impl<Q: QueryExecutor + 'static> Service<Q> {
         warp::path!("metrics").and(warp::get()).map(metrics::dump)
     }
 
-    // GET /debug/heap_profile/{seconds}
-    fn heap_profile(
+    // GET /debug/profile/cpu/{seconds}
+    fn profile_cpu(
         &self,
     ) -> impl Filter<Extract = (impl warp::Reply,), Error = warp::Rejection> + Clone {
-        warp::path!("debug" / "heap_profile" / ..)
+        warp::path!("debug" / "profile" / "cpu" / ..)
             .and(warp::path::param::<u64>())
             .and(warp::get())
             .and(self.with_profiler())
             .and(self.with_runtime())
             .and_then(
                 |duration_sec: u64, profiler: Arc<Profiler>, runtime: Arc<Runtime>| async move {
-                    let handle = runtime.spawn_blocking(move || {
-                        profiler.dump_mem_prof(duration_sec).context(ProfileHeap)
+                    let handle = runtime.spawn_blocking(move || -> Result<()> {
+                        profiler.dump_cpu_prof(duration_sec).context(ProfileCPU)
                     });
                     let result = handle.await.context(JoinAsyncTask);
                     match result {
-                        Ok(Ok(prof_data)) => Ok(prof_data.into_response()),
-                        Ok(Err(e)) => Err(reject::custom(e)),
+                        Ok(_) => Ok("ok"),
                         Err(e) => Err(reject::custom(e)),
                     }
                 },
             )
     }
 
-    // GET /debug/cpu_profile/{seconds}
-    fn cpu_profile(
+    // GET /debug/profile/heap/{seconds}
+    fn profile_heap(
         &self,
     ) -> impl Filter<Extract = (impl warp::Reply,), Error = warp::Rejection> + Clone {
-        warp::path!("debug" / "cpu_profile" / ..)
+        warp::path!("debug" / "profile" / "heap" / ..)
             .and(warp::path::param::<u64>())
             .and(warp::get())
+            .and(self.with_profiler())
             .and(self.with_runtime())
-            .and_then(|duration_sec: u64, runtime: Arc<Runtime>| async move {
-                let handle = runtime.spawn_blocking(move || -> Result<()> {
-                    let guard = pprof::ProfilerGuardBuilder::default()
-                        .frequency(100)
-                        .blocklist(&["libc", "libgcc", "pthread", "vdso"])
-                        .build()
-                        .box_err()
-                        .context(Internal)?;
-
-                    thread::sleep(Duration::from_secs(duration_sec));
-
-                    let report = guard.report().build().box_err().context(Internal)?;
-                    let file = File::create("/tmp/flamegraph.svg")
-                        .box_err()
-                        .context(Internal)?;
-                    report.flamegraph(file).box_err().context(Internal)?;
-                    Ok(())
-                });
-                let result = handle.await.context(JoinAsyncTask);
-                match result {
-                    Ok(_) => Ok("ok"),
-                    Err(e) => Err(reject::custom(e)),
-                }
-            })
+            .and_then(
+                |duration_sec: u64, profiler: Arc<Profiler>, runtime: Arc<Runtime>| async move {
+                    let handle = runtime.spawn_blocking(move || {
+                        profiler.dump_heap_prof(duration_sec).context(ProfileHeap)
+                    });
+                    let result = handle.await.context(JoinAsyncTask);
+                    match result {
+                        Ok(Ok(prof_data)) => Ok(prof_data.into_response()),
+                        Ok(Err(e)) => Err(reject::custom(e)),
+                        Err(e) => Err(reject::custom(e)),
+                    }
+                },
+            )
     }
 
     // GET /debug/config
@@ -695,6 +690,7 @@ fn error_to_status_code(err: &Error) -> StatusCode {
         | Error::MissingProxy { .. }
         | Error::ParseIpAddr { .. }
         | Error::ProfileHeap { .. }
+        | Error::ProfileCPU { .. }
         | Error::Internal { .. }
         | Error::JoinAsyncTask { .. }
         | Error::AlreadyStarted { .. }
diff --git a/server/src/mysql/writer.rs b/server/src/mysql/writer.rs
index e2af7880e7..af01fa09ec 100644
--- a/server/src/mysql/writer.rs
+++ b/server/src/mysql/writer.rs
@@ -150,6 +150,7 @@ mod tests {
                     name: "id".to_string(),
                     data_type: DatumKind::Int32,
                     is_nullable: false,
+                    is_dictionary: false,
                     is_tag: false,
                     comment: "".to_string(),
                     escaped_name: "id".to_string(),
@@ -163,6 +164,7 @@ mod tests {
                     name: "name".to_string(),
                     data_type: DatumKind::String,
                     is_nullable: true,
+                    is_dictionary: false,
                     is_tag: true,
                     comment: "".to_string(),
                     escaped_name: "name".to_string(),
@@ -177,6 +179,7 @@ mod tests {
                     data_type: DatumKind::Timestamp,
                     is_nullable: true,
                     is_tag: true,
+                    is_dictionary: false,
                     comment: "".to_string(),
                     escaped_name: "birthday".to_string(),
                     default_value: None,
@@ -190,6 +193,7 @@ mod tests {
                     data_type: DatumKind::Boolean,
                     is_nullable: true,
                     is_tag: true,
+                    is_dictionary: false,
                     comment: "".to_string(),
                     escaped_name: "is_show".to_string(),
                     default_value: None,
@@ -203,6 +207,7 @@ mod tests {
                     data_type: DatumKind::Double,
                     is_nullable: true,
                     is_tag: true,
+                    is_dictionary: false,
                     comment: "".to_string(),
                     escaped_name: "money".to_string(),
                     default_value: None,
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 4e79a291fb..5e94d20d1b 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -244,7 +244,9 @@ fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     data_type: &DatumKind,
     iter: I,
 ) -> stream::Result<ColumnBlock> {
-    let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0);
+    // TODO ensure there don't use is_dictionary and the datum.clone() is necessary
+    // ?
+    let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false);
     for datum in iter {
         builder
             .append(datum.clone())
diff --git a/table_engine/src/table.rs b/table_engine/src/table.rs
index 8d7638b685..821bdb6195 100644
--- a/table_engine/src/table.rs
+++ b/table_engine/src/table.rs
@@ -249,6 +249,7 @@ impl From<TableId> for TableSeq {
 pub struct TableId(u64);
 
 impl TableId {
+    pub const MAX: TableId = TableId(u64::MAX);
     /// Min table id.
     pub const MIN: TableId = TableId(0);
 
diff --git a/tools/src/bin/sst-metadata.rs b/tools/src/bin/sst-metadata.rs
index 9eb81422bb..a089ad2da5 100644
--- a/tools/src/bin/sst-metadata.rs
+++ b/tools/src/bin/sst-metadata.rs
@@ -2,7 +2,7 @@
 
 //! A cli to query sst meta data
 
-use std::sync::Arc;
+use std::{collections::HashMap, sync::Arc};
 
 use analytic_engine::sst::{meta_data::cache::MetaData, parquet::async_reader::ChunkReaderAdapter};
 use anyhow::{Context, Result};
@@ -13,7 +13,7 @@ use common_util::{
 };
 use futures::StreamExt;
 use object_store::{LocalFileSystem, ObjectMeta, ObjectStoreRef, Path};
-use parquet_ext::meta_data::fetch_parquet_metadata;
+use parquet_ext::{meta_data::fetch_parquet_metadata, reader::ObjectStoreReader};
 use tokio::{runtime::Handle, task::JoinSet};
 
 #[derive(Parser, Debug)]
@@ -30,6 +30,38 @@ struct Args {
     /// Thread num, 0 means cpu num
     #[clap(short, long, default_value_t = 0)]
     threads: usize,
+
+    /// Print page indexes
+    #[clap(short, long, required(false))]
+    page_indexes: bool,
+}
+
+#[derive(Default, Debug)]
+struct FileStatistics {
+    file_count: u64,
+    size: usize,
+    metadata_size: usize,
+    kv_size: usize,
+    filter_size: usize,
+    row_num: i64,
+}
+
+impl ToString for FileStatistics {
+    fn to_string(&self) -> String {
+        format!("FileStatistics {{\n\tfile_count: {},\n\tsize: {:.2},\n\tmetadata_size: {:.2}, \n\tkv_size: {:.2},\n\tfilter_size: {:.2},\n\trow_num: {},\n}}",
+                self.file_count,
+                as_mb(self.size),
+                as_mb(self.metadata_size),
+                as_mb(self.kv_size),
+                as_mb(self.filter_size),
+                self.row_num)
+    }
+}
+
+#[derive(Default, Debug)]
+struct FieldStatistics {
+    compressed_size: i64,
+    uncompressed_size: i64,
 }
 
 fn new_runtime(thread_num: usize) -> Runtime {
@@ -64,6 +96,7 @@ async fn run(args: Args) -> Result<()> {
     let mut join_set = JoinSet::new();
     let mut ssts = storage.list(None).await?;
     let verbose = args.verbose;
+    let page_indexes = args.page_indexes;
     while let Some(object_meta) = ssts.next().await {
         let object_meta = object_meta?;
         let storage = storage.clone();
@@ -71,7 +104,8 @@ async fn run(args: Args) -> Result<()> {
         join_set.spawn_on(
             async move {
                 let (metadata, metadata_size, kv_size) =
-                    parse_metadata(storage, location, object_meta.size, verbose).await?;
+                    parse_metadata(storage, location, object_meta.size, verbose, page_indexes)
+                        .await?;
                 Ok::<_, anyhow::Error>((object_meta, metadata, metadata_size, kv_size))
             },
             &handle,
@@ -93,6 +127,8 @@ async fn run(args: Args) -> Result<()> {
             .cmp(&b.1.custom().time_range.inclusive_start())
     });
 
+    let mut file_stats = FileStatistics::default();
+    let mut field_stats_map = HashMap::new();
     for (object_meta, sst_metadata, metadata_size, kv_size) in metas {
         let ObjectMeta { location, size, .. } = &object_meta;
         let custom_meta = sst_metadata.custom();
@@ -108,6 +144,27 @@ async fn run(args: Args) -> Result<()> {
             .unwrap_or(0);
         let file_metadata = parquet_meta.file_metadata();
         let row_num = file_metadata.num_rows();
+
+        file_stats.file_count += 1;
+        file_stats.size += object_meta.size;
+        file_stats.metadata_size += metadata_size;
+        file_stats.kv_size += kv_size;
+        file_stats.filter_size += filter_size;
+        file_stats.row_num += row_num;
+
+        let fields = file_metadata.schema().get_fields();
+        for (_, row_group) in parquet_meta.row_groups().iter().enumerate() {
+            for i in 0..fields.len() {
+                let column_meta = row_group.column(i);
+                let field_name = fields.get(i).unwrap().get_basic_info().name().to_string();
+                let mut field_stats = field_stats_map
+                    .entry(field_name)
+                    .or_insert(FieldStatistics::default());
+                field_stats.compressed_size += column_meta.compressed_size();
+                field_stats.uncompressed_size += column_meta.uncompressed_size();
+            }
+        }
+
         if verbose {
             println!("object_meta:{object_meta:?}, parquet_meta:{parquet_meta:?}, custom_meta:{custom_meta:?}");
         } else {
@@ -121,6 +178,17 @@ async fn run(args: Args) -> Result<()> {
         }
     }
 
+    println!("{}", file_stats.to_string());
+    println!("FieldStatistics: ");
+    for (k, v) in field_stats_map.iter() {
+        println!(
+            "{},\t compressed_size: {:.2}mb,\t uncompressed_size: {:.2}mb,\t compress_ratio: {:.2}",
+            k,
+            as_mb(v.compressed_size as usize),
+            as_mb(v.uncompressed_size as usize),
+            v.uncompressed_size as f64 / v.compressed_size as f64
+        );
+    }
     Ok(())
 }
 
@@ -133,9 +201,11 @@ async fn parse_metadata(
     path: Path,
     size: usize,
     verbose: bool,
+    page_indexes: bool,
 ) -> Result<(MetaData, usize, usize)> {
     let reader = ChunkReaderAdapter::new(&path, &storage);
     let (parquet_metadata, metadata_size) = fetch_parquet_metadata(size, &reader).await?;
+
     let kv_metadata = parquet_metadata.file_metadata().key_value_metadata();
     let kv_size = kv_metadata
         .map(|kvs| {
@@ -155,6 +225,15 @@ async fn parse_metadata(
         })
         .unwrap_or(0);
 
-    let md = MetaData::try_new(&parquet_metadata, false)?;
+    let md = if page_indexes {
+        let object_store_reader =
+            ObjectStoreReader::new(storage, path.clone(), Arc::new(parquet_metadata));
+        let parquet_metadata =
+            parquet_ext::meta_data::meta_with_page_indexes(object_store_reader).await?;
+        MetaData::try_new(&parquet_metadata, false)?
+    } else {
+        MetaData::try_new(&parquet_metadata, false)?
+    };
+
     Ok((md, metadata_size, kv_size))
 }
diff --git a/wal/src/message_queue_impl/log_cleaner.rs b/wal/src/message_queue_impl/log_cleaner.rs
index b1c4f05a85..074a01b29a 100644
--- a/wal/src/message_queue_impl/log_cleaner.rs
+++ b/wal/src/message_queue_impl/log_cleaner.rs
@@ -1,4 +1,4 @@
-// Copyright 2022 CeresDB Project Authors. Licensed under Apache-2.0.
+// Copyright 2022-2023 CeresDB Project Authors. Licensed under Apache-2.0.
 
 //! Log cleaner
 
@@ -68,7 +68,7 @@ impl<M: MessageQueue> LogCleaner<M> {
 
     pub async fn maybe_clean_logs(&mut self, safe_delete_offset: Offset) -> Result<()> {
         info!(
-            "Begin to check and clean logs, region id:{}, topic:{}, safe delete offset:{:?}",
+            "Region clean logs begin, region id:{}, topic:{}, safe delete offset:{:?}",
             self.region_id, self.log_topic, safe_delete_offset
         );
 
@@ -102,7 +102,7 @@ impl<M: MessageQueue> LogCleaner<M> {
         }
 
         info!(
-            "Finished to check and clean logs, do clean:{}, region id:{}, topic:{}, prepare delete to offset:{:?}",
+            "Region clean logs finish, do clean:{}, region id:{}, topic:{}, prepare delete to offset:{:?}",
             do_clean, self.region_id, self.log_topic, safe_delete_offset
         );
 
diff --git a/wal/src/message_queue_impl/region.rs b/wal/src/message_queue_impl/region.rs
index 49f331dfac..37beea0358 100644
--- a/wal/src/message_queue_impl/region.rs
+++ b/wal/src/message_queue_impl/region.rs
@@ -579,14 +579,14 @@ impl<M: MessageQueue> Region<M> {
         let (snapshot, synchronizer) = {
             let inner = self.inner.write().await;
 
-            debug!(
+            info!(
                 "Mark deleted entries to sequence num:{}, region id:{}, table id:{}",
                 sequence_num,
                 inner.region_context.region_id(),
                 table_id
             );
 
-            inner.mark_delete_to(table_id, sequence_num).await.unwrap();
+            inner.mark_delete_to(table_id, sequence_num).await?;
 
             (
                 inner.make_meta_snapshot().await,
@@ -618,6 +618,8 @@ impl<M: MessageQueue> Region<M> {
         };
 
         let safe_delete_offset = snapshot.safe_delete_offset();
+        info!("Region clean logs, snapshot:{snapshot:?}, safe_delete_offset:{safe_delete_offset}");
+
         // Sync snapshot first.
         synchronizer
             .sync(snapshot)
diff --git a/wal/src/rocks_impl/config.rs b/wal/src/rocks_impl/config.rs
index 439313c144..966720f094 100644
--- a/wal/src/rocks_impl/config.rs
+++ b/wal/src/rocks_impl/config.rs
@@ -2,13 +2,26 @@
 
 //! RocksDB Config
 
+use common_util::config::ReadableSize;
 use serde::{Deserialize, Serialize};
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(default)]
 pub struct Config {
+    pub max_subcompactions: u32,
     pub max_background_jobs: i32,
     pub enable_statistics: bool,
+    pub write_buffer_size: ReadableSize,
+    pub max_write_buffer_number: i32,
+    // Number of files to trigger level-0 compaction. A value <0 means that level-0 compaction will
+    // not be triggered by number of files at all.
+    pub level_zero_file_num_compaction_trigger: i32,
+    // Soft limit on number of level-0 files. We start slowing down writes at this point. A value
+    // <0 means that no writing slow down will be triggered by number of files in level-0.
+    pub level_zero_slowdown_writes_trigger: i32,
+    // Maximum number of level-0 files.  We stop writes at this point.
+    pub level_zero_stop_writes_trigger: i32,
+    pub fifo_compaction_max_table_files_size: ReadableSize,
 }
 
 impl Default for Config {
@@ -16,8 +29,16 @@ impl Default for Config {
         Self {
             // Same with rocksdb
             // https://github.com/facebook/rocksdb/blob/v6.4.6/include/rocksdb/options.h#L537
+            max_subcompactions: 1,
             max_background_jobs: 2,
             enable_statistics: false,
+            write_buffer_size: ReadableSize::mb(64),
+            max_write_buffer_number: 2,
+            level_zero_file_num_compaction_trigger: 4,
+            level_zero_slowdown_writes_trigger: 20,
+            level_zero_stop_writes_trigger: 36,
+            // default is 1G, use 0 to disable fifo
+            fifo_compaction_max_table_files_size: ReadableSize::gb(0),
         }
     }
 }
diff --git a/wal/src/rocks_impl/manager.rs b/wal/src/rocks_impl/manager.rs
index b8a97b8e7a..2e4e70cf00 100644
--- a/wal/src/rocks_impl/manager.rs
+++ b/wal/src/rocks_impl/manager.rs
@@ -19,7 +19,10 @@ use common_types::{
 };
 use common_util::{error::BoxError, runtime::Runtime};
 use log::{debug, info, warn};
-use rocksdb::{DBIterator, DBOptions, ReadOptions, SeekKey, Statistics, Writable, WriteBatch, DB};
+use rocksdb::{
+    rocksdb_options::ColumnFamilyDescriptor, ColumnFamilyOptions, DBCompactionStyle, DBIterator,
+    DBOptions, FifoCompactionOptions, ReadOptions, SeekKey, Statistics, Writable, WriteBatch, DB,
+};
 use snafu::ResultExt;
 use tokio::sync::Mutex;
 
@@ -525,8 +528,15 @@ impl RocksImpl {
 pub struct Builder {
     wal_path: String,
     runtime: Arc<Runtime>,
+    max_subcompactions: Option<u32>,
     max_background_jobs: Option<i32>,
     enable_statistics: Option<bool>,
+    write_buffer_size: Option<u64>,
+    max_write_buffer_number: Option<i32>,
+    level_zero_file_num_compaction_trigger: Option<i32>,
+    level_zero_slowdown_writes_trigger: Option<i32>,
+    level_zero_stop_writes_trigger: Option<i32>,
+    fifo_compaction_max_table_files_size: Option<u64>,
 }
 
 impl Builder {
@@ -535,11 +545,23 @@ impl Builder {
         Self {
             wal_path: wal_path.to_str().unwrap().to_owned(),
             runtime,
+            max_subcompactions: None,
             max_background_jobs: None,
             enable_statistics: None,
+            write_buffer_size: None,
+            max_write_buffer_number: None,
+            level_zero_file_num_compaction_trigger: None,
+            level_zero_slowdown_writes_trigger: None,
+            level_zero_stop_writes_trigger: None,
+            fifo_compaction_max_table_files_size: None,
         }
     }
 
+    pub fn max_subcompactions(mut self, v: u32) -> Self {
+        self.max_subcompactions = Some(v);
+        self
+    }
+
     pub fn max_background_jobs(mut self, v: i32) -> Self {
         self.max_background_jobs = Some(v);
         self
@@ -550,10 +572,43 @@ impl Builder {
         self
     }
 
+    pub fn write_buffer_size(mut self, v: u64) -> Self {
+        self.write_buffer_size = Some(v);
+        self
+    }
+
+    pub fn max_write_buffer_number(mut self, v: i32) -> Self {
+        self.max_write_buffer_number = Some(v);
+        self
+    }
+
+    pub fn level_zero_file_num_compaction_trigger(mut self, v: i32) -> Self {
+        self.level_zero_file_num_compaction_trigger = Some(v);
+        self
+    }
+
+    pub fn level_zero_slowdown_writes_trigger(mut self, v: i32) -> Self {
+        self.level_zero_slowdown_writes_trigger = Some(v);
+        self
+    }
+
+    pub fn level_zero_stop_writes_trigger(mut self, v: i32) -> Self {
+        self.level_zero_stop_writes_trigger = Some(v);
+        self
+    }
+
+    pub fn fifo_compaction_max_table_files_size(mut self, v: u64) -> Self {
+        self.fifo_compaction_max_table_files_size = Some(v);
+        self
+    }
+
     pub fn build(self) -> Result<RocksImpl> {
         let mut rocksdb_config = DBOptions::default();
         rocksdb_config.create_if_missing(true);
 
+        if let Some(v) = self.max_subcompactions {
+            rocksdb_config.set_max_subcompactions(v);
+        }
         if let Some(v) = self.max_background_jobs {
             rocksdb_config.set_max_background_jobs(v);
         }
@@ -566,7 +621,38 @@ impl Builder {
             None
         };
 
-        let db = DB::open(rocksdb_config, &self.wal_path)
+        let mut cf_opts = ColumnFamilyOptions::new();
+        if let Some(v) = self.write_buffer_size {
+            cf_opts.set_write_buffer_size(v);
+        }
+        if let Some(v) = self.max_write_buffer_number {
+            cf_opts.set_max_write_buffer_number(v);
+        }
+        if let Some(v) = self.level_zero_file_num_compaction_trigger {
+            cf_opts.set_level_zero_file_num_compaction_trigger(v);
+        }
+        if let Some(v) = self.level_zero_slowdown_writes_trigger {
+            cf_opts.set_level_zero_slowdown_writes_trigger(v);
+        }
+        if let Some(v) = self.level_zero_stop_writes_trigger {
+            cf_opts.set_level_zero_stop_writes_trigger(v);
+        }
+
+        // FIFO compaction strategy let rocksdb looks like a message queue.
+        if let Some(v) = self.fifo_compaction_max_table_files_size {
+            if v > 0 {
+                let mut fifo_opts = FifoCompactionOptions::new();
+                fifo_opts.set_max_table_files_size(v);
+                cf_opts.set_fifo_compaction_options(fifo_opts);
+                cf_opts.set_compaction_style(DBCompactionStyle::Fifo);
+            }
+        }
+
+        let default_cfd = ColumnFamilyDescriptor {
+            options: cf_opts,
+            ..ColumnFamilyDescriptor::default()
+        };
+        let db = DB::open_cf(rocksdb_config, &self.wal_path, vec![default_cfd])
             .map_err(|e| e.into())
             .context(Open {
                 wal_path: self.wal_path.clone(),

From 305df31d170a81e20e035d7422708ba60051f308 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 28 Jun 2023 15:45:26 +0800
Subject: [PATCH 02/18] update

---
 Cargo.lock | 70 +++---------------------------------------------------
 1 file changed, 3 insertions(+), 67 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 466fe8b216..5c45b2db7d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -85,11 +85,7 @@ dependencies = [
  "async-trait",
  "base64 0.13.1",
  "bytes",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "datafusion",
@@ -1062,11 +1058,7 @@ checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b"
 dependencies = [
  "arrow",
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.5",
-=======
  "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
->>>>>>> 0abc9181 (update pb)
  "dashmap 5.4.0",
  "futures 0.3.28",
  "paste 1.0.12",
@@ -1092,8 +1084,9 @@ dependencies = [
 
 [[package]]
 name = "ceresdbproto"
-version = "1.0.4"
-source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=53f5c74a54d8a08ebb08c41e8b862b2369df4a02#53f5c74a54d8a08ebb08c41e8b862b2369df4a02"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1105,12 +1098,7 @@ dependencies = [
 [[package]]
 name = "ceresdbproto"
 version = "1.0.5"
-<<<<<<< HEAD
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a"
-=======
 source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39"
->>>>>>> 0abc9181 (update pb)
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1263,11 +1251,7 @@ name = "cluster"
 version = "1.2.2"
 dependencies = [
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "etcd-client",
@@ -1320,11 +1304,7 @@ dependencies = [
  "arrow_ext",
  "byteorder",
  "bytes_ext",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "chrono",
  "datafusion",
  "murmur3",
@@ -1343,11 +1323,7 @@ version = "1.2.2"
 dependencies = [
  "arrow",
  "backtrace",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "chrono",
  "common_types",
  "crossbeam-utils 0.8.15",
@@ -3460,11 +3436,7 @@ name = "meta_client"
 version = "1.2.2"
 dependencies = [
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -3953,11 +3925,7 @@ version = "1.2.2"
 dependencies = [
  "async-trait",
  "bytes",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "chrono",
  "clru",
  "common_types",
@@ -4755,11 +4723,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "clru",
  "cluster",
  "common_types",
@@ -4872,11 +4836,7 @@ dependencies = [
  "arrow",
  "async-trait",
  "catalog",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "cluster",
  "common_types",
  "common_util",
@@ -5184,11 +5144,7 @@ version = "1.2.2"
 dependencies = [
  "arrow_ext",
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -5314,11 +5270,7 @@ name = "router"
 version = "1.2.2"
 dependencies = [
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "cluster",
  "common_types",
  "common_util",
@@ -5673,11 +5625,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "clru",
  "cluster",
  "common_types",
@@ -6216,11 +6164,7 @@ dependencies = [
  "arrow",
  "async-trait",
  "catalog",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -6239,11 +6183,7 @@ dependencies = [
  "arrow",
  "arrow_ext",
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "common_types",
  "common_util",
  "datafusion",
@@ -7057,11 +6997,7 @@ name = "wal"
 version = "1.2.2"
 dependencies = [
  "async-trait",
-<<<<<<< HEAD
- "ceresdbproto 1.0.4",
-=======
  "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
->>>>>>> 0abc9181 (update pb)
  "chrono",
  "common_types",
  "common_util",

From d885790c4b28416c27029aa58a3ea7d117a45b0b Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 28 Jun 2023 15:50:50 +0800
Subject: [PATCH 03/18] remove comment

---
 Cargo.lock                 | 1 -
 common_types/src/column.rs | 7 -------
 2 files changed, 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 662e846f41..5c45b2db7d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1099,7 +1099,6 @@ dependencies = [
 name = "ceresdbproto"
 version = "1.0.5"
 source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39"
-
 dependencies = [
  "prost",
  "protoc-bin-vendored",
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index a2580a47c2..9e9a390fd7 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -375,13 +375,6 @@ impl_dedup!(VarbinaryColumn);
 impl_dedup!(StringColumn);
 
 impl StringDictionaryColumn {
-    #[doc = " If datum i is not equal to previous datum i - 1, mark `selected[i]` to"]
-    #[doc = " true."]
-    #[doc = ""]
-    #[doc = " The first datum is marked to true."]
-    #[doc = ""]
-    #[doc = " The size of selected must equal to the size of this column and"]
-    #[doc = " initialized to false."]
     #[allow(clippy::float_cmp)]
     pub fn dedup(&self, selected: &mut [bool]) {
         if self.0.is_empty() {

From 1e1437e58d522d35efabf7436b891f36f20ab5fd Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 28 Jun 2023 16:26:27 +0800
Subject: [PATCH 04/18] remove sdk related code

---
 interpreters/src/describe.rs    |  4 ----
 interpreters/src/show_create.rs |  5 -----
 interpreters/src/tests.rs       | 21 ++++++++++-----------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/interpreters/src/describe.rs b/interpreters/src/describe.rs
index cf415cc6e0..c0944ed81b 100644
--- a/interpreters/src/describe.rs
+++ b/interpreters/src/describe.rs
@@ -46,14 +46,12 @@ impl DescribeInterpreter {
         let mut is_primary_keys = Vec::with_capacity(num_columns);
         let mut is_nullables = Vec::with_capacity(num_columns);
         let mut is_tags = Vec::with_capacity(num_columns);
-        let mut is_dictionarys = Vec::with_capacity(num_columns);
         for (idx, col) in table_schema.columns().iter().enumerate() {
             names.push(col.name.to_string());
             types.push(col.data_type.to_string());
             is_primary_keys.push(table_schema.is_primary_key_index(&idx));
             is_nullables.push(col.is_nullable);
             is_tags.push(col.is_tag);
-            is_dictionarys.push(col.is_dictionary);
         }
 
         let schema = Schema::new(vec![
@@ -62,7 +60,6 @@ impl DescribeInterpreter {
             Field::new("is_primary", DataType::Boolean, false),
             Field::new("is_nullable", DataType::Boolean, false),
             Field::new("is_tag", DataType::Boolean, false),
-            Field::new("is_dictionary", DataType::Boolean, false),
         ]);
 
         let arrow_record_batch = RecordBatch::try_new(
@@ -73,7 +70,6 @@ impl DescribeInterpreter {
                 Arc::new(BooleanArray::from(is_primary_keys)),
                 Arc::new(BooleanArray::from(is_nullables)),
                 Arc::new(BooleanArray::from(is_tags)),
-                Arc::new(BooleanArray::from(is_dictionarys)),
             ],
         )
         .unwrap();
diff --git a/interpreters/src/show_create.rs b/interpreters/src/show_create.rs
index a6e4b3dcdf..29b6048355 100644
--- a/interpreters/src/show_create.rs
+++ b/interpreters/src/show_create.rs
@@ -86,11 +86,6 @@ impl ShowCreateInterpreter {
             if col.is_tag {
                 res += " TAG";
             }
-
-            if col.is_dictionary {
-                res += " DICTIONARY";
-            }
-
             if !col.is_nullable {
                 res += " NOT NULL";
             }
diff --git a/interpreters/src/tests.rs b/interpreters/src/tests.rs
index 6a4473eee6..2b2f93b184 100644
--- a/interpreters/src/tests.rs
+++ b/interpreters/src/tests.rs
@@ -170,18 +170,17 @@ where
         let sql = "desc table test_table";
         let output = self.sql_to_output(sql).await.unwrap();
         let records = output.try_into().unwrap();
-        // todo this maybe need to change
         let expected = vec![
-            "+--------+-----------+------------+-------------+--------+---------------+",
-            "| name   | type      | is_primary | is_nullable | is_tag | is_dictionary |",
-            "+--------+-----------+------------+-------------+--------+---------------+",
-            "| key1   | varbinary | true       | false       | false  | false         |",
-            "| key2   | timestamp | true       | false       | false  | false         |",
-            "| field1 | double    | false      | true        | false  | false         |",
-            "| field2 | string    | false      | true        | false  | false         |",
-            "| field3 | date      | false      | true        | false  | false         |",
-            "| field4 | time      | false      | true        | false  | false         |",
-            "+--------+-----------+------------+-------------+--------+---------------+",
+            "+--------+-----------+------------+-------------+--------+",
+            "| name   | type      | is_primary | is_nullable | is_tag |",
+            "+--------+-----------+------------+-------------+--------+",
+            "| key1   | varbinary | true       | false       | false  |",
+            "| key2   | timestamp | true       | false       | false  |",
+            "| field1 | double    | false      | true        | false  |",
+            "| field2 | string    | false      | true        | false  |",
+            "| field3 | date      | false      | true        | false  |",
+            "| field4 | time      | false      | true        | false  |",
+            "+--------+-----------+------------+-------------+--------+",
         ];
         common_util::record_batch::assert_record_batches_eq(&expected, records);
     }

From 6f9d38f387fd620732123070527cc7d7855ab60f Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 28 Jun 2023 16:44:29 +0800
Subject: [PATCH 05/18] update result

---
 integration_tests/cases/env/local/ddl/create_tables.result | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/integration_tests/cases/env/local/ddl/create_tables.result b/integration_tests/cases/env/local/ddl/create_tables.result
index ede67e9c1a..87779ac52b 100644
--- a/integration_tests/cases/env/local/ddl/create_tables.result
+++ b/integration_tests/cases/env/local/ddl/create_tables.result
@@ -48,7 +48,7 @@ affected_rows: 0
 
 CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"c1\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"c1\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t." })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: CREATE TABLE `05_create_tables_t`(c1 int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"c1\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"c1\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t." })
 
 create table `05_create_tables_t2`(a int, b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic with (enable_ttl='false');
 
@@ -67,11 +67,11 @@ Int32(4),
 
 create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." })
 
 create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 
-Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." })
+Failed to execute query, err: Server(ServerError { code: 500, msg: "Failed to execute plan, sql: create table `05_create_tables_t2`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;. Caused by: Internal error, msg:Failed to execute interpreter, err:Failed to execute create table, err:Failed to create table by table manipulator, err:Failed to operate table, err:Failed to operate table, msg:Some(\"failed to create table on shard, request:CreateTableRequest { catalog_name: \\\"ceresdb\\\", schema_name: \\\"public\\\", table_name: \\\"05_create_tables_t2\\\", table_id: None, table_schema: Schema { timestamp_index: 1, tsid_index: Some(0), column_schemas: ColumnSchemas { columns: [ColumnSchema { id: 1, name: \\\"tsid\\\", data_type: UInt64, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"tsid\\\", default_value: None }, ColumnSchema { id: 2, name: \\\"t\\\", data_type: Timestamp, is_nullable: false, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"t\\\", default_value: None }, ColumnSchema { id: 3, name: \\\"a\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"a\\\", default_value: None }, ColumnSchema { id: 4, name: \\\"b\\\", data_type: Int32, is_nullable: true, is_tag: false, is_dictionary: false, comment: \\\"\\\", escaped_name: \\\"b\\\", default_value: None }] }, version: 1, primary_key_indexes: [0, 1] }, engine: \\\"Analytic\\\", options: {}, state: Stable, shard_id: 0, partition_info: None }\"), err:Failed to create table, table already exists, table:05_create_tables_t2." })
 
 create table `05_create_tables_t3`(a int,b int, t timestamp NOT NULL, TIMESTAMP KEY(t)) ENGINE = Analytic;
 

From 72170a363c94229784d28c9272a59f35e3a0dab2 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 28 Jun 2023 16:56:33 +0800
Subject: [PATCH 06/18] update ceresdb proto version

---
 Cargo.lock | 46 +++++++++++++++++-----------------------------
 Cargo.toml |  3 +--
 2 files changed, 18 insertions(+), 31 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5c45b2db7d..e1430f278e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -85,7 +85,7 @@ dependencies = [
  "async-trait",
  "base64 0.13.1",
  "bytes",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "datafusion",
@@ -1058,7 +1058,7 @@ checksum = "f5f27e14a7a0c030015c0fdb06c59c46cd6f9765e381bd920e02ff316b3be48b"
 dependencies = [
  "arrow",
  "async-trait",
- "ceresdbproto 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
+ "ceresdbproto",
  "dashmap 5.4.0",
  "futures 0.3.28",
  "paste 1.0.12",
@@ -1084,21 +1084,9 @@ dependencies = [
 
 [[package]]
 name = "ceresdbproto"
-version = "1.0.5"
+version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbfdcd9746d2b027e2880ef80bb6c5735ea45ad590f21b2cd2168eb11ba66f7a"
-dependencies = [
- "prost",
- "protoc-bin-vendored",
- "tonic 0.8.3",
- "tonic-build",
- "walkdir",
-]
-
-[[package]]
-name = "ceresdbproto"
-version = "1.0.5"
-source = "git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39#6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39"
+checksum = "81229e82e9afa8318e7f765cc01cd15f7380786699f4c7beceec7540e0488d7e"
 dependencies = [
  "prost",
  "protoc-bin-vendored",
@@ -1251,7 +1239,7 @@ name = "cluster"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "etcd-client",
@@ -1304,7 +1292,7 @@ dependencies = [
  "arrow_ext",
  "byteorder",
  "bytes_ext",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "chrono",
  "datafusion",
  "murmur3",
@@ -1323,7 +1311,7 @@ version = "1.2.2"
 dependencies = [
  "arrow",
  "backtrace",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "chrono",
  "common_types",
  "crossbeam-utils 0.8.15",
@@ -3436,7 +3424,7 @@ name = "meta_client"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -3925,7 +3913,7 @@ version = "1.2.2"
 dependencies = [
  "async-trait",
  "bytes",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "chrono",
  "clru",
  "common_types",
@@ -4723,7 +4711,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "clru",
  "cluster",
  "common_types",
@@ -4836,7 +4824,7 @@ dependencies = [
  "arrow",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "cluster",
  "common_types",
  "common_util",
@@ -5144,7 +5132,7 @@ version = "1.2.2"
 dependencies = [
  "arrow_ext",
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -5270,7 +5258,7 @@ name = "router"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "cluster",
  "common_types",
  "common_util",
@@ -5625,7 +5613,7 @@ dependencies = [
  "async-trait",
  "bytes",
  "catalog",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "clru",
  "cluster",
  "common_types",
@@ -6164,7 +6152,7 @@ dependencies = [
  "arrow",
  "async-trait",
  "catalog",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "futures 0.3.28",
@@ -6183,7 +6171,7 @@ dependencies = [
  "arrow",
  "arrow_ext",
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "common_types",
  "common_util",
  "datafusion",
@@ -6997,7 +6985,7 @@ name = "wal"
 version = "1.2.2"
 dependencies = [
  "async-trait",
- "ceresdbproto 1.0.5 (git+https://github.com/tanruixiang/ceresdbproto.git?rev=6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39)",
+ "ceresdbproto",
  "chrono",
  "common_types",
  "common_util",
diff --git a/Cargo.toml b/Cargo.toml
index cfb2a37dbd..d94dd002f4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -68,8 +68,7 @@ bytes = "1.1.0"
 bytes_ext = { path = "components/bytes_ext" }
 catalog = { path = "catalog" }
 catalog_impls = { path = "catalog_impls" }
-ceresdbproto = { git = "https://github.com/tanruixiang/ceresdbproto.git", rev = "6a03a3fe5a3de32a96ad4ede87a7d5350e7a3a39" }
-
+ceresdbproto = "1.0"
 chrono = "0.4"
 clap = "3.0"
 clru = "0.6.1"

From 41b39704883fd71b72424e74f10add1ac07ea29a Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 10:52:25 +0800
Subject: [PATCH 07/18] adjust testcase and comment

---
 analytic_engine/src/sst/parquet/hybrid.rs |   2 +-
 analytic_engine/src/sst/parquet/writer.rs | 239 +++++++---------------
 common_types/src/column.rs                |   7 +-
 common_types/src/column_schema.rs         |   2 +-
 common_types/src/tests.rs                 |  42 ++--
 df_operator/src/udfs/time_bucket.rs       |   2 +-
 proxy/src/write.rs                        |   4 +-
 query_frontend/src/planner.rs             |  12 +-
 table_engine/src/memory.rs                |   2 +-
 9 files changed, 105 insertions(+), 207 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs
index 1cf7481ecf..ee86496f48 100644
--- a/analytic_engine/src/sst/parquet/hybrid.rs
+++ b/analytic_engine/src/sst/parquet/hybrid.rs
@@ -127,7 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef {
                     field.data_type().clone(),
                     true,
                 )));
-                // TODO is there need to use new_dict?
+                // TODO(tanruixiang) is there need to use new_dict?
                 Arc::new(Field::new(field.name(), field_type, true))
             } else {
                 field.clone()
diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index 378854bafa..3d5e896c06 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -333,7 +333,7 @@ mod tests {
     use common_types::{
         bytes::Bytes,
         projected_schema::ProjectedSchema,
-        tests::{build_row, build_row_for_dictionary, build_schema, build_schema_for_dictionary},
+        tests::{build_row, build_row_for_dictionary, build_schema, build_schema_with_dictionary},
         time::{TimeRange, Timestamp},
     };
     use common_util::{
@@ -358,7 +358,18 @@ mod tests {
         table_options::{self, StorageFormatHint},
     };
 
-    fn write_parquet_with_dictionary_encode_and_read_back(
+    #[test]
+    fn test_parquet_build_and_read() {
+        init_log_for_test();
+
+        let runtime = Arc::new(runtime::Builder::default().build().unwrap());
+        parquet_write_and_then_read_back(runtime.clone(), 2, vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2]);
+        parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3, 3, 2]);
+        parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]);
+        parquet_write_and_then_read_back(runtime, 5, vec![5, 5, 5, 5]);
+    }
+
+    fn parquet_write_and_then_read_back(
         runtime: Arc<Runtime>,
         num_rows_per_row_group: usize,
         expected_num_rows: Vec<i64>,
@@ -376,9 +387,9 @@ mod tests {
             let root = dir.path();
             let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap());
             let store_picker: ObjectStorePickerRef = Arc::new(store);
-            let sst_file_path = Path::from("test_dictionary.par");
+            let sst_file_path = Path::from("data.par");
 
-            let schema = build_schema_for_dictionary();
+            let schema = build_schema_with_dictionary();
             let reader_projected_schema = ProjectedSchema::no_projection(schema.clone());
             let sst_meta = MetaData {
                 min_key: Bytes::from_static(b"100"),
@@ -395,16 +406,45 @@ mod tests {
                 }
                 counter -= 1;
 
+                // reach here when counter is 9 7 5 3 1
                 let ts = 100 + counter;
                 let rows = vec![
-                    build_row_for_dictionary(1, ts, Some("tagv1"), "tagv2", 1),
-                    build_row_for_dictionary(2, ts, Some("tagv2"), "tagv2", 2),
-                    build_row_for_dictionary(3, ts, None, "tagv3", 3),
-                    build_row_for_dictionary(4, ts, Some("tagv3"), "tagv2", 2),
+                    build_row_for_dictionary(
+                        b"a",
+                        ts,
+                        10.0,
+                        "v4",
+                        1000,
+                        1_000_000,
+                        Some("tagv1"),
+                        "tagv2",
+                    ),
+                    build_row_for_dictionary(
+                        b"b",
+                        ts,
+                        10.0,
+                        "v4",
+                        1000,
+                        1_000_000,
+                        Some("tagv2"),
+                        "tagv4",
+                    ),
+                    build_row_for_dictionary(b"c", ts, 10.0, "v4", 1000, 1_000_000, None, "tagv2"),
+                    build_row_for_dictionary(
+                        b"d",
+                        ts,
+                        10.0,
+                        "v4",
+                        1000,
+                        1_000_000,
+                        Some("tagv3"),
+                        "tagv2",
+                    ),
                 ];
                 let batch = build_record_batch_with_key(schema.clone(), rows);
                 Poll::Ready(Some(Ok(batch)))
             }));
+
             let mut writer = sst_factory
                 .create_writer(
                     &sst_write_options,
@@ -466,186 +506,55 @@ mod tests {
 
                 Box::new(reader)
             };
+
             let mut stream = reader.read().await.unwrap();
             let mut expect_rows = vec![];
             for counter in &[4, 3, 2, 1, 0] {
                 expect_rows.push(build_row_for_dictionary(
-                    1,
+                    b"a",
                     100 + counter,
+                    10.0,
+                    "v4",
+                    1000,
+                    1_000_000,
                     Some("tagv1"),
                     "tagv2",
-                    1,
                 ));
                 expect_rows.push(build_row_for_dictionary(
-                    2,
+                    b"b",
                     100 + counter,
+                    10.0,
+                    "v4",
+                    1000,
+                    1_000_000,
                     Some("tagv2"),
+                    "tagv4",
+                ));
+                expect_rows.push(build_row_for_dictionary(
+                    b"c",
+                    100 + counter,
+                    10.0,
+                    "v4",
+                    1000,
+                    1_000_000,
+                    None,
                     "tagv2",
-                    2,
                 ));
-                expect_rows.push(build_row_for_dictionary(3, 100 + counter, None, "tagv3", 3));
                 expect_rows.push(build_row_for_dictionary(
-                    4,
+                    b"d",
                     100 + counter,
+                    10.0,
+                    "v4",
+                    1000,
+                    1_000_000,
                     Some("tagv3"),
                     "tagv2",
-                    2,
                 ));
             }
             check_stream(&mut stream, expect_rows).await;
         });
     }
 
-    // TODO(xikai): add test for reverse reader
-    #[test]
-    fn test_parquet_use_dictionary() {
-        init_log_for_test();
-
-        let runtime = Arc::new(runtime::Builder::default().build().unwrap());
-        write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 5, vec![5, 5, 5, 5]);
-        write_parquet_with_dictionary_encode_and_read_back(runtime.clone(), 4, vec![4, 4, 4, 4, 4]);
-        write_parquet_with_dictionary_encode_and_read_back(
-            runtime.clone(),
-            3,
-            vec![3, 3, 3, 3, 3, 3, 2],
-        );
-        write_parquet_with_dictionary_encode_and_read_back(
-            runtime,
-            2,
-            vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-        );
-    }
-    #[test]
-    fn test_parquet_build_and_read() {
-        init_log_for_test();
-
-        let runtime = Arc::new(runtime::Builder::default().build().unwrap());
-        parquet_write_and_then_read_back(runtime.clone(), 3, vec![3, 3, 3, 3, 3]);
-        parquet_write_and_then_read_back(runtime.clone(), 4, vec![4, 4, 4, 3]);
-        parquet_write_and_then_read_back(runtime, 5, vec![5, 5, 5]);
-    }
-
-    fn parquet_write_and_then_read_back(
-        runtime: Arc<Runtime>,
-        num_rows_per_row_group: usize,
-        expected_num_rows: Vec<i64>,
-    ) {
-        runtime.block_on(async {
-            let sst_factory = FactoryImpl;
-            let sst_write_options = SstWriteOptions {
-                storage_format_hint: StorageFormatHint::Auto,
-                num_rows_per_row_group,
-                compression: table_options::Compression::Uncompressed,
-                max_buffer_size: 0,
-            };
-
-            let dir = tempdir().unwrap();
-            let root = dir.path();
-            let store: ObjectStoreRef = Arc::new(LocalFileSystem::new_with_prefix(root).unwrap());
-            let store_picker: ObjectStorePickerRef = Arc::new(store);
-            let sst_file_path = Path::from("data.par");
-
-            let schema = build_schema();
-            let reader_projected_schema = ProjectedSchema::no_projection(schema.clone());
-            let sst_meta = MetaData {
-                min_key: Bytes::from_static(b"100"),
-                max_key: Bytes::from_static(b"200"),
-                time_range: TimeRange::new_unchecked(Timestamp::new(1), Timestamp::new(2)),
-                max_sequence: 200,
-                schema: schema.clone(),
-            };
-
-            let mut counter = 5;
-            let record_batch_stream = Box::new(stream::poll_fn(move |_| -> Poll<Option<_>> {
-                if counter == 0 {
-                    return Poll::Ready(None);
-                }
-                counter -= 1;
-
-                // reach here when counter is 9 7 5 3 1
-                let ts = 100 + counter;
-                let rows = vec![
-                    build_row(b"a", ts, 10.0, "v4", 1000, 1_000_000),
-                    build_row(b"b", ts, 10.0, "v4", 1000, 1_000_000),
-                    build_row(b"c", ts, 10.0, "v4", 1000, 1_000_000),
-                ];
-                let batch = build_record_batch_with_key(schema.clone(), rows);
-                Poll::Ready(Some(Ok(batch)))
-            }));
-
-            let mut writer = sst_factory
-                .create_writer(
-                    &sst_write_options,
-                    &sst_file_path,
-                    &store_picker,
-                    Level::MAX,
-                )
-                .await
-                .unwrap();
-            let sst_info = writer
-                .write(RequestId::next_id(), &sst_meta, record_batch_stream)
-                .await
-                .unwrap();
-
-            assert_eq!(15, sst_info.row_num);
-
-            let scan_options = ScanOptions::default();
-            // read sst back to test
-            let sst_read_options = SstReadOptions {
-                reverse: false,
-                frequency: ReadFrequency::Frequent,
-                num_rows_per_row_group: 5,
-                projected_schema: reader_projected_schema,
-                predicate: Arc::new(Predicate::empty()),
-                meta_cache: None,
-                scan_options,
-                runtime: runtime.clone(),
-            };
-
-            let mut reader: Box<dyn SstReader + Send> = {
-                let mut reader = AsyncParquetReader::new(
-                    &sst_file_path,
-                    &sst_read_options,
-                    None,
-                    &store_picker,
-                    None,
-                );
-                let mut sst_meta_readback = reader
-                    .meta_data()
-                    .await
-                    .unwrap()
-                    .as_parquet()
-                    .unwrap()
-                    .as_ref()
-                    .clone();
-                // sst filter is built insider sst writer, so overwrite to default for
-                // comparison.
-                sst_meta_readback.parquet_filter = Default::default();
-                assert_eq!(&sst_meta_readback, &ParquetMetaData::from(sst_meta));
-                assert_eq!(
-                    expected_num_rows,
-                    reader
-                        .row_groups()
-                        .await
-                        .iter()
-                        .map(|g| g.num_rows())
-                        .collect::<Vec<_>>()
-                );
-
-                Box::new(reader)
-            };
-
-            let mut stream = reader.read().await.unwrap();
-            let mut expect_rows = vec![];
-            for counter in &[4, 3, 2, 1, 0] {
-                expect_rows.push(build_row(b"a", 100 + counter, 10.0, "v4", 1000, 1_000_000));
-                expect_rows.push(build_row(b"b", 100 + counter, 10.0, "v4", 1000, 1_000_000));
-                expect_rows.push(build_row(b"c", 100 + counter, 10.0, "v4", 1000, 1_000_000));
-            }
-            check_stream(&mut stream, expect_rows).await;
-        });
-    }
-
     #[tokio::test]
     async fn test_fetch_row_group() {
         // rows per group: 10
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index 9e9a390fd7..b1656e2cfc 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -292,7 +292,7 @@ impl_column!(
 impl_column!(StringColumn, get_string_datum, get_string_datum_view);
 
 impl StringDictionaryColumn {
-    #[doc = " Get datum by index."]
+    /// Get datum by index
     pub fn datum_opt(&self, index: usize) -> Option<Datum> {
         if index >= self.0.len() {
             return None;
@@ -311,7 +311,7 @@ impl StringDictionaryColumn {
         if self.0.is_null(index) {
             return DatumView::Null;
         }
-        // TODO : Is this the efficient way?
+        // TODO(tanruixiang): Is this the efficient way?
         DatumView::String(self.0.downcast_dict::<StringArray>().unwrap().value(index))
     }
 
@@ -319,7 +319,7 @@ impl StringDictionaryColumn {
         if self.0.is_null(index) {
             return Datum::Null;
         }
-        // TODO : Is this the efficient way?
+        // TODO(tanruixiang): Is this the efficient way?
         Datum::String(
             self.0
                 .downcast_dict::<StringArray>()
@@ -375,7 +375,6 @@ impl_dedup!(VarbinaryColumn);
 impl_dedup!(StringColumn);
 
 impl StringDictionaryColumn {
-    #[allow(clippy::float_cmp)]
     pub fn dedup(&self, selected: &mut [bool]) {
         if self.0.is_empty() {
             return;
diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs
index 6deaefa5c4..606d04eaf5 100644
--- a/common_types/src/column_schema.rs
+++ b/common_types/src/column_schema.rs
@@ -336,7 +336,7 @@ impl From<&ColumnSchema> for Field {
                 col_schema.is_nullable,
                 col_schema.id.into(),
                 false,
-                // Todo how to use dict_is_ordered
+                // TODO(tanruixiang) how to use dict_is_ordered
             )
         } else {
             Field::new(
diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
index fd81255da2..0839c6c121 100644
--- a/common_types/src/tests.rs
+++ b/common_types/src/tests.rs
@@ -145,24 +145,12 @@ pub fn build_schema() -> Schema {
 pub fn build_default_value_schema() -> Schema {
     default_value_schema_builder().build().unwrap()
 }
+
 /// Build a schema for testing:
-/// (tsid(uint64), key2(timestamp), tag1(string), tag2(string), value(int8),
-/// field2(float))
-pub fn build_schema_for_dictionary() -> Schema {
-    let builder = schema::Builder::new()
-        .auto_increment_column_id(true)
-        .add_key_column(
-            column_schema::Builder::new(TSID_COLUMN.to_string(), DatumKind::UInt64)
-                .build()
-                .unwrap(),
-        )
-        .unwrap()
-        .add_key_column(
-            column_schema::Builder::new("time".to_string(), DatumKind::Timestamp)
-                .build()
-                .unwrap(),
-        )
-        .unwrap()
+/// (key1(varbinary), key2(timestamp), field1(double), field2(string),
+/// field3(date), field4(time)) tag1(string dictionary), tag2(string dictionary)
+pub fn build_schema_with_dictionary() -> Schema {
+    let builder = base_schema_builder()
         .add_normal_column(
             column_schema::Builder::new("tag1".to_string(), DatumKind::String)
                 .is_tag(true)
@@ -179,12 +167,6 @@ pub fn build_schema_for_dictionary() -> Schema {
                 .build()
                 .unwrap(),
         )
-        .unwrap()
-        .add_normal_column(
-            column_schema::Builder::new("value".to_string(), DatumKind::Int8)
-                .build()
-                .unwrap(),
-        )
         .unwrap();
 
     builder.build().unwrap()
@@ -239,19 +221,25 @@ pub fn build_schema_for_cpu() -> Schema {
 }
 
 pub fn build_row_for_dictionary(
-    key1: u64,
+    key1: &[u8],
     key2: i64,
+    field1: f64,
+    field2: &str,
+    field3: i32,
+    field4: i64,
     tag1: Option<&str>,
     tag2: &str,
-    value: i8,
 ) -> Row {
     let datums = vec![
-        Datum::UInt64(key1),
+        Datum::Varbinary(Bytes::copy_from_slice(key1)),
         Datum::Timestamp(Timestamp::new(key2)),
+        Datum::Double(field1),
+        Datum::String(StringBytes::from(field2)),
+        Datum::Date(field3),
+        Datum::Time(field4),
         tag1.map(|v| Datum::String(StringBytes::from(v)))
             .unwrap_or(Datum::Null),
         Datum::String(StringBytes::from(tag2)),
-        Datum::Int8(value),
     ];
     Row::from_datums(datums)
 }
diff --git a/df_operator/src/udfs/time_bucket.rs b/df_operator/src/udfs/time_bucket.rs
index bb4c6b29bb..29d2932aff 100644
--- a/df_operator/src/udfs/time_bucket.rs
+++ b/df_operator/src/udfs/time_bucket.rs
@@ -141,7 +141,7 @@ impl<'a> TimeBucket<'a> {
     }
 
     fn call(&self) -> Result<ColumnBlock> {
-        // TODO mising is_dictionary params
+        // TODO(tanruixiang) : mising is_dictionary params
         let mut out_column_builder =
             ColumnBlockBuilder::with_capacity(&DatumKind::Timestamp, self.column.num_rows(), false);
         for ts_opt in self.column.iter() {
diff --git a/proxy/src/write.rs b/proxy/src/write.rs
index 008489bfdc..0cae62fbb7 100644
--- a/proxy/src/write.rs
+++ b/proxy/src/write.rs
@@ -684,7 +684,7 @@ fn find_new_columns(
             );
 
             let tag_name = &tag_names[name_index];
-            // todo is_dictionary set true or false ?
+            // TODO(tanruixiang) :  is_dictionary set true or false ?
             build_column(&mut columns, schema, tag_name, &tag.value, true, false)?;
         }
 
@@ -701,7 +701,7 @@ fn find_new_columns(
                     }
                 );
                 let field_name = &field_names[field.name_index as usize];
-                // todo is_dictionary set true or false ?
+                // TODO(tanruixiang) :  is_dictionary set true or false ?
                 build_column(&mut columns, schema, field_name, &field.value, false, false)?;
             }
         }
diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs
index 2baf01be10..30f1ad6e74 100644
--- a/query_frontend/src/planner.rs
+++ b/query_frontend/src/planner.rs
@@ -431,7 +431,8 @@ pub fn build_schema_from_write_table_request(
             let data_type = try_get_data_type_from_value(tag_value)?;
 
             if let Some(column_schema) = name_column_map.get(tag_name) {
-                // Todo is_dictionary set true or false ? Do we need modify the pb ?
+                // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb
+                // ?
                 ensure_data_type_compatible(
                     table,
                     tag_name,
@@ -442,7 +443,8 @@ pub fn build_schema_from_write_table_request(
                 )?;
             }
 
-            // Todo is_dictionary set true or false ? Do we need modify the pb ?
+            // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb
+            // ?
             let column_schema = build_column_schema(tag_name, data_type, true, false)?;
             name_column_map.insert(tag_name, column_schema);
         }
@@ -469,7 +471,7 @@ pub fn build_schema_from_write_table_request(
                     let data_type = try_get_data_type_from_value(field_value)?;
 
                     if let Some(column_schema) = name_column_map.get(field_name) {
-                        // todo is_dictionary set true or false ?
+                        // TODO(tanruixiang) :  is_dictionary set true or false ?
                         ensure_data_type_compatible(
                             table,
                             field_name,
@@ -479,7 +481,7 @@ pub fn build_schema_from_write_table_request(
                             column_schema,
                         )?;
                     }
-                    // todo is_dictionary set true or false ?
+                    // TODO(tanruixiang) :  is_dictionary set true or false ?
                     let column_schema = build_column_schema(field_name, data_type, false, false)?;
                     name_column_map.insert(field_name, column_schema);
                 }
@@ -530,7 +532,7 @@ fn ensure_data_type_compatible(
     data_type: DatumKind,
     column_schema: &ColumnSchema,
 ) -> Result<()> {
-    // Todo how to check is_dictionary ?
+    // TODO(tanruixiang) : how to check is_dictionary ?
     ensure!(
         column_schema.is_tag == is_tag,
         InvalidWriteEntry {
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 5e94d20d1b..846a6e18b3 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -244,7 +244,7 @@ fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     data_type: &DatumKind,
     iter: I,
 ) -> stream::Result<ColumnBlock> {
-    // TODO ensure there don't use is_dictionary and the datum.clone() is necessary
+    // TODO(tanruixiang) : ensure there don't use is_dictionary and the datum.clone() is necessary
     // ?
     let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false);
     for datum in iter {

From 8f705c83cf95491ace35bab7de35b2bafd53728d Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 11:01:30 +0800
Subject: [PATCH 08/18] update

---
 analytic_engine/src/sst/parquet/writer.rs | 2 ++
 table_engine/src/memory.rs                | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index 3d5e896c06..3040746efc 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -358,6 +358,8 @@ mod tests {
         table_options::{self, StorageFormatHint},
     };
 
+    // TODO(xikai): add test for reverse reader
+    
     #[test]
     fn test_parquet_build_and_read() {
         init_log_for_test();
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 846a6e18b3..0755552398 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -244,8 +244,8 @@ fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     data_type: &DatumKind,
     iter: I,
 ) -> stream::Result<ColumnBlock> {
-    // TODO(tanruixiang) : ensure there don't use is_dictionary and the datum.clone() is necessary
-    // ?
+    // TODO(tanruixiang) : ensure there don't use is_dictionary and the
+    // datum.clone() is necessary ?
     let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false);
     for datum in iter {
         builder

From 736867ba4a03a102a7aefbf4eee843adcc498f55 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 11:15:53 +0800
Subject: [PATCH 09/18] fmt

---
 analytic_engine/src/sst/parquet/writer.rs |  2 +-
 common_types/src/column.rs                | 68 +++++++++++++++++++----
 2 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/writer.rs b/analytic_engine/src/sst/parquet/writer.rs
index 3040746efc..0e1b5f3658 100644
--- a/analytic_engine/src/sst/parquet/writer.rs
+++ b/analytic_engine/src/sst/parquet/writer.rs
@@ -359,7 +359,7 @@ mod tests {
     };
 
     // TODO(xikai): add test for reverse reader
-    
+
     #[test]
     fn test_parquet_build_and_read() {
         init_log_for_test();
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index b1656e2cfc..f10b1aa35a 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -1201,7 +1201,7 @@ impl ColumnBlockBuilder {
 mod tests {
     use super::*;
     use crate::tests::{
-        build_row_for_dictionary, build_rows, build_schema, build_schema_for_dictionary,
+        build_row_for_dictionary, build_rows, build_schema, build_schema_with_dictionary,
     };
 
     #[test]
@@ -1244,28 +1244,72 @@ mod tests {
 
     #[test]
     fn test_column_block_string_dictionary_builder() {
-        let schema = build_schema_for_dictionary();
+        let schema = build_schema_with_dictionary();
         let rows = vec![
-            build_row_for_dictionary(1, 1, Some("tag1_1"), "tag2_1", 1),
-            build_row_for_dictionary(2, 2, Some("tag1_2"), "tag2_2", 2),
-            build_row_for_dictionary(3, 3, Some("tag1_3"), "tag2_3", 3),
-            build_row_for_dictionary(4, 4, Some("tag1_1"), "tag2_4", 3),
-            build_row_for_dictionary(5, 5, Some("tag1_3"), "tag2_4", 4),
-            build_row_for_dictionary(6, 6, None, "tag2_4", 4),
+            build_row_for_dictionary(
+                b"a",
+                1,
+                10.0,
+                "v4",
+                1000,
+                1_000_000,
+                Some("tag1_1"),
+                "tag2_1",
+            ),
+            build_row_for_dictionary(
+                b"b",
+                2,
+                10.0,
+                "v4",
+                1000,
+                1_000_000,
+                Some("tag1_2"),
+                "tag2_2",
+            ),
+            build_row_for_dictionary(
+                b"c",
+                3,
+                10.0,
+                "v4",
+                1000,
+                1_000_000,
+                Some("tag1_3"),
+                "tag2_3",
+            ),
+            build_row_for_dictionary(
+                b"d",
+                4,
+                10.0,
+                "v4",
+                1000,
+                1_000_000,
+                Some("tag1_1"),
+                "tag2_4",
+            ),
+            build_row_for_dictionary(
+                b"e",
+                5,
+                10.0,
+                "v4",
+                1000,
+                1_000_000,
+                Some("tag1_3"),
+                "tag2_4",
+            ),
+            build_row_for_dictionary(b"f", 6, 10.0, "v4", 1000, 1_000_000, None, "tag2_4"),
         ];
         // DatumKind::String , is_dictionary = true
-        let column = schema.column(2);
-        println!("{column:?}");
+        let column = schema.column(6);
         let mut builder =
             ColumnBlockBuilder::with_capacity(&column.data_type, 0, column.is_dictionary);
         // append
-        (0..rows.len()).for_each(|i| builder.append(rows[i][2].clone()).unwrap());
+        (0..rows.len()).for_each(|i| builder.append(rows[i][6].clone()).unwrap());
 
         let ret = builder.append(rows[0][0].clone());
         assert!(ret.is_err());
 
         // append_view
-        builder.append_view(rows[5][2].as_view()).unwrap();
+        builder.append_view(rows[5][6].as_view()).unwrap();
         let ret = builder.append_view(rows[1][0].as_view());
 
         assert!(ret.is_err());

From 4e54955b6c978cf4ee140ce0faa98ad9624e14aa Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 11:23:24 +0800
Subject: [PATCH 10/18] adjust cilppy

---
 common_types/src/tests.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
index 0839c6c121..b43269427f 100644
--- a/common_types/src/tests.rs
+++ b/common_types/src/tests.rs
@@ -220,6 +220,7 @@ pub fn build_schema_for_cpu() -> Schema {
     builder.build().unwrap()
 }
 
+#[allow(clippy::too_many_arguments)]
 pub fn build_row_for_dictionary(
     key1: &[u8],
     key2: i64,

From 688437cb4cf0d209d1a2df3f89087760bdaee818 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 14:43:39 +0800
Subject: [PATCH 11/18] adjust todo comment

---
 analytic_engine/src/sst/parquet/hybrid.rs |  4 ++--
 common_types/src/column.rs                |  4 ----
 common_types/src/column_schema.rs         |  2 +-
 proxy/src/http/prom.rs                    |  2 +-
 proxy/src/influxdb/types.rs               |  2 +-
 proxy/src/write.rs                        |  4 ++--
 query_frontend/src/planner.rs             | 12 ++++++------
 table_engine/src/memory.rs                |  2 +-
 8 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/hybrid.rs b/analytic_engine/src/sst/parquet/hybrid.rs
index ee86496f48..3d155bc01b 100644
--- a/analytic_engine/src/sst/parquet/hybrid.rs
+++ b/analytic_engine/src/sst/parquet/hybrid.rs
@@ -127,7 +127,7 @@ pub fn build_hybrid_arrow_schema(schema: &Schema) -> ArrowSchemaRef {
                     field.data_type().clone(),
                     true,
                 )));
-                // TODO(tanruixiang) is there need to use new_dict?
+                // TODO(tanruixiang): is there need to use new_dict?
                 Arc::new(Field::new(field.name(), field_type, true))
             } else {
                 field.clone()
@@ -419,7 +419,7 @@ impl ListArrayBuilder {
         let array_len = self.multi_row_arrays.len();
         let mut offsets = MutableBuffer::new(array_len * std::mem::size_of::<i32>());
         let child_data = self.build_child_data(&mut offsets)?;
-        // TODO is there need to use new_dict?
+        // TODO(tanruixiang): is there need to use new_dict?
         let field = Arc::new(Field::new(
             LIST_ITEM_NAME,
             self.datum_kind.to_arrow_data_type(),
diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index f10b1aa35a..b11bdbfd3e 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -476,10 +476,6 @@ impl StringDictionaryColumn {
         DictionaryArray::<Int32Type>::from(array_data)
     }
 
-    #[doc = " Returns a zero-copy slice of this array with the indicated offset and"]
-    #[doc = " length."]
-    #[doc = ""]
-    #[doc = " Panics if offset with length is greater than column length."]
     fn slice(&self, offset: usize, length: usize) -> Self {
         let array_slice = self.0.slice(offset, length);
         let array_data = array_slice.into_data();
diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs
index 606d04eaf5..1691ab8f90 100644
--- a/common_types/src/column_schema.rs
+++ b/common_types/src/column_schema.rs
@@ -336,7 +336,7 @@ impl From<&ColumnSchema> for Field {
                 col_schema.is_nullable,
                 col_schema.id.into(),
                 false,
-                // TODO(tanruixiang) how to use dict_is_ordered
+                // TODO(tanruixiang): how to use dict_is_ordered
             )
         } else {
             Field::new(
diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs
index 87fd26e76b..d82e669847 100644
--- a/proxy/src/http/prom.rs
+++ b/proxy/src/http/prom.rs
@@ -242,7 +242,7 @@ impl Converter {
         let value_idx = schema.index_of(field_col_name).context(InternalNoCause {
             msg: "Value column is missing in query response",
         })?;
-        // Todo is there need add is_dictionary check?
+        // TODO(tanruixiang): is there need add is_dictionary check?
         let tags = schema
             .columns()
             .iter()
diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs
index 85681d95c5..2b311cc9f1 100644
--- a/proxy/src/influxdb/types.rs
+++ b/proxy/src/influxdb/types.rs
@@ -811,7 +811,7 @@ mod tests {
     }
 
     fn build_test_column_blocks() -> Vec<ColumnBlock> {
-        // TODO missing is_dictionary paramms
+        // TODO(tanruixiang): missing is_dictionary paramms
         let mut measurement_builder =
             ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
         let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
diff --git a/proxy/src/write.rs b/proxy/src/write.rs
index 0cae62fbb7..43b39eb6c1 100644
--- a/proxy/src/write.rs
+++ b/proxy/src/write.rs
@@ -684,7 +684,7 @@ fn find_new_columns(
             );
 
             let tag_name = &tag_names[name_index];
-            // TODO(tanruixiang) :  is_dictionary set true or false ?
+            // TODO(tanruixiang):  is_dictionary set true or false ?
             build_column(&mut columns, schema, tag_name, &tag.value, true, false)?;
         }
 
@@ -701,7 +701,7 @@ fn find_new_columns(
                     }
                 );
                 let field_name = &field_names[field.name_index as usize];
-                // TODO(tanruixiang) :  is_dictionary set true or false ?
+                // TODO(tanruixiang):  is_dictionary set true or false ?
                 build_column(&mut columns, schema, field_name, &field.value, false, false)?;
             }
         }
diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs
index 30f1ad6e74..785b75fa09 100644
--- a/query_frontend/src/planner.rs
+++ b/query_frontend/src/planner.rs
@@ -431,8 +431,8 @@ pub fn build_schema_from_write_table_request(
             let data_type = try_get_data_type_from_value(tag_value)?;
 
             if let Some(column_schema) = name_column_map.get(tag_name) {
-                // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb
-                // ?
+                // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the
+                // pb?
                 ensure_data_type_compatible(
                     table,
                     tag_name,
@@ -443,8 +443,8 @@ pub fn build_schema_from_write_table_request(
                 )?;
             }
 
-            // TODO(tanruixiang) is_dictionary set true or false ? Do we need modify the pb
-            // ?
+            // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the
+            // pb?
             let column_schema = build_column_schema(tag_name, data_type, true, false)?;
             name_column_map.insert(tag_name, column_schema);
         }
@@ -471,7 +471,7 @@ pub fn build_schema_from_write_table_request(
                     let data_type = try_get_data_type_from_value(field_value)?;
 
                     if let Some(column_schema) = name_column_map.get(field_name) {
-                        // TODO(tanruixiang) :  is_dictionary set true or false ?
+                        // TODO(tanruixiang):  is_dictionary set true or false ?
                         ensure_data_type_compatible(
                             table,
                             field_name,
@@ -481,7 +481,7 @@ pub fn build_schema_from_write_table_request(
                             column_schema,
                         )?;
                     }
-                    // TODO(tanruixiang) :  is_dictionary set true or false ?
+                    // TODO(tanruixiang):  is_dictionary set true or false ?
                     let column_schema = build_column_schema(field_name, data_type, false, false)?;
                     name_column_map.insert(field_name, column_schema);
                 }
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 0755552398..8d39a40823 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -244,7 +244,7 @@ fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     data_type: &DatumKind,
     iter: I,
 ) -> stream::Result<ColumnBlock> {
-    // TODO(tanruixiang) : ensure there don't use is_dictionary and the
+    // TODO(tanruixiang): ensure there don't use is_dictionary and the
     // datum.clone() is necessary ?
     let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false);
     for datum in iter {

From 1567c0b2de62901d7da8d7276afc600875f23008 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 17:04:18 +0800
Subject: [PATCH 12/18] update by review's comment

---
 common_types/src/column.rs        | 28 +++++++++-------------------
 common_types/src/column_schema.rs |  4 ++--
 proxy/src/http/prom.rs            |  1 -
 proxy/src/influxdb/types.rs       |  1 -
 proxy/src/read.rs                 |  8 ++++++++
 proxy/src/write.rs                |  9 +++------
 query_frontend/src/parser.rs      |  1 -
 query_frontend/src/planner.rs     | 26 ++++----------------------
 8 files changed, 26 insertions(+), 52 deletions(-)

diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index b11bdbfd3e..346d3fd80e 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -463,6 +463,7 @@ impl From<DictionaryArray<Int32Type>> for StringDictionaryColumn {
         Self(array)
     }
 }
+
 impl From<&DictionaryArray<Int32Type>> for StringDictionaryColumn {
     fn from(array_ref: &DictionaryArray<Int32Type>) -> Self {
         let array_data = array_ref.into_data();
@@ -470,6 +471,7 @@ impl From<&DictionaryArray<Int32Type>> for StringDictionaryColumn {
         Self(array)
     }
 }
+
 impl StringDictionaryColumn {
     fn to_arrow_array(&self) -> DictionaryArray<Int32Type> {
         let array_data = self.0.clone().into_data();
@@ -533,6 +535,7 @@ impl StringColumn {
     }
 }
 
+/// dictionary encode type is difference from other types
 impl StringDictionaryColumn {
     /// Create a column that all values are null.
     fn new_null(num_rows: usize) -> Self {
@@ -768,26 +771,13 @@ macro_rules! define_column_block {
                     let column = match datum_kind {
                         DatumKind::Null => ColumnBlock::Null(NullColumn::new_null(array.len())),
                         DatumKind::String => {
-                            if !is_dictionary {
-                                let mills_array;
-                                let cast_column = match array.data_type() {
-                                    DataType::Timestamp(TimeUnit::Nanosecond, None) => {
-                                        mills_array = cast_nanosecond_to_mills(array)?;
-                                        cast_array(datum_kind, &mills_array)?
-                                    }
-                                    _ => cast_array(datum_kind, array)?,
-                                };
-                                ColumnBlock::String(StringColumn::from(cast_column))
-                            } else {
-                                let mills_array;
-                                let cast_column = match array.data_type() {
-                                    DataType::Timestamp(TimeUnit::Nanosecond, None) => {
-                                        mills_array = cast_nanosecond_to_mills(array)?;
-                                        cast_array(datum_kind, &mills_array)?
-                                    }
-                                    _ => cast_array(datum_kind, array)?,
-                                };
+                            if is_dictionary {
+                                let cast_column = cast_array(datum_kind, array)?;
                                 ColumnBlock::StringDictionary(StringDictionaryColumn::from(cast_column))
+
+                            } else {
+                                let cast_column = cast_array(datum_kind, array)?;
+                                ColumnBlock::String(StringColumn::from(cast_column))
                             }
                         },
                         $(
diff --git a/common_types/src/column_schema.rs b/common_types/src/column_schema.rs
index 1691ab8f90..0678336f01 100644
--- a/common_types/src/column_schema.rs
+++ b/common_types/src/column_schema.rs
@@ -168,7 +168,7 @@ pub struct ColumnSchema {
     /// Is tag, tag is just a hint for a column, there is no restriction that a
     /// tag column must be a part of primary key
     pub is_tag: bool,
-    // Whether to use dictionary types for parquet store
+    // Whether to use dictionary types for encoding column
     pub is_dictionary: bool,
     /// Comment of the column
     pub comment: String,
@@ -335,8 +335,8 @@ impl From<&ColumnSchema> for Field {
                 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
                 col_schema.is_nullable,
                 col_schema.id.into(),
-                false,
                 // TODO(tanruixiang): how to use dict_is_ordered
+                false,
             )
         } else {
             Field::new(
diff --git a/proxy/src/http/prom.rs b/proxy/src/http/prom.rs
index d82e669847..dab16707ab 100644
--- a/proxy/src/http/prom.rs
+++ b/proxy/src/http/prom.rs
@@ -242,7 +242,6 @@ impl Converter {
         let value_idx = schema.index_of(field_col_name).context(InternalNoCause {
             msg: "Value column is missing in query response",
         })?;
-        // TODO(tanruixiang): is there need add is_dictionary check?
         let tags = schema
             .columns()
             .iter()
diff --git a/proxy/src/influxdb/types.rs b/proxy/src/influxdb/types.rs
index 2b311cc9f1..d090842b07 100644
--- a/proxy/src/influxdb/types.rs
+++ b/proxy/src/influxdb/types.rs
@@ -811,7 +811,6 @@ mod tests {
     }
 
     fn build_test_column_blocks() -> Vec<ColumnBlock> {
-        // TODO(tanruixiang): missing is_dictionary paramms
         let mut measurement_builder =
             ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
         let mut tag_builder = ColumnBlockBuilder::with_capacity(&DatumKind::String, 3, false);
diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index a47b9454be..0f70b99869 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -9,6 +9,7 @@ use ceresdbproto::storage::{
 };
 use common_types::request_id::RequestId;
 use common_util::{error::BoxError, time::InstantExt};
+use datafusion::common::tree_node::TreeNode;
 use futures::FutureExt;
 use http::StatusCode;
 use interpreters::interpreter::Output;
@@ -17,6 +18,7 @@ use query_engine::executor::Executor as QueryExecutor;
 use query_frontend::{
     frontend,
     frontend::{Context as SqlContext, Frontend},
+    plan::Plan,
     provider::CatalogMetaProvider,
 };
 use router::endpoint::Endpoint;
@@ -122,6 +124,7 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
 
         // Create logical plan
         // Note: Remember to store sql in error when creating logical plan
+        println!("stmts: {:?}", stmts);
         let plan = frontend
             // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we
             // return internal server error in those cases
@@ -132,6 +135,11 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
                 msg: format!("Failed to create plan, query:{sql}"),
             })?;
 
+        if let Plan::Query(tmp) = &plan {
+            println!("{:?}", stmts);
+            println!("{:?}", tmp.df_plan.all_out_ref_exprs());
+        };
+
         let output = if ctx.enable_partition_table_access {
             self.execute_plan_involving_partition_table(request_id, catalog, schema, plan, deadline)
                 .await
diff --git a/proxy/src/write.rs b/proxy/src/write.rs
index 43b39eb6c1..fc2f88d4d9 100644
--- a/proxy/src/write.rs
+++ b/proxy/src/write.rs
@@ -684,8 +684,7 @@ fn find_new_columns(
             );
 
             let tag_name = &tag_names[name_index];
-            // TODO(tanruixiang):  is_dictionary set true or false ?
-            build_column(&mut columns, schema, tag_name, &tag.value, true, false)?;
+            build_column(&mut columns, schema, tag_name, &tag.value, true)?;
         }
 
         // Parse fields.
@@ -701,8 +700,7 @@ fn find_new_columns(
                     }
                 );
                 let field_name = &field_names[field.name_index as usize];
-                // TODO(tanruixiang):  is_dictionary set true or false ?
-                build_column(&mut columns, schema, field_name, &field.value, false, false)?;
+                build_column(&mut columns, schema, field_name, &field.value, false)?;
             }
         }
     }
@@ -716,7 +714,6 @@ fn build_column<'a>(
     name: &'a str,
     value: &Option<Value>,
     is_tag: bool,
-    is_dictionary: bool,
 ) -> Result<()> {
     // Skip adding columns, the following cases:
     // 1. Column already exists.
@@ -742,7 +739,7 @@ fn build_column<'a>(
             msg: "Failed to get data type",
         })?;
 
-    let column_schema = build_column_schema(name, data_type, is_tag, is_dictionary)
+    let column_schema = build_column_schema(name, data_type, is_tag)
         .box_err()
         .context(Internal {
             msg: "Failed to build column schema",
diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs
index 813344a888..8304416b32 100644
--- a/query_frontend/src/parser.rs
+++ b/query_frontend/src/parser.rs
@@ -532,7 +532,6 @@ impl<'a> Parser<'a> {
                 Token::make_keyword(TAG),
             ])))
         } else if self.consume_token(DICTIONARY) {
-            // Support DICTIONARY for ceresdb
             Ok(Some(ColumnOption::DialectSpecific(vec![
                 Token::make_keyword(DICTIONARY),
             ])))
diff --git a/query_frontend/src/planner.rs b/query_frontend/src/planner.rs
index 785b75fa09..de8327da70 100644
--- a/query_frontend/src/planner.rs
+++ b/query_frontend/src/planner.rs
@@ -367,12 +367,10 @@ pub fn build_column_schema(
     column_name: &str,
     data_type: DatumKind,
     is_tag: bool,
-    is_dictionary: bool,
 ) -> Result<ColumnSchema> {
     let builder = column_schema::Builder::new(column_name.to_string(), data_type)
         .is_nullable(true)
-        .is_tag(is_tag)
-        .is_dictionary(is_dictionary);
+        .is_tag(is_tag);
 
     builder.build().with_context(|| InvalidColumnSchema {
         column_name: column_name.to_string(),
@@ -431,21 +429,10 @@ pub fn build_schema_from_write_table_request(
             let data_type = try_get_data_type_from_value(tag_value)?;
 
             if let Some(column_schema) = name_column_map.get(tag_name) {
-                // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the
-                // pb?
-                ensure_data_type_compatible(
-                    table,
-                    tag_name,
-                    true,
-                    false,
-                    data_type,
-                    column_schema,
-                )?;
+                ensure_data_type_compatible(table, tag_name, true, data_type, column_schema)?;
             }
 
-            // TODO(tanruixiang): is_dictionary set true or false ? Do we need modify the
-            // pb?
-            let column_schema = build_column_schema(tag_name, data_type, true, false)?;
+            let column_schema = build_column_schema(tag_name, data_type, true)?;
             name_column_map.insert(tag_name, column_schema);
         }
 
@@ -471,18 +458,15 @@ pub fn build_schema_from_write_table_request(
                     let data_type = try_get_data_type_from_value(field_value)?;
 
                     if let Some(column_schema) = name_column_map.get(field_name) {
-                        // TODO(tanruixiang):  is_dictionary set true or false ?
                         ensure_data_type_compatible(
                             table,
                             field_name,
                             false,
-                            false,
                             data_type,
                             column_schema,
                         )?;
                     }
-                    // TODO(tanruixiang):  is_dictionary set true or false ?
-                    let column_schema = build_column_schema(field_name, data_type, false, false)?;
+                    let column_schema = build_column_schema(field_name, data_type, false)?;
                     name_column_map.insert(field_name, column_schema);
                 }
             }
@@ -528,11 +512,9 @@ fn ensure_data_type_compatible(
     table_name: &str,
     column_name: &str,
     is_tag: bool,
-    _is_dictionary: bool,
     data_type: DatumKind,
     column_schema: &ColumnSchema,
 ) -> Result<()> {
-    // TODO(tanruixiang) : how to check is_dictionary ?
     ensure!(
         column_schema.is_tag == is_tag,
         InvalidWriteEntry {

From df47927dc267ce13b4b861e728db7c71681f6097 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 17:12:55 +0800
Subject: [PATCH 13/18] remove debug message

---
 proxy/src/read.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index 0f70b99869..bcb9f653a0 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -124,7 +124,6 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
 
         // Create logical plan
         // Note: Remember to store sql in error when creating logical plan
-        println!("stmts: {:?}", stmts);
         let plan = frontend
             // TODO(yingwen): Check error, some error may indicate that the sql is invalid. Now we
             // return internal server error in those cases
@@ -135,11 +134,6 @@ impl<Q: QueryExecutor + 'static> Proxy<Q> {
                 msg: format!("Failed to create plan, query:{sql}"),
             })?;
 
-        if let Plan::Query(tmp) = &plan {
-            println!("{:?}", stmts);
-            println!("{:?}", tmp.df_plan.all_out_ref_exprs());
-        };
-
         let output = if ctx.enable_partition_table_access {
             self.execute_plan_involving_partition_table(request_id, catalog, schema, plan, deadline)
                 .await

From db350e323d9d07dec8473a2868d06e81afe8bafc Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 17:16:11 +0800
Subject: [PATCH 14/18] clippy

---
 proxy/src/read.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index bcb9f653a0..86a06b1ee0 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -9,7 +9,7 @@ use ceresdbproto::storage::{
 };
 use common_types::request_id::RequestId;
 use common_util::{error::BoxError, time::InstantExt};
-use datafusion::common::tree_node::TreeNode;
+
 use futures::FutureExt;
 use http::StatusCode;
 use interpreters::interpreter::Output;
@@ -18,7 +18,6 @@ use query_engine::executor::Executor as QueryExecutor;
 use query_frontend::{
     frontend,
     frontend::{Context as SqlContext, Frontend},
-    plan::Plan,
     provider::CatalogMetaProvider,
 };
 use router::endpoint::Endpoint;

From bf81853fc2d269cf89dd879535a3aaf988808ce5 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Tue, 4 Jul 2023 17:21:28 +0800
Subject: [PATCH 15/18] fmt

---
 proxy/src/read.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/proxy/src/read.rs b/proxy/src/read.rs
index 86a06b1ee0..a47b9454be 100644
--- a/proxy/src/read.rs
+++ b/proxy/src/read.rs
@@ -9,7 +9,6 @@ use ceresdbproto::storage::{
 };
 use common_types::request_id::RequestId;
 use common_util::{error::BoxError, time::InstantExt};
-
 use futures::FutureExt;
 use http::StatusCode;
 use interpreters::interpreter::Output;

From 51a7ddf83b11a205825c76274d8abddeb761c82f Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 5 Jul 2023 11:33:28 +0800
Subject: [PATCH 16/18] fix

---
 common_types/src/column.rs | 9 +++++----
 common_types/src/tests.rs  | 1 +
 table_engine/src/memory.rs | 6 ++++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/common_types/src/column.rs b/common_types/src/column.rs
index 346d3fd80e..d1cf85fefc 100644
--- a/common_types/src/column.rs
+++ b/common_types/src/column.rs
@@ -143,6 +143,8 @@ pub struct VarbinaryColumn(BinaryArray);
 #[derive(Debug)]
 pub struct StringColumn(StringArray);
 
+/// dictionary encode type is difference from other types, need implement
+/// without macro
 #[derive(Debug)]
 pub struct StringDictionaryColumn(DictionaryArray<Int32Type>);
 
@@ -535,7 +537,6 @@ impl StringColumn {
     }
 }
 
-/// dictionary encode type is difference from other types
 impl StringDictionaryColumn {
     /// Create a column that all values are null.
     fn new_null(num_rows: usize) -> Self {
@@ -968,10 +969,10 @@ macro_rules! define_column_block_builder {
                         // The data_capacity is set as 1024, because the item is variable-size type.
                         DatumKind::Varbinary => Self::Varbinary(BinaryBuilder::with_capacity(item_capacity, 1024)),
                         DatumKind::String =>{
-                            if !is_dictionary{
-                                Self::String(StringBuilder::with_capacity(item_capacity, 1024))
-                            }else {
+                            if is_dictionary {
                                 Self::Dictionary(StringDictionaryBuilder::<Int32Type>::new())
+                            }else {
+                                Self::String(StringBuilder::with_capacity(item_capacity, 1024))
                             }
                         }
                         DatumKind::Date => Self::Date(DateBuilder::with_capacity(item_capacity)),
diff --git a/common_types/src/tests.rs b/common_types/src/tests.rs
index b43269427f..9cfb86a0fb 100644
--- a/common_types/src/tests.rs
+++ b/common_types/src/tests.rs
@@ -242,6 +242,7 @@ pub fn build_row_for_dictionary(
             .unwrap_or(Datum::Null),
         Datum::String(StringBytes::from(tag2)),
     ];
+
     Row::from_datums(datums)
 }
 pub fn build_projected_schema() -> ProjectedSchema {
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 8d39a40823..7e390ee221 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -229,7 +229,7 @@ fn row_group_to_record_batch(
                 ),
             })?;
         let cols = rows.iter_column(col_index);
-        let column_block = build_column_block(&column.data_type, cols)?;
+        let column_block = build_column_block(&column.data_type, cols, column.is_dictionary)?;
         column_blocks.push(column_block);
     }
 
@@ -243,10 +243,12 @@ fn row_group_to_record_batch(
 fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     data_type: &DatumKind,
     iter: I,
+    is_dictionary: bool,
 ) -> stream::Result<ColumnBlock> {
     // TODO(tanruixiang): ensure there don't use is_dictionary and the
     // datum.clone() is necessary ?
-    let mut builder = ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, false);
+    let mut builder =
+        ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, is_dictionary);
     for datum in iter {
         builder
             .append(datum.clone())

From 3f36476c541ac77a95ce9962f6becfd02f4397f8 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 5 Jul 2023 14:33:39 +0800
Subject: [PATCH 17/18] fmt

---
 query_frontend/src/parser.rs | 4 ++--
 table_engine/src/memory.rs   | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/query_frontend/src/parser.rs b/query_frontend/src/parser.rs
index 8304416b32..77f4c35056 100644
--- a/query_frontend/src/parser.rs
+++ b/query_frontend/src/parser.rs
@@ -329,9 +329,9 @@ impl<'a> Parser<'a> {
         let options = self.parser.parse_options(Keyword::WITH)?;
 
         // Only String Column Can Be Dictionary Encoded
-        for c in columns.iter() {
+        for c in &columns {
             let mut is_dictionary = false;
-            for op in c.options.iter() {
+            for op in &c.options {
                 if is_dictionary_column(&op.option) {
                     is_dictionary = true;
                 }
diff --git a/table_engine/src/memory.rs b/table_engine/src/memory.rs
index 7e390ee221..80d092609c 100644
--- a/table_engine/src/memory.rs
+++ b/table_engine/src/memory.rs
@@ -245,8 +245,6 @@ fn build_column_block<'a, I: Iterator<Item = &'a Datum>>(
     iter: I,
     is_dictionary: bool,
 ) -> stream::Result<ColumnBlock> {
-    // TODO(tanruixiang): ensure there don't use is_dictionary and the
-    // datum.clone() is necessary ?
     let mut builder =
         ColumnBlockBuilder::with_capacity(data_type, iter.size_hint().0, is_dictionary);
     for datum in iter {

From 0523f446d55a3ca9b9058b498e5d8955698b1499 Mon Sep 17 00:00:00 2001
From: tanruixiang <tanruixiang0104@gmail.com>
Date: Wed, 5 Jul 2023 14:53:16 +0800
Subject: [PATCH 18/18] use assert

---
 analytic_engine/src/sst/parquet/encoding.rs | 23 ++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/analytic_engine/src/sst/parquet/encoding.rs b/analytic_engine/src/sst/parquet/encoding.rs
index 1a92338dd4..47d3b8c2dd 100644
--- a/analytic_engine/src/sst/parquet/encoding.rs
+++ b/analytic_engine/src/sst/parquet/encoding.rs
@@ -527,13 +527,22 @@ impl HybridRecordDecoder {
             .map(|f| {
                 if let DataType::List(nested_field) = f.data_type() {
                     match f.data_type() {
-                        DataType::Dictionary(_, _) => Arc::new(Field::new_dict(
-                            f.name(),
-                            nested_field.data_type().clone(),
-                            true,
-                            f.dict_id().unwrap(),
-                            f.dict_is_ordered().unwrap(),
-                        )),
+                        DataType::Dictionary(_, _) => {
+                            assert!(f.dict_id().is_some(), "Dictionary must have dict_id");
+                            assert!(
+                                f.dict_is_ordered().is_some(),
+                                "Dictionary must have dict_is_ordered"
+                            );
+                            let dict_id = f.dict_id().unwrap();
+                            let dict_is_ordered = f.dict_is_ordered().unwrap();
+                            Arc::new(Field::new_dict(
+                                f.name(),
+                                nested_field.data_type().clone(),
+                                true,
+                                dict_id,
+                                dict_is_ordered,
+                            ))
+                        }
                         _ => Arc::new(Field::new(f.name(), nested_field.data_type().clone(), true)),
                     }
                 } else {