diff --git a/Cargo.lock b/Cargo.lock
index 4533257660ad..7aebe2e719eb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2289,6 +2289,7 @@ dependencies = [
"common-error",
"console-subscriber",
"greptime-proto",
+ "humantime-serde",
"lazy_static",
"once_cell",
"opentelemetry 0.21.0",
diff --git a/config/config.md b/config/config.md
index 641eee4b5837..d413936ff3cd 100644
--- a/config/config.md
+++ b/config/config.md
@@ -163,6 +163,10 @@
| `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. |
| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `logging.slow_query` | -- | -- | The slow query log options. |
+| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. |
+| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. |
+| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
@@ -249,6 +253,10 @@
| `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. |
| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `logging.slow_query` | -- | -- | The slow query log options. |
+| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. |
+| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. |
+| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
@@ -314,6 +322,10 @@
| `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. |
| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `logging.slow_query` | -- | -- | The slow query log options. |
+| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. |
+| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. |
+| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
@@ -466,6 +478,10 @@
| `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. |
| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `logging.slow_query` | -- | -- | The slow query log options. |
+| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. |
+| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. |
+| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
| `export_metrics` | -- | -- | The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape. |
| `export_metrics.enable` | Bool | `false` | whether enable export metrics. |
| `export_metrics.write_interval` | String | `30s` | The interval of export metrics. |
@@ -512,5 +528,9 @@
| `logging.log_format` | String | `text` | The log format. Can be `text`/`json`. |
| `logging.tracing_sample_ratio` | -- | -- | The percentage of tracing will be sampled and exported.
Valid range `[0, 1]`, 1 means all traces are sampled, 0 means all traces are not sampled, the default value is 1.
ratio > 1 are treated as 1. Fractions < 0 are treated as 0 |
| `logging.tracing_sample_ratio.default_ratio` | Float | `1.0` | -- |
+| `logging.slow_query` | -- | -- | The slow query log options. |
+| `logging.slow_query.enable` | Bool | `false` | Whether to enable slow query log. |
+| `logging.slow_query.threshold` | String | Unset | The threshold of slow query. |
+| `logging.slow_query.sample_ratio` | Float | Unset | The sampling ratio of slow query log. The value should be in the range of (0, 1]. |
| `tracing` | -- | -- | The tracing options. Only effect when compiled with `tokio-console` feature. |
| `tracing.tokio_console_addr` | String | Unset | The tokio console address. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index e4a3dca6d328..06a59ebd6fcd 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -586,6 +586,19 @@ log_format = "text"
[logging.tracing_sample_ratio]
default_ratio = 1.0
+## The slow query log options.
+[logging.slow_query]
+## Whether to enable slow query log.
+enable = false
+
+## The threshold of slow query.
+## @toml2docs:none-default
+threshold = "10s"
+
+## The sampling ratio of slow query log. The value should be in the range of (0, 1].
+## @toml2docs:none-default
+sample_ratio = 1.0
+
## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
[export_metrics]
diff --git a/config/flownode.example.toml b/config/flownode.example.toml
index a3a414fb987d..9d6d2fe5aa68 100644
--- a/config/flownode.example.toml
+++ b/config/flownode.example.toml
@@ -84,6 +84,19 @@ log_format = "text"
[logging.tracing_sample_ratio]
default_ratio = 1.0
+## The slow query log options.
+[logging.slow_query]
+## Whether to enable slow query log.
+enable = false
+
+## The threshold of slow query.
+## @toml2docs:none-default
+threshold = "10s"
+
+## The sampling ratio of slow query log. The value should be in the range of (0, 1].
+## @toml2docs:none-default
+sample_ratio = 1.0
+
## The tracing options. Only effect when compiled with `tokio-console` feature.
[tracing]
## The tokio console address.
diff --git a/config/frontend.example.toml b/config/frontend.example.toml
index eae001a2db0d..cc9698f61e88 100644
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -191,6 +191,19 @@ log_format = "text"
[logging.tracing_sample_ratio]
default_ratio = 1.0
+## The slow query log options.
+[logging.slow_query]
+## Whether to enable slow query log.
+enable = false
+
+## The threshold of slow query.
+## @toml2docs:none-default
+threshold = "10s"
+
+## The sampling ratio of slow query log. The value should be in the range of (0, 1].
+## @toml2docs:none-default
+sample_ratio = 1.0
+
## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
[export_metrics]
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index e95a9fa7f201..8431940b45aa 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -178,6 +178,19 @@ log_format = "text"
[logging.tracing_sample_ratio]
default_ratio = 1.0
+## The slow query log options.
+[logging.slow_query]
+## Whether to enable slow query log.
+enable = false
+
+## The threshold of slow query.
+## @toml2docs:none-default
+threshold = "10s"
+
+## The sampling ratio of slow query log. The value should be in the range of (0, 1].
+## @toml2docs:none-default
+sample_ratio = 1.0
+
## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
[export_metrics]
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index 1cd75e641421..2d14b6550f3c 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -630,6 +630,19 @@ log_format = "text"
[logging.tracing_sample_ratio]
default_ratio = 1.0
+## The slow query log options.
+[logging.slow_query]
+## Whether to enable slow query log.
+enable = false
+
+## The threshold of slow query.
+## @toml2docs:none-default
+threshold = "10s"
+
+## The sampling ratio of slow query log. The value should be in the range of (0, 1].
+## @toml2docs:none-default
+sample_ratio = 1.0
+
## The datanode can export its metrics and send to Prometheus compatible service (e.g. send to `greptimedb` itself) from remote-write API.
## This is only used for `greptimedb` to export its own metrics internally. It's different from prometheus scrape.
[export_metrics]
diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs
index 320dc49c1979..7678e90c884a 100644
--- a/src/cmd/src/frontend.rs
+++ b/src/cmd/src/frontend.rs
@@ -36,6 +36,7 @@ use frontend::instance::builder::FrontendBuilder;
use frontend::instance::{FrontendInstance, Instance as FeInstance};
use frontend::server::Services;
use meta_client::{MetaClientOptions, MetaClientType};
+use query::stats::StatementStatistics;
use servers::tls::{TlsMode, TlsOption};
use servers::Mode;
use snafu::{OptionExt, ResultExt};
@@ -352,6 +353,7 @@ impl StartCommand {
catalog_manager,
Arc::new(client),
meta_client,
+ StatementStatistics::new(opts.logging.slow_query.clone()),
)
.with_plugin(plugins.clone())
.with_local_cache_invalidator(layered_cache_registry)
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index 80b38ebaa749..4335bd5447e5 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -55,6 +55,7 @@ use frontend::service_config::{
};
use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
use mito2::config::MitoConfig;
+use query::stats::StatementStatistics;
use serde::{Deserialize, Serialize};
use servers::export_metrics::ExportMetricsOption;
use servers::grpc::GrpcOptions;
@@ -557,6 +558,7 @@ impl StartCommand {
catalog_manager.clone(),
node_manager.clone(),
ddl_task_executor.clone(),
+ StatementStatistics::new(opts.logging.slow_query.clone()),
)
.with_plugin(plugins.clone())
.try_build()
diff --git a/src/common/telemetry/Cargo.toml b/src/common/telemetry/Cargo.toml
index 20fc52a763f7..2b4023cf7cdd 100644
--- a/src/common/telemetry/Cargo.toml
+++ b/src/common/telemetry/Cargo.toml
@@ -17,6 +17,7 @@ backtrace = "0.3"
common-error.workspace = true
console-subscriber = { version = "0.1", optional = true }
greptime-proto.workspace = true
+humantime-serde.workspace = true
lazy_static.workspace = true
once_cell.workspace = true
opentelemetry = { version = "0.21.0", default-features = false, features = [
diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs
index de018aa4b6f3..c21766f04dc9 100644
--- a/src/common/telemetry/src/logging.rs
+++ b/src/common/telemetry/src/logging.rs
@@ -15,6 +15,7 @@
//! logging stuffs, inspired by databend
use std::env;
use std::sync::{Arc, Mutex, Once};
+use std::time::Duration;
use once_cell::sync::{Lazy, OnceCell};
use opentelemetry::{global, KeyValue};
@@ -26,7 +27,7 @@ use serde::{Deserialize, Serialize};
use tracing_appender::non_blocking::WorkerGuard;
use tracing_appender::rolling::{RollingFileAppender, Rotation};
use tracing_log::LogTracer;
-use tracing_subscriber::filter::Targets;
+use tracing_subscriber::filter::{FilterFn, Targets};
use tracing_subscriber::fmt::Layer;
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::prelude::*;
@@ -64,6 +65,24 @@ pub struct LoggingOptions {
/// The tracing sample ratio.
pub tracing_sample_ratio: Option,
+
+ /// The logging options of slow query.
+ pub slow_query: SlowQueryOptions,
+}
+
+/// The options of slow query.
+#[derive(Clone, Debug, Serialize, Deserialize, Default)]
+#[serde(default)]
+pub struct SlowQueryOptions {
+ /// Whether to enable slow query log.
+ pub enable: bool,
+
+ /// The threshold of slow queries.
+ #[serde(with = "humantime_serde")]
+ pub threshold: Option,
+
+ /// The sample ratio of slow queries.
+ pub sample_ratio: Option,
}
#[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -96,6 +115,7 @@ impl Default for LoggingOptions {
otlp_endpoint: None,
tracing_sample_ratio: None,
append_stdout: true,
+ slow_query: SlowQueryOptions::default(),
}
}
}
@@ -235,6 +255,42 @@ pub fn init_global_logging(
None
};
+ let slow_query_logging_layer = if !opts.dir.is_empty() && opts.slow_query.enable {
+ let rolling_appender =
+ RollingFileAppender::new(Rotation::HOURLY, &opts.dir, "greptimedb-slow-queries");
+ let (writer, guard) = tracing_appender::non_blocking(rolling_appender);
+ guards.push(guard);
+
+ // Only logs if the field contains "slow".
+ let slow_query_filter = FilterFn::new(|metadata| {
+ metadata
+ .fields()
+ .iter()
+ .any(|field| field.name().contains("slow"))
+ });
+
+ if opts.log_format == LogFormat::Json {
+ Some(
+ Layer::new()
+ .json()
+ .with_writer(writer)
+ .with_ansi(false)
+ .with_filter(slow_query_filter)
+ .boxed(),
+ )
+ } else {
+ Some(
+ Layer::new()
+ .with_writer(writer)
+ .with_ansi(false)
+ .with_filter(slow_query_filter)
+ .boxed(),
+ )
+ }
+ } else {
+ None
+ };
+
// resolve log level settings from:
// - options from command line or config files
// - environment variable: RUST_LOG
@@ -279,6 +335,7 @@ pub fn init_global_logging(
.with(stdout_logging_layer)
.with(file_logging_layer)
.with(err_file_logging_layer)
+ .with(slow_query_logging_layer)
};
// consume the `tracing_opts` to avoid "unused" warnings.
@@ -289,7 +346,8 @@ pub fn init_global_logging(
.with(dyn_filter)
.with(stdout_logging_layer)
.with(file_logging_layer)
- .with(err_file_logging_layer);
+ .with(err_file_logging_layer)
+ .with(slow_query_logging_layer);
if opts.enable_otlp_tracing {
global::set_text_map_propagator(TraceContextPropagator::new());
diff --git a/src/common/telemetry/src/macros.rs b/src/common/telemetry/src/macros.rs
index cb838db6fef6..2846bd5d2051 100644
--- a/src/common/telemetry/src/macros.rs
+++ b/src/common/telemetry/src/macros.rs
@@ -152,6 +152,17 @@ macro_rules! trace {
};
}
+#[macro_export]
+macro_rules! slow {
+ (target: $target:expr, $($arg:tt)+) => {
+ $crate::log!(target: $target, slow = true, $crate::tracing::Level::INFO, $($arg)+)
+ };
+
+ ($($arg:tt)+) => {
+ $crate::log!($crate::tracing::Level::INFO, slow = true, $($arg)+)
+ };
+}
+
#[cfg(test)]
mod tests {
use common_error::mock::MockError;
diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs
index d78f9219cb0c..4381dd06a03b 100644
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -37,6 +37,7 @@ use operator::delete::Deleter;
use operator::insert::Inserter;
use operator::statement::StatementExecutor;
use partition::manager::PartitionRuleManager;
+use query::stats::StatementStatistics;
use query::{QueryEngine, QueryEngineFactory};
use servers::error::{AlreadyStartedSnafu, StartGrpcSnafu, TcpBindSnafu, TcpIncomingSnafu};
use servers::server::Server;
@@ -475,6 +476,7 @@ impl FrontendInvoker {
layered_cache_registry.clone(),
inserter.clone(),
table_route_cache,
+ StatementStatistics::default(),
));
let invoker = FrontendInvoker::new(inserter, deleter, statement_executor);
diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs
index 5450e55ce25e..a9513121d85a 100644
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -33,6 +33,7 @@ use operator::statement::{StatementExecutor, StatementExecutorRef};
use operator::table::TableMutationOperator;
use partition::manager::PartitionRuleManager;
use pipeline::pipeline_operator::PipelineOperator;
+use query::stats::StatementStatistics;
use query::QueryEngineFactory;
use servers::server::ServerHandlers;
use snafu::OptionExt;
@@ -55,6 +56,7 @@ pub struct FrontendBuilder {
plugins: Option,
procedure_executor: ProcedureExecutorRef,
heartbeat_task: Option,
+ stats: StatementStatistics,
}
impl FrontendBuilder {
@@ -65,6 +67,7 @@ impl FrontendBuilder {
catalog_manager: CatalogManagerRef,
node_manager: NodeManagerRef,
procedure_executor: ProcedureExecutorRef,
+ stats: StatementStatistics,
) -> Self {
Self {
options,
@@ -76,6 +79,7 @@ impl FrontendBuilder {
plugins: None,
procedure_executor,
heartbeat_task: None,
+ stats,
}
}
@@ -181,6 +185,7 @@ impl FrontendBuilder {
local_cache_invalidator,
inserter.clone(),
table_route_cache,
+ self.stats,
));
let pipeline_operator = Arc::new(PipelineOperator::new(
diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs
index 7c76d0dcfffc..033bd14e9c40 100644
--- a/src/operator/src/statement.rs
+++ b/src/operator/src/statement.rs
@@ -45,6 +45,7 @@ use common_time::Timestamp;
use datafusion_expr::LogicalPlan;
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
use query::parser::QueryStatement;
+use query::stats::StatementStatistics;
use query::QueryEngineRef;
use session::context::{Channel, QueryContextRef};
use session::table_name::table_idents_to_full_name;
@@ -80,11 +81,13 @@ pub struct StatementExecutor {
partition_manager: PartitionRuleManagerRef,
cache_invalidator: CacheInvalidatorRef,
inserter: InserterRef,
+ stats: StatementStatistics,
}
pub type StatementExecutorRef = Arc;
impl StatementExecutor {
+ #[allow(clippy::too_many_arguments)]
pub fn new(
catalog_manager: CatalogManagerRef,
query_engine: QueryEngineRef,
@@ -93,6 +96,7 @@ impl StatementExecutor {
cache_invalidator: CacheInvalidatorRef,
inserter: InserterRef,
table_route_cache: TableRouteCacheRef,
+ stats: StatementStatistics,
) -> Self {
Self {
catalog_manager,
@@ -104,6 +108,7 @@ impl StatementExecutor {
partition_manager: Arc::new(PartitionRuleManager::new(kv_backend, table_route_cache)),
cache_invalidator,
inserter,
+ stats,
}
}
@@ -113,6 +118,7 @@ impl StatementExecutor {
stmt: QueryStatement,
query_ctx: QueryContextRef,
) -> Result