Sync with upstream (postgresml#26)

* Prevent clients from sticking to old pools after config update (postgresml#113) * Re-acquire pool at the beginning of Protocol loop * Fix query router + add tests for recycling behavior * create a prometheus exporter on a standard http port (postgresml#107) * create a hyper server and add option to enable it in config * move prometheus stuff to its own file; update format * create metric type and help lookup table * finish the metric help type map * switch to a boolean and a standard port * dont emit unimplemented metrics * fail if curl returns a non 200 * resolve conflicts * move log out of config.show and into main * terminating new line * upgrade curl * include unimplemented stats * Validates pgcat is closed after shutdown python tests (postgresml#116) * Validates pgcat is closed after shutdown python tests * Fix pgrep logic * Moves sigterm step to after cleanup to decouple * Replace subprocess with os.system for running pgcat * fix docker compose port allocation for local dev (postgresml#117) change docker compose port to right prometheus port * Update CONTRIBUTING.md * Health check delay (postgresml#118) * initial commit of server check delay implementation * fmt * spelling * Update name to last_healthcheck and some comments * Moved server tested stat to after require_healthcheck check * Make health check delay configurable * Rename to last_activity * Fix typo * Add debug log for healthcheck * Add address to debug log * Speed up CI a bit (postgresml#119) * Sleep for 1s * use premade image * quicker * revert shutdown timeout * Fix debug log (postgresml#120) * Make prometheus port configurable (postgresml#121) * Make prometheus port configurable * Update circleci config * Statement timeout + replica imbalance fix (postgresml#122) * Statement timeout * send error message too * Correct error messages * Fix replica inbalance * disable stmt timeout by default * Redundant mark_bad * revert healthcheck delay * tests * set it to 0 * reload config again * pgcat toml Co-authored-by: Nicholas Dujay <[email protected]> Co-authored-by: zainkabani <[email protected]> Co-authored-by: Lev Kokotov <[email protected]> Co-authored-by: Pradeep Chhetri <[email protected]>
jmeagher · Aug 15, 2022 · 0846a17 · 0846a17
1 parent dfe0e05
commit 0846a17
Show file tree

Hide file tree

Showing 17 changed files with 310 additions and 81 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -9,7 +9,7 @@ jobs:
     # Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
     # See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
     docker:
-      - image: cimg/rust:1.62.0
+      - image: levkk/pgcat-ci:latest
         environment:
           RUST_LOG: info
           RUSTFLAGS: "-C instrument-coverage"

diff --git a/.circleci/pgcat.toml b/.circleci/pgcat.toml
@@ -11,15 +11,21 @@ host = "0.0.0.0"
 # Port to run on, same as PgBouncer used in this example.
 port = 6432
 
-# enable prometheus exporter on port 9930
+# Whether to enable prometheus exporter or not.
 enable_prometheus_exporter = true
 
+# Port at which prometheus exporter listens on.
+prometheus_exporter_port = 9930
+
 # How long to wait before aborting a server connection (ms).
 connect_timeout = 100
 
 # How much time to give the health check query to return with a result (ms).
 healthcheck_timeout = 100
 
+# How long to keep connection available for immediate re-use, without running a healthcheck query on it
+healthcheck_delay = 30000
+
 # How much time to give clients during shutdown before forcibly killing client connections (ms).
 shutdown_timeout = 5000
 
@@ -85,11 +91,13 @@ password = "sharding_user"
 # The maximum number of connection from a single Pgcat process to any database in the cluster
 # is the sum of pool_size across all users.
 pool_size = 9
+statement_timeout = 0
 
 [pools.sharded_db.users.1]
 username = "other_user"
 password = "other_user"
 pool_size = 21
+statement_timeout = 30000
 
 # Shard 0
 [pools.sharded_db.shards.0]
@@ -127,6 +135,7 @@ sharding_function = "pg_bigint_hash"
 username = "simple_user"
 password = "simple_user"
 pool_size = 5
+statement_timeout = 30000
 
 [pools.simple_db.shards.0]
 servers = [

diff --git a/.circleci/run_tests.sh b/.circleci/run_tests.sh
@@ -66,6 +66,18 @@ psql -U sharding_user -e -h 127.0.0.1 -p 6432 -f tests/sharding/query_routing_te
 # Replica/primary selection & more sharding tests
 psql -U sharding_user -e -h 127.0.0.1 -p 6432 -f tests/sharding/query_routing_test_primary_replica.sql > /dev/null
 
+# Statement timeout tests
+sed -i 's/statement_timeout = 0/statement_timeout = 100/' .circleci/pgcat.toml
+kill -SIGHUP $(pgrep pgcat) # Reload config
+sleep 0.2
+
+# This should timeout
+(! psql -U sharding_user -e -h 127.0.0.1 -p 6432 -c 'select pg_sleep(0.5)')
+
+# Disable statement timeout
+sed -i 's/statement_timeout = 100/statement_timeout = 0/' .circleci/pgcat.toml
+kill -SIGHUP $(pgrep pgcat) # Reload config again
+
 #
 # ActiveRecord tests
 #
@@ -122,6 +134,8 @@ sed -i 's/pool_mode = "transaction"/pool_mode = "session"/' .circleci/pgcat.toml
 # Reload config test
 kill -SIGHUP $(pgrep pgcat)
 
+sleep 1
+
 # Prepared statements that will only work in session mode
 pgbench -U sharding_user -h 127.0.0.1 -p 6432 -t 500 -c 2 --protocol prepared
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -10,10 +10,4 @@ Happy hacking!
 
 ## TODOs
 
-A non-exhaustive list of things that would be useful to implement:
-
-#### Client authentication
-MD5 is probably sufficient, but maybe others too.
-
-#### Admin
-Admin database for stats collection and pooler administration. PgBouncer gives us a nice example on how to do that, specifically how to implement `RowDescription` and `DataRow` messages, [example here](https://github.com/pgbouncer/pgbouncer/blob/4f9ced8e63d317a6ff45c8b0efa876b32161f6db/src/admin.c#L813).
+See [Issues]([url](https://github.com/levkk/pgcat/issues)).
diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -0,0 +1,8 @@
+FROM cimg/rust:1.62.0
+RUN sudo apt-get update && \
+	sudo apt-get install -y psmisc postgresql-contrib-12 postgresql-client-12 ruby ruby-dev libpq-dev python3 python3-pip lcov llvm-11 && \
+	sudo apt-get upgrade curl
+RUN cargo install cargo-binutils rustfilt && \
+	rustup component add llvm-tools-preview
+RUN pip3 install psycopg2 && \
+	sudo gem install bundler
diff --git a/README.md b/README.md
@@ -47,29 +47,34 @@ psql -h 127.0.0.1 -p 6432 -c 'SELECT 1'
 
 ### Config
 
-| **Name**                | **Description**                                                                                                                            | **Examples**                     |
-|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|
-| **`general`**           |                                                                                                                                            |                                  |
-| `host`                  | The pooler will run on this host, 0.0.0.0 means accessible from everywhere.                                                                | `0.0.0.0`                        |
-| `port`                  | The pooler will run on this port.                                                                                                          | `6432`                           |
-| `pool_size`             | Maximum allowed server connections per pool. Pools are separated for each user/shard/server role. The connections are allocated as needed. | `15`                             |
-| `pool_mode`             | The pool mode to use, i.e. `session` or `transaction`.                                                                                     | `transaction`                    |
-| `connect_timeout`       | Maximum time to establish a connection to a server (milliseconds). If reached, the server is banned and the next target is attempted.      | `5000`                           |
-| `healthcheck_timeout`   | Maximum time to pass a health check (`SELECT 1`, milliseconds). If reached, the server is banned and the next target is attempted.         | `1000`                           |
-| `shutdown_timeout`   | Maximum time to give clients during shutdown before forcibly killing client connections (ms).      | `60000`                           |
-| `ban_time`              | Ban time for a server (seconds). It won't be allowed to serve transactions until the ban expires; failover targets will be used instead.   | `60`                             |
-|                         |                                                                                                                                            |                                  |
-| **`user`**              |                                                                                                                                            |                                  |
-| `name`                  | The user name.                                                                                                                             | `sharding_user`                  |
-| `password`              | The user password in plaintext.                                                                                                            | `hunter2`                        |
-|                         |                                                                                                                                            |                                  |
-| **`shards`**            | Shards are numerically numbered starting from 0; the order in the config is preserved by the pooler to route queries accordingly.          | `[shards.0]`                     |
-| `servers`               | List of servers to connect to and their roles. A server is: `[host, port, role]`, where `role` is either `primary` or `replica`.           | `["127.0.0.1", 5432, "primary"]` |
-| `database`              | The name of the database to connect to. This is the same on all servers that are part of one shard.                                        |                                  |
-| **`query_router`**      |                                                                                                                                            |                                  |
-| `default_role`          | Traffic is routed to this role by default (round-robin), unless the client specifies otherwise. Default is `any`, for any role available.  | `any`, `primary`, `replica`      |
-| `query_parser_enabled`  | Enable the query parser which will inspect incoming queries and route them to a primary or replicas.                                       | `false`                          |
-| `primary_reads_enabled` | Enable this to allow read queries on the primary; otherwise read queries are routed to the replicas.                                       | `true`                           |
+| **Name**                     | **Description**                                                                                                                            | **Examples**                     |
+|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|
+| **`general`**                |                                                                                                                                            |                                  |
+| `host`                       | The pooler will run on this host, 0.0.0.0 means accessible from everywhere.                                                                | `0.0.0.0`                        |
+| `port`                       | The pooler will run on this port.                                                                                                          | `6432`                           |
+| `enable_prometheus_exporter` | Enable prometheus exporter which will export metrics in prometheus exposition format.                                                      | `true`                           |
+| `prometheus_exporter_port`   | Port at which prometheus exporter listens on.                                                                                              | `9930`                           |
+| `pool_size`                  | Maximum allowed server connections per pool. Pools are separated for each user/shard/server role. The connections are allocated as needed. | `15`                             |
+| `pool_mode`                  | The pool mode to use, i.e. `session` or `transaction`.                                                                                     | `transaction`                    |
+| `connect_timeout`            | Maximum time to establish a connection to a server (milliseconds). If reached, the server is banned and the next target is attempted.      | `5000`                           |
+| `healthcheck_timeout`        | Maximum time to pass a health check (`SELECT 1`, milliseconds). If reached, the server is banned and the next target is attempted.         | `1000`                           |
+| `shutdown_timeout`           | Maximum time to give clients during shutdown before forcibly killing client connections (ms).                                              | `60000`                          |
+| `healthcheck_delay`          | How long to keep connection available for immediate re-use, without running a healthcheck query on it                                      | `30000`                          |
+| `ban_time`                   | Ban time for a server (seconds). It won't be allowed to serve transactions until the ban expires; failover targets will be used instead.   | `60`                             |
+| `autoreload`                 | Enable auto-reload of config after fixed time-interval.                                                                                    | `false`                          |
+|                              |                                                                                                                                            |                                  |
+| **`user`**                   |                                                                                                                                            |                                  |
+| `name`                       | The user name.                                                                                                                             | `sharding_user`                  |
+| `password`                   | The user password in plaintext.                                                                                                            | `hunter2`                        |
+|                              |                                                                                                                                            |                                  |
+| **`shards`**                 | Shards are numerically numbered starting from 0; the order in the config is preserved by the pooler to route queries accordingly.          | `[shards.0]`                     |
+| `servers`                    | List of servers to connect to and their roles. A server is: `[host, port, role]`, where `role` is either `primary` or `replica`.           | `["127.0.0.1", 5432, "primary"]` |
+| `database`                   | The name of the database to connect to. This is the same on all servers that are part of one shard.                                        |                                  |
+|                              |                                                                                                                                            |                                  |
+| **`query_router`**           |                                                                                                                                            |                                  |
+| `default_role`               | Traffic is routed to this role by default (round-robin), unless the client specifies otherwise. Default is `any`, for any role available.  | `any`, `primary`, `replica`      |
+| `query_parser_enabled`       | Enable the query parser which will inspect incoming queries and route them to a primary or replicas.                                       | `false`                          |
+| `primary_reads_enabled`      | Enable this to allow read queries on the primary; otherwise read queries are routed to the replicas.                                       | `true`                           |
 
 ## Local development
 
@@ -261,6 +266,7 @@ The config can be reloaded by sending a `kill -s SIGHUP` to the process or by qu
 | `connect_timeout`       | yes                  |
 | `healthcheck_timeout`   | no                   |
 | `shutdown_timeout`      | no                   |
+| `healthcheck_delay`     | no                   |
 | `ban_time`              | no                   |
 | `user`                  | yes                  |
 | `shards`                | yes                  |

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,7 +1,7 @@
 version: "3"
 services:
   postgres:
-    image: postgres:13
+    image: postgres:14
     environment:
       POSTGRES_PASSWORD: postgres
       POSTGRES_HOST_AUTH_METHOD: md5

diff --git a/examples/docker/pgcat.toml b/examples/docker/pgcat.toml
@@ -11,15 +11,21 @@ host = "0.0.0.0"
 # Port to run on, same as PgBouncer used in this example.
 port = 6432
 
-# enable prometheus exporter on port 9930
+# Whether to enable prometheus exporter or not.
 enable_prometheus_exporter = true
 
+# Port at which prometheus exporter listens on.
+prometheus_exporter_port = 9930
+
 # How long to wait before aborting a server connection (ms).
 connect_timeout = 5000
 
 # How much time to give `SELECT 1` health check query to return with a result (ms).
 healthcheck_timeout = 1000
 
+# How long to keep connection available for immediate re-use, without running a healthcheck query on it
+healthcheck_delay = 30000
+
 # How much time to give clients during shutdown before forcibly killing client connections (ms).
 shutdown_timeout = 60000
 

diff --git a/pgcat.toml b/pgcat.toml
@@ -11,15 +11,21 @@ host = "0.0.0.0"
 # Port to run on, same as PgBouncer used in this example.
 port = 6432
 
-# enable prometheus exporter on port 9930
+# Whether to enable prometheus exporter or not.
 enable_prometheus_exporter = true
 
+# Port at which prometheus exporter listens on.
+prometheus_exporter_port = 9930
+
 # How long to wait before aborting a server connection (ms).
 connect_timeout = 5000
 
 # How much time to give the health check query to return with a result (ms).
 healthcheck_timeout = 1000
 
+# How long to keep connection available for immediate re-use, without running a healthcheck query on it
+healthcheck_delay = 30000
+
 # How much time to give clients during shutdown before forcibly killing client connections (ms).
 shutdown_timeout = 60000
 
@@ -86,10 +92,14 @@ password = "sharding_user"
 # is the sum of pool_size across all users.
 pool_size = 9
 
+# Maximum query duration. Dangerous, but protetcts against DBs that died and a non-obvious way.
+statement_timeout = 0
+
 [pools.sharded_db.users.1]
 username = "other_user"
 password = "other_user"
 pool_size = 21
+statement_timeout = 15000
 
 # Shard 0
 [pools.sharded_db.shards.0]
@@ -127,6 +137,7 @@ sharding_function = "pg_bigint_hash"
 username = "simple_user"
 password = "simple_user"
 pool_size = 5
+statement_timeout = 0
 
 [pools.simple_db.shards.0]
 servers = [