Skip to content

Commit

Permalink
Sync with upstream (postgresml#26)
Browse files Browse the repository at this point in the history
* Prevent clients from sticking to old pools after config update (postgresml#113)

* Re-acquire pool at the beginning of Protocol loop

* Fix query router + add tests for recycling behavior

* create a prometheus exporter on a standard http port (postgresml#107)

* create a hyper server and add option to enable it in config

* move prometheus stuff to its own file; update format

* create metric type and help lookup table

* finish the metric help type map

* switch to a boolean and a standard port

* dont emit unimplemented metrics

* fail if curl returns a non 200

* resolve conflicts

* move log out of config.show and into main

* terminating new line

* upgrade curl

* include unimplemented stats

* Validates pgcat is closed after shutdown python tests (postgresml#116)

* Validates pgcat is closed after shutdown python tests

* Fix pgrep logic

* Moves sigterm step to after cleanup to decouple

* Replace subprocess with os.system for running pgcat

* fix docker compose port allocation for local dev (postgresml#117)

change docker compose port to right prometheus port

* Update CONTRIBUTING.md

* Health check delay (postgresml#118)

* initial commit of server check delay implementation

* fmt

* spelling

* Update name to last_healthcheck and some comments

* Moved server tested stat to after require_healthcheck check

* Make health check delay configurable

* Rename to last_activity

* Fix typo

* Add debug log for healthcheck

* Add address to debug log

* Speed up CI a bit (postgresml#119)

* Sleep for 1s

* use premade image

* quicker

* revert shutdown timeout

* Fix debug log (postgresml#120)

* Make prometheus port configurable (postgresml#121)

* Make prometheus port configurable

* Update circleci config

* Statement timeout + replica imbalance fix (postgresml#122)

* Statement timeout

* send error message too

* Correct error messages

* Fix replica inbalance

* disable stmt timeout by default

* Redundant mark_bad

* revert healthcheck delay

* tests

* set it to 0

* reload config again

* pgcat toml

Co-authored-by: Nicholas Dujay <[email protected]>
Co-authored-by: zainkabani <[email protected]>
Co-authored-by: Lev Kokotov <[email protected]>
Co-authored-by: Pradeep Chhetri <[email protected]>
  • Loading branch information
5 people authored Aug 15, 2022
1 parent dfe0e05 commit 0846a17
Show file tree
Hide file tree
Showing 17 changed files with 310 additions and 81 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
# Specify the execution environment. You can specify an image from Dockerhub or use one of our Convenience Images from CircleCI's Developer Hub.
# See: https://circleci.com/docs/2.0/configuration-reference/#docker-machine-macos-windows-executor
docker:
- image: cimg/rust:1.62.0
- image: levkk/pgcat-ci:latest
environment:
RUST_LOG: info
RUSTFLAGS: "-C instrument-coverage"
Expand Down
11 changes: 10 additions & 1 deletion .circleci/pgcat.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,21 @@ host = "0.0.0.0"
# Port to run on, same as PgBouncer used in this example.
port = 6432

# enable prometheus exporter on port 9930
# Whether to enable prometheus exporter or not.
enable_prometheus_exporter = true

# Port at which prometheus exporter listens on.
prometheus_exporter_port = 9930

# How long to wait before aborting a server connection (ms).
connect_timeout = 100

# How much time to give the health check query to return with a result (ms).
healthcheck_timeout = 100

# How long to keep connection available for immediate re-use, without running a healthcheck query on it
healthcheck_delay = 30000

# How much time to give clients during shutdown before forcibly killing client connections (ms).
shutdown_timeout = 5000

Expand Down Expand Up @@ -85,11 +91,13 @@ password = "sharding_user"
# The maximum number of connection from a single Pgcat process to any database in the cluster
# is the sum of pool_size across all users.
pool_size = 9
statement_timeout = 0

[pools.sharded_db.users.1]
username = "other_user"
password = "other_user"
pool_size = 21
statement_timeout = 30000

# Shard 0
[pools.sharded_db.shards.0]
Expand Down Expand Up @@ -127,6 +135,7 @@ sharding_function = "pg_bigint_hash"
username = "simple_user"
password = "simple_user"
pool_size = 5
statement_timeout = 30000

[pools.simple_db.shards.0]
servers = [
Expand Down
14 changes: 14 additions & 0 deletions .circleci/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@ psql -U sharding_user -e -h 127.0.0.1 -p 6432 -f tests/sharding/query_routing_te
# Replica/primary selection & more sharding tests
psql -U sharding_user -e -h 127.0.0.1 -p 6432 -f tests/sharding/query_routing_test_primary_replica.sql > /dev/null

# Statement timeout tests
sed -i 's/statement_timeout = 0/statement_timeout = 100/' .circleci/pgcat.toml
kill -SIGHUP $(pgrep pgcat) # Reload config
sleep 0.2

# This should timeout
(! psql -U sharding_user -e -h 127.0.0.1 -p 6432 -c 'select pg_sleep(0.5)')

# Disable statement timeout
sed -i 's/statement_timeout = 100/statement_timeout = 0/' .circleci/pgcat.toml
kill -SIGHUP $(pgrep pgcat) # Reload config again

#
# ActiveRecord tests
#
Expand Down Expand Up @@ -122,6 +134,8 @@ sed -i 's/pool_mode = "transaction"/pool_mode = "session"/' .circleci/pgcat.toml
# Reload config test
kill -SIGHUP $(pgrep pgcat)

sleep 1

# Prepared statements that will only work in session mode
pgbench -U sharding_user -h 127.0.0.1 -p 6432 -t 500 -c 2 --protocol prepared

Expand Down
8 changes: 1 addition & 7 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,4 @@ Happy hacking!

## TODOs

A non-exhaustive list of things that would be useful to implement:

#### Client authentication
MD5 is probably sufficient, but maybe others too.

#### Admin
Admin database for stats collection and pooler administration. PgBouncer gives us a nice example on how to do that, specifically how to implement `RowDescription` and `DataRow` messages, [example here](https://github.com/pgbouncer/pgbouncer/blob/4f9ced8e63d317a6ff45c8b0efa876b32161f6db/src/admin.c#L813).
See [Issues]([url](https://github.com/levkk/pgcat/issues)).
8 changes: 8 additions & 0 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
FROM cimg/rust:1.62.0
RUN sudo apt-get update && \
sudo apt-get install -y psmisc postgresql-contrib-12 postgresql-client-12 ruby ruby-dev libpq-dev python3 python3-pip lcov llvm-11 && \
sudo apt-get upgrade curl
RUN cargo install cargo-binutils rustfilt && \
rustup component add llvm-tools-preview
RUN pip3 install psycopg2 && \
sudo gem install bundler
52 changes: 29 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,29 +47,34 @@ psql -h 127.0.0.1 -p 6432 -c 'SELECT 1'

### Config

| **Name** | **Description** | **Examples** |
|-------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|
| **`general`** | | |
| `host` | The pooler will run on this host, 0.0.0.0 means accessible from everywhere. | `0.0.0.0` |
| `port` | The pooler will run on this port. | `6432` |
| `pool_size` | Maximum allowed server connections per pool. Pools are separated for each user/shard/server role. The connections are allocated as needed. | `15` |
| `pool_mode` | The pool mode to use, i.e. `session` or `transaction`. | `transaction` |
| `connect_timeout` | Maximum time to establish a connection to a server (milliseconds). If reached, the server is banned and the next target is attempted. | `5000` |
| `healthcheck_timeout` | Maximum time to pass a health check (`SELECT 1`, milliseconds). If reached, the server is banned and the next target is attempted. | `1000` |
| `shutdown_timeout` | Maximum time to give clients during shutdown before forcibly killing client connections (ms). | `60000` |
| `ban_time` | Ban time for a server (seconds). It won't be allowed to serve transactions until the ban expires; failover targets will be used instead. | `60` |
| | | |
| **`user`** | | |
| `name` | The user name. | `sharding_user` |
| `password` | The user password in plaintext. | `hunter2` |
| | | |
| **`shards`** | Shards are numerically numbered starting from 0; the order in the config is preserved by the pooler to route queries accordingly. | `[shards.0]` |
| `servers` | List of servers to connect to and their roles. A server is: `[host, port, role]`, where `role` is either `primary` or `replica`. | `["127.0.0.1", 5432, "primary"]` |
| `database` | The name of the database to connect to. This is the same on all servers that are part of one shard. | |
| **`query_router`** | | |
| `default_role` | Traffic is routed to this role by default (round-robin), unless the client specifies otherwise. Default is `any`, for any role available. | `any`, `primary`, `replica` |
| `query_parser_enabled` | Enable the query parser which will inspect incoming queries and route them to a primary or replicas. | `false` |
| `primary_reads_enabled` | Enable this to allow read queries on the primary; otherwise read queries are routed to the replicas. | `true` |
| **Name** | **Description** | **Examples** |
|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------|
| **`general`** | | |
| `host` | The pooler will run on this host, 0.0.0.0 means accessible from everywhere. | `0.0.0.0` |
| `port` | The pooler will run on this port. | `6432` |
| `enable_prometheus_exporter` | Enable prometheus exporter which will export metrics in prometheus exposition format. | `true` |
| `prometheus_exporter_port` | Port at which prometheus exporter listens on. | `9930` |
| `pool_size` | Maximum allowed server connections per pool. Pools are separated for each user/shard/server role. The connections are allocated as needed. | `15` |
| `pool_mode` | The pool mode to use, i.e. `session` or `transaction`. | `transaction` |
| `connect_timeout` | Maximum time to establish a connection to a server (milliseconds). If reached, the server is banned and the next target is attempted. | `5000` |
| `healthcheck_timeout` | Maximum time to pass a health check (`SELECT 1`, milliseconds). If reached, the server is banned and the next target is attempted. | `1000` |
| `shutdown_timeout` | Maximum time to give clients during shutdown before forcibly killing client connections (ms). | `60000` |
| `healthcheck_delay` | How long to keep connection available for immediate re-use, without running a healthcheck query on it | `30000` |
| `ban_time` | Ban time for a server (seconds). It won't be allowed to serve transactions until the ban expires; failover targets will be used instead. | `60` |
| `autoreload` | Enable auto-reload of config after fixed time-interval. | `false` |
| | | |
| **`user`** | | |
| `name` | The user name. | `sharding_user` |
| `password` | The user password in plaintext. | `hunter2` |
| | | |
| **`shards`** | Shards are numerically numbered starting from 0; the order in the config is preserved by the pooler to route queries accordingly. | `[shards.0]` |
| `servers` | List of servers to connect to and their roles. A server is: `[host, port, role]`, where `role` is either `primary` or `replica`. | `["127.0.0.1", 5432, "primary"]` |
| `database` | The name of the database to connect to. This is the same on all servers that are part of one shard. | |
| | | |
| **`query_router`** | | |
| `default_role` | Traffic is routed to this role by default (round-robin), unless the client specifies otherwise. Default is `any`, for any role available. | `any`, `primary`, `replica` |
| `query_parser_enabled` | Enable the query parser which will inspect incoming queries and route them to a primary or replicas. | `false` |
| `primary_reads_enabled` | Enable this to allow read queries on the primary; otherwise read queries are routed to the replicas. | `true` |

## Local development

Expand Down Expand Up @@ -261,6 +266,7 @@ The config can be reloaded by sending a `kill -s SIGHUP` to the process or by qu
| `connect_timeout` | yes |
| `healthcheck_timeout` | no |
| `shutdown_timeout` | no |
| `healthcheck_delay` | no |
| `ban_time` | no |
| `user` | yes |
| `shards` | yes |
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: "3"
services:
postgres:
image: postgres:13
image: postgres:14
environment:
POSTGRES_PASSWORD: postgres
POSTGRES_HOST_AUTH_METHOD: md5
Expand Down
8 changes: 7 additions & 1 deletion examples/docker/pgcat.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,21 @@ host = "0.0.0.0"
# Port to run on, same as PgBouncer used in this example.
port = 6432

# enable prometheus exporter on port 9930
# Whether to enable prometheus exporter or not.
enable_prometheus_exporter = true

# Port at which prometheus exporter listens on.
prometheus_exporter_port = 9930

# How long to wait before aborting a server connection (ms).
connect_timeout = 5000

# How much time to give `SELECT 1` health check query to return with a result (ms).
healthcheck_timeout = 1000

# How long to keep connection available for immediate re-use, without running a healthcheck query on it
healthcheck_delay = 30000

# How much time to give clients during shutdown before forcibly killing client connections (ms).
shutdown_timeout = 60000

Expand Down
13 changes: 12 additions & 1 deletion pgcat.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,21 @@ host = "0.0.0.0"
# Port to run on, same as PgBouncer used in this example.
port = 6432

# enable prometheus exporter on port 9930
# Whether to enable prometheus exporter or not.
enable_prometheus_exporter = true

# Port at which prometheus exporter listens on.
prometheus_exporter_port = 9930

# How long to wait before aborting a server connection (ms).
connect_timeout = 5000

# How much time to give the health check query to return with a result (ms).
healthcheck_timeout = 1000

# How long to keep connection available for immediate re-use, without running a healthcheck query on it
healthcheck_delay = 30000

# How much time to give clients during shutdown before forcibly killing client connections (ms).
shutdown_timeout = 60000

Expand Down Expand Up @@ -86,10 +92,14 @@ password = "sharding_user"
# is the sum of pool_size across all users.
pool_size = 9

# Maximum query duration. Dangerous, but protetcts against DBs that died and a non-obvious way.
statement_timeout = 0

[pools.sharded_db.users.1]
username = "other_user"
password = "other_user"
pool_size = 21
statement_timeout = 15000

# Shard 0
[pools.sharded_db.shards.0]
Expand Down Expand Up @@ -127,6 +137,7 @@ sharding_function = "pg_bigint_hash"
username = "simple_user"
password = "simple_user"
pool_size = 5
statement_timeout = 0

[pools.simple_db.shards.0]
servers = [
Expand Down
Loading

0 comments on commit 0846a17

Please sign in to comment.