diff --git a/.github/workflows/check-rust.yml b/.github/workflows/check-rust.yml
index 0c9c3879f..0842d06df 100644
--- a/.github/workflows/check-rust.yml
+++ b/.github/workflows/check-rust.yml
@@ -33,7 +33,7 @@ jobs:
- name: Rust install
uses: dtolnay/rust-toolchain@master
with:
- toolchain: 1.73.0
+ toolchain: 1.75.0
components: clippy, rustfmt
- name: Install Linux requirements
# TODO: When ubuntu-latest gets updated to >= 23.04 replace the wget+unzip with just protobuf-compiler in apt
diff --git a/.github/workflows/run-tarpaulin.yml b/.github/workflows/run-tarpaulin.yml
index 6d902bd47..3e10c3532 100644
--- a/.github/workflows/run-tarpaulin.yml
+++ b/.github/workflows/run-tarpaulin.yml
@@ -16,7 +16,7 @@ on:
env:
CARGO_TERM_COLOR: always
- CARGO_VERSION: 1.73.0
+ CARGO_VERSION: 1.75.0
jobs:
build:
diff --git a/Cargo.lock b/Cargo.lock
index aeb7aeb6d..3a5c301b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -41,7 +41,7 @@ dependencies = [
"flate2",
"futures-core",
"h2",
- "http",
+ "http 0.2.11",
"httparse",
"httpdate",
"itoa",
@@ -76,7 +76,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d22475596539443685426b6bdadb926ad0ecaefdfc5fb05e5e3441f15463c511"
dependencies = [
"bytestring",
- "http",
+ "http 0.2.11",
"regex",
"serde",
"tracing",
@@ -134,8 +134,8 @@ dependencies = [
"impl-more",
"openssl",
"pin-project-lite",
- "rustls",
- "rustls-webpki",
+ "rustls 0.21.10",
+ "rustls-webpki 0.101.7",
"tokio",
"tokio-openssl",
"tokio-util",
@@ -234,19 +234,19 @@ dependencies = [
"async-stream",
"async-trait",
"blake2",
- "env_logger",
+ "env_logger 0.10.1",
"futures",
- "hyper",
+ "hyper 0.14.28",
"itertools 0.12.1",
- "k8s-openapi 0.20.0",
- "kube 0.87.2",
+ "k8s-openapi 0.22.0",
+ "kube 0.91.0",
"kube-runtime",
"lazy_static",
"log",
"mock_instant",
"mockall",
"mockall_double",
- "prometheus",
+ "prometheus 0.12.0",
"prost",
"serde",
"serde_derive",
@@ -333,9 +333,9 @@ dependencies = [
"base64 0.13.1",
"bytes",
"chrono",
- "env_logger",
+ "env_logger 0.10.1",
"futures-util",
- "hyper",
+ "hyper 0.14.28",
"log",
"mockall",
"serde",
@@ -378,12 +378,12 @@ dependencies = [
"anyhow",
"async-trait",
"either",
- "env_logger",
- "k8s-openapi 0.20.0",
- "kube 0.87.2",
+ "env_logger 0.10.1",
+ "k8s-openapi 0.22.0",
+ "kube 0.91.0",
"log",
"mockall",
- "prometheus",
+ "prometheus 0.12.0",
"rand",
"schemars",
"serde",
@@ -403,7 +403,7 @@ dependencies = [
"akri-discovery-utils",
"anyhow",
"async-trait",
- "env_logger",
+ "env_logger 0.10.1",
"log",
"mockall",
"pest",
@@ -456,23 +456,24 @@ dependencies = [
[[package]]
name = "anstream"
-version = "0.6.5"
+version = "0.6.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
+ "is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
-version = "1.0.4"
+version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
+checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
[[package]]
name = "anstyle-parse"
@@ -508,6 +509,18 @@ version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
+[[package]]
+name = "async-broadcast"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20cd0e2e25ea8e5f7e9df04578dc6cf5c83577fd09b1a46aaf5c85e1c33f2a7e"
+dependencies = [
+ "event-listener 5.3.1",
+ "event-listener-strategy 0.5.2",
+ "futures-core",
+ "pin-project-lite",
+]
+
[[package]]
name = "async-channel"
version = "1.9.0"
@@ -527,7 +540,7 @@ checksum = "1ca33f4bc4ed1babef42cad36cc1f51fa88be00420404e5b1e80ab1b18f7678c"
dependencies = [
"concurrent-queue",
"event-listener 4.0.3",
- "event-listener-strategy",
+ "event-listener-strategy 0.4.0",
"futures-core",
"pin-project-lite",
]
@@ -616,7 +629,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d034b430882f8381900d3fe6f0aaa3ad94f2cb4ac519b429692a1bc2dda4ae7b"
dependencies = [
"event-listener 4.0.3",
- "event-listener-strategy",
+ "event-listener-strategy 0.4.0",
"pin-project-lite",
]
@@ -708,9 +721,9 @@ dependencies = [
"bitflags 1.3.2",
"bytes",
"futures-util",
- "http",
- "http-body",
- "hyper",
+ "http 0.2.11",
+ "http-body 0.4.6",
+ "hyper 0.14.28",
"itoa",
"matchit",
"memchr",
@@ -734,8 +747,8 @@ dependencies = [
"async-trait",
"bytes",
"futures-util",
- "http",
- "http-body",
+ "http 0.2.11",
+ "http-body 0.4.6",
"mime",
"rustversion",
"tower-layer",
@@ -786,6 +799,12 @@ version = "0.21.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c79fed4cdb43e993fcdadc7e58a09fd0e3e649c4436fa11da71c9f1f3ee7feb9"
+[[package]]
+name = "base64"
+version = "0.22.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
+
[[package]]
name = "bitflags"
version = "1.3.2"
@@ -900,9 +919,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
-version = "0.4.31"
+version = "0.4.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401"
dependencies = [
"android-tzdata",
"iana-time-zone",
@@ -910,7 +929,7 @@ dependencies = [
"num-traits",
"serde",
"wasm-bindgen",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.0",
]
[[package]]
@@ -963,16 +982,17 @@ dependencies = [
"anyhow",
"async-std",
"chrono",
- "env_logger",
+ "either",
+ "env_logger 0.11.5",
"futures",
- "k8s-openapi 0.20.0",
- "kube 0.87.2",
- "kube-runtime",
+ "k8s-openapi 0.22.0",
+ "kube 0.91.0",
"lazy_static",
"log",
"mockall",
- "prometheus",
+ "prometheus 0.13.4",
"serde_json",
+ "thiserror",
"tokio",
]
@@ -1135,7 +1155,7 @@ version = "0.13.1"
dependencies = [
"akri-debug-echo",
"akri-discovery-utils",
- "env_logger",
+ "env_logger 0.10.1",
"log",
"tokio",
]
@@ -1227,9 +1247,9 @@ checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d"
[[package]]
name = "either"
-version = "1.9.0"
+version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "encoding_rs"
@@ -1240,6 +1260,16 @@ dependencies = [
"cfg-if",
]
+[[package]]
+name = "env_filter"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab"
+dependencies = [
+ "log",
+ "regex",
+]
+
[[package]]
name = "env_logger"
version = "0.10.1"
@@ -1253,6 +1283,19 @@ dependencies = [
"termcolor",
]
+[[package]]
+name = "env_logger"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "env_filter",
+ "humantime",
+ "log",
+]
+
[[package]]
name = "equivalent"
version = "1.0.1"
@@ -1286,6 +1329,17 @@ dependencies = [
"pin-project-lite",
]
+[[package]]
+name = "event-listener"
+version = "5.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
[[package]]
name = "event-listener-strategy"
version = "0.4.0"
@@ -1296,6 +1350,16 @@ dependencies = [
"pin-project-lite",
]
+[[package]]
+name = "event-listener-strategy"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1"
+dependencies = [
+ "event-listener 5.3.1",
+ "pin-project-lite",
+]
+
[[package]]
name = "fastrand"
version = "1.9.0"
@@ -1539,7 +1603,7 @@ dependencies = [
"futures-core",
"futures-sink",
"futures-util",
- "http",
+ "http 0.2.11",
"indexmap 2.1.0",
"slab",
"tokio",
@@ -1572,7 +1636,7 @@ dependencies = [
"base64 0.21.6",
"bytes",
"headers-core",
- "http",
+ "http 0.2.11",
"httpdate",
"mime",
"sha1 0.10.6",
@@ -1584,7 +1648,7 @@ version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429"
dependencies = [
- "http",
+ "http 0.2.11",
]
[[package]]
@@ -1634,6 +1698,17 @@ dependencies = [
"itoa",
]
+[[package]]
+name = "http"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
[[package]]
name = "http-body"
version = "0.4.6"
@@ -1641,7 +1716,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
dependencies = [
"bytes",
- "http",
+ "http 0.2.11",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "http-body"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
+dependencies = [
+ "bytes",
+ "http 1.1.0",
+]
+
+[[package]]
+name = "http-body-util"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
+dependencies = [
+ "bytes",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.1",
"pin-project-lite",
]
@@ -1680,8 +1778,8 @@ dependencies = [
"futures-core",
"futures-util",
"h2",
- "http",
- "http-body",
+ "http 0.2.11",
+ "http-body 0.4.6",
"httparse",
"httpdate",
"itoa",
@@ -1693,14 +1791,33 @@ dependencies = [
"want",
]
+[[package]]
+name = "hyper"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "httparse",
+ "itoa",
+ "pin-project-lite",
+ "smallvec",
+ "tokio",
+ "want",
+]
+
[[package]]
name = "hyper-openssl"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6ee5d7a8f718585d1c3c61dfde28ef5b0bb14734b4db13f5ada856cdc6c612b"
dependencies = [
- "http",
- "hyper",
+ "http 0.2.11",
+ "hyper 0.14.28",
"linked_hash_set",
"once_cell",
"openssl",
@@ -1713,18 +1830,21 @@ dependencies = [
[[package]]
name = "hyper-rustls"
-version = "0.24.2"
+version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
+checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155"
dependencies = [
"futures-util",
- "http",
- "hyper",
+ "http 1.1.0",
+ "hyper 1.4.1",
+ "hyper-util",
"log",
- "rustls",
+ "rustls 0.23.12",
"rustls-native-certs",
+ "rustls-pki-types",
"tokio",
- "tokio-rustls",
+ "tokio-rustls 0.26.0",
+ "tower-service",
]
[[package]]
@@ -1733,12 +1853,45 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
dependencies = [
- "hyper",
+ "hyper 0.14.28",
"pin-project-lite",
"tokio",
"tokio-io-timeout",
]
+[[package]]
+name = "hyper-timeout"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793"
+dependencies = [
+ "hyper 1.4.1",
+ "hyper-util",
+ "pin-project-lite",
+ "tokio",
+ "tower-service",
+]
+
+[[package]]
+name = "hyper-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-util",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "hyper 1.4.1",
+ "pin-project-lite",
+ "socket2 0.5.5",
+ "tokio",
+ "tower",
+ "tower-service",
+ "tracing",
+]
+
[[package]]
name = "iana-time-zone"
version = "0.1.59"
@@ -1852,6 +2005,12 @@ dependencies = [
"windows-sys 0.52.0",
]
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
[[package]]
name = "itertools"
version = "0.11.0"
@@ -1908,10 +2067,12 @@ dependencies = [
[[package]]
name = "jsonpath-rust"
-version = "0.3.5"
+version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829"
+checksum = "19d8fe85bd70ff715f31ce8c739194b423d79811a19602115d611a3ec85d6200"
dependencies = [
+ "lazy_static",
+ "once_cell",
"pest",
"pest_derive",
"regex",
@@ -1947,12 +2108,11 @@ dependencies = [
[[package]]
name = "k8s-openapi"
-version = "0.20.0"
+version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6"
+checksum = "19501afb943ae5806548bc3ebd7f3374153ca057a38f480ef30adfde5ef09755"
dependencies = [
- "base64 0.21.6",
- "bytes",
+ "base64 0.22.1",
"chrono",
"schemars",
"serde",
@@ -1974,14 +2134,15 @@ dependencies = [
[[package]]
name = "kube"
-version = "0.87.2"
+version = "0.91.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e"
+checksum = "264461a7ebf4fb0fcf23e4c7e4f9387c5696ee61d003de207d9b5a895ff37bfa"
dependencies = [
- "k8s-openapi 0.20.0",
- "kube-client 0.87.2",
- "kube-core 0.87.2",
- "kube-derive 0.87.2",
+ "k8s-openapi 0.22.0",
+ "kube-client 0.91.0",
+ "kube-core 0.91.0",
+ "kube-derive 0.91.0",
+ "kube-runtime",
]
[[package]]
@@ -1996,11 +2157,11 @@ dependencies = [
"dirs-next",
"either",
"futures",
- "http",
- "http-body",
- "hyper",
+ "http 0.2.11",
+ "http-body 0.4.6",
+ "hyper 0.14.28",
"hyper-openssl",
- "hyper-timeout",
+ "hyper-timeout 0.4.1",
"jsonpath_lib",
"k8s-openapi 0.17.0",
"kube-core 0.80.0",
@@ -2021,28 +2182,29 @@ dependencies = [
[[package]]
name = "kube-client"
-version = "0.87.2"
+version = "0.91.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f"
+checksum = "47164ad6c47398ee4bdf90509c7b44026229721cb1377eb4623a1ec2a00a85e9"
dependencies = [
- "base64 0.21.6",
+ "base64 0.22.1",
"bytes",
"chrono",
"either",
"futures",
"home",
- "http",
- "http-body",
- "hyper",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "http-body-util",
+ "hyper 1.4.1",
"hyper-rustls",
- "hyper-timeout",
+ "hyper-timeout 0.5.1",
+ "hyper-util",
"jsonpath-rust",
- "k8s-openapi 0.20.0",
- "kube-core 0.87.2",
+ "k8s-openapi 0.22.0",
+ "kube-core 0.91.0",
"pem 3.0.3",
- "pin-project",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.23.12",
+ "rustls-pemfile 2.1.2",
"secrecy",
"serde",
"serde_json",
@@ -2051,7 +2213,7 @@ dependencies = [
"tokio",
"tokio-util",
"tower",
- "tower-http 0.4.4",
+ "tower-http 0.5.2",
"tracing",
]
@@ -2063,7 +2225,7 @@ checksum = "98331c6f1354893f7c50da069e43a3fd1c84e55bbedc7765d9db22ec3291d07d"
dependencies = [
"chrono",
"form_urlencoded",
- "http",
+ "http 0.2.11",
"k8s-openapi 0.17.0",
"once_cell",
"schemars",
@@ -2074,16 +2236,15 @@ dependencies = [
[[package]]
name = "kube-core"
-version = "0.87.2"
+version = "0.91.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae"
+checksum = "2797d3044a238825432129cd9537e12c2a6dacbbb5352381af5ea55e1505ed4f"
dependencies = [
"chrono",
"form_urlencoded",
- "http",
+ "http 1.1.0",
"json-patch",
- "k8s-openapi 0.20.0",
- "once_cell",
+ "k8s-openapi 0.22.0",
"schemars",
"serde",
"serde_json",
@@ -2105,9 +2266,9 @@ dependencies = [
[[package]]
name = "kube-derive"
-version = "0.87.2"
+version = "0.91.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4"
+checksum = "fcf837edaa0c478f85e9a3cddb17fa80d58a57c1afa722b3a9e55753ea162f41"
dependencies = [
"darling 0.20.5",
"proc-macro2",
@@ -2118,19 +2279,21 @@ dependencies = [
[[package]]
name = "kube-runtime"
-version = "0.87.2"
+version = "0.91.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae"
+checksum = "e463e89a1fb222c65a5469b568803153d1bf13d084a8dd42b659e6cca66edc6e"
dependencies = [
"ahash",
+ "async-broadcast",
+ "async-stream",
"async-trait",
"backoff",
"derivative",
"futures",
"hashbrown 0.14.3",
"json-patch",
- "k8s-openapi 0.20.0",
- "kube-client 0.87.2",
+ "k8s-openapi 0.22.0",
+ "kube-client 0.91.0",
"parking_lot 0.12.1",
"pin-project",
"serde",
@@ -2246,9 +2409,9 @@ dependencies = [
[[package]]
name = "log"
-version = "0.4.20"
+version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
dependencies = [
"value-bag",
]
@@ -2365,7 +2528,7 @@ dependencies = [
"bytes",
"encoding_rs",
"futures-util",
- "http",
+ "http 0.2.11",
"httparse",
"log",
"memchr",
@@ -2420,7 +2583,7 @@ version = "0.13.1"
dependencies = [
"akri-discovery-utils",
"akri-onvif",
- "env_logger",
+ "env_logger 0.10.1",
"log",
"tokio",
]
@@ -2469,7 +2632,7 @@ version = "0.13.1"
dependencies = [
"akri-discovery-utils",
"akri-opcua",
- "env_logger",
+ "env_logger 0.10.1",
"log",
"tokio",
]
@@ -2842,6 +3005,29 @@ dependencies = [
"libc",
]
+[[package]]
+name = "procfs"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+ "lazy_static",
+ "procfs-core",
+ "rustix 0.38.28",
+]
+
+[[package]]
+name = "procfs-core"
+version = "0.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29"
+dependencies = [
+ "bitflags 2.4.1",
+ "hex",
+]
+
[[package]]
name = "prometheus"
version = "0.12.0"
@@ -2854,7 +3040,24 @@ dependencies = [
"libc",
"memchr",
"parking_lot 0.11.2",
- "procfs",
+ "procfs 0.9.1",
+ "protobuf",
+ "thiserror",
+]
+
+[[package]]
+name = "prometheus"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "libc",
+ "memchr",
+ "parking_lot 0.12.1",
+ "procfs 0.16.0",
"protobuf",
"thiserror",
]
@@ -3034,9 +3237,9 @@ dependencies = [
"futures-core",
"futures-util",
"h2",
- "http",
- "http-body",
- "hyper",
+ "http 0.2.11",
+ "http-body 0.4.6",
+ "hyper 0.14.28",
"ipnet",
"js-sys",
"log",
@@ -3131,18 +3334,34 @@ checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba"
dependencies = [
"log",
"ring",
- "rustls-webpki",
+ "rustls-webpki 0.101.7",
"sct",
]
+[[package]]
+name = "rustls"
+version = "0.23.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044"
+dependencies = [
+ "log",
+ "once_cell",
+ "ring",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.6",
+ "subtle",
+ "zeroize",
+]
+
[[package]]
name = "rustls-native-certs"
-version = "0.6.3"
+version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
+checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba"
dependencies = [
"openssl-probe",
- "rustls-pemfile",
+ "rustls-pemfile 2.1.2",
+ "rustls-pki-types",
"schannel",
"security-framework",
]
@@ -3156,6 +3375,22 @@ dependencies = [
"base64 0.21.6",
]
+[[package]]
+name = "rustls-pemfile"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d"
+dependencies = [
+ "base64 0.22.1",
+ "rustls-pki-types",
+]
+
+[[package]]
+name = "rustls-pki-types"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d"
+
[[package]]
name = "rustls-webpki"
version = "0.101.7"
@@ -3166,6 +3401,17 @@ dependencies = [
"untrusted",
]
+[[package]]
+name = "rustls-webpki"
+version = "0.102.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e"
+dependencies = [
+ "ring",
+ "rustls-pki-types",
+ "untrusted",
+]
+
[[package]]
name = "rustversion"
version = "1.0.14"
@@ -3419,9 +3665,9 @@ dependencies = [
[[package]]
name = "smallvec"
-version = "1.11.2"
+version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "socket2"
@@ -3681,7 +3927,18 @@ version = "0.24.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
dependencies = [
- "rustls",
+ "rustls 0.21.10",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
+dependencies = [
+ "rustls 0.23.12",
+ "rustls-pki-types",
"tokio",
]
@@ -3736,17 +3993,17 @@ dependencies = [
"base64 0.21.6",
"bytes",
"h2",
- "http",
- "http-body",
- "hyper",
- "hyper-timeout",
+ "http 0.2.11",
+ "http-body 0.4.6",
+ "hyper 0.14.28",
+ "hyper-timeout 0.4.1",
"percent-encoding 2.3.1",
"pin-project",
"prost",
- "rustls",
- "rustls-pemfile",
+ "rustls 0.21.10",
+ "rustls-pemfile 1.0.4",
"tokio",
- "tokio-rustls",
+ "tokio-rustls 0.24.1",
"tokio-stream",
"tower",
"tower-layer",
@@ -3798,8 +4055,8 @@ dependencies = [
"bytes",
"futures-core",
"futures-util",
- "http",
- "http-body",
+ "http 0.2.11",
+ "http-body 0.4.6",
"http-range-header",
"pin-project-lite",
"tower-layer",
@@ -3809,18 +4066,16 @@ dependencies = [
[[package]]
name = "tower-http"
-version = "0.4.4"
+version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
+checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5"
dependencies = [
"base64 0.21.6",
"bitflags 2.4.1",
"bytes",
- "futures-core",
- "futures-util",
- "http",
- "http-body",
- "http-range-header",
+ "http 1.1.0",
+ "http-body 1.0.1",
+ "http-body-util",
"mime",
"pin-project-lite",
"tower-layer",
@@ -3896,7 +4151,7 @@ dependencies = [
"byteorder",
"bytes",
"data-encoding",
- "http",
+ "http 0.2.11",
"httparse",
"log",
"rand",
@@ -3940,7 +4195,7 @@ version = "0.13.1"
dependencies = [
"akri-discovery-utils",
"akri-udev",
- "env_logger",
+ "env_logger 0.10.1",
"log",
"tokio",
]
@@ -3950,10 +4205,10 @@ name = "udev-video-broker"
version = "0.13.1"
dependencies = [
"akri-shared",
- "env_logger",
+ "env_logger 0.10.1",
"lazy_static",
"log",
- "prometheus",
+ "prometheus 0.12.0",
"prost",
"regex",
"rscam",
@@ -4064,9 +4319,9 @@ dependencies = [
[[package]]
name = "value-bag"
-version = "1.6.0"
+version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7cdbaf5e132e593e9fc1de6a15bbec912395b11fb9719e061cf64f804524c503"
+checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
[[package]]
name = "vcpkg"
@@ -4105,15 +4360,15 @@ dependencies = [
"futures-channel",
"futures-util",
"headers",
- "http",
- "hyper",
+ "http 0.2.11",
+ "hyper 0.14.28",
"log",
"mime",
"mime_guess",
"multer",
"percent-encoding 2.3.1",
"pin-project",
- "rustls-pemfile",
+ "rustls-pemfile 1.0.4",
"scoped-tls",
"serde",
"serde_json",
diff --git a/README.md b/README.md
index 80990c109..f7516ebb3 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
[![Slack channel #akri](https://img.shields.io/badge/slack-akri-blueviolet.svg?logo=slack)](https://kubernetes.slack.com/messages/akri)
-[![Rust Version](https://img.shields.io/badge/rustc-1.73.0-blue.svg)](https://blog.rust-lang.org/2023/03/31/Rust-1.73.0.html)
+[![Rust Version](https://img.shields.io/badge/rustc-1.75.0-blue.svg)](https://blog.rust-lang.org/2023/03/31/Rust-1.75.0.html)
[![Kubernetes Version](https://img.shields.io/badge/kubernetes-≥%201.16-blue.svg)](https://kubernetes.io/)
[![codecov](https://codecov.io/gh/project-akri/akri/branch/main/graph/badge.svg?token=V468HO7CDE)](https://codecov.io/gh/project-akri/akri)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/5339/badge)](https://bestpractices.coreinfrastructure.org/projects/5339)
diff --git a/agent/Cargo.toml b/agent/Cargo.toml
index ad389ad13..ec7cd42fa 100644
--- a/agent/Cargo.toml
+++ b/agent/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.13.1"
license = "Apache-2.0"
authors = ["Kate Goldenring ", ""]
edition = "2021"
-rust-version = "1.73.0"
+rust-version = "1.75.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -23,9 +23,9 @@ env_logger = "0.10.0"
futures = { version = "0.3.1", package = "futures" }
hyper = "0.14.2"
itertools = "0.12.0"
-k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] }
-kube = { version = "0.87.1", features = ["derive"] }
-kube-runtime = { version = "0.87.1", features = ["unstable-runtime-reconcile-on"] }
+k8s-openapi = { version = "0.22.0", default-features = false, features = ["schemars", "v1_25"] }
+kube = { version = "0.91.0", features = [ "derive", "runtime"] }
+kube-runtime = { version = "0.91.0", features = ["unstable-runtime-reconcile-on"] }
lazy_static = "1.4"
log = "0.4"
mockall_double = "0.3.1"
diff --git a/agent/src/discovery_handler_manager/discovery_handler_registry.rs b/agent/src/discovery_handler_manager/discovery_handler_registry.rs
index c8c558cda..755898d67 100644
--- a/agent/src/discovery_handler_manager/discovery_handler_registry.rs
+++ b/agent/src/discovery_handler_manager/discovery_handler_registry.rs
@@ -15,7 +15,7 @@ use futures::future::try_join_all;
use futures::FutureExt;
use itertools::Itertools;
use kube::core::ObjectMeta;
-use kube_runtime::reflector::ObjectRef;
+use kube::runtime::reflector::ObjectRef;
use tokio::select;
use tokio::sync::mpsc;
use tokio::sync::watch;
diff --git a/agent/src/discovery_handler_manager/mod.rs b/agent/src/discovery_handler_manager/mod.rs
index 6a4c1434d..e4317774e 100644
--- a/agent/src/discovery_handler_manager/mod.rs
+++ b/agent/src/discovery_handler_manager/mod.rs
@@ -9,7 +9,7 @@ use std::{collections::HashMap, sync::Arc};
use akri_shared::{akri::configuration::Configuration, k8s::api::IntoApi};
use k8s_openapi::api::core::v1::{ConfigMap, Secret};
-use kube_runtime::reflector::ObjectRef;
+use kube::runtime::reflector::ObjectRef;
use thiserror::Error;
use tokio::sync::{mpsc, watch};
diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs
index 496a783fa..8998e97cc 100644
--- a/agent/src/util/discovery_configuration_controller.rs
+++ b/agent/src/util/discovery_configuration_controller.rs
@@ -18,12 +18,12 @@ use crate::discovery_handler_manager::{
discovery_handler_registry::DiscoveryHandlerRegistry, DiscoveryError,
};
-use kube::{Resource, ResourceExt};
-use kube_runtime::{
+use kube::runtime::{
controller::Action,
reflector::{ObjectRef, Store},
Controller,
};
+use kube::{Resource, ResourceExt};
use thiserror::Error;
#[derive(Debug, Error)]
diff --git a/build/containers/Dockerfile.rust b/build/containers/Dockerfile.rust
index 440d98943..55dda3780 100644
--- a/build/containers/Dockerfile.rust
+++ b/build/containers/Dockerfile.rust
@@ -1,6 +1,6 @@
FROM --platform=$BUILDPLATFORM tonistiigi/xx:master AS xx
-FROM --platform=$BUILDPLATFORM rust:1.74-slim-bookworm AS build
+FROM --platform=$BUILDPLATFORM rust:1.75-slim-bookworm AS build
RUN rustup component add rustfmt
RUN apt-get update && apt-get install -y clang lld protobuf-compiler pkg-config mmdebstrap wget
COPY --from=xx / /
diff --git a/build/setup.sh b/build/setup.sh
index 9c16b080a..7cb501657 100755
--- a/build/setup.sh
+++ b/build/setup.sh
@@ -23,10 +23,10 @@ then
if [ -x "$(command -v sudo)" ];
then
echo "Install rustup"
- sudo curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.73.0
+ sudo curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.75.0
else
echo "Install rustup"
- curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.73.0
+ curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=1.75.0
fi
else
echo "Found rustup"
diff --git a/controller/Cargo.toml b/controller/Cargo.toml
index 797bb7608..13c551325 100644
--- a/controller/Cargo.toml
+++ b/controller/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.13.1"
license = "Apache-2.0"
authors = ["", ""]
edition = "2021"
-rust-version = "1.73.0"
+rust-version = "1.75.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -13,14 +13,17 @@ akri-shared = { path = "../shared" }
anyhow = "1.0.38"
async-std = "1.5.0"
chrono = "0.4.10"
-env_logger = "0.10.0"
+either = "1.13"
+env_logger = "0.11.5"
futures = "0.3.1"
-k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] }
-kube = { version = "0.87.1", features = ["derive"] }
-kube-runtime = "0.87.1"
+k8s-openapi = { version = "0.22.0", default-features = false, features = ["schemars", "v1_25"] }
+kube = { version = "0.91.0", features = ["runtime", "client", "derive" ] }
lazy_static = "1.4"
log = "0.4"
-prometheus = { version = "0.12.0", features = ["process"] }
+prometheus = { version = "0.13.4", features = ["process"] }
+# Used for patch API
+serde_json = "1.0.45"
+thiserror = "1"
tokio = { version = "1.0.2", features = ["full"] }
[dev-dependencies]
diff --git a/controller/src/main.rs b/controller/src/main.rs
index 82d6d0c35..31aab21be 100644
--- a/controller/src/main.rs
+++ b/controller/src/main.rs
@@ -2,11 +2,15 @@
extern crate lazy_static;
mod util;
-use akri_shared::akri::{metrics::run_metrics_server, API_NAMESPACE};
-use async_std::sync::Mutex;
+use akri_shared::{
+ akri::{metrics::run_metrics_server, API_NAMESPACE},
+ k8s::AKRI_CONFIGURATION_LABEL_NAME,
+};
+use futures::StreamExt;
+use kube::runtime::{watcher::Config, Controller};
use prometheus::IntGaugeVec;
use std::sync::Arc;
-use util::{instance_action, node_watcher, pod_watcher};
+use util::{controller_ctx::ControllerContext, instance_action, node_watcher, pod_watcher};
/// Length of time to sleep between controller system validation checks
pub const SYSTEM_CHECK_DELAY_SECS: u64 = 30;
@@ -33,45 +37,45 @@ async fn main() -> Result<(), Box
log::info!("{} Controller logging started", API_NAMESPACE);
- let synchronization = Arc::new(Mutex::new(()));
- let instance_watch_synchronization = synchronization.clone();
- let mut tasks = Vec::new();
-
// Start server for prometheus metrics
- tasks.push(tokio::spawn(async move {
- run_metrics_server().await.unwrap();
- }));
+ tokio::spawn(run_metrics_server());
+ let client = Arc::new(kube::Client::try_default().await?);
+ let controller_ctx = Arc::new(ControllerContext::new(client.clone()));
+ let node_watcher_ctx = controller_ctx.clone();
+ let pod_watcher_ctx = controller_ctx.clone();
+
+ node_watcher::check(client.clone()).await?;
+ let node_controller = Controller::new(
+ node_watcher_ctx.client.all().as_inner(),
+ Config::default().any_semantic(),
+ )
+ .shutdown_on_signal()
+ .run(
+ node_watcher::reconcile,
+ node_watcher::error_policy,
+ node_watcher_ctx,
+ )
+ .filter_map(|x| async move { std::result::Result::ok(x) })
+ .for_each(|_| futures::future::ready(()));
- // Handle existing instances
- tasks.push(tokio::spawn({
- async move {
- instance_action::handle_existing_instances().await.unwrap();
- }
- }));
- // Handle instance changes
- tasks.push(tokio::spawn({
- async move {
- instance_action::do_instance_watch(instance_watch_synchronization)
- .await
- .unwrap();
- }
- }));
- // Watch for node disappearance
- tasks.push(tokio::spawn({
- async move {
- let mut node_watcher = node_watcher::NodeWatcher::new();
- node_watcher.watch().await.unwrap();
- }
- }));
- // Watch for broker Pod state changes
- tasks.push(tokio::spawn({
- async move {
- let mut broker_pod_watcher = pod_watcher::BrokerPodWatcher::new();
- broker_pod_watcher.watch().await.unwrap();
- }
- }));
+ pod_watcher::check(client.clone()).await?;
+ let pod_controller = Controller::new(
+ pod_watcher_ctx.client.all().as_inner(),
+ Config::default().labels(AKRI_CONFIGURATION_LABEL_NAME),
+ )
+ .shutdown_on_signal()
+ .run(
+ pod_watcher::reconcile,
+ pod_watcher::error_policy,
+ pod_watcher_ctx,
+ )
+ .filter_map(|x| async move { std::result::Result::ok(x) })
+ .for_each(|_| futures::future::ready(()));
- futures::future::try_join_all(tasks).await?;
+ tokio::select! {
+ _ = futures::future::join(node_controller, pod_controller) => {},
+ _ = instance_action::run(client) => {}
+ }
log::info!("{} Controller end", API_NAMESPACE);
Ok(())
diff --git a/controller/src/util/controller_ctx.rs b/controller/src/util/controller_ctx.rs
new file mode 100644
index 000000000..8534688c1
--- /dev/null
+++ b/controller/src/util/controller_ctx.rs
@@ -0,0 +1,101 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use akri_shared::akri::configuration::Configuration;
+use akri_shared::akri::instance::Instance;
+use akri_shared::k8s::api::IntoApi;
+
+use k8s_openapi::api::batch::v1::Job;
+use k8s_openapi::api::core::v1::{Node, Pod, Service};
+
+use tokio::sync::RwLock;
+
+/// Pod states that BrokerPodWatcher is interested in
+///
+/// PodState describes the various states that the controller can
+/// react to for Pods.
+#[derive(Clone, Debug, PartialEq)]
+pub enum PodState {
+ /// Pod is in Pending state and no action is needed.
+ Pending,
+ /// Pod is in Running state and needs to ensure that
+ /// instance and configuration services are running
+ Running,
+ /// Pod is in Failed/Completed/Succeeded state and
+ /// needs to remove any instance and configuration
+ /// services that are not supported by other Running
+ /// Pods. Also, at this point, if an Instance still
+ /// exists, instance_action::handle_instance_change
+ /// needs to be called to ensure that Pods are
+ /// restarted
+ Ended,
+ /// Pod is in Deleted state and needs to remove any
+ /// instance and configuration services that are not
+ /// supported by other Running Pods. Also, at this
+ /// point, if an Instance still exists, and the Pod is
+ /// owned by the Instance,
+ /// instance_action::handle_instance_change needs to be
+ /// called to ensure that Pods are restarted. Akri
+ /// places an Instance OwnerReference on all the Pods it
+ /// deploys. This declares that the Instance owns that
+ /// Pod and Akri's Controller explicitly manages its
+ /// deployment. However, if the Pod is not owned by the
+ /// Instance, Akri should not assume retry logic and
+ /// should cease action. The owning object (ie Job) will
+ /// handle retries as necessary.
+ Deleted,
+}
+
+/// Node states that NodeWatcher is interested in
+///
+/// NodeState describes the various states that the controller can
+/// react to for Nodes.
+#[derive(Clone, Debug, PartialEq)]
+pub enum NodeState {
+ /// Node has been seen, but not Running yet
+ Known,
+ /// Node has been seen Running
+ Running,
+ /// A previously Running Node has been seen as not Running
+ /// and the Instances have been cleaned of references to that
+ /// vanished Node
+ InstancesCleaned,
+}
+
+pub trait ControllerKubeClient:
+ IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi
+{
+}
+
+impl<
+ T: IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi
+ + IntoApi,
+ > ControllerKubeClient for T
+{
+}
+
+pub struct ControllerContext {
+ /// Kubernetes client
+ pub client: Arc,
+ pub known_pods: Arc>>,
+ pub known_nodes: Arc>>,
+}
+
+impl ControllerContext {
+ pub fn new(client: Arc) -> Self {
+ ControllerContext {
+ client,
+ known_pods: Arc::new(RwLock::new(HashMap::new())),
+ known_nodes: Arc::new(RwLock::new(HashMap::new())),
+ }
+ }
+}
diff --git a/controller/src/util/instance_action.rs b/controller/src/util/instance_action.rs
index 412769c9a..29d38b735 100644
--- a/controller/src/util/instance_action.rs
+++ b/controller/src/util/instance_action.rs
@@ -1,158 +1,62 @@
-use super::super::BROKER_POD_COUNT_METRIC;
-use super::{pod_action::PodAction, pod_action::PodActionInfo};
+use crate::util::controller_ctx::ControllerKubeClient;
+use crate::util::{pod_action::PodAction, pod_action::PodActionInfo};
+use crate::BROKER_POD_COUNT_METRIC;
+use akri_shared::akri::configuration::Configuration;
+use akri_shared::k8s::api::Api;
use akri_shared::{
akri::{configuration::BrokerSpec, instance::Instance, AKRI_PREFIX},
k8s::{
- self, job, pod,
- pod::{AKRI_INSTANCE_LABEL_NAME, AKRI_TARGET_NODE_LABEL_NAME},
- KubeInterface, OwnershipInfo, OwnershipType,
+ job, pod, OwnershipInfo, OwnershipType, AKRI_INSTANCE_LABEL_NAME,
+ AKRI_TARGET_NODE_LABEL_NAME,
},
};
-use async_std::sync::Mutex;
-use futures::{StreamExt, TryStreamExt};
-use k8s_openapi::api::batch::v1::JobSpec;
+use anyhow::Context;
+use futures::TryStreamExt;
+use k8s_openapi::api::batch::v1::{Job, JobSpec};
use k8s_openapi::api::core::v1::{Pod, PodSpec};
-use kube::api::Api;
-use kube_runtime::watcher::{watcher, Config, Event};
-use kube_runtime::WatchStreamExt;
-use log::{error, info, trace};
+
+use kube::{
+ api::{ListParams, ResourceExt},
+ runtime::{controller::Action, watcher::watcher, watcher::Config, WatchStreamExt},
+};
+use log::{error, trace};
use std::collections::HashMap;
use std::sync::Arc;
+
/// Length of time a Pod can be pending before we give up and retry
pub const PENDING_POD_GRACE_PERIOD_MINUTES: i64 = 5;
/// Length of time a Pod can be in an error state before we retry
pub const FAILED_POD_GRACE_PERIOD_MINUTES: i64 = 0;
-
-/// Instance action types
-///
-/// Instance actions describe the types of actions the Controller can
-/// react to for Instances.
-///
-/// This will determine what broker management actions to take (if any)
-///
-/// | --> InstanceAction::Add
-/// | --> No broker => Do nothing
-/// | --> => Deploy a Job
-/// | --> => Deploy Pod to each Node on Instance's `nodes` list (up to `capacity` total)
-/// | --> InstanceAction::Remove
-/// | --> No broker => Do nothing
-/// | --> => Delete all Jobs labeled with the Instance name
-/// | --> => Delete all Pods labeled with the Instance name
-/// | --> InstanceAction::Update
-/// | --> No broker => Do nothing
-/// | --> => No nothing
-/// | --> => Ensure that each Node on Instance's `nodes` list (up to `capacity` total) have a Pod
-///
-#[derive(Clone, Debug, PartialEq)]
-pub enum InstanceAction {
- /// An Instance is added
- Add,
- /// An Instance is removed
- Remove,
- /// An Instance is updated
- Update,
-}
-
-/// This invokes an internal method that watches for Instance events
-pub async fn handle_existing_instances(
-) -> Result<(), Box> {
- internal_handle_existing_instances(&k8s::KubeImpl::new().await?).await
-}
-
-/// This invokes an internal method that watches for Instance events
-pub async fn do_instance_watch(
- synchronization: Arc>,
-) -> Result<(), Box> {
- // Watch for instance changes
- internal_do_instance_watch(&synchronization, &k8s::KubeImpl::new().await?).await
-}
-
-/// This invokes an internal method that watches for Instance events
-async fn internal_handle_existing_instances(
- kube_interface: &impl KubeInterface,
-) -> Result<(), Box> {
- let mut tasks = Vec::new();
-
- // Handle existing instances
- let pre_existing_instances = kube_interface.get_instances().await?;
- for instance in pre_existing_instances {
- tasks.push(tokio::spawn(async move {
- let inner_kube_interface = k8s::KubeImpl::new().await.unwrap();
- handle_instance_change(&instance, &InstanceAction::Update, &inner_kube_interface)
- .await
- .unwrap();
- }));
- }
- futures::future::try_join_all(tasks).await?;
- Ok(())
-}
-
-async fn internal_do_instance_watch(
- synchronization: &Arc>,
- kube_interface: &impl KubeInterface,
-) -> Result<(), Box> {
- trace!("internal_do_instance_watch - enter");
- let resource = Api::::all(kube_interface.get_kube_client());
- let watcher = watcher(resource, Config::default()).default_backoff();
- let mut informer = watcher.boxed();
- let mut first_event = true;
- // Currently, this does not handle None except to break the loop.
- loop {
- let event = match informer.try_next().await {
- Err(e) => {
- error!("Error during watch: {}", e);
- continue;
- }
- Ok(None) => break,
- Ok(Some(event)) => event,
- };
- // Aquire lock to ensure cleanup_instance_and_configuration_svcs and the
- // inner loop handle_instance call in internal_do_instance_watch
- // cannot execute at the same time.
- let _lock = synchronization.lock().await;
- trace!("internal_do_instance_watch - aquired sync lock");
- handle_instance(event, kube_interface, &mut first_event).await?;
- }
- Ok(())
-}
-
-/// This takes an event off the Instance stream and delegates it to the
-/// correct function based on the event type.
-async fn handle_instance(
- event: Event,
- kube_interface: &impl KubeInterface,
- first_event: &mut bool,
-) -> anyhow::Result<()> {
- trace!("handle_instance - enter");
- match event {
- Event::Applied(instance) => {
- info!(
- "handle_instance - added or modified Akri Instance {:?}: {:?}",
- instance.metadata.name, instance.spec
- );
- // TODO: consider renaming `InstanceAction::Add` to `InstanceAction::AddOrUpdate`
- // to reflect that this could also be an Update event. Or as we do more specific
- // inspection in future, delineation may be useful.
- handle_instance_change(&instance, &InstanceAction::Add, kube_interface).await?;
- }
- Event::Deleted(instance) => {
- info!(
- "handle_instance - deleted Akri Instance {:?}: {:?}",
- instance.metadata.name, instance.spec
- );
- handle_instance_change(&instance, &InstanceAction::Remove, kube_interface).await?;
- }
- Event::Restarted(_instances) => {
- if *first_event {
- info!("handle_instance - watcher started");
- } else {
- return Err(anyhow::anyhow!(
- "Instance watcher restarted - throwing error to restart controller"
- ));
+// Identifier for the controller to be set as the field manager for server-side apply
+pub const CONTROLLER_FIELD_MANAGER_ID: &str = "akri.sh/controller";
+
+/// This function is the main Reconcile function for Instance resources
+/// This will get called every time an Instance gets added or is changed.
+pub async fn run(client: Arc) -> anyhow::Result<()> {
+ let api: Box> = client.all();
+ if let Err(e) = api.list(&ListParams::default().limit(1)).await {
+ error!("Instance CRD is not queryable; {e:?}. Is the CRD installed?");
+ std::process::exit(1);
+ }
+
+ // First handle existing instances
+ let instances = api.list(&ListParams::default()).await?;
+ for instance in instances {
+ handle_instance_change(instance, client.clone()).await?;
+ }
+
+ watcher(api.as_inner(), Config::default())
+ .applied_objects()
+ .try_for_each(move |instance| {
+ let client = client.clone();
+ async move {
+ handle_instance_change(instance, client)
+ .await
+ .map_err(kube::runtime::watcher::Error::WatchFailed)?;
+ Ok(())
}
- }
- }
- *first_event = false;
+ })
+ .await?;
Ok(())
}
@@ -163,7 +67,6 @@ async fn handle_instance(
/// specific Node's protocol broker Pod.
///
/// * the node is described by node_name
-/// * the protocol (or capability) is described by instance_name and namespace
/// * what to do with the broker Pod is described by action
// TODO: add Pod name so does not need to be
// generated on deletes and remove Option wrappers.
@@ -175,24 +78,21 @@ pub(crate) struct PodContext {
}
pub(crate) fn create_pod_context(k8s_pod: &Pod, action: PodAction) -> anyhow::Result {
- let pod_name = k8s_pod.metadata.name.as_ref().unwrap();
- let labels = &k8s_pod
- .metadata
- .labels
- .as_ref()
- .ok_or_else(|| anyhow::anyhow!("no labels found for Pod {:?}", pod_name))?;
// Early exits above ensure unwrap will not panic
- let node_to_run_pod_on = labels.get(AKRI_TARGET_NODE_LABEL_NAME).ok_or_else(|| {
- anyhow::anyhow!(
- "no {} label found for {:?}",
- AKRI_TARGET_NODE_LABEL_NAME,
- pod_name
- )
- })?;
+ let node_to_run_pod_on = &k8s_pod
+ .labels()
+ .get(AKRI_TARGET_NODE_LABEL_NAME)
+ .ok_or_else(|| {
+ anyhow::anyhow!(
+ "no {} label found for {:?}",
+ AKRI_TARGET_NODE_LABEL_NAME,
+ k8s_pod.name_unchecked()
+ )
+ })?;
Ok(PodContext {
node_name: Some(node_to_run_pod_on.to_string()),
- namespace: k8s_pod.metadata.namespace.clone(),
+ namespace: k8s_pod.namespace(),
action,
})
}
@@ -202,10 +102,9 @@ pub(crate) fn create_pod_context(k8s_pod: &Pod, action: PodAction) -> anyhow::Re
/// it will update the nodes_to_act_on map with the required action.
fn determine_action_for_pod(
k8s_pod: &Pod,
- action: &InstanceAction,
nodes_to_act_on: &mut HashMap,
) -> anyhow::Result<()> {
- let pod_name = k8s_pod.metadata.name.as_ref().unwrap();
+ let pod_name = k8s_pod.name_unchecked();
let pod_phase = k8s_pod
.status
.as_ref()
@@ -214,36 +113,34 @@ fn determine_action_for_pod(
.as_ref()
.ok_or_else(|| anyhow::anyhow!("No pod phase found for Pod {:?}", pod_name))?;
- let mut update_pod_context = create_pod_context(k8s_pod, PodAction::NoAction)?;
- let node_to_run_pod_on = update_pod_context.node_name.as_ref().unwrap();
+ let mut ctx = create_pod_context(k8s_pod, PodAction::NoAction)?;
// Early exits above ensure unwrap will not panic
let pod_start_time = k8s_pod.status.as_ref().unwrap().start_time.clone();
+ let node_to_run_pod_on = ctx.node_name.as_ref().unwrap();
let pod_action_info = PodActionInfo {
pending_grace_time_in_minutes: PENDING_POD_GRACE_PERIOD_MINUTES,
ended_grace_time_in_minutes: FAILED_POD_GRACE_PERIOD_MINUTES,
phase: pod_phase.to_string(),
- instance_action: action.clone(),
status_start_time: pod_start_time,
unknown_node: !nodes_to_act_on.contains_key(node_to_run_pod_on),
- trace_node_name: k8s_pod.metadata.name.clone().unwrap(),
+ trace_pod_name: k8s_pod.name_unchecked(),
};
- update_pod_context.action = pod_action_info.select_pod_action()?;
- nodes_to_act_on.insert(node_to_run_pod_on.to_string(), update_pod_context);
+ ctx.action = pod_action_info.select_pod_action()?;
+ nodes_to_act_on.insert(node_to_run_pod_on.to_string(), ctx);
Ok(())
}
-/// This handles Instance deletion event by deleting the
-/// broker Pod, the broker Service (if there are no remaining broker Pods),
-/// and the capability Service (if there are no remaining capability Pods).
+/// This deliberately deletes the broker Pod, the broker Service (if there are no remaining broker Pods), and the configuration service (if there are no remaining capability Pods).
+/// This is done before recreating the broker Pod and svcs
async fn handle_deletion_work(
instance_name: &str,
configuration_name: &str,
instance_shared: bool,
node_to_delete_pod: &str,
context: &PodContext,
- kube_interface: &impl KubeInterface,
+ api: &dyn Api,
) -> anyhow::Result<()> {
let context_node_name = context.node_name.as_ref().ok_or_else(|| {
anyhow::anyhow!(
@@ -252,18 +149,11 @@ async fn handle_deletion_work(
context
)
})?;
- let context_namespace = context.namespace.as_ref().ok_or_else(|| {
- anyhow::anyhow!(
- "handle_deletion_work - Context namespace is missing for {}: {:?}",
- node_to_delete_pod,
- context
- )
- })?;
trace!(
"handle_deletion_work - pod::create_broker_app_name({:?}, {:?}, {:?}, {:?})",
&instance_name,
- context_node_name,
+ context.node_name,
instance_shared,
"pod"
);
@@ -276,111 +166,34 @@ async fn handle_deletion_work(
trace!(
"handle_deletion_work - pod::remove_pod name={:?}, namespace={:?}",
&pod_app_name,
- &context_namespace
+ &context.namespace
);
- kube_interface
- .remove_pod(&pod_app_name, context_namespace)
- .await?;
- trace!("handle_deletion_work - pod::remove_pod succeeded",);
+ api.delete(&pod_app_name).await?;
+ trace!("handle_deletion_work - pod::remove_pod succeeded");
BROKER_POD_COUNT_METRIC
- .with_label_values(&[configuration_name, context_node_name])
+ .with_label_values(&[configuration_name, &context_node_name])
.dec();
Ok(())
}
-#[cfg(test)]
-mod handle_deletion_work_tests {
- use super::*;
- use akri_shared::k8s::MockKubeInterface;
-
- #[tokio::test]
- async fn test_handle_deletion_work_with_no_node_name() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let context = PodContext {
- node_name: None,
- namespace: Some("namespace".into()),
- action: PodAction::NoAction,
- };
-
- assert!(handle_deletion_work(
- "instance_name",
- "configuration_name",
- true,
- "node_to_delete_pod",
- &context,
- &MockKubeInterface::new(),
- )
- .await
- .is_err());
- }
-
- #[tokio::test]
- async fn test_handle_deletion_work_with_no_namespace() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let context = PodContext {
- node_name: Some("node-a".into()),
- namespace: None,
- action: PodAction::NoAction,
- };
-
- assert!(handle_deletion_work(
- "instance_name",
- "configuration_name",
- true,
- "node_to_delete_pod",
- &context,
- &MockKubeInterface::new(),
- )
- .await
- .is_err());
- }
-}
-
/// This handles Instance addition event by creating the
-/// broker Pod, the broker Service, and the capability Service.
-/// TODO: reduce parameters by passing Instance object instead of
-/// individual fields
-#[allow(clippy::too_many_arguments)]
+/// broker Pod.
async fn handle_addition_work(
- instance_name: &str,
- instance_uid: &str,
- instance_namespace: &str,
- instance_class_name: &str,
- instance_shared: bool,
+ api: &dyn Api,
+ pod: Pod,
+ configuration_name: &str,
new_node: &str,
- podspec: &PodSpec,
- kube_interface: &impl KubeInterface,
) -> anyhow::Result<()> {
trace!(
"handle_addition_work - Create new Pod for Node={:?}",
new_node
);
- let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name);
- let new_pod = pod::create_new_pod_from_spec(
- instance_namespace,
- instance_name,
- instance_class_name,
- OwnershipInfo::new(
- OwnershipType::Instance,
- instance_name.to_string(),
- instance_uid.to_string(),
- ),
- &capability_id,
- new_node,
- instance_shared,
- podspec,
- )?;
- trace!("handle_addition_work - New pod spec={:?}", new_pod);
-
- kube_interface
- .create_pod(&new_pod, instance_namespace)
- .await?;
+ trace!("handle_addition_work - New pod spec={:?}", pod);
+ api.apply(pod, CONTROLLER_FIELD_MANAGER_ID).await?;
trace!("handle_addition_work - pod::create_pod succeeded",);
BROKER_POD_COUNT_METRIC
- .with_label_values(&[instance_class_name, new_node])
+ .with_label_values(&[configuration_name, new_node])
.inc();
Ok(())
@@ -388,153 +201,103 @@ async fn handle_addition_work(
/// Handle Instance change by
/// 1) checking to make sure the Instance's Configuration exists
-/// 2) calling the appropriate handler depending on the broker type (Pod or Job) if any
+/// 2) taking the appropriate action depending on the broker type (Pod or Job) if any:
+/// | --> No broker => Do nothing
+/// | --> => Deploy a Job if one does not exist
+/// | --> => Ensure that each Node on Instance's `nodes` list (up to `capacity` total) have a Pod.
+/// Deploy Pods as necessary
pub async fn handle_instance_change(
- instance: &Instance,
- action: &InstanceAction,
- kube_interface: &impl KubeInterface,
-) -> anyhow::Result<()> {
- trace!("handle_instance_change - enter {:?}", action);
- let instance_name = instance.metadata.name.clone().unwrap();
- let instance_namespace =
- instance.metadata.namespace.as_ref().ok_or_else(|| {
- anyhow::anyhow!("Namespace not found for instance: {}", &instance_name)
- })?;
- let configuration = match kube_interface
- .find_configuration(&instance.spec.configuration_name, instance_namespace)
- .await
- {
- Ok(config) => config,
- _ => {
- if action != &InstanceAction::Remove {
- // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances.
- // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent
- // is designed to shutdown when it's Configuration watcher fails.
- error!(
- "handle_instance_change - no configuration found for {:?} yet instance {:?} exists - check that device plugin is running properly",
- &instance.spec.configuration_name, &instance.metadata.name
+ instance: Instance,
+ client: Arc,
+) -> kube::Result {
+ trace!("handle_instance_change - enter");
+ let instance_namespace = instance.namespace().unwrap();
+ let api: Box> = client.namespaced(&instance_namespace);
+ let Ok(Some(configuration)) = api.get(&instance.spec.configuration_name).await else {
+ // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances.
+ // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent
+ // is designed to shutdown when it's Configuration watcher fails.
+ error!("handle_instance_change - no configuration found for {:?} yet instance {:?} exists - check that device plugin is running properly",
+ &instance.spec.configuration_name, &instance.name_unchecked()
);
- }
- return Ok(());
- }
+
+ return Ok(default_requeue_action());
};
- if let Some(broker_spec) = &configuration.spec.broker_spec {
- let instance_change_result = match broker_spec {
- BrokerSpec::BrokerPodSpec(p) => {
- handle_instance_change_pod(instance, p, action, kube_interface).await
- }
- BrokerSpec::BrokerJobSpec(j) => {
- handle_instance_change_job(
- instance,
- *configuration.metadata.generation.as_ref().unwrap(),
- j,
- action,
- kube_interface,
- )
- .await
- }
- };
- if let Err(e) = instance_change_result {
- error!("Unable to handle Broker action: {:?}", e);
+ let Some(broker_spec) = &configuration.spec.broker_spec else {
+ return Ok(default_requeue_action());
+ };
+ let res = match broker_spec {
+ BrokerSpec::BrokerPodSpec(p) => handle_instance_change_pod(&instance, p, client).await,
+ BrokerSpec::BrokerJobSpec(j) => {
+ handle_instance_change_job(
+ &instance,
+ *configuration.metadata.generation.as_ref().unwrap(),
+ j,
+ client.clone(),
+ )
+ .await
}
+ };
+ if let Err(e) = res {
+ error!("Unable to handle Broker action: {:?}", e);
}
- Ok(())
+ Ok(default_requeue_action())
}
/// Called when an Instance has changed that requires a Job broker. Action determined by InstanceAction.
-/// InstanceAction::Add => Deploy a Job with JobSpec from Configuration. Label with Instance name.
-/// InstanceAction::Remove => Delete all Jobs labeled with the Instance name
-/// InstanceAction::Update => No nothing
+/// First check if a job with the instance name exists. If it does, do nothing. Otherwise, deploy a Job
+/// with JobSpec from Configuration and label with Instance name.
pub async fn handle_instance_change_job(
instance: &Instance,
config_generation: i64,
job_spec: &JobSpec,
- action: &InstanceAction,
- kube_interface: &impl KubeInterface,
+ client: Arc,
) -> anyhow::Result<()> {
- trace!("handle_instance_change_job - enter {:?}", action);
+ trace!("handle_instance_change_job - enter");
+ let api: Box> = client.namespaced(&instance.namespace().unwrap());
+ if api.get(&instance.name_unchecked()).await?.is_some() {
+ // Job already exists, do nothing
+ return Ok(());
+ }
+ let instance_name = instance.name_unchecked();
// Create name for Job. Includes Configuration generation in the suffix
// to track what version of the Configuration the Job is associated with.
let job_name = pod::create_broker_app_name(
- instance.metadata.name.as_ref().unwrap(),
+ &instance_name,
None,
instance.spec.shared,
&format!("{}-job", config_generation),
);
- let instance_name = instance.metadata.name.as_ref().unwrap();
- let instance_namespace = instance.metadata.namespace.as_ref().unwrap();
- let instance_uid = instance
- .metadata
- .uid
- .as_ref()
- .ok_or_else(|| anyhow::anyhow!("UID not found for instance: {}", &instance_name))?;
- match action {
- InstanceAction::Add => {
- trace!("handle_instance_change_job - instance added");
- let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name);
- let new_job = job::create_new_job_from_spec(
- instance,
- OwnershipInfo::new(
- OwnershipType::Instance,
- instance_name.to_string(),
- instance_uid.to_string(),
- ),
- &capability_id,
- job_spec,
- &job_name,
- )?;
- kube_interface
- .create_job(&new_job, instance_namespace)
- .await?;
- }
- InstanceAction::Remove => {
- trace!("handle_instance_change_job - instance removed");
- // Find all jobs with the label
- let instance_jobs = kube_interface
- .find_jobs_with_label(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name))
- .await?;
- let delete_tasks = instance_jobs.into_iter().map(|j| async move {
- kube_interface
- .remove_job(
- j.metadata.name.as_ref().unwrap(),
- j.metadata.namespace.as_ref().unwrap(),
- )
- .await
- });
-
- futures::future::try_join_all(delete_tasks).await?;
- }
- InstanceAction::Update => {
- trace!("handle_instance_change_job - instance updated");
- // TODO: Broker could have encountered unexpected admission error and need to be removed and added
- }
- }
+ trace!("handle_instance_change_job - instance added");
+ let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name);
+ let new_job = job::create_new_job_from_spec(
+ instance,
+ OwnershipInfo::new(
+ OwnershipType::Instance,
+ instance_name,
+ instance.uid().unwrap(),
+ ),
+ &capability_id,
+ job_spec,
+ &job_name,
+ )?;
+ let api: Box> = client.namespaced(&instance.namespace().unwrap());
+ // TODO: Consider using server side apply instead of create
+ api.create(&new_job).await?;
Ok(())
}
/// Called when an Instance has changed that requires a Pod broker.
-/// Action determined by InstanceAction and changes to the Instance's `nodes` list.
-/// Starts broker Pods that are missing and stops Pods that are no longer needed.
-/// InstanceAction::Add => Deploy Pod to each Node on Instance's `nodes` list (up to `capacity` total)
-/// InstanceAction::Remove => Delete all Pods labeled with the Instance name
-/// InstanceAction::Update => Ensure that each Node on Instance's `nodes` list (up to `capacity` total) have a Pod
+/// Ensures that each Node on Instance's `nodes` list (up to `capacity` total) has a running Pod
pub async fn handle_instance_change_pod(
instance: &Instance,
podspec: &PodSpec,
- action: &InstanceAction,
- kube_interface: &impl KubeInterface,
+ client: Arc,
) -> anyhow::Result<()> {
- trace!("handle_instance_change_pod - enter {:?}", action);
-
- let instance_name = instance.metadata.name.clone().unwrap();
-
- // If InstanceAction::Remove, assume all nodes require PodAction::NoAction (reflect that there is no running Pod unless we find one)
- // Otherwise, assume all nodes require PodAction::Add (reflect that there is no running Pod, unless we find one)
- let default_action = match action {
- InstanceAction::Remove => PodAction::NoAction,
- _ => PodAction::Add,
- };
+ trace!("handle_instance_change_pod - enter");
+ // Assume all nodes require PodAction::Add (reflect that there is no running Pod, unless we find one)
+ let default_action = PodAction::Add;
let mut nodes_to_act_on: HashMap = instance
.spec
.nodes
@@ -550,38 +313,31 @@ pub async fn handle_instance_change_pod(
)
})
.collect();
- trace!(
- "handle_instance_change - nodes tracked from instance={:?}",
- nodes_to_act_on
- );
- trace!(
- "handle_instance_change - find all pods that have {}={}",
+ let lp = ListParams::default().labels(&format!(
+ "{}={}",
AKRI_INSTANCE_LABEL_NAME,
- instance_name
- );
- let instance_pods = kube_interface
- .find_pods_with_label(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name))
- .await?;
+ instance.name_unchecked()
+ ));
+ let api = client.namespaced(&instance.namespace().context("no namespace")?);
+ let instance_pods = api.list(&lp).await?;
trace!(
"handle_instance_change - found {} pods",
instance_pods.items.len()
);
-
- trace!("handle_instance_change - update actions based on the existing pods");
// By default, assume any pod tracked by the instance need to be added.
// Query the existing pods to see if some of these are already added, or
// need to be removed
instance_pods
.items
.iter()
- .try_for_each(|x| determine_action_for_pod(x, action, &mut nodes_to_act_on))?;
+ .try_for_each(|x| determine_action_for_pod(x, &mut nodes_to_act_on))?;
trace!(
"handle_instance_change - nodes tracked after querying existing pods={:?}",
nodes_to_act_on
);
- do_pod_action_for_nodes(nodes_to_act_on, instance, podspec, kube_interface).await?;
+ do_pod_action_for_nodes(nodes_to_act_on, instance, podspec, api).await?;
trace!("handle_instance_change - exit");
Ok(())
@@ -591,7 +347,7 @@ pub(crate) async fn do_pod_action_for_nodes(
nodes_to_act_on: HashMap,
instance: &Instance,
podspec: &PodSpec,
- kube_interface: &impl KubeInterface,
+ api: Box>,
) -> anyhow::Result<()> {
trace!("do_pod_action_for_nodes - enter");
// Iterate over nodes_to_act_on where value == (PodAction::Remove | PodAction::RemoveAndAdd)
@@ -599,12 +355,12 @@ pub(crate) async fn do_pod_action_for_nodes(
((v.action) == PodAction::Remove) | ((v.action) == PodAction::RemoveAndAdd)
}) {
handle_deletion_work(
- instance.metadata.name.as_ref().unwrap(),
+ &instance.name_unchecked(),
&instance.spec.configuration_name,
instance.spec.shared,
node_to_delete_pod,
context,
- kube_interface,
+ api.as_ref(),
)
.await?
}
@@ -622,118 +378,54 @@ pub(crate) async fn do_pod_action_for_nodes(
.collect::>();
// Iterate over nodes_to_act_on where value == (PodAction::Add | PodAction::RemoveAndAdd)
+ let instance_name = instance.name_unchecked();
+ let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name);
for new_node in nodes_to_add {
- handle_addition_work(
- instance.metadata.name.as_ref().unwrap(),
- instance.metadata.uid.as_ref().unwrap(),
- instance.metadata.namespace.as_ref().unwrap(),
+ let new_pod = pod::create_new_pod_from_spec(
+ &instance.namespace().unwrap(),
+ &instance_name,
&instance.spec.configuration_name,
- instance.spec.shared,
+ OwnershipInfo::new(
+ OwnershipType::Instance,
+ instance_name.clone(),
+ instance.uid().unwrap(),
+ ),
+ &capability_id,
&new_node,
+ instance.spec.shared,
podspec,
- kube_interface,
+ )?;
+ handle_addition_work(
+ api.as_ref(),
+ new_pod,
+ &instance.spec.configuration_name,
+ &new_node,
)
.await?;
}
Ok(())
}
+// Default action for finalizers for the instance controller
+fn default_requeue_action() -> Action {
+ Action::await_change()
+}
+
#[cfg(test)]
mod handle_instance_tests {
- use super::super::shared_test_utils::config_for_tests;
- use super::super::shared_test_utils::config_for_tests::PodList;
+ use crate::util::shared_test_utils::mock_client::MockControllerKubeClient;
+
+ use super::super::shared_test_utils::config_for_tests::*;
use super::*;
use akri_shared::{
akri::instance::Instance,
- k8s::{pod::AKRI_INSTANCE_LABEL_NAME, MockKubeInterface},
+ k8s::{api::MockApi, pod::AKRI_INSTANCE_LABEL_NAME},
os::file,
};
use chrono::prelude::*;
use chrono::Utc;
use mockall::predicate::*;
- fn configure_find_pods_with_phase(
- mock: &mut MockKubeInterface,
- pod_selector: &'static str,
- result_file: &'static str,
- specified_phase: &'static str,
- ) {
- trace!(
- "mock.expect_find_pods_with_label pod_selector:{}",
- pod_selector
- );
- mock.expect_find_pods_with_label()
- .times(1)
- .withf(move |selector| selector == pod_selector)
- .returning(move |_| {
- let pods_json = file::read_file_to_string(result_file);
- let phase_adjusted_json = pods_json.replace(
- "\"phase\": \"Running\"",
- &format!("\"phase\": \"{}\"", specified_phase),
- );
- let pods: PodList = serde_json::from_str(&phase_adjusted_json).unwrap();
- Ok(pods)
- });
- }
-
- fn configure_find_pods_with_phase_and_start_time(
- mock: &mut MockKubeInterface,
- pod_selector: &'static str,
- result_file: &'static str,
- specified_phase: &'static str,
- start_time: DateTime,
- ) {
- trace!(
- "mock.expect_find_pods_with_label pod_selector:{}",
- pod_selector
- );
- mock.expect_find_pods_with_label()
- .times(1)
- .withf(move |selector| selector == pod_selector)
- .returning(move |_| {
- let pods_json = file::read_file_to_string(result_file);
- let phase_adjusted_json = pods_json.replace(
- "\"phase\": \"Running\"",
- &format!("\"phase\": \"{}\"", specified_phase),
- );
- let start_time_adjusted_json = phase_adjusted_json.replace(
- "\"startTime\": \"2020-02-25T20:48:03Z\"",
- &format!(
- "\"startTime\": \"{}\"",
- start_time.format("%Y-%m-%dT%H:%M:%SZ")
- ),
- );
- let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap();
- Ok(pods)
- });
- }
-
- fn configure_find_pods_with_phase_and_no_start_time(
- mock: &mut MockKubeInterface,
- pod_selector: &'static str,
- result_file: &'static str,
- specified_phase: &'static str,
- ) {
- trace!(
- "mock.expect_find_pods_with_label pod_selector:{}",
- pod_selector
- );
- mock.expect_find_pods_with_label()
- .times(1)
- .withf(move |selector| selector == pod_selector)
- .returning(move |_| {
- let pods_json = file::read_file_to_string(result_file);
- let phase_adjusted_json = pods_json.replace(
- "\"phase\": \"Running\"",
- &format!("\"phase\": \"{}\"", specified_phase),
- );
- let start_time_adjusted_json =
- phase_adjusted_json.replace("\"startTime\": \"2020-02-25T20:48:03Z\",", "");
- let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap();
- Ok(pods)
- });
- }
-
#[derive(Clone)]
struct HandleInstanceWork {
find_pods_selector: &'static str,
@@ -754,10 +446,11 @@ mod handle_instance_tests {
}
fn configure_for_handle_instance_change(
- mock: &mut MockKubeInterface,
+ mock: &mut MockControllerKubeClient,
work: &HandleInstanceWork,
) {
- config_for_tests::configure_find_config(
+ let mut mock_pod_api: MockApi = MockApi::new();
+ configure_find_config(
mock,
work.config_work.find_config_name,
work.config_work.find_config_namespace,
@@ -767,7 +460,7 @@ mod handle_instance_tests {
if let Some(phase) = work.find_pods_phase {
if let Some(start_time) = work.find_pods_start_time {
configure_find_pods_with_phase_and_start_time(
- mock,
+ &mut mock_pod_api,
work.find_pods_selector,
work.find_pods_result,
phase,
@@ -775,35 +468,39 @@ mod handle_instance_tests {
);
} else if work.find_pods_delete_start_time {
configure_find_pods_with_phase_and_no_start_time(
- mock,
+ &mut mock_pod_api,
work.find_pods_selector,
work.find_pods_result,
phase,
);
} else {
configure_find_pods_with_phase(
- mock,
+ &mut mock_pod_api,
work.find_pods_selector,
work.find_pods_result,
phase,
);
}
} else {
- config_for_tests::configure_find_pods(
- mock,
+ configure_find_pods(
+ &mut mock_pod_api,
work.find_pods_selector,
+ work.config_work.find_config_namespace,
work.find_pods_result,
false,
);
}
if let Some(deletion_work) = &work.deletion_work {
- configure_for_handle_deletion_work(mock, deletion_work);
+ configure_for_handle_deletion_work(&mut mock_pod_api, deletion_work);
}
if let Some(addition_work) = &work.addition_work {
- configure_for_handle_addition_work(mock, addition_work);
+ configure_for_handle_addition_work(&mut mock_pod_api, addition_work);
}
+ mock.pod
+ .expect_namespaced()
+ .return_once(move |_| Box::new(mock_pod_api));
}
#[derive(Clone)]
@@ -821,20 +518,12 @@ mod handle_instance_tests {
}
}
- fn configure_deletion_work_for_config_a_b494b6() -> HandleDeletionWork {
- HandleDeletionWork {
- broker_pod_names: vec!["config-a-b494b6-pod"],
- // instance_svc_names: vec!["config-a-b494b6-svc"],
- cleanup_namespaces: vec!["config-a-namespace"],
- }
- }
-
- fn configure_for_handle_deletion_work(mock: &mut MockKubeInterface, work: &HandleDeletionWork) {
+ fn configure_for_handle_deletion_work(mock: &mut MockApi, work: &HandleDeletionWork) {
for i in 0..work.broker_pod_names.len() {
let broker_pod_name = work.broker_pod_names[i];
let cleanup_namespace = work.cleanup_namespaces[i];
- config_for_tests::configure_remove_pod(mock, broker_pod_name, cleanup_namespace);
+ configure_remove_pod(mock, broker_pod_name, cleanup_namespace);
}
}
@@ -870,10 +559,10 @@ mod handle_instance_tests {
}
}
- fn configure_for_handle_addition_work(mock: &mut MockKubeInterface, work: &HandleAdditionWork) {
+ fn configure_for_handle_addition_work(mock_api: &mut MockApi, work: &HandleAdditionWork) {
for i in 0..work.new_pod_names.len() {
- config_for_tests::configure_add_pod(
- mock,
+ configure_add_pod(
+ mock_api,
work.new_pod_names[i],
work.new_pod_namespaces[i],
AKRI_INSTANCE_LABEL_NAME,
@@ -884,62 +573,21 @@ mod handle_instance_tests {
}
async fn run_handle_instance_change_test(
- mock: &mut MockKubeInterface,
+ client: Arc,
instance_file: &'static str,
- action: &'static InstanceAction,
) {
trace!("run_handle_instance_change_test enter");
- let instance_json = file::read_file_to_string(instance_file);
+ let instance_json: String = file::read_file_to_string(instance_file);
let instance: Instance = serde_json::from_str(&instance_json).unwrap();
- handle_instance(
- match action {
- InstanceAction::Add | InstanceAction::Update => Event::Applied(instance),
- InstanceAction::Remove => Event::Deleted(instance),
- },
- mock,
- &mut false,
- )
- .await
- .unwrap();
+ handle_instance_change(instance, client).await.unwrap();
trace!("run_handle_instance_change_test exit");
}
- // Test that watcher errors on restarts unless it is the first restart (aka initial startup)
- #[tokio::test]
- async fn test_handle_watcher_restart() {
- let _ = env_logger::builder().is_test(true).try_init();
- let mut first_event = true;
- assert!(handle_instance(
- Event::Restarted(Vec::new()),
- &MockKubeInterface::new(),
- &mut first_event
- )
- .await
- .is_ok());
- first_event = false;
- assert!(handle_instance(
- Event::Restarted(Vec::new()),
- &MockKubeInterface::new(),
- &mut first_event
- )
- .await
- .is_err());
- }
-
- #[tokio::test]
- async fn test_internal_handle_existing_instances_no_instances() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let mut mock = MockKubeInterface::new();
- config_for_tests::configure_get_instances(&mut mock, "../test/json/empty-list.json", false);
- internal_handle_existing_instances(&mock).await.unwrap();
- }
-
#[tokio::test]
async fn test_handle_instance_change_for_add_new_local_instance() {
let _ = env_logger::builder().is_test(true).try_init();
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -953,19 +601,14 @@ mod handle_instance_tests {
addition_work: Some(configure_add_local_config_a_b494b6(false)),
},
);
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/local-instance.json",
- &InstanceAction::Add,
- )
- .await;
+ run_handle_instance_change_test(Arc::new(mock), "../test/json/local-instance.json").await;
}
#[tokio::test]
async fn test_handle_instance_change_for_add_new_local_instance_error() {
let _ = env_logger::builder().is_test(true).try_init();
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -979,45 +622,14 @@ mod handle_instance_tests {
addition_work: Some(configure_add_local_config_a_b494b6(true)),
},
);
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/local-instance.json",
- &InstanceAction::Add,
- )
- .await;
- }
-
- #[tokio::test]
- async fn test_handle_instance_change_for_remove_running_local_instance() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let mut mock = MockKubeInterface::new();
- configure_for_handle_instance_change(
- &mut mock,
- &HandleInstanceWork {
- find_pods_selector: "akri.sh/instance=config-a-b494b6",
- find_pods_result: "../test/json/running-pod-list-for-config-a-local.json",
- find_pods_phase: None,
- find_pods_start_time: None,
- find_pods_delete_start_time: false,
- config_work: get_config_work(),
- deletion_work: Some(configure_deletion_work_for_config_a_b494b6()),
- addition_work: None,
- },
- );
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/local-instance.json",
- &InstanceAction::Remove,
- )
- .await;
+ run_handle_instance_change_test(Arc::new(mock), "../test/json/local-instance.json").await;
}
#[tokio::test]
async fn test_handle_instance_change_for_add_new_shared_instance() {
let _ = env_logger::builder().is_test(true).try_init();
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -1033,45 +645,14 @@ mod handle_instance_tests {
)),
},
);
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/shared-instance.json",
- &InstanceAction::Add,
- )
- .await;
- }
-
- #[tokio::test]
- async fn test_handle_instance_change_for_remove_running_shared_instance() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let mut mock = MockKubeInterface::new();
- configure_for_handle_instance_change(
- &mut mock,
- &HandleInstanceWork {
- find_pods_selector: "akri.sh/instance=config-a-359973",
- find_pods_result: "../test/json/running-pod-list-for-config-a-shared.json",
- find_pods_phase: None,
- find_pods_start_time: None,
- find_pods_delete_start_time: false,
- config_work: get_config_work(),
- deletion_work: Some(configure_deletion_work_for_config_a_359973()),
- addition_work: None,
- },
- );
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/shared-instance.json",
- &InstanceAction::Remove,
- )
- .await;
+ run_handle_instance_change_test(Arc::new(mock), "../test/json/shared-instance.json").await;
}
#[tokio::test]
async fn test_handle_instance_change_for_update_active_shared_instance() {
let _ = env_logger::builder().is_test(true).try_init();
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -1087,12 +668,8 @@ mod handle_instance_tests {
)),
},
);
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/shared-instance-update.json",
- &InstanceAction::Update,
- )
- .await;
+ run_handle_instance_change_test(Arc::new(mock), "../test/json/shared-instance-update.json")
+ .await;
}
#[tokio::test]
@@ -1127,7 +704,7 @@ mod handle_instance_tests {
})
.collect::>();
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -1143,12 +720,11 @@ mod handle_instance_tests {
)),
},
);
- run_handle_instance_change_test(&mut mock, instance_file, &InstanceAction::Update).await;
+ run_handle_instance_change_test(Arc::new(mock), instance_file).await;
}
/// Checks that the BROKER_POD_COUNT_METRIC is appropriately incremented
- /// and decremented when an instance is added and deleted (and pods are
- /// created and deleted). Cannot be run in parallel with other tests
+ /// when an instance is added. Cannot be run in parallel with other tests
/// due to the metric being a global variable and modified unpredictably by
/// other tests.
/// Run with: cargo test -- test_broker_pod_count_metric --ignored
@@ -1160,7 +736,7 @@ mod handle_instance_tests {
.with_label_values(&["config-a", "node-a"])
.set(0);
- let mut mock = MockKubeInterface::new();
+ let mut mock = MockControllerKubeClient::default();
configure_for_handle_instance_change(
&mut mock,
&HandleInstanceWork {
@@ -1174,47 +750,6 @@ mod handle_instance_tests {
addition_work: Some(configure_add_local_config_a_b494b6(false)),
},
);
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/local-instance.json",
- &InstanceAction::Add,
- )
- .await;
-
- // Check that broker pod count metric has been incremented to include new pod for this instance
- assert_eq!(
- BROKER_POD_COUNT_METRIC
- .with_label_values(&["config-a", "node-a"])
- .get(),
- 1
- );
-
- configure_for_handle_instance_change(
- &mut mock,
- &HandleInstanceWork {
- find_pods_selector: "akri.sh/instance=config-a-b494b6",
- find_pods_result: "../test/json/running-pod-list-for-config-a-local.json",
- find_pods_phase: None,
- find_pods_start_time: None,
- find_pods_delete_start_time: false,
- config_work: get_config_work(),
- deletion_work: Some(configure_deletion_work_for_config_a_b494b6()),
- addition_work: None,
- },
- );
- run_handle_instance_change_test(
- &mut mock,
- "../test/json/local-instance.json",
- &InstanceAction::Remove,
- )
- .await;
-
- // Check that broker pod count metric has been decremented to reflect deleted instance and pod
- assert_eq!(
- BROKER_POD_COUNT_METRIC
- .with_label_values(&["config-a", "node-a"])
- .get(),
- 0
- );
+ run_handle_instance_change_test(Arc::new(mock), "../test/json/local-instance.json").await;
}
}
diff --git a/controller/src/util/mod.rs b/controller/src/util/mod.rs
index 4c6953c2d..95f322b63 100644
--- a/controller/src/util/mod.rs
+++ b/controller/src/util/mod.rs
@@ -1,5 +1,27 @@
+pub(crate) mod controller_ctx;
pub mod instance_action;
pub mod node_watcher;
mod pod_action;
pub mod pod_watcher;
mod shared_test_utils;
+
+use thiserror::Error;
+
+#[derive(Error, Debug)]
+pub enum ControllerError {
+ #[error(transparent)]
+ KubeError(#[from] kube::Error),
+
+ #[error("Finalizer Error: {0}")]
+ // NB: awkward type because finalizer::Error embeds the reconciler error (which is this)
+ // so boxing this error to break cycles
+ FinalizerError(#[source] Box>),
+
+ #[error("Watcher Error: {0}")]
+ WatcherError(#[from] kube::runtime::watcher::Error),
+
+ #[error(transparent)]
+ Other(#[from] anyhow::Error),
+}
+
+pub type Result = std::result::Result;
diff --git a/controller/src/util/node_watcher.rs b/controller/src/util/node_watcher.rs
index 93f53baa5..474e863b2 100644
--- a/controller/src/util/node_watcher.rs
+++ b/controller/src/util/node_watcher.rs
@@ -1,679 +1,472 @@
-use akri_shared::{
- akri::{
- instance::device_usage::NodeUsage,
- instance::{Instance, InstanceSpec},
- retry::{random_delay, MAX_INSTANCE_UPDATE_TRIES},
- },
- k8s,
- k8s::KubeInterface,
+//! This is used to handle Nodes disappearing.
+//!
+//! When a Node disapears, make sure that any Instance that
+//! references the Node is cleaned. This means that the
+//! Instance.nodes property no longer contains the node and
+//! that the Instance.deviceUsage property no longer contains
+//! slots that are occupied by the node.
+use crate::util::{
+ controller_ctx::{ControllerContext, NodeState},
+ ControllerError, Result,
};
-use futures::{StreamExt, TryStreamExt};
+use akri_shared::akri::instance::{device_usage::NodeUsage, Instance};
+use akri_shared::k8s::api::Api;
+use anyhow::Context;
use k8s_openapi::api::core::v1::{Node, NodeStatus};
-use kube::api::Api;
-use kube_runtime::watcher::{watcher, Config, Event};
-use kube_runtime::WatchStreamExt;
-use log::{error, info, trace};
-use std::collections::HashMap;
+use kube::{
+ api::{
+ ListParams, NotUsed, Object, ObjectList, ObjectMeta, Patch, PatchParams, ResourceExt,
+ TypeMeta,
+ },
+ runtime::{
+ controller::Action,
+ finalizer::{finalizer, Event},
+ reflector::Lookup,
+ },
+};
+use log::{info, trace};
use std::str::FromStr;
+use std::{collections::HashMap, sync::Arc};
-/// Node states that NodeWatcher is interested in
-///
-/// NodeState describes the various states that the controller can
-/// react to for Nodes.
-#[derive(Clone, Debug, PartialEq)]
-enum NodeState {
- /// Node has been seen, but not Running yet
- Known,
- /// Node has been seen Running
- Running,
- /// A previously Running Node has been seen as not Running
- /// and the Instances have been cleaned of references to that
- /// vanished Node
- InstancesCleaned,
-}
+use super::controller_ctx::ControllerKubeClient;
-/// This is used to handle Nodes disappearing.
-///
-/// When a Node disapears, make sure that any Instance that
-/// references the Node is cleaned. This means that the
-/// Instance.nodes property no longer contains the node and
-/// that the Instance.deviceUsage property no longer contains
-/// slots that are occupied by the node.
-pub struct NodeWatcher {
- known_nodes: HashMap,
-}
+pub static NODE_FINALIZER: &str = "akri-node-watcher.kube.rs";
-impl NodeWatcher {
- /// Create new instance of BrokerPodWatcher
- pub fn new() -> Self {
- NodeWatcher {
- known_nodes: HashMap::new(),
- }
+pub async fn check(client: Arc) -> anyhow::Result<()> {
+ let api: Box> = client.all();
+ if let Err(e) = api.list(&ListParams::default().limit(1)).await {
+ anyhow::bail!("Nodes are not queryable; {e:?}")
}
+ Ok(())
+}
- /// This watches for Node events
- pub async fn watch(
- &mut self,
- ) -> Result<(), Box> {
- trace!("watch - enter");
- let kube_interface = k8s::KubeImpl::new().await?;
- let resource = Api::::all(kube_interface.get_kube_client());
- let watcher = watcher(resource, Config::default()).default_backoff();
- let mut informer = watcher.boxed();
- let mut first_event = true;
-
- // Currently, this does not handle None except to break the loop.
- loop {
- let event = match informer.try_next().await {
- Err(e) => {
- error!("Error during watch: {}", e);
- continue;
- }
- Ok(None) => break,
- Ok(Some(event)) => event,
- };
- self.handle_node(event, &kube_interface, &mut first_event)
- .await?;
- }
+pub fn error_policy(
+ _node: Arc,
+ error: &ControllerError,
+ _ctx: Arc,
+) -> Action {
+ log::warn!("reconcile failed: {:?}", error);
+ Action::requeue(std::time::Duration::from_secs(5 * 60))
+}
- Ok(())
- }
+/// This function is the main Reconcile function for Node resources
+/// This will get called every time an Node is added, deleted, or changed, it will also be called for every existing Node on startup.
+///
+/// Nodes are constantly updated. Cleanup work for our services only
+/// needs to be called once.
+///
+/// To achieve this, store each Node's state as either Known (Node has
+/// been seen, but not Running), Running (Node has been seen as Running),
+/// and InstanceCleaned (previously Running Node has been seen as not
+/// Running).
+///
+/// When a Node is in the Known state, it is not Running. If it has
+/// never been seen as Running, it is likely being created and there is
+/// no need to clean any Instance.
+///
+/// Once a Node moves through the Running state into a non Running
+/// state, it becomes important to clean Instances referencing the
+/// non-Running Node.
+pub async fn reconcile(node: Arc, ctx: Arc) -> Result {
+ trace!("Reconciling node {}", node.name_any());
+ finalizer(
+ &ctx.client.clone().all().as_inner(),
+ NODE_FINALIZER,
+ node,
+ |event| reconcile_inner(event, ctx.clone()),
+ )
+ .await
+ // .map_err(|_e| anyhow!("todo"))
+ .map_err(|e| ControllerError::FinalizerError(Box::new(e)))
+}
- /// This takes an event off the Node stream and if a Node is no longer
- /// available, it calls handle_node_disappearance.
- ///
- /// Nodes are constantly updated. Cleanup work for our services only
- /// needs to be called once.
- ///
- /// To achieve this, store each Node's state as either Known (Node has
- /// been seen, but not Running), Running (Node has been seen as Running),
- /// and InstanceCleaned (previously Running Node has been seen as not
- /// Running).
- ///
- /// When a Node is in the Known state, it is not Running. If it has
- /// never been seen as Running, it is likely being created and there is
- /// no need to clean any Instance.
- ///
- /// Once a Node moves through the Running state into a non Running
- /// state, it becomes important to clean Instances referencing the
- /// non-Running Node.
- async fn handle_node(
- &mut self,
- event: Event,
- kube_interface: &impl KubeInterface,
- first_event: &mut bool,
- ) -> anyhow::Result<()> {
- trace!("handle_node - enter");
- match event {
- Event::Applied(node) => {
- let node_name = node.metadata.name.clone().unwrap();
- info!("handle_node - Added or modified: {}", node_name);
- if self.is_node_ready(&node) {
- self.known_nodes.insert(node_name, NodeState::Running);
- } else if let std::collections::hash_map::Entry::Vacant(e) =
- self.known_nodes.entry(node_name)
- {
+async fn reconcile_inner(event: Event, ctx: Arc) -> Result {
+ match event {
+ Event::Apply(node) => {
+ let node_name = node.name_unchecked();
+ info!("handle_node - Added or modified: {}", node_name);
+ if is_node_ready(&node) {
+ ctx.known_nodes
+ .write()
+ .await
+ .insert(node_name, NodeState::Running);
+ } else {
+ let mut guard = ctx.known_nodes.write().await;
+ if let std::collections::hash_map::Entry::Vacant(e) = guard.entry(node_name) {
e.insert(NodeState::Known);
} else {
// Node Modified
- self.call_handle_node_disappearance_if_needed(&node, kube_interface)
- .await?;
- }
- }
- Event::Deleted(node) => {
- info!("handle_node - Deleted: {:?}", &node.metadata.name);
- self.call_handle_node_disappearance_if_needed(&node, kube_interface)
- .await?;
- }
- Event::Restarted(_nodes) => {
- if *first_event {
- info!("handle_node - watcher started");
- } else {
- return Err(anyhow::anyhow!(
- "Node watcher restarted - throwing error to restart controller"
- ));
+ drop(guard);
+ handle_node_disappearance(&node, ctx.clone()).await?;
}
}
- };
- *first_event = false;
- Ok(())
- }
-
- /// This should be called for Nodes that are either !Ready or Deleted.
- /// This function ensures that handle_node_disappearance is called
- /// only once for any Node as it disappears.
- async fn call_handle_node_disappearance_if_needed(
- &mut self,
- node: &Node,
- kube_interface: &impl KubeInterface,
- ) -> anyhow::Result<()> {
- let node_name = node.metadata.name.clone().unwrap();
- trace!(
- "call_handle_node_disappearance_if_needed - enter: {:?}",
- &node.metadata.name
- );
- let last_known_state = self
- .known_nodes
- .get(&node_name)
- .unwrap_or(&NodeState::Running);
- trace!(
- "call_handle_node_disappearance_if_needed - last_known_state: {:?}",
- &last_known_state
- );
- // Nodes are updated roughly once a minute ... try to only call
- // handle_node_disappearance once for a node that disappears.
- //
- // Also, there is no need to call handle_node_disappearance if a
- // Node has never been in the Running state.
- if last_known_state == &NodeState::Running {
- trace!(
- "call_handle_node_disappearance_if_needed - call handle_node_disappearance: {:?}",
- &node.metadata.name
- );
- self.handle_node_disappearance(&node_name, kube_interface)
- .await?;
- self.known_nodes
- .insert(node_name, NodeState::InstancesCleaned);
+ Ok(Action::await_change())
+ }
+ Event::Cleanup(node) => {
+ info!("handle_node - Deleted: {:?}", &node.name_unchecked());
+ handle_node_disappearance(&node, ctx.clone()).await?;
+ ctx.known_nodes.write().await.remove(&node.name_unchecked());
+ Ok(Action::await_change())
}
- Ok(())
- }
-
- /// This determines if a node is in the Ready state.
- fn is_node_ready(&self, k8s_node: &Node) -> bool {
- trace!("is_node_ready - for node {:?}", k8s_node.metadata.name);
- k8s_node
- .status
- .as_ref()
- .unwrap_or(&NodeStatus::default())
- .conditions
- .as_ref()
- .unwrap_or(&Vec::new())
- .iter()
- .filter_map(|condition| {
- if condition.type_ == "Ready" {
- Some(condition.status == "True")
- } else {
- None
- }
- })
- .collect::>()
- .last()
- .unwrap_or(&false)
- == &true
}
+}
- /// This handles when a node disappears by clearing nodes from
- /// the nodes list and deviceUsage map and then trying 5 times to
- /// update the Instance.
- async fn handle_node_disappearance(
- &self,
- vanished_node_name: &str,
- kube_interface: &impl KubeInterface,
- ) -> anyhow::Result<()> {
- trace!(
- "handle_node_disappearance - enter vanished_node_name={:?}",
- vanished_node_name,
- );
-
- let instances = kube_interface.get_instances().await?;
+/// This should be called for Nodes that are either !Ready or Deleted.
+/// This function will clean up any Instances that reference a Node that
+/// was previously Running.
+async fn handle_node_disappearance(node: &Node, ctx: Arc) -> anyhow::Result<()> {
+ let node_name = node.name_unchecked();
+ trace!(
+ "handle_node_disappearance - enter: {:?}",
+ &node.metadata.name
+ );
+ let last_known_state = ctx
+ .known_nodes
+ .read()
+ .await
+ .get(&node_name)
+ .unwrap_or(&NodeState::Running)
+ .clone();
+ trace!(
+ "handle_node_disappearance - last_known_state: {:?}",
+ &last_known_state
+ );
+
+ // If the node was running and no longer is, clear the node from
+ // each instance's nodes list and deviceUsage map.
+ if last_known_state == NodeState::Running {
+ let api = ctx.client.all();
+ let instances: ObjectList = api.list(&ListParams::default()).await?;
trace!(
"handle_node_disappearance - found {:?} instances",
instances.items.len()
);
for instance in instances.items {
- let instance_name = instance.metadata.name.clone().unwrap();
- let instance_namespace = instance.metadata.namespace.as_ref().ok_or_else(|| {
- anyhow::anyhow!("Namespace not found for instance: {}", instance_name)
- })?;
-
- trace!(
- "handle_node_disappearance - make sure node is not referenced here: {:?}",
- &instance_name
- );
-
- // Try up to MAX_INSTANCE_UPDATE_TRIES times to update/create/get instance
- for x in 0..MAX_INSTANCE_UPDATE_TRIES {
- match if x == 0 {
- self.try_remove_nodes_from_instance(
- vanished_node_name,
- &instance_name,
- instance_namespace,
- &instance,
- kube_interface,
- )
- .await
- } else {
- let retry_instance = kube_interface
- .find_instance(&instance_name, instance_namespace)
- .await?;
- self.try_remove_nodes_from_instance(
- vanished_node_name,
- &instance_name,
- instance_namespace,
- &retry_instance,
- kube_interface,
- )
- .await
- } {
- Ok(_) => break,
- Err(e) => {
- if x == (MAX_INSTANCE_UPDATE_TRIES - 1) {
- return Err(e);
- }
- random_delay().await;
- }
- }
- }
+ let instance_name = instance.name_unchecked();
+ try_remove_nodes_from_instance(&node_name, &instance_name, &instance, api.as_ref())
+ .await?;
+ api.remove_finalizer(&instance, &node_name).await?;
}
-
- trace!("handle_node_disappearance - exit");
- Ok(())
+ ctx.known_nodes
+ .write()
+ .await
+ .insert(node_name.to_string(), NodeState::InstancesCleaned);
}
+ Ok(())
+}
- /// This attempts to remove nodes from the nodes list and deviceUsage
- /// map in an Instance. An attempt is made to update
- /// the instance in etcd, any failure is returned.
- async fn try_remove_nodes_from_instance(
- &self,
- vanished_node_name: &str,
- instance_name: &str,
- instance_namespace: &str,
- instance: &Instance,
- kube_interface: &impl KubeInterface,
- ) -> Result<(), anyhow::Error> {
- trace!(
- "try_remove_nodes_from_instance - vanished_node_name: {:?}",
- &vanished_node_name
- );
- let modified_nodes = instance
- .spec
- .nodes
- .iter()
- .filter(|node| &vanished_node_name != node)
- .map(|node| node.into())
- .collect::>();
- // Remove nodes from instance.deviceusage
- let modified_device_usage = instance
- .spec
- .device_usage
- .iter()
- .map(|(slot, usage)| {
- let same_node_name = match NodeUsage::from_str(usage) {
- Ok(node_usage) => node_usage.is_same_node(vanished_node_name),
- Err(_) => false,
- };
-
- (
- slot.to_string(),
- if same_node_name {
- NodeUsage::default().to_string()
- } else {
- usage.into()
- },
- )
- })
- .collect::>();
-
- // Save the instance
- let modified_instance = InstanceSpec {
- cdi_name: instance.spec.cdi_name.clone(),
- capacity: instance.spec.capacity,
- configuration_name: instance.spec.configuration_name.clone(),
- broker_properties: instance.spec.broker_properties.clone(),
- shared: instance.spec.shared,
- device_usage: modified_device_usage,
- nodes: modified_nodes,
- };
-
- trace!(
- "handle_node_disappearance - kube_interface.update_instance name: {}, namespace: {}, {:?}",
- &instance_name,
- &instance_namespace,
- &modified_instance
- );
+/// This determines if a node is in the Ready state.
+fn is_node_ready(k8s_node: &Node) -> bool {
+ trace!("is_node_ready - for node {:?}", k8s_node.metadata.name);
+ k8s_node
+ .status
+ .as_ref()
+ .unwrap_or(&NodeStatus::default())
+ .conditions
+ .as_ref()
+ .unwrap_or(&Vec::new())
+ .last()
+ .map_or(false, |condition| {
+ condition.type_ == "Ready" && condition.status == "True"
+ })
+}
- kube_interface
- .update_instance(&modified_instance, instance_name, instance_namespace)
- .await
- }
+/// This attempts to remove nodes from the nodes list and deviceUsage
+/// map in an Instance. An attempt is made to update
+/// the instance in etcd, any failure is returned.
+async fn try_remove_nodes_from_instance(
+ vanished_node_name: &str,
+ instance_name: &str,
+ instance: &Instance,
+ api: &dyn Api,
+) -> Result<(), anyhow::Error> {
+ trace!(
+ "try_remove_nodes_from_instance - vanished_node_name: {:?}",
+ &vanished_node_name
+ );
+ let modified_nodes = instance
+ .spec
+ .nodes
+ .iter()
+ .filter(|node| &vanished_node_name != node)
+ .map(|node| node.into())
+ .collect::>();
+ // Remove nodes from instance.deviceusage
+ let modified_device_usage = instance
+ .spec
+ .device_usage
+ .iter()
+ .map(|(slot, usage)| match NodeUsage::from_str(usage) {
+ Ok(node_usage) if node_usage.is_same_node(vanished_node_name) => {
+ (slot.to_owned(), NodeUsage::default().to_string())
+ }
+ Ok(_) => (slot.to_owned(), usage.into()),
+ Err(_) => (slot.to_owned(), usage.into()),
+ })
+ .collect::>();
+ let mut modified_spec = instance.spec.clone();
+ modified_spec.nodes = modified_nodes;
+ modified_spec.device_usage = modified_device_usage;
+ let patch = Patch::Merge(
+ serde_json::to_value(Object {
+ types: Some(TypeMeta {
+ api_version: Instance::api_version(&()).to_string(),
+ kind: Instance::kind(&()).to_string(),
+ }),
+ status: None::,
+ spec: modified_spec,
+ metadata: ObjectMeta {
+ name: Some(instance_name.to_string()),
+ ..Default::default()
+ },
+ })
+ .context("Could not create instance patch")?,
+ );
+ api.raw_patch(instance_name, &patch, &PatchParams::default())
+ .await?;
+ Ok(())
}
#[cfg(test)]
mod tests {
- use super::super::shared_test_utils::config_for_tests;
+ use super::super::shared_test_utils::mock_client::MockControllerKubeClient;
use super::*;
- use akri_shared::{akri::instance::InstanceList, k8s::MockKubeInterface, os::file};
-
- #[derive(Clone)]
- struct UpdateInstance {
- instance_to_update: InstanceSpec,
- instance_name: &'static str,
- instance_namespace: &'static str,
- }
-
- #[derive(Clone)]
- struct HandleNodeDisappearance {
- get_instances_result_file: &'static str,
- get_instances_result_listify: bool,
- update_instance: Option,
- }
-
- fn configure_for_handle_node_disappearance(
- mock: &mut MockKubeInterface,
- work: &HandleNodeDisappearance,
- ) {
- config_for_tests::configure_get_instances(
- mock,
- work.get_instances_result_file,
- work.get_instances_result_listify,
- );
-
- if let Some(update_instance) = &work.update_instance {
- config_for_tests::configure_update_instance(
- mock,
- update_instance.instance_to_update.clone(),
- update_instance.instance_name,
- update_instance.instance_namespace,
- false,
- );
- }
- }
-
- // Test that watcher errors on restarts unless it is the first restart (aka initial startup)
- #[tokio::test]
- async fn test_handle_watcher_restart() {
- let _ = env_logger::builder().is_test(true).try_init();
- let mut pod_watcher = NodeWatcher::new();
- let mut first_event = true;
- assert!(pod_watcher
- .handle_node(
- Event::Restarted(Vec::new()),
- &MockKubeInterface::new(),
- &mut first_event
- )
- .await
- .is_ok());
- first_event = false;
- assert!(pod_watcher
- .handle_node(
- Event::Restarted(Vec::new()),
- &MockKubeInterface::new(),
- &mut first_event
- )
- .await
- .is_err());
- }
-
- #[tokio::test]
- async fn test_handle_node_added_unready() {
- let _ = env_logger::builder().is_test(true).try_init();
- let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
- let node: Node = serde_json::from_str(&node_json).unwrap();
- let mut node_watcher = NodeWatcher::new();
- node_watcher
- .handle_node(Event::Applied(node), &MockKubeInterface::new(), &mut false)
- .await
- .unwrap();
-
- assert_eq!(1, node_watcher.known_nodes.len());
+ use akri_shared::{akri::instance::InstanceSpec, k8s::api::MockApi, os::file};
- assert_eq!(
- &NodeState::Known,
- node_watcher.known_nodes.get(&"node-a".to_string()).unwrap()
- )
+ fn instances_list(
+ instance_name: &str,
+ instance_namespace: &str,
+ ) -> kube::Result> {
+ let list = serde_json::json!({
+ "apiVersion": "v1",
+ "kind": "List",
+ "metadata": {
+ "resourceVersion": "",
+ "selfLink": ""
+ },
+ "items": [
+ {
+ "apiVersion": "akri.sh/v0",
+ "kind": "Instance",
+ "metadata": {
+ "name": instance_name,
+ "namespace": instance_namespace,
+ "uid": "abcdegfh-ijkl-mnop-qrst-uvwxyz012345"
+ },
+ "spec": {
+ "configurationName": "config-a",
+ "capacity": 5,
+ "cdiName": "akri.sh/config-a=359973",
+ "deviceUsage": {
+ format!("{instance_name}-0"): "node-b",
+ format!("{instance_name}-1"): "node-a",
+ format!("{instance_name}-2"): "node-b",
+ format!("{instance_name}-3"): "node-a",
+ format!("{instance_name}-4"): "node-c",
+ format!("{instance_name}-5"): ""
+ },
+ "nodes": [ "node-a", "node-b", "node-c" ],
+ "shared": true
+ }
+ }
+ ]
+ });
+ Ok(serde_json::from_value(list).unwrap())
}
#[tokio::test]
- async fn test_handle_node_added_ready() {
+ async fn test_reconcile_node_apply_ready() {
let _ = env_logger::builder().is_test(true).try_init();
-
let node_json = file::read_file_to_string("../test/json/node-a.json");
let node: Node = serde_json::from_str(&node_json).unwrap();
- let mut node_watcher = NodeWatcher::new();
- node_watcher
- .handle_node(Event::Applied(node), &MockKubeInterface::new(), &mut false)
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone())
.await
.unwrap();
- assert_eq!(1, node_watcher.known_nodes.len());
-
assert_eq!(
&NodeState::Running,
- node_watcher.known_nodes.get(&"node-a".to_string()).unwrap()
- )
+ ctx.known_nodes.read().await.get(&node_name).unwrap()
+ );
}
#[tokio::test]
- async fn test_handle_node_modified_unready_unknown() {
+ async fn test_reconcile_node_apply_unready_unknown() {
let _ = env_logger::builder().is_test(true).try_init();
-
- let node_json = file::read_file_to_string("../test/json/node-b-not-ready.json");
+ let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
let node: Node = serde_json::from_str(&node_json).unwrap();
- let mut node_watcher = NodeWatcher::new();
- let instance_file = "../test/json/shared-instance-update.json";
- let instance_json = file::read_file_to_string(instance_file);
- let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap();
- let mut instance = kube_object_instance.spec;
- instance.nodes.clear();
- instance
- .device_usage
- .insert("config-a-359973-2".to_string(), "".to_string());
-
- let mut mock = MockKubeInterface::new();
- configure_for_handle_node_disappearance(
- &mut mock,
- &HandleNodeDisappearance {
- get_instances_result_file: "../test/json/shared-instance-update.json",
- get_instances_result_listify: true,
- update_instance: Some(UpdateInstance {
- instance_to_update: instance,
- instance_name: "config-a-359973",
- instance_namespace: "config-a-namespace",
- }),
- },
- );
- // Insert node into list of known_nodes to mock being previously applied
- node_watcher
- .known_nodes
- .insert(node.metadata.name.clone().unwrap(), NodeState::Running);
- node_watcher
- .handle_node(Event::Applied(node), &mock, &mut false)
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone())
.await
.unwrap();
- assert_eq!(1, node_watcher.known_nodes.len());
-
assert_eq!(
- &NodeState::InstancesCleaned,
- node_watcher.known_nodes.get(&"node-b".to_string()).unwrap()
- )
+ &NodeState::Known,
+ ctx.known_nodes.read().await.get(&node_name).unwrap()
+ );
}
-
+ // If a known node is modified and is still not ready, it should remain in the known state
#[tokio::test]
- async fn test_handle_node_modified_ready_unknown() {
+ async fn test_reconcile_node_apply_unready_known() {
let _ = env_logger::builder().is_test(true).try_init();
-
- let node_json = file::read_file_to_string("../test/json/node-b.json");
+ let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
let node: Node = serde_json::from_str(&node_json).unwrap();
- let mut node_watcher = NodeWatcher::new();
-
- let mock = MockKubeInterface::new();
- node_watcher
- .handle_node(Event::Applied(node), &mock, &mut false)
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ ctx.known_nodes
+ .write()
+ .await
+ .insert(node_name.clone(), NodeState::Known);
+ reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone())
.await
.unwrap();
- assert_eq!(1, node_watcher.known_nodes.len());
-
assert_eq!(
- &NodeState::Running,
- node_watcher.known_nodes.get(&"node-b".to_string()).unwrap()
- )
+ &NodeState::Known,
+ ctx.known_nodes.read().await.get(&node_name).unwrap()
+ );
}
+ // If previously running node is modified and is not ready, it should remove the node from the instances' node lists
#[tokio::test]
- async fn test_handle_node_deleted_unready_unknown() {
+ async fn test_reconcile_node_apply_unready_previously_running() {
let _ = env_logger::builder().is_test(true).try_init();
-
- let node_json = file::read_file_to_string("../test/json/node-b-not-ready.json");
+ let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
let node: Node = serde_json::from_str(&node_json).unwrap();
- let mut node_watcher = NodeWatcher::new();
-
- let instance_file = "../test/json/shared-instance-update.json";
- let instance_json = file::read_file_to_string(instance_file);
- let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap();
- let mut instance = kube_object_instance.spec;
- instance.nodes.clear();
- instance
- .device_usage
- .insert("config-a-359973-2".to_string(), "".to_string());
-
- let mut mock = MockKubeInterface::new();
- configure_for_handle_node_disappearance(
- &mut mock,
- &HandleNodeDisappearance {
- get_instances_result_file: "../test/json/shared-instance-update.json",
- get_instances_result_listify: true,
- update_instance: Some(UpdateInstance {
- instance_to_update: instance,
- instance_name: "config-a-359973",
- instance_namespace: "config-a-namespace",
- }),
- },
- );
-
- node_watcher
- .handle_node(Event::Deleted(node), &mock, &mut false)
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let mut instance_api_mock: MockApi = MockApi::new();
+ let instance_name = "config-a-359973";
+ instance_api_mock
+ .expect_list()
+ .return_once(|_| instances_list(instance_name, "unused"));
+ instance_api_mock
+ .expect_raw_patch()
+ .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default())))
+ .withf(|_, patch, _| match patch {
+ Patch::Merge(v) => {
+ let instance: Instance = serde_json::from_value(v.clone()).unwrap();
+ !instance.spec.nodes.contains(&"node-a".to_owned())
+ }
+ _ => false,
+ });
+ instance_api_mock
+ .expect_remove_finalizer()
+ .returning(|_, _| Ok(()));
+ mock.instance
+ .expect_all()
+ .return_once(move || Box::new(instance_api_mock));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ ctx.known_nodes
+ .write()
+ .await
+ .insert(node_name.clone(), NodeState::Running);
+ reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone())
.await
.unwrap();
-
- assert_eq!(1, node_watcher.known_nodes.len());
-
assert_eq!(
&NodeState::InstancesCleaned,
- node_watcher.known_nodes.get(&"node-b".to_string()).unwrap()
- )
- }
-
- const LIST_PREFIX: &str = r#"
-{
- "apiVersion": "v1",
- "items": ["#;
- const LIST_SUFFIX: &str = r#"
- ],
- "kind": "List",
- "metadata": {
- "resourceVersion": "",
- "selfLink": ""
- }
-}"#;
- fn listify_node(node_json: &str) -> String {
- format!("{}\n{}\n{}", LIST_PREFIX, node_json, LIST_SUFFIX)
+ ctx.known_nodes.read().await.get(&node_name).unwrap()
+ );
}
+ // If previously running node enters the cleanup state, it should remove the node from the instances' node lists
+ // and ensure that the node is removed from the known_nodes
#[tokio::test]
- async fn test_handle_node_disappearance_update_failure_retries() {
+ async fn test_reconcile_node_cleanup() {
let _ = env_logger::builder().is_test(true).try_init();
-
- let mut mock = MockKubeInterface::new();
- mock.expect_get_instances().times(1).returning(move || {
- let instance_file = "../test/json/shared-instance-update.json";
- let instance_json = file::read_file_to_string(instance_file);
- let instance_list_json = listify_node(&instance_json);
- let list: InstanceList = serde_json::from_str(&instance_list_json).unwrap();
- Ok(list)
- });
- mock.expect_update_instance()
- .times(MAX_INSTANCE_UPDATE_TRIES as usize)
- .withf(move |_instance, n, ns| n == "config-a-359973" && ns == "config-a-namespace")
- .returning(move |_, _, _| Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?));
- mock.expect_find_instance()
- .times((MAX_INSTANCE_UPDATE_TRIES - 1) as usize)
- .withf(move |n, ns| n == "config-a-359973" && ns == "config-a-namespace")
- .returning(move |_, _| {
- let instance_file = "../test/json/shared-instance-update.json";
- let instance_json = file::read_file_to_string(instance_file);
- let instance: Instance = serde_json::from_str(&instance_json).unwrap();
- Ok(instance)
+ let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
+ let node: Node = serde_json::from_str(&node_json).unwrap();
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let mut instance_api_mock: MockApi = MockApi::new();
+ let instance_name = "config-a-359973";
+ instance_api_mock
+ .expect_list()
+ .return_once(|_| instances_list(instance_name, "unused"));
+ instance_api_mock
+ .expect_raw_patch()
+ .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default())))
+ .withf(|_, patch, _| match patch {
+ Patch::Merge(v) => {
+ let instance: Instance = serde_json::from_value(v.clone()).unwrap();
+ !instance.spec.nodes.contains(&"node-a".to_owned())
+ }
+ _ => false,
});
-
- let node_watcher = NodeWatcher::new();
- assert!(node_watcher
- .handle_node_disappearance("foo-a", &mock)
+ instance_api_mock
+ .expect_remove_finalizer()
+ .returning(|_, _| Ok(()));
+ mock.instance
+ .expect_all()
+ .return_once(move || Box::new(instance_api_mock));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ ctx.known_nodes
+ .write()
+ .await
+ .insert(node_name.clone(), NodeState::Running);
+ reconcile_inner(Event::Cleanup(Arc::new(node)), ctx.clone())
.await
- .is_err());
+ .unwrap();
+ assert!(ctx.known_nodes.read().await.get(&node_name).is_none());
}
+ // If unknown node is deleted, it should remove the node from the instances' node lists
#[tokio::test]
- async fn test_try_remove_nodes_from_instance() {
+ async fn test_reconcile_node_cleanup_unknown() {
let _ = env_logger::builder().is_test(true).try_init();
-
- let instance_file = "../test/json/shared-instance-update.json";
- let instance_json = file::read_file_to_string(instance_file);
- let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap();
-
- let mut mock = MockKubeInterface::new();
- mock.expect_update_instance()
- .times(1)
- .withf(move |ins, n, ns| {
- n == "config-a"
- && ns == "config-a-namespace"
- && !ins.nodes.contains(&"node-b".to_string())
- && ins
- .device_usage
- .iter()
- .filter_map(|(_slot, value)| {
- if value == &"node-b".to_string() {
- Some(value.to_string())
- } else {
- None
- }
- })
- .collect::>()
- .first()
- .is_none()
- })
- .returning(move |_, _, _| Ok(()));
-
- let node_watcher = NodeWatcher::new();
- assert!(node_watcher
- .try_remove_nodes_from_instance(
- "node-b",
- "config-a",
- "config-a-namespace",
- &kube_object_instance,
- &mock,
- )
+ let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json");
+ let node: Node = serde_json::from_str(&node_json).unwrap();
+ let node_name = node.name_unchecked();
+ let mut mock = MockControllerKubeClient::default();
+ mock.node
+ .expect_all()
+ .return_once(|| Box::new(MockApi::new()));
+ let mut instance_api_mock: MockApi = MockApi::new();
+ let instance_name = "config-a-359973";
+ instance_api_mock
+ .expect_list()
+ .return_once(|_| instances_list(instance_name, "unused"));
+ instance_api_mock
+ .expect_raw_patch()
+ .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default())))
+ .withf(|_, patch, _| match patch {
+ Patch::Merge(v) => {
+ let instance: Instance = serde_json::from_value(v.clone()).unwrap();
+ !instance.spec.nodes.contains(&"node-a".to_owned())
+ }
+ _ => false,
+ });
+ instance_api_mock
+ .expect_remove_finalizer()
+ .returning(|_, _| Ok(()));
+ mock.instance
+ .expect_all()
+ .return_once(move || Box::new(instance_api_mock));
+ let ctx = Arc::new(ControllerContext::new(Arc::new(mock)));
+ reconcile_inner(Event::Cleanup(Arc::new(node)), ctx.clone())
.await
- .is_ok());
- }
-
- #[test]
- fn test_is_node_ready_ready() {
- let _ = env_logger::builder().is_test(true).try_init();
-
- let tests = [
- ("../test/json/node-a.json", true),
- ("../test/json/node-a-not-ready.json", false),
- ("../test/json/node-a-no-conditions.json", false),
- ("../test/json/node-a-no-ready-condition.json", false),
- ];
-
- for (node_file, result) in tests.iter() {
- trace!(
- "Testing {} should reflect node is ready={}",
- node_file,
- result
- );
-
- let node_json = file::read_file_to_string(node_file);
- let kube_object_node: Node = serde_json::from_str(&node_json).unwrap();
-
- let node_watcher = NodeWatcher::new();
- assert_eq!(
- result.clone(),
- node_watcher.is_node_ready(&kube_object_node)
- );
- }
+ .unwrap();
+ assert!(ctx.known_nodes.read().await.get(&node_name).is_none());
}
}
diff --git a/controller/src/util/pod_action.rs b/controller/src/util/pod_action.rs
index 71b5172b6..ad2777622 100644
--- a/controller/src/util/pod_action.rs
+++ b/controller/src/util/pod_action.rs
@@ -1,4 +1,3 @@
-use super::instance_action::InstanceAction;
use chrono::Utc;
use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time;
@@ -23,7 +22,6 @@ pub enum PodAction {
/// a broker Pod.
///
/// The action to take is based on several factors:
-/// 1. what the InstanceAction is (Add, Delete, Modify)
/// 1. what phase the Pod is in (Running, Pending, etc)
/// 1. when the Pod started
/// 1. the relevant grace time
@@ -32,10 +30,9 @@ pub struct PodActionInfo {
pub pending_grace_time_in_minutes: i64,
pub ended_grace_time_in_minutes: i64,
pub phase: String,
- pub instance_action: InstanceAction,
pub status_start_time: Option