From 7482c9fd19e56847cd01d85d17daea079b2b5644 Mon Sep 17 00:00:00 2001 From: Kate Goldenring Date: Tue, 24 Sep 2024 22:03:55 -0700 Subject: [PATCH 1/8] Update controller and agent to kube-rs 0.91.0 Signed-off-by: Kate Goldenring --- Cargo.lock | 402 ++- agent/Cargo.toml | 6 +- .../discovery_handler_registry.rs | 2 +- agent/src/discovery_handler_manager/mod.rs | 2 +- agent/src/plugin_manager/v1.rs | 131 +- agent/src/plugin_manager/v1beta1.rs | 361 ++- .../discovery_configuration_controller.rs | 2 +- controller/Cargo.toml | 13 +- controller/src/main.rs | 49 +- controller/src/util/controller_ctx.rs | 106 + controller/src/util/instance_action.rs | 564 ++-- controller/src/util/mod.rs | 19 + controller/src/util/node_watcher.rs | 990 +++---- controller/src/util/pod_watcher.rs | 2284 ++++++----------- controller/src/util/shared_test_utils.rs | 444 ++-- shared/Cargo.toml | 7 +- shared/src/akri/instance.rs | 2 +- shared/src/k8s/api.rs | 54 +- shared/src/k8s/job.rs | 180 +- shared/src/k8s/mod.rs | 623 +---- shared/src/k8s/node.rs | 28 - shared/src/k8s/pod.rs | 57 +- shared/src/k8s/service.rs | 789 ------ 23 files changed, 2492 insertions(+), 4623 deletions(-) create mode 100644 controller/src/util/controller_ctx.rs delete mode 100644 shared/src/k8s/node.rs delete mode 100644 shared/src/k8s/service.rs diff --git a/Cargo.lock b/Cargo.lock index ba116e178..dade00d52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -236,19 +236,19 @@ dependencies = [ "async-stream", "async-trait", "blake2", - "env_logger", + "env_logger 0.10.2", "futures", - "hyper", + "hyper 0.14.30", "itertools", - "k8s-openapi 0.20.0", - "kube 0.87.2", + "k8s-openapi 0.22.0", + "kube 0.91.0", "kube-runtime", "lazy_static", "log", "mock_instant", "mockall", "mockall_double", - "prometheus", + "prometheus 0.12.0", "prost", "serde", "serde_derive", @@ -336,9 +336,9 @@ dependencies = [ "base64 0.13.1", "bytes", "chrono", - "env_logger", + "env_logger 0.10.2", "futures-util", - "hyper", + "hyper 0.14.30", "log", "mockall", "serde", @@ -381,12 +381,12 @@ dependencies = [ "anyhow", "async-trait", "either", - "env_logger", - "k8s-openapi 0.20.0", - "kube 0.87.2", + "env_logger 0.10.2", + "k8s-openapi 0.22.0", + "kube 0.91.0", "log", "mockall", - "prometheus", + "prometheus 0.12.0", "rand", "schemars", "serde", @@ -406,7 +406,7 @@ dependencies = [ "akri-discovery-utils", "anyhow", "async-trait", - "env_logger", + "env_logger 0.10.2", "log", "mockall", "pest", @@ -512,6 +512,18 @@ version = "1.0.89" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +[[package]] +name = "async-broadcast" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cd0e2e25ea8e5f7e9df04578dc6cf5c83577fd09b1a46aaf5c85e1c33f2a7e" +dependencies = [ + "event-listener 5.3.1", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-channel" version = "1.9.0" @@ -682,8 +694,8 @@ dependencies = [ "bytes", "futures-util", "http 0.2.12", - "http-body", - "hyper", + "http-body 0.4.6", + "hyper 0.14.30", "itoa", "matchit", "memchr", @@ -708,7 +720,7 @@ dependencies = [ "bytes", "futures-util", "http 0.2.12", - "http-body", + "http-body 0.4.6", "mime", "rustversion", "tower-layer", @@ -940,16 +952,17 @@ dependencies = [ "anyhow", "async-std", "chrono", - "env_logger", + "either", + "env_logger 0.11.5", "futures", - "k8s-openapi 0.20.0", - "kube 0.87.2", - "kube-runtime", + "k8s-openapi 0.22.0", + "kube 0.91.0", "lazy_static", "log", "mockall", - "prometheus", + "prometheus 0.13.4", "serde_json", + "thiserror", "tokio", ] @@ -1112,7 +1125,7 @@ version = "0.13.3" dependencies = [ "akri-debug-echo", "akri-discovery-utils", - "env_logger", + "env_logger 0.10.2", "log", "tokio", ] @@ -1217,6 +1230,16 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "env_filter" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" +dependencies = [ + "log", + "regex", +] + [[package]] name = "env_logger" version = "0.10.2" @@ -1230,6 +1253,19 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" +dependencies = [ + "anstream", + "anstyle", + "env_filter", + "humantime", + "log", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -1621,6 +1657,29 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.1", + "pin-project-lite", +] + [[package]] name = "http-range-header" version = "0.3.1" @@ -1657,7 +1716,7 @@ dependencies = [ "futures-util", "h2", "http 0.2.12", - "http-body", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -1669,6 +1728,25 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.1", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-openssl" version = "0.9.2" @@ -1676,7 +1754,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6ee5d7a8f718585d1c3c61dfde28ef5b0bb14734b4db13f5ada856cdc6c612b" dependencies = [ "http 0.2.12", - "hyper", + "hyper 0.14.30", "linked_hash_set", "once_cell", "openssl", @@ -1689,18 +1767,21 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.24.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", - "http 0.2.12", - "hyper", + "http 1.1.0", + "hyper 1.4.1", + "hyper-util", "log", - "rustls", + "rustls 0.23.14", "rustls-native-certs", + "rustls-pki-types", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.0", + "tower-service", ] [[package]] @@ -1709,12 +1790,44 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper", + "hyper 0.14.30", "pin-project-lite", "tokio", "tokio-io-timeout", ] +[[package]] +name = "hyper-timeout" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793" +dependencies = [ + "hyper 1.4.1", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.1", + "hyper 1.4.1", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + [[package]] name = "iana-time-zone" version = "0.1.61" @@ -1869,10 +1982,12 @@ dependencies = [ [[package]] name = "jsonpath-rust" -version = "0.3.5" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829" +checksum = "19d8fe85bd70ff715f31ce8c739194b423d79811a19602115d611a3ec85d6200" dependencies = [ + "lazy_static", + "once_cell", "pest", "pest_derive", "regex", @@ -1908,12 +2023,11 @@ dependencies = [ [[package]] name = "k8s-openapi" -version = "0.20.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6" +checksum = "19501afb943ae5806548bc3ebd7f3374153ca057a38f480ef30adfde5ef09755" dependencies = [ - "base64 0.21.7", - "bytes", + "base64 0.22.1", "chrono", "schemars", "serde", @@ -1935,14 +2049,15 @@ dependencies = [ [[package]] name = "kube" -version = "0.87.2" +version = "0.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e" +checksum = "264461a7ebf4fb0fcf23e4c7e4f9387c5696ee61d003de207d9b5a895ff37bfa" dependencies = [ - "k8s-openapi 0.20.0", - "kube-client 0.87.2", - "kube-core 0.87.2", - "kube-derive 0.87.2", + "k8s-openapi 0.22.0", + "kube-client 0.91.0", + "kube-core 0.91.0", + "kube-derive 0.91.0", + "kube-runtime", ] [[package]] @@ -1958,10 +2073,10 @@ dependencies = [ "either", "futures", "http 0.2.12", - "http-body", - "hyper", + "http-body 0.4.6", + "hyper 0.14.30", "hyper-openssl", - "hyper-timeout", + "hyper-timeout 0.4.1", "jsonpath_lib", "k8s-openapi 0.17.0", "kube-core 0.80.0", @@ -1982,28 +2097,29 @@ dependencies = [ [[package]] name = "kube-client" -version = "0.87.2" +version = "0.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f" +checksum = "47164ad6c47398ee4bdf90509c7b44026229721cb1377eb4623a1ec2a00a85e9" dependencies = [ - "base64 0.21.7", + "base64 0.22.1", "bytes", "chrono", "either", "futures", "home", - "http 0.2.12", - "http-body", - "hyper", + "http 1.1.0", + "http-body 1.0.1", + "http-body-util", + "hyper 1.4.1", "hyper-rustls", - "hyper-timeout", + "hyper-timeout 0.5.1", + "hyper-util", "jsonpath-rust", - "k8s-openapi 0.20.0", - "kube-core 0.87.2", + "k8s-openapi 0.22.0", + "kube-core 0.91.0", "pem 3.0.4", - "pin-project", - "rustls", - "rustls-pemfile", + "rustls 0.23.14", + "rustls-pemfile 2.2.0", "secrecy", "serde", "serde_json", @@ -2012,7 +2128,7 @@ dependencies = [ "tokio", "tokio-util", "tower", - "tower-http 0.4.4", + "tower-http 0.5.2", "tracing", ] @@ -2035,16 +2151,15 @@ dependencies = [ [[package]] name = "kube-core" -version = "0.87.2" +version = "0.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae" +checksum = "2797d3044a238825432129cd9537e12c2a6dacbbb5352381af5ea55e1505ed4f" dependencies = [ "chrono", "form_urlencoded", - "http 0.2.12", + "http 1.1.0", "json-patch", - "k8s-openapi 0.20.0", - "once_cell", + "k8s-openapi 0.22.0", "schemars", "serde", "serde_json", @@ -2066,9 +2181,9 @@ dependencies = [ [[package]] name = "kube-derive" -version = "0.87.2" +version = "0.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4" +checksum = "fcf837edaa0c478f85e9a3cddb17fa80d58a57c1afa722b3a9e55753ea162f41" dependencies = [ "darling 0.20.10", "proc-macro2", @@ -2079,19 +2194,21 @@ dependencies = [ [[package]] name = "kube-runtime" -version = "0.87.2" +version = "0.91.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae" +checksum = "e463e89a1fb222c65a5469b568803153d1bf13d084a8dd42b659e6cca66edc6e" dependencies = [ "ahash", + "async-broadcast", + "async-stream", "async-trait", "backoff", "derivative", "futures", "hashbrown 0.14.5", "json-patch", - "k8s-openapi 0.20.0", - "kube-client 0.87.2", + "k8s-openapi 0.22.0", + "kube-client 0.91.0", "parking_lot 0.12.3", "pin-project", "serde", @@ -2374,7 +2491,7 @@ version = "0.13.3" dependencies = [ "akri-discovery-utils", "akri-onvif", - "env_logger", + "env_logger 0.10.2", "log", "tokio", ] @@ -2423,7 +2540,7 @@ version = "0.13.3" dependencies = [ "akri-discovery-utils", "akri-opcua", - "env_logger", + "env_logger 0.10.2", "log", "tokio", ] @@ -2789,6 +2906,29 @@ dependencies = [ "libc", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.6.0", + "hex", + "lazy_static", + "procfs-core", + "rustix", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.6.0", + "hex", +] + [[package]] name = "prometheus" version = "0.12.0" @@ -2801,7 +2941,24 @@ dependencies = [ "libc", "memchr", "parking_lot 0.11.2", - "procfs", + "procfs 0.9.1", + "protobuf", + "thiserror", +] + +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "libc", + "memchr", + "parking_lot 0.12.3", + "procfs 0.16.0", "protobuf", "thiserror", ] @@ -2987,8 +3144,8 @@ dependencies = [ "futures-util", "h2", "http 0.2.12", - "http-body", - "hyper", + "http-body 0.4.6", + "hyper 0.14.30", "ipnet", "js-sys", "log", @@ -3071,18 +3228,34 @@ checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", "ring", - "rustls-webpki", + "rustls-webpki 0.101.7", "sct", ] +[[package]] +name = "rustls" +version = "0.23.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki 0.102.8", + "subtle", + "zeroize", +] + [[package]] name = "rustls-native-certs" -version = "0.6.3" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", - "rustls-pemfile", + "rustls-pemfile 2.2.0", + "rustls-pki-types", "schannel", "security-framework", ] @@ -3096,6 +3269,21 @@ dependencies = [ "base64 0.21.7", ] +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e696e35370c65c9c541198af4543ccd580cf17fc25d8e05c5a242b202488c55" + [[package]] name = "rustls-webpki" version = "0.101.7" @@ -3106,6 +3294,17 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustls-webpki" +version = "0.102.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.17" @@ -3630,7 +3829,18 @@ version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls", + "rustls 0.21.12", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +dependencies = [ + "rustls 0.23.14", + "rustls-pki-types", "tokio", ] @@ -3685,16 +3895,16 @@ dependencies = [ "bytes", "h2", "http 0.2.12", - "http-body", - "hyper", - "hyper-timeout", + "http-body 0.4.6", + "hyper 0.14.30", + "hyper-timeout 0.4.1", "percent-encoding 2.3.1", "pin-project", "prost", - "rustls", - "rustls-pemfile", + "rustls 0.21.12", + "rustls-pemfile 1.0.4", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.1", "tokio-stream", "tower", "tower-layer", @@ -3747,7 +3957,7 @@ dependencies = [ "futures-core", "futures-util", "http 0.2.12", - "http-body", + "http-body 0.4.6", "http-range-header", "pin-project-lite", "tower-layer", @@ -3757,18 +3967,16 @@ dependencies = [ [[package]] name = "tower-http" -version = "0.4.4" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" +checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ "base64 0.21.7", "bitflags 2.6.0", "bytes", - "futures-core", - "futures-util", - "http 0.2.12", - "http-body", - "http-range-header", + "http 1.1.0", + "http-body 1.0.1", + "http-body-util", "mime", "pin-project-lite", "tower-layer", @@ -3879,7 +4087,7 @@ version = "0.13.3" dependencies = [ "akri-discovery-utils", "akri-udev", - "env_logger", + "env_logger 0.10.2", "log", "tokio", ] @@ -3889,10 +4097,10 @@ name = "udev-video-broker" version = "0.13.3" dependencies = [ "akri-shared", - "env_logger", + "env_logger 0.10.2", "lazy_static", "log", - "prometheus", + "prometheus 0.12.0", "prost", "regex", "rscam", @@ -4039,7 +4247,7 @@ dependencies = [ "futures-util", "headers", "http 0.2.12", - "hyper", + "hyper 0.14.30", "log", "mime", "mime_guess", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 90d22ad28..6ea6c1e51 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -25,9 +25,9 @@ env_logger = "0.10.0" futures = { version = "0.3.1", package = "futures" } hyper = "0.14.2" itertools = "0.12.0" -k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.87.1", features = ["derive"] } -kube-runtime = { version = "0.87.1", features = ["unstable-runtime-reconcile-on"] } +k8s-openapi = { version = "0.22.0", default-features = false, features = ["schemars", "v1_25"] } +kube = { version = "0.91.0", features = [ "derive", "runtime"] } +kube-runtime = { version = "0.91.0", features = ["unstable-runtime-reconcile-on"] } lazy_static = "1.4" log = "0.4" mockall_double = "0.3.1" diff --git a/agent/src/discovery_handler_manager/discovery_handler_registry.rs b/agent/src/discovery_handler_manager/discovery_handler_registry.rs index 27c34466e..cc4e9706a 100644 --- a/agent/src/discovery_handler_manager/discovery_handler_registry.rs +++ b/agent/src/discovery_handler_manager/discovery_handler_registry.rs @@ -34,7 +34,7 @@ use futures::future::try_join_all; use futures::FutureExt; use itertools::Itertools; use kube::core::ObjectMeta; -use kube_runtime::reflector::ObjectRef; +use kube::runtime::reflector::ObjectRef; use tokio::select; use tokio::sync::mpsc; use tokio::sync::watch; diff --git a/agent/src/discovery_handler_manager/mod.rs b/agent/src/discovery_handler_manager/mod.rs index 6a4c1434d..e4317774e 100644 --- a/agent/src/discovery_handler_manager/mod.rs +++ b/agent/src/discovery_handler_manager/mod.rs @@ -9,7 +9,7 @@ use std::{collections::HashMap, sync::Arc}; use akri_shared::{akri::configuration::Configuration, k8s::api::IntoApi}; use k8s_openapi::api::core::v1::{ConfigMap, Secret}; -use kube_runtime::reflector::ObjectRef; +use kube::runtime::reflector::ObjectRef; use thiserror::Error; use tokio::sync::{mpsc, watch}; diff --git a/agent/src/plugin_manager/v1.rs b/agent/src/plugin_manager/v1.rs index 7f3183f3f..f016fb20b 100644 --- a/agent/src/plugin_manager/v1.rs +++ b/agent/src/plugin_manager/v1.rs @@ -87,8 +87,8 @@ pub struct NumaNode { /// Generated client implementations. pub mod pod_resources_lister_client { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::http::Uri; use tonic::codegen::*; + use tonic::codegen::http::Uri; /// PodResourcesLister is a service provided by the kubelet that provides information about the /// node resources consumed by pods and containers on the node #[derive(Debug, Clone)] @@ -134,8 +134,9 @@ pub mod pod_resources_lister_client { >::ResponseBody, >, >, - >>::Error: - Into + Send + Sync, + , + >>::Error: Into + Send + Sync, { PodResourcesListerClient::new(InterceptedService::new(inner, interceptor)) } @@ -173,16 +174,23 @@ pub mod pod_resources_lister_client { pub async fn list( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static("/v1.PodResourcesLister/List"); + let path = http::uri::PathAndQuery::from_static( + "/v1.PodResourcesLister/List", + ); let mut req = request.into_request(); req.extensions_mut() .insert(GrpcMethod::new("v1.PodResourcesLister", "List")); @@ -191,23 +199,28 @@ pub mod pod_resources_lister_client { pub async fn get_allocatable_resources( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/v1.PodResourcesLister/GetAllocatableResources", ); let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "v1.PodResourcesLister", - "GetAllocatableResources", - )); + req.extensions_mut() + .insert( + GrpcMethod::new("v1.PodResourcesLister", "GetAllocatableResources"), + ); self.inner.unary(req, path, codec).await } } @@ -222,11 +235,17 @@ pub mod pod_resources_lister_server { async fn list( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; async fn get_allocatable_resources( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } /// PodResourcesLister is a service provided by the kubelet that provides information about the /// node resources consumed by pods and containers on the node @@ -253,7 +272,10 @@ pub mod pod_resources_lister_server { max_encoding_message_size: None, } } - pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService where F: tonic::service::Interceptor, { @@ -309,11 +331,15 @@ pub mod pod_resources_lister_server { "/v1.PodResourcesLister/List" => { #[allow(non_camel_case_types)] struct ListSvc(pub Arc); - impl - tonic::server::UnaryService for ListSvc - { + impl< + T: PodResourcesLister, + > tonic::server::UnaryService + for ListSvc { type Response = super::ListPodResourcesResponse; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, @@ -351,12 +377,15 @@ pub mod pod_resources_lister_server { "/v1.PodResourcesLister/GetAllocatableResources" => { #[allow(non_camel_case_types)] struct GetAllocatableResourcesSvc(pub Arc); - impl - tonic::server::UnaryService - for GetAllocatableResourcesSvc - { + impl< + T: PodResourcesLister, + > tonic::server::UnaryService + for GetAllocatableResourcesSvc { type Response = super::AllocatableResourcesResponse; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, @@ -364,9 +393,10 @@ pub mod pod_resources_lister_server { let inner = Arc::clone(&self.0); let fut = async move { ::get_allocatable_resources( - &inner, request, - ) - .await + &inner, + request, + ) + .await }; Box::pin(fut) } @@ -394,14 +424,18 @@ pub mod pod_resources_lister_server { }; Box::pin(fut) } - _ => Box::pin(async move { - Ok(http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(empty_body()) - .unwrap()) - }), + _ => { + Box::pin(async move { + Ok( + http::Response::builder() + .status(200) + .header("grpc-status", "12") + .header("content-type", "application/grpc") + .body(empty_body()) + .unwrap(), + ) + }) + } } } } @@ -427,7 +461,8 @@ pub mod pod_resources_lister_server { write!(f, "{:?}", self.0) } } - impl tonic::server::NamedService for PodResourcesListerServer { + impl tonic::server::NamedService + for PodResourcesListerServer { const NAME: &'static str = "v1.PodResourcesLister"; } } diff --git a/agent/src/plugin_manager/v1beta1.rs b/agent/src/plugin_manager/v1beta1.rs index a88570f4d..7b068ecab 100644 --- a/agent/src/plugin_manager/v1beta1.rs +++ b/agent/src/plugin_manager/v1beta1.rs @@ -96,7 +96,9 @@ pub struct PreStartContainerResponse {} #[derive(Clone, PartialEq, ::prost::Message)] pub struct PreferredAllocationRequest { #[prost(message, repeated, tag = "1")] - pub container_requests: ::prost::alloc::vec::Vec, + pub container_requests: ::prost::alloc::vec::Vec< + ContainerPreferredAllocationRequest, + >, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -106,7 +108,9 @@ pub struct ContainerPreferredAllocationRequest { pub available_device_i_ds: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, /// List of deviceIDs that must be included in the preferred allocation #[prost(string, repeated, tag = "2")] - pub must_include_device_i_ds: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + pub must_include_device_i_ds: ::prost::alloc::vec::Vec< + ::prost::alloc::string::String, + >, /// Number of devices to include in the preferred allocation #[prost(int32, tag = "3")] pub allocation_size: i32, @@ -117,7 +121,9 @@ pub struct ContainerPreferredAllocationRequest { #[derive(Clone, PartialEq, ::prost::Message)] pub struct PreferredAllocationResponse { #[prost(message, repeated, tag = "1")] - pub container_responses: ::prost::alloc::vec::Vec, + pub container_responses: ::prost::alloc::vec::Vec< + ContainerPreferredAllocationResponse, + >, } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -162,8 +168,10 @@ pub struct AllocateResponse { pub struct ContainerAllocateResponse { /// List of environment variable to be set in the container to access one of more devices. #[prost(map = "string, string", tag = "1")] - pub envs: - ::std::collections::HashMap<::prost::alloc::string::String, ::prost::alloc::string::String>, + pub envs: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, /// Mounts for the container. #[prost(message, repeated, tag = "2")] pub mounts: ::prost::alloc::vec::Vec, @@ -172,8 +180,10 @@ pub struct ContainerAllocateResponse { pub devices: ::prost::alloc::vec::Vec, /// Container annotations to pass to the container runtime #[prost(map = "string, string", tag = "4")] - pub annotations: - ::std::collections::HashMap<::prost::alloc::string::String, ::prost::alloc::string::String>, + pub annotations: ::std::collections::HashMap< + ::prost::alloc::string::String, + ::prost::alloc::string::String, + >, } /// Mount specifies a host volume to mount into a container. /// where device library or tools are installed on host and container @@ -210,8 +220,8 @@ pub struct DeviceSpec { /// Generated client implementations. pub mod registration_client { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::http::Uri; use tonic::codegen::*; + use tonic::codegen::http::Uri; /// Registration is the service advertised by the Kubelet /// Only when Kubelet answers with a success code to a Register Request /// may Device Plugins start their service @@ -261,8 +271,9 @@ pub mod registration_client { >::ResponseBody, >, >, - >>::Error: - Into + Send + Sync, + , + >>::Error: Into + Send + Sync, { RegistrationClient::new(InterceptedService::new(inner, interceptor)) } @@ -301,14 +312,19 @@ pub mod registration_client { &mut self, request: impl tonic::IntoRequest, ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static("/v1beta1.Registration/Register"); + let path = http::uri::PathAndQuery::from_static( + "/v1beta1.Registration/Register", + ); let mut req = request.into_request(); req.extensions_mut() .insert(GrpcMethod::new("v1beta1.Registration", "Register")); @@ -319,8 +335,8 @@ pub mod registration_client { /// Generated client implementations. pub mod device_plugin_client { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] - use tonic::codegen::http::Uri; use tonic::codegen::*; + use tonic::codegen::http::Uri; /// DevicePlugin is the service advertised by Device Plugins #[derive(Debug, Clone)] pub struct DevicePluginClient { @@ -365,8 +381,9 @@ pub mod device_plugin_client { >::ResponseBody, >, >, - >>::Error: - Into + Send + Sync, + , + >>::Error: Into + Send + Sync, { DevicePluginClient::new(InterceptedService::new(inner, interceptor)) } @@ -406,23 +423,28 @@ pub mod device_plugin_client { pub async fn get_device_plugin_options( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/v1beta1.DevicePlugin/GetDevicePluginOptions", ); let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "v1beta1.DevicePlugin", - "GetDevicePluginOptions", - )); + req.extensions_mut() + .insert( + GrpcMethod::new("v1beta1.DevicePlugin", "GetDevicePluginOptions"), + ); self.inner.unary(req, path, codec).await } /// ListAndWatch returns a stream of List of Devices @@ -435,14 +457,19 @@ pub mod device_plugin_client { tonic::Response>, tonic::Status, > { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static("/v1beta1.DevicePlugin/ListAndWatch"); + let path = http::uri::PathAndQuery::from_static( + "/v1beta1.DevicePlugin/ListAndWatch", + ); let mut req = request.into_request(); req.extensions_mut() .insert(GrpcMethod::new("v1beta1.DevicePlugin", "ListAndWatch")); @@ -456,23 +483,28 @@ pub mod device_plugin_client { pub async fn get_preferred_allocation( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); let path = http::uri::PathAndQuery::from_static( "/v1beta1.DevicePlugin/GetPreferredAllocation", ); let mut req = request.into_request(); - req.extensions_mut().insert(GrpcMethod::new( - "v1beta1.DevicePlugin", - "GetPreferredAllocation", - )); + req.extensions_mut() + .insert( + GrpcMethod::new("v1beta1.DevicePlugin", "GetPreferredAllocation"), + ); self.inner.unary(req, path, codec).await } /// Allocate is called during container creation so that the Device @@ -481,15 +513,23 @@ pub mod device_plugin_client { pub async fn allocate( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); - let path = http::uri::PathAndQuery::from_static("/v1beta1.DevicePlugin/Allocate"); + let path = http::uri::PathAndQuery::from_static( + "/v1beta1.DevicePlugin/Allocate", + ); let mut req = request.into_request(); req.extensions_mut() .insert(GrpcMethod::new("v1beta1.DevicePlugin", "Allocate")); @@ -501,17 +541,23 @@ pub mod device_plugin_client { pub async fn pre_start_container( &mut self, request: impl tonic::IntoRequest, - ) -> std::result::Result, tonic::Status> - { - self.inner.ready().await.map_err(|e| { - tonic::Status::new( - tonic::Code::Unknown, - format!("Service was not ready: {}", e.into()), - ) - })?; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::new( + tonic::Code::Unknown, + format!("Service was not ready: {}", e.into()), + ) + })?; let codec = tonic::codec::ProstCodec::default(); - let path = - http::uri::PathAndQuery::from_static("/v1beta1.DevicePlugin/PreStartContainer"); + let path = http::uri::PathAndQuery::from_static( + "/v1beta1.DevicePlugin/PreStartContainer", + ); let mut req = request.into_request(); req.extensions_mut() .insert(GrpcMethod::new("v1beta1.DevicePlugin", "PreStartContainer")); @@ -560,7 +606,10 @@ pub mod registration_server { max_encoding_message_size: None, } } - pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService where F: tonic::service::Interceptor, { @@ -616,16 +665,23 @@ pub mod registration_server { "/v1beta1.Registration/Register" => { #[allow(non_camel_case_types)] struct RegisterSvc(pub Arc); - impl tonic::server::UnaryService for RegisterSvc { + impl< + T: Registration, + > tonic::server::UnaryService + for RegisterSvc { type Response = super::Empty; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = - async move { ::register(&inner, request).await }; + let fut = async move { + ::register(&inner, request).await + }; Box::pin(fut) } } @@ -652,14 +708,18 @@ pub mod registration_server { }; Box::pin(fut) } - _ => Box::pin(async move { - Ok(http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(empty_body()) - .unwrap()) - }), + _ => { + Box::pin(async move { + Ok( + http::Response::builder() + .status(200) + .header("grpc-status", "12") + .header("content-type", "application/grpc") + .body(empty_body()) + .unwrap(), + ) + }) + } } } } @@ -701,11 +761,15 @@ pub mod device_plugin_server { async fn get_device_plugin_options( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Server streaming response type for the ListAndWatch method. type ListAndWatchStream: tonic::codegen::tokio_stream::Stream< Item = std::result::Result, - > + Send + > + + Send + 'static; /// ListAndWatch returns a stream of List of Devices /// Whenever a Device state change or a Device disappears, ListAndWatch @@ -713,7 +777,10 @@ pub mod device_plugin_server { async fn list_and_watch( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// GetPreferredAllocation returns a preferred set of devices to allocate /// from a list of available ones. The resulting preferred allocation is not /// guaranteed to be the allocation ultimately performed by the @@ -722,21 +789,30 @@ pub mod device_plugin_server { async fn get_preferred_allocation( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Allocate is called during container creation so that the Device /// Plugin can run device specific operations and instruct Kubelet /// of the steps to make the Device available in the container async fn allocate( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// PreStartContainer is called, if indicated by Device Plugin during registeration phase, /// before each container start. Device plugin can run device specific operations /// such as resetting the device before making devices available to the container async fn pre_start_container( &self, request: tonic::Request, - ) -> std::result::Result, tonic::Status>; + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; } /// DevicePlugin is the service advertised by Device Plugins #[derive(Debug)] @@ -762,7 +838,10 @@ pub mod device_plugin_server { max_encoding_message_size: None, } } - pub fn with_interceptor(inner: T, interceptor: F) -> InterceptedService + pub fn with_interceptor( + inner: T, + interceptor: F, + ) -> InterceptedService where F: tonic::service::Interceptor, { @@ -818,13 +897,23 @@ pub mod device_plugin_server { "/v1beta1.DevicePlugin/GetDevicePluginOptions" => { #[allow(non_camel_case_types)] struct GetDevicePluginOptionsSvc(pub Arc); - impl tonic::server::UnaryService for GetDevicePluginOptionsSvc { + impl tonic::server::UnaryService + for GetDevicePluginOptionsSvc { type Response = super::DevicePluginOptions; - type Future = BoxFuture, tonic::Status>; - fn call(&mut self, request: tonic::Request) -> Self::Future { + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - ::get_device_plugin_options(&inner, request) + ::get_device_plugin_options( + &inner, + request, + ) .await }; Box::pin(fut) @@ -856,12 +945,20 @@ pub mod device_plugin_server { "/v1beta1.DevicePlugin/ListAndWatch" => { #[allow(non_camel_case_types)] struct ListAndWatchSvc(pub Arc); - impl tonic::server::ServerStreamingService for ListAndWatchSvc { + impl< + T: DevicePlugin, + > tonic::server::ServerStreamingService + for ListAndWatchSvc { type Response = super::ListAndWatchResponse; type ResponseStream = T::ListAndWatchStream; - type Future = - BoxFuture, tonic::Status>; - fn call(&mut self, request: tonic::Request) -> Self::Future { + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { ::list_and_watch(&inner, request).await @@ -895,19 +992,26 @@ pub mod device_plugin_server { "/v1beta1.DevicePlugin/GetPreferredAllocation" => { #[allow(non_camel_case_types)] struct GetPreferredAllocationSvc(pub Arc); - impl - tonic::server::UnaryService - for GetPreferredAllocationSvc - { + impl< + T: DevicePlugin, + > tonic::server::UnaryService + for GetPreferredAllocationSvc { type Response = super::PreferredAllocationResponse; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - ::get_preferred_allocation(&inner, request).await + ::get_preferred_allocation( + &inner, + request, + ) + .await }; Box::pin(fut) } @@ -938,16 +1042,23 @@ pub mod device_plugin_server { "/v1beta1.DevicePlugin/Allocate" => { #[allow(non_camel_case_types)] struct AllocateSvc(pub Arc); - impl tonic::server::UnaryService for AllocateSvc { + impl< + T: DevicePlugin, + > tonic::server::UnaryService + for AllocateSvc { type Response = super::AllocateResponse; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); - let fut = - async move { ::allocate(&inner, request).await }; + let fut = async move { + ::allocate(&inner, request).await + }; Box::pin(fut) } } @@ -977,19 +1088,23 @@ pub mod device_plugin_server { "/v1beta1.DevicePlugin/PreStartContainer" => { #[allow(non_camel_case_types)] struct PreStartContainerSvc(pub Arc); - impl - tonic::server::UnaryService - for PreStartContainerSvc - { + impl< + T: DevicePlugin, + > tonic::server::UnaryService + for PreStartContainerSvc { type Response = super::PreStartContainerResponse; - type Future = BoxFuture, tonic::Status>; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; fn call( &mut self, request: tonic::Request, ) -> Self::Future { let inner = Arc::clone(&self.0); let fut = async move { - ::pre_start_container(&inner, request).await + ::pre_start_container(&inner, request) + .await }; Box::pin(fut) } @@ -1017,14 +1132,18 @@ pub mod device_plugin_server { }; Box::pin(fut) } - _ => Box::pin(async move { - Ok(http::Response::builder() - .status(200) - .header("grpc-status", "12") - .header("content-type", "application/grpc") - .body(empty_body()) - .unwrap()) - }), + _ => { + Box::pin(async move { + Ok( + http::Response::builder() + .status(200) + .header("grpc-status", "12") + .header("content-type", "application/grpc") + .body(empty_body()) + .unwrap(), + ) + }) + } } } } diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs index 496a783fa..29364a691 100644 --- a/agent/src/util/discovery_configuration_controller.rs +++ b/agent/src/util/discovery_configuration_controller.rs @@ -19,7 +19,7 @@ use crate::discovery_handler_manager::{ }; use kube::{Resource, ResourceExt}; -use kube_runtime::{ +use kube::runtime::{ controller::Action, reflector::{ObjectRef, Store}, Controller, diff --git a/controller/Cargo.toml b/controller/Cargo.toml index 1bee29183..fc063ac76 100644 --- a/controller/Cargo.toml +++ b/controller/Cargo.toml @@ -13,14 +13,17 @@ akri-shared = { path = "../shared" } anyhow = "1.0.38" async-std = "1.5.0" chrono = "0.4.10" -env_logger = "0.10.0" +either = "1.13" +env_logger = "0.11.5" futures = "0.3.1" -k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.87.1", features = ["derive"] } -kube-runtime = "0.87.1" +k8s-openapi = { version = "0.22.0", default-features = false, features = ["schemars", "v1_25"] } +kube = { version = "0.91.0", features = ["runtime", "client", "derive" ] } lazy_static = "1.4" log = "0.4" -prometheus = { version = "0.12.0", features = ["process"] } +prometheus = { version = "0.13.4", features = ["process"] } +# Used for patch API +serde_json = "1.0.45" +thiserror = "1" tokio = { version = "1.0.2", features = ["full"] } [dev-dependencies] diff --git a/controller/src/main.rs b/controller/src/main.rs index 82d6d0c35..4886ce07b 100644 --- a/controller/src/main.rs +++ b/controller/src/main.rs @@ -3,10 +3,12 @@ extern crate lazy_static; mod util; use akri_shared::akri::{metrics::run_metrics_server, API_NAMESPACE}; -use async_std::sync::Mutex; use prometheus::IntGaugeVec; use std::sync::Arc; -use util::{instance_action, node_watcher, pod_watcher}; +use util::{ + controller_ctx::{ControllerContext, CONTROLLER_FIELD_MANAGER_ID}, + instance_action, node_watcher, pod_watcher, +}; /// Length of time to sleep between controller system validation checks pub const SYSTEM_CHECK_DELAY_SECS: u64 = 30; @@ -32,43 +34,30 @@ async fn main() -> Result<(), Box ); log::info!("{} Controller logging started", API_NAMESPACE); - - let synchronization = Arc::new(Mutex::new(())); - let instance_watch_synchronization = synchronization.clone(); let mut tasks = Vec::new(); // Start server for prometheus metrics - tasks.push(tokio::spawn(async move { - run_metrics_server().await.unwrap(); - })); + tokio::spawn(run_metrics_server()); + + let controller_ctx = Arc::new(ControllerContext::new( + Arc::new(kube::Client::try_default().await?), + CONTROLLER_FIELD_MANAGER_ID, + )); + let instance_water_ctx = controller_ctx.clone(); + let node_watcher_ctx = controller_ctx.clone(); + let pod_watcher_ctx = controller_ctx.clone(); - // Handle existing instances - tasks.push(tokio::spawn({ - async move { - instance_action::handle_existing_instances().await.unwrap(); - } - })); // Handle instance changes - tasks.push(tokio::spawn({ - async move { - instance_action::do_instance_watch(instance_watch_synchronization) - .await - .unwrap(); - } + tasks.push(tokio::spawn(async { + instance_action::run(instance_water_ctx).await; })); // Watch for node disappearance - tasks.push(tokio::spawn({ - async move { - let mut node_watcher = node_watcher::NodeWatcher::new(); - node_watcher.watch().await.unwrap(); - } + tasks.push(tokio::spawn(async { + node_watcher::run(node_watcher_ctx).await; })); // Watch for broker Pod state changes - tasks.push(tokio::spawn({ - async move { - let mut broker_pod_watcher = pod_watcher::BrokerPodWatcher::new(); - broker_pod_watcher.watch().await.unwrap(); - } + tasks.push(tokio::spawn(async { + pod_watcher::run(pod_watcher_ctx).await; })); futures::future::try_join_all(tasks).await?; diff --git a/controller/src/util/controller_ctx.rs b/controller/src/util/controller_ctx.rs new file mode 100644 index 000000000..717855c5e --- /dev/null +++ b/controller/src/util/controller_ctx.rs @@ -0,0 +1,106 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use akri_shared::akri::configuration::Configuration; +use akri_shared::akri::instance::Instance; +use akri_shared::k8s::api::IntoApi; + +use k8s_openapi::api::batch::v1::Job; +use k8s_openapi::api::core::v1::{Node, Pod, Service}; + +use tokio::sync::RwLock; + +// Identifier for the controller to be set as the field manager for server-side apply +pub const CONTROLLER_FIELD_MANAGER_ID: &str = "akri.sh/controller"; + +/// Pod states that BrokerPodWatcher is interested in +/// +/// PodState describes the various states that the controller can +/// react to for Pods. +#[derive(Clone, Debug, PartialEq)] +pub enum PodState { + /// Pod is in Pending state and no action is needed. + Pending, + /// Pod is in Running state and needs to ensure that + /// instance and configuration services are running + Running, + /// Pod is in Failed/Completed/Succeeded state and + /// needs to remove any instance and configuration + /// services that are not supported by other Running + /// Pods. Also, at this point, if an Instance still + /// exists, instance_action::handle_instance_change + /// needs to be called to ensure that Pods are + /// restarted + Ended, + /// Pod is in Deleted state and needs to remove any + /// instance and configuration services that are not + /// supported by other Running Pods. Also, at this + /// point, if an Instance still exists, and the Pod is + /// owned by the Instance, + /// instance_action::handle_instance_change needs to be + /// called to ensure that Pods are restarted. Akri + /// places an Instance OwnerReference on all the Pods it + /// deploys. This declares that the Instance owns that + /// Pod and Akri's Controller explicitly manages its + /// deployment. However, if the Pod is not owned by the + /// Instance, Akri should not assume retry logic and + /// should cease action. The owning object (ie Job) will + /// handle retries as necessary. + Deleted, +} + +/// Node states that NodeWatcher is interested in +/// +/// NodeState describes the various states that the controller can +/// react to for Nodes. +#[derive(Clone, Debug, PartialEq)] +pub enum NodeState { + /// Node has been seen, but not Running yet + Known, + /// Node has been seen Running + Running, + /// A previously Running Node has been seen as not Running + /// and the Instances have been cleaned of references to that + /// vanished Node + InstancesCleaned, +} + +pub trait ControllerKubeClient: + IntoApi + + IntoApi + + IntoApi + + IntoApi + + IntoApi + + IntoApi +{ +} + +impl< + T: IntoApi + + IntoApi + + IntoApi + + IntoApi + + IntoApi + + IntoApi, + > ControllerKubeClient for T +{ +} + +pub struct ControllerContext { + /// Kubernetes client + pub client: Arc, + pub known_pods: Arc>>, + pub known_nodes: Arc>>, + pub identifier: String, +} + +impl ControllerContext { + pub fn new(client: Arc, identifier: &str) -> Self { + ControllerContext { + client, + known_pods: Arc::new(RwLock::new(HashMap::new())), + known_nodes: Arc::new(RwLock::new(HashMap::new())), + identifier: identifier.to_string(), + } + } +} diff --git a/controller/src/util/instance_action.rs b/controller/src/util/instance_action.rs index 412769c9a..048dc2afb 100644 --- a/controller/src/util/instance_action.rs +++ b/controller/src/util/instance_action.rs @@ -1,28 +1,66 @@ use super::super::BROKER_POD_COUNT_METRIC; use super::{pod_action::PodAction, pod_action::PodActionInfo}; +use crate::util::controller_ctx::{ControllerContext, ControllerKubeClient}; +use crate::util::{ControllerError, Result}; +use akri_shared::akri::configuration::Configuration; +use akri_shared::k8s::api::Api; use akri_shared::{ akri::{configuration::BrokerSpec, instance::Instance, AKRI_PREFIX}, k8s::{ - self, job, pod, - pod::{AKRI_INSTANCE_LABEL_NAME, AKRI_TARGET_NODE_LABEL_NAME}, - KubeInterface, OwnershipInfo, OwnershipType, + job, pod, OwnershipInfo, OwnershipType, AKRI_INSTANCE_LABEL_NAME, + AKRI_TARGET_NODE_LABEL_NAME, }, }; -use async_std::sync::Mutex; -use futures::{StreamExt, TryStreamExt}; -use k8s_openapi::api::batch::v1::JobSpec; +use anyhow::Context; +use futures::StreamExt; +use k8s_openapi::api::batch::v1::{Job, JobSpec}; use k8s_openapi::api::core::v1::{Pod, PodSpec}; -use kube::api::Api; -use kube_runtime::watcher::{watcher, Config, Event}; -use kube_runtime::WatchStreamExt; -use log::{error, info, trace}; + +use kube::{ + api::{ListParams, ResourceExt}, + runtime::{ + controller::{Action, Controller}, + finalizer::{finalizer, Event}, + watcher::Config, + }, +}; +use log::{error, trace}; use std::collections::HashMap; use std::sync::Arc; + /// Length of time a Pod can be pending before we give up and retry pub const PENDING_POD_GRACE_PERIOD_MINUTES: i64 = 5; /// Length of time a Pod can be in an error state before we retry pub const FAILED_POD_GRACE_PERIOD_MINUTES: i64 = 0; +pub static INSTANCE_FINALIZER: &str = "instances.kube.rs"; + +/// Initialize the instance controller +/// TODO: consider passing state that is shared among controllers such as a metrics exporter +pub async fn run(ctx: Arc) { + let api = ctx.client.all().as_inner(); + if let Err(e) = api.list(&ListParams::default().limit(1)).await { + error!("Instance CRD is not queryable; {e:?}. Is the CRD installed?"); + std::process::exit(1); + } + Controller::new(api, Config::default().any_semantic()) + .shutdown_on_signal() + .run(reconcile, error_policy, ctx) + // TODO: needs update for tokio? + .filter_map(|x| async move { std::result::Result::ok(x) }) + .for_each(|_| futures::future::ready(())) + .await; +} + +fn error_policy( + _instance: Arc, + error: &ControllerError, + _ctx: Arc, +) -> Action { + log::warn!("reconcile failed: {:?}", error); + Action::requeue(std::time::Duration::from_secs(5 * 60)) +} + /// Instance action types /// /// Instance actions describe the types of actions the Controller can @@ -53,107 +91,30 @@ pub enum InstanceAction { Update, } -/// This invokes an internal method that watches for Instance events -pub async fn handle_existing_instances( -) -> Result<(), Box> { - internal_handle_existing_instances(&k8s::KubeImpl::new().await?).await +/// This function is the main Reconcile function for Instance resources +/// This will get called every time an Instance gets added or is changed, it will also be called for every existing instance on startup. +pub async fn reconcile(instance: Arc, ctx: Arc) -> Result { + let ns = instance.namespace().unwrap(); // instance has namespace scope + trace!("Reconciling {} in {}", instance.name_any(), ns); + finalizer( + &ctx.client.clone().all().as_inner(), + INSTANCE_FINALIZER, + instance, + |event| reconcile_inner(event, ctx), + ) + .await + .map_err(|e| ControllerError::FinalizerError(Box::new(e))) } -/// This invokes an internal method that watches for Instance events -pub async fn do_instance_watch( - synchronization: Arc>, -) -> Result<(), Box> { - // Watch for instance changes - internal_do_instance_watch(&synchronization, &k8s::KubeImpl::new().await?).await -} - -/// This invokes an internal method that watches for Instance events -async fn internal_handle_existing_instances( - kube_interface: &impl KubeInterface, -) -> Result<(), Box> { - let mut tasks = Vec::new(); - - // Handle existing instances - let pre_existing_instances = kube_interface.get_instances().await?; - for instance in pre_existing_instances { - tasks.push(tokio::spawn(async move { - let inner_kube_interface = k8s::KubeImpl::new().await.unwrap(); - handle_instance_change(&instance, &InstanceAction::Update, &inner_kube_interface) - .await - .unwrap(); - })); - } - futures::future::try_join_all(tasks).await?; - Ok(()) -} - -async fn internal_do_instance_watch( - synchronization: &Arc>, - kube_interface: &impl KubeInterface, -) -> Result<(), Box> { - trace!("internal_do_instance_watch - enter"); - let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher(resource, Config::default()).default_backoff(); - let mut informer = watcher.boxed(); - let mut first_event = true; - // Currently, this does not handle None except to break the loop. - loop { - let event = match informer.try_next().await { - Err(e) => { - error!("Error during watch: {}", e); - continue; - } - Ok(None) => break, - Ok(Some(event)) => event, - }; - // Aquire lock to ensure cleanup_instance_and_configuration_svcs and the - // inner loop handle_instance call in internal_do_instance_watch - // cannot execute at the same time. - let _lock = synchronization.lock().await; - trace!("internal_do_instance_watch - aquired sync lock"); - handle_instance(event, kube_interface, &mut first_event).await?; - } - Ok(()) -} - -/// This takes an event off the Instance stream and delegates it to the -/// correct function based on the event type. -async fn handle_instance( - event: Event, - kube_interface: &impl KubeInterface, - first_event: &mut bool, -) -> anyhow::Result<()> { - trace!("handle_instance - enter"); +async fn reconcile_inner(event: Event, ctx: Arc) -> Result { match event { - Event::Applied(instance) => { - info!( - "handle_instance - added or modified Akri Instance {:?}: {:?}", - instance.metadata.name, instance.spec - ); - // TODO: consider renaming `InstanceAction::Add` to `InstanceAction::AddOrUpdate` - // to reflect that this could also be an Update event. Or as we do more specific - // inspection in future, delineation may be useful. - handle_instance_change(&instance, &InstanceAction::Add, kube_interface).await?; + Event::Apply(instance) => { + handle_instance_change(&instance, &InstanceAction::Add, ctx.clone()).await } - Event::Deleted(instance) => { - info!( - "handle_instance - deleted Akri Instance {:?}: {:?}", - instance.metadata.name, instance.spec - ); - handle_instance_change(&instance, &InstanceAction::Remove, kube_interface).await?; - } - Event::Restarted(_instances) => { - if *first_event { - info!("handle_instance - watcher started"); - } else { - return Err(anyhow::anyhow!( - "Instance watcher restarted - throwing error to restart controller" - )); - } + Event::Cleanup(instance) => { + handle_instance_change(&instance, &InstanceAction::Remove, ctx.clone()).await } } - *first_event = false; - Ok(()) } /// PodContext stores a set of details required to track/create/delete broker @@ -243,7 +204,7 @@ async fn handle_deletion_work( instance_shared: bool, node_to_delete_pod: &str, context: &PodContext, - kube_interface: &impl KubeInterface, + api: &dyn Api, ) -> anyhow::Result<()> { let context_node_name = context.node_name.as_ref().ok_or_else(|| { anyhow::anyhow!( @@ -278,10 +239,8 @@ async fn handle_deletion_work( &pod_app_name, &context_namespace ); - kube_interface - .remove_pod(&pod_app_name, context_namespace) - .await?; - trace!("handle_deletion_work - pod::remove_pod succeeded",); + api.delete(&pod_app_name).await?; + trace!("handle_deletion_work - pod::remove_pod succeeded"); BROKER_POD_COUNT_METRIC .with_label_values(&[configuration_name, context_node_name]) .dec(); @@ -290,9 +249,9 @@ async fn handle_deletion_work( #[cfg(test)] mod handle_deletion_work_tests { - use super::*; - use akri_shared::k8s::MockKubeInterface; + use akri_shared::k8s::api::MockApi; + use super::*; #[tokio::test] async fn test_handle_deletion_work_with_no_node_name() { let _ = env_logger::builder().is_test(true).try_init(); @@ -302,14 +261,14 @@ mod handle_deletion_work_tests { namespace: Some("namespace".into()), action: PodAction::NoAction, }; - + let api: Box> = Box::new(MockApi::new()); assert!(handle_deletion_work( "instance_name", "configuration_name", true, "node_to_delete_pod", &context, - &MockKubeInterface::new(), + api.as_ref(), ) .await .is_err()); @@ -324,14 +283,14 @@ mod handle_deletion_work_tests { namespace: None, action: PodAction::NoAction, }; - + let api: Box> = Box::new(MockApi::new()); assert!(handle_deletion_work( "instance_name", "configuration_name", true, "node_to_delete_pod", &context, - &MockKubeInterface::new(), + api.as_ref(), ) .await .is_err()); @@ -339,48 +298,24 @@ mod handle_deletion_work_tests { } /// This handles Instance addition event by creating the -/// broker Pod, the broker Service, and the capability Service. -/// TODO: reduce parameters by passing Instance object instead of -/// individual fields -#[allow(clippy::too_many_arguments)] +/// broker Pod. async fn handle_addition_work( - instance_name: &str, - instance_uid: &str, - instance_namespace: &str, - instance_class_name: &str, - instance_shared: bool, + api: &dyn Api, + pod: Pod, + configuration_name: &str, new_node: &str, - podspec: &PodSpec, - kube_interface: &impl KubeInterface, + field_manager: &str, ) -> anyhow::Result<()> { trace!( "handle_addition_work - Create new Pod for Node={:?}", new_node ); - let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name); - let new_pod = pod::create_new_pod_from_spec( - instance_namespace, - instance_name, - instance_class_name, - OwnershipInfo::new( - OwnershipType::Instance, - instance_name.to_string(), - instance_uid.to_string(), - ), - &capability_id, - new_node, - instance_shared, - podspec, - )?; - - trace!("handle_addition_work - New pod spec={:?}", new_pod); - kube_interface - .create_pod(&new_pod, instance_namespace) - .await?; + trace!("handle_addition_work - New pod spec={:?}", pod); + api.apply(pod, field_manager).await?; trace!("handle_addition_work - pod::create_pod succeeded",); BROKER_POD_COUNT_METRIC - .with_label_values(&[instance_class_name, new_node]) + .with_label_values(&[configuration_name, new_node]) .inc(); Ok(()) @@ -392,36 +327,32 @@ async fn handle_addition_work( pub async fn handle_instance_change( instance: &Instance, action: &InstanceAction, - kube_interface: &impl KubeInterface, -) -> anyhow::Result<()> { + ctx: Arc, +) -> Result { trace!("handle_instance_change - enter {:?}", action); - let instance_name = instance.metadata.name.clone().unwrap(); - let instance_namespace = - instance.metadata.namespace.as_ref().ok_or_else(|| { - anyhow::anyhow!("Namespace not found for instance: {}", &instance_name) - })?; - let configuration = match kube_interface - .find_configuration(&instance.spec.configuration_name, instance_namespace) - .await - { - Ok(config) => config, - _ => { - if action != &InstanceAction::Remove { - // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances. - // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent - // is designed to shutdown when it's Configuration watcher fails. - error!( + let instance_namespace = instance + .metadata + .namespace + .as_ref() + .context("no namespace")?; + let api: Box> = ctx.client.namespaced(instance_namespace); + let Ok(Some(configuration)) = api.get(&instance.spec.configuration_name).await else { + if action != &InstanceAction::Remove { + // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances. + // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent + // is designed to shutdown when it's Configuration watcher fails. + error!( "handle_instance_change - no configuration found for {:?} yet instance {:?} exists - check that device plugin is running properly", &instance.spec.configuration_name, &instance.metadata.name ); - } - return Ok(()); } + + return Ok(default_requeue_action()); }; if let Some(broker_spec) = &configuration.spec.broker_spec { let instance_change_result = match broker_spec { BrokerSpec::BrokerPodSpec(p) => { - handle_instance_change_pod(instance, p, action, kube_interface).await + handle_instance_change_pod(instance, p, action, ctx).await } BrokerSpec::BrokerJobSpec(j) => { handle_instance_change_job( @@ -429,7 +360,7 @@ pub async fn handle_instance_change( *configuration.metadata.generation.as_ref().unwrap(), j, action, - kube_interface, + ctx.client.clone(), ) .await } @@ -438,7 +369,7 @@ pub async fn handle_instance_change( error!("Unable to handle Broker action: {:?}", e); } } - Ok(()) + Ok(default_requeue_action()) } /// Called when an Instance has changed that requires a Job broker. Action determined by InstanceAction. @@ -450,7 +381,7 @@ pub async fn handle_instance_change_job( config_generation: i64, job_spec: &JobSpec, action: &InstanceAction, - kube_interface: &impl KubeInterface, + client: Arc, ) -> anyhow::Result<()> { trace!("handle_instance_change_job - enter {:?}", action); // Create name for Job. Includes Configuration generation in the suffix @@ -484,26 +415,27 @@ pub async fn handle_instance_change_job( job_spec, &job_name, )?; - kube_interface - .create_job(&new_job, instance_namespace) - .await?; + let api: Box> = client.namespaced(instance_namespace); + api.create(&new_job).await?; } InstanceAction::Remove => { trace!("handle_instance_change_job - instance removed"); // Find all jobs with the label - let instance_jobs = kube_interface - .find_jobs_with_label(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name)) - .await?; - let delete_tasks = instance_jobs.into_iter().map(|j| async move { - kube_interface - .remove_job( - j.metadata.name.as_ref().unwrap(), - j.metadata.namespace.as_ref().unwrap(), - ) - .await - }); - - futures::future::try_join_all(delete_tasks).await?; + let api: Box> = client.namespaced(instance_namespace); + let lp = ListParams::default() + .labels(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name)); + match api.delete_collection(&lp).await? { + either::Either::Left(list) => { + let names: Vec<_> = list.iter().map(ResourceExt::name_any).collect(); + trace!("handle_instance_change_job - deleting jobs: {:?}", names); + } + either::Either::Right(status) => { + println!( + "handle_instance_change_job - deleted jobs: status={:?}", + status + ); + } + } } InstanceAction::Update => { trace!("handle_instance_change_job - instance updated"); @@ -523,7 +455,7 @@ pub async fn handle_instance_change_pod( instance: &Instance, podspec: &PodSpec, action: &InstanceAction, - kube_interface: &impl KubeInterface, + ctx: Arc, ) -> anyhow::Result<()> { trace!("handle_instance_change_pod - enter {:?}", action); @@ -555,14 +487,12 @@ pub async fn handle_instance_change_pod( nodes_to_act_on ); - trace!( - "handle_instance_change - find all pods that have {}={}", - AKRI_INSTANCE_LABEL_NAME, - instance_name - ); - let instance_pods = kube_interface - .find_pods_with_label(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name)) - .await?; + let lp = + ListParams::default().labels(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name)); + let api = ctx + .client + .namespaced(&instance.namespace().context("no namespace")?); + let instance_pods = api.list(&lp).await?; trace!( "handle_instance_change - found {} pods", instance_pods.items.len() @@ -581,7 +511,7 @@ pub async fn handle_instance_change_pod( "handle_instance_change - nodes tracked after querying existing pods={:?}", nodes_to_act_on ); - do_pod_action_for_nodes(nodes_to_act_on, instance, podspec, kube_interface).await?; + do_pod_action_for_nodes(nodes_to_act_on, instance, podspec, api, &ctx.identifier).await?; trace!("handle_instance_change - exit"); Ok(()) @@ -591,7 +521,8 @@ pub(crate) async fn do_pod_action_for_nodes( nodes_to_act_on: HashMap, instance: &Instance, podspec: &PodSpec, - kube_interface: &impl KubeInterface, + api: Box>, + field_manager: &str, ) -> anyhow::Result<()> { trace!("do_pod_action_for_nodes - enter"); // Iterate over nodes_to_act_on where value == (PodAction::Remove | PodAction::RemoveAndAdd) @@ -604,7 +535,7 @@ pub(crate) async fn do_pod_action_for_nodes( instance.spec.shared, node_to_delete_pod, context, - kube_interface, + api.as_ref(), ) .await? } @@ -622,118 +553,55 @@ pub(crate) async fn do_pod_action_for_nodes( .collect::>(); // Iterate over nodes_to_act_on where value == (PodAction::Add | PodAction::RemoveAndAdd) + let instance_name = instance.metadata.name.clone().unwrap(); + let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name); for new_node in nodes_to_add { - handle_addition_work( - instance.metadata.name.as_ref().unwrap(), - instance.metadata.uid.as_ref().unwrap(), + let new_pod = pod::create_new_pod_from_spec( instance.metadata.namespace.as_ref().unwrap(), + &instance_name, &instance.spec.configuration_name, - instance.spec.shared, + OwnershipInfo::new( + OwnershipType::Instance, + instance_name.clone(), + instance.metadata.uid.clone().unwrap(), + ), + &capability_id, &new_node, + instance.spec.shared, podspec, - kube_interface, + )?; + handle_addition_work( + api.as_ref(), + new_pod, + &instance.spec.configuration_name, + &new_node, + field_manager, ) .await?; } Ok(()) } +// Default action for finalizers for the instance controller +fn default_requeue_action() -> Action { + Action::await_change() +} + #[cfg(test)] mod handle_instance_tests { - use super::super::shared_test_utils::config_for_tests; - use super::super::shared_test_utils::config_for_tests::PodList; + use crate::util::shared_test_utils::mock_client::MockControllerKubeClient; + + use super::super::shared_test_utils::config_for_tests::*; use super::*; use akri_shared::{ akri::instance::Instance, - k8s::{pod::AKRI_INSTANCE_LABEL_NAME, MockKubeInterface}, + k8s::{api::MockApi, pod::AKRI_INSTANCE_LABEL_NAME}, os::file, }; use chrono::prelude::*; use chrono::Utc; use mockall::predicate::*; - fn configure_find_pods_with_phase( - mock: &mut MockKubeInterface, - pod_selector: &'static str, - result_file: &'static str, - specified_phase: &'static str, - ) { - trace!( - "mock.expect_find_pods_with_label pod_selector:{}", - pod_selector - ); - mock.expect_find_pods_with_label() - .times(1) - .withf(move |selector| selector == pod_selector) - .returning(move |_| { - let pods_json = file::read_file_to_string(result_file); - let phase_adjusted_json = pods_json.replace( - "\"phase\": \"Running\"", - &format!("\"phase\": \"{}\"", specified_phase), - ); - let pods: PodList = serde_json::from_str(&phase_adjusted_json).unwrap(); - Ok(pods) - }); - } - - fn configure_find_pods_with_phase_and_start_time( - mock: &mut MockKubeInterface, - pod_selector: &'static str, - result_file: &'static str, - specified_phase: &'static str, - start_time: DateTime, - ) { - trace!( - "mock.expect_find_pods_with_label pod_selector:{}", - pod_selector - ); - mock.expect_find_pods_with_label() - .times(1) - .withf(move |selector| selector == pod_selector) - .returning(move |_| { - let pods_json = file::read_file_to_string(result_file); - let phase_adjusted_json = pods_json.replace( - "\"phase\": \"Running\"", - &format!("\"phase\": \"{}\"", specified_phase), - ); - let start_time_adjusted_json = phase_adjusted_json.replace( - "\"startTime\": \"2020-02-25T20:48:03Z\"", - &format!( - "\"startTime\": \"{}\"", - start_time.format("%Y-%m-%dT%H:%M:%SZ") - ), - ); - let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap(); - Ok(pods) - }); - } - - fn configure_find_pods_with_phase_and_no_start_time( - mock: &mut MockKubeInterface, - pod_selector: &'static str, - result_file: &'static str, - specified_phase: &'static str, - ) { - trace!( - "mock.expect_find_pods_with_label pod_selector:{}", - pod_selector - ); - mock.expect_find_pods_with_label() - .times(1) - .withf(move |selector| selector == pod_selector) - .returning(move |_| { - let pods_json = file::read_file_to_string(result_file); - let phase_adjusted_json = pods_json.replace( - "\"phase\": \"Running\"", - &format!("\"phase\": \"{}\"", specified_phase), - ); - let start_time_adjusted_json = - phase_adjusted_json.replace("\"startTime\": \"2020-02-25T20:48:03Z\",", ""); - let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap(); - Ok(pods) - }); - } - #[derive(Clone)] struct HandleInstanceWork { find_pods_selector: &'static str, @@ -754,10 +622,11 @@ mod handle_instance_tests { } fn configure_for_handle_instance_change( - mock: &mut MockKubeInterface, + mock: &mut MockControllerKubeClient, work: &HandleInstanceWork, ) { - config_for_tests::configure_find_config( + let mut mock_pod_api: MockApi = MockApi::new(); + configure_find_config( mock, work.config_work.find_config_name, work.config_work.find_config_namespace, @@ -767,7 +636,7 @@ mod handle_instance_tests { if let Some(phase) = work.find_pods_phase { if let Some(start_time) = work.find_pods_start_time { configure_find_pods_with_phase_and_start_time( - mock, + &mut mock_pod_api, work.find_pods_selector, work.find_pods_result, phase, @@ -775,35 +644,39 @@ mod handle_instance_tests { ); } else if work.find_pods_delete_start_time { configure_find_pods_with_phase_and_no_start_time( - mock, + &mut mock_pod_api, work.find_pods_selector, work.find_pods_result, phase, ); } else { configure_find_pods_with_phase( - mock, + &mut mock_pod_api, work.find_pods_selector, work.find_pods_result, phase, ); } } else { - config_for_tests::configure_find_pods( - mock, + configure_find_pods( + &mut mock_pod_api, work.find_pods_selector, + work.config_work.find_config_namespace, work.find_pods_result, false, ); } if let Some(deletion_work) = &work.deletion_work { - configure_for_handle_deletion_work(mock, deletion_work); + configure_for_handle_deletion_work(&mut mock_pod_api, deletion_work); } if let Some(addition_work) = &work.addition_work { - configure_for_handle_addition_work(mock, addition_work); + configure_for_handle_addition_work(&mut mock_pod_api, addition_work); } + mock.pod + .expect_namespaced() + .return_once(move |_| Box::new(mock_pod_api)); } #[derive(Clone)] @@ -829,12 +702,12 @@ mod handle_instance_tests { } } - fn configure_for_handle_deletion_work(mock: &mut MockKubeInterface, work: &HandleDeletionWork) { + fn configure_for_handle_deletion_work(mock: &mut MockApi, work: &HandleDeletionWork) { for i in 0..work.broker_pod_names.len() { let broker_pod_name = work.broker_pod_names[i]; let cleanup_namespace = work.cleanup_namespaces[i]; - config_for_tests::configure_remove_pod(mock, broker_pod_name, cleanup_namespace); + configure_remove_pod(mock, broker_pod_name, cleanup_namespace); } } @@ -870,10 +743,10 @@ mod handle_instance_tests { } } - fn configure_for_handle_addition_work(mock: &mut MockKubeInterface, work: &HandleAdditionWork) { + fn configure_for_handle_addition_work(mock_api: &mut MockApi, work: &HandleAdditionWork) { for i in 0..work.new_pod_names.len() { - config_for_tests::configure_add_pod( - mock, + configure_add_pod( + mock_api, work.new_pod_names[i], work.new_pod_namespaces[i], AKRI_INSTANCE_LABEL_NAME, @@ -884,62 +757,30 @@ mod handle_instance_tests { } async fn run_handle_instance_change_test( - mock: &mut MockKubeInterface, + ctx: Arc, instance_file: &'static str, action: &'static InstanceAction, ) { trace!("run_handle_instance_change_test enter"); let instance_json = file::read_file_to_string(instance_file); let instance: Instance = serde_json::from_str(&instance_json).unwrap(); - handle_instance( + reconcile_inner( match action { - InstanceAction::Add | InstanceAction::Update => Event::Applied(instance), - InstanceAction::Remove => Event::Deleted(instance), + InstanceAction::Add | InstanceAction::Update => Event::Apply(Arc::new(instance)), + InstanceAction::Remove => Event::Cleanup(Arc::new(instance)), }, - mock, - &mut false, + ctx, ) .await .unwrap(); trace!("run_handle_instance_change_test exit"); } - // Test that watcher errors on restarts unless it is the first restart (aka initial startup) - #[tokio::test] - async fn test_handle_watcher_restart() { - let _ = env_logger::builder().is_test(true).try_init(); - let mut first_event = true; - assert!(handle_instance( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_ok()); - first_event = false; - assert!(handle_instance( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_err()); - } - - #[tokio::test] - async fn test_internal_handle_existing_instances_no_instances() { - let _ = env_logger::builder().is_test(true).try_init(); - - let mut mock = MockKubeInterface::new(); - config_for_tests::configure_get_instances(&mut mock, "../test/json/empty-list.json", false); - internal_handle_existing_instances(&mock).await.unwrap(); - } - #[tokio::test] async fn test_handle_instance_change_for_add_new_local_instance() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -954,7 +795,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/local-instance.json", &InstanceAction::Add, ) @@ -965,7 +806,7 @@ mod handle_instance_tests { async fn test_handle_instance_change_for_add_new_local_instance_error() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -980,7 +821,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/local-instance.json", &InstanceAction::Add, ) @@ -991,7 +832,7 @@ mod handle_instance_tests { async fn test_handle_instance_change_for_remove_running_local_instance() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1006,7 +847,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/local-instance.json", &InstanceAction::Remove, ) @@ -1017,7 +858,7 @@ mod handle_instance_tests { async fn test_handle_instance_change_for_add_new_shared_instance() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1034,7 +875,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/shared-instance.json", &InstanceAction::Add, ) @@ -1045,7 +886,7 @@ mod handle_instance_tests { async fn test_handle_instance_change_for_remove_running_shared_instance() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1060,7 +901,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/shared-instance.json", &InstanceAction::Remove, ) @@ -1071,7 +912,7 @@ mod handle_instance_tests { async fn test_handle_instance_change_for_update_active_shared_instance() { let _ = env_logger::builder().is_test(true).try_init(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1088,7 +929,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/shared-instance-update.json", &InstanceAction::Update, ) @@ -1127,7 +968,7 @@ mod handle_instance_tests { }) .collect::>(); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1143,7 +984,12 @@ mod handle_instance_tests { )), }, ); - run_handle_instance_change_test(&mut mock, instance_file, &InstanceAction::Update).await; + run_handle_instance_change_test( + Arc::new(ControllerContext::new(Arc::new(mock), "test")), + instance_file, + &InstanceAction::Update, + ) + .await; } /// Checks that the BROKER_POD_COUNT_METRIC is appropriately incremented @@ -1160,7 +1006,7 @@ mod handle_instance_tests { .with_label_values(&["config-a", "node-a"]) .set(0); - let mut mock = MockKubeInterface::new(); + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1175,7 +1021,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/local-instance.json", &InstanceAction::Add, ) @@ -1188,7 +1034,7 @@ mod handle_instance_tests { .get(), 1 ); - + let mut mock = MockControllerKubeClient::default(); configure_for_handle_instance_change( &mut mock, &HandleInstanceWork { @@ -1203,7 +1049,7 @@ mod handle_instance_tests { }, ); run_handle_instance_change_test( - &mut mock, + Arc::new(ControllerContext::new(Arc::new(mock), "test")), "../test/json/local-instance.json", &InstanceAction::Remove, ) diff --git a/controller/src/util/mod.rs b/controller/src/util/mod.rs index 4c6953c2d..a3ee3ab2f 100644 --- a/controller/src/util/mod.rs +++ b/controller/src/util/mod.rs @@ -1,5 +1,24 @@ +pub(crate) mod controller_ctx; pub mod instance_action; pub mod node_watcher; mod pod_action; pub mod pod_watcher; mod shared_test_utils; + +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ControllerError { + #[error(transparent)] + KubeError(#[from] kube::Error), + + #[error("Finalizer Error: {0}")] + // NB: awkward type because finalizer::Error embeds the reconciler error (which is this) + // so boxing this error to break cycles + FinalizerError(#[source] Box>), + + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +pub type Result = std::result::Result; diff --git a/controller/src/util/node_watcher.rs b/controller/src/util/node_watcher.rs index 2b94b3d54..20cd1aa6b 100644 --- a/controller/src/util/node_watcher.rs +++ b/controller/src/util/node_watcher.rs @@ -1,679 +1,477 @@ -use akri_shared::{ - akri::{ - instance::device_usage::NodeUsage, - instance::{Instance, InstanceSpec}, - retry::{random_delay, MAX_INSTANCE_UPDATE_TRIES}, - }, - k8s, - k8s::KubeInterface, +//! This is used to handle Nodes disappearing. +//! +//! When a Node disapears, make sure that any Instance that +//! references the Node is cleaned. This means that the +//! Instance.nodes property no longer contains the node and +//! that the Instance.deviceUsage property no longer contains +//! slots that are occupied by the node. +use crate::util::{ + controller_ctx::{ControllerContext, NodeState}, + ControllerError, Result, }; -use futures::{StreamExt, TryStreamExt}; +use akri_shared::k8s::api::Api; + +use akri_shared::akri::instance::{device_usage::NodeUsage, Instance}; +use futures::StreamExt; use k8s_openapi::api::core::v1::{Node, NodeStatus}; -use kube::api::Api; -use kube_runtime::watcher::{watcher, Config, Event}; -use kube_runtime::WatchStreamExt; +use kube::{ + api::{ListParams, ObjectList, ResourceExt}, + runtime::{ + controller::{Action, Controller}, + finalizer::finalizer, + finalizer::Event, + watcher::Config, + }, +}; use log::{error, info, trace}; -use std::collections::HashMap; use std::str::FromStr; +use std::{collections::HashMap, sync::Arc}; -/// Node states that NodeWatcher is interested in -/// -/// NodeState describes the various states that the controller can -/// react to for Nodes. -#[derive(Clone, Debug, PartialEq)] -enum NodeState { - /// Node has been seen, but not Running yet - Known, - /// Node has been seen Running - Running, - /// A previously Running Node has been seen as not Running - /// and the Instances have been cleaned of references to that - /// vanished Node - InstancesCleaned, -} +pub static NODE_FINALIZER: &str = "nodes.kube.rs"; -/// This is used to handle Nodes disappearing. -/// -/// When a Node disapears, make sure that any Instance that -/// references the Node is cleaned. This means that the -/// Instance.nodes property no longer contains the node and -/// that the Instance.deviceUsage property no longer contains -/// slots that are occupied by the node. -pub struct NodeWatcher { - known_nodes: HashMap, -} - -impl NodeWatcher { - /// Create new instance of BrokerPodWatcher - pub fn new() -> Self { - NodeWatcher { - known_nodes: HashMap::new(), - } +/// Initialize the instance controller +/// TODO: consider passing state that is shared among controllers such as a metrics exporter +pub async fn run(ctx: Arc) { + let api = ctx.client.all().as_inner(); + if let Err(e) = api.list(&ListParams::default().limit(1)).await { + error!("Nodes are not queryable; {e:?}"); + std::process::exit(1); } + Controller::new(api, Config::default().any_semantic()) + .shutdown_on_signal() + .run(reconcile, error_policy, ctx) + // TODO: needs update for tokio? + .filter_map(|x| async move { std::result::Result::ok(x) }) + .for_each(|_| futures::future::ready(())) + .await; +} - /// This watches for Node events - pub async fn watch( - &mut self, - ) -> Result<(), Box> { - trace!("watch - enter"); - let kube_interface = k8s::KubeImpl::new().await?; - let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher(resource, Config::default()).default_backoff(); - let mut informer = watcher.boxed(); - let mut first_event = true; - - // Currently, this does not handle None except to break the loop. - loop { - let event = match informer.try_next().await { - Err(e) => { - error!("Error during watch: {}", e); - continue; - } - Ok(None) => break, - Ok(Some(event)) => event, - }; - self.handle_node(event, &kube_interface, &mut first_event) - .await?; - } +fn error_policy(_node: Arc, error: &ControllerError, _ctx: Arc) -> Action { + log::warn!("reconcile failed: {:?}", error); + Action::requeue(std::time::Duration::from_secs(5 * 60)) +} - Ok(()) - } +/// This function is the main Reconcile function for Node resources +/// This will get called every time an Node is added, deleted, or changed, it will also be called for every existing Node on startup. +/// +/// Nodes are constantly updated. Cleanup work for our services only +/// needs to be called once. +/// +/// To achieve this, store each Node's state as either Known (Node has +/// been seen, but not Running), Running (Node has been seen as Running), +/// and InstanceCleaned (previously Running Node has been seen as not +/// Running). +/// +/// When a Node is in the Known state, it is not Running. If it has +/// never been seen as Running, it is likely being created and there is +/// no need to clean any Instance. +/// +/// Once a Node moves through the Running state into a non Running +/// state, it becomes important to clean Instances referencing the +/// non-Running Node. +pub async fn reconcile(node: Arc, ctx: Arc) -> Result { + trace!("Reconciling node {}", node.name_any()); + finalizer( + &ctx.client.clone().all().as_inner(), + NODE_FINALIZER, + node, + |event| reconcile_inner(event, ctx.clone()), + ) + .await + // .map_err(|_e| anyhow!("todo")) + .map_err(|e| ControllerError::FinalizerError(Box::new(e))) +} - /// This takes an event off the Node stream and if a Node is no longer - /// available, it calls handle_node_disappearance. - /// - /// Nodes are constantly updated. Cleanup work for our services only - /// needs to be called once. - /// - /// To achieve this, store each Node's state as either Known (Node has - /// been seen, but not Running), Running (Node has been seen as Running), - /// and InstanceCleaned (previously Running Node has been seen as not - /// Running). - /// - /// When a Node is in the Known state, it is not Running. If it has - /// never been seen as Running, it is likely being created and there is - /// no need to clean any Instance. - /// - /// Once a Node moves through the Running state into a non Running - /// state, it becomes important to clean Instances referencing the - /// non-Running Node. - async fn handle_node( - &mut self, - event: Event, - kube_interface: &impl KubeInterface, - first_event: &mut bool, - ) -> anyhow::Result<()> { - trace!("handle_node - enter"); - match event { - Event::Applied(node) => { - let node_name = node.metadata.name.clone().unwrap(); - info!("handle_node - Added or modified: {}", node_name); - if self.is_node_ready(&node) { - self.known_nodes.insert(node_name, NodeState::Running); - } else if let std::collections::hash_map::Entry::Vacant(e) = - self.known_nodes.entry(node_name) - { +async fn reconcile_inner(event: Event, ctx: Arc) -> Result { + match event { + Event::Apply(node) => { + let node_name = node.metadata.name.clone().unwrap(); + info!("handle_node - Added or modified: {}", node_name); + if is_node_ready(&node) { + ctx.known_nodes + .write() + .await + .insert(node_name, NodeState::Running); + } else { + let mut guard = ctx.known_nodes.write().await; + if let std::collections::hash_map::Entry::Vacant(e) = guard.entry(node_name) { e.insert(NodeState::Known); } else { // Node Modified - self.call_handle_node_disappearance_if_needed(&node, kube_interface) - .await?; - } - } - Event::Deleted(node) => { - info!("handle_node - Deleted: {:?}", &node.metadata.name); - self.call_handle_node_disappearance_if_needed(&node, kube_interface) - .await?; - } - Event::Restarted(_nodes) => { - if *first_event { - info!("handle_node - watcher started"); - } else { - return Err(anyhow::anyhow!( - "Node watcher restarted - throwing error to restart controller" - )); + drop(guard); + call_handle_node_disappearance_if_needed(&node, ctx.clone()).await?; } } - }; - *first_event = false; - Ok(()) + Ok(Action::await_change()) + } + Event::Cleanup(node) => { + info!("handle_node - Deleted: {:?}", &node.metadata.name); + call_handle_node_disappearance_if_needed(&node, ctx.clone()).await?; + ctx.known_nodes + .write() + .await + .remove(&node.metadata.name.as_deref().unwrap().to_string()); + Ok(Action::await_change()) + } } - - /// This should be called for Nodes that are either !Ready or Deleted. - /// This function ensures that handle_node_disappearance is called - /// only once for any Node as it disappears. - async fn call_handle_node_disappearance_if_needed( - &mut self, - node: &Node, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - let node_name = node.metadata.name.clone().unwrap(); +} +/// This should be called for Nodes that are either !Ready or Deleted. +/// This function ensures that handle_node_disappearance is called +/// only once for any Node as it disappears. +async fn call_handle_node_disappearance_if_needed( + node: &Node, + ctx: Arc, +) -> anyhow::Result<()> { + let node_name = node.metadata.name.as_deref().unwrap(); + trace!( + "call_handle_node_disappearance_if_needed - enter: {:?}", + &node.metadata.name + ); + let last_known_state = ctx + .known_nodes + .read() + .await + .get(node_name) + .unwrap_or(&NodeState::Running) + .clone(); + trace!( + "call_handle_node_disappearance_if_needed - last_known_state: {:?}", + &last_known_state + ); + // Nodes are updated roughly once a minute ... try to only call + // handle_node_disappearance once for a node that disappears. + // + // Also, there is no need to call handle_node_disappearance if a + // Node has never been in the Running state. + if last_known_state == NodeState::Running { trace!( - "call_handle_node_disappearance_if_needed - enter: {:?}", + "call_handle_node_disappearance_if_needed - call handle_node_disappearance: {:?}", &node.metadata.name ); - let last_known_state = self - .known_nodes - .get(&node_name) - .unwrap_or(&NodeState::Running); - trace!( - "call_handle_node_disappearance_if_needed - last_known_state: {:?}", - &last_known_state - ); - // Nodes are updated roughly once a minute ... try to only call - // handle_node_disappearance once for a node that disappears. - // - // Also, there is no need to call handle_node_disappearance if a - // Node has never been in the Running state. - if last_known_state == &NodeState::Running { - trace!( - "call_handle_node_disappearance_if_needed - call handle_node_disappearance: {:?}", - &node.metadata.name - ); - self.handle_node_disappearance(&node_name, kube_interface) - .await?; - self.known_nodes - .insert(node_name, NodeState::InstancesCleaned); - } - Ok(()) - } - - /// This determines if a node is in the Ready state. - fn is_node_ready(&self, k8s_node: &Node) -> bool { - trace!("is_node_ready - for node {:?}", k8s_node.metadata.name); - k8s_node - .status - .as_ref() - .unwrap_or(&NodeStatus::default()) - .conditions - .as_ref() - .unwrap_or(&Vec::new()) - .iter() - .filter_map(|condition| { - if condition.type_ == "Ready" { - Some(condition.status == "True") - } else { - None - } - }) - .collect::>() - .last() - .unwrap_or(&false) - == &true + handle_node_disappearance(node_name, ctx.clone()).await?; + ctx.known_nodes + .write() + .await + .insert(node_name.to_string(), NodeState::InstancesCleaned); } + Ok(()) +} - /// This handles when a node disappears by clearing nodes from - /// the nodes list and deviceUsage map and then trying 5 times to - /// update the Instance. - async fn handle_node_disappearance( - &self, - vanished_node_name: &str, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!( - "handle_node_disappearance - enter vanished_node_name={:?}", - vanished_node_name, - ); - - let instances = kube_interface.get_instances().await?; - trace!( - "handle_node_disappearance - found {:?} instances", - instances.items.len() - ); - for instance in instances.items { - let instance_name = instance.metadata.name.clone().unwrap(); - let instance_namespace = instance.metadata.namespace.as_ref().ok_or_else(|| { - anyhow::anyhow!("Namespace not found for instance: {}", instance_name) - })?; - - trace!( - "handle_node_disappearance - make sure node is not referenced here: {:?}", - &instance_name - ); - - // Try up to MAX_INSTANCE_UPDATE_TRIES times to update/create/get instance - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - match if x == 0 { - self.try_remove_nodes_from_instance( - vanished_node_name, - &instance_name, - instance_namespace, - &instance, - kube_interface, - ) - .await - } else { - let retry_instance = kube_interface - .find_instance(&instance_name, instance_namespace) - .await?; - self.try_remove_nodes_from_instance( - vanished_node_name, - &instance_name, - instance_namespace, - &retry_instance, - kube_interface, - ) - .await - } { - Ok(_) => break, - Err(e) => { - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - random_delay().await; - } - } +/// This determines if a node is in the Ready state. +fn is_node_ready(k8s_node: &Node) -> bool { + trace!("is_node_ready - for node {:?}", k8s_node.metadata.name); + k8s_node + .status + .as_ref() + .unwrap_or(&NodeStatus::default()) + .conditions + .as_ref() + .unwrap_or(&Vec::new()) + .iter() + .filter_map(|condition| { + if condition.type_ == "Ready" { + Some(condition.status == "True") + } else { + None } - } + }) + .collect::>() + .last() + .unwrap_or(&false) + == &true +} - trace!("handle_node_disappearance - exit"); - Ok(()) - } +/// This handles when a node disappears by clearing nodes from +/// the nodes list and deviceUsage map. +async fn handle_node_disappearance( + vanished_node_name: &str, + ctx: Arc, +) -> anyhow::Result<()> { + trace!( + "handle_node_disappearance - enter vanished_node_name={:?}", + vanished_node_name, + ); + let api = ctx.client.all(); + let instances: ObjectList = api.list(&ListParams::default()).await?; + trace!( + "handle_node_disappearance - found {:?} instances", + instances.items.len() + ); + for instance in instances.items { + let instance_name = instance.metadata.name.clone().unwrap(); - /// This attempts to remove nodes from the nodes list and deviceUsage - /// map in an Instance. An attempt is made to update - /// the instance in etcd, any failure is returned. - async fn try_remove_nodes_from_instance( - &self, - vanished_node_name: &str, - instance_name: &str, - instance_namespace: &str, - instance: &Instance, - kube_interface: &impl KubeInterface, - ) -> Result<(), anyhow::Error> { trace!( - "try_remove_nodes_from_instance - vanished_node_name: {:?}", - &vanished_node_name + "handle_node_disappearance - make sure node is not referenced here: {:?}", + &instance_name ); - let modified_nodes = instance - .spec - .nodes - .iter() - .filter(|node| &vanished_node_name != node) - .map(|node| node.into()) - .collect::>(); - // Remove nodes from instance.deviceusage - let modified_device_usage = instance - .spec - .device_usage - .iter() - .map(|(slot, usage)| { - let same_node_name = match NodeUsage::from_str(usage) { - Ok(node_usage) => node_usage.is_same_node(vanished_node_name), - Err(_) => false, - }; - - ( - slot.to_string(), - if same_node_name { - NodeUsage::default().to_string() - } else { - usage.into() - }, - ) - }) - .collect::>(); - - // Save the instance - let modified_instance = InstanceSpec { - cdi_name: instance.spec.cdi_name.clone(), - capacity: instance.spec.capacity, - configuration_name: instance.spec.configuration_name.clone(), - broker_properties: instance.spec.broker_properties.clone(), - shared: instance.spec.shared, - device_usage: modified_device_usage, - nodes: modified_nodes, - }; - - trace!( - "handle_node_disappearance - kube_interface.update_instance name: {}, namespace: {}, {:?}", + try_remove_nodes_from_instance( + vanished_node_name, &instance_name, - &instance_namespace, - &modified_instance - ); - - kube_interface - .update_instance(&modified_instance, instance_name, instance_namespace) - .await + &instance, + api.as_ref(), + &ctx.identifier, + ) + .await?; } + + trace!("handle_node_disappearance - exit"); + Ok(()) +} + +/// This attempts to remove nodes from the nodes list and deviceUsage +/// map in an Instance. An attempt is made to update +/// the instance in etcd, any failure is returned. +async fn try_remove_nodes_from_instance( + vanished_node_name: &str, + _instance_name: &str, + instance: &Instance, + api: &dyn Api, + field_manager: &str, +) -> Result<(), anyhow::Error> { + trace!( + "try_remove_nodes_from_instance - vanished_node_name: {:?}", + &vanished_node_name + ); + let modified_nodes = instance + .spec + .nodes + .iter() + .filter(|node| &vanished_node_name != node) + .map(|node| node.into()) + .collect::>(); + // Remove nodes from instance.deviceusage + let modified_device_usage = instance + .spec + .device_usage + .iter() + .map(|(slot, usage)| match NodeUsage::from_str(usage) { + Ok(node_usage) if node_usage.is_same_node(vanished_node_name) => { + (slot.to_owned(), NodeUsage::default().to_string()) + } + Ok(_) => (slot.to_owned(), usage.into()), + Err(_) => (slot.to_owned(), usage.into()), + }) + .collect::>(); + let mut patched = instance.clone(); + patched.spec.device_usage = modified_device_usage; + patched.spec.nodes = modified_nodes; + api.apply(patched, field_manager).await?; + Ok(()) } #[cfg(test)] mod tests { - use super::super::shared_test_utils::config_for_tests; + use super::super::shared_test_utils::mock_client::MockControllerKubeClient; use super::*; - use akri_shared::{akri::instance::InstanceList, k8s::MockKubeInterface, os::file}; - - #[derive(Clone)] - struct UpdateInstance { - instance_to_update: InstanceSpec, - instance_name: &'static str, - instance_namespace: &'static str, - } - - #[derive(Clone)] - struct HandleNodeDisappearance { - get_instances_result_file: &'static str, - get_instances_result_listify: bool, - update_instance: Option, - } + use akri_shared::{akri::instance::InstanceSpec, k8s::api::MockApi, os::file}; - fn configure_for_handle_node_disappearance( - mock: &mut MockKubeInterface, - work: &HandleNodeDisappearance, - ) { - config_for_tests::configure_get_instances( - mock, - work.get_instances_result_file, - work.get_instances_result_listify, - ); - - if let Some(update_instance) = &work.update_instance { - config_for_tests::configure_update_instance( - mock, - update_instance.instance_to_update.clone(), - update_instance.instance_name, - update_instance.instance_namespace, - false, - ); - } - } - - // Test that watcher errors on restarts unless it is the first restart (aka initial startup) - #[tokio::test] - async fn test_handle_watcher_restart() { - let _ = env_logger::builder().is_test(true).try_init(); - let mut pod_watcher = NodeWatcher::new(); - let mut first_event = true; - assert!(pod_watcher - .handle_node( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_ok()); - first_event = false; - assert!(pod_watcher - .handle_node( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_err()); - } - - #[tokio::test] - async fn test_handle_node_added_unready() { - let _ = env_logger::builder().is_test(true).try_init(); - let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); - let node: Node = serde_json::from_str(&node_json).unwrap(); - let mut node_watcher = NodeWatcher::new(); - node_watcher - .handle_node(Event::Applied(node), &MockKubeInterface::new(), &mut false) - .await - .unwrap(); - - assert_eq!(1, node_watcher.known_nodes.len()); - - assert_eq!( - &NodeState::Known, - node_watcher.known_nodes.get("node-a").unwrap() - ) + fn instances_list( + instance_name: &str, + instance_namespace: &str, + ) -> kube::Result> { + let list = serde_json::json!({ + "apiVersion": "v1", + "kind": "List", + "metadata": { + "resourceVersion": "", + "selfLink": "" + }, + "items": [ + { + "apiVersion": "akri.sh/v0", + "kind": "Instance", + "metadata": { + "name": instance_name, + "namespace": instance_namespace, + "uid": "abcdegfh-ijkl-mnop-qrst-uvwxyz012345" + }, + "spec": { + "configurationName": "config-a", + "capacity": 5, + "cdiName": "akri.sh/config-a=359973", + "deviceUsage": { + format!("{instance_name}-0"): "node-b", + format!("{instance_name}-1"): "node-a", + format!("{instance_name}-2"): "node-b", + format!("{instance_name}-3"): "node-a", + format!("{instance_name}-4"): "node-c", + format!("{instance_name}-5"): "" + }, + "nodes": [ "node-a", "node-b", "node-c" ], + "shared": true + } + } + ] + }); + Ok(serde_json::from_value(list).unwrap()) } #[tokio::test] - async fn test_handle_node_added_ready() { + async fn test_reconcile_node_apply_ready() { let _ = env_logger::builder().is_test(true).try_init(); - let node_json = file::read_file_to_string("../test/json/node-a.json"); let node: Node = serde_json::from_str(&node_json).unwrap(); - let mut node_watcher = NodeWatcher::new(); - node_watcher - .handle_node(Event::Applied(node), &MockKubeInterface::new(), &mut false) + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone()) .await .unwrap(); - assert_eq!(1, node_watcher.known_nodes.len()); - assert_eq!( &NodeState::Running, - node_watcher.known_nodes.get("node-a").unwrap() - ) + ctx.known_nodes.read().await.get(&node_name).unwrap() + ); } #[tokio::test] - async fn test_handle_node_modified_unready_unknown() { + async fn test_reconcile_node_apply_unready_unknown() { let _ = env_logger::builder().is_test(true).try_init(); - - let node_json = file::read_file_to_string("../test/json/node-b-not-ready.json"); + let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); let node: Node = serde_json::from_str(&node_json).unwrap(); - let mut node_watcher = NodeWatcher::new(); - let instance_file = "../test/json/shared-instance-update.json"; - let instance_json = file::read_file_to_string(instance_file); - let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap(); - let mut instance = kube_object_instance.spec; - instance.nodes.clear(); - instance - .device_usage - .insert("config-a-359973-2".to_string(), "".to_string()); - - let mut mock = MockKubeInterface::new(); - configure_for_handle_node_disappearance( - &mut mock, - &HandleNodeDisappearance { - get_instances_result_file: "../test/json/shared-instance-update.json", - get_instances_result_listify: true, - update_instance: Some(UpdateInstance { - instance_to_update: instance, - instance_name: "config-a-359973", - instance_namespace: "config-a-namespace", - }), - }, - ); - // Insert node into list of known_nodes to mock being previously applied - node_watcher - .known_nodes - .insert(node.metadata.name.clone().unwrap(), NodeState::Running); - node_watcher - .handle_node(Event::Applied(node), &mock, &mut false) + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone()) .await .unwrap(); - assert_eq!(1, node_watcher.known_nodes.len()); - assert_eq!( - &NodeState::InstancesCleaned, - node_watcher.known_nodes.get("node-b").unwrap() - ) + &NodeState::Known, + ctx.known_nodes.read().await.get(&node_name).unwrap() + ); } - + // If a known node is modified and is still not ready, it should remain in the known state #[tokio::test] - async fn test_handle_node_modified_ready_unknown() { + async fn test_reconcile_node_apply_unready_known() { let _ = env_logger::builder().is_test(true).try_init(); - - let node_json = file::read_file_to_string("../test/json/node-b.json"); + let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); let node: Node = serde_json::from_str(&node_json).unwrap(); - let mut node_watcher = NodeWatcher::new(); - - let mock = MockKubeInterface::new(); - node_watcher - .handle_node(Event::Applied(node), &mock, &mut false) + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + ctx.known_nodes + .write() + .await + .insert(node_name.clone(), NodeState::Known); + reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone()) .await .unwrap(); - assert_eq!(1, node_watcher.known_nodes.len()); - assert_eq!( - &NodeState::Running, - node_watcher.known_nodes.get("node-b").unwrap() - ) + &NodeState::Known, + ctx.known_nodes.read().await.get(&node_name).unwrap() + ); } + // If previously running node is modified and is not ready, it should remove the node from the instances' node lists #[tokio::test] - async fn test_handle_node_deleted_unready_unknown() { + async fn test_reconcile_node_apply_unready_previously_running() { let _ = env_logger::builder().is_test(true).try_init(); - - let node_json = file::read_file_to_string("../test/json/node-b-not-ready.json"); + let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); let node: Node = serde_json::from_str(&node_json).unwrap(); - let mut node_watcher = NodeWatcher::new(); - - let instance_file = "../test/json/shared-instance-update.json"; - let instance_json = file::read_file_to_string(instance_file); - let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap(); - let mut instance = kube_object_instance.spec; - instance.nodes.clear(); - instance - .device_usage - .insert("config-a-359973-2".to_string(), "".to_string()); - - let mut mock = MockKubeInterface::new(); - configure_for_handle_node_disappearance( - &mut mock, - &HandleNodeDisappearance { - get_instances_result_file: "../test/json/shared-instance-update.json", - get_instances_result_listify: true, - update_instance: Some(UpdateInstance { - instance_to_update: instance, - instance_name: "config-a-359973", - instance_namespace: "config-a-namespace", - }), - }, - ); - - node_watcher - .handle_node(Event::Deleted(node), &mock, &mut false) + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let mut instance_api_mock: MockApi = MockApi::new(); + let instance_name = "config-a-359973"; + instance_api_mock + .expect_list() + .return_once(|_| instances_list(instance_name, "unused")); + instance_api_mock + .expect_apply() + .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + mock.instance + .expect_all() + .return_once(move || Box::new(instance_api_mock)); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + ctx.known_nodes + .write() + .await + .insert(node_name.clone(), NodeState::Running); + reconcile_inner(Event::Apply(Arc::new(node)), ctx.clone()) .await .unwrap(); - - assert_eq!(1, node_watcher.known_nodes.len()); - assert_eq!( &NodeState::InstancesCleaned, - node_watcher.known_nodes.get("node-b").unwrap() - ) - } - - const LIST_PREFIX: &str = r#" -{ - "apiVersion": "v1", - "items": ["#; - const LIST_SUFFIX: &str = r#" - ], - "kind": "List", - "metadata": { - "resourceVersion": "", - "selfLink": "" - } -}"#; - fn listify_node(node_json: &str) -> String { - format!("{}\n{}\n{}", LIST_PREFIX, node_json, LIST_SUFFIX) + ctx.known_nodes.read().await.get(&node_name).unwrap() + ); } + // If previously running node enters the cleanup state, it should remove the node from the instances' node lists + // and ensure that the node is removed from the known_nodes #[tokio::test] - async fn test_handle_node_disappearance_update_failure_retries() { + async fn test_reconcile_node_cleanup() { let _ = env_logger::builder().is_test(true).try_init(); - - let mut mock = MockKubeInterface::new(); - mock.expect_get_instances().times(1).returning(move || { - let instance_file = "../test/json/shared-instance-update.json"; - let instance_json = file::read_file_to_string(instance_file); - let instance_list_json = listify_node(&instance_json); - let list: InstanceList = serde_json::from_str(&instance_list_json).unwrap(); - Ok(list) - }); - mock.expect_update_instance() - .times(MAX_INSTANCE_UPDATE_TRIES as usize) - .withf(move |_instance, n, ns| n == "config-a-359973" && ns == "config-a-namespace") - .returning(move |_, _, _| Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?)); - mock.expect_find_instance() - .times((MAX_INSTANCE_UPDATE_TRIES - 1) as usize) - .withf(move |n, ns| n == "config-a-359973" && ns == "config-a-namespace") - .returning(move |_, _| { - let instance_file = "../test/json/shared-instance-update.json"; - let instance_json = file::read_file_to_string(instance_file); - let instance: Instance = serde_json::from_str(&instance_json).unwrap(); - Ok(instance) - }); - - let node_watcher = NodeWatcher::new(); - assert!(node_watcher - .handle_node_disappearance("foo-a", &mock) + let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); + let node: Node = serde_json::from_str(&node_json).unwrap(); + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let mut instance_api_mock: MockApi = MockApi::new(); + let instance_name = "config-a-359973"; + instance_api_mock + .expect_list() + .return_once(|_| instances_list(instance_name, "unused")); + instance_api_mock + .expect_apply() + .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + mock.instance + .expect_all() + .return_once(move || Box::new(instance_api_mock)); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + ctx.known_nodes + .write() + .await + .insert(node_name.clone(), NodeState::Running); + reconcile_inner(Event::Cleanup(Arc::new(node)), ctx.clone()) .await - .is_err()); + .unwrap(); + assert!(ctx.known_nodes.read().await.get(&node_name).is_none()); } + // If unknown node is deleted, it should remove the node from the instances' node lists #[tokio::test] - async fn test_try_remove_nodes_from_instance() { + async fn test_reconcile_node_cleanup_unknown() { let _ = env_logger::builder().is_test(true).try_init(); - - let instance_file = "../test/json/shared-instance-update.json"; - let instance_json = file::read_file_to_string(instance_file); - let kube_object_instance: Instance = serde_json::from_str(&instance_json).unwrap(); - - let mut mock = MockKubeInterface::new(); - mock.expect_update_instance() - .times(1) - .withf(move |ins, n, ns| { - n == "config-a" - && ns == "config-a-namespace" - && !ins.nodes.contains(&"node-b".to_string()) - && ins - .device_usage - .iter() - .filter_map(|(_slot, value)| { - if value == &"node-b".to_string() { - Some(value.to_string()) - } else { - None - } - }) - .collect::>() - .first() - .is_none() - }) - .returning(move |_, _, _| Ok(())); - - let node_watcher = NodeWatcher::new(); - assert!(node_watcher - .try_remove_nodes_from_instance( - "node-b", - "config-a", - "config-a-namespace", - &kube_object_instance, - &mock, - ) + let node_json = file::read_file_to_string("../test/json/node-a-not-ready.json"); + let node: Node = serde_json::from_str(&node_json).unwrap(); + let node_name = node.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + mock.node + .expect_all() + .return_once(|| Box::new(MockApi::new())); + let mut instance_api_mock: MockApi = MockApi::new(); + let instance_name = "config-a-359973"; + instance_api_mock + .expect_list() + .return_once(|_| instances_list(instance_name, "unused")); + instance_api_mock + .expect_apply() + .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + mock.instance + .expect_all() + .return_once(move || Box::new(instance_api_mock)); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); + reconcile_inner(Event::Cleanup(Arc::new(node)), ctx.clone()) .await - .is_ok()); - } - - #[test] - fn test_is_node_ready_ready() { - let _ = env_logger::builder().is_test(true).try_init(); - - let tests = [ - ("../test/json/node-a.json", true), - ("../test/json/node-a-not-ready.json", false), - ("../test/json/node-a-no-conditions.json", false), - ("../test/json/node-a-no-ready-condition.json", false), - ]; - - for (node_file, result) in tests.iter() { - trace!( - "Testing {} should reflect node is ready={}", - node_file, - result - ); - - let node_json = file::read_file_to_string(node_file); - let kube_object_node: Node = serde_json::from_str(&node_json).unwrap(); - - let node_watcher = NodeWatcher::new(); - assert_eq!( - result.clone(), - node_watcher.is_node_ready(&kube_object_node) - ); - } + .unwrap(); + assert!(ctx.known_nodes.read().await.get(&node_name).is_none()); } } diff --git a/controller/src/util/pod_watcher.rs b/controller/src/util/pod_watcher.rs index 9ebf4a241..38b1fc699 100644 --- a/controller/src/util/pod_watcher.rs +++ b/controller/src/util/pod_watcher.rs @@ -1,60 +1,35 @@ +use crate::util::controller_ctx::{ControllerContext, PodState}; +use crate::util::{ControllerError, Result}; use akri_shared::{ - akri::{ - configuration::Configuration, - retry::{random_delay, MAX_INSTANCE_UPDATE_TRIES}, - }, - k8s, + akri::{configuration::Configuration, instance::Instance, API_NAMESPACE}, k8s::{ - pod::{AKRI_CONFIGURATION_LABEL_NAME, AKRI_INSTANCE_LABEL_NAME}, - service, KubeInterface, OwnershipInfo, OwnershipType, + api::Api, OwnershipInfo, OwnershipType, AKRI_CONFIGURATION_LABEL_NAME, + AKRI_INSTANCE_LABEL_NAME, APP_LABEL_ID, CONTROLLER_LABEL_ID, }, }; -use async_std::sync::Mutex; -use futures::{StreamExt, TryStreamExt}; -use k8s_openapi::api::core::v1::{Pod, ServiceSpec}; -use kube::api::Api; -use kube_runtime::watcher::{watcher, Config, Event}; -use kube_runtime::WatchStreamExt; -use log::{error, info, trace}; -use std::{collections::HashMap, sync::Arc}; -type PodSlice = [Pod]; +use anyhow::Context; +use futures::StreamExt; -/// Pod states that BrokerPodWatcher is interested in -/// -/// PodState describes the various states that the controller can -/// react to for Pods. -#[derive(Clone, Debug, PartialEq)] -enum PodState { - /// Pod is in Pending state and no action is needed. - Pending, - /// Pod is in Running state and needs to ensure that - /// instance and configuration services are running - Running, - /// Pod is in Failed/Completed/Succeeded state and - /// needs to remove any instance and configuration - /// services that are not supported by other Running - /// Pods. Also, at this point, if an Instance still - /// exists, instance_action::handle_instance_change - /// needs to be called to ensure that Pods are - /// restarted - Ended, - /// Pod is in Deleted state and needs to remove any - /// instance and configuration services that are not - /// supported by other Running Pods. Also, at this - /// point, if an Instance still exists, and the Pod is - /// owned by the Instance, - /// instance_action::handle_instance_change needs to be - /// called to ensure that Pods are restarted. Akri - /// places an Instance OwnerReference on all the Pods it - /// deploys. This declares that the Instance owns that - /// Pod and Akri's Controller explicitly manages its - /// deployment. However, if the Pod is not owned by the - /// Instance, Akri should not assume retry logic and - /// should cease action. The owning object (ie Job) will - /// handle retries as necessary. - Deleted, -} +use k8s_openapi::api::core::v1::Pod; +use k8s_openapi::{ + api::core::v1::{Service, ServiceSpec}, + apimachinery::pkg::apis::meta::v1::OwnerReference, +}; +use kube::api::ObjectList; + +use kube::{ + api::{ListParams, ObjectMeta, ResourceExt}, + runtime::{ + controller::{Action, Controller}, + finalizer::{finalizer, Event}, + watcher::Config, + }, +}; +use log::{error, info, trace}; +use std::{collections::BTreeMap, sync::Arc}; + +pub static POD_FINALIZER: &str = "pods.kube.rs"; /// The `kind` of a broker Pod's controlling OwnerReference /// @@ -97,6 +72,26 @@ fn get_broker_pod_owner_kind(pod: &Pod) -> BrokerPodOwnerKind { } } +/// Run the Pod reconciler +pub async fn run(ctx: Arc) { + let api = ctx.client.all().as_inner(); + if let Err(e) = api.list(&ListParams::default().limit(1)).await { + error!("Pods are not queryable; {e:?}"); + std::process::exit(1); + } + Controller::new(api, Config::default().labels(AKRI_CONFIGURATION_LABEL_NAME)) + .shutdown_on_signal() + .run(reconcile, error_policy, ctx) + .filter_map(|x| async move { std::result::Result::ok(x) }) + .for_each(|_| futures::future::ready(())) + .await; +} + +fn error_policy(_node: Arc, error: &ControllerError, _ctx: Arc) -> Action { + log::warn!("reconcile failed: {:?}", error); + Action::requeue(std::time::Duration::from_secs(5 * 60)) +} + /// This is used to handle broker Pods entering and leaving /// the Running state. /// @@ -109,593 +104,571 @@ fn get_broker_pod_owner_kind(pod: &Pod) -> BrokerPodOwnerKind { /// still have other broker Pods supporting them. If there /// are no other supporting broker Pods, delete one or both /// of the services. -#[derive(Debug)] -pub struct BrokerPodWatcher { - known_pods: HashMap, +pub async fn reconcile(pod: Arc, ctx: Arc) -> Result { + trace!("Reconciling broker pod {}", pod.name_any()); + finalizer( + &ctx.client.clone().all().as_inner(), + POD_FINALIZER, + pod, + |event| reconcile_inner(event, ctx.clone()), + ) + .await + .map_err(|e| ControllerError::FinalizerError(Box::new(e))) } -impl BrokerPodWatcher { - /// Create new instance of BrokerPodWatcher - pub fn new() -> Self { - BrokerPodWatcher { - known_pods: HashMap::new(), - } - } - - /// This watches for broker Pod events - pub async fn watch(&mut self) -> anyhow::Result<()> { - trace!("watch - enter"); - let kube_interface = k8s::KubeImpl::new().await?; - let resource = Api::::all(kube_interface.get_kube_client()); - let watcher = watcher( - resource, - Config::default().labels(AKRI_CONFIGURATION_LABEL_NAME), - ) - .default_backoff(); - let mut informer = watcher.boxed(); - let synchronization = Arc::new(Mutex::new(())); - let mut first_event = true; - - loop { - let event = match informer.try_next().await { - Err(e) => { - error!("Error during watch: {}", e); - continue; +async fn reconcile_inner(event: Event, ctx: Arc) -> Result { + match event { + Event::Apply(pod) => { + let phase = get_pod_phase(&pod); + info!( + "reconcile - pod {:?} applied with phase {phase}", + &pod.metadata.name + ); + match phase.as_str() { + "Unknown" | "Pending" => { + ctx.known_pods.write().await.insert( + pod.metadata.name.clone().context("Pod name is None")?, + PodState::Pending, + ); } - Ok(None) => return Err(anyhow::anyhow!("Watch stream ended")), - Ok(Some(event)) => event, - }; - let _lock = synchronization.lock().await; - self.handle_pod(event, &kube_interface, &mut first_event) - .await?; - } - } - - /// Gets Pods phase and returns "Unknown" if no phase exists - fn get_pod_phase(&mut self, pod: &Pod) -> String { - if pod.status.is_some() { - pod.status - .as_ref() - .unwrap() - .phase - .as_ref() - .unwrap_or(&"Unknown".to_string()) - .to_string() - } else { - "Unknown".to_string() - } - } - - /// This takes an event off the Pod stream. If a Pod is newly Running, ensure that - /// the instance and configuration services are running. If a Pod is no longer Running, - /// ensure that the instance and configuration services are removed as needed. - async fn handle_pod( - &mut self, - event: Event, - kube_interface: &impl KubeInterface, - first_event: &mut bool, - ) -> anyhow::Result<()> { - trace!("handle_pod - enter [event: {:?}]", event); - match event { - Event::Applied(pod) => { - info!( - "handle_pod - pod {:?} added or modified", - &pod.metadata.name - ); - let phase = self.get_pod_phase(&pod); - trace!("handle_pod - pod phase {:?}", &phase); - match phase.as_str() { - "Unknown" | "Pending" => { - self.known_pods.insert( - pod.metadata.name.clone().ok_or_else(|| { - anyhow::format_err!("Pod {:?} does not have name", pod) - })?, - PodState::Pending, - ); - } - "Running" => { - self.handle_running_pod_if_needed(&pod, kube_interface) - .await?; - } - "Succeeded" | "Failed" => { - self.handle_ended_pod_if_needed(&pod, kube_interface) - .await?; - } - _ => { - trace!("handle_pod - Unknown phase: {:?}", &phase); - } + "Running" => { + handle_running_pod_if_needed(&pod, ctx).await?; } - } - Event::Deleted(pod) => { - info!("handle_pod - Deleted: {:?}", &pod.metadata.name); - self.handle_deleted_pod_if_needed(&pod, kube_interface) - .await?; - } - Event::Restarted(pods) => { - if *first_event { - info!( - "handle_pod - pod watcher [re]started. Pods are : {:?}", - pods - ); - } else { - return Err(anyhow::anyhow!( - "Pod watcher restarted - throwing error to restart controller" - )); + "Succeeded" | "Failed" => { + handle_ended_pod_if_needed(&pod, ctx).await?; + } + _ => { + trace!("handle_pod - Unknown phase: {:?}", &phase); } } - }; - *first_event = false; - Ok(()) + Ok(Action::await_change()) + } + Event::Cleanup(pod) => { + info!("handle_pod - Deleted: {:?}", &pod.metadata.name); + handle_deleted_pod_if_needed(&pod, ctx).await?; + Ok(Action::await_change()) + } } +} - /// This ensures that handle_running_pod is called only once for - /// any Pod as it exits the Running phase. - async fn handle_running_pod_if_needed( - &mut self, - pod: &Pod, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("handle_running_pod_if_needed - enter"); - let pod_name = pod - .metadata - .name - .clone() - .ok_or_else(|| anyhow::format_err!("Pod {:?} does not have name", pod))?; - let last_known_state = self.known_pods.get(&pod_name).unwrap_or(&PodState::Pending); - trace!( - "handle_running_pod_if_needed - last_known_state: {:?}", - &last_known_state - ); - // Ensure that, for each pod, handle_running_pod is called once - // per transition into the Running state - if last_known_state != &PodState::Running { - trace!("handle_running_pod_if_needed - call handle_running_pod"); - self.handle_running_pod(pod, kube_interface).await?; - self.known_pods.insert(pod_name, PodState::Running); - } - Ok(()) +/// Gets Pods phase and returns "Unknown" if no phase exists +fn get_pod_phase(pod: &Pod) -> String { + if pod.status.is_some() { + pod.status + .as_ref() + .unwrap() + .phase + .as_ref() + .unwrap_or(&"Unknown".to_string()) + .to_string() + } else { + "Unknown".to_string() } +} - /// This ensures that handle_non_running_pod is called only once for - /// any Pod as it enters the Ended phase. Note that handle_non_running_pod - /// will likely be called twice as a Pod leaves the Running phase, that is - /// expected and accepted. - async fn handle_ended_pod_if_needed( - &mut self, - pod: &Pod, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("handle_ended_pod_if_needed - enter"); - let pod_name = pod - .metadata - .name - .clone() - .ok_or_else(|| anyhow::format_err!("Pod {:?} does not have name", pod))?; - let last_known_state = self.known_pods.get(&pod_name).unwrap_or(&PodState::Pending); - trace!( - "handle_ended_pod_if_needed - last_known_state: {:?}", - &last_known_state - ); - // Ensure that, for each pod, handle_non_running_pod is called once - // per transition into the Ended state - if last_known_state != &PodState::Ended { - trace!("handle_ended_pod_if_needed - call handle_non_running_pod"); - self.handle_non_running_pod(pod, kube_interface).await?; - self.known_pods.insert(pod_name, PodState::Ended); - } - Ok(()) +/// This ensures that handle_running_pod is called only once for +/// any Pod as it exits the Running phase. +async fn handle_running_pod_if_needed( + pod: &Pod, + ctx: Arc, +) -> anyhow::Result<()> { + trace!("handle_running_pod_if_needed - enter"); + let pod_name = pod.metadata.name.as_deref().context("Pod name is None")?; + let last_known_state = ctx + .known_pods + .read() + .await + .get(pod_name) + .unwrap_or(&PodState::Pending) + .clone(); + trace!( + "handle_running_pod_if_needed - last_known_state: {:?}", + &last_known_state + ); + // Ensure that, for each pod, handle_running_pod is called once + // per transition into the Running state + if last_known_state != PodState::Running { + trace!("handle_running_pod_if_needed - call handle_running_pod"); + handle_running_pod(pod, ctx.clone()).await?; + ctx.known_pods + .write() + .await + .insert(pod_name.to_string(), PodState::Running); } + Ok(()) +} - /// This ensures that handle_non_running_pod is called only once for - /// any Pod as it enters the Ended phase. Note that handle_non_running_pod - /// will likely be called twice as a Pod leaves the Running phase, that is - /// expected and accepted. - async fn handle_deleted_pod_if_needed( - &mut self, - pod: &Pod, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("handle_deleted_pod_if_needed - enter"); - let pod_name = pod - .metadata - .name - .clone() - .ok_or_else(|| anyhow::format_err!("Pod {:?} does not have name", pod))?; - let last_known_state = self.known_pods.get(&pod_name).unwrap_or(&PodState::Pending); - trace!( - "handle_deleted_pod_if_needed - last_known_state: {:?}", - &last_known_state - ); - // Ensure that, for each pod, handle_non_running_pod is called once - // per transition into the Deleted state - if last_known_state != &PodState::Deleted { - trace!("handle_deleted_pod_if_needed - call handle_non_running_pod"); - self.handle_non_running_pod(pod, kube_interface).await?; - self.known_pods.insert(pod_name, PodState::Deleted); - } - Ok(()) +/// This ensures that handle_non_running_pod is called only once for +/// any Pod as it enters the Ended phase. Note that handle_non_running_pod +/// will likely be called twice as a Pod leaves the Running phase, that is +/// expected and accepted. +async fn handle_ended_pod_if_needed(pod: &Pod, ctx: Arc) -> anyhow::Result<()> { + trace!("handle_ended_pod_if_needed - enter"); + let pod_name = pod + .metadata + .name + .clone() + .context("Pod does not have name")?; + let last_known_state = ctx + .known_pods + .read() + .await + .get(&pod_name) + .unwrap_or(&PodState::Pending) + .clone(); + trace!( + "handle_ended_pod_if_needed - last_known_state: {:?}", + &last_known_state + ); + // Ensure that, for each pod, handle_non_running_pod is called once + // per transition into the Ended state + if last_known_state != PodState::Ended { + trace!("handle_ended_pod_if_needed - call handle_non_running_pod"); + handle_non_running_pod(pod, ctx.clone()).await?; + ctx.known_pods + .write() + .await + .insert(pod_name, PodState::Ended); } + Ok(()) +} - /// Get instance id and configuration name from Pod annotations, return - /// error if the annotations are not found. - fn get_instance_and_configuration_from_pod( - &self, - pod: &Pod, - ) -> anyhow::Result<(String, String)> { - trace!("get_instance_and_configuration_from_pod - enter"); - let labels = pod - .metadata - .labels - .as_ref() - .ok_or_else(|| anyhow::anyhow!("Pod doesn't have labels"))?; - let instance_id = labels - .get(AKRI_INSTANCE_LABEL_NAME) - .ok_or_else(|| anyhow::anyhow!("No configuration name found."))?; - let config_name = labels - .get(AKRI_CONFIGURATION_LABEL_NAME) - .ok_or_else(|| anyhow::anyhow!("No instance id found."))?; - Ok((instance_id.to_string(), config_name.to_string())) +/// This ensures that handle_non_running_pod is called only once for +/// any Pod as it enters the Ended phase. Note that handle_non_running_pod +/// will likely be called twice as a Pod leaves the Running phase, that is +/// expected and accepted. +async fn handle_deleted_pod_if_needed( + pod: &Pod, + ctx: Arc, +) -> anyhow::Result<()> { + trace!("handle_deleted_pod_if_needed - enter"); + let pod_name = pod + .metadata + .name + .clone() + .ok_or_else(|| anyhow::format_err!("Pod {:?} does not have name", pod))?; + // Ensure that, for each pod, handle_non_running_pod is called once + // per transition into the Deleted state + if ctx + .known_pods + .read() + .await + .get(&pod_name) + .unwrap_or(&PodState::Pending) + != &PodState::Deleted + { + trace!("handle_deleted_pod_if_needed - call handle_non_running_pod"); + handle_non_running_pod(pod, ctx.clone()).await?; + ctx.known_pods + .write() + .await + .insert(pod_name, PodState::Deleted); } + Ok(()) +} - /// This is called when a broker Pod exits the Running phase and ensures - /// that instance and configuration services are only running when - /// supported by Running broker Pods. - async fn handle_non_running_pod( - &self, - pod: &Pod, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("handle_non_running_pod - enter"); - let namespace = pod.metadata.namespace.as_ref().ok_or_else(|| { - anyhow::anyhow!("Namespace not found for pod: {:?}", &pod.metadata.name) - })?; - let (instance_id, config_name) = self.get_instance_and_configuration_from_pod(pod)?; - self.find_pods_and_cleanup_svc_if_unsupported( - &instance_id, - &config_name, - namespace, - true, - kube_interface, - ) - .await?; - self.find_pods_and_cleanup_svc_if_unsupported( - &instance_id, - &config_name, - namespace, - false, - kube_interface, - ) - .await?; +/// Get instance id and configuration name from Pod annotations, return +/// error if the annotations are not found. +fn get_instance_and_configuration_from_pod(pod: &Pod) -> anyhow::Result<(String, String)> { + let labels = pod + .metadata + .labels + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Pod doesn't have labels"))?; + let instance_id = labels + .get(AKRI_INSTANCE_LABEL_NAME) + .ok_or_else(|| anyhow::anyhow!("No configuration name found."))?; + let config_name = labels + .get(AKRI_CONFIGURATION_LABEL_NAME) + .ok_or_else(|| anyhow::anyhow!("No instance id found."))?; + Ok((instance_id.to_string(), config_name.to_string())) +} - // Only redeploy Pods that are managed by the Akri Controller (controlled by an Instance OwnerReference) - if get_broker_pod_owner_kind(pod) == BrokerPodOwnerKind::Instance { - // Make sure instance has required Pods - if let Ok(instance) = kube_interface.find_instance(&instance_id, namespace).await { - super::instance_action::handle_instance_change( - &instance, - &super::instance_action::InstanceAction::Update, - kube_interface, - ) - .await?; +/// This is called when a broker Pod exits the Running phase and ensures +/// that instance and configuration services are only running when +/// supported by Running broker Pods. +async fn handle_non_running_pod(pod: &Pod, ctx: Arc) -> anyhow::Result<()> { + trace!("handle_non_running_pod - enter"); + let namespace = pod + .metadata + .namespace + .as_ref() + .context("Pod has no namespace")?; + let (instance_id, config_name) = get_instance_and_configuration_from_pod(pod)?; + let selector = format!("{}={}", AKRI_CONFIGURATION_LABEL_NAME, config_name); + let broker_pods: ObjectList = ctx + .client + .all() + .list(&ListParams { + label_selector: Some(selector), + ..Default::default() + }) + .await?; + // Clean up instance services so long as all pods are terminated or terminating + let svc_api = ctx.client.namespaced(namespace); + cleanup_svc_if_unsupported( + &broker_pods.items, + &create_service_app_name(&config_name), + namespace, + svc_api.as_ref(), + ) + .await?; + let instance_pods: Vec = broker_pods + .items + .into_iter() + .filter(|x| { + match x + .metadata + .labels + .as_ref() + .unwrap_or(&BTreeMap::new()) + .get(AKRI_INSTANCE_LABEL_NAME) + { + Some(name) => name == &instance_id, + None => false, } + }) + .collect(); + cleanup_svc_if_unsupported( + &instance_pods, + &create_service_app_name(&instance_id), + namespace, + svc_api.as_ref(), + ) + .await?; + + // Only redeploy Pods that are managed by the Akri Controller (controlled by an Instance OwnerReference) + if get_broker_pod_owner_kind(pod) == BrokerPodOwnerKind::Instance { + if let Ok(Some(instance)) = ctx.client.namespaced(namespace).get(&instance_id).await { + super::instance_action::handle_instance_change( + &instance, + &super::instance_action::InstanceAction::Update, + ctx, + ) + .await?; } - Ok(()) } + Ok(()) +} - /// This searches existing Pods to determine if there are - /// Services that need to be removed because they lack supporting - /// Pods. If any are found, the Service is removed. - async fn find_pods_and_cleanup_svc_if_unsupported( - &self, - instance_id: &str, - configuration_name: &str, - namespace: &str, - handle_instance_svc: bool, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("find_pods_and_cleanup_svc_if_unsupported - enter"); - let (label, value) = if handle_instance_svc { - (AKRI_INSTANCE_LABEL_NAME, instance_id) - } else { - (AKRI_CONFIGURATION_LABEL_NAME, configuration_name) - }; +/// This determines if there are Services that need to be removed because +/// they lack supporting Pods. If any are found, the Service is removed. +async fn cleanup_svc_if_unsupported( + pods: &[Pod], + svc_name: &str, + namespace: &str, + svc_api: &dyn Api, +) -> anyhow::Result<()> { + // Find the number of non-Terminating pods, if there aren't any (the only pods that exist are Terminating), we should remove the associated services + let num_non_terminating_pods = pods.iter().filter(|&x| + match &x.status { + Some(status) => { + match &status.phase { + Some(phase) => { + trace!("cleanup_svc_if_unsupported - finding num_non_terminating_pods: pod:{:?} phase:{:?}", &x.metadata.name, &phase); + phase != "Terminating" && phase != "Failed" && phase != "Succeeded" + }, + _ => true, + } + }, + _ => true, + }).count(); + if num_non_terminating_pods == 0 { + trace!( + "cleanup_svc_if_unsupported - deleting service name={:?}, namespace={:?}", + &svc_name, + &namespace + ); + svc_api.delete(svc_name).await?; + } + Ok(()) +} - // Clean up instance service if there are no pods anymore - let selector = format!("{}={}", label, value); +/// This is called when a Pod enters the Running phase and ensures +/// that instance and configuration services are running as specified +/// by the configuration. +async fn handle_running_pod(pod: &Pod, ctx: Arc) -> anyhow::Result<()> { + trace!("handle_running_pod - enter"); + let namespace = pod + .metadata + .namespace + .as_ref() + .context("Namespace not found for pod")?; + let (instance_name, configuration_name) = get_instance_and_configuration_from_pod(pod)?; + let Some(configuration) = ctx + .client + .namespaced(namespace) + .get(&configuration_name) + .await? + else { + // In this scenario, a configuration has likely been deleted in the middle of handle_running_pod. + // There is no need to propogate the error and bring down the Controller. trace!( - "find_pods_and_cleanup_svc_if_unsupported - find_pods_with_label({})", - selector + "handle_running_pod - no configuration found for {}", + &configuration_name ); - let pods = kube_interface.find_pods_with_label(&selector).await?; + return Ok(()); + }; + let Some(instance): Option = + ctx.client.namespaced(namespace).get(&instance_name).await? + else { + // In this scenario, a instance has likely been deleted in the middle of handle_running_pod. trace!( - "find_pods_and_cleanup_svc_if_unsupported - found {} pods", - pods.items.len() + "handle_running_pod - no instance found for {}", + &instance_name ); + return Ok(()); + }; + let instance_uid = instance + .metadata + .uid + .as_ref() + .context("UID not found for instance")?; + add_instance_and_configuration_services( + &instance_name, + instance_uid, + namespace, + &configuration_name, + &configuration, + ctx, + ) + .await?; + + Ok(()) +} - let svc_name = service::create_service_app_name( - configuration_name, - instance_id, - "svc", - handle_instance_svc, +/// This creates the broker Service and the capability Service. +async fn add_instance_and_configuration_services( + instance_name: &str, + instance_uid: &str, + namespace: &str, + configuration_name: &str, + configuration: &Configuration, + ctx: Arc, +) -> anyhow::Result<()> { + trace!( + "add_instance_and_configuration_services - instance={:?}", + instance_name + ); + let api = ctx.client.namespaced(namespace); + if let Some(instance_service_spec) = &configuration.spec.instance_service_spec { + let ownership = OwnershipInfo::new( + OwnershipType::Instance, + instance_name.to_string(), + instance_uid.to_string(), ); - - self.cleanup_svc_if_unsupported(&pods.items, &svc_name, namespace, kube_interface) - .await - } - - /// This determines if there are Services that need to be removed because - /// they lack supporting Pods. If any are found, the Service is removed. - async fn cleanup_svc_if_unsupported( - &self, - pods: &PodSlice, - svc_name: &str, - svc_namespace: &str, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - // Find the number of non-Terminating pods, if there aren't any (the only pods that exist are Terminating), we should remove the device capability service - let num_non_terminating_pods = pods.iter().filter(|&x| - match &x.status { - Some(status) => { - match &status.phase { - Some(phase) => { - trace!("cleanup_svc_if_unsupported - finding num_non_terminating_pods: pod:{:?} phase:{:?}", &x.metadata.name, &phase); - phase != "Terminating" && phase != "Failed" && phase != "Succeeded" - }, - _ => true, - } - }, - _ => true, - }).count(); - trace!( - "cleanup_svc_if_unsupported - num_non_terminating_pods: {}", - num_non_terminating_pods + let mut labels: BTreeMap = BTreeMap::new(); + labels.insert( + AKRI_INSTANCE_LABEL_NAME.to_string(), + instance_name.to_string(), ); - - if num_non_terminating_pods == 0 { - trace!( - "cleanup_svc_if_unsupported - service::remove_service app_name={:?}, namespace={:?}", - &svc_name, &svc_namespace - ); - kube_interface - .remove_service(svc_name, svc_namespace) - .await?; - trace!("cleanup_svc_if_unsupported - service::remove_service succeeded"); - } - Ok(()) + let app_name = create_service_app_name(instance_name); + let instance_svc = create_new_service_from_spec( + &app_name, + namespace, + ownership.clone(), + instance_service_spec, + labels, + )?; + api.apply(instance_svc, &ctx.identifier).await?; } - - /// This is called when a Pod enters the Running phase and ensures - /// that isntance and configuration services are running as specified - /// by the configuration. - async fn handle_running_pod( - &self, - pod: &Pod, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!("handle_running_pod - enter"); - let namespace = pod.metadata.namespace.as_ref().ok_or_else(|| { - anyhow::anyhow!("Namespace not found for pod: {:?}", &pod.metadata.name) - })?; - let (instance_name, configuration_name) = - self.get_instance_and_configuration_from_pod(pod)?; - let configuration = match kube_interface - .find_configuration(&configuration_name, namespace) - .await - { - Ok(config) => config, - _ => { - // In this scenario, a configuration has likely been deleted in the middle of handle_running_pod. - // There is no need to propogate the error and bring down the Controller. - trace!( - "handle_running_pod - no configuration found for {}", - &configuration_name - ); - return Ok(()); - } - }; - let instance = match kube_interface - .find_instance(&instance_name, namespace) - .await - { - Ok(instance) => instance, - _ => { - // In this scenario, a instance has likely been deleted in the middle of handle_running_pod. - // There is no need to propogate the error and bring down the Controller. - trace!( - "handle_running_pod - no instance found for {}", - &instance_name - ); - return Ok(()); - } - }; - let instance_uid = instance + if let Some(configuration_service_spec) = &configuration.spec.configuration_service_spec { + let configuration_uid = configuration .metadata .uid .as_ref() - .ok_or_else(|| anyhow::anyhow!("UID not found for instance: {}", instance_name))?; - self.add_instance_and_configuration_services( - &instance_name, - instance_uid, + .context("UID not found for configuration")?; + let ownership = OwnershipInfo::new( + OwnershipType::Configuration, + configuration_name.to_string(), + configuration_uid.clone(), + ); + let mut labels: BTreeMap = BTreeMap::new(); + labels.insert( + AKRI_CONFIGURATION_LABEL_NAME.to_string(), + configuration_name.to_string(), + ); + let app_name = create_service_app_name(configuration_name); + let config_svc = create_new_service_from_spec( + &app_name, namespace, - &configuration_name, - &configuration, - kube_interface, - ) - .await?; - - Ok(()) + ownership.clone(), + configuration_service_spec, + labels, + )?; + // TODO: handle already exists error + api.apply(config_svc, &ctx.identifier).await?; } + Ok(()) +} - /// This creates new service or updates existing service with ownership. - #[allow(clippy::too_many_arguments)] - async fn create_or_update_service( - &self, - instance_name: &str, - configuration_name: &str, - namespace: &str, - label_name: &str, - label_value: &str, - ownership: OwnershipInfo, - service_spec: &ServiceSpec, - is_instance_service: bool, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!( - "create_or_update_service - instance={:?} with ownership:{:?}", - instance_name, - &ownership - ); +pub fn create_new_service_from_spec( + app_name: &str, + svc_namespace: &str, + ownership: OwnershipInfo, + svc_spec: &ServiceSpec, + mut labels: BTreeMap, +) -> anyhow::Result { + labels.insert(APP_LABEL_ID.to_string(), app_name.to_owned()); + labels.insert(CONTROLLER_LABEL_ID.to_string(), API_NAMESPACE.to_string()); + let owner_references: Vec = vec![OwnerReference { + api_version: ownership.get_api_version(), + kind: ownership.get_kind(), + controller: ownership.get_controller(), + block_owner_deletion: ownership.get_block_owner_deletion(), + name: ownership.get_name(), + uid: ownership.get_uid(), + }]; + + let mut spec = svc_spec.clone(); + let mut modified_selector: BTreeMap = spec.selector.unwrap_or_default(); + modified_selector.insert(CONTROLLER_LABEL_ID.to_string(), API_NAMESPACE.to_string()); + spec.selector = Some(modified_selector); + + let new_svc = Service { + spec: Some(spec), + metadata: ObjectMeta { + name: Some(app_name.to_owned()), + namespace: Some(svc_namespace.to_string()), + labels: Some(labels), + owner_references: Some(owner_references), + ..Default::default() + }, + ..Default::default() + }; - let mut create_new_service = true; - if let Ok(existing_svcs) = kube_interface - .find_services(&format!("{}={}", label_name, label_value)) - .await - { - for existing_svc in existing_svcs { - let mut existing_svc = existing_svc.clone(); - let svc_name = existing_svc.metadata.name.clone().ok_or_else(|| { - anyhow::format_err!("Service {:?} does not have name", existing_svc) - })?; - let svc_namespace = existing_svc.metadata.namespace.as_ref().unwrap().clone(); - trace!( - "create_or_update_service - Update existing svc={:?}", - &svc_name - ); - service::update_ownership(&mut existing_svc, ownership.clone(), true)?; - trace!("create_or_update_service - calling service::update_service name:{} namespace: {}", &svc_name, &svc_namespace); - kube_interface - .update_service(&existing_svc, &svc_name, &svc_namespace) - .await?; - trace!("create_or_update_service - service::update_service succeeded"); - create_new_service = false; - } - } + Ok(new_svc) +} - if create_new_service { - let new_instance_svc = service::create_new_service_from_spec( - namespace, - instance_name, - configuration_name, - ownership.clone(), - service_spec, - is_instance_service, - )?; - trace!( - "create_or_update_service - New instance svc spec={:?}", - new_instance_svc - ); +pub fn create_service_app_name(resource_name: &str) -> String { + let normalized = resource_name.replace('.', "-"); + format!("{}-{}", normalized, "svc") +} - kube_interface - .create_service(&new_instance_svc, namespace) - .await?; - trace!("create_or_update_service - service::create_service succeeded"); +#[cfg(test)] +mod tests { + use crate::util::shared_test_utils::mock_client::MockControllerKubeClient; + + use akri_shared::k8s::api::MockApi; + use kube::api::{ObjectList, TypeMeta}; + use mockall::Sequence; + + // use super::super::shared_test_utils::config_for_tests; + // use super::super::shared_test_utils::config_for_tests::PodList; + use super::*; + use k8s_openapi::api::core::v1::{Pod, PodSpec, PodStatus}; + use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; + + fn make_obj_list(items: Vec) -> ObjectList { + ObjectList { + types: TypeMeta { + api_version: "v1".to_string(), + kind: "List".to_string(), + }, + metadata: Default::default(), + items, } - Ok(()) } - /// This creates the broker Service and the capability Service. - async fn add_instance_and_configuration_services( - &self, - instance_name: &str, - instance_uid: &str, + fn make_configuration( + name: &str, namespace: &str, - configuration_name: &str, - configuration: &Configuration, - kube_interface: &impl KubeInterface, - ) -> anyhow::Result<()> { - trace!( - "add_instance_and_configuration_services - instance={:?}", - instance_name - ); - if let Some(instance_service_spec) = &configuration.spec.instance_service_spec { - let ownership = OwnershipInfo::new( - OwnershipType::Instance, - instance_name.to_string(), - instance_uid.to_string(), - ); - // Try up to MAX_INSTANCE_UPDATE_TRIES times to update/create/get instance - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - match self - .create_or_update_service( - instance_name, - configuration_name, - namespace, - AKRI_INSTANCE_LABEL_NAME, - instance_name, - ownership.clone(), - instance_service_spec, - true, - kube_interface, - ) - .await - { - Ok(_) => break, - Err(e) => { - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - random_delay().await; + create_instance_svc: bool, + create_config_svc: bool, + ) -> Configuration { + let config = serde_json::json!({ + "apiVersion": "akri.sh/v0", + "kind": "Configuration", + "metadata": { + "name": name, + "namespace": namespace, + "uid": "e9fbe880-99da-47c1-bea3-5398f21ee747" + }, + "spec": { + "brokerSpec": { + "brokerPodSpec": { + "containers": [ + { + "image": "nginx:latest", + "name": "broker" + } + ] } - } + }, + "discoveryHandler": { + "name": "debugEcho", + "discoveryDetails": "{\"debugEcho\": {\"descriptions\":[\"filter1\", \"filter2\"]}}" + }, + "capacity": 5, + "brokerProperties": {} } } - - if let Some(configuration_service_spec) = &configuration.spec.configuration_service_spec { - let configuration_uid = configuration.metadata.uid.as_ref().ok_or_else(|| { - anyhow::anyhow!("UID not found for configuration: {}", configuration_name) - })?; - let ownership = OwnershipInfo::new( - OwnershipType::Configuration, - configuration_name.to_string(), - configuration_uid.clone(), - ); - // Try up to MAX_INSTANCE_UPDATE_TRIES times to update/create/get instance - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - match self - .create_or_update_service( - instance_name, - configuration_name, - namespace, - AKRI_CONFIGURATION_LABEL_NAME, - configuration_name, - ownership.clone(), - configuration_service_spec, - false, - kube_interface, - ) - .await - { - Ok(_) => break, - Err(e) => { - if x == (MAX_INSTANCE_UPDATE_TRIES - 1) { - return Err(e); - } - random_delay().await; + ); + let mut config: Configuration = serde_json::from_value(config).unwrap(); + if create_config_svc { + config.spec.configuration_service_spec = serde_json::from_value(serde_json::json!({ + "ports": [ + { + "name": "http", + "port": 6052, + "protocol": "TCP", + "targetPort": 6052 } - } - } + ], + "type": "ClusterIP" + })) + .ok(); + } + if create_instance_svc { + config.spec.instance_service_spec = serde_json::from_value(serde_json::json!({ + "ports": [ + { + "name": "http", + "port": 6052, + "protocol": "TCP", + "targetPort": 6052 + } + ], + "type": "ClusterIP" + })) + .ok(); } - Ok(()) + config } -} -#[cfg(test)] -mod tests { - use super::super::shared_test_utils::config_for_tests; - use super::super::shared_test_utils::config_for_tests::PodList; - use super::*; - use akri_shared::{k8s::MockKubeInterface, os::file}; - use k8s_openapi::api::core::v1::PodSpec; - use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; + fn make_instance(name: &str, namespace: &str, config: &str) -> Instance { + let instance = serde_json::json!({ + "apiVersion": "akri.sh/v0", + "kind": "Instance", + "metadata": { + "name": name, + "namespace": namespace, + "uid": "e9fbe880-99da-47c1-bea3-5398f21ee747" + }, + "spec": { + "configurationName": config, + "capacity": 5, + "cdiName": "akri.sh/config-a=12345", + "nodes": [ "node-a" ], + "shared": true + } + }); + serde_json::from_value(instance).unwrap() + } - fn create_pods_with_phase(result_file: &'static str, specified_phase: &'static str) -> PodList { - let pods_json = file::read_file_to_string(result_file); - let phase_adjusted_json = pods_json.replace( - "\"phase\": \"Running\"", - &format!("\"phase\": \"{}\"", specified_phase), - ); - let pods: PodList = serde_json::from_str(&phase_adjusted_json).unwrap(); - pods + fn make_pod_with_owners_and_phase( + instance: &str, + config: &str, + phase: &str, + kind: &str, + ) -> Pod { + let owner_references: Vec = vec![OwnerReference { + kind: kind.to_string(), + controller: Some(true), + name: instance.to_string(), + ..Default::default() + }]; + make_pod_with_owner_references_and_phase(owner_references, instance, config, phase) } fn make_pod_with_owner_references(owner_references: Vec) -> Pod { @@ -709,6 +682,35 @@ mod tests { } } + fn make_pod_with_owner_references_and_phase( + owner_references: Vec, + instance: &str, + config: &str, + phase: &str, + ) -> Pod { + let pod_status = PodStatus { + phase: Some(phase.to_string()), + ..Default::default() + }; + let mut labels = BTreeMap::new(); + labels.insert( + AKRI_CONFIGURATION_LABEL_NAME.to_string(), + config.to_string(), + ); + labels.insert(AKRI_INSTANCE_LABEL_NAME.to_string(), instance.to_string()); + Pod { + spec: Some(PodSpec::default()), + metadata: ObjectMeta { + owner_references: Some(owner_references), + name: Some("test-pod".to_string()), + namespace: Some("test-ns".to_string()), + labels: Some(labels), + ..Default::default() + }, + status: Some(pod_status), + } + } + #[test] fn test_get_broker_pod_owner_kind_instance() { let owner_references: Vec = vec![OwnerReference { @@ -783,927 +785,319 @@ mod tests { ); } - // Test that watcher errors on restarts unless it is the first restart (aka initial startup) - #[tokio::test] - async fn test_handle_watcher_restart() { - let _ = env_logger::builder().is_test(true).try_init(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut first_event = true; - assert!(pod_watcher - .handle_pod( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_ok()); - first_event = false; - assert!(pod_watcher - .handle_pod( - Event::Restarted(Vec::new()), - &MockKubeInterface::new(), - &mut first_event - ) - .await - .is_err()); - } - - #[tokio::test] - async fn test_handle_pod_added_unready() { - let _ = env_logger::builder().is_test(true).try_init(); - - for phase in &["Unknown", "Pending"] { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - phase, - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - trace!( - "test_handle_pod_added_unready phase:{}, Event::Applied", - &phase - ); - pod_watcher - .handle_pod(Event::Applied(pod), &MockKubeInterface::new(), &mut false) - .await - .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); - assert_eq!( - &PodState::Pending, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) - } + fn valid_instance_svc(instance_svc: &Service, instance_name: &str, namespace: &str) -> bool { + instance_svc.metadata.name.as_ref().unwrap() == &format!("{}-svc", instance_name) + && instance_svc.metadata.namespace.as_ref().unwrap() == namespace + && instance_svc + .metadata + .owner_references + .as_ref() + .unwrap() + .len() + == 1 + && instance_svc.metadata.owner_references.as_ref().unwrap()[0].kind == "Instance" + && instance_svc.metadata.owner_references.as_ref().unwrap()[0].name == instance_name + && instance_svc + .metadata + .labels + .as_ref() + .unwrap() + .get(AKRI_INSTANCE_LABEL_NAME) + .unwrap() + == instance_name } - #[tokio::test] - async fn test_handle_pod_modified_unready() { - let _ = env_logger::builder().is_test(true).try_init(); - - for phase in &["Unknown", "Pending"] { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - phase, - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - trace!( - "test_handle_pod_modified_unready phase:{}, Event::Applied", - &phase - ); - pod_watcher - .handle_pod(Event::Applied(pod), &MockKubeInterface::new(), &mut false) - .await - .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); - assert_eq!( - &PodState::Pending, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) - } + fn valid_config_svc(config_svc: &Service, config_name: &str, namespace: &str) -> bool { + config_svc.metadata.name.as_ref().unwrap() == &format!("{}-svc", config_name) + && config_svc.metadata.namespace.as_ref().unwrap() == namespace + && config_svc.metadata.owner_references.as_ref().unwrap().len() == 1 + && config_svc.metadata.owner_references.as_ref().unwrap()[0].kind == "Configuration" + && config_svc.metadata.owner_references.as_ref().unwrap()[0].name == config_name + && config_svc + .metadata + .labels + .as_ref() + .unwrap() + .get(AKRI_CONFIGURATION_LABEL_NAME) + .unwrap() + == config_name } #[tokio::test] - async fn test_handle_pod_modified_ready() { + async fn test_reconcile_applied_unknown_phase() { let _ = env_logger::builder().is_test(true).try_init(); - - let pods_json = - file::read_file_to_string("../test/json/running-pod-list-for-config-a-local.json"); - let pod_list: PodList = serde_json::from_str(&pods_json).unwrap(); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - configure_for_handle_pod( - &mut mock, - &HandlePod { - running: Some(HandlePodRunning { - find_config_name: "config-a", - find_config_namespace: "config-a-namespace", - find_config_result: "../test/json/config-a.json", - find_config_error: false, - - find_instance_name: "config-a-b494b6", - find_instance_result: "../test/json/local-instance.json", - - find_instance_service: FindServices { - find_services_selector: "akri.sh/instance=config-a-b494b6", - find_services_result: "../test/json/empty-list.json", - find_services_error: false, - }, - new_instance_svc_name: "config-a-b494b6-svc", - - find_configuration_service: FindServices { - find_services_selector: "akri.sh/configuration=config-a", - find_services_result: "../test/json/empty-list.json", - find_services_error: false, - }, - new_configuration_svc_name: "config-a-svc", - }), - ended: None, - }, - ); - - pod_watcher - .handle_pod(Event::Applied(pod), &mock, &mut false) + let pod = + make_pod_with_owners_and_phase("instance_name", "copnfig_name", "Unknown", "Instance"); + let pod_name = pod.metadata.name.clone().unwrap(); + let ctx = Arc::new(ControllerContext::new( + Arc::new(MockControllerKubeClient::default()), + "test", + )); + reconcile_inner(Event::Apply(Arc::new(pod)), ctx.clone()) .await .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); assert_eq!( - &PodState::Running, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) - } - - #[tokio::test] - async fn test_handle_pod_modified_ready_no_config() { - let _ = env_logger::builder().is_test(true).try_init(); - - let pods_json = - file::read_file_to_string("../test/json/running-pod-list-for-config-a-local.json"); - let pod_list: PodList = serde_json::from_str(&pods_json).unwrap(); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - configure_for_handle_pod( - &mut mock, - &HandlePod { - running: Some(HandlePodRunning { - find_config_name: "config-a", - find_config_namespace: "config-a-namespace", - find_config_result: "../test/json/config-a.json", - find_config_error: true, - - find_instance_name: "config-a-b494b6", - find_instance_result: "../test/json/local-instance.json", - - find_instance_service: FindServices { - find_services_selector: "akri.sh/instance=config-a-b494b6", - find_services_result: "../test/json/empty-list.json", - find_services_error: false, - }, - new_instance_svc_name: "config-a-b494b6-svc", - - find_configuration_service: FindServices { - find_services_selector: "akri.sh/configuration=config-a", - find_services_result: "../test/json/empty-list.json", - find_services_error: false, - }, - new_configuration_svc_name: "config-a-svc", - }), - ended: None, - }, + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Pending ); - - pod_watcher - .handle_pod(Event::Applied(pod), &mock, &mut false) - .await - .unwrap(); - trace!( - "test_handle_pod_modified_ready_no_config pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); - assert_eq!( - &PodState::Running, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) } #[tokio::test] - async fn test_handle_pod_modified_failed() { + async fn test_reconcile_applied_pending_phase() { let _ = env_logger::builder().is_test(true).try_init(); - - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Failed", - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - configure_for_handle_pod( - &mut mock, - &HandlePod { - running: None, - ended: Some(CleanupServices { - find_svc_selector: "controller=akri.sh", - find_svc_result: "../test/json/running-svc-list-for-config-a-local.json", - cleanup_services: vec![ - CleanupService { - find_pod_selector: "akri.sh/configuration=config-a", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - CleanupService { - find_pod_selector: "akri.sh/instance=config-a-b494b6", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-b494b6-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - ], - find_instance_id: "config-a-b494b6", - find_instance_namespace: "config-a-namespace", - find_instance_result: "", - find_instance_result_error: true, - }), - }, - ); - - pod_watcher - .handle_pod(Event::Applied(pod), &mock, &mut false) + let pod = + make_pod_with_owners_and_phase("instance_name", "config_name", "Pending", "Instance"); + let pod_name = pod.metadata.name.clone().unwrap(); + let ctx = Arc::new(ControllerContext::new( + Arc::new(MockControllerKubeClient::default()), + "test", + )); + reconcile_inner(Event::Apply(Arc::new(pod)), ctx.clone()) .await .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); assert_eq!( - &PodState::Ended, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Pending + ); } + // If the pod is in a running state and was previously running, do nothing #[tokio::test] - async fn test_handle_pod_deleted() { + async fn test_reconcile_applied_running_phase_previously_known() { let _ = env_logger::builder().is_test(true).try_init(); - - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Failed", - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - configure_for_handle_pod( - &mut mock, - &HandlePod { - running: None, - ended: Some(CleanupServices { - find_svc_selector: "controller=akri.sh", - find_svc_result: "../test/json/running-svc-list-for-config-a-local.json", - cleanup_services: vec![ - CleanupService { - find_pod_selector: "akri.sh/configuration=config-a", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - CleanupService { - find_pod_selector: "akri.sh/instance=config-a-b494b6", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-b494b6-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - ], - find_instance_id: "config-a-b494b6", - find_instance_namespace: "config-a-namespace", - find_instance_result: "", - find_instance_result_error: true, - }), - }, - ); - - pod_watcher - .handle_pod(Event::Deleted(pod), &mock, &mut false) + let pod = + make_pod_with_owners_and_phase("instance_name", "config_name", "Running", "Instance"); + let pod_name = pod.metadata.name.clone().unwrap(); + let ctx = Arc::new(ControllerContext::new( + Arc::new(MockControllerKubeClient::default()), + "test", + )); + ctx.known_pods + .write() + .await + .insert(pod_name.clone(), PodState::Running); + reconcile_inner(Event::Apply(Arc::new(pod)), ctx.clone()) .await .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); assert_eq!( - &PodState::Deleted, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Running + ); } + // If the pod is in a running state and is not in known nodes, ensure services + // are created #[tokio::test] - async fn test_handle_pod_succeeded() { + async fn test_reconcile_applied_running_phase_unknown() { let _ = env_logger::builder().is_test(true).try_init(); + let pod = + make_pod_with_owners_and_phase("instance_name", "config_name", "Running", "Instance"); + let pod_name = pod.metadata.name.clone().unwrap(); + let mut mock = MockControllerKubeClient::default(); + let mut mock_config_api: MockApi = MockApi::new(); + mock_config_api.expect_get().return_once(|_| { + Ok(Some(make_configuration( + "config_name", + "test-ns", + true, + true, + ))) + }); + mock.config + .expect_namespaced() + .return_once(|_| Box::new(mock_config_api)) + .with(mockall::predicate::eq("test-ns")); + let mut mock_instance_api: MockApi = MockApi::new(); + mock_instance_api.expect_get().return_once(|_| { + Ok(Some(make_instance( + "instance_name", + "test-ns", + "config_name", + ))) + }); + mock.instance + .expect_namespaced() + .return_once(|_| Box::new(mock_instance_api)) + .with(mockall::predicate::eq("test-ns")); + + let mut mock_svc_api: MockApi = MockApi::new(); + let mut seq = Sequence::new(); + mock_svc_api + .expect_apply() + .times(1) + .in_sequence(&mut seq) + .return_once(|_, _| Ok(Service::default())) + .withf_st(move |x, _| valid_instance_svc(x, "instance_name", "test-ns")); + mock_svc_api + .expect_apply() + .times(1) + .in_sequence(&mut seq) + .return_once(|_, _| Ok(Service::default())) + .withf_st(move |x, _| valid_config_svc(x, "config_name", "test-ns")); + mock.service + .expect_namespaced() + .return_once(|_| Box::new(mock_svc_api)) + .with(mockall::predicate::eq("test-ns")); - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Succeeded", - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - configure_for_handle_pod( - &mut mock, - &HandlePod { - running: None, - ended: Some(CleanupServices { - find_svc_selector: "controller=akri.sh", - find_svc_result: "../test/json/running-svc-list-for-config-a-local.json", - cleanup_services: vec![ - CleanupService { - find_pod_selector: "akri.sh/configuration=config-a", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - CleanupService { - find_pod_selector: "akri.sh/instance=config-a-b494b6", - find_pod_result: "../test/json/empty-list.json", - remove_service: Some(RemoveService { - remove_service_name: "config-a-b494b6-svc", - remove_service_namespace: "config-a-namespace", - }), - }, - ], - find_instance_id: "config-a-b494b6", - find_instance_namespace: "config-a-namespace", - find_instance_result: "", - find_instance_result_error: true, - }), - }, - ); + let ctx = Arc::new(ControllerContext::new(Arc::new(mock), "test")); - pod_watcher - .handle_pod(Event::Applied(pod), &mock, &mut false) + reconcile_inner(Event::Apply(Arc::new(pod)), ctx.clone()) .await .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(1, pod_watcher.known_pods.len()); assert_eq!( - &PodState::Ended, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Running + ); } - #[tokio::test] - async fn test_handle_pod_add_or_modify_unknown_phase() { - let _ = env_logger::builder().is_test(true).try_init(); - - let phase = "Foo"; - { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - phase, - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - trace!( - "test_handle_pod_added_unready phase:{}, Event::Applied", - &phase - ); - pod_watcher - .handle_pod(Event::Applied(pod), &MockKubeInterface::new(), &mut false) - .await - .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(0, pod_watcher.known_pods.len()); - } - { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - phase, - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - trace!( - "test_handle_pod_added_unready phase:{}, Event::Applied", - &phase - ); - pod_watcher - .handle_pod(Event::Applied(pod), &MockKubeInterface::new(), &mut false) - .await - .unwrap(); - trace!( - "test_handle_pod_added_unready pod_watcher:{:?}", - &pod_watcher - ); - assert_eq!(0, pod_watcher.known_pods.len()); + fn controller_ctx_for_handle_ended_pod_if_needed( + pod_list: ObjectList, + delete_config_svc: bool, + ) -> ControllerContext { + let mut mock = MockControllerKubeClient::default(); + let mut mock_config_api: MockApi = MockApi::new(); + mock_config_api.expect_get().return_once(|_| { + Ok(Some(make_configuration( + "config_name", + "test-ns", + true, + true, + ))) + }); + mock.config + .expect_namespaced() + .return_once(|_| Box::new(mock_config_api)) + .with(mockall::predicate::eq("test-ns")); + let mut mock_instance_api: MockApi = MockApi::new(); + mock_instance_api.expect_get().return_once(|_| { + Ok(Some(make_instance( + "instance_name", + "test-ns", + "config_name", + ))) + }); + mock.instance + .expect_namespaced() + .return_once(|_| Box::new(mock_instance_api)) + .with(mockall::predicate::eq("test-ns")); + let mut mock_pod_api: MockApi = MockApi::new(); + mock_pod_api.expect_list().return_once(|_| Ok(pod_list)); + mock.pod.expect_all().return_once(|| Box::new(mock_pod_api)); + + let mut mock_svc_api: MockApi = MockApi::new(); + let mut seq = Sequence::new(); + if delete_config_svc { + mock_svc_api + .expect_delete() + .times(1) + .in_sequence(&mut seq) + .return_once(|_| Ok(either::Left(Service::default()))) + .withf_st(move |x| x == create_service_app_name("config_name")); } + mock_svc_api + .expect_delete() + .times(1) + .in_sequence(&mut seq) + .return_once(|_| Ok(either::Left(Service::default()))) + .withf_st(move |x| x == create_service_app_name("instance_name")); + mock.service + .expect_namespaced() + .return_once(|_| Box::new(mock_svc_api)) + .with(mockall::predicate::eq("test-ns")); + ControllerContext::new(Arc::new(mock), "test") } - #[tokio::test] - async fn test_handle_running_pod_if_needed_do_nothing() { + async fn test_reconcile_applied_terminated_phases(phase: &str) { let _ = env_logger::builder().is_test(true).try_init(); - - let pods_json = - file::read_file_to_string("../test/json/running-pod-list-for-config-a-local.json"); - let pod_list: PodList = serde_json::from_str(&pods_json).unwrap(); - let pod = pod_list.items.first().unwrap(); - - let mut pod_watcher = BrokerPodWatcher::new(); - pod_watcher - .known_pods - .insert("config-a-b494b6-pod".to_string(), PodState::Running); - pod_watcher - .handle_running_pod_if_needed(pod, &MockKubeInterface::new()) + // NOTE: setting Job kind for the Pod owner to ensure `handle_instance_change` is not called + let pod1 = make_pod_with_owners_and_phase("instance_name", "config_name", phase, "Job"); + let pod_name = pod1.metadata.name.clone().unwrap(); + // Unrelated pod that should be filtered out + let pod2 = make_pod_with_owners_and_phase("foo", "config_name", phase, "Job"); + let pod_list = make_obj_list(vec![pod1.clone(), pod2]); + let ctx = Arc::new(controller_ctx_for_handle_ended_pod_if_needed( + pod_list, true, + )); + // Configure the pod as previously running + ctx.known_pods + .write() .await - .unwrap(); - assert_eq!(1, pod_watcher.known_pods.len()); - assert_eq!( - &PodState::Running, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) - } - - #[tokio::test] - async fn test_handle_ended_pod_if_needed_do_nothing() { - let _ = env_logger::builder().is_test(true).try_init(); - - let pods_json = - file::read_file_to_string("../test/json/running-pod-list-for-config-a-local.json"); - let pod_list: PodList = serde_json::from_str(&pods_json).unwrap(); - let pod = pod_list.items.first().unwrap(); - - let mut pod_watcher = BrokerPodWatcher::new(); - pod_watcher - .known_pods - .insert("config-a-b494b6-pod".to_string(), PodState::Ended); - pod_watcher - .handle_ended_pod_if_needed(pod, &MockKubeInterface::new()) + .insert(pod_name.clone(), PodState::Running); + reconcile_inner(Event::Apply(Arc::new(pod1)), ctx.clone()) .await .unwrap(); - assert_eq!(1, pod_watcher.known_pods.len()); assert_eq!( - &PodState::Ended, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Ended + ); } + // If the pod is in a succeeded state and was previously known, ensure services + // are deleted #[tokio::test] - async fn test_handle_deleted_pod_if_needed_do_nothing() { - let _ = env_logger::builder().is_test(true).try_init(); - - let pods_json = - file::read_file_to_string("../test/json/running-pod-list-for-config-a-local.json"); - let pod_list: PodList = serde_json::from_str(&pods_json).unwrap(); - let pod = pod_list.items.first().unwrap(); - - let mut pod_watcher = BrokerPodWatcher::new(); - pod_watcher - .known_pods - .insert("config-a-b494b6-pod".to_string(), PodState::Deleted); - pod_watcher - .handle_deleted_pod_if_needed(pod, &MockKubeInterface::new()) - .await - .unwrap(); - assert_eq!(1, pod_watcher.known_pods.len()); - assert_eq!( - &PodState::Deleted, - pod_watcher.known_pods.get("config-a-b494b6-pod").unwrap() - ) - } - - #[test] - fn test_get_pod_phase() { - let _ = env_logger::builder().is_test(true).try_init(); - - for phase in &[ - "Unknown", - "Pending", - "Running", - "Ended", - "Failed", - "Succeeded", - "Foo", - "", - ] { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - phase, - ); - let pod = pod_list.items.first().unwrap().clone(); - let mut pod_watcher = BrokerPodWatcher::new(); - - assert_eq!(phase.to_string(), pod_watcher.get_pod_phase(&pod)); - } - - { - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Foo", - ); - let mut pod = pod_list.items.first().unwrap().clone(); - pod.status = None; - - let mut pod_watcher = BrokerPodWatcher::new(); - - assert_eq!("Unknown", pod_watcher.get_pod_phase(&pod)); - } + async fn test_reconcile_applied_succeeded_phase() { + test_reconcile_applied_terminated_phases("Succeeded").await; } + // If the pod is in a failed state and was previously known, ensure services + // are deleted #[tokio::test] - async fn test_cleanup_svc_if_unsupported() { - let _ = env_logger::builder().is_test(true).try_init(); - let watcher = BrokerPodWatcher::new(); - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Succeeded", - ); - let pod = pod_list.items.first().unwrap().to_owned(); - let svc_name = "some-service"; - let svc_namespace = "default"; - let mut mock = MockKubeInterface::new(); - mock.expect_remove_service() - .times(1) - .returning(|_, _| Ok(())); - watcher - .cleanup_svc_if_unsupported(&[pod], svc_name, svc_namespace, &mock) - .await - .unwrap(); - } - - #[test] - fn test_get_instance_and_configuration_from_pod() { - let _ = env_logger::builder().is_test(true).try_init(); - - let pod_list = create_pods_with_phase( - "../test/json/running-pod-list-for-config-a-local.json", - "Foo", - ); - let orig_pod = pod_list.items.first().unwrap(); - - let pod_watcher = BrokerPodWatcher::new(); - assert!(pod_watcher - .get_instance_and_configuration_from_pod(orig_pod) - .is_ok()); - - let mut instanceless_pod = orig_pod.clone(); - instanceless_pod - .metadata - .labels - .as_mut() - .unwrap() - .remove(AKRI_INSTANCE_LABEL_NAME); - assert!(pod_watcher - .get_instance_and_configuration_from_pod(&instanceless_pod) - .is_err()); - - let mut configurationless_pod = orig_pod.clone(); - configurationless_pod - .metadata - .labels - .as_mut() - .unwrap() - .remove(AKRI_CONFIGURATION_LABEL_NAME); - assert!(pod_watcher - .get_instance_and_configuration_from_pod(&configurationless_pod) - .is_err()); + async fn test_reconcile_applied_failed_phase() { + test_reconcile_applied_terminated_phases("Failed").await; } #[tokio::test] - async fn test_create_or_update_service_successful_update() { + async fn test_reconcile_applied_failed_phase_pods_with_pods_not_terminating() { let _ = env_logger::builder().is_test(true).try_init(); - - let config_json = file::read_file_to_string("../test/json/config-a.json"); - let config: Configuration = serde_json::from_str(&config_json).unwrap(); - - let pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - config_for_tests::configure_find_services( - &mut mock, - "akri.sh/instance=config-a-b494b6", - "../test/json/running-instance-svc-list-for-config-a-local.json", - false, - ); - config_for_tests::configure_update_service( - &mut mock, - "node-a-config-a-b494b6-svc", - "config-a-namespace", - false, - ); - let ownership = OwnershipInfo::new( - OwnershipType::Instance, - "object".to_string(), - "object_uid".to_string(), - ); - pod_watcher - .create_or_update_service( - "config-a-b494b6", - "config-a", - "config-a-namespace", - AKRI_INSTANCE_LABEL_NAME, - "config-a-b494b6", - ownership, - &config.spec.instance_service_spec.unwrap().clone(), - true, - &mock, - ) + // NOTE: setting Job kind for the Pod owner to ensure `handle_instance_change` is not called + let pod1 = make_pod_with_owners_and_phase("instance_name", "config_name", "Failed", "Job"); + let pod_name = pod1.metadata.name.clone().unwrap(); + // Have one pod of the config still running to ensure that the config service is not deleted + let pod2 = make_pod_with_owners_and_phase("foo", "config_name", "Running", "Job"); + let pod_list = make_obj_list(vec![pod1.clone(), pod2]); + let ctx = Arc::new(controller_ctx_for_handle_ended_pod_if_needed( + pod_list, false, + )); + // Configure the pod as previously running + ctx.known_pods + .write() + .await + .insert(pod_name.clone(), PodState::Running); + reconcile_inner(Event::Apply(Arc::new(pod1)), ctx.clone()) .await .unwrap(); - } - - #[tokio::test] - async fn test_create_or_update_service_failed_update() { - let _ = env_logger::builder().is_test(true).try_init(); - - let config_json = file::read_file_to_string("../test/json/config-a.json"); - let config: Configuration = serde_json::from_str(&config_json).unwrap(); - - let pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - config_for_tests::configure_find_services( - &mut mock, - "akri.sh/instance=config-a-b494b6", - "../test/json/running-instance-svc-list-for-config-a-local.json", - false, - ); - config_for_tests::configure_update_service( - &mut mock, - "node-a-config-a-b494b6-svc", - "config-a-namespace", - true, - ); - let ownership = OwnershipInfo::new( - OwnershipType::Instance, - "object".to_string(), - "object_uid".to_string(), + assert_eq!( + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Ended ); - - assert!(pod_watcher - .create_or_update_service( - "config-a-b494b6", - "config-a", - "config-a-namespace", - AKRI_INSTANCE_LABEL_NAME, - "config-a-b494b6", - ownership, - &config.spec.instance_service_spec.unwrap().clone(), - true, - &mock - ) - .await - .is_err()); } #[tokio::test] - async fn test_create_or_update_service_successful_create() { + async fn test_reconcile_cleanup() { let _ = env_logger::builder().is_test(true).try_init(); - - let config_json = file::read_file_to_string("../test/json/config-a.json"); - let config: Configuration = serde_json::from_str(&config_json).unwrap(); - - let pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - config_for_tests::configure_find_services( - &mut mock, - "akri.sh/instance=config-a-b494b6", - "../test/json/empty-list.json", - false, - ); - config_for_tests::configure_add_service( - &mut mock, - "config-a-b494b6-svc", - "config-a-namespace", - AKRI_INSTANCE_LABEL_NAME, - "config-a-b494b6", - ); - let ownership = OwnershipInfo::new( - OwnershipType::Instance, - "object".to_string(), - "object_uid".to_string(), - ); - - pod_watcher - .create_or_update_service( - "config-a-b494b6", - "config-a", - "config-a-namespace", - AKRI_INSTANCE_LABEL_NAME, - "config-a-b494b6", - ownership, - &config.spec.instance_service_spec.unwrap().clone(), - true, - &mock, - ) + let phase = "Succeeded"; + // NOTE: setting Job kind for the Pod owner to ensure `handle_instance_change` is not called + let pod1 = make_pod_with_owners_and_phase("instance_name", "config_name", phase, "Job"); + let pod_name = pod1.metadata.name.clone().unwrap(); + // Unrelated pod that should be filtered out + let pod2 = make_pod_with_owners_and_phase("foo", "config_name", phase, "Job"); + let pod_list = make_obj_list(vec![pod1.clone(), pod2]); + let ctx = Arc::new(controller_ctx_for_handle_ended_pod_if_needed( + pod_list, true, + )); + // Configure the pod as previously running + ctx.known_pods + .write() .await - .unwrap(); - } - - #[tokio::test] - async fn test_create_or_update_service_failed_create() { - let _ = env_logger::builder().is_test(true).try_init(); - - let config_json = file::read_file_to_string("../test/json/config-a.json"); - let config: Configuration = serde_json::from_str(&config_json).unwrap(); - - let pod_watcher = BrokerPodWatcher::new(); - let mut mock = MockKubeInterface::new(); - config_for_tests::configure_find_services( - &mut mock, - "akri.sh/instance=config-a-b494b6", - "../test/json/empty-list.json", - false, - ); - mock.expect_create_service() - .returning(move |_, _| Err(anyhow::anyhow!("Failure"))); - - let ownership = OwnershipInfo::new( - OwnershipType::Instance, - "object".to_string(), - "object_uid".to_string(), - ); - - assert!(pod_watcher - .create_or_update_service( - "config-a-b494b6", - "config-a", - "config-a-namespace", - AKRI_INSTANCE_LABEL_NAME, - "config-a-b494b6", - ownership, - &config.spec.instance_service_spec.unwrap().clone(), - true, - &mock - ) + .insert(pod_name.clone(), PodState::Running); + reconcile_inner(Event::Cleanup(Arc::new(pod1)), ctx.clone()) .await - .is_err()); - } - - #[derive(Clone)] - struct RemoveService { - remove_service_name: &'static str, - remove_service_namespace: &'static str, - } - - #[derive(Clone)] - struct CleanupService { - find_pod_selector: &'static str, - find_pod_result: &'static str, - remove_service: Option, - } - - #[derive(Clone)] - struct CleanupServices { - // This field is used for testing - #[allow(dead_code)] - find_svc_selector: &'static str, - // This field is used for testing - #[allow(dead_code)] - find_svc_result: &'static str, - cleanup_services: Vec, - find_instance_id: &'static str, - find_instance_namespace: &'static str, - find_instance_result: &'static str, - find_instance_result_error: bool, - } - - fn configure_for_cleanup_broker_and_configuration_svcs( - mock: &mut MockKubeInterface, - work: &CleanupServices, - ) { - for i in 0..work.cleanup_services.len() { - let cleanup_service = &work.cleanup_services[i]; - config_for_tests::configure_find_pods( - mock, - cleanup_service.find_pod_selector, - cleanup_service.find_pod_result, - false, - ); - if let Some(remove_service) = &cleanup_service.remove_service { - config_for_tests::configure_remove_service( - mock, - remove_service.remove_service_name, - remove_service.remove_service_namespace, - ); - } - } - - config_for_tests::configure_find_instance( - mock, - work.find_instance_id, - work.find_instance_namespace, - work.find_instance_result, - work.find_instance_result_error, - ); - } - - #[derive(Clone)] - struct FindServices { - find_services_selector: &'static str, - find_services_result: &'static str, - find_services_error: bool, - } - - #[derive(Clone)] - struct HandlePodRunning { - find_config_name: &'static str, - find_config_namespace: &'static str, - find_config_result: &'static str, - find_config_error: bool, - - find_instance_name: &'static str, - find_instance_result: &'static str, - - find_instance_service: FindServices, - new_instance_svc_name: &'static str, - - find_configuration_service: FindServices, - new_configuration_svc_name: &'static str, - } - - fn configure_for_running_pod_work(mock: &mut MockKubeInterface, work: &HandlePodRunning) { - config_for_tests::configure_find_config( - mock, - work.find_config_name, - work.find_config_namespace, - work.find_config_result, - work.find_config_error, + .unwrap(); + assert_eq!( + ctx.known_pods.read().await.get(&pod_name).unwrap(), + &PodState::Deleted ); - if !work.find_config_error { - config_for_tests::configure_find_instance( - mock, - work.find_instance_name, - work.find_config_namespace, - work.find_instance_result, - false, - ); - - config_for_tests::configure_find_services( - mock, - work.find_instance_service.find_services_selector, - work.find_instance_service.find_services_result, - work.find_instance_service.find_services_error, - ); - if work.find_instance_service.find_services_error { - config_for_tests::configure_update_service( - mock, - work.new_instance_svc_name, - work.find_config_namespace, - false, - ); - } else { - config_for_tests::configure_add_service( - mock, - work.new_instance_svc_name, - work.find_config_namespace, - AKRI_INSTANCE_LABEL_NAME, - work.find_instance_name, - ); - } - - config_for_tests::configure_find_services( - mock, - work.find_configuration_service.find_services_selector, - work.find_configuration_service.find_services_result, - work.find_configuration_service.find_services_error, - ); - if work.find_configuration_service.find_services_error { - config_for_tests::configure_update_service( - mock, - work.new_configuration_svc_name, - work.find_config_namespace, - false, - ); - } else { - config_for_tests::configure_add_service( - mock, - work.new_configuration_svc_name, - work.find_config_namespace, - AKRI_CONFIGURATION_LABEL_NAME, - work.find_config_name, - ); - } - } } - #[derive(Clone)] - struct HandlePod { - running: Option, - ended: Option, - } - - fn configure_for_handle_pod(mock: &mut MockKubeInterface, handle_pod: &HandlePod) { - if let Some(running) = &handle_pod.running { - configure_for_running_pod_work(mock, running); - } - - if let Some(ended) = &handle_pod.ended { - configure_for_cleanup_broker_and_configuration_svcs(mock, ended); - } - } + // TODO: directly test cleanup_svc_if_unsupported } diff --git a/controller/src/util/shared_test_utils.rs b/controller/src/util/shared_test_utils.rs index 09ac39735..8b29bc7d3 100644 --- a/controller/src/util/shared_test_utils.rs +++ b/controller/src/util/shared_test_utils.rs @@ -1,264 +1,177 @@ #[cfg(test)] pub mod config_for_tests { - use akri_shared::{ - akri::{ - configuration::Configuration, - instance::{Instance, InstanceList, InstanceSpec}, - }, - k8s::MockKubeInterface, - os::file, - }; - use k8s_openapi::api::core::v1::{Pod, Service}; + use super::mock_client::MockControllerKubeClient; + use akri_shared::{akri::configuration::Configuration, k8s::api::MockApi, os::file}; + use chrono::DateTime; + use k8s_openapi::api::core::v1::Pod; use kube::api::ObjectList; use log::trace; pub type PodList = ObjectList; - pub type ServiceList = ObjectList; - - pub fn configure_find_instance( - mock: &mut MockKubeInterface, - instance_name: &'static str, - instance_namespace: &'static str, - result_file: &'static str, - result_error: bool, - ) { - trace!("mock.expect_find_instance instance_name:{}", instance_name); - mock.expect_find_instance() - .times(1) - .withf(move |name, namespace| name == instance_name && namespace == instance_namespace) - .returning(move |_, _| { - if result_error { - // Return error that instance could not be found - Err(anyhow::anyhow!(kube::Error::Api( - kube::error::ErrorResponse { - status: "Failure".to_string(), - message: "instances.akri.sh \"akri-blah-901a7b\" not found".to_string(), - reason: "NotFound".to_string(), - code: akri_shared::k8s::ERROR_NOT_FOUND, - } - ))) - } else { - let instance_json = file::read_file_to_string(result_file); - let instance: Instance = serde_json::from_str(&instance_json).unwrap(); - Ok(instance) - } - }); - } - - const LIST_PREFIX: &str = r#" -{ - "apiVersion": "v1", - "items": ["#; - const LIST_SUFFIX: &str = r#" - ], - "kind": "List", - "metadata": { - "resourceVersion": "", - "selfLink": "" - } -}"#; - fn listify_kube_object(node_json: &str) -> String { - format!("{}\n{}\n{}", LIST_PREFIX, node_json, LIST_SUFFIX) - } - - pub fn configure_get_instances( - mock: &mut MockKubeInterface, - result_file: &'static str, - listify_result: bool, - ) { - trace!("mock.expect_get_instances namespace:None"); - mock.expect_get_instances().times(1).returning(move || { - let json = file::read_file_to_string(result_file); - let instance_list_json = if listify_result { - listify_kube_object(&json) - } else { - json - }; - let list: InstanceList = serde_json::from_str(&instance_list_json).unwrap(); - Ok(list) - }); - } - - pub fn configure_update_instance( - mock: &mut MockKubeInterface, - instance_to_update: InstanceSpec, - instance_name: &'static str, - instance_namespace: &'static str, - result_error: bool, - ) { - trace!( - "mock.expect_update_instance name:{} namespace:{} error:{}", - instance_name, - instance_namespace, - result_error - ); - mock.expect_update_instance() - .times(1) - .withf(move |instance, name, namespace| { - name == instance_name - && namespace == instance_namespace - && instance.nodes == instance_to_update.nodes - && instance.device_usage == instance_to_update.device_usage - }) - .returning(move |_, _, _| { - if result_error { - Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?) - } else { - Ok(()) - } - }); - } pub fn configure_find_config( - mock: &mut MockKubeInterface, + mock: &mut MockControllerKubeClient, config_name: &'static str, config_namespace: &'static str, result_file: &'static str, result_error: bool, ) { trace!("mock.expect_find_configuration config_name:{}", config_name); - mock.expect_find_configuration() - .times(1) - .withf(move |name, namespace| name == config_name && namespace == config_namespace) - .returning(move |_, _| { - if result_error { - Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?) - } else { - let config_json = file::read_file_to_string(result_file); - let config: Configuration = serde_json::from_str(&config_json).unwrap(); - Ok(config) - } - }); + mock.config + .expect_namespaced() + .return_once(move |_| { + let mut mock_api: MockApi = MockApi::new(); + mock_api + .expect_get() + .times(1) + .withf(move |name| name == config_name) + .returning(move |_| { + if result_error { + Err(kube::Error::Api(kube::error::ErrorResponse { + status: "Failure".to_string(), + message: format!("configurations.akri.sh {config_name} not found"), + reason: "NotFound".to_string(), + code: akri_shared::k8s::ERROR_NOT_FOUND, + })) + } else { + let config_json = file::read_file_to_string(result_file); + let config: Configuration = serde_json::from_str(&config_json).unwrap(); + Ok(Some(config)) + } + }); + Box::new(mock_api) + }) + .withf(move |namespace| namespace == config_namespace); } - pub fn configure_find_services( - mock: &mut MockKubeInterface, - svc_selector: &'static str, + pub fn configure_find_pods( + mock_api: &mut MockApi, + pod_selector: &'static str, + _namespace: &'static str, result_file: &'static str, result_error: bool, ) { - trace!("mock.expect_find_services svc_selector:{}", svc_selector); - mock.expect_find_services() + trace!( + "mock.expect_find_pods_with_label pod_selector:{}", + pod_selector + ); + mock_api + .expect_list() .times(1) - .withf(move |selector| selector == svc_selector) + .withf(move |lp| lp.label_selector.as_ref().unwrap_or(&String::new()) == pod_selector) .returning(move |_| { if result_error { - Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?) + Err(kube::Error::Api(kube::error::ErrorResponse { + status: "Failure".to_string(), + message: format!("pods {pod_selector} not found"), + reason: "NotFound".to_string(), + code: akri_shared::k8s::ERROR_NOT_FOUND, + })) } else { - let svcs_json = file::read_file_to_string(result_file); - let svcs: ServiceList = serde_json::from_str(&svcs_json).unwrap(); - Ok(svcs) + let pods_json = file::read_file_to_string(result_file); + let pods: PodList = serde_json::from_str(&pods_json).unwrap(); + Ok(pods) } }); } - pub fn configure_add_service( - mock: &mut MockKubeInterface, - svc_name: &'static str, - namespace: &'static str, - label_id: &'static str, - label_value: &'static str, - ) { - trace!( - "mock.expect_create_service name:{}, namespace:{}, [{}={}]", - &svc_name, - &namespace, - &label_id, - &label_value - ); - mock.expect_create_service() - .withf(move |svc_to_create, ns| { - svc_to_create.metadata.name.as_ref().unwrap() == svc_name - && svc_to_create - .metadata - .labels - .as_ref() - .unwrap() - .get(label_id) - .unwrap() - == label_value - && ns == namespace - }) - .returning(move |_, _| Ok(())); - } - pub fn configure_remove_service( - mock: &mut MockKubeInterface, - svc_name: &'static str, - svc_namespace: &'static str, + pub fn configure_find_pods_with_phase( + mock_api: &mut MockApi, + pod_selector: &'static str, + result_file: &'static str, + specified_phase: &'static str, ) { trace!( - "mock.expect_remove_service svc_name:{}, svc_namespace={}", - svc_name, - svc_namespace + "mock.expect_find_pods_with_label pod_selector:{}", + pod_selector ); - mock.expect_remove_service() + mock_api + .expect_list() .times(1) - .withf(move |svc_to_remove, namespace| { - svc_to_remove == svc_name && namespace == svc_namespace - }) - .returning(move |_, _| Ok(())); + .withf(move |lp| lp.label_selector.as_ref().unwrap_or(&String::new()) == pod_selector) + .returning(move |_| { + let pods_json = file::read_file_to_string(result_file); + let phase_adjusted_json = pods_json.replace( + "\"phase\": \"Running\"", + &format!("\"phase\": \"{}\"", specified_phase), + ); + let pods: PodList = serde_json::from_str(&phase_adjusted_json).unwrap(); + Ok(pods) + }); } - pub fn configure_update_service( - mock: &mut MockKubeInterface, - svc_name: &'static str, - svc_namespace: &'static str, - result_error: bool, + pub fn configure_find_pods_with_phase_and_start_time( + mock_api: &mut MockApi, + pod_selector: &'static str, + result_file: &'static str, + specified_phase: &'static str, + start_time: DateTime, ) { trace!( - "mock.expect_update_service name:{} namespace:{} error:{}", - svc_name, - svc_namespace, - result_error, + "mock.expect_find_pods_with_label pod_selector:{}", + pod_selector ); - mock.expect_update_service() + mock_api + .expect_list() .times(1) - .withf(move |_svc, name, namespace| name == svc_name && namespace == svc_namespace) - .returning(move |_, _, _| { - if result_error { - Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?) - } else { - Ok(()) - } + .withf(move |lp| lp.label_selector.as_ref().unwrap_or(&String::new()) == pod_selector) + .returning(move |_| { + let pods_json = file::read_file_to_string(result_file); + let phase_adjusted_json = pods_json.replace( + "\"phase\": \"Running\"", + &format!("\"phase\": \"{}\"", specified_phase), + ); + let start_time_adjusted_json = phase_adjusted_json.replace( + "\"startTime\": \"2020-02-25T20:48:03Z\"", + &format!( + "\"startTime\": \"{}\"", + start_time.format("%Y-%m-%dT%H:%M:%SZ") + ), + ); + let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap(); + Ok(pods) }); } - pub fn configure_find_pods( - mock: &mut MockKubeInterface, + pub fn configure_find_pods_with_phase_and_no_start_time( + mock_api: &mut MockApi, pod_selector: &'static str, result_file: &'static str, - result_error: bool, + specified_phase: &'static str, ) { trace!( "mock.expect_find_pods_with_label pod_selector:{}", pod_selector ); - mock.expect_find_pods_with_label() + mock_api + .expect_list() .times(1) - .withf(move |selector| selector == pod_selector) + .withf(move |lp| lp.label_selector.as_ref().unwrap_or(&String::new()) == pod_selector) .returning(move |_| { - if result_error { - Err(None.ok_or_else(|| anyhow::anyhow!("failure"))?) - } else { - let pods_json = file::read_file_to_string(result_file); - let pods: PodList = serde_json::from_str(&pods_json).unwrap(); - Ok(pods) - } + let pods_json = file::read_file_to_string(result_file); + let phase_adjusted_json = pods_json.replace( + "\"phase\": \"Running\"", + &format!("\"phase\": \"{}\"", specified_phase), + ); + let start_time_adjusted_json = + phase_adjusted_json.replace("\"startTime\": \"2020-02-25T20:48:03Z\",", ""); + let pods: PodList = serde_json::from_str(&start_time_adjusted_json).unwrap(); + Ok(pods) }); } pub fn configure_add_pod( - mock: &mut MockKubeInterface, + mock_api: &mut MockApi, pod_name: &'static str, - pod_namespace: &'static str, + _pod_namespace: &'static str, label_id: &'static str, label_value: &'static str, error: bool, ) { trace!("mock.expect_create_pod pod_name:{}", pod_name); - mock.expect_create_pod() - .times(1) - .withf(move |pod_to_create, namespace| { + mock_api + .expect_apply() + .withf(move |pod_to_create, _| { pod_to_create.metadata.name.as_ref().unwrap() == pod_name && pod_to_create .metadata @@ -268,16 +181,20 @@ pub mod config_for_tests { .get(label_id) .unwrap() == label_value - && namespace == pod_namespace }) - .returning(move |_, _| match error { - false => Ok(()), - true => Err(anyhow::format_err!("create pod error")), + .returning(move |pod, _| match error { + false => Ok(pod), + true => Err(kube::Error::Api(kube::error::ErrorResponse { + status: "Failure".to_string(), + message: format!("pods {pod_name} not created"), + reason: "NotFound".to_string(), + code: akri_shared::k8s::ERROR_NOT_FOUND, + })), }); } pub fn configure_remove_pod( - mock: &mut MockKubeInterface, + mock_api: &mut MockApi, pod_name: &'static str, pod_namespace: &'static str, ) { @@ -286,11 +203,114 @@ pub mod config_for_tests { pod_name, pod_namespace ); - mock.expect_remove_pod() + mock_api + .expect_delete() .times(1) - .withf(move |pod_to_remove, namespace| { - pod_to_remove == pod_name && namespace == pod_namespace - }) - .returning(move |_, _| Ok(())); + .withf(move |pod_to_remove| pod_to_remove == pod_name) + .returning(move |_| Ok(either::Left(Pod::default()))); + } +} + +#[cfg(test)] +pub mod mock_client { + use akri_shared::akri::{configuration::Configuration, instance::Instance}; + use akri_shared::k8s::api::{Api, IntoApi, MockIntoApi}; + use k8s_openapi::api::batch::v1::Job; + use k8s_openapi::api::core::v1::{Node, Pod, Service}; + + #[derive(Default)] + pub struct MockControllerKubeClient { + pub instance: MockIntoApi, + pub config: MockIntoApi, + pub job: MockIntoApi, + pub pod: MockIntoApi, + pub service: MockIntoApi, + pub node: MockIntoApi, + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.instance.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.instance.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.instance.default_namespaced() + } + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.config.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.config.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.config.default_namespaced() + } + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.job.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.job.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.job.default_namespaced() + } + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.pod.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.pod.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.pod.default_namespaced() + } + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.service.all() + } + + fn namespaced(&self, namespace: &str) -> Box> { + self.service.namespaced(namespace) + } + + fn default_namespaced(&self) -> Box> { + self.service.default_namespaced() + } + } + + impl IntoApi for MockControllerKubeClient { + fn all(&self) -> Box> { + self.node.all() + } + + fn namespaced(&self, _namespace: &str) -> Box> { + // TODO: handle error here -- no namespaced scope for Node + self.node.all() + } + + fn default_namespaced(&self) -> Box> { + // TODO: handle error here -- no namespaced scope for Node + self.node.all() + } } } diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 2ffe521d0..f50642708 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -11,9 +11,10 @@ rust-version.workspace = true [dependencies] anyhow = "1.0.38" async-trait = "0.1.0" -either = '*' -k8s-openapi = { version = "0.20.0", default-features = false, features = ["schemars", "v1_23"] } -kube = { version = "0.87.1", features = ["derive"] } +either = "1.13" +k8s-openapi = { version = "0.22.0", default-features = false, features = ["schemars", "v1_25"] } +# k8s-openapi = { version = "0.22.0", features = ["schemars"]} +kube = { version = "0.91.0", features = ["derive" ] } log = "0.4" mockall = "0.12" prometheus = { version = "0.12.0", features = ["process"] } diff --git a/shared/src/akri/instance.rs b/shared/src/akri/instance.rs index 22a1eb485..a05075bf6 100644 --- a/shared/src/akri/instance.rs +++ b/shared/src/akri/instance.rs @@ -16,7 +16,7 @@ pub type InstanceList = ObjectList; /// a Configuration. For example, a Configuration /// may describe many cameras, each camera will be represented by a /// Instance. -#[derive(CustomResource, Deserialize, Serialize, Clone, Debug, JsonSchema, PartialEq)] +#[derive(CustomResource, Deserialize, Serialize, Clone, Debug, Default, JsonSchema, PartialEq)] #[serde(rename_all = "camelCase")] // group = API_NAMESPACE and version = API_VERSION #[kube( diff --git a/shared/src/k8s/api.rs b/shared/src/k8s/api.rs index 88ec79a0f..52914af6b 100644 --- a/shared/src/k8s/api.rs +++ b/shared/src/k8s/api.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; use async_trait::async_trait; use either::Either; use kube::{ - api::{Patch, PatchParams}, + api::{ListParams, Patch, PatchParams}, core::{ObjectList, ObjectMeta, PartialObjectMetaExt, Status}, Error, Resource, ResourceExt, }; @@ -11,8 +11,6 @@ use mockall::automock; use serde::de::DeserializeOwned; use serde_json::Value; -use super::KubeImpl; - #[automock] #[async_trait] pub trait Api: Send + Sync { @@ -24,9 +22,14 @@ pub trait Api: Send + Sync { patch: &Patch, pp: &PatchParams, ) -> Result; + async fn create(&self, obj: &T) -> Result; async fn delete(&self, name: &str) -> Result, Error>; + async fn delete_collection( + &self, + lp: &ListParams, + ) -> Result, Status>, Error>; async fn get(&self, name: &str) -> Result, Error>; - async fn list(&self) -> Result, Error>; + async fn list(&self, lp: &ListParams) -> Result, Error>; async fn add_finalizer(&self, obj: &T, finalizer: &str) -> Result<(), Error> { self.set_finalizers( &obj.name_any(), @@ -75,14 +78,23 @@ where ) -> Result { self.patch(name, pp, patch).await } + async fn create(&self, obj: &T) -> Result { + self.create(&Default::default(), obj).await + } async fn delete(&self, name: &str) -> Result, Error> { self.delete(name, &Default::default()).await } + async fn delete_collection( + &self, + lp: &ListParams, + ) -> Result, Status>, Error> { + self.delete_collection(&Default::default(), lp).await + } async fn get(&self, name: &str) -> Result, Error> { self.get_opt(name).await } - async fn list(&self) -> Result, Error> { - self.list(&Default::default()).await + async fn list(&self, lp: &ListParams) -> Result, Error> { + self.list(lp).await } async fn set_finalizers( &self, @@ -117,36 +129,6 @@ pub trait IntoApi: Send + Sync { T: Resource; } -impl IntoApi for KubeImpl -where - T: Resource - + Clone - + DeserializeOwned - + Debug - + serde::Serialize - + Send - + Sync - + 'static, -{ - fn all(&self) -> Box> { - Box::new(kube::Api::all(self.client.clone())) - } - - fn namespaced(&self, namespace: &str) -> Box> - where - T: Resource, - { - Box::new(kube::Api::namespaced(self.client.clone(), namespace)) - } - - fn default_namespaced(&self) -> Box> - where - T: Resource, - { - Box::new(kube::Api::default_namespaced(self.client.clone())) - } -} - impl IntoApi for kube::Client where T: Resource diff --git a/shared/src/k8s/job.rs b/shared/src/k8s/job.rs index bcf5c238a..c01d0f19b 100644 --- a/shared/src/k8s/job.rs +++ b/shared/src/k8s/job.rs @@ -4,71 +4,14 @@ use super::{ pod::{ AKRI_CONFIGURATION_LABEL_NAME, AKRI_INSTANCE_LABEL_NAME, APP_LABEL_ID, CONTROLLER_LABEL_ID, }, - OwnershipInfo, ERROR_CONFLICT, ERROR_NOT_FOUND, + OwnershipInfo, }; -use either::Either; + use k8s_openapi::api::batch::v1::{Job, JobSpec}; use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; -use kube::{ - api::{Api, DeleteParams, ListParams, ObjectList, PostParams, PropagationPolicy}, - client::Client, -}; -use log::{error, info, trace}; -use std::collections::BTreeMap; -/// Find Kubernetes Jobs with a given label or field selector -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::job; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let label_selector = Some("environment=production,app=nginx".to_string()); -/// let api_client = Client::try_default().await.unwrap(); -/// for job in job::find_jobs_with_selector(label_selector, None, api_client).await.unwrap() { -/// println!("found job: {}", job.metadata.name.unwrap()) -/// } -/// # } -/// ``` -/// -/// ```no_run -/// use akri_shared::k8s::job; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let field_selector = Some("spec.nodeName=node-a".to_string()); -/// let api_client = Client::try_default().await.unwrap(); -/// for job in job::find_jobs_with_selector(None, field_selector, api_client).await.unwrap() { -/// println!("found job: {}", job.metadata.name.unwrap()) -/// } -/// # } -/// ``` -pub async fn find_jobs_with_selector( - label_selector: Option, - field_selector: Option, - kube_client: Client, -) -> Result, anyhow::Error> { - trace!( - "find_jobs_with_selector with label_selector={:?} field_selector={:?}", - &label_selector, - &field_selector - ); - let jobs: Api = Api::all(kube_client); - let job_list_params = ListParams { - label_selector, - field_selector, - ..Default::default() - }; - let result = jobs.list(&job_list_params).await; - trace!("find_jobs_with_selector return"); - Ok(result?) -} +use log::trace; +use std::collections::BTreeMap; /// Create Kubernetes Job with given Instance and OwnershipInfo /// @@ -171,121 +114,6 @@ pub fn create_new_job_from_spec( Ok(result) } -/// Create Kubernetes Job -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::job; -/// use kube::client::Client; -/// use kube::config; -/// use k8s_openapi::api::batch::v1::Job; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let api_client = Client::try_default().await.unwrap(); -/// job::create_job(&Job::default(), "job_namespace", api_client).await.unwrap(); -/// # } -/// ``` -pub async fn create_job( - job_to_create: &Job, - namespace: &str, - kube_client: Client, -) -> Result<(), anyhow::Error> { - trace!("create_job enter"); - let jobs: Api = Api::namespaced(kube_client, namespace); - match jobs.create(&PostParams::default(), job_to_create).await { - Ok(created_job) => { - info!( - "create_job jobs.create return: {:?}", - created_job.metadata.name - ); - Ok(()) - } - Err(kube::Error::Api(ae)) => { - if ae.code == ERROR_CONFLICT { - trace!("create_job - job already exists"); - Ok(()) - } else { - error!( - "create_job jobs.create [{:?}] returned kube error: {:?}", - serde_json::to_string(&job_to_create), - ae - ); - Err(anyhow::anyhow!(ae)) - } - } - Err(e) => { - error!( - "create_job jobs.create [{:?}] error: {:?}", - serde_json::to_string(&job_to_create), - e - ); - Err(anyhow::anyhow!(e)) - } - } -} - -/// Remove Kubernetes Job -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::job; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let api_client = Client::try_default().await.unwrap(); -/// job::remove_job("job_to_remove", "job_namespace", api_client).await.unwrap(); -/// # } -/// ``` -pub async fn remove_job( - job_to_remove: &str, - namespace: &str, - kube_client: Client, -) -> Result<(), anyhow::Error> { - trace!("remove_job enter"); - let jobs: Api = Api::namespaced(kube_client, namespace); - let dps = DeleteParams { - dry_run: false, - propagation_policy: Some(PropagationPolicy::Background), - ..Default::default() - }; - match jobs.delete(job_to_remove, &dps).await { - Ok(deleted_job) => match deleted_job { - Either::Left(spec) => { - info!("remove_job jobs.delete return: {:?}", &spec.metadata.name); - Ok(()) - } - Either::Right(status) => { - info!("remove_job jobs.delete return: {:?}", &status.status); - Ok(()) - } - }, - Err(kube::Error::Api(ae)) => { - if ae.code == ERROR_NOT_FOUND { - trace!("remove_job - job already removed"); - Ok(()) - } else { - error!( - "remove_job jobs.delete [{:?}] returned kube error: {:?}", - &job_to_remove, ae - ); - Err(anyhow::anyhow!(ae)) - } - } - Err(e) => { - error!( - "remove_job jobs.delete [{:?}] error: {:?}", - &job_to_remove, e - ); - Err(anyhow::anyhow!(e)) - } - } -} - #[cfg(test)] mod broker_jobspec_tests { use super::super::super::{akri::API_VERSION, os::file}; diff --git a/shared/src/k8s/mod.rs b/shared/src/k8s/mod.rs index e004a44c3..4d9209c11 100644 --- a/shared/src/k8s/mod.rs +++ b/shared/src/k8s/mod.rs @@ -1,28 +1,21 @@ -use super::akri::{ - configuration, - configuration::{Configuration, ConfigurationList}, - instance, - instance::{Instance, InstanceList, InstanceSpec}, - retry::{random_delay, MAX_INSTANCE_UPDATE_TRIES}, - API_NAMESPACE, API_VERSION, -}; -use async_trait::async_trait; -use k8s_openapi::api::batch::v1::Job; -use k8s_openapi::api::core::v1::{Node, Pod, Service}; -use kube::{api::ObjectList, client::Client}; -use mockall::{automock, predicate::*}; +use super::akri::{API_NAMESPACE, API_VERSION}; + +use mockall::predicate::*; pub mod api; pub mod job; -pub mod node; pub mod pod; -pub mod service; pub const NODE_SELECTOR_OP_IN: &str = "In"; pub const OBJECT_NAME_FIELD: &str = "metadata.name"; pub const RESOURCE_REQUIREMENTS_KEY: &str = "{{PLACEHOLDER}}"; pub const ERROR_NOT_FOUND: u16 = 404; pub const ERROR_CONFLICT: u16 = 409; +pub const APP_LABEL_ID: &str = "app"; +pub const CONTROLLER_LABEL_ID: &str = "controller"; +pub const AKRI_CONFIGURATION_LABEL_NAME: &str = "akri.sh/configuration"; +pub const AKRI_INSTANCE_LABEL_NAME: &str = "akri.sh/instance"; +pub const AKRI_TARGET_NODE_LABEL_NAME: &str = "akri.sh/target-node"; /// OwnershipType defines what type of Kubernetes object /// an object is dependent on @@ -88,610 +81,10 @@ impl OwnershipInfo { } } -#[automock] -#[async_trait] -pub trait KubeInterface: Send + Sync { - fn get_kube_client(&self) -> Client; - - async fn find_node(&self, name: &str) -> Result; - - async fn find_pods_with_label(&self, selector: &str) -> Result, anyhow::Error>; - async fn find_pods_with_field(&self, selector: &str) -> Result, anyhow::Error>; - async fn create_pod(&self, pod_to_create: &Pod, namespace: &str) -> Result<(), anyhow::Error>; - async fn remove_pod(&self, pod_to_remove: &str, namespace: &str) -> Result<(), anyhow::Error>; - - async fn find_jobs_with_label(&self, selector: &str) -> Result, anyhow::Error>; - async fn find_jobs_with_field(&self, selector: &str) -> Result, anyhow::Error>; - async fn create_job(&self, job_to_create: &Job, namespace: &str) -> Result<(), anyhow::Error>; - async fn remove_job(&self, job_to_remove: &str, namespace: &str) -> Result<(), anyhow::Error>; - - async fn find_services(&self, selector: &str) -> Result, anyhow::Error>; - async fn create_service( - &self, - svc_to_create: &Service, - namespace: &str, - ) -> Result<(), anyhow::Error>; - async fn remove_service( - &self, - svc_to_remove: &str, - namespace: &str, - ) -> Result<(), anyhow::Error>; - async fn update_service( - &self, - svc_to_update: &Service, - name: &str, - namespace: &str, - ) -> Result<(), anyhow::Error>; - - async fn find_configuration( - &self, - name: &str, - namespace: &str, - ) -> Result; - async fn get_configurations(&self) -> Result; - - async fn find_instance(&self, name: &str, namespace: &str) -> Result; - async fn get_instances(&self) -> Result; - async fn create_instance( - &self, - instance_to_create: &InstanceSpec, - name: &str, - namespace: &str, - owner_config_name: &str, - owner_config_uid: &str, - ) -> Result<(), anyhow::Error>; - async fn delete_instance(&self, name: &str, namespace: &str) -> Result<(), anyhow::Error>; - async fn update_instance( - &self, - instance_to_update: &InstanceSpec, - name: &str, - namespace: &str, - ) -> Result<(), anyhow::Error>; -} - -#[derive(Clone)] -pub struct KubeImpl { - client: kube::Client, -} - -impl KubeImpl { - /// Create new instance of KubeImpl - pub async fn new() -> Result { - Ok(KubeImpl { - client: Client::try_default().await?, - }) - } -} - -#[async_trait] -impl KubeInterface for KubeImpl { - /// Return of clone of KubeImpl's client - fn get_kube_client(&self) -> Client { - self.client.clone() - } - - /// Get Kuberenetes node for specified name - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let node = kube.find_node("node-a").await.unwrap(); - /// # } - /// ``` - async fn find_node(&self, name: &str) -> Result { - node::find_node(name, self.get_kube_client()).await - } - - /// Get Kuberenetes pods with specified label selector - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let interesting_pods = kube.find_pods_with_label("label=interesting").await.unwrap(); - /// # } - /// ``` - async fn find_pods_with_label(&self, selector: &str) -> Result, anyhow::Error> { - pod::find_pods_with_selector(Some(selector.to_string()), None, self.get_kube_client()).await - } - /// Get Kuberenetes pods with specified field selector - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let pods_on_node_a = kube.find_pods_with_field("spec.nodeName=node-a").await.unwrap(); - /// # } - /// ``` - async fn find_pods_with_field(&self, selector: &str) -> Result, anyhow::Error> { - pod::find_pods_with_selector(None, Some(selector.to_string()), self.get_kube_client()).await - } - /// Create Kuberenetes pod - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use k8s_openapi::api::core::v1::Pod; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.create_pod(&Pod::default(), "pod_namespace").await.unwrap(); - /// # } - /// ``` - async fn create_pod(&self, pod_to_create: &Pod, namespace: &str) -> Result<(), anyhow::Error> { - pod::create_pod(pod_to_create, namespace, self.get_kube_client()).await - } - /// Remove Kubernetes pod - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.remove_pod("pod_to_remove", "pod_namespace").await.unwrap(); - /// # } - /// ``` - async fn remove_pod(&self, pod_to_remove: &str, namespace: &str) -> Result<(), anyhow::Error> { - pod::remove_pod(pod_to_remove, namespace, self.get_kube_client()).await - } - - /// Find Kuberenetes Jobs with specified label selector - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let interesting_jobs = kube.find_jobs_with_label("label=interesting").await.unwrap(); - /// # } - /// ``` - async fn find_jobs_with_label(&self, selector: &str) -> Result, anyhow::Error> { - job::find_jobs_with_selector(Some(selector.to_string()), None, self.get_kube_client()).await - } - /// Find Kuberenetes Jobs with specified field selector - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let jobs_on_node_a = kube.find_jobs_with_field("spec.nodeName=node-a").await.unwrap(); - /// # } - /// ``` - async fn find_jobs_with_field(&self, selector: &str) -> Result, anyhow::Error> { - job::find_jobs_with_selector(None, Some(selector.to_string()), self.get_kube_client()).await - } - - /// Create Kuberenetes job - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use k8s_openapi::api::batch::v1::Job; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.create_job(&Job::default(), "job_namespace").await.unwrap(); - /// # } - /// ``` - async fn create_job(&self, job_to_create: &Job, namespace: &str) -> Result<(), anyhow::Error> { - job::create_job(job_to_create, namespace, self.get_kube_client()).await - } - /// Remove Kubernetes job - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.remove_job("job_to_remove", "job_namespace").await.unwrap(); - /// # } - /// ``` - async fn remove_job(&self, job_to_remove: &str, namespace: &str) -> Result<(), anyhow::Error> { - job::remove_job(job_to_remove, namespace, self.get_kube_client()).await - } - - /// Get Kuberenetes services with specified label selector - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let interesting_services = kube.find_services("label=interesting").await.unwrap(); - /// # } - /// ``` - async fn find_services(&self, selector: &str) -> Result, anyhow::Error> { - service::find_services_with_selector(selector, self.get_kube_client()).await - } - /// Create Kubernetes service - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use k8s_openapi::api::core::v1::Service; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.create_service(&Service::default(), "service_namespace").await.unwrap(); - /// # } - /// ``` - async fn create_service( - &self, - svc_to_create: &Service, - namespace: &str, - ) -> Result<(), anyhow::Error> { - service::create_service(svc_to_create, namespace, self.get_kube_client()).await - } - /// Remove Kubernetes service - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.remove_service("service_to_remove", "service_namespace").await.unwrap(); - /// # } - /// ``` - async fn remove_service( - &self, - svc_to_remove: &str, - namespace: &str, - ) -> Result<(), anyhow::Error> { - service::remove_service(svc_to_remove, namespace, self.get_kube_client()).await - } - /// Update Kubernetes service - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use k8s_openapi::api::core::v1::Service; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let selector = "environment=production,app=nginx"; - /// for svc in kube.find_services(&selector).await.unwrap() { - /// let svc_name = &svc.metadata.name.clone().unwrap(); - /// let svc_namespace = &svc.metadata.namespace.as_ref().unwrap().clone(); - /// let updated_svc = kube.update_service( - /// &svc, - /// &svc_name, - /// &svc_namespace).await.unwrap(); - /// } - /// # } - /// ``` - async fn update_service( - &self, - svc_to_update: &Service, - name: &str, - namespace: &str, - ) -> Result<(), anyhow::Error> { - service::update_service(svc_to_update, name, namespace, self.get_kube_client()).await - } - - // Get Akri Configuration with given name and namespace - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let config = kube.find_configuration("config-1", "config-namespace").await.unwrap(); - /// # } - /// ``` - async fn find_configuration( - &self, - name: &str, - namespace: &str, - ) -> Result { - configuration::find_configuration(name, namespace, &self.get_kube_client()).await - } - // Get Akri Configurations with given namespace - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let configs = kube.get_configurations().await.unwrap(); - /// # } - /// ``` - async fn get_configurations(&self) -> Result { - configuration::get_configurations(&self.get_kube_client()).await - } - - // Get Akri Instance with given name and namespace - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let instance = kube.find_instance("instance-1", "instance-namespace").await.unwrap(); - /// # } - /// ``` - async fn find_instance(&self, name: &str, namespace: &str) -> Result { - instance::find_instance(name, namespace, &self.get_kube_client()).await - } - // Get Akri Instances with given namespace - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// let instances = kube.get_instances().await.unwrap(); - /// # } - /// ``` - async fn get_instances(&self) -> Result { - instance::get_instances(&self.get_kube_client()).await - } - - /// Create Akri Instance - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use akri_shared::akri::instance::InstanceSpec; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.create_instance( - /// &InstanceSpec{ - /// configuration_name: "capability_configuration_name".to_string(), - /// cdi_name: "akri.sh/config-1=instance-1".to_string(), - /// capacity: 1, - /// shared: true, - /// nodes: Vec::new(), - /// device_usage: std::collections::HashMap::new(), - /// broker_properties: std::collections::HashMap::new(), - /// }, - /// "instance-1", - /// "instance-namespace", - /// "config-1", - /// "abcdefgh-ijkl-mnop-qrst-uvwxyz012345" - /// ).await.unwrap(); - /// # } - /// ``` - async fn create_instance( - &self, - instance_to_create: &InstanceSpec, - name: &str, - namespace: &str, - owner_config_name: &str, - owner_config_uid: &str, - ) -> Result<(), anyhow::Error> { - instance::create_instance( - instance_to_create, - name, - namespace, - owner_config_name, - owner_config_uid, - &self.get_kube_client(), - ) - .await - } - // Delete Akri Instance - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.delete_instance( - /// "instance-1", - /// "instance-namespace" - /// ).await.unwrap(); - /// # } - /// ``` - async fn delete_instance(&self, name: &str, namespace: &str) -> Result<(), anyhow::Error> { - instance::delete_instance(name, namespace, &self.get_kube_client()).await - } - /// Update Akri Instance - /// - /// Example: - /// - /// ```no_run - /// use akri_shared::k8s; - /// use akri_shared::k8s::KubeInterface; - /// use akri_shared::akri::instance::InstanceSpec; - /// - /// # #[tokio::main] - /// # async fn main() { - /// let kube = k8s::KubeImpl::new().await.unwrap(); - /// kube.update_instance( - /// &InstanceSpec{ - /// configuration_name: "capability_configuration_name".to_string(), - /// cdi_name: "akri.sh/capability_configuration_name=instance-1".to_string(), - /// capacity: 1, - /// shared: true, - /// nodes: Vec::new(), - /// device_usage: std::collections::HashMap::new(), - /// broker_properties: std::collections::HashMap::new(), - /// }, - /// "instance-1", - /// "instance-namespace" - /// ).await.unwrap(); - /// # } - /// ``` - async fn update_instance( - &self, - instance_to_update: &InstanceSpec, - name: &str, - namespace: &str, - ) -> Result<(), anyhow::Error> { - instance::update_instance(instance_to_update, name, namespace, &self.get_kube_client()) - .await - } -} - -/// This deletes an Instance unless it has already been deleted by another node -/// or fails after multiple retries. -pub async fn try_delete_instance( - kube_interface: &dyn KubeInterface, - instance_name: &str, - instance_namespace: &str, -) -> Result<(), anyhow::Error> { - for x in 0..MAX_INSTANCE_UPDATE_TRIES { - match kube_interface - .delete_instance(instance_name, instance_namespace) - .await - { - Ok(()) => { - log::trace!("try_delete_instance - deleted Instance {}", instance_name); - break; - } - Err(e) => { - if let Some(ae) = e.downcast_ref::() { - if ae.code == ERROR_NOT_FOUND { - log::trace!( - "try_delete_instance - discovered Instance {} already deleted", - instance_name - ); - break; - } - } - log::error!( - "try_delete_instance - tried to delete Instance {} but still exists. {} retries left.", - instance_name, MAX_INSTANCE_UPDATE_TRIES - x - 1 - ); - if x == MAX_INSTANCE_UPDATE_TRIES - 1 { - return Err(e); - } - } - } - random_delay().await; - } - Ok(()) -} - #[cfg(test)] pub mod test_ownership { use super::*; - #[tokio::test] - async fn test_try_delete_instance() { - let mut mock_kube_interface = MockKubeInterface::new(); - mock_kube_interface - .expect_delete_instance() - .times(1) - .returning(move |_, _| { - let error_response = kube::error::ErrorResponse { - status: "random".to_string(), - message: "blah".to_string(), - reason: "NotFound".to_string(), - code: 404, - }; - Err(error_response.into()) - }); - try_delete_instance(&mock_kube_interface, "instance_name", "instance_namespace") - .await - .unwrap(); - } - - // Test that succeeds on second try - #[tokio::test] - async fn test_try_delete_instance_sequence() { - let mut seq = mockall::Sequence::new(); - let mut mock_kube_interface = MockKubeInterface::new(); - mock_kube_interface - .expect_delete_instance() - .times(1) - .returning(move |_, _| { - let error_response = kube::error::ErrorResponse { - status: "random".to_string(), - message: "blah".to_string(), - reason: "SomeError".to_string(), - code: 401, - }; - Err(error_response.into()) - }) - .in_sequence(&mut seq); - mock_kube_interface - .expect_delete_instance() - .times(1) - .returning(move |_, _| Ok(())) - .in_sequence(&mut seq); - try_delete_instance(&mock_kube_interface, "instance_name", "instance_namespace") - .await - .unwrap(); - } - #[tokio::test] async fn test_ownership_from_config() { let name = "asdf"; diff --git a/shared/src/k8s/node.rs b/shared/src/k8s/node.rs deleted file mode 100644 index 4f7f518fb..000000000 --- a/shared/src/k8s/node.rs +++ /dev/null @@ -1,28 +0,0 @@ -use k8s_openapi::api::core::v1::Node; -use kube::{api::Api, client::Client}; -use log::trace; - -/// Get Kubernetes Node with a given name -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::node; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let label_selector = Some("environment=production,app=nginx".to_string()); -/// let api_client = Client::try_default().await.unwrap(); -/// let node = node::find_node("node-a", api_client).await.unwrap(); -/// # } -/// ``` -pub async fn find_node(name: &str, kube_client: Client) -> Result { - trace!("find_node with name={}", name); - let nodes: Api = Api::all(kube_client); - trace!("find_node PRE nodes.get(...).await?"); - let result = nodes.get(name).await; - trace!("find_node return"); - Ok(result?) -} diff --git a/shared/src/k8s/pod.rs b/shared/src/k8s/pod.rs index 7123701a1..67123abf2 100644 --- a/shared/src/k8s/pod.rs +++ b/shared/src/k8s/pod.rs @@ -10,7 +10,7 @@ use k8s_openapi::api::core::v1::{ use k8s_openapi::apimachinery::pkg::api::resource::Quantity; use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; use kube::{ - api::{Api, DeleteParams, ListParams, ObjectList, PostParams}, + api::{Api, DeleteParams, PostParams}, client::Client, }; use log::{error, info, trace}; @@ -22,61 +22,6 @@ pub const AKRI_CONFIGURATION_LABEL_NAME: &str = "akri.sh/configuration"; pub const AKRI_INSTANCE_LABEL_NAME: &str = "akri.sh/instance"; pub const AKRI_TARGET_NODE_LABEL_NAME: &str = "akri.sh/target-node"; -/// Get Kubernetes Pods with a given label or field selector -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::pod; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let label_selector = Some("environment=production,app=nginx".to_string()); -/// let api_client = Client::try_default().await.unwrap(); -/// for pod in pod::find_pods_with_selector(label_selector, None, api_client).await.unwrap() { -/// println!("found pod: {}", pod.metadata.name.unwrap()) -/// } -/// # } -/// ``` -/// -/// ```no_run -/// use akri_shared::k8s::pod; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let field_selector = Some("spec.nodeName=node-a".to_string()); -/// let api_client = Client::try_default().await.unwrap(); -/// for pod in pod::find_pods_with_selector(None, field_selector, api_client).await.unwrap() { -/// println!("found pod: {}", pod.metadata.name.unwrap()) -/// } -/// # } -/// ``` -pub async fn find_pods_with_selector( - label_selector: Option, - field_selector: Option, - kube_client: Client, -) -> Result, anyhow::Error> { - trace!( - "find_pods_with_selector with label_selector={:?} field_selector={:?}", - &label_selector, - &field_selector - ); - let pods: Api = Api::all(kube_client); - let pod_list_params = ListParams { - label_selector, - field_selector, - ..Default::default() - }; - trace!("find_pods_with_selector PRE pods.list(...).await?"); - let result = pods.list(&pod_list_params).await; - trace!("find_pods_with_selector return"); - Ok(result?) -} - /// Create name for Kubernetes Pod. /// /// Example: diff --git a/shared/src/k8s/service.rs b/shared/src/k8s/service.rs deleted file mode 100644 index 90f9849a2..000000000 --- a/shared/src/k8s/service.rs +++ /dev/null @@ -1,789 +0,0 @@ -use super::{ - super::akri::API_NAMESPACE, - pod::{ - AKRI_CONFIGURATION_LABEL_NAME, AKRI_INSTANCE_LABEL_NAME, APP_LABEL_ID, CONTROLLER_LABEL_ID, - }, - OwnershipInfo, ERROR_NOT_FOUND, -}; -use either::Either; -use k8s_openapi::api::core::v1::{Service, ServiceSpec}; -use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; -use kube::{ - api::{Api, DeleteParams, ListParams, ObjectList, Patch, PatchParams, PostParams}, - client::Client, -}; -use log::{error, info, trace}; -use std::collections::BTreeMap; - -/// Get Kubernetes Services with a given selector -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::service; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let selector = "environment=production,app=nginx"; -/// let api_client = Client::try_default().await.unwrap(); -/// for svc in service::find_services_with_selector(&selector, api_client).await.unwrap() { -/// println!("found svc: {}", svc.metadata.name.unwrap()) -/// } -/// # } -/// ``` -pub async fn find_services_with_selector( - selector: &str, - kube_client: Client, -) -> Result, anyhow::Error> { - trace!("find_services_with_selector with selector={:?}", &selector); - let svc_client: Api = Api::all(kube_client); - let svc_list_params = ListParams { - label_selector: Some(selector.to_string()), - ..Default::default() - }; - trace!("find_services_with_selector PRE svcs.list(...).await?"); - let result = svc_client.list(&svc_list_params).await; - trace!("find_services_with_selector return"); - Ok(result?) -} - -/// Create name for Kubernetes Service. -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::service; -/// -/// let svc_name = service::create_service_app_name( -/// "capability_config", -/// "capability_instance", -/// "svc", -/// true); -/// ``` -pub fn create_service_app_name( - configuration_name: &str, - instance_name: &str, - svc_suffix: &str, - node_specific_svc: bool, -) -> String { - let normalized_instance_name = instance_name.replace('.', "-"); - if node_specific_svc { - // If this is the node specific service, use the insrtance name which - // contains node-specific content. - format!("{}-{}", normalized_instance_name, svc_suffix) - } else { - // If this is NOT the node specific service, use the capability name. - format!("{}-{}", configuration_name, svc_suffix) - } -} - -/// Create Kubernetes Service based on Device Capabililty Instance & Config. -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::{ -/// OwnershipInfo, -/// OwnershipType, -/// service -/// }; -/// use kube::client::Client; -/// use kube::config; -/// use k8s_openapi::api::core::v1::ServiceSpec; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let api_client = Client::try_default().await.unwrap(); -/// let svc = service::create_new_service_from_spec( -/// "svc_namespace", -/// "capability_instance", -/// "capability_config", -/// OwnershipInfo::new( -/// OwnershipType::Instance, -/// "capability_instance".to_string(), -/// "instance_uid".to_string() -/// ), -/// &ServiceSpec::default(), -/// true).unwrap(); -/// # } -/// ``` -pub fn create_new_service_from_spec( - svc_namespace: &str, - instance_name: &str, - configuration_name: &str, - ownership: OwnershipInfo, - svc_spec: &ServiceSpec, - node_specific_svc: bool, -) -> anyhow::Result { - let app_name = - create_service_app_name(configuration_name, instance_name, "svc", node_specific_svc); - let mut labels: BTreeMap = BTreeMap::new(); - labels.insert(APP_LABEL_ID.to_string(), app_name.clone()); - labels.insert(CONTROLLER_LABEL_ID.to_string(), API_NAMESPACE.to_string()); - if node_specific_svc { - labels.insert( - AKRI_INSTANCE_LABEL_NAME.to_string(), - instance_name.to_string(), - ); - } else { - labels.insert( - AKRI_CONFIGURATION_LABEL_NAME.to_string(), - configuration_name.to_string(), - ); - } - - let owner_references: Vec = vec![OwnerReference { - api_version: ownership.get_api_version(), - kind: ownership.get_kind(), - controller: ownership.get_controller(), - block_owner_deletion: ownership.get_block_owner_deletion(), - name: ownership.get_name(), - uid: ownership.get_uid(), - }]; - - let mut spec = svc_spec.clone(); - let mut modified_selector: BTreeMap = spec.selector.unwrap_or_default(); - modified_selector.insert(CONTROLLER_LABEL_ID.to_string(), API_NAMESPACE.to_string()); - if node_specific_svc { - modified_selector.insert( - AKRI_INSTANCE_LABEL_NAME.to_string(), - instance_name.to_string(), - ); - } else { - modified_selector.insert( - AKRI_CONFIGURATION_LABEL_NAME.to_string(), - configuration_name.to_string(), - ); - } - spec.selector = Some(modified_selector); - - let new_svc = Service { - spec: Some(spec), - metadata: ObjectMeta { - name: Some(app_name), - namespace: Some(svc_namespace.to_string()), - labels: Some(labels), - owner_references: Some(owner_references), - ..Default::default() - }, - ..Default::default() - }; - - Ok(new_svc) -} - -/// Update Kubernetes Service ownership references. -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::{ -/// OwnershipInfo, -/// OwnershipType, -/// service -/// }; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let selector = "environment=production,app=nginx"; -/// let api_client = Client::try_default().await.unwrap(); -/// for svc in service::find_services_with_selector(&selector, api_client).await.unwrap() { -/// let mut svc = svc; -/// service::update_ownership( -/// &mut svc, -/// OwnershipInfo::new( -/// OwnershipType::Pod, -/// "pod_name".to_string(), -/// "pod_uid".to_string(), -/// ), -/// true -/// ).unwrap(); -/// } -/// # } -/// ``` -pub fn update_ownership( - svc_to_update: &mut Service, - ownership: OwnershipInfo, - replace_references: bool, -) -> anyhow::Result<()> { - let ownership_ref = OwnerReference { - api_version: ownership.get_api_version(), - kind: ownership.get_kind(), - controller: ownership.get_controller(), - block_owner_deletion: ownership.get_block_owner_deletion(), - name: ownership.get_name(), - uid: ownership.get_uid(), - }; - if replace_references || svc_to_update.metadata.owner_references.is_none() { - // Replace all existing ownerReferences with specified ownership - svc_to_update.metadata.owner_references = Some(vec![ownership_ref]); - } else { - // Add ownership to list IFF the UID doesn't already exist - if !svc_to_update - .metadata - .owner_references - .as_ref() - .unwrap() - .iter() - .any(|x| x.uid == ownership.get_uid()) - { - svc_to_update - .metadata - .owner_references - .as_mut() - .unwrap() - .push(ownership_ref); - } - } - Ok(()) -} - -/// Create Kubernetes Service -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::service; -/// use kube::client::Client; -/// use kube::config; -/// use k8s_openapi::api::core::v1::Service; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let api_client = Client::try_default().await.unwrap(); -/// service::create_service(&Service::default(), "svc_namespace", api_client).await.unwrap(); -/// # } -/// ``` -pub async fn create_service( - svc_to_create: &Service, - namespace: &str, - kube_client: Client, -) -> Result<(), anyhow::Error> { - trace!("create_service enter"); - let services: Api = Api::namespaced(kube_client, namespace); - info!("create_service svcs.create(...).await?:"); - match services.create(&PostParams::default(), svc_to_create).await { - Ok(created_svc) => { - info!( - "create_service services.create return: {:?}", - created_svc.metadata.name - ); - Ok(()) - } - Err(kube::Error::Api(ae)) => { - error!( - "create_service services.create [{:?}] returned kube error: {:?}", - serde_json::to_string(&svc_to_create), - ae - ); - Ok(()) - } - Err(e) => { - error!( - "create_service services.create [{:?}] error: {:?}", - serde_json::to_string(&svc_to_create), - e - ); - Err(anyhow::anyhow!(e)) - } - } -} - -/// Remove Kubernetes Service -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::service; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let api_client = Client::try_default().await.unwrap(); -/// service::remove_service("svc_to_remove", "svc_namespace", api_client).await.unwrap(); -/// # } -/// ``` -pub async fn remove_service( - svc_to_remove: &str, - namespace: &str, - kube_client: Client, -) -> Result<(), anyhow::Error> { - trace!("remove_service enter"); - let svcs: Api = Api::namespaced(kube_client, namespace); - info!("remove_service svcs.create(...).await?:"); - match svcs.delete(svc_to_remove, &DeleteParams::default()).await { - Ok(deleted_svc) => match deleted_svc { - Either::Left(spec) => { - info!( - "remove_service svcs.delete return: {:?}", - &spec.metadata.name - ); - Ok(()) - } - Either::Right(status) => { - info!("remove_service svcs.delete return: {:?}", &status.status); - Ok(()) - } - }, - Err(kube::Error::Api(ae)) => { - if ae.code == ERROR_NOT_FOUND { - trace!("remove_service - service already deleted"); - Ok(()) - } else { - error!( - "remove_service svcs.delete [{:?}] returned kube error: {:?}", - &svc_to_remove, ae - ); - Err(anyhow::anyhow!(ae)) - } - } - Err(e) => { - error!( - "remove_service svcs.delete [{:?}] error: {:?}", - &svc_to_remove, e - ); - Err(anyhow::anyhow!(e)) - } - } -} - -/// Update Kubernetes Service -/// -/// Example: -/// -/// ```no_run -/// use akri_shared::k8s::service; -/// use kube::client::Client; -/// use kube::config; -/// -/// # #[tokio::main] -/// # async fn main() { -/// let selector = "environment=production,app=nginx"; -/// let api_client = Client::try_default().await.unwrap(); -/// for svc in service::find_services_with_selector(&selector, api_client).await.unwrap() { -/// let svc_name = &svc.metadata.name.clone().unwrap(); -/// let svc_namespace = &svc.metadata.namespace.as_ref().unwrap().clone(); -/// let loop_api_client = Client::try_default().await.unwrap(); -/// let updated_svc = service::update_service( -/// &svc, -/// &svc_name, -/// &svc_namespace, -/// loop_api_client).await.unwrap(); -/// } -/// # } -/// ``` -pub async fn update_service( - svc_to_update: &Service, - name: &str, - namespace: &str, - kube_client: Client, -) -> Result<(), anyhow::Error> { - trace!( - "update_service enter name:{} namespace: {}", - &name, - &namespace - ); - let svcs: Api = Api::namespaced(kube_client, namespace); - - info!("remove_service svcs.patch(...).await?:"); - match svcs - .patch(name, &PatchParams::default(), &Patch::Merge(&svc_to_update)) - .await - { - Ok(_service_modified) => { - log::trace!("update_service return"); - Ok(()) - } - Err(kube::Error::Api(ae)) => { - log::trace!( - "update_service kube_client.request returned kube error: {:?}", - ae - ); - Err(anyhow::anyhow!(ae)) - } - Err(e) => { - log::trace!("update_service kube_client.request error: {:?}", e); - Err(anyhow::anyhow!(e)) - } - } -} - -#[cfg(test)] -mod svcspec_tests { - use super::super::OwnershipType; - use super::*; - use env_logger; - - use k8s_openapi::api::core::v1::ServiceStatus; - use kube::api::ObjectMeta; - - #[test] - fn test_create_service_app_name() { - let _ = env_logger::builder().is_test(true).try_init(); - - assert_eq!( - "node-a-suffix", - create_service_app_name("foo", "node.a", "suffix", true) - ); - assert_eq!( - "foo-suffix", - create_service_app_name("foo", "node.a", "suffix", false) - ); - - assert_eq!( - "node-a-suffix", - create_service_app_name("foo", "node-a", "suffix", true) - ); - assert_eq!( - "foo-suffix", - create_service_app_name("foo", "node-a", "suffix", false) - ); - } - - #[test] - fn test_update_ownership_replace() { - let _ = env_logger::builder().is_test(true).try_init(); - - let mut svc = Service { - metadata: ObjectMeta::default(), - spec: Some(ServiceSpec::default()), - status: Some(ServiceStatus::default()), - }; - - assert!(svc.metadata.owner_references.is_none()); - update_ownership( - &mut svc, - OwnershipInfo { - object_type: OwnershipType::Pod, - object_name: "object1".to_string(), - object_uid: "uid1".to_string(), - }, - true, - ) - .unwrap(); - assert_eq!(1, svc.metadata.owner_references.as_ref().unwrap().len()); - assert_eq!( - "object1", - &svc.metadata.owner_references.as_ref().unwrap()[0].name - ); - assert_eq!( - "uid1", - &svc.metadata.owner_references.as_ref().unwrap()[0].uid - ); - - update_ownership( - &mut svc, - OwnershipInfo { - object_type: OwnershipType::Pod, - object_name: "object2".to_string(), - object_uid: "uid2".to_string(), - }, - true, - ) - .unwrap(); - assert_eq!(1, svc.metadata.owner_references.as_ref().unwrap().len()); - assert_eq!( - "object2", - &svc.metadata.owner_references.as_ref().unwrap()[0].name - ); - assert_eq!( - "uid2", - &svc.metadata.owner_references.as_ref().unwrap()[0].uid - ); - } - - #[test] - fn test_update_ownership_append() { - let _ = env_logger::builder().is_test(true).try_init(); - - let svc = Service { - metadata: ObjectMeta::default(), - spec: Some(ServiceSpec::default()), - status: Some(ServiceStatus::default()), - }; - - assert!(svc.metadata.owner_references.is_none()); - let mut svc = svc; - update_ownership( - &mut svc, - OwnershipInfo { - object_type: OwnershipType::Pod, - object_name: "object1".to_string(), - object_uid: "uid1".to_string(), - }, - false, - ) - .unwrap(); - assert_eq!(1, svc.metadata.owner_references.as_ref().unwrap().len()); - assert_eq!( - "object1", - &svc.metadata.owner_references.as_ref().unwrap()[0].name - ); - assert_eq!( - "uid1", - &svc.metadata.owner_references.as_ref().unwrap()[0].uid - ); - - update_ownership( - &mut svc, - OwnershipInfo { - object_type: OwnershipType::Pod, - object_name: "object2".to_string(), - object_uid: "uid2".to_string(), - }, - false, - ) - .unwrap(); - assert_eq!(2, svc.metadata.owner_references.as_ref().unwrap().len()); - assert_eq!( - "object1", - &svc.metadata.owner_references.as_ref().unwrap()[0].name - ); - assert_eq!( - "uid1", - &svc.metadata.owner_references.as_ref().unwrap()[0].uid - ); - assert_eq!( - "object2", - &svc.metadata.owner_references.as_ref().unwrap()[1].name - ); - assert_eq!( - "uid2", - &svc.metadata.owner_references.as_ref().unwrap()[1].uid - ); - - // Test that trying to add the same UID doesn't result in - // duplicate - update_ownership( - &mut svc, - OwnershipInfo { - object_type: OwnershipType::Pod, - object_name: "object2".to_string(), - object_uid: "uid2".to_string(), - }, - false, - ) - .unwrap(); - assert_eq!(2, svc.metadata.owner_references.as_ref().unwrap().len()); - assert_eq!( - "object1", - &svc.metadata.owner_references.as_ref().unwrap()[0].name - ); - assert_eq!( - "uid1", - &svc.metadata.owner_references.as_ref().unwrap()[0].uid - ); - assert_eq!( - "object2", - &svc.metadata.owner_references.as_ref().unwrap()[1].name - ); - assert_eq!( - "uid2", - &svc.metadata.owner_references.as_ref().unwrap()[1].uid - ); - } - - #[test] - fn test_svc_spec_creation() { - let _ = env_logger::builder().is_test(true).try_init(); - - let svc_namespace = "svc_namespace".to_string(); - let instance_name = "instance_name".to_string(); - let configuration_name = "configuration_name".to_string(); - - let object_name = "owner_object".to_string(); - let object_uid = "owner_uid".to_string(); - - for node_specific_svc in &[true, false] { - let mut preexisting_selector = BTreeMap::new(); - preexisting_selector.insert( - "do-not-change".to_string(), - "this-node-selector".to_string(), - ); - let svc_spec = ServiceSpec { - selector: Some(preexisting_selector), - ..Default::default() - }; - - let svc = create_new_service_from_spec( - &svc_namespace, - &instance_name, - &configuration_name, - OwnershipInfo::new(OwnershipType::Pod, object_name.clone(), object_uid.clone()), - &svc_spec, - *node_specific_svc, - ) - .unwrap(); - - let app_name = create_service_app_name( - &configuration_name, - &instance_name, - "svc", - *node_specific_svc, - ); - - // Validate the metadata name/namesapce - assert_eq!(&app_name, &svc.metadata.clone().name.unwrap()); - assert_eq!(&svc_namespace, &svc.metadata.clone().namespace.unwrap()); - - // Validate the labels added - assert_eq!( - &&app_name, - &svc.metadata - .clone() - .labels - .unwrap() - .get(APP_LABEL_ID) - .unwrap() - ); - assert_eq!( - &&API_NAMESPACE.to_string(), - &svc.metadata - .clone() - .labels - .unwrap() - .get(CONTROLLER_LABEL_ID) - .unwrap() - ); - if *node_specific_svc { - assert_eq!( - &&instance_name, - &svc.metadata - .clone() - .labels - .unwrap() - .get(AKRI_INSTANCE_LABEL_NAME) - .unwrap() - ); - } else { - assert_eq!( - &&configuration_name, - &svc.metadata - .clone() - .labels - .unwrap() - .get(AKRI_CONFIGURATION_LABEL_NAME) - .unwrap() - ); - } - - // Validate ownerReference - assert_eq!( - object_name, - svc.metadata - .clone() - .owner_references - .as_ref() - .unwrap() - .first() - .unwrap() - .name - ); - assert_eq!( - object_uid, - svc.metadata - .clone() - .owner_references - .as_ref() - .unwrap() - .first() - .unwrap() - .uid - ); - assert_eq!( - "Pod", - &svc.metadata - .clone() - .owner_references - .as_ref() - .unwrap() - .first() - .unwrap() - .kind - ); - assert_eq!( - "core/v1", - &svc.metadata - .clone() - .owner_references - .unwrap() - .first() - .unwrap() - .api_version - ); - assert!(svc - .metadata - .clone() - .owner_references - .unwrap() - .first() - .unwrap() - .controller - .unwrap()); - assert!(svc - .metadata - .clone() - .owner_references - .unwrap() - .first() - .unwrap() - .block_owner_deletion - .unwrap()); - - // Validate the existing selector unchanged - assert_eq!( - &&"this-node-selector", - &svc.spec - .as_ref() - .unwrap() - .selector - .as_ref() - .unwrap() - .get("do-not-change") - .unwrap() - ); - // Validate the selector added - assert_eq!( - &&API_NAMESPACE.to_string(), - &svc.spec - .as_ref() - .unwrap() - .selector - .as_ref() - .unwrap() - .get(CONTROLLER_LABEL_ID) - .unwrap() - ); - if *node_specific_svc { - assert_eq!( - &&instance_name, - &svc.spec - .as_ref() - .unwrap() - .selector - .as_ref() - .unwrap() - .get(AKRI_INSTANCE_LABEL_NAME) - .unwrap() - ); - } else { - assert_eq!( - &&configuration_name, - &svc.spec - .as_ref() - .unwrap() - .selector - .as_ref() - .unwrap() - .get(AKRI_CONFIGURATION_LABEL_NAME) - .unwrap() - ); - } - } - } -} From 2c3722a36db3e8623aa086ac922f9b4785276bf9 Mon Sep 17 00:00:00 2001 From: Kate Goldenring Date: Wed, 25 Sep 2024 16:38:09 -0700 Subject: [PATCH 2/8] Remove Pod handling for instance deletion and controller cleanups Signed-off-by: Kate Goldenring --- .../discovery_configuration_controller.rs | 2 +- controller/src/main.rs | 4 +- controller/src/util/instance_action.rs | 215 ++++---------- controller/src/util/node_watcher.rs | 97 +++--- controller/src/util/pod_action.rs | 281 +++++------------- controller/src/util/pod_watcher.rs | 19 +- shared/Cargo.toml | 1 - 7 files changed, 197 insertions(+), 422 deletions(-) diff --git a/agent/src/util/discovery_configuration_controller.rs b/agent/src/util/discovery_configuration_controller.rs index 29364a691..8998e97cc 100644 --- a/agent/src/util/discovery_configuration_controller.rs +++ b/agent/src/util/discovery_configuration_controller.rs @@ -18,12 +18,12 @@ use crate::discovery_handler_manager::{ discovery_handler_registry::DiscoveryHandlerRegistry, DiscoveryError, }; -use kube::{Resource, ResourceExt}; use kube::runtime::{ controller::Action, reflector::{ObjectRef, Store}, Controller, }; +use kube::{Resource, ResourceExt}; use thiserror::Error; #[derive(Debug, Error)] diff --git a/controller/src/main.rs b/controller/src/main.rs index 4886ce07b..ff8d76f55 100644 --- a/controller/src/main.rs +++ b/controller/src/main.rs @@ -43,13 +43,13 @@ async fn main() -> Result<(), Box Arc::new(kube::Client::try_default().await?), CONTROLLER_FIELD_MANAGER_ID, )); - let instance_water_ctx = controller_ctx.clone(); + let instance_watcher_ctx = controller_ctx.clone(); let node_watcher_ctx = controller_ctx.clone(); let pod_watcher_ctx = controller_ctx.clone(); // Handle instance changes tasks.push(tokio::spawn(async { - instance_action::run(instance_water_ctx).await; + instance_action::run(instance_watcher_ctx).await; })); // Watch for node disappearance tasks.push(tokio::spawn(async { diff --git a/controller/src/util/instance_action.rs b/controller/src/util/instance_action.rs index 048dc2afb..f7a535e12 100644 --- a/controller/src/util/instance_action.rs +++ b/controller/src/util/instance_action.rs @@ -61,35 +61,18 @@ fn error_policy( Action::requeue(std::time::Duration::from_secs(5 * 60)) } -/// Instance action types +/// Instance event types /// /// Instance actions describe the types of actions the Controller can /// react to for Instances. /// /// This will determine what broker management actions to take (if any) /// -/// | --> InstanceAction::Add +/// | --> Instance Applied /// | --> No broker => Do nothing -/// | --> => Deploy a Job -/// | --> => Deploy Pod to each Node on Instance's `nodes` list (up to `capacity` total) -/// | --> InstanceAction::Remove -/// | --> No broker => Do nothing -/// | --> => Delete all Jobs labeled with the Instance name -/// | --> => Delete all Pods labeled with the Instance name -/// | --> InstanceAction::Update -/// | --> No broker => Do nothing -/// | --> => No nothing -/// | --> => Ensure that each Node on Instance's `nodes` list (up to `capacity` total) have a Pod -/// -#[derive(Clone, Debug, PartialEq)] -pub enum InstanceAction { - /// An Instance is added - Add, - /// An Instance is removed - Remove, - /// An Instance is updated - Update, -} +/// | --> => Deploy a Job if one does not exist +/// | --> => Ensure that each Node on Instance's `nodes` list (up to `capacity` total) have a Pod. +/// Deploy Pods as necessary /// This function is the main Reconcile function for Instance resources /// This will get called every time an Instance gets added or is changed, it will also be called for every existing instance on startup. @@ -108,11 +91,10 @@ pub async fn reconcile(instance: Arc, ctx: Arc) -> async fn reconcile_inner(event: Event, ctx: Arc) -> Result { match event { - Event::Apply(instance) => { - handle_instance_change(&instance, &InstanceAction::Add, ctx.clone()).await - } - Event::Cleanup(instance) => { - handle_instance_change(&instance, &InstanceAction::Remove, ctx.clone()).await + Event::Apply(instance) => handle_instance_change(&instance, ctx.clone()).await, + Event::Cleanup(_) => { + // Do nothing. OwnerReferences are attached to Jobs and Pods to automate cleanup + Ok(default_requeue_action()) } } } @@ -137,11 +119,7 @@ pub(crate) struct PodContext { pub(crate) fn create_pod_context(k8s_pod: &Pod, action: PodAction) -> anyhow::Result { let pod_name = k8s_pod.metadata.name.as_ref().unwrap(); - let labels = &k8s_pod - .metadata - .labels - .as_ref() - .ok_or_else(|| anyhow::anyhow!("no labels found for Pod {:?}", pod_name))?; + let labels = &k8s_pod.labels(); // Early exits above ensure unwrap will not panic let node_to_run_pod_on = labels.get(AKRI_TARGET_NODE_LABEL_NAME).ok_or_else(|| { anyhow::anyhow!( @@ -163,7 +141,6 @@ pub(crate) fn create_pod_context(k8s_pod: &Pod, action: PodAction) -> anyhow::Re /// it will update the nodes_to_act_on map with the required action. fn determine_action_for_pod( k8s_pod: &Pod, - action: &InstanceAction, nodes_to_act_on: &mut HashMap, ) -> anyhow::Result<()> { let pod_name = k8s_pod.metadata.name.as_ref().unwrap(); @@ -185,7 +162,6 @@ fn determine_action_for_pod( pending_grace_time_in_minutes: PENDING_POD_GRACE_PERIOD_MINUTES, ended_grace_time_in_minutes: FAILED_POD_GRACE_PERIOD_MINUTES, phase: pod_phase.to_string(), - instance_action: action.clone(), status_start_time: pod_start_time, unknown_node: !nodes_to_act_on.contains_key(node_to_run_pod_on), trace_node_name: k8s_pod.metadata.name.clone().unwrap(), @@ -326,10 +302,9 @@ async fn handle_addition_work( /// 2) calling the appropriate handler depending on the broker type (Pod or Job) if any pub async fn handle_instance_change( instance: &Instance, - action: &InstanceAction, ctx: Arc, ) -> Result { - trace!("handle_instance_change - enter {:?}", action); + trace!("handle_instance_change - enter"); let instance_namespace = instance .metadata .namespace @@ -337,29 +312,24 @@ pub async fn handle_instance_change( .context("no namespace")?; let api: Box> = ctx.client.namespaced(instance_namespace); let Ok(Some(configuration)) = api.get(&instance.spec.configuration_name).await else { - if action != &InstanceAction::Remove { - // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances. - // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent - // is designed to shutdown when it's Configuration watcher fails. - error!( + // In this scenario, a configuration has been deleted without the Akri Agent deleting the associated Instances. + // Furthermore, Akri Agent is still modifying the Instances. This should not happen beacuse Agent + // is designed to shutdown when it's Configuration watcher fails. + error!( "handle_instance_change - no configuration found for {:?} yet instance {:?} exists - check that device plugin is running properly", &instance.spec.configuration_name, &instance.metadata.name ); - } return Ok(default_requeue_action()); }; if let Some(broker_spec) = &configuration.spec.broker_spec { let instance_change_result = match broker_spec { - BrokerSpec::BrokerPodSpec(p) => { - handle_instance_change_pod(instance, p, action, ctx).await - } + BrokerSpec::BrokerPodSpec(p) => handle_instance_change_pod(instance, p, ctx).await, BrokerSpec::BrokerJobSpec(j) => { handle_instance_change_job( instance, *configuration.metadata.generation.as_ref().unwrap(), j, - action, ctx.client.clone(), ) .await @@ -373,17 +343,24 @@ pub async fn handle_instance_change( } /// Called when an Instance has changed that requires a Job broker. Action determined by InstanceAction. -/// InstanceAction::Add => Deploy a Job with JobSpec from Configuration. Label with Instance name. -/// InstanceAction::Remove => Delete all Jobs labeled with the Instance name -/// InstanceAction::Update => No nothing +/// First check if a job with the instance name exists. If it does, do nothing. Otherwise, deploy a Job +/// with JobSpec from Configuration and label with Instance name. pub async fn handle_instance_change_job( instance: &Instance, config_generation: i64, job_spec: &JobSpec, - action: &InstanceAction, client: Arc, ) -> anyhow::Result<()> { - trace!("handle_instance_change_job - enter {:?}", action); + trace!("handle_instance_change_job - enter"); + let api: Box> = client.namespaced(instance.metadata.namespace.as_ref().unwrap()); + if api + .get(instance.metadata.name.as_ref().unwrap()) + .await? + .is_some() + { + // Job already exists, do nothing + return Ok(()); + } // Create name for Job. Includes Configuration generation in the suffix // to track what version of the Configuration the Job is associated with. let job_name = pod::create_broker_app_name( @@ -395,53 +372,23 @@ pub async fn handle_instance_change_job( let instance_name = instance.metadata.name.as_ref().unwrap(); let instance_namespace = instance.metadata.namespace.as_ref().unwrap(); - let instance_uid = instance - .metadata - .uid - .as_ref() - .ok_or_else(|| anyhow::anyhow!("UID not found for instance: {}", &instance_name))?; - match action { - InstanceAction::Add => { - trace!("handle_instance_change_job - instance added"); - let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name); - let new_job = job::create_new_job_from_spec( - instance, - OwnershipInfo::new( - OwnershipType::Instance, - instance_name.to_string(), - instance_uid.to_string(), - ), - &capability_id, - job_spec, - &job_name, - )?; - let api: Box> = client.namespaced(instance_namespace); - api.create(&new_job).await?; - } - InstanceAction::Remove => { - trace!("handle_instance_change_job - instance removed"); - // Find all jobs with the label - let api: Box> = client.namespaced(instance_namespace); - let lp = ListParams::default() - .labels(&format!("{}={}", AKRI_INSTANCE_LABEL_NAME, instance_name)); - match api.delete_collection(&lp).await? { - either::Either::Left(list) => { - let names: Vec<_> = list.iter().map(ResourceExt::name_any).collect(); - trace!("handle_instance_change_job - deleting jobs: {:?}", names); - } - either::Either::Right(status) => { - println!( - "handle_instance_change_job - deleted jobs: status={:?}", - status - ); - } - } - } - InstanceAction::Update => { - trace!("handle_instance_change_job - instance updated"); - // TODO: Broker could have encountered unexpected admission error and need to be removed and added - } - } + let instance_uid = instance.metadata.uid.as_ref().unwrap(); + trace!("handle_instance_change_job - instance added"); + let capability_id = format!("{}/{}", AKRI_PREFIX, instance_name); + let new_job = job::create_new_job_from_spec( + instance, + OwnershipInfo::new( + OwnershipType::Instance, + instance_name.to_string(), + instance_uid.to_string(), + ), + &capability_id, + job_spec, + &job_name, + )?; + let api: Box> = client.namespaced(instance_namespace); + // TODO: Consider using server side apply instead of create + api.create(&new_job).await?; Ok(()) } @@ -454,19 +401,15 @@ pub async fn handle_instance_change_job( pub async fn handle_instance_change_pod( instance: &Instance, podspec: &PodSpec, - action: &InstanceAction, ctx: Arc, ) -> anyhow::Result<()> { - trace!("handle_instance_change_pod - enter {:?}", action); + trace!("handle_instance_change_pod - enter"); let instance_name = instance.metadata.name.clone().unwrap(); // If InstanceAction::Remove, assume all nodes require PodAction::NoAction (reflect that there is no running Pod unless we find one) // Otherwise, assume all nodes require PodAction::Add (reflect that there is no running Pod, unless we find one) - let default_action = match action { - InstanceAction::Remove => PodAction::NoAction, - _ => PodAction::Add, - }; + let default_action = PodAction::Add; let mut nodes_to_act_on: HashMap = instance .spec .nodes @@ -505,7 +448,7 @@ pub async fn handle_instance_change_pod( instance_pods .items .iter() - .try_for_each(|x| determine_action_for_pod(x, action, &mut nodes_to_act_on))?; + .try_for_each(|x| determine_action_for_pod(x, &mut nodes_to_act_on))?; trace!( "handle_instance_change - nodes tracked after querying existing pods={:?}", @@ -602,6 +545,16 @@ mod handle_instance_tests { use chrono::Utc; use mockall::predicate::*; + #[derive(Clone, Debug, PartialEq)] + pub enum InstanceAction { + /// An Instance is added + Add, + /// An Instance is removed + Remove, + /// An Instance is updated + Update, + } + #[derive(Clone)] struct HandleInstanceWork { find_pods_selector: &'static str, @@ -828,32 +781,6 @@ mod handle_instance_tests { .await; } - #[tokio::test] - async fn test_handle_instance_change_for_remove_running_local_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - - let mut mock = MockControllerKubeClient::default(); - configure_for_handle_instance_change( - &mut mock, - &HandleInstanceWork { - find_pods_selector: "akri.sh/instance=config-a-b494b6", - find_pods_result: "../test/json/running-pod-list-for-config-a-local.json", - find_pods_phase: None, - find_pods_start_time: None, - find_pods_delete_start_time: false, - config_work: get_config_work(), - deletion_work: Some(configure_deletion_work_for_config_a_b494b6()), - addition_work: None, - }, - ); - run_handle_instance_change_test( - Arc::new(ControllerContext::new(Arc::new(mock), "test")), - "../test/json/local-instance.json", - &InstanceAction::Remove, - ) - .await; - } - #[tokio::test] async fn test_handle_instance_change_for_add_new_shared_instance() { let _ = env_logger::builder().is_test(true).try_init(); @@ -882,32 +809,6 @@ mod handle_instance_tests { .await; } - #[tokio::test] - async fn test_handle_instance_change_for_remove_running_shared_instance() { - let _ = env_logger::builder().is_test(true).try_init(); - - let mut mock = MockControllerKubeClient::default(); - configure_for_handle_instance_change( - &mut mock, - &HandleInstanceWork { - find_pods_selector: "akri.sh/instance=config-a-359973", - find_pods_result: "../test/json/running-pod-list-for-config-a-shared.json", - find_pods_phase: None, - find_pods_start_time: None, - find_pods_delete_start_time: false, - config_work: get_config_work(), - deletion_work: Some(configure_deletion_work_for_config_a_359973()), - addition_work: None, - }, - ); - run_handle_instance_change_test( - Arc::new(ControllerContext::new(Arc::new(mock), "test")), - "../test/json/shared-instance.json", - &InstanceAction::Remove, - ) - .await; - } - #[tokio::test] async fn test_handle_instance_change_for_update_active_shared_instance() { let _ = env_logger::builder().is_test(true).try_init(); diff --git a/controller/src/util/node_watcher.rs b/controller/src/util/node_watcher.rs index 20cd1aa6b..a188215c4 100644 --- a/controller/src/util/node_watcher.rs +++ b/controller/src/util/node_watcher.rs @@ -12,14 +12,18 @@ use crate::util::{ use akri_shared::k8s::api::Api; use akri_shared::akri::instance::{device_usage::NodeUsage, Instance}; +use anyhow::Context; use futures::StreamExt; use k8s_openapi::api::core::v1::{Node, NodeStatus}; use kube::{ - api::{ListParams, ObjectList, ResourceExt}, + api::{ + ListParams, NotUsed, Object, ObjectList, ObjectMeta, Patch, PatchParams, ResourceExt, + TypeMeta, + }, runtime::{ controller::{Action, Controller}, - finalizer::finalizer, - finalizer::Event, + finalizer::{finalizer, Event}, + reflector::Lookup, watcher::Config, }, }; @@ -167,18 +171,10 @@ fn is_node_ready(k8s_node: &Node) -> bool { .conditions .as_ref() .unwrap_or(&Vec::new()) - .iter() - .filter_map(|condition| { - if condition.type_ == "Ready" { - Some(condition.status == "True") - } else { - None - } - }) - .collect::>() .last() - .unwrap_or(&false) - == &true + .map_or(false, |condition| { + condition.type_ == "Ready" && condition.status == "True" + }) } /// This handles when a node disappears by clearing nodes from @@ -204,14 +200,8 @@ async fn handle_node_disappearance( "handle_node_disappearance - make sure node is not referenced here: {:?}", &instance_name ); - try_remove_nodes_from_instance( - vanished_node_name, - &instance_name, - &instance, - api.as_ref(), - &ctx.identifier, - ) - .await?; + try_remove_nodes_from_instance(vanished_node_name, &instance_name, &instance, api.as_ref()) + .await?; } trace!("handle_node_disappearance - exit"); @@ -223,10 +213,9 @@ async fn handle_node_disappearance( /// the instance in etcd, any failure is returned. async fn try_remove_nodes_from_instance( vanished_node_name: &str, - _instance_name: &str, + instance_name: &str, instance: &Instance, api: &dyn Api, - field_manager: &str, ) -> Result<(), anyhow::Error> { trace!( "try_remove_nodes_from_instance - vanished_node_name: {:?}", @@ -252,10 +241,26 @@ async fn try_remove_nodes_from_instance( Err(_) => (slot.to_owned(), usage.into()), }) .collect::>(); - let mut patched = instance.clone(); - patched.spec.device_usage = modified_device_usage; - patched.spec.nodes = modified_nodes; - api.apply(patched, field_manager).await?; + let mut modified_spec = instance.spec.clone(); + modified_spec.nodes = modified_nodes; + modified_spec.device_usage = modified_device_usage; + let patch = Patch::Merge( + serde_json::to_value(Object { + types: Some(TypeMeta { + api_version: Instance::api_version(&()).to_string(), + kind: Instance::kind(&()).to_string(), + }), + status: None::, + spec: modified_spec, + metadata: ObjectMeta { + name: Some(instance_name.to_string()), + ..Default::default() + }, + }) + .context("Could not create instance patch")?, + ); + api.raw_patch(instance_name, &patch, &PatchParams::default()) + .await?; Ok(()) } @@ -390,9 +395,15 @@ mod tests { .expect_list() .return_once(|_| instances_list(instance_name, "unused")); instance_api_mock - .expect_apply() - .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) - .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + .expect_raw_patch() + .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|_, patch, _| match patch { + Patch::Merge(v) => { + let instance: Instance = serde_json::from_value(v.clone()).unwrap(); + !instance.spec.nodes.contains(&"node-a".to_owned()) + } + _ => false, + }); mock.instance .expect_all() .return_once(move || Box::new(instance_api_mock)); @@ -428,9 +439,15 @@ mod tests { .expect_list() .return_once(|_| instances_list(instance_name, "unused")); instance_api_mock - .expect_apply() - .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) - .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + .expect_raw_patch() + .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|_, patch, _| match patch { + Patch::Merge(v) => { + let instance: Instance = serde_json::from_value(v.clone()).unwrap(); + !instance.spec.nodes.contains(&"node-a".to_owned()) + } + _ => false, + }); mock.instance .expect_all() .return_once(move || Box::new(instance_api_mock)); @@ -462,9 +479,15 @@ mod tests { .expect_list() .return_once(|_| instances_list(instance_name, "unused")); instance_api_mock - .expect_apply() - .return_once(|_, _| Ok(Instance::new("unused", InstanceSpec::default()))) - .withf(|i, _| !i.spec.nodes.contains(&"node-a".to_owned())); + .expect_raw_patch() + .return_once(|_, _, _| Ok(Instance::new("unused", InstanceSpec::default()))) + .withf(|_, patch, _| match patch { + Patch::Merge(v) => { + let instance: Instance = serde_json::from_value(v.clone()).unwrap(); + !instance.spec.nodes.contains(&"node-a".to_owned()) + } + _ => false, + }); mock.instance .expect_all() .return_once(move || Box::new(instance_api_mock)); diff --git a/controller/src/util/pod_action.rs b/controller/src/util/pod_action.rs index 71b5172b6..ecfb6b2b3 100644 --- a/controller/src/util/pod_action.rs +++ b/controller/src/util/pod_action.rs @@ -1,4 +1,3 @@ -use super::instance_action::InstanceAction; use chrono::Utc; use k8s_openapi::apimachinery::pkg::apis::meta::v1::Time; @@ -23,7 +22,6 @@ pub enum PodAction { /// a broker Pod. /// /// The action to take is based on several factors: -/// 1. what the InstanceAction is (Add, Delete, Modify) /// 1. what phase the Pod is in (Running, Pending, etc) /// 1. when the Pod started /// 1. the relevant grace time @@ -32,7 +30,6 @@ pub struct PodActionInfo { pub pending_grace_time_in_minutes: i64, pub ended_grace_time_in_minutes: i64, pub phase: String, - pub instance_action: InstanceAction, pub status_start_time: Option