diff --git a/.gitignore b/.gitignore index c2a6e3b91..b338dc773 100644 --- a/.gitignore +++ b/.gitignore @@ -27,5 +27,8 @@ cuebot/.project /pycue/opencue/compiled_proto/ /rqd/rqd/compiled_proto/ docker-compose-local.yml +/sandbox/kafka* +/sandbox/zookeeper* docs/_site/ docs/bin/ + diff --git a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java index af6c54b99..23a527643 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java +++ b/cuebot/src/main/java/com/imageworks/spcue/dispatcher/DispatchSupportService.java @@ -19,6 +19,13 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.springframework.dao.DataAccessException; +import org.springframework.dao.EmptyResultDataAccessException; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + import com.imageworks.spcue.AllocationInterface; import com.imageworks.spcue.DispatchFrame; import com.imageworks.spcue.DispatchHost; @@ -35,13 +42,6 @@ import com.imageworks.spcue.ShowInterface; import com.imageworks.spcue.StrandedCores; import com.imageworks.spcue.VirtualProc; -import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.LogManager; -import org.springframework.dao.EmptyResultDataAccessException; -import org.springframework.dao.DataAccessException; -import org.springframework.transaction.annotation.Propagation; -import org.springframework.transaction.annotation.Transactional; - import com.imageworks.spcue.dao.BookingDao; import com.imageworks.spcue.dao.DispatcherDao; import com.imageworks.spcue.dao.FrameDao; diff --git a/rust/.gitignore b/rust/.gitignore index 1b699df37..eedddf488 100644 --- a/rust/.gitignore +++ b/rust/.gitignore @@ -3,7 +3,7 @@ # TODO: Remove once these crates are stable and ready for public use /crates/cuebot-config /crates/dist-lock -/crates/scheduler .DS_Store config/rqd.local_docker.yaml +/sandbox/kafka* diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 2431ea2c2..5cc475f2e 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -80,6 +80,17 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d902e3d592a523def97af8f317b08ce16b7ab854c1985a0c671e6f15cebc236" +[[package]] +name = "async-lock" +version = "3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-stream" version = "0.1.2" @@ -112,6 +123,15 @@ dependencies = [ "syn 2.0.104", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -216,6 +236,12 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64ct" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" + [[package]] name = "bincode" version = "1.3.3" @@ -322,9 +348,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.29" +version = "1.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c1599538de2394445747c8cf7935946e3cc27e9625f889d979bfb2aaf569362" +checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e" dependencies = [ "shlex", ] @@ -371,6 +397,15 @@ dependencies = [ "vec_map", ] +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "config" version = "0.14.1" @@ -390,6 +425,12 @@ dependencies = [ "yaml-rust2", ] +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "const-random" version = "0.1.18" @@ -444,6 +485,21 @@ dependencies = [ "libc", ] +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -472,6 +528,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -507,6 +572,17 @@ dependencies = [ "parking_lot_core", ] +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.4.0" @@ -538,6 +614,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", + "const-oid", "crypto-common", "subtle", ] @@ -562,6 +639,12 @@ dependencies = [ "const-random", ] +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + [[package]] name = "dummy-cuebot" version = "0.1.0" @@ -589,15 +672,18 @@ dependencies = [ [[package]] name = "dyn-clone" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c7a8fb8a9fbf66c1f703fe16184d10ca0ee9d23be5b4436400408ba54a95005" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +dependencies = [ + "serde", +] [[package]] name = "encoding_rs" @@ -624,6 +710,38 @@ dependencies = [ "windows-sys 0.60.2", ] +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "fallible-iterator" version = "0.2.0" @@ -642,12 +760,29 @@ version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "futures-core", + "futures-sink", + "spin", +] + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -705,6 +840,17 @@ dependencies = [ "futures-util", ] +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + [[package]] name = "futures-io" version = "0.3.31" @@ -752,6 +898,20 @@ dependencies = [ "slab", ] +[[package]] +name = "generator" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d18470a76cb7f8ff746cf1f7470914f900252ec36bbc40b569d74b1258446827" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows 0.61.3", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -793,9 +953,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "h2" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17da50a276f1e01e0ba6c029e47b7100754904ee8a278f886546e98575380785" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" dependencies = [ "atomic-waker", "bytes", @@ -828,9 +988,14 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] [[package]] name = "hashlink" @@ -841,6 +1006,15 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "hashlink" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "heck" version = "0.3.3" @@ -871,6 +1045,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + [[package]] name = "hmac" version = "0.12.1" @@ -880,6 +1063,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "http" version = "1.3.1" @@ -993,9 +1185,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f66d5bd4c6f02bf0542fad85d626775bab9258cf795a4256dcaf3161114d1df" +checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" dependencies = [ "bytes", "futures-channel", @@ -1006,7 +1198,7 @@ dependencies = [ "hyper", "libc", "pin-project-lite", - "socket2", + "socket2 0.6.0", "tokio", "tower-service", "tracing", @@ -1176,15 +1368,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.15.5", "serde", ] [[package]] name = "io-uring" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b86e202f00093dcba4275d4636b93ef9dd75d025ae560d2521b45ea28ab49013" +checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" dependencies = [ "bitflags 2.9.1", "cfg-if", @@ -1247,6 +1439,9 @@ name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] [[package]] name = "libc" @@ -1254,6 +1449,34 @@ version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" +[[package]] +name = "libm" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" + +[[package]] +name = "libsqlite3-sys" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e99fb7a497b1e3339bc746195567ed8d3e24945ecd636e3619d20b9de9e9149" +dependencies = [ + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b70e7a7df205e92a1a4cd9aaae7898dac0aa555503cc0a649494d0d60e7651d" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "linux-raw-sys" version = "0.9.4" @@ -1282,6 +1505,19 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + [[package]] name = "macos-accessibility-client" version = "0.0.1" @@ -1292,6 +1528,15 @@ dependencies = [ "core-foundation-sys", ] +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "matchit" version = "0.8.4" @@ -1376,6 +1621,28 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "moka" +version = "0.12.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9321642ca94a4282428e6ea4af8cc2ca4eac48ac7a6a4ea8f33f76d0ce70926" +dependencies = [ + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "event-listener", + "futures-util", + "loom", + "parking_lot", + "portable-atomic", + "rustc_version", + "smallvec", + "tagptr", + "thiserror 1.0.69", + "uuid", +] + [[package]] name = "multimap" version = "0.10.1" @@ -1423,12 +1690,49 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + [[package]] name = "num-conv" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1436,6 +1740,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", +] + +[[package]] +name = "num_enum" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a" +dependencies = [ + "num_enum_derive", + "rustversion", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn 2.0.104", ] [[package]] @@ -1492,6 +1819,12 @@ version = "4.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e" +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.4" @@ -1527,6 +1860,15 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df94ce210e5bc13cb6651479fa48d14f601d9858cfe0467f43ae157023b938d3" +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1637,12 +1979,39 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "portable-atomic" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" + [[package]] name = "postgres-protocol" version = "0.6.8" @@ -1656,7 +2025,7 @@ dependencies = [ "hmac", "md-5", "memchr", - "rand 0.9.1", + "rand 0.9.2", "sha2", "stringprep", ] @@ -1698,14 +2067,23 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.35" +version = "0.2.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061c1221631e079b26479d25bbf2275bfe5917ae8419cd7e34f13bfc2aa7539a" +checksum = "ff24dfcda44452b9816fff4cd4227e1bb73ff5a2f1bc1105aa92fb8565ce44d2" dependencies = [ "proc-macro2", "syn 2.0.104", ] +[[package]] +name = "proc-macro-crate" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -1819,9 +2197,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", @@ -1886,22 +2264,52 @@ dependencies = [ ] [[package]] -name = "readkey" -version = "0.2.2" +name = "rdkafka" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a36870cefdfcff57edbc0fa62165f42dfd4e5a0d8965117c1ea84c5700e4450" +checksum = "5f1856d72dbbbea0d2a5b2eaf6af7fb3847ef2746e883b11781446a51dbc85c0" +dependencies = [ + "futures-channel", + "futures-util", + "libc", + "log", + "rdkafka-sys", + "serde", + "serde_derive", + "serde_json", + "slab", + "tokio", +] [[package]] -name = "readmouse" +name = "rdkafka-sys" +version = "4.9.0+2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5230dca48bc354d718269f3e4353280e188b610f7af7e2fcf54b7a79d5802872" +dependencies = [ + "libc", + "libz-sys", + "num_enum", + "pkg-config", +] + +[[package]] +name = "readkey" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a36870cefdfcff57edbc0fa62165f42dfd4e5a0d8965117c1ea84c5700e4450" + +[[package]] +name = "readmouse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be105c72a1e6a5a1198acee3d5b506a15676b74a02ecd78060042a447f408d94" [[package]] name = "redox_syscall" -version = "0.5.13" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ "bitflags 2.9.1", ] @@ -1934,8 +2342,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", - "regex-automata", - "regex-syntax", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", ] [[package]] @@ -1946,9 +2363,15 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.5", ] +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.5" @@ -2017,7 +2440,7 @@ dependencies = [ "opencue-proto", "pin-project-lite", "prost", - "rand 0.9.1", + "rand 0.9.2", "regex", "serde", "serde_derive", @@ -2038,6 +2461,26 @@ dependencies = [ "uuid", ] +[[package]] +name = "rsa" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78928ac1ed176a5ca1d17e578a1825f3d81ca54cf41053a592584b020cfd691b" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core 0.6.4", + "signature", + "spki", + "subtle", + "zeroize", +] + [[package]] name = "rust-ini" version = "0.20.0" @@ -2050,21 +2493,30 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.25" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + +[[package]] +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] [[package]] name = "rustix" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" dependencies = [ "bitflags 2.9.1", "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -2079,6 +2531,44 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "scheduler" +version = "0.1.0" +dependencies = [ + "async-stream", + "async-trait", + "bytesize", + "chrono", + "config", + "dashmap", + "futures", + "humantime", + "humantime-serde", + "indexmap 2.10.0", + "itertools 0.13.0", + "lazy_static", + "miette", + "moka", + "once_cell", + "opencue-proto", + "prost", + "rdkafka", + "regex", + "serde", + "serde_derive", + "serde_json", + "sqlx", + "structopt", + "thiserror 1.0.69", + "tokio", + "tonic", + "tracing", + "tracing-appender", + "tracing-rolling-file", + "tracing-subscriber", + "uuid", +] + [[package]] name = "schemars" version = "0.9.0" @@ -2103,12 +2593,24 @@ dependencies = [ "serde_json", ] +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + [[package]] name = "scopeguard" version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.219" @@ -2131,9 +2633,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" dependencies = [ "itoa", "memchr", @@ -2192,6 +2694,17 @@ dependencies = [ "time", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha2" version = "0.10.9" @@ -2220,13 +2733,23 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook-registry" -version = "1.4.5" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + [[package]] name = "siphasher" version = "1.0.1" @@ -2235,15 +2758,18 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04dc19736151f35336d325007ac991178d504a119863a2fcb3758cdb5e52c50d" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "smallvec" version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +dependencies = [ + "serde", +] [[package]] name = "socket2" @@ -2255,6 +2781,223 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlx" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fefb893899429669dcdd979aff487bd78f4064e5e7907e4269081e0ef7d97dc" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee6798b1838b6a0f69c007c133b8df5866302197e404e8b6ee8ed3e3a5e68dc6" +dependencies = [ + "base64 0.22.1", + "bytes", + "crc", + "crossbeam-queue", + "either", + "event-listener", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashbrown 0.15.5", + "hashlink 0.10.0", + "indexmap 2.10.0", + "log", + "memchr", + "once_cell", + "percent-encoding", + "serde", + "serde_json", + "sha2", + "smallvec", + "thiserror 2.0.12", + "tokio", + "tokio-stream", + "tracing", + "url", +] + +[[package]] +name = "sqlx-macros" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2d452988ccaacfbf5e0bdbc348fb91d7c8af5bee192173ac3636b5fb6e6715d" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 2.0.104", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19a9c1841124ac5a61741f96e1d9e2ec77424bf323962dd894bdb93f37d5219b" +dependencies = [ + "dotenvy", + "either", + "heck 0.5.0", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 2.0.104", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa003f0038df784eb8fecbbac13affe3da23b45194bd57dba231c8f48199c526" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.9.1", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand 0.8.5", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 2.0.12", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db58fcd5a53cf07c184b154801ff91347e4c30d17a3562a635ff028ad5deda46" +dependencies = [ + "atoi", + "base64 0.22.1", + "bitflags 2.9.1", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand 0.8.5", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror 2.0.12", + "tracing", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2d12fe70b2c1b4401038055f90f151b78208de1f9f89a7dbfd41587a10c3eea" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "serde_urlencoded", + "sqlx-core", + "thiserror 2.0.12", + "tracing", + "url", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -2382,6 +3125,12 @@ dependencies = [ "windows 0.57.0", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + [[package]] name = "tempfile" version = "3.20.0" @@ -2540,9 +3289,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.46.1" +version = "1.47.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cc3a2344dafbe23a245241fe8b09735b521110d30fcefbbd5feb1797ca35d17" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" dependencies = [ "backtrace", "bytes", @@ -2553,9 +3302,9 @@ dependencies = [ "pin-project-lite", "signal-hook-registry", "slab", - "socket2", + "socket2 0.6.0", "tokio-macros", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2588,8 +3337,8 @@ dependencies = [ "pin-project-lite", "postgres-protocol", "postgres-types", - "rand 0.9.1", - "socket2", + "rand 0.9.2", + "socket2 0.5.10", "tokio", "tokio-util", "whoami", @@ -2608,9 +3357,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.15" +version = "0.7.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" dependencies = [ "bytes", "futures-core", @@ -2680,7 +3429,7 @@ dependencies = [ "percent-encoding", "pin-project", "prost", - "socket2", + "socket2 0.5.10", "tokio", "tokio-stream", "tower", @@ -2740,6 +3489,7 @@ version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ + "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -2804,10 +3554,14 @@ version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] @@ -2926,6 +3680,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "vec_map" version = "0.8.2" @@ -3088,6 +3848,28 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows" +version = "0.61.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9babd3a767a4c1aef6900409f85f5d53ce2544ccdfaa86dad48c91782c6d6893" +dependencies = [ + "windows-collections", + "windows-core 0.61.2", + "windows-future", + "windows-link", + "windows-numerics", +] + +[[package]] +name = "windows-collections" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3beeceb5e5cfd9eb1d76b381630e82c4241ccd0d27f1a39ed41b2760b255c5e8" +dependencies = [ + "windows-core 0.61.2", +] + [[package]] name = "windows-core" version = "0.57.0" @@ -3113,6 +3895,17 @@ dependencies = [ "windows-strings", ] +[[package]] +name = "windows-future" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc6a41e98427b19fe4b73c550f060b59fa592d7d686537eebf9385621bfbad8e" +dependencies = [ + "windows-core 0.61.2", + "windows-link", + "windows-threading", +] + [[package]] name = "windows-implement" version = "0.57.0" @@ -3163,6 +3956,16 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-numerics" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9150af68066c4c5c07ddc0ce30421554771e528bde427614c61038bc2c92c2b1" +dependencies = [ + "windows-core 0.61.2", + "windows-link", +] + [[package]] name = "windows-result" version = "0.1.2" @@ -3190,6 +3993,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -3214,7 +4026,7 @@ version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets 0.53.2", + "windows-targets 0.53.3", ] [[package]] @@ -3250,10 +4062,11 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.2" +version = "0.53.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ + "windows-link", "windows_aarch64_gnullvm 0.53.0", "windows_aarch64_msvc 0.53.0", "windows_i686_gnu 0.53.0", @@ -3264,6 +4077,15 @@ dependencies = [ "windows_x86_64_msvc 0.53.0", ] +[[package]] +name = "windows-threading" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66463ad2e0ea3bbf808b7f1d371311c80e115c0b71d60efc142cafbcfb057a6" +dependencies = [ + "windows-link", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.5" @@ -3404,9 +4226,9 @@ checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] name = "winnow" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74c7b26e3480b707944fc872477815d29a8e429d2f93a1ce000f5fa84a15cbcd" +checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" dependencies = [ "memchr", ] @@ -3444,7 +4266,7 @@ checksum = "8902160c4e6f2fb145dbe9d6760a75e3c9522d8bf796ed7047c85919ac7115f8" dependencies = [ "arraydeque", "encoding_rs", - "hashlink", + "hashlink 0.8.4", ] [[package]] @@ -3512,6 +4334,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + [[package]] name = "zerotrie" version = "0.2.2" @@ -3525,9 +4353,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.2" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" dependencies = [ "yoke", "zerofrom", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index f9ab1947f..6dc253397 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -1,5 +1,10 @@ [workspace] -members = ["crates/opencue-proto", "crates/rqd", "crates/dummy-cuebot"] +members = [ + "crates/opencue-proto", + "crates/rqd", + "crates/dummy-cuebot", + "crates/scheduler", +] [workspace.package] authors = ["Diego Tavares "] @@ -28,3 +33,4 @@ tracing = "0.1.40" tracing-appender = "0.2.3" tracing-rolling-file = "0.1.2" tracing-subscriber = "0.3.18" +structopt = "0.3.26" diff --git a/rust/config/rqd.fake_linux.yaml b/rust/config/rqd.dummy.cuebot.yaml similarity index 89% rename from rust/config/rqd.fake_linux.yaml rename to rust/config/rqd.dummy.cuebot.yaml index 4083bcc92..a4e2544c0 100644 --- a/rust/config/rqd.fake_linux.yaml +++ b/rust/config/rqd.dummy.cuebot.yaml @@ -2,7 +2,8 @@ logging: level: info grpc: rqd_port: 8444 - cuebot_endpoints: ["0.0.0.0:4343", "0.0.0.0:4343"] + # cuebot_endpoints: ["0.0.0.0:4343", "0.0.0.0:4343"] + cuebot_endpoints: ["localhost:8443"] connection_expires_after: 15m machine: # nimby_mode: true @@ -14,7 +15,7 @@ machine: distro_release_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/distro-release/rocky" proc_stat_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/proc/stat" proc_loadavg_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/proc/loadavg" - temp_path: "/tmp" + temp_path: "/Users/dtavares/tmp" use_session_id_for_proc_lineage: true runner: snapshots_path: $HOME/.rqd/snapshots diff --git a/rust/config/rqd.local.cuebot.yaml b/rust/config/rqd.local.cuebot.yaml new file mode 100644 index 000000000..a4e2544c0 --- /dev/null +++ b/rust/config/rqd.local.cuebot.yaml @@ -0,0 +1,41 @@ +logging: + level: info +grpc: + rqd_port: 8444 + # cuebot_endpoints: ["0.0.0.0:4343", "0.0.0.0:4343"] + cuebot_endpoints: ["localhost:8443"] + connection_expires_after: 15m +machine: + # nimby_mode: true + # nimby_idle_threshold: 60s + worker_threads: 8 + facility: test + monitor_interval: 3s + cpuinfo_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/cpuinfo/cpuinfo_srdsvr09_48-12-4" + distro_release_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/distro-release/rocky" + proc_stat_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/proc/stat" + proc_loadavg_path: "/Users/dtavares/dev/rust_opencue/crates/rqd/resources/proc/loadavg" + temp_path: "/Users/dtavares/tmp" + use_session_id_for_proc_lineage: true +runner: + snapshots_path: $HOME/.rqd/snapshots + kill_monitor_interval: 10s + kill_monitor_timeout: 60s + force_kill_after_timeout: true + docker.mounts: + - target: "" + source: "" + typ: "" + bind-propagation: "" + docker.images: + rhel7: "centos7:latest" + rocky9: "rockyimage:latest" +# monitor_interval_seconds: 3 +# use_ip_as_hostname: false +# nimby_mode: false +# override_real_values: +# cores: 4 +# procs: 8 +# memory: "2Gb" +# desktop_mode: true +# hostname: "some_host_name" diff --git a/rust/config/scheduler.yaml b/rust/config/scheduler.yaml new file mode 100644 index 000000000..cc6d9f1c4 --- /dev/null +++ b/rust/config/scheduler.yaml @@ -0,0 +1,21 @@ +# ============================================================================= +# LOGGING CONFIGURATION +# ============================================================================= +logging: + # Logging level for RQD output + # Options: debug, info, warning, error + # Default: debug + level: debug + +database: + connection_url: "postgresql://cuebot:cuebot_password@localhost:5432/cuebot" + +kafka: + bootstrap_servers: "localhost:9092" + general_jobs_topic: + num_partitions: 12 + replication_factor: 1 + retention: 5m + +rqd: + dry_run_mode: true diff --git a/rust/crates/dummy-cuebot/Cargo.toml b/rust/crates/dummy-cuebot/Cargo.toml index 467f958a1..5be2a7f41 100644 --- a/rust/crates/dummy-cuebot/Cargo.toml +++ b/rust/crates/dummy-cuebot/Cargo.toml @@ -29,7 +29,7 @@ tokio = { workspace = true } tokio-postgres = "0.7.12" tonic = { workspace = true } users = "0.11" -structopt = "0.3.26" +structopt = { workspace = true } [dev-dependencies] tempfile = "3.14.0" diff --git a/rust/crates/scheduler/Cargo.toml b/rust/crates/scheduler/Cargo.toml new file mode 100644 index 000000000..fe2283f38 --- /dev/null +++ b/rust/crates/scheduler/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "scheduler" +authors = { workspace = true } +edition = { workspace = true } +version = { workspace = true } +license = "Apache-2.0" +description = "OpenCue Server Side Job Queueing Service" + +[[bin]] +path = "src/main.rs" +name = "cue-scheduler" + +[dependencies] +# Internal Dependencies +opencue-proto = { path = "../opencue-proto" } + +# External Dependencies +chrono = "0.4.38" +futures = { workspace = true } +dashmap = { workspace = true } +serde = { version = "1.0", features = ["derive"] } +serde_derive = "1.0" +serde_json = "1.0" +async-trait = { workspace = true } +async-stream = { workspace = true } +config = { workspace = true } +thiserror = { workspace = true } +miette = { workspace = true } +tracing = { workspace = true } +tracing-appender = { workspace = true } +tracing-rolling-file = { workspace = true } +tracing-subscriber = { workspace = true } +uuid = { workspace = true, features = ["serde"] } +prost = { workspace = true } +tokio = { workspace = true } +tonic = { workspace = true } +itertools = "0.13.0" +humantime = "2.2.0" +humantime-serde = "1.1.1" +sqlx = { version = "0.8", features = ["runtime-tokio", "postgres"] } +rdkafka = "0.38.0" +structopt = { workspace = true } +once_cell = "1.13" +bytesize = { version = "1.2.0", features = ["serde"] } +regex = "1.0" +indexmap = "2.0" +lazy_static = "1.5" +moka = { version = "0.12.10", features = ["future"] } diff --git a/rust/crates/scheduler/src/config/error.rs b/rust/crates/scheduler/src/config/error.rs new file mode 100644 index 000000000..f9f57c03c --- /dev/null +++ b/rust/crates/scheduler/src/config/error.rs @@ -0,0 +1,32 @@ +use miette::Diagnostic; +use thiserror::Error; +use tonic::Status; + +//===Scheduler Config Error=== +#[derive(Debug, Error, Diagnostic)] +pub enum JobQueueConfigError { + #[error("Failed to load config file")] + LoadConfigError(String), + + #[error("Failed to start application via config file")] + StartFromConfigError(String), + + #[error("Invalid Path configuration")] + InvalidPath(String), +} + +impl From for Status { + fn from(e: JobQueueConfigError) -> Self { + match e { + JobQueueConfigError::LoadConfigError(msg) => { + Status::invalid_argument(format!("Failed to load config: {}", msg)) + } + JobQueueConfigError::StartFromConfigError(msg) => { + Status::internal(format!("Failed to start: {}", msg)) + } + JobQueueConfigError::InvalidPath(msg) => { + Status::invalid_argument(format!("Invalid path: {}", msg)) + } + } + } +} diff --git a/rust/crates/scheduler/src/config/mod.rs b/rust/crates/scheduler/src/config/mod.rs new file mode 100644 index 000000000..beef539e1 --- /dev/null +++ b/rust/crates/scheduler/src/config/mod.rs @@ -0,0 +1,219 @@ +pub mod error; + +use crate::config::error::JobQueueConfigError; +use bytesize::ByteSize; +use config::{Config as ConfigBase, Environment, File}; +use lazy_static::lazy_static; +use serde::Deserialize; +use std::{env, time::Duration}; + +static DEFAULT_CONFIG_FILE: &str = "~/.local/share/rqd.yaml"; + +lazy_static! { + pub static ref CONFIG: Config = Config::load().expect("Failed to load config file"); +} + +//===Config Types=== + +#[derive(Debug, Deserialize, Default, Clone)] +#[serde(default)] +pub struct Config { + pub logging: LoggingConfig, + pub queue: QueueConfig, + pub database: DatabaseConfig, + pub kafka: KafkaConfig, + pub rqd: RqdConfig, +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct LoggingConfig { + // Logging level: debug|info|warning|error + pub level: String, + // Path to the log file if `file_appender` is enabled + pub path: String, + // Log to stdout if file_appender is False + pub file_appender: bool, +} + +impl Default for LoggingConfig { + fn default() -> Self { + Self { + level: "debug".to_string(), + path: "/opt/rqd/logs/scheduler.log".to_string(), + file_appender: false, + } + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct QueueConfig { + #[serde(with = "humantime_serde")] + pub monitor_interval: Duration, + pub worker_threads: usize, + pub dispatch_frames_per_layer_limit: usize, + pub core_multiplier: u32, + pub memory_stranded_threshold: ByteSize, + #[serde(with = "humantime_serde")] + pub job_back_off_duration: Duration, +} + +impl Default for QueueConfig { + fn default() -> QueueConfig { + QueueConfig { + monitor_interval: Duration::from_secs(5), + worker_threads: 4, + dispatch_frames_per_layer_limit: 20, + core_multiplier: 100, + memory_stranded_threshold: ByteSize::gib(2), + job_back_off_duration: Duration::from_secs(300), + } + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct DatabaseConfig { + pub pool_size: u32, + pub connection_url: String, + pub core_multiplier: u32, +} + +impl Default for DatabaseConfig { + fn default() -> DatabaseConfig { + DatabaseConfig { + pool_size: 10, + connection_url: "postgres://postgres:password@localhost/test".to_string(), + core_multiplier: 100, + } + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct KafkaConfig { + pub bootstrap_servers: String, + #[serde(with = "humantime_serde")] + pub timeout: Duration, + pub general_jobs_topic: TopicConfig, +} + +impl Default for KafkaConfig { + fn default() -> KafkaConfig { + KafkaConfig { + bootstrap_servers: "localhost:9092".to_string(), + timeout: Duration::from_secs(5), + general_jobs_topic: TopicConfig::default(), + } + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct TopicConfig { + pub topic_name: String, + pub num_partitions: i32, + pub replication_factor: i32, + #[serde(with = "humantime_serde")] + pub retention: Duration, +} + +impl Default for TopicConfig { + fn default() -> TopicConfig { + TopicConfig { + topic_name: "general_job_queue".to_string(), + num_partitions: 12, + replication_factor: 3, + retention: Duration::from_secs(300), + } + } +} + +#[derive(Debug, Deserialize, Clone)] +#[serde(default)] +pub struct RqdConfig { + pub grpc_port: u32, + pub dry_run_mode: bool, +} + +impl Default for RqdConfig { + fn default() -> RqdConfig { + RqdConfig { + grpc_port: 8444, + dry_run_mode: false, + } + } +} +//===Config Loader=== + +impl Config { + // load the current config from the system config and environment variables + pub fn load() -> Result { + let mut required = false; + let config_file = match env::var("OPENCUE_JOB_QUEUE_CONFIG") { + Ok(v) => { + required = true; + v + } + Err(_) => DEFAULT_CONFIG_FILE.to_string(), + }; + + println!(" INFO Config::load: using config file: {:?}", config_file); + + let config = ConfigBase::builder() + .add_source(File::with_name(&config_file).required(required)) + .add_source(Environment::with_prefix("OPENRQD").separator("_")) + .build() + .map_err(|err| { + JobQueueConfigError::LoadConfigError(format!( + "{:?} config could not be loaded. {}", + &config_file, err + )) + })?; + + let deserialized_config = Config::deserialize(config).map_err(|err| { + JobQueueConfigError::LoadConfigError(format!( + "{:?} config could not be deserialized. {}", + &config_file, err + )) + })?; + + Ok(deserialized_config) + } + + #[allow(dead_code)] + pub fn load_file_and_env>(path: P) -> Result { + let config = ConfigBase::builder() + .add_source(File::with_name(path.as_ref())) + .add_source(Environment::with_prefix("RQD").separator("_")) + .build(); + + config + .map(|c| Config::deserialize(c).unwrap()) + .map_err(|err| { + JobQueueConfigError::LoadConfigError(format!( + "{:?} config could not be loaded. {}", + path.as_ref(), + err + )) + }) + } + + #[allow(dead_code)] + pub fn load_file>(path: P) -> Result { + let config = ConfigBase::builder() + .add_source(File::with_name(path.as_ref())) + .build(); + + config + .map(|c| Config::deserialize(c).unwrap()) + .map_err(|err| { + JobQueueConfigError::LoadConfigError(format!( + "{:?} config could not be loaded. {}", + path.as_ref(), + err + )) + }) + } +} diff --git a/rust/crates/scheduler/src/dao/frame_dao.rs b/rust/crates/scheduler/src/dao/frame_dao.rs new file mode 100644 index 000000000..1b119b4c4 --- /dev/null +++ b/rust/crates/scheduler/src/dao/frame_dao.rs @@ -0,0 +1,211 @@ +use std::sync::Arc; + +use futures::Stream; +use miette::Result; +use serde::{Deserialize, Serialize}; +use sqlx::{Pool, Postgres}; +use uuid::Uuid; + +use crate::{ + config::DatabaseConfig, + models::{CoreSize, DispatchFrame, DispatchLayer}, + pgpool::connection_pool, +}; + +/// Data Access Object for frame operations in the job dispatch system. +/// +/// Handles database queries related to frames, particularly for finding +/// dispatchable frames within layers that meet resource constraints. +pub struct FrameDao { + connection_pool: Arc>, +} + +/// Database model representing a frame ready for dispatch. +/// +/// Contains all the necessary information to dispatch a frame to a host, +/// including resource requirements, job metadata, and execution parameters. +/// This model maps directly to the database query results and is converted +/// to `DispatchFrame` for business logic processing. +#[derive(sqlx::FromRow, Serialize, Deserialize)] +pub struct DispatchFrameModel { + // Entity fields + pub pk_frame: String, + pub str_frame_name: String, + + // LayerEntity fields + pub pk_show: String, + pub pk_facility: String, + pub pk_job: String, + + // FrameEntity fields + pub pk_layer: String, + + // DispatchFrame specific fields + pub str_cmd: String, + pub str_range: String, + pub int_chunk_size: i64, + pub str_show: String, + pub str_shot: String, + pub str_user: String, + pub int_uid: Option, + pub str_log_dir: String, + pub str_layer_name: String, + pub str_job_name: String, + pub int_min_cores: i32, + pub int_mem_min: i64, + pub b_threadable: bool, + pub int_gpus_min: i64, + pub int_gpu_mem_min: i64, + // On Cuebot these fields come from constants, maybe replicate these constants here + // pub int_soft_memory_limit: i64, + // pub int_hard_memory_limit: i64, + pub str_services: Option, + pub str_os: Option, + pub int_layer_cores_max: i32, +} + +impl From for DispatchFrame { + fn from(val: DispatchFrameModel) -> Self { + DispatchFrame { + // id: Uuid::parse_str(&val.pk_host).unwrap_or_default(), + id: Uuid::parse_str(&val.pk_frame).unwrap_or_default(), + frame_name: val.str_frame_name, + show_id: Uuid::parse_str(&val.pk_show).unwrap_or_default(), + facility_id: Uuid::parse_str(&val.pk_facility).unwrap_or_default(), + job_id: Uuid::parse_str(&val.pk_job).unwrap_or_default(), + layer_id: Uuid::parse_str(&val.pk_layer).unwrap_or_default(), + command: val.str_cmd, + range: val.str_range, + chunk_size: val + .int_chunk_size + .try_into() + .expect("int_chunk_size fit on a i32"), + show_name: val.str_show, + shot: val.str_shot, + user: val.str_user, + uid: val + .int_uid + .map(|uid| uid.try_into().expect("int_uid should fit on a i32")), + log_dir: val.str_log_dir, + layer_name: val.str_layer_name, + job_name: val.str_job_name, + min_cores: CoreSize::from_multiplied(val.int_min_cores), + threadable: val.b_threadable, + min_gpus: val + .int_gpus_min + .try_into() + .expect("int_gpus_min should fit on a i32"), + min_gpu_memory: val.int_gpu_mem_min as u64, + min_memory: val.int_mem_min as u64, + services: val.str_services, + os: val.str_os, + // TODO: fill up from config, or update database schema + loki_url: None, + layer_cores_limit: (val.int_layer_cores_max > 0) + .then(|| CoreSize::from_multiplied(val.int_layer_cores_max)), + // TODO: Implement a better solution for handling selfish services + has_selfish_service: false, + } + } +} + +static QUERY_FRAME: &str = r#" +WITH dispatch_frames AS ( + SELECT + f.pk_frame, + f.str_name as str_frame_name, + j.pk_show, + j.pk_facility, + j.pk_job, + l.pk_layer, + l.str_cmd, + l.str_range, + l.int_chunk_size, + j.str_show, + j.str_shot, + j.str_user, + j.int_uid, + j.str_log_dir, + l.str_name as str_layer_name, + j.str_name as str_job_name, + j.int_min_cores, + l.int_mem_min, + l.b_threadable, + l.int_gpus_min, + l.int_gpu_mem_min, + l.str_services, + j.str_os, + l.int_cores_max as int_layer_cores_max, + f.int_dispatch_order, + f.int_layer_order, + -- Accumulate the number of cores that would be consumed + SUM(l.int_cores_min) OVER ( + ORDER BY f.int_dispatch_order, f.int_layer_order + ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + ) AS aggr_job_cores, + jr.int_max_cores as job_resource_core_limit, + jr.int_cores as job_resource_consumed_cores + FROM job j + INNER JOIN layer l ON j.pk_job = l.pk_job + INNER JOIN frame f ON l.pk_layer = f.pk_layer + INNER JOIN job_resource jr ON l.pk_job = jr.pk_job + WHERE l.pk_layer = $1 + -- Avoid booking DEPEND frames. This status is maintained by a database trigger + AND f.str_state = 'WAITING' +) SELECT * from dispatch_frames + -- Limit the query to a number of frames that would not overflow the job_resource limit + -- limit <= 0 means there's no limit + WHERE job_resource_core_limit <= 0 OR (aggr_job_cores + job_resource_consumed_cores <= job_resource_core_limit) +ORDER BY + int_dispatch_order, + int_layer_order +LIMIT $2 +"#; +// TODO: Take table limit_record into consideration + +impl FrameDao { + /// Creates a new FrameDao from database configuration. + /// + /// Establishes a connection pool to the PostgreSQL database using the + /// provided configuration settings. + /// + /// # Arguments + /// * `config` - Database configuration containing connection parameters + /// + /// # Returns + /// * `Ok(FrameDao)` - Configured DAO ready for database operations + /// * `Err(miette::Error)` - If database connection fails + pub async fn from_config(config: &DatabaseConfig) -> Result { + let pool = connection_pool(config).await?; + Ok(FrameDao { + connection_pool: pool, + }) + } + + /// Queries frames ready for dispatch within a specific layer. + /// + /// Returns a stream of frames that are: + /// - In WAITING state (not DEPEND or already running) + /// - Respecting job resource core limits + /// - Ordered by dispatch and layer priority + /// + /// The query includes a complex resource limit check that ensures the + /// cumulative core usage doesn't exceed job resource limits. + /// + /// # Arguments + /// * `layer` - The layer to find dispatchable frames for + /// * `limit` - Maximum number of frames to return + /// + /// # Returns + /// A stream of `DispatchFrameModel` results from the database query + pub fn query_dispatch_frames( + &self, + layer: &DispatchLayer, + limit: i32, + ) -> impl Stream> + '_ { + sqlx::query_as::<_, DispatchFrameModel>(QUERY_FRAME) + .bind(format!("{:x}", layer.id)) + .bind(limit) + .fetch(&*self.connection_pool) + } +} diff --git a/rust/crates/scheduler/src/dao/host_dao.rs b/rust/crates/scheduler/src/dao/host_dao.rs new file mode 100644 index 000000000..9be691182 --- /dev/null +++ b/rust/crates/scheduler/src/dao/host_dao.rs @@ -0,0 +1,272 @@ +use std::sync::Arc; + +use futures::Stream; +use miette::{Context, IntoDiagnostic, Result}; +use opencue_proto::host::ThreadMode; +use serde::{Deserialize, Serialize}; +use sqlx::{Pool, Postgres}; +use tracing::trace; +use uuid::Uuid; + +use crate::{ + config::DatabaseConfig, + models::{CoreSize, DispatchLayer, Host}, + pgpool::connection_pool, +}; + +/// Data Access Object for host operations in the job dispatch system. +/// +/// Manages database operations related to render hosts, including: +/// - Finding suitable hosts for layer dispatch +/// - Host resource locking and unlocking +/// - Updating host resource availability after dispatch +pub struct HostDao { + connection_pool: Arc>, +} + +/// Database model representing a host with its current resource availability. +/// +/// Contains host metadata, resource information, and allocation details +/// needed for dispatch matching. This model is converted to the business +/// logic `Host` type for processing. +#[derive(sqlx::FromRow, Serialize, Deserialize)] +pub(crate) struct HostModel { + pk_host: String, + str_name: String, + str_os: Option, + int_cores_idle: i64, + int_mem_idle: i64, + int_gpus_idle: i64, + int_gpu_mem_idle: i64, + int_cores: i64, + int_mem: i64, + int_thread_mode: i32, + // Name of the allocation the host is subscribed to for a given show + str_alloc_name: String, + // Number of cores available at the subscription of the show this host has been queried on + int_alloc_available_cores: i64, +} + +impl From for Host { + fn from(val: HostModel) -> Self { + Host { + id: Uuid::parse_str(&val.pk_host).unwrap_or_default(), + name: val.str_name, + str_os: val.str_os, + idle_cores: CoreSize::from_multiplied( + val.int_cores_idle + .try_into() + .expect("int_cores_min/multiplier should fit on a i32"), + ), + idle_memory: val.int_mem_idle as u64, + idle_gpus: val + .int_gpus_idle + .try_into() + .expect("int_gpus should fit on a i32"), + idle_gpu_memory: val.int_gpu_mem_idle as u64, + total_cores: CoreSize::from_multiplied( + val.int_cores + .try_into() + .expect("total_cores should fit on a i32"), + ), + total_memory: val.int_mem as u64, + thread_mode: ThreadMode::try_from(val.int_thread_mode).unwrap_or_default(), + alloc_available_cores: CoreSize::from_multiplied( + val.int_alloc_available_cores + .try_into() + .expect("alloc_available_cores should fit on a i32"), + ), + allocation_name: val.str_alloc_name, + } + } +} + +static QUERY_DISPATCH_HOST: &str = r#" +SELECT + h.pk_host, + h.str_name, + hs.str_os, + h.int_cores_idle, + h.int_mem_idle, + h.int_gpus_idle, + h.int_gpu_mem_idle, + h.int_cores, + h.int_mem, + h.int_thread_mode, + s.int_burst - s.int_cores as int_alloc_available_cores, + a.str_name as str_alloc_name +FROM host h + INNER JOIN host_stat hs ON h.pk_host = hs.pk_host + INNER JOIN alloc a ON h.pk_alloc = a.pk_alloc + INNER JOIN subscription s ON s.pk_alloc = a.pk_alloc AND s.pk_show = $1 +WHERE a.pk_facility = $2 + AND (hs.str_os ILIKE $3 OR hs.str_os = '' and $4 = '') -- review + AND h.str_lock_state = 'OPEN' + --AND hs.str_state = 'UP' + AND h.int_cores_idle >= $5 + AND h.int_mem_idle >= $6 + AND string_to_array($7, ' | ') && string_to_array(h.str_tags, ' ') + AND h.int_gpus_idle >= $8 + AND h.int_gpu_mem_idle >= $9 +ORDER BY + -- Hosts with least resources available come first in an attempt to fully book them + h.int_cores_idle::float / h.int_cores, + h.int_mem_idle::float / h.int_mem +LIMIT $10 +"#; + +impl HostDao { + /// Creates a new HostDao from database configuration. + /// + /// Establishes a connection pool to the PostgreSQL database for + /// host-related operations. + /// + /// # Arguments + /// * `config` - Database configuration containing connection parameters + /// + /// # Returns + /// * `Ok(HostDao)` - Configured DAO ready for host operations + /// * `Err(miette::Error)` - If database connection fails + pub async fn from_config(config: &DatabaseConfig) -> Result { + let pool = connection_pool(config).await?; + Ok(HostDao { + connection_pool: pool, + }) + } + + /// Finds hosts capable of executing frames from a specific layer. + /// + /// The query filters hosts based on: + /// - OS compatibility (using ILIKE pattern matching) + /// - Available resources (cores, memory, GPUs) + /// - Host state (OPEN lock state) + /// - Service tags compatibility + /// - Allocation and subscription constraints + /// + /// Results are ordered to prioritize hosts with fewer available resources + /// to encourage full host utilization. + /// + /// # Arguments + /// * `layer` - The layer requiring host resources + /// * `limit` - Maximum number of hosts to return + /// + /// # Returns + /// A stream of `HostModel` results ordered by resource utilization + pub fn find_host_for_layer( + &self, + layer: &DispatchLayer, + limit: usize, + ) -> impl Stream> + '_ { + let str_os_like = format!( + "%{}%", + layer.str_os.clone().unwrap_or("EMPTY_HOST_OS".to_string()) + ); + trace!( + "find_host_for_layer: $1={}, $2={}, $3={}, $4={}, $5={}, $6={}, $7={}, $8={}, $9={}", + format!("{:X}", layer.show_id), + format!("{:X}", layer.facility_id), + str_os_like, + layer.str_os.clone().unwrap_or_default(), + layer.cores_min.with_multiplier().value(), + layer.mem_min, + layer.tags.clone(), + layer.gpus_min, + layer.gpu_mem_min + ); + sqlx::query_as::<_, HostModel>(QUERY_DISPATCH_HOST) + .bind(format!("{:X}", layer.show_id)) + .bind(format!("{:X}", layer.facility_id)) + .bind(str_os_like) + .bind(layer.str_os.clone().unwrap_or_default()) + .bind(layer.cores_min.with_multiplier().value()) + .bind(layer.mem_min) + .bind(layer.tags.clone()) + .bind(layer.gpus_min) + .bind(layer.gpu_mem_min) + .bind(limit as i32) + .fetch(&*self.connection_pool) + } + + /// Acquires an advisory lock on a host to prevent concurrent dispatch. + /// + /// Uses PostgreSQL's advisory lock mechanism to ensure only one dispatcher + /// can modify a host's resources at a time. The lock is based on a hash + /// of the host ID string. + /// + /// # Arguments + /// * `host_id` - The UUID of the host to lock + /// + /// # Returns + /// * `Ok(true)` - Lock successfully acquired + /// * `Ok(false)` - Lock already held by another process + /// * `Err(miette::Error)` - Database operation failed + pub async fn lock(&self, host_id: &Uuid) -> Result { + let host_id_str = host_id.to_string(); + sqlx::query_scalar::<_, bool>("SELECT pg_try_advisory_lock(hashtext($1))") + .bind(&host_id_str) + .fetch_one(&*self.connection_pool) + .await + .into_diagnostic() + .wrap_err("Failed to acquire advisory lock") + // Ok(true) + } + + /// Releases an advisory lock on a host after dispatch completion. + /// + /// Releases the PostgreSQL advisory lock that was acquired during + /// the dispatch process, allowing other dispatchers to access the host. + /// + /// # Arguments + /// * `host_id` - The UUID of the host to unlock + /// + /// # Returns + /// * `Ok(true)` - Lock successfully released + /// * `Ok(false)` - Lock was not held by this process + /// * `Err(miette::Error)` - Database operation failed + pub async fn unlock(&self, host_id: &Uuid) -> Result { + let host_id_str = host_id.to_string(); + sqlx::query_scalar::<_, bool>("SELECT pg_advisory_unlock(hashtext($1))") + .bind(&host_id_str) + .fetch_one(&*self.connection_pool) + .await + .into_diagnostic() + .wrap_err("Failed to release advisory lock") + // Ok(true) + } + + /// Updates a host's available resource counts after frame dispatch. + /// + /// Modifies the host's idle resource counters in the database to reflect + /// resources consumed by dispatched frames. This ensures accurate resource + /// tracking for subsequent dispatch decisions. + /// + /// # Arguments + /// * `updated_host` - Host with updated idle resource values + /// + /// # Returns + /// * `Ok(())` - Resources successfully updated + /// * `Err(miette::Error)` - Database update failed + pub async fn update_resources(&self, updated_host: &Host) -> Result<()> { + sqlx::query( + r#" + UPDATE host + SET int_cores_idle = $1, + int_mem_idle = $2, + int_gpus_idle = $3, + int_gpu_mem_idle = $4 + WHERE pk_host = $5 + "#, + ) + .bind(updated_host.idle_cores.with_multiplier().value()) + .bind(updated_host.idle_memory as i64) + .bind(updated_host.idle_gpus as i32) + .bind(updated_host.idle_gpu_memory as i64) + .bind(updated_host.id.to_string()) + .execute(&*self.connection_pool) + .await + .into_diagnostic() + .wrap_err("Failed to update host resources")?; + + Ok(()) + } +} diff --git a/rust/crates/scheduler/src/dao/job_dao.rs b/rust/crates/scheduler/src/dao/job_dao.rs new file mode 100644 index 000000000..8f8080906 --- /dev/null +++ b/rust/crates/scheduler/src/dao/job_dao.rs @@ -0,0 +1,121 @@ +use std::sync::Arc; + +use futures::Stream; +use miette::Result; +use serde::{Deserialize, Serialize}; +use sqlx::{Pool, Postgres}; +use uuid::Uuid; + +use crate::{ + config::{CONFIG, DatabaseConfig}, + models::DispatchJob, + pgpool::connection_pool, +}; + +/// Data Access Object for job operations in the job dispatch system. +/// +/// Handles database queries related to jobs, specifically finding jobs +/// that are ready for dispatch processing based on show subscriptions, +/// resource limits, and job states. +pub struct JobDao { + connection_pool: Arc>, +} + +/// Database model representing a job ready for dispatch. +/// +/// Contains the essential job metadata needed for dispatch prioritization +/// and processing. This model is converted to `DispatchJob` for business +/// logic operations. +#[derive(sqlx::FromRow, Serialize, Deserialize)] +pub struct JobModel { + pub pk_job: String, + pub int_priority: i32, + pub age_days: i32, +} + +impl From for DispatchJob { + fn from(val: JobModel) -> Self { + DispatchJob { + id: Uuid::parse_str(&val.pk_job).unwrap_or_default(), + int_priority: val.int_priority, + age_days: val.age_days, + } + } +} + +static QUERY_PENDING_JOBS: &str = r#" +WITH bookable_shows AS ( + SELECT + w.pk_show, + s.float_tier, + s.int_burst + FROM subscription s + INNER JOIN vs_waiting w ON s.pk_show = w.pk_show + WHERE s.int_burst > 0 + AND s.int_burst - s.int_cores >= $1 + AND s.int_cores < s.int_burst +), +filtered_jobs AS ( + SELECT + j.pk_job, + jr.int_priority, + CAST(EXTRACT(EPOCH FROM (NOW() - j.ts_updated)) / 86400 AS INTEGER) AS age_days + FROM job j + INNER JOIN bookable_shows on j.pk_show = bookable_shows.pk_show + INNER JOIN job_resource jr ON j.pk_job = jr.pk_job + INNER JOIN folder f ON j.pk_folder = f.pk_folder + INNER JOIN folder_resource fr ON f.pk_folder = fr.pk_folder + INNER JOIN point p ON f.pk_dept = p.pk_dept AND f.pk_show = p.pk_show + WHERE j.str_state = 'PENDING' + AND j.b_paused = false + AND (fr.int_max_cores = -1 OR fr.int_cores < fr.int_max_cores) + AND (fr.int_max_gpus = -1 OR fr.int_gpus < fr.int_max_gpus) +) +SELECT DISTINCT + fj.pk_job, + fj.int_priority, + fj.age_days +FROM filtered_jobs fj +INNER JOIN layer_stat ls ON fj.pk_job = ls.pk_job +WHERE ls.int_waiting_count > 0 +"#; + +impl JobDao { + /// Creates a new JobDao from database configuration. + /// + /// Establishes a connection pool to the PostgreSQL database for + /// job-related queries. + /// + /// # Arguments + /// * `config` - Database configuration containing connection parameters + /// + /// # Returns + /// * `Ok(JobDao)` - Configured DAO ready for job operations + /// * `Err(miette::Error)` - If database connection fails + pub async fn from_config(config: &DatabaseConfig) -> Result { + let pool = connection_pool(config).await?; + + Ok(JobDao { + connection_pool: pool, + }) + } + + /// Queries for active jobs that are ready for dispatch processing. + /// + /// Returns jobs that meet the following criteria: + /// - Belong to shows with available subscription burst capacity + /// - Are in PENDING state and not paused + /// - Have folder resources within limits (cores and GPUs) + /// - Have at least one layer with waiting frames + /// + /// The query considers show subscription tiers, resource limits, and + /// current resource usage to ensure only dispatchable jobs are returned. + /// + /// # Returns + /// A stream of `JobModel` results representing jobs ready for processing + pub fn query_active_jobs(&self) -> impl Stream> + '_ { + sqlx::query_as::<_, JobModel>(QUERY_PENDING_JOBS) + .bind(CONFIG.queue.core_multiplier as i32) + .fetch(&*self.connection_pool) + } +} diff --git a/rust/crates/scheduler/src/dao/layer_dao.rs b/rust/crates/scheduler/src/dao/layer_dao.rs new file mode 100644 index 000000000..a15b67d58 --- /dev/null +++ b/rust/crates/scheduler/src/dao/layer_dao.rs @@ -0,0 +1,139 @@ +use std::sync::Arc; + +use futures::Stream; +use miette::Result; +use serde::{Deserialize, Serialize}; +use sqlx::{Pool, Postgres}; +use uuid::Uuid; + +use crate::{ + config::DatabaseConfig, + models::{CoreSize, DispatchLayer}, + pgpool::connection_pool, +}; + +/// Data Access Object for layer operations in the job dispatch system. +/// +/// Handles database queries related to layers within jobs, specifically +/// finding layers that have waiting frames and are ready for dispatch. +pub struct LayerDao { + connection_pool: Arc>, +} + +/// Database model representing a layer ready for dispatch. +/// +/// Contains layer metadata, resource requirements, and job context needed +/// for host matching and frame dispatch. This model is converted to +/// `DispatchLayer` for business logic processing. +#[derive(sqlx::FromRow, Serialize, Deserialize)] +pub struct DispatchLayerModel { + pub pk_layer: String, + pub pk_job: String, + pub pk_facility: String, + pub pk_show: String, + pub str_name: String, + pub str_job_name: String, + pub str_os: Option, + pub int_cores_min: i64, + pub int_mem_min: i64, + pub b_threadable: bool, + pub int_gpus_min: i64, + pub int_gpu_mem_min: i64, + pub str_tags: String, +} + +impl From for DispatchLayer { + fn from(val: DispatchLayerModel) -> Self { + DispatchLayer { + id: Uuid::parse_str(&val.pk_layer).unwrap_or_default(), + job_id: Uuid::parse_str(&val.pk_job).unwrap_or_default(), + facility_id: Uuid::parse_str(&val.pk_facility).unwrap_or_default(), + show_id: Uuid::parse_str(&val.pk_show).unwrap_or_default(), + job_name: val.str_job_name, + layer_name: val.str_name, + str_os: val.str_os, + cores_min: CoreSize::from_multiplied( + val.int_cores_min + .try_into() + .expect("int_cores_min should fit on a i32"), + ), + mem_min: val.int_mem_min, + threadable: val.b_threadable, + gpus_min: val + .int_gpus_min + .try_into() + .expect("gpus_min should fit on a i32"), + gpu_mem_min: val.int_gpu_mem_min, + tags: val.str_tags, + } + } +} + +static QUERY_LAYER: &str = r#" +SELECT + l.pk_layer, + j.pk_job, + j.pk_facility, + j.pk_show, + j.str_name, + l.str_name as str_job_name, + j.str_os, + l.int_cores_min, + l.int_mem_min, + l.b_threadable, + l.int_gpus_min, + l.int_gpu_mem_min, + l.str_tags +FROM job j + INNER JOIN layer l ON j.pk_job = l.pk_job + INNER JOIN layer_stat ls on l.pk_layer = ls.pk_layer +WHERE j.pk_job = $1 + AND ls.int_waiting_count > 0 +ORDER BY + l.int_dispatch_order +"#; +// TODO: Take table limit_record into consideration + +impl LayerDao { + /// Creates a new LayerDao from database configuration. + /// + /// Establishes a connection pool to the PostgreSQL database for + /// layer-related queries. + /// + /// # Arguments + /// * `config` - Database configuration containing connection parameters + /// + /// # Returns + /// * `Ok(LayerDao)` - Configured DAO ready for layer operations + /// * `Err(miette::Error)` - If database connection fails + pub async fn from_config(config: &DatabaseConfig) -> Result { + let pool = connection_pool(config).await?; + Ok(LayerDao { + connection_pool: pool, + }) + } + + /// Queries layers within a specific job that have waiting frames. + /// + /// Returns layers that: + /// - Belong to the specified job + /// - Have at least one frame in waiting state + /// - Are ordered by dispatch priority (int_dispatch_order) + /// + /// This query is used to find layers within a job that are ready + /// for frame dispatch processing. + /// + /// # Arguments + /// * `pk_job` - The UUID of the job to find layers for + /// + /// # Returns + /// A stream of `DispatchLayerModel` results ordered by dispatch priority + pub fn query_layers( + &self, + pk_job: Uuid, + ) -> impl Stream> + '_ { + sqlx::query_as::<_, DispatchLayerModel>(QUERY_LAYER) + .bind(format!("{:x}", pk_job)) + .fetch(&*self.connection_pool) + } +} diff --git a/rust/crates/scheduler/src/dao/mod.rs b/rust/crates/scheduler/src/dao/mod.rs new file mode 100644 index 000000000..ad3cdf2f9 --- /dev/null +++ b/rust/crates/scheduler/src/dao/mod.rs @@ -0,0 +1,9 @@ +mod frame_dao; +mod host_dao; +mod job_dao; +mod layer_dao; + +pub use frame_dao::FrameDao; +pub use host_dao::HostDao; +pub use job_dao::JobDao; +pub use layer_dao::LayerDao; diff --git a/rust/crates/scheduler/src/job_dispatcher/dispatcher.rs b/rust/crates/scheduler/src/job_dispatcher/dispatcher.rs new file mode 100644 index 000000000..867379bf8 --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/dispatcher.rs @@ -0,0 +1,600 @@ +use crate::{ + dao::{FrameDao, HostDao}, + job_dispatcher::{DispatchError, VirtualProcError, frame_set::FrameSet}, + models::{CoreSize, DispatchFrame, DispatchLayer, Host, VirtualProc}, +}; +use bytesize::MIB; +use futures::{FutureExt, StreamExt}; +use miette::{Context, IntoDiagnostic, Result, miette}; +use opencue_proto::{ + host::ThreadMode, + rqd::{RqdStaticLaunchFrameRequest, RunFrame, rqd_interface_client::RqdInterfaceClient}, +}; +use std::{collections::HashMap, sync::Arc}; +use tonic::transport::Channel; +use tracing::{debug, error, info}; +use uuid::Uuid; + +/// RQD dispatcher responsible for dispatching frames to render hosts. +/// +/// The dispatcher handles: +/// - Frame-to-host matching and resource allocation +/// - gRPC communication with RQD instances +/// - Resource consumption tracking and validation +/// - Frame command preparation and execution setup +pub struct RqdDispatcher { + frame_dao: FrameDao, + host_dao: Arc, + dispatch_frames_per_layer_limit: usize, + grpc_port: u32, + memory_stranded_threshold: u64, + dry_run_mode: bool, +} + +impl RqdDispatcher { + /// Creates a new RQD dispatcher with the specified configuration. + /// + /// # Arguments + /// * `frame_dao` - Database access for frame operations + /// * `host_dao` - Database access for host operations and locking + /// * `grpc_port` - Port number for RQD gRPC connections + /// * `dispatch_frames_per_layer_limit` - Maximum frames to dispatch per layer + /// * `memory_stranded_threshold` - Memory threshold for stranded frame detection + /// * `dry_run_mode` - If true, logs dispatch actions without executing them + pub fn new( + frame_dao: FrameDao, + host_dao: Arc, + grpc_port: u32, + dispatch_frames_per_layer_limit: usize, + memory_stranded_threshold: u64, + dry_run_mode: bool, + ) -> Self { + Self { + frame_dao, + host_dao, + grpc_port, + dispatch_frames_per_layer_limit, + memory_stranded_threshold, + dry_run_mode, + } + } + + /// Dispatches a layer to a specific host with proper locking and error handling. + /// + /// The dispatch process: + /// 1. Acquires an exclusive lock on the target host + /// 2. Performs the actual dispatch operation + /// 3. Ensures the host lock is always released, even on panic or failure + /// + /// # Arguments + /// * `layer` - The layer containing frames to dispatch + /// * `host` - The target host for frame execution + /// + /// # Returns + /// * `Ok(())` on successful dispatch + /// * `Err(DispatchError)` on various failure conditions + pub async fn dispatch(&self, layer: &DispatchLayer, host: &Host) -> Result<(), DispatchError> { + // Acquire lock first + if !self + .host_dao + .lock(&host.id) + .await + .map_err(DispatchError::Failure)? + { + return Err(DispatchError::HostLock(host.name.clone())); + } + + // Ensure unlock is always called, even if dispatch_inner panics or fails + let result = std::panic::AssertUnwindSafe(self.dispatch_inner(layer, host)) + .catch_unwind() + .await; + + // Always unlock, regardless of outcome + if let Err(unlock_err) = self.host_dao.unlock(&host.id).await { + error!("Failed to unlock host {}: {}", host.id, unlock_err); + } + + // Handle the result from dispatch_inner + match result { + Ok(result) => { + if result.is_ok() { + info!("Successfully dispatched layer {} on {}.", layer, host); + } + result + } + Err(_panic) => Err(DispatchError::Failure(miette!( + "Dispatch operation panicked for layer {} on host {}", + layer, + host + ))), + } + } + + async fn dispatch_inner( + &self, + layer: &DispatchLayer, + host: &Host, + ) -> Result<(), DispatchError> { + let rqd_client = if self.dry_run_mode { + None + } else { + Some( + Self::connect_to_rqd(&host.name, self.grpc_port) + .await + .map_err(DispatchError::Failure)?, + ) + }; + + let mut stream = self + .frame_dao + .query_dispatch_frames(layer, self.dispatch_frames_per_layer_limit as i32); + let mut current_host = host.clone(); + + // A host should not book frames if its allocation is at or above its limit, + // but checking the limit before each frame is too costly. The tradeoff is + // to check the allocation state before entering the frame booking loop, + // with these there's a risk the allocation will go above burst, but not by + // a great margin as each loop only runs for a limited number of frames + // (see config queue.dispatch_frames_per_layer_limit) + let mut allocation_capacity = host.alloc_available_cores; + + let mut dispatched_procs: Vec = Vec::new(); + + while let Some(frame) = stream.next().await { + match frame { + Ok(frame_model) => { + let frame: DispatchFrame = frame_model.into(); + debug!("found frame {}", frame); + + match Self::consume_host_virtual_resources( + frame, + current_host.clone(), + self.memory_stranded_threshold, + ) + .await + { + Ok((virtual_proc, updated_host)) => { + debug!("Built virtual proc {}", virtual_proc); + // Update host for the next iteration + current_host = updated_host; + + // Check allocation capacity + let cores_reserved_without_multiplier: CoreSize = + virtual_proc.cores_reserved.into(); + if cores_reserved_without_multiplier > allocation_capacity { + Err(DispatchError::AllocationOverBurst( + host.allocation_name.clone(), + ))?; + }; + allocation_capacity = + allocation_capacity - virtual_proc.cores_reserved.into(); + + let run_frame = self + .prepare_rqd_run_frame(&virtual_proc) + .map_err(DispatchError::Failure)?; + debug!("Prepared run_frame for {}", virtual_proc); + let request = RqdStaticLaunchFrameRequest { + run_frame: Some(run_frame), + }; + + // When running on dry_run_mode, just log the outcome + let msg = format!("Dispatching {} on {}", virtual_proc, ¤t_host); + if self.dry_run_mode { + info!("(DRY_RUN) {}", msg); + } else { + debug!(msg); + // Get a ref to the mutable grpc client + let mut rqd_client_ref = rqd_client + .as_ref() + .expect("Should be Some if dry_run is false") + .clone(); + + // Launch frame on rqd + rqd_client_ref + .launch_frame(request) + .await + .into_diagnostic() + .map_err(DispatchError::Failure)?; + + // Update database resources + self.host_dao + .update_resources(¤t_host) + .await + .map_err(DispatchError::FailureAfterDispatch)?; + } + dispatched_procs.push(virtual_proc.to_string()); + } + Err(err) => match err { + VirtualProcError::HostResourcesExtinguished(msg) => { + debug!("Host resourses extinguished for {}. {}", host, msg); + Err(DispatchError::HostResourcesExtinguished)?; + } + }, + } + } + Err(err) => { + Err(DispatchError::Failure(miette!( + "Failed to consume dispatch stream. {}", + err + )))?; + } + } + } + if dispatched_procs.is_empty() { + info!("Found no frames on {} to dispatch to {}", layer, host); + } else { + debug!("Dispatched {} frames: ", dispatched_procs.len()); + for proc in dispatched_procs { + debug!("{}", proc); + } + } + Ok(()) + } + + /// Calculates the actual number of cores requested based on frame requirements. + /// + /// Handles special core request semantics: + /// - Negative values: Reserve all cores except the specified amount + /// - Zero: Reserve all cores on the host + /// - Positive values: Reserve the exact amount requested + /// + /// # Arguments + /// * `cores_requested` - The raw core request from the frame + /// * `total_cores` - Total cores available on the host + /// + /// # Returns + /// The calculated number of cores to actually request + fn calculate_cores_requested(cores_requested: CoreSize, total_cores: CoreSize) -> CoreSize { + // Requesting NEGATIVE cores is actually reserving ALL but the number of cores requeted + if cores_requested.value() < 0 { + total_cores + cores_requested + // Requesting ZERO cores is actually reserving ALL cores on the host + } else if cores_requested.value() == 0 { + total_cores + // Requesting POSITIVE cores + } else { + cores_requested + } + } + + /// Calculates the number of cores to reserve for a frame on a specific host. + /// + /// Takes into account: + /// - Host thread mode (All, Variable, Auto) + /// - Frame threadability + /// - Memory requirements and stranded thresholds + /// - Selfish services and resource availability + /// + /// # Arguments + /// * `host` - The target host with available resources + /// * `frame` - The frame requiring resources + /// * `memory_stranded_threshold` - Threshold for memory-stranded frame detection + /// + /// # Returns + /// * `Ok(CoreSize)` - Number of cores to reserve + /// * `Err(VirtualProcError)` - If insufficient resources available + fn calculate_core_reservation( + host: &Host, + frame: &DispatchFrame, + memory_stranded_threshold: u64, + ) -> Result { + let cores_requested = Self::calculate_cores_requested(frame.min_cores, host.total_cores); + + let cores_reserved = match (host.thread_mode, frame.threadable) { + (ThreadMode::All, _) => host.idle_cores, + (ThreadMode::Variable, true) if cores_requested.value() <= 2 => CoreSize(2), + (ThreadMode::Auto, true) | (ThreadMode::Variable, true) => { + // Book whatever is left for hosts with selfish services or memory stranded + if frame.has_selfish_service + || host.idle_memory - frame.min_memory <= memory_stranded_threshold + { + host.idle_cores + // Limit Variable booking to at least 2 cores + } else { + Self::calculate_memory_balanced_core_count(host, frame, cores_requested) + } + } + _ => cores_requested, + }; + + // Sanity check + if cores_reserved > host.total_cores || cores_reserved > host.idle_cores { + Err(VirtualProcError::HostResourcesExtinguished(format!( + "Not enough cores: {} < {}", + host.idle_cores, cores_reserved + ))) + } else { + Ok(cores_reserved) + } + } + + /// Consumes a host(HostModel) and returns an updated version accounting for consumed resources + /// eg. + /// HostModel(2 cores, 20GB) + frame(1 core, 10GB) + /// -> VirtualProc(1core, 10GB) + HostModel(1 core, 10GB) + async fn consume_host_virtual_resources( + frame: DispatchFrame, + original_host: Host, + memory_stranded_threshold: u64, + ) -> Result<(VirtualProc, Host), VirtualProcError> { + let mut host = original_host; + + let cores_reserved = + Self::calculate_core_reservation(&host, &frame, memory_stranded_threshold)?; + + if host.idle_memory < frame.min_memory { + Err(VirtualProcError::HostResourcesExtinguished(format!( + "Not enough memory: {}mb < {}mb", + host.idle_memory / MIB, + frame.min_memory / MIB + )))? + } + + if host.idle_gpus < frame.min_gpus { + Err(VirtualProcError::HostResourcesExtinguished(format!( + "Not enough GPU cores: {} < {}", + host.idle_gpus, frame.min_gpus + )))? + } + + if host.idle_gpu_memory < frame.min_gpu_memory { + Err(VirtualProcError::HostResourcesExtinguished(format!( + "Not enough GPU memory: {}mb < {}mb", + host.idle_gpu_memory / MIB, + frame.min_gpu_memory / MIB + )))? + } + + let memory_reserved = frame.min_memory; + let gpus_reserved = frame.min_gpus; + let gpu_memory_reserved = frame.min_gpu_memory; + + // Update host resources + host.idle_cores = host.idle_cores - cores_reserved; + host.idle_memory -= memory_reserved; + host.idle_gpus -= gpus_reserved; + host.idle_gpu_memory -= gpu_memory_reserved; + + Ok(( + VirtualProc { + proc_id: Uuid::new_v4(), + host_id: host.id, + cores_reserved: cores_reserved.into(), + memory_reserved, + gpus_reserved, + gpu_memory_reserved, + os: host.str_os.clone().unwrap_or_default(), + is_local_dispatch: false, + frame, + }, + host, + )) + } + + /// Calculates a memory-balanced core count to prevent resource imbalance. + /// + /// Ensures that core allocation is proportional to memory requirements + /// to avoid situations where memory or cores become stranded. + /// + /// # Arguments + /// * `host` - The host with available resources + /// * `frame` - The frame with memory and core requirements + /// * `cores_requested` - The number of cores originally requested + /// + /// # Returns + /// The balanced number of cores to allocate + fn calculate_memory_balanced_core_count( + host: &Host, + frame: &DispatchFrame, + cores_requested: CoreSize, + ) -> CoreSize { + let total_cores = host.total_cores.value() as f64; + let total_memory = host.total_memory as f64; + let frame_min_memory = frame.min_memory as f64; + + // Memory per core if evently distributed + let memory_per_core = total_cores / total_memory; + + // How many cores worth of memory the frame needs + let mut cores_worth_of_memory = (frame_min_memory / memory_per_core.round()) as i32; + + // If frame requested more than the memory-balanced core count, use frame's request + if cores_worth_of_memory < cores_requested.value() { + cores_worth_of_memory = cores_requested.value(); + } + // Don't book above max_core limit + if let Some(layer_cores_limit) = frame.layer_cores_limit { + if layer_cores_limit.value() > 0 && cores_worth_of_memory > layer_cores_limit.value() { + cores_worth_of_memory = layer_cores_limit.value(); + } + } + + CoreSize(cores_worth_of_memory) + } + + /// Calculate a new frame spec from an original frame_range and a chunk definition + /// + /// # Arguments + /// + /// * `initial_frame_number` - The starting frame number to begin the chunk from + /// * `frame_range` - A string representation of the frame range (e.g., "1-100") + /// * `chunk_size` - The number of frames to include in the chunk + /// + /// # Returns + /// + /// Returns a `Result` containing a tuple of: + /// * `String` - The frame specification string for the chunk + /// * `i32` - The last frame number in the chunk + /// + /// # Errors + /// + /// This function will return an error if: + /// * The frame range string is invalid + /// * The initial frame number is not within the specified range + /// * The chunk cannot be generated from the given parameters + /// * The chunk frame set is empty or invalid + fn prepare_frame_spec( + initial_frame_number: i32, + frame_range: &str, + chunk_size: usize, + ) -> Result<(String, i32)> { + let frame_set = FrameSet::new(frame_range)?; + let start_index = frame_set.index(initial_frame_number).ok_or(miette!( + "Invalid frame number {}. Out of range {}", + initial_frame_number, + frame_range + ))?; + let frame_spec = frame_set + .get_chunk(start_index, chunk_size) + .wrap_err("Invalid Chunk")?; + let chunk_frame_set = FrameSet::new(&frame_spec)?; + let chunk_end_frame = chunk_frame_set.last().ok_or(miette!( + "Could not find last frame of the chunk {}", + frame_spec + ))?; + + Ok((frame_spec, chunk_end_frame)) + } + + /// Prepares a RunFrame message for RQD execution. + /// + /// Converts a VirtualProc into the protobuf RunFrame format required by RQD, + /// including: + /// - Environment variable setup (CUE_*, frame metadata) + /// - Command token replacement (#FRAME#, #LAYER#, etc.) + /// - Resource allocation specifications + /// - Frame timing and execution context + /// + /// # Arguments + /// * `proc` - The virtual proc containing frame and resource information + /// + /// # Returns + /// * `Ok(RunFrame)` - The prepared RQD RunFrame message + /// * `Err(miette::Error)` - If frame preparation fails + fn prepare_rqd_run_frame(&self, proc: &VirtualProc) -> Result { + // Calculate threads from cores reserved + let proc_cores_reserved: CoreSize = proc.cores_reserved.into(); + let threads = std::cmp::max(CoreSize(1), proc_cores_reserved); + let frame = &proc.frame; + + // Extract frame number from frame name (assumes format "frameNumber-...") + let frame_number = frame + .frame_name + .split('-') + .next() + .and_then(|s| s.parse::().ok()) + .ok_or(miette!("Invalid Frame Number"))?; + + let z_frame_number = format!("{:04}", frame_number); + + let (frame_spec, chunk_end_frame) = + Self::prepare_frame_spec(frame_number, &frame.range, frame.chunk_size as usize)?; + + // Build environment variables + let mut environment = HashMap::new(); + environment.insert("CUE3".to_string(), "1".to_string()); + environment.insert("CUE_THREADS".to_string(), threads.to_string()); + environment.insert("CUE_MEMORY".to_string(), proc.memory_reserved.to_string()); + environment.insert("CUE_GPUS".to_string(), proc.gpus_reserved.to_string()); + environment.insert( + "CUE_GPU_MEMORY".to_string(), + proc.gpu_memory_reserved.to_string(), + ); + environment.insert("CUE_LOG_PATH".to_string(), frame.log_dir.clone()); + environment.insert("CUE_RANGE".to_string(), frame.range.clone()); + environment.insert("CUE_CHUNK".to_string(), frame.chunk_size.to_string()); + environment.insert("CUE_IFRAME".to_string(), frame_number.to_string()); + environment.insert("CUE_LAYER".to_string(), frame.layer_name.clone()); + environment.insert("CUE_JOB".to_string(), frame.job_name.clone()); + environment.insert("CUE_FRAME".to_string(), frame.frame_name.clone()); + environment.insert("CUE_SHOW".to_string(), frame.show_name.clone()); + environment.insert("CUE_SHOT".to_string(), frame.shot.clone()); + environment.insert("CUE_USER".to_string(), frame.user.clone()); + environment.insert("CUE_JOB_ID".to_string(), frame.job_id.to_string()); + environment.insert("CUE_LAYER_ID".to_string(), frame.layer_id.to_string()); + environment.insert("CUE_FRAME_ID".to_string(), frame.id.to_string()); + environment.insert( + "CUE_THREADABLE".to_string(), + if frame.threadable { "1" } else { "0" }.to_string(), + ); + + // Process command with token replacements + let processed_command = frame + .command + .replace("#ZFRAME#", &z_frame_number) + .replace("#IFRAME#", &frame_number.to_string()) + .replace("#FRAME_START#", &frame_number.to_string()) + .replace("#FRAME_END#", &chunk_end_frame.to_string()) + .replace("#FRAME_CHUNK#", &frame.chunk_size.to_string()) + .replace("#LAYER#", &frame.layer_name) + .replace("#JOB#", &frame.job_name) + .replace("#FRAMESPEC#", &frame_spec) + .replace("#FRAME#", &frame.frame_name); + + // Build RunFrame + let run_frame = RunFrame { + shot: frame.shot.clone(), + show: frame.show_name.clone(), + user_name: frame.user.clone(), + log_dir: frame.log_dir.clone(), + job_id: frame.job_id.to_string(), + job_name: frame.job_name.clone(), + frame_id: frame.id.to_string(), + frame_name: frame.frame_name.clone(), + layer_id: frame.layer_id.to_string(), + resource_id: proc.proc_id.to_string(), + num_cores: proc.cores_reserved.value(), + num_gpus: proc.gpus_reserved as i32, + start_time: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0), + ignore_nimby: proc.is_local_dispatch, + os: proc.os.clone(), + // TODO: Get soft/hard limits from config + soft_memory_limit: -1, + hard_memory_limit: -1, + loki_url: frame.loki_url.as_ref().unwrap_or(&String::new()).clone(), + environment, + command: processed_command, + uid_optional: frame + .uid + .map(opencue_proto::rqd::run_frame::UidOptional::Uid), + frame_temp_dir: String::new(), // Will be set by RQD + gid: 0, // Will be set by RQD based on user + attributes: HashMap::new(), + children: None, + pid: 0, // Will be set by RQD + + // Deprecated fields + #[allow(deprecated)] + job_temp_dir: "deprecated".to_string(), + #[allow(deprecated)] + log_file: "deprecated".to_string(), + #[allow(deprecated)] + log_dir_file: "deprecated".to_string(), + }; + + Ok(run_frame) + } + + /// Establishes a gRPC connection to an RQD instance. + /// + /// # Arguments + /// * `hostname` - The hostname or IP address of the RQD instance + /// * `port` - The gRPC port number for the RQD service + /// + /// # Returns + /// * `Ok(RqdInterfaceClient)` - Connected gRPC client + /// * `Err(miette::Error)` - If connection fails + async fn connect_to_rqd(hostname: &str, port: u32) -> Result> { + let client = RqdInterfaceClient::connect(format!("http://{}:{}", hostname, port)) + .await + .into_diagnostic() + .wrap_err(format!( + "Failed to connect to Rqd Server: {}:{}", + hostname, port + ))?; + Ok(client) + } +} diff --git a/rust/crates/scheduler/src/job_dispatcher/error.rs b/rust/crates/scheduler/src/job_dispatcher/error.rs new file mode 100644 index 000000000..e8e3b6def --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/error.rs @@ -0,0 +1,26 @@ +use miette::{Diagnostic, Error}; +use thiserror::Error; + +#[derive(Debug, Error, Diagnostic)] +pub enum VirtualProcError { + #[error("Failed to create Virtual Proc. Host resources extinguished.")] + HostResourcesExtinguished(String), +} + +#[derive(Debug, Error, Diagnostic)] +pub enum DispatchError { + #[error("DispatchError: Failed to acquire lock")] + HostLock(String), + + #[error("DispatchError: Unexpected Failure")] + Failure(Error), + + #[error("DispatchError: Allocation over burst")] + AllocationOverBurst(String), + + #[error("DispatchError: Dipatch happened but something failed after that")] + FailureAfterDispatch(Error), + + #[error("DispatchError: Host resources extinguished")] + HostResourcesExtinguished, +} diff --git a/rust/crates/scheduler/src/job_dispatcher/event_handler.rs b/rust/crates/scheduler/src/job_dispatcher/event_handler.rs new file mode 100644 index 000000000..fb2b3ff18 --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/event_handler.rs @@ -0,0 +1,167 @@ +use std::sync::Arc; + +use crate::{ + config::CONFIG, + dao::{FrameDao, HostDao, LayerDao}, + job_dispatcher::{DispatchError, dispatcher::RqdDispatcher}, + models::{DispatchJob, DispatchLayer, Host}, +}; +use futures::StreamExt; +use miette::Result; +use tracing::{debug, error, info}; + +/// Event handler for booking jobs to available hosts. +/// +/// This handler orchestrates the job dispatch process by: +/// - Processing incoming dispatch jobs +/// - Finding eligible layers within each job +/// - Matching layers to available host candidates +/// - Dispatching frames to selected hosts via the RQD dispatcher +pub struct BookJobEventHandler { + host_dao: Arc, + job_dao: LayerDao, + dispatcher: RqdDispatcher, +} + +impl BookJobEventHandler { + /// Creates a new BookJobEventHandler with configured DAOs and dispatcher. + /// + /// Initializes the handler with: + /// - Host DAO for finding available hosts + /// - Layer DAO for querying job layers + /// - RQD dispatcher for frame execution + pub async fn new() -> Result { + let host_dao = Arc::new(HostDao::from_config(&CONFIG.database).await?); + let layer_dao = LayerDao::from_config(&CONFIG.database).await?; + let frame_dao = FrameDao::from_config(&CONFIG.database).await?; + + let dispatcher = RqdDispatcher::new( + frame_dao, + host_dao.clone(), + CONFIG.rqd.grpc_port, + CONFIG.queue.dispatch_frames_per_layer_limit, + CONFIG.queue.memory_stranded_threshold.as_u64(), + CONFIG.rqd.dry_run_mode, + ); + Ok(BookJobEventHandler { + host_dao, + job_dao: layer_dao, + dispatcher, + }) + } + + /// Processes a dispatch job by finding and dispatching its eligible layers. + /// + /// For each layer in the job: + /// - Queries eligible layers from the database + /// - Attempts to find suitable host candidates + /// - Dispatches frames to available hosts, layer by layer + /// + /// # Arguments + /// * `job` - The dispatch job containing layers to process + pub async fn process(&self, job: DispatchJob) { + let mut stream = self.job_dao.query_layers(job.id); + let mut pending_layers = 0; + // Stream elegible layers from this job and dispatch one by one + while let Some(layer_model) = stream.next().await { + let layer: Result = layer_model.map(|l| l.into()); + match layer { + Ok(dispatch_layer) => { + pending_layers += 1; + // Give up on this + self.process_layer(dispatch_layer).await + } + Err(err) => { + error!("Failed to query layers. {}", err); + } + } + } + + if pending_layers == 0 { + info!("Found no pending layers for {}", job); + } + } + + /// Processes a single layer by finding host candidates and attempting dispatch. + /// + /// The process: + /// 1. Queries host candidates suitable for the layer + /// 2. Attempts dispatch on each candidate until successful + /// 3. Handles various dispatch errors (resource exhaustion, allocation limits, etc.) + /// + /// # Arguments + /// * `dispatch_layer` - The layer to dispatch to a host + async fn process_layer(&self, dispatch_layer: DispatchLayer) { + let limit = 10; + let mut host_candidates_stream = self.host_dao.find_host_for_layer(&dispatch_layer, limit); + let mut candidates_count = 0; + let mut choosen_host = None; + + // Attempt to dispatch host candidates and exist on the first successful attempt + while let Some(host_candidate) = host_candidates_stream.next().await { + match host_candidate { + Ok(host_model) => { + candidates_count += 1; + + let host: Host = host_model.into(); + debug!( + "Attempting host candidate {} for job {}", + host, dispatch_layer + ); + match self.dispatcher.dispatch(&dispatch_layer, &host).await { + // Stop on the first successful attempt + // Attempt next candidate in any failure case + Ok(_) => { + choosen_host.replace(host); + break; + } + Err(DispatchError::HostLock(host_name)) => { + info!("Failed to acquire lock for host {}", host_name) + } + Err(DispatchError::Failure(report)) => { + error!( + "Failed to dispatch {} on {}. {}", + dispatch_layer, + host, + report.to_string() + ); + } + Err(DispatchError::AllocationOverBurst(allocation_name)) => { + let msg = format!( + "Skiping host in this selection for {}. Allocation {} is over burst.", + dispatch_layer.job_id, allocation_name + ); + info!(msg); + } + Err(DispatchError::FailureAfterDispatch(report)) => { + // TODO: Implement a recovery logic for when a frame got dispatched + // but its status hasn't been updated on the database + let msg = format!( + "Failed after dispatch {} on {}. {}", + dispatch_layer, host, report + ); + error!(msg); + } + Err(DispatchError::HostResourcesExtinguished) => { + debug!( + "Host resources for {} extinguished, skiping to the next candidate", + host + ); + } + }; + } + Err(err) => { + error!("Failed to query host to dispatch. {}", err); + } + } + } + if candidates_count == 0 { + info!("Found no candidate for dispatching {}", dispatch_layer); + } else if choosen_host.is_none() { + info!( + "Attempted {} candidates and found no match for {}", + limit, dispatch_layer + ); + } + } +} diff --git a/rust/crates/scheduler/src/job_dispatcher/frame_set.rs b/rust/crates/scheduler/src/job_dispatcher/frame_set.rs new file mode 100644 index 000000000..5be71563c --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/frame_set.rs @@ -0,0 +1,970 @@ +//! Frame range parsing and manipulation for OpenCue job queue. +//! +//! This module provides functionality for parsing and manipulating frame ranges +//! commonly used in render farm job specifications. It supports various frame +//! range syntaxes including simple ranges, stepped ranges, inverse steps, and +//! interleaved patterns. +//! +//! # Frame Range Syntax +//! +//! The following syntax patterns are supported: +//! +//! - **Single frame**: `"5"` → `[5]` +//! - **Simple range**: `"1-10"` → `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]` +//! - **Stepped range**: `"1-10x2"` → `[1, 3, 5, 7, 9]` +//! - **Inverse stepped**: `"1-10y3"` → `[2, 3, 5, 6, 8, 9]` (excludes every 3rd frame) +//! - **Negative step**: `"10-1x-2"` → `[10, 8, 6, 4, 2]` +//! - **Interleaved**: `"1-10:5"` → `[1, 6, 3, 5, 7, 9, 2, 4, 8, 10]` +//! +//! # Frame Set Syntax +//! +//! Multiple frame ranges can be combined with commas: +//! - `"1-5,10-15"` → `[1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15]` +//! - `"1-10x2,20,25-30"` → `[1, 3, 5, 7, 9, 20, 25, 26, 27, 28, 29, 30]` +//! +//! # Examples +//! +//! ```rust +//! use frame_set::{FrameRange, FrameSet}; +//! +//! // Parse a simple frame range +//! let range = FrameRange::new("1-10x2")?; +//! assert_eq!(range.get_all(), &[1, 3, 5, 7, 9]); +//! +//! // Parse a complex frame set +//! let frame_set = FrameSet::new("1-5,10-15x2")?; +//! assert_eq!(frame_set.get_all(), &[1, 2, 3, 4, 5, 10, 12, 14]); +//! +//! // Get a chunk for job distribution +//! let chunk = frame_set.get_chunk(2, 3)?; // Starting at index 2, size 3 +//! assert_eq!(chunk, "3-5"); +//! ``` + +use indexmap::IndexSet; +use miette::{Context, IntoDiagnostic, Result, miette}; +use regex::Regex; + +/// Represents a sequence of image frames parsed from a frame range specification. +/// +/// A `FrameRange` represents a single contiguous or patterned sequence of frame numbers. +/// It supports various syntaxes including simple ranges, stepped ranges, inverse steps, +/// and interleaved patterns. +/// +/// This is a direct port of the Java `FrameRange` class from OpenCue's codebot. +/// +/// # Supported Syntax +/// +/// - **Single frame**: `"42"` produces `[42]` +/// - **Simple range**: `"1-10"` produces `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]` +/// - **Stepped range (x)**: `"1-10x3"` produces `[1, 4, 7, 10]` (every 3rd frame) +/// - **Inverse stepped (y)**: `"1-10y3"` produces `[2, 3, 5, 6, 8, 9]` (all except every 3rd) +/// - **Negative step**: `"10-1x-2"` produces `[10, 8, 6, 4, 2]` (backwards with step) +/// - **Interleaved (:)**: `"1-10:5"` produces interleaved pattern for render optimization +/// +/// # Validation Rules +/// +/// - Step size cannot be zero +/// - For positive steps, end frame must be >= start frame +/// - For negative steps, end frame must be <= start frame +/// - Step size and interleave size cannot be combined +/// +/// # Examples +/// +/// ```rust +/// // Basic usage +/// let range = FrameRange::new("1-10x2")?; +/// assert_eq!(range.size(), 5); +/// assert_eq!(range.get(0), Some(1)); +/// assert_eq!(range.get_all(), &[1, 3, 5, 7, 9]); +/// +/// // Inverse stepping +/// let inverse = FrameRange::new("1-10y3")?; +/// assert_eq!(inverse.get_all(), &[2, 3, 5, 6, 8, 9]); +/// ``` +#[derive(Debug, Clone, PartialEq)] +#[allow(dead_code)] +pub struct FrameRange { + frame_list: Vec, +} + +impl FrameRange { + /// Constructs a new `FrameRange` by parsing a frame range specification. + /// + /// # Arguments + /// + /// * `frame_range` - A string specification following the frame range syntax + /// + /// # Returns + /// + /// * `Ok(FrameRange)` - Successfully parsed frame range + /// * `Err(String)` - Parse error with description + /// + /// # Examples + /// + /// ```rust + /// let range = FrameRange::new("1-10x2")?; + /// let single = FrameRange::new("42")?; + /// let inverse = FrameRange::new("1-10y3")?; + /// ``` + /// + /// # Errors + /// + /// Returns an error if: + /// - The syntax is invalid or unrecognized + /// - Step size is zero + /// - Step direction conflicts with range direction + /// - Frame numbers cannot be parsed as integers + #[allow(dead_code)] + pub fn new(frame_range: &str) -> Result { + let frame_list = Self::parse_frame_range(frame_range)?; + Ok(FrameRange { frame_list }) + } + + /// Gets the number of frames contained in this sequence. + /// + /// # Returns + /// + /// The total count of frames in the range. + /// + /// # Example + /// + /// ```rust + /// let range = FrameRange::new("1-10x2")?; + /// assert_eq!(range.size(), 5); // [1, 3, 5, 7, 9] + /// ``` + #[allow(dead_code)] + pub fn size(&self) -> usize { + self.frame_list.len() + } + + /// Gets an individual frame number by its position in the sequence. + /// + /// # Arguments + /// + /// * `idx` - Zero-based index into the frame sequence + /// + /// # Returns + /// + /// * `Some(frame_number)` - If the index is valid + /// * `None` - If the index is out of bounds + /// + /// # Example + /// + /// ```rust + /// let range = FrameRange::new("1-10x2")?; + /// assert_eq!(range.get(0), Some(1)); + /// assert_eq!(range.get(2), Some(5)); + /// assert_eq!(range.get(10), None); + /// ``` + #[allow(dead_code)] + pub fn get(&self, idx: usize) -> Option { + self.frame_list.get(idx).copied() + } + + /// Finds the index of a specific frame number in the sequence. + /// + /// # Arguments + /// + /// * `frame` - The frame number to search for + /// + /// # Returns + /// + /// * `Some(index)` - Zero-based index if the frame is found + /// * `None` - If the frame is not in the sequence + /// + /// # Example + /// + /// ```rust + /// let range = FrameRange::new("1-10x2")?; + /// assert_eq!(range.index(5), Some(2)); + /// assert_eq!(range.index(4), None); // 4 is not in [1,3,5,7,9] + /// ``` + #[allow(dead_code)] + pub fn index(&self, frame: i32) -> Option { + self.frame_list.iter().position(|&x| x == frame) + } + + /// Gets a reference to the complete frame sequence as a slice. + /// + /// # Returns + /// + /// A slice containing all frame numbers in order. + /// + /// # Example + /// + /// ```rust + /// let range = FrameRange::new("1-5")?; + /// assert_eq!(range.get_all(), &[1, 2, 3, 4, 5]); + /// ``` + #[allow(dead_code)] + pub fn get_all(&self) -> &[i32] { + &self.frame_list + } + + /// Parses a frame range specification string into a vector of frame numbers. + /// + /// This is the core parsing logic that handles all supported syntax patterns. + /// It uses regex patterns to identify and parse different frame range formats. + fn parse_frame_range(frame_range: &str) -> Result> { + let single_frame_pattern = Regex::new(r"^(-?\d+)$").unwrap(); + let simple_range_pattern = Regex::new(r"^(?P-?\d+)-(?P-?\d+)$").unwrap(); + let step_pattern = + Regex::new(r"^(?P-?\d+)-(?P-?\d+)(?P[xy])(?P-?\d+)$").unwrap(); + let interleave_pattern = + Regex::new(r"^(?P-?\d+)-(?P-?\d+):(?P-?\d+)$").unwrap(); + + if let Some(caps) = single_frame_pattern.captures(frame_range) { + let frame: i32 = caps + .get(1) + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err(format!("Invalid frame number: {}", frame_range))?; + return Ok(vec![frame]); + } + + if let Some(caps) = simple_range_pattern.captures(frame_range) { + let start_frame: i32 = caps + .name("sf") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid start frame".to_string())?; + let end_frame: i32 = caps + .name("ef") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid end frame".to_string())?; + let step = if end_frame >= start_frame { 1 } else { -1 }; + return Self::get_int_range(start_frame, end_frame, step); + } + + if let Some(caps) = step_pattern.captures(frame_range) { + let start_frame: i32 = caps + .name("sf") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid start frame".to_string())?; + let end_frame: i32 = caps + .name("ef") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid end frame".to_string())?; + let step: i32 = caps + .name("step") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid step".to_string())?; + let step_sep = caps.name("stepSep").unwrap().as_str(); + let inverse_step = step_sep == "y"; + return Self::get_stepped_range(start_frame, end_frame, step, inverse_step); + } + + if let Some(caps) = interleave_pattern.captures(frame_range) { + let start_frame: i32 = caps + .name("sf") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid start frame".to_string())?; + let end_frame: i32 = caps + .name("ef") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid end frame".to_string())?; + let step: i32 = caps + .name("step") + .unwrap() + .as_str() + .parse() + .into_diagnostic() + .wrap_err("Invalid step".to_string())?; + return Self::get_interleaved_range(start_frame, end_frame, step); + } + + Err(miette!("Unrecognized frame range syntax: {}", frame_range)) + } + + /// Generates an integer range with the specified start, end, and step values. + /// + /// This method handles the core logic for generating frame sequences, including + /// support for negative steps and proper filtering based on step intervals. + fn get_int_range(start: i32, end: i32, step: i32) -> Result> { + let (stream_start, stream_end) = if step < 0 { (end, start) } else { (start, end) }; + let stream_step = step.abs(); + + let mut result = Vec::new(); + let mut current = stream_start; + + while current <= stream_end { + if (current - start) % stream_step == 0 { + result.push(current); + } + current += 1; + } + + if step < 0 { + result.reverse(); + } + + Ok(result) + } + + /// Generates a stepped range, optionally with inverse stepping. + /// + /// For normal stepping (x syntax), returns frames at the specified intervals. + /// For inverse stepping (y syntax), returns all frames EXCEPT those at the intervals. + /// + /// # Arguments + /// * `start` - Starting frame number + /// * `end` - Ending frame number + /// * `step` - Step interval + /// * `inverse_step` - If true, excludes stepped frames instead of including them + fn get_stepped_range(start: i32, end: i32, step: i32, inverse_step: bool) -> Result> { + Self::validate_step_sign(start, end, step)?; + let stepped_range = Self::get_int_range(start, end, step)?; + + if inverse_step { + let full_range = Self::get_int_range(start, end, if step < 0 { -1 } else { 1 })?; + let stepped_set: std::collections::HashSet<_> = stepped_range.into_iter().collect(); + let result: Vec = full_range + .into_iter() + .filter(|x| !stepped_set.contains(x)) + .collect(); + Ok(result) + } else { + Ok(stepped_range) + } + } + + /// Generates an interleaved frame sequence for render optimization. + /// + /// The interleaved pattern distributes frames across the range to provide + /// better early feedback during rendering. The algorithm progressively + /// halves the step size to fill in gaps. + /// + /// Example: "1-10:5" produces [1, 6, 3, 5, 7, 9, 2, 4, 8, 10] + fn get_interleaved_range(start: i32, end: i32, mut step: i32) -> Result> { + Self::validate_step_sign(start, end, step)?; + let mut interleaved_frames = IndexSet::new(); + + while step.abs() > 0 { + let range = Self::get_int_range(start, end, step)?; + for frame in range { + interleaved_frames.insert(frame); + } + step /= 2; + } + + Ok(interleaved_frames.into_iter().collect()) + } + + /// Validates that the step direction is compatible with the range direction. + /// + /// Ensures that positive steps are only used with ascending ranges and + /// negative steps are only used with descending ranges. Step size zero is invalid. + fn validate_step_sign(start: i32, end: i32, step: i32) -> Result<()> { + if step > 1 { + if end < start { + Err(miette!( + "End frame may not be less than start frame when using a positive step" + )) + } else { + Ok(()) + } + } else if step == 0 { + Err(miette!("Step cannot be zero")) + } else if step < 0 && end >= start { + Err(miette!( + "End frame may not be greater than start frame when using a negative step" + )) + } else { + Ok(()) + } + } +} + +/// Represents an ordered sequence of FrameRanges combined into a single frame list. +/// +/// A `FrameSet` allows combining multiple frame range specifications using comma-separated +/// syntax. Each section is parsed as a `FrameRange` and the results are concatenated. +/// +/// This is a direct port of the Java `FrameSet` class from OpenCue's codebot. +/// +/// # Syntax +/// +/// Frame sets use comma-separated frame range specifications: +/// - `"1-10"` - Simple range: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +/// - `"1-5,10-15"` - Multiple ranges: [1, 2, 3, 4, 5, 10, 11, 12, 13, 14, 15] +/// - `"1-10x2,20,25-30"` - Mixed syntax: [1, 3, 5, 7, 9, 20, 25, 26, 27, 28, 29, 30] +/// - `"1-5x2, 10-15, 20"` - Whitespace is trimmed automatically +/// +/// # Job Distribution +/// +/// FrameSet provides chunking functionality for distributing frames across render nodes: +/// +/// ```rust +/// let frame_set = FrameSet::new("1-100")?; +/// let chunk1 = frame_set.get_chunk(0, 10)?; // "1-10" +/// let chunk2 = frame_set.get_chunk(10, 10)?; // "11-20" +/// ``` +/// +/// Chunks are returned as compact string representations that can be parsed by render nodes. +/// +/// # Examples +/// +/// ```rust +/// // Basic frame set +/// let frames = FrameSet::new("1-5,10-12")?; +/// assert_eq!(frames.get_all(), &[1, 2, 3, 4, 5, 10, 11, 12]); +/// assert_eq!(frames.size(), 8); +/// +/// // Complex frame set with different syntaxes +/// let complex = FrameSet::new("1-10x2,15,20-25")?; +/// assert_eq!(complex.get_all(), &[1, 3, 5, 7, 9, 15, 20, 21, 22, 23, 24, 25]); +/// +/// // Chunking for job distribution +/// let chunk = complex.get_chunk(0, 3)?; // First 3 frames +/// // Returns compact representation like "1-5x2,15" +/// ``` +#[derive(Debug, Clone, PartialEq)] +#[allow(dead_code)] +pub struct FrameSet { + frame_list: Vec, +} + +impl FrameSet { + /// Constructs a new `FrameSet` by parsing a comma-separated frame range specification. + /// + /// # Arguments + /// + /// * `frame_range` - Comma-separated frame range specifications + /// + /// # Returns + /// + /// * `Ok(FrameSet)` - Successfully parsed frame set + /// * `Err(String)` - Parse error with description + /// + /// # Examples + /// + /// ```rust + /// let simple = FrameSet::new("1-10")?; + /// let multi = FrameSet::new("1-5,10-15")?; + /// let complex = FrameSet::new("1-10x2, 20, 25-30")?; + /// ``` + pub fn new(frame_range: &str) -> Result { + let frame_list = Self::parse_frame_range(frame_range)?; + Ok(FrameSet { frame_list }) + } + + /// Gets the total number of frames in this frame set. + /// + /// # Returns + /// + /// The total count of frames across all ranges. + #[allow(dead_code)] + pub fn size(&self) -> usize { + self.frame_list.len() + } + + /// Gets an individual frame number by its position in the sequence. + /// + /// # Arguments + /// + /// * `idx` - Zero-based index into the frame sequence + /// + /// # Returns + /// + /// * `Some(frame_number)` - If the index is valid + /// * `None` - If the index is out of bounds + #[allow(dead_code)] + pub fn get(&self, idx: usize) -> Option { + self.frame_list.get(idx).copied() + } + + /// Gets last individual frame number. + /// + /// # Returns + /// + /// * `Some(frame_number)` - If set not empty + /// * `None` - Otherwise + pub fn last(&self) -> Option { + self.frame_list.last().cloned() + } + + /// Finds the index of a specific frame number in the sequence. + /// + /// # Arguments + /// + /// * `frame` - The frame number to search for + /// + /// # Returns + /// + /// * `Some(index)` - Zero-based index if found + /// * `None` - If the frame is not in the set + pub fn index(&self, frame: i32) -> Option { + self.frame_list.iter().position(|&x| x == frame) + } + + /// Gets a reference to the complete frame sequence as a slice. + /// + /// # Returns + /// + /// A slice containing all frame numbers in the order they were specified. + #[allow(dead_code)] + pub fn get_all(&self) -> &[i32] { + &self.frame_list + } + + /// Returns a sub-FrameSet as a compact string representation for job distribution. + /// + /// This method is used to divide frame sets into smaller chunks for distribution + /// across render nodes. The returned string uses the most compact frame range + /// representation possible. + /// + /// # Arguments + /// + /// * `start_frame_index` - Zero-based index of the first frame to include + /// * `chunk_size` - Maximum number of frames to include in the chunk + /// + /// # Returns + /// + /// * `Ok(String)` - Compact frame range representation (e.g., "1-10", "1,3,5", "10-20x2") + /// * `Err(String)` - If start_frame_index is out of bounds + /// + /// # Examples + /// + /// ```rust + /// let frames = FrameSet::new("1-20")?; + /// assert_eq!(frames.get_chunk(0, 5)?, "1-5"); + /// assert_eq!(frames.get_chunk(5, 5)?, "6-10"); + /// + /// let stepped = FrameSet::new("1-10x2")?; // [1, 3, 5, 7, 9] + /// assert_eq!(stepped.get_chunk(1, 3)?, "3-7x2"); // [3, 5, 7] + /// ``` + /// + /// # Errors + /// + /// Returns an error if `start_frame_index` is greater than or equal to the + /// total number of frames in the set. + pub fn get_chunk(&self, start_frame_index: usize, chunk_size: usize) -> Result { + if self.frame_list.len() <= start_frame_index { + Err(miette!( + "startFrameIndex {} is not in range 0-{}", + start_frame_index, + self.frame_list.len() - 1 + ))?; + } + + if chunk_size == 1 { + return Ok(self.frame_list[start_frame_index].to_string()); + } + + let final_frame_index = self.frame_list.len() - 1; + let mut end_frame_index = start_frame_index + chunk_size - 1; + if end_frame_index > final_frame_index { + end_frame_index = final_frame_index; + } + + let subset = &self.frame_list[start_frame_index..=end_frame_index]; + Ok(Self::frames_to_frame_ranges(subset)) + } + + /// Parses a comma-separated frame range specification into a vector of frame numbers. + /// + /// Each comma-separated section is parsed as an individual FrameRange and the + /// results are concatenated in order. + fn parse_frame_range(frame_range: &str) -> Result> { + let mut result = Vec::new(); + for frame_range_section in frame_range.split(',') { + let section_frames = FrameRange::parse_frame_range(frame_range_section.trim())?; + result.extend(section_frames); + } + Ok(result) + } + + /// Builds a compact string representation for a frame range part. + /// + /// Returns the most compact representation: + /// - Single frame: "5" + /// - Consecutive frames: "1-10" + /// - Stepped frames: "1-10x2" + fn build_frame_part(start_frame: i32, end_frame: i32, step: i32) -> String { + if start_frame == end_frame { + start_frame.to_string() + } else if step == 1 { + format!("{}-{}", start_frame, end_frame) + } else { + format!("{}-{}x{}", start_frame, end_frame, step) + } + } + + /// Converts a list of frame numbers back to the most compact frame range representation. + /// + /// This method analyzes the frame sequence to detect patterns and produces + /// the most compact string representation possible. It's adapted from the + /// Python Fileseq library approach used in the original Java implementation. + /// + /// # Arguments + /// + /// * `frames` - Slice of frame numbers in ascending order + /// + /// # Returns + /// + /// Compact frame range string (e.g., "1-10", "1-10x2", "1,3,5,10-15") + fn frames_to_frame_ranges(frames: &[i32]) -> String { + let l = frames.len(); + if l == 0 { + return String::new(); + } else if l == 1 { + return frames[0].to_string(); + } + + let mut result_parts = Vec::new(); + let mut curr_count = 1; + let mut curr_step = 0; + let mut curr_start = frames[0]; + let mut last_frame = frames[0]; + + for &curr_frame in frames.iter().skip(1) { + if curr_step == 0 { + curr_step = curr_frame - curr_start; + } + let new_step = curr_frame - last_frame; + + if curr_step == new_step { + last_frame = curr_frame; + curr_count += 1; + } else if curr_count == 2 && curr_step != 1 { + result_parts.push(curr_start.to_string()); + curr_step = 0; + curr_start = last_frame; + last_frame = curr_frame; + } else { + result_parts.push(Self::build_frame_part(curr_start, last_frame, curr_step)); + curr_step = 0; + curr_start = curr_frame; + last_frame = curr_frame; + curr_count = 1; + } + } + + if curr_count == 2 && curr_step != 1 { + result_parts.push(curr_start.to_string()); + result_parts.push(frames[frames.len() - 1].to_string()); + } else { + result_parts.push(Self::build_frame_part( + curr_start, + frames[frames.len() - 1], + curr_step, + )); + } + + result_parts.join(",") + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Basic functionality tests + #[test] + fn test_single_frame() { + let frame_range = FrameRange::new("5").unwrap(); + assert_eq!(frame_range.get_all(), &[5]); + } + + #[test] + fn test_single_frame_negative() { + let frame_range = FrameRange::new("-5").unwrap(); + assert_eq!(frame_range.get_all(), &[-5]); + } + + #[test] + fn test_simple_range() { + let frame_range = FrameRange::new("1-5").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 2, 3, 4, 5]); + } + + #[test] + fn test_simple_range_negative() { + let frame_range = FrameRange::new("-5--1").unwrap(); + assert_eq!(frame_range.get_all(), &[-5, -4, -3, -2, -1]); + } + + // Stepped range tests (x syntax) + #[test] + fn test_stepped_range_basic() { + let frame_range = FrameRange::new("1-10x2").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 3, 5, 7, 9]); + } + + #[test] + fn test_stepped_range_documented_example() { + let frame_range = FrameRange::new("1-10x3").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 4, 7, 10]); + } + + #[test] + fn test_stepped_range_step_of_one() { + let frame_range = FrameRange::new("1-5x1").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 2, 3, 4, 5]); + } + + #[test] + fn test_stepped_range_large_step() { + let frame_range = FrameRange::new("1-10x5").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 6]); + } + + // Negative stepped range tests + #[test] + fn test_negative_stepped_range() { + let frame_range = FrameRange::new("10-1x-1").unwrap(); + assert_eq!(frame_range.get_all(), &[10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); + } + + #[test] + fn test_negative_stepped_range_with_step() { + let frame_range = FrameRange::new("10-1x-2").unwrap(); + assert_eq!(frame_range.get_all(), &[10, 8, 6, 4, 2]); + } + + // Inverse stepped range tests (y syntax) + #[test] + fn test_inverse_stepped_range_documented_example() { + let frame_range = FrameRange::new("1-10y3").unwrap(); + assert_eq!(frame_range.get_all(), &[2, 3, 5, 6, 8, 9]); + } + + #[test] + fn test_inverse_stepped_range_step_2() { + let frame_range = FrameRange::new("1-10y2").unwrap(); + assert_eq!(frame_range.get_all(), &[2, 4, 6, 8, 10]); + } + + #[test] + fn test_inverse_stepped_range_step_1() { + let frame_range = FrameRange::new("1-5y1").unwrap(); + assert_eq!(frame_range.get_all(), &[] as &[i32]); + } + + // Interleaved range tests (: syntax) + #[test] + fn test_interleaved_range_documented_example() { + let frame_range = FrameRange::new("1-10:5").unwrap(); + // Actual output from our implementation + assert_eq!(frame_range.get_all(), &[1, 6, 3, 5, 7, 9, 2, 4, 8, 10]); + } + + #[test] + fn test_interleaved_range_step_2() { + let frame_range = FrameRange::new("1-8:2").unwrap(); + assert_eq!(frame_range.get_all(), &[1, 3, 5, 7, 2, 4, 6, 8]); + } + + #[test] + fn test_interleaved_range_step_4() { + let frame_range = FrameRange::new("1-8:4").unwrap(); + // Actual output from our implementation + assert_eq!(frame_range.get_all(), &[1, 5, 3, 7, 2, 4, 6, 8]); + } + + // Error cases and validation + #[test] + fn test_step_zero_error() { + let result = FrameRange::new("1-10x0"); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Step cannot be zero") + ); + } + + #[test] + fn test_positive_step_with_descending_range_error() { + let result = FrameRange::new("10-1x2"); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("End frame may not be less than start frame when using a positive step") + ); + } + + #[test] + fn test_negative_step_with_ascending_range_error() { + let result = FrameRange::new("1-10x-2"); + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains( + "End frame may not be greater than start frame when using a negative step" + ) + ); + } + + #[test] + fn test_invalid_syntax_error() { + let result = FrameRange::new("1-10z2"); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("Unrecognized frame range syntax") + ); + } + + #[test] + fn test_malformed_range_error() { + let result = FrameRange::new("abc"); + assert!(result.is_err()); + } + + // FrameSet tests + #[test] + fn test_frame_set_simple() { + let frame_set = FrameSet::new("1-3,5-7").unwrap(); + assert_eq!(frame_set.get_all(), &[1, 2, 3, 5, 6, 7]); + } + + #[test] + fn test_frame_set_mixed_syntax() { + let frame_set = FrameSet::new("1-5x2,10,15-20").unwrap(); + assert_eq!(frame_set.get_all(), &[1, 3, 5, 10, 15, 16, 17, 18, 19, 20]); + } + + #[test] + fn test_frame_set_with_spaces() { + let frame_set = FrameSet::new("1-3, 5-7, 10").unwrap(); + assert_eq!(frame_set.get_all(), &[1, 2, 3, 5, 6, 7, 10]); + } + + #[test] + fn test_frame_set_single_frame() { + let frame_set = FrameSet::new("42").unwrap(); + assert_eq!(frame_set.get_all(), &[42]); + } + + // Chunk tests + #[test] + fn test_frame_set_get_chunk() { + let frame_set = FrameSet::new("1-10").unwrap(); + let chunk = frame_set.get_chunk(0, 3).unwrap(); + assert_eq!(chunk, "1-3"); + } + + #[test] + fn test_frame_set_get_chunk_single() { + let frame_set = FrameSet::new("1-10").unwrap(); + let chunk = frame_set.get_chunk(2, 1).unwrap(); + assert_eq!(chunk, "3"); + } + + #[test] + fn test_frame_set_get_chunk_end_of_range() { + let frame_set = FrameSet::new("1-10").unwrap(); + let chunk = frame_set.get_chunk(7, 5).unwrap(); // Should only get frames 8,9,10 + assert_eq!(chunk, "8-10"); + } + + #[test] + fn test_frame_set_get_chunk_out_of_bounds() { + let frame_set = FrameSet::new("1-5").unwrap(); + let result = frame_set.get_chunk(10, 3); + assert!(result.is_err()); + assert!( + result + .unwrap_err() + .to_string() + .contains("startFrameIndex 10 is not in range 0-4") + ); + } + + #[test] + fn test_frame_set_get_chunk_stepped_frames() { + let frame_set = FrameSet::new("1-10x2").unwrap(); // [1, 3, 5, 7, 9] + let chunk = frame_set.get_chunk(1, 3).unwrap(); // Should get [3, 5, 7] + assert_eq!(chunk, "3-7x2"); + } + + // Frame range reconstruction tests + #[test] + fn test_frames_to_frame_ranges_simple() { + let frames = &[1, 2, 3, 5, 6, 7]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, "1-3,5-7"); + } + + #[test] + fn test_frames_to_frame_ranges_stepped() { + let frames = &[1, 3, 5, 7, 9]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, "1-9x2"); + } + + #[test] + fn test_frames_to_frame_ranges_single_frame() { + let frames = &[42]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, "42"); + } + + #[test] + fn test_frames_to_frame_ranges_empty() { + let frames = &[]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, ""); + } + + #[test] + fn test_frames_to_frame_ranges_mixed() { + let frames = &[1, 3, 5, 10, 11, 12]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, "1-5x2,10-12"); + } + + #[test] + fn test_frames_to_frame_ranges_single_gaps() { + let frames = &[1, 3, 5]; + let result = FrameSet::frames_to_frame_ranges(frames); + assert_eq!(result, "1-5x2"); + } + + // Edge cases + #[test] + fn test_frame_range_single_element_range() { + let frame_range = FrameRange::new("5-5").unwrap(); + assert_eq!(frame_range.get_all(), &[5]); + } + + #[test] + fn test_frame_range_backwards_single_step() { + let frame_range = FrameRange::new("5-1").unwrap(); + assert_eq!(frame_range.get_all(), &[5, 4, 3, 2, 1]); + } + + #[test] + fn test_complex_frame_set() { + let frame_set = FrameSet::new("1-5x2,10-15,20-30x3,50").unwrap(); + let expected = [1, 3, 5, 10, 11, 12, 13, 14, 15, 20, 23, 26, 29, 50]; + assert_eq!(frame_set.get_all(), &expected); + } +} diff --git a/rust/crates/scheduler/src/job_dispatcher/job_consumer.rs b/rust/crates/scheduler/src/job_dispatcher/job_consumer.rs new file mode 100644 index 000000000..af7ccd588 --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/job_consumer.rs @@ -0,0 +1,181 @@ +use crate::{ + config::CONFIG, job_dispatcher::event_handler::BookJobEventHandler, models::DispatchJob, +}; +use futures::StreamExt; +use miette::{Context, IntoDiagnostic, Result}; +use moka::future::Cache; +use rdkafka::{ + ClientConfig, Message, + consumer::{Consumer, StreamConsumer}, +}; +use tracing::{error, info, warn}; +use uuid::Uuid; + +/// General job dispatcher that coordinates Kafka message consumption with job processing. +/// +/// This is the main entry point for the job dispatch system, wrapping a Kafka consumer +/// that processes dispatch job messages from the queue. +pub struct GeneralJobDispatcher { + consumer: KafkaJobConsumer, +} + +impl GeneralJobDispatcher { + /// Creates a new general job dispatcher with Kafka consumer and event handler. + /// + /// Sets up the complete dispatch pipeline including database connections, + /// Kafka consumer configuration, and job event handling. + /// + /// # Returns + /// * `Ok(GeneralJobDispatcher)` - Configured dispatcher ready to process jobs + /// * `Err(miette::Error)` - If initialization fails (typically database connection issues) + pub async fn new() -> Result { + let job_dispatcher = BookJobEventHandler::new() + .await + .wrap_err("Failed to start JobMessageExecutor, possibly a database connection error")?; + + let consumer = KafkaJobConsumer::new( + &CONFIG.kafka.bootstrap_servers, + CONFIG.kafka.general_jobs_topic.topic_name.clone(), + job_dispatcher, + )?; + + Ok(GeneralJobDispatcher { consumer }) + } +} + +/// Kafka consumer for processing job dispatch messages. +/// +/// Handles: +/// - Kafka message consumption and deserialization +/// - Job back-off caching to prevent repeated processing +/// - Message acknowledgment and error handling +/// - Integration with job event handler for dispatch processing +pub struct KafkaJobConsumer { + id: Uuid, + topic_name: String, + consumer: StreamConsumer, + event_handler: BookJobEventHandler, + back_off_cache: Cache, +} + +impl KafkaJobConsumer { + /// Creates a new Kafka job consumer with the specified configuration. + /// + /// Configures the Kafka consumer with: + /// - Bootstrap servers for cluster connection + /// - Consumer group membership for load balancing + /// - Manual commit mode for reliable message processing + /// - Back-off cache to prevent duplicate job processing + /// + /// # Arguments + /// * `bootstrap_servers` - Comma-separated list of Kafka broker addresses + /// * `topic_name` - Name of the Kafka topic to consume from + /// * `dispatcher` - Event handler for processing consumed jobs + /// + /// # Returns + /// * `Ok(KafkaJobConsumer)` - Configured consumer ready to start processing + /// * `Err(miette::Error)` - If Kafka client creation fails + pub fn new( + bootstrap_servers: &str, + topic_name: String, + dispatcher: BookJobEventHandler, + ) -> Result { + let id = Uuid::new_v4(); + let consumer = ClientConfig::new() + .set("bootstrap.servers", bootstrap_servers) + .set("group.id", "opencue-job-dispatchers") + .set("client.id", id) + .set("enable.auto.commit", "false") // Manual commit for better control + .set("auto.offset.reset", "earliest") + .set("max.poll.interval.ms", "300000") // 5 minutes + .set("session.timeout.ms", "30000") + .set("heartbeat.interval.ms", "3000") + .create() + .into_diagnostic() + .wrap_err("Failed to start Kafka consumer client")?; + + let back_off_cache = Cache::builder() + .time_to_live(CONFIG.queue.job_back_off_duration) + .build(); + + Ok(KafkaJobConsumer { + id, + topic_name, + consumer, + event_handler: dispatcher, + back_off_cache, + }) + } + + /// Starts the Kafka consumer to process job dispatch messages. + /// + /// This is the main processing loop that: + /// 1. Subscribes to the configured Kafka topic + /// 2. Continuously consumes messages from the stream + /// 3. Deserializes job messages and processes them via the event handler + /// 4. Implements back-off caching to prevent duplicate processing + /// 5. Commits messages after successful processing + /// + /// The loop runs indefinitely until an unrecoverable error occurs. + /// + /// # Returns + /// * `Ok(())` - Should never return normally (runs indefinitely) + /// * `Err(miette::Error)` - If subscription fails or other critical errors occur + pub async fn start(&self) -> Result<()> { + self.consumer + .subscribe(&[&self.topic_name]) + .into_diagnostic() + .wrap_err("Failed to subscribe to topic")?; + + info!( + "Job Dispatcher {} started consuming {}", + self.id, self.topic_name + ); + + let mut message_stream = self.consumer.stream(); + while let Some(message) = message_stream.next().await { + info!("Got message"); + match message { + Ok(msg) => { + if let Some(payload) = msg.payload() { + let serialized_job = String::from_utf8_lossy(payload); + match serde_json::from_str::(&serialized_job) { + Ok(job) => { + if self.back_off_cache.contains_key(&job.id) { + info!("Skipping job {}", job); + } else { + info!("Consumed job {}", job); + self.back_off_cache.insert(job.id, ()).await; + self.event_handler.process(job).await; + } + } + Err(err) => { + warn!("Failed to deserialize job: {}", err); + // TODO: push failed message to a different queue + } + } + } + + if let Err(err) = self + .consumer + .commit_message(&msg, rdkafka::consumer::CommitMode::Async) + { + warn!("Failed to commit message. {}", err); + } + } + Err(err) => { + error!("Error receiving kafka message: {}", err); + } + } + } + Ok(()) + } +} + +impl std::ops::Deref for GeneralJobDispatcher { + type Target = KafkaJobConsumer; + + fn deref(&self) -> &Self::Target { + &self.consumer + } +} diff --git a/rust/crates/scheduler/src/job_dispatcher/mod.rs b/rust/crates/scheduler/src/job_dispatcher/mod.rs new file mode 100644 index 000000000..bd909371a --- /dev/null +++ b/rust/crates/scheduler/src/job_dispatcher/mod.rs @@ -0,0 +1,14 @@ +mod dispatcher; +mod error; +mod event_handler; +mod frame_set; +mod job_consumer; + +pub use error::{DispatchError, VirtualProcError}; +use job_consumer::GeneralJobDispatcher; + +pub async fn run() -> miette::Result<()> { + let job_dispatcher = GeneralJobDispatcher::new().await?; + + job_dispatcher.start().await +} diff --git a/rust/crates/scheduler/src/job_fetcher/job_producer.rs b/rust/crates/scheduler/src/job_fetcher/job_producer.rs new file mode 100644 index 000000000..90dfc7154 --- /dev/null +++ b/rust/crates/scheduler/src/job_fetcher/job_producer.rs @@ -0,0 +1,147 @@ +use std::{marker::PhantomData, time::Duration}; + +use miette::{Context, IntoDiagnostic, Result}; +use rdkafka::{ + ClientConfig, + admin::{AdminClient, AdminOptions, NewTopic, TopicReplication}, + client::DefaultClientContext, + producer::{FutureProducer, FutureRecord}, + types::RDKafkaErrorCode, + util::Timeout, +}; +use serde::Serialize; +use tracing::{error, info, trace}; + +use crate::{ + config::{KafkaConfig, TopicConfig}, + models::{DispatchJob, Partitionable}, +}; + +pub struct GeneralJobQueue { + producer: KafkaTopicProducer, +} + +impl GeneralJobQueue { + pub fn from_config(config: &KafkaConfig) -> Result { + let producer: KafkaTopicProducer = + KafkaTopicProducer::new(config, &config.general_jobs_topic)?; + + Ok(Self { producer }) + } +} + +pub struct KafkaTopicProducer { + topic_name: String, + num_partitions: i32, + replication_factor: i32, + retention: Duration, + producer: FutureProducer, + config: KafkaConfig, + _phantom: PhantomData, +} + +impl KafkaTopicProducer { + pub fn new(config: &KafkaConfig, topic_config: &TopicConfig) -> Result { + let producer: FutureProducer = ClientConfig::new() + .set("bootstrap.servers", &config.bootstrap_servers) + .create() + .into_diagnostic()?; + + Ok(KafkaTopicProducer { + topic_name: topic_config.topic_name.clone(), + num_partitions: topic_config.num_partitions, + replication_factor: topic_config.replication_factor, + retention: topic_config.retention, + producer, + config: config.clone(), + _phantom: PhantomData, + }) + } + + pub async fn send(&self, payload: &T) -> Result<()> { + let serialized_payload = serde_json::to_string(payload) + .into_diagnostic() + .wrap_err("Failed to serialize payload")?; + let key = payload.partition_key(); + let record = FutureRecord::to(&self.topic_name) + .payload(&serialized_payload) + .key(&key); + + match self + .producer + .send(record, Timeout::After(self.config.timeout)) + .await + { + Ok(delivery) => { + trace!( + "Message sent with key {} to {} at partition {}", + key, self.topic_name, delivery.partition + ); + } + Err((kafka_error, _)) => { + error!( + "Failed to deliver message with key {} to {}. {}", + key, self.topic_name, kafka_error + ) + } + } + Ok(()) + } + + pub async fn create_topic(&self) -> Result<()> { + let admin_client: AdminClient = ClientConfig::new() + .set("bootstrap.servers", &self.config.bootstrap_servers) + .create() + .into_diagnostic() + .wrap_err("Failed to connect AdminClient")?; + + let retention = self.retention.as_millis().to_string(); + + info!("Replication = {}", self.replication_factor); + let new_topic = NewTopic::new( + &self.topic_name, + self.num_partitions, + TopicReplication::Fixed(self.replication_factor), + ) + // How long messages are retained in the topic before being deleted + .set("retention.ms", &retention) + // Use log compaction to keep only the latest value for each key + .set("cleanup.policy", "compact") + // Minimum ratio of dirty (uncompacted) records to total records before compaction triggers + .set("min.cleanable.dirty.ratio", "0.1") + // Maximum time a segment is kept open before being closed and made available for compaction + .set("segment.ms", "60000") // 1 min + // How long to retain delete tombstone markers for compacted topics + .set("delete.retention.ms", "60000") // 1 minute + // Maximum size of a single log segment file before rolling to a new segment + .set("segment.bytes", "5242880"); // 5MB + let options = AdminOptions::new().operation_timeout(Some(Duration::from_secs(30))); + + let results = admin_client + .create_topics(&[new_topic], &options) + .await + .into_diagnostic() + .wrap_err("Failed to create topic")?; + + for result in results { + match result { + Ok(topic) => info!("Topic '{}' created successfully", topic), + Err((topic, RDKafkaErrorCode::TopicAlreadyExists)) => { + info!("Topic '{}' already exists.", topic) + } + Err((_topic, error)) => Err(error) + .into_diagnostic() + .wrap_err("Failed to create topic")?, + } + } + Ok(()) + } +} + +impl std::ops::Deref for GeneralJobQueue { + type Target = KafkaTopicProducer; + + fn deref(&self) -> &Self::Target { + &self.producer + } +} diff --git a/rust/crates/scheduler/src/job_fetcher/mod.rs b/rust/crates/scheduler/src/job_fetcher/mod.rs new file mode 100644 index 000000000..9191e294a --- /dev/null +++ b/rust/crates/scheduler/src/job_fetcher/mod.rs @@ -0,0 +1,40 @@ +mod job_producer; + +use std::time::Duration; + +use futures::StreamExt; +use tokio::time; +use tracing::{error, info, warn}; + +use crate::config::CONFIG; +use crate::dao::JobDao; +use crate::models::DispatchJob; + +pub async fn run(monitor_interval: Option) -> miette::Result<()> { + let job_fetcher = JobDao::from_config(&CONFIG.database).await?; + let queue_manager = job_producer::GeneralJobQueue::from_config(&CONFIG.kafka)?; + queue_manager.create_topic().await?; + + let mut interval = time::interval(monitor_interval.unwrap_or(CONFIG.queue.monitor_interval)); + loop { + interval.tick().await; + + let mut stream = job_fetcher.query_active_jobs(); + + while let Some(job) = stream.next().await { + match job { + Ok(job_model) => { + let job: DispatchJob = job_model.into(); + info!("Found job: {}", job); + if let Err(err) = queue_manager.send(&job).await { + warn!("Failed to send job: {} to kafka", err) + } + } + Err(err) => { + error!("Failed to fetch job: {}", err); + } + } + } + info!("Finished streaming all jobs for this round") + } +} diff --git a/rust/crates/scheduler/src/main.rs b/rust/crates/scheduler/src/main.rs new file mode 100644 index 000000000..f7869691f --- /dev/null +++ b/rust/crates/scheduler/src/main.rs @@ -0,0 +1,89 @@ +use std::{str::FromStr, time::Duration}; + +use miette::IntoDiagnostic; +use structopt::StructOpt; +use tracing_rolling_file::{RollingConditionBase, RollingFileAppenderBase}; + +use crate::config::CONFIG; + +mod config; +mod dao; +mod job_dispatcher; +mod job_fetcher; +mod models; +mod pgpool; + +#[derive(StructOpt, Debug)] +pub struct JobQueueCli { + #[structopt(subcommand)] + subcommands: SubCommands, +} + +#[derive(StructOpt, Debug)] +enum SubCommands { + JobProducer(JobProducerCmd), + JobDispatcher(JobDispatcherCmd), +} + +#[derive(StructOpt, Debug)] +struct JobDispatcherCmd {} + +#[derive(StructOpt, Debug)] +struct JobProducerCmd { + #[structopt( + long, + short = "i", + long_help = "Interval the consumer loop should query and publish job updates to the queue" + )] + monitor_interval_seconds: Option, +} + +impl JobQueueCli { + async fn run(&self) -> miette::Result<()> { + match &self.subcommands { + SubCommands::JobProducer(job_producer_cmd) => { + job_fetcher::run( + job_producer_cmd + .monitor_interval_seconds + .map(Duration::from_secs), + ) + .await + } + SubCommands::JobDispatcher(_) => job_dispatcher::run().await, + } + } +} + +fn main() -> miette::Result<()> { + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(CONFIG.queue.worker_threads) + .enable_all() + .build() + .into_diagnostic()?; + + runtime.block_on(async_main()) +} + +async fn async_main() -> miette::Result<()> { + let log_level = + tracing::Level::from_str(CONFIG.logging.level.as_str()).expect("Invalid log level"); + let log_builder = tracing_subscriber::fmt() + .with_timer(tracing_subscriber::fmt::time::SystemTime) + .pretty() + .with_max_level(log_level); + if CONFIG.logging.file_appender { + let file_appender = RollingFileAppenderBase::new( + CONFIG.logging.path.clone(), + RollingConditionBase::new().max_size(1024 * 1024), + 7, + ) + .expect("Failed to create appender"); + let (non_blocking, _guard) = tracing_appender::non_blocking(file_appender); + log_builder.with_writer(non_blocking).init(); + } else { + log_builder.init(); + } + + let opts = JobQueueCli::from_args(); + opts.run().await +} diff --git a/rust/crates/scheduler/src/models/core_size.rs b/rust/crates/scheduler/src/models/core_size.rs new file mode 100644 index 000000000..ebd07e55c --- /dev/null +++ b/rust/crates/scheduler/src/models/core_size.rs @@ -0,0 +1,114 @@ +/// A module to handle two different units used to represent cores: +/// `CoreSize` and `CoreSizeWithMultiplier`. +/// +/// In OpenCue's database, core counts are stored with a multiplier (typically 100, +/// configurable in the CueBot config file). For example, 1 core might be stored as 100. +/// +/// To simplify booking calculations, this multiplier is often ignored to avoid partial +/// bookings (fractions of a single core). However, mixing values with and without the +/// multiplier can lead to bugs in calculations. +/// +/// This module provides two distinct types that can be converted between each other +/// but cannot be directly used together in operations, preventing accidental mixing +/// of multiplied and non-multiplied values. +/// +use core::fmt; +use std::{ + cmp, + fmt::Display, + ops::{Add, Sub}, +}; + +use serde::{Deserialize, Serialize}; + +use crate::config::CONFIG; + +/// Size of a processing unit (# cores without multiplier) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct CoreSize(pub i32); + +/// Size of a processing unit with a multiplier (# cores with multiplier) +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct CoreSizeWithMultiplier(pub i32); + +impl CoreSize { + pub fn value(self) -> i32 { + self.0 + } + + pub fn with_multiplier(self) -> CoreSizeWithMultiplier { + self.into() + } + + pub fn from_multiplied(size_with_multiplier: i32) -> CoreSize { + Self(size_with_multiplier / CONFIG.queue.core_multiplier as i32) + } +} + +impl CoreSizeWithMultiplier { + pub fn value(self) -> i32 { + self.0 + } +} + +impl From for CoreSizeWithMultiplier { + fn from(value: CoreSize) -> Self { + CoreSizeWithMultiplier(value.value() * CONFIG.queue.core_multiplier as i32) + } +} + +impl From for CoreSize { + fn from(value: CoreSizeWithMultiplier) -> Self { + CoreSize(value.value() / CONFIG.queue.core_multiplier as i32) + } +} + +impl Add for CoreSize { + type Output = CoreSize; + + fn add(self, rhs: Self) -> Self::Output { + Self(rhs.value() + self.value()) + } +} + +impl Add for CoreSizeWithMultiplier { + type Output = CoreSizeWithMultiplier; + + fn add(self, rhs: Self) -> Self::Output { + Self(rhs.value() + self.value()) + } +} + +impl Sub for CoreSize { + type Output = CoreSize; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.value() - rhs.value()) + } +} + +impl Sub for CoreSizeWithMultiplier { + type Output = CoreSizeWithMultiplier; + + fn sub(self, rhs: Self) -> Self::Output { + Self(self.value() - rhs.value()) + } +} + +impl cmp::Ord for CoreSize { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.0.cmp(&other.0) + } +} + +impl cmp::PartialOrd for CoreSize { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Display for CoreSize { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/rust/crates/scheduler/src/models/frame.rs b/rust/crates/scheduler/src/models/frame.rs new file mode 100644 index 000000000..ba19f53e4 --- /dev/null +++ b/rust/crates/scheduler/src/models/frame.rs @@ -0,0 +1,60 @@ +use std::fmt::Display; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::models::{core_size::CoreSize, fmt_uuid}; + +#[derive(Serialize, Deserialize)] +pub struct DispatchFrame { + // Entity fields + pub id: Uuid, + pub frame_name: String, + + // LayerEntity fields + pub show_id: Uuid, + pub facility_id: Uuid, + pub job_id: Uuid, + + // FrameEntity fields + pub layer_id: Uuid, + + // DispatchFrame specific fields + pub command: String, + pub range: String, + pub chunk_size: i32, + pub show_name: String, + pub shot: String, + pub user: String, + pub uid: Option, + pub log_dir: String, + pub layer_name: String, + pub job_name: String, + // Min cores can be a negative, representing `machine_total_cores - min_cores` + pub min_cores: CoreSize, + pub layer_cores_limit: Option, + pub threadable: bool, + pub has_selfish_service: bool, + pub min_gpus: u32, + pub min_gpu_memory: u64, + pub min_memory: u64, + // On Cuebot these fields come from constants, maybe replicate these constants here + // pub int_soft_memory_limit: i64, + // pub int_hard_memory_limit: i64, + pub services: Option, + pub os: Option, + pub loki_url: Option, +} + +impl Display for DispatchFrame { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}.{}.{}({})", + self.job_name, + self.layer_name, + self.frame_name, + fmt_uuid(&self.id) + ) + } +} diff --git a/rust/crates/scheduler/src/models/host.rs b/rust/crates/scheduler/src/models/host.rs new file mode 100644 index 000000000..e412c3884 --- /dev/null +++ b/rust/crates/scheduler/src/models/host.rs @@ -0,0 +1,28 @@ +use std::fmt::Display; + +use opencue_proto::host::ThreadMode; +use uuid::Uuid; + +use crate::models::{core_size::CoreSize, fmt_uuid}; + +#[derive(Clone)] +pub struct Host { + pub(crate) id: Uuid, + pub(crate) name: String, + pub(crate) str_os: Option, + pub(crate) total_cores: CoreSize, + pub(crate) total_memory: u64, + pub(crate) idle_cores: CoreSize, + pub(crate) idle_memory: u64, + pub(crate) idle_gpus: u32, + pub(crate) idle_gpu_memory: u64, + pub(crate) thread_mode: ThreadMode, + pub(crate) alloc_available_cores: CoreSize, + pub(crate) allocation_name: String, +} + +impl Display for Host { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}({})", self.name, fmt_uuid(&self.id)) + } +} diff --git a/rust/crates/scheduler/src/models/job.rs b/rust/crates/scheduler/src/models/job.rs new file mode 100644 index 000000000..d24fa4996 --- /dev/null +++ b/rust/crates/scheduler/src/models/job.rs @@ -0,0 +1,45 @@ +use core::fmt; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::models::{Partitionable, fmt_uuid}; + +/// Basic information to collect a job on the database for dispatching +#[derive(Serialize, Deserialize, Clone)] +pub struct DispatchJob { + #[serde( + serialize_with = "serialize_uuid", + deserialize_with = "deserialize_uuid" + )] + pub id: Uuid, + pub int_priority: i32, + pub age_days: i32, +} + +fn serialize_uuid(uuid: &Uuid, serializer: S) -> Result +where + S: serde::Serializer, +{ + serializer.serialize_str(&uuid.to_string()) +} + +fn deserialize_uuid<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + Uuid::parse_str(&s).map_err(serde::de::Error::custom) +} + +impl fmt::Display for DispatchJob { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", fmt_uuid(&self.id)) + } +} + +impl Partitionable for DispatchJob { + fn partition_key(&self) -> String { + self.id.to_string() + } +} diff --git a/rust/crates/scheduler/src/models/layer.rs b/rust/crates/scheduler/src/models/layer.rs new file mode 100644 index 000000000..98e2642da --- /dev/null +++ b/rust/crates/scheduler/src/models/layer.rs @@ -0,0 +1,35 @@ +use core::fmt; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::models::{core_size::CoreSize, fmt_uuid}; + +#[derive(Serialize, Deserialize)] +pub struct DispatchLayer { + pub id: Uuid, + pub job_id: Uuid, + pub facility_id: Uuid, + pub show_id: Uuid, + pub job_name: String, + pub layer_name: String, + pub str_os: Option, + pub cores_min: CoreSize, + pub mem_min: i64, + pub threadable: bool, + pub gpus_min: i32, + pub gpu_mem_min: i64, + pub tags: String, +} + +impl fmt::Display for DispatchLayer { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{}.{}({})", + self.job_name, + self.layer_name, + fmt_uuid(&self.id) + ) + } +} diff --git a/rust/crates/scheduler/src/models/mod.rs b/rust/crates/scheduler/src/models/mod.rs new file mode 100644 index 000000000..ece7f7949 --- /dev/null +++ b/rust/crates/scheduler/src/models/mod.rs @@ -0,0 +1,27 @@ +mod core_size; +mod frame; +mod host; +mod job; +mod layer; +mod virtual_proc; + +use uuid::Uuid; + +pub use core_size::{CoreSize, CoreSizeWithMultiplier}; +pub use frame::DispatchFrame; +pub use host::Host; +pub use job::DispatchJob; +pub use layer::DispatchLayer; +pub use virtual_proc::VirtualProc; + +pub fn fmt_uuid(id: &Uuid) -> String { + id.to_string() + .split_once("-") + .unwrap_or((&id.to_string(), "")) + .0 + .to_string() +} + +pub trait Partitionable { + fn partition_key(&self) -> String; +} diff --git a/rust/crates/scheduler/src/models/virtual_proc.rs b/rust/crates/scheduler/src/models/virtual_proc.rs new file mode 100644 index 000000000..aaf56e551 --- /dev/null +++ b/rust/crates/scheduler/src/models/virtual_proc.rs @@ -0,0 +1,31 @@ +use std::fmt::Display; + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use crate::models::{CoreSizeWithMultiplier, DispatchFrame, fmt_uuid}; + +#[derive(Serialize, Deserialize)] +pub struct VirtualProc { + pub proc_id: Uuid, + pub host_id: Uuid, + pub cores_reserved: CoreSizeWithMultiplier, + pub memory_reserved: u64, // in bytes + pub gpus_reserved: u32, + pub gpu_memory_reserved: u64, // in bytes + pub os: String, + pub is_local_dispatch: bool, + pub frame: DispatchFrame, +} + +impl Display for VirtualProc { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "(proc_id={}) {}->host={}", + fmt_uuid(&self.proc_id), + self.frame, + fmt_uuid(&self.host_id), + ) + } +} diff --git a/rust/crates/scheduler/src/pgpool.rs b/rust/crates/scheduler/src/pgpool.rs new file mode 100644 index 000000000..b60fa6001 --- /dev/null +++ b/rust/crates/scheduler/src/pgpool.rs @@ -0,0 +1,23 @@ +use std::sync::Arc; + +use miette::{IntoDiagnostic, Result}; +use sqlx::{Pool, Postgres, postgres::PgPoolOptions}; +use tokio::sync::OnceCell; + +use crate::config::DatabaseConfig; + +static CONNECTION_POOL: OnceCell>> = OnceCell::const_new(); + +pub async fn connection_pool(config: &DatabaseConfig) -> Result>> { + CONNECTION_POOL + .get_or_try_init(|| async { + let pool = PgPoolOptions::new() + .max_connections(config.pool_size) + .connect(&config.connection_url) + .await + .into_diagnostic()?; + Ok(Arc::new(pool)) + }) + .await + .map(Arc::clone) +}