|
| 1 | +use crate::uniffi::{ |
| 2 | + error::{OrcaError, Result, selector}, |
| 3 | + model::{PodJob, PodResult}, |
| 4 | + orchestrator::agent::{Agent, AgentClient}, |
| 5 | + store::ModelID, |
| 6 | +}; |
| 7 | +use chrono::Utc; |
| 8 | +use futures_util::future::FutureExt as _; |
| 9 | +use regex::Regex; |
| 10 | +use serde::{Deserialize, Serialize}; |
| 11 | +use snafu::{OptionExt as _, ResultExt as _}; |
| 12 | +use std::{ |
| 13 | + collections::HashMap, |
| 14 | + path::PathBuf, |
| 15 | + sync::{Arc, LazyLock}, |
| 16 | +}; |
| 17 | +use tokio::{ |
| 18 | + sync::mpsc::{self, error::SendError}, |
| 19 | + task::JoinSet, |
| 20 | +}; |
| 21 | +use tokio_util::task::TaskTracker; |
| 22 | + |
| 23 | +#[expect(clippy::expect_used, reason = "Valid static regex")] |
| 24 | +static RE_PODJOB_ACTION: LazyLock<Regex> = LazyLock::new(|| { |
| 25 | + Regex::new( |
| 26 | + r"(?x) |
| 27 | + ^ |
| 28 | + group\/(?<group>[a-z_]+)\/ |
| 29 | + (?<action>request|reservation|success|failure)\/ |
| 30 | + pod_job\/(?<pod_job_hash>[0-9a-f]+)\/ |
| 31 | + host\/(?<host>[a-z_]+)\/ |
| 32 | + timestamp\/(?<timestamp>.*?) |
| 33 | + $ |
| 34 | + ", |
| 35 | + ) |
| 36 | + .expect("Invalid PodJob action regex.") |
| 37 | +}); |
| 38 | + |
| 39 | +#[expect( |
| 40 | + dead_code, |
| 41 | + reason = "Need to be able to initialize to pass metadata as input." |
| 42 | +)] |
| 43 | +#[derive(Debug, Clone)] |
| 44 | +pub struct EventMetadata { |
| 45 | + group: String, |
| 46 | + host: String, |
| 47 | + subgroup: String, |
| 48 | +} |
| 49 | + |
| 50 | +#[expect( |
| 51 | + dead_code, |
| 52 | + reason = "Need to be able to initialize to pass metadata as input." |
| 53 | +)] |
| 54 | +#[derive(Debug, Clone)] |
| 55 | +pub enum EventPayload { |
| 56 | + Request(PodJob), |
| 57 | + Reservation(ModelID), |
| 58 | + Success(PodResult), |
| 59 | + Failure(PodResult), |
| 60 | +} |
| 61 | + |
| 62 | +#[expect( |
| 63 | + dead_code, |
| 64 | + reason = "Need to be able to initialize to pass metadata as input." |
| 65 | +)] |
| 66 | +#[derive(Debug, Clone)] |
| 67 | +pub struct Event { |
| 68 | + metadata: EventMetadata, |
| 69 | + payload: EventPayload, |
| 70 | +} |
| 71 | + |
| 72 | +impl AgentClient { |
| 73 | + pub(crate) async fn publish<T>(&self, topic: &str, payload: &T) -> Result<()> |
| 74 | + where |
| 75 | + T: Serialize + Sync + ?Sized, |
| 76 | + { |
| 77 | + Ok(self |
| 78 | + .session |
| 79 | + .put( |
| 80 | + format!( |
| 81 | + "group/{}/{}/host/{}/timestamp/{}", |
| 82 | + self.group, |
| 83 | + topic, |
| 84 | + self.host, |
| 85 | + Utc::now().to_rfc3339() |
| 86 | + ), |
| 87 | + &serde_json::to_vec(payload)?, |
| 88 | + ) |
| 89 | + .await |
| 90 | + .context(selector::AgentCommunicationFailure {})?) |
| 91 | + } |
| 92 | + /// Send a log message to the agent network. |
| 93 | + /// |
| 94 | + /// # Errors |
| 95 | + /// |
| 96 | + /// Will fail if there is an issue sending the message. |
| 97 | + pub(crate) async fn log(&self, message: &str) -> Result<()> { |
| 98 | + println!("{message}"); |
| 99 | + self.publish("log", message).await |
| 100 | + } |
| 101 | +} |
| 102 | + |
| 103 | +#[expect( |
| 104 | + clippy::excessive_nesting, |
| 105 | + clippy::let_underscore_must_use, |
| 106 | + reason = "`result::Result<(), SendError<_>>` is the only uncaptured result since it would mean we can't transmit results over mpsc." |
| 107 | +)] |
| 108 | +pub async fn start_service< |
| 109 | + EventClassifierF, // function to classify the event payload e.g. EventPayload::{Request | Reservation | ..} |
| 110 | + RequestF, // function to run on requests |
| 111 | + RequestI, // input to the function for requests |
| 112 | + RequestR, // output to the function for requests |
| 113 | + ResponseF, // function to run on completing a request i.e. response |
| 114 | + ResponseI, // input to the function for responses |
| 115 | + ResponseR, // output to the function for responses |
| 116 | +>( |
| 117 | + agent: Arc<Agent>, |
| 118 | + request_key_expr: String, |
| 119 | + namespace_lookup: HashMap<String, PathBuf>, |
| 120 | + event_classifier: EventClassifierF, |
| 121 | + request_task: RequestF, |
| 122 | + response_task: ResponseF, |
| 123 | +) -> Result<()> |
| 124 | +where |
| 125 | + EventClassifierF: Fn(&RequestI) -> EventPayload + Send + 'static, |
| 126 | + RequestI: for<'serde> Deserialize<'serde>, |
| 127 | + RequestF: Fn(Arc<Agent>, HashMap<String, PathBuf>, EventMetadata, RequestI) -> RequestR |
| 128 | + + Send |
| 129 | + + 'static, |
| 130 | + RequestR: Future<Output = Result<ResponseI>> + Send + 'static, |
| 131 | + ResponseI: Send + 'static, |
| 132 | + ResponseF: Fn(Arc<AgentClient>, ResponseI) -> ResponseR + Send + 'static, |
| 133 | + ResponseR: Future<Output = Result<()>> + Send + 'static, |
| 134 | +{ |
| 135 | + agent |
| 136 | + .client |
| 137 | + .log(&format!("Started `{request_key_expr}` service.")) |
| 138 | + .await?; |
| 139 | + let (response_tx, mut response_rx) = mpsc::channel(100); |
| 140 | + |
| 141 | + let mut services = JoinSet::new(); |
| 142 | + services.spawn({ |
| 143 | + let inner_agent = Arc::clone(&agent); |
| 144 | + async move { |
| 145 | + let tasks = TaskTracker::new(); |
| 146 | + let subscriber = inner_agent |
| 147 | + .client |
| 148 | + .session |
| 149 | + .declare_subscriber(format!( |
| 150 | + "group/{}/{}", |
| 151 | + inner_agent.client.group, request_key_expr |
| 152 | + )) |
| 153 | + .await |
| 154 | + .context(selector::AgentCommunicationFailure {})?; |
| 155 | + while let Ok(sample) = subscriber.recv_async().await { |
| 156 | + if let (Ok(input), Some(metadata)) = ( |
| 157 | + serde_json::from_slice::<RequestI>(&sample.payload().to_bytes()), |
| 158 | + RE_PODJOB_ACTION.captures(sample.key_expr().as_str()), |
| 159 | + ) { |
| 160 | + let inner_response_tx = response_tx.clone(); |
| 161 | + let event_metadata = EventMetadata { |
| 162 | + group: metadata["group"].to_string(), |
| 163 | + host: metadata["host"].to_string(), |
| 164 | + subgroup: metadata["pod_job_hash"].to_string(), |
| 165 | + }; |
| 166 | + let _event_payload = event_classifier(&input); |
| 167 | + tasks.spawn( |
| 168 | + request_task( |
| 169 | + Arc::clone(&inner_agent), |
| 170 | + namespace_lookup.clone(), |
| 171 | + event_metadata, |
| 172 | + input, |
| 173 | + ) |
| 174 | + .then(move |response| async move { |
| 175 | + let _: Result<(), SendError<Result<ResponseI>>> = |
| 176 | + inner_response_tx.send(response).await; |
| 177 | + Ok::<_, OrcaError>(()) |
| 178 | + }), |
| 179 | + ); |
| 180 | + } |
| 181 | + } |
| 182 | + Ok(()) |
| 183 | + } |
| 184 | + }); |
| 185 | + services.spawn(async move { |
| 186 | + while let Some(content) = response_rx.recv().await { |
| 187 | + response_task(Arc::clone(&agent.client), content?).await?; |
| 188 | + } |
| 189 | + Ok(()) |
| 190 | + }); |
| 191 | + |
| 192 | + services |
| 193 | + .join_next() |
| 194 | + .await |
| 195 | + .context(selector::NoRemainingServices {})?? |
| 196 | +} |
0 commit comments