From e08389ce9aef61ab6b63b55ba3cee4ecd1c1d3e1 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 9 Jul 2025 12:25:14 +0000 Subject: [PATCH 01/29] Save progress --- .clippy.toml | 2 +- Cargo.toml | 2 + src/core/error.rs | 27 +- src/uniffi/error.rs | 8 + src/uniffi/mod.rs | 7 +- src/uniffi/pipeline.rs | 16 +- src/uniffi/pipeline_runner/docker.rs | 458 +++++++++++++++++++++++++++ src/uniffi/pipeline_runner/mod.rs | 55 ++++ 8 files changed, 565 insertions(+), 10 deletions(-) create mode 100644 src/uniffi/pipeline_runner/docker.rs create mode 100644 src/uniffi/pipeline_runner/mod.rs diff --git a/.clippy.toml b/.clippy.toml index 8987fce2..5821063e 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -1,3 +1,3 @@ -excessive-nesting-threshold = 4 +excessive-nesting-threshold = 5 too-many-arguments-threshold = 10 allowed-idents-below-min-chars = ["..", "k", "v", "f", "re", "id", "Ok", "'_"] diff --git a/Cargo.toml b/Cargo.toml index d9b2b6f0..7486a41c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ glob = "0.3.1" heck = "0.5.0" # hashmaps that preserve insertion order indexmap = { version = "2.9.0", features = ["serde"] } +itertools = "0.14.0" layout-rs = "0.1.3" # random name generator names = "0.14.0" @@ -66,6 +67,7 @@ sha2 = "0.10.8" snafu = { version = "0.8.5", features = ["futures"] } # a runtime for async applications tokio = { version = "1.41.0", features = ["full"] } +tokio-stream = "0.1.17" # utilities for async calls tokio-util = "0.7.13" # automated CFFI + bindings in other languages diff --git a/src/core/error.rs b/src/core/error.rs index 710e22f8..a43fa08d 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -1,4 +1,7 @@ -use crate::uniffi::error::{Kind, OrcaError}; +use crate::uniffi::{ + error::{Kind, OrcaError}, + pipeline_runner::docker::Message, +}; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -9,6 +12,7 @@ use std::{ io, path::{self}, }; +use tokio::{sync::broadcast::error::SendError, task::JoinError}; impl From for OrcaError { fn from(error: BollardError) -> Self { @@ -70,6 +74,26 @@ impl From for OrcaError { } } } +impl From for OrcaError { + fn from(error: JoinError) -> Self { + Self { + kind: Kind::IoError { + source: error.into(), + backtrace: Some(Backtrace::capture()), + }, + } + } +} +impl From> for OrcaError { + fn from(error: SendError) -> Self { + Self { + kind: Kind::SendError { + source: error, + backtrace: Some(Backtrace::capture()), + }, + } + } +} impl From for OrcaError { fn from(error: Kind) -> Self { Self { kind: error } @@ -105,6 +129,7 @@ impl fmt::Debug for OrcaError { | Kind::GlobPatternError { backtrace, .. } | Kind::IoError { backtrace, .. } | Kind::PathPrefixError { backtrace, .. } + | Kind::SendError { backtrace, .. } | Kind::SerdeJsonError { backtrace, .. } | Kind::SerdeYamlError { backtrace, .. } => { write!(f, "{}{}", self.kind, format_stack(backtrace.as_ref())) diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 0ed5a4d6..661ec775 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -14,8 +14,11 @@ use std::{ path::{self, PathBuf}, result, }; +use tokio::sync::broadcast::error::SendError; use uniffi; +use crate::uniffi::pipeline_runner::docker::Message; + /// Shorthand for a Result that returns an `OrcaError`. pub type Result = result::Result; /// Possible errors you may encounter. @@ -107,6 +110,11 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] + SendError { + source: SendError, + backtrace: Option, + }, + #[snafu(transparent)] SerdeJsonError { source: serde_json::Error, backtrace: Option, diff --git a/src/uniffi/mod.rs b/src/uniffi/mod.rs index dadf9d5e..f0a20412 100644 --- a/src/uniffi/mod.rs +++ b/src/uniffi/mod.rs @@ -4,8 +4,9 @@ pub mod error; pub mod model; /// Interface into container orchestration engine. pub mod orchestrator; -/// Data persistence provided by a store backend. -pub mod store; - /// Pipeline management and execution. pub mod pipeline; +/// Pipeline runner interface. +pub mod pipeline_runner; +/// Data persistence provided by a store backend. +pub mod store; diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index fdb1c316..97923ff0 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -6,7 +6,7 @@ use crate::{ }, uniffi::{ error::{Kind, OrcaError, Result}, - model::{Annotation, PathSet, Pod}, + model::{Annotation, PathSet, Pod, URI}, }, }; use derive_more::Display; @@ -126,8 +126,8 @@ impl From for Kernel { #[uniffi::export(Display)] /// Struct to represent a node in the pipeline graph pub struct Node { - /// This is name for now till hashing feature get merged - pub name: String, + /// This is id for now till hashing feature get merged + pub id: String, /// Hash of the kernel to use in `kernel_lut` pub kernel_hash: String, } @@ -136,7 +136,7 @@ impl Node { /// Creates a new `Node` instance and computes its hash based on the kernel hash and parent hashes. pub fn new(kernel_hash: &str, parent_hashes: Vec<&str>) -> Self { Self { - name: Self::compute_hash(kernel_hash, parent_hashes), + id: Self::compute_hash(kernel_hash, parent_hashes), kernel_hash: kernel_hash.to_owned(), } } @@ -222,7 +222,7 @@ impl Pipeline { // Create the node, insert into graph and store the idx for node_name in node_names { let node = Node { - name: (*node_name).clone(), + id: (*node_name).clone(), kernel_hash: kernel.get_hash(), }; let node_idx = graph.add_node(node); @@ -349,6 +349,7 @@ pub struct PipelineJob { pub pipeline: Pipeline, /// Mapping of outside input to keys to be match with the pipeline `input_map` pub input_map: HashMap, + pub output_dir: URI, /// Annotation for the pipeline job pub annotation: Option, } @@ -360,6 +361,7 @@ impl PipelineJob { pub fn new( pipeline: Pipeline, input_packet: HashMap, + output_dir: URI, annotation: Option, ) -> Result { // Check if input_map has all the requires keys @@ -390,6 +392,10 @@ impl PipelineJob { pipeline, input_map: input_packet, annotation, + output_dir, }) } } +pub struct PipelineResult { + pub pipeline_job: PipelineJob, +} diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs new file mode 100644 index 00000000..fb760c49 --- /dev/null +++ b/src/uniffi/pipeline_runner/docker.rs @@ -0,0 +1,458 @@ +use super::PipelineRun; +use crate::{ + core::{ + crypto::{hash_buffer, hash_stream}, + model::serialize_hashmap, + util::get, + }, + uniffi::{ + error::{OrcaError, Result, selector}, + model::{PathSet, Pod, PodJob, URI}, + pipeline::{Kernel, Node, PipelineJob, PipelineResult}, + }, +}; +use futures_util::stream::FuturesUnordered; +use itertools::Itertools; +use serde_yaml::Serializer; +use snafu::OptionExt as _; +use std::{ + clone, + collections::HashMap, + mem, + path::{Path, PathBuf}, + sync::Arc, +}; +use tokio::{ + sync::broadcast::{self, Receiver, Sender}, + task::JoinSet, +}; +use tokio_stream::StreamExt as _; + +#[derive(Clone, Debug)] +pub(crate) enum Message { + NodeOutput(String, HashMap), // String is the parent_node_name, while HashMap is output of the parent node + Stop, // Message to halt all operations +} + +struct PipelineRunInfo { + node_task_join_set: JoinSet>, // Join set to track the tasks for this pipeline run + job_manager_ch_tx: Sender, + node_tx: HashMap>, + outputs: HashMap>, // String is the node key, while hash + namespace_lookup: HashMap, // Namespace to operate as storage +} + +/// Docker based pipeline runner meant to execute on a single machine +#[derive(Default)] +pub struct DockerPipelineRunner { + pipeline_runs: HashMap, // For each pipeline run, we have a join set to track the tasks and wait on them +} + +impl DockerPipelineRunner { + /// Create a new Docker pipeline runner + pub fn new() -> Self { + Self::default() + } + + /// Start the `pipeline_job` returning `pipeline_run`un + /// + /// # Errors + /// Will error out if the pipeline job fails to start + pub fn start( + &mut self, + pipeline_job: PipelineJob, + namespace_lookup: HashMap, + ) -> Result { + // Create a new pipeline run + let pipeline_run = PipelineRun { pipeline_job }; + let pipeline_run_arc = Arc::new(pipeline_run.clone()); + + // Insert into the list of pipeline runs + self.pipeline_runs.insert( + (*pipeline_run_arc).clone(), + PipelineRunInfo { + job_manager_ch_tx: broadcast::channel::(1).0, + node_tx: HashMap::new(), + node_task_join_set: JoinSet::new(), + outputs: HashMap::new(), + namespace_lookup, + }, + ); + + // Create the source channel for the pipeline + // This channel will be used to send inputs to the pipeline + let (source_tx, _) = broadcast::channel::(1); + + // Get reference to the pipeline + let pipeline = &pipeline_run_arc.pipeline_job.pipeline; + + // Get all the leaf nodes and call the create_task_for_node function for each leaf node + // This will recursively create all the tasks and channels for the pipeline + pipeline.get_leaf_nodes().try_for_each(|node| { + self.create_task_for_node(node, &pipeline_run_arc, &source_tx)?; + Ok::<(), OrcaError>(()) + })?; + + for node_key in pipeline.get_leaf_nodes() { + self.create_task_for_node(node_key, &pipeline_run_arc, &source_tx)?; + } + + // Create a task to handle outputs of output nodes in pipeline + // for node_key in pipeline.output_nodes {} + + Ok(pipeline_run) + } + + /// Given a pipeline run, wait for all its tasks to complete and return the `PipelineResult` + /// + /// # Errors + /// Will error out if any of the pipeline tasks failed to join + pub async fn get_result(&mut self, pipeline_run: &PipelineRun) -> Result { + // Call join on the join set for the pipeline run + let pipeline_run_info = + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })?; + + // Wait for all the tasks to complete + while let Some(result) = pipeline_run_info.node_task_join_set.join_next().await { + match result { + Ok(Ok(())) => {} // Task completed successfully + Ok(Err(err)) => { + eprintln!("Task failed: {err}"); + return Err(err); + } + Err(err) => { + eprintln!("Join set error: {err}"); + return Err(err.into()); + } + } + } + + Ok(PipelineResult { + pipeline_job: pipeline_run.pipeline_job.clone(), + }) + } + + fn create_task_for_node( + &mut self, + node: &Node, + pipeline_run: &Arc, + source_tx: &Sender, + ) -> Result> { + // Get the input channels for this node which should be it's parents + let mut input_ch_rxs = pipeline_run + .pipeline_job + .pipeline + .get_parents_for_node(node) + .map(|parent_node| { + // Check if it exists in the pipeline_runs hashmap + match get(&self.pipeline_runs, pipeline_run)? + .node_tx + .get(&parent_node.id) + { + Some(rx) => Ok(rx.subscribe()), + None => { + // Missing parent node, thus call create_task for the parent node parent node first + Ok(self + .create_task_for_node(parent_node, pipeline_run, source_tx)? + .subscribe()) + } + } + }) + .collect::>>()?; + + // Check if input_ch_rxs is empty, meaning this node has no parents and is a root node + // In this case, we will use the source channel as the input channel + // TODO: This will be replaced by input_node logic once that is merged + if input_ch_rxs.is_empty() { + // No parents, thus this is root node + // The parent rx will be the source channel rx + input_ch_rxs.push(source_tx.subscribe()); + } + + // Get the job manager ch and subscribe to it (mainly for receiving shutdown signal) + let job_manager_ch_rx = get(&self.pipeline_runs, pipeline_run)? + .job_manager_ch_tx + .subscribe(); + + // Create the output_channel for this node + let (tx, _) = broadcast::channel::(128); + + // Spawn the node_manager for this node + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })? + .node_task_join_set + .spawn(Self::start_node_manager( + node.clone(), + Arc::clone(pipeline_run), + input_ch_rxs, + job_manager_ch_rx, + tx.clone(), + )); + + // Insert it into the the tx into the pipeline_runs hashmap + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })? + .node_tx + .insert(node.id.clone(), tx.clone()); + + // Return tx + Ok(tx) + } + + /// For tx: Sender, we only want to send successfully completed results to the next node + async fn start_node_manager( + node: Node, + pipeline_run: Arc, + parent_channel_rxs: Vec>, + mut job_manager_channel: Receiver, + tx: Sender, + namespace_lookup: &HashMap, + ) -> Result<()> { + // Create a futures unordered set to dynamically listen to N number of receivers + let mut futures = FuturesUnordered::new(); + + // Add all the parent channel receivers to the futures unordered set + for mut rx in parent_channel_rxs { + futures.push(tokio::spawn(async move { rx.recv().await })); + } + + // Add the job manager channel to the futures unordered set + futures.push(tokio::spawn( + async move { job_manager_channel.recv().await }, + )); + + // Get the kernel for this node + let kernel = get( + &pipeline_run.pipeline_job.pipeline.kernel_lut, + &node.kernel_hash, + )?; + + // Set up a join_set to track the tasks () + let mut task_join_set = JoinSet::new(); + + // Listen to the MPSC channel and handle messages + while let Some(result) = futures.next().await { + let rx_result = match result { + Ok(rx_result) => rx_result, + Err(err) => { + // Record into pipeilne_error log + if err.is_panic() { + eprintln!("Task panicked: {err}"); + } else { + eprintln!("Error receiving message: {err}"); + } + continue; + } + }; + + let Ok(msg) = rx_result else { + eprintln!("Failed to receive message from parent channel"); + continue; + }; + + match msg { + Message::NodeOutput(_, input_packet) => { + // Inputs from parents are ready, thus we need to process them if they are already computed and cached + // NOTE: Cache is TODO + match kernel { + Kernel::Pod(pod) => { + Self::process_packet_pod( + &node, + pod.clone(), + tx.clone(), + tx.clone(), + input_packet, + Arc::clone(&pipeline_run), + namespace_lookup, + )?; + } + Kernel::Mapper(mapper) => { + // For mapper, we just apply it directly + let output_map = mapper + .mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(&input_packet, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; + + // Send the output via the channel + tx.send(Message::NodeOutput(node_key.clone(), output_map))?; + } + Kernel::Joiner(joiner) => todo!(), + } + } + Message::Stop => { + // Stop all pod_job tasks abruptly + task_join_set.shutdown().await; + break; + } + } + } + + Ok(()) + } + + fn process_packet_pod( + node: &Node, + pod: Arc, + success_ch_tx: Sender, + failure_ch_tx: Sender, + input_packet: HashMap, + pipeline_run: Arc, + namespace_lookup: &HashMap, + ) -> Result<()> { + // Output directory is pod_runs/pod_run_id/node_id/hash_of_input_packet + + // Compute the hash of the input_packet + let mut buf = Vec::new(); + let mut serializer = Serializer::new(&mut buf); + serialize_hashmap(&input_packet, &mut serializer)?; + let input_packet_hash = hash_buffer(buf); + let output_dir = URI { + namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), + path: PathBuf::from(format!("pod_runs/{}/{}", pod.hash, input_packet_hash)), + }; + + let cpu_limit = pod.recommended_cpus; + let memory_limit = pod.recommended_memory; + + // Create the pod job + let pod_job = PodJob::new( + None, + pod, + input_packet.clone(), + output_dir, + cpu_limit, + memory_limit, + None, + namespace_lookup, + )?; + + // Simulate pod execution by just printing out pod_job_hash and pod hash + // This will be replaced by sending the pod_job to the orchestrator via the agent + println!( + "Executing pod job: {} with pod hash: {}", + pod_job.hash, pod_job.pod.hash + ); + + // For now we will just send the input_packet to the success channel + success_ch_tx.send(Message::NodeOutput(node.id.clone(), input_packet.clone()))?; + + Ok(()) + } +} + +trait ProcessPacket { + fn process_packet( + &mut self, + sender_node_id: String, + packet: HashMap, + success_ch_tx: Sender, + failure_ch_tx: Sender, + ) -> Result<()>; +} + +struct PodNodeProcessor {} + +struct MapperProcessor {} + +struct JoinNodeProcessor { + /// Cache for all packets received by the node + input_packet_cache: HashMap>>, +} + +impl JoinNodeProcessor { + fn new(self, parents_node_id: Vec) -> Self { + let input_packet_cache = parents_node_id + .into_iter() + .map(|id| (id, Vec::new())) + .collect(); + Self { input_packet_cache } + } + + fn compute_new_packet_combination( + &self, + sender_node_id: String, + new_packet: &HashMap, + ) -> Result>> { + // Combine the new packet with the existing packets in the cache + // Get all the cached packets from other parents + let other_parent_ids = self + .input_packet_cache + .keys() + .filter(|key| *key != &sender_node_id); + let mut factors = other_parent_ids + .map(|id| get(&self.input_packet_cache, id)) + .collect::>>()?; + + // Add the new incoming packet as a factor + let incoming_packet = vec![new_packet.clone()]; + factors.push(&incoming_packet); + + let result = factors + .into_iter() + .multi_cartesian_product() + .map(|packets_to_combined| { + packets_to_combined + .into_iter() + .fold(HashMap::new(), |mut acc, packet| { + acc.extend(packet.clone()); + acc + }) + }) + .collect::>(); + + Ok(result) + } +} + +impl ProcessPacket for JoinNodeProcessor { + fn process_packet( + &mut self, + sender_node_id: String, + packet: HashMap, + success_ch_tx: Sender, + failure_ch_tx: Sender, + ) -> Result<()> { + match { + get(&self.input_packet_cache, &sender_node_id)?.push(packet); + + // Compute the new packet combination based on the sender node id and the packet + let new_packets_to_send = + self.compute_new_packet_combination(sender_node_id, &packet)?; + + Ok::>, OrcaError>(new_packets_to_send) + } { + Ok(output_packets) => { + // Send the output packets to the success channel + for output_packet in output_packets { + success_ch_tx + .send(Message::NodeOutput(sender_node_id.clone(), output_packet))?; + } + } + Err(err) => { + // Send the error to the failure channel + failure_ch_tx.send(Message::NodeOutput( + sender_node_id.clone(), + HashMap::new(), // Empty packet on failure + ))?; + return Err(err); + } + } + // Add the new packet into the cache + + Ok(()) + } +} diff --git a/src/uniffi/pipeline_runner/mod.rs b/src/uniffi/pipeline_runner/mod.rs new file mode 100644 index 00000000..67fa0663 --- /dev/null +++ b/src/uniffi/pipeline_runner/mod.rs @@ -0,0 +1,55 @@ +use crate::uniffi::error::Result; + +use super::pipeline::PipelineJob; +use std::fmt; +use std::hash::{Hash, Hasher}; + +/// # Errors: +/// Error out if fail to start the pipeline job +pub trait PipelineRunner { + /// Starts the given pipeline job. + /// + /// # Errors + /// Returns an error if the pipeline job fails to start. + fn start(&self, pipeline_job: PipelineJob) -> Result<()>; +} + +#[derive(Debug, Clone)] +/// Struct to store the active pipeline run. +pub struct PipelineRun { + pipeline_job: PipelineJob, +} + +impl fmt::Display for PipelineRun { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "PipelineRun {{ pipeline_job: {} }}", + self.pipeline_job.hash + ) + } +} + +impl PipelineRun { + /// New function to initialize the pipeline run + pub const fn new(pipeline_job: PipelineJob) -> Self { + Self { pipeline_job } + } +} + +impl PartialEq for PipelineRun { + fn eq(&self, other: &Self) -> bool { + self.pipeline_job.hash == other.pipeline_job.hash + } +} + +impl Eq for PipelineRun {} + +impl Hash for PipelineRun { + fn hash(&self, state: &mut H) { + self.pipeline_job.hash.hash(state); + } +} + +/// Docker pipeline runner +pub mod docker; From 6b7913d293ccce137708a7b3dc919d7b3fb7b2be Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 9 Jul 2025 14:16:28 +0000 Subject: [PATCH 02/29] Save progress --- src/core/orchestrator/docker.rs | 2 +- src/uniffi/pipeline.rs | 16 +- src/uniffi/pipeline_runner/docker.rs | 240 ++++++++++++++++++--------- 3 files changed, 178 insertions(+), 80 deletions(-) diff --git a/src/core/orchestrator/docker.rs b/src/core/orchestrator/docker.rs index 999f01a5..b6d22f9d 100644 --- a/src/core/orchestrator/docker.rs +++ b/src/core/orchestrator/docker.rs @@ -38,7 +38,7 @@ impl LocalDockerOrchestrator { fn prepare_mount_binds( namespace_lookup: &HashMap, pod_job: &PodJob, - ) -> Result<(Vec, [String; 1])> { + ) -> Result<(Vec)> { // all host mounted paths need to be absolute let host_output_directory = path::absolute( namespace_lookup[&pod_job.output_dir.namespace].join(&pod_job.output_dir.path), diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index 97923ff0..d90fac87 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -339,13 +339,16 @@ impl Pipeline { } } -#[derive(uniffi::Object, Display, Debug, Clone)] +#[derive(uniffi::Object, Display, Debug, Clone, Serialize)] #[display("{self:#?}")] #[uniffi::export(Display)] /// `PipelineJob` struct /// This struct is used to store the pipeline and the input map pub struct PipelineJob { - /// Pipeline struct + /// Used to unique identify the pipeline job + pub hash: String, + /// Pipeline struct (Note: Due to the removal of the hash system to be deferred, this has no guarantee of being unique) + #[serde(skip)] pub pipeline: Pipeline, /// Mapping of outside input to keys to be match with the pipeline `input_map` pub input_map: HashMap, @@ -388,11 +391,18 @@ impl PipelineJob { }); } - Ok(Self { + // Create the job without_hash + let no_hash = Self { + hash: String::new(), pipeline, input_map: input_packet, annotation, output_dir, + }; + + Ok(Self { + hash: hash_buffer(to_yaml(&no_hash)?), + ..no_hash }) } } diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index fb760c49..7ebad3e7 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -1,27 +1,17 @@ use super::PipelineRun; use crate::{ - core::{ - crypto::{hash_buffer, hash_stream}, - model::serialize_hashmap, - util::get, - }, + core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ error::{OrcaError, Result, selector}, model::{PathSet, Pod, PodJob, URI}, - pipeline::{Kernel, Node, PipelineJob, PipelineResult}, + pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, }; use futures_util::stream::FuturesUnordered; use itertools::Itertools; use serde_yaml::Serializer; use snafu::OptionExt as _; -use std::{ - clone, - collections::HashMap, - mem, - path::{Path, PathBuf}, - sync::Arc, -}; +use std::{collections::HashMap, path::PathBuf, sync::Arc}; use tokio::{ sync::broadcast::{self, Receiver, Sender}, task::JoinSet, @@ -30,8 +20,9 @@ use tokio_stream::StreamExt as _; #[derive(Clone, Debug)] pub(crate) enum Message { - NodeOutput(String, HashMap), // String is the parent_node_name, while HashMap is output of the parent node - Stop, // Message to halt all operations + NodeOutput(String, HashMap), // String is the parent_node_id, while HashMap is output of the parent node + ProcessingFailed(String, Arc), // String is the node_id, while OrcaError is the error that occurred + Stop, // Message to halt all operations } struct PipelineRunInfo { @@ -39,7 +30,7 @@ struct PipelineRunInfo { job_manager_ch_tx: Sender, node_tx: HashMap>, outputs: HashMap>, // String is the node key, while hash - namespace_lookup: HashMap, // Namespace to operate as storage + namespace_lookup: HashMap, } /// Docker based pipeline runner meant to execute on a single machine @@ -61,7 +52,7 @@ impl DockerPipelineRunner { pub fn start( &mut self, pipeline_job: PipelineJob, - namespace_lookup: HashMap, + namespace_lookup: HashMap, ) -> Result { // Create a new pipeline run let pipeline_run = PipelineRun { pipeline_job }; @@ -75,7 +66,7 @@ impl DockerPipelineRunner { node_tx: HashMap::new(), node_task_join_set: JoinSet::new(), outputs: HashMap::new(), - namespace_lookup, + namespace_lookup: namespace_lookup.clone(), }, ); @@ -89,14 +80,10 @@ impl DockerPipelineRunner { // Get all the leaf nodes and call the create_task_for_node function for each leaf node // This will recursively create all the tasks and channels for the pipeline pipeline.get_leaf_nodes().try_for_each(|node| { - self.create_task_for_node(node, &pipeline_run_arc, &source_tx)?; + self.create_task_for_node(node, &pipeline_run_arc, &source_tx, &namespace_lookup)?; Ok::<(), OrcaError>(()) })?; - for node_key in pipeline.get_leaf_nodes() { - self.create_task_for_node(node_key, &pipeline_run_arc, &source_tx)?; - } - // Create a task to handle outputs of output nodes in pipeline // for node_key in pipeline.output_nodes {} @@ -141,6 +128,7 @@ impl DockerPipelineRunner { node: &Node, pipeline_run: &Arc, source_tx: &Sender, + namespace_lookup: &HashMap, ) -> Result> { // Get the input channels for this node which should be it's parents let mut input_ch_rxs = pipeline_run @@ -157,7 +145,12 @@ impl DockerPipelineRunner { None => { // Missing parent node, thus call create_task for the parent node parent node first Ok(self - .create_task_for_node(parent_node, pipeline_run, source_tx)? + .create_task_for_node( + parent_node, + pipeline_run, + source_tx, + namespace_lookup, + )? .subscribe()) } } @@ -194,6 +187,7 @@ impl DockerPipelineRunner { input_ch_rxs, job_manager_ch_rx, tx.clone(), + namespace_lookup.clone(), )); // Insert it into the the tx into the pipeline_runs hashmap @@ -216,7 +210,7 @@ impl DockerPipelineRunner { parent_channel_rxs: Vec>, mut job_manager_channel: Receiver, tx: Sender, - namespace_lookup: &HashMap, + namespace_lookup: HashMap, ) -> Result<()> { // Create a futures unordered set to dynamically listen to N number of receivers let mut futures = FuturesUnordered::new(); @@ -231,14 +225,37 @@ impl DockerPipelineRunner { async move { job_manager_channel.recv().await }, )); - // Get the kernel for this node - let kernel = get( + // Get the kernel for this node and build the correct processor + let mut processor: Box = match get( &pipeline_run.pipeline_job.pipeline.kernel_lut, &node.kernel_hash, - )?; - - // Set up a join_set to track the tasks () - let mut task_join_set = JoinSet::new(); + )? { + Kernel::Pod(pod) => { + // Create a processor for the pod node + Box::new(PodNodeProcessor::new( + pod.clone(), + pipeline_run.pipeline_job.output_dir.namespace.clone(), + namespace_lookup.clone(), + )) + } + Kernel::Mapper(mapper) => { + // Create a processor for the mapper node + Box::new(MapperProcessor { + mapper: mapper.clone(), + }) + } + Kernel::Joiner => { + // Get the parents of the join node + let parent_nodes_id = pipeline_run + .pipeline_job + .pipeline + .get_parents_for_node(&node) + .map(|parent_node| parent_node.id.clone()) + .collect::>(); + // Create a processor for the join node + Box::new(JoinerNodeProcessor::new(parent_nodes_id)) + } + }; // Listen to the MPSC channel and handle messages while let Some(result) = futures.next().await { @@ -261,43 +278,13 @@ impl DockerPipelineRunner { }; match msg { - Message::NodeOutput(_, input_packet) => { + Message::NodeOutput(sender_node_id, input_packet) => { // Inputs from parents are ready, thus we need to process them if they are already computed and cached - // NOTE: Cache is TODO - match kernel { - Kernel::Pod(pod) => { - Self::process_packet_pod( - &node, - pod.clone(), - tx.clone(), - tx.clone(), - input_packet, - Arc::clone(&pipeline_run), - namespace_lookup, - )?; - } - Kernel::Mapper(mapper) => { - // For mapper, we just apply it directly - let output_map = mapper - .mapping - .iter() - .map(|(input_key, output_key)| { - let input = get(&input_packet, input_key)?.clone(); - Ok((output_key.to_owned(), input)) - }) - .collect::>>()?; - - // Send the output via the channel - tx.send(Message::NodeOutput(node_key.clone(), output_map))?; - } - Kernel::Joiner(joiner) => todo!(), - } } Message::Stop => { - // Stop all pod_job tasks abruptly - task_join_set.shutdown().await; break; } + Message::ProcessingFailed(_, orca_error) => todo!(), } } @@ -354,27 +341,121 @@ impl DockerPipelineRunner { } } -trait ProcessPacket { +trait NodeProcessor: Send { fn process_packet( &mut self, sender_node_id: String, + current_node_id: String, packet: HashMap, success_ch_tx: Sender, failure_ch_tx: Sender, ) -> Result<()>; } -struct PodNodeProcessor {} +struct PodNodeProcessor { + pod: Arc, + namespace: String, + namespace_lookup: HashMap, // Copy of the look up table + processing_tasks: JoinSet<()>, +} + +impl PodNodeProcessor { + fn new(pod: Arc, namespace: String, namespace_lookup: HashMap) -> Self { + Self { + pod, + namespace, + namespace_lookup, + processing_tasks: JoinSet::new(), + } + } +} + +impl NodeProcessor for PodNodeProcessor { + fn process_packet( + &mut self, + sender_node_id: String, + current_node_id: String, + packet: HashMap, + success_ch_tx: Sender, + failure_ch_tx: Sender, + ) -> Result<()> { + // Process the packet using the pod -struct MapperProcessor {} + // Create the pod_job + let mut buf = Vec::new(); + let mut serializer = Serializer::new(&mut buf); + serialize_hashmap(&packet, &mut serializer)?; + let input_packet_hash = hash_buffer(buf); + let output_dir = URI { + namespace: self.namespace.clone(), + path: PathBuf::from(format!("pod_runs/{}/{}", self.pod.hash, input_packet_hash)), + }; + + let cpu_limit = self.pod.recommended_cpus; + let memory_limit = self.pod.recommended_memory; -struct JoinNodeProcessor { + // Create the pod job + let pod_job = PodJob::new( + None, + self.pod.clone(), + packet.clone(), + output_dir, + cpu_limit, + memory_limit, + None, + &self.namespace_lookup, + )?; + + // Simulate pod execution by just printing out pod_job_hash and pod hash + // This will be replaced by sending the pod_job to the orchestrator via the agent + println!( + "Executing pod job: {} with pod hash: {}", + pod_job.hash, pod_job.pod.hash + ); + + // For now we will just send the input_packet to the success channel + success_ch_tx.send(Message::NodeOutput(current_node_id, packet.clone()))?; + + Ok(()) + } +} +struct MapperProcessor { + mapper: Arc, +} + +impl NodeProcessor for MapperProcessor { + fn process_packet( + &mut self, + sender_node_id: String, + current_node_id: String, + packet: HashMap, + success_ch_tx: Sender, + _failure_ch_tx: Sender, + ) -> Result<()> { + // Apply the mapping to the input packet + let output_map = self + .mapper + .mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(&packet, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; + + // Send the output via the channel + success_ch_tx.send(Message::NodeOutput(sender_node_id, output_map))?; + Ok(()) + } +} + +struct JoinerNodeProcessor { /// Cache for all packets received by the node input_packet_cache: HashMap>>, } -impl JoinNodeProcessor { - fn new(self, parents_node_id: Vec) -> Self { +impl JoinerNodeProcessor { + fn new(parents_node_id: Vec) -> Self { let input_packet_cache = parents_node_id .into_iter() .map(|id| (id, Vec::new())) @@ -384,7 +465,7 @@ impl JoinNodeProcessor { fn compute_new_packet_combination( &self, - sender_node_id: String, + sender_node_id: &str, new_packet: &HashMap, ) -> Result>> { // Combine the new packet with the existing packets in the cache @@ -392,7 +473,7 @@ impl JoinNodeProcessor { let other_parent_ids = self .input_packet_cache .keys() - .filter(|key| *key != &sender_node_id); + .filter(|key| *key != sender_node_id); let mut factors = other_parent_ids .map(|id| get(&self.input_packet_cache, id)) .collect::>>()?; @@ -418,20 +499,27 @@ impl JoinNodeProcessor { } } -impl ProcessPacket for JoinNodeProcessor { +impl NodeProcessor for JoinerNodeProcessor { fn process_packet( &mut self, sender_node_id: String, + current_node_id: String, packet: HashMap, success_ch_tx: Sender, failure_ch_tx: Sender, ) -> Result<()> { match { - get(&self.input_packet_cache, &sender_node_id)?.push(packet); - // Compute the new packet combination based on the sender node id and the packet let new_packets_to_send = - self.compute_new_packet_combination(sender_node_id, &packet)?; + self.compute_new_packet_combination(&sender_node_id, &packet)?; + + // Record the packet into the cache + self.input_packet_cache + .get_mut(&sender_node_id) + .context(selector::KeyMissing { + key: sender_node_id.clone(), + })? + .push(packet); Ok::>, OrcaError>(new_packets_to_send) } { @@ -439,7 +527,7 @@ impl ProcessPacket for JoinNodeProcessor { // Send the output packets to the success channel for output_packet in output_packets { success_ch_tx - .send(Message::NodeOutput(sender_node_id.clone(), output_packet))?; + .send(Message::NodeOutput(current_node_id.clone(), output_packet))?; } } Err(err) => { From 5cd16d2ebf7510e6613de69b9c076de275c5ba86 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 9 Jul 2025 14:28:36 +0000 Subject: [PATCH 03/29] Save progress --- cspell.json | 3 ++- src/core/error.rs | 2 +- src/core/orchestrator/docker.rs | 2 +- src/uniffi/error.rs | 13 +++++------ src/uniffi/pipeline_runner/docker.rs | 32 +++++++++++++++------------- tests/fixture/mod.rs | 4 ++++ tests/pipeline.rs | 4 ++++ 7 files changed, 34 insertions(+), 26 deletions(-) diff --git a/cspell.json b/cspell.json index c32f7e17..e3315aad 100644 --- a/cspell.json +++ b/cspell.json @@ -75,7 +75,8 @@ "getrandom", "wasi", "petgraph", - "rfind" + "rfind", + "itertools" ], "useGitignore": false, "ignorePaths": [ diff --git a/src/core/error.rs b/src/core/error.rs index a43fa08d..af0eb7aa 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -88,7 +88,7 @@ impl From> for OrcaError { fn from(error: SendError) -> Self { Self { kind: Kind::SendError { - source: error, + reason: error.to_string(), backtrace: Some(Backtrace::capture()), }, } diff --git a/src/core/orchestrator/docker.rs b/src/core/orchestrator/docker.rs index b6d22f9d..999f01a5 100644 --- a/src/core/orchestrator/docker.rs +++ b/src/core/orchestrator/docker.rs @@ -38,7 +38,7 @@ impl LocalDockerOrchestrator { fn prepare_mount_binds( namespace_lookup: &HashMap, pod_job: &PodJob, - ) -> Result<(Vec)> { + ) -> Result<(Vec, [String; 1])> { // all host mounted paths need to be absolute let host_output_directory = path::absolute( namespace_lookup[&pod_job.output_dir.namespace].join(&pod_job.output_dir.path), diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 661ec775..85eb7103 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -14,11 +14,8 @@ use std::{ path::{self, PathBuf}, result, }; -use tokio::sync::broadcast::error::SendError; use uniffi; -use crate::uniffi::pipeline_runner::docker::Message; - /// Shorthand for a Result that returns an `OrcaError`. pub type Result = result::Result; /// Possible errors you may encounter. @@ -89,6 +86,11 @@ pub(crate) enum Kind { missing_keys: Vec, backtrace: Option, }, + #[snafu(display("Failed to send message because: {reason}"))] + SendError { + reason: String, + backtrace: Option, + }, #[snafu(transparent)] BollardError { source: BollardError, @@ -110,11 +112,6 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] - SendError { - source: SendError, - backtrace: Option, - }, - #[snafu(transparent)] SerdeJsonError { source: serde_json::Error, backtrace: Option, diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index 7ebad3e7..a8c6782b 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -8,7 +8,7 @@ use crate::{ }, }; use futures_util::stream::FuturesUnordered; -use itertools::Itertools; +use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; use std::{collections::HashMap, path::PathBuf, sync::Arc}; @@ -52,7 +52,7 @@ impl DockerPipelineRunner { pub fn start( &mut self, pipeline_job: PipelineJob, - namespace_lookup: HashMap, + namespace_lookup: &HashMap, ) -> Result { // Create a new pipeline run let pipeline_run = PipelineRun { pipeline_job }; @@ -80,7 +80,7 @@ impl DockerPipelineRunner { // Get all the leaf nodes and call the create_task_for_node function for each leaf node // This will recursively create all the tasks and channels for the pipeline pipeline.get_leaf_nodes().try_for_each(|node| { - self.create_task_for_node(node, &pipeline_run_arc, &source_tx, &namespace_lookup)?; + self.create_task_for_node(node, &pipeline_run_arc, &source_tx, namespace_lookup)?; Ok::<(), OrcaError>(()) })?; @@ -209,7 +209,7 @@ impl DockerPipelineRunner { pipeline_run: Arc, parent_channel_rxs: Vec>, mut job_manager_channel: Receiver, - tx: Sender, + success_ch_tx: Sender, namespace_lookup: HashMap, ) -> Result<()> { // Create a futures unordered set to dynamically listen to N number of receivers @@ -233,7 +233,7 @@ impl DockerPipelineRunner { Kernel::Pod(pod) => { // Create a processor for the pod node Box::new(PodNodeProcessor::new( - pod.clone(), + Arc::clone(pod), pipeline_run.pipeline_job.output_dir.namespace.clone(), namespace_lookup.clone(), )) @@ -241,7 +241,7 @@ impl DockerPipelineRunner { Kernel::Mapper(mapper) => { // Create a processor for the mapper node Box::new(MapperProcessor { - mapper: mapper.clone(), + mapper: Arc::clone(mapper), }) } Kernel::Joiner => { @@ -262,7 +262,7 @@ impl DockerPipelineRunner { let rx_result = match result { Ok(rx_result) => rx_result, Err(err) => { - // Record into pipeilne_error log + // Record into pipeline_error log if err.is_panic() { eprintln!("Task panicked: {err}"); } else { @@ -294,10 +294,10 @@ impl DockerPipelineRunner { fn process_packet_pod( node: &Node, pod: Arc, - success_ch_tx: Sender, - failure_ch_tx: Sender, - input_packet: HashMap, - pipeline_run: Arc, + success_ch_tx: &Sender, + failure_ch_tx: &Sender, + input_packet: &HashMap, + pipeline_run: &Arc, namespace_lookup: &HashMap, ) -> Result<()> { // Output directory is pod_runs/pod_run_id/node_id/hash_of_input_packet @@ -305,7 +305,7 @@ impl DockerPipelineRunner { // Compute the hash of the input_packet let mut buf = Vec::new(); let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(&input_packet, &mut serializer)?; + serialize_hashmap(input_packet, &mut serializer)?; let input_packet_hash = hash_buffer(buf); let output_dir = URI { namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), @@ -397,7 +397,7 @@ impl NodeProcessor for PodNodeProcessor { // Create the pod job let pod_job = PodJob::new( None, - self.pod.clone(), + Arc::clone(&self.pod), packet.clone(), output_dir, cpu_limit, @@ -508,7 +508,7 @@ impl NodeProcessor for JoinerNodeProcessor { success_ch_tx: Sender, failure_ch_tx: Sender, ) -> Result<()> { - match { + let process_result = { // Compute the new packet combination based on the sender node id and the packet let new_packets_to_send = self.compute_new_packet_combination(&sender_node_id, &packet)?; @@ -522,7 +522,9 @@ impl NodeProcessor for JoinerNodeProcessor { .push(packet); Ok::>, OrcaError>(new_packets_to_send) - } { + }; + + match process_result { Ok(output_packets) => { // Send the output packets to the success channel for output_packet in output_packets { diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index f3dc49ea..c787dc93 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -292,6 +292,10 @@ pub fn pipeline_job() -> Result { ..Default::default() }), )]), + URI { + namespace: "default".to_owned(), + path: PathBuf::from("output"), + }, Some(Annotation { name: "Example Pipeline Job".to_owned(), description: "This is an example pipeline job.".to_owned(), diff --git a/tests/pipeline.rs b/tests/pipeline.rs index 25e03296..71a0f069 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -117,6 +117,10 @@ fn incorrect_input_packet() -> Result<()> { ..Default::default() }), )]), + URI { + namespace: "default".to_owned(), + path: PathBuf::from("output"), + }, None ) .is_err() From 81b8a49f8f2a9ef06cafab9d2d1455b5aa58098e Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 9 Jul 2025 18:10:50 +0000 Subject: [PATCH 04/29] Save redesign --- cspell.json | 3 +- src/uniffi/pipeline_runner/docker.rs | 522 ++++++++++++++------------- tests/pipeline_runner.rs | 33 ++ 3 files changed, 304 insertions(+), 254 deletions(-) create mode 100644 tests/pipeline_runner.rs diff --git a/cspell.json b/cspell.json index e3315aad..6f799bdc 100644 --- a/cspell.json +++ b/cspell.json @@ -76,7 +76,8 @@ "wasi", "petgraph", "rfind", - "itertools" + "itertools", + "oneshot" ], "useGitignore": false, "ignorePaths": [ diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index a8c6782b..523d5120 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -13,14 +13,20 @@ use serde_yaml::Serializer; use snafu::OptionExt as _; use std::{collections::HashMap, path::PathBuf, sync::Arc}; use tokio::{ - sync::broadcast::{self, Receiver, Sender}, - task::JoinSet, + sync::{ + broadcast::{self, Receiver, Sender, error::RecvError}, + oneshot, + }, + task::{JoinHandle, JoinSet}, }; use tokio_stream::StreamExt as _; #[derive(Clone, Debug)] pub(crate) enum Message { - NodeOutput(String, HashMap), // String is the parent_node_id, while HashMap is output of the parent node + /// String is the `parent_node_id`, while `HashMap` is output of the parent node + NodeOutput(String, HashMap), + /// String is the `node_id` that has completed processing + NodeProcessingComplete(String), ProcessingFailed(String, Arc), // String is the node_id, while OrcaError is the error that occurred Stop, // Message to halt all operations } @@ -130,6 +136,7 @@ impl DockerPipelineRunner { source_tx: &Sender, namespace_lookup: &HashMap, ) -> Result> { + println!("Creating task for node: {}", node.id); // Get the input channels for this node which should be it's parents let mut input_ch_rxs = pipeline_run .pipeline_job @@ -212,53 +219,127 @@ impl DockerPipelineRunner { success_ch_tx: Sender, namespace_lookup: HashMap, ) -> Result<()> { + // Create a channel to for waiting when the node processing is complete + let (node_complete_tx, node_complete_rx) = oneshot::channel::<()>(); + // Create a futures unordered set to dynamically listen to N number of receivers - let mut futures = FuturesUnordered::new(); + let chs_to_listen_to = FuturesUnordered::new(); // Add all the parent channel receivers to the futures unordered set for mut rx in parent_channel_rxs { - futures.push(tokio::spawn(async move { rx.recv().await })); + chs_to_listen_to.push(tokio::spawn(async move { rx.recv().await })); } // Add the job manager channel to the futures unordered set - futures.push(tokio::spawn( + chs_to_listen_to.push(tokio::spawn( async move { job_manager_channel.recv().await }, )); // Get the kernel for this node and build the correct processor - let mut processor: Box = match get( + match get( &pipeline_run.pipeline_job.pipeline.kernel_lut, &node.kernel_hash, )? { - Kernel::Pod(pod) => { - // Create a processor for the pod node - Box::new(PodNodeProcessor::new( - Arc::clone(pod), - pipeline_run.pipeline_job.output_dir.namespace.clone(), - namespace_lookup.clone(), - )) - } + Kernel::Pod(pod) => PodNodeProcessor::new( + Arc::clone(pod), + node.id.clone(), + chs_to_listen_to, + success_ch_tx, + pipeline_run.pipeline_job.output_dir.namespace.clone(), + namespace_lookup, + node_complete_tx, + ), Kernel::Mapper(mapper) => { - // Create a processor for the mapper node - Box::new(MapperProcessor { - mapper: Arc::clone(mapper), - }) + todo!() } Kernel::Joiner => { - // Get the parents of the join node - let parent_nodes_id = pipeline_run - .pipeline_job - .pipeline - .get_parents_for_node(&node) - .map(|parent_node| parent_node.id.clone()) - .collect::>(); - // Create a processor for the join node - Box::new(JoinerNodeProcessor::new(parent_nodes_id)) + todo!() } }; + node_complete_rx.await; + + // // Listen to the MPSC channel and handle messages + // while let Some(result) = chs_to_listen_to.next().await { + // let rx_result = match result { + // Ok(rx_result) => rx_result, + // Err(err) => { + // // Record into pipeline_error log + // if err.is_panic() { + // eprintln!("Task panicked: {err}"); + // } else { + // eprintln!("Error receiving message: {err}"); + // } + // continue; + // } + // }; + + // let Ok(msg) = rx_result else { + // eprintln!("Failed to receive message from parent channel"); + // continue; + // }; + + // match msg { + // Message::NodeOutput(sender_node_id, packet) => { + // // Inputs from parents are ready, thus we need to process them if they are already computed and cached + // processor.process_packet( + // &sender_node_id, + // &node.id, + // packet, + // success_ch_tx.clone(), + // failure_ch_tx.clone(), + // )?; + // } + // Message::Stop => { + // todo!() + // } + // Message::ProcessingFailed(_, orca_error) => todo!(), + // Message::NodeProcessingComplete(node_id) => , + // } + // } + + Ok(()) + } +} + +struct PodNodeProcessor { + pod: Arc, + node_id: String, + ch_to_listen_to: FuturesUnordered>>, + success_ch_tx: Sender, // Channel to send successful outputs to the next node + namespace: String, + namespace_lookup: HashMap, // Copy of the look up table + node_complete_tx: oneshot::Sender<()>, + processing_tasks: JoinSet>, +} + +impl PodNodeProcessor { + fn new( + pod: Arc, + node_id: String, + ch_to_listen_to: FuturesUnordered>>, + success_ch_tx: Sender, + namespace: String, + namespace_lookup: HashMap, + node_complete_tx: oneshot::Sender<()>, + ) -> Self { + Self { + pod, + node_id, + ch_to_listen_to, + success_ch_tx, + namespace, + namespace_lookup, + node_complete_tx, + processing_tasks: JoinSet::new(), + } + } + + async fn start(&mut self) { + // Start to listen to the channels // Listen to the MPSC channel and handle messages - while let Some(result) = futures.next().await { + + while let Some(result) = self.ch_to_listen_to.next().await { let rx_result = match result { Ok(rx_result) => rx_result, Err(err) => { @@ -278,37 +359,50 @@ impl DockerPipelineRunner { }; match msg { - Message::NodeOutput(sender_node_id, input_packet) => { - // Inputs from parents are ready, thus we need to process them if they are already computed and cached + Message::NodeOutput(sender_node_id, packet) => { + let pod_ref = Arc::clone(&self.pod); + let node_id = self.node_id.clone(); + let namespace = self.namespace.clone(); + let namespace_lookup = self.namespace_lookup.clone(); + let success_ch_tx = self.success_ch_tx.clone(); + // Forward it into a processing task + self.processing_tasks.spawn(async move { + Self::process_packet( + &node_id, + &pod_ref, + &namespace, + &namespace_lookup, + &packet, + &success_ch_tx, + ) + }); } Message::Stop => { - break; + todo!() } Message::ProcessingFailed(_, orca_error) => todo!(), + Message::NodeProcessingComplete(node_id) => todo!(), } } - - Ok(()) } - fn process_packet_pod( - node: &Node, - pod: Arc, - success_ch_tx: &Sender, - failure_ch_tx: &Sender, - input_packet: &HashMap, - pipeline_run: &Arc, + fn process_packet( + node_id: &str, + pod: &Arc, + namespace: &str, namespace_lookup: &HashMap, + packet: &HashMap, + success_ch_tx: &Sender, ) -> Result<()> { - // Output directory is pod_runs/pod_run_id/node_id/hash_of_input_packet + // Process the packet using the pod - // Compute the hash of the input_packet + // Create the pod_job let mut buf = Vec::new(); let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(input_packet, &mut serializer)?; + serialize_hashmap(packet, &mut serializer)?; let input_packet_hash = hash_buffer(buf); let output_dir = URI { - namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), + namespace: namespace.to_owned(), path: PathBuf::from(format!("pod_runs/{}/{}", pod.hash, input_packet_hash)), }; @@ -318,92 +412,13 @@ impl DockerPipelineRunner { // Create the pod job let pod_job = PodJob::new( None, - pod, - input_packet.clone(), - output_dir, - cpu_limit, - memory_limit, - None, - namespace_lookup, - )?; - - // Simulate pod execution by just printing out pod_job_hash and pod hash - // This will be replaced by sending the pod_job to the orchestrator via the agent - println!( - "Executing pod job: {} with pod hash: {}", - pod_job.hash, pod_job.pod.hash - ); - - // For now we will just send the input_packet to the success channel - success_ch_tx.send(Message::NodeOutput(node.id.clone(), input_packet.clone()))?; - - Ok(()) - } -} - -trait NodeProcessor: Send { - fn process_packet( - &mut self, - sender_node_id: String, - current_node_id: String, - packet: HashMap, - success_ch_tx: Sender, - failure_ch_tx: Sender, - ) -> Result<()>; -} - -struct PodNodeProcessor { - pod: Arc, - namespace: String, - namespace_lookup: HashMap, // Copy of the look up table - processing_tasks: JoinSet<()>, -} - -impl PodNodeProcessor { - fn new(pod: Arc, namespace: String, namespace_lookup: HashMap) -> Self { - Self { - pod, - namespace, - namespace_lookup, - processing_tasks: JoinSet::new(), - } - } -} - -impl NodeProcessor for PodNodeProcessor { - fn process_packet( - &mut self, - sender_node_id: String, - current_node_id: String, - packet: HashMap, - success_ch_tx: Sender, - failure_ch_tx: Sender, - ) -> Result<()> { - // Process the packet using the pod - - // Create the pod_job - let mut buf = Vec::new(); - let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(&packet, &mut serializer)?; - let input_packet_hash = hash_buffer(buf); - let output_dir = URI { - namespace: self.namespace.clone(), - path: PathBuf::from(format!("pod_runs/{}/{}", self.pod.hash, input_packet_hash)), - }; - - let cpu_limit = self.pod.recommended_cpus; - let memory_limit = self.pod.recommended_memory; - - // Create the pod job - let pod_job = PodJob::new( - None, - Arc::clone(&self.pod), + Arc::clone(pod), packet.clone(), output_dir, cpu_limit, memory_limit, None, - &self.namespace_lookup, + namespace_lookup, )?; // Simulate pod execution by just printing out pod_job_hash and pod hash @@ -414,135 +429,136 @@ impl NodeProcessor for PodNodeProcessor { ); // For now we will just send the input_packet to the success channel - success_ch_tx.send(Message::NodeOutput(current_node_id, packet.clone()))?; + success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone()))?; Ok(()) } } -struct MapperProcessor { - mapper: Arc, -} -impl NodeProcessor for MapperProcessor { - fn process_packet( - &mut self, - sender_node_id: String, - current_node_id: String, - packet: HashMap, - success_ch_tx: Sender, - _failure_ch_tx: Sender, - ) -> Result<()> { - // Apply the mapping to the input packet - let output_map = self - .mapper - .mapping - .iter() - .map(|(input_key, output_key)| { - let input = get(&packet, input_key)?.clone(); - Ok((output_key.to_owned(), input)) - }) - .collect::>>()?; - - // Send the output via the channel - success_ch_tx.send(Message::NodeOutput(sender_node_id, output_map))?; - Ok(()) - } -} - -struct JoinerNodeProcessor { - /// Cache for all packets received by the node - input_packet_cache: HashMap>>, -} - -impl JoinerNodeProcessor { - fn new(parents_node_id: Vec) -> Self { - let input_packet_cache = parents_node_id - .into_iter() - .map(|id| (id, Vec::new())) - .collect(); - Self { input_packet_cache } - } - - fn compute_new_packet_combination( - &self, - sender_node_id: &str, - new_packet: &HashMap, - ) -> Result>> { - // Combine the new packet with the existing packets in the cache - // Get all the cached packets from other parents - let other_parent_ids = self - .input_packet_cache - .keys() - .filter(|key| *key != sender_node_id); - let mut factors = other_parent_ids - .map(|id| get(&self.input_packet_cache, id)) - .collect::>>()?; - - // Add the new incoming packet as a factor - let incoming_packet = vec![new_packet.clone()]; - factors.push(&incoming_packet); - - let result = factors - .into_iter() - .multi_cartesian_product() - .map(|packets_to_combined| { - packets_to_combined - .into_iter() - .fold(HashMap::new(), |mut acc, packet| { - acc.extend(packet.clone()); - acc - }) - }) - .collect::>(); - - Ok(result) - } -} - -impl NodeProcessor for JoinerNodeProcessor { - fn process_packet( - &mut self, - sender_node_id: String, - current_node_id: String, - packet: HashMap, - success_ch_tx: Sender, - failure_ch_tx: Sender, - ) -> Result<()> { - let process_result = { - // Compute the new packet combination based on the sender node id and the packet - let new_packets_to_send = - self.compute_new_packet_combination(&sender_node_id, &packet)?; - - // Record the packet into the cache - self.input_packet_cache - .get_mut(&sender_node_id) - .context(selector::KeyMissing { - key: sender_node_id.clone(), - })? - .push(packet); - - Ok::>, OrcaError>(new_packets_to_send) - }; - - match process_result { - Ok(output_packets) => { - // Send the output packets to the success channel - for output_packet in output_packets { - success_ch_tx - .send(Message::NodeOutput(current_node_id.clone(), output_packet))?; - } - } - Err(err) => { - // Send the error to the failure channel - failure_ch_tx.send(Message::NodeOutput( - sender_node_id.clone(), - HashMap::new(), // Empty packet on failure - ))?; - return Err(err); - } - } - // Add the new packet into the cache - - Ok(()) - } -} +// struct MapperProcessor { +// mapper: Arc, +// } + +// impl NodeProcessor for MapperProcessor { +// fn process_packet( +// &mut self, +// sender_node_id: String, +// current_node_id: String, +// packet: HashMap, +// success_ch_tx: Sender, +// _failure_ch_tx: Sender, +// ) -> Result<()> { +// // Apply the mapping to the input packet +// let output_map = self +// .mapper +// .mapping +// .iter() +// .map(|(input_key, output_key)| { +// let input = get(&packet, input_key)?.clone(); +// Ok((output_key.to_owned(), input)) +// }) +// .collect::>>()?; + +// // Send the output via the channel +// success_ch_tx.send(Message::NodeOutput(sender_node_id, output_map))?; +// Ok(()) +// } +// } + +// struct JoinerNodeProcessor { +// /// Cache for all packets received by the node +// input_packet_cache: HashMap>>, +// } + +// impl JoinerNodeProcessor { +// fn new(parents_node_id: Vec) -> Self { +// let input_packet_cache = parents_node_id +// .into_iter() +// .map(|id| (id, Vec::new())) +// .collect(); +// Self { input_packet_cache } +// } + +// fn compute_new_packet_combination( +// &self, +// sender_node_id: &str, +// new_packet: &HashMap, +// ) -> Result>> { +// // Combine the new packet with the existing packets in the cache +// // Get all the cached packets from other parents +// let other_parent_ids = self +// .input_packet_cache +// .keys() +// .filter(|key| *key != sender_node_id); +// let mut factors = other_parent_ids +// .map(|id| get(&self.input_packet_cache, id)) +// .collect::>>()?; + +// // Add the new incoming packet as a factor +// let incoming_packet = vec![new_packet.clone()]; +// factors.push(&incoming_packet); + +// let result = factors +// .into_iter() +// .multi_cartesian_product() +// .map(|packets_to_combined| { +// packets_to_combined +// .into_iter() +// .fold(HashMap::new(), |mut acc, packet| { +// acc.extend(packet.clone()); +// acc +// }) +// }) +// .collect::>(); + +// Ok(result) +// } +// } + +// impl NodeProcessor for JoinerNodeProcessor { +// fn process_packet( +// &mut self, +// sender_node_id: String, +// current_node_id: String, +// packet: HashMap, +// success_ch_tx: Sender, +// failure_ch_tx: Sender, +// ) -> Result<()> { +// let process_result = { +// // Compute the new packet combination based on the sender node id and the packet +// let new_packets_to_send = +// self.compute_new_packet_combination(&sender_node_id, &packet)?; + +// // Record the packet into the cache +// self.input_packet_cache +// .get_mut(&sender_node_id) +// .context(selector::KeyMissing { +// key: sender_node_id.clone(), +// })? +// .push(packet); + +// Ok::>, OrcaError>(new_packets_to_send) +// }; + +// match process_result { +// Ok(output_packets) => { +// // Send the output packets to the success channel +// for output_packet in output_packets { +// success_ch_tx +// .send(Message::NodeOutput(current_node_id.clone(), output_packet))?; +// } +// } +// Err(err) => { +// // Send the error to the failure channel +// failure_ch_tx.send(Message::NodeOutput( +// sender_node_id.clone(), +// HashMap::new(), // Empty packet on failure +// ))?; +// return Err(err); +// } +// } +// // Add the new packet into the cache + +// Ok(()) +// } +// } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs new file mode 100644 index 00000000..7959088b --- /dev/null +++ b/tests/pipeline_runner.rs @@ -0,0 +1,33 @@ +#![expect(missing_docs, reason = "OK in tests.")] +// If 'fixture' is a local module, ensure there is a 'mod fixture;' statement or a 'fixture.rs' file in the same directory or in 'tests/'. +// If 'fixture' is an external crate, add it to Cargo.toml and import as shown below. +// use fixture::pipeline_job; +pub mod fixture; + +// Example for a local module: +use std::collections::HashMap; + +use orcapod::uniffi::{error::Result, pipeline_runner::docker::DockerPipelineRunner}; + +use crate::fixture::TestDirs; +use fixture::pipeline_job; + +#[tokio::test] +async fn basic_run() -> Result<()> { + let pipeline_job = pipeline_job()?; + + // Create the runner + let mut runner = DockerPipelineRunner::new(); + + let test_dirs = TestDirs::new(&HashMap::from([( + "default".to_owned(), + Some("./tests/extra/data/"), + )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); + + let pipeline_run = runner.start(pipeline_job, &namespace_lookup)?; + + // Wait for the pipeline run to complete + let result = runner.get_result(&pipeline_run).await?; + Ok(()) +} From 54d38be74664a3975a558d44cc34182b27b9d492 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 10:57:57 +0000 Subject: [PATCH 05/29] Fix bugs and implemented missing parts --- .clippy.toml | 2 +- src/core/error.rs | 20 +- src/core/util.rs | 20 +- src/uniffi/error.rs | 15 +- src/uniffi/pipeline.rs | 39 +- src/uniffi/pipeline_runner/docker.rs | 665 +++++++++++++++++---------- tests/fixture/mod.rs | 4 +- tests/pipeline.rs | 35 +- tests/pipeline_runner.rs | 1 + 9 files changed, 463 insertions(+), 338 deletions(-) diff --git a/.clippy.toml b/.clippy.toml index 5821063e..6b3b5fee 100644 --- a/.clippy.toml +++ b/.clippy.toml @@ -1,3 +1,3 @@ -excessive-nesting-threshold = 5 +excessive-nesting-threshold = 6 too-many-arguments-threshold = 10 allowed-idents-below-min-chars = ["..", "k", "v", "f", "re", "id", "Ok", "'_"] diff --git a/src/core/error.rs b/src/core/error.rs index af0eb7aa..06e3adef 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -12,7 +12,10 @@ use std::{ io, path::{self}, }; -use tokio::{sync::broadcast::error::SendError, task::JoinError}; +use tokio::{ + sync::{broadcast::error::SendError, oneshot}, + task::JoinError, +}; impl From for OrcaError { fn from(error: BollardError) -> Self { @@ -24,6 +27,16 @@ impl From for OrcaError { } } } +impl From for OrcaError { + fn from(error: oneshot::error::RecvError) -> Self { + Self { + kind: Kind::ChannelReceiveError { + source: error, + backtrace: Some(Backtrace::capture()), + }, + } + } +} impl From for OrcaError { fn from(error: glob::PatternError) -> Self { Self { @@ -113,7 +126,8 @@ fn format_stack(backtrace: Option<&Backtrace>) -> String { impl fmt::Debug for OrcaError { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match &self.kind { - Kind::EmptyResponseWhenLoadingContainerAltImage { backtrace, .. } + Kind::ReceiverDroppedBeforeSender { backtrace, .. } + | Kind::EmptyResponseWhenLoadingContainerAltImage { backtrace, .. } | Kind::FailedToParseDot { backtrace, .. } | Kind::GeneratedNamesOverflow { backtrace, .. } | Kind::InvalidFilepath { backtrace, .. } @@ -124,8 +138,8 @@ impl fmt::Debug for OrcaError { | Kind::NoFileName { backtrace, .. } | Kind::NoMatchingPodRun { backtrace, .. } | Kind::NoTagFoundInContainerAltImage { backtrace, .. } - | Kind::MissingInputSpecKey { backtrace, .. } | Kind::BollardError { backtrace, .. } + | Kind::ChannelReceiveError { backtrace, .. } | Kind::GlobPatternError { backtrace, .. } | Kind::IoError { backtrace, .. } | Kind::PathPrefixError { backtrace, .. } diff --git a/src/core/util.rs b/src/core/util.rs index d41ed4ac..2b7bf6b4 100644 --- a/src/core/util.rs +++ b/src/core/util.rs @@ -1,7 +1,4 @@ -use crate::uniffi::{ - error::{Result, selector}, - model::PathSet, -}; +use crate::uniffi::error::{Result, selector}; use snafu::OptionExt as _; use std::{ any::type_name, @@ -43,18 +40,3 @@ where })?; Ok(temp) } - -pub fn find_missing_keys<'a>( - input_map: &HashMap, - keys_to_check: impl Iterator, -) -> Vec { - keys_to_check - .filter_map(|key| { - if input_map.contains_key(key) { - None - } else { - Some(key.clone()) - } - }) - .collect() -} diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 85eb7103..72cae77f 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -14,6 +14,7 @@ use std::{ path::{self, PathBuf}, result, }; +use tokio::sync::oneshot; use uniffi; /// Shorthand for a Result that returns an `OrcaError`. @@ -23,6 +24,10 @@ pub type Result = result::Result; #[snafu(module(selector), visibility(pub(crate)), context(suffix(false)))] #[uniffi(flat_error)] pub(crate) enum Kind { + #[snafu(display( + "Receiver was dropped before sender could send a message for oneshot channel" + ))] + ReceiverDroppedBeforeSender { backtrace: Option }, #[snafu(display( "Received an empty response when attempting to load the alternate container image file: {path:?}." ))] @@ -81,11 +86,6 @@ pub(crate) enum Kind { path: PathBuf, backtrace: Option, }, - #[snafu(display("Input map missing required packet keys: {missing_keys:?}"))] - MissingInputSpecKey { - missing_keys: Vec, - backtrace: Option, - }, #[snafu(display("Failed to send message because: {reason}"))] SendError { reason: String, @@ -97,6 +97,11 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] + ChannelReceiveError { + source: oneshot::error::RecvError, + backtrace: Option, + }, + #[snafu(transparent)] GlobPatternError { source: glob::PatternError, backtrace: Option, diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index d90fac87..27d84dbd 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -1,9 +1,5 @@ use crate::{ - core::{ - crypto::hash_buffer, - model::to_yaml, - util::{find_missing_keys, get}, - }, + core::{crypto::hash_buffer, model::to_yaml, util::get}, uniffi::{ error::{Kind, OrcaError, Result}, model::{Annotation, PathSet, Pod, URI}, @@ -351,7 +347,8 @@ pub struct PipelineJob { #[serde(skip)] pub pipeline: Pipeline, /// Mapping of outside input to keys to be match with the pipeline `input_map` - pub input_map: HashMap, + pub input_packets: Vec>, + /// Directory where to store the outputs of the pipeline pub output_dir: URI, /// Annotation for the pipeline job pub annotation: Option, @@ -363,39 +360,15 @@ impl PipelineJob { /// Error out if there are missing keys or failed to convert to yaml pub fn new( pipeline: Pipeline, - input_packet: HashMap, + input_packets: Vec>, output_dir: URI, annotation: Option, ) -> Result { - // Check if input_map has all the requires keys - let missing_keys = pipeline - .get_root_nodes() - .map(|node| match pipeline.get_kernel(&node.kernel_hash)? { - Kernel::Pod(pod) => Ok(find_missing_keys(&input_packet, pod.input_spec.keys())), - Kernel::Mapper(mapper) => { - Ok(find_missing_keys(&input_packet, mapper.mapping.keys())) - } - Kernel::Joiner => Ok(Vec::::new()), // Should probably error out because joiner should not be a root node - }) - .collect::>>>()? - .into_iter() - .flatten() - .collect::>(); - - if !missing_keys.is_empty() { - return Err(OrcaError { - kind: Kind::MissingInputSpecKey { - missing_keys, - backtrace: Some(Backtrace::capture()), - }, - }); - } - // Create the job without_hash let no_hash = Self { hash: String::new(), pipeline, - input_map: input_packet, + input_packets, annotation, output_dir, }; @@ -406,6 +379,8 @@ impl PipelineJob { }) } } + +#[derive(uniffi::Object, Display, Debug, Clone, Serialize)] pub struct PipelineResult { pub pipeline_job: PipelineJob, } diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index 523d5120..1b279bb0 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -2,7 +2,7 @@ use super::PipelineRun; use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ - error::{OrcaError, Result, selector}, + error::{Kind, OrcaError, Result, selector}, model::{PathSet, Pod, PodJob, URI}, pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, @@ -11,9 +11,10 @@ use futures_util::stream::FuturesUnordered; use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; -use std::{collections::HashMap, path::PathBuf, sync::Arc}; +use std::{backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc}; use tokio::{ sync::{ + RwLock, broadcast::{self, Receiver, Sender, error::RecvError}, oneshot, }, @@ -27,16 +28,18 @@ pub(crate) enum Message { NodeOutput(String, HashMap), /// String is the `node_id` that has completed processing NodeProcessingComplete(String), - ProcessingFailed(String, Arc), // String is the node_id, while OrcaError is the error that occurred - Stop, // Message to halt all operations + Stop, // Message to halt all operations } +#[expect( + clippy::type_complexity, + reason = "too complex, but necessary for async handling" +)] struct PipelineRunInfo { node_task_join_set: JoinSet>, // Join set to track the tasks for this pipeline run job_manager_ch_tx: Sender, node_tx: HashMap>, - outputs: HashMap>, // String is the node key, while hash - namespace_lookup: HashMap, + outputs: Arc>>>>, // String is the node key, while hash } /// Docker based pipeline runner meant to execute on a single machine @@ -71,8 +74,7 @@ impl DockerPipelineRunner { job_manager_ch_tx: broadcast::channel::(1).0, node_tx: HashMap::new(), node_task_join_set: JoinSet::new(), - outputs: HashMap::new(), - namespace_lookup: namespace_lookup.clone(), + outputs: Arc::new(RwLock::new(HashMap::new())), }, ); @@ -87,11 +89,25 @@ impl DockerPipelineRunner { // This will recursively create all the tasks and channels for the pipeline pipeline.get_leaf_nodes().try_for_each(|node| { self.create_task_for_node(node, &pipeline_run_arc, &source_tx, namespace_lookup)?; + + // Since we don't have output nodes implemented, and currently it is set as leaf nodes, + // we can do the output handling logic here too + Ok::<(), OrcaError>(()) })?; - // Create a task to handle outputs of output nodes in pipeline - // for node_key in pipeline.output_nodes {} + // All pipeline tasks have been created, now we need to feed the inputs to the pipeline + pipeline_run + .pipeline_job + .input_packets + .iter() + .try_for_each(|input_map| { + source_tx.send(Message::NodeOutput("input".to_owned(), input_map.clone()))?; + Ok::<(), OrcaError>(()) + })?; + + // Send a message that all job inputs have been sent + source_tx.send(Message::NodeProcessingComplete("input".to_owned()))?; Ok(pipeline_run) } @@ -210,6 +226,75 @@ impl DockerPipelineRunner { Ok(tx) } + fn create_task_to_capture_output_of_node( + &mut self, + node: &Node, + pipeline_run: &Arc, + ) -> Result<()> { + let pipeline_run_info = + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })?; + // Get the output ch rx for the node + let node_rx = get(&pipeline_run_info.node_tx, &node.id)?.subscribe(); + // Create a new ref copy of pipeline_run_output + let outputs_ref = Arc::clone(&pipeline_run_info.outputs); + // Create a task to listen to it and record the outputs + pipeline_run_info + .node_task_join_set + .spawn(Self::capture_node_output(node_rx, outputs_ref)); + + Ok(()) + } + + #[expect( + clippy::type_complexity, + reason = "too complex, but necessary for async handling" + )] + async fn capture_node_output( + mut node_rx: Receiver, + outputs_ref: Arc>>>>, + ) -> Result<()> { + loop { + let message = match node_rx.recv().await { + Ok(message) => message, + Err(err) => { + match err { + RecvError::Closed => { + // No more message will be received, thus we can exit the loop + // Only case where this will occur is when the channel is closed due to abort + break; + } + RecvError::Lagged(_) => { + print!("Warning: Channel lagged, skipping message"); + } + } + continue; + } + }; + match message { + Message::NodeOutput(node_id, hash_map) => { + // Record the output + + outputs_ref + .write() + .await + .entry(node_id) + .or_default() + .push(hash_map); + } + Message::NodeProcessingComplete(_) | Message::Stop => { + // Node processing is complete, we can stop listening to this channel + break; + } + } + } + + Ok(()) + } + /// For tx: Sender, we only want to send successfully completed results to the next node async fn start_node_manager( node: Node, @@ -220,7 +305,7 @@ impl DockerPipelineRunner { namespace_lookup: HashMap, ) -> Result<()> { // Create a channel to for waiting when the node processing is complete - let (node_complete_tx, node_complete_rx) = oneshot::channel::<()>(); + let (processing_complete_ch_tx, processing_complete_ch_rx) = oneshot::channel::<()>(); // Create a futures unordered set to dynamically listen to N number of receivers let chs_to_listen_to = FuturesUnordered::new(); @@ -235,111 +320,66 @@ impl DockerPipelineRunner { async move { job_manager_channel.recv().await }, )); + // Create a metadata struct for this node + let node_metadata = NodeMetaData { + node_id: node.id.clone(), + ch_to_listen_to: chs_to_listen_to, + success_ch_tx: success_ch_tx.clone(), + namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), + namespace_lookup: namespace_lookup.clone(), + }; + // Get the kernel for this node and build the correct processor match get( &pipeline_run.pipeline_job.pipeline.kernel_lut, &node.kernel_hash, )? { - Kernel::Pod(pod) => PodNodeProcessor::new( - Arc::clone(pod), - node.id.clone(), - chs_to_listen_to, - success_ch_tx, - pipeline_run.pipeline_job.output_dir.namespace.clone(), - namespace_lookup, - node_complete_tx, - ), + Kernel::Pod(pod) => { + let mut processor = PodProcessor::new(Arc::clone(pod), node_metadata); + processor.start(processing_complete_ch_tx).await?; + processing_complete_ch_rx.await?; + } Kernel::Mapper(mapper) => { - todo!() + let mut processor = MapperProcessor::new(Arc::clone(mapper), node_metadata); + processor.start(processing_complete_ch_tx).await?; + processing_complete_ch_rx.await?; } Kernel::Joiner => { - todo!() + let parent_nodes_id = pipeline_run + .pipeline_job + .pipeline + .get_parents_for_node(&node) + .map(|parent_node| parent_node.id.clone()) + .collect::>(); + let mut processor = JoinerProcessor::new(parent_nodes_id, node_metadata); + processor.start(processing_complete_ch_tx).await?; + processing_complete_ch_rx.await?; } - }; - - node_complete_rx.await; - - // // Listen to the MPSC channel and handle messages - // while let Some(result) = chs_to_listen_to.next().await { - // let rx_result = match result { - // Ok(rx_result) => rx_result, - // Err(err) => { - // // Record into pipeline_error log - // if err.is_panic() { - // eprintln!("Task panicked: {err}"); - // } else { - // eprintln!("Error receiving message: {err}"); - // } - // continue; - // } - // }; - - // let Ok(msg) = rx_result else { - // eprintln!("Failed to receive message from parent channel"); - // continue; - // }; - - // match msg { - // Message::NodeOutput(sender_node_id, packet) => { - // // Inputs from parents are ready, thus we need to process them if they are already computed and cached - // processor.process_packet( - // &sender_node_id, - // &node.id, - // packet, - // success_ch_tx.clone(), - // failure_ch_tx.clone(), - // )?; - // } - // Message::Stop => { - // todo!() - // } - // Message::ProcessingFailed(_, orca_error) => todo!(), - // Message::NodeProcessingComplete(node_id) => , - // } - // } + } Ok(()) } } -struct PodNodeProcessor { - pod: Arc, +struct NodeMetaData { node_id: String, ch_to_listen_to: FuturesUnordered>>, success_ch_tx: Sender, // Channel to send successful outputs to the next node namespace: String, namespace_lookup: HashMap, // Copy of the look up table - node_complete_tx: oneshot::Sender<()>, - processing_tasks: JoinSet>, } -impl PodNodeProcessor { - fn new( - pod: Arc, - node_id: String, - ch_to_listen_to: FuturesUnordered>>, - success_ch_tx: Sender, - namespace: String, - namespace_lookup: HashMap, - node_complete_tx: oneshot::Sender<()>, - ) -> Self { - Self { - pod, - node_id, - ch_to_listen_to, - success_ch_tx, - namespace, - namespace_lookup, - node_complete_tx, - processing_tasks: JoinSet::new(), - } - } +trait NodeProcessor { + fn get_ch_to_listen_to( + &mut self, + ) -> &mut FuturesUnordered>>; - async fn start(&mut self) { + async fn wait_for_node_task_completion(&mut self); + + async fn start(&mut self, process_complete_ch_tx: oneshot::Sender<()>) -> Result<()> { // Start to listen to the channels // Listen to the MPSC channel and handle messages - - while let Some(result) = self.ch_to_listen_to.next().await { + while let Some(result) = self.get_ch_to_listen_to().next().await { let rx_result = match result { Ok(rx_result) => rx_result, Err(err) => { @@ -358,32 +398,42 @@ impl PodNodeProcessor { continue; }; - match msg { - Message::NodeOutput(sender_node_id, packet) => { - let pod_ref = Arc::clone(&self.pod); - let node_id = self.node_id.clone(); - let namespace = self.namespace.clone(); - let namespace_lookup = self.namespace_lookup.clone(); - let success_ch_tx = self.success_ch_tx.clone(); - // Forward it into a processing task - self.processing_tasks.spawn(async move { - Self::process_packet( - &node_id, - &pod_ref, - &namespace, - &namespace_lookup, - &packet, - &success_ch_tx, - ) - }); - } - Message::Stop => { - todo!() - } - Message::ProcessingFailed(_, orca_error) => todo!(), - Message::NodeProcessingComplete(node_id) => todo!(), + // Process the message + if self.process_msg(msg).await? { + // If the message indicates that processing is complete, we can exit the loop + // Wait for all processing tasks to complete before sending the completion message + + self.wait_for_node_task_completion().await; + + // Send the node processing complete message + process_complete_ch_tx.send(()).map_err(|()| OrcaError { + kind: Kind::ReceiverDroppedBeforeSender { + backtrace: Some(Backtrace::capture()), + }, + })?; + break; } } + + Ok(()) + } + + async fn process_msg(&mut self, msg: Message) -> Result; +} + +struct PodProcessor { + pod: Arc, + node_metadata: NodeMetaData, + processing_tasks: JoinSet>, +} + +impl PodProcessor { + fn new(pod: Arc, node_metadata: NodeMetaData) -> Self { + Self { + pod, + node_metadata, + processing_tasks: JoinSet::new(), + } } fn process_packet( @@ -435,130 +485,259 @@ impl PodNodeProcessor { } } -// struct MapperProcessor { -// mapper: Arc, -// } - -// impl NodeProcessor for MapperProcessor { -// fn process_packet( -// &mut self, -// sender_node_id: String, -// current_node_id: String, -// packet: HashMap, -// success_ch_tx: Sender, -// _failure_ch_tx: Sender, -// ) -> Result<()> { -// // Apply the mapping to the input packet -// let output_map = self -// .mapper -// .mapping -// .iter() -// .map(|(input_key, output_key)| { -// let input = get(&packet, input_key)?.clone(); -// Ok((output_key.to_owned(), input)) -// }) -// .collect::>>()?; - -// // Send the output via the channel -// success_ch_tx.send(Message::NodeOutput(sender_node_id, output_map))?; -// Ok(()) -// } -// } - -// struct JoinerNodeProcessor { -// /// Cache for all packets received by the node -// input_packet_cache: HashMap>>, -// } - -// impl JoinerNodeProcessor { -// fn new(parents_node_id: Vec) -> Self { -// let input_packet_cache = parents_node_id -// .into_iter() -// .map(|id| (id, Vec::new())) -// .collect(); -// Self { input_packet_cache } -// } - -// fn compute_new_packet_combination( -// &self, -// sender_node_id: &str, -// new_packet: &HashMap, -// ) -> Result>> { -// // Combine the new packet with the existing packets in the cache -// // Get all the cached packets from other parents -// let other_parent_ids = self -// .input_packet_cache -// .keys() -// .filter(|key| *key != sender_node_id); -// let mut factors = other_parent_ids -// .map(|id| get(&self.input_packet_cache, id)) -// .collect::>>()?; - -// // Add the new incoming packet as a factor -// let incoming_packet = vec![new_packet.clone()]; -// factors.push(&incoming_packet); - -// let result = factors -// .into_iter() -// .multi_cartesian_product() -// .map(|packets_to_combined| { -// packets_to_combined -// .into_iter() -// .fold(HashMap::new(), |mut acc, packet| { -// acc.extend(packet.clone()); -// acc -// }) -// }) -// .collect::>(); - -// Ok(result) -// } -// } - -// impl NodeProcessor for JoinerNodeProcessor { -// fn process_packet( -// &mut self, -// sender_node_id: String, -// current_node_id: String, -// packet: HashMap, -// success_ch_tx: Sender, -// failure_ch_tx: Sender, -// ) -> Result<()> { -// let process_result = { -// // Compute the new packet combination based on the sender node id and the packet -// let new_packets_to_send = -// self.compute_new_packet_combination(&sender_node_id, &packet)?; - -// // Record the packet into the cache -// self.input_packet_cache -// .get_mut(&sender_node_id) -// .context(selector::KeyMissing { -// key: sender_node_id.clone(), -// })? -// .push(packet); - -// Ok::>, OrcaError>(new_packets_to_send) -// }; - -// match process_result { -// Ok(output_packets) => { -// // Send the output packets to the success channel -// for output_packet in output_packets { -// success_ch_tx -// .send(Message::NodeOutput(current_node_id.clone(), output_packet))?; -// } -// } -// Err(err) => { -// // Send the error to the failure channel -// failure_ch_tx.send(Message::NodeOutput( -// sender_node_id.clone(), -// HashMap::new(), // Empty packet on failure -// ))?; -// return Err(err); -// } -// } -// // Add the new packet into the cache - -// Ok(()) -// } -// } +impl NodeProcessor for PodProcessor { + async fn process_msg(&mut self, msg: Message) -> Result { + match msg { + Message::NodeOutput(sender_node_id, packet) => { + let pod_ref = Arc::clone(&self.pod); + let node_id = self.node_metadata.node_id.clone(); + let namespace = self.node_metadata.namespace.clone(); + let namespace_lookup = self.node_metadata.namespace_lookup.clone(); + let success_ch_tx = self.node_metadata.success_ch_tx.clone(); + // Forward it into a processing task + self.processing_tasks.spawn(async move { + Self::process_packet( + &node_id, + &pod_ref, + &namespace, + &namespace_lookup, + &packet, + &success_ch_tx, + ) + }); + } + Message::Stop => { + // Stop message received, we will stop processing + self.processing_tasks.abort_all(); + return Ok(true); + } + Message::NodeProcessingComplete(_) => { + // Since pod only have one parent, we can expect that there will be no more incoming packet + // thus, we need to wait for everything to finish processing and send completion message + // Return true to notify caller that processing is complete + self.wait_for_node_task_completion().await; + return Ok(true); + } + } + Ok(false) + } + + fn get_ch_to_listen_to( + &mut self, + ) -> &mut FuturesUnordered>> { + &mut self.node_metadata.ch_to_listen_to + } + + async fn wait_for_node_task_completion(&mut self) { + while self.processing_tasks.join_next().await.is_some() { + // Wait for all processing tasks to complete + } + } +} + +struct MapperProcessor { + mapper: Arc, + node_metadata: NodeMetaData, +} + +impl MapperProcessor { + const fn new(mapper: Arc, node_metadata: NodeMetaData) -> Self { + Self { + mapper, + node_metadata, + } + } + + fn process_packet(&self, packet: &HashMap) -> Result<()> { + // Apply the mapping to the input packet + let output_map = self + .mapper + .mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(packet, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; + + // Send the output via the channel + self.node_metadata.success_ch_tx.send(Message::NodeOutput( + self.node_metadata.node_id.clone(), + output_map, + ))?; + Ok(()) + } +} + +impl NodeProcessor for MapperProcessor { + fn get_ch_to_listen_to( + &mut self, + ) -> &mut FuturesUnordered>> { + &mut self.node_metadata.ch_to_listen_to + } + + async fn wait_for_node_task_completion(&mut self) { + // Mapper doesn't spawn additional tasks, so this is a no-op + } + + async fn process_msg(&mut self, msg: Message) -> Result { + match msg { + Message::NodeOutput(_, hash_map) => { + let output_map = self + .mapper + .mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(&hash_map, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; + + // For now we will just send the input_packet to the success channel + self.node_metadata.success_ch_tx.send(Message::NodeOutput( + self.node_metadata.node_id.clone(), + output_map, + ))?; + } + Message::NodeProcessingComplete(_) => return Ok(true), + Message::Stop => todo!(), + } + + Ok(false) + } +} + +struct JoinerProcessor { + /// Cache for all packets received by the node + input_packet_cache: HashMap>>, + completed_parents: Vec, + node_metadata: NodeMetaData, +} + +impl JoinerProcessor { + fn new(parents_node_id: Vec, node_metadata: NodeMetaData) -> Self { + let input_packet_cache = parents_node_id + .into_iter() + .map(|id| (id, Vec::new())) + .collect(); + Self { + input_packet_cache, + node_metadata, + completed_parents: Vec::new(), + } + } + + fn compute_new_packet_combination( + &self, + sender_node_id: &str, + new_packet: &HashMap, + ) -> Result>> { + // Combine the new packet with the existing packets in the cache + // Get all the cached packets from other parents + let other_parent_ids = self + .input_packet_cache + .keys() + .filter(|key| *key != sender_node_id); + let mut factors = other_parent_ids + .map(|id| get(&self.input_packet_cache, id)) + .collect::>>()?; + + // Add the new incoming packet as a factor + let incoming_packet = vec![new_packet.clone()]; + factors.push(&incoming_packet); + + let result = factors + .into_iter() + .multi_cartesian_product() + .map(|packets_to_combined| { + packets_to_combined + .into_iter() + .fold(HashMap::new(), |mut acc, packet| { + acc.extend(packet.clone()); + acc + }) + }) + .collect::>(); + + Ok(result) + } + + fn process_packet( + &mut self, + sender_node_id: &str, + packet: HashMap, + ) -> Result<()> { + let process_result = { + // Compute the new packet combination based on the sender node id and the packet + let new_packets_to_send = + self.compute_new_packet_combination(sender_node_id, &packet)?; + + // Record the packet into the cache + self.input_packet_cache + .get_mut(sender_node_id) + .context(selector::KeyMissing { + key: sender_node_id.to_owned(), + })? + .push(packet); + + Ok::>, OrcaError>(new_packets_to_send) + }; + + match process_result { + Ok(output_packets) => { + // Send the output packets to the success channel + for output_packet in output_packets { + self.node_metadata.success_ch_tx.send(Message::NodeOutput( + self.node_metadata.node_id.clone(), + output_packet, + ))?; + } + } + Err(err) => { + // Send the error to the failure channel + todo!(); + } + } + // Add the new packet into the cache + + Ok(()) + } +} + +impl NodeProcessor for JoinerProcessor { + fn get_ch_to_listen_to( + &mut self, + ) -> &mut FuturesUnordered>> { + &mut self.node_metadata.ch_to_listen_to + } + + async fn wait_for_node_task_completion(&mut self) { + // Joiner doesn't spawn additional tasks, so this is a no-op + } + + async fn process_msg(&mut self, msg: Message) -> Result { + match msg { + Message::NodeOutput(sender_node_id, packet) => { + // Process the packet and send the output to the success channel + self.process_packet(&sender_node_id, packet)?; + } + Message::NodeProcessingComplete(sender_node_id) => { + // Record that this parent node has completed processing + self.completed_parents.push(sender_node_id); + + // Check if all parents have completed processing + if self.completed_parents.len() == self.input_packet_cache.len() { + // All parents have completed processing, we can send the output + // Wait for all packets to be processed and send the output + return Ok(true); + } + } + Message::Stop => { + // We don't have anything to clean up, so we can just return + return Ok(true); + } + } + + Ok(false) + } +} diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index c787dc93..150babaf 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -281,7 +281,7 @@ pub fn pipeline_job() -> Result { // Create a simple pipeline_job PipelineJob::new( pipeline()?, - HashMap::from([( + vec![HashMap::from([( "input_text".to_owned(), PathSet::Unary(Blob { kind: BlobKind::File, @@ -291,7 +291,7 @@ pub fn pipeline_job() -> Result { }, ..Default::default() }), - )]), + )])], URI { namespace: "default".to_owned(), path: PathBuf::from("output"), diff --git a/tests/pipeline.rs b/tests/pipeline.rs index 71a0f069..4150deeb 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -6,14 +6,10 @@ //! process completes successfully and outputs the expected results. pub mod fixture; -use std::{collections::HashMap, path::PathBuf, vec}; +use std::vec; use fixture::pipeline; -use orcapod::uniffi::{ - error::Result, - model::{Annotation, Blob, BlobKind, PathSet, URI}, - pipeline::PipelineJob, -}; +use orcapod::uniffi::{error::Result, model::Annotation}; use crate::fixture::pipeline_job; @@ -100,30 +96,3 @@ fn pipeline_job_creation() -> Result<()> { Ok(()) } - -#[test] -fn incorrect_input_packet() -> Result<()> { - assert!( - PipelineJob::new( - pipeline()?, - HashMap::from([( - "wrong_key".to_owned(), - PathSet::Unary(Blob { - kind: BlobKind::File, - location: URI { - namespace: "default".to_owned(), - path: PathBuf::from("data/input.txt"), - }, - ..Default::default() - }), - )]), - URI { - namespace: "default".to_owned(), - path: PathBuf::from("output"), - }, - None - ) - .is_err() - ); - Ok(()) -} diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 7959088b..a2889480 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -29,5 +29,6 @@ async fn basic_run() -> Result<()> { // Wait for the pipeline run to complete let result = runner.get_result(&pipeline_run).await?; + println!("Pipeline run result: {result:?}"); Ok(()) } From 3058eb75667a1831b514082bcc237791f6a01674 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 11:36:22 +0000 Subject: [PATCH 06/29] Add basic design struct --- src/uniffi/pipeline_runner/docker.rs | 75 +++++++++++++++++++++++----- tests/pipeline_runner.rs | 2 +- 2 files changed, 64 insertions(+), 13 deletions(-) diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index 1b279bb0..2d815726 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -11,7 +11,10 @@ use futures_util::stream::FuturesUnordered; use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; -use std::{backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc}; +use std::{ + backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc, thread::sleep, + time::Duration, +}; use tokio::{ sync::{ RwLock, @@ -80,7 +83,7 @@ impl DockerPipelineRunner { // Create the source channel for the pipeline // This channel will be used to send inputs to the pipeline - let (source_tx, _) = broadcast::channel::(1); + let (source_tx, _) = broadcast::channel::(128); // Get reference to the pipeline let pipeline = &pipeline_run_arc.pipeline_job.pipeline; @@ -109,6 +112,8 @@ impl DockerPipelineRunner { // Send a message that all job inputs have been sent source_tx.send(Message::NodeProcessingComplete("input".to_owned()))?; + sleep(Duration::from_secs(1)); // Give some time for the tasks to start + panic!(); Ok(pipeline_run) } @@ -304,6 +309,7 @@ impl DockerPipelineRunner { success_ch_tx: Sender, namespace_lookup: HashMap, ) -> Result<()> { + println!("Starting node manager for node: {}", node.id); // Create a channel to for waiting when the node processing is complete let (processing_complete_ch_tx, processing_complete_ch_rx) = oneshot::channel::<()>(); @@ -329,6 +335,8 @@ impl DockerPipelineRunner { namespace_lookup: namespace_lookup.clone(), }; + println!("Setting up node processor for node: {}", node.id); + // Get the kernel for this node and build the correct processor match get( &pipeline_run.pipeline_job.pipeline.kernel_lut, @@ -380,7 +388,7 @@ trait NodeProcessor { // Start to listen to the channels // Listen to the MPSC channel and handle messages while let Some(result) = self.get_ch_to_listen_to().next().await { - let rx_result = match result { + let repeater_result = match result { Ok(rx_result) => rx_result, Err(err) => { // Record into pipeline_error log @@ -393,9 +401,18 @@ trait NodeProcessor { } }; - let Ok(msg) = rx_result else { - eprintln!("Failed to receive message from parent channel"); - continue; + let msg = match repeater_result { + Ok(msg) => msg, + Err(RecvError::Closed) => { + // Channel is closed, we can exit the loop + eprintln!("Channel closed, exiting node processor"); + break; + } + Err(RecvError::Lagged(_)) => { + // Channel lagged, skip this message + eprintln!("Channel lagged, skipping message"); + continue; + } }; // Process the message @@ -445,11 +462,22 @@ impl PodProcessor { success_ch_tx: &Sender, ) -> Result<()> { // Process the packet using the pod + println!( + "Processing packet in pod: {} with node_id: {}", + pod.hash, node_id + ); // Create the pod_job let mut buf = Vec::new(); let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(packet, &mut serializer)?; + match serialize_hashmap(packet, &mut serializer) { + Ok(_) => {} + Err(err) => { + println!("Failed to serialize packet: {err}"); + } + } + + println!("managed to serialize packet: {:?}", buf); let input_packet_hash = hash_buffer(buf); let output_dir = URI { namespace: namespace.to_owned(), @@ -460,7 +488,7 @@ impl PodProcessor { let memory_limit = pod.recommended_memory; // Create the pod job - let pod_job = PodJob::new( + let pod_job = match PodJob::new( None, Arc::clone(pod), packet.clone(), @@ -469,7 +497,13 @@ impl PodProcessor { memory_limit, None, namespace_lookup, - )?; + ) { + Ok(job) => job, + Err(err) => { + println!("Failed to create pod job: {err}"); + panic!("Failed to create pod job: {err}"); + } + }; // Simulate pod execution by just printing out pod_job_hash and pod hash // This will be replaced by sending the pod_job to the orchestrator via the agent @@ -479,7 +513,12 @@ impl PodProcessor { ); // For now we will just send the input_packet to the success channel - success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone()))?; + match success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone())) { + Ok(_) => {} + Err(err) => { + println!("Failed to send message to success channel: {err}"); + } + } Ok(()) } @@ -489,6 +528,10 @@ impl NodeProcessor for PodProcessor { async fn process_msg(&mut self, msg: Message) -> Result { match msg { Message::NodeOutput(sender_node_id, packet) => { + println!( + "Node {} received packet: {:?} from {}", + self.node_metadata.node_id, packet, sender_node_id + ); let pod_ref = Arc::clone(&self.pod); let node_id = self.node_metadata.node_id.clone(); let namespace = self.node_metadata.namespace.clone(); @@ -582,13 +625,17 @@ impl NodeProcessor for MapperProcessor { async fn process_msg(&mut self, msg: Message) -> Result { match msg { - Message::NodeOutput(_, hash_map) => { + Message::NodeOutput(sender_node_id, packet) => { + println!( + "Node {} received packet: {:?} from {}", + self.node_metadata.node_id, packet, sender_node_id + ); let output_map = self .mapper .mapping .iter() .map(|(input_key, output_key)| { - let input = get(&hash_map, input_key)?.clone(); + let input = get(&packet, input_key)?.clone(); Ok((output_key.to_owned(), input)) }) .collect::>>()?; @@ -718,6 +765,10 @@ impl NodeProcessor for JoinerProcessor { async fn process_msg(&mut self, msg: Message) -> Result { match msg { Message::NodeOutput(sender_node_id, packet) => { + println!( + "Node {} received packet: {:?} from {}", + self.node_metadata.node_id, packet, sender_node_id + ); // Process the packet and send the output to the success channel self.process_packet(&sender_node_id, packet)?; } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index a2889480..a349eb24 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -12,7 +12,7 @@ use orcapod::uniffi::{error::Result, pipeline_runner::docker::DockerPipelineRunn use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test] +#[tokio::test(flavor = "multi_thread", worker_threads = 16)] async fn basic_run() -> Result<()> { let pipeline_job = pipeline_job()?; From c45df99cf4f18440dd38458a3837980946a17673 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 16:21:21 +0000 Subject: [PATCH 07/29] Replace broadcast arch with MPSC --- src/core/error.rs | 2 +- src/uniffi/pipeline.rs | 13 + src/uniffi/pipeline_runner/docker.rs | 495 ++++++++++----------------- 3 files changed, 186 insertions(+), 324 deletions(-) diff --git a/src/core/error.rs b/src/core/error.rs index 06e3adef..d2404663 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -13,7 +13,7 @@ use std::{ path::{self}, }; use tokio::{ - sync::{broadcast::error::SendError, oneshot}, + sync::{mpsc::error::SendError, oneshot}, task::JoinError, }; diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index 27d84dbd..2bb205a7 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -333,6 +333,19 @@ impl Pipeline { .map(move |parent_idx| &self.graph[parent_idx]) }) } + + pub fn get_children_for_node(&self, node: &Node) -> impl Iterator { + // Find the NodeIndex for the given node_key + let node_index = self + .graph + .node_indices() + .find(|&idx| self.graph[idx] == *node); + node_index.into_iter().flat_map(move |idx| { + self.graph + .neighbors_directed(idx, Outgoing) + .map(move |child_idx| &self.graph[child_idx]) + }) + } } #[derive(uniffi::Object, Display, Debug, Clone, Serialize)] diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index 2d815726..461edc8c 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -2,28 +2,20 @@ use super::PipelineRun; use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ - error::{Kind, OrcaError, Result, selector}, + error::{OrcaError, Result, selector}, model::{PathSet, Pod, PodJob, URI}, pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, }; -use futures_util::stream::FuturesUnordered; +use futures_util::future::try_join_all; use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; -use std::{ - backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc, thread::sleep, - time::Duration, -}; +use std::{collections::HashMap, path::PathBuf, sync::Arc, thread::sleep, time::Duration}; use tokio::{ - sync::{ - RwLock, - broadcast::{self, Receiver, Sender, error::RecvError}, - oneshot, - }, - task::{JoinHandle, JoinSet}, + sync::{RwLock, mpsc}, + task::{JoinSet, spawn_blocking}, }; -use tokio_stream::StreamExt as _; #[derive(Clone, Debug)] pub(crate) enum Message { @@ -40,8 +32,7 @@ pub(crate) enum Message { )] struct PipelineRunInfo { node_task_join_set: JoinSet>, // Join set to track the tasks for this pipeline run - job_manager_ch_tx: Sender, - node_tx: HashMap>, + node_tx: HashMap>, outputs: Arc>>>>, // String is the node key, while hash } @@ -61,7 +52,7 @@ impl DockerPipelineRunner { /// /// # Errors /// Will error out if the pipeline job fails to start - pub fn start( + pub async fn start( &mut self, pipeline_job: PipelineJob, namespace_lookup: &HashMap, @@ -74,45 +65,40 @@ impl DockerPipelineRunner { self.pipeline_runs.insert( (*pipeline_run_arc).clone(), PipelineRunInfo { - job_manager_ch_tx: broadcast::channel::(1).0, node_tx: HashMap::new(), node_task_join_set: JoinSet::new(), outputs: Arc::new(RwLock::new(HashMap::new())), }, ); - // Create the source channel for the pipeline - // This channel will be used to send inputs to the pipeline - let (source_tx, _) = broadcast::channel::(128); - // Get reference to the pipeline let pipeline = &pipeline_run_arc.pipeline_job.pipeline; - // Get all the leaf nodes and call the create_task_for_node function for each leaf node + // Get all the root nodes and call the create_task_for_node function for each root node // This will recursively create all the tasks and channels for the pipeline - pipeline.get_leaf_nodes().try_for_each(|node| { - self.create_task_for_node(node, &pipeline_run_arc, &source_tx, namespace_lookup)?; - - // Since we don't have output nodes implemented, and currently it is set as leaf nodes, - // we can do the output handling logic here too - - Ok::<(), OrcaError>(()) - })?; + let root_nodes_tx = pipeline + .get_root_nodes() + .map(|node| self.create_task_for_node(node, &pipeline_run_arc, namespace_lookup)) + .collect::>>()?; // All pipeline tasks have been created, now we need to feed the inputs to the pipeline - pipeline_run - .pipeline_job - .input_packets - .iter() - .try_for_each(|input_map| { - source_tx.send(Message::NodeOutput("input".to_owned(), input_map.clone()))?; - Ok::<(), OrcaError>(()) - })?; + for tx in &root_nodes_tx { + for input_packet in &pipeline_run.pipeline_job.input_packets { + tx.send(Message::NodeOutput( + "input".to_owned(), + input_packet.clone(), + )) + .await?; + } + } // Send a message that all job inputs have been sent - source_tx.send(Message::NodeProcessingComplete("input".to_owned()))?; + for tx in &root_nodes_tx { + tx.send(Message::NodeProcessingComplete("input".to_owned())) + .await?; + } - sleep(Duration::from_secs(1)); // Give some time for the tasks to start + sleep(Duration::from_secs(5)); // Give some time for the tasks to start panic!(); Ok(pipeline_run) } @@ -154,55 +140,40 @@ impl DockerPipelineRunner { &mut self, node: &Node, pipeline_run: &Arc, - source_tx: &Sender, namespace_lookup: &HashMap, - ) -> Result> { + ) -> Result> { println!("Creating task for node: {}", node.id); - // Get the input channels for this node which should be it's parents - let mut input_ch_rxs = pipeline_run - .pipeline_job - .pipeline - .get_parents_for_node(node) - .map(|parent_node| { - // Check if it exists in the pipeline_runs hashmap - match get(&self.pipeline_runs, pipeline_run)? - .node_tx - .get(&parent_node.id) - { - Some(rx) => Ok(rx.subscribe()), - None => { - // Missing parent node, thus call create_task for the parent node parent node first - Ok(self - .create_task_for_node( - parent_node, - pipeline_run, - source_tx, - namespace_lookup, - )? - .subscribe()) - } - } - }) - .collect::>>()?; + // Create a channel for the node + // This channel will be used to send messages to the node processor + let (tx, rx) = mpsc::channel::(128); + + // Use closer to limit the scope of the borrow + { + let pipeline_info = + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })?; + // Check if the node is already inside the node_tx + if pipeline_info.node_tx.contains_key(&node.id) { + // Node already exists, thus we can return the existing tx + return Ok(pipeline_info.node_tx.get(&node.id).unwrap().clone()); + } - // Check if input_ch_rxs is empty, meaning this node has no parents and is a root node - // In this case, we will use the source channel as the input channel - // TODO: This will be replaced by input_node logic once that is merged - if input_ch_rxs.is_empty() { - // No parents, thus this is root node - // The parent rx will be the source channel rx - input_ch_rxs.push(source_tx.subscribe()); + // Record the tx into the pipeline_info tx_hashmap + pipeline_info.node_tx.insert(node.id.clone(), tx.clone()); } - // Get the job manager ch and subscribe to it (mainly for receiving shutdown signal) - let job_manager_ch_rx = get(&self.pipeline_runs, pipeline_run)? - .job_manager_ch_tx - .subscribe(); - - // Create the output_channel for this node - let (tx, _) = broadcast::channel::(128); + // Call this function for each of the child nodes to get their Sender_tx + let children_node_tx = pipeline_run + .pipeline_job + .pipeline + .get_children_for_node(node) + .map(|child_node| self.create_task_for_node(child_node, pipeline_run, namespace_lookup)) + .collect::>>()?; - // Spawn the node_manager for this node + // Start the task_manager self.pipeline_runs .get_mut(pipeline_run) .context(selector::KeyMissing { @@ -212,77 +183,26 @@ impl DockerPipelineRunner { .spawn(Self::start_node_manager( node.clone(), Arc::clone(pipeline_run), - input_ch_rxs, - job_manager_ch_rx, - tx.clone(), + rx, + children_node_tx, namespace_lookup.clone(), )); - // Insert it into the the tx into the pipeline_runs hashmap - self.pipeline_runs - .get_mut(pipeline_run) - .context(selector::KeyMissing { - key: pipeline_run.to_string(), - })? - .node_tx - .insert(node.id.clone(), tx.clone()); - - // Return tx Ok(tx) } - fn create_task_to_capture_output_of_node( - &mut self, - node: &Node, - pipeline_run: &Arc, - ) -> Result<()> { - let pipeline_run_info = - self.pipeline_runs - .get_mut(pipeline_run) - .context(selector::KeyMissing { - key: pipeline_run.to_string(), - })?; - // Get the output ch rx for the node - let node_rx = get(&pipeline_run_info.node_tx, &node.id)?.subscribe(); - // Create a new ref copy of pipeline_run_output - let outputs_ref = Arc::clone(&pipeline_run_info.outputs); - // Create a task to listen to it and record the outputs - pipeline_run_info - .node_task_join_set - .spawn(Self::capture_node_output(node_rx, outputs_ref)); - - Ok(()) - } - #[expect( clippy::type_complexity, reason = "too complex, but necessary for async handling" )] async fn capture_node_output( - mut node_rx: Receiver, + mut output_rx: mpsc::Receiver, outputs_ref: Arc>>>>, ) -> Result<()> { - loop { - let message = match node_rx.recv().await { - Ok(message) => message, - Err(err) => { - match err { - RecvError::Closed => { - // No more message will be received, thus we can exit the loop - // Only case where this will occur is when the channel is closed due to abort - break; - } - RecvError::Lagged(_) => { - print!("Warning: Channel lagged, skipping message"); - } - } - continue; - } - }; - match message { + while let Some(msg) = output_rx.recv().await { + match msg { Message::NodeOutput(node_id, hash_map) => { // Record the output - outputs_ref .write() .await @@ -296,7 +216,6 @@ impl DockerPipelineRunner { } } } - Ok(()) } @@ -304,33 +223,16 @@ impl DockerPipelineRunner { async fn start_node_manager( node: Node, pipeline_run: Arc, - parent_channel_rxs: Vec>, - mut job_manager_channel: Receiver, - success_ch_tx: Sender, + node_rx: mpsc::Receiver, + success_chs_tx: Vec>, namespace_lookup: HashMap, ) -> Result<()> { println!("Starting node manager for node: {}", node.id); - // Create a channel to for waiting when the node processing is complete - let (processing_complete_ch_tx, processing_complete_ch_rx) = oneshot::channel::<()>(); - - // Create a futures unordered set to dynamically listen to N number of receivers - let chs_to_listen_to = FuturesUnordered::new(); - - // Add all the parent channel receivers to the futures unordered set - for mut rx in parent_channel_rxs { - chs_to_listen_to.push(tokio::spawn(async move { rx.recv().await })); - } - - // Add the job manager channel to the futures unordered set - chs_to_listen_to.push(tokio::spawn( - async move { job_manager_channel.recv().await }, - )); - // Create a metadata struct for this node let node_metadata = NodeMetaData { node_id: node.id.clone(), - ch_to_listen_to: chs_to_listen_to, - success_ch_tx: success_ch_tx.clone(), + node_rx, + child_nodes_txs: success_chs_tx.clone(), namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), namespace_lookup: namespace_lookup.clone(), }; @@ -344,13 +246,11 @@ impl DockerPipelineRunner { )? { Kernel::Pod(pod) => { let mut processor = PodProcessor::new(Arc::clone(pod), node_metadata); - processor.start(processing_complete_ch_tx).await?; - processing_complete_ch_rx.await?; + processor.start().await; } Kernel::Mapper(mapper) => { let mut processor = MapperProcessor::new(Arc::clone(mapper), node_metadata); - processor.start(processing_complete_ch_tx).await?; - processing_complete_ch_rx.await?; + processor.start().await; } Kernel::Joiner => { let parent_nodes_id = pipeline_run @@ -360,82 +260,49 @@ impl DockerPipelineRunner { .map(|parent_node| parent_node.id.clone()) .collect::>(); let mut processor = JoinerProcessor::new(parent_nodes_id, node_metadata); - processor.start(processing_complete_ch_tx).await?; - processing_complete_ch_rx.await?; + processor.start().await; } } + // Notify that node is finish processing + println!("Node {} processing complete", node.id); + for success_ch_tx in &success_chs_tx { + success_ch_tx + .send(Message::NodeProcessingComplete(node.id.clone())) + .await?; + } + Ok(()) } } struct NodeMetaData { node_id: String, - ch_to_listen_to: FuturesUnordered>>, - success_ch_tx: Sender, // Channel to send successful outputs to the next node + node_rx: mpsc::Receiver, // Channel to listen to messages from parent nodes + child_nodes_txs: Vec>, // Channel to send successful outputs to the next node namespace: String, namespace_lookup: HashMap, // Copy of the look up table } trait NodeProcessor { - fn get_ch_to_listen_to( - &mut self, - ) -> &mut FuturesUnordered>>; - - async fn wait_for_node_task_completion(&mut self); + fn get_node_rx(&mut self) -> &mut mpsc::Receiver; - async fn start(&mut self, process_complete_ch_tx: oneshot::Sender<()>) -> Result<()> { + async fn start(&mut self) { // Start to listen to the channels // Listen to the MPSC channel and handle messages - while let Some(result) = self.get_ch_to_listen_to().next().await { - let repeater_result = match result { - Ok(rx_result) => rx_result, - Err(err) => { - // Record into pipeline_error log - if err.is_panic() { - eprintln!("Task panicked: {err}"); - } else { - eprintln!("Error receiving message: {err}"); - } - continue; - } - }; - - let msg = match repeater_result { - Ok(msg) => msg, - Err(RecvError::Closed) => { - // Channel is closed, we can exit the loop - eprintln!("Channel closed, exiting node processor"); - break; - } - Err(RecvError::Lagged(_)) => { - // Channel lagged, skip this message - eprintln!("Channel lagged, skipping message"); - continue; - } - }; - - // Process the message - if self.process_msg(msg).await? { + while let Some(msg) = self.get_node_rx().recv().await { + if self.process_msg(msg).await { // If the message indicates that processing is complete, we can exit the loop - // Wait for all processing tasks to complete before sending the completion message - + // Wait for all processing tasks to complete before returning self.wait_for_node_task_completion().await; - - // Send the node processing complete message - process_complete_ch_tx.send(()).map_err(|()| OrcaError { - kind: Kind::ReceiverDroppedBeforeSender { - backtrace: Some(Backtrace::capture()), - }, - })?; break; } } - - Ok(()) } - async fn process_msg(&mut self, msg: Message) -> Result; + async fn process_msg(&mut self, msg: Message) -> bool; + + async fn wait_for_node_task_completion(&mut self); } struct PodProcessor { @@ -453,32 +320,28 @@ impl PodProcessor { } } - fn process_packet( - node_id: &str, - pod: &Arc, - namespace: &str, - namespace_lookup: &HashMap, - packet: &HashMap, - success_ch_tx: &Sender, + async fn process_packet( + node_id: String, + pod: Arc, + namespace: String, + namespace_lookup: HashMap, + packet: HashMap, + success_chs_tx: Vec>, ) -> Result<()> { // Process the packet using the pod - println!( - "Processing packet in pod: {} with node_id: {}", - pod.hash, node_id - ); - // Create the pod_job - let mut buf = Vec::new(); - let mut serializer = Serializer::new(&mut buf); - match serialize_hashmap(packet, &mut serializer) { - Ok(_) => {} - Err(err) => { - println!("Failed to serialize packet: {err}"); - } - } - println!("managed to serialize packet: {:?}", buf); - let input_packet_hash = hash_buffer(buf); + // We need a unique hash for this given input packet process by the node + // therefore we need to generate a hash that has the pod_id + input_packet + let node_id_bytes = node_id.as_bytes().to_vec(); + let packet_copy = packet.clone(); + let input_packet_hash = spawn_blocking(move || { + let mut buf = node_id_bytes; + let mut serializer = Serializer::new(&mut buf); + serialize_hashmap(&packet_copy, &mut serializer)?; + Ok::<_, OrcaError>(hash_buffer(buf)) + }) + .await??; let output_dir = URI { namespace: namespace.to_owned(), path: PathBuf::from(format!("pod_runs/{}/{}", pod.hash, input_packet_hash)), @@ -488,22 +351,16 @@ impl PodProcessor { let memory_limit = pod.recommended_memory; // Create the pod job - let pod_job = match PodJob::new( + let pod_job = PodJob::new( None, - Arc::clone(pod), + Arc::clone(&pod), packet.clone(), output_dir, cpu_limit, memory_limit, None, - namespace_lookup, - ) { - Ok(job) => job, - Err(err) => { - println!("Failed to create pod job: {err}"); - panic!("Failed to create pod job: {err}"); - } - }; + &namespace_lookup, + )?; // Simulate pod execution by just printing out pod_job_hash and pod hash // This will be replaced by sending the pod_job to the orchestrator via the agent @@ -513,19 +370,21 @@ impl PodProcessor { ); // For now we will just send the input_packet to the success channel - match success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone())) { - Ok(_) => {} - Err(err) => { - println!("Failed to send message to success channel: {err}"); - } - } + try_join_all(success_chs_tx.iter().map(|success_ch_tx| { + success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone())) + })) + .await?; Ok(()) } } impl NodeProcessor for PodProcessor { - async fn process_msg(&mut self, msg: Message) -> Result { + fn get_node_rx(&mut self) -> &mut mpsc::Receiver { + &mut self.node_metadata.node_rx + } + + async fn process_msg(&mut self, msg: Message) -> bool { match msg { Message::NodeOutput(sender_node_id, packet) => { println!( @@ -536,39 +395,33 @@ impl NodeProcessor for PodProcessor { let node_id = self.node_metadata.node_id.clone(); let namespace = self.node_metadata.namespace.clone(); let namespace_lookup = self.node_metadata.namespace_lookup.clone(); - let success_ch_tx = self.node_metadata.success_ch_tx.clone(); + let child_nodes_txs = self.node_metadata.child_nodes_txs.clone(); // Forward it into a processing task - self.processing_tasks.spawn(async move { - Self::process_packet( - &node_id, - &pod_ref, - &namespace, - &namespace_lookup, - &packet, - &success_ch_tx, - ) - }); + self.processing_tasks.spawn(Self::process_packet( + node_id, + pod_ref, + namespace, + namespace_lookup, + packet, + child_nodes_txs, + )); } Message::Stop => { // Stop message received, we will stop processing self.processing_tasks.abort_all(); - return Ok(true); + return true; } Message::NodeProcessingComplete(_) => { + println!("Node processing complete"); // Since pod only have one parent, we can expect that there will be no more incoming packet // thus, we need to wait for everything to finish processing and send completion message // Return true to notify caller that processing is complete self.wait_for_node_task_completion().await; - return Ok(true); + return true; } } - Ok(false) - } - - fn get_ch_to_listen_to( - &mut self, - ) -> &mut FuturesUnordered>> { - &mut self.node_metadata.ch_to_listen_to + println!("returning false"); + false } async fn wait_for_node_task_completion(&mut self) { @@ -591,7 +444,7 @@ impl MapperProcessor { } } - fn process_packet(&self, packet: &HashMap) -> Result<()> { + async fn process_packet(&self, packet: &HashMap) -> Result<()> { // Apply the mapping to the input packet let output_map = self .mapper @@ -604,53 +457,42 @@ impl MapperProcessor { .collect::>>()?; // Send the output via the channel - self.node_metadata.success_ch_tx.send(Message::NodeOutput( - self.node_metadata.node_id.clone(), - output_map, - ))?; + try_join_all(self.node_metadata.child_nodes_txs.iter().map(|ch| { + ch.send(Message::NodeOutput( + self.node_metadata.node_id.clone(), + output_map.clone(), + )) + })) + .await?; Ok(()) } } impl NodeProcessor for MapperProcessor { - fn get_ch_to_listen_to( - &mut self, - ) -> &mut FuturesUnordered>> { - &mut self.node_metadata.ch_to_listen_to + fn get_node_rx(&mut self) -> &mut mpsc::Receiver { + &mut self.node_metadata.node_rx } async fn wait_for_node_task_completion(&mut self) { // Mapper doesn't spawn additional tasks, so this is a no-op } - async fn process_msg(&mut self, msg: Message) -> Result { + async fn process_msg(&mut self, msg: Message) -> bool { match msg { - Message::NodeOutput(sender_node_id, packet) => { - println!( - "Node {} received packet: {:?} from {}", - self.node_metadata.node_id, packet, sender_node_id - ); - let output_map = self - .mapper - .mapping - .iter() - .map(|(input_key, output_key)| { - let input = get(&packet, input_key)?.clone(); - Ok((output_key.to_owned(), input)) - }) - .collect::>>()?; - - // For now we will just send the input_packet to the success channel - self.node_metadata.success_ch_tx.send(Message::NodeOutput( - self.node_metadata.node_id.clone(), - output_map, - ))?; + Message::NodeOutput(_, packet) => { + match self.process_packet(&packet).await { + Ok(_) => {} + Err(err) => { + // Send the error to the failure channel + // For now just print it out + println!("Failed to process packet with error: {}", err); + } + } } - Message::NodeProcessingComplete(_) => return Ok(true), - Message::Stop => todo!(), + Message::NodeProcessingComplete(_) | Message::Stop => return true, } - Ok(false) + false } } @@ -709,7 +551,7 @@ impl JoinerProcessor { Ok(result) } - fn process_packet( + async fn process_packet( &mut self, sender_node_id: &str, packet: HashMap, @@ -734,10 +576,13 @@ impl JoinerProcessor { Ok(output_packets) => { // Send the output packets to the success channel for output_packet in output_packets { - self.node_metadata.success_ch_tx.send(Message::NodeOutput( - self.node_metadata.node_id.clone(), - output_packet, - ))?; + try_join_all(self.node_metadata.child_nodes_txs.iter().map(|ch| { + ch.send(Message::NodeOutput( + self.node_metadata.node_id.clone(), + output_packet.clone(), + )) + })) + .await?; } } Err(err) => { @@ -752,17 +597,15 @@ impl JoinerProcessor { } impl NodeProcessor for JoinerProcessor { - fn get_ch_to_listen_to( - &mut self, - ) -> &mut FuturesUnordered>> { - &mut self.node_metadata.ch_to_listen_to + fn get_node_rx(&mut self) -> &mut mpsc::Receiver { + &mut self.node_metadata.node_rx } async fn wait_for_node_task_completion(&mut self) { // Joiner doesn't spawn additional tasks, so this is a no-op } - async fn process_msg(&mut self, msg: Message) -> Result { + async fn process_msg(&mut self, msg: Message) -> bool { match msg { Message::NodeOutput(sender_node_id, packet) => { println!( @@ -770,7 +613,13 @@ impl NodeProcessor for JoinerProcessor { self.node_metadata.node_id, packet, sender_node_id ); // Process the packet and send the output to the success channel - self.process_packet(&sender_node_id, packet)?; + match self.process_packet(&sender_node_id, packet).await { + Ok(_) => {} + Err(err) => { + // Send the error to the failure channel + println!("Failed to process packet with error: {}", err); + } + } } Message::NodeProcessingComplete(sender_node_id) => { // Record that this parent node has completed processing @@ -780,15 +629,15 @@ impl NodeProcessor for JoinerProcessor { if self.completed_parents.len() == self.input_packet_cache.len() { // All parents have completed processing, we can send the output // Wait for all packets to be processed and send the output - return Ok(true); + return true; } } Message::Stop => { // We don't have anything to clean up, so we can just return - return Ok(true); + return true; } } - Ok(false) + false } } From 415eb054a800b7772cd535e7a738a63f983241f1 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 16:46:01 +0000 Subject: [PATCH 08/29] Added output handling --- src/uniffi/pipeline_runner/docker.rs | 75 ++++++++++++++++++++++++---- tests/pipeline_runner.rs | 2 +- 2 files changed, 66 insertions(+), 11 deletions(-) diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/docker.rs index 461edc8c..2f6cbf3f 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/docker.rs @@ -11,8 +11,15 @@ use futures_util::future::try_join_all; use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; -use std::{collections::HashMap, path::PathBuf, sync::Arc, thread::sleep, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + path::PathBuf, + sync::Arc, + thread::sleep, + time::Duration, +}; use tokio::{ + net::unix::pipe, sync::{RwLock, mpsc}, task::{JoinSet, spawn_blocking}, }; @@ -74,6 +81,55 @@ impl DockerPipelineRunner { // Get reference to the pipeline let pipeline = &pipeline_run_arc.pipeline_job.pipeline; + // Create the output channel to capture the outputs of the outputs nodes (Currently only leaf nodes) + let (output_tx, mut output_rx) = mpsc::channel::(128); // Channel to capture outputs from nodes + + // Get the output_nodes (leaf nodes for now) so the output task can keep track when parents are done + let output_nodes_ids = pipeline + .get_leaf_nodes() + .map(|node| node.id.clone()) + .collect::>(); + let outputs = get(&self.pipeline_runs, &pipeline_run_arc)?.outputs.clone(); + + // Create the task that captures the output from the nodes and stores them in the outputs map + self.pipeline_runs + .get_mut(&pipeline_run_arc) + .context(selector::KeyMissing { + key: pipeline_run_arc.to_string(), + })? + .node_task_join_set + .spawn(async move { + let mut complete_parent_nodes = HashSet::new(); + while let Some(message) = output_rx.recv().await { + match message { + Message::NodeOutput(sender_node_id, hash_map) => { + // Store the output in the outputs map + outputs + .write() + .await + .entry(sender_node_id) + .or_default() + .push(hash_map); + } + Message::NodeProcessingComplete(sender_node_id) => { + // Add the sender node id to the complete parent nodes + complete_parent_nodes.insert(sender_node_id.clone()); + + // Check if all parent nodes are complete + if complete_parent_nodes.is_superset(&output_nodes_ids) { + // All parents are complete, we can exit this task + println!( + "All parent nodes are complete, stopping output capture task." + ); + return Ok(()); + } + } + Message::Stop => todo!(), + } + } + Ok(()) + }); + // Get all the root nodes and call the create_task_for_node function for each root node // This will recursively create all the tasks and channels for the pipeline let root_nodes_tx = pipeline @@ -158,12 +214,12 @@ impl DockerPipelineRunner { // Check if the node is already inside the node_tx if pipeline_info.node_tx.contains_key(&node.id) { // Node already exists, thus we can return the existing tx - return Ok(pipeline_info.node_tx.get(&node.id).unwrap().clone()); + return Ok(get(&pipeline_info.node_tx, &node.id)?.clone()); } // Record the tx into the pipeline_info tx_hashmap - pipeline_info.node_tx.insert(node.id.clone(), tx.clone()); - } + pipeline_info.node_tx.insert(node.id.clone(), tx.clone()) + }; // Call this function for each of the child nodes to get their Sender_tx let children_node_tx = pipeline_run @@ -264,8 +320,7 @@ impl DockerPipelineRunner { } } - // Notify that node is finish processing - println!("Node {} processing complete", node.id); + // Since all inputs are sent, we can send a message that the "input node" processing is complete for success_ch_tx in &success_chs_tx { success_ch_tx .send(Message::NodeProcessingComplete(node.id.clone())) @@ -343,7 +398,7 @@ impl PodProcessor { }) .await??; let output_dir = URI { - namespace: namespace.to_owned(), + namespace: namespace.clone(), path: PathBuf::from(format!("pod_runs/{}/{}", pod.hash, input_packet_hash)), }; @@ -371,7 +426,7 @@ impl PodProcessor { // For now we will just send the input_packet to the success channel try_join_all(success_chs_tx.iter().map(|success_ch_tx| { - success_ch_tx.send(Message::NodeOutput(node_id.to_owned(), packet.clone())) + success_ch_tx.send(Message::NodeOutput(node_id.clone(), packet.clone())) })) .await?; @@ -481,7 +536,7 @@ impl NodeProcessor for MapperProcessor { match msg { Message::NodeOutput(_, packet) => { match self.process_packet(&packet).await { - Ok(_) => {} + Ok(()) => {} Err(err) => { // Send the error to the failure channel // For now just print it out @@ -614,7 +669,7 @@ impl NodeProcessor for JoinerProcessor { ); // Process the packet and send the output to the success channel match self.process_packet(&sender_node_id, packet).await { - Ok(_) => {} + Ok(()) => {} Err(err) => { // Send the error to the failure channel println!("Failed to process packet with error: {}", err); diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index a349eb24..920e95da 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -25,7 +25,7 @@ async fn basic_run() -> Result<()> { )]))?; let namespace_lookup = test_dirs.namespace_lookup(); - let pipeline_run = runner.start(pipeline_job, &namespace_lookup)?; + let pipeline_run = runner.start(pipeline_job, &namespace_lookup).await?; // Wait for the pipeline run to complete let result = runner.get_result(&pipeline_run).await?; From fb8ace1dec0f03a6f1d31a2f961a0adcd97066ac Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 17:01:58 +0000 Subject: [PATCH 09/29] Fix bugs and clippy suggestions --- src/core/error.rs | 5 +- src/uniffi/error.rs | 4 - src/uniffi/pipeline.rs | 8 +- src/uniffi/pipeline_runner/mod.rs | 2 +- .../pipeline_runner/{docker.rs => runner.rs} | 75 +++++++------------ tests/pipeline_runner.rs | 4 +- 6 files changed, 37 insertions(+), 61 deletions(-) rename src/uniffi/pipeline_runner/{docker.rs => runner.rs} (92%) diff --git a/src/core/error.rs b/src/core/error.rs index d2404663..614cd593 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -1,6 +1,6 @@ use crate::uniffi::{ error::{Kind, OrcaError}, - pipeline_runner::docker::Message, + pipeline_runner::runner::Message, }; use bollard::errors::Error as BollardError; use glob; @@ -126,8 +126,7 @@ fn format_stack(backtrace: Option<&Backtrace>) -> String { impl fmt::Debug for OrcaError { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match &self.kind { - Kind::ReceiverDroppedBeforeSender { backtrace, .. } - | Kind::EmptyResponseWhenLoadingContainerAltImage { backtrace, .. } + Kind::EmptyResponseWhenLoadingContainerAltImage { backtrace, .. } | Kind::FailedToParseDot { backtrace, .. } | Kind::GeneratedNamesOverflow { backtrace, .. } | Kind::InvalidFilepath { backtrace, .. } diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 72cae77f..c52da3b7 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -24,10 +24,6 @@ pub type Result = result::Result; #[snafu(module(selector), visibility(pub(crate)), context(suffix(false)))] #[uniffi(flat_error)] pub(crate) enum Kind { - #[snafu(display( - "Receiver was dropped before sender could send a message for oneshot channel" - ))] - ReceiverDroppedBeforeSender { backtrace: Option }, #[snafu(display( "Received an empty response when attempting to load the alternate container image file: {path:?}." ))] diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index 2bb205a7..e634ae36 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -334,6 +334,7 @@ impl Pipeline { }) } + /// Function to get the children of a node pub fn get_children_for_node(&self, node: &Node) -> impl Iterator { // Find the NodeIndex for the given node_key let node_index = self @@ -393,7 +394,12 @@ impl PipelineJob { } } -#[derive(uniffi::Object, Display, Debug, Clone, Serialize)] +#[derive(uniffi::Object, Debug, Clone, Serialize)] +/// `PipelineResult` struct +/// This struct is used to return the result of a pipeline job pub struct PipelineResult { + /// Ref to the pipeline job that was executed pub pipeline_job: PipelineJob, + /// Output packets produced by the pipeline job + pub output_packets: HashMap>>, } diff --git a/src/uniffi/pipeline_runner/mod.rs b/src/uniffi/pipeline_runner/mod.rs index 67fa0663..a0d4812b 100644 --- a/src/uniffi/pipeline_runner/mod.rs +++ b/src/uniffi/pipeline_runner/mod.rs @@ -52,4 +52,4 @@ impl Hash for PipelineRun { } /// Docker pipeline runner -pub mod docker; +pub mod runner; diff --git a/src/uniffi/pipeline_runner/docker.rs b/src/uniffi/pipeline_runner/runner.rs similarity index 92% rename from src/uniffi/pipeline_runner/docker.rs rename to src/uniffi/pipeline_runner/runner.rs index 2f6cbf3f..74f440fa 100644 --- a/src/uniffi/pipeline_runner/docker.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -15,11 +15,8 @@ use std::{ collections::{HashMap, HashSet}, path::PathBuf, sync::Arc, - thread::sleep, - time::Duration, }; use tokio::{ - net::unix::pipe, sync::{RwLock, mpsc}, task::{JoinSet, spawn_blocking}, }; @@ -89,7 +86,7 @@ impl DockerPipelineRunner { .get_leaf_nodes() .map(|node| node.id.clone()) .collect::>(); - let outputs = get(&self.pipeline_runs, &pipeline_run_arc)?.outputs.clone(); + let outputs = Arc::clone(&get(&self.pipeline_runs, &pipeline_run_arc)?.outputs); // Create the task that captures the output from the nodes and stores them in the outputs map self.pipeline_runs @@ -134,7 +131,9 @@ impl DockerPipelineRunner { // This will recursively create all the tasks and channels for the pipeline let root_nodes_tx = pipeline .get_root_nodes() - .map(|node| self.create_task_for_node(node, &pipeline_run_arc, namespace_lookup)) + .map(|node| { + self.create_task_for_node(node, &pipeline_run_arc, &output_tx, namespace_lookup) + }) .collect::>>()?; // All pipeline tasks have been created, now we need to feed the inputs to the pipeline @@ -154,8 +153,6 @@ impl DockerPipelineRunner { .await?; } - sleep(Duration::from_secs(5)); // Give some time for the tasks to start - panic!(); Ok(pipeline_run) } @@ -189,6 +186,7 @@ impl DockerPipelineRunner { Ok(PipelineResult { pipeline_job: pipeline_run.pipeline_job.clone(), + output_packets: pipeline_run_info.outputs.read().await.clone(), }) } @@ -196,6 +194,7 @@ impl DockerPipelineRunner { &mut self, node: &Node, pipeline_run: &Arc, + output_tx: &mpsc::Sender, namespace_lookup: &HashMap, ) -> Result> { println!("Creating task for node: {}", node.id); @@ -222,13 +221,22 @@ impl DockerPipelineRunner { }; // Call this function for each of the child nodes to get their Sender_tx - let children_node_tx = pipeline_run + let mut children_node_tx = pipeline_run .pipeline_job .pipeline .get_children_for_node(node) - .map(|child_node| self.create_task_for_node(child_node, pipeline_run, namespace_lookup)) + .map(|child_node| { + self.create_task_for_node(child_node, pipeline_run, output_tx, namespace_lookup) + }) .collect::>>()?; + // Check if children_node_tx is empty, if so, this is a leaf node thus we need to attach the output_tx + if children_node_tx.is_empty() { + // This is a leaf node, thus we need to attach the output_tx to the tx + // This will allow the node to send its output to the output channel + children_node_tx.push(output_tx.clone()); + } + // Start the task_manager self.pipeline_runs .get_mut(pipeline_run) @@ -247,34 +255,6 @@ impl DockerPipelineRunner { Ok(tx) } - #[expect( - clippy::type_complexity, - reason = "too complex, but necessary for async handling" - )] - async fn capture_node_output( - mut output_rx: mpsc::Receiver, - outputs_ref: Arc>>>>, - ) -> Result<()> { - while let Some(msg) = output_rx.recv().await { - match msg { - Message::NodeOutput(node_id, hash_map) => { - // Record the output - outputs_ref - .write() - .await - .entry(node_id) - .or_default() - .push(hash_map); - } - Message::NodeProcessingComplete(_) | Message::Stop => { - // Node processing is complete, we can stop listening to this channel - break; - } - } - } - Ok(()) - } - /// For tx: Sender, we only want to send successfully completed results to the next node async fn start_node_manager( node: Node, @@ -327,6 +307,8 @@ impl DockerPipelineRunner { .await?; } + println!("Node manager for node: {} has completed", node.id); + Ok(()) } } @@ -441,11 +423,7 @@ impl NodeProcessor for PodProcessor { async fn process_msg(&mut self, msg: Message) -> bool { match msg { - Message::NodeOutput(sender_node_id, packet) => { - println!( - "Node {} received packet: {:?} from {}", - self.node_metadata.node_id, packet, sender_node_id - ); + Message::NodeOutput(_, packet) => { let pod_ref = Arc::clone(&self.pod); let node_id = self.node_metadata.node_id.clone(); let namespace = self.node_metadata.namespace.clone(); @@ -467,7 +445,6 @@ impl NodeProcessor for PodProcessor { return true; } Message::NodeProcessingComplete(_) => { - println!("Node processing complete"); // Since pod only have one parent, we can expect that there will be no more incoming packet // thus, we need to wait for everything to finish processing and send completion message // Return true to notify caller that processing is complete @@ -540,7 +517,7 @@ impl NodeProcessor for MapperProcessor { Err(err) => { // Send the error to the failure channel // For now just print it out - println!("Failed to process packet with error: {}", err); + println!("Failed to process packet with error: {err}"); } } } @@ -642,7 +619,9 @@ impl JoinerProcessor { } Err(err) => { // Send the error to the failure channel - todo!(); + println!( + "Failed to process packet from {sender_node_id} for joiner node with error: {err}" + ); } } // Add the new packet into the cache @@ -663,16 +642,12 @@ impl NodeProcessor for JoinerProcessor { async fn process_msg(&mut self, msg: Message) -> bool { match msg { Message::NodeOutput(sender_node_id, packet) => { - println!( - "Node {} received packet: {:?} from {}", - self.node_metadata.node_id, packet, sender_node_id - ); // Process the packet and send the output to the success channel match self.process_packet(&sender_node_id, packet).await { Ok(()) => {} Err(err) => { // Send the error to the failure channel - println!("Failed to process packet with error: {}", err); + println!("Failed to process packet with error: {err}"); } } } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 920e95da..afd58eb2 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -7,7 +7,7 @@ pub mod fixture; // Example for a local module: use std::collections::HashMap; -use orcapod::uniffi::{error::Result, pipeline_runner::docker::DockerPipelineRunner}; +use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunner}; use crate::fixture::TestDirs; use fixture::pipeline_job; @@ -29,6 +29,6 @@ async fn basic_run() -> Result<()> { // Wait for the pipeline run to complete let result = runner.get_result(&pipeline_run).await?; - println!("Pipeline run result: {result:?}"); + println!("Pipeline run result: {:?}", result.output_packets); Ok(()) } From 08453b3bc94fd503dd074026ddbf756c1e32f324 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 18:02:54 +0000 Subject: [PATCH 10/29] Added a lot of docs and fix input.txt issue for test --- src/uniffi/pipeline_runner/runner.rs | 205 ++++++++++++++++++++++----- tests/extra/data/input.txt | 0 tests/fixture/mod.rs | 2 +- tests/pipeline_runner.rs | 26 +++- 4 files changed, 194 insertions(+), 39 deletions(-) create mode 100644 tests/extra/data/input.txt diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 74f440fa..af72e5d1 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -40,7 +40,16 @@ struct PipelineRunInfo { outputs: Arc>>>>, // String is the node key, while hash } -/// Docker based pipeline runner meant to execute on a single machine +/** + * Runner for pipelines + * + * General Algorithm: + * 1. All nodes receive inputs via a MPSC channel, where parents nodes will send their output packets + * 2. There are two "functional nodes processor" in the pipeline, + * which is the `input_node` and `output_node` + * 3. Each node will process the inputs its receives and will only send it children input channels + * if they are successfully processed. Failures are just printed for now (Will be replaced by logging) + */ #[derive(Default)] pub struct DockerPipelineRunner { pipeline_runs: HashMap, // For each pipeline run, we have a join set to track the tasks and wait on them @@ -52,10 +61,28 @@ impl DockerPipelineRunner { Self::default() } - /// Start the `pipeline_job` returning `pipeline_run`un - /// - /// # Errors - /// Will error out if the pipeline job fails to start + /** + Start the `pipeline_job` returning `pipeline_run`un + + Algorithm: + 1. Create a new `PipelineRun` from the `pipeline_job` + 2. Insert the `PipelineRun` into the `pipeline_runs` map + 3. Create an output channel to capture the outputs of the nodes + (This will be given to the output capture task) + 4. Create a task that captures the outputs form nodes and stores them in the `outputs` map + This is done via listening the channel and acting like a final node in the pipeline + 5. Get the root nodes of the pipeline and call `create_task_for_node` for each root node + This will recursively BFS through the pipeline and create tasks for each node + (More detail in that function) + 6. Using the `root_nodes` txs, we will send all inputs to that channel. + This will start the pipeline execution + 7. Upon sending all the inputs, we will send node complete message + signifying that the `input_node` is done + 8. Return the `PipelineRun` which can be used to get the results later + + # Errors + Will error out if the pipeline job fails to start + */ pub async fn start( &mut self, pipeline_job: PipelineJob, @@ -81,6 +108,15 @@ impl DockerPipelineRunner { // Create the output channel to capture the outputs of the outputs nodes (Currently only leaf nodes) let (output_tx, mut output_rx) = mpsc::channel::(128); // Channel to capture outputs from nodes + // Insert the output channel into the pipeline run info + self.pipeline_runs + .get_mut(&pipeline_run_arc) + .context(selector::KeyMissing { + key: pipeline_run_arc.to_string(), + })? + .node_tx + .insert("output".to_owned(), output_tx.clone()); + // Get the output_nodes (leaf nodes for now) so the output task can keep track when parents are done let output_nodes_ids = pipeline .get_leaf_nodes() @@ -115,13 +151,13 @@ impl DockerPipelineRunner { // Check if all parent nodes are complete if complete_parent_nodes.is_superset(&output_nodes_ids) { // All parents are complete, we can exit this task - println!( - "All parent nodes are complete, stopping output capture task." - ); return Ok(()); } } - Message::Stop => todo!(), + Message::Stop => { + // No clear action needed, just exit the task + return Ok(()); + } } } Ok(()) @@ -190,6 +226,50 @@ impl DockerPipelineRunner { }) } + /// Stop the pipeline run and all its tasks + /// # Errors + /// Will error out if the pipeline run is not found or if any of the tasks fail to stop correctly + pub async fn stop(&mut self, pipeline_run: &PipelineRun) -> Result<()> { + // Get the pipeline run info + let pipeline_run_info = + self.pipeline_runs + .get_mut(pipeline_run) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })?; + + // Send a stop message to all the node txs + for tx in pipeline_run_info.node_tx.values() { + tx.send(Message::Stop).await?; + } + + // Wait for all tasks to complete + while let Some(result) = pipeline_run_info.node_task_join_set.join_next().await { + match result { + Ok(Ok(())) => {} // Task completed successfully + Ok(Err(err)) => { + eprintln!("Task failed: {err}"); + return Err(err); + } + Err(err) => { + eprintln!("Join set error: {err}"); + return Err(err.into()); + } + } + } + + // Remove the pipeline run from the list of pipeline runs + self.pipeline_runs.remove(pipeline_run); + + Ok(()) + } + + /// Helper function to create a task for each node, while recursively BFS through the pipeline + /// Summary: + /// 1. Check if their is already a channel created for the node, if not create one and insert it + /// 2. Call this function for each of the child nodes to get their `Sender_tx` + /// 3. If the node is a leaf node, attach the `output_tx` to the tx (Will be replaced by `output_nodes`) + /// 4. Start the task manager for the node, which will act as the node's processor fn create_task_for_node( &mut self, node: &Node, @@ -199,11 +279,9 @@ impl DockerPipelineRunner { ) -> Result> { println!("Creating task for node: {}", node.id); // Create a channel for the node - // This channel will be used to send messages to the node processor - let (tx, rx) = mpsc::channel::(128); // Use closer to limit the scope of the borrow - { + let (tx, rx) = { let pipeline_info = self.pipeline_runs .get_mut(pipeline_run) @@ -216,8 +294,12 @@ impl DockerPipelineRunner { return Ok(get(&pipeline_info.node_tx, &node.id)?.clone()); } + // This channel will be used to send messages to the node processor + let (tx, rx) = mpsc::channel::(128); + // Record the tx into the pipeline_info tx_hashmap - pipeline_info.node_tx.insert(node.id.clone(), tx.clone()) + pipeline_info.node_tx.insert(node.id.clone(), tx.clone()); + (tx, rx) }; // Call this function for each of the child nodes to get their Sender_tx @@ -255,7 +337,14 @@ impl DockerPipelineRunner { Ok(tx) } - /// For tx: Sender, we only want to send successfully completed results to the next node + /// Act as the processor of the node by: + /// 1. Creating a metadata struct for the node to be passed to the appropriate processor + /// 2. Get the kernel for the node and build the correct processor for this node + /// 3. Start the processor and wait till it completes + /// 4. Send a message that the node processing is complete + /// + /// # Errors + /// Will error out if the kernel for the node is not found or if the async fn start_node_manager( node: Node, pipeline_run: Arc, @@ -263,7 +352,6 @@ impl DockerPipelineRunner { success_chs_tx: Vec>, namespace_lookup: HashMap, ) -> Result<()> { - println!("Starting node manager for node: {}", node.id); // Create a metadata struct for this node let node_metadata = NodeMetaData { node_id: node.id.clone(), @@ -273,8 +361,6 @@ impl DockerPipelineRunner { namespace_lookup: namespace_lookup.clone(), }; - println!("Setting up node processor for node: {}", node.id); - // Get the kernel for this node and build the correct processor match get( &pipeline_run.pipeline_job.pipeline.kernel_lut, @@ -302,17 +388,31 @@ impl DockerPipelineRunner { // Since all inputs are sent, we can send a message that the "input node" processing is complete for success_ch_tx in &success_chs_tx { - success_ch_tx + match success_ch_tx .send(Message::NodeProcessingComplete(node.id.clone())) - .await?; + .await + { + Ok(()) => {} + Err(err) => { + match err { + mpsc::error::SendError(Message::NodeProcessingComplete(_)) => { + // The channel is closed, we can ignore this error, this happens when stop it called + eprintln!("Failed to send processing complete message, channel closed"); + } + _ => { + eprintln!("Failed to send processing complete message: {err}"); + } + } + } + } } - println!("Node manager for node: {} has completed", node.id); - Ok(()) } } +/// Metadata for the node processor +/// Contains fields that is normally needed to process incoming packets struct NodeMetaData { node_id: String, node_rx: mpsc::Receiver, // Channel to listen to messages from parent nodes @@ -321,6 +421,11 @@ struct NodeMetaData { namespace_lookup: HashMap, // Copy of the look up table } +/// Unify the interface for node processors and provide a common way to handle processing of incoming messages +/// This trait defines the methods that all node processors should implement +/// +/// Main purpose was to reduce the amount of code duplication between different node processors +/// As a result, each processor only needs to worry about writing their own function to process the msg. trait NodeProcessor { fn get_node_rx(&mut self) -> &mut mpsc::Receiver; @@ -342,6 +447,8 @@ trait NodeProcessor { async fn wait_for_node_task_completion(&mut self); } +/// Processor for Pods +/// Currently missing implementation to call agents for actual pod processing struct PodProcessor { pod: Arc, node_metadata: NodeMetaData, @@ -357,6 +464,8 @@ impl PodProcessor { } } + /// Actual logic of processing a packet using the pod + /// At the moment it does a simulation of pod execution async fn process_packet( node_id: String, pod: Arc, @@ -402,13 +511,24 @@ impl PodProcessor { // Simulate pod execution by just printing out pod_job_hash and pod hash // This will be replaced by sending the pod_job to the orchestrator via the agent println!( - "Executing pod job: {} with pod hash: {}", + "Simulating Executing pod job: {} with pod hash: {}", pod_job.hash, pod_job.pod.hash ); + #[expect( + clippy::unwrap_used, + reason = "Hard code for now, will be replaced by agent" + )] + // Build the output_packet + let output_packet = pod + .output_spec + .keys() + .map(|output_key| (output_key.clone(), packet.values().next().cloned().unwrap())) + .collect::>(); + // For now we will just send the input_packet to the success channel try_join_all(success_chs_tx.iter().map(|success_ch_tx| { - success_ch_tx.send(Message::NodeOutput(node_id.clone(), packet.clone())) + success_ch_tx.send(Message::NodeOutput(node_id.clone(), output_packet.clone())) })) .await?; @@ -430,14 +550,25 @@ impl NodeProcessor for PodProcessor { let namespace_lookup = self.node_metadata.namespace_lookup.clone(); let child_nodes_txs = self.node_metadata.child_nodes_txs.clone(); // Forward it into a processing task - self.processing_tasks.spawn(Self::process_packet( - node_id, - pod_ref, - namespace, - namespace_lookup, - packet, - child_nodes_txs, - )); + self.processing_tasks.spawn(async move { + // Process the packet using the pod + // This will execute the pod and send the output to the next node + if let Err(err) = Self::process_packet( + node_id, + pod_ref, + namespace, + namespace_lookup, + packet, + child_nodes_txs, + ) + .await + { + // Send the error to the failure channel + // For now just print it out + eprintln!("Failed to process packet with error: {err}"); + } + Ok(()) + }); } Message::Stop => { // Stop message received, we will stop processing @@ -452,7 +583,6 @@ impl NodeProcessor for PodProcessor { return true; } } - println!("returning false"); false } @@ -463,6 +593,8 @@ impl NodeProcessor for PodProcessor { } } +/// Processor for Mapper nodes +/// This processor renames the `input_keys` from the input packet to the `output_keys` defined by the map struct MapperProcessor { mapper: Arc, node_metadata: NodeMetaData, @@ -517,7 +649,7 @@ impl NodeProcessor for MapperProcessor { Err(err) => { // Send the error to the failure channel // For now just print it out - println!("Failed to process packet with error: {err}"); + eprintln!("Failed to process packet with error: {err}"); } } } @@ -528,6 +660,9 @@ impl NodeProcessor for MapperProcessor { } } +/// Processor for Joiner nodes +/// This processor combines packets from multiple parent nodes into a single output packet +/// It uses a cartesian product to combine packets from different parents struct JoinerProcessor { /// Cache for all packets received by the node input_packet_cache: HashMap>>, @@ -619,7 +754,7 @@ impl JoinerProcessor { } Err(err) => { // Send the error to the failure channel - println!( + eprintln!( "Failed to process packet from {sender_node_id} for joiner node with error: {err}" ); } @@ -647,7 +782,7 @@ impl NodeProcessor for JoinerProcessor { Ok(()) => {} Err(err) => { // Send the error to the failure channel - println!("Failed to process packet with error: {err}"); + eprintln!("Failed to process packet with error: {err}"); } } } diff --git a/tests/extra/data/input.txt b/tests/extra/data/input.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index 150babaf..0e577e4f 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -287,7 +287,7 @@ pub fn pipeline_job() -> Result { kind: BlobKind::File, location: URI { namespace: "default".to_owned(), - path: PathBuf::from("data/input.txt"), + path: PathBuf::from("input.txt"), }, ..Default::default() }), diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index afd58eb2..37fcdecf 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -12,7 +12,7 @@ use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunn use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test(flavor = "multi_thread", worker_threads = 16)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn basic_run() -> Result<()> { let pipeline_job = pipeline_job()?; @@ -28,7 +28,27 @@ async fn basic_run() -> Result<()> { let pipeline_run = runner.start(pipeline_job, &namespace_lookup).await?; // Wait for the pipeline run to complete - let result = runner.get_result(&pipeline_run).await?; - println!("Pipeline run result: {:?}", result.output_packets); + runner.get_result(&pipeline_run).await?; + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn stop() -> Result<()> { + let pipeline_job = pipeline_job()?; + + // Create the runner + let mut runner = DockerPipelineRunner::new(); + + let test_dirs = TestDirs::new(&HashMap::from([( + "default".to_owned(), + Some("./tests/extra/data/"), + )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); + + let pipeline_run = runner.start(pipeline_job, &namespace_lookup).await?; + + // Abort the pipeline run + runner.stop(&pipeline_run).await?; + Ok(()) } From 713e4710ee4af95cd417bab0c617658eabac931b Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 10 Jul 2025 18:19:25 +0000 Subject: [PATCH 11/29] fix mistake in docs --- src/uniffi/pipeline_runner/runner.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index af72e5d1..87aaa649 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -62,7 +62,7 @@ impl DockerPipelineRunner { } /** - Start the `pipeline_job` returning `pipeline_run`un + Start the `pipeline_job` returning `pipeline_run` Algorithm: 1. Create a new `PipelineRun` from the `pipeline_job` From d3701c92900e9644cc8d52defe30c2dd750a3095 Mon Sep 17 00:00:00 2001 From: synicix Date: Tue, 15 Jul 2025 05:57:27 +0000 Subject: [PATCH 12/29] Added joiner processor unit test and fix bug for case where we don't have all the parents yet --- src/core/pipeline.rs | 0 src/uniffi/pipeline.rs | 9 +- src/uniffi/pipeline_runner/runner.rs | 132 ++++++++++++++++++++++++++- tests/fixture/mod.rs | 53 ++++++++--- tests/pipeline.rs | 15 ++- tests/pipeline_runner.rs | 1 + 6 files changed, 186 insertions(+), 24 deletions(-) create mode 100644 src/core/pipeline.rs diff --git a/src/core/pipeline.rs b/src/core/pipeline.rs new file mode 100644 index 00000000..e69de29b diff --git a/src/uniffi/pipeline.rs b/src/uniffi/pipeline.rs index e634ae36..3476ce07 100644 --- a/src/uniffi/pipeline.rs +++ b/src/uniffi/pipeline.rs @@ -12,7 +12,7 @@ use petgraph::{graph::DiGraph, prelude::NodeIndex}; use serde::Serialize; use std::{ backtrace::Backtrace, - collections::HashMap, + collections::{HashMap, HashSet}, hash::{Hash, Hasher}, string::String, sync::Arc, @@ -243,8 +243,9 @@ impl Pipeline { }) } - /// Returns the input specification for the pipeline. - /// This is currently a combination of all the root nodes' input specifications. + /// Returns the input specification for the pipeline, where the specification is a list of unique + /// keys that are required as input to the pipeline. + /// /// # Errors /// Will error out if it fails to get the kernel from the kernel lookup table pub fn get_input_spec(&self) -> Result> { @@ -254,7 +255,7 @@ impl Pipeline { Ok(self .get_root_nodes() .map(|node| Ok(get(&self.kernel_lut, &node.kernel_hash)?.get_input_keys())) - .collect::>>()? + .collect::>>()? .into_iter() .flatten() .collect()) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 87aaa649..4085ffac 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -426,7 +426,7 @@ struct NodeMetaData { /// /// Main purpose was to reduce the amount of code duplication between different node processors /// As a result, each processor only needs to worry about writing their own function to process the msg. -trait NodeProcessor { +pub(crate) trait NodeProcessor { fn get_node_rx(&mut self) -> &mut mpsc::Receiver; async fn start(&mut self) { @@ -668,6 +668,7 @@ struct JoinerProcessor { input_packet_cache: HashMap>>, completed_parents: Vec, node_metadata: NodeMetaData, + initial_computation_completed: bool, } impl JoinerProcessor { @@ -680,11 +681,12 @@ impl JoinerProcessor { input_packet_cache, node_metadata, completed_parents: Vec::new(), + initial_computation_completed: false, } } fn compute_new_packet_combination( - &self, + &mut self, sender_node_id: &str, new_packet: &HashMap, ) -> Result>> { @@ -694,12 +696,36 @@ impl JoinerProcessor { .input_packet_cache .keys() .filter(|key| *key != sender_node_id); + + // Create a vector to hold the incoming packet + // This will be used to compute the cartesian product and will be modified if the initial computation is not completed + let mut incoming_packet = vec![new_packet.clone()]; + + // Determine if the initial computation has been computed + if !self.initial_computation_completed { + // Check if we at least have one cached packet for each of the other parents + for parent_id in other_parent_ids.clone() { + if get(&self.input_packet_cache, parent_id)?.is_empty() { + // We are still missing other parents, so we can't compute the new packet combination yet + return Ok(Vec::new()); + } + } + + // We have at least one packet for each of the other parents, thus we can compute the cartesian product + // For the initial computation, we will add all of the add all previous packets for this sender + get(&self.input_packet_cache, &sender_node_id.to_owned())? + .iter() + .for_each(|packet| incoming_packet.push(packet.clone())); + + self.initial_computation_completed = true; + } + let mut factors = other_parent_ids .map(|id| get(&self.input_packet_cache, id)) .collect::>>()?; // Add the new incoming packet as a factor - let incoming_packet = vec![new_packet.clone()]; + factors.push(&incoming_packet); let result = factors @@ -806,3 +832,103 @@ impl NodeProcessor for JoinerProcessor { false } } + +#[cfg(test)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[expect(clippy::panic_in_result_fn, reason = "Unit test")] +async fn joiner() -> Result<()> { + // Create a fake mpsc channel for the node + let (_, node_rx) = mpsc::channel::(128); + + // Create a child mpsc + let (child_tx, mut child_rx) = mpsc::channel::(128); + + let node_metadata = NodeMetaData { + node_id: "joiner_node".to_owned(), + node_rx, + child_nodes_txs: vec![child_tx], + namespace: "test".to_owned(), + namespace_lookup: HashMap::new(), + }; + + let mut joiner_process = JoinerProcessor::new( + vec!["0".to_owned(), "1".to_owned(), "2".to_owned()], + node_metadata, + ); + + // Make each parent has 1 packet + for idx in 0..2 { + joiner_process + .process_packet( + &format!("{idx}"), + make_test_packet("data_1.txt".to_owned().into()), + ) + .await?; + } + + // Confirm that there should be no output yet + + // Now we send the missing parent package + // This will yield one unique combination + joiner_process + .process_packet("2", make_test_packet("data_1.txt".to_owned().into())) + .await?; + + // Confirm that the output is sent to the child channel + assert!( + child_rx.len() == 1, + "Should have only one message in the channel", + ); + assert!( + child_rx.recv().await.is_some(), + "Should have received a message" + ); + + // Insert another one + joiner_process + .process_packet("2", make_test_packet("data_2.txt".to_owned().into())) + .await?; + + // The joiner node should send another one + assert!( + child_rx.len() == 1, + "Should have only one message in the channel", + ); + assert!( + child_rx.recv().await.is_some(), + "Should have received a message" + ); + + // Now insert to packet for parent 0, which should yield 2 packets in total + // This is because of the cartesian product + joiner_process + .process_packet("0", make_test_packet("data_2.txt".to_owned().into())) + .await?; + + assert!( + child_rx.len() == 2, + "Should have only two messages in the channel", + ); + assert!( + child_rx.recv().await.is_some(), + "Should have received a message" + ); + + Ok(()) +} + +#[cfg(test)] +fn make_test_packet(path: PathBuf) -> HashMap { + use crate::uniffi::model::{Blob, BlobKind}; + + let path_set = PathSet::Unary(Blob { + kind: BlobKind::File, + location: URI { + namespace: "test".to_owned(), + path, + }, + checksum: String::new(), + }); + + HashMap::from([("key".to_owned(), path_set)]) +} diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index 0e577e4f..cf252ffc 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -190,7 +190,7 @@ pub fn container_image_style(binary_location: impl AsRef) -> Result Result { +pub fn append_name_pod(pod_name: &str) -> Result { Pod::new( Some(Annotation { name: pod_name.to_owned(), @@ -225,21 +225,30 @@ pub fn pod_append_name(pod_name: &str) -> Result { pub fn pipeline() -> Result { // Create a simple pipeline where the functions job is to add append their name into the input file - // Structure: A -> B -> C + // Structure: A -> Mapper -> Joiner -> B -> Mapper -> C, D -> Mapper -> Joiner // Create the components of the pipeline - let pod_a = pod_append_name("A")?; - let pod_b = pod_append_name("B")?; - let pod_c = pod_append_name("C")?; + let pod_a = append_name_pod("A")?; + let pod_b = append_name_pod("B")?; + let pod_c = append_name_pod("C")?; + let pod_d = append_name_pod("D")?; + // Create the file mapper that will be used to map the output of one pod to the input of another let file_mapper = Mapper::new(HashMap::from([( "output_text".to_owned(), "input_text".to_owned(), )]))?; + + // Create the file mapper that will be used to map the output of one pod to the input of another + let file_mapper_for_pod_d = Mapper::new(HashMap::from([( + "output_text".to_owned(), + "input2_text".to_owned(), + )]))?; + let mut kernel_to_node_name = HashMap::>::new(); // Insert the pods into the kernel_to_node_name mapping - for pod in [&pod_a, &pod_b, &pod_c] { + for pod in [&pod_a, &pod_b, &pod_c, &pod_d] { kernel_to_node_name .entry(pod.clone().into()) .or_default() @@ -252,18 +261,34 @@ pub fn pipeline() -> Result { ); } - // Insert the mapping next - for idx in 0..2 { - kernel_to_node_name - .entry(file_mapper.clone().into()) - .or_default() - .push("file_mapper_".to_owned() + &idx.to_string()); - } + // Add mapper to end of pod_a and pod_b + kernel_to_node_name + .entry(file_mapper.clone().into()) + .or_default() + .push("pod_a_mapper".to_owned()); + + kernel_to_node_name + .entry(file_mapper.into()) + .or_default() + .push("pod_b_mapper".to_owned()); + + // Insert mapper for pod_d + kernel_to_node_name + .entry(file_mapper_for_pod_d.into()) + .or_default() + .push("pod_d_mapper".to_owned()); + + // Add the joiner + kernel_to_node_name + .entry(Kernel::Joiner) + .or_default() + .push("pod_b_joiner".to_owned()); // Write all the edges in DOT format let dot = " digraph { - A -> file_mapper_0 -> B -> file_mapper_1 -> C; + A -> pod_a_mapper -> pod_b_joiner -> B -> pod_b_mapper -> C; + D -> pod_d_mapper -> pod_b_joiner; } "; diff --git a/tests/pipeline.rs b/tests/pipeline.rs index 4150deeb..70de9487 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -33,8 +33,8 @@ fn creation() -> Result<()> { // so graph of 5, and 4 kernels due to the mapping being repeated assert_eq!( pipeline.kernel_lut.len(), - 4, - "Kernel LUT should have exactly 4 entries." + 7, + "Kernel LUT should have exactly 7 entries." ); Ok(()) @@ -44,7 +44,7 @@ fn creation() -> Result<()> { fn root_nodes() -> Result<()> { let pipeline = pipeline()?; - assert_eq!(pipeline.get_root_nodes().count(), 1); + assert_eq!(pipeline.get_root_nodes().count(), 2); Ok(()) } @@ -65,6 +65,15 @@ fn get_parents_key_for_node() -> Result<()> { Ok(()) } +#[test] +fn get_childen_for_node() -> Result<()> { + let pipeline = pipeline()?; + let node_key = pipeline.get_root_nodes().next().unwrap(); + + assert_eq!(pipeline.get_children_for_node(node_key).count(), 1); + Ok(()) +} + #[test] fn get_input_spec() -> Result<()> { let pipeline = pipeline()?; diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 37fcdecf..d3b816af 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -1,4 +1,5 @@ #![expect(missing_docs, reason = "OK in tests.")] + // If 'fixture' is a local module, ensure there is a 'mod fixture;' statement or a 'fixture.rs' file in the same directory or in 'tests/'. // If 'fixture' is an external crate, add it to Cargo.toml and import as shown below. // use fixture::pipeline_job; From b911f7972ea4a0fb0031e7bc31f6e77bd5808e34 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 18 Jul 2025 03:47:16 +0000 Subject: [PATCH 13/29] Save progress --- Cargo.toml | 1 + src/core/orchestrator/agent.rs | 1 + src/uniffi/orchestrator/agent.rs | 1 - src/uniffi/pipeline_runner/mod.rs | 40 --- src/uniffi/pipeline_runner/runner.rs | 382 ++++++++------------------- tests/agent.rs | 1 - tests/pipeline_runner.rs | 2 +- 7 files changed, 106 insertions(+), 322 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1340c5d7..4b1ff057 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ crate-type = ["rlib", "cdylib"] [dependencies] # make async fn in traits work with dyn traits async-trait = "0.1.88" +bitcode = "0.6.6" # docker API in orchestrator bollard = "0.17.1" # datetime utilities diff --git a/src/core/orchestrator/agent.rs b/src/core/orchestrator/agent.rs index a739b90f..bd93077c 100644 --- a/src/core/orchestrator/agent.rs +++ b/src/core/orchestrator/agent.rs @@ -102,6 +102,7 @@ impl AgentClient { #[expect( clippy::let_underscore_must_use, + clippy::excessive_nesting, reason = "`result::Result<(), SendError<_>>` is the only uncaptured result since it would mean we can't transmit results over mpsc." )] pub async fn start_service< diff --git a/src/uniffi/orchestrator/agent.rs b/src/uniffi/orchestrator/agent.rs index 8a55f795..bcf48fcc 100644 --- a/src/uniffi/orchestrator/agent.rs +++ b/src/uniffi/orchestrator/agent.rs @@ -140,7 +140,6 @@ impl Agent { /// # Errors /// /// Will stop and return an error if encounters an error while processing any pod job request. - #[expect(clippy::excessive_nesting, reason = "Nesting manageable.")] pub async fn start( &self, namespace_lookup: &HashMap, diff --git a/src/uniffi/pipeline_runner/mod.rs b/src/uniffi/pipeline_runner/mod.rs index a0d4812b..c4a8e880 100644 --- a/src/uniffi/pipeline_runner/mod.rs +++ b/src/uniffi/pipeline_runner/mod.rs @@ -1,8 +1,6 @@ use crate::uniffi::error::Result; use super::pipeline::PipelineJob; -use std::fmt; -use std::hash::{Hash, Hasher}; /// # Errors: /// Error out if fail to start the pipeline job @@ -13,43 +11,5 @@ pub trait PipelineRunner { /// Returns an error if the pipeline job fails to start. fn start(&self, pipeline_job: PipelineJob) -> Result<()>; } - -#[derive(Debug, Clone)] -/// Struct to store the active pipeline run. -pub struct PipelineRun { - pipeline_job: PipelineJob, -} - -impl fmt::Display for PipelineRun { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "PipelineRun {{ pipeline_job: {} }}", - self.pipeline_job.hash - ) - } -} - -impl PipelineRun { - /// New function to initialize the pipeline run - pub const fn new(pipeline_job: PipelineJob) -> Self { - Self { pipeline_job } - } -} - -impl PartialEq for PipelineRun { - fn eq(&self, other: &Self) -> bool { - self.pipeline_job.hash == other.pipeline_job.hash - } -} - -impl Eq for PipelineRun {} - -impl Hash for PipelineRun { - fn hash(&self, state: &mut H) { - self.pipeline_job.hash.hash(state); - } -} - /// Docker pipeline runner pub mod runner; diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 4085ffac..8a1475be 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -1,4 +1,3 @@ -use super::PipelineRun; use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ @@ -7,12 +6,14 @@ use crate::{ pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, }; +use bitcode::{Decode, Encode}; use futures_util::future::try_join_all; use itertools::Itertools as _; use serde_yaml::Serializer; use snafu::OptionExt as _; use std::{ collections::{HashMap, HashSet}, + hash::{Hash, Hasher}, path::PathBuf, sync::Arc, }; @@ -21,7 +22,7 @@ use tokio::{ task::{JoinSet, spawn_blocking}, }; -#[derive(Clone, Debug)] +#[derive(Encode, Decode, Clone, Debug)] pub(crate) enum Message { /// String is the `parent_node_id`, while `HashMap` is output of the parent node NodeOutput(String, HashMap), @@ -34,12 +35,27 @@ pub(crate) enum Message { clippy::type_complexity, reason = "too complex, but necessary for async handling" )] -struct PipelineRunInfo { - node_task_join_set: JoinSet>, // Join set to track the tasks for this pipeline run - node_tx: HashMap>, +#[derive(Debug, Clone)] +pub struct PipelineRun { + /// PipelineJob that this run is associated with + pub pipeline_job: PipelineJob, // The pipeline job that this run is associated with outputs: Arc>>>>, // String is the node key, while hash } +impl PartialEq for PipelineRun { + fn eq(&self, other: &Self) -> bool { + self.pipeline_job.hash == other.pipeline_job.hash + } +} + +impl Eq for PipelineRun {} + +impl Hash for PipelineRun { + fn hash(&self, state: &mut H) { + self.pipeline_job.hash.hash(state); + } +} + /** * Runner for pipelines * @@ -52,9 +68,17 @@ struct PipelineRunInfo { */ #[derive(Default)] pub struct DockerPipelineRunner { - pipeline_runs: HashMap, // For each pipeline run, we have a join set to track the tasks and wait on them + pipeline_runs: HashSet>, } +/** + * This is an implementation of a pipeline runner that uses Zenoh to communicate between the tasks + * The runtime is tokio + * + * These are the key expressions of the components of the pipeline: + * - Input Node: pipeline_job_hash/input_node/outputs (This is where the pipeline_job packets get fed to) + * - Nodes: pipeline_job_hash/node_id/outputs/(success|failure) (This is where the node outputs are sent to) +*/ impl DockerPipelineRunner { /// Create a new Docker pipeline runner pub fn new() -> Self { @@ -87,90 +111,17 @@ impl DockerPipelineRunner { &mut self, pipeline_job: PipelineJob, namespace_lookup: &HashMap, - ) -> Result { + ) -> Result<&PipelineRun> { // Create a new pipeline run - let pipeline_run = PipelineRun { pipeline_job }; - let pipeline_run_arc = Arc::new(pipeline_run.clone()); - - // Insert into the list of pipeline runs - self.pipeline_runs.insert( - (*pipeline_run_arc).clone(), - PipelineRunInfo { - node_tx: HashMap::new(), - node_task_join_set: JoinSet::new(), - outputs: Arc::new(RwLock::new(HashMap::new())), - }, - ); + let pipeline_run = Arc::new(PipelineRun { + pipeline_job, + outputs: Arc::new(RwLock::new(HashMap::new())), + }); // Get reference to the pipeline - let pipeline = &pipeline_run_arc.pipeline_job.pipeline; - - // Create the output channel to capture the outputs of the outputs nodes (Currently only leaf nodes) - let (output_tx, mut output_rx) = mpsc::channel::(128); // Channel to capture outputs from nodes + let pipeline = &pipeline_run.pipeline_job.pipeline; - // Insert the output channel into the pipeline run info - self.pipeline_runs - .get_mut(&pipeline_run_arc) - .context(selector::KeyMissing { - key: pipeline_run_arc.to_string(), - })? - .node_tx - .insert("output".to_owned(), output_tx.clone()); - - // Get the output_nodes (leaf nodes for now) so the output task can keep track when parents are done - let output_nodes_ids = pipeline - .get_leaf_nodes() - .map(|node| node.id.clone()) - .collect::>(); - let outputs = Arc::clone(&get(&self.pipeline_runs, &pipeline_run_arc)?.outputs); - - // Create the task that captures the output from the nodes and stores them in the outputs map - self.pipeline_runs - .get_mut(&pipeline_run_arc) - .context(selector::KeyMissing { - key: pipeline_run_arc.to_string(), - })? - .node_task_join_set - .spawn(async move { - let mut complete_parent_nodes = HashSet::new(); - while let Some(message) = output_rx.recv().await { - match message { - Message::NodeOutput(sender_node_id, hash_map) => { - // Store the output in the outputs map - outputs - .write() - .await - .entry(sender_node_id) - .or_default() - .push(hash_map); - } - Message::NodeProcessingComplete(sender_node_id) => { - // Add the sender node id to the complete parent nodes - complete_parent_nodes.insert(sender_node_id.clone()); - - // Check if all parent nodes are complete - if complete_parent_nodes.is_superset(&output_nodes_ids) { - // All parents are complete, we can exit this task - return Ok(()); - } - } - Message::Stop => { - // No clear action needed, just exit the task - return Ok(()); - } - } - } - Ok(()) - }); - - // Get all the root nodes and call the create_task_for_node function for each root node - // This will recursively create all the tasks and channels for the pipeline - let root_nodes_tx = pipeline - .get_root_nodes() - .map(|node| { - self.create_task_for_node(node, &pipeline_run_arc, &output_tx, namespace_lookup) - }) - .collect::>>()?; + // Create a task for each node // All pipeline tasks have been created, now we need to feed the inputs to the pipeline for tx in &root_nodes_tx { @@ -189,7 +140,15 @@ impl DockerPipelineRunner { .await?; } - Ok(pipeline_run) + // Insert into the list of pipeline runs + self.pipeline_runs.insert(pipeline_run); + + Ok(self + .pipeline_runs + .get(&pipeline_run_arc) + .context(selector::KeyMissing { + key: pipeline_run.to_string(), + })?) } /// Given a pipeline run, wait for all its tasks to complete and return the `PipelineResult` @@ -264,79 +223,6 @@ impl DockerPipelineRunner { Ok(()) } - /// Helper function to create a task for each node, while recursively BFS through the pipeline - /// Summary: - /// 1. Check if their is already a channel created for the node, if not create one and insert it - /// 2. Call this function for each of the child nodes to get their `Sender_tx` - /// 3. If the node is a leaf node, attach the `output_tx` to the tx (Will be replaced by `output_nodes`) - /// 4. Start the task manager for the node, which will act as the node's processor - fn create_task_for_node( - &mut self, - node: &Node, - pipeline_run: &Arc, - output_tx: &mpsc::Sender, - namespace_lookup: &HashMap, - ) -> Result> { - println!("Creating task for node: {}", node.id); - // Create a channel for the node - - // Use closer to limit the scope of the borrow - let (tx, rx) = { - let pipeline_info = - self.pipeline_runs - .get_mut(pipeline_run) - .context(selector::KeyMissing { - key: pipeline_run.to_string(), - })?; - // Check if the node is already inside the node_tx - if pipeline_info.node_tx.contains_key(&node.id) { - // Node already exists, thus we can return the existing tx - return Ok(get(&pipeline_info.node_tx, &node.id)?.clone()); - } - - // This channel will be used to send messages to the node processor - let (tx, rx) = mpsc::channel::(128); - - // Record the tx into the pipeline_info tx_hashmap - pipeline_info.node_tx.insert(node.id.clone(), tx.clone()); - (tx, rx) - }; - - // Call this function for each of the child nodes to get their Sender_tx - let mut children_node_tx = pipeline_run - .pipeline_job - .pipeline - .get_children_for_node(node) - .map(|child_node| { - self.create_task_for_node(child_node, pipeline_run, output_tx, namespace_lookup) - }) - .collect::>>()?; - - // Check if children_node_tx is empty, if so, this is a leaf node thus we need to attach the output_tx - if children_node_tx.is_empty() { - // This is a leaf node, thus we need to attach the output_tx to the tx - // This will allow the node to send its output to the output channel - children_node_tx.push(output_tx.clone()); - } - - // Start the task_manager - self.pipeline_runs - .get_mut(pipeline_run) - .context(selector::KeyMissing { - key: pipeline_run.to_string(), - })? - .node_task_join_set - .spawn(Self::start_node_manager( - node.clone(), - Arc::clone(pipeline_run), - rx, - children_node_tx, - namespace_lookup.clone(), - )); - - Ok(tx) - } - /// Act as the processor of the node by: /// 1. Creating a metadata struct for the node to be passed to the appropriate processor /// 2. Get the kernel for the node and build the correct processor for this node @@ -345,27 +231,13 @@ impl DockerPipelineRunner { /// /// # Errors /// Will error out if the kernel for the node is not found or if the - async fn start_node_manager( - node: Node, - pipeline_run: Arc, - node_rx: mpsc::Receiver, - success_chs_tx: Vec>, - namespace_lookup: HashMap, + async fn start_node_task( + kernel: Kernel, + output_key_expression: String, + namespace_path: PathBuf, ) -> Result<()> { - // Create a metadata struct for this node - let node_metadata = NodeMetaData { - node_id: node.id.clone(), - node_rx, - child_nodes_txs: success_chs_tx.clone(), - namespace: pipeline_run.pipeline_job.output_dir.namespace.clone(), - namespace_lookup: namespace_lookup.clone(), - }; - // Get the kernel for this node and build the correct processor - match get( - &pipeline_run.pipeline_job.pipeline.kernel_lut, - &node.kernel_hash, - )? { + match kernel { Kernel::Pod(pod) => { let mut processor = PodProcessor::new(Arc::clone(pod), node_metadata); processor.start().await; @@ -411,55 +283,37 @@ impl DockerPipelineRunner { } } -/// Metadata for the node processor -/// Contains fields that is normally needed to process incoming packets -struct NodeMetaData { - node_id: String, - node_rx: mpsc::Receiver, // Channel to listen to messages from parent nodes - child_nodes_txs: Vec>, // Channel to send successful outputs to the next node - namespace: String, - namespace_lookup: HashMap, // Copy of the look up table -} - /// Unify the interface for node processors and provide a common way to handle processing of incoming messages /// This trait defines the methods that all node processors should implement /// /// Main purpose was to reduce the amount of code duplication between different node processors /// As a result, each processor only needs to worry about writing their own function to process the msg. pub(crate) trait NodeProcessor { - fn get_node_rx(&mut self) -> &mut mpsc::Receiver; - - async fn start(&mut self) { - // Start to listen to the channels - // Listen to the MPSC channel and handle messages - while let Some(msg) = self.get_node_rx().recv().await { - if self.process_msg(msg).await { - // If the message indicates that processing is complete, we can exit the loop - // Wait for all processing tasks to complete before returning - self.wait_for_node_task_completion().await; - break; - } - } - } + async fn process_packet( + &mut self, + packet: HashMap, + session: Arc, + output_key_exp: &str, + ) -> Result<()>; - async fn process_msg(&mut self, msg: Message) -> bool; + async fn wait_for_node_task_completion(&mut self) -> Result<()>; - async fn wait_for_node_task_completion(&mut self); + fn stop(&mut self) -> Result<()>; } /// Processor for Pods /// Currently missing implementation to call agents for actual pod processing struct PodProcessor { + session: zenoh::Session, pod: Arc, - node_metadata: NodeMetaData, processing_tasks: JoinSet>, } impl PodProcessor { - fn new(pod: Arc, node_metadata: NodeMetaData) -> Self { + fn new(pod: Arc) -> Self { Self { + session: zenoh::Session::default(), pod, - node_metadata, processing_tasks: JoinSet::new(), } } @@ -537,59 +391,19 @@ impl PodProcessor { } impl NodeProcessor for PodProcessor { - fn get_node_rx(&mut self) -> &mut mpsc::Receiver { - &mut self.node_metadata.node_rx - } + async fn process_packet( + &mut self, + packet: HashMap, + session: Arc, + output_key_exp: &str, + ) -> Result<()>; - async fn process_msg(&mut self, msg: Message) -> bool { - match msg { - Message::NodeOutput(_, packet) => { - let pod_ref = Arc::clone(&self.pod); - let node_id = self.node_metadata.node_id.clone(); - let namespace = self.node_metadata.namespace.clone(); - let namespace_lookup = self.node_metadata.namespace_lookup.clone(); - let child_nodes_txs = self.node_metadata.child_nodes_txs.clone(); - // Forward it into a processing task - self.processing_tasks.spawn(async move { - // Process the packet using the pod - // This will execute the pod and send the output to the next node - if let Err(err) = Self::process_packet( - node_id, - pod_ref, - namespace, - namespace_lookup, - packet, - child_nodes_txs, - ) - .await - { - // Send the error to the failure channel - // For now just print it out - eprintln!("Failed to process packet with error: {err}"); - } - Ok(()) - }); - } - Message::Stop => { - // Stop message received, we will stop processing - self.processing_tasks.abort_all(); - return true; - } - Message::NodeProcessingComplete(_) => { - // Since pod only have one parent, we can expect that there will be no more incoming packet - // thus, we need to wait for everything to finish processing and send completion message - // Return true to notify caller that processing is complete - self.wait_for_node_task_completion().await; - return true; - } - } - false + async fn wait_for_node_task_completion(&mut self) -> Result<()> { + todo!() } - async fn wait_for_node_task_completion(&mut self) { - while self.processing_tasks.join_next().await.is_some() { - // Wait for all processing tasks to complete - } + fn stop(&mut self) -> Result<()> { + todo!() } } @@ -597,7 +411,6 @@ impl NodeProcessor for PodProcessor { /// This processor renames the `input_keys` from the input packet to the `output_keys` defined by the map struct MapperProcessor { mapper: Arc, - node_metadata: NodeMetaData, } impl MapperProcessor { @@ -633,30 +446,41 @@ impl MapperProcessor { } impl NodeProcessor for MapperProcessor { - fn get_node_rx(&mut self) -> &mut mpsc::Receiver { - &mut self.node_metadata.node_rx - } + async fn process_packet( + &mut self, + packet: HashMap, + session: Arc, + output_key_exp: &str, + ) -> Result<()> { + // Apply the mapping to the input packet + let output_map = self + .mapper + .mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(&packet, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; - async fn wait_for_node_task_completion(&mut self) { - // Mapper doesn't spawn additional tasks, so this is a no-op + // Send the packet outwards + session + .put( + output_key_exp, + bitcode::encode(&Message::NodeOutput((), ())), + ) + .await + .unwrap(); + + Ok(()) } - async fn process_msg(&mut self, msg: Message) -> bool { - match msg { - Message::NodeOutput(_, packet) => { - match self.process_packet(&packet).await { - Ok(()) => {} - Err(err) => { - // Send the error to the failure channel - // For now just print it out - eprintln!("Failed to process packet with error: {err}"); - } - } - } - Message::NodeProcessingComplete(_) | Message::Stop => return true, - } + async fn wait_for_node_task_completion(&mut self) -> Result<()> { + todo!() + } - false + fn stop(&mut self) -> Result<()> { + todo!() } } diff --git a/tests/agent.rs b/tests/agent.rs index 781c8086..2935e213 100644 --- a/tests/agent.rs +++ b/tests/agent.rs @@ -39,7 +39,6 @@ fn simple() -> Result<()> { Ok(()) } -#[expect(clippy::excessive_nesting, reason = "Nesting is manageable")] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn parallel_four_cores() -> Result<()> { let test_dirs = TestDirs::new(&HashMap::from([("default".to_owned(), None::)]))?; diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index d3b816af..054a8f2e 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -33,7 +33,7 @@ async fn basic_run() -> Result<()> { Ok(()) } -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn stop() -> Result<()> { let pipeline_job = pipeline_job()?; From b77d3f27a26d895ec87a7cdf5123e17e8fdc753a Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 18 Jul 2025 17:48:47 +0000 Subject: [PATCH 14/29] dsave progres --- .devcontainer/gpu/devcontainer.json | 1 - Cargo.toml | 2 +- cspell.json | 3 +- src/uniffi/pipeline_runner/runner.rs | 284 ++++++++++++++++----------- 4 files changed, 170 insertions(+), 120 deletions(-) diff --git a/.devcontainer/gpu/devcontainer.json b/.devcontainer/gpu/devcontainer.json index ebbaba51..f663fc73 100644 --- a/.devcontainer/gpu/devcontainer.json +++ b/.devcontainer/gpu/devcontainer.json @@ -21,7 +21,6 @@ }, "runArgs": [ "--name=${localWorkspaceFolderBasename}_devcontainer", - "--gpus=all", "--privileged", "--cgroupns=host" ], diff --git a/Cargo.toml b/Cargo.toml index 4b1ff057..d70b1059 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,7 @@ crate-type = ["rlib", "cdylib"] [dependencies] # make async fn in traits work with dyn traits async-trait = "0.1.88" -bitcode = "0.6.6" +bincode = { version = "2.0.1", features = ["serde"] } # docker API in orchestrator bollard = "0.17.1" # datetime utilities diff --git a/cspell.json b/cspell.json index 4211ef6e..e1526ca7 100644 --- a/cspell.json +++ b/cspell.json @@ -82,7 +82,8 @@ "itertools", "oneshot", "patchelf", - "colinianking" + "colinianking", + "bitcode" ], "useGitignore": false, "ignorePaths": [ diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 8a1475be..94abcb4e 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -6,11 +6,12 @@ use crate::{ pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, }; -use bitcode::{Decode, Encode}; +use bincode::{Decode, Encode}; use futures_util::future::try_join_all; use itertools::Itertools as _; +use serde::{Deserialize, Serialize}; use serde_yaml::Serializer; -use snafu::OptionExt as _; +use snafu::{OptionExt as _, ResultExt}; use std::{ collections::{HashMap, HashSet}, hash::{Hash, Hasher}, @@ -22,10 +23,14 @@ use tokio::{ task::{JoinSet, spawn_blocking}, }; -#[derive(Encode, Decode, Clone, Debug)] +static SUCCESS_KEY_EXP: &str = "/success"; +static FAILURE_KEY_EXP: &str = "/failure"; + +#[derive(Serialize, Deserialize, Clone, Debug)] pub(crate) enum Message { /// String is the `parent_node_id`, while `HashMap` is output of the parent node NodeOutput(String, HashMap), + NodeProcessingFailure(String, String), // String is the `node_id` that has failed processing /// String is the `node_id` that has completed processing NodeProcessingComplete(String), Stop, // Message to halt all operations @@ -288,12 +293,16 @@ impl DockerPipelineRunner { /// /// Main purpose was to reduce the amount of code duplication between different node processors /// As a result, each processor only needs to worry about writing their own function to process the msg. -pub(crate) trait NodeProcessor { +trait NodeProcessor { async fn process_packet( &mut self, - packet: HashMap, + sender_node_id: &str, + node_id: &str, + packet: &HashMap, session: Arc, output_key_exp: &str, + namespace: &str, + namespace_lookup: &HashMap, ) -> Result<()>; async fn wait_for_node_task_completion(&mut self) -> Result<()>; @@ -321,6 +330,7 @@ impl PodProcessor { /// Actual logic of processing a packet using the pod /// At the moment it does a simulation of pod execution async fn process_packet( + _sender_node_id: &str, node_id: String, pod: Arc, namespace: String, @@ -393,10 +403,80 @@ impl PodProcessor { impl NodeProcessor for PodProcessor { async fn process_packet( &mut self, - packet: HashMap, + _sender_node_id: &str, + node_id: &str, + packet: &HashMap, session: Arc, output_key_exp: &str, - ) -> Result<()>; + namespace: &str, + namespace_lookup: &HashMap, + ) -> Result<()> { + // Process the packet using the pod + // Create the pod_job + + // We need a unique hash for this given input packet process by the node + // therefore we need to generate a hash that has the pod_id + input_packet + let node_id_bytes = node_id.as_bytes().to_vec(); + let packet_copy = packet.clone(); + let input_packet_hash = { + let mut buf = node_id_bytes; + let mut serializer = Serializer::new(&mut buf); + serialize_hashmap(&packet_copy, &mut serializer)?; + hash_buffer(buf) + }; + let output_dir = URI { + namespace: namespace.to_owned(), + path: PathBuf::from(format!("pod_runs/{}/{}", self.pod.hash, input_packet_hash)), + }; + + let cpu_limit = self.pod.recommended_cpus; + let memory_limit = self.pod.recommended_memory; + + // Create the pod job + let pod_job = PodJob::new( + None, + Arc::clone(&self.pod), + packet.clone(), + output_dir, + cpu_limit, + memory_limit, + None, + &namespace_lookup, + )?; + + // Simulate pod execution by just printing out pod_job_hash and pod hash + // This will be replaced by sending the pod_job to the orchestrator via the agent + self.processing_tasks.spawn(async move { + println!( + "Simulating Executing pod job: {} with pod hash: {}", + pod_job.hash, pod_job.pod.hash + ); + Ok(()) + }); + + #[expect( + clippy::unwrap_used, + reason = "Hard code for now, will be replaced by agent" + )] + // Build the output_packet, in reality, this will be extracted from the pod_result + let output_packet = self + .pod + .output_spec + .keys() + .map(|output_key| (output_key.clone(), packet.values().next().cloned().unwrap())) + .collect::>(); + + // For now we will just send the input_packet to the success channel + session + .put( + output_key_exp, + bitcode::encode(&Message::NodeOutput(node_id.to_owned(), output_packet)), + ) + .await + .context(selector::AgentCommunicationFailure {})?; + + Ok(()) + } async fn wait_for_node_task_completion(&mut self) -> Result<()> { todo!() @@ -414,43 +494,21 @@ struct MapperProcessor { } impl MapperProcessor { - const fn new(mapper: Arc, node_metadata: NodeMetaData) -> Self { - Self { - mapper, - node_metadata, - } - } - - async fn process_packet(&self, packet: &HashMap) -> Result<()> { - // Apply the mapping to the input packet - let output_map = self - .mapper - .mapping - .iter() - .map(|(input_key, output_key)| { - let input = get(packet, input_key)?.clone(); - Ok((output_key.to_owned(), input)) - }) - .collect::>>()?; - - // Send the output via the channel - try_join_all(self.node_metadata.child_nodes_txs.iter().map(|ch| { - ch.send(Message::NodeOutput( - self.node_metadata.node_id.clone(), - output_map.clone(), - )) - })) - .await?; - Ok(()) + const fn new(mapper: Arc) -> Self { + Self { mapper } } } impl NodeProcessor for MapperProcessor { async fn process_packet( &mut self, - packet: HashMap, + _sender_node_id: &str, + node_id: &str, + packet: &HashMap, session: Arc, output_key_exp: &str, + _namespace: &str, + _namespace_lookup: &HashMap, ) -> Result<()> { // Apply the mapping to the input packet let output_map = self @@ -467,7 +525,7 @@ impl NodeProcessor for MapperProcessor { session .put( output_key_exp, - bitcode::encode(&Message::NodeOutput((), ())), + bitcode::encode(&Message::NodeOutput(node_id.to_owned(), output_map)), ) .await .unwrap(); @@ -491,35 +549,32 @@ struct JoinerProcessor { /// Cache for all packets received by the node input_packet_cache: HashMap>>, completed_parents: Vec, - node_metadata: NodeMetaData, initial_computation_completed: bool, + processing_tasks: JoinSet>, } impl JoinerProcessor { - fn new(parents_node_id: Vec, node_metadata: NodeMetaData) -> Self { + fn new(parents_node_id: Vec) -> Self { let input_packet_cache = parents_node_id .into_iter() .map(|id| (id, Vec::new())) .collect(); Self { input_packet_cache, - node_metadata, completed_parents: Vec::new(), initial_computation_completed: false, + processing_tasks: JoinSet::new(), } } fn compute_new_packet_combination( - &mut self, sender_node_id: &str, new_packet: &HashMap, + packet_cache: HashMap>>, ) -> Result>> { // Combine the new packet with the existing packets in the cache // Get all the cached packets from other parents - let other_parent_ids = self - .input_packet_cache - .keys() - .filter(|key| *key != sender_node_id); + let other_parent_ids = packet_cache.keys().filter(|key| *key != sender_node_id); // Create a vector to hold the incoming packet // This will be used to compute the cartesian product and will be modified if the initial computation is not completed @@ -567,94 +622,89 @@ impl JoinerProcessor { Ok(result) } +} +impl NodeProcessor for JoinerProcessor { async fn process_packet( &mut self, sender_node_id: &str, - packet: HashMap, + node_id: &str, + packet: &HashMap, + session: Arc, + output_key_exp: &str, + namespace: &str, + namespace_lookup: &HashMap, ) -> Result<()> { - let process_result = { - // Compute the new packet combination based on the sender node id and the packet - let new_packets_to_send = - self.compute_new_packet_combination(sender_node_id, &packet)?; - - // Record the packet into the cache - self.input_packet_cache - .get_mut(sender_node_id) - .context(selector::KeyMissing { - key: sender_node_id.to_owned(), - })? - .push(packet); - - Ok::>, OrcaError>(new_packets_to_send) - }; + self.input_packet_cache + .get_mut(sender_node_id) + .context(selector::KeyMissing { + key: sender_node_id.to_owned(), + })? + .push(packet.clone()); + + self.processing_tasks.spawn(async move { + let process_result = { + for packet in self.compute_new_packet_combination(sender_node_id, &packet)? { + session + .put( + output_key_exp.to_owned() + SUCCESS_KEY_EXP, + bincode::serde::encode_to_vec( + &Message::NodeOutput(node_id.to_owned(), packet), + bincode::config::standard(), + ) + .unwrap(), + ) + .await + .context(selector::AgentCommunicationFailure {})?; + } + Ok::<(), OrcaError>(()) + }; - match process_result { - Ok(output_packets) => { - // Send the output packets to the success channel - for output_packet in output_packets { - try_join_all(self.node_metadata.child_nodes_txs.iter().map(|ch| { - ch.send(Message::NodeOutput( - self.node_metadata.node_id.clone(), - output_packet.clone(), - )) - })) - .await?; + match process_result { + Ok(_) => {} + Err(err) => { + // Something failed thus we should output to the failed channel + session + .put( + output_key_exp.to_owned() + FAILURE_KEY_EXP, + bincode::serde::encode_to_vec( + &Message::NodeProcessingFailure( + node_id.to_owned(), + err.to_string(), + ), + bincode::config::standard(), + ) + .unwrap(), + ) + .await + .context(selector::AgentCommunicationFailure {})?; } } - Err(err) => { - // Send the error to the failure channel - eprintln!( - "Failed to process packet from {sender_node_id} for joiner node with error: {err}" - ); - } - } - // Add the new packet into the cache + // For each new packet, we + Ok(()) + }); Ok(()) } -} -impl NodeProcessor for JoinerProcessor { - fn get_node_rx(&mut self) -> &mut mpsc::Receiver { - &mut self.node_metadata.node_rx + async fn wait_for_node_task_completion(&mut self) -> Result<()> { + todo!() } - async fn wait_for_node_task_completion(&mut self) { - // Joiner doesn't spawn additional tasks, so this is a no-op + fn stop(&mut self) -> Result<()> { + todo!() } +} - async fn process_msg(&mut self, msg: Message) -> bool { - match msg { - Message::NodeOutput(sender_node_id, packet) => { - // Process the packet and send the output to the success channel - match self.process_packet(&sender_node_id, packet).await { - Ok(()) => {} - Err(err) => { - // Send the error to the failure channel - eprintln!("Failed to process packet with error: {err}"); - } - } - } - Message::NodeProcessingComplete(sender_node_id) => { - // Record that this parent node has completed processing - self.completed_parents.push(sender_node_id); - - // Check if all parents have completed processing - if self.completed_parents.len() == self.input_packet_cache.len() { - // All parents have completed processing, we can send the output - // Wait for all packets to be processed and send the output - return true; - } - } - Message::Stop => { - // We don't have anything to clean up, so we can just return - return true; - } - } - - false - } +// Utils functions +fn get_node_id(output_key_exp: &str) -> String { + // Extract the node id from the output key expression + // The output key expression is in the format of "pipeline_job_hash/node_id/outputs" + output_key_exp + .split('/') + .nth(1) + .map(|s| s.to_owned()) + .unwrap_or_else(|| "unknown_node".to_owned()) } #[cfg(test)] From 75d1e79465223a3890c35a9ec51601079466a804 Mon Sep 17 00:00:00 2001 From: synicix Date: Fri, 18 Jul 2025 21:15:55 +0000 Subject: [PATCH 15/29] Save progress --- src/core/error.rs | 10 +++ src/uniffi/error.rs | 5 ++ src/uniffi/pipeline_runner/runner.rs | 111 ++++++++++++++++----------- 3 files changed, 80 insertions(+), 46 deletions(-) diff --git a/src/core/error.rs b/src/core/error.rs index f4263009..12918fe9 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -25,6 +25,16 @@ impl From for OrcaError { } } } +impl From for OrcaError { + fn from(error: EncodingError) -> Self { + Self { + kind: Kind::FailedToParseDot { + source: error, + backtrace: Some(Backtrace::capture()), + }, + } + } +} impl From for OrcaError { fn from(error: oneshot::error::RecvError) -> Self { Self { diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 89ac49ea..95f1d1e1 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -107,6 +107,11 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] + EncodingError { + source: EncodingError, + backtrace: Option, + }, + #[snafu(transparent)] GlobPatternError { source: glob::PatternError, backtrace: Option, diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 94abcb4e..d8bcc3a0 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -6,7 +6,7 @@ use crate::{ pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, }, }; -use bincode::{Decode, Encode}; +use bincode::{Decode, Encode, config, serde::encode_to_vec}; use futures_util::future::try_join_all; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; @@ -525,7 +525,11 @@ impl NodeProcessor for MapperProcessor { session .put( output_key_exp, - bitcode::encode(&Message::NodeOutput(node_id.to_owned(), output_map)), + bincode::serde::encode_to_vec( + &Message::NodeOutput(node_id.to_owned(), output_map), + bincode::config::standard(), + ) + .unwrap(), ) .await .unwrap(); @@ -567,47 +571,10 @@ impl JoinerProcessor { } } - fn compute_new_packet_combination( - sender_node_id: &str, - new_packet: &HashMap, - packet_cache: HashMap>>, - ) -> Result>> { - // Combine the new packet with the existing packets in the cache - // Get all the cached packets from other parents - let other_parent_ids = packet_cache.keys().filter(|key| *key != sender_node_id); - - // Create a vector to hold the incoming packet - // This will be used to compute the cartesian product and will be modified if the initial computation is not completed - let mut incoming_packet = vec![new_packet.clone()]; - - // Determine if the initial computation has been computed - if !self.initial_computation_completed { - // Check if we at least have one cached packet for each of the other parents - for parent_id in other_parent_ids.clone() { - if get(&self.input_packet_cache, parent_id)?.is_empty() { - // We are still missing other parents, so we can't compute the new packet combination yet - return Ok(Vec::new()); - } - } - - // We have at least one packet for each of the other parents, thus we can compute the cartesian product - // For the initial computation, we will add all of the add all previous packets for this sender - get(&self.input_packet_cache, &sender_node_id.to_owned())? - .iter() - .for_each(|packet| incoming_packet.push(packet.clone())); - - self.initial_computation_completed = true; - } - - let mut factors = other_parent_ids - .map(|id| get(&self.input_packet_cache, id)) - .collect::>>()?; - - // Add the new incoming packet as a factor - - factors.push(&incoming_packet); - - let result = factors + fn compute_cartesian_product( + factors: &Vec<&Vec>>, + ) -> Vec> { + factors .into_iter() .multi_cartesian_product() .map(|packets_to_combined| { @@ -618,9 +585,7 @@ impl JoinerProcessor { acc }) }) - .collect::>(); - - Ok(result) + .collect::>() } } @@ -642,6 +607,60 @@ impl NodeProcessor for JoinerProcessor { })? .push(packet.clone()); + // Check if we have all the other parents needed to compute the cartesian product + if self.input_packet_cache.values().all(|v| !v.is_empty()) { + // Get all the cached packets from other parents + let other_parent_ids = self + .input_packet_cache + .keys() + .filter(|key| *key != sender_node_id); + + // Build the factors of the product + let mut factors = other_parent_ids + .map(|id| get(&self.input_packet_cache, id)) + .collect::>>()?; + + // Add the new packet as a factor + factors.push(&vec![packet.clone()]); + + // Compute the cartesian product of the factors + self.processing_tasks.spawn(async move { + let cartesian_product = Self::compute_cartesian_product(&factors); + + // Post all products to the output channel + for output_packet in cartesian_product { + let result = session.put( + output_key_exp.to_owned() + SUCCESS_KEY_EXP, + encode_to_vec( + &Message::NodeOutput(node_id.to_owned(), output_packet), + config::standard(), + )?, + ); + } + + Ok(()) + }); + } + + // Check if this packet is the first packet from this parent node + if get(&self.input_packet_cache, sender_node_id)?.len() == 1 {} + + // Determine if the initial computation is completed + if !self.initial_computation_completed { + // Check if we have at least one packet for each parent node + if self.input_packet_cache.values().all(|v| !v.is_empty()) { + self.initial_computation_completed = true; + } else { + // If not, we cannot compute the new packet combination yet + return Ok(()); + } + } + if self.input_packet_cache.values().all(|v| !v.is_empty()) + | self.input_packet_cache.values().any(|v| v.len() == 1) + { + // Initial case where we first have at least one packet for each parent node met + } + self.processing_tasks.spawn(async move { let process_result = { for packet in self.compute_new_packet_combination(sender_node_id, &packet)? { From 99b2d1ee0f346c69449eb11c465858be811cd4ce Mon Sep 17 00:00:00 2001 From: synicix Date: Sat, 19 Jul 2025 00:33:32 +0000 Subject: [PATCH 16/29] Save progress --- src/core/error.rs | 8 +- src/uniffi/error.rs | 3 +- src/uniffi/pipeline_runner/runner.rs | 208 ++++++++++++--------------- 3 files changed, 96 insertions(+), 123 deletions(-) diff --git a/src/core/error.rs b/src/core/error.rs index 12918fe9..3ca897c7 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -2,6 +2,7 @@ use crate::uniffi::{ error::{Kind, OrcaError}, pipeline_runner::runner::Message, }; +use bincode::error::EncodeError; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -25,10 +26,10 @@ impl From for OrcaError { } } } -impl From for OrcaError { - fn from(error: EncodingError) -> Self { +impl From for OrcaError { + fn from(error: EncodeError) -> Self { Self { - kind: Kind::FailedToParseDot { + kind: Kind::EncodingError { source: error, backtrace: Some(Backtrace::capture()), }, @@ -150,6 +151,7 @@ impl fmt::Debug for OrcaError { | Kind::NoTagFoundInContainerAltImage { backtrace, .. } | Kind::BollardError { backtrace, .. } | Kind::ChannelReceiveError { backtrace, .. } + | Kind::EncodingError { backtrace, .. } | Kind::GlobPatternError { backtrace, .. } | Kind::IoError { backtrace, .. } | Kind::PathPrefixError { backtrace, .. } diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 95f1d1e1..4a8e9c0d 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -3,6 +3,7 @@ reason = "Needed since SNAFU dynamically generating selectors." )] +use bincode::error::EncodeError; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -108,7 +109,7 @@ pub(crate) enum Kind { }, #[snafu(transparent)] EncodingError { - source: EncodingError, + source: EncodeError, backtrace: Option, }, #[snafu(transparent)] diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index d8bcc3a0..63f12b4e 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -228,14 +228,16 @@ impl DockerPipelineRunner { Ok(()) } - /// Act as the processor of the node by: - /// 1. Creating a metadata struct for the node to be passed to the appropriate processor - /// 2. Get the kernel for the node and build the correct processor for this node - /// 3. Start the processor and wait till it completes - /// 4. Send a message that the node processing is complete - /// - /// # Errors - /// Will error out if the kernel for the node is not found or if the + /** + * Act as the processor of the node by: + * 1. Creating a metadata struct for the node to be passed to the appropriate processor + * 2. Get the kernel for the node and build the correct processor for this node + * 3. Start the processor and wait till it completes + * 4. Send a message that the node processing is complete + * + * # Errors + * Will error out if the kernel for the node is not found or if the + */ async fn start_node_task( kernel: Kernel, output_key_expression: String, @@ -446,19 +448,12 @@ impl NodeProcessor for PodProcessor { // Simulate pod execution by just printing out pod_job_hash and pod hash // This will be replaced by sending the pod_job to the orchestrator via the agent - self.processing_tasks.spawn(async move { - println!( - "Simulating Executing pod job: {} with pod hash: {}", - pod_job.hash, pod_job.pod.hash - ); - Ok(()) - }); + // Build the output_packet, in reality, this will be extracted from the pod_result #[expect( clippy::unwrap_used, reason = "Hard code for now, will be replaced by agent" )] - // Build the output_packet, in reality, this will be extracted from the pod_result let output_packet = self .pod .output_spec @@ -466,24 +461,40 @@ impl NodeProcessor for PodProcessor { .map(|output_key| (output_key.clone(), packet.values().next().cloned().unwrap())) .collect::>(); - // For now we will just send the input_packet to the success channel - session - .put( - output_key_exp, - bitcode::encode(&Message::NodeOutput(node_id.to_owned(), output_packet)), - ) - .await - .context(selector::AgentCommunicationFailure {})?; + let node_id_clone = node_id.to_owned(); + let output_key_exp_clone = output_key_exp.to_owned(); + self.processing_tasks.spawn(async move { + println!( + "Simulating Executing pod job: {} with pod hash: {}", + pod_job.hash, pod_job.pod.hash + ); + + // For now we will just send the input_packet to the success channel + session + .put( + output_key_exp_clone + SUCCESS_KEY_EXP, + bincode::serde::encode_to_vec( + &Message::NodeOutput(node_id_clone, output_packet), + bincode::config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + + Ok(()) + }); Ok(()) } async fn wait_for_node_task_completion(&mut self) -> Result<()> { - todo!() + while self.processing_tasks.join_next().await.is_some() {} + Ok(()) } fn stop(&mut self) -> Result<()> { - todo!() + self.processing_tasks.abort_all(); + Ok(()) } } @@ -528,21 +539,22 @@ impl NodeProcessor for MapperProcessor { bincode::serde::encode_to_vec( &Message::NodeOutput(node_id.to_owned(), output_map), bincode::config::standard(), - ) - .unwrap(), + )?, ) .await - .unwrap(); + .context(selector::AgentCommunicationFailure {})?; Ok(()) } async fn wait_for_node_task_completion(&mut self) -> Result<()> { - todo!() + // All mappers tasks are synchronous, so we don't need to wait for anything + Ok(()) } fn stop(&mut self) -> Result<()> { - todo!() + // Mappers do not have any state to stop, so we can just return Ok + Ok(()) } } @@ -572,7 +584,7 @@ impl JoinerProcessor { } fn compute_cartesian_product( - factors: &Vec<&Vec>>, + factors: &Vec>>, ) -> Vec> { factors .into_iter() @@ -597,8 +609,8 @@ impl NodeProcessor for JoinerProcessor { packet: &HashMap, session: Arc, output_key_exp: &str, - namespace: &str, - namespace_lookup: &HashMap, + _namespace: &str, + _namespace_lookup: &HashMap, ) -> Result<()> { self.input_packet_cache .get_mut(sender_node_id) @@ -615,117 +627,75 @@ impl NodeProcessor for JoinerProcessor { .keys() .filter(|key| *key != sender_node_id); - // Build the factors of the product + // Build the factors of the product as owned values to avoid lifetime issues let mut factors = other_parent_ids - .map(|id| get(&self.input_packet_cache, id)) + .map(|id| get(&self.input_packet_cache, id).map(|v| v.clone())) .collect::>>()?; // Add the new packet as a factor - factors.push(&vec![packet.clone()]); + factors.push(vec![packet.clone()]); // Compute the cartesian product of the factors + let node_id_clone = node_id.to_owned(); + let output_key_exp_clone = output_key_exp.to_owned(); + self.processing_tasks.spawn(async move { + // Convert Vec>> to Vec<&Vec>> for compute_cartesian_product let cartesian_product = Self::compute_cartesian_product(&factors); // Post all products to the output channel for output_packet in cartesian_product { - let result = session.put( - output_key_exp.to_owned() + SUCCESS_KEY_EXP, - encode_to_vec( - &Message::NodeOutput(node_id.to_owned(), output_packet), - config::standard(), - )?, - ); + let result = { + session + .put( + output_key_exp_clone.clone() + SUCCESS_KEY_EXP, + encode_to_vec( + &Message::NodeOutput(node_id_clone.clone(), output_packet), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + Ok::<(), OrcaError>(()) + }; + + // If the result is an error, we will just send it to the error channel + if let Err(err) = result { + session + .put( + output_key_exp_clone.clone() + FAILURE_KEY_EXP, + encode_to_vec( + &Message::NodeProcessingFailure( + node_id_clone.clone(), + err.to_string(), + ), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + } } Ok(()) }); } - - // Check if this packet is the first packet from this parent node - if get(&self.input_packet_cache, sender_node_id)?.len() == 1 {} - - // Determine if the initial computation is completed - if !self.initial_computation_completed { - // Check if we have at least one packet for each parent node - if self.input_packet_cache.values().all(|v| !v.is_empty()) { - self.initial_computation_completed = true; - } else { - // If not, we cannot compute the new packet combination yet - return Ok(()); - } - } - if self.input_packet_cache.values().all(|v| !v.is_empty()) - | self.input_packet_cache.values().any(|v| v.len() == 1) - { - // Initial case where we first have at least one packet for each parent node met - } - - self.processing_tasks.spawn(async move { - let process_result = { - for packet in self.compute_new_packet_combination(sender_node_id, &packet)? { - session - .put( - output_key_exp.to_owned() + SUCCESS_KEY_EXP, - bincode::serde::encode_to_vec( - &Message::NodeOutput(node_id.to_owned(), packet), - bincode::config::standard(), - ) - .unwrap(), - ) - .await - .context(selector::AgentCommunicationFailure {})?; - } - Ok::<(), OrcaError>(()) - }; - - match process_result { - Ok(_) => {} - Err(err) => { - // Something failed thus we should output to the failed channel - session - .put( - output_key_exp.to_owned() + FAILURE_KEY_EXP, - bincode::serde::encode_to_vec( - &Message::NodeProcessingFailure( - node_id.to_owned(), - err.to_string(), - ), - bincode::config::standard(), - ) - .unwrap(), - ) - .await - .context(selector::AgentCommunicationFailure {})?; - } - } - // For each new packet, we - Ok(()) - }); - Ok(()) } async fn wait_for_node_task_completion(&mut self) -> Result<()> { - todo!() + // We must wait for all joiner processing task to complete + while self.processing_tasks.join_next().await.is_some() {} + Ok(()) } fn stop(&mut self) -> Result<()> { - todo!() + // We want to abort any computation + self.processing_tasks.abort_all(); + Ok(()) } } -// Utils functions -fn get_node_id(output_key_exp: &str) -> String { - // Extract the node id from the output key expression - // The output key expression is in the format of "pipeline_job_hash/node_id/outputs" - output_key_exp - .split('/') - .nth(1) - .map(|s| s.to_owned()) - .unwrap_or_else(|| "unknown_node".to_owned()) -} - #[cfg(test)] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] #[expect(clippy::panic_in_result_fn, reason = "Unit test")] From 630e27ed4fff054f020969ce9af9d2a9c96067d0 Mon Sep 17 00:00:00 2001 From: synicix Date: Sun, 20 Jul 2025 09:15:33 +0000 Subject: [PATCH 17/29] Save progress --- src/core/error.rs | 31 +- src/uniffi/error.rs | 7 +- src/uniffi/pipeline_runner/runner.rs | 884 ++++++++++++++++----------- tests/pipeline_runner.rs | 72 ++- 4 files changed, 615 insertions(+), 379 deletions(-) diff --git a/src/core/error.rs b/src/core/error.rs index 3ca897c7..ada0b0e2 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -1,8 +1,5 @@ -use crate::uniffi::{ - error::{Kind, OrcaError}, - pipeline_runner::runner::Message, -}; -use bincode::error::EncodeError; +use crate::uniffi::error::{Kind, OrcaError}; +use bincode::error::{DecodeError, EncodeError}; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -13,7 +10,7 @@ use std::{ io, path::{self}, }; -use tokio::sync::{mpsc::error::SendError, oneshot}; +use tokio::sync::oneshot; use tokio::task; impl From for OrcaError { @@ -26,6 +23,16 @@ impl From for OrcaError { } } } +impl From for OrcaError { + fn from(error: DecodeError) -> Self { + Self { + kind: Kind::DecodeError { + source: error, + backtrace: Some(Backtrace::capture()), + }, + } + } +} impl From for OrcaError { fn from(error: EncodeError) -> Self { Self { @@ -106,17 +113,6 @@ impl From for OrcaError { } } } - -impl From> for OrcaError { - fn from(error: SendError) -> Self { - Self { - kind: Kind::SendError { - reason: error.to_string(), - backtrace: Some(Backtrace::capture()), - }, - } - } -} impl From for OrcaError { fn from(error: Kind) -> Self { Self { kind: error } @@ -151,6 +147,7 @@ impl fmt::Debug for OrcaError { | Kind::NoTagFoundInContainerAltImage { backtrace, .. } | Kind::BollardError { backtrace, .. } | Kind::ChannelReceiveError { backtrace, .. } + | Kind::DecodeError { backtrace, .. } | Kind::EncodingError { backtrace, .. } | Kind::GlobPatternError { backtrace, .. } | Kind::IoError { backtrace, .. } diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index 4a8e9c0d..c932570e 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -3,7 +3,7 @@ reason = "Needed since SNAFU dynamically generating selectors." )] -use bincode::error::EncodeError; +use bincode::error::{DecodeError, EncodeError}; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -108,6 +108,11 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] + DecodeError { + source: DecodeError, + backtrace: Option, + }, + #[snafu(transparent)] EncodingError { source: EncodeError, backtrace: Option, diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 63f12b4e..a1ddd3f9 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -3,47 +3,56 @@ use crate::{ uniffi::{ error::{OrcaError, Result, selector}, model::{PathSet, Pod, PodJob, URI}, - pipeline::{Kernel, Mapper, Node, PipelineJob, PipelineResult}, + pipeline::{Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, }, }; -use bincode::{Decode, Encode, config, serde::encode_to_vec}; -use futures_util::future::try_join_all; +use async_trait::async_trait; +use bincode::{ + config, + serde::{decode_from_slice, encode_to_vec}, +}; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use serde_yaml::Serializer; -use snafu::{OptionExt as _, ResultExt}; +use snafu::{OptionExt as _, ResultExt as _}; use std::{ - collections::{HashMap, HashSet}, + collections::HashMap, + fmt::{Display, Formatter, Result as FmtResult}, hash::{Hash, Hasher}, path::PathBuf, sync::Arc, }; use tokio::{ - sync::{RwLock, mpsc}, - task::{JoinSet, spawn_blocking}, + sync::{Mutex, RwLock}, + task::JoinSet, }; +use zenoh::{handlers::FifoChannelHandler, pubsub::Subscriber, sample::Sample}; + +static SUCCESS_KEY_EXP: &str = "success"; +static FAILURE_KEY_EXP: &str = "failure"; +static INPUT_KEY_EXP: &str = "input_node/outputs"; -static SUCCESS_KEY_EXP: &str = "/success"; -static FAILURE_KEY_EXP: &str = "/failure"; +#[derive(Serialize, Deserialize, Clone, Debug)] +enum NodeOutput { + Packet(String, HashMap), + ProcessingCompleted(String), +} #[derive(Serialize, Deserialize, Clone, Debug)] -pub(crate) enum Message { - /// String is the `parent_node_id`, while `HashMap` is output of the parent node - NodeOutput(String, HashMap), - NodeProcessingFailure(String, String), // String is the `node_id` that has failed processing - /// String is the `node_id` that has completed processing - NodeProcessingComplete(String), - Stop, // Message to halt all operations +struct ProcessingFailure { + node_id: String, + error: String, } #[expect( clippy::type_complexity, reason = "too complex, but necessary for async handling" )] -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct PipelineRun { - /// PipelineJob that this run is associated with + /// `PipelineJob` that this run is associated with pub pipeline_job: PipelineJob, // The pipeline job that this run is associated with + node_tasks: JoinSet>, // JoinSet of tasks for each node in the pipeline outputs: Arc>>>>, // String is the node key, while hash } @@ -61,6 +70,11 @@ impl Hash for PipelineRun { } } +impl Display for PipelineRun { + fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { + write!(f, "PipelineRun({})", self.pipeline_job.hash) + } +} /** * Runner for pipelines * @@ -73,7 +87,7 @@ impl Hash for PipelineRun { */ #[derive(Default)] pub struct DockerPipelineRunner { - pipeline_runs: HashSet>, + pipeline_runs: HashMap, } /** @@ -81,8 +95,8 @@ pub struct DockerPipelineRunner { * The runtime is tokio * * These are the key expressions of the components of the pipeline: - * - Input Node: pipeline_job_hash/input_node/outputs (This is where the pipeline_job packets get fed to) - * - Nodes: pipeline_job_hash/node_id/outputs/(success|failure) (This is where the node outputs are sent to) + * - Input Node: `pipeline_job_hash/input_node/outputs` (This is where the `pipeline_job` packets get fed to) + * - Nodes: `pipeline_job_hash/node_id/outputs/(success|failure)` (This is where the node outputs are sent to) */ impl DockerPipelineRunner { /// Create a new Docker pipeline runner @@ -115,66 +129,119 @@ impl DockerPipelineRunner { pub async fn start( &mut self, pipeline_job: PipelineJob, + namespace: &str, // Name space to save pod_results to namespace_lookup: &HashMap, - ) -> Result<&PipelineRun> { + ) -> Result { // Create a new pipeline run - let pipeline_run = Arc::new(PipelineRun { + let mut pipeline_run = PipelineRun { pipeline_job, outputs: Arc::new(RwLock::new(HashMap::new())), - }); + node_tasks: JoinSet::new(), + }; - // Get reference to the pipeline - let pipeline = &pipeline_run.pipeline_job.pipeline; + // Get the pipeline_job_hash which will be use to identify the pipeline run + let pipeline_job_hash = pipeline_run.pipeline_job.hash.clone(); - // Create a task for each node + let graph = &pipeline_run.pipeline_job.pipeline.graph; - // All pipeline tasks have been created, now we need to feed the inputs to the pipeline - for tx in &root_nodes_tx { - for input_packet in &pipeline_run.pipeline_job.input_packets { - tx.send(Message::NodeOutput( - "input".to_owned(), - input_packet.clone(), - )) - .await?; - } + // Create the subscriber to listen to node ready status before sending inputs + let session = zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?; + + let subscriber = session + .declare_subscriber(format!("{pipeline_job_hash}/*/status/ready")) + .await + .context(selector::AgentCommunicationFailure {})?; + + // For each node, we will create call create_node_processing_task + for node_idx in graph.node_indices() { + let node = &graph[node_idx]; + + // Spawn the task + pipeline_run + .node_tasks + .spawn(Self::create_node_processing_task( + node.clone(), + pipeline_run.pipeline_job.pipeline.clone(), + pipeline_job_hash.clone(), + namespace.to_owned(), + namespace_lookup.clone(), + )); + } + + // Spawn the task that captures the outputs from the output_nodes + // For now the output nodes are hardcoded to be the leaf nodes of the pipeline + for node in pipeline_run.pipeline_job.pipeline.get_leaf_nodes() { + pipeline_run + .node_tasks + .spawn(Self::create_capture_task_for_node( + node.id.clone(), + pipeline_run.pipeline_job.hash.clone(), + Arc::clone(&pipeline_run.outputs), + )); } - // Send a message that all job inputs have been sent - for tx in &root_nodes_tx { - tx.send(Message::NodeProcessingComplete("input".to_owned())) - .await?; + let num_of_nodes = graph.node_count(); + println!("Waiting for {num_of_nodes} nodes to be ready"); + let mut ready_nodes = 0; + + // Wait for all nodes to be ready before sending inputs + while (subscriber.recv_async().await).is_ok() { + // Message is empty, just increment the counter + ready_nodes += 1; + println!("number of ready nodes: {ready_nodes}"); + + if ready_nodes == num_of_nodes { + break; // All nodes are ready, we can start sending inputs + } } + println!( + "All nodes are ready, starting pipeline run: {}", + pipeline_job_hash + ); + + // // Submit the input_packets to the correct key_exp + // for packet in &pipeline_run.pipeline_job.input_packets { + // println!("Sending packet"); + // // Send the packet to the input node key_exp + // session + // .put( + // format!("{pipeline_job_hash}/{INPUT_KEY_EXP}"), + // encode_to_vec(packet, config::standard())?, + // ) + // .await + // .context(selector::AgentCommunicationFailure {})?; + // } + // Insert into the list of pipeline runs - self.pipeline_runs.insert(pipeline_run); + self.pipeline_runs + .insert(pipeline_job_hash.clone(), pipeline_run); - Ok(self - .pipeline_runs - .get(&pipeline_run_arc) - .context(selector::KeyMissing { - key: pipeline_run.to_string(), - })?) + Ok(pipeline_job_hash) } /// Given a pipeline run, wait for all its tasks to complete and return the `PipelineResult` /// /// # Errors /// Will error out if any of the pipeline tasks failed to join - pub async fn get_result(&mut self, pipeline_run: &PipelineRun) -> Result { - // Call join on the join set for the pipeline run - let pipeline_run_info = + pub async fn get_result(&mut self, pipeline_run_id: &str) -> Result { + // To get the result, the pipeline execution must be complete, so we need to await on the tasks + + let pipeline_run = self.pipeline_runs - .get_mut(pipeline_run) + .get_mut(pipeline_run_id) .context(selector::KeyMissing { - key: pipeline_run.to_string(), + key: pipeline_run_id.to_owned(), })?; // Wait for all the tasks to complete - while let Some(result) = pipeline_run_info.node_task_join_set.join_next().await { + while let Some(result) = pipeline_run.node_tasks.join_next().await { match result { Ok(Ok(())) => {} // Task completed successfully Ok(Err(err)) => { - eprintln!("Task failed: {err}"); + eprintln!("Task failed with err: {err}"); return Err(err); } Err(err) => { @@ -186,107 +253,266 @@ impl DockerPipelineRunner { Ok(PipelineResult { pipeline_job: pipeline_run.pipeline_job.clone(), - output_packets: pipeline_run_info.outputs.read().await.clone(), + output_packets: pipeline_run.outputs.read().await.clone(), }) } /// Stop the pipeline run and all its tasks /// # Errors /// Will error out if the pipeline run is not found or if any of the tasks fail to stop correctly - pub async fn stop(&mut self, pipeline_run: &PipelineRun) -> Result<()> { - // Get the pipeline run info - let pipeline_run_info = + pub async fn stop(&mut self, pipeline_run_id: &str) -> Result<()> { + // To stop the pipeline run, we need to send a stop message to all the tasks + + // Get the pipeline run first + let pipeline_run = self.pipeline_runs - .get_mut(pipeline_run) + .get_mut(pipeline_run_id) .context(selector::KeyMissing { - key: pipeline_run.to_string(), + key: pipeline_run_id.to_owned(), })?; - // Send a stop message to all the node txs - for tx in pipeline_run_info.node_tx.values() { - tx.send(Message::Stop).await?; - } + let session = zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?; - // Wait for all tasks to complete - while let Some(result) = pipeline_run_info.node_task_join_set.join_next().await { - match result { - Ok(Ok(())) => {} // Task completed successfully - Ok(Err(err)) => { - eprintln!("Task failed: {err}"); - return Err(err); + // Send the stop message into the stop key_exp, the msg is just an empty vector + session + .put( + format!("{}/stop", pipeline_run.pipeline_job.hash), + Vec::new(), + ) + .await + .context(selector::AgentCommunicationFailure {})?; + + while pipeline_run.node_tasks.join_next().await.is_some() {} + Ok(()) + } + + #[expect(clippy::type_complexity, reason = "Needed for async")] + async fn create_capture_task_for_node( + node_id: String, + pipeline_run_id: String, + outputs: Arc>>>>, + ) -> Result<()> { + // Create a zenoh session + let session = zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?; + let subscriber = session + .declare_subscriber(format!( + "{pipeline_run_id}/{node_id}/outputs/{SUCCESS_KEY_EXP}" + )) + .await + .context(selector::AgentCommunicationFailure {})?; + + while let Ok(payload) = subscriber.recv_async().await { + // Extract the message from the payload + let (msg, _): (NodeOutput, usize) = + decode_from_slice(&payload.payload().to_bytes(), config::standard())?; + + match msg { + NodeOutput::Packet(sender_id, hash_map) => { + // Optionally, you can log or print the output packet + println!("Captured output from node {}: {:?}", sender_id, hash_map); + + // Store the output packet in the outputs map + let mut outputs_lock = outputs.write().await; + outputs_lock + .entry(node_id.clone()) + .or_default() + .push(hash_map); } - Err(err) => { - eprintln!("Join set error: {err}"); - return Err(err.into()); + NodeOutput::ProcessingCompleted(_) => { + // Handle processing completed message if needed } } } - // Remove the pipeline run from the list of pipeline runs - self.pipeline_runs.remove(pipeline_run); - Ok(()) } - /** - * Act as the processor of the node by: - * 1. Creating a metadata struct for the node to be passed to the appropriate processor - * 2. Get the kernel for the node and build the correct processor for this node - * 3. Start the processor and wait till it completes - * 4. Send a message that the node processing is complete - * - * # Errors - * Will error out if the kernel for the node is not found or if the - */ - async fn start_node_task( - kernel: Kernel, - output_key_expression: String, - namespace_path: PathBuf, + /// Function to start tasks associated with the node + /// Steps: + /// - Create the node processor based on the kernel type + /// - Create the zenoh session + /// - Create a join set to spawn and handle incoming messages tasks + /// - Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) + /// - For each subscriber, handle the incoming message appropriately + /// + /// # Errors + /// Will error out if the kernel for the node is not found or if the + async fn create_node_processing_task( + node: Node, + pipeline: Pipeline, + pipeline_job_id: String, + namespace: String, + namespace_lookup: HashMap, ) -> Result<()> { - // Get the kernel for this node and build the correct processor - match kernel { - Kernel::Pod(pod) => { - let mut processor = PodProcessor::new(Arc::clone(pod), node_metadata); - processor.start().await; - } - Kernel::Mapper(mapper) => { - let mut processor = MapperProcessor::new(Arc::clone(mapper), node_metadata); - processor.start().await; - } - Kernel::Joiner => { - let parent_nodes_id = pipeline_run - .pipeline_job - .pipeline - .get_parents_for_node(&node) - .map(|parent_node| parent_node.id.clone()) - .collect::>(); - let mut processor = JoinerProcessor::new(parent_nodes_id, node_metadata); - processor.start().await; - } + // Create the correct processor for the node based on the kernel type + let node_processor: Arc>> = Arc::new(Mutex::new( + match get(&pipeline.kernel_lut, &node.kernel_hash)? { + Kernel::Pod(pod) => Box::new(PodProcessor::new(Arc::clone(pod))), + Kernel::Mapper(mapper) => Box::new(MapperProcessor::new(Arc::clone(mapper))), + Kernel::Joiner => { + // Need to get the parent node id for this joiner node + let parent_nodes_id = pipeline + .get_parents_for_node(&node) + .map(|parent_node| parent_node.id.clone()) + .collect::>(); + Box::new(JoinerProcessor::new(parent_nodes_id)) + } + }, + )); + + // Create the zenoh session + let session = Arc::new( + zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?, + ); + + // Create a joinset to spawn and handle incoming messages tasks + let mut listener_tasks = JoinSet::new(); + + // Create the list of key_expressions to subscribe to + let mut key_exps_to_subscribe_to = pipeline + .get_parents_for_node(&node) + .map(|parent_node| { + format!( + "{pipeline_job_id}/{}/outputs/{SUCCESS_KEY_EXP}", + parent_node.id + ) + }) + .collect::>(); + + // If there was no parent node, then this is root node, therefore we need to subscribe to the input node + if key_exps_to_subscribe_to.is_empty() { + key_exps_to_subscribe_to.push(format!("{pipeline_job_id}/{INPUT_KEY_EXP}")); } - // Since all inputs are sent, we can send a message that the "input node" processing is complete - for success_ch_tx in &success_chs_tx { - match success_ch_tx - .send(Message::NodeProcessingComplete(node.id.clone())) + // Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) + for key_exp in key_exps_to_subscribe_to { + let subscriber = session + .declare_subscriber(key_exp) .await - { - Ok(()) => {} - Err(err) => { - match err { - mpsc::error::SendError(Message::NodeProcessingComplete(_)) => { - // The channel is closed, we can ignore this error, this happens when stop it called - eprintln!("Failed to send processing complete message, channel closed"); - } - _ => { - eprintln!("Failed to send processing complete message: {err}"); - } + .context(selector::AgentCommunicationFailure {})?; + + listener_tasks.spawn(Self::start_async_processor_task( + subscriber, + Arc::clone(&node_processor), + node.id.clone(), + pipeline_job_id.clone(), + namespace.clone(), + namespace_lookup.clone(), + Arc::clone(&session), + )); + } + + // Create the task to handle stop request + listener_tasks.spawn(Self::start_stop_request_task( + Arc::clone(&node_processor), + pipeline_job_id.clone(), + Arc::clone(&session), + )); + + // Wait for all task to complete + listener_tasks.join_all().await; + + Ok(()) + } + + async fn start_async_processor_task( + subscriber: Subscriber>, + node_processor: Arc>>, + node_id: String, + pipeline_job_id: String, + namespace: String, + namespace_lookup: HashMap, + session: Arc, + ) -> Result<()> { + // Send a ready message so the pipeline knows when to start sending inputs + let result = session + .put( + format!("{pipeline_job_id}/{node_id}/status/ready"), + &node_id, + ) + .await + .context(selector::AgentCommunicationFailure {}); + + // Print out if the ready message was sent successfully + if let Err(err) = result { + eprintln!("Failed to send ready message for node {}: {}", node_id, err); + } else { + println!("Ready message sent for node {}", node_id); + } + + while let Ok(payload) = subscriber.recv_async().await { + // Extract the message from the payload + + let (msg, _): (NodeOutput, usize) = + decode_from_slice(&payload.payload().to_bytes(), config::standard())?; + println!("Received message for node {}: {:?}", node_id, msg); + match msg { + NodeOutput::Packet(sender_id, hash_map) => { + println!( + "Received packet from {} for node {}: {:?}", + sender_id, node_id, hash_map + ); + // Process the packet using the node processor + node_processor.lock().await.process_packet( + &sender_id, + &node_id, + &hash_map, + Arc::clone(&session), + &format!("{}/{}/outputs", pipeline_job_id, node_id.clone()), + &namespace, + &namespace_lookup, + )?; + } + NodeOutput::ProcessingCompleted(sender_id) => { + // Notify the processor that the parent node has completed processing + if node_processor + .lock() + .await + .mark_parent_as_complete(&sender_id) + .await + { + // This was the last parent, thus we need to send the processing complete message + let output_key_exp = + format!("{pipeline_job_id}/{node_id}/outputs/{SUCCESS_KEY_EXP}"); + session + .put( + output_key_exp, + encode_to_vec( + NodeOutput::ProcessingCompleted(node_id.clone()), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; } } } + + // Process the message based on its type } + Ok::<(), OrcaError>(()) + } - Ok(()) + async fn start_stop_request_task( + node_processor: Arc>>, + pipeline_run_id: String, + session: Arc, + ) -> Result<()> { + let subscriber = session + .declare_subscriber(pipeline_run_id.clone() + "/stop") + .await + .context(selector::AgentCommunicationFailure {})?; + while subscriber.recv_async().await.is_ok() { + // Received a requst to stop, therefore we need to tell the node_processor to shutdown + node_processor.lock().await.stop(); + } + Ok::<(), OrcaError>(()) } } @@ -294,9 +520,10 @@ impl DockerPipelineRunner { /// This trait defines the methods that all node processors should implement /// /// Main purpose was to reduce the amount of code duplication between different node processors -/// As a result, each processor only needs to worry about writing their own function to process the msg. -trait NodeProcessor { - async fn process_packet( +/// As a result, each processor only needs to worry about writing their own function to process the msg +#[async_trait] +trait NodeProcessor: Send + Sync { + fn process_packet( &mut self, sender_node_id: &str, node_id: &str, @@ -307,15 +534,25 @@ trait NodeProcessor { namespace_lookup: &HashMap, ) -> Result<()>; - async fn wait_for_node_task_completion(&mut self) -> Result<()>; + /// Notifies the processor that the parent node has completed processing + /// If the parent node was the last one to complete, this function will wait till all task are done + /// and send the node processing complete message then return. + /// + /// Otherwise it will return immediately + /// + /// # Returns + /// true if the parent node was the last one to complete processing, user send + /// the processing completion message to the output + /// + /// false if there are still other parent nodes that need to complete processing + async fn mark_parent_as_complete(&mut self, parent_node_id: &str) -> bool; - fn stop(&mut self) -> Result<()>; + fn stop(&mut self); } /// Processor for Pods /// Currently missing implementation to call agents for actual pod processing struct PodProcessor { - session: zenoh::Session, pod: Arc, processing_tasks: JoinSet>, } @@ -323,87 +560,20 @@ struct PodProcessor { impl PodProcessor { fn new(pod: Arc) -> Self { Self { - session: zenoh::Session::default(), pod, processing_tasks: JoinSet::new(), } } - - /// Actual logic of processing a packet using the pod - /// At the moment it does a simulation of pod execution - async fn process_packet( - _sender_node_id: &str, - node_id: String, - pod: Arc, - namespace: String, - namespace_lookup: HashMap, - packet: HashMap, - success_chs_tx: Vec>, - ) -> Result<()> { - // Process the packet using the pod - // Create the pod_job - - // We need a unique hash for this given input packet process by the node - // therefore we need to generate a hash that has the pod_id + input_packet - let node_id_bytes = node_id.as_bytes().to_vec(); - let packet_copy = packet.clone(); - let input_packet_hash = spawn_blocking(move || { - let mut buf = node_id_bytes; - let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(&packet_copy, &mut serializer)?; - Ok::<_, OrcaError>(hash_buffer(buf)) - }) - .await??; - let output_dir = URI { - namespace: namespace.clone(), - path: PathBuf::from(format!("pod_runs/{}/{}", pod.hash, input_packet_hash)), - }; - - let cpu_limit = pod.recommended_cpus; - let memory_limit = pod.recommended_memory; - - // Create the pod job - let pod_job = PodJob::new( - None, - Arc::clone(&pod), - packet.clone(), - output_dir, - cpu_limit, - memory_limit, - None, - &namespace_lookup, - )?; - - // Simulate pod execution by just printing out pod_job_hash and pod hash - // This will be replaced by sending the pod_job to the orchestrator via the agent - println!( - "Simulating Executing pod job: {} with pod hash: {}", - pod_job.hash, pod_job.pod.hash - ); - - #[expect( - clippy::unwrap_used, - reason = "Hard code for now, will be replaced by agent" - )] - // Build the output_packet - let output_packet = pod - .output_spec - .keys() - .map(|output_key| (output_key.clone(), packet.values().next().cloned().unwrap())) - .collect::>(); - - // For now we will just send the input_packet to the success channel - try_join_all(success_chs_tx.iter().map(|success_ch_tx| { - success_ch_tx.send(Message::NodeOutput(node_id.clone(), output_packet.clone())) - })) - .await?; - - Ok(()) - } } +#[async_trait] impl NodeProcessor for PodProcessor { - async fn process_packet( + #[expect( + clippy::unwrap_used, + clippy::unwrap_in_result, + reason = "Hard code for now, will be replaced by agent" + )] + fn process_packet( &mut self, _sender_node_id: &str, node_id: &str, @@ -443,17 +613,14 @@ impl NodeProcessor for PodProcessor { cpu_limit, memory_limit, None, - &namespace_lookup, + namespace_lookup, )?; // Simulate pod execution by just printing out pod_job_hash and pod hash // This will be replaced by sending the pod_job to the orchestrator via the agent // Build the output_packet, in reality, this will be extracted from the pod_result - #[expect( - clippy::unwrap_used, - reason = "Hard code for now, will be replaced by agent" - )] + let output_packet = self .pod .output_spec @@ -473,9 +640,9 @@ impl NodeProcessor for PodProcessor { session .put( output_key_exp_clone + SUCCESS_KEY_EXP, - bincode::serde::encode_to_vec( - &Message::NodeOutput(node_id_clone, output_packet), - bincode::config::standard(), + encode_to_vec( + NodeOutput::Packet(node_id_clone, output_packet), + config::standard(), )?, ) .await @@ -484,17 +651,21 @@ impl NodeProcessor for PodProcessor { Ok(()) }); + println!("Successfully started processor for node: {}", node_id); Ok(()) } - async fn wait_for_node_task_completion(&mut self) -> Result<()> { - while self.processing_tasks.join_next().await.is_some() {} - Ok(()) + async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { + // For pod we only have one parent, thus execute the exit case + while (self.processing_tasks.join_next().await).is_some() { + // Wait for all tasks to complete + } + + true } - fn stop(&mut self) -> Result<()> { + fn stop(&mut self) { self.processing_tasks.abort_all(); - Ok(()) } } @@ -502,16 +673,21 @@ impl NodeProcessor for PodProcessor { /// This processor renames the `input_keys` from the input packet to the `output_keys` defined by the map struct MapperProcessor { mapper: Arc, + processing_tasks: JoinSet>, } impl MapperProcessor { - const fn new(mapper: Arc) -> Self { - Self { mapper } + fn new(mapper: Arc) -> Self { + Self { + mapper, + processing_tasks: JoinSet::new(), + } } } +#[async_trait] impl NodeProcessor for MapperProcessor { - async fn process_packet( + fn process_packet( &mut self, _sender_node_id: &str, node_id: &str, @@ -521,40 +697,71 @@ impl NodeProcessor for MapperProcessor { _namespace: &str, _namespace_lookup: &HashMap, ) -> Result<()> { - // Apply the mapping to the input packet - let output_map = self - .mapper - .mapping - .iter() - .map(|(input_key, output_key)| { - let input = get(&packet, input_key)?.clone(); - Ok((output_key.to_owned(), input)) - }) - .collect::>>()?; + let mapping = self.mapper.mapping.clone(); + let packet_clone = packet.clone(); + let node_id_clone = node_id.to_owned(); + let output_key_exp_clone = output_key_exp.to_owned(); - // Send the packet outwards - session - .put( - output_key_exp, - bincode::serde::encode_to_vec( - &Message::NodeOutput(node_id.to_owned(), output_map), - bincode::config::standard(), - )?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; + self.processing_tasks.spawn(async move { + let result = { + // Apply the mapping to the input packet + let output_map = mapping + .iter() + .map(|(input_key, output_key)| { + let input = get(&packet_clone, input_key)?.clone(); + Ok((output_key.to_owned(), input)) + }) + .collect::>>()?; + + // Send the packet outwards + session + .put( + output_key_exp_clone.clone() + SUCCESS_KEY_EXP, + encode_to_vec( + NodeOutput::Packet(node_id_clone.clone(), output_map), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + Ok::<(), OrcaError>(()) + }; + + if let Err(err) = result { + // If there was an error, we send it to the failure channel + session + .put( + output_key_exp_clone + FAILURE_KEY_EXP, + encode_to_vec( + &ProcessingFailure { + node_id: node_id_clone, + error: err.to_string(), + }, + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + } + + Ok(()) + }); + println!("Successfully started processor for node: {}", node_id); Ok(()) } - async fn wait_for_node_task_completion(&mut self) -> Result<()> { - // All mappers tasks are synchronous, so we don't need to wait for anything - Ok(()) + async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { + // For mapper we only have one parent, thus execute the exit case + while (self.processing_tasks.join_next().await).is_some() { + // Wait for all tasks to complete + } + + true } - fn stop(&mut self) -> Result<()> { - // Mappers do not have any state to stop, so we can just return Ok - Ok(()) + fn stop(&mut self) { + self.processing_tasks.abort_all(); } } @@ -565,7 +772,6 @@ struct JoinerProcessor { /// Cache for all packets received by the node input_packet_cache: HashMap>>, completed_parents: Vec, - initial_computation_completed: bool, processing_tasks: JoinSet>, } @@ -578,16 +784,15 @@ impl JoinerProcessor { Self { input_packet_cache, completed_parents: Vec::new(), - initial_computation_completed: false, processing_tasks: JoinSet::new(), } } fn compute_cartesian_product( - factors: &Vec>>, + factors: &[Vec>], ) -> Vec> { factors - .into_iter() + .iter() .multi_cartesian_product() .map(|packets_to_combined| { packets_to_combined @@ -601,8 +806,9 @@ impl JoinerProcessor { } } +#[async_trait] impl NodeProcessor for JoinerProcessor { - async fn process_packet( + fn process_packet( &mut self, sender_node_id: &str, node_id: &str, @@ -629,7 +835,7 @@ impl NodeProcessor for JoinerProcessor { // Build the factors of the product as owned values to avoid lifetime issues let mut factors = other_parent_ids - .map(|id| get(&self.input_packet_cache, id).map(|v| v.clone())) + .map(|id| get(&self.input_packet_cache, id).cloned()) .collect::>>()?; // Add the new packet as a factor @@ -650,7 +856,7 @@ impl NodeProcessor for JoinerProcessor { .put( output_key_exp_clone.clone() + SUCCESS_KEY_EXP, encode_to_vec( - &Message::NodeOutput(node_id_clone.clone(), output_packet), + NodeOutput::Packet(node_id_clone.clone(), output_packet), config::standard(), )?, ) @@ -665,10 +871,10 @@ impl NodeProcessor for JoinerProcessor { .put( output_key_exp_clone.clone() + FAILURE_KEY_EXP, encode_to_vec( - &Message::NodeProcessingFailure( - node_id_clone.clone(), - err.to_string(), - ), + &ProcessingFailure { + node_id: node_id_clone.clone(), + error: err.to_string(), + }, config::standard(), )?, ) @@ -680,19 +886,30 @@ impl NodeProcessor for JoinerProcessor { Ok(()) }); } + println!("Successfully started processor for node: {}", node_id); Ok(()) } - async fn wait_for_node_task_completion(&mut self) -> Result<()> { - // We must wait for all joiner processing task to complete - while self.processing_tasks.join_next().await.is_some() {} - Ok(()) + async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { + // For Joiner, we need to determine if all parents are complete, if so then wait for task to complete + // before returning true + self.completed_parents.push(_parent_node_id.to_owned()); + + // If we have all parents completed, we can wait for the tasks to complete + if self.completed_parents.len() == self.input_packet_cache.len() { + while (self.processing_tasks.join_next().await).is_some() { + // Wait for all tasks to complete + } + return true; + } + + // If not all parents are completed, we return false + false } - fn stop(&mut self) -> Result<()> { + fn stop(&mut self) { // We want to abort any computation self.processing_tasks.abort_all(); - Ok(()) } } @@ -700,82 +917,63 @@ impl NodeProcessor for JoinerProcessor { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] #[expect(clippy::panic_in_result_fn, reason = "Unit test")] async fn joiner() -> Result<()> { - // Create a fake mpsc channel for the node - let (_, node_rx) = mpsc::channel::(128); - - // Create a child mpsc - let (child_tx, mut child_rx) = mpsc::channel::(128); - - let node_metadata = NodeMetaData { - node_id: "joiner_node".to_owned(), - node_rx, - child_nodes_txs: vec![child_tx], - namespace: "test".to_owned(), - namespace_lookup: HashMap::new(), - }; - - let mut joiner_process = JoinerProcessor::new( - vec!["0".to_owned(), "1".to_owned(), "2".to_owned()], - node_metadata, - ); - - // Make each parent has 1 packet - for idx in 0..2 { - joiner_process - .process_packet( - &format!("{idx}"), - make_test_packet("data_1.txt".to_owned().into()), - ) - .await?; - } - - // Confirm that there should be no output yet - - // Now we send the missing parent package - // This will yield one unique combination - joiner_process - .process_packet("2", make_test_packet("data_1.txt".to_owned().into())) - .await?; - - // Confirm that the output is sent to the child channel - assert!( - child_rx.len() == 1, - "Should have only one message in the channel", - ); - assert!( - child_rx.recv().await.is_some(), - "Should have received a message" - ); - - // Insert another one - joiner_process - .process_packet("2", make_test_packet("data_2.txt".to_owned().into())) - .await?; - - // The joiner node should send another one - assert!( - child_rx.len() == 1, - "Should have only one message in the channel", - ); - assert!( - child_rx.recv().await.is_some(), - "Should have received a message" - ); - - // Now insert to packet for parent 0, which should yield 2 packets in total - // This is because of the cartesian product - joiner_process - .process_packet("0", make_test_packet("data_2.txt".to_owned().into())) - .await?; - - assert!( - child_rx.len() == 2, - "Should have only two messages in the channel", - ); - assert!( - child_rx.recv().await.is_some(), - "Should have received a message" - ); + // let parent_ids = vec!["0".to_owned(), "1".to_owned(), "2".to_owned()]; + + // let mut joiner_process = JoinerProcessor::new(parent_ids); + + // // Make each parent has 1 packet + // for idx in 0..2 { + // let packet = make_test_packet(format!("data_{idx}.txt").into()); + // joiner_process.process_packet(idx, "joiner", packet, session, output_key_exp, namespace, namespace_lookup); + // } + + // // Confirm that there should be no output yet + + // // Now we send the missing parent package + // // This will yield one unique combination + // joiner_process + // .process_packet("2", make_test_packet("data_1.txt".to_owned().into())) + // .await?; + + // // Confirm that the output is sent to the child channel + // assert!( + // child_rx.len() == 1, + // "Should have only one message in the channel", + // ); + // assert!( + // child_rx.recv().await.is_some(), + // "Should have received a message" + // ); + + // // Insert another one + // joiner_process + // .process_packet("2", make_test_packet("data_2.txt".to_owned().into())) + // .await?; + + // // The joiner node should send another one + // assert!( + // child_rx.len() == 1, + // "Should have only one message in the channel", + // ); + // assert!( + // child_rx.recv().await.is_some(), + // "Should have received a message" + // ); + + // // Now insert to packet for parent 0, which should yield 2 packets in total + // // This is because of the cartesian product + // joiner_process + // .process_packet("0", make_test_packet("data_2.txt".to_owned().into())) + // .await?; + + // assert!( + // child_rx.len() == 2, + // "Should have only two messages in the channel", + // ); + // assert!( + // child_rx.recv().await.is_some(), + // "Should have received a message" + // ); Ok(()) } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 054a8f2e..7354a542 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -9,14 +9,42 @@ pub mod fixture; use std::collections::HashMap; use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunner}; +use snafu::ResultExt; +use tokio::time::sleep; use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 32)] async fn basic_run() -> Result<()> { let pipeline_job = pipeline_job()?; + // Create zenoh to monitor the node ready message + let zenoh = zenoh::open(zenoh::Config::default()).await.unwrap(); // Replace with the correct error variant if needed + let sub = zenoh.declare_subscriber("*/*/status/ready").await.unwrap(); + + tokio::spawn({ + async move { + // Receive loop ready, publish ready message + zenoh.put("ready", vec![]).await.unwrap(); + println!("Listening for messages..."); + loop { + match sub.recv_async().await { + Ok(msg) => { + println!( + "Received message: {:?}", + msg.payload().try_to_string().unwrap() + ); + } + Err(_) => todo!(), + } + } + } + }); + + // Wait for the zenoh subscriber to be ready + sleep(std::time::Duration::from_secs(5)).await; + // Create the runner let mut runner = DockerPipelineRunner::new(); @@ -26,30 +54,38 @@ async fn basic_run() -> Result<()> { )]))?; let namespace_lookup = test_dirs.namespace_lookup(); - let pipeline_run = runner.start(pipeline_job, &namespace_lookup).await?; + let pipeline_run = runner + .start(pipeline_job, "default", &namespace_lookup) + .await?; + sleep(std::time::Duration::from_secs(5)).await; + panic!(); // Wait for the pipeline run to complete - runner.get_result(&pipeline_run).await?; + let pipeline_result = runner.get_result(&pipeline_run).await?; + println!("{:?}", pipeline_result.output_packets); + Ok(()) } -#[tokio::test(flavor = "multi_thread", worker_threads = 4)] -async fn stop() -> Result<()> { - let pipeline_job = pipeline_job()?; +// #[tokio::test(flavor = "multi_thread", worker_threads = 4)] +// async fn stop() -> Result<()> { +// let pipeline_job = pipeline_job()?; - // Create the runner - let mut runner = DockerPipelineRunner::new(); +// // Create the runner +// let mut runner = DockerPipelineRunner::new(); - let test_dirs = TestDirs::new(&HashMap::from([( - "default".to_owned(), - Some("./tests/extra/data/"), - )]))?; - let namespace_lookup = test_dirs.namespace_lookup(); +// let test_dirs = TestDirs::new(&HashMap::from([( +// "default".to_owned(), +// Some("./tests/extra/data/"), +// )]))?; +// let namespace_lookup = test_dirs.namespace_lookup(); - let pipeline_run = runner.start(pipeline_job, &namespace_lookup).await?; +// let pipeline_run = runner +// .start(pipeline_job, "default", &namespace_lookup) +// .await?; - // Abort the pipeline run - runner.stop(&pipeline_run).await?; +// // Abort the pipeline run +// runner.stop(&pipeline_run).await?; - Ok(()) -} +// Ok(()) +// } From 31f925e9ab8851448db9851b97c3f8cb17854b0d Mon Sep 17 00:00:00 2001 From: synicix Date: Sun, 20 Jul 2025 16:27:42 +0000 Subject: [PATCH 18/29] Save progress --- src/uniffi/pipeline_runner/runner.rs | 12 +++++- tests/pipeline_runner.rs | 57 +++++++++++++++++----------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index a1ddd3f9..a73abdeb 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -155,6 +155,7 @@ impl DockerPipelineRunner { .context(selector::AgentCommunicationFailure {})?; // For each node, we will create call create_node_processing_task + println!("Num of node indices: {}", graph.node_count()); for node_idx in graph.node_indices() { let node = &graph[node_idx]; @@ -348,6 +349,8 @@ impl DockerPipelineRunner { namespace: String, namespace_lookup: HashMap, ) -> Result<()> { + // Print out node id for debugging + println!("Creating processing task for node: {}", node.id); // Create the correct processor for the node based on the kernel type let node_processor: Arc>> = Arc::new(Mutex::new( match get(&pipeline.kernel_lut, &node.kernel_hash)? { @@ -378,6 +381,7 @@ impl DockerPipelineRunner { let mut key_exps_to_subscribe_to = pipeline .get_parents_for_node(&node) .map(|parent_node| { + println!("Setting up listener for parent node: {}", parent_node.id); format!( "{pipeline_job_id}/{}/outputs/{SUCCESS_KEY_EXP}", parent_node.id @@ -443,9 +447,15 @@ impl DockerPipelineRunner { if let Err(err) = result { eprintln!("Failed to send ready message for node {}: {}", node_id, err); } else { - println!("Ready message sent for node {}", node_id); + println!( + "Ready message sent for node {}, with key exp {}", + node_id, + format!("{pipeline_job_id}/{node_id}/status/ready") + ); } + println!("Listening for messages on node: {}", node_id); + while let Ok(payload) = subscriber.recv_async().await { // Extract the message from the payload diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 7354a542..a9f0238c 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -21,12 +21,11 @@ async fn basic_run() -> Result<()> { // Create zenoh to monitor the node ready message let zenoh = zenoh::open(zenoh::Config::default()).await.unwrap(); // Replace with the correct error variant if needed - let sub = zenoh.declare_subscriber("*/*/status/ready").await.unwrap(); + let sub = zenoh.declare_subscriber("**").await.unwrap(); tokio::spawn({ async move { // Receive loop ready, publish ready message - zenoh.put("ready", vec![]).await.unwrap(); println!("Listening for messages..."); loop { match sub.recv_async().await { @@ -36,33 +35,45 @@ async fn basic_run() -> Result<()> { msg.payload().try_to_string().unwrap() ); } - Err(_) => todo!(), + Err(err) => println!("Error receiving message: {}", err), } } } }); - // Wait for the zenoh subscriber to be ready - sleep(std::time::Duration::from_secs(5)).await; - - // Create the runner - let mut runner = DockerPipelineRunner::new(); - - let test_dirs = TestDirs::new(&HashMap::from([( - "default".to_owned(), - Some("./tests/extra/data/"), - )]))?; - let namespace_lookup = test_dirs.namespace_lookup(); - - let pipeline_run = runner - .start(pipeline_job, "default", &namespace_lookup) - .await?; + let zenoh2 = zenoh::open(zenoh::Config::default()).await.unwrap(); + + let joiner = tokio::spawn(async move { + sleep(tokio::time::Duration::from_secs(2)).await; + // Send a bunch of messsage to the channel + for i in 0..10 { + zenoh2 + .put(format!("test/{}", i), format!("message {}", i).as_bytes()) + .await + .unwrap(); + println!("Sent message {}", i); + } + }); - sleep(std::time::Duration::from_secs(5)).await; - panic!(); - // Wait for the pipeline run to complete - let pipeline_result = runner.get_result(&pipeline_run).await?; - println!("{:?}", pipeline_result.output_packets); + joiner.await.unwrap(); + // // Create the runner + // let mut runner = DockerPipelineRunner::new(); + + // let test_dirs = TestDirs::new(&HashMap::from([( + // "default".to_owned(), + // Some("./tests/extra/data/"), + // )]))?; + // let namespace_lookup = test_dirs.namespace_lookup(); + + // let pipeline_run = runner + // .start(pipeline_job, "default", &namespace_lookup) + // .await?; + + // sleep(std::time::Duration::from_secs(5)).await; + // panic!(); + // // Wait for the pipeline run to complete + // let pipeline_result = runner.get_result(&pipeline_run).await?; + // println!("{:?}", pipeline_result.output_packets); Ok(()) } From b369549fdd2d35bf25622224c2880c88e656a953 Mon Sep 17 00:00:00 2001 From: Synicix Date: Mon, 21 Jul 2025 01:56:49 +0000 Subject: [PATCH 19/29] Save progress --- src/uniffi/pipeline_runner/runner.rs | 223 ++++++++++++++++----------- tests/pipeline_runner.rs | 58 +++---- 2 files changed, 157 insertions(+), 124 deletions(-) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index a73abdeb..c43a5799 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -2,13 +2,13 @@ use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ error::{OrcaError, Result, selector}, - model::{PathSet, Pod, PodJob, URI}, + model::{Blob, BlobKind, PathSet, Pod, PodJob, URI}, pipeline::{Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, }, }; use async_trait::async_trait; use bincode::{ - config, + config, de, serde::{decode_from_slice, encode_to_vec}, }; use itertools::Itertools as _; @@ -49,9 +49,9 @@ struct ProcessingFailure { reason = "too complex, but necessary for async handling" )] #[derive(Debug)] -pub struct PipelineRun { +struct PipelineRun { /// `PipelineJob` that this run is associated with - pub pipeline_job: PipelineJob, // The pipeline job that this run is associated with + pipeline_job: PipelineJob, // The pipeline job that this run is associated with node_tasks: JoinSet>, // JoinSet of tasks for each node in the pipeline outputs: Arc>>>>, // String is the node key, while hash } @@ -104,28 +104,8 @@ impl DockerPipelineRunner { Self::default() } - /** - Start the `pipeline_job` returning `pipeline_run` - - Algorithm: - 1. Create a new `PipelineRun` from the `pipeline_job` - 2. Insert the `PipelineRun` into the `pipeline_runs` map - 3. Create an output channel to capture the outputs of the nodes - (This will be given to the output capture task) - 4. Create a task that captures the outputs form nodes and stores them in the `outputs` map - This is done via listening the channel and acting like a final node in the pipeline - 5. Get the root nodes of the pipeline and call `create_task_for_node` for each root node - This will recursively BFS through the pipeline and create tasks for each node - (More detail in that function) - 6. Using the `root_nodes` txs, we will send all inputs to that channel. - This will start the pipeline execution - 7. Upon sending all the inputs, we will send node complete message - signifying that the `input_node` is done - 8. Return the `PipelineRun` which can be used to get the results later - - # Errors - Will error out if the pipeline job fails to start - */ + /// # Errors + /// Will error out if the pipeline job fails to start pub async fn start( &mut self, pipeline_job: PipelineJob, @@ -145,9 +125,11 @@ impl DockerPipelineRunner { let graph = &pipeline_run.pipeline_job.pipeline.graph; // Create the subscriber to listen to node ready status before sending inputs - let session = zenoh::open(zenoh::Config::default()) - .await - .context(selector::AgentCommunicationFailure {})?; + let session = Arc::new( + zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?, + ); let subscriber = session .declare_subscriber(format!("{pipeline_job_hash}/*/status/ready")) @@ -155,19 +137,19 @@ impl DockerPipelineRunner { .context(selector::AgentCommunicationFailure {})?; // For each node, we will create call create_node_processing_task - println!("Num of node indices: {}", graph.node_count()); for node_idx in graph.node_indices() { let node = &graph[node_idx]; // Spawn the task pipeline_run .node_tasks - .spawn(Self::create_node_processing_task( + .spawn(Self::spawn_node_processing_task( node.clone(), pipeline_run.pipeline_job.pipeline.clone(), pipeline_job_hash.clone(), namespace.to_owned(), namespace_lookup.clone(), + Arc::clone(&session), )); } @@ -180,46 +162,82 @@ impl DockerPipelineRunner { node.id.clone(), pipeline_run.pipeline_job.hash.clone(), Arc::clone(&pipeline_run.outputs), + Arc::clone(&session), )); } let num_of_nodes = graph.node_count(); - println!("Waiting for {num_of_nodes} nodes to be ready"); let mut ready_nodes = 0; // Wait for all nodes to be ready before sending inputs while (subscriber.recv_async().await).is_ok() { // Message is empty, just increment the counter ready_nodes += 1; - println!("number of ready nodes: {ready_nodes}"); if ready_nodes == num_of_nodes { break; // All nodes are ready, we can start sending inputs } } - println!( - "All nodes are ready, starting pipeline run: {}", - pipeline_job_hash - ); + // Submit the input_packets to the correct key_exp + let input_node_key_exp = format!("{pipeline_job_hash}/{INPUT_KEY_EXP}"); + for packet in &pipeline_run.pipeline_job.input_packets { + // Send the packet to the input node key_exp + let payload_encoded = encode_to_vec( + PathSet::Unary(Blob { + kind: BlobKind::File, + location: URI { + namespace: "asdfasdf".to_owned(), + path: "asdfasdf".into(), + }, + checksum: "".to_owned(), + }), + config::standard(), + )?; + + let (decoded_packet, _): (PathSet, usize) = + decode_from_slice(&payload_encoded, config::standard())?; + println!("decoded packet: {:?}", decoded_packet); + println!( + "Payload bytes: {:?}", + encode_to_vec( + NodeOutput::Packet("input_node".to_owned(), packet.clone()), + config::standard(), + )? + ); + session + .put( + &input_node_key_exp, + encode_to_vec( + NodeOutput::Packet("input_node".to_owned(), packet.clone()), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + } - // // Submit the input_packets to the correct key_exp - // for packet in &pipeline_run.pipeline_job.input_packets { - // println!("Sending packet"); - // // Send the packet to the input node key_exp - // session - // .put( - // format!("{pipeline_job_hash}/{INPUT_KEY_EXP}"), - // encode_to_vec(packet, config::standard())?, - // ) - // .await - // .context(selector::AgentCommunicationFailure {})?; - // } + // Send the complete processing message for the input node + session + .put( + input_node_key_exp, + encode_to_vec( + NodeOutput::ProcessingCompleted("input_node".to_owned()), + config::standard(), + )?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; // Insert into the list of pipeline runs self.pipeline_runs .insert(pipeline_job_hash.clone(), pipeline_run); + println!( + "Pipeline run started with id: {} and hash: {}", + pipeline_job_hash, pipeline_job_hash + ); + Ok(pipeline_job_hash) } @@ -237,8 +255,12 @@ impl DockerPipelineRunner { key: pipeline_run_id.to_owned(), })?; + println!("len of node_tasks: {}", pipeline_run.node_tasks.len()); + println!("Join set {:?}", pipeline_run.node_tasks); // Wait for all the tasks to complete while let Some(result) = pipeline_run.node_tasks.join_next().await { + println!("Join set {:?}", pipeline_run.node_tasks); + println!("Task completed, result: {:?}", result); match result { Ok(Ok(())) => {} // Task completed successfully Ok(Err(err)) => { @@ -250,6 +272,8 @@ impl DockerPipelineRunner { return Err(err.into()); } } + pipeline_run.node_tasks.abort_all(); + panic!(); } Ok(PipelineResult { @@ -294,11 +318,9 @@ impl DockerPipelineRunner { node_id: String, pipeline_run_id: String, outputs: Arc>>>>, + session: Arc, ) -> Result<()> { // Create a zenoh session - let session = zenoh::open(zenoh::Config::default()) - .await - .context(selector::AgentCommunicationFailure {})?; let subscriber = session .declare_subscriber(format!( "{pipeline_run_id}/{node_id}/outputs/{SUCCESS_KEY_EXP}" @@ -324,11 +346,15 @@ impl DockerPipelineRunner { .push(hash_map); } NodeOutput::ProcessingCompleted(_) => { - // Handle processing completed message if needed + // Processing is completed, thus we can exit this task + break; } } } + // Print exit message + println!("Capture task for node {} completed.", node_id); + Ok(()) } @@ -342,15 +368,14 @@ impl DockerPipelineRunner { /// /// # Errors /// Will error out if the kernel for the node is not found or if the - async fn create_node_processing_task( + async fn spawn_node_processing_task( node: Node, pipeline: Pipeline, pipeline_job_id: String, namespace: String, namespace_lookup: HashMap, + session: Arc, ) -> Result<()> { - // Print out node id for debugging - println!("Creating processing task for node: {}", node.id); // Create the correct processor for the node based on the kernel type let node_processor: Arc>> = Arc::new(Mutex::new( match get(&pipeline.kernel_lut, &node.kernel_hash)? { @@ -367,13 +392,6 @@ impl DockerPipelineRunner { }, )); - // Create the zenoh session - let session = Arc::new( - zenoh::open(zenoh::Config::default()) - .await - .context(selector::AgentCommunicationFailure {})?, - ); - // Create a joinset to spawn and handle incoming messages tasks let mut listener_tasks = JoinSet::new(); @@ -381,7 +399,6 @@ impl DockerPipelineRunner { let mut key_exps_to_subscribe_to = pipeline .get_parents_for_node(&node) .map(|parent_node| { - println!("Setting up listener for parent node: {}", parent_node.id); format!( "{pipeline_job_id}/{}/outputs/{SUCCESS_KEY_EXP}", parent_node.id @@ -395,7 +412,7 @@ impl DockerPipelineRunner { } // Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) - for key_exp in key_exps_to_subscribe_to { + for key_exp in &key_exps_to_subscribe_to { let subscriber = session .declare_subscriber(key_exp) .await @@ -419,6 +436,36 @@ impl DockerPipelineRunner { Arc::clone(&session), )); + // Wait for all tasks to be spawned and reply with ready message + // This is to ensure that the pipeline run knows when all tasks are ready to receive inputs + + let mut num_of_ready_subcribers: usize = 0; + // Build the subscriber + let status_subscriber = session + .declare_subscriber(format!( + "{pipeline_job_id}/{}/subscriber/status/ready", + node.id + )) + .await + .context(selector::AgentCommunicationFailure {})?; + + while status_subscriber.recv_async().await.is_ok() { + num_of_ready_subcribers += 1; + if num_of_ready_subcribers == key_exps_to_subscribe_to.len() { + // +1 for the stop request task + break; // All tasks are ready, we can start sending inputs + } + } + + // Send a ready message so the pipeline knows when to start sending inputs + session + .put( + format!("{pipeline_job_id}/{}/status/ready", node.id), + &node.id, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + // Wait for all task to complete listener_tasks.join_all().await; @@ -434,34 +481,33 @@ impl DockerPipelineRunner { namespace_lookup: HashMap, session: Arc, ) -> Result<()> { - // Send a ready message so the pipeline knows when to start sending inputs - let result = session + // We do not know when tokio will start executing this task, therefore we need to send a ready message + // back to our spawner task + session .put( - format!("{pipeline_job_id}/{node_id}/status/ready"), + format!("{pipeline_job_id}/{node_id}/subscriber/status/ready"), &node_id, ) .await - .context(selector::AgentCommunicationFailure {}); + .context(selector::AgentCommunicationFailure {})?; - // Print out if the ready message was sent successfully - if let Err(err) = result { - eprintln!("Failed to send ready message for node {}: {}", node_id, err); - } else { + while let Ok(payload) = subscriber.recv_async().await { + // Extract the message from the payload println!( - "Ready message sent for node {}, with key exp {}", + "Received message for node {}: {:?}", node_id, - format!("{pipeline_job_id}/{node_id}/status/ready") + payload.payload().to_bytes() ); - } - - println!("Listening for messages on node: {}", node_id); - - while let Ok(payload) = subscriber.recv_async().await { - // Extract the message from the payload let (msg, _): (NodeOutput, usize) = - decode_from_slice(&payload.payload().to_bytes(), config::standard())?; - println!("Received message for node {}: {:?}", node_id, msg); + match decode_from_slice(&payload.payload().to_bytes(), config::standard()) { + Ok(msg) => msg, + Err(err) => { + eprintln!("Failed to decode message: {err}"); + panic!("Failed to decode message: {err}"); + } + }; + match msg { NodeOutput::Packet(sender_id, hash_map) => { println!( @@ -481,6 +527,10 @@ impl DockerPipelineRunner { } NodeOutput::ProcessingCompleted(sender_id) => { // Notify the processor that the parent node has completed processing + println!( + "Received processing completed message for node {}", + sender_id + ); if node_processor .lock() .await @@ -500,12 +550,12 @@ impl DockerPipelineRunner { ) .await .context(selector::AgentCommunicationFailure {})?; + break; } } } - - // Process the message based on its type } + Ok::<(), OrcaError>(()) } @@ -756,8 +806,6 @@ impl NodeProcessor for MapperProcessor { Ok(()) }); - - println!("Successfully started processor for node: {}", node_id); Ok(()) } @@ -896,7 +944,6 @@ impl NodeProcessor for JoinerProcessor { Ok(()) }); } - println!("Successfully started processor for node: {}", node_id); Ok(()) } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index a9f0238c..5492c13a 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -10,21 +10,21 @@ use std::collections::HashMap; use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunner}; use snafu::ResultExt; -use tokio::time::sleep; +use tokio::{task::JoinSet, time::sleep}; use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test(flavor = "multi_thread", worker_threads = 32)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn basic_run() -> Result<()> { let pipeline_job = pipeline_job()?; // Create zenoh to monitor the node ready message let zenoh = zenoh::open(zenoh::Config::default()).await.unwrap(); // Replace with the correct error variant if needed - let sub = zenoh.declare_subscriber("**").await.unwrap(); tokio::spawn({ async move { + let sub = zenoh.declare_subscriber("**").await.unwrap(); // Receive loop ready, publish ready message println!("Listening for messages..."); loop { @@ -35,45 +35,31 @@ async fn basic_run() -> Result<()> { msg.payload().try_to_string().unwrap() ); } - Err(err) => println!("Error receiving message: {}", err), + Err(err) => { + println!("Error receiving message: {}", err); + break; + } } } } }); - let zenoh2 = zenoh::open(zenoh::Config::default()).await.unwrap(); - - let joiner = tokio::spawn(async move { - sleep(tokio::time::Duration::from_secs(2)).await; - // Send a bunch of messsage to the channel - for i in 0..10 { - zenoh2 - .put(format!("test/{}", i), format!("message {}", i).as_bytes()) - .await - .unwrap(); - println!("Sent message {}", i); - } - }); + // Create the runner + let mut runner = DockerPipelineRunner::new(); + + let test_dirs = TestDirs::new(&HashMap::from([( + "default".to_owned(), + Some("./tests/extra/data/"), + )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); + + let pipeline_run = runner + .start(pipeline_job, "default", &namespace_lookup) + .await?; - joiner.await.unwrap(); - // // Create the runner - // let mut runner = DockerPipelineRunner::new(); - - // let test_dirs = TestDirs::new(&HashMap::from([( - // "default".to_owned(), - // Some("./tests/extra/data/"), - // )]))?; - // let namespace_lookup = test_dirs.namespace_lookup(); - - // let pipeline_run = runner - // .start(pipeline_job, "default", &namespace_lookup) - // .await?; - - // sleep(std::time::Duration::from_secs(5)).await; - // panic!(); - // // Wait for the pipeline run to complete - // let pipeline_result = runner.get_result(&pipeline_run).await?; - // println!("{:?}", pipeline_result.output_packets); + // Wait for the pipeline run to complete + let pipeline_result = runner.get_result(&pipeline_run).await?; + println!("{:?}", pipeline_result.output_packets); Ok(()) } From dc3be339175fc286953e7e87d116ca53550fea00 Mon Sep 17 00:00:00 2001 From: Synicix Date: Mon, 21 Jul 2025 23:49:26 +0000 Subject: [PATCH 20/29] Remove bincode and switch to json. Fix a few joining error --- Cargo.toml | 1 - src/uniffi/pipeline_runner/runner.rs | 180 ++++++++------------------- tests/pipeline_runner.rs | 36 +++--- 3 files changed, 67 insertions(+), 150 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d70b1059..1340c5d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,6 @@ crate-type = ["rlib", "cdylib"] [dependencies] # make async fn in traits work with dyn traits async-trait = "0.1.88" -bincode = { version = "2.0.1", features = ["serde"] } # docker API in orchestrator bollard = "0.17.1" # datetime utilities diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index c43a5799..627c860d 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -7,10 +7,6 @@ use crate::{ }, }; use async_trait::async_trait; -use bincode::{ - config, de, - serde::{decode_from_slice, encode_to_vec}, -}; use itertools::Itertools as _; use serde::{Deserialize, Serialize}; use serde_yaml::Serializer; @@ -20,6 +16,7 @@ use std::{ fmt::{Display, Formatter, Result as FmtResult}, hash::{Hash, Hasher}, path::PathBuf, + result, sync::Arc, }; use tokio::{ @@ -183,35 +180,13 @@ impl DockerPipelineRunner { let input_node_key_exp = format!("{pipeline_job_hash}/{INPUT_KEY_EXP}"); for packet in &pipeline_run.pipeline_job.input_packets { // Send the packet to the input node key_exp - let payload_encoded = encode_to_vec( - PathSet::Unary(Blob { - kind: BlobKind::File, - location: URI { - namespace: "asdfasdf".to_owned(), - path: "asdfasdf".into(), - }, - checksum: "".to_owned(), - }), - config::standard(), - )?; - - let (decoded_packet, _): (PathSet, usize) = - decode_from_slice(&payload_encoded, config::standard())?; - println!("decoded packet: {:?}", decoded_packet); - println!( - "Payload bytes: {:?}", - encode_to_vec( - NodeOutput::Packet("input_node".to_owned(), packet.clone()), - config::standard(), - )? - ); session .put( &input_node_key_exp, - encode_to_vec( - NodeOutput::Packet("input_node".to_owned(), packet.clone()), - config::standard(), - )?, + serde_json::to_string(&NodeOutput::Packet( + "input_node".to_owned(), + packet.clone(), + ))?, ) .await .context(selector::AgentCommunicationFailure {})?; @@ -221,10 +196,7 @@ impl DockerPipelineRunner { session .put( input_node_key_exp, - encode_to_vec( - NodeOutput::ProcessingCompleted("input_node".to_owned()), - config::standard(), - )?, + serde_json::to_string(&NodeOutput::ProcessingCompleted("input_node".to_owned()))?, ) .await .context(selector::AgentCommunicationFailure {})?; @@ -233,11 +205,6 @@ impl DockerPipelineRunner { self.pipeline_runs .insert(pipeline_job_hash.clone(), pipeline_run); - println!( - "Pipeline run started with id: {} and hash: {}", - pipeline_job_hash, pipeline_job_hash - ); - Ok(pipeline_job_hash) } @@ -255,12 +222,8 @@ impl DockerPipelineRunner { key: pipeline_run_id.to_owned(), })?; - println!("len of node_tasks: {}", pipeline_run.node_tasks.len()); - println!("Join set {:?}", pipeline_run.node_tasks); // Wait for all the tasks to complete while let Some(result) = pipeline_run.node_tasks.join_next().await { - println!("Join set {:?}", pipeline_run.node_tasks); - println!("Task completed, result: {:?}", result); match result { Ok(Ok(())) => {} // Task completed successfully Ok(Err(err)) => { @@ -272,8 +235,6 @@ impl DockerPipelineRunner { return Err(err.into()); } } - pipeline_run.node_tasks.abort_all(); - panic!(); } Ok(PipelineResult { @@ -287,7 +248,6 @@ impl DockerPipelineRunner { /// Will error out if the pipeline run is not found or if any of the tasks fail to stop correctly pub async fn stop(&mut self, pipeline_run_id: &str) -> Result<()> { // To stop the pipeline run, we need to send a stop message to all the tasks - // Get the pipeline run first let pipeline_run = self.pipeline_runs @@ -330,14 +290,10 @@ impl DockerPipelineRunner { while let Ok(payload) = subscriber.recv_async().await { // Extract the message from the payload - let (msg, _): (NodeOutput, usize) = - decode_from_slice(&payload.payload().to_bytes(), config::standard())?; + let msg: NodeOutput = serde_json::from_slice(&payload.payload().to_bytes())?; match msg { NodeOutput::Packet(sender_id, hash_map) => { - // Optionally, you can log or print the output packet - println!("Captured output from node {}: {:?}", sender_id, hash_map); - // Store the output packet in the outputs map let mut outputs_lock = outputs.write().await; outputs_lock @@ -351,10 +307,6 @@ impl DockerPipelineRunner { } } } - - // Print exit message - println!("Capture task for node {} completed.", node_id); - Ok(()) } @@ -392,7 +344,7 @@ impl DockerPipelineRunner { }, )); - // Create a joinset to spawn and handle incoming messages tasks + // Create a join set to spawn and handle incoming messages tasks let mut listener_tasks = JoinSet::new(); // Create the list of key_expressions to subscribe to @@ -429,10 +381,12 @@ impl DockerPipelineRunner { )); } - // Create the task to handle stop request - listener_tasks.spawn(Self::start_stop_request_task( + // Create the listener task for the stop request + let mut stop_listener_task = JoinSet::new(); + + stop_listener_task.spawn(Self::start_stop_request_task( Arc::clone(&node_processor), - pipeline_job_id.clone(), + format!("{pipeline_job_id}/{}/stop", node.id), Arc::clone(&session), )); @@ -469,6 +423,9 @@ impl DockerPipelineRunner { // Wait for all task to complete listener_tasks.join_all().await; + // Abort the stop listener task since we don't need it anymore + stop_listener_task.abort_all(); + Ok(()) } @@ -493,27 +450,8 @@ impl DockerPipelineRunner { while let Ok(payload) = subscriber.recv_async().await { // Extract the message from the payload - println!( - "Received message for node {}: {:?}", - node_id, - payload.payload().to_bytes() - ); - - let (msg, _): (NodeOutput, usize) = - match decode_from_slice(&payload.payload().to_bytes(), config::standard()) { - Ok(msg) => msg, - Err(err) => { - eprintln!("Failed to decode message: {err}"); - panic!("Failed to decode message: {err}"); - } - }; - - match msg { + match serde_json::from_slice(&payload.payload().to_bytes())? { NodeOutput::Packet(sender_id, hash_map) => { - println!( - "Received packet from {} for node {}: {:?}", - sender_id, node_id, hash_map - ); // Process the packet using the node processor node_processor.lock().await.process_packet( &sender_id, @@ -527,10 +465,6 @@ impl DockerPipelineRunner { } NodeOutput::ProcessingCompleted(sender_id) => { // Notify the processor that the parent node has completed processing - println!( - "Received processing completed message for node {}", - sender_id - ); if node_processor .lock() .await @@ -543,15 +477,14 @@ impl DockerPipelineRunner { session .put( output_key_exp, - encode_to_vec( - NodeOutput::ProcessingCompleted(node_id.clone()), - config::standard(), - )?, + serde_json::to_string(&NodeOutput::ProcessingCompleted( + node_id.clone(), + ))?, ) .await .context(selector::AgentCommunicationFailure {})?; - break; } + break; } } } @@ -691,36 +624,29 @@ impl NodeProcessor for PodProcessor { let node_id_clone = node_id.to_owned(); let output_key_exp_clone = output_key_exp.to_owned(); self.processing_tasks.spawn(async move { - println!( - "Simulating Executing pod job: {} with pod hash: {}", - pod_job.hash, pod_job.pod.hash - ); - // For now we will just send the input_packet to the success channel session .put( - output_key_exp_clone + SUCCESS_KEY_EXP, - encode_to_vec( - NodeOutput::Packet(node_id_clone, output_packet), - config::standard(), - )?, + output_key_exp_clone + "/" + SUCCESS_KEY_EXP, + serde_json::to_string(&NodeOutput::Packet(node_id_clone, output_packet))?, ) .await .context(selector::AgentCommunicationFailure {})?; Ok(()) }); - - println!("Successfully started processor for node: {}", node_id); Ok(()) } async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { // For pod we only have one parent, thus execute the exit case - while (self.processing_tasks.join_next().await).is_some() { - // Wait for all tasks to complete + while let Some(result) = self.processing_tasks.join_next().await { + match result { + Ok(Ok(())) => {} + Ok(Err(err)) => {} + Err(err) => {} + } } - true } @@ -776,11 +702,11 @@ impl NodeProcessor for MapperProcessor { // Send the packet outwards session .put( - output_key_exp_clone.clone() + SUCCESS_KEY_EXP, - encode_to_vec( - NodeOutput::Packet(node_id_clone.clone(), output_map), - config::standard(), - )?, + format!("{}/{}", output_key_exp_clone, SUCCESS_KEY_EXP), + &serde_json::to_string(&NodeOutput::Packet( + node_id_clone.clone(), + output_map, + ))?, ) .await .context(selector::AgentCommunicationFailure {})?; @@ -791,19 +717,15 @@ impl NodeProcessor for MapperProcessor { // If there was an error, we send it to the failure channel session .put( - output_key_exp_clone + FAILURE_KEY_EXP, - encode_to_vec( - &ProcessingFailure { - node_id: node_id_clone, - error: err.to_string(), - }, - config::standard(), - )?, + format!("{}/{}", output_key_exp_clone, FAILURE_KEY_EXP), + serde_json::to_string(&ProcessingFailure { + node_id: node_id_clone.clone(), + error: err.to_string(), + })?, ) .await .context(selector::AgentCommunicationFailure {})?; } - Ok(()) }); Ok(()) @@ -906,17 +828,16 @@ impl NodeProcessor for JoinerProcessor { self.processing_tasks.spawn(async move { // Convert Vec>> to Vec<&Vec>> for compute_cartesian_product let cartesian_product = Self::compute_cartesian_product(&factors); - // Post all products to the output channel for output_packet in cartesian_product { let result = { session .put( - output_key_exp_clone.clone() + SUCCESS_KEY_EXP, - encode_to_vec( - NodeOutput::Packet(node_id_clone.clone(), output_packet), - config::standard(), - )?, + format!("{}/{}", output_key_exp_clone, SUCCESS_KEY_EXP), + serde_json::to_string(&NodeOutput::Packet( + node_id_clone.clone(), + output_packet, + ))?, ) .await .context(selector::AgentCommunicationFailure {})?; @@ -927,14 +848,11 @@ impl NodeProcessor for JoinerProcessor { if let Err(err) = result { session .put( - output_key_exp_clone.clone() + FAILURE_KEY_EXP, - encode_to_vec( - &ProcessingFailure { - node_id: node_id_clone.clone(), - error: err.to_string(), - }, - config::standard(), - )?, + format!("{}/{}", output_key_exp_clone, FAILURE_KEY_EXP), + serde_json::to_string(&ProcessingFailure { + node_id: node_id_clone.clone(), + error: err.to_string(), + })?, ) .await .context(selector::AgentCommunicationFailure {})?; diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 5492c13a..37cb7c0e 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -24,7 +24,7 @@ async fn basic_run() -> Result<()> { tokio::spawn({ async move { - let sub = zenoh.declare_subscriber("**").await.unwrap(); + let sub = zenoh.declare_subscriber("**/failure").await.unwrap(); // Receive loop ready, publish ready message println!("Listening for messages..."); loop { @@ -64,25 +64,25 @@ async fn basic_run() -> Result<()> { Ok(()) } -// #[tokio::test(flavor = "multi_thread", worker_threads = 4)] -// async fn stop() -> Result<()> { -// let pipeline_job = pipeline_job()?; +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn stop() -> Result<()> { + let pipeline_job = pipeline_job()?; -// // Create the runner -// let mut runner = DockerPipelineRunner::new(); + // Create the runner + let mut runner = DockerPipelineRunner::new(); -// let test_dirs = TestDirs::new(&HashMap::from([( -// "default".to_owned(), -// Some("./tests/extra/data/"), -// )]))?; -// let namespace_lookup = test_dirs.namespace_lookup(); + let test_dirs = TestDirs::new(&HashMap::from([( + "default".to_owned(), + Some("./tests/extra/data/"), + )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); -// let pipeline_run = runner -// .start(pipeline_job, "default", &namespace_lookup) -// .await?; + let pipeline_run = runner + .start(pipeline_job, "default", &namespace_lookup) + .await?; -// // Abort the pipeline run -// runner.stop(&pipeline_run).await?; + // Abort the pipeline run + runner.stop(&pipeline_run).await?; -// Ok(()) -// } + Ok(()) +} From 83c9eca79e373b6ae16c0b805b48dc5b0fda420d Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 23 Jul 2025 02:41:16 +0000 Subject: [PATCH 21/29] Add group and host name --- Cargo.toml | 1 + src/core/error.rs | 23 ------ src/uniffi/error.rs | 12 --- src/uniffi/pipeline_runner/runner.rs | 117 +++++++++++++++------------ tests/pipeline_runner.rs | 48 ++++------- 5 files changed, 83 insertions(+), 118 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1340c5d7..171236d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,7 @@ getset = { version = "0.1.5", git = "https://github.com/guzman-raphael/getset.gi glob = "0.3.1" # strings to snake_case heck = "0.5.0" +hostname = "0.4.1" # hashmaps that preserve insertion order indexmap = { version = "2.9.0", features = ["serde"] } itertools = "0.14.0" diff --git a/src/core/error.rs b/src/core/error.rs index ada0b0e2..cffb538c 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -1,5 +1,4 @@ use crate::uniffi::error::{Kind, OrcaError}; -use bincode::error::{DecodeError, EncodeError}; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -23,26 +22,6 @@ impl From for OrcaError { } } } -impl From for OrcaError { - fn from(error: DecodeError) -> Self { - Self { - kind: Kind::DecodeError { - source: error, - backtrace: Some(Backtrace::capture()), - }, - } - } -} -impl From for OrcaError { - fn from(error: EncodeError) -> Self { - Self { - kind: Kind::EncodingError { - source: error, - backtrace: Some(Backtrace::capture()), - }, - } - } -} impl From for OrcaError { fn from(error: oneshot::error::RecvError) -> Self { Self { @@ -147,8 +126,6 @@ impl fmt::Debug for OrcaError { | Kind::NoTagFoundInContainerAltImage { backtrace, .. } | Kind::BollardError { backtrace, .. } | Kind::ChannelReceiveError { backtrace, .. } - | Kind::DecodeError { backtrace, .. } - | Kind::EncodingError { backtrace, .. } | Kind::GlobPatternError { backtrace, .. } | Kind::IoError { backtrace, .. } | Kind::PathPrefixError { backtrace, .. } diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index c932570e..e94159f4 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -2,8 +2,6 @@ clippy::field_scoped_visibility_modifiers, reason = "Needed since SNAFU dynamically generating selectors." )] - -use bincode::error::{DecodeError, EncodeError}; use bollard::errors::Error as BollardError; use glob; use serde_json; @@ -108,16 +106,6 @@ pub(crate) enum Kind { backtrace: Option, }, #[snafu(transparent)] - DecodeError { - source: DecodeError, - backtrace: Option, - }, - #[snafu(transparent)] - EncodingError { - source: EncodeError, - backtrace: Option, - }, - #[snafu(transparent)] GlobPatternError { source: glob::PatternError, backtrace: Option, diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 627c860d..b4452242 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -2,8 +2,8 @@ use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ error::{OrcaError, Result, selector}, - model::{Blob, BlobKind, PathSet, Pod, PodJob, URI}, - pipeline::{Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, + model::{PathSet, Pod, PodJob, URI}, + pipeline::{self, Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, }, }; use async_trait::async_trait; @@ -16,7 +16,6 @@ use std::{ fmt::{Display, Formatter, Result as FmtResult}, hash::{Hash, Hasher}, path::PathBuf, - result, sync::Arc, }; use tokio::{ @@ -84,6 +83,10 @@ impl Display for PipelineRun { */ #[derive(Default)] pub struct DockerPipelineRunner { + /// User label on which group of agents this runner is associated with + pub group: String, + /// The host name of the runner + pub host: String, pipeline_runs: HashMap, } @@ -97,8 +100,14 @@ pub struct DockerPipelineRunner { */ impl DockerPipelineRunner { /// Create a new Docker pipeline runner - pub fn new() -> Self { - Self::default() + /// # Errors + /// Will error out if the environment variable `HOSTNAME` is not set + pub fn new(group: String) -> Result { + Ok(Self { + group, + host: hostname::get()?.to_string_lossy().to_string(), + pipeline_runs: HashMap::new(), + }) } /// # Errors @@ -116,8 +125,8 @@ impl DockerPipelineRunner { node_tasks: JoinSet::new(), }; - // Get the pipeline_job_hash which will be use to identify the pipeline run - let pipeline_job_hash = pipeline_run.pipeline_job.hash.clone(); + // The id for the pipeline_run is the pipeline_job hash + let pipeline_run_id = pipeline_run.pipeline_job.hash.clone(); let graph = &pipeline_run.pipeline_job.pipeline.graph; @@ -129,7 +138,7 @@ impl DockerPipelineRunner { ); let subscriber = session - .declare_subscriber(format!("{pipeline_job_hash}/*/status/ready")) + .declare_subscriber(self.get_base_key_exp(&pipeline_run_id) + "/*/status/ready") .await .context(selector::AgentCommunicationFailure {})?; @@ -143,7 +152,7 @@ impl DockerPipelineRunner { .spawn(Self::spawn_node_processing_task( node.clone(), pipeline_run.pipeline_job.pipeline.clone(), - pipeline_job_hash.clone(), + self.get_base_key_exp(&pipeline_run_id), namespace.to_owned(), namespace_lookup.clone(), Arc::clone(&session), @@ -157,9 +166,14 @@ impl DockerPipelineRunner { .node_tasks .spawn(Self::create_capture_task_for_node( node.id.clone(), - pipeline_run.pipeline_job.hash.clone(), Arc::clone(&pipeline_run.outputs), Arc::clone(&session), + format!( + "{}/{}/outputs/{}", + self.get_base_key_exp(&pipeline_run_id), + node.id, + SUCCESS_KEY_EXP, + ), )); } @@ -177,7 +191,11 @@ impl DockerPipelineRunner { } // Submit the input_packets to the correct key_exp - let input_node_key_exp = format!("{pipeline_job_hash}/{INPUT_KEY_EXP}"); + let input_node_key_exp = format!( + "{}/{}", + self.get_base_key_exp(&pipeline_run_id), + INPUT_KEY_EXP, + ); for packet in &pipeline_run.pipeline_job.input_packets { // Send the packet to the input node key_exp session @@ -203,9 +221,9 @@ impl DockerPipelineRunner { // Insert into the list of pipeline runs self.pipeline_runs - .insert(pipeline_job_hash.clone(), pipeline_run); + .insert(pipeline_run_id.clone(), pipeline_run); - Ok(pipeline_job_hash) + Ok(pipeline_run_id) } /// Given a pipeline run, wait for all its tasks to complete and return the `PipelineResult` @@ -247,6 +265,11 @@ impl DockerPipelineRunner { /// # Errors /// Will error out if the pipeline run is not found or if any of the tasks fail to stop correctly pub async fn stop(&mut self, pipeline_run_id: &str) -> Result<()> { + let stop_key_exp = format!( + "{}/{}/stop", + self.get_base_key_exp(pipeline_run_id), + pipeline_run_id + ); // To stop the pipeline run, we need to send a stop message to all the tasks // Get the pipeline run first let pipeline_run = @@ -262,10 +285,7 @@ impl DockerPipelineRunner { // Send the stop message into the stop key_exp, the msg is just an empty vector session - .put( - format!("{}/stop", pipeline_run.pipeline_job.hash), - Vec::new(), - ) + .put(stop_key_exp, Vec::new()) .await .context(selector::AgentCommunicationFailure {})?; @@ -276,15 +296,13 @@ impl DockerPipelineRunner { #[expect(clippy::type_complexity, reason = "Needed for async")] async fn create_capture_task_for_node( node_id: String, - pipeline_run_id: String, outputs: Arc>>>>, session: Arc, + key_exp_to_sub: String, ) -> Result<()> { // Create a zenoh session let subscriber = session - .declare_subscriber(format!( - "{pipeline_run_id}/{node_id}/outputs/{SUCCESS_KEY_EXP}" - )) + .declare_subscriber(key_exp_to_sub) .await .context(selector::AgentCommunicationFailure {})?; @@ -293,7 +311,7 @@ impl DockerPipelineRunner { let msg: NodeOutput = serde_json::from_slice(&payload.payload().to_bytes())?; match msg { - NodeOutput::Packet(sender_id, hash_map) => { + NodeOutput::Packet(_, hash_map) => { // Store the output packet in the outputs map let mut outputs_lock = outputs.write().await; outputs_lock @@ -323,7 +341,7 @@ impl DockerPipelineRunner { async fn spawn_node_processing_task( node: Node, pipeline: Pipeline, - pipeline_job_id: String, + base_key_exp: String, namespace: String, namespace_lookup: HashMap, session: Arc, @@ -352,7 +370,7 @@ impl DockerPipelineRunner { .get_parents_for_node(&node) .map(|parent_node| { format!( - "{pipeline_job_id}/{}/outputs/{SUCCESS_KEY_EXP}", + "{base_key_exp}/{}/outputs/{SUCCESS_KEY_EXP}", parent_node.id ) }) @@ -360,7 +378,7 @@ impl DockerPipelineRunner { // If there was no parent node, then this is root node, therefore we need to subscribe to the input node if key_exps_to_subscribe_to.is_empty() { - key_exps_to_subscribe_to.push(format!("{pipeline_job_id}/{INPUT_KEY_EXP}")); + key_exps_to_subscribe_to.push(format!("{base_key_exp}/{INPUT_KEY_EXP}")); } // Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) @@ -374,7 +392,7 @@ impl DockerPipelineRunner { subscriber, Arc::clone(&node_processor), node.id.clone(), - pipeline_job_id.clone(), + base_key_exp.clone(), namespace.clone(), namespace_lookup.clone(), Arc::clone(&session), @@ -386,26 +404,26 @@ impl DockerPipelineRunner { stop_listener_task.spawn(Self::start_stop_request_task( Arc::clone(&node_processor), - format!("{pipeline_job_id}/{}/stop", node.id), + format!("{base_key_exp}/{}/stop", node.id), Arc::clone(&session), )); // Wait for all tasks to be spawned and reply with ready message // This is to ensure that the pipeline run knows when all tasks are ready to receive inputs - let mut num_of_ready_subcribers: usize = 0; + let mut num_of_ready_subscribers: usize = 0; // Build the subscriber let status_subscriber = session .declare_subscriber(format!( - "{pipeline_job_id}/{}/subscriber/status/ready", + "{base_key_exp}/{}/subscriber/status/ready", node.id )) .await .context(selector::AgentCommunicationFailure {})?; while status_subscriber.recv_async().await.is_ok() { - num_of_ready_subcribers += 1; - if num_of_ready_subcribers == key_exps_to_subscribe_to.len() { + num_of_ready_subscribers += 1; + if num_of_ready_subscribers == key_exps_to_subscribe_to.len() { // +1 for the stop request task break; // All tasks are ready, we can start sending inputs } @@ -413,10 +431,7 @@ impl DockerPipelineRunner { // Send a ready message so the pipeline knows when to start sending inputs session - .put( - format!("{pipeline_job_id}/{}/status/ready", node.id), - &node.id, - ) + .put(format!("{base_key_exp}/{}/status/ready", node.id), &node.id) .await .context(selector::AgentCommunicationFailure {})?; @@ -433,7 +448,7 @@ impl DockerPipelineRunner { subscriber: Subscriber>, node_processor: Arc>>, node_id: String, - pipeline_job_id: String, + base_key_exp: String, namespace: String, namespace_lookup: HashMap, session: Arc, @@ -442,7 +457,7 @@ impl DockerPipelineRunner { // back to our spawner task session .put( - format!("{pipeline_job_id}/{node_id}/subscriber/status/ready"), + format!("{base_key_exp}/{node_id}/subscriber/status/ready"), &node_id, ) .await @@ -458,7 +473,7 @@ impl DockerPipelineRunner { &node_id, &hash_map, Arc::clone(&session), - &format!("{}/{}/outputs", pipeline_job_id, node_id.clone()), + &format!("{base_key_exp}/{}/outputs", node_id.clone()), &namespace, &namespace_lookup, )?; @@ -473,7 +488,7 @@ impl DockerPipelineRunner { { // This was the last parent, thus we need to send the processing complete message let output_key_exp = - format!("{pipeline_job_id}/{node_id}/outputs/{SUCCESS_KEY_EXP}"); + format!("{base_key_exp}/{node_id}/outputs/{SUCCESS_KEY_EXP}"); session .put( output_key_exp, @@ -494,11 +509,11 @@ impl DockerPipelineRunner { async fn start_stop_request_task( node_processor: Arc>>, - pipeline_run_id: String, + base_key_exp: String, session: Arc, ) -> Result<()> { let subscriber = session - .declare_subscriber(pipeline_run_id.clone() + "/stop") + .declare_subscriber(format!("{base_key_exp}/stop")) .await .context(selector::AgentCommunicationFailure {})?; while subscriber.recv_async().await.is_ok() { @@ -507,6 +522,10 @@ impl DockerPipelineRunner { } Ok::<(), OrcaError>(()) } + + fn get_base_key_exp(&self, pipeline_run_id: &str) -> String { + format!("{}/{}/{}", self.group, self.host, pipeline_run_id) + } } /// Unify the interface for node processors and provide a common way to handle processing of incoming messages @@ -640,13 +659,7 @@ impl NodeProcessor for PodProcessor { async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { // For pod we only have one parent, thus execute the exit case - while let Some(result) = self.processing_tasks.join_next().await { - match result { - Ok(Ok(())) => {} - Ok(Err(err)) => {} - Err(err) => {} - } - } + while let Some(result) = self.processing_tasks.join_next().await {} true } @@ -702,7 +715,7 @@ impl NodeProcessor for MapperProcessor { // Send the packet outwards session .put( - format!("{}/{}", output_key_exp_clone, SUCCESS_KEY_EXP), + format!("{output_key_exp_clone}/{SUCCESS_KEY_EXP}"), &serde_json::to_string(&NodeOutput::Packet( node_id_clone.clone(), output_map, @@ -717,7 +730,7 @@ impl NodeProcessor for MapperProcessor { // If there was an error, we send it to the failure channel session .put( - format!("{}/{}", output_key_exp_clone, FAILURE_KEY_EXP), + format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), serde_json::to_string(&ProcessingFailure { node_id: node_id_clone.clone(), error: err.to_string(), @@ -833,7 +846,7 @@ impl NodeProcessor for JoinerProcessor { let result = { session .put( - format!("{}/{}", output_key_exp_clone, SUCCESS_KEY_EXP), + format!("{output_key_exp_clone}/{SUCCESS_KEY_EXP}"), serde_json::to_string(&NodeOutput::Packet( node_id_clone.clone(), output_packet, @@ -848,7 +861,7 @@ impl NodeProcessor for JoinerProcessor { if let Err(err) = result { session .put( - format!("{}/{}", output_key_exp_clone, FAILURE_KEY_EXP), + format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), serde_json::to_string(&ProcessingFailure { node_id: node_id_clone.clone(), error: err.to_string(), diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 37cb7c0e..1dd90f58 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -1,4 +1,4 @@ -#![expect(missing_docs, reason = "OK in tests.")] +#![expect(missing_docs, clippy::panic_in_result_fn, reason = "OK in tests.")] // If 'fixture' is a local module, ensure there is a 'mod fixture;' statement or a 'fixture.rs' file in the same directory or in 'tests/'. // If 'fixture' is an external crate, add it to Cargo.toml and import as shown below. @@ -9,8 +9,6 @@ pub mod fixture; use std::collections::HashMap; use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunner}; -use snafu::ResultExt; -use tokio::{task::JoinSet, time::sleep}; use crate::fixture::TestDirs; use fixture::pipeline_job; @@ -19,33 +17,8 @@ use fixture::pipeline_job; async fn basic_run() -> Result<()> { let pipeline_job = pipeline_job()?; - // Create zenoh to monitor the node ready message - let zenoh = zenoh::open(zenoh::Config::default()).await.unwrap(); // Replace with the correct error variant if needed - - tokio::spawn({ - async move { - let sub = zenoh.declare_subscriber("**/failure").await.unwrap(); - // Receive loop ready, publish ready message - println!("Listening for messages..."); - loop { - match sub.recv_async().await { - Ok(msg) => { - println!( - "Received message: {:?}", - msg.payload().try_to_string().unwrap() - ); - } - Err(err) => { - println!("Error receiving message: {}", err); - break; - } - } - } - } - }); - // Create the runner - let mut runner = DockerPipelineRunner::new(); + let mut runner = DockerPipelineRunner::new("test".to_owned())?; let test_dirs = TestDirs::new(&HashMap::from([( "default".to_owned(), @@ -59,17 +32,22 @@ async fn basic_run() -> Result<()> { // Wait for the pipeline run to complete let pipeline_result = runner.get_result(&pipeline_run).await?; + + assert!( + pipeline_result.output_packets.len() == 1, + "Expected exactly one output packet." + ); println!("{:?}", pipeline_result.output_packets); Ok(()) } -#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn stop() -> Result<()> { let pipeline_job = pipeline_job()?; // Create the runner - let mut runner = DockerPipelineRunner::new(); + let mut runner = DockerPipelineRunner::new("test".to_owned())?; let test_dirs = TestDirs::new(&HashMap::from([( "default".to_owned(), @@ -86,3 +64,11 @@ async fn stop() -> Result<()> { Ok(()) } + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn check_group_and_host() -> Result<()> { + let runner = DockerPipelineRunner::new("test".to_owned())?; + assert_eq!(runner.group, "test"); + assert!(!runner.host.is_empty(), "Host should not be empty"); + Ok(()) +} From 1df8295e1d012ac2fdb1c30f225dbdd2a2785ffe Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 23 Jul 2025 10:14:02 +0000 Subject: [PATCH 22/29] fix unit test --- src/uniffi/pipeline_runner/runner.rs | 346 ++++++++++++++++++--------- 1 file changed, 229 insertions(+), 117 deletions(-) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index b4452242..37122c4a 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -3,12 +3,13 @@ use crate::{ uniffi::{ error::{OrcaError, Result, selector}, model::{PathSet, Pod, PodJob, URI}, - pipeline::{self, Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, + pipeline::{Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, }, }; use async_trait::async_trait; +use derive_more::derive; use itertools::Itertools as _; -use serde::{Deserialize, Serialize}; +use serde::{Deserialize, Serialize, de::value}; use serde_yaml::Serializer; use snafu::{OptionExt as _, ResultExt as _}; use std::{ @@ -463,20 +464,31 @@ impl DockerPipelineRunner { .await .context(selector::AgentCommunicationFailure {})?; + let node_base_output_key_exp = format!("{base_key_exp}/{node_id}/outputs"); while let Ok(payload) = subscriber.recv_async().await { // Extract the message from the payload match serde_json::from_slice(&payload.payload().to_bytes())? { NodeOutput::Packet(sender_id, hash_map) => { // Process the packet using the node processor - node_processor.lock().await.process_packet( + let result = node_processor.lock().await.process_packet( &sender_id, &node_id, &hash_map, Arc::clone(&session), - &format!("{base_key_exp}/{}/outputs", node_id.clone()), + &node_base_output_key_exp, &namespace, &namespace_lookup, - )?; + ); + + if let Err(err) = result { + try_to_forward_err_msg( + Arc::clone(&session), + err, + &node_base_output_key_exp, + &node_id, + ) + .await; + } } NodeOutput::ProcessingCompleted(sender_id) => { // Notify the processor that the parent node has completed processing @@ -517,7 +529,7 @@ impl DockerPipelineRunner { .await .context(selector::AgentCommunicationFailure {})?; while subscriber.recv_async().await.is_ok() { - // Received a requst to stop, therefore we need to tell the node_processor to shutdown + // Received a request to stop, therefore we need to tell the node_processor to shutdown node_processor.lock().await.stop(); } Ok::<(), OrcaError>(()) @@ -541,7 +553,7 @@ trait NodeProcessor: Send + Sync { node_id: &str, packet: &HashMap, session: Arc, - output_key_exp: &str, + base_output_key_exp: &str, namespace: &str, namespace_lookup: &HashMap, ) -> Result<()>; @@ -562,11 +574,40 @@ trait NodeProcessor: Send + Sync { fn stop(&mut self); } +/// Util function to handle forwarding error messages to the failure channel +async fn try_to_forward_err_msg( + session: Arc, + err: OrcaError, + node_base_output_key_exp: &str, + node_id: &str, +) { + match async { + session + .put( + format!("{node_base_output_key_exp}/{FAILURE_KEY_EXP}"), + serde_json::to_string(&ProcessingFailure { + node_id: node_id.to_owned(), + error: err.to_string(), + })?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + Ok::<(), OrcaError>(()) + } + .await + { + Ok(()) => {} + Err(send_err) => { + eprintln!("Failed to send failure message: {send_err}"); + } + } +} + /// Processor for Pods /// Currently missing implementation to call agents for actual pod processing struct PodProcessor { pod: Arc, - processing_tasks: JoinSet>, + processing_tasks: JoinSet<()>, } impl PodProcessor { @@ -591,13 +632,10 @@ impl NodeProcessor for PodProcessor { node_id: &str, packet: &HashMap, session: Arc, - output_key_exp: &str, + base_output_key_exp: &str, namespace: &str, namespace_lookup: &HashMap, ) -> Result<()> { - // Process the packet using the pod - // Create the pod_job - // We need a unique hash for this given input packet process by the node // therefore we need to generate a hash that has the pod_id + input_packet let node_id_bytes = node_id.as_bytes().to_vec(); @@ -641,25 +679,42 @@ impl NodeProcessor for PodProcessor { .collect::>(); let node_id_clone = node_id.to_owned(); - let output_key_exp_clone = output_key_exp.to_owned(); + let output_key_exp_clone = base_output_key_exp.to_owned(); self.processing_tasks.spawn(async move { // For now we will just send the input_packet to the success channel - session - .put( - output_key_exp_clone + "/" + SUCCESS_KEY_EXP, - serde_json::to_string(&NodeOutput::Packet(node_id_clone, output_packet))?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; + let results = async { + session + .put( + output_key_exp_clone.clone() + "/" + SUCCESS_KEY_EXP, + serde_json::to_string(&NodeOutput::Packet( + node_id_clone.clone(), + output_packet, + ))?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + Ok::<(), OrcaError>(()) + }; - Ok(()) + match results.await { + Ok(()) => {} + Err(err) => { + try_to_forward_err_msg( + session, + err, + &format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), + &node_id_clone, + ) + .await; + } + } }); Ok(()) } async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { // For pod we only have one parent, thus execute the exit case - while let Some(result) = self.processing_tasks.join_next().await {} + while self.processing_tasks.join_next().await.is_some() {} true } @@ -672,7 +727,7 @@ impl NodeProcessor for PodProcessor { /// This processor renames the `input_keys` from the input packet to the `output_keys` defined by the map struct MapperProcessor { mapper: Arc, - processing_tasks: JoinSet>, + processing_tasks: JoinSet<()>, } impl MapperProcessor { @@ -692,17 +747,17 @@ impl NodeProcessor for MapperProcessor { node_id: &str, packet: &HashMap, session: Arc, - output_key_exp: &str, + base_output_key_exp: &str, _namespace: &str, _namespace_lookup: &HashMap, ) -> Result<()> { let mapping = self.mapper.mapping.clone(); let packet_clone = packet.clone(); let node_id_clone = node_id.to_owned(); - let output_key_exp_clone = output_key_exp.to_owned(); + let output_key_exp_clone = base_output_key_exp.to_owned(); self.processing_tasks.spawn(async move { - let result = { + let result = async { // Apply the mapping to the input packet let output_map = mapping .iter() @@ -724,22 +779,18 @@ impl NodeProcessor for MapperProcessor { .await .context(selector::AgentCommunicationFailure {})?; Ok::<(), OrcaError>(()) - }; + } + .await; if let Err(err) = result { - // If there was an error, we send it to the failure channel - session - .put( - format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), - serde_json::to_string(&ProcessingFailure { - node_id: node_id_clone.clone(), - error: err.to_string(), - })?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; + try_to_forward_err_msg( + session, + err, + &format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), + &node_id_clone, + ) + .await; } - Ok(()) }); Ok(()) } @@ -747,7 +798,7 @@ impl NodeProcessor for MapperProcessor { async fn mark_parent_as_complete(&mut self, _parent_node_id: &str) -> bool { // For mapper we only have one parent, thus execute the exit case while (self.processing_tasks.join_next().await).is_some() { - // Wait for all tasks to complete + // The only error that should be forwarded here is the failure to send the output packet } true @@ -761,11 +812,12 @@ impl NodeProcessor for MapperProcessor { /// Processor for Joiner nodes /// This processor combines packets from multiple parent nodes into a single output packet /// It uses a cartesian product to combine packets from different parents +#[derive(Debug)] struct JoinerProcessor { /// Cache for all packets received by the node input_packet_cache: HashMap>>, completed_parents: Vec, - processing_tasks: JoinSet>, + processing_tasks: JoinSet<()>, } impl JoinerProcessor { @@ -807,7 +859,7 @@ impl NodeProcessor for JoinerProcessor { node_id: &str, packet: &HashMap, session: Arc, - output_key_exp: &str, + base_output_key_exp: &str, _namespace: &str, _namespace_lookup: &HashMap, ) -> Result<()> { @@ -820,6 +872,7 @@ impl NodeProcessor for JoinerProcessor { // Check if we have all the other parents needed to compute the cartesian product if self.input_packet_cache.values().all(|v| !v.is_empty()) { + // Print we have all the parents // Get all the cached packets from other parents let other_parent_ids = self .input_packet_cache @@ -836,15 +889,16 @@ impl NodeProcessor for JoinerProcessor { // Compute the cartesian product of the factors let node_id_clone = node_id.to_owned(); - let output_key_exp_clone = output_key_exp.to_owned(); + let output_key_exp_clone = base_output_key_exp.to_owned(); self.processing_tasks.spawn(async move { // Convert Vec>> to Vec<&Vec>> for compute_cartesian_product let cartesian_product = Self::compute_cartesian_product(&factors); // Post all products to the output channel + let session_clone = Arc::clone(&session); for output_packet in cartesian_product { - let result = { - session + let result = async { + session_clone .put( format!("{output_key_exp_clone}/{SUCCESS_KEY_EXP}"), serde_json::to_string(&NodeOutput::Packet( @@ -855,24 +909,20 @@ impl NodeProcessor for JoinerProcessor { .await .context(selector::AgentCommunicationFailure {})?; Ok::<(), OrcaError>(()) - }; + } + .await; // If the result is an error, we will just send it to the error channel if let Err(err) = result { - session - .put( - format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), - serde_json::to_string(&ProcessingFailure { - node_id: node_id_clone.clone(), - error: err.to_string(), - })?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; + try_to_forward_err_msg( + Arc::clone(&session_clone), + err, + &output_key_exp_clone, + &node_id_clone, + ) + .await; } } - - Ok(()) }); } Ok(()) @@ -904,70 +954,132 @@ impl NodeProcessor for JoinerProcessor { #[cfg(test)] #[tokio::test(flavor = "multi_thread", worker_threads = 2)] #[expect(clippy::panic_in_result_fn, reason = "Unit test")] +/// This test 3 cases for the joiner node: +/// The notation is as follows: (parent_id: data_file) +/// 1. Insufficient parents: It should not output anything until all parents has produce a packet (0: [A] 1: [A] 2: []) -> No output +/// 2. Sufficient parents: It should output a single packet with the cartesian product of the parents (0: [A] 1: [A] 2: [A]) -> Output: (0: A, 1: A, 2: A) +/// 3. Additional packet after initial condition is met: It should output a new packet with the cartesian product of the parents (0: [A] 1: [A] 2: [A, B]) -> Output: (0: A, 1: A, 2: B) +/// 4. Add an additional packet where more than 1 packet will be generated: (0: [A, B] 1: [A] 2: [A, B]) -> Output: (0: B, 1: A, 2: A), (0: B, 1: A, 2: B), async fn joiner() -> Result<()> { - // let parent_ids = vec!["0".to_owned(), "1".to_owned(), "2".to_owned()]; - - // let mut joiner_process = JoinerProcessor::new(parent_ids); - - // // Make each parent has 1 packet - // for idx in 0..2 { - // let packet = make_test_packet(format!("data_{idx}.txt").into()); - // joiner_process.process_packet(idx, "joiner", packet, session, output_key_exp, namespace, namespace_lookup); - // } - - // // Confirm that there should be no output yet - - // // Now we send the missing parent package - // // This will yield one unique combination - // joiner_process - // .process_packet("2", make_test_packet("data_1.txt".to_owned().into())) - // .await?; - - // // Confirm that the output is sent to the child channel - // assert!( - // child_rx.len() == 1, - // "Should have only one message in the channel", - // ); - // assert!( - // child_rx.recv().await.is_some(), - // "Should have received a message" - // ); - - // // Insert another one - // joiner_process - // .process_packet("2", make_test_packet("data_2.txt".to_owned().into())) - // .await?; - - // // The joiner node should send another one - // assert!( - // child_rx.len() == 1, - // "Should have only one message in the channel", - // ); - // assert!( - // child_rx.recv().await.is_some(), - // "Should have received a message" - // ); - - // // Now insert to packet for parent 0, which should yield 2 packets in total - // // This is because of the cartesian product - // joiner_process - // .process_packet("0", make_test_packet("data_2.txt".to_owned().into())) - // .await?; - - // assert!( - // child_rx.len() == 2, - // "Should have only two messages in the channel", - // ); - // assert!( - // child_rx.recv().await.is_some(), - // "Should have received a message" - // ); + use std::{thread::sleep, time::Duration}; + + let parent_ids = vec!["0".to_owned(), "1".to_owned(), "2".to_owned()]; + + let mut joiner_processor = JoinerProcessor::new(parent_ids); + let session = Arc::new( + zenoh::open(zenoh::Config::default()) + .await + .context(selector::AgentCommunicationFailure {})?, + ); + + let base_output_key_exp = "joiner_unit_test".to_owned(); + + // Create a buffer and a listener for the output channel + let success_msg = Arc::new(Mutex::new(Vec::new())); + let success_sub = session + .declare_subscriber(format!("{base_output_key_exp}/{SUCCESS_KEY_EXP}")) + .await + .context(selector::AgentCommunicationFailure {})?; + + // Create the async test to receive messages from the output channel + let mut listener_task = JoinSet::new(); + let success_msg_clone = Arc::clone(&success_msg); + listener_task.spawn(async move { + while let Ok(msg) = success_sub.recv() { + success_msg_clone.lock().await.push(msg); + } + }); + + // Make each parent has 1 packet + for idx in 0..2 { + let packet = make_test_packet(format!("key_{idx}"), "data_A.txt".to_string().into()); + joiner_processor.process_packet( + &format!("{idx}"), + &idx.to_string(), + &packet, + Arc::clone(&session), + &base_output_key_exp, + "", + &HashMap::new(), + )?; + } + + // Confirm that there should be no output yet + assert!( + success_msg.lock().await.is_empty(), + "Should have no messages in the channel", + ); + + // Now we send the missing parent package + // This will yield one unique combination + let packet_2_a = make_test_packet("key_2".to_owned(), "data_A.txt".to_owned().into()); + joiner_processor.process_packet( + "2", + "2", + &packet_2_a, + Arc::clone(&session), + &base_output_key_exp, + "", + &HashMap::new(), + )?; + + // Wait for the joiner to process and the listener to process the message + sleep(Duration::from_millis(100)); + + // Confirm that the output is sent to the child channel + assert_eq!( + success_msg.lock().await.len(), + 1, + "Should have only one message in the channel", + ); + + let packet_2_b = make_test_packet("key_2".to_owned(), "data_B.txt".to_owned().into()); + joiner_processor.process_packet( + "2", + "2", + &packet_2_b, + Arc::clone(&session), + &base_output_key_exp, + "", + &HashMap::new(), + )?; + + // Wait for the joiner to process and the listener to process the message + sleep(Duration::from_millis(100)); + + // The joiner node should send another one + assert_eq!( + success_msg.lock().await.len(), + 2, + "Should have only two messages in the channel", + ); + + let packet_0_b = make_test_packet("key_0".to_owned(), "data_B.txt".to_owned().into()); + joiner_processor.process_packet( + "0", + "0", + &packet_0_b, + Arc::clone(&session), + &base_output_key_exp, + "", + &HashMap::new(), + )?; + + // Wait for the joiner to process and the listener to process the message + sleep(Duration::from_millis(100)); + + // Should be a total of 6 messages in the channel + assert_eq!( + success_msg.lock().await.len(), + 4, + "Should have 4 messages in the channel", + ); Ok(()) } #[cfg(test)] -fn make_test_packet(path: PathBuf) -> HashMap { +fn make_test_packet(key: String, path: PathBuf) -> HashMap { use crate::uniffi::model::{Blob, BlobKind}; let path_set = PathSet::Unary(Blob { @@ -979,5 +1091,5 @@ fn make_test_packet(path: PathBuf) -> HashMap { checksum: String::new(), }); - HashMap::from([("key".to_owned(), path_set)]) + HashMap::from([(key, path_set)]) } From 4ed76a8810c2c08193d75c146332de9eb717d6d5 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 23 Jul 2025 21:46:19 +0000 Subject: [PATCH 23/29] Readd gpu --- .devcontainer/devcontainer.json | 1 + 1 file changed, 1 insertion(+) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 95a1a1ae..2fe4edcb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -21,6 +21,7 @@ }, "runArgs": [ "--name=${localWorkspaceFolderBasename}_devcontainer", + "--gpus=all", "--privileged", "--cgroupns=host" ], From f6bd697ff3a23c57128e42482ac06fa6d715a590 Mon Sep 17 00:00:00 2001 From: Synicix Date: Wed, 23 Jul 2025 22:07:06 +0000 Subject: [PATCH 24/29] Update comments --- src/uniffi/pipeline_runner/runner.rs | 51 ++++++++++++++++------------ tests/pipeline_runner.rs | 1 - 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 37122c4a..535b54d6 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -7,9 +7,8 @@ use crate::{ }, }; use async_trait::async_trait; -use derive_more::derive; use itertools::Itertools as _; -use serde::{Deserialize, Serialize, de::value}; +use serde::{Deserialize, Serialize}; use serde_yaml::Serializer; use snafu::{OptionExt as _, ResultExt as _}; use std::{ @@ -41,6 +40,8 @@ struct ProcessingFailure { error: String, } +/// Internal representation of a pipeline run, this should not be made public due to the fact that it contains +/// internal states and tasks #[expect( clippy::type_complexity, reason = "too complex, but necessary for async handling" @@ -72,16 +73,8 @@ impl Display for PipelineRun { write!(f, "PipelineRun({})", self.pipeline_job.hash) } } -/** - * Runner for pipelines - * - * General Algorithm: - * 1. All nodes receive inputs via a MPSC channel, where parents nodes will send their output packets - * 2. There are two "functional nodes processor" in the pipeline, - * which is the `input_node` and `output_node` - * 3. Each node will process the inputs its receives and will only send it children input channels - * if they are successfully processed. Failures are just printed for now (Will be replaced by logging) - */ + +/// Runner that uses a docker agent to run pipelines #[derive(Default)] pub struct DockerPipelineRunner { /// User label on which group of agents this runner is associated with @@ -91,14 +84,13 @@ pub struct DockerPipelineRunner { pipeline_runs: HashMap, } -/** - * This is an implementation of a pipeline runner that uses Zenoh to communicate between the tasks - * The runtime is tokio - * - * These are the key expressions of the components of the pipeline: - * - Input Node: `pipeline_job_hash/input_node/outputs` (This is where the `pipeline_job` packets get fed to) - * - Nodes: `pipeline_job_hash/node_id/outputs/(success|failure)` (This is where the node outputs are sent to) -*/ +/// This is an implementation of a pipeline runner that uses Zenoh to communicate between the tasks +/// The runtime is tokio +/// +/// These are the key expressions of the components of the pipeline: +/// Input Node: `pipeline_job_hash/input_node/outputs` (This is where the `pipeline_job` packets get fed to) +/// Nodes: `pipeline_job_hash/node_id/outputs/(success|failure)` (This is where the node outputs are sent to) +/// impl DockerPipelineRunner { /// Create a new Docker pipeline runner /// # Errors @@ -111,6 +103,12 @@ impl DockerPipelineRunner { }) } + /// Will start a new pipeline run with the given `PipelineJob` + /// This will start the async tasks for each node in the pipeline + /// including the one that captures the outputs from the leaf nodes + /// + /// Upon receiving the ready message from all the nodes, it will send the input packets to the input node + /// /// # Errors /// Will error out if the pipeline job fails to start pub async fn start( @@ -178,10 +176,10 @@ impl DockerPipelineRunner { )); } + // Wait for all nodes to be ready before sending inputs let num_of_nodes = graph.node_count(); let mut ready_nodes = 0; - // Wait for all nodes to be ready before sending inputs while (subscriber.recv_async().await).is_ok() { // Message is empty, just increment the counter ready_nodes += 1; @@ -224,6 +222,7 @@ impl DockerPipelineRunner { self.pipeline_runs .insert(pipeline_run_id.clone(), pipeline_run); + // Return the pipeline run id Ok(pipeline_run_id) } @@ -263,6 +262,9 @@ impl DockerPipelineRunner { } /// Stop the pipeline run and all its tasks + /// This will send a stop message to a channel that all node manager task are subscribed to. + /// Upon receiving the stop message, each node manager will force abort all of its task and exit. + /// /// # Errors /// Will error out if the pipeline run is not found or if any of the tasks fail to stop correctly pub async fn stop(&mut self, pipeline_run_id: &str) -> Result<()> { @@ -294,6 +296,7 @@ impl DockerPipelineRunner { Ok(()) } + /// This will capture the outputs of the given nodes and store it in the `outputs` map #[expect(clippy::type_complexity, reason = "Needed for async")] async fn create_capture_task_for_node( node_id: String, @@ -335,6 +338,7 @@ impl DockerPipelineRunner { /// - Create the zenoh session /// - Create a join set to spawn and handle incoming messages tasks /// - Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) + /// - Create an abort listener task that will listen for stop requests /// - For each subscriber, handle the incoming message appropriately /// /// # Errors @@ -445,6 +449,7 @@ impl DockerPipelineRunner { Ok(()) } + /// This is the actual handler for incoming messages for the node async fn start_async_processor_task( subscriber: Subscriber>, node_processor: Arc>>, @@ -519,6 +524,7 @@ impl DockerPipelineRunner { Ok::<(), OrcaError>(()) } + /// This task will listen for stop requests on the given key expression async fn start_stop_request_task( node_processor: Arc>>, base_key_exp: String, @@ -992,7 +998,7 @@ async fn joiner() -> Result<()> { // Make each parent has 1 packet for idx in 0..2 { - let packet = make_test_packet(format!("key_{idx}"), "data_A.txt".to_string().into()); + let packet = make_test_packet(format!("key_{idx}"), "data_A.txt".to_owned().into()); joiner_processor.process_packet( &format!("{idx}"), &idx.to_string(), @@ -1078,6 +1084,7 @@ async fn joiner() -> Result<()> { Ok(()) } +/// Helper function to create a test packet with a given key and path #[cfg(test)] fn make_test_packet(key: String, path: PathBuf) -> HashMap { use crate::uniffi::model::{Blob, BlobKind}; diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 1dd90f58..2c0ee628 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -37,7 +37,6 @@ async fn basic_run() -> Result<()> { pipeline_result.output_packets.len() == 1, "Expected exactly one output packet." ); - println!("{:?}", pipeline_result.output_packets); Ok(()) } From 16e0cbbf8ca5f5923b85f9c52ef26f6da48df564 Mon Sep 17 00:00:00 2001 From: Synicix Date: Sat, 26 Jul 2025 07:06:15 +0000 Subject: [PATCH 25/29] Save progress --- src/core/error.rs | 5 + src/uniffi/error.rs | 31 ++++ src/uniffi/model.rs | 89 ++++++++++- src/uniffi/orchestrator/agent.rs | 28 +++- src/uniffi/orchestrator/docker.rs | 2 +- src/uniffi/pipeline_runner/runner.rs | 231 ++++++++++++++++++++------- 6 files changed, 312 insertions(+), 74 deletions(-) diff --git a/src/core/error.rs b/src/core/error.rs index cffb538c..9be7405b 100644 --- a/src/core/error.rs +++ b/src/core/error.rs @@ -116,6 +116,7 @@ impl fmt::Debug for OrcaError { | Kind::FailedToParseDot { backtrace, .. } | Kind::GeneratedNamesOverflow { backtrace, .. } | Kind::InvalidFilepath { backtrace, .. } + | Kind::InvalidIndex { backtrace, .. } | Kind::InvalidPodResultTerminatedDatetime { backtrace, .. } | Kind::KeyMissing { backtrace, .. } | Kind::NoAnnotationFound { backtrace, .. } @@ -124,6 +125,10 @@ impl fmt::Debug for OrcaError { | Kind::NoMatchingPodRun { backtrace, .. } | Kind::NoRemainingServices { backtrace, .. } | Kind::NoTagFoundInContainerAltImage { backtrace, .. } + | Kind::PodJobSubmissionFailed { backtrace, .. } + | Kind::PodJobProcessingError { backtrace, .. } + | Kind::StatusConversionFailure { backtrace, .. } + | Kind::UnsupportedPathType { backtrace, .. } | Kind::BollardError { backtrace, .. } | Kind::ChannelReceiveError { backtrace, .. } | Kind::GlobPatternError { backtrace, .. } diff --git a/src/uniffi/error.rs b/src/uniffi/error.rs index e94159f4..355e723d 100644 --- a/src/uniffi/error.rs +++ b/src/uniffi/error.rs @@ -18,6 +18,8 @@ use tokio::sync::oneshot; use tokio::task; use uniffi; +use crate::uniffi::orchestrator::Status; + /// Shorthand for a Result that returns an `OrcaError`. pub type Result = result::Result; /// Possible errors you may encounter. @@ -51,6 +53,11 @@ pub(crate) enum Kind { source: io::Error, backtrace: Option, }, + #[snafu(display("Failed to get items at idx {idx}."))] + InvalidIndex { + idx: usize, + backtrace: Option, + }, #[snafu(display( "An invalid datetime was set for pod result for pod job (hash: {pod_job_hash})." ))] @@ -90,11 +97,35 @@ pub(crate) enum Kind { path: PathBuf, backtrace: Option, }, + #[snafu(display("Pod job {hash} failed to process with reason: {reason}."))] + PodJobProcessingError { + hash: String, + reason: String, + backtrace: Option, + }, + #[snafu(display( + "Failed to convert status {status:?} to PodResultStatus with reason: {reason}." + ))] + StatusConversionFailure { + status: Status, + reason: String, + backtrace: Option, + }, + #[snafu(display("Unsupported path type: {path:?}."))] + UnsupportedPathType { + path: PathBuf, + backtrace: Option, + }, #[snafu(display("Failed to send message because: {reason}"))] SendError { reason: String, backtrace: Option, }, + #[snafu(display("Pod job submission failed with reason: {reason}."))] + PodJobSubmissionFailed { + reason: String, + backtrace: Option, + }, #[snafu(transparent)] BollardError { source: BollardError, diff --git a/src/uniffi/model.rs b/src/uniffi/model.rs index b2c7dca7..1a8ad1a6 100644 --- a/src/uniffi/model.rs +++ b/src/uniffi/model.rs @@ -1,12 +1,16 @@ use crate::{ core::{ - crypto::{hash_blob, hash_buffer}, + crypto::{hash_blob, hash_buffer, hash_dir, hash_file}, model::{ deserialize_pod, deserialize_pod_job, serialize_hashmap, serialize_hashmap_option, to_yaml, }, + util::get, + }, + uniffi::{ + error::{Kind, OrcaError, Result}, + orchestrator::Status, }, - uniffi::{error::Result, orchestrator::Status}, }; use derive_more::Display; use getset::CloneGetters; @@ -183,8 +187,85 @@ impl PodJob { ..pod_job_no_hash }) } + + /// Util function to get the `output_packet` from a given `pod_job`, assuming it results already computed + /// # Errors + /// Will return `Err` if the output packet cannot be constructed, e.g. if the pod job has not been run yet or the output directory is not set. + pub fn get_output_packet( + &self, + namespace_lookup: &HashMap, + ) -> Result> { + self.pod + .output_spec + .iter() + .map(|(key, value)| { + // Construct the full path and figure out if it is a file or directory + let namespace_path = get(namespace_lookup, &self.output_dir.namespace)?; + let rel_path = self.output_dir.path.join(&value.path); + let abs_path = namespace_path.join(&rel_path); + + // Check if if it is a file or directory + let path_set = if abs_path.is_file() { + PathSet::Unary(Blob { + kind: BlobKind::File, + location: URI { + namespace: self.output_dir.namespace.clone(), + path: rel_path, + }, + checksum: hash_file(&abs_path)?, + }) + } else if abs_path.is_dir() { + PathSet::Unary(Blob { + kind: BlobKind::Directory, + location: URI { + namespace: self.output_dir.namespace.clone(), + path: rel_path, + }, + checksum: hash_dir(&abs_path)?, + }) + } else { + return Err(OrcaError { + kind: Kind::UnsupportedPathType { + path: abs_path, + backtrace: Some(snafu::Backtrace::capture()), + }, + }); + }; + Ok((key.clone(), path_set)) + }) + .collect::>() + } } +#[derive(uniffi::Enum, Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +/// Status of a pod result. +pub enum PodResultStatus { + /// Pod Job completed successfully. + Completed, + /// Pod Job failed with an exit code. + Failed(i16), + /// Mainly used for default values, not a valid status. + #[default] + Unset, +} + +impl TryFrom for PodResultStatus { + type Error = OrcaError; + + fn try_from(status: Status) -> Result { + match status { + Status::Completed => Ok(Self::Completed), + Status::Failed(code) => Ok(Self::Failed(code)), + Status::Running | Status::Unset => Err(OrcaError { + kind: Kind::StatusConversionFailure { + status, + reason: "Cannot convert Running or Unset status to PodResultStatus".to_owned(), + backtrace: Some(snafu::Backtrace::capture()), + }, + }), + } + } +} /// Result from a compute job run. #[derive(uniffi::Record, Serialize, Deserialize, Debug, Clone, PartialEq, Default)] pub struct PodResult { @@ -199,7 +280,7 @@ pub struct PodResult { /// Name given by orchestrator. pub assigned_name: String, /// Status of compute run when terminated. - pub status: Status, + pub status: PodResultStatus, /// Time in epoch when created in seconds. pub created: u64, /// Time in epoch when terminated in seconds. @@ -216,7 +297,7 @@ impl PodResult { annotation: Option, pod_job: Arc, assigned_name: String, - status: Status, + status: PodResultStatus, created: u64, terminated: u64, ) -> Result { diff --git a/src/uniffi/orchestrator/agent.rs b/src/uniffi/orchestrator/agent.rs index bcf48fcc..b99ddea5 100644 --- a/src/uniffi/orchestrator/agent.rs +++ b/src/uniffi/orchestrator/agent.rs @@ -1,9 +1,9 @@ use crate::{ core::orchestrator::agent::{EventPayload, start_service}, uniffi::{ - error::{OrcaError, Result, selector}, - model::{PodJob, PodResult}, - orchestrator::{Orchestrator, Status, docker::LocalDockerOrchestrator}, + error::{Kind, OrcaError, Result, selector}, + model::{PodJob, PodResult, PodResultStatus}, + orchestrator::{Orchestrator, docker::LocalDockerOrchestrator}, store::{Store as _, filestore::LocalFileStore}, }, }; @@ -13,7 +13,7 @@ use futures_util::future::join_all; use getset::CloneGetters; use serde_json::Value; use snafu::{OptionExt as _, ResultExt as _}; -use std::{collections::HashMap, path::PathBuf, sync::Arc}; +use std::{backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc}; use tokio::task::JoinSet; use uniffi; use zenoh; @@ -152,20 +152,33 @@ impl Agent { namespace_lookup.clone(), |pod_job: &PodJob| EventPayload::Request(pod_job.clone()), async |agent, inner_namespace_lookup, _, pod_job| { + println!("Processing pod job: {}", pod_job.hash); let pod_run = agent .orchestrator .start(&inner_namespace_lookup, &pod_job) - .await?; + .await + .unwrap(); let pod_result = agent.orchestrator.get_result(&pod_run).await?; agent.orchestrator.delete(&pod_run).await?; Ok(pod_result) }, async |client, pod_result| { let response_topic = match &pod_result.status { - Status::Completed => &format!("success/pod_job/{}", pod_result.pod_job.hash), - Status::Running | Status::Failed(_) | Status::Unset => { + PodResultStatus::Completed => { + &format!("success/pod_job/{}", pod_result.pod_job.hash) + } + PodResultStatus::Failed(_) => { &format!("failure/pod_job/{}", pod_result.pod_job.hash) } + PodResultStatus::Unset => { + return Err(OrcaError { + kind: Kind::PodJobProcessingError { + hash: pod_result.pod_job.hash.clone(), + reason: "PodResultStatus should not be unset".to_owned(), + backtrace: Some(Backtrace::capture()), + }, + }); + } }; client.publish(response_topic, &pod_result).await }, @@ -197,6 +210,7 @@ impl Agent { async |_, ()| Ok(()), )); } + services .join_next() .await diff --git a/src/uniffi/orchestrator/docker.rs b/src/uniffi/orchestrator/docker.rs index ff3790ad..38a8e248 100644 --- a/src/uniffi/orchestrator/docker.rs +++ b/src/uniffi/orchestrator/docker.rs @@ -223,7 +223,7 @@ impl Orchestrator for LocalDockerOrchestrator { None, Arc::clone(&pod_run.pod_job), pod_run.assigned_name.clone(), - result_info.status, + result_info.status.try_into()?, result_info.created, result_info .terminated diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 535b54d6..6cba4389 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -1,8 +1,12 @@ use crate::{ core::{crypto::hash_buffer, model::serialize_hashmap, util::get}, uniffi::{ - error::{OrcaError, Result, selector}, - model::{PathSet, Pod, PodJob, URI}, + error::{Kind, OrcaError, Result, selector}, + model::{PathSet, Pod, PodJob, PodResult, PodResultStatus, URI}, + orchestrator::{ + agent::{Agent, AgentClient, Response}, + docker::LocalDockerOrchestrator, + }, pipeline::{Kernel, Mapper, Node, Pipeline, PipelineJob, PipelineResult}, }, }; @@ -52,6 +56,8 @@ struct PipelineRun { pipeline_job: PipelineJob, // The pipeline job that this run is associated with node_tasks: JoinSet>, // JoinSet of tasks for each node in the pipeline outputs: Arc>>>>, // String is the node key, while hash + orchestrator_agent: Arc, // This is placed in pipeline due to the current design requiring a namespace to operate on + orchestrator_agent_task: JoinSet>, // JoinSet of tasks for the orchestrator agent } impl PartialEq for PipelineRun { @@ -75,7 +81,6 @@ impl Display for PipelineRun { } /// Runner that uses a docker agent to run pipelines -#[derive(Default)] pub struct DockerPipelineRunner { /// User label on which group of agents this runner is associated with pub group: String, @@ -96,9 +101,10 @@ impl DockerPipelineRunner { /// # Errors /// Will error out if the environment variable `HOSTNAME` is not set pub fn new(group: String) -> Result { + let host = hostname::get()?.to_string_lossy().to_string(); Ok(Self { group, - host: hostname::get()?.to_string_lossy().to_string(), + host, pipeline_runs: HashMap::new(), }) } @@ -117,13 +123,31 @@ impl DockerPipelineRunner { namespace: &str, // Name space to save pod_results to namespace_lookup: &HashMap, ) -> Result { + // Create the orchestrator + let orchestrator_agent = Agent::new( + self.group.clone(), + self.host.clone(), + LocalDockerOrchestrator::new()?.into(), + )?; + // Create a new pipeline run let mut pipeline_run = PipelineRun { pipeline_job, outputs: Arc::new(RwLock::new(HashMap::new())), node_tasks: JoinSet::new(), + orchestrator_agent: orchestrator_agent.into(), + orchestrator_agent_task: JoinSet::new(), }; + let orchestrator_agent_clone = Arc::clone(&pipeline_run.orchestrator_agent); + let namespace_lookup_clone = namespace_lookup.clone(); + // Start the orchestrator agent service + pipeline_run.orchestrator_agent_task.spawn(async move { + orchestrator_agent_clone + .start(&namespace_lookup_clone, None) + .await + }); + // The id for the pipeline_run is the pipeline_job hash let pipeline_run_id = pipeline_run.pipeline_job.hash.clone(); @@ -155,6 +179,7 @@ impl DockerPipelineRunner { namespace.to_owned(), namespace_lookup.clone(), Arc::clone(&session), + Arc::clone(&pipeline_run.orchestrator_agent.client), )); } @@ -163,7 +188,7 @@ impl DockerPipelineRunner { for node in pipeline_run.pipeline_job.pipeline.get_leaf_nodes() { pipeline_run .node_tasks - .spawn(Self::create_capture_task_for_node( + .spawn(Self::create_output_capture_task_for_node( node.id.clone(), Arc::clone(&pipeline_run.outputs), Arc::clone(&session), @@ -298,7 +323,7 @@ impl DockerPipelineRunner { /// This will capture the outputs of the given nodes and store it in the `outputs` map #[expect(clippy::type_complexity, reason = "Needed for async")] - async fn create_capture_task_for_node( + async fn create_output_capture_task_for_node( node_id: String, outputs: Arc>>>>, session: Arc, @@ -350,11 +375,12 @@ impl DockerPipelineRunner { namespace: String, namespace_lookup: HashMap, session: Arc, + client: Arc, ) -> Result<()> { // Create the correct processor for the node based on the kernel type let node_processor: Arc>> = Arc::new(Mutex::new( match get(&pipeline.kernel_lut, &node.kernel_hash)? { - Kernel::Pod(pod) => Box::new(PodProcessor::new(Arc::clone(pod))), + Kernel::Pod(pod) => Box::new(PodProcessor::new(Arc::clone(pod), client)), Kernel::Mapper(mapper) => Box::new(MapperProcessor::new(Arc::clone(mapper))), Kernel::Joiner => { // Need to get the parent node id for this joiner node @@ -614,57 +640,50 @@ async fn try_to_forward_err_msg( struct PodProcessor { pod: Arc, processing_tasks: JoinSet<()>, + client: Arc, } impl PodProcessor { - fn new(pod: Arc) -> Self { + fn new(pod: Arc, client: Arc) -> Self { Self { pod, processing_tasks: JoinSet::new(), + client, } } -} -#[async_trait] -impl NodeProcessor for PodProcessor { - #[expect( - clippy::unwrap_used, - clippy::unwrap_in_result, - reason = "Hard code for now, will be replaced by agent" - )] - fn process_packet( - &mut self, - _sender_node_id: &str, - node_id: &str, - packet: &HashMap, + /// Will handle the creation of the pod job, submission to the agent, listening for completion, and extracting the `output_packet` if successful + async fn start_pod_job_task( + node_id: String, + pod: Arc, + packet: HashMap, + client: Arc, session: Arc, - base_output_key_exp: &str, - namespace: &str, + base_output_key_exp: String, + namespace: String, namespace_lookup: &HashMap, ) -> Result<()> { - // We need a unique hash for this given input packet process by the node - // therefore we need to generate a hash that has the pod_id + input_packet + // For now we will just send the input_packet to the success channel let node_id_bytes = node_id.as_bytes().to_vec(); - let packet_copy = packet.clone(); let input_packet_hash = { let mut buf = node_id_bytes; let mut serializer = Serializer::new(&mut buf); - serialize_hashmap(&packet_copy, &mut serializer)?; + serialize_hashmap(&packet, &mut serializer)?; hash_buffer(buf) }; let output_dir = URI { - namespace: namespace.to_owned(), - path: PathBuf::from(format!("pod_runs/{}/{}", self.pod.hash, input_packet_hash)), + namespace: namespace.clone(), + path: PathBuf::from(format!("pod_runs/{node_id}/{input_packet_hash}")), }; - let cpu_limit = self.pod.recommended_cpus; - let memory_limit = self.pod.recommended_memory; + let cpu_limit = pod.recommended_cpus; + let memory_limit = pod.recommended_memory; // Create the pod job let pod_job = PodJob::new( None, - Arc::clone(&self.pod), - packet.clone(), + Arc::clone(&pod), + packet, output_dir, cpu_limit, memory_limit, @@ -672,44 +691,133 @@ impl NodeProcessor for PodProcessor { namespace_lookup, )?; - // Simulate pod execution by just printing out pod_job_hash and pod hash - // This will be replaced by sending the pod_job to the orchestrator via the agent + // Create listener for pod_job + let target_key_exp = format!( + "group/{}/{}/*/pod_job/{}", + client.group, client.host, pod_job.hash + ); - // Build the output_packet, in reality, this will be extracted from the pod_result + // Create the subscriber + let pod_job_subscriber = session + .declare_subscriber(target_key_exp) + .await + .context(selector::AgentCommunicationFailure {})?; - let output_packet = self - .pod - .output_spec - .keys() - .map(|output_key| (output_key.clone(), packet.values().next().cloned().unwrap())) - .collect::>(); + // Create the async task to listen for the pod job completion + let pod_job_listener_task = tokio::spawn(async move { + // Wait for the pod job to complete and extract the result + + let sample = pod_job_subscriber + .recv_async() + .await + .context(selector::AgentCommunicationFailure {})?; + // Extract the pod_result from the payload + let pod_result: PodResult = serde_json::from_slice(&sample.payload().to_bytes())?; + Ok::<_, OrcaError>(pod_result) + }); + + // Submit it to the client and get the response to make sure it was successful + let responses = client.submit_pod_jobs(vec![pod_job.into()]).await; + let response = responses + .first() + .context(selector::InvalidIndex { idx: 0_usize })?; + + match response { + Response::Ok => (), + Response::Err(err) => { + return Err(OrcaError { + kind: Kind::PodJobSubmissionFailed { + reason: err.clone(), + backtrace: Some(snafu::Backtrace::capture()), + }, + }); + } + } + + // Get the pod result from the listener task + let pod_result = pod_job_listener_task.await??; + // Get the output packet for the pod result + let output_packet = match pod_result.status { + PodResultStatus::Completed => { + // Get the output packet + pod_result.pod_job.get_output_packet(namespace_lookup)? + } + PodResultStatus::Failed(exit_code) => { + // Processing failed, thus return the error + return Err(OrcaError { + kind: Kind::PodJobProcessingError { + hash: pod_result.pod_job.hash.clone(), + reason: format!("Pod processing failed with exit code {exit_code}"), + backtrace: Some(snafu::Backtrace::capture()), + }, + }); + } + PodResultStatus::Unset => { + // This should not happen, but if it does, we will return an error + return Err(OrcaError { + kind: Kind::PodJobProcessingError { + hash: pod_result.pod_job.hash.clone(), + reason: "Pod processing status is unset".to_owned(), + backtrace: Some(snafu::Backtrace::capture()), + }, + }); + } + }; + + session + .put( + base_output_key_exp.clone() + "/" + SUCCESS_KEY_EXP, + serde_json::to_string(&NodeOutput::Packet(node_id.clone(), output_packet))?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + Ok::<(), OrcaError>(()) + } +} + +#[async_trait] +impl NodeProcessor for PodProcessor { + fn process_packet( + &mut self, + _sender_node_id: &str, + node_id: &str, + packet: &HashMap, + session: Arc, + base_output_key_exp: &str, + namespace: &str, + namespace_lookup: &HashMap, + ) -> Result<()> { + // We need a unique hash for this given input packet process by the node + // therefore we need to generate a hash that has the pod_id + input_packet + let pod_clone = Arc::clone(&self.pod); + let client_clone = Arc::clone(&self.client); + let node_id_owned = node_id.to_owned(); + let packet_owned = packet.clone(); + let base_output_key_exp_owned = base_output_key_exp.to_owned(); + let namespace_owned = namespace.to_owned(); + let namespace_lookup_owned = namespace_lookup.clone(); - let node_id_clone = node_id.to_owned(); - let output_key_exp_clone = base_output_key_exp.to_owned(); self.processing_tasks.spawn(async move { - // For now we will just send the input_packet to the success channel - let results = async { - session - .put( - output_key_exp_clone.clone() + "/" + SUCCESS_KEY_EXP, - serde_json::to_string(&NodeOutput::Packet( - node_id_clone.clone(), - output_packet, - ))?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; - Ok::<(), OrcaError>(()) - }; + let results = Self::start_pod_job_task( + node_id_owned.clone(), + pod_clone, + packet_owned, + client_clone, + Arc::clone(&session), + base_output_key_exp_owned.clone(), + namespace_owned.clone(), + &namespace_lookup_owned, + ) + .await; - match results.await { + match results { Ok(()) => {} Err(err) => { try_to_forward_err_msg( session, err, - &format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), - &node_id_clone, + &format!("{base_output_key_exp_owned}/{FAILURE_KEY_EXP}"), + &node_id_owned, ) .await; } @@ -806,7 +914,6 @@ impl NodeProcessor for MapperProcessor { while (self.processing_tasks.join_next().await).is_some() { // The only error that should be forwarded here is the failure to send the output packet } - true } From 65cbbdd497cab96f0a4b71c3f2b8296b481193d9 Mon Sep 17 00:00:00 2001 From: Synicix Date: Mon, 28 Jul 2025 19:56:52 +0000 Subject: [PATCH 26/29] Save progress --- src/core/orchestrator/agent.rs | 13 ++++++++++++- src/uniffi/orchestrator/agent.rs | 15 ++++++++++----- src/uniffi/pipeline_runner/runner.rs | 22 +++++++++------------- tests/fixture/mod.rs | 7 ++++--- tests/orchestrator.rs | 6 +++--- tests/pipeline_runner.rs | 24 +++++++++++++++++++++++- 6 files changed, 61 insertions(+), 26 deletions(-) diff --git a/src/core/orchestrator/agent.rs b/src/core/orchestrator/agent.rs index bd93077c..e5f39066 100644 --- a/src/core/orchestrator/agent.rs +++ b/src/core/orchestrator/agent.rs @@ -28,7 +28,7 @@ static RE_PODJOB_ACTION: LazyLock = LazyLock::new(|| { group\/(?[a-z_]+)\/ (?request|reservation|success|failure)\/ pod_job\/(?[0-9a-f]+)\/ - host\/(?[a-z_]+)\/ + host\/(?[0-9a-z_]+)\/ timestamp\/(?.*?) $ ", @@ -154,6 +154,16 @@ where .await .context(selector::AgentCommunicationFailure {})?; while let Ok(sample) = subscriber.recv_async().await { + println!( + "Received message on key expression: {}", + sample.key_expr().as_str(), + ); + + println!( + "Received payload: {:?}", + RE_PODJOB_ACTION.captures(sample.key_expr().as_str()) + ); + if let (Ok(input), Some(metadata)) = ( serde_json::from_slice::(&sample.payload().to_bytes()), RE_PODJOB_ACTION.captures(sample.key_expr().as_str()), @@ -165,6 +175,7 @@ where subgroup: metadata["pod_job_hash"].to_string(), }; let _event_payload = event_classifier(&input); + println!("Sending it to request task."); tasks.spawn({ let inner_request_task = request_task.clone(); let inner_inner_agent = Arc::clone(&inner_agent); diff --git a/src/uniffi/orchestrator/agent.rs b/src/uniffi/orchestrator/agent.rs index b99ddea5..7029bd2e 100644 --- a/src/uniffi/orchestrator/agent.rs +++ b/src/uniffi/orchestrator/agent.rs @@ -46,7 +46,7 @@ pub struct AgentClient { /// Connecting agent's assigned name used for reference. pub host: String, #[getset(skip)] - pub(crate) session: zenoh::Session, + pub(crate) session: Arc, } #[uniffi::export] @@ -67,7 +67,8 @@ impl AgentClient { .await .context(selector::AgentCommunicationFailure {})?, ) - })?, + })? + .into(), }) } /// Submit many pod jobs to be processed in parallel. @@ -156,13 +157,13 @@ impl Agent { let pod_run = agent .orchestrator .start(&inner_namespace_lookup, &pod_job) - .await - .unwrap(); + .await?; let pod_result = agent.orchestrator.get_result(&pod_run).await?; - agent.orchestrator.delete(&pod_run).await?; + //agent.orchestrator.delete(&pod_run).await?; Ok(pod_result) }, async |client, pod_result| { + println!("Finished processing pod job: {}", pod_result.pod_job.hash); let response_topic = match &pod_result.status { PodResultStatus::Completed => { &format!("success/pod_job/{}", pod_result.pod_job.hash) @@ -210,6 +211,10 @@ impl Agent { async |_, ()| Ok(()), )); } + // Create a service that responds to pod_job_worker availability requests. + services.spawn(start_service( + + )) services .join_next() diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 6cba4389..a93d9efa 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -139,6 +139,9 @@ impl DockerPipelineRunner { orchestrator_agent_task: JoinSet::new(), }; + // Get the preexisting zenoh session from agent + let session = Arc::clone(&pipeline_run.orchestrator_agent.client.session); + let orchestrator_agent_clone = Arc::clone(&pipeline_run.orchestrator_agent); let namespace_lookup_clone = namespace_lookup.clone(); // Start the orchestrator agent service @@ -153,13 +156,6 @@ impl DockerPipelineRunner { let graph = &pipeline_run.pipeline_job.pipeline.graph; - // Create the subscriber to listen to node ready status before sending inputs - let session = Arc::new( - zenoh::open(zenoh::Config::default()) - .await - .context(selector::AgentCommunicationFailure {})?, - ); - let subscriber = session .declare_subscriber(self.get_base_key_exp(&pipeline_run_id) + "/*/status/ready") .await @@ -692,10 +688,7 @@ impl PodProcessor { )?; // Create listener for pod_job - let target_key_exp = format!( - "group/{}/{}/*/pod_job/{}", - client.group, client.host, pod_job.hash - ); + let target_key_exp = format!("group/{}/*/pod_job/{}/**", client.group, pod_job.hash); // Create the subscriber let pod_job_subscriber = session @@ -706,7 +699,6 @@ impl PodProcessor { // Create the async task to listen for the pod job completion let pod_job_listener_task = tokio::spawn(async move { // Wait for the pod job to complete and extract the result - let sample = pod_job_subscriber .recv_async() .await @@ -735,7 +727,11 @@ impl PodProcessor { } // Get the pod result from the listener task - let pod_result = pod_job_listener_task.await??; + println!("Trying to get pod job result..."); + let temp = pod_job_listener_task.await?; + println!("Waiting for pod job to complete... {:?}", temp); + let pod_result = temp?; + // Get the output packet for the pod result let output_packet = match pod_result.status { PodResultStatus::Completed => { diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index df74e440..64800499 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -10,8 +10,9 @@ use names::{Generator, Name}; use orcapod::uniffi::{ error::Result, - model::{Annotation, Blob, BlobKind, PathInfo, PathSet, Pod, PodJob, PodResult, URI}, - orchestrator::Status, + model::{ + Annotation, Blob, BlobKind, PathInfo, PathSet, Pod, PodJob, PodResult, PodResultStatus, URI, + }, pipeline::{Kernel, Mapper, Pipeline, PipelineJob}, store::{ModelID, ModelInfo, Store}, }; @@ -137,7 +138,7 @@ pub fn pod_result_style( }), pod_job_style(namespace_lookup)?.into(), "simple-endeavour".to_owned(), - Status::Completed, + PodResultStatus::Completed, 1_737_922_307, 1_737_925_907, ) diff --git a/tests/orchestrator.rs b/tests/orchestrator.rs index 7ab494e3..11775a5f 100644 --- a/tests/orchestrator.rs +++ b/tests/orchestrator.rs @@ -8,7 +8,7 @@ use fixture::{ use futures_util::future::join_all; use orcapod::uniffi::{ error::{OrcaError, Result}, - model::URI, + model::{PodResultStatus, URI}, orchestrator::{ImageKind, Orchestrator as _, PodRun, Status, docker::LocalDockerOrchestrator}, }; use std::{collections::HashMap, path::PathBuf}; @@ -131,7 +131,7 @@ async fn remote_container_image_failed() -> Result<()> { orch.delete(&pod_run).await?; assert!( - matches!(pod_result.status, Status::Failed(1)), + matches!(pod_result.status, PodResultStatus::Failed(1)), "Expected to fail but did not." ); Ok(()) @@ -161,7 +161,7 @@ async fn verify_pod_result_not_running() -> Result<()> { let statuses = results .into_iter() .map(|result| Ok(result?.status)) - .filter(|status| !matches!(status, Ok(Status::Completed))) + .filter(|status| !matches!(status, Ok(PodResultStatus::Completed))) .collect::>>()?; println!("statuses: {statuses:?}"); diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 2c0ee628..a6e6f19c 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -13,8 +13,30 @@ use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunn use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] async fn basic_run() -> Result<()> { + // create a zenoh session to print out all communication message + let session = zenoh::open(zenoh::Config::default()) + .await + .expect("Failed to open zenoh session"); + + tokio::spawn(async move { + // Subscribe to all messages in the 'test' group + let sub = session + .declare_subscriber("**") + .await + .expect("Failed to declare subscriber"); + + while let Ok(sample) = sub.recv_async().await { + // Print the key expression and payload of each message + println!( + "Received message: {}: {:?}", + sample.key_expr().as_str(), + sample.payload(); + ); + } + }); + let pipeline_job = pipeline_job()?; // Create the runner From 58864a005ee1014f6dd77fa3878c543626c7d808 Mon Sep 17 00:00:00 2001 From: Synicix Date: Thu, 31 Jul 2025 22:15:34 +0000 Subject: [PATCH 27/29] Fix majority of merge errors --- output.txt | 3 + src/core/mod.rs | 3 +- src/core/model/mod.rs | 8 +- src/uniffi/model/packet.rs | 22 +++ src/uniffi/model/pipeline.rs | 35 +++-- src/uniffi/pipeline_runner/runner.rs | 4 - tests/extra/data/input.txt | 0 tests/extra/data/input1.txt | 1 + tests/extra/data/input2.txt | 1 + tests/fixture/mod.rs | 191 +++++++++++++++------------ tests/model.rs | 3 +- tests/orchestrator.rs | 2 +- tests/pipeline.rs | 38 +++--- tests/pipeline_runner.rs | 36 ++--- 14 files changed, 205 insertions(+), 142 deletions(-) create mode 100644 output.txt delete mode 100644 tests/extra/data/input.txt create mode 100644 tests/extra/data/input1.txt create mode 100644 tests/extra/data/input2.txt diff --git a/output.txt b/output.txt new file mode 100644 index 00000000..2ca3cd52 --- /dev/null +++ b/output.txt @@ -0,0 +1,3 @@ +1 +2 +2 diff --git a/src/core/mod.rs b/src/core/mod.rs index 52a6d376..6e3cc1ca 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -2,7 +2,8 @@ pub mod crypto; pub(crate) mod error; pub(crate) mod graph; -pub(crate) mod model; +/// Model definitions and utilities. +pub mod model; pub(crate) mod orchestrator; pub(crate) mod store; pub(crate) mod util; diff --git a/src/core/model/mod.rs b/src/core/model/mod.rs index e74cd7e6..50194881 100644 --- a/src/core/model/mod.rs +++ b/src/core/model/mod.rs @@ -32,7 +32,7 @@ pub fn to_yaml(instance: &T) -> Result { Ok(yaml) } -pub fn serialize_hashmap( +pub(crate) fn serialize_hashmap( map: &HashMap, serializer: S, ) -> result::Result @@ -44,7 +44,7 @@ where } #[expect(clippy::ref_option, reason = "Serde requires this signature.")] -pub fn serialize_hashmap_option( +pub(crate) fn serialize_hashmap_option( map_option: &Option>, serializer: S, ) -> result::Result @@ -57,5 +57,5 @@ where sorted.serialize(serializer) } -pub mod pipeline; -pub mod pod; +pub(crate) mod pipeline; +pub(crate) mod pod; diff --git a/src/uniffi/model/packet.rs b/src/uniffi/model/packet.rs index b33b3e17..79e554e6 100644 --- a/src/uniffi/model/packet.rs +++ b/src/uniffi/model/packet.rs @@ -31,6 +31,15 @@ pub struct URI { pub path: PathBuf, } +#[uniffi::export] +impl URI { + #[uniffi::constructor] + /// Create a new URI with the given namespace and path. + pub const fn new(namespace: String, path: PathBuf) -> Self { + Self { namespace, path } + } +} + /// BLOB with metadata. #[derive(uniffi::Record, Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] pub struct Blob { @@ -42,6 +51,19 @@ pub struct Blob { pub checksum: String, } +#[uniffi::export] +impl Blob { + #[uniffi::constructor] + /// Create a new BLOB with the given kind, location, and checksum. + pub const fn new(kind: BlobKind, location: URI) -> Self { + Self { + kind, + location, + checksum: String::new(), + } + } +} + /// A single BLOB or a collection of BLOBs. #[derive(uniffi::Enum, Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[serde(untagged)] diff --git a/src/uniffi/model/pipeline.rs b/src/uniffi/model/pipeline.rs index c2b1cb17..72e9ab72 100644 --- a/src/uniffi/model/pipeline.rs +++ b/src/uniffi/model/pipeline.rs @@ -8,6 +8,7 @@ use crate::{ uniffi::{ error::Result, model::{ + Annotation, packet::{PathSet, URI}, pod::Pod, }, @@ -31,9 +32,12 @@ pub struct Pipeline { #[getset(skip)] pub graph: DiGraph, /// Exposed, internal input specification. Each input may be fed into more than one node/key if desired. - pub input_spec: HashMap>, + pub input_spec: HashMap>, /// Exposed, internal output specification. Each output is associated with only one node/key. - pub output_spec: HashMap, + pub output_spec: HashMap, + /// Optional annotation for the pipeline. + #[getset(skip)] + pub annotation: Option, } #[uniffi::export] @@ -46,15 +50,17 @@ impl Pipeline { #[uniffi::constructor] pub fn new( graph_dot: &str, - metadata: HashMap, - input_spec: &HashMap>, - output_spec: &HashMap, + kernel_map: HashMap, + input_spec: HashMap>, + output_spec: HashMap, + annotation: Option, ) -> Result { - let graph = make_graph(graph_dot, metadata)?; + let graph = make_graph(graph_dot, kernel_map)?; Ok(Self { graph, - input_spec: input_spec.clone(), - output_spec: output_spec.clone(), + input_spec, + output_spec, + annotation, }) } } @@ -78,6 +84,9 @@ pub struct PipelineJob { pub input_packet: HashMap>, /// Attached, external output directory. pub output_dir: URI, + /// Optional annotation for the pipeline job. + #[getset(skip)] + pub annotation: Option, } #[uniffi::export] @@ -91,7 +100,8 @@ impl PipelineJob { pub fn new( pipeline: Arc, input_packet: &HashMap>, - output_dir: &URI, + output_dir: URI, + annotation: Option, namespace_lookup: &HashMap, ) -> Result { validate_packet("input".into(), &pipeline.input_spec, input_packet)?; @@ -112,7 +122,8 @@ impl PipelineJob { hash: make_random_hash(), pipeline, input_packet: input_packet_with_checksum, - output_dir: output_dir.clone(), + output_dir, + annotation, }) } } @@ -137,6 +148,7 @@ impl PipelineJob { } } +/// Struct to hold the result of a pipeline execution. pub struct PipelineResult { /// The pipeline job that was executed. pub pipeline_job: Arc, @@ -175,6 +187,7 @@ impl From for Kernel { } } +/// Mapper struct to store mapping information between input and output stream keys. #[derive(uniffi::Object, Display, Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] #[display("{self:#?}")] #[uniffi::export(Display)] @@ -206,7 +219,7 @@ impl Mapper { /// Index from pipeline node into pod specification. #[derive(uniffi::Record, Debug, Clone, Deserialize, Serialize, PartialEq, Eq)] -pub struct SpecURI { +pub struct NodeURI { /// Node reference name in pipeline. pub node_name: String, /// Specification key. diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 405fb51a..44ad3d57 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -53,10 +53,6 @@ struct ProcessingFailure { /// Internal representation of a pipeline run, this should not be made public due to the fact that it contains /// internal states and tasks -#[expect( - clippy::type_complexity, - reason = "too complex, but necessary for async handling" -)] #[derive(Debug)] struct PipelineRun { /// `PipelineJob` that this run is associated with diff --git a/tests/extra/data/input.txt b/tests/extra/data/input.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/extra/data/input1.txt b/tests/extra/data/input1.txt new file mode 100644 index 00000000..6ccbe584 --- /dev/null +++ b/tests/extra/data/input1.txt @@ -0,0 +1 @@ +input1_data diff --git a/tests/extra/data/input2.txt b/tests/extra/data/input2.txt new file mode 100644 index 00000000..901c1e11 --- /dev/null +++ b/tests/extra/data/input2.txt @@ -0,0 +1 @@ +input2_data diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index 23f7d217..12b73d53 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -13,9 +13,9 @@ use orcapod::uniffi::{ model::{ Annotation, packet::{Blob, BlobKind, PathInfo, PathSet, URI}, - pod::{Pod, PodJob, PodResult}, + pipeline::{Kernel, Mapper, NodeURI, Pipeline, PipelineJob}, + pod::{Pod, PodJob, PodResult, PodResultStatus}, }, - orchestrator::Status, store::{ModelID, ModelInfo, Store}, }; use std::{ @@ -156,7 +156,7 @@ pub fn pod_custom( image_reference.into(), command.into(), input_spec, - PathBuf::from("/tmp/output"), + PathBuf::from("/output"), HashMap::new(), "https://github.com/place/holder".to_owned(), 0.1, // 100 millicores as frac cores @@ -271,20 +271,29 @@ pub fn append_name_pod(pod_name: &str) -> Result { }), "alpine:3.14".to_owned(), format!( - "cp /input/input.txt /output/input.txt && echo \"Touch by Pod: {pod_name}\" >> /output/input.txt" + "cat input/input1.txt input/input2.txt > /output/output.txt && echo \"Processed by {pod_name}\" >> /output/output.txt" ), - HashMap::from([( - "input_text".to_owned(), - PathInfo { - path: PathBuf::from("/input/input.txt"), - match_pattern: r".*\.txt".to_owned(), - }, - )]), + HashMap::from([ + ( + "input1".to_owned(), + PathInfo { + path: PathBuf::from("/input/input1.txt"), + match_pattern: r".*\.txt".to_owned(), + }, + ), + ( + "input2".into(), + PathInfo { + path: PathBuf::from("/input/input2.txt"), + match_pattern: r".*\.txt".to_owned(), + }, + ), + ]), PathBuf::from("/output"), HashMap::from([( - "output_text".to_owned(), + "output".to_owned(), PathInfo { - path: PathBuf::from("/output/input.txt"), + path: PathBuf::from("/output/output.txt"), match_pattern: r".*\.txt".to_owned(), }, )]), @@ -299,62 +308,29 @@ pub fn pipeline() -> Result { // Create a simple pipeline where the functions job is to add append their name into the input file // Structure: A -> Mapper -> Joiner -> B -> Mapper -> C, D -> Mapper -> Joiner - // Create the components of the pipeline - let pod_a = append_name_pod("A")?; - let pod_b = append_name_pod("B")?; - let pod_c = append_name_pod("C")?; - let pod_d = append_name_pod("D")?; + // Create the kernel map + let mut kernel_map = HashMap::new(); - // Create the file mapper that will be used to map the output of one pod to the input of another - let file_mapper = Mapper::new(HashMap::from([( - "output_text".to_owned(), - "input_text".to_owned(), - )]))?; + // Insert the pod into the kernel map + for pod_name in ["A", "B", "C", "D"] { + kernel_map.insert(pod_name.into(), append_name_pod(pod_name)?.into()); + } // Create the file mapper that will be used to map the output of one pod to the input of another - let file_mapper_for_pod_d = Mapper::new(HashMap::from([( - "output_text".to_owned(), - "input2_text".to_owned(), - )]))?; - - let mut kernel_to_node_name = HashMap::>::new(); - - // Insert the pods into the kernel_to_node_name mapping - for pod in [&pod_a, &pod_b, &pod_c, &pod_d] { - kernel_to_node_name - .entry(pod.clone().into()) - .or_default() - .push( - pod.annotation - .as_ref() - .expect("Annotation missing.") - .name - .clone(), - ); - } + let mapper_kernel: Kernel = + Mapper::new(HashMap::from([("output".to_owned(), "input".to_owned())]))?.into(); + // Add the mappers + kernel_map.insert("pod_a_mapper".into(), mapper_kernel.clone()); + kernel_map.insert("pod_b_mapper".into(), mapper_kernel); + + // Create the file mapper for d which needs to be different + kernel_map.insert( + "pod_d_mapper".into(), + Mapper::new(HashMap::from([("output".to_owned(), "input2".to_owned())]))?.into(), + ); - // Add mapper to end of pod_a and pod_b - kernel_to_node_name - .entry(file_mapper.clone().into()) - .or_default() - .push("pod_a_mapper".to_owned()); - - kernel_to_node_name - .entry(file_mapper.into()) - .or_default() - .push("pod_b_mapper".to_owned()); - - // Insert mapper for pod_d - kernel_to_node_name - .entry(file_mapper_for_pod_d.into()) - .or_default() - .push("pod_d_mapper".to_owned()); - - // Add the joiner - kernel_to_node_name - .entry(Kernel::Joiner) - .or_default() - .push("pod_b_joiner".to_owned()); + // Add the joiner node + kernel_map.insert("pod_b_joiner".into(), Kernel::Joiner); // Write all the edges in DOT format let dot = " @@ -364,31 +340,73 @@ pub fn pipeline() -> Result { } "; - // Create pipeline with annotation - let annotation = Some(Annotation { - name: "Example Pipeline".to_owned(), - description: "This is an example pipeline. of A -> B -> C".to_owned(), - version: "1.0.0".to_owned(), - }); - - Pipeline::from_dot(&kernel_to_node_name, dot, annotation) + Pipeline::new( + dot, + kernel_map, + HashMap::from([ + ( + "input".into(), + vec![ + NodeURI { + node_name: "A".into(), + key: "input".into(), + }, + NodeURI { + node_name: "D".into(), + key: "input".into(), + }, + ], + ), + ( + "input2".into(), + vec![ + NodeURI { + node_name: "A".into(), + key: "input2".into(), + }, + NodeURI { + node_name: "D".into(), + key: "input2".into(), + }, + ], + ), + ]), + HashMap::from([( + "output".to_owned(), + NodeURI { + node_name: "C".into(), + key: "output".into(), + }, + )]), + Some(Annotation { + name: "Example Pipeline".to_owned(), + description: "This is an example pipeline. of A -> B -> C".to_owned(), + version: "1.0.0".to_owned(), + }), + ) } -pub fn pipeline_job() -> Result { +#[expect(clippy::implicit_hasher, reason = "Could be a false positive?")] +pub fn pipeline_job(namespace_lookup: &HashMap) -> Result { // Create a simple pipeline_job PipelineJob::new( - pipeline()?, - vec![HashMap::from([( - "input_text".to_owned(), - PathSet::Unary(Blob { - kind: BlobKind::File, - location: URI { - namespace: "default".to_owned(), - path: PathBuf::from("input.txt"), - }, - ..Default::default() - }), - )])], + pipeline()?.into(), + &HashMap::from([ + ( + "input1".into(), + vec![PathSet::Unary(Blob::new( + BlobKind::File, + URI::new("default".into(), "input.txt".into()), + ))], + ), + ( + "input2".into(), + vec![PathSet::Unary(Blob::new( + BlobKind::File, + URI::new("default".into(), "input2.txt".into()), + ))], + ), + ]), URI { namespace: "default".to_owned(), path: PathBuf::from("output"), @@ -398,6 +416,7 @@ pub fn pipeline_job() -> Result { description: "This is an example pipeline job.".to_owned(), version: "1.0.0".to_owned(), }), + namespace_lookup, ) } diff --git a/tests/model.rs b/tests/model.rs index 4180caf0..6a6d3e29 100644 --- a/tests/model.rs +++ b/tests/model.rs @@ -3,7 +3,8 @@ pub mod fixture; use fixture::{NAMESPACE_LOOKUP_READ_ONLY, pod_job_style, pod_result_style, pod_style}; use indoc::indoc; -use orcapod::{core::model::to_yaml, uniffi::error::Result}; +use orcapod::core::model::to_yaml; +use orcapod::uniffi::error::Result; #[test] fn hash_pod() -> Result<()> { diff --git a/tests/orchestrator.rs b/tests/orchestrator.rs index 86e32128..f9e31989 100644 --- a/tests/orchestrator.rs +++ b/tests/orchestrator.rs @@ -8,7 +8,7 @@ use fixture::{ use futures_util::future::join_all; use orcapod::uniffi::{ error::{OrcaError, Result}, - model::packet::{PodResultStatus, URI}, + model::{packet::URI, pod::PodResultStatus}, orchestrator::{ImageKind, Orchestrator as _, PodRun, Status, docker::LocalDockerOrchestrator}, }; use std::{collections::HashMap, path::PathBuf}; diff --git a/tests/pipeline.rs b/tests/pipeline.rs index 1e68dab5..a0783325 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -13,7 +13,7 @@ use orcapod::uniffi::{ error::Result, model::{ packet::{Blob, BlobKind, PathInfo, PathSet, URI}, - pipeline::{Kernel, Pipeline, PipelineJob, SpecURI}, + pipeline::{NodeURI, Pipeline, PipelineJob}, }, }; use std::collections::HashMap; @@ -30,29 +30,28 @@ fn input_packet_checksum() -> Result<()> { "}, HashMap::from([( "A".into(), - Kernel::Pod { - r#ref: pod_custom( - "alpine:3.14", - "echo", - HashMap::from([( - "node_key_1".into(), - PathInfo { - path: "/tmp/input/subject.jpeg".into(), - match_pattern: r".*\.jpeg".into(), - }, - )]), - )? - .into(), - }, + pod_custom( + "alpine:3.14", + "echo", + HashMap::from([( + "node_key_1".into(), + PathInfo { + path: "/tmp/input/subject.jpeg".into(), + match_pattern: r".*\.jpeg".into(), + }, + )]), + )? + .into(), )]), - &HashMap::from([( + HashMap::from([( "pipeline_key_1".into(), - vec![SpecURI { + vec![NodeURI { node_name: "A".into(), key: "node_key_1".into(), }], )]), - &HashMap::new(), + HashMap::new(), + None, )?; let pipeline_job = PipelineJob::new( @@ -68,10 +67,11 @@ fn input_packet_checksum() -> Result<()> { checksum: String::new(), })], )]), - &URI { + URI { namespace: "default".into(), path: "output/pipeline".into(), }, + None, &NAMESPACE_LOOKUP_READ_ONLY, )?; diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index a6e6f19c..99535a6e 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -13,8 +13,15 @@ use orcapod::uniffi::{error::Result, pipeline_runner::runner::DockerPipelineRunn use crate::fixture::TestDirs; use fixture::pipeline_job; -#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn basic_run() -> Result<()> { + // Create the test_dir and get the namespace lookup + let test_dirs = TestDirs::new(&HashMap::from([( + "default".to_owned(), + Some("./tests/extra/data/"), + )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); + // create a zenoh session to print out all communication message let session = zenoh::open(zenoh::Config::default()) .await @@ -32,22 +39,16 @@ async fn basic_run() -> Result<()> { println!( "Received message: {}: {:?}", sample.key_expr().as_str(), - sample.payload(); + sample.payload() ); } }); - let pipeline_job = pipeline_job()?; + let pipeline_job = pipeline_job(&namespace_lookup)?; // Create the runner let mut runner = DockerPipelineRunner::new("test".to_owned())?; - let test_dirs = TestDirs::new(&HashMap::from([( - "default".to_owned(), - Some("./tests/extra/data/"), - )]))?; - let namespace_lookup = test_dirs.namespace_lookup(); - let pipeline_run = runner .start(pipeline_job, "default", &namespace_lookup) .await?; @@ -65,17 +66,22 @@ async fn basic_run() -> Result<()> { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn stop() -> Result<()> { - let pipeline_job = pipeline_job()?; - - // Create the runner - let mut runner = DockerPipelineRunner::new("test".to_owned())?; - + // Create the test_dir and get the namespace lookup let test_dirs = TestDirs::new(&HashMap::from([( "default".to_owned(), - Some("./tests/extra/data/"), + Some( + "./tests/extra + /data/", + ), )]))?; + let namespace_lookup = test_dirs.namespace_lookup(); + let pipeline_job = pipeline_job(&namespace_lookup)?; + + // Create the runner + let mut runner = DockerPipelineRunner::new("test".to_owned())?; + let pipeline_run = runner .start(pipeline_job, "default", &namespace_lookup) .await?; From 0fca0941a7543b4999d2c9ebc0fa1d5a428c088d Mon Sep 17 00:00:00 2001 From: Synicix Date: Fri, 1 Aug 2025 10:13:26 +0000 Subject: [PATCH 28/29] Add pipeline util func to handle new pipeline input_spec behavior (Runner shas no support yet) --- .vscode/settings.json | 2 +- src/core/orchestrator/agent.rs | 10 -- src/uniffi/model/packet.rs | 14 ++- src/uniffi/model/pipeline.rs | 79 ++++++++++--- src/uniffi/pipeline_runner/runner.rs | 147 ++++++++++++++++--------- tests/extra/data/input1.txt | 1 - tests/extra/data/input2.txt | 1 - tests/extra/data/input_txt/Where.txt | 1 + tests/extra/data/input_txt/black.txt | 1 + tests/extra/data/input_txt/cat.txt | 1 + tests/extra/data/input_txt/hiding.txt | 1 + tests/extra/data/input_txt/is_the.txt | 1 + tests/extra/data/input_txt/playing.txt | 1 + tests/extra/data/input_txt/tabby.txt | 1 + tests/fixture/mod.rs | 143 ++++++++++++++---------- tests/pipeline.rs | 90 ++++++++++++++- tests/pipeline_runner.rs | 13 ++- 17 files changed, 363 insertions(+), 144 deletions(-) delete mode 100644 tests/extra/data/input1.txt delete mode 100644 tests/extra/data/input2.txt create mode 100644 tests/extra/data/input_txt/Where.txt create mode 100644 tests/extra/data/input_txt/black.txt create mode 100644 tests/extra/data/input_txt/cat.txt create mode 100644 tests/extra/data/input_txt/hiding.txt create mode 100644 tests/extra/data/input_txt/is_the.txt create mode 100644 tests/extra/data/input_txt/playing.txt create mode 100644 tests/extra/data/input_txt/tabby.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 48efb42c..f95b3e1a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -21,4 +21,4 @@ "python.terminal.activateEnvironment": false, "notebook.formatOnSave.enabled": true, "notebook.output.scrolling": true -} +} \ No newline at end of file diff --git a/src/core/orchestrator/agent.rs b/src/core/orchestrator/agent.rs index 1566484b..1e518d18 100644 --- a/src/core/orchestrator/agent.rs +++ b/src/core/orchestrator/agent.rs @@ -154,16 +154,6 @@ where .await .context(selector::AgentCommunicationFailure {})?; while let Ok(sample) = subscriber.recv_async().await { - println!( - "Received message on key expression: {}", - sample.key_expr().as_str(), - ); - - println!( - "Received payload: {:?}", - RE_PODJOB_ACTION.captures(sample.key_expr().as_str()) - ); - if let (Ok(input), Some(metadata)) = ( serde_json::from_slice::(&sample.payload().to_bytes()), RE_PODJOB_ACTION.captures(sample.key_expr().as_str()), diff --git a/src/uniffi/model/packet.rs b/src/uniffi/model/packet.rs index 79e554e6..fddac353 100644 --- a/src/uniffi/model/packet.rs +++ b/src/uniffi/model/packet.rs @@ -12,6 +12,18 @@ pub struct PathInfo { pub match_pattern: String, } +#[uniffi::export] +impl PathInfo { + #[uniffi::constructor] + /// Create a new `PathInfo` with the given path and match pattern. + pub const fn new(path: PathBuf, match_pattern: String) -> Self { + Self { + path, + match_pattern, + } + } +} + /// File or directory options for BLOBs. #[derive(uniffi::Enum, Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] pub enum BlobKind { @@ -53,8 +65,8 @@ pub struct Blob { #[uniffi::export] impl Blob { - #[uniffi::constructor] /// Create a new BLOB with the given kind, location, and checksum. + #[uniffi::constructor] pub const fn new(kind: BlobKind, location: URI) -> Self { Self { kind, diff --git a/src/uniffi/model/pipeline.rs b/src/uniffi/model/pipeline.rs index 72e9ab72..5dca259d 100644 --- a/src/uniffi/model/pipeline.rs +++ b/src/uniffi/model/pipeline.rs @@ -6,7 +6,7 @@ use crate::{ validation::validate_packet, }, uniffi::{ - error::Result, + error::{Kind, OrcaError, Result}, model::{ Annotation, packet::{PathSet, URI}, @@ -19,7 +19,7 @@ use getset::CloneGetters; use itertools::Itertools as _; use petgraph::graph::DiGraph; use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, path::PathBuf, sync::Arc}; +use std::{backtrace::Backtrace, collections::HashMap, path::PathBuf, sync::Arc}; use uniffi; /// Computational dependencies as a [DAG](https://en.wikipedia.org/wiki/Directed_acyclic_graph). @@ -129,22 +129,64 @@ impl PipelineJob { } impl PipelineJob { - pub(crate) fn get_input_packets(&self) -> impl Iterator> { - let (keys, values) = self - .input_packet - .iter() - .map(|(key, value)| (key.clone(), value)) - .collect::<(Vec<_>, Vec<_>)>(); + /// Helpful function to get the input packet for input nodes of the pipeline based on the `pipeline_job` an`pipeline_spec`ec + /// # Errors + /// Will return `Err` if there is an issue getting the input packet per node. + /// # Returns + /// A `HashMap` where the key is the node name and the value is a vector of `HashMap` representing the input packets for that node. + pub fn get_input_packet_per_node( + &self, + ) -> Result>>> { + // For each node in the input specification, we will iterate over its mapping and + let mut node_input_spec = HashMap::new(); + for (input_key, node_uris) in &self.pipeline.input_spec { + for node_uri in node_uris { + let input_path_sets = self.input_packet.get(input_key).ok_or(OrcaError { + kind: Kind::KeyMissing { + key: input_key.clone(), + backtrace: Some(Backtrace::capture()), + }, + })?; + // There shouldn't be a duplicate key in the input packet + let node_input_path_sets_ref = node_input_spec + .entry(&node_uri.node_name) + .or_insert_with(HashMap::new); + + // Check if the node_uri.key already exists, if it does this is an error as there can't be two input_packet that map to the same key + if node_input_path_sets_ref.contains_key(&node_uri.key) { + todo!() + } else { + // Insert all the input_path_sets that map to this specific key for the node + node_input_path_sets_ref.insert(&node_uri.key, input_path_sets); + } + } + } - values + // For each node, compute the cartesian product of the path_sets for each unique combination of keys + let node_input_packets = node_input_spec .into_iter() - .multi_cartesian_product() - .map(move |combo| { - keys.clone() + .map(|(node_id, input_node_keys)| { + // We need to pull them out at the same time to ensure the key order is preserve to match the cartesian product + let (keys, values): (Vec<_>, Vec<_>) = input_node_keys.into_iter().unzip(); + + // Covert each combo into a packet + let packets = values .into_iter() - .zip(combo.into_iter().cloned()) - .collect::>() + .multi_cartesian_product() + .map(|combo| { + keys.iter() + .copied() + .zip(combo) + .map(|(key, pathset)| (key.to_owned(), pathset.to_owned())) + .collect::>() + }) + .collect::>>(); + + (node_id.to_owned(), packets) }) + .collect::>(); + + Ok(node_input_packets) } } @@ -225,3 +267,12 @@ pub struct NodeURI { /// Specification key. pub key: String, } + +#[uniffi::export] +impl NodeURI { + /// Create a new `NodeURI` instance. + #[uniffi::constructor] + pub const fn new(node_name: String, key: String) -> Self { + Self { node_name, key } + } +} diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 8e4b836b..481aabdf 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -61,6 +61,8 @@ struct PipelineRun { outputs: Arc>>>, // String is the node key, while hash orchestrator_agent: Arc, // This is placed in pipeline due to the current design requiring a namespace to operate on orchestrator_agent_task: JoinSet>, // JoinSet of tasks for the orchestrator agent + failure_logs: Arc>>, // Logs of processing failures + failure_logging_task: JoinSet>, // JoinSet of tasks for logging failures } impl PartialEq for PipelineRun { @@ -140,11 +142,14 @@ impl DockerPipelineRunner { node_tasks: JoinSet::new(), orchestrator_agent: orchestrator_agent.into(), orchestrator_agent_task: JoinSet::new(), + failure_logs: Arc::new(RwLock::new(Vec::new())), + failure_logging_task: JoinSet::new(), }; // Get the preexisting zenoh session from agent let session = Arc::clone(&pipeline_run.orchestrator_agent.client.session); + // Spawn task for each of the processing node let orchestrator_agent_clone = Arc::clone(&pipeline_run.orchestrator_agent); let namespace_lookup_clone = namespace_lookup.clone(); // Start the orchestrator agent service @@ -154,17 +159,27 @@ impl DockerPipelineRunner { .await }); + // Create failure logging task + pipeline_run + .failure_logging_task + .spawn(Self::failure_capture_task( + Arc::clone(&session), + Arc::clone(&pipeline_run.failure_logs), + )); + + // Create the processor task for each node // The id for the pipeline_run is the pipeline_job hash let pipeline_run_id = pipeline_run.pipeline_job.hash.clone(); let graph = &pipeline_run.pipeline_job.pipeline.graph; + // Create the subscriber that listen for ready messages let subscriber = session .declare_subscriber(self.get_base_key_exp(&pipeline_run_id) + "/*/status/ready") .await .context(selector::AgentCommunicationFailure {})?; - // For each node, we will create call create_node_processing_task + // Iterate through each node in the graph and spawn a task for each for node_idx in graph.node_indices() { let node = &graph[node_idx]; @@ -182,24 +197,35 @@ impl DockerPipelineRunner { )); } - // Spawn the task that captures the outputs from the output_nodes - // For now the output nodes are hardcoded to be the leaf nodes of the pipeline - - // for node in pipeline_run.pipeline_job.pipeline.get_leaf_nodes() { - // pipeline_run - // .node_tasks - // .spawn(Self::create_output_capture_task_for_node( - // node.id.clone(), - // Arc::clone(&pipeline_run.outputs), - // Arc::clone(&session), - // format!( - // "{}/{}/outputs/{}", - // self.get_base_key_exp(&pipeline_run_id), - // node.id, - // SUCCESS_KEY_EXP, - // ), - // )); - // } + // Spawn the task that captures the outputs based on the output_spec + let mut node_output_spec = HashMap::new(); + // Group the output spec by node + for (output_key, node_uri) in &pipeline_run.pipeline_job.pipeline.output_spec { + node_output_spec + .entry(node_uri.node_name.clone()) + .or_insert_with(HashMap::new) + .insert(output_key.clone(), node_uri.key.clone()); + } + + for (node_id, key_mapping) in node_output_spec { + // Create the key expression to subscribe to + let key_exp_to_sub = format!( + "{}/{}/outputs/{}", + self.get_base_key_exp(&pipeline_run_id), + node_id, + SUCCESS_KEY_EXP, + ); + + // Spawn the task that captures the outputs + pipeline_run + .node_tasks + .spawn(Self::create_output_capture_task_for_node( + key_mapping, + Arc::clone(&pipeline_run.outputs), + Arc::clone(&session), + key_exp_to_sub, + )); + } // Wait for all nodes to be ready before sending inputs let num_of_nodes = graph.node_count(); @@ -220,19 +246,19 @@ impl DockerPipelineRunner { self.get_base_key_exp(&pipeline_run_id), INPUT_KEY_EXP, ); - for packet in pipeline_run.pipeline_job.get_input_packets() { - // Send the packet to the input node key_exp - session - .put( - &input_node_key_exp, - serde_json::to_string(&NodeOutput::Packet( - "input_node".to_owned(), - packet.clone(), - ))?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; - } + // for packet in pipeline_run.pipeline_job.get_input_packet_per_node() { + // // Send the packet to the input node key_exp + // session + // .put( + // &input_node_key_exp, + // serde_json::to_string(&NodeOutput::Packet( + // "input_node".to_owned(), + // packet.clone(), + // ))?, + // ) + // .await + // .context(selector::AgentCommunicationFailure {})?; + // } // Send the complete processing message for the input node session @@ -323,13 +349,15 @@ impl DockerPipelineRunner { } /// This will capture the outputs of the given nodes and store it in the `outputs` map - #[expect(clippy::type_complexity, reason = "Needed for async")] async fn create_output_capture_task_for_node( - node_id: String, - outputs: Arc>>>>, + // + key_mapping: HashMap, + outputs: Arc>>>, session: Arc, key_exp_to_sub: String, ) -> Result<()> { + // Determine which keys we are interested in for the given node_id + // Create a zenoh session let subscriber = session .declare_subscriber(key_exp_to_sub) @@ -341,13 +369,16 @@ impl DockerPipelineRunner { let msg: NodeOutput = serde_json::from_slice(&payload.payload().to_bytes())?; match msg { - NodeOutput::Packet(_, hash_map) => { + NodeOutput::Packet(_, packet) => { + // Figure out which keys // Store the output packet in the outputs map let mut outputs_lock = outputs.write().await; - outputs_lock - .entry(node_id.clone()) - .or_default() - .push(hash_map); + for (output_key, node_key) in &key_mapping { + outputs_lock + .entry(output_key.to_owned()) + .or_default() + .push(get(&packet, node_key.as_str())?.clone()); + } } NodeOutput::ProcessingCompleted(_) => { // Processing is completed, thus we can exit this task @@ -358,6 +389,28 @@ impl DockerPipelineRunner { Ok(()) } + async fn failure_capture_task( + session: Arc, + failure_logs: Arc>>, + ) -> Result<()> { + let sub = session + .declare_subscriber(format!("**/outputs/{FAILURE_KEY_EXP}")) + .await + .context(selector::AgentCommunicationFailure {})?; + + // Listen to any failure messages and write it the logs + while let Ok(payload) = sub.recv_async().await { + // Extract the message from the payload + let msg: ProcessingFailure = serde_json::from_slice(&payload.payload().to_bytes())?; + // Store the failure message in the logs + failure_logs.write().await.push(msg.clone()); + // Print the failure message to stderr + eprintln!("Processing failure for node {}: {}", msg.node_id, msg.error); + } + + Ok(()) + } + /// Function to start tasks associated with the node /// Steps: /// - Create the node processor based on the kernel type @@ -694,6 +747,7 @@ impl PodProcessor { None, namespace_lookup, )?; + // Print out the packet // Create listener for pod_job let target_key_exp = format!("group/{}/*/pod_job/{}/**", client.group, pod_job.hash); @@ -735,9 +789,8 @@ impl PodProcessor { } // Get the pod result from the listener task - println!("Trying to get pod job result..."); let temp = pod_job_listener_task.await?; - println!("Waiting for pod job to complete... {temp:?}"); + let pod_result = temp?; // Get the output packet for the pod result @@ -820,7 +873,7 @@ impl NodeProcessor for PodProcessor { try_to_forward_err_msg( session, err, - &format!("{base_output_key_exp_owned}/{FAILURE_KEY_EXP}"), + &base_output_key_exp_owned, &node_id_owned, ) .await; @@ -901,13 +954,7 @@ impl NodeProcessor for MapperProcessor { .await; if let Err(err) = result { - try_to_forward_err_msg( - session, - err, - &format!("{output_key_exp_clone}/{FAILURE_KEY_EXP}"), - &node_id_clone, - ) - .await; + try_to_forward_err_msg(session, err, &output_key_exp_clone, &node_id_clone).await; } }); Ok(()) diff --git a/tests/extra/data/input1.txt b/tests/extra/data/input1.txt deleted file mode 100644 index 6ccbe584..00000000 --- a/tests/extra/data/input1.txt +++ /dev/null @@ -1 +0,0 @@ -input1_data diff --git a/tests/extra/data/input2.txt b/tests/extra/data/input2.txt deleted file mode 100644 index 901c1e11..00000000 --- a/tests/extra/data/input2.txt +++ /dev/null @@ -1 +0,0 @@ -input2_data diff --git a/tests/extra/data/input_txt/Where.txt b/tests/extra/data/input_txt/Where.txt new file mode 100644 index 00000000..2891a132 --- /dev/null +++ b/tests/extra/data/input_txt/Where.txt @@ -0,0 +1 @@ +Where diff --git a/tests/extra/data/input_txt/black.txt b/tests/extra/data/input_txt/black.txt new file mode 100644 index 00000000..7e66a17d --- /dev/null +++ b/tests/extra/data/input_txt/black.txt @@ -0,0 +1 @@ +black diff --git a/tests/extra/data/input_txt/cat.txt b/tests/extra/data/input_txt/cat.txt new file mode 100644 index 00000000..ef07ddcd --- /dev/null +++ b/tests/extra/data/input_txt/cat.txt @@ -0,0 +1 @@ +cat diff --git a/tests/extra/data/input_txt/hiding.txt b/tests/extra/data/input_txt/hiding.txt new file mode 100644 index 00000000..56e64f05 --- /dev/null +++ b/tests/extra/data/input_txt/hiding.txt @@ -0,0 +1 @@ +hiding diff --git a/tests/extra/data/input_txt/is_the.txt b/tests/extra/data/input_txt/is_the.txt new file mode 100644 index 00000000..863d01a3 --- /dev/null +++ b/tests/extra/data/input_txt/is_the.txt @@ -0,0 +1 @@ +is the diff --git a/tests/extra/data/input_txt/playing.txt b/tests/extra/data/input_txt/playing.txt new file mode 100644 index 00000000..0395b790 --- /dev/null +++ b/tests/extra/data/input_txt/playing.txt @@ -0,0 +1 @@ +playing diff --git a/tests/extra/data/input_txt/tabby.txt b/tests/extra/data/input_txt/tabby.txt new file mode 100644 index 00000000..3de6015d --- /dev/null +++ b/tests/extra/data/input_txt/tabby.txt @@ -0,0 +1 @@ +tabby diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index 9e291c03..e42454d8 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -18,7 +18,7 @@ use orcapod::uniffi::{ }, store::{ModelID, ModelInfo, Store}, }; -use std::borrow::ToOwned; +use std::borrow::ToOwned as _; use std::{ collections::HashMap, fs::{self, File}, @@ -263,7 +263,7 @@ pub fn container_image_style(binary_location: impl AsRef) -> Result Result { +pub fn combine_txt_pod(pod_name: &str) -> Result { Pod::new( Some(Annotation { name: pod_name.to_owned(), @@ -272,22 +272,16 @@ pub fn append_name_pod(pod_name: &str) -> Result { }), "alpine:3.14".to_owned(), str_to_vec(&format!( - "cat input/input1.txt input/input2.txt > /output/output.txt && echo \"Processed by {pod_name}\" >> /output/output.txt" + "cat input/input_1.txt input/input_2.txt > /output/output.txt && echo \"Processed by {pod_name}\" >> /output/output.txt" )), HashMap::from([ ( "input1".to_owned(), - PathInfo { - path: PathBuf::from("/input/input1.txt"), - match_pattern: r".*\.txt".to_owned(), - }, + PathInfo::new("/input/input_1.txt".into(), r".*\.txt".into()), ), ( "input2".into(), - PathInfo { - path: PathBuf::from("/input/input2.txt"), - match_pattern: r".*\.txt".to_owned(), - }, + PathInfo::new("/input/input2.txt".into(), r".*\.txt".into()), ), ]), PathBuf::from("/output"), @@ -299,8 +293,8 @@ pub fn append_name_pod(pod_name: &str) -> Result { }, )]), "N/A".to_owned(), - 0.25, // 250 millicores as frac cores - 1_u64 << 30, // 1GiB in bytes + 0.25, // 250 millicores as frac cores + 128_u64 << 20, // 128MB in bytes None, ) } @@ -314,30 +308,32 @@ pub fn pipeline() -> Result { // Insert the pod into the kernel map for pod_name in ["A", "B", "C", "D"] { - kernel_map.insert(pod_name.into(), append_name_pod(pod_name)?.into()); + kernel_map.insert(pod_name.into(), combine_txt_pod(pod_name)?.into()); } - // Create the file mapper that will be used to map the output of one pod to the input of another - let mapper_kernel: Kernel = - Mapper::new(HashMap::from([("output".to_owned(), "input".to_owned())]))?.into(); - // Add the mappers - kernel_map.insert("pod_a_mapper".into(), mapper_kernel.clone()); - kernel_map.insert("pod_b_mapper".into(), mapper_kernel); - - // Create the file mapper for d which needs to be different + // Create a mapper for A, B, and C + kernel_map.insert( + "pod_a_mapper".into(), + Mapper::new(HashMap::from([("output".to_owned(), "input_1".to_owned())]))?.into(), + ); + kernel_map.insert( + "pod_b_mapper".into(), + Mapper::new(HashMap::from([("output".to_owned(), "input_2".to_owned())]))?.into(), + ); kernel_map.insert( - "pod_d_mapper".into(), - Mapper::new(HashMap::from([("output".to_owned(), "input2".to_owned())]))?.into(), + "pod_c_mapper".into(), + Mapper::new(HashMap::from([("output".to_owned(), "input_1".to_owned())]))?.into(), ); // Add the joiner node - kernel_map.insert("pod_b_joiner".into(), Kernel::Joiner); + kernel_map.insert("pod_c_joiner".into(), Kernel::Joiner); // Write all the edges in DOT format let dot = " digraph { - A -> pod_a_mapper -> pod_b_joiner -> B -> pod_b_mapper -> C; - D -> pod_d_mapper -> pod_b_joiner; + A -> pod_a_mapper -> pod_c_joiner; + B -> pod_b_mapper -> pod_c_joiner; + pod_c_joiner -> C -> D; } "; @@ -346,42 +342,36 @@ pub fn pipeline() -> Result { kernel_map, HashMap::from([ ( - "input".into(), - vec![ - NodeURI { - node_name: "A".into(), - key: "input".into(), - }, - NodeURI { - node_name: "D".into(), - key: "input".into(), - }, - ], + "where".into(), + vec![NodeURI::new("A".into(), "input_1".into())], ), ( - "input2".into(), - vec![ - NodeURI { - node_name: "A".into(), - key: "input2".into(), - }, - NodeURI { - node_name: "D".into(), - key: "input2".into(), - }, - ], + "is_the".into(), + vec![NodeURI::new("A".into(), "input_2".into())], + ), + ( + "cat_color".into(), + vec![NodeURI::new("B".into(), "input_1".into())], + ), + ( + "cat".into(), + vec![NodeURI::new("B".into(), "input_2".into())], + ), + ( + "action".into(), + vec![NodeURI::new("D".into(), "input_2".into())], ), ]), HashMap::from([( "output".to_owned(), NodeURI { - node_name: "C".into(), + node_name: "D".into(), key: "output".into(), }, )]), Some(Annotation { - name: "Example Pipeline".to_owned(), - description: "This is an example pipeline. of A -> B -> C".to_owned(), + name: "Sentence making pipeline".to_owned(), + description: "Parse txt files with txt and to form sentences".to_owned(), version: "1.0.0".to_owned(), }), ) @@ -394,27 +384,62 @@ pub fn pipeline_job(namespace_lookup: &HashMap) -> Result Result<()> { @@ -87,3 +90,88 @@ fn input_packet_checksum() -> Result<()> { ); Ok(()) } + +#[test] +fn creation() -> Result<()> { + // This test checks if the pipeline can be created successfully. + let pipeline = pipeline()?; + + assert_eq!( + pipeline.annotation, + Some(Annotation { + name: "Sentence making pipeline".to_owned(), + description: "Parse txt files with txt and to form sentences".to_owned(), + version: "1.0.0".to_owned(), + }), + "Pipeline annotation does not match expected values." + ); + + assert_eq!( + pipeline.graph.node_count(), + 6, + "Pipeline graph should have 6 nodes." + ); + assert_eq!( + pipeline.graph.edge_count(), + 5, + "Pipeline graph should have 5 edges." + ); + + Ok(()) +} + +/// Verify that the utility function that computes the input packets to feed into each input node works as expected. +#[test] +fn get_input_packet_per_node() -> Result<()> { + let pipeline_job = pipeline_job(&NAMESPACE_LOOKUP_READ_ONLY)?; + + let input_packets_per_node = pipeline_job.get_input_packet_per_node()?; + + // Given the pipeline definition used in pipeline_job, we expect the following input packets per node: + // The full sentence to be constructed is "Where is the black/tabby cat hiding/playing" + // Node A: 1 packets, with keys input "input_1" and "input_2" Due to only Where.txt and is_the.txt being route to this node + // Node B: 2 packets, with keys "input_1" and "input_2" Due to black.txt / tabby.txt and cat.txt being routed to this node + // Node C should not receive any input, as it is an internal node + // Node D: 2 packets, with keys "input_2" only, due to input_1 being received by the joiner node, and input_2 being hiding.txt + + // Check A + let input_packet_node_a = input_packets_per_node.get("A").unwrap(); + assert_num_of_packets(input_packet_node_a.len(), 1); + assert_contains_keys(&input_packet_node_a[0], &["input_1", "input_2"]); + + // Check B + let input_packet_node_b = input_packets_per_node.get("B").unwrap(); + assert_num_of_packets(input_packet_node_b.len(), 2); + assert_contains_keys(&input_packet_node_b[0], &["input_1", "input_2"]); + assert_contains_keys(&input_packet_node_b[1], &["input_1", "input_2"]); + + // Check C + assert!( + !input_packets_per_node.contains_key("C"), + "Node C should not have any input packets.", + ); + + // Check D + let input_packet_node_d = input_packets_per_node.get("D").unwrap(); + assert_num_of_packets(input_packet_node_d.len(), 2); + assert_contains_keys(&input_packet_node_d[0], &["input_2"]); + assert_contains_keys(&input_packet_node_d[1], &["input_2"]); + + Ok(()) +} + +fn assert_num_of_packets(num_of_packets: usize, expected: usize) { + assert!( + num_of_packets == expected, + "Expected {expected} packets, but got {num_of_packets}." + ); +} + +fn assert_contains_keys(input_packet: &HashMap, keys: &[&str]) { + for key in keys { + assert!( + input_packet.contains_key(*key), + "Input packet should contain key '{key}'." + ); + } +} diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 99535a6e..5d70e78c 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -1,4 +1,9 @@ -#![expect(missing_docs, clippy::panic_in_result_fn, reason = "OK in tests.")] +#![expect( + missing_docs, + clippy::panic_in_result_fn, + clippy::expect_used, + reason = "OK in tests." +)] // If 'fixture' is a local module, ensure there is a 'mod fixture;' statement or a 'fixture.rs' file in the same directory or in 'tests/'. // If 'fixture' is an external crate, add it to Cargo.toml and import as shown below. @@ -36,11 +41,7 @@ async fn basic_run() -> Result<()> { while let Ok(sample) = sub.recv_async().await { // Print the key expression and payload of each message - println!( - "Received message: {}: {:?}", - sample.key_expr().as_str(), - sample.payload() - ); + println!("Received message: {}:", sample.key_expr().as_str(),); } }); From 0c59a9dd1896df76c531134855c9899d8a980d8f Mon Sep 17 00:00:00 2001 From: Synicix Date: Sat, 2 Aug 2025 03:48:21 +0000 Subject: [PATCH 29/29] Save progress --- src/core/model/pipeline.rs | 15 +++++ src/uniffi/orchestrator/agent.rs | 2 +- src/uniffi/pipeline_runner/runner.rs | 99 +++++++++++++++++----------- tests/fixture/mod.rs | 13 ++-- tests/pipeline.rs | 21 ++++-- tests/pipeline_runner.rs | 7 +- 6 files changed, 104 insertions(+), 53 deletions(-) diff --git a/src/core/model/pipeline.rs b/src/core/model/pipeline.rs index 572b8a75..1c5af674 100644 --- a/src/core/model/pipeline.rs +++ b/src/core/model/pipeline.rs @@ -1,3 +1,5 @@ +use std::collections::HashSet; + use crate::uniffi::model::pipeline::{Kernel, Pipeline}; use petgraph::Direction::Incoming; use serde::{Deserialize, Serialize}; @@ -25,4 +27,17 @@ impl Pipeline { .map(move |parent_idx| &self.graph[parent_idx]) }) } + + /// Return a vec of `node_names` that takes in inputs based on the `input_spec`ec + pub(crate) fn get_input_nodes(&self) -> HashSet<&String> { + let mut input_nodes = HashSet::new(); + + self.input_spec.iter().for_each(|(_, node_uris)| { + for node_uri in node_uris { + input_nodes.insert(&node_uri.node_name); + } + }); + + input_nodes + } } diff --git a/src/uniffi/orchestrator/agent.rs b/src/uniffi/orchestrator/agent.rs index 4a559f37..0bfd7598 100644 --- a/src/uniffi/orchestrator/agent.rs +++ b/src/uniffi/orchestrator/agent.rs @@ -154,11 +154,11 @@ impl Agent { namespace_lookup.clone(), |pod_job: &PodJob| EventPayload::Request(pod_job.clone()), async |agent, inner_namespace_lookup, _, pod_job| { - println!("Processing pod job: {}", pod_job.hash); let pod_run = agent .orchestrator .start(&inner_namespace_lookup, &pod_job) .await?; + let pod_result = agent.orchestrator.get_result(&pod_run).await?; //agent.orchestrator.delete(&pod_run).await?; Ok(pod_result) diff --git a/src/uniffi/pipeline_runner/runner.rs b/src/uniffi/pipeline_runner/runner.rs index 481aabdf..3026f8a1 100644 --- a/src/uniffi/pipeline_runner/runner.rs +++ b/src/uniffi/pipeline_runner/runner.rs @@ -179,6 +179,9 @@ impl DockerPipelineRunner { .await .context(selector::AgentCommunicationFailure {})?; + // Get the set of input_nodes + let input_nodes = pipeline_run.pipeline_job.pipeline.get_input_nodes(); + // Iterate through each node in the graph and spawn a task for each for node_idx in graph.node_indices() { let node = &graph[node_idx]; @@ -189,6 +192,7 @@ impl DockerPipelineRunner { .spawn(Self::spawn_node_processing_task( node.clone(), Arc::clone(&pipeline_run.pipeline_job.pipeline), + input_nodes.contains(&node.name), self.get_base_key_exp(&pipeline_run_id), namespace.to_owned(), namespace_lookup.clone(), @@ -241,33 +245,40 @@ impl DockerPipelineRunner { } // Submit the input_packets to the correct key_exp - let input_node_key_exp = format!( + let base_input_node_key_exp = format!( "{}/{}", self.get_base_key_exp(&pipeline_run_id), INPUT_KEY_EXP, ); - // for packet in pipeline_run.pipeline_job.get_input_packet_per_node() { - // // Send the packet to the input node key_exp - // session - // .put( - // &input_node_key_exp, - // serde_json::to_string(&NodeOutput::Packet( - // "input_node".to_owned(), - // packet.clone(), - // ))?, - // ) - // .await - // .context(selector::AgentCommunicationFailure {})?; - // } - - // Send the complete processing message for the input node - session - .put( - input_node_key_exp, - serde_json::to_string(&NodeOutput::ProcessingCompleted("input_node".to_owned()))?, - ) - .await - .context(selector::AgentCommunicationFailure {})?; + + // For each node send all the packets associate with it + for (node_name, input_packets) in pipeline_run.pipeline_job.get_input_packet_per_node()? { + for packet in input_packets { + // Send the packet to the input node key_exp + let output_key_exp = format!("{base_input_node_key_exp}/{node_name}"); + session + .put( + &output_key_exp, + serde_json::to_string(&NodeOutput::Packet( + "input_node".to_owned(), + packet.clone(), + ))?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + + // All packets associate with node are sent, we can send processing complete msg now + session + .put( + &output_key_exp, + serde_json::to_string(&NodeOutput::ProcessingCompleted( + "input_node".to_owned(), + ))?, + ) + .await + .context(selector::AgentCommunicationFailure {})?; + } + } // Insert into the list of pipeline runs self.pipeline_runs @@ -401,11 +412,16 @@ impl DockerPipelineRunner { // Listen to any failure messages and write it the logs while let Ok(payload) = sub.recv_async().await { // Extract the message from the payload - let msg: ProcessingFailure = serde_json::from_slice(&payload.payload().to_bytes())?; + let process_failure: ProcessingFailure = + serde_json::from_slice(&payload.payload().to_bytes())?; // Store the failure message in the logs - failure_logs.write().await.push(msg.clone()); - // Print the failure message to stderr - eprintln!("Processing failure for node {}: {}", msg.node_id, msg.error); + failure_logs.write().await.push(process_failure.clone()); + if let Some(first_line) = process_failure.error.lines().next() { + println!( + "Node {} processing failed with error: {}", + process_failure.node_id, first_line + ); + } } Ok(()) @@ -425,6 +441,7 @@ impl DockerPipelineRunner { async fn spawn_node_processing_task( node: PipelineNode, pipeline: Arc, + is_input_node: bool, base_key_exp: String, namespace: String, namespace_lookup: HashMap, @@ -438,12 +455,17 @@ impl DockerPipelineRunner { Kernel::Mapper { mapper } => Box::new(MapperProcessor::new(Arc::clone(mapper))), Kernel::Joiner => { // Need to get the parent node id for this joiner node - Box::new(JoinerProcessor::new( - pipeline - .get_node_parents(&node) - .map(|parent_node| parent_node.name.clone()) - .collect::>(), - )) + let mut parent_nodes = pipeline + .get_node_parents(&node) + .map(|parent_node| parent_node.name.clone()) + .collect::>(); + + // Check if it this node takes input from input_nodes, if so we need ot add it to parent_node + if is_input_node { + parent_nodes.push("input_node".to_owned()); + } + + Box::new(JoinerProcessor::new(parent_nodes)) } })); @@ -461,9 +483,10 @@ impl DockerPipelineRunner { }) .collect::>(); - // If there was no parent node, then this is root node, therefore we need to subscribe to the input node - if key_exps_to_subscribe_to.is_empty() { - key_exps_to_subscribe_to.push(format!("{base_key_exp}/{INPUT_KEY_EXP}")); + // Check if node is an input_node, if so we need to add the input node key expression + if is_input_node { + key_exps_to_subscribe_to + .push(format!("{base_key_exp}/input_node/outputs/{}", node.name)); } // Create a subscriber for each of the parent nodes (Should only be 1, unless it is a joiner node) @@ -747,10 +770,9 @@ impl PodProcessor { None, namespace_lookup, )?; - // Print out the packet // Create listener for pod_job - let target_key_exp = format!("group/{}/*/pod_job/{}/**", client.group, pod_job.hash); + let target_key_exp = format!("group/{}/success/pod_job/{}/**", client.group, pod_job.hash); // Create the subscriber let pod_job_subscriber = session @@ -1036,7 +1058,6 @@ impl NodeProcessor for JoinerProcessor { // Check if we have all the other parents needed to compute the cartesian product if self.input_packet_cache.values().all(|v| !v.is_empty()) { - // Print we have all the parents // Get all the cached packets from other parents let other_parent_ids = self .input_packet_cache diff --git a/tests/fixture/mod.rs b/tests/fixture/mod.rs index e42454d8..a5b4fc28 100644 --- a/tests/fixture/mod.rs +++ b/tests/fixture/mod.rs @@ -272,15 +272,15 @@ pub fn combine_txt_pod(pod_name: &str) -> Result { }), "alpine:3.14".to_owned(), str_to_vec(&format!( - "cat input/input_1.txt input/input_2.txt > /output/output.txt && echo \"Processed by {pod_name}\" >> /output/output.txt" + "sh -c cat input/input_1.txt input/input_2.txt > /output/output.txt && echo \"Processed by {pod_name}\" >> /output/output.txt" )), HashMap::from([ ( - "input1".to_owned(), + "input_1".to_owned(), PathInfo::new("/input/input_1.txt".into(), r".*\.txt".into()), ), ( - "input2".into(), + "input_2".into(), PathInfo::new("/input/input2.txt".into(), r".*\.txt".into()), ), ]), @@ -328,12 +328,15 @@ pub fn pipeline() -> Result { // Add the joiner node kernel_map.insert("pod_c_joiner".into(), Kernel::Joiner); + // Add joiner node for D + kernel_map.insert("pod_d_joiner".into(), Kernel::Joiner); + // Write all the edges in DOT format let dot = " digraph { A -> pod_a_mapper -> pod_c_joiner; B -> pod_b_mapper -> pod_c_joiner; - pod_c_joiner -> C -> D; + pod_c_joiner -> C -> pod_d_joiner -> D; } "; @@ -359,7 +362,7 @@ pub fn pipeline() -> Result { ), ( "action".into(), - vec![NodeURI::new("D".into(), "input_2".into())], + vec![NodeURI::new("pod_d_joiner".into(), "input_2".into())], ), ]), HashMap::from([( diff --git a/tests/pipeline.rs b/tests/pipeline.rs index 6ddabb9f..53797c73 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -108,13 +108,13 @@ fn creation() -> Result<()> { assert_eq!( pipeline.graph.node_count(), - 6, - "Pipeline graph should have 6 nodes." + 8, + "Pipeline graph should have 8 nodes." ); assert_eq!( pipeline.graph.edge_count(), - 5, - "Pipeline graph should have 5 edges." + 7, + "Pipeline graph should have 7 edges." ); Ok(()) @@ -132,7 +132,7 @@ fn get_input_packet_per_node() -> Result<()> { // Node A: 1 packets, with keys input "input_1" and "input_2" Due to only Where.txt and is_the.txt being route to this node // Node B: 2 packets, with keys "input_1" and "input_2" Due to black.txt / tabby.txt and cat.txt being routed to this node // Node C should not receive any input, as it is an internal node - // Node D: 2 packets, with keys "input_2" only, due to input_1 being received by the joiner node, and input_2 being hiding.txt + // Node pod_d_joiner: 2 packets, with keys "input_2" only, due to input_1 being received by the joiner node, and input_2 being hiding.txt // Check A let input_packet_node_a = input_packets_per_node.get("A").unwrap(); @@ -151,12 +151,19 @@ fn get_input_packet_per_node() -> Result<()> { "Node C should not have any input packets.", ); - // Check D - let input_packet_node_d = input_packets_per_node.get("D").unwrap(); + // Check pod_d_joiner + // Node node_d_joiner: 2 packets, with keys "input_2" only, due to input_1 being received by the joiner node, and input_2 being hiding.txt + let input_packet_node_d = input_packets_per_node.get("pod_d_joiner").unwrap(); assert_num_of_packets(input_packet_node_d.len(), 2); assert_contains_keys(&input_packet_node_d[0], &["input_2"]); assert_contains_keys(&input_packet_node_d[1], &["input_2"]); + // Check D + assert!( + !input_packets_per_node.contains_key("D"), + "Node D should not have any input packets.", + ); + Ok(()) } diff --git a/tests/pipeline_runner.rs b/tests/pipeline_runner.rs index 5d70e78c..56598496 100644 --- a/tests/pipeline_runner.rs +++ b/tests/pipeline_runner.rs @@ -35,7 +35,7 @@ async fn basic_run() -> Result<()> { tokio::spawn(async move { // Subscribe to all messages in the 'test' group let sub = session - .declare_subscriber("**") + .declare_subscriber("**/failure/**") .await .expect("Failed to declare subscriber"); @@ -57,6 +57,11 @@ async fn basic_run() -> Result<()> { // Wait for the pipeline run to complete let pipeline_result = runner.get_result(&pipeline_run).await?; + println!( + "Pipeline run completed: {:?}", + pipeline_result.output_packets + ); + assert!( pipeline_result.output_packets.len() == 1, "Expected exactly one output packet."