openebs
diff --git a/‎.github/workflows/release_artifacts.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/release_artifacts.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 7 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎chart/templates/csi-deployment.yaml‎
Lines changed: 1 addition & 1 deletion b/‎chart/templates/csi-deployment.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎chart/templates/msp-deployment.yaml‎
Lines changed: 1 addition & 1 deletion b/‎chart/templates/msp-deployment.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/src/opentelemetry.rs‎
Lines changed: 15 additions & 0 deletions b/‎common/src/opentelemetry.rs‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎common/src/store/etcd_keep_alive.rs‎
Lines changed: 1 addition & 1 deletion b/‎common/src/store/etcd_keep_alive.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/src/types/v0/message_bus/replica.rs‎
Lines changed: 1 addition & 0 deletions b/‎common/src/types/v0/message_bus/replica.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/src/types/v0/store/nexus_persistence.rs‎
Lines changed: 4 additions & 0 deletions b/‎common/src/types/v0/store/nexus_persistence.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎control-plane/agents/common/src/errors.rs‎
Lines changed: 8 additions & 0 deletions b/‎control-plane/agents/common/src/errors.rs‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎control-plane/agents/core/src/core/grpc.rs‎
Lines changed: 48 additions & 9 deletions b/‎control-plane/agents/core/src/core/grpc.rs‎
Lines changed: 48 additions & 9 deletions
@@ -4,6 +4,7 @@ on:
     branches:
       - master
       - 'release/**'
+      - 'hotfix-v**'
 
 jobs:
   kubectl-plugin:
 
@@ -53,10 +53,3 @@ repos:
         args: [ "--changes" ]
         pass_filenames: false
         language: system
-    -   id: helm-develop-deploy
-        name: Helm Generator
-        description: Ensures the deploy is updated with the develop yamls
-        entry: ./scripts/generate-deploy-yamls.sh
-        args: [ "-c", "develop" ]
-        pass_filenames: false
-        language: system
@@ -57,7 +57,7 @@ spec:
           imagePullPolicy: {{ .Values.mayastorCP.pullPolicy }}
           args:
             - "--csi-socket=/var/lib/csi/sockets/pluginproxy/csi.sock"
-            - "--rest-endpoint=http://$(REST_SERVICE_HOST):8081"{{ if .Values.base.jaeger.enabled }}
+            - "--rest-endpoint=http://rest:8081"{{ if .Values.base.jaeger.enabled }}
             - "--jaeger={{ .Values.base.jaeger.agent.name }}:{{ .Values.base.jaeger.agent.port }}"{{ end }}
           env:
             - name: RUST_LOG
 
@@ -26,7 +26,7 @@ spec:
           image: {{ .Values.mayastorCP.registry }}mayadata/mcp-msp-operator:{{ .Values.mayastorCP.tag }}
           imagePullPolicy: {{ .Values.mayastorCP.pullPolicy }}
           args:
-            - "-e http://$(REST_SERVICE_HOST):8081"
+            - "-e http://rest:8081"
             - "--interval={{ .Values.base.cache_poll_period }}"{{ if .Values.base.jaeger.enabled }}
             - "--jaeger={{ .Values.base.jaeger.agent.name }}:{{ .Values.base.jaeger.agent.port }}"{{ end }}
           env:
 
@@ -19,3 +19,18 @@ pub fn default_tracing_tags(git_commit: &str, cargo_version: &str) -> Vec<KeyVal
         KeyValue::new("crate.version", cargo_version.to_string()),
     ]
 }
+
+/// Name of the OTEL_BSP_MAX_EXPORT_BATCH_SIZE variable
+pub const OTEL_BSP_MAX_EXPORT_BATCH_SIZE_NAME: &str = "OTEL_BSP_MAX_EXPORT_BATCH_SIZE";
+/// The value of OTEL_BSP_MAX_EXPORT_BATCH_SIZE to be used with JAEGER
+pub const OTEL_BSP_MAX_EXPORT_BATCH_SIZE_JAEGER: &str = "64";
+/// Set the OTEL variables for a jaeger configuration
+pub fn set_jaeger_env() {
+    // if not set, default it to our jaeger value
+    if std::env::var(OTEL_BSP_MAX_EXPORT_BATCH_SIZE_NAME).is_err() {
+        std::env::set_var(
+            OTEL_BSP_MAX_EXPORT_BATCH_SIZE_NAME,
+            OTEL_BSP_MAX_EXPORT_BATCH_SIZE_JAEGER,
+        );
+    }
+}
@@ -507,7 +507,7 @@ impl LeaseLockKeeperClocking<Locked> for EtcdSingletonLock {
 
 #[async_trait::async_trait]
 impl LeaseLockKeeperClocking<KeepAlive> for EtcdSingletonLock {
-    #[tracing::instrument(skip(self, state), err)]
+    #[tracing::instrument(level = "trace", skip(self, state), err)]
     async fn clock(&mut self, mut state: KeepAlive) -> LockStatesResult {
         state
             .keeper
 
@@ -158,6 +158,7 @@ pub struct CreateReplica {
 #[derive(Serialize, Deserialize, Default, Debug, Clone, PartialEq)]
 pub struct ReplicaOwners {
     volume: Option<VolumeId>,
+    #[serde(skip)]
     nexuses: Vec<NexusId>,
 }
 impl ReplicaOwners {
 
@@ -26,6 +26,10 @@ impl NexusInfo {
             None => false,
         }
     }
+    /// Check if no replica is healthy
+    pub fn no_healthy_replicas(&self) -> bool {
+        self.children.iter().all(|c| !c.healthy) || self.children.is_empty()
+    }
 }
 
 /// Definition of the child information that gets saved in the persistent
 
@@ -186,6 +186,8 @@ pub enum SvcError {
     ReplicaCreateNumber { id: String },
     #[snafu(display("No online replicas are available for Volume '{}'", id))]
     NoOnlineReplicas { id: String },
+    #[snafu(display("No healthy replicas are available for Volume '{}'", id))]
+    NoHealthyReplicas { id: String },
     #[snafu(display("Entry with key '{}' not found in the persistent store.", key))]
     StoreMissingEntry { key: String },
     #[snafu(display("The uuid '{}' for kind '{}' is not valid.", uuid, kind.to_string()))]
@@ -514,6 +516,12 @@ impl From<SvcError> for ReplyError {
                 source: desc.to_string(),
                 extra: error.full_string(),
             },
+            SvcError::NoHealthyReplicas { .. } => ReplyError {
+                kind: ReplyErrorKind::VolumeNoReplicas,
+                resource: ResourceKind::Volume,
+                source: desc.to_string(),
+                extra: error.full_string(),
+            },
             SvcError::ReplicaCreateNumber { .. } => ReplyError {
                 kind: ReplyErrorKind::ReplicaCreateNumber,
                 resource: ResourceKind::Volume,
 
@@ -56,13 +56,27 @@ impl GrpcContext {
             comms_timeouts: comms_timeouts.clone(),
         })
     }
-    pub(crate) async fn lock(&self) -> tokio::sync::OwnedMutexGuard<()> {
+    /// Override the timeout config in the context for the given request
+    fn override_timeout<R: MessageIdTimeout>(&mut self, request: Option<R>) {
+        let timeout = request
+            .map(|r| r.timeout(self.comms_timeouts.request(), &bus()))
+            .unwrap_or_else(|| self.comms_timeouts.request());
+
+        self.endpoint = self
+            .endpoint
+            .clone()
+            .connect_timeout(self.comms_timeouts.connect() + Duration::from_millis(500))
+            .timeout(timeout);
+    }
+    pub(crate) async fn lock(&self) -> GrpcLockGuard {
         self.lock.clone().lock_owned().await
     }
     pub(crate) async fn connect(&self) -> Result<GrpcClient, SvcError> {
         GrpcClient::new(self).await
     }
-    pub(crate) async fn connect_locked(&self) -> Result<GrpcClientLocked, SvcError> {
+    pub(crate) async fn connect_locked(
+        &self,
+    ) -> Result<GrpcClientLocked, (GrpcLockGuard, SvcError)> {
         GrpcClientLocked::new(self).await
     }
 }
@@ -72,7 +86,7 @@ impl GrpcContext {
 pub(crate) struct GrpcClient {
     context: GrpcContext,
     /// gRPC Mayastor Client
-    pub(crate) client: MayaClient,
+    pub(crate) mayastor: MayaClient,
 }
 pub(crate) type MayaClient = MayastorClient<Channel>;
 impl GrpcClient {
@@ -96,23 +110,48 @@ impl GrpcClient {
 
         Ok(Self {
             context: context.clone(),
-            client,
+            mayastor: client,
         })
     }
 }
 
-/// Wrapper over all gRPC Clients types with implicit locking for serialization
+/// Async Lock guard for gRPC operations.
+/// It's used by the GrpcClientLocked to ensure there's only one operation in progress
+/// at a time while still allowing for multiple gRPC clients.
+type GrpcLockGuard = tokio::sync::OwnedMutexGuard<()>;
+
+/// Wrapper over all gRPC Clients types with implicit locking for serialization.
 pub(crate) struct GrpcClientLocked {
     /// gRPC auto CRUD guard lock
-    _lock: tokio::sync::OwnedMutexGuard<()>,
+    _lock: GrpcLockGuard,
     client: GrpcClient,
 }
 impl GrpcClientLocked {
-    pub(crate) async fn new(context: &GrpcContext) -> Result<Self, SvcError> {
-        let client = GrpcClient::new(context).await?;
+    /// Create new locked client from the given context
+    /// A connection is established with the timeouts specified from the context.
+    /// Only one `Self` is allowed at a time by making use of a lock guard.
+    pub(crate) async fn new(context: &GrpcContext) -> Result<Self, (GrpcLockGuard, SvcError)> {
+        let _lock = context.lock().await;
+
+        let client = match GrpcClient::new(context).await {
+            Ok(client) => client,
+            Err(error) => return Err((_lock, error)),
+        };
+
+        Ok(Self { _lock, client })
+    }
+    /// Reconnect the client to use for the given request
+    /// This is useful when we want to issue the next gRPC using a different timeout
+    /// todo: tower should allow us to handle this better by keeping the same "backend" client
+    /// but modifying the timeout layer?
+    pub(crate) async fn reconnect<R: MessageIdTimeout>(self, request: R) -> Result<Self, SvcError> {
+        let mut context = self.context.clone();
+        context.override_timeout(Some(request));
+
+        let client = GrpcClient::new(&context).await?;
 
         Ok(Self {
-            _lock: context.lock().await,
+            _lock: self._lock,
             client,
         })
     }
Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,7 @@ pub struct CreateReplica {`
`158`	`158`	`#[derive(Serialize, Deserialize, Default, Debug, Clone, PartialEq)]`
`159`	`159`	`pub struct ReplicaOwners {`
`160`	`160`	`volume: Option<VolumeId>,`
	`161`	`+ #[serde(skip)]`
`161`	`162`	`nexuses: Vec<NexusId>,`
`162`	`163`	`}`
`163`	`164`	`impl ReplicaOwners {`
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,10 @@ impl NexusInfo {`
`26`	`26`	`None => false,`
`27`	`27`	`}`
`28`	`28`	`}`
	`29`	`+ /// Check if no replica is healthy`
	`30`	`+ pub fn no_healthy_replicas(&self) -> bool {`
	`31`	`+ self.children.iter().all(\|c\| !c.healthy) \|\| self.children.is_empty()`
	`32`	`+ }`
`29`	`33`	`}`
`30`	`34`
`31`	`35`	`/// Definition of the child information that gets saved in the persistent`