Implement initial provisioning failure reporting with wireserver and a KVP fallback

peytonr18 · peytonr18 · commit f8be2fe291bd · 2025-03-04T18:35:45.000-08:00
diff --git a/libazureinit/src/goalstate.rs b/libazureinit/src/goalstate.rs
@@ -252,10 +252,85 @@ fn build_report_health_file(goalstate: Goalstate) -> String {
     )
 }
 
+#[instrument(err, skip_all)]
+pub async fn report_failure(
+    client: &Client,
+    goalstate: Goalstate,
+    description: &str,
+    retry_interval: Duration,
+    total_timeout: Duration,
+    url: Option<&str>,
+) -> Result<(), Error> {
+    let mut headers = HeaderMap::new();
+    headers.insert("x-ms-agent-name", HeaderValue::from_static("azure-init"));
+    headers.insert("x-ms-version", HeaderValue::from_static("2012-11-30"));
+    headers.insert(
+        "Content-Type",
+        HeaderValue::from_static("text/xml;charset=utf-8"),
+    );
+
+    let request_timeout =
+        Duration::from_secs(http::WIRESERVER_HTTP_TIMEOUT_SEC);
+    let url = url.unwrap_or(DEFAULT_HEALTH_URL);
+
+    let post_request = build_report_failure_file(goalstate, description);
+
+    _ = http::post(
+        client,
+        headers,
+        post_request,
+        request_timeout,
+        retry_interval,
+        total_timeout,
+        url,
+    )
+    .await?;
+
+    Ok(())
+}
+
+fn build_report_failure_file(
+    goalstate: Goalstate,
+    description: &str,
+) -> String {
+    let post_request = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\
+    <Health xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">\n\
+        <GoalStateIncarnation>$GOAL_STATE_INCARNATION</GoalStateIncarnation>\n\
+        <Container>\n\
+            <ContainerId>$CONTAINER_ID</ContainerId>\n\
+            <RoleInstanceList>\n\
+                <Role>\n\
+                    <InstanceId>$INSTANCE_ID</InstanceId>\n\
+                    <Health>\n\
+                        <State>NotReady</State>\n\
+                        <Substatus>ProvisioningFailed</Substatus>\n\
+                        <Description>$DESCRIPTION</Description>\n\
+                    </Health>\n\
+                </Role>\n\
+            </RoleInstanceList>\n\
+        </Container>\n\
+    </Health>";
+
+    let post_request =
+        post_request.replace("$GOAL_STATE_INCARNATION", &goalstate.incarnation);
+    let post_request = post_request
+        .replace("$CONTAINER_ID", &goalstate.container.container_id);
+    let post_request = post_request.replace(
+        "$INSTANCE_ID",
+        &goalstate
+            .container
+            .role_instance_list
+            .role_instance
+            .instance_id,
+    );
+    post_request.replace("$DESCRIPTION", description)
+}
+
 #[cfg(test)]
 mod tests {
     use super::{
-        build_report_health_file, get_goalstate, report_health, Goalstate,
+        build_report_failure_file, build_report_health_file, get_goalstate,
+        report_health, Goalstate,
     };
 
     use reqwest::{header, Client, StatusCode};
@@ -425,6 +500,22 @@ mod tests {
         }
     }
 
+    #[tokio::test]
+    async fn test_build_report_failure_file() {
+        let goalstate: Goalstate = serde_xml_rs::from_str(GOALSTATE_STR)
+            .expect("Failed to parse the goalstate XML.");
+        let failure_message = "Provisioning failed due to test error.";
+
+        let result = build_report_failure_file(goalstate, failure_message);
+
+        assert!(result.contains("<State>NotReady</State>"));
+        assert!(result.contains("<Substatus>ProvisioningFailed</Substatus>"));
+        assert!(result.contains(&format!(
+            "<Description>{}</Description>",
+            failure_message
+        )));
+    }
+
     // Assert malformed responses are retried.
     //
     // In this case the server doesn't return XML at all.
diff --git a/src/main.rs b/src/main.rs
@@ -164,61 +164,102 @@ async fn provision(config: Config, opts: Cli) -> Result<(), anyhow::Error> {
     let imds_http_timeout_sec: u64 = 5 * 60;
     let imds_http_retry_interval_sec: u64 = 2;
 
-    // Username can be obtained either via fetching instance metadata from IMDS
-    // or mounting a local device for OVF environment file. It should not fail
-    // immediately in a single failure, instead it should fall back to the other
-    // mechanism. So it is not a good idea to use `?` for query() or
-    // get_environment().
-    let instance_metadata = imds::query(
-        &client,
-        Duration::from_secs(imds_http_retry_interval_sec),
-        Duration::from_secs(imds_http_timeout_sec),
-        None, // default IMDS URL
-    )
-    .await
-    .ok();
-
-    let environment = get_environment().ok();
-
-    let username =
-        get_username(instance_metadata.as_ref(), environment.as_ref())?;
-
-    // It is necessary to get the actual instance metadata after getting username,
-    // as it is not desirable to immediately return error before get_username.
-    let im = instance_metadata
-        .clone()
-        .ok_or::<LibError>(LibError::InstanceMetadataFailure)?;
-
-    let user =
-        User::new(username, im.compute.public_keys).with_groups(opts.groups);
-
-    Provision::new(im.compute.os_profile.computer_name, user, config)
-        .provision()?;
-
-    let vm_goalstate = goalstate::get_goalstate(
-        &client,
-        Duration::from_secs(imds_http_retry_interval_sec),
-        Duration::from_secs(imds_http_timeout_sec),
-        None, // default wireserver goalstate URL
-    )
-    .await
-    .with_context(|| {
-        tracing::error!("Failed to get the desired goalstate.");
-        "Failed to get desired goalstate."
-    })?;
-
-    goalstate::report_health(
-        &client,
-        vm_goalstate,
-        Duration::from_secs(imds_http_retry_interval_sec),
-        Duration::from_secs(imds_http_timeout_sec),
-        None, // default wireserver health URL
-    )
-    .await
-    .with_context(|| {
-        tracing::error!("Failed to report VM health.");
-        "Failed to report VM health."
-    })?;
-
-    Ok(())
+    // Wrap the entire provisioning process in an async block to capture errors.
+    // If an error happens, capture the goalstate and call report_failure()
+    let provisioning_result: Result<(), anyhow::Error> = async {
+        // Username can be obtained either via fetching instance metadata from IMDS
+        // or mounting a local device for OVF environment file. It should not fail
+        // immediately in a single failure, instead it should fall back to the other
+        // mechanism. So it is not a good idea to use `?` for query() or
+        // get_environment().
+        let instance_metadata = imds::query(
+            &client,
+            Duration::from_secs(imds_http_retry_interval_sec),
+            Duration::from_secs(imds_http_timeout_sec),
+            None, // default IMDS URL
+        )
+        .await
+        .ok();
+
+        let environment = get_environment().ok();
+
+        let username =
+            get_username(instance_metadata.as_ref(), environment.as_ref())?;
+
+        // It is necessary to get the actual instance metadata after getting username,
+        // as it is not desirable to immediately return error before get_username.
+        let im = instance_metadata
+            .clone()
+            .ok_or::<LibError>(LibError::InstanceMetadataFailure)?;
+
+        let user = User::new(username, im.compute.public_keys)
+            .with_groups(opts.groups);
+
+        Provision::new(im.compute.os_profile.computer_name, user, config)
+            .provision()?;
+
+        let vm_goalstate = goalstate::get_goalstate(
+            &client,
+            Duration::from_secs(imds_http_retry_interval_sec),
+            Duration::from_secs(imds_http_timeout_sec),
+            None, // default wireserver goalstate URL
+        )
+        .await
+        .with_context(|| {
+            tracing::error!("Failed to get the desired goalstate.");
+            "Failed to get desired goalstate."
+        })?;
+
+        goalstate::report_health(
+            &client,
+            vm_goalstate,
+            Duration::from_secs(imds_http_retry_interval_sec),
+            Duration::from_secs(imds_http_timeout_sec),
+            None, // default wireserver health URL
+        )
+        .await
+        .with_context(|| {
+            tracing::error!("Failed to report VM health.");
+            "Failed to report VM health."
+        })?;
+
+        Ok(())
+    }
+    .await;
+
+    if let Err(ref e) = provisioning_result {
+        tracing::error!("Provisioning failed with error: {:?}", e);
+
+        // Report the provisioning failure via wireserver.
+        // If this fails, fallback to KVP reporting by logging the error.
+        if let Ok(vm_goalstate) = goalstate::get_goalstate(
+            &client,
+            Duration::from_secs(imds_http_retry_interval_sec),
+            Duration::from_secs(imds_http_timeout_sec),
+            None, // default wireserver goalstate URL
+        )
+        .await
+        {
+            let failure_description = format!("Provisioning error: {:?}", e);
+            if let Err(report_err) = goalstate::report_failure(
+                &client,
+                vm_goalstate,
+                &failure_description,
+                Duration::from_secs(imds_http_retry_interval_sec),
+                Duration::from_secs(imds_http_timeout_sec),
+                None, // default wireserver health URL
+            )
+            .await
+            {
+                tracing::error!(
+                    "Failed to report provisioning failure: {:?}",
+                    report_err
+                );
+            }
+        } else {
+            tracing::error!("Could not fetch goalstate for failure reporting");
+        }
+    }
+
+    provisioning_result
 }