Skip to content

Commit f8be2fe

Browse files
committed
Implement initial provisioning failure reporting with wireserver and a KVP fallback
1 parent 623fa65 commit f8be2fe

File tree

2 files changed

+190
-58
lines changed

2 files changed

+190
-58
lines changed

libazureinit/src/goalstate.rs

+92-1
Original file line numberDiff line numberDiff line change
@@ -252,10 +252,85 @@ fn build_report_health_file(goalstate: Goalstate) -> String {
252252
)
253253
}
254254

255+
#[instrument(err, skip_all)]
256+
pub async fn report_failure(
257+
client: &Client,
258+
goalstate: Goalstate,
259+
description: &str,
260+
retry_interval: Duration,
261+
total_timeout: Duration,
262+
url: Option<&str>,
263+
) -> Result<(), Error> {
264+
let mut headers = HeaderMap::new();
265+
headers.insert("x-ms-agent-name", HeaderValue::from_static("azure-init"));
266+
headers.insert("x-ms-version", HeaderValue::from_static("2012-11-30"));
267+
headers.insert(
268+
"Content-Type",
269+
HeaderValue::from_static("text/xml;charset=utf-8"),
270+
);
271+
272+
let request_timeout =
273+
Duration::from_secs(http::WIRESERVER_HTTP_TIMEOUT_SEC);
274+
let url = url.unwrap_or(DEFAULT_HEALTH_URL);
275+
276+
let post_request = build_report_failure_file(goalstate, description);
277+
278+
_ = http::post(
279+
client,
280+
headers,
281+
post_request,
282+
request_timeout,
283+
retry_interval,
284+
total_timeout,
285+
url,
286+
)
287+
.await?;
288+
289+
Ok(())
290+
}
291+
292+
fn build_report_failure_file(
293+
goalstate: Goalstate,
294+
description: &str,
295+
) -> String {
296+
let post_request = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n\
297+
<Health xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">\n\
298+
<GoalStateIncarnation>$GOAL_STATE_INCARNATION</GoalStateIncarnation>\n\
299+
<Container>\n\
300+
<ContainerId>$CONTAINER_ID</ContainerId>\n\
301+
<RoleInstanceList>\n\
302+
<Role>\n\
303+
<InstanceId>$INSTANCE_ID</InstanceId>\n\
304+
<Health>\n\
305+
<State>NotReady</State>\n\
306+
<Substatus>ProvisioningFailed</Substatus>\n\
307+
<Description>$DESCRIPTION</Description>\n\
308+
</Health>\n\
309+
</Role>\n\
310+
</RoleInstanceList>\n\
311+
</Container>\n\
312+
</Health>";
313+
314+
let post_request =
315+
post_request.replace("$GOAL_STATE_INCARNATION", &goalstate.incarnation);
316+
let post_request = post_request
317+
.replace("$CONTAINER_ID", &goalstate.container.container_id);
318+
let post_request = post_request.replace(
319+
"$INSTANCE_ID",
320+
&goalstate
321+
.container
322+
.role_instance_list
323+
.role_instance
324+
.instance_id,
325+
);
326+
post_request.replace("$DESCRIPTION", description)
327+
}
328+
255329
#[cfg(test)]
256330
mod tests {
257331
use super::{
258-
build_report_health_file, get_goalstate, report_health, Goalstate,
332+
build_report_failure_file, build_report_health_file, get_goalstate,
333+
report_health, Goalstate,
259334
};
260335

261336
use reqwest::{header, Client, StatusCode};
@@ -425,6 +500,22 @@ mod tests {
425500
}
426501
}
427502

503+
#[tokio::test]
504+
async fn test_build_report_failure_file() {
505+
let goalstate: Goalstate = serde_xml_rs::from_str(GOALSTATE_STR)
506+
.expect("Failed to parse the goalstate XML.");
507+
let failure_message = "Provisioning failed due to test error.";
508+
509+
let result = build_report_failure_file(goalstate, failure_message);
510+
511+
assert!(result.contains("<State>NotReady</State>"));
512+
assert!(result.contains("<Substatus>ProvisioningFailed</Substatus>"));
513+
assert!(result.contains(&format!(
514+
"<Description>{}</Description>",
515+
failure_message
516+
)));
517+
}
518+
428519
// Assert malformed responses are retried.
429520
//
430521
// In this case the server doesn't return XML at all.

src/main.rs

+98-57
Original file line numberDiff line numberDiff line change
@@ -164,61 +164,102 @@ async fn provision(config: Config, opts: Cli) -> Result<(), anyhow::Error> {
164164
let imds_http_timeout_sec: u64 = 5 * 60;
165165
let imds_http_retry_interval_sec: u64 = 2;
166166

167-
// Username can be obtained either via fetching instance metadata from IMDS
168-
// or mounting a local device for OVF environment file. It should not fail
169-
// immediately in a single failure, instead it should fall back to the other
170-
// mechanism. So it is not a good idea to use `?` for query() or
171-
// get_environment().
172-
let instance_metadata = imds::query(
173-
&client,
174-
Duration::from_secs(imds_http_retry_interval_sec),
175-
Duration::from_secs(imds_http_timeout_sec),
176-
None, // default IMDS URL
177-
)
178-
.await
179-
.ok();
180-
181-
let environment = get_environment().ok();
182-
183-
let username =
184-
get_username(instance_metadata.as_ref(), environment.as_ref())?;
185-
186-
// It is necessary to get the actual instance metadata after getting username,
187-
// as it is not desirable to immediately return error before get_username.
188-
let im = instance_metadata
189-
.clone()
190-
.ok_or::<LibError>(LibError::InstanceMetadataFailure)?;
191-
192-
let user =
193-
User::new(username, im.compute.public_keys).with_groups(opts.groups);
194-
195-
Provision::new(im.compute.os_profile.computer_name, user, config)
196-
.provision()?;
197-
198-
let vm_goalstate = goalstate::get_goalstate(
199-
&client,
200-
Duration::from_secs(imds_http_retry_interval_sec),
201-
Duration::from_secs(imds_http_timeout_sec),
202-
None, // default wireserver goalstate URL
203-
)
204-
.await
205-
.with_context(|| {
206-
tracing::error!("Failed to get the desired goalstate.");
207-
"Failed to get desired goalstate."
208-
})?;
209-
210-
goalstate::report_health(
211-
&client,
212-
vm_goalstate,
213-
Duration::from_secs(imds_http_retry_interval_sec),
214-
Duration::from_secs(imds_http_timeout_sec),
215-
None, // default wireserver health URL
216-
)
217-
.await
218-
.with_context(|| {
219-
tracing::error!("Failed to report VM health.");
220-
"Failed to report VM health."
221-
})?;
222-
223-
Ok(())
167+
// Wrap the entire provisioning process in an async block to capture errors.
168+
// If an error happens, capture the goalstate and call report_failure()
169+
let provisioning_result: Result<(), anyhow::Error> = async {
170+
// Username can be obtained either via fetching instance metadata from IMDS
171+
// or mounting a local device for OVF environment file. It should not fail
172+
// immediately in a single failure, instead it should fall back to the other
173+
// mechanism. So it is not a good idea to use `?` for query() or
174+
// get_environment().
175+
let instance_metadata = imds::query(
176+
&client,
177+
Duration::from_secs(imds_http_retry_interval_sec),
178+
Duration::from_secs(imds_http_timeout_sec),
179+
None, // default IMDS URL
180+
)
181+
.await
182+
.ok();
183+
184+
let environment = get_environment().ok();
185+
186+
let username =
187+
get_username(instance_metadata.as_ref(), environment.as_ref())?;
188+
189+
// It is necessary to get the actual instance metadata after getting username,
190+
// as it is not desirable to immediately return error before get_username.
191+
let im = instance_metadata
192+
.clone()
193+
.ok_or::<LibError>(LibError::InstanceMetadataFailure)?;
194+
195+
let user = User::new(username, im.compute.public_keys)
196+
.with_groups(opts.groups);
197+
198+
Provision::new(im.compute.os_profile.computer_name, user, config)
199+
.provision()?;
200+
201+
let vm_goalstate = goalstate::get_goalstate(
202+
&client,
203+
Duration::from_secs(imds_http_retry_interval_sec),
204+
Duration::from_secs(imds_http_timeout_sec),
205+
None, // default wireserver goalstate URL
206+
)
207+
.await
208+
.with_context(|| {
209+
tracing::error!("Failed to get the desired goalstate.");
210+
"Failed to get desired goalstate."
211+
})?;
212+
213+
goalstate::report_health(
214+
&client,
215+
vm_goalstate,
216+
Duration::from_secs(imds_http_retry_interval_sec),
217+
Duration::from_secs(imds_http_timeout_sec),
218+
None, // default wireserver health URL
219+
)
220+
.await
221+
.with_context(|| {
222+
tracing::error!("Failed to report VM health.");
223+
"Failed to report VM health."
224+
})?;
225+
226+
Ok(())
227+
}
228+
.await;
229+
230+
if let Err(ref e) = provisioning_result {
231+
tracing::error!("Provisioning failed with error: {:?}", e);
232+
233+
// Report the provisioning failure via wireserver.
234+
// If this fails, fallback to KVP reporting by logging the error.
235+
if let Ok(vm_goalstate) = goalstate::get_goalstate(
236+
&client,
237+
Duration::from_secs(imds_http_retry_interval_sec),
238+
Duration::from_secs(imds_http_timeout_sec),
239+
None, // default wireserver goalstate URL
240+
)
241+
.await
242+
{
243+
let failure_description = format!("Provisioning error: {:?}", e);
244+
if let Err(report_err) = goalstate::report_failure(
245+
&client,
246+
vm_goalstate,
247+
&failure_description,
248+
Duration::from_secs(imds_http_retry_interval_sec),
249+
Duration::from_secs(imds_http_timeout_sec),
250+
None, // default wireserver health URL
251+
)
252+
.await
253+
{
254+
tracing::error!(
255+
"Failed to report provisioning failure: {:?}",
256+
report_err
257+
);
258+
}
259+
} else {
260+
tracing::error!("Could not fetch goalstate for failure reporting");
261+
}
262+
}
263+
264+
provisioning_result
224265
}

0 commit comments

Comments
 (0)