[antlir2_vm] dump eth0 traffic when running in test mode

chantra · facebook-github-bot · commit c7c8d04ef043 · 2024-09-30T15:37:39.000-07:00
Summary: This diff provides a mechanism to dump eth0 traffic when running an antlir VM. Essentially, this adds a new option to `antlir2_vm run` called `--eth0-output-file`. When it is set, qemu will be configured to dump the traffic of `net0` interface to that file. By default, nothing is dumped. To plug this into the testing framework, when `antlir2_vm` test is called, it creates the blob artifacts that will be uploaded to TPX and passes the resulting file to `antlir2_vm run` via `--eth0-output-file`. Test Plan: ``` $ buck2 run fbcode//antlir/antlir2/antlir2_vm:antlir2_vm -- run -h Buck UI: https://www.internalfb.com/buck2/64d2ce42-afe3-42cf-9970-76387fcbb5a8 Network: Up: 0B Down: 0B Jobs completed: 6. Time elapsed: 0.0s. BUILD SUCCEEDED Run the VM. Must be executed inside container Usage: antlir2_vm run [OPTIONS] --machine-spec <MACHINE_SPEC> [COMMAND]... Arguments: [COMMAND]... Execute command through ssh inside VM Options: --machine-spec <MACHINE_SPEC> Json-encoded file for VM machine configuration --expect-failure Expects the VM to timeout or terminate early --postmortem The command should be run after VM termination. Console log will be available at env $CONSOLE_OUTPUT --timeout-secs <TIMEOUT_SECS> Timeout in seconds before VM will be terminated. None disables the timeout, which should only be used for interactive shells for development --console-output-file <CONSOLE_OUTPUT_FILE> Redirect console output to file. By default it's suppressed --output-dirs <OUTPUT_DIRS> Output directories that need to be available inside VM --command-envs <COMMAND_ENVS> Environment variables for the command --first-boot-command <FIRST_BOOT_COMMAND> Command requires first boot --eth0-output-file <ETH0_OUTPUT_FILE> Dump network traffic on eth0 to output to file. By default it is not dumped --console Drop into console prompt. This also enables console output on screen, unless `--console-output-file` is specified --container Drop into container shell outside VM -h, --help Print help ``` and ran: ``` buck2 test $(kerctl vmtest-config -e everstore:GICWmAACj33BEzsFAElJ4glD5YhXbuYfAAAf) fbcode//kernel/vmtest/uname_test:uname_test-6.9-local ``` https://www.internalfb.com/intern/testinfra/testconsole/testrun/6755399674977058/ leads to https://www.internalfb.com/intern/testinfra/diagnostics/6755399674977058.562950104331417.1727383743/ which has an "Artifacts" section with eth0.pcap Reviewed By: wujj123456 Differential Revision: D63487026 fbshipit-source-id: f6e63d2202a08d7b324364f7b492818de3224230
diff --git a/antlir/antlir2/antlir2_vm/src/main.rs b/antlir/antlir2/antlir2_vm/src/main.rs
@@ -44,6 +44,7 @@ use crate::share::VirtiofsShare;
 use crate::types::MachineOpts;
 use crate::types::MountPlatformDecision;
 use crate::types::VMArgs;
+use crate::utils::create_tpx_blobs;
 use crate::utils::create_tpx_logs;
 use crate::utils::env_names_to_kvpairs;
 use crate::utils::log_command;
@@ -97,6 +98,9 @@ struct IsolateCmdArgs {
     /// Extra RW bind-mount into the VM for debugging purpose
     #[arg(long)]
     scratch_dir: Option<PathBuf>,
+    /// Whether or not to dump the VM's eth0 traffic to a file. When running the test command, this will set eth0_output_file to a file that will be uploaded to tpx.
+    #[arg(long, default_value_t = false)]
+    dump_eth0_traffic: bool,
     /// Args for run command
     #[clap(flatten)]
     run_cmd_args: RunCmdArgs,
@@ -260,7 +264,11 @@ fn record_envs(_envs: &[KvPair]) -> Result<()> {
 
 /// Further validate `VMArgs` parsed by clap and generate a new `VMArgs` with
 /// content specific to test execution.
-fn get_test_vm_args(orig_args: &VMArgs, cli_envs: Vec<String>) -> Result<ValidatedVMArgs> {
+fn get_test_vm_args(
+    orig_args: &VMArgs,
+    cli_envs: Vec<String>,
+    dump_eth0_traffic: bool,
+) -> Result<ValidatedVMArgs> {
     if orig_args.timeout_secs.is_none() {
         return Err(anyhow!("Test command must specify --timeout-secs."));
     }
@@ -302,6 +310,9 @@ fn get_test_vm_args(orig_args: &VMArgs, cli_envs: Vec<String>) -> Result<Validat
     vm_args.mode.command = Some(test_args.test.into_inner_cmd());
     vm_args.command_envs = envs;
     vm_args.console_output_file = create_tpx_logs("console.txt", "console logs")?;
+    if dump_eth0_traffic {
+        vm_args.eth0_output_file = create_tpx_blobs("eth0.pcap", "eth0 traffic")?;
+    }
     Ok(ValidatedVMArgs {
         inner: vm_args,
         is_list,
@@ -381,7 +392,11 @@ fn test(args: &IsolateCmdArgs) -> Result<()> {
     // It may then decide whether to use host's platform for the actual test.
     Platform::set(&MountPlatformDecision(true))?;
 
-    let validated_args = get_test_vm_args(&args.run_cmd_args.vm_args, args.passenv.clone())?;
+    let validated_args = get_test_vm_args(
+        &args.run_cmd_args.vm_args,
+        args.passenv.clone(),
+        args.dump_eth0_traffic,
+    )?;
     antlir2_rootless::unshare_new_userns()?;
     antlir2_isolate::unshare_and_privatize_mount_ns().context("while isolating mount ns")?;
     let mut command = if validated_args.is_list {
@@ -441,27 +456,28 @@ mod test {
         };
         let mut expected = valid.clone();
         expected.mode.command = Some(vec![OsString::from("whatever")]);
-        let parsed = get_test_vm_args(&valid, vec![]).expect("Parsing should succeed");
+        let parsed = get_test_vm_args(&valid, vec![], false).expect("Parsing should succeed");
         assert_eq!(parsed.inner.mode, expected.mode);
         assert!(!parsed.is_list);
 
         let mut timeout = valid.clone();
         timeout.timeout_secs = None;
-        assert!(get_test_vm_args(&timeout, vec![]).is_err());
+        assert!(get_test_vm_args(&timeout, vec![], false).is_err());
 
         let mut output_dirs = valid.clone();
         output_dirs.output_dirs = vec![PathBuf::from("/some")];
-        assert!(get_test_vm_args(&output_dirs, vec![]).is_err());
+        assert!(get_test_vm_args(&output_dirs, vec![], false).is_err());
 
         let mut command = valid.clone();
         command.mode.command = None;
-        assert!(get_test_vm_args(&command, vec![]).is_err());
+        assert!(get_test_vm_args(&command, vec![], false).is_err());
         command.mode.command = Some(vec![OsString::from("invalid")]);
-        assert!(get_test_vm_args(&command, vec![]).is_err());
+        assert!(get_test_vm_args(&command, vec![], false).is_err());
 
         let env_var_test = valid;
         std::env::set_var("TEST_PILOT_A", "A");
-        let parsed = get_test_vm_args(&env_var_test, vec![]).expect("Parsing should succeed");
+        let parsed =
+            get_test_vm_args(&env_var_test, vec![], false).expect("Parsing should succeed");
         assert!(
             parsed
                 .inner
diff --git a/antlir/antlir2/antlir2_vm/src/types.rs b/antlir/antlir2/antlir2_vm/src/types.rs
@@ -96,6 +96,9 @@ pub(crate) struct VMArgs {
     /// Command requires first boot
     #[clap(long)]
     pub(crate) first_boot_command: Option<String>,
+    /// Dump network traffic on eth0 to output to file. By default it is not dumped.
+    #[clap(long)]
+    pub(crate) eth0_output_file: Option<PathBuf>,
     /// Operation for VM to carry out
     #[clap(flatten)]
     pub(crate) mode: VMModeArgs,
@@ -131,6 +134,10 @@ impl VMArgs {
             args.push("--console-output-file".into());
             args.push(path.into());
         }
+        if let Some(path) = &self.eth0_output_file {
+            args.push("--eth0-output-file".into());
+            args.push(path.into());
+        }
         self.command_envs.iter().for_each(|pair| {
             args.push("--command-envs".into());
             let mut kv_str = OsString::new();
@@ -176,6 +183,14 @@ impl VMArgs {
                 outputs.insert(env::current_dir().expect("current dir must be valid"));
             }
         }
+        // eth0 output needs to be accessible for debugging and uploading
+        if let Some(file_path) = &self.eth0_output_file {
+            if let Some(parent) = file_path.parent() {
+                outputs.insert(parent.to_path_buf());
+            } else {
+                outputs.insert(env::current_dir().expect("current dir must be valid"));
+            }
+        }
         outputs
     }
 }
diff --git a/antlir/antlir2/antlir2_vm/src/vm.rs b/antlir/antlir2/antlir2_vm/src/vm.rs
@@ -13,6 +13,7 @@ use std::io::BufRead;
 use std::io::BufReader;
 use std::io::ErrorKind;
 use std::io::Read;
+use std::io::Write;
 use std::net::Shutdown;
 use std::os::unix::net::UnixStream;
 use std::path::Path;
@@ -31,6 +32,7 @@ use thiserror::Error;
 use tracing::debug;
 use tracing::error;
 use tracing::info;
+use tracing::warn;
 use uuid::Uuid;
 
 use crate::disk::QCow2DiskError;
@@ -131,7 +133,20 @@ impl<S: Share> VM<S> {
             &state_dir,
             machine.mem_mib,
         )?;
-        let nics = VirtualNICs::new(machine.num_nics, machine.max_combined_channels)?;
+        let mut nics = VirtualNICs::new(machine.num_nics, machine.max_combined_channels)?;
+        if nics.len() > 0 {
+            if let Err(e) = nics[0].try_dump_file(args.eth0_output_file.clone()) {
+                let err = format!("Failed to set eth0 dump file: {:?}", e);
+                warn!(err);
+                // Leave a hint that we could not set the dump file by writting a textual error in the .pcap file.
+                // This will generate a corrupted .pcap file that an operator can look into to debug and understand what went wrong.
+                if let Some(filename) = args.eth0_output_file.as_ref() {
+                    // If any part of this fail, we don't want to fail the VM creation.
+                    let _ =
+                        fs::File::create(filename).and_then(|mut f| f.write_all(err.as_bytes()));
+                }
+            }
+        }
         let tpm = match machine.use_tpm {
             true => Some(TPMDevice::new(&state_dir)?),
             false => None,
@@ -391,6 +406,7 @@ impl<S: Share> VM<S> {
         if let Some(tpm) = &self.tpm {
             args.extend(tpm.qemu_args());
         }
+
         let mut command = Command::new(match self.machine.arch {
             CpuIsa::AARCH64 => "qemu-system-aarch64",
             CpuIsa::X86_64 => "qemu-system-x86_64",