diff --git a/.gitignore b/.gitignore index 27e2f7832c..18d44ceb8a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ *.so *.dylib +# Avoid checking in keys +*.pem + # Test binary, built with `go test -c` *.test diff --git a/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml b/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml index af4b4e91ea..f61e8bfb44 100644 --- a/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml +++ b/deploy/legacy/manifests/controller/helm/retina/templates/daemonset.yaml @@ -214,6 +214,15 @@ spec: fieldRef: apiVersion: v1 fieldPath: status.hostIP + livenessProbe: + httpGet: + path: /metrics + port: {{ .Values.retinaPort }} + initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds | default "30" }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds | default "30" }} + timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds | default "1" }} + failureThreshold: {{ .Values.livenessProbe.failureThreshold | default "3" }} + successThreshold: {{ .Values.livenessProbe.successThreshold | default "1" }} securityContext: capabilities: add: diff --git a/test/e2e/README.md b/test/e2e/README.md index 06a4581b36..b16ef854fb 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -17,3 +17,19 @@ For reference, see the `test-all` recipe in the root [Makefile](../../Makefile). For sample test, please check out: [the Retina E2E.](./scenarios/retina/drop/scenario.go) + +## Sample VSCode `settings.json` for running with existing cluster + +```json +"go.testFlags": [ + "-v", + "-timeout=40m", + "-tags=e2e", + "-args", + "-create-infra=false", + "-delete-infra=false", + "-image-namespace=retistrynamespace", + "-image-registry=yourregistry", + "-image-tag=yourtesttag", +], +``` diff --git a/test/e2e/framework/kubernetes/create-kapinger-deployment.go b/test/e2e/framework/kubernetes/create-kapinger-deployment.go index 60d082b7c3..087ae25451 100644 --- a/test/e2e/framework/kubernetes/create-kapinger-deployment.go +++ b/test/e2e/framework/kubernetes/create-kapinger-deployment.go @@ -107,6 +107,9 @@ func (c *CreateKapingerDeployment) GetKapingerDeployment() *appsv1.Deployment { }, Spec: v1.PodSpec{ + NodeSelector: map[string]string{ + "kubernetes.io/os": "linux", + }, Affinity: &v1.Affinity{ PodAntiAffinity: &v1.PodAntiAffinity{ // prefer an even spread across the cluster to avoid scheduling on the same node diff --git a/test/e2e/framework/kubernetes/exec-pod.go b/test/e2e/framework/kubernetes/exec-pod.go index 2991a90aa2..03728ddbda 100644 --- a/test/e2e/framework/kubernetes/exec-pod.go +++ b/test/e2e/framework/kubernetes/exec-pod.go @@ -1,6 +1,7 @@ package kubernetes import ( + "bytes" "context" "fmt" "log" @@ -9,6 +10,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/remotecommand" "k8s.io/kubectl/pkg/scheme" @@ -27,7 +29,17 @@ func (e *ExecInPod) Run() error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - err := ExecPod(ctx, e.KubeConfigFilePath, e.PodNamespace, e.PodName, e.Command) + config, err := clientcmd.BuildConfigFromFlags("", e.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + _, err = ExecPod(ctx, clientset, config, e.PodNamespace, e.PodName, e.Command) if err != nil { return fmt.Errorf("error executing command [%s]: %w", e.Command, err) } @@ -43,17 +55,8 @@ func (e *ExecInPod) Stop() error { return nil } -func ExecPod(ctx context.Context, kubeConfigFilePath, namespace, podName, command string) error { - config, err := clientcmd.BuildConfigFromFlags("", kubeConfigFilePath) - if err != nil { - return fmt.Errorf("error building kubeconfig: %w", err) - } - - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return fmt.Errorf("error creating Kubernetes client: %w", err) - } - +func ExecPod(ctx context.Context, clientset *kubernetes.Clientset, config *rest.Config, namespace, podName, command string) ([]byte, error) { + log.Printf("executing command \"%s\" on pod \"%s\" in namespace \"%s\"...", command, podName, namespace) req := clientset.CoreV1().RESTClient().Post().Resource("pods").Name(podName). Namespace(namespace).SubResource(ExecSubResources) option := &v1.PodExecOptions{ @@ -69,20 +72,21 @@ func ExecPod(ctx context.Context, kubeConfigFilePath, namespace, podName, comman scheme.ParameterCodec, ) + var buf bytes.Buffer exec, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) if err != nil { - return fmt.Errorf("error creating executor: %w", err) + return buf.Bytes(), fmt.Errorf("error creating executor: %w", err) } - log.Printf("executing command \"%s\" on pod \"%s\" in namespace \"%s\"...", command, podName, namespace) err = exec.StreamWithContext(ctx, remotecommand.StreamOptions{ Stdin: os.Stdin, - Stdout: os.Stdout, - Stderr: os.Stderr, + Stdout: &buf, + Stderr: &buf, }) if err != nil { - return fmt.Errorf("error executing command: %w", err) + return buf.Bytes(), fmt.Errorf("error executing command: %w", err) } - return nil + res := buf.Bytes() + return res, nil } diff --git a/test/e2e/framework/kubernetes/get-logs.go b/test/e2e/framework/kubernetes/get-logs.go index 7cd728af85..a57680ce84 100644 --- a/test/e2e/framework/kubernetes/get-logs.go +++ b/test/e2e/framework/kubernetes/get-logs.go @@ -12,9 +12,16 @@ import ( "k8s.io/client-go/tools/clientcmd" ) -func PrintPodLogs(kubeconfigpath, namespace, labelSelector string) { +type GetPodLogs struct { + KubeConfigFilePath string + Namespace string + LabelSelector string +} + +func (p *GetPodLogs) Run() error { + fmt.Printf("printing pod logs for namespace: %s, labelselector: %s\n", p.Namespace, p.LabelSelector) // Load the kubeconfig file to get the configuration to access the cluster - config, err := clientcmd.BuildConfigFromFlags("", kubeconfigpath) + config, err := clientcmd.BuildConfigFromFlags("", p.KubeConfigFilePath) if err != nil { log.Printf("error building kubeconfig: %s\n", err) } @@ -25,8 +32,14 @@ func PrintPodLogs(kubeconfigpath, namespace, labelSelector string) { log.Printf("error creating clientset: %s\n", err) } + PrintPodLogs(context.Background(), clientset, p.Namespace, p.LabelSelector) + + return nil +} + +func PrintPodLogs(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) { // List all the pods in the namespace - pods, err := clientset.CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{ + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: labelSelector, }) if err != nil { @@ -55,5 +68,6 @@ func PrintPodLogs(kubeconfigpath, namespace, labelSelector string) { // Print the logs log.Println(string(buf)) + fmt.Printf("#######################################################\n") } } diff --git a/test/e2e/framework/kubernetes/install-retina-helm.go b/test/e2e/framework/kubernetes/install-retina-helm.go index 82f10745de..e5507b7ef9 100644 --- a/test/e2e/framework/kubernetes/install-retina-helm.go +++ b/test/e2e/framework/kubernetes/install-retina-helm.go @@ -1,6 +1,7 @@ package kubernetes import ( + "context" "fmt" "log" "os" @@ -11,10 +12,12 @@ import ( "helm.sh/helm/v3/pkg/action" "helm.sh/helm/v3/pkg/chart/loader" "helm.sh/helm/v3/pkg/cli" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" ) const ( - createTimeout = 240 * time.Second // windpws is slow + createTimeout = 20 * time.Minute // windows is slow deleteTimeout = 60 * time.Second ) @@ -32,6 +35,8 @@ type InstallHelmChart struct { } func (i *InstallHelmChart) Run() error { + ctx, cancel := context.WithTimeout(context.Background(), createTimeout) + defer cancel() settings := cli.New() settings.KubeConfig = i.KubeConfigFilePath actionConfig := new(action.Configuration) @@ -97,7 +102,7 @@ func (i *InstallHelmChart) Run() error { client.WaitForJobs = true // install the chart here - rel, err := client.Run(chart, chart.Values) + rel, err := client.RunWithContext(ctx, chart, chart.Values) if err != nil { return fmt.Errorf("failed to install chart: %w", err) } @@ -106,6 +111,23 @@ func (i *InstallHelmChart) Run() error { // this will confirm the values set during installation log.Printf("chart values: %v\n", rel.Config) + // ensure all pods are running, since helm doesn't care about windows + config, err := clientcmd.BuildConfigFromFlags("", i.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + labelSelector := "k8s-app=retina" + err = WaitForPodReady(ctx, clientset, "kube-system", labelSelector) + if err != nil { + return fmt.Errorf("error waiting for retina pods to be ready: %w", err) + } + return nil } diff --git a/test/e2e/framework/kubernetes/no-crashes.go b/test/e2e/framework/kubernetes/no-crashes.go new file mode 100644 index 0000000000..a5d5ec03b7 --- /dev/null +++ b/test/e2e/framework/kubernetes/no-crashes.go @@ -0,0 +1,43 @@ +package kubernetes + +import ( + "context" + "fmt" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +var ErrPodCrashed = fmt.Errorf("pod has crashes") + +type EnsureStableCluster struct { + LabelSelector string + PodNamespace string + KubeConfigFilePath string +} + +func (n *EnsureStableCluster) Run() error { + config, err := clientcmd.BuildConfigFromFlags("", n.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + err = WaitForPodReady(context.TODO(), clientset, n.PodNamespace, n.LabelSelector) + if err != nil { + return fmt.Errorf("error waiting for retina pods to be ready: %w", err) + } + return nil +} + +func (n *EnsureStableCluster) Prevalidate() error { + return nil +} + +func (n *EnsureStableCluster) Stop() error { + return nil +} diff --git a/test/e2e/framework/kubernetes/port-forward.go b/test/e2e/framework/kubernetes/port-forward.go index 5237109867..531fc3953b 100644 --- a/test/e2e/framework/kubernetes/port-forward.go +++ b/test/e2e/framework/kubernetes/port-forward.go @@ -11,6 +11,7 @@ import ( "time" retry "github.com/microsoft/retina/test/retry" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -120,7 +121,7 @@ func (p *PortForward) Run() error { } func (p *PortForward) findPodsWithAffinity(ctx context.Context, clientset *kubernetes.Clientset) (string, error) { - targetPods, errAffinity := clientset.CoreV1().Pods(p.Namespace).List(ctx, metav1.ListOptions{ + targetPodsAll, errAffinity := clientset.CoreV1().Pods(p.Namespace).List(ctx, metav1.ListOptions{ LabelSelector: p.LabelSelector, FieldSelector: "status.phase=Running", }) @@ -128,6 +129,15 @@ func (p *PortForward) findPodsWithAffinity(ctx context.Context, clientset *kuber return "", fmt.Errorf("could not list pods in %q with label %q: %w", p.Namespace, p.LabelSelector, errAffinity) } + // omit windows pods because we can't port-forward to them + targetPodsLinux := make([]v1.Pod, 0) + for i := range targetPodsAll.Items { + if targetPodsAll.Items[i].Spec.NodeSelector["kubernetes.io/os"] != "windows" { + targetPodsLinux = append(targetPodsLinux, targetPodsAll.Items[i]) + } + } + + // get all pods with optional label affinity affinityPods, errAffinity := clientset.CoreV1().Pods(p.Namespace).List(ctx, metav1.ListOptions{ LabelSelector: p.OptionalLabelAffinity, FieldSelector: "status.phase=Running", @@ -143,10 +153,10 @@ func (p *PortForward) findPodsWithAffinity(ctx context.Context, clientset *kuber } // if a pod is found on the same node as an affinity pod, use it - for i := range targetPods.Items { - if affinityNodes[targetPods.Items[i].Spec.NodeName] { + for i := range targetPodsLinux { + if affinityNodes[targetPodsLinux[i].Spec.NodeName] { // found a pod with the specified label, on a node with the optional label affinity - return targetPods.Items[i].Name, nil + return targetPodsLinux[i].Name, nil } } diff --git a/test/e2e/framework/kubernetes/wait-pod-ready.go b/test/e2e/framework/kubernetes/wait-pod-ready.go index 208fe140d1..4ec81ae91a 100644 --- a/test/e2e/framework/kubernetes/wait-pod-ready.go +++ b/test/e2e/framework/kubernetes/wait-pod-ready.go @@ -15,12 +15,18 @@ import ( const ( RetryTimeoutPodsReady = 5 * time.Minute RetryIntervalPodsReady = 5 * time.Second + + printInterval = 5 // print to stdout every 5 iterations ) func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, namespace, labelSelector string) error { podReadyMap := make(map[string]bool) + printIterator := 0 conditionFunc := wait.ConditionWithContextFunc(func(context.Context) (bool, error) { + defer func() { + printIterator++ + }() var podList *corev1.PodList podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) if err != nil { @@ -40,11 +46,21 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names return false, fmt.Errorf("error getting Pod: %w", err) } + for istatus := range pod.Status.ContainerStatuses { + status := &pod.Status.ContainerStatuses[istatus] + if status.RestartCount > 0 { + return false, fmt.Errorf("pod %s has %d restarts: status: %+v: %w", pod.Name, status.RestartCount, status, ErrPodCrashed) + } + } + // Check the Pod phase if pod.Status.Phase != corev1.PodRunning { - log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name) + if printIterator%printInterval == 0 { + log.Printf("pod \"%s\" is not in Running state yet. Waiting...\n", pod.Name) + } return false, nil } + if !podReadyMap[pod.Name] { log.Printf("pod \"%s\" is in Running state\n", pod.Name) podReadyMap[pod.Name] = true @@ -56,6 +72,7 @@ func WaitForPodReady(ctx context.Context, clientset *kubernetes.Clientset, names err := wait.PollUntilContextCancel(ctx, RetryIntervalPodsReady, true, conditionFunc) if err != nil { + PrintPodLogs(ctx, clientset, namespace, labelSelector) return fmt.Errorf("error waiting for pods in namespace \"%s\" with label \"%s\" to be in Running state: %w", namespace, labelSelector, err) } return nil diff --git a/test/e2e/framework/prometheus/prometheus.go b/test/e2e/framework/prometheus/prometheus.go index 4cdaab1614..0014e2d368 100644 --- a/test/e2e/framework/prometheus/prometheus.go +++ b/test/e2e/framework/prometheus/prometheus.go @@ -7,6 +7,7 @@ import ( "log" "net/http" "reflect" + "strings" "time" "github.com/microsoft/retina/test/retry" @@ -34,7 +35,7 @@ func CheckMetric(promAddress, metricName string, validMetric map[string]string) var err error // obtain a full dump of all metrics on the endpoint - metrics, err = getAllPrometheusMetrics(promAddress) + metrics, err = getAllPrometheusMetricsFromURL(promAddress) if err != nil { return fmt.Errorf("could not start port forward within %ds: %w ", defaultTimeout, err) } @@ -57,6 +58,21 @@ func CheckMetric(promAddress, metricName string, validMetric map[string]string) return nil } +func CheckMetricFromBuffer(prometheusMetricData []byte, metricName string, validMetric map[string]string) error { + metrics, err := getAllPrometheusMetricsFromBuffer(prometheusMetricData) + if err != nil { + return fmt.Errorf("failed to parse prometheus metrics: %w", err) + } + + err = verifyValidMetricPresent(metricName, metrics, validMetric) + if err != nil { + log.Printf("failed to find metric matching %s: %+v\n", metricName, validMetric) + return ErrNoMetricFound + } + + return nil +} + func verifyValidMetricPresent(metricName string, data map[string]*promclient.MetricFamily, validMetric map[string]string) error { for _, metric := range data { if metric.GetName() == metricName { @@ -77,7 +93,7 @@ func verifyValidMetricPresent(metricName string, data map[string]*promclient.Met return fmt.Errorf("failed to find metric matching: %+v: %w", validMetric, ErrNoMetricFound) } -func getAllPrometheusMetrics(url string) (map[string]*promclient.MetricFamily, error) { +func getAllPrometheusMetricsFromURL(url string) (map[string]*promclient.MetricFamily, error) { client := http.Client{} resp, err := client.Get(url) //nolint if err != nil { @@ -89,7 +105,7 @@ func getAllPrometheusMetrics(url string) (map[string]*promclient.MetricFamily, e return nil, fmt.Errorf("HTTP request failed with status: %v", resp.Status) //nolint:goerr113,gocritic } - metrics, err := parseReaderPrometheusMetrics(resp.Body) + metrics, err := ParseReaderPrometheusMetrics(resp.Body) if err != nil { return nil, err } @@ -97,7 +113,25 @@ func getAllPrometheusMetrics(url string) (map[string]*promclient.MetricFamily, e return metrics, nil } -func parseReaderPrometheusMetrics(input io.Reader) (map[string]*promclient.MetricFamily, error) { +func getAllPrometheusMetricsFromBuffer(buf []byte) (map[string]*promclient.MetricFamily, error) { + var parser expfmt.TextParser + reader := strings.NewReader(string(buf)) + return parser.TextToMetricFamilies(reader) //nolint +} + +func ParseReaderPrometheusMetrics(input io.Reader) (map[string]*promclient.MetricFamily, error) { var parser expfmt.TextParser return parser.TextToMetricFamilies(input) //nolint } + +// When capturing promethus output via curl and exect, there's a lot +// of garbage at the front +func stripExecGarbage(s string) string { + index := strings.Index(s, "#") + if index == -1 { + // If there's no `#`, return the original string + return s + } + // Slice the string up to the character before the first `#` + return s[:index] +} diff --git a/test/e2e/jobs/jobs.go b/test/e2e/jobs/jobs.go index eee000c415..8c7c98e04d 100644 --- a/test/e2e/jobs/jobs.go +++ b/test/e2e/jobs/jobs.go @@ -9,37 +9,49 @@ import ( "github.com/microsoft/retina/test/e2e/scenarios/drop" "github.com/microsoft/retina/test/e2e/scenarios/latency" tcp "github.com/microsoft/retina/test/e2e/scenarios/tcp" + "github.com/microsoft/retina/test/e2e/scenarios/windows" ) -func CreateTestInfra(subID, clusterName, location, kubeConfigFilePath string) *types.Job { +func CreateTestInfra(subID, clusterName, location, kubeConfigFilePath string, createInfra bool) *types.Job { job := types.NewJob("Create e2e test infrastructure") - job.AddStep(&azure.CreateResourceGroup{ - SubscriptionID: subID, - ResourceGroupName: clusterName, - Location: location, - }, nil) - - job.AddStep(&azure.CreateVNet{ - VnetName: "testvnet", - VnetAddressSpace: "10.0.0.0/9", - }, nil) - - job.AddStep(&azure.CreateSubnet{ - SubnetName: "testsubnet", - SubnetAddressSpace: "10.0.0.0/12", - }, nil) - - job.AddStep(&azure.CreateNPMCluster{ - ClusterName: clusterName, - PodCidr: "10.128.0.0/9", - DNSServiceIP: "192.168.0.10", - ServiceCidr: "192.168.0.0/28", - }, nil) - - job.AddStep(&azure.GetAKSKubeConfig{ - KubeConfigFilePath: kubeConfigFilePath, - }, nil) + if createInfra { + job.AddStep(&azure.CreateResourceGroup{ + SubscriptionID: subID, + ResourceGroupName: clusterName, + Location: location, + }, nil) + + job.AddStep(&azure.CreateVNet{ + VnetName: "testvnet", + VnetAddressSpace: "10.0.0.0/9", + }, nil) + + job.AddStep(&azure.CreateSubnet{ + SubnetName: "testsubnet", + SubnetAddressSpace: "10.0.0.0/12", + }, nil) + + job.AddStep(&azure.CreateNPMCluster{ + ClusterName: clusterName, + PodCidr: "10.128.0.0/9", + DNSServiceIP: "192.168.0.10", + ServiceCidr: "192.168.0.0/28", + }, nil) + + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + }, nil) + + } else { + job.AddStep(&azure.GetAKSKubeConfig{ + KubeConfigFilePath: kubeConfigFilePath, + ClusterName: clusterName, + SubscriptionID: subID, + ResourceGroupName: clusterName, + Location: location, + }, nil) + } job.AddStep(&generic.LoadFlags{ TagEnv: generic.DefaultTagEnv, @@ -77,6 +89,8 @@ func InstallAndTestRetinaBasicMetrics(kubeConfigFilePath, chartPath string) *typ job.AddScenario(tcp.ValidateTCPMetrics()) + job.AddScenario(windows.ValidateWindowsBasicMetric()) + dnsScenarios := []struct { name string req *dns.RequestValidationParams @@ -122,6 +136,11 @@ func InstallAndTestRetinaBasicMetrics(kubeConfigFilePath, chartPath string) *typ job.AddScenario(dns.ValidateBasicDNSMetrics(scenario.name, scenario.req, scenario.resp)) } + job.AddStep(&kubernetes.EnsureStableCluster{ + PodNamespace: "kube-system", + LabelSelector: "k8s-app=retina", + }, nil) + return job } @@ -184,5 +203,10 @@ func UpgradeAndTestRetinaAdvancedMetrics(kubeConfigFilePath, chartPath, valuesFi job.AddScenario(latency.ValidateLatencyMetric()) + job.AddStep(&kubernetes.EnsureStableCluster{ + PodNamespace: "kube-system", + LabelSelector: "k8s-app=retina", + }, nil) + return job } diff --git a/test/e2e/retina_e2e_test.go b/test/e2e/retina_e2e_test.go index f804eef495..e69dab3e61 100644 --- a/test/e2e/retina_e2e_test.go +++ b/test/e2e/retina_e2e_test.go @@ -2,6 +2,7 @@ package retina import ( "crypto/rand" + "flag" "math/big" "os" "os/user" @@ -16,14 +17,23 @@ import ( "github.com/stretchr/testify/require" ) -var locations = []string{"eastus2", "centralus", "southcentralus", "uksouth", "centralindia", "westus2"} +var ( + locations = []string{"eastus2", "centralus", "southcentralus", "uksouth", "centralindia", "westus2"} + createInfra = flag.Bool("create-infra", true, "create a Resource group, vNET and AKS cluster for testing") + deleteInfra = flag.Bool("delete-infra", true, "delete a Resource group, vNET and AKS cluster for testing") +) // TestE2ERetina tests all e2e scenarios for retina func TestE2ERetina(t *testing.T) { curuser, err := user.Current() require.NoError(t, err) + flag.Parse() - clusterName := curuser.Username + common.NetObsRGtag + strconv.FormatInt(time.Now().Unix(), 10) + clusterName := os.Getenv("CLUSTER_NAME") + if clusterName == "" { + clusterName = curuser.Username + common.NetObsRGtag + strconv.FormatInt(time.Now().Unix(), 10) + t.Logf("CLUSTER_NAME is not set, generating a random cluster name: %s", clusterName) + } subID := os.Getenv("AZURE_SUBSCRIPTION_ID") require.NotEmpty(t, subID) @@ -49,7 +59,7 @@ func TestE2ERetina(t *testing.T) { kubeConfigFilePath := filepath.Join(rootDir, "test", "e2e", "test.pem") // CreateTestInfra - createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, clusterName, location, kubeConfigFilePath)) + createTestInfra := types.NewRunner(t, jobs.CreateTestInfra(subID, clusterName, location, kubeConfigFilePath, *createInfra)) createTestInfra.Run() // Hacky way to ensure that the test infra is deleted even if the test panics @@ -57,7 +67,9 @@ func TestE2ERetina(t *testing.T) { if r := recover(); r != nil { t.Logf("Recovered in TestE2ERetina, %v", r) } - _ = jobs.DeleteTestInfra(subID, clusterName, location).Run() + if *deleteInfra { + _ = jobs.DeleteTestInfra(subID, clusterName, location).Run() + } }() // Install and test Retina basic metrics diff --git a/test/e2e/scenarios/windows/scenario.go b/test/e2e/scenarios/windows/scenario.go new file mode 100644 index 0000000000..12d82f74be --- /dev/null +++ b/test/e2e/scenarios/windows/scenario.go @@ -0,0 +1,19 @@ +package windows + +import ( + "github.com/microsoft/retina/test/e2e/framework/types" +) + +func ValidateWindowsBasicMetric() *types.Scenario { + name := "Windows Metrics" + steps := []*types.StepWrapper{ + { + Step: &ValidateHNSMetric{ + KubeConfigFilePath: "./test.pem", + RetinaDaemonSetNamespace: "kube-system", + RetinaDaemonSetName: "retina-agent-win", + }, + }, + } + return types.NewScenario(name, steps...) +} diff --git a/test/e2e/scenarios/windows/validate-hns-metrics.go b/test/e2e/scenarios/windows/validate-hns-metrics.go new file mode 100644 index 0000000000..d4bcd0de1a --- /dev/null +++ b/test/e2e/scenarios/windows/validate-hns-metrics.go @@ -0,0 +1,107 @@ +package windows + +import ( + "context" + "errors" + "fmt" + "log" + "time" + + "github.com/microsoft/retina/test/e2e/common" + k8s "github.com/microsoft/retina/test/e2e/framework/kubernetes" + prom "github.com/microsoft/retina/test/e2e/framework/prometheus" + "github.com/microsoft/retina/test/retry" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kubernetes "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +const ( + defaultRetryDelay = 5 * time.Second + defaultRetryAttempts = 5 + defaultHTTPClientTimeout = 2 * time.Second +) + +var ( + ErrorNoWindowsPod = errors.New("no windows retina pod found") + ErrNoMetricFound = fmt.Errorf("no metric found") + + hnsMetricName = "networkobservability_windows_hns_stats" + defaultRetrier = retry.Retrier{Attempts: defaultRetryAttempts, Delay: defaultRetryDelay} +) + +type ValidateHNSMetric struct { + KubeConfigFilePath string + RetinaDaemonSetNamespace string + RetinaDaemonSetName string +} + +func (v *ValidateHNSMetric) Run() error { + config, err := clientcmd.BuildConfigFromFlags("", v.KubeConfigFilePath) + if err != nil { + return fmt.Errorf("error building kubeconfig: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return fmt.Errorf("error creating Kubernetes client: %w", err) + } + + pods, err := clientset.CoreV1().Pods(v.RetinaDaemonSetNamespace).List(context.TODO(), metav1.ListOptions{ + LabelSelector: "k8s-app=retina", + }) + if err != nil { + panic(err.Error()) + } + + var windowsRetinaPod *v1.Pod + for pod := range pods.Items { + if pods.Items[pod].Spec.NodeSelector["kubernetes.io/os"] == "windows" { + windowsRetinaPod = &pods.Items[pod] + } + } + if windowsRetinaPod == nil { + return ErrorNoWindowsPod + } + + labels := map[string]string{ + "direction": "win_packets_sent_count", + } + + log.Printf("checking for metric %s with labels %+v\n", hnsMetricName, labels) + + // wrap this in a retrier because windows is slow + var output []byte + err = defaultRetrier.Do(context.TODO(), func() error { + output, err = k8s.ExecPod(context.TODO(), clientset, config, windowsRetinaPod.Namespace, windowsRetinaPod.Name, fmt.Sprintf("curl -s http://localhost:%d/metrics", common.RetinaPort)) + if err != nil { + return fmt.Errorf("error executing command in windows retina pod: %w", err) + } + if len(output) == 0 { + return ErrNoMetricFound + } + + if err != nil { + return fmt.Errorf("failed to get metrics from windows retina pod: %w", err) + } + + err = prom.CheckMetricFromBuffer(output, hnsMetricName, labels) + if err != nil { + return fmt.Errorf("failed to verify prometheus metrics: %w", err) + } + + return nil + }) + + log.Printf("found metric matching %+v: with labels %+v\n", hnsMetricName, labels) + return nil +} + +func (v *ValidateHNSMetric) Prevalidate() error { + return nil +} + +func (v *ValidateHNSMetric) Stop() error { + return nil +}