-
Notifications
You must be signed in to change notification settings - Fork 1.4k
🐛 refactored cleanup for clusterctl upgrade e2e #12775
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,9 @@ import ( | |
"io" | ||
"net/http" | ||
"os" | ||
"os/exec" | ||
"path/filepath" | ||
"regexp" | ||
"runtime" | ||
"strings" | ||
"time" | ||
|
@@ -58,6 +60,18 @@ import ( | |
"sigs.k8s.io/cluster-api/util/version" | ||
) | ||
|
||
// isValidClusterName validates that a cluster name is safe to use in shell commands. | ||
// It only allows alphanumeric characters, hyphens, and underscores. | ||
func isValidClusterName(name string) bool { | ||
if name == "" { | ||
return false | ||
} | ||
// Allow alphanumeric characters, hyphens, and underscores | ||
// This matches typical Kubernetes resource naming conventions | ||
validNameRegex := regexp.MustCompile(`^[a-zA-Z0-9-_]+$`) | ||
return validNameRegex.MatchString(name) | ||
} | ||
|
||
// ClusterctlUpgradeSpecInput is the input for ClusterctlUpgradeSpec. | ||
type ClusterctlUpgradeSpecInput struct { | ||
E2EConfig *clusterctl.E2EConfig | ||
|
@@ -260,6 +274,10 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
}) | ||
|
||
It("Should create a management cluster and then upgrade all the providers", func() { | ||
// Clean up any leftover clusters from previous test runs at the very beginning | ||
By("Cleaning up any leftover clusters from previous runs") | ||
cleanupRemainingKindClusters(ctx, specName) | ||
|
||
infrastructureProvider := clusterctl.DefaultInfrastructureProvider | ||
if input.InfrastructureProvider != nil { | ||
infrastructureProvider = *input.InfrastructureProvider | ||
|
@@ -282,12 +300,36 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
}) | ||
Expect(managementClusterProvider).ToNot(BeNil(), "Failed to create a kind cluster") | ||
|
||
// Set up cleanup for kind management cluster | ||
DeferCleanup(func() { | ||
if !input.SkipCleanup && managementClusterProvider != nil { | ||
By("Disposing kind management cluster provider") | ||
managementClusterProvider.Dispose(ctx) | ||
} | ||
}) | ||
|
||
kubeconfigPath := managementClusterProvider.GetKubeconfigPath() | ||
Expect(kubeconfigPath).To(BeAnExistingFile(), "Failed to get the kubeconfig file for the kind cluster") | ||
|
||
managementClusterProxy = input.KindManagementClusterNewClusterProxyFunc(managementClusterName, kubeconfigPath) | ||
Expect(managementClusterProxy).ToNot(BeNil(), "Failed to get a kind cluster proxy") | ||
|
||
// Set up cleanup for kind management cluster proxy | ||
DeferCleanup(func() { | ||
if !input.SkipCleanup && managementClusterProxy != nil { | ||
By("Disposing kind management cluster proxy") | ||
managementClusterProxy.Dispose(ctx) | ||
} | ||
}) | ||
|
||
// Set up cleanup for kind cluster logs | ||
DeferCleanup(func() { | ||
if managementClusterProxy != nil { | ||
By("Dumping kind cluster logs") | ||
dumpKindClusterLogs(ctx, input.ArtifactFolder, managementClusterProxy) | ||
} | ||
}) | ||
|
||
managementClusterResources.Cluster = &clusterv1.Cluster{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: managementClusterName, | ||
|
@@ -322,6 +364,14 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
WaitForMachineDeployments: input.E2EConfig.GetIntervals(specName, "wait-worker-nodes"), | ||
}, managementClusterResources) | ||
|
||
// Set up cleanup for non-kind management cluster | ||
DeferCleanup(func() { | ||
if managementClusterNamespace != nil && managementClusterCancelWatches != nil && managementClusterResources.Cluster != nil { | ||
By("Cleaning up non-kind management cluster resources") | ||
framework.DumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ClusterctlConfigPath, input.ArtifactFolder, managementClusterNamespace, managementClusterCancelWatches, managementClusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup) | ||
} | ||
}) | ||
|
||
// If the cluster is a DockerCluster, we should load controller images into the nodes. | ||
// Nb. this can be achieved also by changing the DockerMachine spec, but for the time being we are using | ||
// this approach because this allows to have a single source of truth for images, the e2e config | ||
|
@@ -408,6 +458,67 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", "bootstrap"), | ||
}) | ||
|
||
// Set up cleanup for test namespace and workload cluster | ||
DeferCleanup(func() { | ||
if testNamespace != nil { | ||
By("Cleaning up test namespace and workload cluster") | ||
|
||
// Dump all the logs from the workload cluster before deleting them. | ||
if workloadClusterName != "" { | ||
framework.DumpAllResourcesAndLogs(ctx, managementClusterProxy, input.ClusterctlConfigPath, input.ArtifactFolder, testNamespace, &clusterv1.Cluster{ | ||
// DumpAllResourcesAndLogs only uses Namespace + Name from the Cluster object. | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Namespace: testNamespace.Name, | ||
Name: workloadClusterName, | ||
}, | ||
}) | ||
} | ||
|
||
if !input.SkipCleanup { | ||
// Delete all clusters in the test namespace | ||
Byf("Deleting all clusters in namespace %s in management cluster %s", testNamespace.Name, managementClusterName) | ||
deleteAllClustersAndWait(ctx, deleteAllClustersAndWaitInput{ | ||
Client: managementClusterProxy.GetClient(), | ||
Namespace: testNamespace.Name, | ||
}, input.E2EConfig.GetIntervals(specName, "wait-delete-cluster")...) | ||
|
||
// Cleanup any remaining Kind clusters that might not have been cleaned up by the above | ||
By("Cleaning up any remaining Kind clusters") | ||
cleanupRemainingKindClusters(ctx, specName) | ||
|
||
// Delete the test namespace | ||
Byf("Deleting namespace %s used for hosting the %q test", testNamespace.Name, specName) | ||
framework.DeleteNamespace(ctx, framework.DeleteNamespaceInput{ | ||
Deleter: managementClusterProxy.GetClient(), | ||
Name: testNamespace.Name, | ||
}) | ||
|
||
// Delete providers | ||
if managementClusterResources.Cluster != nil { | ||
Byf("Deleting providers") | ||
clusterctl.Delete(ctx, clusterctl.DeleteInput{ | ||
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", managementClusterResources.Cluster.Name), | ||
ClusterctlConfigPath: input.ClusterctlConfigPath, | ||
KubeconfigPath: managementClusterProxy.GetKubeconfigPath(), | ||
}) | ||
} | ||
} | ||
|
||
// Cancel watches | ||
if testCancelWatches != nil { | ||
testCancelWatches() | ||
} | ||
} | ||
}) | ||
|
||
// Set up cleanup for pre-cleanup management cluster hook | ||
DeferCleanup(func() { | ||
if input.PreCleanupManagementCluster != nil { | ||
By("Running PreCleanupManagementCluster steps against the management cluster") | ||
input.PreCleanupManagementCluster(managementClusterProxy) | ||
} | ||
}) | ||
|
||
if input.PostNamespaceCreated != nil { | ||
log.Logf("Calling postNamespaceCreated for namespace %s", testNamespace.Name) | ||
input.PostNamespaceCreated(managementClusterProxy, testNamespace.Name) | ||
|
@@ -421,6 +532,15 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
|
||
workloadClusterName = fmt.Sprintf("%s-workload-%s", specName, util.RandomString(6)) | ||
workloadClusterNamespace := testNamespace.Name | ||
|
||
// Register cleanup for the workload cluster immediately after naming it | ||
// This ensures cleanup happens even if the test fails during cluster creation | ||
DeferCleanup(func() { | ||
By("Cleaning up workload cluster (early cleanup)") | ||
if workloadClusterName != "" { | ||
cleanupRemainingKindClusters(ctx, workloadClusterName) | ||
} | ||
}) | ||
kubernetesVersion := input.WorkloadKubernetesVersion | ||
if kubernetesVersion == "" { | ||
kubernetesVersion = input.E2EConfig.MustGetVariable(KubernetesVersion) | ||
|
@@ -775,58 +895,6 @@ func ClusterctlUpgradeSpec(ctx context.Context, inputGetter func() ClusterctlUpg | |
|
||
By("PASSED!") | ||
}) | ||
|
||
AfterEach(func() { | ||
if testNamespace != nil { | ||
// Dump all the logs from the workload cluster before deleting them. | ||
framework.DumpAllResourcesAndLogs(ctx, managementClusterProxy, input.ClusterctlConfigPath, input.ArtifactFolder, testNamespace, &clusterv1.Cluster{ | ||
// DumpAllResourcesAndLogs only uses Namespace + Name from the Cluster object. | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Namespace: testNamespace.Name, | ||
Name: workloadClusterName, | ||
}, | ||
}) | ||
|
||
if !input.SkipCleanup { | ||
Byf("Deleting all clusters in namespace %s in management cluster %s", testNamespace.Name, managementClusterName) | ||
deleteAllClustersAndWait(ctx, deleteAllClustersAndWaitInput{ | ||
Client: managementClusterProxy.GetClient(), | ||
Namespace: testNamespace.Name, | ||
}, input.E2EConfig.GetIntervals(specName, "wait-delete-cluster")...) | ||
|
||
Byf("Deleting namespace %s used for hosting the %q test", testNamespace.Name, specName) | ||
framework.DeleteNamespace(ctx, framework.DeleteNamespaceInput{ | ||
Deleter: managementClusterProxy.GetClient(), | ||
Name: testNamespace.Name, | ||
}) | ||
|
||
Byf("Deleting providers") | ||
clusterctl.Delete(ctx, clusterctl.DeleteInput{ | ||
LogFolder: filepath.Join(input.ArtifactFolder, "clusters", managementClusterResources.Cluster.Name), | ||
ClusterctlConfigPath: input.ClusterctlConfigPath, | ||
KubeconfigPath: managementClusterProxy.GetKubeconfigPath(), | ||
}) | ||
} | ||
testCancelWatches() | ||
} | ||
|
||
if input.PreCleanupManagementCluster != nil { | ||
By("Running PreCleanupManagementCluster steps against the management cluster") | ||
input.PreCleanupManagementCluster(managementClusterProxy) | ||
} | ||
|
||
// Dumps all the resources in the spec namespace, then cleanups the cluster object and the spec namespace itself. | ||
if input.UseKindForManagementCluster { | ||
dumpKindClusterLogs(ctx, input.ArtifactFolder, managementClusterProxy) | ||
|
||
if !input.SkipCleanup { | ||
managementClusterProxy.Dispose(ctx) | ||
managementClusterProvider.Dispose(ctx) | ||
Comment on lines
-823
to
-824
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we know why this doesn't work in some cases? I would have expected it to work in cases where all tests pass (e.g. https://prow.k8s.io/view/gs/kubernetes-ci-logs/logs/periodic-cluster-api-e2e-mink8s-release-1-9/1951726318206849024) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, do we have a recent occurrence of this issue? I couldn't find one but maybe my search is wrong: https://storage.googleapis.com/k8s-triage/index.html?text=Found%20unexpected%20running%20containers&job=.*cluster-api.*e2e.*main&xjob=.*-provider-.* There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey @sbueringer - I tried a couple of times on a VM this time and I wasn't able to reproduce the failure @arshadd-b - Have you seen the flake recently? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
} | ||
} else { | ||
framework.DumpSpecResourcesAndCleanup(ctx, specName, input.BootstrapClusterProxy, input.ClusterctlConfigPath, input.ArtifactFolder, managementClusterNamespace, managementClusterCancelWatches, managementClusterResources.Cluster, input.E2EConfig.GetIntervals, input.SkipCleanup) | ||
} | ||
}) | ||
} | ||
|
||
func setupClusterctl(ctx context.Context, clusterctlBinaryURL, clusterctlConfigPath string) (string, string) { | ||
|
@@ -1211,3 +1279,50 @@ func getValueOrFallback(value []string, fallback []string) []string { | |
} | ||
return fallback | ||
} | ||
|
||
// cleanupRemainingKindClusters cleans up any Kind clusters that might be left behind after the test. | ||
// This is a safety net to ensure proper cleanup in case the normal cluster deletion doesn't clean up Kind clusters. | ||
func cleanupRemainingKindClusters(ctx context.Context, testPrefix string) { | ||
if testPrefix == "" { | ||
return | ||
} | ||
|
||
// Get list of all Kind clusters | ||
cmd := exec.CommandContext(ctx, "kind", "get", "clusters") | ||
output, err := cmd.Output() | ||
if err != nil { | ||
log.Logf("Failed to get kind clusters: %v", err) | ||
return | ||
} | ||
|
||
clusters := strings.Split(strings.TrimSpace(string(output)), "\n") | ||
for _, cluster := range clusters { | ||
cluster = strings.TrimSpace(cluster) | ||
if cluster == "" { | ||
continue | ||
} | ||
|
||
// Check if this cluster is related to our test (either management or workload cluster) | ||
// We check for both the exact prefix and common test cluster patterns | ||
shouldDelete := strings.Contains(cluster, testPrefix) || | ||
strings.HasPrefix(cluster, "clusterctl-upgrade-") || | ||
strings.Contains(cluster, "-workload-") || | ||
strings.Contains(cluster, "-management-") | ||
|
||
if shouldDelete { | ||
// Validate cluster name to prevent command injection | ||
if !isValidClusterName(cluster) { | ||
log.Logf("Skipping cluster %s due to invalid name format", cluster) | ||
continue | ||
} | ||
|
||
log.Logf("Found remaining kind cluster %s, cleaning it up", cluster) | ||
deleteCmd := exec.CommandContext(ctx, "kind", "delete", "cluster", "--name", cluster) // #nosec G204 - cluster name is validated | ||
if err := deleteCmd.Run(); err != nil { | ||
log.Logf("Failed to delete kind cluster %s: %v", cluster, err) | ||
} else { | ||
log.Logf("Successfully deleted kind cluster %s", cluster) | ||
} | ||
} | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer if we could avoid a huge refactoring like this. It took us a while to get to the current state and there is a lot of nuance in all of this (e.g. different apiVersions being available in different test cases).
Can we try to fix the issue by only improving AfterEach?
I think it's also easier to reason through one AfterEach block then trying to figure out how all of this can add up in various success and error cases
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can give it another go for sure - I did observe that AfterEach wasn't being executed and thought it might be because of nested AfterEach.
I think a couple more days working on this may ring a bell.
If not - I'll keep the changes to minimal
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that should only happen if the test case is not started (we run a bunch of code before entering the test case, see:
cluster-api/test/e2e/e2e_suite_test.go
Line 135 in d30d3e3
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at #12578 (comment), I think your test is stuck in SynchronizedBeforeSuite. But it definitely looks like the clusterctl upgrade test is not started