1: run DMF in parallel

williamlardier · williamlardier · commit 995dcc0a655e · 2025-09-09T11:34:02.000+02:00
diff --git a/solution/deps.yaml b/solution/deps.yaml
@@ -113,7 +113,7 @@ sorbet:
   policy: sorbet/sorbet-policies
   dashboard: sorbet/sorbet-dashboards
   image: sorbet
-  tag: v1.2.0-preview.1
+  tag: 931169543e23666c99a20e1c679e7661ec1c50eb
   envsubst: SORBET_TAG
 stern: # tail any pod logs with pattern matchin
   tag: 1.30.0
diff --git a/tests/ctst/HOW_TO_WRITE_TESTS.md b/tests/ctst/HOW_TO_WRITE_TESTS.md
@@ -59,6 +59,23 @@ possible. Solutions exist:
   relative checks.
 - As a last resort, we might have a dedicated test suite.
 
+### Cold Storage Tests and Parallel Execution
+
+Previously, `@ColdStorage` tests were forced to run sequentially due to shared 
+DMF volume access. This has been resolved with the following improvements:
+
+- **Bucket-specific file isolation**: The sorbet mock backend now uses S3 alias 
+  naming (`/cold-data/data/s3-aliases/{bucket}-{key}-{versionId}/`) which provides
+  perfect isolation between parallel test runs.
+- **Intelligent file counting**: DMF volume checks now scan only for files 
+  belonging to the specific test's bucket name.
+- **Per-test cleanup**: Each test cleans up only its own files, preventing 
+  interference with parallel tests.
+
+This means `@ColdStorage` tests can now run with full parallelization, 
+significantly reducing test execution time. Please follow the rules for parallel
+execution, if you are using `@ColdStorage` tests.
+
 ## 5. Focus on validating features.
 
 We only want to assert against externally visible state, as given in the
diff --git a/tests/ctst/common/hooks.ts b/tests/ctst/common/hooks.ts
@@ -27,7 +27,6 @@ export const replicationLockTags = [
 const noParallelRun = atMostOnePicklePerTag([
     '@AfterAll',
     '@PRA',
-    '@ColdStorage',
     ...replicationLockTags
 ]);
 
diff --git a/tests/ctst/steps/dmf.ts b/tests/ctst/steps/dmf.ts
@@ -1,23 +1,110 @@
 import { Then, Given, After } from '@cucumber/cucumber';
 import assert from 'assert';
-import { execShellCommand } from 'common/utils';
 import Zenko from 'world/Zenko';
+import { execInCluster } from './utils/kubernetes';
+import { Utils } from 'cli-testing';
 
-async function cleanDmfVolume() {
-    await execShellCommand('rm -rf /cold-data/*');
+/**
+ * Clean up S3 alias files for this specific bucket
+ * @param world - The Zenko world object
+ * @param bucketName - The name of the bucket to clean up
+ * @returns void
+ */
+async function cleanDmfVolumeForBucket(world: Zenko, bucketName: string) {
+    if (!bucketName) {
+        return;
+    }
+
+    const commands = [
+        `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type f -delete 2>/dev/null || true`,
+        `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type d -empty -delete 2>/dev/null || true`
+    ];
+
+    for (const command of commands) {
+        await execInCluster(world, command);
+    }
 }
 
+/**
+ * Check if the DMF volume contains the expected number of objects.
+ * This requires sorbet mock backend with UseS3Naming=true.
+ * Files are stored as: /cold-data/data/s3-aliases/{bucket}-{key}-{versionId}/content
+ * This enables parallel test execution by providing bucket-level isolation
+ * @param this - The Zenko world object
+ * @param objectCount - The expected number of objects
+ * @returns void
+ */
 Then('dmf volume should contain {int} objects',
-    { timeout: 2 * 60 * 1000 }, async (objectCount: number) => {
+    { timeout: 2 * 60 * 1000 }, async function (this: Zenko, objectCount: number) {
+        const bucketName = this.getSaved<string>('bucketName');
+        if (!bucketName) {
+            throw new Error('bucketName not found in test context. Ensure bucket is created before this step.');
+        }
+
         let conditionOk = false;
-        while (!conditionOk) {
-            // Getting the number of objects inside the volume used
-            // by the mock dmf to store transitioned objects
-            const outStr = await execShellCommand('find /cold-data -type f | wc -l');
-            // we store two files per object (content and manifest.json)
-            conditionOk = Number(outStr) === objectCount * 2;
+        let attempts = 0;
+        const maxAttempts = 60;
+
+        while (!conditionOk && attempts < maxAttempts) {
+            try {
+                const outStr = await execInCluster(
+                    this,
+                    `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type f | wc -l`
+                );
+                const fileCount = Number(outStr.trim());
+
+                // We expect 2 files per object (content + manifest.json)
+                const expectedFileCount = objectCount * 2;
+                conditionOk = fileCount === expectedFileCount;
+
+                if (!conditionOk) {
+                    this.logger.debug(`DMF volume check for bucket ${bucketName}`, {
+                        expected: expectedFileCount,
+                        found: fileCount,
+                        attempt: attempts + 1,
+                        maxAttempts
+                    });
+
+                    if (attempts % 10 === 0) {
+                        const filesFound = await execInCluster(
+                            this,
+                            `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type f 2>/dev/null`
+                        );
+                        this.logger.debug(`Files found for bucket ${bucketName}:`, { files: filesFound });
+                    }
+
+                    await Utils.sleep(2000);
+                    attempts++;
+                }
+            } catch (error) {
+                this.logger.error('Error checking DMF volume', { error, bucket: bucketName });
+                throw error;
+            }
+        }
+
+        if (!conditionOk) {
+            const finalCount = await execInCluster(
+                this,
+                `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type f | wc -l`
+            );
+            const actualFiles = await execInCluster(
+                this,
+                `find /cold-data/data/s3-aliases -name "${bucketName}-*" -type f 2>/dev/null`
+            );
+
+            assert.fail(
+                `DMF volume should contain ${objectCount * 2} files for bucket ${bucketName}, ` +
+                `but found ${finalCount.trim()} after ${attempts} attempts. ` +
+                `Files found: ${actualFiles}`
+            );
         }
-        assert(conditionOk);
+
+        this.logger.debug(`DMF volume check passed for bucket ${bucketName}`, {
+            expectedObjects: objectCount,
+            foundFiles: objectCount * 2,
+            attempts,
+            maxAttempts,
+        });
     });
 
 Given('a flaky backend that will require {int} retries for {string}',
@@ -29,6 +116,17 @@ Given('a flaky backend that will require {int} retries for {string}',
         this.addToSaved('backendFlakiness', op);
     });
 
-After({ tags: '@Dmf' }, async () => {
-    await cleanDmfVolume();
+After({ tags: '@Dmf' }, async function (this: Zenko, results) {
+    const bucketName = this.getSaved<string>('bucketName');
+
+    if (results.result?.status === 'FAILED') {
+        this.logger.warn('DMF volume was not cleaned for failed test', {
+            bucket: bucketName,
+            reason: 'test failed - keeping files for debugging'
+        });
+        return;
+    }
+
+    await cleanDmfVolumeForBucket(this, bucketName);
+    this.logger.debug(`Cleaned DMF volume for bucket: ${bucketName}`);
 });
diff --git a/tests/ctst/steps/utils/kubernetes.ts b/tests/ctst/steps/utils/kubernetes.ts
@@ -174,7 +174,7 @@ export async function createJobAndWaitForCompletion(
             );
         });
     } catch (err: unknown) {
-        world.logger.error('Error creating or waiting for job completion', {
+        world.logger.debug('Error creating or waiting for job completion', {
             jobName,
             err,
         });
@@ -219,7 +219,7 @@ export async function createAndRunPod(
                                 resolve();
                             } else if (phase === 'Failed') {
                                 clearTimeout(timeoutId);
-                                world.logger.error('Pod failed', { 
+                                world.logger.debug('Pod failed', { 
                                     podName, 
                                     status: watchObj.object?.status 
                                 });
@@ -248,7 +248,7 @@ export async function createAndRunPod(
 
         return response.body;
     } catch (err: unknown) {
-        world.logger.error('Failed to create and run pod:', { err });
+        world.logger.debug('Failed to create and run pod:', { err });
         throw new Error(`Failed to create and run pod: ${err}`);
     }
 }
@@ -295,7 +295,7 @@ export async function waitForZenkoToStabilize(
             'zenkos',
             'end2end',
         ).catch(err => {
-            world.logger.error('Error getting Zenko CR', {
+            world.logger.debug('Error getting Zenko CR', {
                 err: err as unknown,
             });
             return null;
@@ -433,7 +433,7 @@ export async function displayCRStatus(world: Zenko, namespace = 'default') {
         'zenkos',
         'end2end',
     ).catch(err => {
-        world.logger.error('Error getting Zenko CR', {
+        world.logger.debug('Error getting Zenko CR', {
             err: err as unknown,
         });
         return null;
@@ -524,7 +524,7 @@ export async function createSecret(
         const response = await coreClient.createNamespacedSecret(namespace, secret);
         return response;
     } catch (err) {
-        world.logger.error('Error creating secret', {
+        world.logger.debug('Error creating secret', {
             namespace,
             secret,
             err,
@@ -615,3 +615,123 @@ export async function getZenkoVersion(
     }
 }
 
+/**
+ * Execute a shell command in a pod with host volume access
+ * Simplified to only support host path mounting for system volumes
+ * @param world - The Zenko world object
+ * @param command - The command to execute
+ * @param options - The options for the command execution
+ * @returns The output of the command
+ */
+export async function execCommandWithVolumeAccess(
+    world: Zenko,
+    command: string,
+    options: {
+        volumeMountPath?: string;
+        hostPath?: string;
+        image?: string;
+        namespace?: string;
+        timeout?: number;
+        cleanup?: boolean;
+    } = {}
+): Promise<string> {
+    const {
+        volumeMountPath = '/cold-data',
+        hostPath = '/cold-data',
+        image = 'alpine:3.22',
+        namespace = 'default',
+        timeout = 30000,
+        cleanup = true,
+    } = options;
+
+    // Generate unique pod name to prevent conflicts between concurrent tests
+    const timestamp = Date.now();
+    const randomId = Math.random().toString(36).substring(2, 8);
+    const testContext = world.getSaved?.('bucketName') || 'test';
+    const podName = `ctst-exec-${testContext}-${timestamp}-${randomId}`.toLowerCase();
+    
+    const podManifest: V1Pod = {
+        apiVersion: 'v1',
+        kind: 'Pod',
+        metadata: {
+            name: podName,
+            namespace,
+            labels: {
+                'app.kubernetes.io/name': 'ctst-command-executor',
+                'app.kubernetes.io/component': 'test-utility',
+                'ctst.test/execution-id': `${timestamp}-${randomId}`
+            }
+        },
+        spec: {
+            restartPolicy: 'Never',
+            securityContext: {
+                runAsNonRoot: false,
+                fsGroup: 0
+            },
+            containers: [{
+                name: 'executor',
+                image,
+                command: ['/bin/sh', '-c', command],
+                securityContext: {
+                    runAsUser: 0,
+                    allowPrivilegeEscalation: false,
+                    readOnlyRootFilesystem: false,
+                    capabilities: {
+                        drop: ['ALL']
+                    }
+                },
+                volumeMounts: [{
+                    name: 'host-volume',
+                    mountPath: volumeMountPath
+                }]
+            }],
+            volumes: [{
+                name: 'host-volume',
+                hostPath: {
+                    path: hostPath,
+                    type: 'DirectoryOrCreate'
+                }
+            }]
+        }
+    };
+
+    try {
+        await createAndRunPod(world, podManifest, true, cleanup, timeout);
+        
+        const coreClient = createKubeCoreClient(world);
+        const logs = await coreClient.readNamespacedPodLog(podName, namespace);
+        
+        return logs.body.trim();
+    } catch (error) {
+        world.logger.debug('Command execution failed', { 
+            command, 
+            podName, 
+            error: error instanceof Error ? error.message : String(error)
+        });
+        throw error;
+    }
+}
+
+/**
+ * Execute command in Kubernetes cluster with host volume access
+ * Designed for concurrent test execution without conflicts
+ * Uses unique pod names and labels for isolation
+ */
+export async function execInCluster(
+    world: Zenko, 
+    command: string,
+    volumeOptions?: Parameters<typeof execCommandWithVolumeAccess>[2]
+): Promise<string> {
+    world.logger.debug('Executing command in cluster', { command });
+    
+    try {
+        return await execCommandWithVolumeAccess(world, command, volumeOptions);
+    } catch (error) {
+        world.logger.debug('Kubernetes command execution failed', {
+            command,
+            error,
+        });
+        throw error;
+    }
+}
+