Add integration test for Firecracker with clippy-tracing instrumentation

demoncoder-crypto · demoncoder-crypto · commit 0595ad0c5f92 · 2025-06-09T20:49:03.000+05:30
diff --git a/tests/integration_tests/functional/test_firecracker_tracing.py b/tests/integration_tests/functional/test_firecracker_tracing.py
@@ -0,0 +1,305 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Test that Firecracker works correctly when instrumented with tracing and trace level logs are enabled.
+
+This test addresses GitHub issue about adding a test that checks if simple integration tests work 
+when Firecracker is instrumented with src/clippy-tracing and trace level logs are enabled.
+
+The existing test_log_instrument.py only tests the log-instrument crate with example programs.
+This test verifies that the log-instrument crate can successfully instrument the actual Firecracker
+binary and that the instrumented Firecracker works correctly with trace-level logging.
+
+Test Coverage:
+1. Uses clippy-tracing tool to add instrumentation to key Firecracker source files
+2. Builds Firecracker with --features tracing to enable the log-instrument functionality  
+3. Spawns instrumented Firecracker with trace-level logging enabled
+4. Performs basic API operations (describe_instance, machine_config, logger config)
+5. Verifies that:
+   - Firecracker starts and responds to API calls correctly
+   - Trace-level logs are generated
+   - log-instrument traces (function entry/exit) appear in logs with expected format
+   - Meaningful functions from main, API server, and VMM are being traced
+   - Runtime log level filtering works correctly
+   - Performance impact is reasonable (basic smoke test)
+
+The test ensures that instrumentation doesn't break Firecracker's core functionality while
+providing the tracing capabilities needed for debugging deadlocks and performance issues.
+"""
+
+import os
+import re
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from framework import utils
+from framework.microvm import MicroVMFactory
+from framework.properties import global_props
+from host_tools.cargo_build import cargo, get_binary
+
+
+def build_instrumented_firecracker():
+    """Build Firecracker with tracing instrumentation enabled."""
+    # First, add instrumentation using clippy-tracing
+    clippy_tracing = get_binary("clippy-tracing")
+    
+    # Add instrumentation to a subset of files to avoid performance issues
+    # We'll instrument just the API server and main entry points for meaningful traces
+    cargo_args = [
+        "--action", "fix",
+        "--path", "./src/firecracker/src/main.rs",
+        "--path", "./src/firecracker/src/api_server",
+        "--path", "./src/vmm/src/lib.rs",
+        "--path", "./src/vmm/src/builder.rs"
+    ]
+    
+    utils.check_output(f"{clippy_tracing} {' '.join(cargo_args)}")
+    
+    # Build Firecracker with tracing feature enabled
+    cargo("build", "--features tracing --bin firecracker")
+    
+    return get_binary("firecracker")
+
+
+def cleanup_instrumentation():
+    """Remove instrumentation from source files."""
+    clippy_tracing = get_binary("clippy-tracing")
+    
+    # Strip instrumentation from the files we modified
+    strip_args = [
+        "--action", "strip",
+        "--path", "./src/firecracker/src/main.rs",
+        "--path", "./src/firecracker/src/api_server",
+        "--path", "./src/vmm/src/lib.rs", 
+        "--path", "./src/vmm/src/builder.rs"
+    ]
+    
+    utils.check_output(f"{clippy_tracing} {' '.join(strip_args)}")
+
+
+@pytest.fixture(scope="module")
+def instrumented_firecracker_binary():
+    """Fixture that builds an instrumented Firecracker binary and cleans up after tests."""
+    try:
+        binary_path = build_instrumented_firecracker()
+        yield binary_path
+    finally:
+        cleanup_instrumentation()
+
+
+def test_firecracker_tracing_basic_functionality(instrumented_firecracker_binary):
+    """Test that instrumented Firecracker can start and handle basic API calls with trace logging."""
+    # Create a temporary directory for this test
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        
+        # Create a MicroVM factory with the instrumented binary
+        factory = MicroVMFactory(instrumented_firecracker_binary)
+        
+        # Build a microVM
+        vm = factory.build()
+        
+        # Configure basic VM settings
+        vm.basic_config(vcpu_count=1, mem_size_mib=128)
+        
+        # Spawn the VM with trace level logging
+        vm.spawn(log_level="Trace", log_show_level=True, log_show_origin=True)
+        
+        try:
+            # Wait for the API socket to be available
+            vm._wait_for_api_socket()
+            
+            # Make some basic API calls to generate trace logs
+            # Get instance info
+            response = vm.api.describe_instance.get()
+            assert response.status_code == 200
+            
+            # Get machine config
+            response = vm.api.machine_config.get()
+            assert response.status_code == 200
+            
+            # Set logger to trace level to ensure we capture instrumentation logs
+            logger_config = {
+                "level": "Trace"
+            }
+            response = vm.api.logger.put(**logger_config)
+            assert response.status_code == 204
+            
+            # Make another API call after setting trace level
+            response = vm.api.describe_instance.get()
+            assert response.status_code == 200
+            
+            # Verify that the VM is working correctly
+            assert vm.state == "Not started"
+            
+        finally:
+            vm.kill()
+        
+        # Check the logs for instrumentation traces
+        log_data = vm.log_data
+        
+        # Verify that trace level logs are present
+        assert "TRACE" in log_data, "Expected TRACE level logs in output"
+        
+        # Look for log-instrument traces (function entry/exit)
+        # These should have the format: ThreadId(X)>>function_name or ThreadId(X)<<function_name
+        trace_pattern = r"ThreadId\(\d+\)(?:::[^>]*)?(?:>>|<<)\w+"
+        trace_matches = re.findall(trace_pattern, log_data)
+        
+        assert len(trace_matches) > 0, f"Expected to find log-instrument traces in logs, but found none. Log data: {log_data[:1000]}..."
+        
+        # Verify we see function entry and exit traces
+        entry_traces = [match for match in trace_matches if ">>" in match]
+        exit_traces = [match for match in trace_matches if "<<" in match]
+        
+        assert len(entry_traces) > 0, "Expected to find function entry traces (>>)"
+        assert len(exit_traces) > 0, "Expected to find function exit traces (<<)"
+        
+        # Verify that meaningful functions are being traced
+        # Look for traces from main, API handling, or VM management functions
+        meaningful_functions = ["main", "api", "vmm", "request", "response"]
+        found_meaningful = False
+        
+        for trace in trace_matches:
+            for func in meaningful_functions:
+                if func.lower() in trace.lower():
+                    found_meaningful = True
+                    break
+            if found_meaningful:
+                break
+        
+        assert found_meaningful, f"Expected to find traces from meaningful functions, but traces were: {trace_matches[:10]}"
+
+
+def test_firecracker_tracing_with_vm_lifecycle(instrumented_firecracker_binary):
+    """Test that instrumented Firecracker works through a complete VM lifecycle with tracing."""
+    # Skip this test if we don't have the necessary resources
+    if not global_props.host_linux_version_tup >= (4, 14):
+        pytest.skip("Requires Linux kernel >= 4.14")
+    
+    # Create a MicroVM factory with the instrumented binary
+    factory = MicroVMFactory(instrumented_firecracker_binary)
+    
+    # Build a microVM
+    vm = factory.build()
+    
+    # Configure the VM with minimal resources
+    vm.basic_config(vcpu_count=1, mem_size_mib=128, add_root_device=False)
+    
+    # Spawn with trace logging
+    vm.spawn(log_level="Trace")
+    
+    try:
+        # Set logger to trace level
+        logger_config = {"level": "Trace"}
+        response = vm.api.logger.put(**logger_config)
+        assert response.status_code == 204
+        
+        # Test pause/resume functionality with tracing
+        # Note: We can't actually start the VM without a kernel/rootfs,
+        # but we can test the API endpoints
+        
+        # Get metrics (this exercises the metrics system)
+        metrics = vm.get_all_metrics()
+        assert metrics is not None
+        
+        # Test machine config updates
+        machine_config = {
+            "vcpu_count": 1,
+            "mem_size_mib": 128,
+            "smt": False
+        }
+        response = vm.api.machine_config.put(**machine_config)
+        assert response.status_code == 204
+        
+    finally:
+        vm.kill()
+    
+    # Verify tracing worked throughout the lifecycle
+    log_data = vm.log_data
+    
+    # Should have trace logs
+    assert "TRACE" in log_data
+    
+    # Should have instrumentation traces
+    trace_pattern = r"ThreadId\(\d+\)(?:::[^>]*)?(?:>>|<<)\w+"
+    trace_matches = re.findall(trace_pattern, log_data)
+    assert len(trace_matches) > 0, "Expected instrumentation traces during VM lifecycle"
+
+
+def test_firecracker_tracing_performance_impact():
+    """Test that instrumented Firecracker still performs reasonably (basic smoke test)."""
+    import time
+    
+    # This is a basic performance smoke test to ensure tracing doesn't break functionality
+    # We're not doing detailed performance analysis, just ensuring it doesn't hang or crash
+    
+    # Build instrumented binary
+    instrumented_binary = build_instrumented_firecracker()
+    
+    try:
+        factory = MicroVMFactory(instrumented_binary)
+        vm = factory.build()
+        
+        # Time the basic configuration and startup
+        start_time = time.time()
+        
+        vm.basic_config(vcpu_count=1, mem_size_mib=128, add_root_device=False)
+        vm.spawn(log_level="Trace")
+        
+        # Make several API calls
+        for _ in range(5):
+            response = vm.api.describe_instance.get()
+            assert response.status_code == 200
+        
+        elapsed = time.time() - start_time
+        
+        # Should complete within reasonable time (30 seconds is very generous)
+        # This is just to catch major performance regressions or hangs
+        assert elapsed < 30, f"Instrumented Firecracker took too long to start and handle API calls: {elapsed}s"
+        
+        vm.kill()
+        
+    finally:
+        cleanup_instrumentation()
+
+
+def test_trace_log_filtering():
+    """Test that trace log filtering works correctly with instrumented Firecracker."""
+    instrumented_binary = build_instrumented_firecracker()
+    
+    try:
+        factory = MicroVMFactory(instrumented_binary)
+        vm = factory.build()
+        
+        vm.basic_config(vcpu_count=1, mem_size_mib=128, add_root_device=False)
+        vm.spawn(log_level="Info")  # Start with Info level
+        
+        try:
+            # Initially should not have trace logs
+            initial_log_data = vm.log_data
+            
+            # Set logger to trace level
+            logger_config = {"level": "Trace"}
+            response = vm.api.logger.put(**logger_config)
+            assert response.status_code == 204
+            
+            # Make API calls to generate traces
+            for _ in range(3):
+                response = vm.api.describe_instance.get()
+                assert response.status_code == 200
+            
+            # Now should have trace logs
+            final_log_data = vm.log_data
+            
+            # The new log data should contain trace information
+            new_log_data = final_log_data[len(initial_log_data):]
+            assert "TRACE" in new_log_data, "Expected TRACE logs after setting log level to Trace"
+            
+        finally:
+            vm.kill()
+            
+    finally:
+        cleanup_instrumentation()