Keeping the extracted and deleting download file, restructuring test

apbose · apbose · commit dbfd7eefe533 · 2025-07-03T17:38:38.000-07:00
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -8,15 +8,13 @@
 import tempfile
 import urllib.request
 import warnings
-from contextlib import contextmanager
 from dataclasses import fields, replace
 from enum import Enum
 from pathlib import Path
 from typing import (
     Any,
     Callable,
     Dict,
-    Iterator,
     List,
     Optional,
     Sequence,
@@ -864,40 +862,52 @@ def is_platform_supported_for_trtllm(platform: str) -> bool:
     return True
 
 
-@contextmanager
-def download_plugin_lib_path(platform: str) -> Iterator[str]:
-    """
-    Downloads (if needed) and extracts the TensorRT-LLM plugin wheel for the specified platform,
-    then yields the path to the extracted shared library (.so or .dll).
+def _cache_root() -> Path:
+    username = getpass.getuser()
+    return Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
 
-    The wheel file is cached in a user-specific temporary directory to avoid repeated downloads.
-    Extraction happens in a temporary directory that is cleaned up after use.
 
-    Args:
-        platform (str): The platform identifier string (e.g., 'linux_x86_64') to select the correct wheel.
+def _extracted_dir_trtllm(platform: str) -> Path:
+    return _cache_root() / "trtllm" / f"{__tensorrt_llm_version__}_{platform}"
 
-    Yields:
-        str: The full path to the extracted TensorRT-LLM shared library file.
 
-    Raises:
-        ImportError: If the 'zipfile' module is not available.
-        RuntimeError: If the wheel file is missing, corrupted, or extraction fails.
+def download_and_get_plugin_lib_path(platform: str) -> Optional[str]:
     """
-    plugin_lib_path = None
-    username = getpass.getuser()
-    torchtrt_cache_dir = Path(tempfile.gettempdir()) / f"torch_tensorrt_{username}"
-    torchtrt_cache_dir.mkdir(parents=True, exist_ok=True)
-    file_name = f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-{_WHL_CPYTHON_VERSION}-{platform}.whl"
-    torchtrt_cache_trtllm_whl = torchtrt_cache_dir / file_name
-    downloaded_file_path = torchtrt_cache_trtllm_whl
-
-    if not torchtrt_cache_trtllm_whl.exists():
-        # Downloading TRT-LLM lib
+    Returns the path to the TensorRT‑LLM shared library, downloading and extracting if necessary.
+
+    Args:
+        platform (str): Platform identifier (e.g., 'linux_x86_64')
+
+    Returns:
+        Optional[str]: Path to shared library or None if operation fails.
+    """
+    wheel_filename = (
+        f"tensorrt_llm-{__tensorrt_llm_version__}-{_WHL_CPYTHON_VERSION}-"
+        f"{_WHL_CPYTHON_VERSION}-{platform}.whl"
+    )
+    wheel_path = _cache_root() / wheel_filename
+    extract_dir = _extracted_dir_trtllm(platform)
+    # else will never be met though
+    lib_filename = (
+        "libnvinfer_plugin_tensorrt_llm.so"
+        if "linux" in platform
+        else "libnvinfer_plugin_tensorrt_llm.dll"
+    )
+    # eg: /tmp/torch_tensorrt_<username>/trtllm/0.17.0.post1_linux_x86_64/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so
+    plugin_lib_path = extract_dir / "tensorrt_llm" / "libs" / lib_filename
+
+    if plugin_lib_path.exists():
+        return str(plugin_lib_path)
+
+    wheel_path.parent.mkdir(parents=True, exist_ok=True)
+    extract_dir.mkdir(parents=True, exist_ok=True)
+
+    if not wheel_path.exists():
         base_url = "https://pypi.nvidia.com/tensorrt-llm/"
-        download_url = base_url + file_name
+        download_url = base_url + wheel_filename
         try:
-            logger.debug(f"Downloading {download_url} ...")
-            urllib.request.urlretrieve(download_url, downloaded_file_path)
+            logger.debug("Downloading %s ...", download_url)
+            urllib.request.urlretrieve(download_url, wheel_path)
             logger.debug("Download succeeded and TRT-LLM wheel is now present")
         except urllib.error.HTTPError as e:
             logger.error(
@@ -910,41 +920,45 @@ def download_plugin_lib_path(platform: str) -> Iterator[str]:
         except OSError as e:
             logger.error(f"Local file write error: {e}")
 
-    # Proceeding with the unzip of the wheel file in tmpdir
-    if "linux" in platform:
-        lib_filename = "libnvinfer_plugin_tensorrt_llm.so"
-    else:
-        # This condition is never met though
-        lib_filename = "libnvinfer_plugin_tensorrt_llm.dll"
+    try:
+        import zipfile
+    except ImportError as e:
+        raise ImportError(
+            "zipfile module is required but not found. Please install zipfile"
+        )
+    try:
+        with zipfile.ZipFile(wheel_path) as zip_ref:
+            zip_ref.extractall(extract_dir)
+            logger.debug(f"Extracted wheel to {extract_dir}")
+    except FileNotFoundError as e:
+        # This should capture the errors in the download failure above
+        logger.error(f"Wheel file not found at {wheel_path}: {e}")
+        raise RuntimeError(
+            f"Failed to find downloaded wheel file at {wheel_path}"
+        ) from e
+    except zipfile.BadZipFile as e:
+        logger.error(f"Invalid or corrupted wheel file: {e}")
+        raise RuntimeError(
+            "Downloaded wheel file is corrupted or not a valid zip archive"
+        ) from e
+    except Exception as e:
+        logger.error(f"Unexpected error while extracting wheel: {e}")
+        raise RuntimeError(
+            "Unexpected error during extraction of TensorRT-LLM wheel"
+        ) from e
 
-    with tempfile.TemporaryDirectory() as tmpdir:
-        try:
-            import zipfile
-        except ImportError:
-            raise ImportError(
-                "zipfile module is required but not found. Please install zipfile"
-            )
-        try:
-            with zipfile.ZipFile(downloaded_file_path, "r") as zip_ref:
-                zip_ref.extractall(tmpdir)  # Extract to a folder named 'tensorrt_llm'
-        except FileNotFoundError as e:
-            # This should capture the errors in the download failure above
-            logger.error(f"Wheel file not found at {downloaded_file_path}: {e}")
-            raise RuntimeError(
-                f"Failed to find downloaded wheel file at {downloaded_file_path}"
-            ) from e
-        except zipfile.BadZipFile as e:
-            logger.error(f"Invalid or corrupted wheel file: {e}")
-            raise RuntimeError(
-                "Downloaded wheel file is corrupted or not a valid zip archive"
-            ) from e
-        except Exception as e:
-            logger.error(f"Unexpected error while extracting wheel: {e}")
-            raise RuntimeError(
-                "Unexpected error during extraction of TensorRT-LLM wheel"
-            ) from e
-        plugin_lib_path = os.path.join(tmpdir, "tensorrt_llm/libs", lib_filename)
-        yield plugin_lib_path
+    try:
+        wheel_path.unlink(missing_ok=True)
+        logger.debug(f"Deleted wheel file: {wheel_path}")
+    except Exception as e:
+        logger.warning(f"Could not delete wheel file {wheel_path}: {e}")
+    if not plugin_lib_path.exists():
+        logger.error(
+            f"Plugin library not found at expected location: {plugin_lib_path}"
+        )
+        return None
+
+    return str(plugin_lib_path)
 
 
 def load_and_initialize_trtllm_plugin(plugin_lib_path: str) -> bool:
@@ -1034,6 +1048,6 @@ def load_tensorrt_llm_for_nccl() -> bool:
             )
             return False
 
-        with download_plugin_lib_path(platform) as plugin_lib_path:
-            return load_and_initialize_trtllm_plugin(plugin_lib_path)
+        plugin_lib_path = download_and_get_plugin_lib_path(platform)
+        return load_and_initialize_trtllm_plugin(plugin_lib_path)  # type: ignore[arg-type]
     return False
diff --git a/tests/py/dynamo/distributed/distributed_utils.py b/tests/py/dynamo/distributed/distributed_utils.py
@@ -13,7 +13,6 @@ def set_environment_variables_pytest():
     os.environ["RANK"] = str(0)
     os.environ["MASTER_ADDR"] = "127.0.0.1"
     os.environ["MASTER_PORT"] = str(29500)
-    os.environ["USE_TRTLLM_PLUGINS"] = "1"
 
 
 def initialize_logger(rank, logger_file_name):
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -4,18 +4,42 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from conversion.harness import DispatchTestCase
 from distributed_utils import set_environment_variables_pytest
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
 from torch_tensorrt._enums import Platform
 
-set_environment_variables_pytest()
-dist.init_process_group(backend="nccl", init_method="env://")
-group = dist.new_group(ranks=[0])
-group_name = group.group_name
-world_size = 1
 
-from conversion.harness import DispatchTestCase
+class DistributedGatherModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
+
+    def forward(self, x):
+        x = self.fc(x)
+        gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+            x, self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(gathered_tensor)
+
+
+class DistributedReduceScatterModel(nn.Module):
+    def __init__(self, input_dim, world_size, group_name):
+        super().__init__()
+        self.fc = nn.Linear(input_dim, input_dim)
+        self.world_size = world_size
+        self.group_name = group_name
+
+    def forward(self, x):
+        x = self.fc(x)
+        out = torch.ops._c10d_functional.reduce_scatter_tensor(
+            x, "sum", self.world_size, self.group_name
+        )
+        return torch.ops._c10d_functional.wait_tensor(out)
+
 
 platform_str = str(Platform.current_platform()).lower()
 
@@ -25,64 +49,49 @@ class TestGatherNcclOpsConverter(DispatchTestCase):
         "win" or "aarch64" in platform_str,
         "Skipped on Windows and Jetson: NCCL backend is not supported.",
     )
+    @classmethod
+    def setUpClass(cls):
+        set_environment_variables_pytest()
+        print("USE_TRTLLM_PLUGINS =", os.environ.get("USE_TRTLLM_PLUGINS"))
+        cls.world_size = 1
+        if not dist.is_initialized():
+            dist.init_process_group(
+                backend="nccl",
+                init_method="env://",
+                world_size=cls.world_size,
+                rank=0,  # or read from env
+            )
+        cls.group = dist.new_group(ranks=[0])
+        cls.group_name = cls.group.group_name
+
+    @classmethod
+    def tearDownClass(cls):
+        if dist.is_initialized():
+            dist.destroy_process_group()
+
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):
-        class DistributedGatherModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
-                    x, world_size, group_name
-                )
-                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
-                    gathered_tensor
-                )
-                return gathered_tensor
-
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
         self.run_test(
-            DistributedGatherModel(linear_layer_dim).cuda(),
+            DistributedGatherModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
         )
 
-    @unittest.skipIf(
-        "win" or "aarch64" in platform_str,
-        "Skipped on Windows and Jetson: NCCL backend is not supported.",
-    )
     @parameterized.expand([8])
     def test_nccl_ops_scatter(self, linear_layer_dim):
-
-        class DistributedReduceScatterModel(nn.Module):
-            def __init__(self, input_dim):
-                super().__init__()
-                self.fc = torch.nn.Linear(input_dim, input_dim)
-
-            def forward(self, x):
-                x = self.fc(x)
-                scatter_reduce_tensor = (
-                    torch.ops._c10d_functional.reduce_scatter_tensor(
-                        x, "sum", world_size, group_name
-                    )
-                )
-                scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(
-                    scatter_reduce_tensor
-                )
-                return scatter_reduce_tensor
-
         inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
-
         self.run_test(
-            DistributedReduceScatterModel(linear_layer_dim).cuda(),
+            DistributedReduceScatterModel(
+                linear_layer_dim, self.world_size, self.group_name
+            ).cuda(),
             inputs,
             use_dynamo_tracer=True,
             enable_passes=True,
         )
-        dist.destroy_process_group()
 
 
 if __name__ == "__main__":
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.sh b/tests/py/dynamo/distributed/test_nccl_ops.sh
@@ -70,51 +70,6 @@ ensure_pytest_installed(){
 
 echo "Setting up the environment"
 
-OS="$(uname -s)"
-ARCH="$(uname -m)"
-
-
-#getting the file name for TensorRT-LLM download
-if [[ "$OS" == "Linux" && "$ARCH" == "x86_64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_x86_64.whl"
-elif [[ "$OS" == "Linux" && "$ARCH" == "aarch64"]]; then
-    FILE="tensorrt_llm-0.17.0.post1-cp312-cp312-linux_aarch64.whl"
-else:
-    echo "Unsupported platform: OS=$OS ARCH=$ARCH
-    exit 1
-fi
-
-# Download the selected file
-URL="https://pypi.nvidia.com/tensorrt-llm/$FILE"
-echo "Downloading $FILE from $URL..."
-
-#Installing wget
-ensure_installed wget
-
-#Downloading the file
-filename=$(basename "$URL")
-if [ -f "$filename" ]; then
-    echo "File already exists: $filename"
-else
-    wget "$URL"
-fi
-echo "Download complete: $FILE"
-
-UNZIP_DIR="tensorrt_llm_unzip"
-if [[ ! -d "$UNZIP_DIR" ]]; then
-    echo "Creating directory: $UNZIP_DIR"
-    mkdir -p "$UNZIP_DIR"
-    echo "extracting $FILE to $UNZIP_DIR ..."
-    #Installing unzip
-    ensure_installed unzip
-    #unzip the TensorRT-LLM package
-    unzip -q "$FILE" -d "$UNZIP_DIR"
-    echo "Unzip complete"
-fi
-
-
-export TRTLLM_PLUGINS_PATH="$(pwd)/${UNZIP_DIR}/tensorrt_llm/libs/libnvinfer_plugin_tensorrt_llm.so"
-echo ${TRTLLM_PLUGINS_PATH}
 
 ensure_mpi_installed libmpich-dev
 ensure_mpi_installed libopenmpi-dev
@@ -123,7 +78,7 @@ run_tests() {
     cd ..
     export PYTHONPATH=$(pwd)
     echo "Running pytest on distributed/test_nccl_ops.py..."
-    pytest distributed/test_nccl_ops.py
+    USE_TRTLLM_PLUGINS=1 pytest distributed/test_nccl_ops.py
 }
 
 run_mpi_tests(){