pytorch
diff --git a/‎examples/dynamo/cache_utils.py
Lines changed: 152 additions & 0 deletions b/‎examples/dynamo/cache_utils.py
Lines changed: 152 additions & 0 deletions
diff --git a/‎examples/dynamo/dynamic_cache.py
Lines changed: 16 additions & 29 deletions b/‎examples/dynamo/dynamic_cache.py
Lines changed: 16 additions & 29 deletions
diff --git a/‎examples/dynamo/llama3_trt.py
Lines changed: 60 additions & 10 deletions b/‎examples/dynamo/llama3_trt.py
Lines changed: 60 additions & 10 deletions
@@ -0,0 +1,152 @@
+import torch
+from torch.fx import Graph, GraphModule, Node
+from typing import Optional, Union, Iterable, List, Tuple
+from torch._ops import OpOverloadPacket
+from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
+from torch.fx.passes.shape_prop import _extract_tensor_metadata
+from torch.utils._pytree import _LEAF_SPEC
+from torch._export.utils import _detect_fake_mode_from_gm
+
+def get_kv_nodes(gm):
+    """
+    Get the key and value nodes from the graph.
+    """
+    kv_nodes = []
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch._C._nn.scaled_dot_product_attention:
+            q_node, k_node, v_node = node.args[:3]
+            kv_nodes.append((k_node, v_node))
+    return kv_nodes
+
+def get_random_tensor_from_node(node: Node) -> torch.Tensor:
+        """
+        Creates a random tensor based on the shape information in a node's metadata.
+        For symbolic dimensions, extracts the maximum value from the shape environment.
+        
+        Args:
+            node: A torch.fx.Node object with metadata containing tensor information
+            
+        Returns:
+            A random tensor with shape matching the node's metadata, or None if no valid
+            tensor information is found
+        """
+        if "val" not in node.meta:
+            raise ValueError(f"No tensor information found in node metadata for node: {node}")
+            
+        fake_tensor = node.meta["val"]
+        shape = []
+        
+        # Iterate through each dimension and handle symbolic dimensions
+        for dim in fake_tensor.shape:
+            if isinstance(dim, torch.SymInt):
+                # Extract the maximum value from the shape environment
+                max_val = dim.node.hint
+                shape.append(max_val)
+            else:
+                shape.append(dim)
+        
+        # Create a random tensor with the determined shape
+        dtype = fake_tensor.dtype
+        device = fake_tensor.device
+        random_tensor = torch.rand(shape, dtype=dtype, device=device)
+
+        return random_tensor
+
+def create_random_output_tensors(nodes: List[Node]) -> List[torch.Tensor]:
+    """
+    Creates random tensors based on the shape information in node metadata.
+    For symbolic dimensions, extracts the maximum value from the shape environment.
+    
+    Args:
+        nodes: List of torch.fx.Node objects with metadata
+        
+    Returns:
+        List of random tensors with shapes matching the nodes' metadata
+    """
+    random_tensors = []
+    
+    for node in nodes:
+        if isinstance(node, Node):
+            node_tensor = get_random_tensor_from_node(node)
+        elif isinstance(node, tuple):
+            node_tensor_list = []
+            for n in node:
+                random_tensor = get_random_tensor_from_node(n)
+                node_tensor_list.append(random_tensor)
+            node_tensor = tuple(node_tensor_list)
+               
+        random_tensors.append(node_tensor)
+    
+    return random_tensors
+
+def add_graph_input(
+    gm: GraphModule, name: str, val: Optional[torch.Tensor] = None, dynamic_shape=None
+) -> Node:
+    """Add a graph input to the given GraphModule and return the newly created node.
+
+    NOTE: function does NOT do any graph canonicalization. This is left to the user!
+
+    Args:
+        gm (GraphModule): The GraphModule to add the input to.
+        name (str): The name of the input.
+        val (torch.Tensor): An example tensor to use for the input.
+        dynamic_shape: The dynamic shape of the input tensor [NOT SUPPORTED YET]
+    """
+    # check that no dynamic shape is provided...
+    if dynamic_shape:
+        raise NotImplementedError("Dynamic shape not supported for adding graph inputs")
+
+    # extract graph and input spec
+    graph: Graph = gm.graph
+
+    in_spec = graph._codegen.pytree_info.in_spec
+    in_spec_for_args = in_spec.children_specs[0]
+    orig_args = graph._codegen.pytree_info.orig_args
+    assert in_spec_for_args.type is tuple
+
+    # insert input node after currently last input node
+    node_last_input = graph.find_nodes(op="placeholder", sort=True)[-1]
+    with graph.inserting_after(node_last_input):
+        in_node = graph.placeholder(name)
+        in_spec_for_args.children_specs.append(_LEAF_SPEC)
+        orig_args.append(f"arg_{name}")
+
+    # update pytree info recursively with __post_init__ starting at leaves
+    def call_post_init(spec):
+        for child_spec in spec.children_specs:
+            call_post_init(child_spec)
+        spec.__post_init__()
+
+    call_post_init(in_spec)
+
+    # set fake tensor information if all required information is available
+    fake_mode: Optional[FakeTensorMode] = _detect_fake_mode_from_gm(gm)
+    if fake_mode and val is not None and isinstance(val, torch.Tensor):
+        if isinstance(val, FakeTensor):
+            fake_tensor = val
+        else:
+            fake_tensor: FakeTensor = fake_mode.from_tensor(val, static_shapes=True)
+        in_node.meta["val"] = fake_tensor
+        in_node.meta["tensor_meta"] = _extract_tensor_metadata(fake_tensor)
+
+    # return new node...
+    return in_node
+
+def is_op(node: Node, ops: Union[OpOverloadPacket, Iterable[OpOverloadPacket]]) -> bool:
+    """Check if the node is a call to one of the ops."""
+    if node.op != "call_function":
+        return False
+    # check if it's a single op that's provided
+    if isinstance(ops, OpOverloadPacket):
+        ops = [ops]
+
+    # check if it's the op itself instead of an overload
+    if any(node.target == op for op in ops):
+        return True
+
+    return False
+
+def get_all_input_output_nodes(graph: Graph) -> Tuple[List[Node], List[Node]]:
+    input_nodes: List[Node] = graph.find_nodes(op="placeholder")
+    output_nodes: List[Node] = graph.find_nodes(op="output")
+    return (input_nodes, output_nodes)
@@ -14,7 +14,7 @@
     clean_up_graph_after_modifications,
 )
 
-from .cache_utils import add_graph_input, create_random_output_tensors, get_kv_nodes
+from cache_utils import add_graph_input, create_random_output_tensors, get_kv_nodes, is_op
 import tensorrt
 import torch.utils._pytree as pytree
 logger = logging.getLogger(__name__)
@@ -146,23 +146,7 @@ def get_static_tensor(tensor: torch.Tensor):
             v_input = add_graph_input(gm, key_value[1].name+"_v_input", v_val)
             kv_inputs.append((k_input, v_input))
 
-        # Add start_idx and end_idx as inputs
-        start_idx_input = add_graph_input(gm, "start_idx")
-        end_idx_input = add_graph_input(gm, "end_idx")
-        return kv_inputs, start_idx_input, end_idx_input
-
-def insert_kv_slicing_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Tensor, torch.Tensor]]):
-    """
-    Insert slicing operations before each scaled_dot_product_attention operation.
-    """
-    pass
-    # Find all nodes with scaled_dot_product_attention
-    sdpa_nodes = []
-    for node in gm.graph.nodes:
-        if node.op == "call_function" and node.target == torch._C._nn.scaled_dot_product_attention:
-            sdpa_nodes.append(node)
-
-    for idx, sdpa_node in enumerate(sdpa_nodes):
+        return kv_inputs
 
 
 def insert_torch_cond_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Tensor, torch.Tensor]]):
@@ -181,41 +165,44 @@ def insert_torch_cond_before_sdpa(gm, incoming_keys_values: List[Tuple[torch.Ten
         if node.op == "call_function" and node.target == torch._C._nn.scaled_dot_product_attention:
             sdpa_nodes.append(node)
 
+    # Get the is_causal input node 
+    is_causal_node = next((node for node in gm.graph.nodes if node.op == "placeholder" and node.name == "is_causal"), None)
+
     # For each SDPA node, insert a torch.cond operation before it
     for idx, sdpa_node in enumerate(sdpa_nodes):
 
         with gm.graph.inserting_before(sdpa_node):
-            pred_node = add_graph_input(gm, "is_generate", torch.tensor(False, dtype=torch.bool))
+            # pred_node = add_graph_input(gm, "is_generate", torch.tensor(False, dtype=torch.bool))
             q_node, k_node, v_node = sdpa_node.args[:3]
             incoming_key, incoming_value = incoming_keys_values[idx]
             # Create nodes for concatenating k with incoming_key and v with incoming_value
             concatenated_k_node = gm.graph.create_node(
                 "call_function",
                 torch.ops.aten.cat.default,
-                args=([k_node, incoming_key], 2),  # Concatenate along sequence length dimension
+                args=([incoming_key, k_node], 2),  # Concatenate along sequence length dimension
                 kwargs={}
             )
             concatenated_v_node = gm.graph.create_node(
                 "call_function",
                 torch.ops.aten.cat.default,
-                args=([v_node, incoming_value], 2),  #  Concatenate along sequence length dimension
+                args=([incoming_value, v_node], 2),  #  Concatenate along sequence length dimension
                 kwargs={}
             )
 
             # Create the torch.cond node
             cond_k_node = gm.graph.create_node(
                 "call_function",
                 torch.ops.higher_order.cond,
-                args=(pred_node, concatenated_k_node, k_node),
+                args=(is_causal_node, concatenated_k_node, k_node),
             )
 
             cond_v_node = gm.graph.create_node(
                 "call_function",
                 torch.ops.higher_order.cond,
-                args=(pred_node, concatenated_v_node, v_node),
+                args=(is_causal_node, concatenated_v_node, v_node),
             )
 
-            sdpa_node.args = (q_node, cond_k_node, cond_v_node)
+            sdpa_node.args = (q_node, cond_k_node, cond_v_node) + sdpa_node.args[3:]
 
     return gm
 
@@ -229,13 +216,13 @@ def insert_dynamic_kv_cache(
     """Perform insertion of kv-caches and attention kernel."""
 
     # Add static key and value as inputs to the graph
-    kv_inputs, start_idx_input, end_idx_input = add_kv_and_indices_as_inputs(gm, fixed_kv=True)
+    kv_inputs  = add_kv_and_indices_as_inputs(gm, fixed_kv=True)
 
-    # Call the function to add QKV as outputs
-    logits_keys_values = add_kv_as_outputs(gm, start_idx_input, end_idx_input)
+    # Call the function to add KV as outputs
+    logits_keys_values = add_kv_as_outputs(gm)
 
-    gm = insert_kv_slicing_before_sdpa(gm, kv_inputs, start_idx_input, end_idx_input)
-    # gm = insert_torch_cond_before_sdpa(gm, kv_inputs)
+    # Insert torch.cond before each SDPA node which acts toggles between prefill and generate phases
+    gm = insert_torch_cond_before_sdpa(gm, kv_inputs)
 
     gm = clean_up_graph_after_modifications(gm)
 
 
@@ -19,7 +19,7 @@
 import torch_tensorrt
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from contextlib import nullcontext
-from utils import export_llm, generate, recordStats, time_generate, generate_with_kv_cache
+from utils import export_llm, generate, recordStats, time_generate, generate_with_kv_cache, get_zeroed_kv_cache_inputs
 
 
 DEVICE = torch.device("cuda:0")
@@ -43,7 +43,7 @@ def get_model(args):
                     args.model,
                     use_cache=False,
                     attn_implementation="sdpa",
-                    # num_hidden_layers=1
+                    num_hidden_layers=1
                 )
                 .eval()
                 .cuda()
@@ -194,9 +194,10 @@ def measure_perf(trt_model, input_signature, backend_name):
         help="Enable pytorch run (default: False)"
     )
     arg_parser.add_argument(
-        "--kv_cache",
-        action="store_true",
-        help="Enable kv_cache (default: False)"
+        "--cache",
+        type=str,
+        default="static",
+        help="Type of KV cache to use",
     )
     arg_parser.add_argument(
         "--cudagraph",
@@ -220,9 +221,9 @@ def measure_perf(trt_model, input_signature, backend_name):
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
 
         prompt = "What is parallel programming ?"
+        # prompt = "What is the capital of France ?"
         model_inputs = tokenizer(prompt, return_tensors="pt")
         input_ids = model_inputs["input_ids"].to(DEVICE)
-
         # Prepare input prompt
         # word = "What"
         # word_ids = tokenizer(word, return_tensors="pt").input_ids[0]  # Get the first (and only) sequence
@@ -252,18 +253,67 @@ def measure_perf(trt_model, input_signature, backend_name):
                 )
 
         # TRT
+        pyt_logits_tok1 = model.cuda()(input_ids)
+        next_tokens = torch.argmax(pyt_logits_tok1.logits[:, -1, :], dim=-1)
+        input_seq = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        pyt_logits_tok2 = model.cuda()(input_seq)
         from lower_sdpa import *
-        if args.kv_cache:
-            # This import is required to register static/dynamic KV cache transformations as lowering passes
-            from static_cache import *
+        if args.cache == "static":
+            # This import is required to register static KV cache transformations as lowering passes
+            from static_cache2 import *
+            trt_model = compile_torchtrt(model, input_ids, args) 
+            kv_cache = get_zeroed_kv_cache_inputs(trt_model)
+
+            # First token generation
+            pyt_keys = torch.load("key.pt"); pyt_values = torch.load("value.pt")
+            trt_logits, key_cache, value_cache, trt_keys_1, trt_values_1 = trt_model(input_ids.clone(), True, *kv_cache, 0, input_ids.shape[1])
+            print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits_tok1.logits - trt_logits))}")
+            print(f"Diff between pyt and trt keys: {torch.mean(torch.abs(pyt_keys - trt_keys_1))}")
+            print(f"Diff between pyt and trt keys in cache: {torch.mean(torch.abs(pyt_keys - key_cache[:, :, :-2, :]))}")
+            print(f"Diff between pyt and trt values: {torch.mean(torch.abs(pyt_values - trt_values_1))}")
+            print(f"Diff between pyt and trt values in cache: {torch.mean(torch.abs(pyt_values - value_cache[:, :, :-2, :]))}")
+            next_tokens = torch.argmax(trt_logits[:, -1, :], dim=-1)
+
+            # Second token generation
+            trt_logits_2, key_cache2, value_cache2, trt_keys_2, trt_values_2 = trt_model(next_tokens[:, None], False, key_cache.clone(), value_cache.clone(), input_ids.shape[1], input_ids.shape[1]+1)
+            pyt_keys2 = torch.load("key2.pt"); pyt_values2 = torch.load("value2.pt")
+            print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits_tok2.logits[:, -1:, :] - trt_logits_2))}")
+            print(f"Diff between pyt and trt keys: {torch.mean(torch.abs(pyt_keys2[:, :, -2:-1, :] - trt_keys_2))}")
+            print(f"Diff between pyt and trt keys in cache: {torch.mean(torch.abs(pyt_keys2 - key_cache2[:, :, :-1, :]))}")
+            print(f"Diff between pyt and trt values: {torch.mean(torch.abs(pyt_values2[:, :, -2:-1, :] - trt_values_2))}")
+            print(f"Diff between pyt and trt values in cache: {torch.mean(torch.abs(pyt_values2 - value_cache2[:, :, :-1, :]))}")
+            breakpoint()
+        elif args.cache == "dynamic":
+            from dynamic_cache import *
             trt_model = compile_torchtrt(model, input_ids, args) 
+            breakpoint()
+            kv_cache = get_zeroed_kv_cache_inputs(trt_model)
         else:
             # pyt_logits = model.cuda()(input_ids.clone())
             trt_model = compile_torchtrt(model, input_ids, args) 
             # trt_logits = trt_model(input_ids.clone(), True)
             # print(f"Diff between pyt and trt: {torch.mean(torch.abs(pyt_logits - trt_logits))}")
             # print(f"Diff between pyt and trt logits: {torch.mean(torch.abs(pyt_logits.logits - trt_logits.logits))}")
-        if args.kv_cache:
+        if args.cache == "static":
+            if args.cudagraph:
+                # Run a decoding loop with prefill and generate phases so that the CUDAGraph is recorded for both of these phases.
+                # trt_input_signature = (input_ids.clone(),) + get_zeroed_kv_cache_inputs(trt_model)
+                torch_tensorrt.runtime.set_cudagraphs_mode(True)
+             
+            trt_gen_tokens = generate_with_kv_cache(
+                trt_model, input_ids.clone(), MAX_OUTPUT_SEQ_LENGTH, tokenizer.eos_token_id,
+                )
+
+            if args.benchmark:
+                trt_timings = time_generate(
+                    generate_with_kv_cache,
+                    trt_model,
+                    input_ids.clone(),
+                    MAX_OUTPUT_SEQ_LENGTH,
+                    tokenizer.eos_token_id,
+                    iterations=args.iterations,
+                )
+        elif args.cache == "dynamic":
             if args.cudagraph:
                 # Run a decoding loop with prefill and generate phases so that the CUDAGraph is recorded for both of these phases.
                 # trt_input_signature = (input_ids.clone(),) + get_zeroed_kv_cache_inputs(trt_model)