Add internal jagged_dense_dense_sum op

FindHao · facebook-github-bot · commit 352be402176a · 2025-03-14T10:46:08.000-07:00
Summary: This diff add the internal jagged_dense_dense_elementwise_add_jagged_output_forward operator. It uses the internal input shapes, which is the first step to utilize durin data in tritonbench. scuba table for this op https://fburl.com/scuba/gpu_kernel_stats/huxds0z5 Reviewed By: xuzhao9 Differential Revision: D71073294 fbshipit-source-id: 3ea4a9eca1d4cb4c8c43664b9d2d505b202def14
diff --git a/tritonbench/data/__init__.py b/tritonbench/data/__init__.py
diff --git a/tritonbench/operators/jagged_softmax/operator.py b/tritonbench/operators/jagged_softmax/operator.py
@@ -18,6 +18,7 @@
     get_styles,
     get_tensor_bytes_limit,
     GIGABYTES_PER_BYTE,
+    jagged_to_nested_tensor,
     RANDOM_CHOICE_MARGIN,
     RELATIVE_TOLERANCE,
 )
@@ -178,20 +179,49 @@ def get_input_iter(self) -> Generator:
         """
         Generate random nested tensors of shape (B, *, M), where * is the ragged dimension
         """
+        if not self.prod_shapes:
+            B_vals, M_vals, seqlen_vals, sparsity_vals = self.get_x_vals()
+
+            for nt, B, M, max_seqlen, sparsity in generate_random_nested_tensors(
+                B_vals,
+                M_vals,
+                seqlen_vals,
+                sparsity_vals,
+                device=self.device,
+                dtype=self.dtype,
+                TENSOR_BYTES_LIMIT=self.tensor_bytes_limit,
+                RANDOM_CHOICE_MARGIN=RANDOM_CHOICE_MARGIN,
+            ):
+                yield (nt, B, M, max_seqlen, sparsity)
+        else:
+            from tritonbench.data.fb.jagged_dense_dense import (
+                generate_input_vals_fb,
+                get_prod_input_metadata,
+            )
 
-        B_vals, M_vals, seqlen_vals, sparsity_vals = self.get_x_vals()
-
-        for nt, B, M, max_seqlen, sparsity in generate_random_nested_tensors(
-            B_vals,
-            M_vals,
-            seqlen_vals,
-            sparsity_vals,
-            device=self.device,
-            dtype=self.dtype,
-            TENSOR_BYTES_LIMIT=self.tensor_bytes_limit,
-            RANDOM_CHOICE_MARGIN=RANDOM_CHOICE_MARGIN,
-        ):
-            yield (nt, B, M, max_seqlen, sparsity)
+            input_data = get_prod_input_metadata()
+            for (
+                jagged_values_shape,
+                dense_0_shape,
+                dense_1_shape,
+                jagged_values_dtype,
+                dense_0_dtype,
+                dense_1_dtype,
+            ) in input_data:
+                jagged_values, jagged_offsets, _, _ = generate_input_vals_fb(
+                    jagged_values_shape,
+                    dense_0_shape=dense_0_shape,
+                    dense_1_shape=dense_1_shape,
+                    jagged_values_dtype=jagged_values_dtype,
+                    dense_0_dtype=dense_0_dtype,
+                    dense_1_dtype=dense_1_dtype,
+                )
+                nested_tensor = jagged_to_nested_tensor(jagged_values, jagged_offsets)
+                # Yueming: in the future, if we integrate more input shapes for other jagged operators,
+                # the dense_0 may be None. In that case, we should use another way to obtain the batch size
+                # and max seq len.
+                batch_size, max_seq_len, _ = dense_0_shape
+                yield (nested_tensor, batch_size, 1, max_seq_len, 0.0)
 
     def _get_accuracy(self, fn: Callable, baseline_fn: Callable) -> bool:
         output = fn()
diff --git a/tritonbench/operators/jagged_sum/operator.py b/tritonbench/operators/jagged_sum/operator.py
@@ -18,6 +18,7 @@
     get_styles,
     get_tensor_bytes_limit,
     GIGABYTES_PER_BYTE,
+    jagged_to_nested_tensor,
     RANDOM_CHOICE_MARGIN,
     RELATIVE_TOLERANCE,
 )
@@ -192,20 +193,49 @@ def get_input_iter(self) -> Generator:
         """
         Generate random nested tensors of shape (B, *, M), where * is the ragged dimension
         """
+        if not self.prod_shapes:
+            B_vals, M_vals, seqlen_vals, sparsity_vals = self.get_x_vals()
+
+            for nt, B, M, max_seqlen, sparsity in generate_random_nested_tensors(
+                B_vals,
+                M_vals,
+                seqlen_vals,
+                sparsity_vals,
+                device=self.device,
+                dtype=self.dtype,
+                TENSOR_BYTES_LIMIT=self.tensor_bytes_limit,
+                RANDOM_CHOICE_MARGIN=RANDOM_CHOICE_MARGIN,
+            ):
+                yield (nt, B, M, max_seqlen, sparsity)
+        else:
+            from tritonbench.data.fb.jagged_dense_dense import (
+                generate_input_vals_fb,
+                get_prod_input_metadata,
+            )
 
-        B_vals, M_vals, seqlen_vals, sparsity_vals = self.get_x_vals()
-
-        for nt, B, M, max_seqlen, sparsity in generate_random_nested_tensors(
-            B_vals,
-            M_vals,
-            seqlen_vals,
-            sparsity_vals,
-            device=self.device,
-            dtype=self.dtype,
-            TENSOR_BYTES_LIMIT=self.tensor_bytes_limit,
-            RANDOM_CHOICE_MARGIN=RANDOM_CHOICE_MARGIN,
-        ):
-            yield (nt, B, M, max_seqlen, sparsity)
+            input_data = get_prod_input_metadata()
+            for (
+                jagged_values_shape,
+                dense_0_shape,
+                dense_1_shape,
+                jagged_values_dtype,
+                dense_0_dtype,
+                dense_1_dtype,
+            ) in input_data:
+                jagged_values, jagged_offsets, _, _ = generate_input_vals_fb(
+                    jagged_values_shape,
+                    dense_0_shape=dense_0_shape,
+                    dense_1_shape=dense_1_shape,
+                    jagged_values_dtype=jagged_values_dtype,
+                    dense_0_dtype=dense_0_dtype,
+                    dense_1_dtype=dense_1_dtype,
+                )
+                nested_tensor = jagged_to_nested_tensor(jagged_values, jagged_offsets)
+                # Yueming: in the future, if we integrate more input shapes for other jagged operators,
+                # the dense_0 may be None. In that case, we should use another way to obtain the batch size
+                # and max seq len.
+                batch_size, max_seq_len, _ = dense_0_shape
+                yield (nested_tensor, batch_size, 1, max_seq_len, 0.0)
 
     def _get_accuracy(self, fn: Callable, baseline_fn: Callable) -> bool:
         output = fn()
diff --git a/tritonbench/utils/jagged_utils.py b/tritonbench/utils/jagged_utils.py
@@ -133,6 +133,57 @@ def generate_input_vals(B, M, max_seqlen, sparsity, sizes):
     return B_vals, M_vals, seqlen_vals, sparsity_vals
 
 
+def jagged_to_nested_tensor(values: torch.Tensor, offsets: list[torch.Tensor]):
+    """
+    Convert jagged tensor (values + offsets) to torch.nested.nested_tensor
+
+    Args:
+        values: Compressed values tensor
+        offsets: List of offset tensors, indicating the starting position of each sequence
+
+    Returns:
+        Tensor in torch.nested.nested_tensor format
+    """
+    # Calculate the length of each sequence
+    lengths = []
+    for i in range(len(offsets)):
+        if i == 0:
+            # For the first layer, calculate the length of each batch
+            batch_size = offsets[i].size(0) - 1
+            batch_lengths = []
+            for b in range(batch_size):
+                batch_lengths.append(offsets[i][b + 1] - offsets[i][b])
+            lengths.append(batch_lengths)
+        else:
+            # For deeper levels of nesting
+            prev_lengths = lengths[i - 1]
+            curr_lengths = []
+            offset_idx = 0
+            for prev_len in prev_lengths:
+                seq_lengths = []
+                for _ in range(prev_len):
+                    seq_lengths.append(
+                        offsets[i][offset_idx + 1] - offsets[i][offset_idx]
+                    )
+                    offset_idx += 1
+                curr_lengths.append(seq_lengths)
+            lengths.append(curr_lengths)
+
+    # Build tensor list based on lengths and values
+    tensor_list = []
+    start_idx = 0
+    for b in range(len(lengths[0])):
+        length = lengths[0][b]
+        end_idx = start_idx + length
+        tensor_list.append(values[start_idx:end_idx])
+        start_idx = end_idx
+
+    # Create nested tensor
+    return torch.nested.nested_tensor(
+        tensor_list, layout=torch.jagged, device=values.device, dtype=values.dtype
+    )
+
+
 def get_size_in_bytes(shape, dtype) -> int:
     num_elements = math.prod(shape)
     element_size = dtype.itemsize
diff --git a/tritonbench/utils/parser.py b/tritonbench/utils/parser.py
@@ -193,6 +193,12 @@ def get_parser(args=None):
         help="The directory to store input or output.",
     )
 
+    parser.add_argument(
+        "--prod-shapes",
+        action="store_true",
+        help="Only run with pre-defined production shapes.",
+    )
+
     if IS_FBCODE:
         parser.add_argument("--log-scuba", action="store_true", help="Log to scuba.")
         parser.add_argument(
diff --git a/tritonbench/utils/triton_op.py b/tritonbench/utils/triton_op.py
@@ -724,6 +724,7 @@ def __init__(
         self._skip = _split_params_by_comma(self.tb_args.skip)
         self._input_id = self.tb_args.input_id
         self._num_inputs = self.tb_args.num_inputs
+        self.prod_shapes = self.tb_args.prod_shapes
 
     # Run the post initialization
     def __post__init__(self):