Update llama CI to run everyday, set timeout for large llama tests CI…

…, update 8b fp8 test and weight paths Signed-off-by: aviator19941 <avinash.sharma@amd.com>
nod-ai · Feb 26, 2025 · 9836afe · 9836afe
1 parent 46ef7df
commit 9836afe
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 49 deletions.
diff --git a/.github/workflows/ci-llama-large-tests.yaml b/.github/workflows/ci-llama-large-tests.yaml
@@ -11,7 +11,7 @@ on:
   pull_request:
   schedule:
     # Weekdays at 11:00 AM UTC = 03:00 AM PST / 04:00 AM PDT
-    - cron: "0 11 * * 1-5"
+    - cron: "0 11 * * 1-7"
 
 concurrency:
   # A PR number if a pull request and otherwise the commit hash. This cancels
@@ -24,6 +24,7 @@ concurrency:
 jobs:
   test_llama_large:
     if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
+    timeout-minutes: 240
     name: "Llama Benchmarking Tests"
     strategy:
       matrix:

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -163,8 +163,8 @@ def setUp(self):
             "--benchmark_repetitions=3",
         ]
 
-    def testBenchmark8B_f16_Non_Decomposed_Input_Len_128(self):
-        output_file_name = self.dir_path_8b / "f16_torch_128"
+    def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
+        output_file_name = self.dir_path_8b / "f16_torch_128_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -203,8 +203,8 @@ def testBenchmark8B_f16_Non_Decomposed_Input_Len_128(self):
         )
 
     @skipif_run_quick_llama_test
-    def testBenchmark8B_f16_Non_Decomposed_Input_Len_2048(self):
-        output_file_name = self.dir_path_8b / "f16_torch_2048"
+    def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
+        output_file_name = self.dir_path_8b / "f16_torch_2048_tp1"
         output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -242,9 +242,8 @@ def testBenchmark8B_f16_Non_Decomposed_Input_Len_2048(self):
             cwd=self.repo_root,
         )
 
-    @skipif_run_quick_llama_test
-    def testBenchmark8B_fp8_Non_Decomposed(self):
-        output_file_name = self.dir_path_8b / "fp8_torch"
+    def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
+        output_file_name = self.dir_path_8b / "fp8_torch_tp1"
         output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -323,6 +322,9 @@ def setUp(self):
             attention_kernel="torch",
             tensor_parallelism_size=1,
             block_seq_stride=32,
+            activation_dtype="bfloat16",
+            attention_dtype="bfloat16",
+            kv_cache_dtype="float8_e4m3fnuz",
         )
         self.prefill_args_bs4_128_stride_32_tp1_f16 = (
             self.artifacts_dir / "prefill_args_bs4_128_stride_32"
@@ -465,7 +467,7 @@ def setUp(self):
         ]
 
     def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
-        output_file_name = self.dir_path_70b / "f16_torch_128"
+        output_file_name = self.dir_path_70b / "f16_torch_128_tp1"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -504,7 +506,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
         )
 
     def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
-        output_file_name = self.dir_path_70b / "f16_torch_2048"
+        output_file_name = self.dir_path_70b / "f16_torch_2048_tp1"
         output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -546,43 +548,45 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
-        output_file_name = self.dir_path_70b / "f16_torch_128"
-        output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_file_name = self.dir_path_70b / "f16_torch_128_tp8"
+        output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".mlir", prefix=output_file_name
         )
-        output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_json = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".json", prefix=output_file_name
         )
-        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
         output_shard_file_name = (
-            self.artifacts_dir
-            / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+            self.weights_dir
+            / f"tp8/llama3_70b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
         )
         if output_shard_file_name.exists():
-            self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
-        export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
+            self.llama70b_f16_torch_sdpa_artifacts_tp8.irpa_path = (
+                output_shard_file_name
+            )
+        export_return_code = self.llama70b_f16_torch_sdpa_artifacts_tp8.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
         )
-        self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
             args=self.compile_args,
         )
         # benchmark prefill
-        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -594,43 +598,45 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
         reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
     )
     def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
-        output_file_name = self.dir_path_70b / "f16_torch_2048"
-        output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_file_name = self.dir_path_70b / "f16_torch_2048_tp8"
+        output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".mlir", prefix=output_file_name
         )
-        output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_json = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".json", prefix=output_file_name
         )
-        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
+        output_vmfb = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
             suffix=".vmfb", prefix=output_file_name
         )
         output_shard_file_name = (
-            self.artifacts_dir
-            / f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+            self.weights_dir
+            / f"tp8/llama3_70b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
         )
         if output_shard_file_name.exists():
-            self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
-        export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
+            self.llama70b_f16_torch_sdpa_artifacts_tp8.irpa_path = (
+                output_shard_file_name
+            )
+        export_return_code = self.llama70b_f16_torch_sdpa_artifacts_tp8.export_to_mlir(
             mlir_path=output_mlir,
             json_path=output_json,
         )
-        self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
             mlir_path=str(output_mlir),
             vmfb_path=output_vmfb,
             hal_dump_path=output_file_name,
             cwd=self.repo_root,
             args=self.compile_args,
         )
         # benchmark prefill
-        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
             args=self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16,
             cwd=self.repo_root,
         )
         # benchmark decode
-        self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
+        self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
             hip_device_id=self.iree_device,
             vmfb_name=output_vmfb,
             irpa_path=self.irpa_path,
@@ -642,7 +648,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
         reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
     )
     def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
-        output_file_name = self.dir_path_70b / "fp8_torch"
+        output_file_name = self.dir_path_70b / "fp8_torch_tp1"
         output_mlir = self.llama70b_fp8_torch_sdpa_artifacts_tp1.create_file(
             suffix=".mlir", prefix=output_file_name
         )
@@ -687,9 +693,11 @@ class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
     def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
-        self.artifacts_dir = Path("/shark-dev/data/llama3.1/weights/405b")
-        self.artifacts_dir_2048 = Path("/shark-dev/405b")
-        self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
+        self.artifacts_dir = Path("/shark-dev/405b")
+        self.weights_dir = self.artifacts_dir / "instruct/weights"
+        self.irpa_path = Path(
+            "/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa"
+        )
         self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"
@@ -704,15 +712,6 @@ def setUp(self):
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=32,
         )
-        self.llama405b_fp8_decomposed_artifacts = ExportArtifacts(
-            irpa_path=str(self.irpa_path_fp8),
-            batch_size=4,
-            iree_hip_target="gfx942",
-            iree_hal_target_device="hip",
-            attention_kernel="decomposed",
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=32,
-        )
         self.llama405b_fp8_torch_sdpa_artifacts = ExportArtifacts(
             irpa_path=str(self.irpa_path_fp8),
             batch_size=4,
@@ -721,6 +720,9 @@ def setUp(self):
             attention_kernel="torch",
             tensor_parallelism_size=self.tensor_parallelism_size,
             block_seq_stride=32,
+            activation_dtype="bfloat16",
+            attention_dtype="bfloat16",
+            kv_cache_dtype="float8_e4m3fnuz",
         )
         self.prefill_args_bs4_128_stride_32_tp8_f16 = (
             self.artifacts_dir / "prefill_args_bs4_128_stride_32_tp8"
@@ -729,10 +731,10 @@ def setUp(self):
             self.artifacts_dir / "decode_args_bs4_128_stride_32_tp8"
         )
         self.prefill_args_bs4_2048_stride_32_tp8_f16 = (
-            self.artifacts_dir_2048 / "prefill_args_bs4_2048_stride_32_tp8"
+            self.artifacts_dir / "prefill_args_bs4_2048_stride_32_tp8"
         )
         self.decode_args_bs4_2048_stride_32_tp8_f16 = (
-            self.artifacts_dir_2048 / "decode_args_bs4_2048_stride_32_tp8"
+            self.artifacts_dir / "decode_args_bs4_2048_stride_32_tp8"
         )
         self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
         self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
@@ -832,7 +834,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
         )
         output_shard_file_name = (
             self.artifacts_dir
-            / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+            / f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
         )
         if output_shard_file_name.exists():
             self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
@@ -873,7 +875,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
         )
         output_shard_file_name = (
             self.artifacts_dir
-            / f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
+            / f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
         )
         if output_shard_file_name.exists():
             self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name