Skip to content

Commit

Permalink
Update llama CI to run everyday, set timeout for large llama tests CI…
Browse files Browse the repository at this point in the history
…, update 8b fp8 test and weight paths

Signed-off-by: aviator19941 <avinash.sharma@amd.com>
  • Loading branch information
aviator19941 committed Feb 26, 2025
1 parent 46ef7df commit 9836afe
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 49 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/ci-llama-large-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:
pull_request:
schedule:
# Weekdays at 11:00 AM UTC = 03:00 AM PST / 04:00 AM PDT
- cron: "0 11 * * 1-5"
- cron: "0 11 * * 1-7"

concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
Expand All @@ -24,6 +24,7 @@ concurrency:
jobs:
test_llama_large:
if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }}
timeout-minutes: 240
name: "Llama Benchmarking Tests"
strategy:
matrix:
Expand Down
98 changes: 50 additions & 48 deletions sharktank/tests/models/llama/benchmark_amdgpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,8 @@ def setUp(self):
"--benchmark_repetitions=3",
]

def testBenchmark8B_f16_Non_Decomposed_Input_Len_128(self):
output_file_name = self.dir_path_8b / "f16_torch_128"
def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_128(self):
output_file_name = self.dir_path_8b / "f16_torch_128_tp1"
output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -203,8 +203,8 @@ def testBenchmark8B_f16_Non_Decomposed_Input_Len_128(self):
)

@skipif_run_quick_llama_test
def testBenchmark8B_f16_Non_Decomposed_Input_Len_2048(self):
output_file_name = self.dir_path_8b / "f16_torch_2048"
def testBenchmark8B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
output_file_name = self.dir_path_8b / "f16_torch_2048_tp1"
output_mlir = self.llama8b_f16_torch_sdpa_artifacts.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -242,9 +242,8 @@ def testBenchmark8B_f16_Non_Decomposed_Input_Len_2048(self):
cwd=self.repo_root,
)

@skipif_run_quick_llama_test
def testBenchmark8B_fp8_Non_Decomposed(self):
output_file_name = self.dir_path_8b / "fp8_torch"
def testBenchmark8B_fp8_TP1_Non_Decomposed(self):
output_file_name = self.dir_path_8b / "fp8_torch_tp1"
output_mlir = self.llama8b_fp8_torch_sdpa_artifacts.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -323,6 +322,9 @@ def setUp(self):
attention_kernel="torch",
tensor_parallelism_size=1,
block_seq_stride=32,
activation_dtype="bfloat16",
attention_dtype="bfloat16",
kv_cache_dtype="float8_e4m3fnuz",
)
self.prefill_args_bs4_128_stride_32_tp1_f16 = (
self.artifacts_dir / "prefill_args_bs4_128_stride_32"
Expand Down Expand Up @@ -465,7 +467,7 @@ def setUp(self):
]

def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
output_file_name = self.dir_path_70b / "f16_torch_128"
output_file_name = self.dir_path_70b / "f16_torch_128_tp1"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -504,7 +506,7 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_128(self):
)

def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
output_file_name = self.dir_path_70b / "f16_torch_2048"
output_file_name = self.dir_path_70b / "f16_torch_2048_tp1"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp1.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -546,43 +548,45 @@ def testBenchmark70B_f16_TP1_Non_Decomposed_Input_Len_2048(self):
reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
)
def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
output_file_name = self.dir_path_70b / "f16_torch_128"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_file_name = self.dir_path_70b / "f16_torch_128_tp8"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".mlir", prefix=output_file_name
)
output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_json = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".json", prefix=output_file_name
)
output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_vmfb = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".vmfb", prefix=output_file_name
)
output_shard_file_name = (
self.artifacts_dir
/ f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
self.weights_dir
/ f"tp8/llama3_70b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
)
if output_shard_file_name.exists():
self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
self.llama70b_f16_torch_sdpa_artifacts_tp8.irpa_path = (
output_shard_file_name
)
export_return_code = self.llama70b_f16_torch_sdpa_artifacts_tp8.export_to_mlir(
mlir_path=output_mlir,
json_path=output_json,
)
self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
mlir_path=str(output_mlir),
vmfb_path=output_vmfb,
hal_dump_path=output_file_name,
cwd=self.repo_root,
args=self.compile_args,
)
# benchmark prefill
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_nondecomposed_args_128_tp8_fp16,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
Expand All @@ -594,43 +598,45 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_128(self):
reason="Benchmarking Error", strict=True, raises=IreeBenchmarkException
)
def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
output_file_name = self.dir_path_70b / "f16_torch_2048"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_file_name = self.dir_path_70b / "f16_torch_2048_tp8"
output_mlir = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".mlir", prefix=output_file_name
)
output_json = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_json = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".json", prefix=output_file_name
)
output_vmfb = self.llama70b_f16_torch_sdpa_artifacts.create_file(
output_vmfb = self.llama70b_f16_torch_sdpa_artifacts_tp8.create_file(
suffix=".vmfb", prefix=output_file_name
)
output_shard_file_name = (
self.artifacts_dir
/ f"fp16/tp8/llama3.1_70b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
self.weights_dir
/ f"tp8/llama3_70b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
)
if output_shard_file_name.exists():
self.llama70b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
export_return_code = self.llama70b_f16_torch_sdpa_artifacts.export_to_mlir(
self.llama70b_f16_torch_sdpa_artifacts_tp8.irpa_path = (
output_shard_file_name
)
export_return_code = self.llama70b_f16_torch_sdpa_artifacts_tp8.export_to_mlir(
mlir_path=output_mlir,
json_path=output_json,
)
self.llama70b_f16_torch_sdpa_artifacts.compile_to_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.compile_to_vmfb(
mlir_path=str(output_mlir),
vmfb_path=output_vmfb,
hal_dump_path=output_file_name,
cwd=self.repo_root,
args=self.compile_args,
)
# benchmark prefill
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
args=self.iree_run_prefill_nondecomposed_args_2048_tp8_fp16,
cwd=self.repo_root,
)
# benchmark decode
self.llama70b_f16_torch_sdpa_artifacts.iree_benchmark_vmfb(
self.llama70b_f16_torch_sdpa_artifacts_tp8.iree_benchmark_vmfb(
hip_device_id=self.iree_device,
vmfb_name=output_vmfb,
irpa_path=self.irpa_path,
Expand All @@ -642,7 +648,7 @@ def testBenchmark70B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
reason="70b fp8 irpa does not exist", strict=True, raises=ExportMlirException
)
def testBenchmark70B_fp8_TP1_Non_Decomposed(self):
output_file_name = self.dir_path_70b / "fp8_torch"
output_file_name = self.dir_path_70b / "fp8_torch_tp1"
output_mlir = self.llama70b_fp8_torch_sdpa_artifacts_tp1.create_file(
suffix=".mlir", prefix=output_file_name
)
Expand Down Expand Up @@ -687,9 +693,11 @@ class BenchmarkLlama3_1_405B(BaseBenchmarkTest):
def setUp(self):
super().setUp()
# TODO: add numpy files to Azure and download from it
self.artifacts_dir = Path("/shark-dev/data/llama3.1/weights/405b")
self.artifacts_dir_2048 = Path("/shark-dev/405b")
self.irpa_path = self.artifacts_dir / "fp16/llama3.1_405b_fp16.irpa"
self.artifacts_dir = Path("/shark-dev/405b")
self.weights_dir = self.artifacts_dir / "instruct/weights"
self.irpa_path = Path(
"/shark-dev/data/llama3.1/weights/405b/fp16/llama3.1_405b_fp16.irpa"
)
self.irpa_path_fp8 = self.artifacts_dir / "f8/llama3.1_405b_fp8.irpa"
self.tensor_parallelism_size = 8
self.dir_path_405b = self.dir_path / "llama-405b"
Expand All @@ -704,15 +712,6 @@ def setUp(self):
tensor_parallelism_size=self.tensor_parallelism_size,
block_seq_stride=32,
)
self.llama405b_fp8_decomposed_artifacts = ExportArtifacts(
irpa_path=str(self.irpa_path_fp8),
batch_size=4,
iree_hip_target="gfx942",
iree_hal_target_device="hip",
attention_kernel="decomposed",
tensor_parallelism_size=self.tensor_parallelism_size,
block_seq_stride=32,
)
self.llama405b_fp8_torch_sdpa_artifacts = ExportArtifacts(
irpa_path=str(self.irpa_path_fp8),
batch_size=4,
Expand All @@ -721,6 +720,9 @@ def setUp(self):
attention_kernel="torch",
tensor_parallelism_size=self.tensor_parallelism_size,
block_seq_stride=32,
activation_dtype="bfloat16",
attention_dtype="bfloat16",
kv_cache_dtype="float8_e4m3fnuz",
)
self.prefill_args_bs4_128_stride_32_tp8_f16 = (
self.artifacts_dir / "prefill_args_bs4_128_stride_32_tp8"
Expand All @@ -729,10 +731,10 @@ def setUp(self):
self.artifacts_dir / "decode_args_bs4_128_stride_32_tp8"
)
self.prefill_args_bs4_2048_stride_32_tp8_f16 = (
self.artifacts_dir_2048 / "prefill_args_bs4_2048_stride_32_tp8"
self.artifacts_dir / "prefill_args_bs4_2048_stride_32_tp8"
)
self.decode_args_bs4_2048_stride_32_tp8_f16 = (
self.artifacts_dir_2048 / "decode_args_bs4_2048_stride_32_tp8"
self.artifacts_dir / "decode_args_bs4_2048_stride_32_tp8"
)
self.prefill_args_fp8 = self.artifacts_dir / "prefill_args_fp8"
self.decode_args_fp8 = self.artifacts_dir / "decode_args_fp8"
Expand Down Expand Up @@ -832,7 +834,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_128(self):
)
output_shard_file_name = (
self.artifacts_dir
/ f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
)
if output_shard_file_name.exists():
self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
Expand Down Expand Up @@ -873,7 +875,7 @@ def testBenchmark405B_f16_TP8_Non_Decomposed_Input_Len_2048(self):
)
output_shard_file_name = (
self.artifacts_dir
/ f"fp16/tp8/llama3.1_405b_fp16_tp{self.tensor_parallelism_size}_parameters.irpa"
/ f"tp8/llama3_405b_instruct_fp16_tp{self.tensor_parallelism_size}.irpa"
)
if output_shard_file_name.exists():
self.llama405b_f16_torch_sdpa_artifacts.irpa_path = output_shard_file_name
Expand Down

0 comments on commit 9836afe

Please sign in to comment.