[sharktank] Update perplexity README and enable torch attention-kernel (

#1002) - Update perplexity instructions in README - Remove decomposed tests and enable non-decomposed perplexity CIs - Allow passing custom mlir/vmfb to perplexity script
nod-ai · Feb 26, 2025 · 3273c83 · 3273c83
1 parent 06f5b2a
commit 3273c83
Show file tree

Hide file tree

Showing 10 changed files with 269 additions and 330 deletions.
diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
@@ -121,7 +121,7 @@ jobs:
       - name: Run perplexity test with Torch
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa --llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
+          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --run-nightly-llama-tests --llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa --llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

diff --git a/docs/model_cookbook.md b/docs/model_cookbook.md
@@ -256,18 +256,7 @@ iree-run-module \
 
 ## Evaluation pipeline
 
-Run perplexity test:
-
-```bash
-pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
-```
-
-Run perplexity for a new model:
-```bash
-python -m  sharktank.evaluate.perplexity \
-  --gguf-file=llama8b_f16.gguf \
-  --tokenizer-config-json=tokenizer_config.json
-```
+[Instructions](../sharktank/sharktank/evaluate/README.md) to run perplexity test
 
 ## Generating data for llama models
 

diff --git a/sharktank/conftest.py b/sharktank/conftest.py
@@ -64,20 +64,12 @@ def pytest_addoption(parser):
         help="Load cached results if present instead of recomputing.",
     )
 
-    parser.addoption(
-        "--longrun",
-        action="store_true",
-        dest="longrun",
-        default=False,
-        help="Enable long tests",
-    )
-
     parser.addoption(
         "--run-quick-llama-test",
         action="store_true",
         dest="run-quick-llama-test",
         default=False,
-        help="Enable llama 8b f16 decomposed benchmarking test",
+        help="Run large llama tests if passed",
     )
 
     parser.addoption(

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -13,28 +13,74 @@ Perplexity score measures the ability of a language model to predict the next to
 
 In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts randomly selected from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
 
-* Test perplexity for Llama3.1 8B (FP16) model:
+Perplexity script takes a given `--irpa-file` or `--gguf-file`, exports and compiles it in order to calculate the perplexity. There are options to pass a custom `--mlir-path` or `--vmfb-path` too.
 
+#### Run perplexity
+For Llama3.1 8B (FP16) model on a MI300 server:
+##### Torch mode
 ```bash
-pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
+pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llama3_8B_f16 \
+  --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa \
+  --llama3-8b-tokenizer-path=tokenizer_config.json \
+  --bs=4 \
+  --run-nightly-llama-tests
 ```
 
-* Calculate perplexity for a new model:
+##### IREE mode
+```bash
+pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py -k test_llama3_8B_f16 \
+  --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa  \
+  --llama3-8b-tokenizer-path=tokenizer_config.json \
+  --bs=4 \
+  --iree-device=hip://1 \
+  --iree-hip-target=gfx942 \
+  --iree-hal-target-device=hip
+```
+
+For a new model:
+
+Replace `--irpa-file` with `--gguf-file` flag if necessary (eg: `--gguf-file=llama3_70b_instruct_fp16.gguf`)
+
+##### Torch mode
+```bash
+python -m  sharktank.evaluate.perplexity_torch \
+  --irpa-file=llama3_70b_instruct_fp16.irpa \
+  --tokenizer-config-json=tokenizer_config.json \
+  --num-prompts=4
+```
+
+##### IREE mode
+
+To run on MI300:
+```bash
+python -m sharktank.evaluate.perplexity_iree \
+  --irpa-file=llama3_70b_instruct_fp16.irpa \
+  --tokenizer-config-json=tokenizer_config.json \
+  --num-prompts=4 \
+  --iree-device='hip://0' \
+  --iree-hal-target-device=hip \
+  --iree-hip-target=gfx942
+```
+
+To run on CPU, replace the above --iree-* flags with:
+```bash
+  --iree-device='local-task' --iree-hal-target-device=llvm-cpu
+```
 
+For additional options:
 ```bash
-python -m  sharktank.evaluate.perplexity \
-  --gguf-file=llama3_70b_f16.gguf \
-  --tokenizer-config-json=tokenizer_config.json
+python -m sharktank.evaluate.perplexity_torch  -h
+python -m sharktank.evaluate.perplexity_iree  -h
 ```
 
 ### Perplexity Scoreboard
 
-| CPU            | GPU        |
-|:-------------: |:----------:|
-| AMD EPYC 9554  | MI300X     |
+| CPU            | GPU        | Num of prompts   |
+|:-------------: |:----------:|:----------------:|
+| AMD EPYC 9554  | MI300X     |      100         |
 
 #### LLaMA 3.1
 
-|Models                 |Model size (GB) |Torch score   |IREE score    |
-|:----------------------|:---------------|:-------------|:-------------|
-|8B FP16 TP1 decomposed |16.07           |14.930181     |14.991893     |
+|Models                          |Torch score   |IREE score    | Model size (GB) |
+|:-------------------------------|:-------------|:-------------|:----------------|
+|8B FP16 Instruct TP1            |20.303255     |19.786807     |16.07            |
diff --git a/sharktank/sharktank/evaluate/perplexity_iree.py b/sharktank/sharktank/evaluate/perplexity_iree.py
@@ -128,26 +128,31 @@ def print_token_comparison(self, i):
             logger.debug(f"{expected_token_id}")
 
     @timeit
-    def compile_model(self, weight_path_str):
+    def compile_model(self, weight_path_str, mlir_path, json_path, vmfb_path):
         self.weight_path_str = weight_path_str
 
-        logger.info(f" Compiling: {self.weight_path_str}")
+        logger.info(f" Model: {self.weight_path_str}")
 
-        export_artifacts = ExportArtifacts(
-            irpa_path=self.weight_path_str,
-            batch_size=self.bs,
-            iree_hip_target=self.iree_hip_target,
-            iree_hal_target_device=self.iree_hal_target_device,
-            attention_kernel=self.attention_kernel,
-            tensor_parallelism_size=self.tensor_parallelism_size,
-            block_seq_stride=self.block_seq_stride,
-            use_attention_mask=self.use_attention_mask,
-        )
-        vmfb_path = export_artifacts.get_artifacts()
-        return vmfb_path
+        if vmfb_path:
+            self.vmfb_path = vmfb_path
+            logger.info(f" Using pre-compiled vmfb: {self.vmfb_path}")
+        else:
+            export_artifacts = ExportArtifacts(
+                irpa_path=self.weight_path_str,
+                batch_size=self.bs,
+                iree_hip_target=self.iree_hip_target,
+                iree_hal_target_device=self.iree_hal_target_device,
+                attention_kernel=self.attention_kernel,
+                tensor_parallelism_size=self.tensor_parallelism_size,
+                block_seq_stride=self.block_seq_stride,
+                use_attention_mask=self.use_attention_mask,
+                mlir_path=mlir_path,
+                json_path=json_path,
+            )
+            self.vmfb_path = export_artifacts.get_artifacts()
 
     @timeit
-    def load_model(self, weight_path, tokenizer, vmfb_path):
+    def load_model(self, weight_path, tokenizer):
 
         self.config = LlamaModelConfig(
             hp=configs.LlamaHParams.from_gguf_props(weight_path.properties),
@@ -175,7 +180,7 @@ def load_model(self, weight_path, tokenizer, vmfb_path):
 
         self.runner = vmfbRunner(
             device=self.iree_device,
-            vmfb_path=vmfb_path,
+            vmfb_path=self.vmfb_path,
             external_weight_path=self.weight_path_str,
         )
 
@@ -400,6 +405,9 @@ def run_perplexity(
     num_prompts,
     block_seq_stride,
     use_attention_mask,
+    mlir_path,
+    json_path,
+    vmfb_path,
 ):
     start = time.time()
     perplexity = Perplexity(
@@ -415,8 +423,8 @@ def run_perplexity(
 
     perplexity.get_prompts(num_prompts=num_prompts)
 
-    vmfb_path = perplexity.compile_model(weight_path_str)
-    perplexity.load_model(weight_path, tokenizer, vmfb_path)
+    perplexity.compile_model(weight_path_str, mlir_path, json_path, vmfb_path)
+    perplexity.load_model(weight_path, tokenizer)
     ppl = perplexity.get_perplexity()
 
     end = time.time()
@@ -451,6 +459,21 @@ def main(argv):
         default=100,
         help="Number of prompts for perplexity test (1 to 100)",
     )
+    parser.add_argument(
+        "--mlir-path",
+        type=str,
+        help="Path to exported mlir file",
+    )
+    parser.add_argument(
+        "--json-path",
+        type=str,
+        help="Path to exported config json file",
+    )
+    parser.add_argument(
+        "--vmfb-path",
+        type=str,
+        help="Path to compiled vmfb file",
+    )
 
     cli.add_model_options(parser)
     cli.add_tokenizer_options(parser)
@@ -463,6 +486,11 @@ def main(argv):
 
     use_attention_mask = True
 
+    if args.mlir_path or args.json_path:
+        assert (
+            args.json_path is not None and args.mlir_path is not None
+        ), "If using pre-exported mlir, both --mlir-path and --json-path must be passed"
+
     # Override flag if dataset disagrees
     tensor_parallelism_size = (
         weight_path.properties["tensor_parallelism_size"]
@@ -483,6 +511,9 @@ def main(argv):
         num_prompts=args.num_prompts,
         block_seq_stride=args.block_seq_stride,
         use_attention_mask=use_attention_mask,
+        mlir_path=args.mlir_path,
+        json_path=args.json_path,
+        vmfb_path=args.vmfb_path,
     )
 
     logger.info(f"\n{json.dumps(ppl, indent=2)}")

diff --git a/sharktank/sharktank/utils/cli.py b/sharktank/sharktank/utils/cli.py
@@ -84,7 +84,7 @@ def add_model_options(parser: argparse.ArgumentParser):
     )
     parser.add_argument(
         "--skip-decode",
-        help="Skips export decode",
+        help="Skips exporting decode",
         action="store_true",
     )
     parser.add_argument(
@@ -99,7 +99,7 @@ def add_model_options(parser: argparse.ArgumentParser):
     )
     parser.add_argument(
         "--attention-dtype",
-        help="DType to use for activations in the model",
+        help="DType to use for attention in the model",
         default="float16",
     )
     parser.add_argument(

diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
@@ -96,11 +96,15 @@ def __init__(
         activation_dtype: str = "float16",
         attention_dtype: str = "float16",
         kv_cache_dtype: Optional[str] = None,
+        mlir_path: Optional[str] = None,
+        json_path: Optional[str] = None,
     ):
         self.sharktank_dir = str(
             Path(os.path.dirname(os.path.abspath(__file__))).parent.parent.parent
         )
         self.irpa_path = irpa_path
+        self.mlir_path = mlir_path
+        self.json_path = json_path
         self.batch_size = batch_size
         self.iree_hip_target = iree_hip_target
         self.iree_hal_target_device = iree_hal_target_device
@@ -318,6 +322,11 @@ def create_file(self, *, suffix, prefix):
 
     def get_artifacts(self):
 
+        assert self.attention_kernel in [
+            "decomposed",
+            "torch",
+        ], "Only torch or decomposed attention_kernel types are supported"
+
         self.dir_path = self.sharktank_dir + "/" + "perplexity_ci_artifacts/"
         temp_dir = Path(self.dir_path)
         temp_dir.mkdir(parents=True, exist_ok=True)
@@ -327,27 +336,31 @@ def get_artifacts(self):
             + "_"
             + self.attention_kernel
         )
-        mlir_path = str(
-            self.create_file(suffix=".mlir", prefix=self.dir_path + model_name)
-        )
-        json_path = str(
-            self.create_file(suffix=".json", prefix=self.dir_path + model_name)
-        )
+
+        if self.mlir_path is None:
+            self.mlir_path = str(
+                self.create_file(suffix=".mlir", prefix=self.dir_path + model_name)
+            )
+            self.json_path = str(
+                self.create_file(suffix=".json", prefix=self.dir_path + model_name)
+            )
+
+            self.export_to_mlir(
+                mlir_path=self.mlir_path,
+                json_path=self.json_path,
+            )
+        else:
+            logger.info(f" Using pre-exported mlir: {self.mlir_path}")
+            logger.info(f" Using pre-exported config json: {self.json_path}")
+
         vmfb_path = str(
             self.create_file(suffix=".vmfb", prefix=self.dir_path + model_name)
         )
 
-        if self.attention_kernel == "decomposed":
-            returncode = self.export_to_mlir(
-                mlir_path=mlir_path,
-                json_path=json_path,
-            )
-
-            if returncode == 0:
-                self.compile_to_vmfb(
-                    mlir_path=mlir_path,
-                    vmfb_path=vmfb_path,
-                    cwd=self.sharktank_dir,
-                )
+        self.compile_to_vmfb(
+            mlir_path=self.mlir_path,
+            vmfb_path=vmfb_path,
+            cwd=self.sharktank_dir,
+        )
 
         return vmfb_path