Merge branch 'main' into integrates/iree

nod-ai · Feb 27, 2025 · b4bfb08 · b4bfb08
2 parents afcb250 + 3273c83
commit b4bfb08
Show file tree

Hide file tree

Showing 28 changed files with 813 additions and 368 deletions.
diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml
@@ -121,7 +121,7 @@ jobs:
       - name: Run perplexity test with Torch
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa --llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
+          pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --run-nightly-llama-tests --llama3-8b-f16-model-path=/shark-dev/data/llama3.1/weights/8b/fp16/llama3.1_8b_fp16_instruct.irpa --llama3-8b-tokenizer-path=/shark-dev/data/llama3.1/weights/8b/fp16/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html
 
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0

diff --git a/.github/workflows/ci_linux_x64_nogil-libshortfin.yml b/.github/workflows/ci_linux_x64_nogil-libshortfin.yml
@@ -76,6 +76,6 @@ jobs:
     - name: Run shortfin Python tests (full)
       working-directory: ${{ env.LIBSHORTFIN_DIR }}
       run: |
-        pytest -s --ignore=tests/examples/fastapi_test.py --ignore=tests/apps/sd
+        pytest -s --ignore=tests/examples/fastapi_test.py --ignore=tests/apps/sd --ignore=tests/apps/flux
       # TODO: Enable further tests and switch to
       # pytest -s
diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml
@@ -66,6 +66,54 @@ jobs:
           name: smoke-test-${{ matrix.name }}
           path: smoke-test-${{ matrix.name }}.xml
 
+
+  direct_to_batcher_test:
+    name: "Direct to Batcher Test (${{ matrix.name }})"
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: cpu
+            runs-on: azure-cpubuilder-linux-scale
+            test_device: cpu
+            python-version: 3.11
+          - name: amdgpu_rocm_mi300_gfx942
+            runs-on: linux-mi300-1gpu-ossci
+            test_device: gfx942
+            python-version: 3.11
+    defaults:
+      run:
+        shell: bash
+    env:
+      PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
+      VENV_DIR: ${{ github.workspace }}/.venv
+    steps:
+      - name: Run rocminfo
+        if: contains(matrix.test_device, 'gfx')
+        run: rocminfo
+      - name: "Checkout Code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: "Set up environment and install PkgCI Artifacts"
+        uses: ./.github/actions/pkgci-setup
+        with:
+          python-version: ${{matrix.python-version}}
+          artifact-run-id: ${{ inputs.artifact_run_id }}
+      - name: Run Direct-to-batcher Test
+        run: |
+          source ${VENV_DIR}/bin/activate
+          pytest -v -s --test_device=${{ matrix.test_device }} \
+            --junitxml=direct-to-batcher-test-${{ matrix.name }}.xml \
+            app_tests/integration_tests/llm/shortfin/direct_to_batcher_test.py \
+            --log-cli-level=INFO
+      - name: Upload Test Results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: direct-to-batcher-test-${{ matrix.name }}
+          path: direct-to-batcher-test-${{ matrix.name }}.xml
+
+
   integration_test:
     name: "Integration Test (${{ matrix.name }})"
     runs-on: ${{ matrix.runs-on }}

diff --git a/app_tests/integration_tests/llm/server_management.py b/app_tests/integration_tests/llm/server_management.py
@@ -10,6 +10,8 @@
 
 from .device_settings import DeviceSettings
 from .model_management import ModelArtifacts
+from shortfin_apps.llm.components.service import GenerateService
+from contextlib import contextmanager
 
 
 @dataclass
@@ -58,6 +60,41 @@ def find_available_port() -> int:
             s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
             return s.getsockname()[1]
 
+    def get_server_args(self) -> list[str]:
+        """Returns the command line arguments to start the server."""
+        argv = [
+            f"--tokenizer_json={self.config.artifacts.tokenizer_path}",
+            f"--model_config={self.config.artifacts.config_path}",
+            f"--vmfb={self.config.artifacts.vmfb_path}",
+            f"--parameters={self.config.artifacts.weights_path}",
+            f"--port={self.port}",
+            f"--prefix_sharing_algorithm={self.config.prefix_sharing_algorithm}",
+        ]
+        argv.extend(self.config.device_settings.server_flags)
+        return argv
+
+    @contextmanager
+    def start_service_only(self) -> GenerateService:
+        """Starts a server with only the shortfin_apps.llm.components.serivce.GenerateService."""
+
+        argv = self.get_server_args()
+        from shortfin_apps.llm.server import parse_args
+
+        args = parse_args(argv)
+        if args.tokenizer_config_json is None:
+            # this is only used for the EOS token
+            inferred_tokenizer_config_path = args.tokenizer_json.with_name(
+                args.tokenizer_json.stem + "_config.json"
+            )
+        args.tokenizer_config_json = inferred_tokenizer_config_path
+
+        from shortfin_apps.llm.components.lifecycle import ShortfinLlmLifecycleManager
+
+        lifecycle_manager = ShortfinLlmLifecycleManager(args)
+
+        with lifecycle_manager:
+            yield lifecycle_manager.services["default"]
+
     def start(self) -> None:
         """Starts the server process."""
         if self.process is not None:
@@ -69,15 +106,7 @@ def start(self) -> None:
             sys.executable,
             "-m",
             "shortfin_apps.llm.server",
-            f"--tokenizer_json={self.config.artifacts.tokenizer_path}",
-            f"--model_config={self.config.artifacts.config_path}",
-            f"--vmfb={self.config.artifacts.vmfb_path}",
-            f"--parameters={self.config.artifacts.weights_path}",
-            f"--port={self.port}",
-            f"--prefix_sharing_algorithm={self.config.prefix_sharing_algorithm}",
-        ]
-        cmd.extend(self.config.device_settings.server_flags)
-
+        ] + self.get_server_args()
         self.process = subprocess.Popen(cmd)
         self.wait_for_ready()
 

diff --git a/app_tests/integration_tests/llm/shortfin/conftest.py b/app_tests/integration_tests/llm/shortfin/conftest.py
@@ -78,6 +78,23 @@ def server(model_artifacts, request):
     process.wait()
 
 
+@pytest.fixture(scope="module")
+def generate_service(model_artifacts, request):
+    """Starts and manages the test server."""
+    model_config = model_artifacts.model_config
+
+    server_config = ServerConfig(
+        artifacts=model_artifacts,
+        device_settings=model_config.device_settings,
+        prefix_sharing_algorithm=request.param.get("prefix_sharing", "none"),
+    )
+
+    server_instance = ServerInstance(server_config)
+    server_instance.port = 0
+    with server_instance.start_service_only() as gs:
+        yield gs
+
+
 @pytest.fixture(scope="module")
 def encoded_prompt(model_artifacts: ModelArtifacts, request) -> list[int]:
     tokenizer = Tokenizer.from_file(str(model_artifacts.tokenizer_path))

diff --git a/app_tests/integration_tests/llm/shortfin/direct_to_batcher_test.py b/app_tests/integration_tests/llm/shortfin/direct_to_batcher_test.py
@@ -0,0 +1,101 @@
+import pytest
+import numpy as np
+import asyncio
+import shortfin as sf
+
+from app_tests.integration_tests.llm.server_management import (
+    ServerInstance,
+    ServerConfig,
+)
+from app_tests.integration_tests.llm.model_management import TEST_MODELS, ModelProcessor
+from app_tests.integration_tests.llm.device_settings import CPU
+from shortfin_apps.llm.components.messages import InferencePhase, InferenceExecRequest
+
+
+pytestmark = pytest.mark.parametrize(
+    "model_artifacts,generate_service",
+    [
+        ["tinystories_llama2_25m", {"prefix_sharing": "none"}],
+    ],
+    indirect=True,
+)
+
+
+class BatchConsistencyTestProcess(sf.Process):
+    """Process to test consistency of results across different batch sizes.
+
+    This is necessary because InferenceExecRequest uses shortfin.VoidFuture
+    which can only be created on a process (which belongs to a fiber that a worker works through).
+    """
+
+    def __init__(self, service, input_tokens, batch_sizes, max_response_length):
+        super().__init__(fiber=service.main_fiber)
+        self.service = service
+        self.input_tokens = input_tokens
+        self.batch_sizes = batch_sizes
+        self.max_response_length = max_response_length
+        self.results = {}  # Store results for each batch size
+        # TODO: modify the batcher to guarantee the batch we send isn't split by strobe messages
+
+    async def run(self):
+        for batch_size in self.batch_sizes:
+            batch_results = []
+            for _ in range(batch_size):
+                prefill_req = InferenceExecRequest(
+                    phase=InferencePhase.PREFILL,
+                    input_token_ids=self.input_tokens,
+                    rid=f"test-{batch_size}",
+                )
+                prefill_req.return_host_array = True
+                self.service.batcher.submit(prefill_req)
+                await prefill_req.done
+                first_token = np.argmax(prefill_req.result_logits.items)
+                result_sequence = [first_token]
+
+                decode_req = prefill_req
+                for _ in range(self.max_response_length - 1):
+                    decode_req.reset(InferencePhase.DECODE)
+                    decode_req.input_token_ids.append(first_token)
+                    decode_req.start_position += 1
+                    self.service.batcher.submit(decode_req)
+                    await decode_req.done
+                    next_token = np.argmax(decode_req.result_logits.items)
+                    result_sequence.append(next_token)
+                    first_token = next_token
+
+                batch_results.append(result_sequence)
+                decode_req.free_cache_pages()
+
+            self.results[batch_size] = batch_results
+
+            first_result = batch_results[0]
+            for result in batch_results[1:]:
+                assert np.array_equal(
+                    first_result, result
+                ), f"Inconsistent results within batch size {batch_size}"
+
+        first_batch_result = self.results[self.batch_sizes[0]][0]
+        for batch_size in self.batch_sizes[1:]:
+            assert np.array_equal(
+                first_batch_result, self.results[batch_size][0]
+            ), f"Inconsistent results between batch sizes {self.batch_sizes[0]} and {batch_size}"
+
+
+def test_batch_and_nobatch_consistency(model_artifacts, generate_service):
+    """
+    Test that requests produce identical results regardless of batch size.
+
+    If this test fails, it means that changing the batch size changes the generation results.
+
+    Look for kvcache corruption due to
+    - improper seq_len / current_position handling in service.py
+    - improper masking in sharktank
+    """
+    # Create and run the test process
+    test_process = BatchConsistencyTestProcess(
+        generate_service,
+        input_tokens=[1, 2, 3, 4],
+        batch_sizes=[1, 2, 3, 4],
+        max_response_length=3,
+    )
+    test_process.launch()
diff --git a/docs/model_cookbook.md b/docs/model_cookbook.md
@@ -256,18 +256,7 @@ iree-run-module \
 
 ## Evaluation pipeline
 
-Run perplexity test:
-
-```bash
-pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
-```
-
-Run perplexity for a new model:
-```bash
-python -m  sharktank.evaluate.perplexity \
-  --gguf-file=llama8b_f16.gguf \
-  --tokenizer-config-json=tokenizer_config.json
-```
+[Instructions](../sharktank/sharktank/evaluate/README.md) to run perplexity test
 
 ## Generating data for llama models
 

diff --git a/sharktank/conftest.py b/sharktank/conftest.py
@@ -64,20 +64,12 @@ def pytest_addoption(parser):
         help="Load cached results if present instead of recomputing.",
     )
 
-    parser.addoption(
-        "--longrun",
-        action="store_true",
-        dest="longrun",
-        default=False,
-        help="Enable long tests",
-    )
-
     parser.addoption(
         "--run-quick-llama-test",
         action="store_true",
         dest="run-quick-llama-test",
         default=False,
-        help="Enable llama 8b f16 decomposed benchmarking test",
+        help="Run large llama tests if passed",
     )
 
     parser.addoption(

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -13,28 +13,74 @@ Perplexity score measures the ability of a language model to predict the next to
 
 In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts randomly selected from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
 
-* Test perplexity for Llama3.1 8B (FP16) model:
+Perplexity script takes a given `--irpa-file` or `--gguf-file`, exports and compiles it in order to calculate the perplexity. There are options to pass a custom `--mlir-path` or `--vmfb-path` too.
 
+#### Run perplexity
+For Llama3.1 8B (FP16) model on a MI300 server:
+##### Torch mode
 ```bash
-pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
+pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py -k test_llama3_8B_f16 \
+  --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa \
+  --llama3-8b-tokenizer-path=tokenizer_config.json \
+  --bs=4 \
+  --run-nightly-llama-tests
 ```
 
-* Calculate perplexity for a new model:
+##### IREE mode
+```bash
+pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py -k test_llama3_8B_f16 \
+  --llama3-8b-f16-model-path=llama3.1_8b_instruct_fp16.irpa  \
+  --llama3-8b-tokenizer-path=tokenizer_config.json \
+  --bs=4 \
+  --iree-device=hip://1 \
+  --iree-hip-target=gfx942 \
+  --iree-hal-target-device=hip
+```
+
+For a new model:
+
+Replace `--irpa-file` with `--gguf-file` flag if necessary (eg: `--gguf-file=llama3_70b_instruct_fp16.gguf`)
+
+##### Torch mode
+```bash
+python -m  sharktank.evaluate.perplexity_torch \
+  --irpa-file=llama3_70b_instruct_fp16.irpa \
+  --tokenizer-config-json=tokenizer_config.json \
+  --num-prompts=4
+```
+
+##### IREE mode
+
+To run on MI300:
+```bash
+python -m sharktank.evaluate.perplexity_iree \
+  --irpa-file=llama3_70b_instruct_fp16.irpa \
+  --tokenizer-config-json=tokenizer_config.json \
+  --num-prompts=4 \
+  --iree-device='hip://0' \
+  --iree-hal-target-device=hip \
+  --iree-hip-target=gfx942
+```
+
+To run on CPU, replace the above --iree-* flags with:
+```bash
+  --iree-device='local-task' --iree-hal-target-device=llvm-cpu
+```
 
+For additional options:
 ```bash
-python -m  sharktank.evaluate.perplexity \
-  --gguf-file=llama3_70b_f16.gguf \
-  --tokenizer-config-json=tokenizer_config.json
+python -m sharktank.evaluate.perplexity_torch  -h
+python -m sharktank.evaluate.perplexity_iree  -h
 ```
 
 ### Perplexity Scoreboard
 
-| CPU            | GPU        |
-|:-------------: |:----------:|
-| AMD EPYC 9554  | MI300X     |
+| CPU            | GPU        | Num of prompts   |
+|:-------------: |:----------:|:----------------:|
+| AMD EPYC 9554  | MI300X     |      100         |
 
 #### LLaMA 3.1
 
-|Models                 |Model size (GB) |Torch score   |IREE score    |
-|:----------------------|:---------------|:-------------|:-------------|
-|8B FP16 TP1 decomposed |16.07           |14.930181     |14.991893     |
+|Models                          |Torch score   |IREE score    | Model size (GB) |
+|:-------------------------------|:-------------|:-------------|:----------------|
+|8B FP16 Instruct TP1            |20.303255     |19.786807     |16.07            |