|
13 | 13 | import pytest
|
14 | 14 |
|
15 | 15 | from tests.common import TUNE_PATH
|
16 |
| -from tests.recipes.utils import llama2_test_config, write_hf_ckpt_config |
17 |
| -from tests.test_utils import CKPT_MODEL_PATHS |
| 16 | +from tests.recipes.utils import ( |
| 17 | + llama2_test_config, |
| 18 | + llama3_2_vision_test_config, |
| 19 | + write_hf_ckpt_config, |
| 20 | + write_hf_vision_ckpt_config, |
| 21 | +) |
| 22 | +from tests.test_utils import CKPT_MODEL_PATHS, gpu_test |
18 | 23 |
|
19 | 24 |
|
20 | 25 | class TestEleutherEval:
|
| 26 | + @pytest.fixture |
| 27 | + def hide_correct_version_number(self, monkeypatch): |
| 28 | + import importlib.metadata |
| 29 | + |
| 30 | + import_orig = importlib.metadata.version |
| 31 | + |
| 32 | + def mocked_import(name, *args, **kwargs): |
| 33 | + if name == "lm-eval": |
| 34 | + return "0.4.4" # Hardcode wrong version number |
| 35 | + return import_orig(name, *args, **kwargs) |
| 36 | + |
| 37 | + monkeypatch.setattr(importlib.metadata, "version", mocked_import) |
| 38 | + |
| 39 | + @pytest.fixture |
| 40 | + def expected_vision_acc(self): |
| 41 | + return { |
| 42 | + "Science": 0.35, |
| 43 | + "Biology": 0.25, |
| 44 | + "Chemistry": 0.25, |
| 45 | + "Geography": 0.5, |
| 46 | + "Math": 0.0, |
| 47 | + "Physics": 0.75, |
| 48 | + } |
| 49 | + |
21 | 50 | @pytest.mark.parametrize(
|
22 | 51 | "eval_name, expected_acc, bsz",
|
23 | 52 | [
|
@@ -74,22 +103,9 @@ def test_torchtune_checkpoint_eval_results(
|
74 | 103 | acc_result = float(search_results.group(1))
|
75 | 104 | assert math.isclose(acc_result, expected_acc, abs_tol=0.05)
|
76 | 105 |
|
77 |
| - @pytest.fixture |
78 |
| - def hide_correct_version_number(self, monkeypatch): |
79 |
| - import importlib.metadata |
80 |
| - |
81 |
| - import_orig = importlib.metadata.version |
82 |
| - |
83 |
| - def mocked_import(name, *args, **kwargs): |
84 |
| - if name == "lm-eval": |
85 |
| - return "0.4.4" # Hardcode wrong version number |
86 |
| - return import_orig(name, *args, **kwargs) |
87 |
| - |
88 |
| - monkeypatch.setattr(importlib.metadata, "version", mocked_import) |
89 |
| - |
90 | 106 | @pytest.mark.integration_test
|
91 | 107 | @pytest.mark.usefixtures("hide_correct_version_number")
|
92 |
| - def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir): |
| 108 | + def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir): |
93 | 109 | ckpt = "llama2_tune"
|
94 | 110 | ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
|
95 | 111 | ckpt_dir = ckpt_path.parent
|
@@ -123,7 +139,7 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
|
123 | 139 |
|
124 | 140 | @pytest.mark.integration_test
|
125 | 141 | def test_eval_recipe_errors_with_quantization_hf_checkpointer(
|
126 |
| - self, capsys, monkeypatch, tmpdir |
| 142 | + self, monkeypatch, tmpdir |
127 | 143 | ):
|
128 | 144 | ckpt = "llama2_hf"
|
129 | 145 | ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
|
@@ -162,7 +178,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer(
|
162 | 178 | runpy.run_path(TUNE_PATH, run_name="__main__")
|
163 | 179 |
|
164 | 180 | @pytest.mark.integration_test
|
165 |
| - def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir): |
| 181 | + def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir): |
166 | 182 | ckpt = "llama2_tune"
|
167 | 183 | ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
|
168 | 184 | ckpt_dir = ckpt_path.parent
|
@@ -194,3 +210,86 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
|
194 | 210 | match="QAT quantizers should only be used during quantization aware training",
|
195 | 211 | ):
|
196 | 212 | runpy.run_path(TUNE_PATH, run_name="__main__")
|
| 213 | + |
| 214 | + @pytest.mark.integration_test |
| 215 | + @gpu_test(gpu_count=1) |
| 216 | + def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc): |
| 217 | + ckpt = "llama3_2_vision_meta" |
| 218 | + ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) |
| 219 | + ckpt_dir = ckpt_path.parent |
| 220 | + |
| 221 | + cmd = f""" |
| 222 | + tune run eleuther_eval \ |
| 223 | + --config llama3_2_vision/11B_evaluation \ |
| 224 | + output_dir={tmpdir} \ |
| 225 | + checkpointer=torchtune.training.FullModelMetaCheckpointer \ |
| 226 | + checkpointer.checkpoint_dir='{ckpt_dir}' \ |
| 227 | + checkpointer.checkpoint_files=[{ckpt_path}] \ |
| 228 | + ~checkpointer.checkpoint_files.filename_format \ |
| 229 | + ~checkpointer.checkpoint_files.max_filename \ |
| 230 | + checkpointer.output_dir={tmpdir} \ |
| 231 | + checkpointer.model_type=LLAMA3_VISION \ |
| 232 | + tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ |
| 233 | + tokenizer.prompt_template=null \ |
| 234 | + limit=4 \ |
| 235 | + dtype=bf16 \ |
| 236 | + device=cuda \ |
| 237 | + """.split() |
| 238 | + |
| 239 | + model_config = llama3_2_vision_test_config() |
| 240 | + cmd = cmd + model_config |
| 241 | + |
| 242 | + monkeypatch.setattr(sys, "argv", cmd) |
| 243 | + with pytest.raises(SystemExit, match=""): |
| 244 | + runpy.run_path(TUNE_PATH, run_name="__main__") |
| 245 | + |
| 246 | + out = caplog.text |
| 247 | + |
| 248 | + pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)" |
| 249 | + |
| 250 | + matches = re.findall(pattern, out, re.MULTILINE) |
| 251 | + for task_name, _, accuracy in matches: |
| 252 | + assert math.isclose(float(accuracy), expected_vision_acc[task_name]) |
| 253 | + |
| 254 | + @pytest.mark.integration_test |
| 255 | + @gpu_test(gpu_count=1) |
| 256 | + def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc): |
| 257 | + ckpt = "llama3_2_vision_hf" |
| 258 | + ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) |
| 259 | + ckpt_dir = ckpt_path.parent |
| 260 | + |
| 261 | + # Config file needed for model conversion. |
| 262 | + write_hf_vision_ckpt_config(ckpt_dir) |
| 263 | + |
| 264 | + cmd = f""" |
| 265 | + tune run eleuther_eval \ |
| 266 | + --config llama3_2_vision/11B_evaluation \ |
| 267 | + output_dir={tmpdir} \ |
| 268 | + checkpointer=torchtune.training.FullModelHFCheckpointer \ |
| 269 | + checkpointer.checkpoint_dir='{ckpt_dir}' \ |
| 270 | + checkpointer.checkpoint_files=[{ckpt_path}]\ |
| 271 | + ~checkpointer.checkpoint_files.filename_format \ |
| 272 | + ~checkpointer.checkpoint_files.max_filename \ |
| 273 | + checkpointer.output_dir={tmpdir} \ |
| 274 | + checkpointer.model_type=LLAMA3_VISION \ |
| 275 | + tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \ |
| 276 | + tokenizer.prompt_template=null \ |
| 277 | + limit=4 \ |
| 278 | + dtype=bf16 \ |
| 279 | + device=cuda \ |
| 280 | + """.split() |
| 281 | + |
| 282 | + model_config = llama3_2_vision_test_config() |
| 283 | + cmd = cmd + model_config |
| 284 | + |
| 285 | + monkeypatch.setattr(sys, "argv", cmd) |
| 286 | + with pytest.raises(SystemExit, match=""): |
| 287 | + runpy.run_path(TUNE_PATH, run_name="__main__") |
| 288 | + |
| 289 | + out = caplog.text |
| 290 | + |
| 291 | + pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)" |
| 292 | + |
| 293 | + matches = re.findall(pattern, out, re.MULTILINE) |
| 294 | + for task_name, _, accuracy in matches: |
| 295 | + assert math.isclose(float(accuracy), expected_vision_acc[task_name]) |
0 commit comments