krammnic
diff --git a/‎docs/source/api_ref_data.rst
-4 b/‎docs/source/api_ref_data.rst
-4
diff --git a/‎tests/assets/chat_tiny.json
+26 b/‎tests/assets/chat_tiny.json
+26
diff --git a/‎tests/assets/instruct_tiny.json
+10 b/‎tests/assets/instruct_tiny.json
+10
diff --git a/‎tests/common.py
+3 b/‎tests/common.py
+3
diff --git a/‎tests/test_utils.py
+2-2 b/‎tests/test_utils.py
+2-2
diff --git a/‎tests/torchtune/_cli/test_validate.py
+1-5 b/‎tests/torchtune/_cli/test_validate.py
+1-5
diff --git a/‎tests/torchtune/data/test_chat_formats.py
-99 b/‎tests/torchtune/data/test_chat_formats.py
-99
diff --git a/‎tests/torchtune/datasets/test_chat_dataset.py
+60-50 b/‎tests/torchtune/datasets/test_chat_dataset.py
+60-50
diff --git a/‎tests/torchtune/datasets/test_grammar_dataset.py
+2-2 b/‎tests/torchtune/datasets/test_grammar_dataset.py
+2-2
@@ -25,11 +25,7 @@ and models.
     PromptTemplate
     PromptTemplateInterface
     ChatMLTemplate
-
     ChatFormat
-    ChatMLFormat
-    Llama2ChatFormat
-    MistralChatFormat
 
 Types
 -----
 
@@ -0,0 +1,26 @@
+[
+    {
+        "conversations": [
+            {
+                "from": "system",
+                "value": "You are an AI assistant."
+            },
+            {
+                "from": "human",
+                "value": "What is the meaning of life?"
+            },
+            {
+                "from": "gpt",
+                "value": "The meaning of life is 42."
+            },
+            {
+                "from": "human",
+                "value": "That's ridiculous."
+            },
+            {
+                "from": "gpt",
+                "value": "I agree."
+            }
+        ]
+    }
+]
@@ -0,0 +1,10 @@
+[
+    {
+        "instruction": "What time is it in London?",
+        "response": "It is 10:00 AM in London"
+    },
+    {
+        "instruction": "Is is Istanbul or Constantinople?",
+        "response": "Istanbul was Constantinople. Now it's Istanbul, not Constantinople."
+    }
+]
@@ -3,5 +3,8 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+from pathlib import Path
 
 TUNE_PATH = "torchtune/_cli/tune.py"
+
+ASSETS = Path(__file__).parent / "assets"
@@ -19,7 +19,7 @@
 
 import torch
 from torch import nn
-from torchtune.data import ChatFormat, Message, PromptTemplate, truncate
+from torchtune.data import Message, PromptTemplate, truncate
 from torchtune.modules.tokenizers import ModelTokenizer
 from torchtune.modules.transforms import Transform
 
@@ -164,7 +164,7 @@ def image_id(self):
         return -2
 
 
-class DummyChatFormat(ChatFormat):
+class DummyChatFormat:
 
     B_SYS, E_SYS = "System:\n", "\n"
     B_INST, E_INST = "User:\n", "\nAssistant:\n"
 
@@ -7,12 +7,8 @@
 import runpy
 import sys
 
-from pathlib import Path
-
 import pytest
-from tests.common import TUNE_PATH
-
-ASSETS = Path(__file__).parent.parent.parent / "assets"
+from tests.common import ASSETS, TUNE_PATH
 
 
 class TestTuneValidateCommand:
 
@@ -4,63 +4,20 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from unittest import mock
-
 import pytest
+from tests.common import ASSETS
 from tests.test_utils import DummyChatFormat, DummyTokenizer
-from torchtune.data import Message
+from torchtune.data import get_sharegpt_messages
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-from torchtune.datasets import ChatDataset
+from torchtune.datasets import chat_dataset, ChatDataset
 
 
 class TestChatDataset:
     @pytest.fixture
     def chat_format(self):
         return DummyChatFormat
 
-    @pytest.fixture
-    def dialogue(self):
-        return [
-            {
-                "dialogue": [
-                    Message.from_dict(
-                        {
-                            "role": "system",
-                            "content": "You are an AI assistant.",
-                            "masked": True,
-                        }
-                    ),
-                    Message.from_dict(
-                        {
-                            "role": "user",
-                            "content": "What is the meaning of life?",
-                            "masked": True,
-                        }
-                    ),
-                    Message.from_dict(
-                        {
-                            "role": "assistant",
-                            "content": "The meaning of life is 42.",
-                            "masked": False,
-                        }
-                    ),
-                    Message.from_dict(
-                        {
-                            "role": "user",
-                            "content": "That's ridiculous.",
-                            "masked": True,
-                        }
-                    ),
-                    Message.from_dict(
-                        {"role": "assistant", "content": "I agree.", "masked": False}
-                    ),
-                ],
-            },
-        ]
-
-    @mock.patch("torchtune.datasets._chat.load_dataset")
-    def test_get_item(self, mock_load_dataset, chat_format, dialogue):
-        mock_load_dataset.return_value = dialogue
+    def test_get_item(self, chat_format):
         expected_tokenized_prompts = [
             [
                 0,
@@ -104,15 +61,68 @@ def test_get_item(self, mock_load_dataset, chat_format, dialogue):
         ]
         ds = ChatDataset(
             tokenizer=DummyTokenizer(),
-            source="iam/agoofy/goober",
-            convert_to_messages=lambda x, y: x["dialogue"],
+            source="json",
+            convert_to_messages=get_sharegpt_messages,
             chat_format=chat_format,
             max_seq_len=100,
             train_on_input=False,
+            data_files=str(ASSETS / "chat_tiny.json"),
+            split="train",
         )
         assert len(ds) == 1
-        mock_load_dataset.assert_called_once()
+        prompt, label = ds[0]["tokens"], ds[0]["labels"]
+        assert prompt == expected_tokenized_prompts[0]
+        assert label == expected_labels[0]
+
+        expected_tokenized_prompts = [
+            [
+                0,
+                3,
+                3,
+                2,
+                2,
+                10,
+                4,
+                2,
+                3,
+                7,
+                2,
+                5,
+                3,
+                7,
+                2,
+                4,
+                2,
+                3,
+                -1,
+                0,
+                6,
+                11,
+                1,
+                6,
+                -1,
+            ]
+        ]
+        prompt_lengths = (12, 3)
+        expected_labels = [
+            [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[0]
+            + [3, 7, 2, 4, 2, 3, -1]
+            + [CROSS_ENTROPY_IGNORE_IDX] * prompt_lengths[1]
+            + [1, 6, -1]
+        ]
+
+        ds = chat_dataset(
+            tokenizer=DummyTokenizer(),
+            source="json",
+            data_files=str(ASSETS / "chat_tiny.json"),
+            conversation_column="conversations",
+            conversation_style="sharegpt",
+            train_on_input=False,
+            packed=False,
+            split="train",
+        )
 
+        assert len(ds) == 1
         prompt, label = ds[0]["tokens"], ds[0]["labels"]
         assert prompt == expected_tokenized_prompts[0]
         assert label == expected_labels[0]
@@ -36,7 +36,7 @@ def test_label_no_masking(self, load_dataset, tokenizer):
             ]
         )
 
-        grammar_ds = grammar_dataset(model_transform=tokenizer, train_on_input=True)
+        grammar_ds = grammar_dataset(tokenizer=tokenizer, train_on_input=True)
         input, labels = grammar_ds[0]["tokens"], grammar_ds[0]["labels"]
 
         assert input == [0, 7, 2, 3, 6, 4, 8, 5, 8, 5, 7, 4, 3, 6, 4, 8, 9, 2, 9, -1]
@@ -58,7 +58,7 @@ def test_label_masking(self, load_dataset, tokenizer):
             ]
         )
 
-        grammar_ds = grammar_dataset(model_transform=tokenizer)
+        grammar_ds = grammar_dataset(tokenizer=tokenizer)
 
         # Generate the input and labels
         input, labels = grammar_ds[0]["tokens"], grammar_ds[0]["labels"]
Original file line number	Diff line number	Diff line change
`@@ -3,5 +3,8 @@`
`3`	`3`	`#`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`5`	`5`	`# LICENSE file in the root directory of this source tree.`
	`6`	`+from pathlib import Path`
`6`	`7`
`7`	`8`	`TUNE_PATH = "torchtune/_cli/tune.py"`
	`9`	`+`
	`10`	`+ASSETS = Path(__file__).parent / "assets"`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_label_no_masking(self, load_dataset, tokenizer):`
`36`	`36`	`]`
`37`	`37`	`)`
`38`	`38`
`39`		`- grammar_ds = grammar_dataset(model_transform=tokenizer, train_on_input=True)`
	`39`	`+ grammar_ds = grammar_dataset(tokenizer=tokenizer, train_on_input=True)`
`40`	`40`	`input, labels = grammar_ds[0]["tokens"], grammar_ds[0]["labels"]`
`41`	`41`
`42`	`42`	`assert input == [0, 7, 2, 3, 6, 4, 8, 5, 8, 5, 7, 4, 3, 6, 4, 8, 9, 2, 9, -1]`
`@@ -58,7 +58,7 @@ def test_label_masking(self, load_dataset, tokenizer):`
`58`	`58`	`]`
`59`	`59`	`)`
`60`	`60`
`61`		`- grammar_ds = grammar_dataset(model_transform=tokenizer)`
	`61`	`+ grammar_ds = grammar_dataset(tokenizer=tokenizer)`
`62`	`62`
`63`	`63`	`# Generate the input and labels`
`64`	`64`	`input, labels = grammar_ds[0]["tokens"], grammar_ds[0]["labels"]`