Skip to content

Commit 87d2da8

Browse files
authored
Merge pull request #554 from instructlab/mergify/bp/release-v0.7/pr-551
fix: ensures the system prompt is set when mixing datasets from SDG (backport #551)
2 parents 7fff001 + 19a9732 commit 87d2da8

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

docs/examples/mix_datasets/example_mixing.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
output_dir = Path(__file__).parent.joinpath("output")
1010
output_dir.mkdir(exist_ok=True)
1111

12+
system_prompt = "You are a helpful assistant."
13+
1214
concatenate_recipe_yaml = Path(__file__).parent.joinpath("concatenate_recipe.yaml")
1315
concatenated_output_jsonl = output_dir.joinpath("concatenated.jsonl")
14-
mix_datasets(concatenate_recipe_yaml, concatenated_output_jsonl)
16+
mix_datasets(
17+
concatenate_recipe_yaml, concatenated_output_jsonl, system_prompt=system_prompt
18+
)
1519

1620
weighted_recipe_yaml = Path(__file__).parent.joinpath("weighted_recipe.yaml")
1721
weighted_output_jsonl = output_dir.joinpath("weighted.jsonl")
18-
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl)
22+
mix_datasets(weighted_recipe_yaml, weighted_output_jsonl, system_prompt=system_prompt)

src/instructlab/sdg/generate_data.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -603,8 +603,9 @@ def mix_datasets(
603603
recipe_file: str,
604604
output_file: str,
605605
num_proc: Optional[int] = 8,
606+
system_prompt: Optional[str] = None,
606607
):
607-
recipe = Recipe(recipe_file)
608+
recipe = Recipe(recipe_file, system_prompt)
608609
if recipe.datasets:
609610
recipe.save_mixed_dataset(output_file, num_proc)
610611
else:
@@ -719,10 +720,12 @@ def generate_data(
719720
mix_datasets(
720721
recipe_file=f"{output_dir}/skills_recipe_{date_suffix}.yaml",
721722
output_file=f"{output_dir}/skills_train_msgs_{date_suffix}.jsonl",
723+
system_prompt=system_prompt,
722724
)
723725
mix_datasets(
724726
recipe_file=f"{output_dir}/knowledge_recipe_{date_suffix}.yaml",
725727
output_file=f"{output_dir}/knowledge_train_msgs_{date_suffix}.jsonl",
728+
system_prompt=system_prompt,
726729
)
727730

728731
generate_duration = time.time() - generate_start

0 commit comments

Comments
 (0)