Skip to content

Commit b6f07a8

Browse files
authored
Merge pull request #366 from aakankshaduggal/data-mix-fix
Data mix fix
2 parents eaaccca + 3040657 commit b6f07a8

File tree

1 file changed

+14
-5
lines changed

1 file changed

+14
-5
lines changed

src/instructlab/sdg/datamixing.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,9 @@ def __create_auxiliary_ds(rec):
445445

446446

447447
def _create_phase10_ds(
448-
generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]]
448+
generated_dataset: Dataset,
449+
auxiliary_inst: Optional[Dict[str, List[str]]],
450+
use_legacy_pretraining_format: bool,
449451
):
450452
"""
451453
Create a dataset for Phase 1.0 of downstream training.
@@ -457,13 +459,20 @@ def _create_phase10_ds(
457459
knowledge_ds = _generate_knowledge_qa_dataset(
458460
generated_dataset, keep_context_separate=True
459461
)
460-
knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
462+
raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
463+
# Include phase07
464+
pretraining_knowledge_ds = _generate_knowledge_qa_dataset(
465+
generated_dataset, keep_context_separate=False
466+
).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format))
461467

462468
auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst)
469+
463470
if auxiliary_dataset is not None:
464-
phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset])
471+
phase10 = concatenate_datasets(
472+
[raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]
473+
)
465474
else:
466-
phase10 = knowledge_ds
475+
phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds])
467476
return phase10
468477

469478

@@ -601,7 +610,7 @@ def collect(
601610
)
602611

603612
skills_phase_data = _create_phase10_ds(
604-
new_generated_data, self.auxiliary_inst
613+
new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format
605614
)
606615
output_file_leaf_skills = (
607616
f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"

0 commit comments

Comments
 (0)