@@ -445,7 +445,9 @@ def __create_auxiliary_ds(rec):
445
445
446
446
447
447
def _create_phase10_ds (
448
- generated_dataset : Dataset , auxiliary_inst : Optional [Dict [str , List [str ]]]
448
+ generated_dataset : Dataset ,
449
+ auxiliary_inst : Optional [Dict [str , List [str ]]],
450
+ use_legacy_pretraining_format : bool ,
449
451
):
450
452
"""
451
453
Create a dataset for Phase 1.0 of downstream training.
@@ -457,13 +459,20 @@ def _create_phase10_ds(
457
459
knowledge_ds = _generate_knowledge_qa_dataset (
458
460
generated_dataset , keep_context_separate = True
459
461
)
460
- knowledge_ds = _add_extra_contexts_to_samples (knowledge_ds , p = 0.4 )
462
+ raft_knowledge_ds = _add_extra_contexts_to_samples (knowledge_ds , p = 0.4 )
463
+ # Include phase07
464
+ pretraining_knowledge_ds = _generate_knowledge_qa_dataset (
465
+ generated_dataset , keep_context_separate = False
466
+ ).map (lambda rec : _conv_pretrain (rec , use_legacy_pretraining_format ))
461
467
462
468
auxiliary_dataset = _create_auxiliary_dataset (generated_dataset , auxiliary_inst )
469
+
463
470
if auxiliary_dataset is not None :
464
- phase10 = concatenate_datasets ([knowledge_ds , auxiliary_dataset ])
471
+ phase10 = concatenate_datasets (
472
+ [raft_knowledge_ds , pretraining_knowledge_ds , auxiliary_dataset ]
473
+ )
465
474
else :
466
- phase10 = knowledge_ds
475
+ phase10 = concatenate_datasets ([ raft_knowledge_ds , pretraining_knowledge_ds ])
467
476
return phase10
468
477
469
478
@@ -601,7 +610,7 @@ def collect(
601
610
)
602
611
603
612
skills_phase_data = _create_phase10_ds (
604
- new_generated_data , self .auxiliary_inst
613
+ new_generated_data , self .auxiliary_inst , use_legacy_pretraining_format
605
614
)
606
615
output_file_leaf_skills = (
607
616
f"node_datasets_{ self .date_suffix } /{ leaf_node_path } _p10.jsonl"
0 commit comments