Skip to content

Commit 6554f61

Browse files
committed
update loading pretrain dataset
1 parent e4786fa commit 6554f61

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def group_texts(examples):
103103
try:
104104
processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
105105
logger.info(f'Finished loading datasets-{file_name} from cache')
106-
except:
106+
except Exception:
107107
tmp_cache_path = join(cache_path, 'tmp') # 临时缓存目录,会被自动删除
108108
logger.info(f'There is no cache of file {file_name}, start preprocessing...')
109109
raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False)
@@ -129,7 +129,7 @@ def group_texts(examples):
129129
processed_dataset = grouped_datasets
130130
processed_dataset.save_to_disk(cache_path)
131131
# 删除临时目录
132-
shutil.rmtree(tmp_cache_path)
132+
# shutil.rmtree(tmp_cache_path)
133133

134134
logger.info(f"Training number of {file_name}: {len(processed_dataset['train'])}")
135135
if idx == 0:

train_args/pretrain/full/qwen-7b-pretrain-full.json

+2-4
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"deepspeed": "./train_args/ds_z3_config.json",
55
"train_file": "./data/pretrain",
66
"num_train_epochs": 1,
7-
"max_steps": 1000000,
87
"tokenize_num_workers": 10,
98
"per_device_train_batch_size": 4,
109
"gradient_accumulation_steps": 4,
@@ -13,10 +12,9 @@
1312
"task_type": "pretrain",
1413

1514
"logging_steps": 500,
16-
"save_steps": 10000,
17-
"save_total_limit": 1,
15+
"save_steps": 500,
1816
"lr_scheduler_type": "cosine",
19-
"warmup_steps": 1000,
17+
"warmup_ratio": 0.01,
2018

2119
"gradient_checkpointing": true,
2220
"logging_first_step": false,

0 commit comments

Comments
 (0)