update loading pretrain dataset

yangjianxin1 · yangjianxin1 · commit 6554f6170654 · 2024-01-11T14:36:44.000+08:00
diff --git a/train.py b/train.py
@@ -103,7 +103,7 @@ def group_texts(examples):
             try:
                 processed_dataset = datasets.load_from_disk(cache_path, keep_in_memory=False)
                 logger.info(f'Finished loading datasets-{file_name} from cache')
-            except:
+            except Exception:
                 tmp_cache_path = join(cache_path, 'tmp')    # 临时缓存目录，会被自动删除
                 logger.info(f'There is no cache of file {file_name}, start preprocessing...')
                 raw_dataset = load_dataset("json", data_files=file, cache_dir=tmp_cache_path, keep_in_memory=False)
@@ -129,7 +129,7 @@ def group_texts(examples):
                 processed_dataset = grouped_datasets
                 processed_dataset.save_to_disk(cache_path)
                 # 删除临时目录
-                shutil.rmtree(tmp_cache_path)
+                # shutil.rmtree(tmp_cache_path)
 
             logger.info(f"Training number of {file_name}: {len(processed_dataset['train'])}")
             if idx == 0:
diff --git a/train_args/pretrain/full/qwen-7b-pretrain-full.json b/train_args/pretrain/full/qwen-7b-pretrain-full.json
@@ -4,7 +4,6 @@
     "deepspeed": "./train_args/ds_z3_config.json",
     "train_file": "./data/pretrain",
     "num_train_epochs": 1,
-    "max_steps": 1000000,
     "tokenize_num_workers": 10,
     "per_device_train_batch_size": 4,
     "gradient_accumulation_steps": 4,
@@ -13,10 +12,9 @@
     "task_type": "pretrain",
 
     "logging_steps": 500,
-    "save_steps": 10000,
-    "save_total_limit": 1,
+    "save_steps": 500,
     "lr_scheduler_type": "cosine",
-    "warmup_steps": 1000,
+    "warmup_ratio": 0.01,
 
     "gradient_checkpointing": true,
     "logging_first_step": false,