diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 3435ea2ecdbd..c7976ef350e1 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -200,7 +200,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -227,7 +227,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -485,7 +485,10 @@ def group_texts(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("clm_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -571,12 +574,7 @@ def group_texts(examples): if args.with_tracking: accelerator.log( - { - "perplexity": perplexity, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, + {"perplexity": perplexity, "train_loss": total_loss, "epoch": epoch, "step": completed_steps}, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 2720e76d02e9..6ff58ddc7f32 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -209,7 +209,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -238,7 +238,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -531,7 +531,10 @@ def group_texts(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("mlm_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -618,12 +621,7 @@ def group_texts(examples): if args.with_tracking: accelerator.log( - { - "perplexity": perplexity, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, + {"perplexity": perplexity, "train_loss": total_loss, "epoch": epoch, "step": completed_steps}, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index f845cd43e2c8..63b799c0929b 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -192,7 +192,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -265,7 +265,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -485,7 +485,10 @@ def preprocess_function(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("swag_no_trainer", experiment_config) # Metrics metric = load_metric("accuracy") @@ -570,12 +573,7 @@ def preprocess_function(examples): if args.with_tracking: accelerator.log( - { - "accuracy": eval_metric, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, + {"accuracy": eval_metric, "train_loss": total_loss, "epoch": epoch, "step": completed_steps}, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index bdfa1e189322..a53a3cf54822 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -224,7 +224,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -259,7 +259,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -723,7 +723,10 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("qa_beam_search_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -916,11 +919,12 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): "squad_v2" if args.version_2_with_negative else "squad": eval_metric, "train_loss": total_loss, "epoch": epoch, + "step": completed_steps, } if args.do_predict: log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric - accelerator.log(log, step=completed_steps) + accelerator.log(log) if args.checkpointing_steps == "epoch": accelerator.save_state(f"epoch_{epoch}") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 925c31b70642..7c7b54004ff5 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -254,7 +254,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -289,7 +289,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -730,7 +730,10 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("qa_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -889,11 +892,12 @@ def create_and_fill_np_array(start_or_end_logits, dataset, max_len): "squad_v2" if args.version_2_with_negative else "squad": eval_metric, "train_loss": total_loss, "epoch": epoch, + "step": completed_steps, } if args.do_predict: log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric - accelerator.log(log, step=completed_steps) + accelerator.log(log) if args.output_dir is not None: accelerator.wait_for_everyone() diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 1c93b064a6d1..48c0aada34b4 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -277,7 +277,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -315,7 +315,7 @@ def main(): ) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -548,7 +548,10 @@ def postprocess_text(preds, labels): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("summarization_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("summarization_no_trainer", experiment_config) # Metric metric = load_metric("rouge") @@ -666,7 +669,8 @@ def postprocess_text(preds, labels): if args.with_tracking: result["train_loss"] = total_loss result["epoch"] = epoch - accelerator.log(result, step=completed_steps) + result["step"] = completed_steps + accelerator.log(result) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py index f4fb8d0d52a6..720a6f707914 100644 --- a/examples/pytorch/test_accelerate_examples.py +++ b/examples/pytorch/test_accelerate_examples.py @@ -104,7 +104,8 @@ def test_run_glue_no_trainer(self): --learning_rate=1e-4 --seed=42 --checkpointing_steps epoch - """.split() + --with_tracking + """.split() if is_cuda_and_apex_available(): testargs.append("--fp16") @@ -114,6 +115,7 @@ def test_run_glue_no_trainer(self): result = get_results(tmp_dir) self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "glue_no_trainer"))) def test_run_clm_no_trainer(self): tmp_dir = self.get_auto_remove_tmp_dir() @@ -128,7 +130,8 @@ def test_run_clm_no_trainer(self): --num_train_epochs 2 --output_dir {tmp_dir} --checkpointing_steps epoch - """.split() + --with_tracking + """.split() if torch.cuda.device_count() > 1: # Skipping because there are not enough batches to train the model + would need a drop_last to work. @@ -139,6 +142,7 @@ def test_run_clm_no_trainer(self): result = get_results(tmp_dir) self.assertLess(result["perplexity"], 100) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "clm_no_trainer"))) def test_run_mlm_no_trainer(self): tmp_dir = self.get_auto_remove_tmp_dir() @@ -150,6 +154,7 @@ def test_run_mlm_no_trainer(self): --output_dir {tmp_dir} --num_train_epochs=1 --checkpointing_steps epoch + --with_tracking """.split() with patch.object(sys, "argv", testargs): @@ -157,6 +162,7 @@ def test_run_mlm_no_trainer(self): result = get_results(tmp_dir) self.assertLess(result["perplexity"], 42) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "mlm_no_trainer"))) def test_run_ner_no_trainer(self): # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu @@ -175,6 +181,7 @@ def test_run_ner_no_trainer(self): --num_train_epochs={epochs} --seed 7 --checkpointing_steps epoch + --with_tracking """.split() with patch.object(sys, "argv", testargs): @@ -183,6 +190,7 @@ def test_run_ner_no_trainer(self): self.assertGreaterEqual(result["eval_accuracy"], 0.75) self.assertLess(result["train_loss"], 0.5) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer"))) def test_run_squad_no_trainer(self): tmp_dir = self.get_auto_remove_tmp_dir() @@ -199,6 +207,7 @@ def test_run_squad_no_trainer(self): --per_device_train_batch_size=2 --per_device_eval_batch_size=1 --checkpointing_steps epoch + --with_tracking """.split() with patch.object(sys, "argv", testargs): @@ -207,6 +216,7 @@ def test_run_squad_no_trainer(self): self.assertGreaterEqual(result["eval_f1"], 30) self.assertGreaterEqual(result["eval_exact"], 30) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "qa_no_trainer"))) def test_run_swag_no_trainer(self): tmp_dir = self.get_auto_remove_tmp_dir() @@ -221,12 +231,14 @@ def test_run_swag_no_trainer(self): --learning_rate=2e-4 --per_device_train_batch_size=2 --per_device_eval_batch_size=1 + --with_tracking """.split() with patch.object(sys, "argv", testargs): run_swag_no_trainer.main() result = get_results(tmp_dir) self.assertGreaterEqual(result["eval_accuracy"], 0.8) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "swag_no_trainer"))) @slow def test_run_summarization_no_trainer(self): @@ -243,6 +255,7 @@ def test_run_summarization_no_trainer(self): --per_device_train_batch_size=2 --per_device_eval_batch_size=1 --checkpointing_steps epoch + --with_tracking """.split() with patch.object(sys, "argv", testargs): @@ -253,6 +266,7 @@ def test_run_summarization_no_trainer(self): self.assertGreaterEqual(result["eval_rougeL"], 7) self.assertGreaterEqual(result["eval_rougeLsum"], 7) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "summarization_no_trainer"))) @slow def test_run_translation_no_trainer(self): @@ -273,6 +287,7 @@ def test_run_translation_no_trainer(self): --source_lang en_XX --target_lang ro_RO --checkpointing_steps epoch + --with_tracking """.split() with patch.object(sys, "argv", testargs): @@ -280,3 +295,4 @@ def test_run_translation_no_trainer(self): result = get_results(tmp_dir) self.assertGreaterEqual(result["eval_bleu"], 30) self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0"))) + self.assertTrue(os.path.exists(os.path.join(tmp_dir, "translation_no_trainer"))) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 9730c2f34568..0842b462f9cb 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -165,7 +165,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -192,7 +192,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -430,7 +430,10 @@ def preprocess_function(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("glue_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("glue_no_trainer", experiment_config) # Get the metric function if args.task_name is not None: @@ -520,8 +523,8 @@ def preprocess_function(examples): "accuracy" if args.task_name is not None else "glue": eval_metric, "train_loss": total_loss, "epoch": epoch, + "step": completed_steps, }, - step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 26f1ff414017..6351d2625617 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -219,7 +219,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -246,7 +246,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -545,7 +545,10 @@ def tokenize_and_align_labels(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("clm_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("ner_no_trainer", experiment_config) # Metrics metric = load_metric("seqeval") @@ -676,12 +679,7 @@ def compute_metrics(): accelerator.print(f"epoch {epoch}:", eval_metric) if args.with_tracking: accelerator.log( - { - "seqeval": eval_metric, - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, + {"seqeval": eval_metric, "train_loss": total_loss, "epoch": epoch, "step": completed_steps}, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 024d70519466..706d7637fd97 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -258,7 +258,7 @@ def parse_args(): ) parser.add_argument( "--with_tracking", - required=False, + action="store_true", help="Whether to load in all available experiment trackers from the environment and use them for logging.", ) args = parser.parse_args() @@ -287,7 +287,7 @@ def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment - accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator() + accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -526,7 +526,10 @@ def preprocess_function(examples): # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: - accelerator.init_trackers("translation_no_trainer", args) + experiment_config = vars(args) + # TensorBoard cannot log Enums, need the raw value + experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value + accelerator.init_trackers("translation_no_trainer", experiment_config) metric = load_metric("sacrebleu") @@ -644,12 +647,7 @@ def postprocess_text(preds, labels): if args.with_tracking: accelerator.log( - { - "blue": eval_metric["score"], - "train_loss": total_loss, - "epoch": epoch, - }, - step=completed_steps, + {"blue": eval_metric["score"], "train_loss": total_loss, "epoch": epoch, "step": completed_steps}, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: