diff --git a/training/benchmarks/swin_transformer/pytorch/config/_base.py b/training/benchmarks/swin_transformer/pytorch/config/_base.py index 79395efeb..6bf62e6da 100644 --- a/training/benchmarks/swin_transformer/pytorch/config/_base.py +++ b/training/benchmarks/swin_transformer/pytorch/config/_base.py @@ -4,6 +4,8 @@ vendor: str = None # model name name: str = "swin_transformer" +cudnn_benchmark: bool = False +cudnn_deterministic: bool = True # ----------------------------------------------------------------------------- # Data settings diff --git a/training/benchmarks/swin_transformer/pytorch/dataloaders/dataloader.py b/training/benchmarks/swin_transformer/pytorch/dataloaders/dataloader.py index 7664a7fb3..69a4378aa 100644 --- a/training/benchmarks/swin_transformer/pytorch/dataloaders/dataloader.py +++ b/training/benchmarks/swin_transformer/pytorch/dataloaders/dataloader.py @@ -16,10 +16,11 @@ from .cached_image_folder import CachedImageFolder from .samplers import SubsetRandomSampler +from driver import dist_pytorch + try: from torchvision.transforms import InterpolationMode - def _pil_interp(method): if method == 'bicubic': return InterpolationMode.BICUBIC @@ -31,7 +32,6 @@ def _pil_interp(method): # default bilinear, do we want to allow nearest? return InterpolationMode.BILINEAR - import timm.data.transforms as timm_transforms timm_transforms._pil_interp = _pil_interp @@ -43,21 +43,26 @@ def build_loader(config): # config.defrost() dataset_train, config.model_num_classes = build_dataset(is_train=True, config=config) # config.freeze() - print(f"local rank {config.local_rank} / global rank {dist.get_rank()} successfully build train dataset") + # bugfix for single-card training + if dist_pytorch.is_dist_avail_and_initialized(): + print(f"local rank {config.local_rank} / global rank {dist.get_rank()} successfully build train dataset") dataset_val, _ = build_dataset(is_train=False, config=config) - print(f"local rank {config.local_rank} / global rank {dist.get_rank()} successfully build val dataset") - num_tasks = dist.get_world_size() - global_rank = dist.get_rank() + # bugfix for single-card training + if dist_pytorch.is_dist_avail_and_initialized(): + print(f"local rank {config.local_rank} / global rank {dist.get_rank()} successfully build val dataset") + + num_tasks = dist.get_world_size() if dist_pytorch.is_dist_avail_and_initialized() else 1 + global_rank = dist.get_rank() if dist_pytorch.is_dist_avail_and_initialized() else 0 if config.data_zip_mode and config.data_cache_mode == 'part': - indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size()) + indices = np.arange(global_rank, len(dataset_train), num_tasks) sampler_train = SubsetRandomSampler(indices) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True ) - if config.test_sequential: + if config.test_sequential or num_tasks == 1: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_val = torch.utils.data.distributed.DistributedSampler( diff --git a/training/benchmarks/swin_transformer/pytorch/run_pretraining.py b/training/benchmarks/swin_transformer/pytorch/run_pretraining.py index 564450dbb..e3f39f0e0 100755 --- a/training/benchmarks/swin_transformer/pytorch/run_pretraining.py +++ b/training/benchmarks/swin_transformer/pytorch/run_pretraining.py @@ -78,10 +78,11 @@ def main() -> Tuple[Any, Any]: dist_pytorch.barrier(config.vendor) model_driver.event(Event.TRAIN_START) - raw_train_start_time = logger.previous_log_time epoch = -1 max_accuracy = 0.0 + + train_start_time = time.time() for epoch in range(config.train_start_epoch, config.train_epochs): training_state.epoch = epoch @@ -102,10 +103,8 @@ def main() -> Tuple[Any, Any]: end_training_state = trainer.detect_training_status(training_state) model_driver.event(Event.TRAIN_END) - raw_train_end_time = logger.previous_log_time - training_state.raw_train_time = (raw_train_end_time - - raw_train_start_time) / 1e+3 + training_state.raw_train_time = time.time() - train_start_time return config, training_state @@ -131,5 +130,12 @@ def main() -> Tuple[Any, Any]: "final_acc5": state.eval_acc5, "raw_train_time": state.raw_train_time, "init_time": state.init_time, + "train_no_eval_time": state.no_eval_time, + "pure_training_computing_time": state.pure_compute_time, + "throughput(ips)_raw": state.num_trained_samples / state.raw_train_time, + "throughput(ips)_no_eval": + state.num_trained_samples / state.no_eval_time, + "throughput(ips)_pure_compute": + state.num_trained_samples / state.pure_compute_time, } logger.log(Event.FINISHED, message=finished_info, stacklevel=0) diff --git a/training/benchmarks/swin_transformer/pytorch/train/trainer.py b/training/benchmarks/swin_transformer/pytorch/train/trainer.py index 98e0436d8..c4a12d825 100644 --- a/training/benchmarks/swin_transformer/pytorch/train/trainer.py +++ b/training/benchmarks/swin_transformer/pytorch/train/trainer.py @@ -1,11 +1,9 @@ -import os -import sys import time import datetime import torch from torch.types import Device -from timm.utils import accuracy, AverageMeter +from timm.utils import AverageMeter from driver import Driver, Event, dist_pytorch from train.training_state import TrainingState @@ -27,6 +25,7 @@ def train_one_epoch(self, model, criterion, dataloader, optimizer, epoch, mixup_ model.train() optimizer.zero_grad() + no_eval_start_time = time.time() num_steps = len(dataloader) batch_time = AverageMeter() @@ -41,6 +40,9 @@ def train_one_epoch(self, model, criterion, dataloader, optimizer, epoch, mixup_ state.global_steps += 1 samples = samples.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) + state.num_trained_samples += samples.size(0) * self.config.n_device + + pure_compute_start_time = time.time() if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) @@ -70,6 +72,8 @@ def train_one_epoch(self, model, criterion, dataloader, optimizer, epoch, mixup_ end = time.time() state.loss = loss_meter.val + state.pure_compute_time += time.time() - pure_compute_start_time + other_state = dict() if state.global_steps % self.config.gradient_accumulation_steps == 0: step_end_time = time.time() @@ -91,6 +95,7 @@ def train_one_epoch(self, model, criterion, dataloader, optimizer, epoch, mixup_ loss=state.loss) epoch_time = time.time() - start + state.no_eval_time += time.time() - no_eval_start_time if config.local_rank == 0: print("EPOCH {} training takes {}".format(epoch, datetime.timedelta(seconds=int(epoch_time)))) diff --git a/training/benchmarks/swin_transformer/pytorch/train/training_state.py b/training/benchmarks/swin_transformer/pytorch/train/training_state.py index ca7753267..e9b3575dd 100644 --- a/training/benchmarks/swin_transformer/pytorch/train/training_state.py +++ b/training/benchmarks/swin_transformer/pytorch/train/training_state.py @@ -27,6 +27,8 @@ class TrainingState: init_time = 0 raw_train_time = 0 + no_eval_time = 0 + pure_compute_time = 0 def status(self): if self.converged: diff --git a/training/benchmarks/swin_transformer/pytorch/utils/utils.py b/training/benchmarks/swin_transformer/pytorch/utils/utils.py index cc21b6032..1b2225a5d 100644 --- a/training/benchmarks/swin_transformer/pytorch/utils/utils.py +++ b/training/benchmarks/swin_transformer/pytorch/utils/utils.py @@ -5,7 +5,6 @@ # Written by Ze Liu # -------------------------------------------------------- -import os import torch import torch.distributed as dist from torch._six import inf @@ -13,8 +12,12 @@ def reduce_tensor(tensor): rt = tensor.clone() - dist.all_reduce(rt, op=dist.ReduceOp.SUM) - rt /= dist.get_world_size() + # bugfix for 1x1 training + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + rt /= dist.get_world_size() + else: + return tensor return rt diff --git a/training/nvidia/swin_transformer-pytorch/README.md b/training/nvidia/swin_transformer-pytorch/README.md index 46ac31803..734369646 100644 --- a/training/nvidia/swin_transformer-pytorch/README.md +++ b/training/nvidia/swin_transformer-pytorch/README.md @@ -18,10 +18,29 @@ ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | -| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机1卡 | config_A100x1x1 | | | | | | -| 单机2卡 | config_A100x1x2 | | | | | | -| 单机4卡 | config_A100x1x4 | | | | | | -| 单机8卡 | config_A100x1x8 | 109571.12 | 81.00 | 81.12 | 187500 | 3505.07 | -| 两机8卡 | config_A100x2x8 | | | | | | +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | --------------------------------------------- | ------------------------------------------- | +| 任务类别 | Image Classification && Semantic Segmantation | | +| 模型 | swin_transformer | | +| 数据集 | Imagenet2012 1K | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | nvidia A100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | val_loss,见“性能指标” | 验证loss | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem | +| ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- | +| A100单机8卡(1x8) | amp | / | 109832 | 3410 | 3481 | 3511 | 81.12 | 28.9/40.0 | +| A100单机8卡(1x8) | amp | bs=384 | | 3457 | 3535 | 3573 | | 37.6/40.0 | +| A100单机单卡(1x1) | amp | bs=384 | | 451 | 457 | 458 | | 36.0/40.0 | +| A100两机8卡(2x8) | amp | bs=384 | | 6733 | 6947 | 7073 | | 39.6/40.0 |