-
Notifications
You must be signed in to change notification settings - Fork 82
/
Copy pathbenchmarks.yml
84 lines (80 loc) · 2.24 KB
/
benchmarks.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
---
common_options: &common_options
output:
- [samples/sec, 'throughput']
- [loss, 'loss']
data:
throughput:
regexp: 'throughput:*(.*?) avg:.* samples\/sec'
skip: 1
loss:
reduction_type: 'final'
regexp: 'loss: *(\d*\.\d*)'
skip: 1
env:
POPLAR_ENGINE_OPTIONS: '{
"opt.enableMultiAccessCopies":"false",
"target.hostSyncTimeout":"3000"
}'
PYTORCH_EXE_DIR: "/tmp/pt_cache/"
description: |
Dino training with real data
config_options: &config_options
requirements_path: requirements.txt
pre_run_commands: [sh make_ema.sh]
pytorch_dino_finetune_pod16:
<<: [*common_options, *config_options]
cmd: >-
python3 script/linear_train.py
--arch vit_base
--n_last_blocks 1
--data_path $DATASETS_DIR/imagenet-raw-dataset
--pretrained_weights checkpoint_dino_vit_base/checkpoint.pth
--replica 8
--ga 64
--batch_size 4
--output vit_linear
pytorch_dino_train_real_pod16:
<<: [*common_options, *config_options]
cmd: >-
python3 train_ipu.py
--config vit_base_pod16
--data_path $DATASETS_DIR/imagenet-raw-dataset/train
--epochs 2
--warmup_epochs 0
pytorch_dino_train_real_pod64_conv:
<<: [*common_options, *config_options]
cmd: >-
poprun
-vv
--host $HOSTS
--num-instances=8
--num-replicas=8
--ipus-per-replica=8
--remove-partition=no
--vipu-server-host=$IPUOF_VIPU_API_HOST
--vipu-partition=$IPUOF_VIPU_API_PARTITION_ID
--vipu-server-timeout=3600
--mpi-global-args="
--mca oob_tcp_if_include $TCP_IF_INCLUDE
--mca btl_tcp_if_include $TCP_IF_INCLUDE"
--mpi-local-args="
-x OPAL_PREFIX
-x LD_LIBRARY_PATH
-x PATH
-x PYTHONPATH
-x CPATH
-x IPUOF_VIPU_API_TIMEOUT=3600
-x POPLAR_LOG_LEVEL=WARN
-x POPLAR_SDK_ENABLED
-x POPLAR_ENGINE_OPTIONS"
python3 train_ipu.py
--config vit_base_pod64
--data_path $DATASETS_DIR/imagenet-raw-dataset/train
--ga 200
--batch_size 2
--rebatched_worker_size 400
--executable-cache-dir $PYTORCH_EXE_DIR
--wandb
--wandb-run-name pytorch_dino_train_real_pod64_conv
--checkpoint-output-dir output