-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlandscapes_audio-cond_cfg.yaml
77 lines (73 loc) · 2.39 KB
/
landscapes_audio-cond_cfg.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
exp:
output_dir: "exps/audio-cond_animation/landscapes_audio-cond_cfg"
project_dir: "exps/audio-cond_animation/landscapes_audio-cond_cfg/log"
log_file: "exps/audio-cond_animation/landscapes_audio-cond_cfg/train.log"
log_with: "wandb"
seed: 123
model:
scheduler:
name: "DDPMScheduler"
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
dynamic_thresholding_ratio: 0.995
num_train_timesteps: 1000
prediction_type: epsilon
steps_offset: 1
clip_sample: False
sample_max_value: 1.0
thresholding: false
trained_betas: null
audio_encoder:
name: "ImageBindSegmaskAudioEncoder"
n_segment: 12
vae:
name: "AutoencoderKL"
pretrained_model_name_or_path: "./pretrained/stable-diffusion-v1-5"
subfolder: "vae"
unet:
pretrained_model_name_or_path: "./pretrained/stable-diffusion-v1-5"
down_block_types: ["FFSpatioAudioTempCrossAttnDownBlock3D", "FFSpatioAudioTempCrossAttnDownBlock3D", "FFSpatioAudioTempCrossAttnDownBlock3D", "FFSpatioTempResDownBlock3D"]
up_block_types: ["FFSpatioTempResUpBlock3D", "FFSpatioAudioTempCrossAttnUpBlock3D", "FFSpatioAudioTempCrossAttnUpBlock3D", "FFSpatioAudioTempCrossAttnUpBlock3D"]
mid_block_type: "FFSpatioAudioTempCrossAttnUNetMidBlock3D"
train_image_modules: False
trainable_modules: ["_temp", "_audio"]
audio_cond_drop_prob: 0.2
text_cond_drop_prob: 0.0
image_cond_drop_prob: 0.0
loss_on_first_frame: False
train:
dataset_name: "base_audio_video"
batch_size: 4
log_steps: 10
dataset:
data_root: "./datasets/Landscapes/videos/train"
example_list_path: "./datasets/Landscapes/train.txt"
example_list_type: "video"
mode: "train"
img_size: [256, 256]
randflip: True
randcrop: False
video_fps: 6
video_num_frame: 12
class_mapping_json: "./datasets/Landscapes/class_mapping.json"
class_text_encoding_mapping_pt: "./datasets/Landscapes/class_clip_text_encodings_stable-diffusion-v1-5.pt"
# optimizer
optim:
max_train_steps: 400000
learning_rate: 1e-4
scale_lr: False
lr_scheduler: "constant"
lr_warmup_steps: 10
adam_beta1: 0.9
adam_beta2: 0.999
adam_weight_decay: 1e-2
adam_epsilon: 1e-08
max_grad_norm: 1.0
gradient_accumulation_steps: 2
checkpointing_steps: 1000
checkpointing_milestones: 24000
resume_from_checkpoint: "latest"
mixed_precision: "fp16"
use_8bit_adam: False
enable_gradient_checkpoint: True