Files
ViGent2/models/LatentSync/configs/syncnet/syncnet_16_latent.yaml
2026-01-21 10:30:32 +08:00

47 lines
1.2 KiB
YAML

model:
audio_encoder: # input (1, 80, 52)
in_channels: 1
block_out_channels: [32, 64, 128, 256, 512, 1024]
downsample_factors: [[2, 1], 2, 2, 2, 2, [2, 3]]
attn_blocks: [0, 0, 0, 0, 0, 0]
dropout: 0.0
visual_encoder: # input (64, 32, 32)
in_channels: 64
block_out_channels: [64, 128, 256, 256, 512, 1024]
downsample_factors: [2, 2, 2, 1, 2, 2]
attn_blocks: [0, 0, 0, 0, 0, 0]
dropout: 0.0
ckpt:
resume_ckpt_path: ""
inference_ckpt_path: ""
save_ckpt_steps: 2500
data:
train_output_dir: debug/syncnet
num_val_samples: 1200
batch_size: 120 # 40
gradient_accumulation_steps: 1
num_workers: 12 # 12
latent_space: true
num_frames: 16
resolution: 256
train_fileslist: /mnt/bn/maliva-gen-ai-v2/chunyu.li/fileslist/data_v10_core.txt
train_data_dir: ""
val_fileslist: ""
val_data_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/VoxCeleb2/high_visual_quality/val
audio_mel_cache_dir: /mnt/bn/maliva-gen-ai-v2/chunyu.li/audio_cache/mel
lower_half: false
audio_sample_rate: 16000
video_fps: 25
optimizer:
lr: 1e-5
max_grad_norm: 1.0
run:
max_train_steps: 10000000
validation_steps: 2500
mixed_precision_training: true
seed: 42