Date: 11.08.2025

WAN 2.1 1.3b diffusers ROCm PyTorch Test

Requirments

AMD MI200 64Gb VRAM
Workstation 40 GB RAM, 200GB SSD, 750W Power supply
Ubuntu 24.04 LTS HWE Kernel
Install python 3.11 or 3.12

Steps

Preapre python environment for ROCm + PyTorch + BitsAndBytes:

Prepare Compilation BitsAndBytes for ROCm 6.2

Check PyTorch installation

python3 -c "import torch; print(torch.__version__); print(torch.cuda.is_available()); print(torch.version.hip);print(torch.cuda.get_device_name(0));"

Check BitsAndBytes installation

python -m bitsandbytes

Get the WAN 2.1

git lfs install
git clone https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B-diffusers wan2.1d

Create script test_rocm_wan2.1d.py:

import torch
from diffusers import AutoencoderKLWan, WanVACEPipeline, WanVACETransformer3DModel
from diffusers import BitsAndBytesConfig as DF_BNB
from transformers import UMT5EncoderModel, AutoTokenizer
from transformers import BitsAndBytesConfig as TF_BNB
import time

import numpy as np
from PIL import Image
from diffusers.utils import load_image, export_to_video

model_dir = "/home/sysadmin/llm/wan2.1d"

bnb_tx = DF_BNB(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
transformer = WanVACETransformer3DModel.from_pretrained(
    model_dir, 
    subfolder="transformer",
    quantization_config=bnb_tx, 
    torch_dtype=torch.float16
)

bnb_te = TF_BNB(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
text_encoder = UMT5EncoderModel.from_pretrained(
    model_dir, 
    subfolder="text_encoder",
    quantization_config=bnb_te, 
    torch_dtype=torch.float16
)

vae = AutoencoderKLWan.from_pretrained(model_dir, subfolder="vae", torch_dtype=torch.float32)

pipe = WanVACEPipeline.from_pretrained(
    model_dir, 
    vae=vae, 
    transformer=transformer, 
    text_encoder=text_encoder,
    torch_dtype=torch.float16
)
#).to("cuda")

pipe.enable_attention_slicing("max")
pipe.enable_model_cpu_offload() 

#time.sleep(30)

####

def fit_hw_for_pipe(width, height, pipe, max_area=832*480):
    aspect = height / width
    mod = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
    H = round(np.sqrt(max_area * aspect)) // mod * mod
    W = round(np.sqrt(max_area / aspect)) // mod * mod
    return int(W), int(H)

ref = load_image("test_bnb.png") 
W, H = fit_hw_for_pipe(ref.width, ref.height, pipe, max_area=832*480)

ref = ref.resize((W, H), Image.LANCZOS)

prompt = "ginger cat sits on a chair"
negative = "text on screen, watermarks, blurry, distortion, low quality"

result = pipe(
    prompt=prompt,
    negative_prompt=negative,
    reference_images=[ref], 
    height=H,
    width=W,
    num_frames=40, 
    num_inference_steps=20,
    guidance_scale=5.0
).frames[0]

export_to_video(result, "test.mp4", fps=16)

Open test.mp4 and enjoy the result!

Currently it not works I down’t have mi200