Reward-Forcing

Paused

App Files Files Community

fffiloni commited on 24 days ago

Commit

fa10854

verified ·

1 Parent(s): 5a46abc

English version

Browse files

Files changed (1) hide show

app_wip.py +55 -51

app_wip.py CHANGED Viewed

@@ -15,11 +15,11 @@ from pipeline import (
     CausalInferencePipeline,
 )
 from utils.dataset import TextDataset
-from utils.misc import set_seed
 from demo_utils.memory import get_cuda_free_memory_gb, DynamicSwapInstaller
 # -------------------------------------------------------------------
-# Téléchargement des checkpoints (une fois au démarrage du Space)
 # -------------------------------------------------------------------
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
@@ -41,7 +41,7 @@ snapshot_download(
     local_dir="./checkpoints/Reward-Forcing-T2V-1.3B",
 )
-# === Chemins ===
 CONFIG_PATH = "configs/reward_forcing.yaml"
 CHECKPOINT_PATH = "checkpoints/Reward-Forcing-T2V-1.3B/rewardforcing.pt"
@@ -60,14 +60,14 @@ def reward_forcing_inference(
     progress: gr.Progress,
 ):
     """
-    Version inline / simplifiée de inference.py :
     - single GPU
-    - T2V uniquement
-    - 1 fichier .txt = n prompts (mais on retourne la 1ère vidéo)
     """
     logs = ""
-    # --------------------- Device & seed ---------------------
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     set_seed(0)
@@ -77,29 +77,31 @@ def reward_forcing_inference(
     torch.set_grad_enabled(False)
-    # --------------------- Phase 1 : init modèle / config ---------------------
-    progress(0.05, desc="Initialisation : chargement de la config")
-    logs += "Chargement de la config...\n"
     config = OmegaConf.load(CONFIG_PATH)
     default_config = OmegaConf.load("configs/default_config.yaml")
     config = OmegaConf.merge(default_config, config)
-    progress(0.15, desc="Initialisation : création de la pipeline")
-    logs += "Initialisation de la pipeline...\n"
     if hasattr(config, "denoising_step_list"):
         pipeline = CausalInferencePipeline(config, device=device)
     else:
         pipeline = CausalDiffusionInferencePipeline(config, device=device)
-    progress(0.35, desc="Initialisation : chargement du checkpoint")
-    logs += "Chargement des poids du checkpoint...\n"
     state_dict = torch.load(CHECKPOINT_PATH, map_location="cpu")
     pipeline.generator.load_state_dict(state_dict)
     checkpoint_step = os.path.basename(os.path.dirname(CHECKPOINT_PATH))
     checkpoint_step = checkpoint_step.split("_")[-1]
-    progress(0.55, desc="Initialisation : placement sur le device")
-    logs += "Placement du modèle sur le device...\n"
     pipeline = pipeline.to(dtype=torch.bfloat16)
     if low_memory:
         DynamicSwapInstaller.install_model(pipeline.text_encoder, device=device)
@@ -108,9 +110,9 @@ def reward_forcing_inference(
     pipeline.generator.to(device=device)
     pipeline.vae.to(device=device)
-    # --------------------- Dataset / DataLoader ---------------------
-    progress(0.65, desc="Préparation du dataset")
-    logs += "Préparation du dataset (TextDataset)...\n"
     dataset = TextDataset(prompt_path=prompt_txt_path, extended_prompt_path=None)
     num_prompts = len(dataset)
     logs += f"Number of prompts: {num_prompts}\n"
@@ -122,26 +124,26 @@ def reward_forcing_inference(
         dataset, batch_size=1, sampler=sampler, num_workers=0, drop_last=False
     )
-    # --------------------- Output folder (on le vide) ---------------------
-    progress(0.7, desc="Nettoyage du dossier de sortie")
     output_folder = os.path.join(
         output_root, f"rewardforcing-{num_output_frames}f", checkpoint_step
     )
     shutil.rmtree(output_folder, ignore_errors=True)
     os.makedirs(output_folder, exist_ok=True)
-    logs += f"Dossier de sortie: {output_folder}\n"
-    # --------------------- Phase 2 : boucle d'inférence ---------------------
-    # Ici on peut utiliser progress.tqdm sur la boucle dataloader
     for i, batch_data in progress.tqdm(
         enumerate(dataloader),
         total=num_prompts,
-        desc="Génération vidéo",
         unit="prompt",
     ):
         idx = batch_data["idx"].item()
-        # Unpack batch
         if isinstance(batch_data, dict):
             batch = batch_data
         elif isinstance(batch_data, list):
@@ -151,7 +153,7 @@ def reward_forcing_inference(
         all_video = []
-        # TEXT-TO-VIDEO uniquement (pas d'I2V ici)
         prompt = batch["prompts"][0]
         extended_prompt = batch.get("extended_prompts", [None])[0]
         if extended_prompt is not None:
@@ -161,15 +163,16 @@ def reward_forcing_inference(
         initial_latent = None
         sampled_noise = torch.randn(
             [1, num_output_frames, 16, 60, 104],
             device=device,
             dtype=torch.bfloat16,
         )
-        logs += f"Génération pour le prompt: {prompt[:80]}...\n"
-        # Appel au pipeline
         video, latents = pipeline.inference(
             noise=sampled_noise,
             text_prompts=prompts,
@@ -181,23 +184,24 @@ def reward_forcing_inference(
         current_video = rearrange(video, "b t c h w -> b t h w c").cpu()
         all_video.append(current_video)
         video = 255.0 * torch.cat(all_video, dim=1)
-        # Clear VAE cache
         pipeline.vae.model.clear_cache()
-        # Sauvegarde vidéo (on retourne la 1ère vidéo)
         if idx < num_prompts:
             model = "regular" if not use_ema else "ema"
             safe_name = prompt[:50].replace("/", "_").replace("\\", "_")
             output_path = os.path.join(output_folder, f"{safe_name}.mp4")
             write_video(output_path, video[0], fps=16)
-            logs += f"Vidéo enregistrée: {output_path}\n"
-            progress(1.0, desc="Terminé ✅")
             return output_path, logs
-    logs += "[WARN] Aucune vidéo générée dans la boucle.\n"
     return None, logs
@@ -205,15 +209,15 @@ def gradio_generate(
     prompt: str, duration: str, use_ema: bool, progress=gr.Progress(track_tqdm=True)
 ):
     """
-    Fonction appelée par Gradio :
-    - écrit le prompt dans un .txt
-    - appelle reward_forcing_inference
-    - retourne (video_path, logs)
     """
     if not prompt or not prompt.strip():
-        raise gr.Error("Veuillez entrer un prompt texte 🙂")
-    # Durée -> frames
     if duration == "5s (21 frames)":
         num_output_frames = 21
     else:
@@ -236,15 +240,15 @@ def gradio_generate(
     if video_path is None or not os.path.exists(video_path):
         raise gr.Error(
-            "Aucune vidéo trouvée après l'inférence.\n"
-            "Regarde les logs ci-dessous pour voir ce qui a coincé."
         )
     return video_path, logs
 # -------------------------------------------------------------------
-# UI Gradio
 # -------------------------------------------------------------------
 with gr.Blocks(title="Reward Forcing T2V Demo (inline inference)") as demo:
@@ -252,10 +256,10 @@ with gr.Blocks(title="Reward Forcing T2V Demo (inline inference)") as demo:
         """
         # 🎬 Reward Forcing – Text-to-Video (inline)
-        Cette version appelle directement la logique d'inférence en Python,
-        ce qui permet à Gradio de suivre :
-        - l'initialisation du modèle (via `progress(...)`)
-        - la boucle de génération (via `progress.tqdm(...)`)
         """
     )
@@ -270,14 +274,14 @@ with gr.Blocks(title="Reward Forcing T2V Demo (inline inference)") as demo:
         duration = gr.Radio(
             ["5s (21 frames)", "30s (120 frames)"],
             value="5s (21 frames)",
-            label="Durée",
         )
-        use_ema = gr.Checkbox(value=True, label="Utiliser les poids EMA (--use_ema)")
-    generate_btn = gr.Button("🚀 Générer la vidéo", variant="primary")
     with gr.Row():
-        video_out = gr.Video(label="Vidéo générée")
     logs_out = gr.Textbox(
         label="Logs",
         lines=12,

     CausalInferencePipeline,
 )
 from utils.dataset import TextDataset
+    from utils.misc import set_seed
 from demo_utils.memory import get_cuda_free_memory_gb, DynamicSwapInstaller
 # -------------------------------------------------------------------
+# Download checkpoints once when the Space starts
 # -------------------------------------------------------------------
 snapshot_download(
     repo_id="Wan-AI/Wan2.1-T2V-1.3B",
     local_dir="./checkpoints/Reward-Forcing-T2V-1.3B",
 )
+# === Paths ===
 CONFIG_PATH = "configs/reward_forcing.yaml"
 CHECKPOINT_PATH = "checkpoints/Reward-Forcing-T2V-1.3B/rewardforcing.pt"
     progress: gr.Progress,
 ):
     """
+    Inline / simplified version of inference.py:
     - single GPU
+    - text-to-video only
+    - one .txt file = N prompts, but we return only the first generated video
     """
     logs = ""
+    # --------------------- Device & randomness ---------------------
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     set_seed(0)
     torch.set_grad_enabled(False)
+    # --------------------- Stage 1: model & config init ---------------------
+    progress(0.05, desc="Init: loading config")
+    logs += "Loading config...\n"
     config = OmegaConf.load(CONFIG_PATH)
     default_config = OmegaConf.load("configs/default_config.yaml")
     config = OmegaConf.merge(default_config, config)
+    progress(0.15, desc="Init: creating pipeline")
+    logs += "Creating pipeline...\n"
     if hasattr(config, "denoising_step_list"):
+        # few-step sampling pipeline
         pipeline = CausalInferencePipeline(config, device=device)
     else:
+        # full diffusion pipeline
         pipeline = CausalDiffusionInferencePipeline(config, device=device)
+    progress(0.35, desc="Init: loading checkpoint")
+    logs += "Loading checkpoint weights...\n"
     state_dict = torch.load(CHECKPOINT_PATH, map_location="cpu")
     pipeline.generator.load_state_dict(state_dict)
     checkpoint_step = os.path.basename(os.path.dirname(CHECKPOINT_PATH))
     checkpoint_step = checkpoint_step.split("_")[-1]
+    progress(0.55, desc="Init: moving model to device")
+    logs += "Moving model to device...\n"
     pipeline = pipeline.to(dtype=torch.bfloat16)
     if low_memory:
         DynamicSwapInstaller.install_model(pipeline.text_encoder, device=device)
     pipeline.generator.to(device=device)
     pipeline.vae.to(device=device)
+    # --------------------- Dataset setup ---------------------
+    progress(0.65, desc="Preparing dataset")
+    logs += "Preparing dataset (TextDataset)...\n"
     dataset = TextDataset(prompt_path=prompt_txt_path, extended_prompt_path=None)
     num_prompts = len(dataset)
     logs += f"Number of prompts: {num_prompts}\n"
         dataset, batch_size=1, sampler=sampler, num_workers=0, drop_last=False
     )
+    # --------------------- Make a clean output directory ---------------------
+    progress(0.7, desc="Cleaning output folder")
     output_folder = os.path.join(
         output_root, f"rewardforcing-{num_output_frames}f", checkpoint_step
     )
     shutil.rmtree(output_folder, ignore_errors=True)
     os.makedirs(output_folder, exist_ok=True)
+    logs += f"Output directory: {output_folder}\n"
+    # --------------------- Stage 2: inference loop ---------------------
+    # Gradio can track tqdm progress on iterable loops
     for i, batch_data in progress.tqdm(
         enumerate(dataloader),
         total=num_prompts,
+        desc="Video generation",
         unit="prompt",
     ):
         idx = batch_data["idx"].item()
+        # Unpack dataset batch
         if isinstance(batch_data, dict):
             batch = batch_data
         elif isinstance(batch_data, list):
         all_video = []
+        # TEXT-TO-VIDEO only (no I2V here)
         prompt = batch["prompts"][0]
         extended_prompt = batch.get("extended_prompts", [None])[0]
         if extended_prompt is not None:
         initial_latent = None
+        # Noise tensor shape matches WAN2 expected latent dims
         sampled_noise = torch.randn(
             [1, num_output_frames, 16, 60, 104],
             device=device,
             dtype=torch.bfloat16,
         )
+        logs += f"Generating for prompt: {prompt[:80]}...\n"
+        # Run WAN inference
         video, latents = pipeline.inference(
             noise=sampled_noise,
             text_prompts=prompts,
         current_video = rearrange(video, "b t c h w -> b t h w c").cpu()
         all_video.append(current_video)
+        # convert to uint8 *after* concatenation
         video = 255.0 * torch.cat(all_video, dim=1)
+        # free VAE cache between clips
         pipeline.vae.model.clear_cache()
+        # Save only the first video
         if idx < num_prompts:
             model = "regular" if not use_ema else "ema"
             safe_name = prompt[:50].replace("/", "_").replace("\\", "_")
             output_path = os.path.join(output_folder, f"{safe_name}.mp4")
             write_video(output_path, video[0], fps=16)
+            logs += f"Saved video: {output_path}\n"
+            progress(1.0, desc="Done ✅")
             return output_path, logs
+    logs += "[WARN] No video generated in loop.\n"
     return None, logs
     prompt: str, duration: str, use_ema: bool, progress=gr.Progress(track_tqdm=True)
 ):
     """
+    Triggered by Gradio:
+    - writes prompt to a temporary .txt file
+    - runs reward_forcing_inference
+    - returns video + logs
     """
     if not prompt or not prompt.strip():
+        raise gr.Error("Please type a text prompt 🙂")
+    # Duration -> number of latent timesteps
     if duration == "5s (21 frames)":
         num_output_frames = 21
     else:
     if video_path is None or not os.path.exists(video_path):
         raise gr.Error(
+            "No video generated.\n"
+            "Check the logs below for errors."
         )
     return video_path, logs
 # -------------------------------------------------------------------
+# Gradio UI
 # -------------------------------------------------------------------
 with gr.Blocks(title="Reward Forcing T2V Demo (inline inference)") as demo:
         """
         # 🎬 Reward Forcing – Text-to-Video (inline)
+        This version directly calls the inference logic in Python,
+        allowing Gradio to track:
+        - model initialization via `progress(...)`
+        - video generation progress via `progress.tqdm(...)`
         """
     )
         duration = gr.Radio(
             ["5s (21 frames)", "30s (120 frames)"],
             value="5s (21 frames)",
+            label="Duration",
         )
+        use_ema = gr.Checkbox(value=True, label="Use EMA weights (--use_ema)")
+    generate_btn = gr.Button("🚀 Generate Video", variant="primary")
     with gr.Row():
+        video_out = gr.Video(label="Generated Video")
     logs_out = gr.Textbox(
         label="Logs",
         lines=12,