Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

Warholt commited on 29 days ago

Commit

7c81a73

1 Parent(s): a2ea06b

wrap aot packages in lazy torch module

Browse files

Files changed (1) hide show

app.py +64 -94

app.py CHANGED Viewed

@@ -4,63 +4,67 @@ import torch._inductor
 import spaces
 from char_tokenizers import GermanCharsTokenizer
-# Initialize tokenizer
 TOKENIZER = GermanCharsTokenizer()
-# Model paths
-AOT_MODELS = {
     "Caro": {
-        "encoder": "aot_package/caro_fastpitch_encoder.pt2",
-        "decoder": "aot_package/caro_fastpitch_decoder.pt2",
-        "vocoder": "aot_package/caro_hifigan.pt2",
     },
     "Karlsson": {
-        "encoder": "aot_package/karlsson_fastpitch_encoder.pt2",
-        "decoder": "aot_package/karlsson_fastpitch_decoder.pt2",
-        "vocoder": "aot_package/karlsson_hifigan.pt2",
     },
 }
-# Global variable to hold loaded models
-aot_sessions = {}
-@spaces.GPU(duration=60)
-def load_models():
-    """Load AOT models on GPU."""
-    global aot_sessions
-    if aot_sessions:  # Already loaded
-        return
-    print("Loading AOT models for GPU...")
-    for voice_name, paths in AOT_MODELS.items():
-        print(f"Loading {voice_name} AOT models...")
-        aot_sessions[voice_name] = {
-            "encoder": torch._inductor.aoti_load_package(paths["encoder"]),
-            "decoder": torch._inductor.aoti_load_package(paths["decoder"]),
-            "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
-        }
-    print("AOT models loaded successfully!")
 @spaces.GPU(duration=60)
 def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
-    Synthesize speech from text using AOT compiled models on GPU.
-    Args:
-        text: Input text to synthesize
-        voice: Voice to use (Caro or Karlsson)
-        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
-    Returns:
-        Tuple of (sample_rate, audio_array)
     """
-    # Load models if not already loaded
-    if not aot_sessions:
-        load_models()
     if not text.strip():
         return None
@@ -69,22 +73,25 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
     # Prepare control parameters
-    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32)
-    pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
     with torch.inference_mode():
-        # Run encoder to get latent representation and length
-        encoder = aot_sessions[voice]["encoder"]
         len_regulated, dec_lens, spk_emb = encoder(
             tokens_tensor, pitch_tensor, pace_tensor
         )
-        # Run decoder to get mel-spectrogram
-        decoder = aot_sessions[voice]["decoder"]
         spec = decoder(len_regulated, dec_lens, spk_emb)
-        # Run vocoder to generate audio waveform
-        vocoder = aot_sessions[voice]["vocoder"]
         audio = vocoder(spec)
     # Convert to numpy and return
@@ -94,15 +101,12 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     return (sample_rate, audio_array)
-# Create Gradio interface
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
         """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
-        Enter your German text below and select a voice to synthesize speech.
         """
     )
@@ -110,54 +114,20 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to synthesize",
-                placeholder="Geben Sie hier Ihren deutschen Text ein...",
-                lines=5,
                 value="Hallo! Willkommen zur deutschen Sprachsynthese.",
             )
             voice_dropdown = gr.Dropdown(
                 choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
             )
             pace_slider = gr.Slider(
-                minimum=0.5,
-                maximum=2.0,
-                value=1.0,
-                step=0.1,
-                label="Speaking Rate",
-                info="1.0 is normal speed",
             )
             generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated Audio", type="numpy")
-    gr.Examples(
-        examples=[
-            ["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
-            [
-                "Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
-                "Karlsson",
-                1.0,
-            ],
-            [
-                "Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
-                "Caro",
-                0.9,
-            ],
-            [
-                "Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
-                "Karlsson",
-                1.0,
-            ],
-        ],
-        inputs=[text_input, voice_dropdown, pace_slider],
-        outputs=audio_output,
-        fn=synthesize_speech,
-        cache_examples=False,
-    )
     generate_btn.click(
         fn=synthesize_speech,
         inputs=[text_input, voice_dropdown, pace_slider],
@@ -165,4 +135,4 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import spaces
 from char_tokenizers import GermanCharsTokenizer
+# --- 1. Define a Wrapper for Lazy Loading ---
+class LazyAotPackage(torch.nn.Module):
+    """
+    A wrapper that holds the path to an AOT package and loads it
+    to the GPU only when forward() is called.
+    """
+    def __init__(self, package_path):
+        super().__init__()
+        self.package_path = package_path
+        self.runner = None
+    def forward(self, *args, **kwargs):
+        # We are now inside the @spaces.GPU decorated function.
+        # Valid GPU context exists.
+        # If runner is not loaded, load it now.
+        if self.runner is None:
+            # Load directly to the active CUDA device
+            self.runner = torch._inductor.aoti_load_package(
+                self.package_path, device="cuda"
+            )
+        # Run inference
+        # We add a try/except block because if ZeroGPU swaps the underlying hardware
+        # between requests, the old runner might be invalid.
+        try:
+            return self.runner(*args, **kwargs)
+        except RuntimeError:
+            # Context might be stale, reload
+            self.runner = torch._inductor.aoti_load_package(
+                self.package_path, device="cuda"
+            )
+            return self.runner(*args, **kwargs)
+# --- 2. Initialize Global Components ---
 TOKENIZER = GermanCharsTokenizer()
+# Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
+# These act like standard PyTorch modules but use almost no RAM until inference.
+MODELS = {
     "Caro": {
+        "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
+        "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
+        "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
     },
     "Karlsson": {
+        "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
+        "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
+        "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
     },
 }
+# --- 3. Inference Function ---
 @spaces.GPU(duration=60)
 def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
+    Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
+    for the duration of this function.
     """
     if not text.strip():
         return None
     tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
     # Prepare control parameters
+    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
+    pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace
+    # Retrieve the correct lazy-loaded models
+    # The .forward() call inside these objects will trigger the load to GPU
+    encoder = MODELS[voice]["encoder"]
+    decoder = MODELS[voice]["decoder"]
+    vocoder = MODELS[voice]["vocoder"]
     with torch.inference_mode():
+        # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
         len_regulated, dec_lens, spk_emb = encoder(
             tokens_tensor, pitch_tensor, pace_tensor
         )
+        # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
         spec = decoder(len_regulated, dec_lens, spk_emb)
+        # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
         audio = vocoder(spec)
     # Convert to numpy and return
     return (sample_rate, audio_array)
+# --- 4. Gradio Interface ---
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
         """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
         """
     )
         with gr.Column():
             text_input = gr.Textbox(
                 label="Text to synthesize",
                 value="Hallo! Willkommen zur deutschen Sprachsynthese.",
+                lines=3,
             )
             voice_dropdown = gr.Dropdown(
                 choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
             )
             pace_slider = gr.Slider(
+                minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
             )
             generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated Audio", type="numpy")
     generate_btn.click(
         fn=synthesize_speech,
         inputs=[text_input, voice_dropdown, pace_slider],
     )
 if __name__ == "__main__":
+    demo.launch()