Spaces:

Warholt
/

CaroTTS-DE

Running on Zero

App Files Files Community

Warholt commited on 29 days ago

Commit

a2ea06b

1 Parent(s): dff6ae5

remove onnx fallback, load with gpu decorator

Browse files

Files changed (1) hide show

app.py +22 -121

app.py CHANGED Viewed

@@ -1,29 +1,12 @@
 import gradio as gr
-import onnxruntime as ort
-import numpy as np
 import torch
 import torch._inductor
 from char_tokenizers import GermanCharsTokenizer
-# Try to import spaces for Zero GPU support
-try:
-    import spaces
-    HAS_SPACES = True
-except ImportError:
-    HAS_SPACES = False
-    print("spaces not available, running without Zero GPU support")
 # Initialize tokenizer
 TOKENIZER = GermanCharsTokenizer()
-# Check if CUDA is available
-USE_GPU = torch.cuda.is_available()
-DEVICE = "cuda" if USE_GPU else "cpu"
-print(f"Using device: {DEVICE}")
-print(f"Zero GPU support: {HAS_SPACES}")
 # Model paths
 AOT_MODELS = {
     "Caro": {
@@ -38,21 +21,19 @@ AOT_MODELS = {
     },
 }
-ONNX_MODELS = {
-    "Caro": {
-        "fastpitch": "onnx/caro_fastpitch.onnx",
-        "hifigan": "onnx/caro_hifigan.onnx",
-    },
-    "Karlsson": {
-        "fastpitch": "onnx/karlsson_fastpitch.onnx",
-        "hifigan": "onnx/karlsson_hifigan.onnx",
-    },
-}
-# Load models based on device
-if USE_GPU:
     print("Loading AOT models for GPU...")
-    aot_sessions = {}
     for voice_name, paths in AOT_MODELS.items():
         print(f"Loading {voice_name} AOT models...")
         aot_sessions[voice_name] = {
@@ -61,44 +42,34 @@ if USE_GPU:
             "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
         }
     print("AOT models loaded successfully!")
-    onnx_sessions = None
-else:
-    print("Loading ONNX models for CPU...")
-    onnx_sessions = {}
-    for voice_name, paths in ONNX_MODELS.items():
-        print(f"Loading {voice_name} ONNX models...")
-        onnx_sessions[voice_name] = {
-            "fastpitch": ort.InferenceSession(paths["fastpitch"]),
-            "hifigan": ort.InferenceSession(paths["hifigan"]),
-        }
-    print("ONNX models loaded successfully!")
-    aot_sessions = None
-def synthesize_speech_aot(
-    text: str, voice: str, pace: float = 1.0, pitch_shift: float = 0.0
-):
     """
-    Synthesize speech using AOT compiled models (GPU).
     Args:
         text: Input text to synthesize
         voice: Voice to use (Caro or Karlsson)
         pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
-        pitch_shift: Pitch adjustment (0.0 = no change)
     Returns:
         Tuple of (sample_rate, audio_array)
     """
     if not text.strip():
         return None
     # Tokenize text
     tokens = TOKENIZER.encode(text)
-    tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to(DEVICE)
     # Prepare control parameters
-    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32) + pitch_shift
     pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
     with torch.inference_mode():
@@ -123,84 +94,14 @@ def synthesize_speech_aot(
     return (sample_rate, audio_array)
-def synthesize_speech_onnx(text: str, voice: str, pace: float = 1.0):
-    """
-    Synthesize speech using ONNX models (CPU).
-    Args:
-        text: Input text to synthesize
-        voice: Voice to use (Caro or Karlsson)
-        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
-    Returns:
-        Tuple of (sample_rate, audio_array)
-    """
-    if not text.strip():
-        return None
-    # Tokenize text
-    tokens = TOKENIZER.encode(text)
-    # Prepare inputs for FastPitch
-    paces = np.zeros(len(tokens), dtype=np.float32) + pace
-    pitches = np.zeros(len(tokens), dtype=np.float32)  # Keep pitch at 0.0
-    inputs = {
-        "text": np.array([tokens], dtype=np.int64),
-        "pace": np.array([paces], dtype=np.float32),
-        "pitch": np.array([pitches], dtype=np.float32),
-    }
-    # Generate spectrogram with FastPitch
-    fastpitch_session = onnx_sessions[voice]["fastpitch"]
-    spec = fastpitch_session.run(None, inputs)[0]
-    # Generate audio with HiFiGAN
-    hifigan_session = onnx_sessions[voice]["hifigan"]
-    gan_inputs = {"spec": spec}
-    audio = hifigan_session.run(None, gan_inputs)[0]
-    # Return sample rate and audio
-    sample_rate = 44100
-    audio_array = audio.squeeze()
-    return (sample_rate, audio_array)
-def synthesize_speech(text: str, voice: str, pace: float = 1.0):
-    """
-    Synthesize speech from text using the selected voice.
-    Uses AOT models on GPU or ONNX models on CPU.
-    Args:
-        text: Input text to synthesize
-        voice: Voice to use (Caro or Karlsson)
-        pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
-    Returns:
-        Tuple of (sample_rate, audio_array)
-    """
-    if USE_GPU:
-        return synthesize_speech_aot(text, voice, pace)
-    else:
-        return synthesize_speech_onnx(text, voice, pace)
-# Apply Zero GPU decorator if available
-if HAS_SPACES and USE_GPU:
-    synthesize_speech = spaces.GPU(synthesize_speech)
 # Create Gradio interface
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
-        f"""
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
-        **Running on:** {DEVICE.upper()} {"(AOT models)" if USE_GPU else "(ONNX models)"}
         Enter your German text below and select a voice to synthesize speech.
         """
     )

 import gradio as gr
 import torch
 import torch._inductor
+import spaces
 from char_tokenizers import GermanCharsTokenizer
 # Initialize tokenizer
 TOKENIZER = GermanCharsTokenizer()
 # Model paths
 AOT_MODELS = {
     "Caro": {
     },
 }
+# Global variable to hold loaded models
+aot_sessions = {}
+@spaces.GPU(duration=60)
+def load_models():
+    """Load AOT models on GPU."""
+    global aot_sessions
+    if aot_sessions:  # Already loaded
+        return
     print("Loading AOT models for GPU...")
     for voice_name, paths in AOT_MODELS.items():
         print(f"Loading {voice_name} AOT models...")
         aot_sessions[voice_name] = {
             "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
         }
     print("AOT models loaded successfully!")
+@spaces.GPU(duration=60)
+def synthesize_speech(text: str, voice: str, pace: float = 1.0):
     """
+    Synthesize speech from text using AOT compiled models on GPU.
     Args:
         text: Input text to synthesize
         voice: Voice to use (Caro or Karlsson)
         pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
     Returns:
         Tuple of (sample_rate, audio_array)
     """
+    # Load models if not already loaded
+    if not aot_sessions:
+        load_models()
     if not text.strip():
         return None
     # Tokenize text
     tokens = TOKENIZER.encode(text)
+    tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
     # Prepare control parameters
+    pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32)
     pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
     with torch.inference_mode():
     return (sample_rate, audio_array)
 # Create Gradio interface
 with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
     gr.Markdown(
+        """
         # 🎙️ German Text-to-Speech
         Generate German speech using two different voices: **Caro** and **Karlsson**.
         Enter your German text below and select a voice to synthesize speech.
         """
     )