Warholt commited on
Commit
a2ea06b
·
1 Parent(s): dff6ae5

remove onnx fallback, load with gpu decorator

Browse files
Files changed (1) hide show
  1. app.py +22 -121
app.py CHANGED
@@ -1,29 +1,12 @@
1
  import gradio as gr
2
- import onnxruntime as ort
3
- import numpy as np
4
  import torch
5
  import torch._inductor
 
6
  from char_tokenizers import GermanCharsTokenizer
7
 
8
- # Try to import spaces for Zero GPU support
9
- try:
10
- import spaces
11
-
12
- HAS_SPACES = True
13
- except ImportError:
14
- HAS_SPACES = False
15
- print("spaces not available, running without Zero GPU support")
16
-
17
  # Initialize tokenizer
18
  TOKENIZER = GermanCharsTokenizer()
19
 
20
- # Check if CUDA is available
21
- USE_GPU = torch.cuda.is_available()
22
- DEVICE = "cuda" if USE_GPU else "cpu"
23
-
24
- print(f"Using device: {DEVICE}")
25
- print(f"Zero GPU support: {HAS_SPACES}")
26
-
27
  # Model paths
28
  AOT_MODELS = {
29
  "Caro": {
@@ -38,21 +21,19 @@ AOT_MODELS = {
38
  },
39
  }
40
 
41
- ONNX_MODELS = {
42
- "Caro": {
43
- "fastpitch": "onnx/caro_fastpitch.onnx",
44
- "hifigan": "onnx/caro_hifigan.onnx",
45
- },
46
- "Karlsson": {
47
- "fastpitch": "onnx/karlsson_fastpitch.onnx",
48
- "hifigan": "onnx/karlsson_hifigan.onnx",
49
- },
50
- }
51
 
52
- # Load models based on device
53
- if USE_GPU:
 
 
 
 
 
 
 
54
  print("Loading AOT models for GPU...")
55
- aot_sessions = {}
56
  for voice_name, paths in AOT_MODELS.items():
57
  print(f"Loading {voice_name} AOT models...")
58
  aot_sessions[voice_name] = {
@@ -61,44 +42,34 @@ if USE_GPU:
61
  "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
62
  }
63
  print("AOT models loaded successfully!")
64
- onnx_sessions = None
65
- else:
66
- print("Loading ONNX models for CPU...")
67
- onnx_sessions = {}
68
- for voice_name, paths in ONNX_MODELS.items():
69
- print(f"Loading {voice_name} ONNX models...")
70
- onnx_sessions[voice_name] = {
71
- "fastpitch": ort.InferenceSession(paths["fastpitch"]),
72
- "hifigan": ort.InferenceSession(paths["hifigan"]),
73
- }
74
- print("ONNX models loaded successfully!")
75
- aot_sessions = None
76
 
77
 
78
- def synthesize_speech_aot(
79
- text: str, voice: str, pace: float = 1.0, pitch_shift: float = 0.0
80
- ):
81
  """
82
- Synthesize speech using AOT compiled models (GPU).
83
 
84
  Args:
85
  text: Input text to synthesize
86
  voice: Voice to use (Caro or Karlsson)
87
  pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
88
- pitch_shift: Pitch adjustment (0.0 = no change)
89
 
90
  Returns:
91
  Tuple of (sample_rate, audio_array)
92
  """
 
 
 
 
93
  if not text.strip():
94
  return None
95
 
96
  # Tokenize text
97
  tokens = TOKENIZER.encode(text)
98
- tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to(DEVICE)
99
 
100
  # Prepare control parameters
101
- pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32) + pitch_shift
102
  pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
103
 
104
  with torch.inference_mode():
@@ -123,84 +94,14 @@ def synthesize_speech_aot(
123
  return (sample_rate, audio_array)
124
 
125
 
126
- def synthesize_speech_onnx(text: str, voice: str, pace: float = 1.0):
127
- """
128
- Synthesize speech using ONNX models (CPU).
129
-
130
- Args:
131
- text: Input text to synthesize
132
- voice: Voice to use (Caro or Karlsson)
133
- pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
134
-
135
- Returns:
136
- Tuple of (sample_rate, audio_array)
137
- """
138
- if not text.strip():
139
- return None
140
-
141
- # Tokenize text
142
- tokens = TOKENIZER.encode(text)
143
-
144
- # Prepare inputs for FastPitch
145
- paces = np.zeros(len(tokens), dtype=np.float32) + pace
146
- pitches = np.zeros(len(tokens), dtype=np.float32) # Keep pitch at 0.0
147
-
148
- inputs = {
149
- "text": np.array([tokens], dtype=np.int64),
150
- "pace": np.array([paces], dtype=np.float32),
151
- "pitch": np.array([pitches], dtype=np.float32),
152
- }
153
-
154
- # Generate spectrogram with FastPitch
155
- fastpitch_session = onnx_sessions[voice]["fastpitch"]
156
- spec = fastpitch_session.run(None, inputs)[0]
157
-
158
- # Generate audio with HiFiGAN
159
- hifigan_session = onnx_sessions[voice]["hifigan"]
160
- gan_inputs = {"spec": spec}
161
- audio = hifigan_session.run(None, gan_inputs)[0]
162
-
163
- # Return sample rate and audio
164
- sample_rate = 44100
165
- audio_array = audio.squeeze()
166
-
167
- return (sample_rate, audio_array)
168
-
169
-
170
- def synthesize_speech(text: str, voice: str, pace: float = 1.0):
171
- """
172
- Synthesize speech from text using the selected voice.
173
- Uses AOT models on GPU or ONNX models on CPU.
174
-
175
- Args:
176
- text: Input text to synthesize
177
- voice: Voice to use (Caro or Karlsson)
178
- pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
179
-
180
- Returns:
181
- Tuple of (sample_rate, audio_array)
182
- """
183
- if USE_GPU:
184
- return synthesize_speech_aot(text, voice, pace)
185
- else:
186
- return synthesize_speech_onnx(text, voice, pace)
187
-
188
-
189
- # Apply Zero GPU decorator if available
190
- if HAS_SPACES and USE_GPU:
191
- synthesize_speech = spaces.GPU(synthesize_speech)
192
-
193
-
194
  # Create Gradio interface
195
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
196
  gr.Markdown(
197
- f"""
198
  # 🎙️ German Text-to-Speech
199
 
200
  Generate German speech using two different voices: **Caro** and **Karlsson**.
201
 
202
- **Running on:** {DEVICE.upper()} {"(AOT models)" if USE_GPU else "(ONNX models)"}
203
-
204
  Enter your German text below and select a voice to synthesize speech.
205
  """
206
  )
 
1
  import gradio as gr
 
 
2
  import torch
3
  import torch._inductor
4
+ import spaces
5
  from char_tokenizers import GermanCharsTokenizer
6
 
 
 
 
 
 
 
 
 
 
7
  # Initialize tokenizer
8
  TOKENIZER = GermanCharsTokenizer()
9
 
 
 
 
 
 
 
 
10
  # Model paths
11
  AOT_MODELS = {
12
  "Caro": {
 
21
  },
22
  }
23
 
24
+ # Global variable to hold loaded models
25
+ aot_sessions = {}
 
 
 
 
 
 
 
 
26
 
27
+
28
+ @spaces.GPU(duration=60)
29
+ def load_models():
30
+ """Load AOT models on GPU."""
31
+ global aot_sessions
32
+
33
+ if aot_sessions: # Already loaded
34
+ return
35
+
36
  print("Loading AOT models for GPU...")
 
37
  for voice_name, paths in AOT_MODELS.items():
38
  print(f"Loading {voice_name} AOT models...")
39
  aot_sessions[voice_name] = {
 
42
  "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
43
  }
44
  print("AOT models loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
+ @spaces.GPU(duration=60)
48
+ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
 
49
  """
50
+ Synthesize speech from text using AOT compiled models on GPU.
51
 
52
  Args:
53
  text: Input text to synthesize
54
  voice: Voice to use (Caro or Karlsson)
55
  pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
 
56
 
57
  Returns:
58
  Tuple of (sample_rate, audio_array)
59
  """
60
+ # Load models if not already loaded
61
+ if not aot_sessions:
62
+ load_models()
63
+
64
  if not text.strip():
65
  return None
66
 
67
  # Tokenize text
68
  tokens = TOKENIZER.encode(text)
69
+ tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
70
 
71
  # Prepare control parameters
72
+ pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32)
73
  pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
74
 
75
  with torch.inference_mode():
 
94
  return (sample_rate, audio_array)
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  # Create Gradio interface
98
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
99
  gr.Markdown(
100
+ """
101
  # 🎙️ German Text-to-Speech
102
 
103
  Generate German speech using two different voices: **Caro** and **Karlsson**.
104
 
 
 
105
  Enter your German text below and select a voice to synthesize speech.
106
  """
107
  )