Warholt commited on
Commit
7c81a73
·
1 Parent(s): a2ea06b

wrap aot packages in lazy torch module

Browse files
Files changed (1) hide show
  1. app.py +64 -94
app.py CHANGED
@@ -4,63 +4,67 @@ import torch._inductor
4
  import spaces
5
  from char_tokenizers import GermanCharsTokenizer
6
 
7
- # Initialize tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  TOKENIZER = GermanCharsTokenizer()
9
 
10
- # Model paths
11
- AOT_MODELS = {
 
12
  "Caro": {
13
- "encoder": "aot_package/caro_fastpitch_encoder.pt2",
14
- "decoder": "aot_package/caro_fastpitch_decoder.pt2",
15
- "vocoder": "aot_package/caro_hifigan.pt2",
16
  },
17
  "Karlsson": {
18
- "encoder": "aot_package/karlsson_fastpitch_encoder.pt2",
19
- "decoder": "aot_package/karlsson_fastpitch_decoder.pt2",
20
- "vocoder": "aot_package/karlsson_hifigan.pt2",
21
  },
22
  }
23
 
24
- # Global variable to hold loaded models
25
- aot_sessions = {}
26
-
27
-
28
- @spaces.GPU(duration=60)
29
- def load_models():
30
- """Load AOT models on GPU."""
31
- global aot_sessions
32
-
33
- if aot_sessions: # Already loaded
34
- return
35
-
36
- print("Loading AOT models for GPU...")
37
- for voice_name, paths in AOT_MODELS.items():
38
- print(f"Loading {voice_name} AOT models...")
39
- aot_sessions[voice_name] = {
40
- "encoder": torch._inductor.aoti_load_package(paths["encoder"]),
41
- "decoder": torch._inductor.aoti_load_package(paths["decoder"]),
42
- "vocoder": torch._inductor.aoti_load_package(paths["vocoder"]),
43
- }
44
- print("AOT models loaded successfully!")
45
-
46
-
47
  @spaces.GPU(duration=60)
48
  def synthesize_speech(text: str, voice: str, pace: float = 1.0):
49
  """
50
- Synthesize speech from text using AOT compiled models on GPU.
51
-
52
- Args:
53
- text: Input text to synthesize
54
- voice: Voice to use (Caro or Karlsson)
55
- pace: Speaking rate (1.0 is normal, <1.0 is slower, >1.0 is faster)
56
-
57
- Returns:
58
- Tuple of (sample_rate, audio_array)
59
  """
60
- # Load models if not already loaded
61
- if not aot_sessions:
62
- load_models()
63
-
64
  if not text.strip():
65
  return None
66
 
@@ -69,22 +73,25 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
69
  tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
70
 
71
  # Prepare control parameters
72
- pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32)
73
- pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32) * pace
 
 
 
 
 
 
74
 
75
  with torch.inference_mode():
76
- # Run encoder to get latent representation and length
77
- encoder = aot_sessions[voice]["encoder"]
78
  len_regulated, dec_lens, spk_emb = encoder(
79
  tokens_tensor, pitch_tensor, pace_tensor
80
  )
81
 
82
- # Run decoder to get mel-spectrogram
83
- decoder = aot_sessions[voice]["decoder"]
84
  spec = decoder(len_regulated, dec_lens, spk_emb)
85
 
86
- # Run vocoder to generate audio waveform
87
- vocoder = aot_sessions[voice]["vocoder"]
88
  audio = vocoder(spec)
89
 
90
  # Convert to numpy and return
@@ -94,15 +101,12 @@ def synthesize_speech(text: str, voice: str, pace: float = 1.0):
94
  return (sample_rate, audio_array)
95
 
96
 
97
- # Create Gradio interface
98
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
99
  gr.Markdown(
100
  """
101
  # 🎙️ German Text-to-Speech
102
-
103
  Generate German speech using two different voices: **Caro** and **Karlsson**.
104
-
105
- Enter your German text below and select a voice to synthesize speech.
106
  """
107
  )
108
 
@@ -110,54 +114,20 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
110
  with gr.Column():
111
  text_input = gr.Textbox(
112
  label="Text to synthesize",
113
- placeholder="Geben Sie hier Ihren deutschen Text ein...",
114
- lines=5,
115
  value="Hallo! Willkommen zur deutschen Sprachsynthese.",
 
116
  )
117
-
118
  voice_dropdown = gr.Dropdown(
119
  choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
120
  )
121
-
122
  pace_slider = gr.Slider(
123
- minimum=0.5,
124
- maximum=2.0,
125
- value=1.0,
126
- step=0.1,
127
- label="Speaking Rate",
128
- info="1.0 is normal speed",
129
  )
130
-
131
  generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
132
 
133
  with gr.Column():
134
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
135
 
136
- gr.Examples(
137
- examples=[
138
- ["Guten Tag! Wie geht es Ihnen heute?", "Caro", 1.0],
139
- [
140
- "Die Wissenschaft hat in den letzten Jahren große Fortschritte gemacht.",
141
- "Karlsson",
142
- 1.0,
143
- ],
144
- [
145
- "Es war einmal ein kleines Mädchen, das durch den Wald spazierte.",
146
- "Caro",
147
- 0.9,
148
- ],
149
- [
150
- "Berlin ist die Hauptstadt und zugleich ein Land der Bundesrepublik Deutschland.",
151
- "Karlsson",
152
- 1.0,
153
- ],
154
- ],
155
- inputs=[text_input, voice_dropdown, pace_slider],
156
- outputs=audio_output,
157
- fn=synthesize_speech,
158
- cache_examples=False,
159
- )
160
-
161
  generate_btn.click(
162
  fn=synthesize_speech,
163
  inputs=[text_input, voice_dropdown, pace_slider],
@@ -165,4 +135,4 @@ with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
165
  )
166
 
167
  if __name__ == "__main__":
168
- demo.launch()
 
4
  import spaces
5
  from char_tokenizers import GermanCharsTokenizer
6
 
7
+ # --- 1. Define a Wrapper for Lazy Loading ---
8
+ class LazyAotPackage(torch.nn.Module):
9
+ """
10
+ A wrapper that holds the path to an AOT package and loads it
11
+ to the GPU only when forward() is called.
12
+ """
13
+
14
+ def __init__(self, package_path):
15
+ super().__init__()
16
+ self.package_path = package_path
17
+ self.runner = None
18
+
19
+ def forward(self, *args, **kwargs):
20
+ # We are now inside the @spaces.GPU decorated function.
21
+ # Valid GPU context exists.
22
+
23
+ # If runner is not loaded, load it now.
24
+ if self.runner is None:
25
+ # Load directly to the active CUDA device
26
+ self.runner = torch._inductor.aoti_load_package(
27
+ self.package_path, device="cuda"
28
+ )
29
+
30
+ # Run inference
31
+ # We add a try/except block because if ZeroGPU swaps the underlying hardware
32
+ # between requests, the old runner might be invalid.
33
+ try:
34
+ return self.runner(*args, **kwargs)
35
+ except RuntimeError:
36
+ # Context might be stale, reload
37
+ self.runner = torch._inductor.aoti_load_package(
38
+ self.package_path, device="cuda"
39
+ )
40
+ return self.runner(*args, **kwargs)
41
+
42
+
43
+ # --- 2. Initialize Global Components ---
44
  TOKENIZER = GermanCharsTokenizer()
45
 
46
+ # Instead of a dict of raw paths, we instantiate our Lazy Loaders immediately.
47
+ # These act like standard PyTorch modules but use almost no RAM until inference.
48
+ MODELS = {
49
  "Caro": {
50
+ "encoder": LazyAotPackage("aot_package/caro_fastpitch_encoder.pt2"),
51
+ "decoder": LazyAotPackage("aot_package/caro_fastpitch_decoder.pt2"),
52
+ "vocoder": LazyAotPackage("aot_package/caro_hifigan.pt2"),
53
  },
54
  "Karlsson": {
55
+ "encoder": LazyAotPackage("aot_package/karlsson_fastpitch_encoder.pt2"),
56
+ "decoder": LazyAotPackage("aot_package/karlsson_fastpitch_decoder.pt2"),
57
+ "vocoder": LazyAotPackage("aot_package/karlsson_hifigan.pt2"),
58
  },
59
  }
60
 
61
+ # --- 3. Inference Function ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  @spaces.GPU(duration=60)
63
  def synthesize_speech(text: str, voice: str, pace: float = 1.0):
64
  """
65
+ Synthesize speech. The @spaces.GPU decorator ensures a GPU is assigned
66
+ for the duration of this function.
 
 
 
 
 
 
 
67
  """
 
 
 
 
68
  if not text.strip():
69
  return None
70
 
 
73
  tokens_tensor = torch.tensor([tokens], dtype=torch.int64).to("cuda")
74
 
75
  # Prepare control parameters
76
+ pitch_tensor = torch.zeros_like(tokens_tensor, dtype=torch.float32).to("cuda")
77
+ pace_tensor = torch.ones_like(tokens_tensor, dtype=torch.float32).to("cuda") * pace
78
+
79
+ # Retrieve the correct lazy-loaded models
80
+ # The .forward() call inside these objects will trigger the load to GPU
81
+ encoder = MODELS[voice]["encoder"]
82
+ decoder = MODELS[voice]["decoder"]
83
+ vocoder = MODELS[voice]["vocoder"]
84
 
85
  with torch.inference_mode():
86
+ # 1. Run Encoder (Loads .pt2 to GPU if needed -> Runs)
 
87
  len_regulated, dec_lens, spk_emb = encoder(
88
  tokens_tensor, pitch_tensor, pace_tensor
89
  )
90
 
91
+ # 2. Run Decoder (Loads .pt2 to GPU if needed -> Runs)
 
92
  spec = decoder(len_regulated, dec_lens, spk_emb)
93
 
94
+ # 3. Run Vocoder (Loads .pt2 to GPU if needed -> Runs)
 
95
  audio = vocoder(spec)
96
 
97
  # Convert to numpy and return
 
101
  return (sample_rate, audio_array)
102
 
103
 
104
+ # --- 4. Gradio Interface ---
105
  with gr.Blocks(title="German TTS - Caro & Karlsson") as demo:
106
  gr.Markdown(
107
  """
108
  # 🎙️ German Text-to-Speech
 
109
  Generate German speech using two different voices: **Caro** and **Karlsson**.
 
 
110
  """
111
  )
112
 
 
114
  with gr.Column():
115
  text_input = gr.Textbox(
116
  label="Text to synthesize",
 
 
117
  value="Hallo! Willkommen zur deutschen Sprachsynthese.",
118
+ lines=3,
119
  )
 
120
  voice_dropdown = gr.Dropdown(
121
  choices=["Caro", "Karlsson"], label="Voice", value="Karlsson"
122
  )
 
123
  pace_slider = gr.Slider(
124
+ minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speaking Rate"
 
 
 
 
 
125
  )
 
126
  generate_btn = gr.Button("Generate Speech 🔊", variant="primary")
127
 
128
  with gr.Column():
129
  audio_output = gr.Audio(label="Generated Audio", type="numpy")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  generate_btn.click(
132
  fn=synthesize_speech,
133
  inputs=[text_input, voice_dropdown, pace_slider],
 
135
  )
136
 
137
  if __name__ == "__main__":
138
+ demo.launch()