LiamKhoaLe commited on
Commit
5040e2f
·
1 Parent(s): a863763

Change Translation, ASR, TTS services to MCP

Browse files
Files changed (2) hide show
  1. README.md +23 -2
  2. app.py +257 -16
README.md CHANGED
@@ -86,6 +86,9 @@ tags:
86
  - **Embedding Model**: abhinand/MedEmbed-large-v0.1 (domain-tuned medical embeddings)
87
  - **RAG Framework**: LlamaIndex with hierarchical node parsing
88
  - **Web Search**: Model Context Protocol (MCP) tools with automatic fallback to DuckDuckGo
 
 
 
89
  - **MCP Client**: Python MCP SDK for standardized tool integration
90
 
91
  ## 📋 Requirements
@@ -97,9 +100,27 @@ See `requirements.txt` for full dependency list. Key dependencies:
97
  - **RAG Framework**: `llama-index`
98
  - **Utilities**: `langdetect`, `gradio`, `spaces`
99
 
100
- ### MCP Configuration (Optional)
101
 
102
- The application uses MCP tools by default when available. To configure MCP servers:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  1. **Install MCP Python SDK** (already in requirements.txt):
105
  ```bash
 
86
  - **Embedding Model**: abhinand/MedEmbed-large-v0.1 (domain-tuned medical embeddings)
87
  - **RAG Framework**: LlamaIndex with hierarchical node parsing
88
  - **Web Search**: Model Context Protocol (MCP) tools with automatic fallback to DuckDuckGo
89
+ - **Speech-to-Text**: MCP Whisper integration with local Whisper fallback
90
+ - **Text-to-Speech**: MCP TTS integration with local TTS fallback
91
+ - **Translation**: MCP translation tools with local DeepSeek-R1 fallback
92
  - **MCP Client**: Python MCP SDK for standardized tool integration
93
 
94
  ## 📋 Requirements
 
100
  - **RAG Framework**: `llama-index`
101
  - **Utilities**: `langdetect`, `gradio`, `spaces`
102
 
103
+ ### 🔌 MCP Configuration
104
 
105
+ The application supports MCP (Model Context Protocol) tools for various services. Configure MCP servers via environment variables:
106
+
107
+ ```bash
108
+ # Web Search MCP Server
109
+ export MCP_SERVER_COMMAND="python"
110
+ export MCP_SERVER_ARGS="-m duckduckgo_mcp_server"
111
+
112
+ # Or use npx for Node.js MCP servers
113
+ export MCP_SERVER_COMMAND="npx"
114
+ export MCP_SERVER_ARGS="-y @modelcontextprotocol/server-duckduckgo"
115
+ ```
116
+
117
+ **Available MCP Tools:**
118
+ - **Web Search**: DuckDuckGo search via MCP (automatic fallback to direct API)
119
+ - **Speech-to-Text**: Whisper transcription via MCP (automatic fallback to local Whisper)
120
+ - **Text-to-Speech**: TTS generation via MCP (automatic fallback to local TTS)
121
+ - **Translation**: Multi-language translation via MCP (automatic fallback to local DeepSeek-R1)
122
+
123
+ The application automatically detects and uses MCP tools when available, falling back to local implementations seamlessly.
124
 
125
  1. **Install MCP Python SDK** (already in requirements.txt):
126
  ```bash
app.py CHANGED
@@ -238,11 +238,49 @@ def initialize_whisper_model():
238
  if global_whisper_model is None:
239
  logger.info("Initializing Whisper model for speech transcription...")
240
  try:
241
- # Try loading from HuggingFace
242
- global_whisper_model = whisper.load_model("large-v3-turbo")
243
- except:
244
- # Fallback to base model
245
- global_whisper_model = whisper.load_model("base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  logger.info("Whisper model initialized successfully")
247
  return global_whisper_model
248
 
@@ -263,11 +301,46 @@ def initialize_tts_model():
263
  global_tts_model = None
264
  return global_tts_model
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  def transcribe_audio(audio):
267
- """Transcribe audio to text using Whisper"""
268
  global global_whisper_model
269
- if global_whisper_model is None:
270
- initialize_whisper_model()
271
 
272
  if audio is None:
273
  return ""
@@ -286,6 +359,35 @@ def transcribe_audio(audio):
286
  else:
287
  audio_path = audio
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # Transcribe
290
  result = global_whisper_model.transcribe(audio_path, language="en")
291
  transcribed_text = result["text"].strip()
@@ -295,11 +397,81 @@ def transcribe_audio(audio):
295
  logger.error(f"Transcription error: {e}")
296
  return ""
297
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  def generate_speech(text: str):
299
- """Generate speech from text using TTS model"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  if not TTS_AVAILABLE:
301
  logger.error("TTS library not installed. Please install TTS to use voice generation.")
302
  return None
 
303
  global global_tts_model
304
  if global_tts_model is None:
305
  initialize_tts_model()
@@ -308,9 +480,6 @@ def generate_speech(text: str):
308
  logger.error("TTS model not available. Please check dependencies.")
309
  return None
310
 
311
- if not text or len(text.strip()) == 0:
312
- return None
313
-
314
  try:
315
  # Generate audio
316
  wav = global_tts_model.tts(text)
@@ -360,8 +529,71 @@ def detect_language(text: str) -> str:
360
  except LangDetectException:
361
  return "en" # Default to English if detection fails
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  def translate_text(text: str, target_lang: str = "en", source_lang: str = None) -> str:
364
- """Translate text using DeepSeek-R1 model"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  global global_translation_model, global_translation_tokenizer
366
  if global_translation_model is None or global_translation_tokenizer is None:
367
  initialize_translation_model()
@@ -1550,17 +1782,26 @@ if __name__ == "__main__":
1550
  logger.info("Initializing default medical model (MedSwin SFT)...")
1551
  initialize_medical_model(DEFAULT_MEDICAL_MODEL)
1552
  logger.info("Preloading Whisper model...")
1553
- initialize_whisper_model()
 
 
 
 
 
 
 
 
 
1554
  logger.info("Preloading TTS model...")
1555
  try:
1556
  initialize_tts_model()
1557
  if global_tts_model is not None:
1558
  logger.info("TTS model preloaded successfully!")
1559
  else:
1560
- logger.warning("TTS model not available - voice generation will be disabled")
1561
  except Exception as e:
1562
  logger.warning(f"TTS model preloading failed: {e}")
1563
- logger.warning("Voice generation features will be disabled")
1564
  logger.info("Model preloading complete!")
1565
  demo = create_demo()
1566
  demo.launch()
 
238
  if global_whisper_model is None:
239
  logger.info("Initializing Whisper model for speech transcription...")
240
  try:
241
+ # Check if we're in a spaces environment (has spaces patching)
242
+ in_spaces_env = hasattr(torch, '_spaces_patched') or 'spaces' in str(type(torch.Tensor.to))
243
+
244
+ # Try loading from HuggingFace with device handling for spaces compatibility
245
+ try:
246
+ if in_spaces_env:
247
+ # In spaces environment, load on CPU and don't move to device
248
+ logger.info("Detected spaces environment, loading Whisper on CPU")
249
+ global_whisper_model = whisper.load_model("large-v3-turbo", device="cpu")
250
+ # Don't move to GPU in spaces environment
251
+ else:
252
+ # Normal environment, let whisper handle device
253
+ global_whisper_model = whisper.load_model("large-v3-turbo")
254
+ except NotImplementedError as e:
255
+ # Handle sparse tensor error from spaces library
256
+ if "SparseTensorImpl" in str(e) or "storage" in str(e).lower():
257
+ logger.warning(f"Spaces library compatibility issue: {e}")
258
+ logger.info("Trying to load Whisper model with workaround...")
259
+ try:
260
+ # Try loading on CPU explicitly
261
+ global_whisper_model = whisper.load_model("base", device="cpu")
262
+ except Exception as e2:
263
+ logger.error(f"Failed to load Whisper with workaround: {e2}")
264
+ global_whisper_model = None
265
+ return None
266
+ else:
267
+ raise
268
+ except Exception as e1:
269
+ logger.warning(f"Failed to load large-v3-turbo: {e1}")
270
+ try:
271
+ # Fallback to base model with CPU
272
+ global_whisper_model = whisper.load_model("base", device="cpu")
273
+ except Exception as e2:
274
+ logger.error(f"Failed to load Whisper base model: {e2}")
275
+ # Set to None to indicate failure - will use MCP or skip transcription
276
+ global_whisper_model = None
277
+ return None
278
+ except Exception as e:
279
+ logger.error(f"Whisper model initialization error: {e}")
280
+ import traceback
281
+ logger.debug(traceback.format_exc())
282
+ global_whisper_model = None
283
+ return None
284
  logger.info("Whisper model initialized successfully")
285
  return global_whisper_model
286
 
 
301
  global_tts_model = None
302
  return global_tts_model
303
 
304
+ async def transcribe_audio_mcp(audio_path: str) -> str:
305
+ """Transcribe audio using MCP Whisper tool"""
306
+ global global_mcp_client
307
+
308
+ if not MCP_AVAILABLE:
309
+ return ""
310
+
311
+ try:
312
+ # Initialize MCP client if needed
313
+ if global_mcp_client is None:
314
+ return ""
315
+
316
+ # Find Whisper tool
317
+ tools = await global_mcp_client.list_tools()
318
+ whisper_tool = None
319
+ for tool in tools.tools:
320
+ if "whisper" in tool.name.lower() or "transcribe" in tool.name.lower() or "speech" in tool.name.lower():
321
+ whisper_tool = tool
322
+ logger.info(f"Found MCP Whisper tool: {tool.name}")
323
+ break
324
+
325
+ if whisper_tool:
326
+ result = await global_mcp_client.call_tool(
327
+ whisper_tool.name,
328
+ arguments={"audio_path": audio_path, "language": "en"}
329
+ )
330
+
331
+ # Parse result
332
+ if hasattr(result, 'content') and result.content:
333
+ for item in result.content:
334
+ if hasattr(item, 'text'):
335
+ return item.text.strip()
336
+ return ""
337
+ except Exception as e:
338
+ logger.debug(f"MCP transcription error: {e}")
339
+ return ""
340
+
341
  def transcribe_audio(audio):
342
+ """Transcribe audio to text using Whisper (with MCP fallback)"""
343
  global global_whisper_model
 
 
344
 
345
  if audio is None:
346
  return ""
 
359
  else:
360
  audio_path = audio
361
 
362
+ # Try MCP first if available
363
+ if MCP_AVAILABLE:
364
+ try:
365
+ loop = asyncio.get_event_loop()
366
+ if loop.is_running():
367
+ try:
368
+ import nest_asyncio
369
+ transcribed = nest_asyncio.run(transcribe_audio_mcp(audio_path))
370
+ if transcribed:
371
+ logger.info(f"Transcribed via MCP: {transcribed}")
372
+ return transcribed
373
+ except:
374
+ pass
375
+ else:
376
+ transcribed = loop.run_until_complete(transcribe_audio_mcp(audio_path))
377
+ if transcribed:
378
+ logger.info(f"Transcribed via MCP: {transcribed}")
379
+ return transcribed
380
+ except Exception as e:
381
+ logger.debug(f"MCP transcription not available: {e}")
382
+
383
+ # Fallback to local Whisper model
384
+ if global_whisper_model is None:
385
+ initialize_whisper_model()
386
+
387
+ if global_whisper_model is None:
388
+ logger.warning("Whisper model not available and MCP not working")
389
+ return ""
390
+
391
  # Transcribe
392
  result = global_whisper_model.transcribe(audio_path, language="en")
393
  transcribed_text = result["text"].strip()
 
397
  logger.error(f"Transcription error: {e}")
398
  return ""
399
 
400
+ async def generate_speech_mcp(text: str) -> str:
401
+ """Generate speech using MCP TTS tool"""
402
+ global global_mcp_client
403
+
404
+ if not MCP_AVAILABLE:
405
+ return None
406
+
407
+ try:
408
+ # Initialize MCP client if needed
409
+ if global_mcp_client is None:
410
+ return None
411
+
412
+ # Find TTS tool
413
+ tools = await global_mcp_client.list_tools()
414
+ tts_tool = None
415
+ for tool in tools.tools:
416
+ if "tts" in tool.name.lower() or "speech" in tool.name.lower() or "synthesize" in tool.name.lower():
417
+ tts_tool = tool
418
+ logger.info(f"Found MCP TTS tool: {tool.name}")
419
+ break
420
+
421
+ if tts_tool:
422
+ result = await global_mcp_client.call_tool(
423
+ tts_tool.name,
424
+ arguments={"text": text, "language": "en"}
425
+ )
426
+
427
+ # Parse result - MCP might return audio data or file path
428
+ if hasattr(result, 'content') and result.content:
429
+ for item in result.content:
430
+ if hasattr(item, 'text'):
431
+ # If it's a file path
432
+ if os.path.exists(item.text):
433
+ return item.text
434
+ elif hasattr(item, 'data') and item.data:
435
+ # If it's binary audio data, save it
436
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
437
+ tmp_file.write(item.data)
438
+ return tmp_file.name
439
+ return None
440
+ except Exception as e:
441
+ logger.debug(f"MCP TTS error: {e}")
442
+ return None
443
+
444
  def generate_speech(text: str):
445
+ """Generate speech from text using TTS model (with MCP fallback)"""
446
+ if not text or len(text.strip()) == 0:
447
+ return None
448
+
449
+ # Try MCP first if available
450
+ if MCP_AVAILABLE:
451
+ try:
452
+ loop = asyncio.get_event_loop()
453
+ if loop.is_running():
454
+ try:
455
+ import nest_asyncio
456
+ audio_path = nest_asyncio.run(generate_speech_mcp(text))
457
+ if audio_path:
458
+ logger.info("Generated speech via MCP")
459
+ return audio_path
460
+ except:
461
+ pass
462
+ else:
463
+ audio_path = loop.run_until_complete(generate_speech_mcp(text))
464
+ if audio_path:
465
+ logger.info("Generated speech via MCP")
466
+ return audio_path
467
+ except Exception as e:
468
+ logger.debug(f"MCP TTS not available: {e}")
469
+
470
+ # Fallback to local TTS model
471
  if not TTS_AVAILABLE:
472
  logger.error("TTS library not installed. Please install TTS to use voice generation.")
473
  return None
474
+
475
  global global_tts_model
476
  if global_tts_model is None:
477
  initialize_tts_model()
 
480
  logger.error("TTS model not available. Please check dependencies.")
481
  return None
482
 
 
 
 
483
  try:
484
  # Generate audio
485
  wav = global_tts_model.tts(text)
 
529
  except LangDetectException:
530
  return "en" # Default to English if detection fails
531
 
532
+ async def translate_text_mcp(text: str, target_lang: str = "en", source_lang: str = None) -> str:
533
+ """Translate text using MCP translation tool"""
534
+ global global_mcp_client
535
+
536
+ if not MCP_AVAILABLE:
537
+ return ""
538
+
539
+ try:
540
+ # Initialize MCP client if needed
541
+ if global_mcp_client is None:
542
+ return ""
543
+
544
+ # Find translation tool
545
+ tools = await global_mcp_client.list_tools()
546
+ translate_tool = None
547
+ for tool in tools.tools:
548
+ if "translate" in tool.name.lower() or "translation" in tool.name.lower():
549
+ translate_tool = tool
550
+ logger.info(f"Found MCP translation tool: {tool.name}")
551
+ break
552
+
553
+ if translate_tool:
554
+ args = {"text": text, "target_language": target_lang}
555
+ if source_lang:
556
+ args["source_language"] = source_lang
557
+
558
+ result = await global_mcp_client.call_tool(
559
+ translate_tool.name,
560
+ arguments=args
561
+ )
562
+
563
+ # Parse result
564
+ if hasattr(result, 'content') and result.content:
565
+ for item in result.content:
566
+ if hasattr(item, 'text'):
567
+ return item.text.strip()
568
+ return ""
569
+ except Exception as e:
570
+ logger.debug(f"MCP translation error: {e}")
571
+ return ""
572
+
573
  def translate_text(text: str, target_lang: str = "en", source_lang: str = None) -> str:
574
+ """Translate text using DeepSeek-R1 model (with MCP fallback)"""
575
+ # Try MCP first if available
576
+ if MCP_AVAILABLE:
577
+ try:
578
+ loop = asyncio.get_event_loop()
579
+ if loop.is_running():
580
+ try:
581
+ import nest_asyncio
582
+ translated = nest_asyncio.run(translate_text_mcp(text, target_lang, source_lang))
583
+ if translated:
584
+ logger.info(f"Translated via MCP: {translated[:50]}...")
585
+ return translated
586
+ except:
587
+ pass
588
+ else:
589
+ translated = loop.run_until_complete(translate_text_mcp(text, target_lang, source_lang))
590
+ if translated:
591
+ logger.info(f"Translated via MCP: {translated[:50]}...")
592
+ return translated
593
+ except Exception as e:
594
+ logger.debug(f"MCP translation not available: {e}")
595
+
596
+ # Fallback to local translation model
597
  global global_translation_model, global_translation_tokenizer
598
  if global_translation_model is None or global_translation_tokenizer is None:
599
  initialize_translation_model()
 
1782
  logger.info("Initializing default medical model (MedSwin SFT)...")
1783
  initialize_medical_model(DEFAULT_MEDICAL_MODEL)
1784
  logger.info("Preloading Whisper model...")
1785
+ try:
1786
+ initialize_whisper_model()
1787
+ if global_whisper_model is not None:
1788
+ logger.info("Whisper model preloaded successfully!")
1789
+ else:
1790
+ logger.warning("Whisper model not available - will use MCP or disable transcription")
1791
+ except Exception as e:
1792
+ logger.warning(f"Whisper model preloading failed: {e}")
1793
+ logger.warning("Speech-to-text will use MCP or be disabled")
1794
+ global_whisper_model = None
1795
  logger.info("Preloading TTS model...")
1796
  try:
1797
  initialize_tts_model()
1798
  if global_tts_model is not None:
1799
  logger.info("TTS model preloaded successfully!")
1800
  else:
1801
+ logger.warning("TTS model not available - will use MCP or disable voice generation")
1802
  except Exception as e:
1803
  logger.warning(f"TTS model preloading failed: {e}")
1804
+ logger.warning("Text-to-speech will use MCP or be disabled")
1805
  logger.info("Model preloading complete!")
1806
  demo = create_demo()
1807
  demo.launch()