#!/usr/bin/env python3 """ Pipeline completo: YouTube -> Audio com voz clonada -> Lip Sync Uso: python full_pipeline.py --youtube-url "..." --text "Seu texto" --output video_final.mp4 """ import argparse import os import subprocess import tempfile # Fix para PyTorch 2.6+ import torch original_load = torch.load def patched_load(*args, **kwargs): kwargs['weights_only'] = False return original_load(*args, **kwargs) torch.load = patched_load def download_youtube_video(url: str, output_path: str, start: int = 0, duration: int = 15): """Baixa video do YouTube.""" print(f"[1/4] Baixando video do YouTube...") cmd = [ 'yt-dlp', '-f', 'best[height<=720]', '--postprocessor-args', f'ffmpeg:-ss {start} -t {duration}', '-o', output_path, url ] subprocess.run(cmd, check=True) return output_path def extract_voice_reference(video_path: str, output_path: str): """Extrai audio de referencia do video.""" print(f"[2/4] Extraindo audio de referencia...") cmd = [ 'ffmpeg', '-y', '-i', video_path, '-ar', '22050', '-ac', '1', '-t', '15', output_path ] subprocess.run(cmd, capture_output=True, check=True) return output_path def generate_cloned_audio(text: str, voice_ref: str, output_path: str): """Gera audio com voz clonada.""" print(f"[3/4] Gerando audio com StyleTTS2...") from styletts2 import tts import scipy.io.wavfile as wavfile my_tts = tts.StyleTTS2() wav = my_tts.inference( text, target_voice_path=voice_ref, diffusion_steps=10 ) wavfile.write(output_path, 24000, wav) return output_path def run_lipsync(video_path: str, audio_path: str, output_dir: str): """Executa lip sync.""" print(f"[4/4] Executando lip sync...") import yaml # Criar config config = { 'task_0': { 'video_path': os.path.abspath(video_path), 'audio_path': os.path.abspath(audio_path), 'bbox_shift': 5 } } config_file = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) yaml.dump(config, config_file) config_file.close() musetalk_dir = os.environ.get('MUSETALK_DIR', '/root/musetalk-space') cmd = [ 'python3', '-m', 'scripts.inference', '--inference_config', config_file.name, '--result_dir', output_dir ] subprocess.run(cmd, cwd=musetalk_dir, check=True) os.unlink(config_file.name) # Encontrar video de saida for f in os.listdir(os.path.join(output_dir, 'v15')): if f.endswith('.mp4'): return os.path.join(output_dir, 'v15', f) return None def main(): parser = argparse.ArgumentParser(description='Pipeline completo de video com lip sync') parser.add_argument('--youtube-url', '-y', required=True, help='URL do YouTube') parser.add_argument('--text', '-t', required=True, help='Texto para falar') parser.add_argument('--output', '-o', default='./output', help='Diretorio de saida') parser.add_argument('--start', '-s', type=int, default=0, help='Segundo inicial do video') parser.add_argument('--duration', '-d', type=int, default=15, help='Duracao em segundos') args = parser.parse_args() # Criar diretorio de saida os.makedirs(args.output, exist_ok=True) # Arquivos temporarios video_path = os.path.join(args.output, 'source_video.mp4') voice_ref_path = os.path.join(args.output, 'voice_ref.wav') audio_path = os.path.join(args.output, 'generated_audio.wav') # Executar pipeline download_youtube_video(args.youtube_url, video_path, args.start, args.duration) extract_voice_reference(video_path, voice_ref_path) generate_cloned_audio(args.text, voice_ref_path, audio_path) final_video = run_lipsync(video_path, audio_path, args.output) print(f"\n{'='*50}") print(f"Pipeline concluido!") print(f"Video final: {final_video}") print(f"{'='*50}") if __name__ == '__main__': main()