MuseTalk / scripts /full_pipeline.py
marcos
Add StyleTTS2 integration scripts for voice cloning and lip sync pipeline
66e2a44
#!/usr/bin/env python3
"""
Pipeline completo: YouTube -> Audio com voz clonada -> Lip Sync
Uso: python full_pipeline.py --youtube-url "..." --text "Seu texto" --output video_final.mp4
"""
import argparse
import os
import subprocess
import tempfile
# Fix para PyTorch 2.6+
import torch
original_load = torch.load
def patched_load(*args, **kwargs):
kwargs['weights_only'] = False
return original_load(*args, **kwargs)
torch.load = patched_load
def download_youtube_video(url: str, output_path: str, start: int = 0, duration: int = 15):
"""Baixa video do YouTube."""
print(f"[1/4] Baixando video do YouTube...")
cmd = [
'yt-dlp',
'-f', 'best[height<=720]',
'--postprocessor-args', f'ffmpeg:-ss {start} -t {duration}',
'-o', output_path,
url
]
subprocess.run(cmd, check=True)
return output_path
def extract_voice_reference(video_path: str, output_path: str):
"""Extrai audio de referencia do video."""
print(f"[2/4] Extraindo audio de referencia...")
cmd = [
'ffmpeg', '-y',
'-i', video_path,
'-ar', '22050',
'-ac', '1',
'-t', '15',
output_path
]
subprocess.run(cmd, capture_output=True, check=True)
return output_path
def generate_cloned_audio(text: str, voice_ref: str, output_path: str):
"""Gera audio com voz clonada."""
print(f"[3/4] Gerando audio com StyleTTS2...")
from styletts2 import tts
import scipy.io.wavfile as wavfile
my_tts = tts.StyleTTS2()
wav = my_tts.inference(
text,
target_voice_path=voice_ref,
diffusion_steps=10
)
wavfile.write(output_path, 24000, wav)
return output_path
def run_lipsync(video_path: str, audio_path: str, output_dir: str):
"""Executa lip sync."""
print(f"[4/4] Executando lip sync...")
import yaml
# Criar config
config = {
'task_0': {
'video_path': os.path.abspath(video_path),
'audio_path': os.path.abspath(audio_path),
'bbox_shift': 5
}
}
config_file = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False)
yaml.dump(config, config_file)
config_file.close()
musetalk_dir = os.environ.get('MUSETALK_DIR', '/root/musetalk-space')
cmd = [
'python3', '-m', 'scripts.inference',
'--inference_config', config_file.name,
'--result_dir', output_dir
]
subprocess.run(cmd, cwd=musetalk_dir, check=True)
os.unlink(config_file.name)
# Encontrar video de saida
for f in os.listdir(os.path.join(output_dir, 'v15')):
if f.endswith('.mp4'):
return os.path.join(output_dir, 'v15', f)
return None
def main():
parser = argparse.ArgumentParser(description='Pipeline completo de video com lip sync')
parser.add_argument('--youtube-url', '-y', required=True, help='URL do YouTube')
parser.add_argument('--text', '-t', required=True, help='Texto para falar')
parser.add_argument('--output', '-o', default='./output', help='Diretorio de saida')
parser.add_argument('--start', '-s', type=int, default=0, help='Segundo inicial do video')
parser.add_argument('--duration', '-d', type=int, default=15, help='Duracao em segundos')
args = parser.parse_args()
# Criar diretorio de saida
os.makedirs(args.output, exist_ok=True)
# Arquivos temporarios
video_path = os.path.join(args.output, 'source_video.mp4')
voice_ref_path = os.path.join(args.output, 'voice_ref.wav')
audio_path = os.path.join(args.output, 'generated_audio.wav')
# Executar pipeline
download_youtube_video(args.youtube_url, video_path, args.start, args.duration)
extract_voice_reference(video_path, voice_ref_path)
generate_cloned_audio(args.text, voice_ref_path, audio_path)
final_video = run_lipsync(video_path, audio_path, args.output)
print(f"\n{'='*50}")
print(f"Pipeline concluido!")
print(f"Video final: {final_video}")
print(f"{'='*50}")
if __name__ == '__main__':
main()