File size: 4,062 Bytes
66e2a44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
"""
Pipeline completo: YouTube -> Audio com voz clonada -> Lip Sync
Uso: python full_pipeline.py --youtube-url "..." --text "Seu texto" --output video_final.mp4
"""

import argparse
import os
import subprocess
import tempfile

# Fix para PyTorch 2.6+
import torch
original_load = torch.load
def patched_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return original_load(*args, **kwargs)
torch.load = patched_load


def download_youtube_video(url: str, output_path: str, start: int = 0, duration: int = 15):
    """Baixa video do YouTube."""
    print(f"[1/4] Baixando video do YouTube...")

    cmd = [
        'yt-dlp',
        '-f', 'best[height<=720]',
        '--postprocessor-args', f'ffmpeg:-ss {start} -t {duration}',
        '-o', output_path,
        url
    ]
    subprocess.run(cmd, check=True)
    return output_path


def extract_voice_reference(video_path: str, output_path: str):
    """Extrai audio de referencia do video."""
    print(f"[2/4] Extraindo audio de referencia...")

    cmd = [
        'ffmpeg', '-y',
        '-i', video_path,
        '-ar', '22050',
        '-ac', '1',
        '-t', '15',
        output_path
    ]
    subprocess.run(cmd, capture_output=True, check=True)
    return output_path


def generate_cloned_audio(text: str, voice_ref: str, output_path: str):
    """Gera audio com voz clonada."""
    print(f"[3/4] Gerando audio com StyleTTS2...")

    from styletts2 import tts
    import scipy.io.wavfile as wavfile

    my_tts = tts.StyleTTS2()
    wav = my_tts.inference(
        text,
        target_voice_path=voice_ref,
        diffusion_steps=10
    )
    wavfile.write(output_path, 24000, wav)
    return output_path


def run_lipsync(video_path: str, audio_path: str, output_dir: str):
    """Executa lip sync."""
    print(f"[4/4] Executando lip sync...")

    import yaml

    # Criar config
    config = {
        'task_0': {
            'video_path': os.path.abspath(video_path),
            'audio_path': os.path.abspath(audio_path),
            'bbox_shift': 5
        }
    }

    config_file = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False)
    yaml.dump(config, config_file)
    config_file.close()

    musetalk_dir = os.environ.get('MUSETALK_DIR', '/root/musetalk-space')

    cmd = [
        'python3', '-m', 'scripts.inference',
        '--inference_config', config_file.name,
        '--result_dir', output_dir
    ]

    subprocess.run(cmd, cwd=musetalk_dir, check=True)

    os.unlink(config_file.name)

    # Encontrar video de saida
    for f in os.listdir(os.path.join(output_dir, 'v15')):
        if f.endswith('.mp4'):
            return os.path.join(output_dir, 'v15', f)
    return None


def main():
    parser = argparse.ArgumentParser(description='Pipeline completo de video com lip sync')
    parser.add_argument('--youtube-url', '-y', required=True, help='URL do YouTube')
    parser.add_argument('--text', '-t', required=True, help='Texto para falar')
    parser.add_argument('--output', '-o', default='./output', help='Diretorio de saida')
    parser.add_argument('--start', '-s', type=int, default=0, help='Segundo inicial do video')
    parser.add_argument('--duration', '-d', type=int, default=15, help='Duracao em segundos')

    args = parser.parse_args()

    # Criar diretorio de saida
    os.makedirs(args.output, exist_ok=True)

    # Arquivos temporarios
    video_path = os.path.join(args.output, 'source_video.mp4')
    voice_ref_path = os.path.join(args.output, 'voice_ref.wav')
    audio_path = os.path.join(args.output, 'generated_audio.wav')

    # Executar pipeline
    download_youtube_video(args.youtube_url, video_path, args.start, args.duration)
    extract_voice_reference(video_path, voice_ref_path)
    generate_cloned_audio(args.text, voice_ref_path, audio_path)
    final_video = run_lipsync(video_path, audio_path, args.output)

    print(f"\n{'='*50}")
    print(f"Pipeline concluido!")
    print(f"Video final: {final_video}")
    print(f"{'='*50}")


if __name__ == '__main__':
    main()