File size: 7,334 Bytes
77eabea
6ec69c0
7f5b6cf
 
6ec69c0
7f5b6cf
 
23f3f75
7f5b6cf
310b1cd
7f5b6cf
 
 
 
23f3f75
88b4f72
 
24b7a14
 
23f3f75
88b4f72
23f3f75
6ec69c0
 
7f5b6cf
 
6ec69c0
 
7f5b6cf
 
23f3f75
7f5b6cf
51c9037
 
 
7f5b6cf
77eabea
0a52a3b
 
7a3e6dc
 
dd328cf
7a3e6dc
0a52a3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ec69c0
 
 
 
23f3f75
a496e5c
88b4f72
23f3f75
6ec69c0
23f3f75
6ec69c0
23f3f75
 
 
 
 
7f5b6cf
23f3f75
88b4f72
23f3f75
7f5b6cf
 
23f3f75
6ec69c0
 
 
0a52a3b
 
 
23f3f75
544cb68
23f3f75
 
310b1cd
0a52a3b
7f5b6cf
544cb68
 
 
 
 
 
 
 
 
6ec69c0
 
 
 
 
 
 
 
310b1cd
0a52a3b
 
 
347bb89
310b1cd
8a94ca8
23f3f75
 
310b1cd
 
8a94ca8
6ec69c0
7a3e6dc
b764f80
7a3e6dc
545b263
 
7a3e6dc
545b263
 
 
 
7a3e6dc
 
545b263
1df68cf
545b263
 
 
 
 
7a3e6dc
 
310b1cd
6ec69c0
 
 
b764f80
6ec69c0
a94ede6
544cb68
0a52a3b
6ec69c0
7f5b6cf
6ec69c0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import spaces
import numpy as np
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration
from PyPDF2 import PdfReader
import re
import textwrap
import soundfile as sf

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models and tokenizers
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

# Helper function to extract text from a PDF
def pdf_to_text(pdf_file):
    with open(pdf_file, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    return text

# Helper function to split text into sentences using regex
def split_text_into_sentences(text):
    sentence_endings = re.compile(r'[.!?]')
    sentences = sentence_endings.split(text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

@spaces.GPU(duration=120)
# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
    if source_lang == 'en' and target_lang == 'tr':
        model_name = f"Helsinki-NLP/opus-mt-tc-big-en-tr"
    else:
        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    text_chunks = textwrap.wrap(source_text, 512)
    translated_text = ""
    
    for i in range(0, len(text_chunks), batch_size):
        text_batch = text_chunks[i:i+batch_size]
        input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
        output_ids = model.generate(input_ids, max_new_tokens=512)
        
        for output in output_ids:
            output_text = tokenizer.decode(output, skip_special_tokens=True)
            translated_text += output_text + " "
    
    return translated_text

# Function to combine audio arrays
def combine_audio_arrays(audio_list):
    combined_audio = np.concatenate(audio_list, axis=0)
    return combined_audio

@spaces.GPU(duration=35)
# Function to generate audio for a single sentence
def generate_single_wav_from_text(sentence, description):
    torch.manual_seed(SEED)
    inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)

    generation = tts_model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
        prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()
    return SAMPLE_RATE, audio_arr

# Gradio Interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
            pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
            text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
            translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
            source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
            target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
            description = gr.Textbox(label="Voice Description", lines=2, 
                                     value="Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio")
            markdown_output = gr.Markdown()

    def update_target_lang(source_lang):
        options = {
            "en": ["de", "fr", "tr"],
            "tr": ["en"],
            "de": ["en", "fr"],
            "fr": ["en", "de"]
        }
        return gr.update(choices=options[source_lang], value=options[source_lang][0])

    def handle_input(input_mode, pdf_input, text_input):
        if input_mode == "Upload PDF":
            return pdf_to_text(pdf_input.name)
        else:
            return text_input

    def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
        text = handle_input(input_mode, pdf_input, text_input)
        
        if translate_checkbox:
            text = translate(text, source_lang, target_lang)
        
        sentences = split_text_into_sentences(text)
        all_audio = []
        all_text = ""
        for sentence in sentences:
            sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
            all_audio.append(audio_arr)
            combined_audio = combine_audio_arrays(all_audio)
            all_text += f"**Sentence**: {sentence}\n\n"
            yield (sample_rate, combined_audio), all_text

    examples = [
        [
            "Type Text",  # Example for text input mode
            None,  # No PDF
            "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her palace window, which had a carved frame of black wood.",
            False,  # Translation not enabled
            "en",  # Source language
            "tr",  # Target language
            "In an inferior recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average."
        ],
        [
            "Upload PDF",  # Example for PDF input mode
            "Ethics.pdf",  # PDF name
            None,  # No direct text input
            False,  # Translation not enabled
            "en",  # Source language
            "tr",  # Target language
            "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise."
        ]            
    ]

    input_mode.change(
        fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
        inputs=input_mode,
        outputs=[pdf_input, text_input],
    )
    gr.Examples(examples=examples, fn=run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output], cache_examples=False)
    source_lang.change(update_target_lang, inputs=source_lang, outputs=target_lang)

    run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])

demo.launch(share=True)