Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import numpy as np | |
import gradio as gr | |
import torch | |
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from PyPDF2 import PdfReader | |
import re | |
import textwrap | |
import soundfile as sf | |
# Device configuration | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Initialize models and tokenizers | |
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device) | |
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1") | |
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1") | |
SAMPLE_RATE = feature_extractor.sampling_rate | |
SEED = 42 | |
# Helper function to extract text from a PDF | |
def pdf_to_text(pdf_file): | |
with open(pdf_file, 'rb') as file: | |
pdf_reader = PdfReader(file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() or "" | |
return text | |
# Helper function to split text into sentences using regex | |
def split_text_into_sentences(text): | |
sentence_endings = re.compile(r'[.!?]') | |
sentences = sentence_endings.split(text) | |
return [sentence.strip() for sentence in sentences if sentence.strip()] | |
# Translation function | |
def translate(source_text, source_lang, target_lang, batch_size=16): | |
if source_lang == 'en' and target_lang == 'tr': | |
model_name = f"Helsinki-NLP/opus-mt-tc-big-en-tr" | |
else: | |
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}" | |
tokenizer = MarianTokenizer.from_pretrained(model_name) | |
model = MarianMTModel.from_pretrained(model_name).to(device) | |
text_chunks = textwrap.wrap(source_text, 512) | |
translated_text = "" | |
for i in range(0, len(text_chunks), batch_size): | |
text_batch = text_chunks[i:i+batch_size] | |
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device) | |
output_ids = model.generate(input_ids, max_new_tokens=512) | |
for output in output_ids: | |
output_text = tokenizer.decode(output, skip_special_tokens=True) | |
translated_text += output_text + " " | |
return translated_text | |
# Function to combine audio arrays | |
def combine_audio_arrays(audio_list): | |
combined_audio = np.concatenate(audio_list, axis=0) | |
return combined_audio | |
# Function to generate audio for a single sentence | |
def generate_single_wav_from_text(sentence, description): | |
torch.manual_seed(SEED) | |
inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device) | |
prompt = tts_tokenizer(sentence, return_tensors="pt").to(device) | |
generation = tts_model.generate( | |
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, | |
prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0 | |
) | |
audio_arr = generation.cpu().numpy().squeeze() | |
return SAMPLE_RATE, audio_arr | |
# Gradio Interface | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text") | |
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False) | |
text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...") | |
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False) | |
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True) | |
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True) | |
description = gr.Textbox(label="Voice Description", lines=2, | |
value="Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.") | |
run_button = gr.Button("Generate Audio", variant="primary") | |
with gr.Column(): | |
audio_output = gr.Audio(label="Generated Audio") | |
markdown_output = gr.Markdown() | |
def update_target_lang(source_lang): | |
options = { | |
"en": ["de", "fr", "tr"], | |
"tr": ["en"], | |
"de": ["en", "fr"], | |
"fr": ["en", "de"] | |
} | |
return gr.update(choices=options[source_lang], value=options[source_lang][0]) | |
def handle_input(input_mode, pdf_input, text_input): | |
if input_mode == "Upload PDF": | |
return pdf_to_text(pdf_input.name) | |
else: | |
return text_input | |
def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description): | |
text = handle_input(input_mode, pdf_input, text_input) | |
if translate_checkbox: | |
text = translate(text, source_lang, target_lang) | |
sentences = split_text_into_sentences(text) | |
all_audio = [] | |
all_text = "" | |
for sentence in sentences: | |
sample_rate, audio_arr = generate_single_wav_from_text(sentence, description) | |
all_audio.append(audio_arr) | |
combined_audio = combine_audio_arrays(all_audio) | |
all_text += f"**Sentence**: {sentence}\n\n" | |
yield (sample_rate, combined_audio), all_text | |
examples = [ | |
[ | |
"Type Text", # Example for text input mode | |
None, # No PDF | |
"Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her palace window, which had a carved frame of black wood.", | |
False, # Translation not enabled | |
"en", # Source language | |
"tr", # Target language | |
"In an inferior recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average." | |
], | |
[ | |
"Upload PDF", # Example for PDF input mode | |
"Ethics.pdf", # PDF name | |
None, # No direct text input | |
False, # Translation not enabled | |
"en", # Source language | |
"tr", # Target language | |
"Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise." | |
] | |
] | |
input_mode.change( | |
fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")], | |
inputs=input_mode, | |
outputs=[pdf_input, text_input], | |
) | |
gr.Examples(examples=examples, fn=run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output], cache_examples=False) | |
source_lang.change(update_target_lang, inputs=source_lang, outputs=target_lang) | |
run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output]) | |
demo.launch(share=True) | |