emirhanno commited on
Commit
7f5b6cf
1 Parent(s): 297157b

initial commits

Browse files
Files changed (2) hide show
  1. app.py +144 -0
  2. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import MarianTokenizer, MarianMTModel
4
+ from pdf2docx import Converter
5
+ from docx import Document
6
+ from parler_tts import ParlerTTSForConditionalGeneration
7
+ from transformers import AutoTokenizer
8
+ import soundfile as sf
9
+ from pydub import AudioSegment
10
+ import os
11
+ import nltk
12
+ from PyPDF2 import PdfReader
13
+ import textwrap
14
+
15
+ # Download the punkt tokenizer for sentence splitting
16
+ nltk.download('punkt')
17
+
18
+ # Device configuration
19
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
+
21
+ # Translation function
22
+ def translate(source_text, source_lang, target_lang, batch_size=16):
23
+ model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
24
+
25
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
26
+ model = MarianMTModel.from_pretrained(model_name).to(device)
27
+
28
+ text_chunks = textwrap.wrap(source_text, 512)
29
+ translated_text = ""
30
+
31
+ for i in range(0, len(text_chunks), batch_size):
32
+ text_batch = text_chunks[i:i+batch_size]
33
+ input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
34
+ output_ids = model.generate(input_ids, max_new_tokens=512)
35
+
36
+ for output in output_ids:
37
+ output_text = tokenizer.decode(output, skip_special_tokens=True)
38
+ translated_text += output_text + " "
39
+
40
+ return translated_text
41
+
42
+ # Function to extract text from PDF
43
+ def pdf_to_text(pdf_path):
44
+ with open(pdf_path, 'rb') as file:
45
+ pdf_reader = PdfReader(file)
46
+ text = ""
47
+ for page_num in range(len(pdf_reader.pages)):
48
+ page = pdf_reader.pages[page_num]
49
+ text += page.extract_text()
50
+ return text
51
+
52
+ # Load TTS model and tokenizer
53
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
54
+ tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
55
+
56
+ # Function to split text into sentences
57
+ def split_text_into_sentences(text):
58
+ sentences = nltk.sent_tokenize(text)
59
+ return sentences
60
+
61
+ # Function to generate audio from text
62
+ def generate_wav_from_text(prompt, description, output_file_prefix):
63
+ input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
64
+ prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
65
+
66
+ generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
67
+ audio_arr = generation.cpu().numpy().squeeze()
68
+ output_file = f"{output_file_prefix}.wav"
69
+ sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
70
+ return output_file
71
+
72
+ # Function to combine audio files
73
+ def combine_wav_files(output_file, *input_files, silence_duration=500):
74
+ combined = AudioSegment.empty()
75
+ one_second_silence = AudioSegment.silent(duration=silence_duration)
76
+
77
+ for file in input_files:
78
+ audio = AudioSegment.from_wav(file)
79
+ combined += audio + one_second_silence
80
+
81
+ combined.export(output_file, format='wav')
82
+
83
+ # Function to update target language options based on the source language
84
+ def update_target_lang_options(source_lang):
85
+ options = {
86
+ "en": ["de", "fr", "tr"],
87
+ "tr": ["en"],
88
+ "de": ["en", "fr"],
89
+ "fr": ["en", "de"]
90
+ }
91
+ return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
92
+
93
+ # Main Gradio function
94
+ def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
95
+ text = pdf_to_text(pdf_file.name)
96
+
97
+ # Translate if translation checkbox is selected
98
+ if translate_checkbox:
99
+ text = translate(text, source_lang, target_lang)
100
+
101
+ sentences = split_text_into_sentences(text)
102
+ audio_files = []
103
+ outputs = []
104
+
105
+ for i, sentence in enumerate(sentences):
106
+ output_file_prefix = f"sentence_{i+1}"
107
+ audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
108
+ audio_files.append(audio_file)
109
+ outputs.append((sentence, audio_file))
110
+
111
+ combined_output_file = "sentences_combined.wav"
112
+ combine_wav_files(combined_output_file, *audio_files)
113
+
114
+ return outputs, combined_output_file
115
+
116
+ # Gradio interface
117
+ with gr.Blocks() as demo:
118
+ with gr.Row():
119
+ with gr.Column(scale=1):
120
+ pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
121
+ translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
122
+ source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
123
+ target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
124
+ description = gr.Textbox(label="Voice Description",
125
+ value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
126
+ process_btn = gr.Button("Process")
127
+ with gr.Column(scale=2):
128
+ output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"])
129
+ combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath")
130
+
131
+ def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
132
+ return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
133
+
134
+ def handle_translation_toggle(translate_checkbox):
135
+ if translate_checkbox:
136
+ return gr.update(visible=True), gr.update(visible=True)
137
+ else:
138
+ return gr.update(visible=False), gr.update(visible=False)
139
+
140
+ translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
141
+ source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
142
+ process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio])
143
+
144
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch>=1.9.0
3
+ transformers>=4.11.3
4
+ sentencepiece
5
+ pdf2docx
6
+ python-docx
7
+ PyPDF2
8
+ pydub
9
+ soundfile
10
+ nltk