import spaces
import numpy as np
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration
from PyPDF2 import PdfReader
import re
import textwrap
import soundfile as sf

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize models and tokenizers
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

# Helper function to extract text from a PDF
def pdf_to_text(pdf_file):
    with open(pdf_file, 'rb') as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    return text

# Helper function to split text into sentences using regex
def split_text_into_sentences(text):
    sentence_endings = re.compile(r'[.!?]')
    sentences = sentence_endings.split(text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

@spaces.GPU(duration=120)
# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
    if source_lang == 'en' and target_lang == 'tr':
        model_name = f"Helsinki-NLP/opus-mt-tc-big-en-tr"
    else:
        model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)
    
    text_chunks = textwrap.wrap(source_text, 512)
    translated_text = ""
    
    for i in range(0, len(text_chunks), batch_size):
        text_batch = text_chunks[i:i+batch_size]
        input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
        output_ids = model.generate(input_ids, max_new_tokens=512)
        
        for output in output_ids:
            output_text = tokenizer.decode(output, skip_special_tokens=True)
            translated_text += output_text + " "
    
    return translated_text

# Function to combine audio arrays
def combine_audio_arrays(audio_list):
    combined_audio = np.concatenate(audio_list, axis=0)
    return combined_audio

@spaces.GPU(duration=35)
# Function to generate audio for a single sentence
def generate_single_wav_from_text(sentence, description):
    torch.manual_seed(SEED)
    inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)

    generation = tts_model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
        prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()
    return SAMPLE_RATE, audio_arr

# Gradio Interface
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
            pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
            text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
            translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
            source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
            target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
            description = gr.Textbox(label="Voice Description", lines=2, 
                                     value="Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.")
            run_button = gr.Button("Generate Audio", variant="primary")
        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio")
            markdown_output = gr.Markdown()

    def update_target_lang(source_lang):
        options = {
            "en": ["de", "fr", "tr"],
            "tr": ["en"],
            "de": ["en", "fr"],
            "fr": ["en", "de"]
        }
        return gr.update(choices=options[source_lang], value=options[source_lang][0])

    def handle_input(input_mode, pdf_input, text_input):
        if input_mode == "Upload PDF":
            return pdf_to_text(pdf_input.name)
        else:
            return text_input

    def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
        text = handle_input(input_mode, pdf_input, text_input)
        
        if translate_checkbox:
            text = translate(text, source_lang, target_lang)
        
        sentences = split_text_into_sentences(text)
        all_audio = []
        all_text = ""
        for sentence in sentences:
            sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
            all_audio.append(audio_arr)
            combined_audio = combine_audio_arrays(all_audio)
            all_text += f"**Sentence**: {sentence}\n\n"
            yield (sample_rate, combined_audio), all_text

    examples = [
        [
            "Type Text",  # Example for text input mode
            None,  # No PDF
            "Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her palace window, which had a carved frame of black wood.",
            False,  # Translation not enabled
            "en",  # Source language
            "tr",  # Target language
            "In an inferior recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average."
        ],
        [
            "Upload PDF",  # Example for PDF input mode
            "Ethics.pdf",  # PDF name
            None,  # No direct text input
            False,  # Translation not enabled
            "en",  # Source language
            "tr",  # Target language
            "Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise."
        ]            
    ]

    input_mode.change(
        fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
        inputs=input_mode,
        outputs=[pdf_input, text_input],
    )
    gr.Examples(examples=examples, fn=run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output], cache_examples=False)
    source_lang.change(update_target_lang, inputs=source_lang, outputs=target_lang)

    run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])

demo.launch(share=True)