import gradio as gr import ast model = gr.Interface.load("huggingface/pyannote/voice-activity-detection") def format_inference(output): if output: timestamps = [] for out in output: timestamps.append(f"Start: {out['start']}s; Stop{out['stop']}s") return "\n".join(timestamps) else: return "No voice activity detected." def inference(audio_file): output = model(audio_file) output_list = ast.literal_eval(output) return format_inference(output_list) inputs = gr.inputs.Audio(label="Input Audio", type="filepath", source="microphone") outputs = gr.outputs.Textbox(label="Voice timestamps", type="auto") title = "Voice Activity Detection" description = "Record or upload an audio file and detected voices will be timestamped." examples = ['samples/talk.wav', 'samples/talk2.wav', 'samples/silence.wav', ] article = "pyannote, https://github.com/pyannote/pyannote-audio" gr.Interface(inference, inputs, outputs, title=title, description=description, article=article, examples=examples, theme="grass").launch(debug=True)