openai/whisper-large-v3 · Single word transcription for a audio file with ~1.5m frames

import torchaudio

processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")

sample_wav = download_asset(audio_file)
print(torchaudio.info(sample_wav))

AudioMetaData(sample_rate=8000, num_frames=1564224, num_channels=1, bits_per_sample=8, encoding=PCM_U)

# Resample  to match Whisper sampling rate
target_sample_rate = processor.feature_extractor.sampling_rate
print(f'Whisper sampling rate: {target_sample_rate}')

if orig_sample_rate != target_sample_rate:
    transform = torchaudio.transforms.Resample(orig_freq=orig_sample_rate, new_freq=target_sample_rate)
    waveform = transform(waveform)

print(f"waveform original shape 2D: {waveform.shape}")
waveform1 = waveform.squeeze(0)
print(f"waveform modified shape 1D: {waveform1.shape}")

waveform original shape 2D: torch.Size([1, 3128448])
waveform modified shape 1D: torch.Size([3128448])

# Load and preprocess the audio data
inputs = processor(waveform1, sampling_rate=processor.feature_extractor.sampling_rate, return_tensors="pt").input_features.to(device)

# Generate transcription
predicted_ids = model.generate(inputs, language="en")

# Decode the transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) 

# Print the transcription
print(transcription)

[' Thank you.']

The audio is 1.5m frames. How come I only get transcription of a single-word? Does it not auto-regressively decode each input id?