I am developing an app to create a video. The audio part is generated using gemini-2.5-pro-tts using the google genai python sdk. I have this problem where sometimes the generated audio gets cutoff at the last word. For example, the complete phrase is “welcome to my home” but the generated audio stopped mid-word to “welcome to my ho-”. This is happening maybe once in every 10 api call.
Below is minimal code to reproduce the issue
# script.py
import argparse
import asyncio
import os
import wave
from google import genai
from google.genai import types
from google.genai.types import HttpOptions
def save_wave(path: str, audio_data: bytes) -> None:
os.makedirs(os.path.dirname(path), exist_ok=True)
with wave.open(path, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(24000)
wf.writeframes(audio_data)
async def generate_tts(text: str, output_path: str, voice_name: str, model: str) -> None:
client = genai.Client(
http_options=HttpOptions(api_version="v1"),
vertexai=True,
project=os.environ["GOOGLE_CLOUD_PROJECT"],
location=os.environ.get("GOOGLE_CLOUD_LOCATION", "us-central1"),
)
prompt = f"""Please deliver this in a professional, straightforward, friendly, and engaging tone.
Ensure every word is articulated clearly. Include occasional pauses for breath.
Text to synthesize:
{text}"""
response = await client.aio.models.generate_content(
model=model,
contents=prompt,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice_name,
)
)
),
),
)
parts = response.candidates[0].content.parts
audio_data = parts[0].inline_data.data
save_wave(output_path, audio_data)
async def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--prompts-file", required=True)
parser.add_argument("--output-dir", default="tts_repro_out")
parser.add_argument("--model", default="gemini-2.0-flash-exp")
parser.add_argument("--voice", default="Achernar")
args = parser.parse_args()
with open(args.prompts_file, "r", encoding="utf-8") as f:
prompts = [line.strip() for line in f if line.strip()]
for idx, text in enumerate(prompts):
out_path = os.path.join(args.output_dir, f"segment_{idx:03d}.wav")
await generate_tts(text=text, output_path=out_path, voice_name=args.voice, model=args.model)
print(f"ok index={idx} path={out_path}")
if __name__ == "__main__":
asyncio.run(main())
run the script with:
python script.py \
--prompts-file list_of_prompts.txt \
--output-dir tts_repro_out \
--model gemini-2.5-pro-tts \
--voice Achernar
Could anyone help me? Is it wrong code implementation on my side or there is issue from google side?