Tried to follow your steps but cannot make it work.
For context, my codebase is acting as a middleware integrating an external app that streams voice conversations via websockets to Gemini (thanks to my middleware), so Gemini multimodal live API can act as a real-time voice assistant.
See some methods on my codebase:
async def connect(self, instructions: Optional[str] = None,
voice: Optional[str] = None,
temperature: Optional[float] = None,
model: Optional[str] = None,
max_output_tokens: Optional[int] = None,
agent_name: Optional[str] = None,
company_name: Optional[str] = None) -> None:
"""
Establishes a connection to the Gemini API using API key authentication
"""
try:
# Store configuration
self.admin_instructions = instructions
self.voice = voice if voice and voice.strip() else "Puck"
self.agent_name = agent_name
self.company_name = company_name
if temperature is not None:
self.temperature = temperature
if model is not None:
self.model = model
if max_output_tokens is not None:
self.max_output_tokens = max_output_tokens
# Create final instructions (system prompt)
self.final_instructions = create_final_system_prompt(
self.admin_instructions,
self.language,
self.customer_data,
self.agent_name,
self.company_name
)
# Initialize Gemini client with API key
api_key = os.getenv('GOOGLE_API_KEY')
if not api_key:
raise ValueError("GOOGLE_API_KEY environment variable not set")
self.logger.info("Initializing Gemini client with API key...")
self.client = genai.Client(
api_key=api_key,
http_options={'api_version': 'v1alpha'}
)
# Prepare session configuration
generation_config = {
"responseModalities": ["TEXT", "AUDIO"],
"candidateCount": 1,
"maxOutputTokens": self.max_output_tokens,
"temperature": self.temperature,
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": self.voice
}
}
},
"safetySettings": [
{
"category": setting["category"],
"threshold": setting["threshold"]
}
for setting in GEMINI_SAFETY_SETTINGS
]
}
setup_message = {
"model": self.model,
"generation_config": generation_config,
"system_instruction": {
"parts": [{"text": self.final_instructions}]
}
}
self.logger.debug(f"Sending setup message: {json.dumps(setup_message, indent=2)}")
# Connect to Gemini Live API
self.logger.info("Creating Gemini session...")
self._session_manager = self.client.aio.live.connect(
model=self.model,
config=setup_message
)
# Initialize the session
self.session = await self._session_manager.__aenter__()
self.running = True
self.read_task = asyncio.create_task(self._read_responses())
self.logger.info("Successfully connected to Gemini API")
except Exception as e:
self.logger.error(f"Failed to connect to Gemini: {str(e)}")
self.logger.debug("Full connection error details:", exc_info=True)
raise RuntimeError(f"Failed to connect to Gemini: {str(e)}")
async def _read_responses(self) -> None:
"""Reads responses (text/audio/function calls) from Gemini"""
try:
while self.running:
turn = self.session.receive()
async for response in turn:
if not self.running:
break
# Only handle function calls if enabled
if ENABLE_FUNCTIONS and hasattr(response, "tool_call") and response.tool_call:
await self._handle_tool_call(response.tool_call)
# Handle audio data
if response.data:
self.logger.debug(f"Received audio data: {len(response.data)} bytes")
if self.on_audio_ready_callback:
await self.on_audio_ready_callback(response.data)
# Handle text responses
if response.text:
self.logger.info(f"Received text response: {response.text}")
self.last_response = {
"text": response.text,
"usage": {}
}
if self.on_speech_started_callback:
await self.on_speech_started_callback()
if hasattr(response, 'server_content') and hasattr(response.server_content, 'turn_complete'):
if response.server_content.turn_complete:
self.logger.debug("Turn complete signal received")
if self.on_audio_ready_callback:
await self.on_audio_ready_callback(None)
except asyncio.CancelledError:
self.logger.info("Response reading task cancelled")
raise
except Exception as e:
self.logger.error(f"Error reading responses: {e}", exc_info=True)
raise
finally:
self.running = False
This is the method for generating the final summary:
async def generate_session_summary(self) -> Optional[Dict[str, Any]]:
"""
Sends a text-based summary request to Gemini in the SAME session.
"""
from config import ENDING_PROMPT, DEBUG
from google.genai import types
try:
if DEBUG == 'true':
self.logger.debug("=== Starting Summary Generation ===")
# Cancel existing read task first
if self.read_task:
if DEBUG == 'true':
self.logger.debug("Cancelling existing read task...")
self.read_task.cancel()
try:
await self.read_task
if DEBUG == 'true':
self.logger.debug("Read task cancelled successfully")
except asyncio.CancelledError:
if DEBUG == 'true':
self.logger.debug("Read task cancellation handled")
pass
self.read_task = None
await asyncio.sleep(0.1)
if DEBUG == 'true':
self.logger.debug("Making text request in existing session...")
# Send summary prompt directly as text
summary_prompt = (
f"{ENDING_PROMPT}"
)
if DEBUG == 'true':
self.logger.debug(f"Sending summary request with text: {summary_prompt}")
# Send the text directly
await self.session.send(summary_prompt, end_of_turn=True)
# Read the response
full_response = ""
try:
async with asyncio.timeout(30):
turn = self.session.receive()
async for chunk in turn:
if DEBUG == 'true':
self.logger.debug(f"Received chunk: {chunk}")
if chunk.text:
if DEBUG == 'true':
self.logger.debug(f"Got text: {chunk.text}")
full_response += chunk.text
if (hasattr(chunk, 'server_content') and
hasattr(chunk.server_content, 'turn_complete') and
chunk.server_content.turn_complete):
if DEBUG == 'true':
self.logger.debug("Turn complete")
if full_response:
break
except asyncio.TimeoutError:
self.logger.error("Timeout waiting for summary")
return None
if DEBUG == 'true':
self.logger.debug(f"Final response: {full_response}")
if full_response:
return {"summary": full_response}
else:
self.logger.warning("No summary text received")
return None
except Exception as e:
self.logger.error(f"Error generating summary: {e}", exc_info=True)
if DEBUG == 'true':
self.logger.debug("Full error details:", exc_info=True)
return None
finally:
if DEBUG == 'true':
self.logger.debug("=== Summary Generation Complete ===")
During the voice conversation I correctly get the audio responses back from Gemini, but when the voice ends, the external app sends me a “close” message, and in that moment I want to generate the text summary by invoking “generate_session_summary”
Excerpt from my server logs when the voice conversation ends and I try to get the summary:
2024-12-26 10:46:52.575 [INFO] ExternalAppGeminiBridge.ExternalAppServer-81fb-4294-903d-05ad43319bc5: Received 'close' from ExternalApp. Reason: end
2024-12-26 10:46:52.576 [INFO] ExternalAppGeminiBridge.ExternalAppServer-81fb-4294-903d-05ad43319bc5: Audio processing task cancelled
2024-12-26 10:46:52.576 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: === Starting Summary Generation ===
2024-12-26 10:46:52.577 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Cancelling existing read task...
2024-12-26 10:46:52.577 [INFO] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Response reading task cancelled
2024-12-26 10:46:52.577 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Read task cancellation handled
2024-12-26 10:46:52.682 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Making text request in existing session...
2024-12-26 10:46:52.682 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Sending summary request with text: Here is a summary request for the voice conversation we just had:
Please analyze this conversation and provide a structured summary including:
{
"main_topics": [],
"key_decisions": [],
"action_items": [],
"sentiment": ""
}
Please analyze our conversation and provide the summary in the requested format.
2024-12-26 10:46:52.802 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Received chunk: setup_complete=None server_content=LiveServerContent(model_turn=None, turn_complete=None, interrupted=None) tool_call=None tool_call_cancellation=None
2024-12-26 10:46:52.803 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Received chunk: setup_complete=None server_content=LiveServerContent(model_turn=None, turn_complete=True, interrupted=None) tool_call=None tool_call_cancellation=None
2024-12-26 10:46:52.803 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Turn complete
2024-12-26 10:46:52.804 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Final response:
2024-12-26 10:46:52.804 [WARNING] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: No summary text received
2024-12-26 10:46:52.804 [DEBUG] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: === Summary Generation Complete ===
2024-12-26 10:46:52.804 [INFO] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Closing Gemini connection...
2024-12-26 10:46:52.915 [INFO] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Gemini session reference cleared
2024-12-26 10:46:52.915 [INFO] ExternalAppGeminiBridge.GeminiClient_30e03fcf-6cb7-48a4-aebc-b63fda305922: Connection closed after 15.34s
2024-12-26 10:46:52.915 [DEBUG] ExternalAppGeminiBridge.ExternalAppServer-81fb-4294-903d-05ad43319bc5: Sending message to ExternalApp:
{
"version": "2",
"type": "disconnect",
"seq": 6,
"clientseq": 15,
"id": "30e03fcf-6cb7-48a4-aebc-b63fda305922",
"parameters": {
"reason": "completed",
"outputVariables": {
"CONVERSATION_SUMMARY": "",
"CONVERSATION_DURATION": "15.495136260986328"
}
}
}
2024-12-26 10:46:52.916 [INFO] ExternalAppGeminiBridge.ExternalAppServer-81fb-4294-903d-05ad43319bc5: Session stats - Duration: 15.50s, Frames sent: 35, Frames received: 75
2024-12-26 10:46:52.916 [INFO] ExternalAppGeminiBridge: [WS-0a011fd2] Connection handler finished
So as you can see, I can’t get any conversation summary.
Thanks in advance!