How to use the stream of tts in golang? i want to send text and receive audio simultaneously.

I want to use tts api, and send text and receive audio simultaneously. the stream way is a good idea. but i found it need call CloseSend() every time, when the text stream finished, then call the CloseSend() , i can receive the audio stream. if i didn’t call CloseSend(), it will come error.

so , actuallly, it can not send and receive simultaneously? it just send some text in the stream, then call CloseSend(), then i receive the audio stream. so someone can tell me the right way to use tts api in golang? thanks a lot.

here is my log:

Receiving streaming responses…
2025/02/22 16:23:12 call close send
2025/02/22 16:23:12 Sent all requests.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.00800775s
2025/02/22 16:23:13 Received audio data: 1865
2025/02/22 16:23:13 first audio spent: 1.1212305s
2025/02/22 16:23:13 Received audio data: 2113
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.211656625s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.256695417s
2025/02/22 16:23:13 Received audio data: 2139
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.370593125s
2025/02/22 16:23:13 Received audio data: 2193
2025/02/22 16:23:13 first audio spent: 1.419397459s
2025/02/22 16:23:13 Received audio data: 2143
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.511148917s
2025/02/22 16:23:13 Received audio data: 2179
Audio data received.
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.601768667s
2025/02/22 16:23:13 Received audio data: 2191
Audio data received.
2025/02/22 16:23:13 first audio spent: 1.61639975s
2025/02/22 16:23:13 Received audio data: 1188
Streaming completed successfully.
2025/02/22 16:23:13 save audio data to file: google-20250222162312-output.pcm
2025/02/22 16:23:13 total time: 1.664321167s

here is my golang code(i drop the private_key):

package main

import (
    "context"
    "fmt"
    "io"
    "log"
    "os"
    "time"

    texttospeech "cloud.google.com/go/texttospeech/apiv1"
    "cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
    "google.golang.org/api/option"
)

func main() {
    if err := GoogleTts(); err != nil {
       log.Fatalf("GoogleTts err: %v", err)
    }
}

func GoogleTts() error {
    start := time.Now()
    // Instantiates a client.
    ctx := context.Background()

    credentialsJson := `
{
  "type": "service_account",
  "project_id": "teamsun-workspace",
  "private_key_id": "cc43756f5a0e46c30f34e53f8ed1343a2a92a34a",
  "private_key": "xxxxxxxx",
  "client_email": "teamsun-tts-test@teamsun-workspace.iam.gserviceaccount.com",
  "client_id": "105978219795698433242",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/teamsun-tts-test%40teamsun-workspace.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}
`

    client, err := texttospeech.NewClient(ctx, option.WithCredentialsJSON([]byte(credentialsJson)))
    if err != nil {
       log.Fatal(err)
    }
    defer client.Close()

    // Open a streaming connection
    stream, err := client.StreamingSynthesize(ctx)
    if err != nil {
       log.Fatalf("Failed to start streaming: %v", err)
    }

    // Send streaming synthesis requests in a separate goroutine
    go func() {
       // Prepare the input requests for the streaming API
       requests := []*texttospeechpb.StreamingSynthesizeRequest{
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_StreamingConfig{
                StreamingConfig: &texttospeechpb.StreamingSynthesizeConfig{
                   Voice: &texttospeechpb.VoiceSelectionParams{
                      LanguageCode: "en-US",
                      Name:         "en-US-Chirp-HD-F",
                      SsmlGender:   texttospeechpb.SsmlVoiceGender_FEMALE,
                   },
                   StreamingAudioConfig: &texttospeechpb.StreamingAudioConfig{
                      AudioEncoding:   texttospeechpb.AudioEncoding_OGG_OPUS,
                      SampleRateHertz: 8000,
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "hello, ",
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "good to see you, ",
                   },
                },
             },
          },
          {
             StreamingRequest: &texttospeechpb.StreamingSynthesizeRequest_Input{
                Input: &texttospeechpb.StreamingSynthesisInput{
                   InputSource: &texttospeechpb.StreamingSynthesisInput_Text{
                      Text: "and welcome to the world of AI.",
                   },
                },
             },
          },
       }

       // Send the requests into the stream
       for _, req := range requests {
          if err := stream.Send(req); err != nil {
             log.Fatalf("Failed to send request: %v", err)
          }
       }

       // End the sending stream
       log.Printf("call close send")
       if err := stream.CloseSend(); err != nil {
          log.Fatalf("Failed to close send stream: %v", err)
       }

       log.Printf("Sent all requests.")
    }()

    // Receive and handle streaming responses
    fmt.Println("Receiving streaming responses...")
    filename := "google-" + time.Now().Format("20060102150405") + "-output.pcm"
    for {
       resp, err := stream.Recv()
       if err == io.EOF {
          break
       }
       if err != nil {
          log.Fatalf("Failed to receive stream: %v", err)
       }

       log.Printf("first audio spent: %v", time.Since(start))

       // log.Printf("Received response: %v", resp)

       if len(resp.AudioContent) > 0 {
          fmt.Println("Audio data received.")
          log.Printf("Received audio data: %v", len(resp.AudioContent))
          if err := appendToFile(filename, resp.AudioContent); err != nil {
             log.Fatalf("Failed to write audio data to file: %v", err)
          }
       }
    }

    fmt.Println("Streaming completed successfully.")
    log.Printf("save audio data to file: %v", filename)
    log.Printf("total time: %v", time.Since(start))

    return nil
}

func appendToFile(filename string, data []byte) error {
    f, err := os.OpenFile(filename, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
    if err != nil {
       return err
    }
    defer f.Close()

    _, err = f.Write(data)
    return err
}

"You’re correct — most TTS APIs don’t support fully bi-directional streaming where you send text and get audio back at the exact same time. In Golang, the common flow is:

  1. Stream or chunk your text input.

  2. When the text input is finished, call CloseSend().

  3. The server will then finalize and start streaming the audio response back.

So the behavior you’re seeing is expected — it’s more of a half-duplex stream (send → close → receive). If you want near real-time, the workaround is to send smaller chunks of text and close/send them incrementally so audio starts sooner. But fully simultaneous (true duplex) isn’t usually supported in standard TTS APIs."