I am trying to implement Google Cloud’s Speech-to-Text API in a web application, so users can speak into a microphone and see what they say in real time. I am using React.js on the frontend and Express.js in the backend. I am using the microphone-stream
npm package to capture and stream user audio and the websocket-stream
npm package to stream the audio through a web socket. Here is my source code on the frontend:
import MicrophoneStream from "microphone-stream";
import webSocketStream from "@httptoolkit/websocket-stream";
function Test() => {
const mediaStream = useRef(null);
const micStream = useRef(null);
const webSocket = useRef(null);
const listen = async () => {
if (isRecording) {
micStream.current.stop();
setIsRecording(false);
return;
}
const sampleRate = 16000;
// get media stream
mediaStream.current = await navigator.mediaDevices.getUserMedia({
audio: {
deviceId: "default",
sampleRate: sampleRate,
sampleSize: 16,
channelCount: 1,
},
video: false,
});
setIsRecording(true);
micStream.current = new MicrophoneStream();
micStream.current.setStream(mediaStream.current);
micStream.current.on("data", (chunk) => {
console.log("data received from mic stream");
});
micStream.current.on("error", (error) => {
console.error(error);
});
micStream.current.on("close", () => {
console.log("mic stream closed");
mediaStream.current.getAudioTracks()[0].stop();
setIsRecording(false);
});
webSocket.current = webSocketStream("ws://localhost:8000/ws/stt", {
perMessageDeflate: false,
});
webSocket.current.on("data", (data) => {
console.log("Data received:", data);
});
webSocket.current.on("error", (error) => {
console.log(error);
});
webSocket.current.on("close", (error) => {
console.log("web socket stream closed");
});
micStream.current.pipe(webSocket.current);
setTimeout(() => {
// micStream.current.unpipe(webSocket.current);
micStream.current.stop();
}, 3000);
}
And here is how I handle it on the backend:
import express from "express";
import { SpeechClient } from "@google-cloud/speech";
import websocketStream from "@httptoolkit/websocket-stream";
const router = express.Router();
const sttClient = new SpeechClient();
router.ws("/ws/stt", (ws, req) => {
console.log("Client connected");
const recognizeStream = sttClient
.streamingRecognize({
config: {
encoding: "LINEAR16",
sampleRateHertz: 16000,
languageCode: "en-GB",
enableAutomaticPunctuation: true,
},
interimResults: true,
})
.on("error", (error) => {
console.log("Error:", error);
})
.on("data", (data) => {
console.log("Received data:", data);
console.log("transcript:", data.results[0].alternatives[0].transcript);
ws.send(data.results[0].alternatives[0].transcript);
});
const wss = websocketStream(ws, { perMessageDeflate: false });
wss.pipe(recognizeStream);
ws.on("close", () => {
console.log("Client disconnected");
wss.end();
});
ws.on("message", async (message) => {
console.log("Received message:", message);
});
});
The data is sent through correctly, but I am getting very unexpected results. I keep getting a transcript that reads “play” or “play radio”, even if I say nothing. This is an example response:
Received data: {
[0] results: [
[0] {
[0] alternatives: [Array],
[0] isFinal: true,
[0] stability: 0,
[0] resultEndTime: [Object],
[0] channelTag: 0,
[0] languageCode: 'en-gb'
[0] }
[0] ],
[0] error: null,
[0] speechEventType: 'SPEECH_EVENT_UNSPECIFIED',
[0] totalBilledTime: { seconds: '18', nanos: 0 },
[0] speechAdaptationInfo: null,
[0] requestId: '4181042299479530578'
[0] }
[0] transcript: Play radio.
Am I approaching this correctly? Any help or advice would be greatly appreciated.