Skip to content Skip to sidebar Skip to footer

Google Streaming Speech Recognition On An Audio Stream Python

I have searched through all the available docs of Google but I could not find an example of streaming speech recognition on an audio stream in Python. Currently, I am using Speech

Solution 1:

Google provides an example of the streaming Python API here.

Rather than opening an audio file to create the stream (as on line 34 of that example), pass the stream directly to the audio sample object (as on line 36).

Solution 2:

This is a working code for the above requirement.

Code:

import asyncio
import websockets
import json
import threading
from six.moves import queue
from google.cloud import speech
from google.cloud.speech import types


IP = '0.0.0.0'
PORT = 8000classTranscoder(object):
    """
    Converts audio chunks to text
    """def__init__(self, encoding, rate, language):
        self.buff = queue.Queue()
        self.encoding = encoding
        self.language = language
        self.rate = rate
        self.closed = True
        self.transcript = Nonedefstart(self):
        """Start up streaming speech call"""
        threading.Thread(target=self.process).start()

    defresponse_loop(self, responses):
        """
        Pick up the final result of Speech to text conversion
        """for response in responses:
            ifnot response.results:
                continue
            result = response.results[0]
            ifnot result.alternatives:
                continue
            transcript = result.alternatives[0].transcript
            if result.is_final:
                self.transcript = transcript

    defprocess(self):
        """
        Audio stream recognition and result parsing
        """#You can add speech contexts for better recognition
        cap_speech_context = types.SpeechContext(phrases=["Add your phrases here"])
        client = speech.SpeechClient()
        config = types.RecognitionConfig(
            encoding=self.encoding,
            sample_rate_hertz=self.rate,
            language_code=self.language,
            speech_contexts=[cap_speech_context,],
            model='command_and_search'
        )
        streaming_config = types.StreamingRecognitionConfig(
            config=config,
            interim_results=False,
            single_utterance=False)
        audio_generator = self.stream_generator()
        requests = (types.StreamingRecognizeRequest(audio_content=content)
                    for content in audio_generator)

        responses = client.streaming_recognize(streaming_config, requests)
        try:
            self.response_loop(responses)
        except:
            self.start()

    defstream_generator(self):
        whilenot self.closed:
            chunk = self.buff.get()
            if chunk isNone:
                return
            data = [chunk]
            whileTrue:
                try:
                    chunk = self.buff.get(block=False)
                    if chunk isNone:
                        return
                    data.append(chunk)
                except queue.Empty:
                    breakyieldb''.join(data)

    defwrite(self, data):
        """
        Writes data to the buffer
        """
        self.buff.put(data)


asyncdefaudio_processor(websocket, path):
    """
    Collects audio from the stream, writes it to buffer and return the output of Google speech to text
    """
    config = await websocket.recv()
    ifnotisinstance(config, str):
        print("ERROR, no config")
        return
    config = json.loads(config)
    transcoder = Transcoder(
        encoding=config["format"],
        rate=config["rate"],
        language=config["language"]
    )
    transcoder.start()
    whileTrue:
        try:
            data = await websocket.recv()
        except websockets.ConnectionClosed:
            print("Connection closed")
            break
        transcoder.write(data)
        transcoder.closed = Falseif transcoder.transcript:
            print(transcoder.transcript)
            await websocket.send(transcoder.transcript)
            transcoder.transcript = None

start_server = websockets.serve(audio_processor, IP, PORT)
asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()

Solution 3:

If you're using a React web app to stream client's audio, then you can refer to this repository for code samples (or you could just clone it and add your proprietary code) https://github.com/saharmor/realtime-transcription-playground

Post a Comment for "Google Streaming Speech Recognition On An Audio Stream Python"