robot_arm/whisper_main.py

import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
from faster_whisper import WhisperModel


class RobotEar:
    """Speech recognition module backed by faster-whisper."""

    def __init__(self, model_size="base"):
        self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
        self.fs = 16000

    def get_text(self, audio_data):
        """Transcribe audio frames to text.

        Args:
            audio_data: list of numpy arrays captured from sounddevice InputStream.

        Returns:
            Transcribed string (stripped).
        """
        temp_file = "temp_voice.wav"
        audio_np = np.concatenate(audio_data, axis=0)
        wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
        segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
        return "".join(s.text for s in segments).strip()