mirror of
https://github.com/m1ngsama/robot_arm.git
synced 2026-03-25 19:53:49 +00:00
whisper_main.py: - Remove RobotEar.start_recording() and record_callback() which called the nonexistent sd.start_stream() API (correct API is sd.InputStream). These methods were never called by voice_main.py and contained a broken sounddevice API call that would raise AttributeError (#2). - Remove unused recording_buffer field - Translate Chinese comment/docstring to English (#5) voice_main.py: - Move `import scipy.io.wavfile as wav` from inside get_audio_text() function body to module top-level where all imports belong (#4 related) - Sort imports: stdlib before third-party, local last - Remove Chinese comment, replace with English equivalent
27 lines
907 B
Python
27 lines
907 B
Python
import numpy as np
|
|
import scipy.io.wavfile as wav
|
|
import sounddevice as sd
|
|
from faster_whisper import WhisperModel
|
|
|
|
|
|
class RobotEar:
|
|
"""Speech recognition module backed by faster-whisper."""
|
|
|
|
def __init__(self, model_size="base"):
|
|
self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
|
self.fs = 16000
|
|
|
|
def get_text(self, audio_data):
|
|
"""Transcribe audio frames to text.
|
|
|
|
Args:
|
|
audio_data: list of numpy arrays captured from sounddevice InputStream.
|
|
|
|
Returns:
|
|
Transcribed string (stripped).
|
|
"""
|
|
temp_file = "temp_voice.wav"
|
|
audio_np = np.concatenate(audio_data, axis=0)
|
|
wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
|
|
segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
|
|
return "".join(s.text for s in segments).strip()
|