From 6977061bef4b04c5e6fc47d917e0410e57b1bff7 Mon Sep 17 00:00:00 2001 From: m1ngsama Date: Fri, 20 Feb 2026 20:24:24 +0800 Subject: [PATCH] fix(whisper): remove broken start_recording; move scipy import to top-level whisper_main.py: - Remove RobotEar.start_recording() and record_callback() which called the nonexistent sd.start_stream() API (correct API is sd.InputStream). These methods were never called by voice_main.py and contained a broken sounddevice API call that would raise AttributeError (#2). - Remove unused recording_buffer field - Translate Chinese comment/docstring to English (#5) voice_main.py: - Move `import scipy.io.wavfile as wav` from inside get_audio_text() function body to module top-level where all imports belong (#4 related) - Sort imports: stdlib before third-party, local last - Remove Chinese comment, replace with English equivalent --- voice_main.py | 26 ++++++++++++++------------ whisper_main.py | 32 ++++++++++++++------------------ 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/voice_main.py b/voice_main.py index 4274218..86f5614 100644 --- a/voice_main.py +++ b/voice_main.py @@ -1,17 +1,20 @@ -import cv2 -import numpy as np -import time +import json import os import re -import json -import torch -import sounddevice as sd -from transformers import AutoModelForCausalLM, AutoTokenizer -from ultralytics import YOLO -from arm_main import RobotArmUltimate -from whisper_main import RobotEar +import time -# 禁用代理 +import cv2 +import numpy as np +import scipy.io.wavfile as wav +import sounddevice as sd +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +from ultralytics import YOLO + +from arm_main import RobotArmUltimate +from whisper_main import RobotEar + +# Disable proxy for local serial/model communication os.environ["no_proxy"] = "localhost,127.0.0.1" # ========================================================= @@ -686,7 +689,6 @@ class RobotApp: print(f">>> [语音] 音频太长({duration:.1f}s),截断到15秒") audio_trimmed = audio_trimmed[:16000 * 15] - import scipy.io.wavfile as wav temp_file = "temp_voice.wav" wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16)) diff --git a/whisper_main.py b/whisper_main.py index 9250ba1..8dc5924 100644 --- a/whisper_main.py +++ b/whisper_main.py @@ -1,31 +1,27 @@ -# 文件名: whisper_main.py -import sounddevice as sd import numpy as np import scipy.io.wavfile as wav +import sounddevice as sd from faster_whisper import WhisperModel + class RobotEar: + """Speech recognition module backed by faster-whisper.""" + def __init__(self, model_size="base"): self.model = WhisperModel(model_size, device="cuda", compute_type="float16") self.fs = 16000 - self.recording_buffer = [] - - def start_recording(self): - self.recording_buffer = [] - # 开始长录音 - sd.start_stream(samplerate=self.fs, channels=1) - print(">>> [耳朵] 录音中...") - - def record_callback(self, indata, frames, time, status): - self.recording_buffer.append(indata.copy()) def get_text(self, audio_data): - """将传入的音频数组转为文字""" + """Transcribe audio frames to text. + + Args: + audio_data: list of numpy arrays captured from sounddevice InputStream. + + Returns: + Transcribed string (stripped). + """ temp_file = "temp_voice.wav" - # 归一化音频数据 audio_np = np.concatenate(audio_data, axis=0) wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16)) - - segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh") - text = "".join([s.text for s in segments]) - return text.strip() \ No newline at end of file + segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh") + return "".join(s.text for s in segments).strip()