merge: fix whisper broken start_recording (closes #2)

2026-05-10 19:11:04 +08:00 · 2026-02-20 20:45:23 +08:00 · 2026-02-20 20:45:23 +08:00 · f631157887
commit f631157887
parent 01313a1d94 6977061bef
2 changed files with 28 additions and 30 deletions
--- a/voice_main.py
+++ b/voice_main.py
@ -1,17 +1,20 @@
-import cv2
-import numpy as np
-import time
+import json
 import os
 import re
-import json
-import torch
-import sounddevice as sd
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from ultralytics import YOLO  
-from arm_main import RobotArmUltimate 
-from whisper_main import RobotEar 
+import time

-# 禁用代理
+import cv2
+import numpy as np
+import scipy.io.wavfile as wav
+import sounddevice as sd
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from ultralytics import YOLO
+
+from arm_main import RobotArmUltimate
+from whisper_main import RobotEar
+
+# Disable proxy for local serial/model communication
 os.environ["no_proxy"] = "localhost,127.0.0.1"

 # =========================================================
@ -686,7 +689,6 @@ class RobotApp:
            print(f">>> [语音] 音频太长({duration:.1f}s)，截断到15秒")
            audio_trimmed = audio_trimmed[:16000 * 15]
        
-        import scipy.io.wavfile as wav
        temp_file = "temp_voice.wav"
        wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))
        
--- a/whisper_main.py
+++ b/whisper_main.py
@ -1,31 +1,27 @@
-# 文件名: whisper_main.py
-import sounddevice as sd
 import numpy as np
 import scipy.io.wavfile as wav
+import sounddevice as sd
 from faster_whisper import WhisperModel

+
 class RobotEar:
+    """Speech recognition module backed by faster-whisper."""
+
    def __init__(self, model_size="base"):
        self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
        self.fs = 16000
-        self.recording_buffer = []
-
-    def start_recording(self):
-        self.recording_buffer = []
-        # 开始长录音
-        sd.start_stream(samplerate=self.fs, channels=1)
-        print(">>> [耳朵] 录音中...")
-
-    def record_callback(self, indata, frames, time, status):
-        self.recording_buffer.append(indata.copy())

    def get_text(self, audio_data):
-        """将传入的音频数组转为文字"""
+        """Transcribe audio frames to text.
+
+        Args:
+            audio_data: list of numpy arrays captured from sounddevice InputStream.
+
+        Returns:
+            Transcribed string (stripped).
+        """
        temp_file = "temp_voice.wav"
-        # 归一化音频数据
        audio_np = np.concatenate(audio_data, axis=0)
        wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
-        
-        segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh")
-        text = "".join([s.text for s in segments])
-        return text.strip()
+        segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
+        return "".join(s.text for s in segments).strip()