merge: fix whisper broken start_recording (closes #2)

2026-05-10 19:11:04 +08:00 · 2026-02-20 20:45:23 +08:00 · 2026-02-20 20:45:23 +08:00 · f631157887
commit f631157887
parent 01313a1d94 6977061bef
2 changed files with 28 additions and 30 deletions
--- a/voice_main.py
+++ b/voice_main.py
@ -1,17 +1,20 @@
-import cv2
+import json
 import numpy as np
 import time
 import os
 import re
-import json
+import time
-import torch
+
 import cv2
 import numpy as np
 import scipy.io.wavfile as wav
 import sounddevice as sd
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from ultralytics import YOLO
 from arm_main import RobotArmUltimate
 from whisper_main import RobotEar
-# 禁用代理
+# Disable proxy for local serial/model communication
 os.environ["no_proxy"] = "localhost,127.0.0.1"
 # =========================================================
@ -686,7 +689,6 @@ class RobotApp:
            print(f">>> [语音] 音频太长({duration:.1f}s)，截断到15秒")
            audio_trimmed = audio_trimmed[:16000 * 15]
        import scipy.io.wavfile as wav
        temp_file = "temp_voice.wav"
        wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))
--- a/whisper_main.py
+++ b/whisper_main.py
@ -1,31 +1,27 @@
 # 文件名: whisper_main.py
 import sounddevice as sd
 import numpy as np
 import scipy.io.wavfile as wav
 import sounddevice as sd
 from faster_whisper import WhisperModel
 class RobotEar:
    """Speech recognition module backed by faster-whisper."""
    def __init__(self, model_size="base"):
        self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
        self.fs = 16000
        self.recording_buffer = []
    def start_recording(self):
        self.recording_buffer = []
        # 开始长录音
        sd.start_stream(samplerate=self.fs, channels=1)
        print(">>> [耳朵] 录音中...")
    def record_callback(self, indata, frames, time, status):
        self.recording_buffer.append(indata.copy())
    def get_text(self, audio_data):
-        """将传入的音频数组转为文字"""
+        """Transcribe audio frames to text.
        Args:
            audio_data: list of numpy arrays captured from sounddevice InputStream.
        Returns:
            Transcribed string (stripped).
        """
        temp_file = "temp_voice.wav"
        # 归一化音频数据
        audio_np = np.concatenate(audio_data, axis=0)
        wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
-        
+        segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
-        segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh")
+        return "".join(s.text for s in segments).strip()
        text = "".join([s.text for s in segments])
        return text.strip()