fix(whisper): remove broken start_recording; move scipy import to top-level

whisper_main.py:
- Remove RobotEar.start_recording() and record_callback() which called
  the nonexistent sd.start_stream() API (correct API is sd.InputStream).
  These methods were never called by voice_main.py and contained a broken
  sounddevice API call that would raise AttributeError (#2).
- Remove unused recording_buffer field
- Translate Chinese comment/docstring to English (#5)

voice_main.py:
- Move `import scipy.io.wavfile as wav` from inside get_audio_text()
  function body to module top-level where all imports belong (#4 related)
- Sort imports: stdlib before third-party, local last
- Remove Chinese comment, replace with English equivalent
This commit is contained in:
m1ngsama 2026-02-20 20:24:24 +08:00
parent a7209c4f78
commit 6977061bef
2 changed files with 28 additions and 30 deletions

View file

@ -1,17 +1,20 @@
import cv2
import numpy as np
import time
import json
import os
import re
import json
import torch
import sounddevice as sd
from transformers import AutoModelForCausalLM, AutoTokenizer
from ultralytics import YOLO
from arm_main import RobotArmUltimate
from whisper_main import RobotEar
import time
# 禁用代理
import cv2
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from ultralytics import YOLO
from arm_main import RobotArmUltimate
from whisper_main import RobotEar
# Disable proxy for local serial/model communication
os.environ["no_proxy"] = "localhost,127.0.0.1"
# =========================================================
@ -686,7 +689,6 @@ class RobotApp:
print(f">>> [语音] 音频太长({duration:.1f}s)截断到15秒")
audio_trimmed = audio_trimmed[:16000 * 15]
import scipy.io.wavfile as wav
temp_file = "temp_voice.wav"
wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))

View file

@ -1,31 +1,27 @@
# 文件名: whisper_main.py
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
from faster_whisper import WhisperModel
class RobotEar:
"""Speech recognition module backed by faster-whisper."""
def __init__(self, model_size="base"):
self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
self.fs = 16000
self.recording_buffer = []
def start_recording(self):
self.recording_buffer = []
# 开始长录音
sd.start_stream(samplerate=self.fs, channels=1)
print(">>> [耳朵] 录音中...")
def record_callback(self, indata, frames, time, status):
self.recording_buffer.append(indata.copy())
def get_text(self, audio_data):
"""将传入的音频数组转为文字"""
"""Transcribe audio frames to text.
Args:
audio_data: list of numpy arrays captured from sounddevice InputStream.
Returns:
Transcribed string (stripped).
"""
temp_file = "temp_voice.wav"
# 归一化音频数据
audio_np = np.concatenate(audio_data, axis=0)
wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh")
text = "".join([s.text for s in segments])
return text.strip()
segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
return "".join(s.text for s in segments).strip()