merge: fix whisper broken start_recording (closes #2)

This commit is contained in:
m1ngsama 2026-02-20 20:45:23 +08:00
commit f631157887
2 changed files with 28 additions and 30 deletions

View file

@ -1,17 +1,20 @@
import cv2
import numpy as np
import time
import json
import os
import re
import json
import torch
import sounddevice as sd
from transformers import AutoModelForCausalLM, AutoTokenizer
from ultralytics import YOLO
from arm_main import RobotArmUltimate
from whisper_main import RobotEar
import time
# 禁用代理
import cv2
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from ultralytics import YOLO
from arm_main import RobotArmUltimate
from whisper_main import RobotEar
# Disable proxy for local serial/model communication
os.environ["no_proxy"] = "localhost,127.0.0.1"
# =========================================================
@ -686,7 +689,6 @@ class RobotApp:
print(f">>> [语音] 音频太长({duration:.1f}s)截断到15秒")
audio_trimmed = audio_trimmed[:16000 * 15]
import scipy.io.wavfile as wav
temp_file = "temp_voice.wav"
wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))

View file

@ -1,31 +1,27 @@
# 文件名: whisper_main.py
import sounddevice as sd
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
from faster_whisper import WhisperModel
class RobotEar:
"""Speech recognition module backed by faster-whisper."""
def __init__(self, model_size="base"):
self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
self.fs = 16000
self.recording_buffer = []
def start_recording(self):
self.recording_buffer = []
# 开始长录音
sd.start_stream(samplerate=self.fs, channels=1)
print(">>> [耳朵] 录音中...")
def record_callback(self, indata, frames, time, status):
self.recording_buffer.append(indata.copy())
def get_text(self, audio_data):
"""将传入的音频数组转为文字"""
"""Transcribe audio frames to text.
Args:
audio_data: list of numpy arrays captured from sounddevice InputStream.
Returns:
Transcribed string (stripped).
"""
temp_file = "temp_voice.wav"
# 归一化音频数据
audio_np = np.concatenate(audio_data, axis=0)
wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh")
text = "".join([s.text for s in segments])
return text.strip()
segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
return "".join(s.text for s in segments).strip()