merge: fix whisper broken start_recording (closes #2)

This commit is contained in:
m1ngsama 2026-02-20 20:45:23 +08:00
commit f631157887
2 changed files with 28 additions and 30 deletions

View file

@ -1,17 +1,20 @@
import cv2 import json
import numpy as np
import time
import os import os
import re import re
import json import time
import torch
import cv2
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd import sounddevice as sd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
from ultralytics import YOLO from ultralytics import YOLO
from arm_main import RobotArmUltimate from arm_main import RobotArmUltimate
from whisper_main import RobotEar from whisper_main import RobotEar
# 禁用代理 # Disable proxy for local serial/model communication
os.environ["no_proxy"] = "localhost,127.0.0.1" os.environ["no_proxy"] = "localhost,127.0.0.1"
# ========================================================= # =========================================================
@ -686,7 +689,6 @@ class RobotApp:
print(f">>> [语音] 音频太长({duration:.1f}s)截断到15秒") print(f">>> [语音] 音频太长({duration:.1f}s)截断到15秒")
audio_trimmed = audio_trimmed[:16000 * 15] audio_trimmed = audio_trimmed[:16000 * 15]
import scipy.io.wavfile as wav
temp_file = "temp_voice.wav" temp_file = "temp_voice.wav"
wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16)) wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))

View file

@ -1,31 +1,27 @@
# 文件名: whisper_main.py
import sounddevice as sd
import numpy as np import numpy as np
import scipy.io.wavfile as wav import scipy.io.wavfile as wav
import sounddevice as sd
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
class RobotEar: class RobotEar:
"""Speech recognition module backed by faster-whisper."""
def __init__(self, model_size="base"): def __init__(self, model_size="base"):
self.model = WhisperModel(model_size, device="cuda", compute_type="float16") self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
self.fs = 16000 self.fs = 16000
self.recording_buffer = []
def start_recording(self):
self.recording_buffer = []
# 开始长录音
sd.start_stream(samplerate=self.fs, channels=1)
print(">>> [耳朵] 录音中...")
def record_callback(self, indata, frames, time, status):
self.recording_buffer.append(indata.copy())
def get_text(self, audio_data): def get_text(self, audio_data):
"""将传入的音频数组转为文字""" """Transcribe audio frames to text.
Args:
audio_data: list of numpy arrays captured from sounddevice InputStream.
Returns:
Transcribed string (stripped).
"""
temp_file = "temp_voice.wav" temp_file = "temp_voice.wav"
# 归一化音频数据
audio_np = np.concatenate(audio_data, axis=0) audio_np = np.concatenate(audio_data, axis=0)
wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16)) wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh") return "".join(s.text for s in segments).strip()
text = "".join([s.text for s in segments])
return text.strip()