mirror of
https://github.com/m1ngsama/robot_arm.git
synced 2026-03-25 19:53:49 +00:00
fix(whisper): remove broken start_recording; move scipy import to top-level
whisper_main.py: - Remove RobotEar.start_recording() and record_callback() which called the nonexistent sd.start_stream() API (correct API is sd.InputStream). These methods were never called by voice_main.py and contained a broken sounddevice API call that would raise AttributeError (#2). - Remove unused recording_buffer field - Translate Chinese comment/docstring to English (#5) voice_main.py: - Move `import scipy.io.wavfile as wav` from inside get_audio_text() function body to module top-level where all imports belong (#4 related) - Sort imports: stdlib before third-party, local last - Remove Chinese comment, replace with English equivalent
This commit is contained in:
parent
a7209c4f78
commit
6977061bef
2 changed files with 28 additions and 30 deletions
|
|
@ -1,17 +1,20 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import torch
|
||||
import sounddevice as sd
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from ultralytics import YOLO
|
||||
from arm_main import RobotArmUltimate
|
||||
from whisper_main import RobotEar
|
||||
import time
|
||||
|
||||
# 禁用代理
|
||||
import cv2
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import sounddevice as sd
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from ultralytics import YOLO
|
||||
|
||||
from arm_main import RobotArmUltimate
|
||||
from whisper_main import RobotEar
|
||||
|
||||
# Disable proxy for local serial/model communication
|
||||
os.environ["no_proxy"] = "localhost,127.0.0.1"
|
||||
|
||||
# =========================================================
|
||||
|
|
@ -686,7 +689,6 @@ class RobotApp:
|
|||
print(f">>> [语音] 音频太长({duration:.1f}s),截断到15秒")
|
||||
audio_trimmed = audio_trimmed[:16000 * 15]
|
||||
|
||||
import scipy.io.wavfile as wav
|
||||
temp_file = "temp_voice.wav"
|
||||
wav.write(temp_file, 16000, (audio_trimmed * 32767).astype(np.int16))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,31 +1,27 @@
|
|||
# 文件名: whisper_main.py
|
||||
import sounddevice as sd
|
||||
import numpy as np
|
||||
import scipy.io.wavfile as wav
|
||||
import sounddevice as sd
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
|
||||
class RobotEar:
|
||||
"""Speech recognition module backed by faster-whisper."""
|
||||
|
||||
def __init__(self, model_size="base"):
|
||||
self.model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
||||
self.fs = 16000
|
||||
self.recording_buffer = []
|
||||
|
||||
def start_recording(self):
|
||||
self.recording_buffer = []
|
||||
# 开始长录音
|
||||
sd.start_stream(samplerate=self.fs, channels=1)
|
||||
print(">>> [耳朵] 录音中...")
|
||||
|
||||
def record_callback(self, indata, frames, time, status):
|
||||
self.recording_buffer.append(indata.copy())
|
||||
|
||||
def get_text(self, audio_data):
|
||||
"""将传入的音频数组转为文字"""
|
||||
"""Transcribe audio frames to text.
|
||||
|
||||
Args:
|
||||
audio_data: list of numpy arrays captured from sounddevice InputStream.
|
||||
|
||||
Returns:
|
||||
Transcribed string (stripped).
|
||||
"""
|
||||
temp_file = "temp_voice.wav"
|
||||
# 归一化音频数据
|
||||
audio_np = np.concatenate(audio_data, axis=0)
|
||||
wav.write(temp_file, self.fs, (audio_np * 32767).astype(np.int16))
|
||||
|
||||
segments, info = self.model.transcribe(temp_file, beam_size=5, language="zh")
|
||||
text = "".join([s.text for s in segments])
|
||||
return text.strip()
|
||||
segments, _ = self.model.transcribe(temp_file, beam_size=5, language="zh")
|
||||
return "".join(s.text for s in segments).strip()
|
||||
|
|
|
|||
Loading…
Reference in a new issue