从飞书消息中获取语音文件的file_key,下载为.ogg或.m4a格式。
使用Python soundfile将音频转换为16kHz采样的WAV格式:
import soundfile as sf
audio, sr = sf.read(voice_file)
# 如果是立体声,转为单声道
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
sf.write('output.wav', audio, 16000)
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' # 国内镜像
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor
import soundfile as sf
# 读取音频
audio, sr = sf.read('output.wav')
if len(audio.shape) > 1:
audio = audio.mean(axis=1)
# 加载模型
processor = WhisperProcessor.from_pretrained('openai/whisper-tiny')
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-tiny')
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-tiny')
# 识别
input_features = feature_extractor(audio, sampling_rate=16000, return_tensors='pt').input_features
with torch.no_grad():
predicted_ids = model.generate(input_features)
result = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
pip install torch transformers soundfile
共 1 个版本