使用 Whisper JAX 批量语音转文字
最近练听力,找到 englishpod 的 mp3,非常不错,但是缺少同步字幕,所以用openai的whisper来转换
但是官方的 whisper 每次只处理30s,需要分片处理,所以找到了 Whisper Jax 声称速度更快,而且不需要自己处理分片的问题。
import jax
import jax.numpy as jnp
import os
import time
import json
import magic
from whisper_jax import FlaxWhisperPipline
def get_mp3_files(directory):
total_files = 0
print(f"scanning files...")
for root, dirs, files in os.walk(directory):
for file in files:
mime = magic.Magic(mime=True)
if mime.from_file(os.path.join(root, file)) == 'audio/mpeg':
total_files += 1
yield os.path.join(root, file)
print(f"Total files: {total_files}")
def mp3_to_text(directory):
pipeline = FlaxWhisperPipline("openai/whisper-large-v2", dtype=jnp.bfloat16)
for mp3_file in get_mp3_files(directory):
print(f"Processing file: {mp3_file}")
txt_file = os.path.splitext(mp3_file)[0] + '.txt'
if os.path.isfile(txt_file):
print(f"Skipping file: {mp3_file}")
continue
start_time = time.time()
text = pipeline(mp3_file, task="translate", language="chinese",return_timestamps=True)
text_length = len(text['text']) # 获取转录文本的长度
print(f"Text length: {text_length} characters")
end_time = time.time()
print(f"Processed in {end_time - start_time} seconds")
with open(txt_file, 'w') as f:
f.write(json.dumps(text))
start_time = time.time()
mp3_to_text('/data/englishpod')
end_time = time.time()
print(f"Total time: {end_time - start_time} seconds")