2024年3月

最近练听力,找到 englishpod 的 mp3,非常不错,但是缺少同步字幕,所以用openai的whisper来转换
但是官方的 whisper 每次只处理30s,需要分片处理,所以找到了 Whisper Jax 声称速度更快,而且不需要自己处理分片的问题。

import jax
import jax.numpy as jnp
import os
import time
import json
import magic
from whisper_jax import FlaxWhisperPipline

def get_mp3_files(directory):
    total_files = 0
    print(f"scanning files...")
    for root, dirs, files in os.walk(directory):
        for file in files:
            mime = magic.Magic(mime=True)
            if mime.from_file(os.path.join(root, file)) == 'audio/mpeg':
                total_files += 1
                yield os.path.join(root, file)
    print(f"Total files: {total_files}")

def mp3_to_text(directory):
    pipeline = FlaxWhisperPipline("openai/whisper-large-v2", dtype=jnp.bfloat16)
    for mp3_file in get_mp3_files(directory):
        print(f"Processing file: {mp3_file}")

        txt_file = os.path.splitext(mp3_file)[0] + '.txt'
        if os.path.isfile(txt_file):
            print(f"Skipping file: {mp3_file}")
            continue

        start_time = time.time()
        text = pipeline(mp3_file, task="translate", language="chinese",return_timestamps=True)
        text_length = len(text['text'])  # 获取转录文本的长度
        print(f"Text length: {text_length} characters")
        end_time = time.time()
        print(f"Processed in {end_time - start_time} seconds")

        with open(txt_file, 'w') as f:
            f.write(json.dumps(text))


start_time = time.time()
mp3_to_text('/data/englishpod')
end_time = time.time()
print(f"Total time: {end_time - start_time} seconds")