#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2023/05/12 18:19:33 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : whisper voice to text ''' import torch from transformers import pipeline import gradio as gr # import pytube as pt MODE_NAME="openai/whisper-large-v2" device= 0 if torch.cuda.is_available() else "cpu" pipe = pipeline("automatic-speech-recognition", model=MODE_NAME, device=device, chunk_length_s=30) all_special_ids = pipe.tokenizer.all_special_ids transcribe_token_id = all_special_ids[-5] translate_token_id = all_special_ids[-6] def transcribe(microphone, state, task="transcribe"): file = microphone pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id if task=="transcribe" else translate_token_id]] text = pipe(file)["text"] return state + "\n" + text, state + "\n" + text iface = gr.Interface(fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", optional=True), gr.State(value="") ], outputs=[ gr.Textbox(lines=15), gr.State()], title="Speech to Text", layout="horizontal", theme="huggingface", live=True, description="Transcribe speech from your microphone or from a youtube video", allow_flagging="never", ) iface.launch(enable_queue=True)