app.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/05/12 18:19:33
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : whisper voice to text
  8. '''
  9. import torch
  10. from transformers import pipeline
  11. import gradio as gr
  12. # import pytube as pt
  13. MODE_NAME="openai/whisper-large-v2"
  14. device= 0 if torch.cuda.is_available() else "cpu"
  15. pipe = pipeline("automatic-speech-recognition", model=MODE_NAME, device=device,
  16. chunk_length_s=30)
  17. all_special_ids = pipe.tokenizer.all_special_ids
  18. transcribe_token_id = all_special_ids[-5]
  19. translate_token_id = all_special_ids[-6]
  20. def transcribe(microphone, state, task="transcribe"):
  21. file = microphone
  22. pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id if task=="transcribe" else translate_token_id]]
  23. text = pipe(file)["text"]
  24. return state + "\n" + text, state + "\n" + text
  25. iface = gr.Interface(fn=transcribe,
  26. inputs=[
  27. gr.Audio(source="microphone", type="filepath", optional=True),
  28. gr.State(value="")
  29. ], outputs=[
  30. gr.Textbox(lines=15),
  31. gr.State()],
  32. title="Speech to Text",
  33. layout="horizontal",
  34. theme="huggingface",
  35. live=True,
  36. description="Transcribe speech from your microphone or from a youtube video",
  37. allow_flagging="never",
  38. )
  39. iface.launch(enable_queue=True)