diff --git a/configs/audio-whisper/aws.yaml b/configs/audio-whisper/aws.yaml new file mode 100644 index 000000000..09144cb1d --- /dev/null +++ b/configs/audio-whisper/aws.yaml @@ -0,0 +1,10 @@ +head_node_type: + name: head_node_type + instance_type: m5.2xlarge + +worker_node_types: +- name: cpu_worker + instance_type: m5.2xlarge + min_workers: 0 + max_workers: 2 + use_spot: false \ No newline at end of file diff --git a/configs/audio-whisper/gce.yaml b/configs/audio-whisper/gce.yaml new file mode 100644 index 000000000..49bfccb86 --- /dev/null +++ b/configs/audio-whisper/gce.yaml @@ -0,0 +1,10 @@ +head_node_type: + name: head_node_type + instance_type: n2-standard-8 + +worker_node_types: +- name: cpu_worker + instance_type: n2-standard-8 + min_workers: 0 + max_workers: 2 + use_spot: false \ No newline at end of file diff --git a/templates/audio-whisper/README.md b/templates/audio-whisper/README.md new file mode 100644 index 000000000..6dd701f76 --- /dev/null +++ b/templates/audio-whisper/README.md @@ -0,0 +1,5 @@ +# Audio demo with OpenAI Whisper + +This demo shows how to run OpenAI whisper in Anyscale Workspace. +And also demos the cluster environment with audio processing related dependencies like ffmpeg. + diff --git a/templates/audio-whisper/sample-ch.mp3 b/templates/audio-whisper/sample-ch.mp3 new file mode 100644 index 000000000..f31f772fc Binary files /dev/null and b/templates/audio-whisper/sample-ch.mp3 differ diff --git a/templates/audio-whisper/sample-en.mp3 b/templates/audio-whisper/sample-en.mp3 new file mode 100644 index 000000000..61de3714a Binary files /dev/null and b/templates/audio-whisper/sample-en.mp3 differ diff --git a/templates/audio-whisper/whisper.ipynb b/templates/audio-whisper/whisper.ipynb new file mode 100644 index 000000000..4490d64f5 --- /dev/null +++ b/templates/audio-whisper/whisper.ipynb @@ -0,0 +1,90 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4719f38-0166-4b80-a03f-290255b5c528", + "metadata": {}, + "source": [ + "# OpenAI whisper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aaa04be4-6913-43bb-b20f-93b2cde52acc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import whisper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca8fe87e-2912-46b1-8a27-5271b2ffbc9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "model = whisper.load_model(\"base\")\n", + "result = model.transcribe(\"sample-en.mp3\")\n", + "print(result[\"text\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50542e68-b306-4c12-ae72-a8a2b21347db", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "model = whisper.load_model(\"base\")\n", + "\n", + "# load audio and pad/trim it to fit 30 seconds\n", + "audio = whisper.load_audio(\"sample-ch.mp3\")\n", + "audio = whisper.pad_or_trim(audio)\n", + "\n", + "# make log-Mel spectrogram and move to the same device as the model\n", + "mel = whisper.log_mel_spectrogram(audio).to(model.device)\n", + "\n", + "# detect the spoken language\n", + "_, probs = model.detect_language(mel)\n", + "print(f\"Detected language: {max(probs, key=probs.get)}\")\n", + "\n", + "# decode the audio\n", + "options = whisper.DecodingOptions(fp16 = False)\n", + "print(options)\n", + "result = whisper.decode(model, mel, options)\n", + "\n", + "# print the recognized text\n", + "print(result.text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}