Skip to content

Latest commit

 

History

History
241 lines (201 loc) · 8.72 KB

TODO.org

File metadata and controls

241 lines (201 loc) · 8.72 KB

Tasks to do for voice-fn

Add pipeline interruptions

Make assistant context aggregator support interrupt

Add support for first message greeting in the pipeline

Add support for ultravox

Add support for Silero VAD

Add support for google gemini

Add support for telnyx transport

Add support for openai realtime API

Research webrtc support

Add local transport (microphone + speaker out)

Add TTFT metric

Add LLM usage metrics based on chunks responses API docs for usage

research clojure-media for dedicated ffmpeg support for media conversion

Make a helper to create easier connections between processors

(def phone-flow
  (voice-fn/create-flow {:language :en
                         :transport {:mode :telephony
                                     :in (input-channel)
                                     :out (output-channel)}
                         :transcriptor {:proc asr/deepgram-processor
                                        :args {:transcription/api-key (secret [:deepgram :api-key])
                                               :transcription/model :nova-2}}
                         :llm {:proc llm/openai-llm-process

                               :args {:openai/api-key (secret [:openai :new-api-sk])
                                      :llm/model "gpt-4o-mini"}}
                         :tts {:proc tts/elevenlabs-tts-process
                               :args {:elevenlabs/api-key (secret [:elevenlabs :api-key])
                                      :elevenlabs/model-id "eleven_flash_v2_5"}}}))

Add Gladia as a transcription provider

Some code from another project

;;;;;;;;; Gladia ASR ;;;;;;;;;;;;;
;; :frames_format "base64"
;; :word_timestamps true})
(def ^:private gladia-url "wss://api.gladia.io/audio/text/audio-transcription")

;; this may be outdated
(def ^:private asr-configuration {:x_gladia_key api-key
                                  :sample_rate 8000
                                  :encoding "WAV/ULAW"
                                  :language_behaviour "manual"
                                  :language "romanian"})

(defn transcript?
  [m]
  (= (:event m) "transcript"))

(defn final-transcription?
  [m]
  (and (transcript? m)
       (= (:type m) "final")))

(defn partial-transcription?
  [m]
  (and (transcript? m)
       (= (:type m) "partial")))

(defrecord GladiaASR [ws asr-chan]
  ASR
  (send-audio-chunk [_ data]
    (send! ws {:frames (get-in data [:media :payload])} false))
  (close! [_]
    (ws/close! ws)))

(defn- make-gladia-asr!
  [{:keys [asr-text]}]
  ;; TODO: Handle reconnect & errors
  (let [ws @(websocket gladia-url
                       {:on-open (fn [ws]
                                   (prn "Open ASR Stream")
                                   (send! ws asr-configuration)
                                   (u/log ::gladia-asr-connected))
                        :on-message (fn [_ws ^HeapCharBuffer data _last?]
                                      (let [m (json/parse-if-json (str data))]
                                        (u/log ::gladia-msg :m m)
                                        (when (final-transcription? m)
                                          (u/log ::gladia-asr-transcription :sentence (:transcription m) :transcription m)
                                          (go (>! asr-text (:transcription m))))))
                        :on-error (fn [_ e]
                                    (u/log ::gladia-asr-error :exception e))
                        :on-close (fn [_ code reason]
                                    (u/log ::gladia-asr-closed :code code :reason reason))})]
    (->GladiaASR ws asr-text)))

Add openai text to speech

(require '[wkok.openai-clojure.api :as openai])

(defn openai
  "Generate speech using openai"
  ([input]
   (openai input {}))
  ([input config]
   (openai/create-speech (merge {:input input
                                 :voice "alloy"
                                 :response_format "wav"
                                 :model "tts-1"}
                                config)
                         {:version :http-2 :as :stream})))

(defn tts-stage-openai
  [sid in]
  (a/go-loop []
    (let [sentence (a/<! in)]
      (when-not (nil? sentence)
        (append-message! sid "assistant" sentence)
        (try
          (let [sentence-stream (-> (tts/openai sentence) (io/input-stream))
                ais (AudioSystem/getAudioInputStream sentence-stream)
                twilio-ais (audio/->twilio-phone ais)
                buffer (byte-array 256)]
            (loop []
              (let [bytes-read (.read twilio-ais buffer)]
                (when (pos? bytes-read)
                  (twilio/send-msg! (sessions/ws sid)
                                    sid
                                    (e/encode-base64 buffer))
                  (recur)))))
          (catch Exception e
            (u/log ::tts-stage-error :exception e)))
        (recur)))))

Add rime ai text to speech

(def ^:private rime-tts-url "https://users.rime.ai/v1/rime-tts")

(defn rime
  "Generate speech using rime-ai provider"
  [sentence]
  (-> {:method :post
       :url rime-tts-url
       :as :stream
       :body (json/->json-str {:text sentence
                               :reduceLatency false
                               :samplingRate 8000
                               :speedAlpha 1.0
                               :modelId "v1"
                               :speaker "Colby"})
       :headers {"Authorization" (str "Bearer " rime-api-key)
                 "Accept" "audio/x-mulaw"
                 "Content-Type" "application/json"}}

      (client/request)
      :body))

(defn rime-async
  "Generate speech using rime-ai provider, outputs results on a async
  channel"
  [sentence]
  (let [stream (-> (rime sentence)
                   (io/input-stream))
        c (a/chan 1024)]
    (au/input-stream->chan stream c 1024)))

(defn tts-stage
  [sid in]
  (a/go-loop []
    (let [sentence (a/<! in)]
      (when-not (nil? sentence)
        (append-message! sid "assistant" sentence)
        (try
          (let [sentence-stream (-> (tts/rime sentence) (io/input-stream))
                buffer (byte-array 256)]
            (loop []
              (let [bytes-read (.read sentence-stream buffer)]
                (when (pos? bytes-read)
                  (twilio/send-msg! (sessions/ws sid)
                                    sid
                                    (e/encode-base64 buffer))
                  (recur)))))
          (catch Exception e
            (u/log ::tts-stage-error :exception e)))
        (recur)))))

Implement diagram flows into vice-fn

This means implementing flow diagrams

{:initial-node :start
 :nodes
 {:start {:role_messages [{:role :system
                           :content "You are an order-taking assistant. You must ALWAYS use the available functions to progress the conversation. This is a phone conversation and your responses will be converted to audio. Keep the conversation friendly, casual, and polite. Avoid outputting special characters and emojis."}]
          :task_messages [{:role :system
                           :content "For this step, ask the user if they want pizza or sushi, and wait for them to use a function to choose. Start off by greeting them. Be friendly and casual; you're taking an order for food over the phone."}]}
  :functions [{:type :function
               :function {:name :choose_sushi
                          :description "User wants to order sushi. Let's get that order started"

                          }}]

  }}

Implement pre-actions & post actions

Implement background noise filtering with krisp.ai

Add support for Talon STT