Jetpack AI: Add machinery to generate audio transcriptions (#35691)

lhkowalski · web-flow · commit a2392b00a3de · 2024-02-15T16:28:06.000-03:00
* Create function to process transcriptions

* Expose the audio transcription function to the world

* Add first version of the use-audio-transcription hook

* Expose the hook to the world

* Expose use-audio-transcription hook types

* Add changelog file

* Add missing line

* Add audio transcription demo component

* changelog

* Update testing component labels

* Bump ai-client version

* Fix imports after module settings change

* Import apiFetch the right way

* Move apiFetch special handling to a dedicated file

* Introduce onReady and onError callbacks for the transcription hook

* Add demo for the transcription hook
diff --git a/projects/js-packages/ai-client/changelog/update-voice-to-content-handle-audio-transcription-request b/projects/js-packages/ai-client/changelog/update-voice-to-content-handle-audio-transcription-request
@@ -0,0 +1,4 @@
+Significance: minor
+Type: added
+
+AI Client: add support for audio transcriptions.
diff --git a/projects/js-packages/ai-client/package.json b/projects/js-packages/ai-client/package.json
@@ -1,7 +1,7 @@
 {
 	"private": false,
 	"name": "@automattic/jetpack-ai-client",
-	"version": "0.6.2-alpha",
+	"version": "0.7.0-alpha",
 	"description": "A JS client for consuming Jetpack AI services",
 	"homepage": "https://github.com/Automattic/jetpack/tree/HEAD/projects/js-packages/ai-client/#readme",
 	"bugs": {
diff --git a/projects/js-packages/ai-client/src/api-fetch/index.ts b/projects/js-packages/ai-client/src/api-fetch/index.ts
@@ -0,0 +1,13 @@
+/**
+ * External dependencies
+ */
+import apiFetchMod from '@wordpress/api-fetch';
+
+// @wordpress/api-fetch (as of 6.47.0) declares itself in such a way that tsc and node see the function at apiFetchMod.default
+// while some other environments (including code running inside WordPress itself) see it at apiFetch.
+// See https://arethetypeswrong.github.io/?p=@wordpress/api-fetch@6.47.0
+// This is a helper to simplify the usage of the api-fetch module on the ai-client package.
+type ApiFetchType = typeof apiFetchMod.default;
+const apiFetch: ApiFetchType = ( apiFetchMod.default ?? apiFetchMod ) as ApiFetchType;
+
+export default apiFetch;
diff --git a/projects/js-packages/ai-client/src/audio-transcription/index.ts b/projects/js-packages/ai-client/src/audio-transcription/index.ts
@@ -0,0 +1,67 @@
+/**
+ * External dependencies
+ */
+import debugFactory from 'debug';
+/**
+ * Internal dependencies
+ */
+import apiFetch from '../api-fetch/index.js';
+import requestJwt from '../jwt/index.js';
+
+const debug = debugFactory( 'jetpack-ai-client:audio-transcription' );
+
+/**
+ * The response from the audio transcription service.
+ */
+type AudioTranscriptionResponse = {
+	/**
+	 * The transcribed text.
+	 */
+	text: string;
+};
+
+/**
+ * A function that takes an audio blob and transcribes it.
+ *
+ * @param {Blob} audio - The audio to be transcribed, from a recording or from a file.
+ * @param {string} feature - The feature name that is calling the transcription.
+ * @returns {Promise<string>} - The promise of a string containing the transcribed audio.
+ */
+export default async function transcribeAudio( audio: Blob, feature?: string ): Promise< string > {
+	debug( 'Transcribing audio: %o. Feature: %o', audio, feature );
+
+	// Get a token to use the transcription service
+	let token = '';
+	try {
+		token = ( await requestJwt() ).token;
+	} catch ( error ) {
+		debug( 'Error getting token: %o', error );
+		return Promise.reject( error );
+	}
+
+	// Build a FormData object to hold the audio file
+	const formData = new FormData();
+	formData.append( 'audio_file', audio );
+
+	try {
+		const headers = {
+			Authorization: `Bearer ${ token }`,
+		};
+
+		const response: AudioTranscriptionResponse = await apiFetch( {
+			url: `https://public-api.wordpress.com/wpcom/v2/jetpack-ai-transcription${
+				feature ? `?feature=${ feature }` : ''
+			}`,
+			method: 'POST',
+			body: formData,
+			headers,
+		} );
+
+		debug( 'Transcription response: %o', response );
+
+		return response.text;
+	} catch ( error ) {
+		debug( 'Transcription error response: %o', error );
+		return Promise.reject( error );
+	}
+}
diff --git a/projects/js-packages/ai-client/src/hooks/use-audio-transcription/index.ts b/projects/js-packages/ai-client/src/hooks/use-audio-transcription/index.ts
@@ -0,0 +1,81 @@
+/**
+ * External dependencies
+ */
+import { useCallback, useState } from '@wordpress/element';
+import debugFactory from 'debug';
+/**
+ * Internal dependencies
+ */
+import transcribeAudio from '../../audio-transcription/index.js';
+
+const debug = debugFactory( 'jetpack-ai-client:use-audio-transcription' );
+
+/**
+ * The response from the audio transcription hook.
+ */
+export type UseAudioTranscriptionReturn = {
+	transcriptionResult: string;
+	isTranscribingAudio: boolean;
+	transcriptionError: string;
+	transcribeAudio: ( audio: Blob ) => void;
+};
+
+/**
+ * The props for the audio transcription hook.
+ */
+export type UseAudioTranscriptionProps = {
+	feature: string;
+	onReady?: ( transcription: string ) => void;
+	onError?: ( error: string ) => void;
+};
+
+/**
+ * A hook to handle audio transcription.
+ *
+ * @param {string} feature - The feature name that is calling the transcription.
+ * @returns {UseAudioTranscriptionReturn} - Object with properties to get the transcription data.
+ */
+export default function useAudioTranscription( {
+	feature,
+	onReady,
+	onError,
+}: UseAudioTranscriptionProps ): UseAudioTranscriptionReturn {
+	const [ transcriptionResult, setTranscriptionResult ] = useState< string >( '' );
+	const [ transcriptionError, setTranscriptionError ] = useState< string >( '' );
+	const [ isTranscribingAudio, setIsTranscribingAudio ] = useState( false );
+
+	const handleAudioTranscription = useCallback(
+		( audio: Blob ) => {
+			debug( 'Transcribing audio' );
+
+			/**
+			 * Reset the transcription result and error.
+			 */
+			setTranscriptionResult( '' );
+			setTranscriptionError( '' );
+			setIsTranscribingAudio( true );
+
+			/**
+			 * Call the audio transcription library.
+			 */
+			transcribeAudio( audio, feature )
+				.then( transcriptionText => {
+					setTranscriptionResult( transcriptionText );
+					onReady?.( transcriptionText );
+				} )
+				.catch( error => {
+					setTranscriptionError( error.message );
+					onError?.( error.message );
+				} )
+				.finally( () => setIsTranscribingAudio( false ) );
+		},
+		[ transcribeAudio, setTranscriptionResult, setTranscriptionError, setIsTranscribingAudio ]
+	);
+
+	return {
+		transcriptionResult,
+		isTranscribingAudio,
+		transcriptionError,
+		transcribeAudio: handleAudioTranscription,
+	};
+}
diff --git a/projects/js-packages/ai-client/src/index.ts b/projects/js-packages/ai-client/src/index.ts
@@ -4,12 +4,14 @@
 export { default as requestJwt } from './jwt/index.js';
 export { default as SuggestionsEventSource } from './suggestions-event-source/index.js';
 export { default as askQuestion } from './ask-question/index.js';
+export { default as transcribeAudio } from './audio-transcription/index.js';
 
 /*
  * Hooks
  */
 export { default as useAiSuggestions } from './hooks/use-ai-suggestions/index.js';
 export { default as useMediaRecording } from './hooks/use-media-recording/index.js';
+export { default as useAudioTranscription } from './hooks/use-audio-transcription/index.js';
 
 /*
  * Components: Icons
diff --git a/projects/js-packages/ai-client/src/jwt/index.ts b/projects/js-packages/ai-client/src/jwt/index.ts
@@ -2,8 +2,11 @@
  * External dependencies
  */
 import { isSimpleSite } from '@automattic/jetpack-shared-extension-utils';
-import apiFetchMod from '@wordpress/api-fetch';
 import debugFactory from 'debug';
+/**
+ * Internal dependencies
+ */
+import apiFetch from '../api-fetch/index.js';
 /*
  * Types & constants
  */
@@ -27,12 +30,6 @@ type TokenDataEndpointResponseProps = {
 
 const debug = debugFactory( 'jetpack-ai-client:jwt' );
 
-// @wordpress/api-fetch (as of 6.47.0) declares itself in such a way that tsc and node see the function at apiFetchMod.default
-// while some other environments (including code running inside WordPress itself) see it at apiFetch.
-// See https://arethetypeswrong.github.io/?p=@wordpress/api-fetch@6.47.0
-type ApiFetchType = typeof apiFetchMod.default;
-const apiFetch: ApiFetchType = ( apiFetchMod.default ?? apiFetchMod ) as ApiFetchType;
-
 const JWT_TOKEN_ID = 'jetpack-ai-jwt';
 const JWT_TOKEN_EXPIRATION_TIME = 2 * 60 * 1000; // 2 minutes
 
diff --git a/projects/js-packages/ai-client/src/types.ts b/projects/js-packages/ai-client/src/types.ts
@@ -32,7 +32,15 @@ export type PromptProp = PromptMessagesProp | string;
  * Data Flow types
  */
 export type { UseAiContextOptions } from './data-flow/use-ai-context.js';
+
+/*
+ * Hook types
+ */
 export type { RequestingErrorProps } from './hooks/use-ai-suggestions/index.js';
+export type {
+	UseAudioTranscriptionProps,
+	UseAudioTranscriptionReturn,
+} from './hooks/use-audio-transcription/index.js';
 
 /*
  * Requests types
diff --git a/projects/plugins/jetpack/changelog/update-voice-to-content-handle-audio-transcription-request b/projects/plugins/jetpack/changelog/update-voice-to-content-handle-audio-transcription-request
@@ -0,0 +1,4 @@
+Significance: minor
+Type: other
+
+Jetpack AI: include audio transcription usage example to Voice-to-Content block.
diff --git a/projects/plugins/jetpack/extensions/blocks/voice-to-content/edit.tsx b/projects/plugins/jetpack/extensions/blocks/voice-to-content/edit.tsx
@@ -6,9 +6,11 @@ import {
 	micIcon,
 	playerPauseIcon,
 	useMediaRecording,
+	useAudioTranscription,
+	UseAudioTranscriptionReturn,
 } from '@automattic/jetpack-ai-client';
 import { ThemeProvider } from '@automattic/jetpack-components';
-import { Button, Modal, Icon } from '@wordpress/components';
+import { Button, Modal, Icon, FormFileUpload } from '@wordpress/components';
 import { useDispatch } from '@wordpress/data';
 import { useCallback } from '@wordpress/element';
 import { __ } from '@wordpress/i18n';
@@ -80,6 +82,22 @@ function ContextualRow( { state, error = null, audioURL = null } ) {
 function ActionButtons( { state, mediaControls } ) {
 	const { start, pause, resume, stop } = mediaControls ?? {};
 
+	const onTranscriptionReady = ( transcription: string ) => {
+		// eslint-disable-next-line no-console
+		console.log( 'Transcription ready: ', transcription );
+	};
+
+	const onTranscriptionError = ( error: string ) => {
+		// eslint-disable-next-line no-console
+		console.log( 'Transcription error: ', error );
+	};
+
+	const { transcribeAudio }: UseAudioTranscriptionReturn = useAudioTranscription( {
+		feature: 'voice-to-content',
+		onReady: onTranscriptionReady,
+		onError: onTranscriptionError,
+	} );
+
 	const recordingHandler = useCallback( () => {
 		if ( state === 'inactive' ) {
 			start?.( 1000 ); // Stream audio on 1 second intervals
@@ -90,8 +108,11 @@ function ActionButtons( { state, mediaControls } ) {
 		}
 	}, [ state, start, pause, resume ] );
 
-	const uploadHandler = () => {
-		throw new Error( 'Not implemented' );
+	const uploadHandler = event => {
+		if ( event.currentTarget.files.length > 0 ) {
+			const file = event.currentTarget.files[ 0 ];
+			transcribeAudio( file );
+		}
 	};
 
 	const doneHandler = useCallback( () => {
@@ -122,13 +143,14 @@ function ActionButtons( { state, mediaControls } ) {
 				</Button>
 			) }
 			{ [ 'inactive', 'error' ].includes( state ) && (
-				<Button
-					className="jetpack-ai-voice-to-content__button"
+				<FormFileUpload
+					accept="audio/*"
+					onChange={ uploadHandler }
 					variant="secondary"
-					onClick={ uploadHandler }
+					className="jetpack-ai-voice-to-content__button"
 				>
 					{ __( 'Upload audio', 'jetpack' ) }
-				</Button>
+				</FormFileUpload>
 			) }
 			{ [ 'recording', 'paused' ].includes( state ) && (
 				<Button

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"private": false,`
`3`	`3`	`"name": "@automattic/jetpack-ai-client",`
`4`		`- "version": "0.6.2-alpha",`
	`4`	`+ "version": "0.7.0-alpha",`
`5`	`5`	`"description": "A JS client for consuming Jetpack AI services",`
`6`	`6`	`"homepage": "https://github.com/Automattic/jetpack/tree/HEAD/projects/js-packages/ai-client/#readme",`
`7`	`7`	`"bugs": {`