diff --git a/.mock/definition/empathic-voice/chat.yml b/.mock/definition/empathic-voice/chat.yml index 6d9d6ac5..ee5e412a 100644 --- a/.mock/definition/empathic-voice/chat.yml +++ b/.mock/definition/empathic-voice/chat.yml @@ -113,12 +113,12 @@ channel: For more details, refer to the [Authentication Strategies Guide](/docs/introduction/api-key#authentication-strategies). messages: - publish: - origin: client - body: PublishEvent subscribe: origin: server body: SubscribeEvent + publish: + origin: client + body: PublishEvent examples: - messages: - type: publish @@ -131,19 +131,6 @@ channel: imports: root: __package__.yml types: - PublishEvent: - discriminated: false - union: - - type: root.AudioInput - - type: root.SessionSettings - - type: root.UserInput - - type: root.AssistantInput - - type: root.ToolResponseMessage - - type: root.ToolErrorMessage - - type: root.PauseAssistantMessage - - type: root.ResumeAssistantMessage - source: - openapi: evi-asyncapi.json SubscribeEvent: discriminated: false union: @@ -159,3 +146,16 @@ types: - type: root.ToolErrorMessage source: openapi: evi-asyncapi.json + PublishEvent: + discriminated: false + union: + - type: root.AudioInput + - type: root.SessionSettings + - type: root.UserInput + - type: root.AssistantInput + - type: root.ToolResponseMessage + - type: root.ToolErrorMessage + - type: root.PauseAssistantMessage + - type: root.ResumeAssistantMessage + source: + openapi: evi-asyncapi.json diff --git a/.mock/definition/empathic-voice/configs.yml b/.mock/definition/empathic-voice/configs.yml index 04623260..430d92d3 100644 --- a/.mock/definition/empathic-voice/configs.yml +++ b/.mock/definition/empathic-voice/configs.yml @@ -658,6 +658,7 @@ service: response: docs: Success type: text + status-code: 200 errors: - root.BadRequestError examples: diff --git a/.mock/definition/empathic-voice/prompts.yml b/.mock/definition/empathic-voice/prompts.yml index ad364ad5..9b00c47d 100644 --- a/.mock/definition/empathic-voice/prompts.yml +++ b/.mock/definition/empathic-voice/prompts.yml @@ -377,6 +377,7 @@ service: response: docs: Success type: text + status-code: 200 errors: - root.BadRequestError examples: diff --git a/.mock/definition/empathic-voice/tools.yml b/.mock/definition/empathic-voice/tools.yml index 1a0964bf..c926e230 100644 --- a/.mock/definition/empathic-voice/tools.yml +++ b/.mock/definition/empathic-voice/tools.yml @@ -427,6 +427,8 @@ service: content-type: application/json response: docs: Success + type: text + status-code: 200 errors: - root.BadRequestError examples: diff --git a/.mock/definition/expression-measurement/stream/stream.yml b/.mock/definition/expression-measurement/stream/stream.yml index bdb6f041..1ed74b71 100644 --- a/.mock/definition/expression-measurement/stream/stream.yml +++ b/.mock/definition/expression-measurement/stream/stream.yml @@ -7,14 +7,14 @@ channel: type: string name: humeApiKey messages: + subscribe: + origin: server + body: SubscribeEvent publish: origin: client body: type: StreamModelsEndpointPayload docs: Models endpoint payload - subscribe: - origin: server - body: SubscribeEvent examples: - messages: - type: publish @@ -22,209 +22,6 @@ channel: - type: subscribe body: {} types: - StreamFace: - docs: > - Configuration for the facial expression emotion model. - - - Note: Using the `reset_stream` parameter does not have any effect on face - identification. A single face identifier cache is maintained over a full - session whether `reset_stream` is used or not. - properties: - facs: - type: optional> - docs: >- - Configuration for FACS predictions. If missing or null, no FACS - predictions will be generated. - descriptions: - type: optional> - docs: >- - Configuration for Descriptions predictions. If missing or null, no - Descriptions predictions will be generated. - identify_faces: - type: optional - docs: > - Whether to return identifiers for faces across frames. If true, unique - identifiers will be assigned to face bounding boxes to differentiate - different faces. If false, all faces will be tagged with an "unknown" - ID. - default: false - fps_pred: - type: optional - docs: > - Number of frames per second to process. Other frames will be omitted - from the response. - default: 3 - prob_threshold: - type: optional - docs: > - Face detection probability threshold. Faces detected with a - probability less than this threshold will be omitted from the - response. - default: 3 - min_face_size: - type: optional - docs: > - Minimum bounding box side length in pixels to treat as a face. Faces - detected with a bounding box side length in pixels less than this - threshold will be omitted from the response. - default: 3 - source: - openapi: streaming-asyncapi.yml - inline: true - StreamLanguage: - docs: Configuration for the language emotion model. - properties: - sentiment: - type: optional> - docs: >- - Configuration for sentiment predictions. If missing or null, no - sentiment predictions will be generated. - toxicity: - type: optional> - docs: >- - Configuration for toxicity predictions. If missing or null, no - toxicity predictions will be generated. - granularity: - type: optional - docs: >- - The granularity at which to generate predictions. Values are `word`, - `sentence`, `utterance`, or `passage`. To get a single prediction for - the entire text of your streaming payload use `passage`. Default value - is `word`. - source: - openapi: streaming-asyncapi.yml - inline: true - Config: - docs: > - Configuration used to specify which models should be used and with what - settings. - properties: - burst: - type: optional> - docs: | - Configuration for the vocal burst emotion model. - - Note: Model configuration is not currently available in streaming. - - Please use the default configuration by passing an empty object `{}`. - face: - type: optional - docs: > - Configuration for the facial expression emotion model. - - - Note: Using the `reset_stream` parameter does not have any effect on - face identification. A single face identifier cache is maintained over - a full session whether `reset_stream` is used or not. - facemesh: - type: optional> - docs: | - Configuration for the facemesh emotion model. - - Note: Model configuration is not currently available in streaming. - - Please use the default configuration by passing an empty object `{}`. - language: - type: optional - docs: Configuration for the language emotion model. - prosody: - type: optional> - docs: | - Configuration for the speech prosody emotion model. - - Note: Model configuration is not currently available in streaming. - - Please use the default configuration by passing an empty object `{}`. - source: - openapi: streaming-asyncapi.yml - inline: true - StreamModelsEndpointPayload: - docs: Models endpoint payload - properties: - data: - type: optional - models: - type: optional - docs: > - Configuration used to specify which models should be used and with - what settings. - stream_window_ms: - type: optional - docs: > - Length in milliseconds of streaming sliding window. - - - Extending the length of this window will prepend media context from - past payloads into the current payload. - - - For example, if on the first payload you send 500ms of data and on the - second payload you send an additional 500ms of data, a window of at - least 1000ms will allow the model to process all 1000ms of stream - data. - - - A window of 600ms would append the full 500ms of the second payload to - the last 100ms of the first payload. - - - Note: This feature is currently only supported for audio data and - audio models. For other file types and models this parameter will be - ignored. - default: 5000 - validation: - min: 500 - max: 10000 - reset_stream: - type: optional - docs: > - Whether to reset the streaming sliding window before processing the - current payload. - - - If this parameter is set to `true` then past context will be deleted - before processing the current payload. - - - Use reset_stream when one audio file is done being processed and you - do not want context to leak across files. - default: false - raw_text: - type: optional - docs: > - Set to `true` to enable the data parameter to be parsed as raw text - rather than base64 encoded bytes. - - This parameter is useful if you want to send text to be processed by - the language model, but it cannot be used with other file types like - audio, image, or video. - default: false - job_details: - type: optional - docs: > - Set to `true` to get details about the job. - - - This parameter can be set in the same payload as data or it can be set - without data and models configuration to get the job details between - payloads. - - - This parameter is useful to get the unique job ID. - default: false - payload_id: - type: optional - docs: > - Pass an arbitrary string as the payload ID and get it back at the top - level of the socket response. - - - This can be useful if you have multiple requests running - asynchronously and want to disambiguate responses as they are - received. - source: - openapi: streaming-asyncapi.yml StreamModelPredictionsJobDetails: docs: > If the job_details flag was set in the request, details about the current @@ -434,5 +231,208 @@ types: docs: Warning message source: openapi: streaming-asyncapi.yml + StreamFace: + docs: > + Configuration for the facial expression emotion model. + + + Note: Using the `reset_stream` parameter does not have any effect on face + identification. A single face identifier cache is maintained over a full + session whether `reset_stream` is used or not. + properties: + facs: + type: optional> + docs: >- + Configuration for FACS predictions. If missing or null, no FACS + predictions will be generated. + descriptions: + type: optional> + docs: >- + Configuration for Descriptions predictions. If missing or null, no + Descriptions predictions will be generated. + identify_faces: + type: optional + docs: > + Whether to return identifiers for faces across frames. If true, unique + identifiers will be assigned to face bounding boxes to differentiate + different faces. If false, all faces will be tagged with an "unknown" + ID. + default: false + fps_pred: + type: optional + docs: > + Number of frames per second to process. Other frames will be omitted + from the response. + default: 3 + prob_threshold: + type: optional + docs: > + Face detection probability threshold. Faces detected with a + probability less than this threshold will be omitted from the + response. + default: 3 + min_face_size: + type: optional + docs: > + Minimum bounding box side length in pixels to treat as a face. Faces + detected with a bounding box side length in pixels less than this + threshold will be omitted from the response. + default: 3 + source: + openapi: streaming-asyncapi.yml + inline: true + StreamLanguage: + docs: Configuration for the language emotion model. + properties: + sentiment: + type: optional> + docs: >- + Configuration for sentiment predictions. If missing or null, no + sentiment predictions will be generated. + toxicity: + type: optional> + docs: >- + Configuration for toxicity predictions. If missing or null, no + toxicity predictions will be generated. + granularity: + type: optional + docs: >- + The granularity at which to generate predictions. Values are `word`, + `sentence`, `utterance`, or `passage`. To get a single prediction for + the entire text of your streaming payload use `passage`. Default value + is `word`. + source: + openapi: streaming-asyncapi.yml + inline: true + Config: + docs: > + Configuration used to specify which models should be used and with what + settings. + properties: + burst: + type: optional> + docs: | + Configuration for the vocal burst emotion model. + + Note: Model configuration is not currently available in streaming. + + Please use the default configuration by passing an empty object `{}`. + face: + type: optional + docs: > + Configuration for the facial expression emotion model. + + + Note: Using the `reset_stream` parameter does not have any effect on + face identification. A single face identifier cache is maintained over + a full session whether `reset_stream` is used or not. + facemesh: + type: optional> + docs: | + Configuration for the facemesh emotion model. + + Note: Model configuration is not currently available in streaming. + + Please use the default configuration by passing an empty object `{}`. + language: + type: optional + docs: Configuration for the language emotion model. + prosody: + type: optional> + docs: | + Configuration for the speech prosody emotion model. + + Note: Model configuration is not currently available in streaming. + + Please use the default configuration by passing an empty object `{}`. + source: + openapi: streaming-asyncapi.yml + inline: true + StreamModelsEndpointPayload: + docs: Models endpoint payload + properties: + data: + type: optional + models: + type: optional + docs: > + Configuration used to specify which models should be used and with + what settings. + stream_window_ms: + type: optional + docs: > + Length in milliseconds of streaming sliding window. + + + Extending the length of this window will prepend media context from + past payloads into the current payload. + + + For example, if on the first payload you send 500ms of data and on the + second payload you send an additional 500ms of data, a window of at + least 1000ms will allow the model to process all 1000ms of stream + data. + + + A window of 600ms would append the full 500ms of the second payload to + the last 100ms of the first payload. + + + Note: This feature is currently only supported for audio data and + audio models. For other file types and models this parameter will be + ignored. + default: 5000 + validation: + min: 500 + max: 10000 + reset_stream: + type: optional + docs: > + Whether to reset the streaming sliding window before processing the + current payload. + + + If this parameter is set to `true` then past context will be deleted + before processing the current payload. + + + Use reset_stream when one audio file is done being processed and you + do not want context to leak across files. + default: false + raw_text: + type: optional + docs: > + Set to `true` to enable the data parameter to be parsed as raw text + rather than base64 encoded bytes. + + This parameter is useful if you want to send text to be processed by + the language model, but it cannot be used with other file types like + audio, image, or video. + default: false + job_details: + type: optional + docs: > + Set to `true` to get details about the job. + + + This parameter can be set in the same payload as data or it can be set + without data and models configuration to get the job details between + payloads. + + + This parameter is useful to get the unique job ID. + default: false + payload_id: + type: optional + docs: > + Pass an arbitrary string as the payload ID and get it back at the top + level of the socket response. + + + This can be useful if you have multiple requests running + asynchronously and want to disambiguate responses as they are + received. + source: + openapi: streaming-asyncapi.yml imports: streamRoot: __package__.yml diff --git a/.mock/definition/tts/__package__.yml b/.mock/definition/tts/__package__.yml index 5c7af469..e094b295 100644 --- a/.mock/definition/tts/__package__.yml +++ b/.mock/definition/tts/__package__.yml @@ -73,11 +73,12 @@ service: audio: //PExAA0DDYRvkpNfhv3JI5JZ...etc. snippets: - - audio: //PExAA0DDYRvkpNfhv3JI5JZ...etc. + generation_id: 795c949a-1510-4a80-9646-7d0863b023ab id: 37b1b1b1-1b1b-1b1b-1b1b-1b1b1b1b1b1b text: >- Beauty is no quality in things themselves: It exists merely in the mind which contemplates them. - generation_id: 795c949a-1510-4a80-9646-7d0863b023ab + utterance_index: 0 request_id: 66e01f90-4501-4aa0-bbaf-74f45dc15aa725906 synthesize-file: path: /v0/tts/file @@ -260,9 +261,10 @@ types: multiple requests. snippets: docs: >- - A list of speech segments, each containing a portion of the original - text optimized for natural speech delivery. These segments represent - the input text divided into more natural-sounding units. + A list of snippet groups where each group corresponds to an utterance + in the request. Each group contains segmented snippets that represent + the original utterance divided into more natural-sounding units + optimized for speech delivery. type: list> source: openapi: tts-openapi.yml diff --git a/.mock/definition/tts/voices.yml b/.mock/definition/tts/voices.yml index 8718b05c..e7a2b496 100644 --- a/.mock/definition/tts/voices.yml +++ b/.mock/definition/tts/voices.yml @@ -44,7 +44,8 @@ service: For example, if `page_size` is set to 10, each page will include up to 10 items. Defaults to 10. - ascending_order: optional + ascending_order: + type: optional response: docs: Success type: root.ReturnPagedVoices @@ -59,8 +60,10 @@ service: voices_page: - name: David Hume id: c42352c0-4566-455d-b180-0f654b65b525 + provider: CUSTOM_VOICE - name: Goliath Hume id: d87352b0-26a3-4b11-081b-d157a5674d19 + provider: CUSTOM_VOICE create: path: /v0/tts/voices method: POST @@ -100,6 +103,7 @@ service: body: name: David Hume id: c42352c0-4566-455d-b180-0f654b65b525 + provider: CUSTOM_VOICE delete: path: /v0/tts/voices method: DELETE diff --git a/.mock/fern.config.json b/.mock/fern.config.json index 9d619049..a8152b41 100644 --- a/.mock/fern.config.json +++ b/.mock/fern.config.json @@ -1,4 +1,4 @@ { "organization" : "hume", - "version" : "0.57.14" + "version" : "0.56.23" } \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 36ae50e1..1e56dcda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hume" -version = "0.7.13" +version = "0.7.14" description = "A Python SDK for Hume AI" readme = "README.md" authors = [] diff --git a/src/hume/tts/types/return_generation.py b/src/hume/tts/types/return_generation.py index 27672325..13af43e1 100644 --- a/src/hume/tts/types/return_generation.py +++ b/src/hume/tts/types/return_generation.py @@ -32,7 +32,7 @@ class ReturnGeneration(UniversalBaseModel): snippets: typing.List[typing.List[Snippet]] = pydantic.Field() """ - A list of speech segments, each containing a portion of the original text optimized for natural speech delivery. These segments represent the input text divided into more natural-sounding units. + A list of snippet groups where each group corresponds to an utterance in the request. Each group contains segmented snippets that represent the original utterance divided into more natural-sounding units optimized for speech delivery. """ if IS_PYDANTIC_V2: diff --git a/tests/tts/test_root.py b/tests/tts/test_root.py index c95a8215..c70eb237 100644 --- a/tests/tts/test_root.py +++ b/tests/tts/test_root.py @@ -22,9 +22,10 @@ async def test_synthesize_json(client: HumeClient, async_client: AsyncHumeClient [ { "audio": "//PExAA0DDYRvkpNfhv3JI5JZ...etc.", + "generation_id": "795c949a-1510-4a80-9646-7d0863b023ab", "id": "37b1b1b1-1b1b-1b1b-1b1b-1b1b1b1b1b1b", "text": "Beauty is no quality in things themselves: It exists merely in the mind which contemplates them.", - "generation_id": "795c949a-1510-4a80-9646-7d0863b023ab", + "utterance_index": 0, } ] ], @@ -44,7 +45,20 @@ async def test_synthesize_json(client: HumeClient, async_client: AsyncHumeClient "audio": None, "snippets": ( "list", - {0: ("list", {0: {"audio": None, "id": None, "text": None, "generation_id": None}})}, + { + 0: ( + "list", + { + 0: { + "audio": None, + "generation_id": None, + "id": None, + "text": None, + "utterance_index": "integer", + } + }, + ) + }, ), } }, diff --git a/tests/tts/test_voices.py b/tests/tts/test_voices.py index bdb8f88c..564a3007 100644 --- a/tests/tts/test_voices.py +++ b/tests/tts/test_voices.py @@ -9,12 +9,15 @@ async def test_list_(client: HumeClient, async_client: AsyncHumeClient) -> None: expected_response: typing.Any = { "voices_page": [ - {"name": "David Hume", "id": "c42352c0-4566-455d-b180-0f654b65b525"}, - {"name": "Goliath Hume", "id": "d87352b0-26a3-4b11-081b-d157a5674d19"}, + {"name": "David Hume", "id": "c42352c0-4566-455d-b180-0f654b65b525", "provider": "CUSTOM_VOICE"}, + {"name": "Goliath Hume", "id": "d87352b0-26a3-4b11-081b-d157a5674d19", "provider": "CUSTOM_VOICE"}, ] } expected_types: typing.Any = { - "voices_page": ("list", {0: {"name": None, "id": None}, 1: {"name": None, "id": None}}) + "voices_page": ( + "list", + {0: {"name": None, "id": None, "provider": None}, 1: {"name": None, "id": None, "provider": None}}, + ) } response = client.tts.voices.list(provider="CUSTOM_VOICE") validate_response(response, expected_response, expected_types) @@ -24,8 +27,12 @@ async def test_list_(client: HumeClient, async_client: AsyncHumeClient) -> None: async def test_create(client: HumeClient, async_client: AsyncHumeClient) -> None: - expected_response: typing.Any = {"name": "David Hume", "id": "c42352c0-4566-455d-b180-0f654b65b525"} - expected_types: typing.Any = {"name": None, "id": None} + expected_response: typing.Any = { + "name": "David Hume", + "id": "c42352c0-4566-455d-b180-0f654b65b525", + "provider": "CUSTOM_VOICE", + } + expected_types: typing.Any = {"name": None, "id": None, "provider": None} response = client.tts.voices.create(generation_id="795c949a-1510-4a80-9646-7d0863b023ab", name="David Hume") validate_response(response, expected_response, expected_types)