Add tuning for embeddings (#69)

vhaldemar · web-flow · commit 363776a94838 · 2025-03-19T12:38:07.000+01:00
diff --git a/examples/async/tuning/embeddings.py b/examples/async/tuning/embeddings.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import AsyncYCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+async def get_datasets(sdk, name, dataset_function):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = sdk.datasets.get("some_id")
+    tuning_task = base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    async for dataset in sdk.datasets.list(status='READY', name_pattern=name):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = dataset_function.draft_from_path(
+            path=local_path(f'{name}.jsonlines'),
+            upload_format='jsonlines',
+            name=name,
+        )
+
+        dataset = await dataset_draft.upload()
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+async def main() -> None:
+    sdk = AsyncYCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    sdk.setup_default_logging()
+    base_model = sdk.models.text_embeddings('yandexgpt-lite')
+
+    for name, tune_type, dataset_function in [
+        ('embeddings_pair', 'pair', sdk.datasets.text_embeddings_pair),
+        ('embeddings_triplet', 'triplet', sdk.datasets.text_embeddings_triplet),
+    ]:
+        train_dataset, validation_dataset = await get_datasets(sdk, name, dataset_function)
+        result = await base_model.run("hi")
+        print(f'pretrain model inference result: {result}')
+
+        # `.tune(...)` is a shortcut for:
+        # tuning_task = await base_model.tune_deferred(...)
+        # new_model = await tuning_task.wait(...)
+        # But it gives you less control on tune canceling and
+        # reporting.
+        new_model = await base_model.tune(
+            train_dataset,
+            validation_datasets=validation_dataset,
+            embeddings_tune_type=tune_type,
+            name=str(uuid.uuid4())
+        )
+        print(f'resulting {new_model}')
+
+        # you can save model.uri somewhere and reuse it later
+        tuned_uri = new_model.uri
+        model = sdk.models.text_embeddings(tuned_uri)
+        result = await model.run("hi")
+        print(f'posttrain model inference result: {result}')
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
diff --git a/examples/async/tuning/embeddings_pair.jsonlines b/examples/async/tuning/embeddings_pair.jsonlines
@@ -0,0 +1 @@
+{"anchor": "hello", "positive": "hi"}
diff --git a/examples/async/tuning/embeddings_triplet.jsonlines b/examples/async/tuning/embeddings_triplet.jsonlines
@@ -0,0 +1 @@
+{"anchor": "hello", "positive": "hi", "negative": "bye"}
diff --git a/examples/sync/tuning/embeddings.py b/examples/sync/tuning/embeddings.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import pathlib
+import uuid
+
+from yandex_cloud_ml_sdk import YCloudML
+
+
+def local_path(path: str) -> pathlib.Path:
+    return pathlib.Path(__file__).parent / path
+
+
+def get_datasets(sdk, name, dataset_function):
+    """
+    This function represents getting or creating datasets object.
+
+    In real life you could use just a datasets ids, for example:
+
+    ```
+    dataset = sdk.datasets.get("some_id")
+    tuning_task = base_model.tune_deferred(
+        "dataset_id",
+        validation_datasets=dataset
+    )
+    ```
+    """
+
+    for dataset in sdk.datasets.list(status='READY', name_pattern=name):
+        print(f'using old dataset {dataset=}')
+        break
+    else:
+        print('no old datasets found, creating new one')
+        dataset_draft = dataset_function.draft_from_path(
+            path=local_path(f'{name}.jsonlines'),
+            upload_format='jsonlines',
+            name=name,
+        )
+
+        dataset = dataset_draft.upload()
+        print(f'created new dataset {dataset=}')
+
+    return dataset, dataset
+
+
+def main() -> None:
+    sdk = YCloudML(folder_id='b1ghsjum2v37c2un8h64')
+    sdk.setup_default_logging()
+    base_model = sdk.models.text_embeddings('yandexgpt-lite')
+
+    for name, tune_type, dataset_function in [
+        ('embeddings_pair', 'pair', sdk.datasets.text_embeddings_pair),
+        ('embeddings_triplet', 'triplet', sdk.datasets.text_embeddings_triplet),
+    ]:
+        train_dataset, validation_dataset = get_datasets(sdk, name, dataset_function)
+        result = base_model.run("hi")
+        print(f'pretrain model inference result: {result}')
+
+        # `.tune(...)` is a shortcut for:
+        # tuning_task = base_model.tune_deferred(...)
+        # new_model = tuning_task.wait(...)
+        # But it gives you less control on tune canceling and
+        # reporting.
+        new_model = base_model.tune(
+            train_dataset,
+            validation_datasets=validation_dataset,
+            embeddings_tune_type=tune_type,
+            name=str(uuid.uuid4())
+        )
+        print(f'resulting {new_model}')
+
+        # you can save model.uri somewhere and reuse it later
+        tuned_uri = new_model.uri
+        model = sdk.models.text_embeddings(tuned_uri)
+        result = model.run("hi")
+        print(f'posttrain model inference result: {result}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/sync/tuning/embeddings_pair.jsonlines b/examples/sync/tuning/embeddings_pair.jsonlines
@@ -0,0 +1 @@
+{"anchor": "hello", "positive": "hi"}
diff --git a/examples/sync/tuning/embeddings_triplet.jsonlines b/examples/sync/tuning/embeddings_triplet.jsonlines
@@ -0,0 +1 @@
+{"anchor": "hello", "positive": "hi", "negative": "bye"}
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ classifiers = [
 requires-python = ">=3.9"
 dynamic = ["version"]
 dependencies = [
-    "yandexcloud>=0.334.0",
+    "yandexcloud>=0.335.0",
     "grpcio>=1.70.0",
     "get-annotations",
     "httpx>=0.27,<1",
diff --git a/src/yandex_cloud_ml_sdk/_models/text_embeddings/model.py b/src/yandex_cloud_ml_sdk/_models/text_embeddings/model.py
diff --git a/src/yandex_cloud_ml_sdk/_models/text_embeddings/tune_params.py b/src/yandex_cloud_ml_sdk/_models/text_embeddings/tune_params.py
diff --git a/src/yandex_cloud_ml_sdk/_types/tuning/params.py b/src/yandex_cloud_ml_sdk/_types/tuning/params.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"anchor": "hello", "positive": "hi", "negative": "bye"}`