docs: update docs for name and destination changes

Askir · Askir · commit 5949c2f2ae16 · 2025-04-24T11:23:00.000+02:00
fix: destination_table( not just destination(

Update docs/vectorizer/api-reference.md

Co-authored-by: Matvey Arye &lt;cevian@gmail.com&gt;
Signed-off-by: Jascha Beste &lt;bestejascha@gmail.com&gt;

Update docs/vectorizer/api-reference.md

Co-authored-by: Matvey Arye &lt;cevian@gmail.com&gt;
Signed-off-by: Jascha Beste &lt;bestejascha@gmail.com&gt;

docs: fix some typos and more precise language
diff --git a/docs/vectorizer-quick-start.md b/docs/vectorizer-quick-start.md
@@ -90,8 +90,8 @@ Now we can create and run a vectorizer. A vectorizer is a pgai concept, it proce
     SELECT ai.create_vectorizer(
          'blog'::regclass,
          loading => ai.loading_column('contents'),
-         destination => 'blog_contents_embeddings',
-         embedding => ai.embedding_ollama('nomic-embed-text', 768),
+         destination => ai.destination_table('blog_contents_embeddings'),
+         embedding => ai.embedding_ollama('nomic-embed-text', 768)
     );
     ```
 
diff --git a/docs/vectorizer/api-reference.md b/docs/vectorizer/api-reference.md
diff --git a/docs/vectorizer/overview.md b/docs/vectorizer/overview.md
@@ -117,8 +117,9 @@ query like this:
 ```sql
 SELECT ai.create_vectorizer( 
    'blog'::regclass,
+   name => 'blog_embeddings',  -- Optional custom name for easier reference
    loading => ai.loading_column('contents'),
-   destination => 'blog_contents_embeddings',
+   destination => ai.destination_table('blog_contents_embeddings'),
    embedding => ai.embedding_ollama('nomic-embed-text', 768)
 );
 ```
@@ -150,7 +151,7 @@ into each chunk:
 SELECT ai.create_vectorizer(   
     'blog'::regclass,
     loading => ai.loading_column('contents'),
-    destination => 'blog_contents_embeddings',
+    destination => ai.destination_table('blog_contents_embeddings'),
     embedding => ai.embedding_ollama('nomic-embed-text', 768),
     formatting => ai.formatting_python_template('$title: $chunk')
 );
@@ -284,7 +285,7 @@ accordingly:
 SELECT ai.create_vectorizer(
     'blog'::regclass,
     loading => ai.loading_column('contents'),
-    destination => 'blog_contents_embeddings',
+    destination => ai.destination_table('blog_contents_embeddings'),
     embedding => ai.embedding_ollama('nomic-embed-text', 768),
     formatting => ai.formatting_python_template('$title - by $author - $chunk')
 );
@@ -304,7 +305,7 @@ example uses a HNSW index:
 SELECT ai.create_vectorizer(
     'blog'::regclass,
     loading => ai.loading_column('contents'),
-    destination => 'blog_contents_embeddings',
+    destination => ai.destination_table('blog_contents_embeddings'),
     embedding => ai.embedding_ollama('nomic-embed-text', 768),
     formatting => ai.formatting_python_template('$title - by $author - $chunk'),
     indexing => ai.indexing_hnsw(min_rows => 100000, opclass => 'vector_l2_ops')
@@ -344,6 +345,57 @@ CREATE TABLE blog_contents_embeddings_store(
 );
 ```
 
+## Destination Options for Embeddings
+
+Vectorizer supports two different ways to store your embeddings:
+
+### 1. Table Destination (Default)
+
+The default approach creates a separate table to store embeddings and a view that joins with the source table:
+
+```sql
+SELECT ai.create_vectorizer(
+    'blog'::regclass,
+    name => 'blog_vectorizer',  -- Optional custom name for easier reference
+    loading => ai.loading_column('contents'),
+    destination => ai.destination_table(
+        target_schema => 'public',
+        target_table => 'blog_embeddings_store',
+        view_name => 'blog_embeddings'
+    ),
+    embedding => ai.embedding_ollama('nomic-embed-text', 768)
+);
+```
+
+**When to use table destination:**
+- When you need multiple embeddings per row (chunking)
+- For large text fields that need to be split
+- You are vectorizing documents (which typically require chunking)
+
+### 2. Column Destination
+
+For simpler cases, you can add an embedding column directly to the source table. This can only be used when the vectorizer does not perform chunking because it requires a one-to-one relationship between the source data and the embedding. This is useful in cases where you know the source text is short (as is common if the chunking has already been done upstream in your data pipeline).
+
+The workflow is that your application inserts data into the table with a NULL in the embedding column. The vectorizer will then read the row, generate the embedding and update the row with the correct value in the embedding column.
+```sql
+SELECT ai.create_vectorizer(
+    'product_descriptions'::regclass,
+    name => 'product_descriptions_vectorizer',
+    loading => ai.loading_column('description'),
+    destination => ai.destination_column('description_embedding'),
+    embedding => ai.embedding_openai('text-embedding-3-small', 768),
+    chunking => ai.chunking_none()  -- Required for column destination
+);
+```
+
+**When to use column destination:**
+- When you need exactly one embedding per row
+- For shorter text that doesn't require chunking
+- When your application already takes care of the chunking before inserting into the database
+- When you want to avoid creating additional database objects
+
+**Note:** Column destination requires chunking to be set to `ai.chunking_none()` since it can only store one embedding per row.
+
 ## Monitor a vectorizer
 
 Since embeddings are created asynchronously, a delay may occur before they
diff --git a/docs/vectorizer/python-integration.md b/docs/vectorizer/python-integration.md
@@ -13,11 +13,14 @@ Then you can create a vectorizer from python:
 
 ```python
 from pgai.vectorizer import CreateVectorizer
-from pgai.vectorizer.configuration import EmbeddingOpenaiConfig, ChunkingCharacterTextSplitterConfig, FormattingPythonTemplateConfig, LoadingColumnConfig
+from pgai.vectorizer.configuration import EmbeddingOpenaiConfig, ChunkingCharacterTextSplitterConfig, FormattingPythonTemplateConfig, LoadingColumnConfig, DestinationTableConfig
 
 vectorizer_statement = CreateVectorizer(
     source="blog",
-    target_table='blog_embeddings',
+    name="blog_content_embedder",  # Optional custom name for easier reference
+    destination=DestinationTableConfig(
+        destination='blog_embeddings'
+    ),
     loading=LoadingColumnConfig(column_name='content'),
     embedding=EmbeddingOpenaiConfig(
         model='text-embedding-3-small',
@@ -237,14 +240,18 @@ from pgai.vectorizer.configuration import (
     EmbeddingOpenaiConfig,
     ChunkingCharacterTextSplitterConfig,
     FormattingPythonTemplateConfig,
-    LoadingColumnConfig
+    LoadingColumnConfig,
+    DestinationTableConfig
 )
 
 
 def upgrade() -> None:
     op.create_vectorizer(
         source="blog",
-        target_table='blog_embeddings',
+        name="blog_content_embedder",  # Optional custom name for easier reference
+        destination=DestinationTableConfig(
+            destination='blog_embeddings'
+        ),
         loading=LoadingColumnConfig(column_name='content'),
         embedding=EmbeddingOpenaiConfig(
             model='text-embedding-3-small',
@@ -261,7 +268,7 @@ def upgrade() -> None:
 
 
 def downgrade() -> None:
-    op.drop_vectorizer(target_table="blog_embeddings", drop_all=True)
+    op.drop_vectorizer(name="blog_content_embedder", drop_all=True)
 ```
 
 The `create_vectorizer` operation supports all configuration options available in the [SQL API](/docs/vectorizer/api-reference.md).
diff --git a/docs/vectorizer/quick-start-openai.md b/docs/vectorizer/quick-start-openai.md
@@ -92,7 +92,7 @@ To create and run a vectorizer, then query the auto-generated embeddings created
     SELECT ai.create_vectorizer(
        'blog'::regclass,
        loading => ai.loading_column('contents'),
-       destination => 'blog_contents_embeddings',
+       destination => ai.destination_table('blog_contents_embeddings'),
        embedding => ai.embedding_openai('text-embedding-3-small', 768)
     );
     ```
diff --git a/docs/vectorizer/quick-start-voyage.md b/docs/vectorizer/quick-start-voyage.md
@@ -88,7 +88,7 @@ Now you can create and run a vectorizer. A vectorizer is a pgai concept, it proc
     SELECT ai.create_vectorizer(
       'blog'::regclass,
       loading => ai.loading_column('contents'),
-      destination => 'blog_contents_embeddings',
+      destination => ai.destination_table('blog_contents_embeddings'),
       embedding => ai.embedding_voyageai(
         'voyage-3-lite',
         512
diff --git a/docs/vectorizer/quick-start.md b/docs/vectorizer/quick-start.md
@@ -90,7 +90,7 @@ Now we can create and run a vectorizer. A vectorizer is a pgai concept, it proce
     SELECT ai.create_vectorizer(
          'blog'::regclass,
          loading => ai.loading_column('contents'),
-         destination => 'blog_contents_embeddings',
+         destination => ai.destination_table('blog_contents_embeddings'),
          embedding => ai.embedding_ollama('nomic-embed-text', 768)
     );
     ```
diff --git a/examples/embeddings_from_documents/documents/pgai.md b/examples/embeddings_from_documents/documents/pgai.md
@@ -120,7 +120,7 @@ Please note that using Ollama requires a large (>4GB) download of the docker ima
     ```sql
     SELECT ai.create_vectorizer(
          'wiki'::regclass,
-         destination => 'wiki_embeddings',
+         destination => ai.destination_table('wiki_embeddings'),
          embedding => ai.embedding_ollama('all-minilm', 384),
          chunking => ai.chunking_recursive_character_text_splitter('text')
     );
diff --git a/examples/evaluations/litellm_vectorizer/README.md b/examples/evaluations/litellm_vectorizer/README.md
@@ -54,7 +54,7 @@ The evaluation generates diverse question types (short, long, direct, implied, a
 
     SELECT ai.create_vectorizer(
         'paul_graham_essays'::regclass,
-        destination => 'essays_cohere_embeddings',
+        destination => ai.destination_table('essays_cohere_embeddings'),
         embedding => ai.embedding_litellm(
             'cohere/embed-english-v3.0',
             1024,
@@ -65,7 +65,7 @@ The evaluation generates diverse question types (short, long, direct, implied, a
 
     SELECT ai.create_vectorizer(
         'paul_graham_essays'::regclass,
-        destination => 'essays_mistral_embeddings',
+        destination => ai.destination_table('essays_mistral_embeddings'),
         embedding => ai.embedding_litellm(
             'mistral/mistral-embed',
             1024,
@@ -76,7 +76,7 @@ The evaluation generates diverse question types (short, long, direct, implied, a
 
     SELECT ai.create_vectorizer(
         'paul_graham_essays'::regclass,
-        destination => 'essays_openai_small_embeddings',
+        destination => ai.destination_table('essays_openai_small_embeddings'),
         embedding => ai.embedding_openai(
             'text-embedding-3-small', 
             1024, 
diff --git a/examples/evaluations/ollama_vectorizer/README.md b/examples/evaluations/ollama_vectorizer/README.md
@@ -61,7 +61,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
       'pg_essays'::regclass,
       loading => ai.loading_column('text'),
-      destination => 'essays_nomic_embeddings',
+      destination => ai.destination_table('essays_nomic_embeddings'),
       embedding => ai.embedding_ollama('nomic-embed-text', 768),
       chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );
@@ -70,7 +70,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
       'pg_essays'::regclass,
       loading => ai.loading_column('text'),
-      destination => 'essays_openai_small_embeddings',
+      destination => ai.destination_table('essays_openai_small_embeddings'),
       embedding => ai.embedding_openai('text-embedding-3-small', 768),
       chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );
@@ -79,7 +79,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
       'pg_essays'::regclass,
       loading => ai.loading_column('text'),
-      destination => 'essays_bge_large_embeddings',
+      destination => ai.destination_table('essays_bge_large_embeddings'),
       embedding => ai.embedding_ollama('bge-large', 1024),
       chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );
@@ -88,7 +88,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
       'pg_essays'::regclass,
       loading => ai.loading_column('text'),
-      destination => 'essays_openai_large_embeddings', 
+      destination => ai.destination_table('essays_openai_large_embeddings'), 
       embedding => ai.embedding_openai('text-embedding-3-large', 1536),
       chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );
diff --git a/examples/evaluations/voyage_vectorizer/README.md b/examples/evaluations/voyage_vectorizer/README.md
@@ -69,7 +69,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
        'sec_filings'::regclass,
        loading => ai.loading_column('text'),
-       destination => 'sec_filings_openai_embeddings',
+       destination => ai.destination_table('sec_filings_openai_embeddings'),
        embedding => ai.embedding_openai('text-embedding-3-small', 768),
        chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );
@@ -78,7 +78,7 @@ Dataset Setup:
    SELECT ai.create_vectorizer(
        'sec_filings'::regclass,
        loading => ai.loading_column('text'),
-       destination => 'sec_filings_voyage_embeddings',
+       destination => ai.destination_table('sec_filings_voyage_embeddings'),
        embedding => ai.embedding_voyageai('voyage-finance-2', 1024),
        chunking => ai.chunking_recursive_character_text_splitter(512, 50)
    );

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ Now you can create and run a vectorizer. A vectorizer is a pgai concept, it proc`
`88`	`88`	`SELECT ai.create_vectorizer(`
`89`	`89`	`'blog'::regclass,`
`90`	`90`	`loading => ai.loading_column('contents'),`
`91`		`- destination => 'blog_contents_embeddings',`
	`91`	`+ destination => ai.destination_table('blog_contents_embeddings'),`
`92`	`92`	`embedding => ai.embedding_voyageai(`
`93`	`93`	`'voyage-3-lite',`
`94`	`94`	`512`