z3z1ma
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/dbt_osmosis/cli/main.py‎
Lines changed: 17 additions & 0 deletions b/‎src/dbt_osmosis/cli/main.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎src/dbt_osmosis/core/llm.py‎
Lines changed: 233 additions & 38 deletions b/‎src/dbt_osmosis/core/llm.py‎
Lines changed: 233 additions & 38 deletions
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "dbt-osmosis"
-version = "1.1.1"
+version = "1.1.2"
 description = "A dbt utility for managing YAML to make developing with dbt more delightful."
 readme = "README.md"
 license = { text = "Apache-2.0" }
@@ -43,7 +43,7 @@ workbench = [
   "setuptools>=70",
 ]
 
-openai = ["openai>0.28.0"]
+openai = ["openai~=1.58.1"]
 
 dev = [
   "ruff~=0.8.4",
 
@@ -28,6 +28,7 @@
     sort_columns_as_in_database,
     sync_node_to_yaml,
     synchronize_data_types,
+    synthesize_missing_documentation_with_openai,
 )
 
 T = t.TypeVar("T")
@@ -210,6 +211,11 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
     is_flag=True,
     help="Automatically apply the restructure plan without confirmation.",
 )
+@click.option(
+    "--synthesize",
+    is_flag=True,
+    help="Automatically synthesize missing documentation with OpenAI.",
+)
 def refactor(
     target: str | None = None,
     profile: str | None = None,
@@ -219,6 +225,7 @@ def refactor(
     auto_apply: bool = False,
     check: bool = False,
     threads: int | None = None,
+    synthesize: bool = False,
     **kwargs: t.Any,
 ) -> None:
     """Executes organize which syncs yaml files with database schema and organizes the dbt models
@@ -254,6 +261,8 @@ def refactor(
     inherit_upstream_column_knowledge(context=context)
     sort_columns_as_in_database(context=context)
     synchronize_data_types(context=context)
+    if synthesize:
+        synthesize_missing_documentation_with_openai(context=context)
     sync_node_to_yaml(context=context)
     commit_yamls(context=context)
 
@@ -382,6 +391,11 @@ def organize(
     is_flag=True,
     help="Automatically apply the restructure plan without confirmation.",
 )
+@click.option(
+    "--synthesize",
+    is_flag=True,
+    help="Automatically synthesize missing documentation with OpenAI.",
+)
 def document(
     target: str | None = None,
     profile: str | None = None,
@@ -390,6 +404,7 @@ def document(
     vars: str | None = None,
     check: bool = False,
     threads: int | None = None,
+    synthesize: bool = False,
     **kwargs: t.Any,
 ) -> None:
     """Column level documentation inheritance for existing models
@@ -418,6 +433,8 @@ def document(
     inject_missing_columns(context=context)
     inherit_upstream_column_knowledge(context=context)
     sort_columns_as_in_database(context=context)
+    if synthesize:
+        synthesize_missing_documentation_with_openai(context=context)
     sync_node_to_yaml(context=context)
     commit_yamls(context=context)
 
 
@@ -1,51 +1,246 @@
-# WIP: this will eventually be a class that will handle the LLM process
+"""Supplementary module for LLM synthesis of dbt documentation."""
 
+import json
+import os
+import typing as t
+from textwrap import dedent
 
-def create_llm_prompt(sql_content: str, filename: str) -> list[dict[str, str]]:
-    header = """DO NOT ADD A HEADER TO DBT YAML.
-THIS CODE WILL APPEND TO AN EXISTING YAML FILE.
+import openai
 
-Examples of YAML structure:
+openai.api_key = os.environ.get("OPENAI_API_KEY")
 
-"""
-    prompt = f"""
-You are a helpful SQL Developer and Expert in dbt.
-Your job is to receive a SQL and generate the YAML in dbt format.
-You will not respond anything else, just the YAML code formated to be saved into a file.
+__all__ = [
+    "generate_model_spec_as_json",
+    "generate_column_doc",
+]
 
-IMPORTANT RULES:
 
-1. DO NOT PROSE.
-2. DO NOT DEVIATE OR INVENT FROM THE CONTEXT.
-3. Always follow dbt convetion!
-4. The context will always be ONE FULL SQL.
-5. DO NOT WRAP WITH MARKDOWN.
-6. The model name will always be the file name.
-7. NO NEW LINE BETWEEN COLUMNS!
+def _create_llm_prompt_for_model_docs_as_json(
+    sql_content: str,
+    existing_context: str | None = None,
+    upstream_docs: list[str] | None = None,
+) -> list[dict[str, t.Any]]:
+    """Builds a system + user prompt instructing the model to produce a JSON structure describing the entire model (including columns)."""
+    if upstream_docs is None:
+        upstream_docs = []
 
-{header}
+    example_json = dedent("""\
+    {
+      "description": "A short description for the model",
+      "columns": [
+        {
+          "name": "id",
+          "description": "Unique identifier for each record",
+        },
+        {
+          "name": "email",
+          "description": "User email address",
+        }
+      ]
+    }
+    """)
 
-  - name: model_name
-    description: markdown_string
+    system_prompt = dedent(f"""
+    You are a helpful SQL Developer and an Expert in dbt.
+    You must produce a JSON object that documents a single model and its columns.
+    The object must match the structure shown below.
+    DO NOT WRITE EXTRA EXPLANATION OR MARKDOWN FENCES, ONLY VALID JSON.
 
-    columns:
-      - name: column_name
-        description: markdown_string
-      - name: column_name
-        description: markdown_string
-      - name: column_name
-        description: markdown_string
-      - name: column_name
-        description: markdown_string
+    Example of desired JSON structure:
+    {example_json}
 
-INCLUDE TESTS IF YOU KNOW WHAT THE COLUMN NEEDS.
+    IMPORTANT RULES:
+    1. "description" should be short and gleaned from the SQL or the provided docs if possible.
+    2. "columns" is an array of objects. Each object MUST contain:
+       - "name": the column name
+       - "description": short explanation of what the column is
+    3. If you have "upstream_docs", you may incorporate them as you see fit, but do NOT invent details.
+    4. Do not output any extra text besides valid JSON.
+    """)
 
-File Name to be used as MODEL NAME: {filename}
+    user_message = dedent(f"""
+    The SQL for the model is:
 
-Convert the following DBT SQL code to YAML:
-"""
-    messages = [
-        {"role": "system", "content": prompt},
-        {"role": "user", "content": sql_content},
+    >>> SQL CODE START
+    {sql_content}
+    >>> SQL CODE END
+
+    The context for the model is:
+    {existing_context or "(none)"}
+
+    The upstream documentation is:
+    {os.linesep.join(upstream_docs)}
+
+    Please return only a valid JSON that matches the structure described above.
+    """)
+
+    return [
+        {"role": "system", "content": system_prompt.strip()},
+        {"role": "user", "content": user_message.strip()},
+    ]
+
+
+def _create_llm_prompt_for_column(
+    column_name: str,
+    existing_context: str | None = None,
+    table_name: str | None = None,
+    upstream_docs: list[str] | None = None,
+) -> list[dict[str, str]]:
+    """Builds a system + user prompt for generating a docstring for a single column. The final answer should be just the docstring text, not JSON or YAML."""
+    if upstream_docs is None:
+        upstream_docs = []
+
+    table_context = f"in the table '{table_name}'." if table_name else "."
+
+    system_prompt = dedent(f"""
+    You are a helpful SQL Developer and an Expert in dbt.
+    Your job is to produce a concise documentation string
+    for a single column {table_context}
+
+    IMPORTANT RULES:
+    1. DO NOT output extra commentary or Markdown fences.
+    2. Provide only the column description text, nothing else.
+    3. If upstream docs exist, you may incorporate them. If none exist,
+       a short placeholder is acceptable.
+    4. Avoid speculation. Keep it short and relevant.
+    """)
+
+    user_message = dedent(f"""
+    The column name is: {column_name}
+
+    Existing context:
+    {existing_context or "(none)"}
+
+    Upstream docs:
+    {os.linesep.join(upstream_docs)}
+
+    Return ONLY the text suitable for the "description" field.
+    """)
+
+    return [
+        {"role": "system", "content": system_prompt.strip()},
+        {"role": "user", "content": user_message.strip()},
+    ]
+
+
+def generate_model_spec_as_json(
+    sql_content: str,
+    upstream_docs: list[str] | None = None,
+    existing_context: str | None = None,
+    model_engine: str = "gpt-4o",
+    temperature: float = 0.3,
+) -> dict[str, t.Any]:
+    """Calls OpenAI to generate a JSON specification for a model's metadata and columns.
+
+    The structure is:
+      {
+        "description": "...",
+        "columns": [
+          {"name": "...", "description": "..."},
+          ...
+        ]
+      }
+
+    Args:
+        sql_content (str): Full SQL code of the model
+        upstream_docs (list[str] | None): Optional list of strings containing context or upstream docs
+        model_engine (str): Which OpenAI model to use (e.g., 'gpt-3.5-turbo', 'gpt-4')
+        temperature (float): OpenAI completion temperature
+
+    Returns:
+        dict[str, t.Any]: A dictionary with keys "description", "columns".
+    """
+    messages = _create_llm_prompt_for_model_docs_as_json(
+        sql_content, existing_context, upstream_docs
+    )
+    response = openai.chat.completions.create(
+        model=model_engine,
+        messages=messages,  # pyright: ignore[reportArgumentType]
+        temperature=temperature,
+    )
+
+    content = response.choices[0].message.content
+    if content is None:
+        raise ValueError("OpenAI returned an empty response")
+    content = content.strip()
+    try:
+        data = t.cast(dict[str, t.Any], json.loads(content))
+    except json.JSONDecodeError:
+        raise ValueError("OpenAI returned invalid JSON:\n" + content)
+
+    return data
+
+
+def generate_column_doc(
+    column_name: str,
+    existing_context: str | None = None,
+    table_name: str | None = None,
+    upstream_docs: list[str] | None = None,
+    model_engine: str = "gpt-4o",
+    temperature: float = 0.7,
+) -> str:
+    """Calls OpenAI to generate documentation for a single column in a table.
+
+    Args:
+        column_name (str): The name of the column to describe
+        existing_context (str | None): Any relevant metadata or table definitions
+        table_name (str | None): Name of the table/model (optional)
+        upstream_docs (list[str] | None): Optional docs or references you might have
+        model_engine (str): The OpenAI model to use (e.g., 'gpt-3.5-turbo')
+        temperature (float): OpenAI completion temperature
+
+    Returns:
+        str: A short docstring suitable for a "description" field
+    """
+    messages = _create_llm_prompt_for_column(
+        column_name, existing_context, table_name, upstream_docs
+    )
+    response = openai.chat.completions.create(
+        model=model_engine,
+        messages=messages,  # pyright: ignore[reportArgumentType]
+        temperature=temperature,
+    )
+
+    content = response.choices[0].message.content
+    if not content:
+        raise ValueError("OpenAI returned an empty response")
+    return content.strip()
+
+
+if __name__ == "__main__":
+    # Kitchen sink
+    sample_sql = """
+        SELECT
+            user_id,
+            email,
+            created_at,
+            is_active
+        FROM some_source_table
+        WHERE created_at > '2021-01-01'
+    """
+    docs = [
+        "user_id: unique integer ID for each user",
+        "email: user email address",
+        "created_at: record creation time",
+        "is_active: boolean flag indicating active user",
     ]
-    return messages
+    model_spec = generate_model_spec_as_json(
+        sql_content=sample_sql,
+        upstream_docs=docs,
+        model_engine="gpt-3.5-turbo",
+        temperature=0.3,
+    )
+
+    print("\n=== Generated Model JSON Spec ===")
+    print(json.dumps(model_spec, indent=2))
+
+    col_doc = generate_column_doc(
+        column_name="email",
+        existing_context="This table tracks basic user information.",
+        table_name="user_activity_model",
+        upstream_docs=["Stores the user's primary email address."],
+        model_engine="gpt-3.5-turbo",
+        temperature=0.2,
+    )
+    print("\n=== Single Column Documentation ===")
+    print(f"Column: email => {col_doc}")