Skip to content

Commit b222cec

Browse files
committed
feat: add support for llm documentation synthesis via openai
1 parent 5ce84bc commit b222cec

File tree

6 files changed

+315
-43
lines changed

6 files changed

+315
-43
lines changed

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "dbt-osmosis"
7-
version = "1.1.1"
7+
version = "1.1.2"
88
description = "A dbt utility for managing YAML to make developing with dbt more delightful."
99
readme = "README.md"
1010
license = { text = "Apache-2.0" }
@@ -43,7 +43,7 @@ workbench = [
4343
"setuptools>=70",
4444
]
4545

46-
openai = ["openai>0.28.0"]
46+
openai = ["openai~=1.58.1"]
4747

4848
dev = [
4949
"ruff~=0.8.4",

src/dbt_osmosis/cli/main.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
sort_columns_as_in_database,
2929
sync_node_to_yaml,
3030
synchronize_data_types,
31+
synthesize_missing_documentation_with_openai,
3132
)
3233

3334
T = t.TypeVar("T")
@@ -210,6 +211,11 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
210211
is_flag=True,
211212
help="Automatically apply the restructure plan without confirmation.",
212213
)
214+
@click.option(
215+
"--synthesize",
216+
is_flag=True,
217+
help="Automatically synthesize missing documentation with OpenAI.",
218+
)
213219
def refactor(
214220
target: str | None = None,
215221
profile: str | None = None,
@@ -219,6 +225,7 @@ def refactor(
219225
auto_apply: bool = False,
220226
check: bool = False,
221227
threads: int | None = None,
228+
synthesize: bool = False,
222229
**kwargs: t.Any,
223230
) -> None:
224231
"""Executes organize which syncs yaml files with database schema and organizes the dbt models
@@ -254,6 +261,8 @@ def refactor(
254261
inherit_upstream_column_knowledge(context=context)
255262
sort_columns_as_in_database(context=context)
256263
synchronize_data_types(context=context)
264+
if synthesize:
265+
synthesize_missing_documentation_with_openai(context=context)
257266
sync_node_to_yaml(context=context)
258267
commit_yamls(context=context)
259268

@@ -382,6 +391,11 @@ def organize(
382391
is_flag=True,
383392
help="Automatically apply the restructure plan without confirmation.",
384393
)
394+
@click.option(
395+
"--synthesize",
396+
is_flag=True,
397+
help="Automatically synthesize missing documentation with OpenAI.",
398+
)
385399
def document(
386400
target: str | None = None,
387401
profile: str | None = None,
@@ -390,6 +404,7 @@ def document(
390404
vars: str | None = None,
391405
check: bool = False,
392406
threads: int | None = None,
407+
synthesize: bool = False,
393408
**kwargs: t.Any,
394409
) -> None:
395410
"""Column level documentation inheritance for existing models
@@ -418,6 +433,8 @@ def document(
418433
inject_missing_columns(context=context)
419434
inherit_upstream_column_knowledge(context=context)
420435
sort_columns_as_in_database(context=context)
436+
if synthesize:
437+
synthesize_missing_documentation_with_openai(context=context)
421438
sync_node_to_yaml(context=context)
422439
commit_yamls(context=context)
423440

src/dbt_osmosis/core/llm.py

Lines changed: 233 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,246 @@
1-
# WIP: this will eventually be a class that will handle the LLM process
1+
"""Supplementary module for LLM synthesis of dbt documentation."""
22

3+
import json
4+
import os
5+
import typing as t
6+
from textwrap import dedent
37

4-
def create_llm_prompt(sql_content: str, filename: str) -> list[dict[str, str]]:
5-
header = """DO NOT ADD A HEADER TO DBT YAML.
6-
THIS CODE WILL APPEND TO AN EXISTING YAML FILE.
8+
import openai
79

8-
Examples of YAML structure:
10+
openai.api_key = os.environ.get("OPENAI_API_KEY")
911

10-
"""
11-
prompt = f"""
12-
You are a helpful SQL Developer and Expert in dbt.
13-
Your job is to receive a SQL and generate the YAML in dbt format.
14-
You will not respond anything else, just the YAML code formated to be saved into a file.
12+
__all__ = [
13+
"generate_model_spec_as_json",
14+
"generate_column_doc",
15+
]
1516

16-
IMPORTANT RULES:
1717

18-
1. DO NOT PROSE.
19-
2. DO NOT DEVIATE OR INVENT FROM THE CONTEXT.
20-
3. Always follow dbt convetion!
21-
4. The context will always be ONE FULL SQL.
22-
5. DO NOT WRAP WITH MARKDOWN.
23-
6. The model name will always be the file name.
24-
7. NO NEW LINE BETWEEN COLUMNS!
18+
def _create_llm_prompt_for_model_docs_as_json(
19+
sql_content: str,
20+
existing_context: str | None = None,
21+
upstream_docs: list[str] | None = None,
22+
) -> list[dict[str, t.Any]]:
23+
"""Builds a system + user prompt instructing the model to produce a JSON structure describing the entire model (including columns)."""
24+
if upstream_docs is None:
25+
upstream_docs = []
2526

26-
{header}
27+
example_json = dedent("""\
28+
{
29+
"description": "A short description for the model",
30+
"columns": [
31+
{
32+
"name": "id",
33+
"description": "Unique identifier for each record",
34+
},
35+
{
36+
"name": "email",
37+
"description": "User email address",
38+
}
39+
]
40+
}
41+
""")
2742

28-
- name: model_name
29-
description: markdown_string
43+
system_prompt = dedent(f"""
44+
You are a helpful SQL Developer and an Expert in dbt.
45+
You must produce a JSON object that documents a single model and its columns.
46+
The object must match the structure shown below.
47+
DO NOT WRITE EXTRA EXPLANATION OR MARKDOWN FENCES, ONLY VALID JSON.
3048
31-
columns:
32-
- name: column_name
33-
description: markdown_string
34-
- name: column_name
35-
description: markdown_string
36-
- name: column_name
37-
description: markdown_string
38-
- name: column_name
39-
description: markdown_string
49+
Example of desired JSON structure:
50+
{example_json}
4051
41-
INCLUDE TESTS IF YOU KNOW WHAT THE COLUMN NEEDS.
52+
IMPORTANT RULES:
53+
1. "description" should be short and gleaned from the SQL or the provided docs if possible.
54+
2. "columns" is an array of objects. Each object MUST contain:
55+
- "name": the column name
56+
- "description": short explanation of what the column is
57+
3. If you have "upstream_docs", you may incorporate them as you see fit, but do NOT invent details.
58+
4. Do not output any extra text besides valid JSON.
59+
""")
4260

43-
File Name to be used as MODEL NAME: {filename}
61+
user_message = dedent(f"""
62+
The SQL for the model is:
4463
45-
Convert the following DBT SQL code to YAML:
46-
"""
47-
messages = [
48-
{"role": "system", "content": prompt},
49-
{"role": "user", "content": sql_content},
64+
>>> SQL CODE START
65+
{sql_content}
66+
>>> SQL CODE END
67+
68+
The context for the model is:
69+
{existing_context or "(none)"}
70+
71+
The upstream documentation is:
72+
{os.linesep.join(upstream_docs)}
73+
74+
Please return only a valid JSON that matches the structure described above.
75+
""")
76+
77+
return [
78+
{"role": "system", "content": system_prompt.strip()},
79+
{"role": "user", "content": user_message.strip()},
80+
]
81+
82+
83+
def _create_llm_prompt_for_column(
84+
column_name: str,
85+
existing_context: str | None = None,
86+
table_name: str | None = None,
87+
upstream_docs: list[str] | None = None,
88+
) -> list[dict[str, str]]:
89+
"""Builds a system + user prompt for generating a docstring for a single column. The final answer should be just the docstring text, not JSON or YAML."""
90+
if upstream_docs is None:
91+
upstream_docs = []
92+
93+
table_context = f"in the table '{table_name}'." if table_name else "."
94+
95+
system_prompt = dedent(f"""
96+
You are a helpful SQL Developer and an Expert in dbt.
97+
Your job is to produce a concise documentation string
98+
for a single column {table_context}
99+
100+
IMPORTANT RULES:
101+
1. DO NOT output extra commentary or Markdown fences.
102+
2. Provide only the column description text, nothing else.
103+
3. If upstream docs exist, you may incorporate them. If none exist,
104+
a short placeholder is acceptable.
105+
4. Avoid speculation. Keep it short and relevant.
106+
""")
107+
108+
user_message = dedent(f"""
109+
The column name is: {column_name}
110+
111+
Existing context:
112+
{existing_context or "(none)"}
113+
114+
Upstream docs:
115+
{os.linesep.join(upstream_docs)}
116+
117+
Return ONLY the text suitable for the "description" field.
118+
""")
119+
120+
return [
121+
{"role": "system", "content": system_prompt.strip()},
122+
{"role": "user", "content": user_message.strip()},
123+
]
124+
125+
126+
def generate_model_spec_as_json(
127+
sql_content: str,
128+
upstream_docs: list[str] | None = None,
129+
existing_context: str | None = None,
130+
model_engine: str = "gpt-4o",
131+
temperature: float = 0.3,
132+
) -> dict[str, t.Any]:
133+
"""Calls OpenAI to generate a JSON specification for a model's metadata and columns.
134+
135+
The structure is:
136+
{
137+
"description": "...",
138+
"columns": [
139+
{"name": "...", "description": "..."},
140+
...
141+
]
142+
}
143+
144+
Args:
145+
sql_content (str): Full SQL code of the model
146+
upstream_docs (list[str] | None): Optional list of strings containing context or upstream docs
147+
model_engine (str): Which OpenAI model to use (e.g., 'gpt-3.5-turbo', 'gpt-4')
148+
temperature (float): OpenAI completion temperature
149+
150+
Returns:
151+
dict[str, t.Any]: A dictionary with keys "description", "columns".
152+
"""
153+
messages = _create_llm_prompt_for_model_docs_as_json(
154+
sql_content, existing_context, upstream_docs
155+
)
156+
response = openai.chat.completions.create(
157+
model=model_engine,
158+
messages=messages, # pyright: ignore[reportArgumentType]
159+
temperature=temperature,
160+
)
161+
162+
content = response.choices[0].message.content
163+
if content is None:
164+
raise ValueError("OpenAI returned an empty response")
165+
content = content.strip()
166+
try:
167+
data = t.cast(dict[str, t.Any], json.loads(content))
168+
except json.JSONDecodeError:
169+
raise ValueError("OpenAI returned invalid JSON:\n" + content)
170+
171+
return data
172+
173+
174+
def generate_column_doc(
175+
column_name: str,
176+
existing_context: str | None = None,
177+
table_name: str | None = None,
178+
upstream_docs: list[str] | None = None,
179+
model_engine: str = "gpt-4o",
180+
temperature: float = 0.7,
181+
) -> str:
182+
"""Calls OpenAI to generate documentation for a single column in a table.
183+
184+
Args:
185+
column_name (str): The name of the column to describe
186+
existing_context (str | None): Any relevant metadata or table definitions
187+
table_name (str | None): Name of the table/model (optional)
188+
upstream_docs (list[str] | None): Optional docs or references you might have
189+
model_engine (str): The OpenAI model to use (e.g., 'gpt-3.5-turbo')
190+
temperature (float): OpenAI completion temperature
191+
192+
Returns:
193+
str: A short docstring suitable for a "description" field
194+
"""
195+
messages = _create_llm_prompt_for_column(
196+
column_name, existing_context, table_name, upstream_docs
197+
)
198+
response = openai.chat.completions.create(
199+
model=model_engine,
200+
messages=messages, # pyright: ignore[reportArgumentType]
201+
temperature=temperature,
202+
)
203+
204+
content = response.choices[0].message.content
205+
if not content:
206+
raise ValueError("OpenAI returned an empty response")
207+
return content.strip()
208+
209+
210+
if __name__ == "__main__":
211+
# Kitchen sink
212+
sample_sql = """
213+
SELECT
214+
user_id,
215+
email,
216+
created_at,
217+
is_active
218+
FROM some_source_table
219+
WHERE created_at > '2021-01-01'
220+
"""
221+
docs = [
222+
"user_id: unique integer ID for each user",
223+
"email: user email address",
224+
"created_at: record creation time",
225+
"is_active: boolean flag indicating active user",
50226
]
51-
return messages
227+
model_spec = generate_model_spec_as_json(
228+
sql_content=sample_sql,
229+
upstream_docs=docs,
230+
model_engine="gpt-3.5-turbo",
231+
temperature=0.3,
232+
)
233+
234+
print("\n=== Generated Model JSON Spec ===")
235+
print(json.dumps(model_spec, indent=2))
236+
237+
col_doc = generate_column_doc(
238+
column_name="email",
239+
existing_context="This table tracks basic user information.",
240+
table_name="user_activity_model",
241+
upstream_docs=["Stores the user's primary email address."],
242+
model_engine="gpt-3.5-turbo",
243+
temperature=0.2,
244+
)
245+
print("\n=== Single Column Documentation ===")
246+
print(f"Column: email => {col_doc}")

0 commit comments

Comments
 (0)