reviews incorporated

zilto · zilto · commit c2be8c40a9b5 · 2024-02-01T14:24:31.000-05:00
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/README.md b/contrib/hamilton/contrib/user/zilto/llm_generate_code/README.md
@@ -4,15 +4,34 @@ This module uses the OpenAI completion API to generate code.
 
 For any language, you can request `generated_code` to get the generated response. If you are generating Python code, you can execute it in a subprocess by requesting `execution_output` and `execution_error`.
 
-# Configuration Options
-## Config.when
+## Example
+```python
+from hamilton import driver
+import __init__ as llm_generate_code
+
+dr = driver.Builder().with_modules(llm_generate_code).build()
+
+dr.execute(
+    ["execution_output", "execution_error"],
+    inputs=dict(
+        query="Retrieve the primary type from a `typing.Annotated` object`",
+    )
+)
+```
+
+## Configuration Options
+### Config.when
 This module doesn't receive configurations.
 
-## Inputs
+### Inputs
 - `query`: The query for which you want code generated.
 - `api_key`: Set the OpenAI API key to use. If None, read the environment variable `OPENAI_API_KEY`
 - `code_language`: Set the code language to generate the reponse in. Defaults to `python`
 
-## Overrides
+### Overrides
 - `prompt_template_to_generate_code`: Create a new prompt template with the fields `query` and `code_language`.
 - `prompt_to_generate_code`: Manually provide a prompt to generate Python code
+
+## Extension / Limitations
+- Executing arbitrary generated code is a security risk. Proceed with caution.
+- You need to manually install dependencies for your generated code to be executed (i.e., you need to `pip install pandas` yourself)
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/__init__.py b/contrib/hamilton/contrib/user/zilto/llm_generate_code/__init__.py
@@ -14,14 +14,18 @@
 
 
 def llm_client(api_key: Optional[str] = None) -> openai.OpenAI:
-    """Create an OpenAI client"""
+    """Create an OpenAI client."""
     if api_key is None:
         api_key = os.environ.get("OPENAI_API_KEY")
 
     return openai.OpenAI(api_key=api_key)
 
 
 def prompt_template_to_generate_code() -> str:
+    """Prompt template to generate code.
+
+    It must include the fields `code_language` and `query`.
+    """
     return """Write some {code_language} code to solve the user's problem.
 
 Return only python code in Markdown format, e.g.:
@@ -40,36 +44,47 @@ def prompt_template_to_generate_code() -> str:
 def prompt_to_generate_code(
     prompt_template_to_generate_code: str, query: str, code_language: str = "python"
 ) -> str:
+    """Fill the prompt template with the code language and the user query."""
     return prompt_template_to_generate_code.format(
         query=query,
         code_language=code_language,
     )
 
 
 def response_generated_code(llm_client: openai.OpenAI, prompt_to_generate_code: str) -> str:
+    """Call the OpenAI API completion endpoint with the prompt to generate code."""
     response = llm_client.completions.create(
         model="gpt-3.5-turbo-instruct",
         prompt=prompt_to_generate_code,
     )
     return response.choices[0].text
 
 
-def generated_code(response_generated_code: str) -> str:
-    _, _, lower_part = response_generated_code.partition("```python")
+def parsed_generated_code(response_generated_code: str, code_language: str = "python") -> str:
+    """Retrieve the code section from the generated text."""
+    _, _, lower_part = response_generated_code.partition(f"```{code_language}")
     code_part, _, _ = lower_part.partition("```")
     return code_part
 
 
-def code_prepared_for_execution(generated_code: str, code_language: str = "python") -> str:
+def code_prepared_for_execution(parsed_generated_code: str, code_language: str = "python") -> str:
+    """If code is Python, append to it statements prepare it to be run in a subprocess.
+
+    We collect all local variables in a directory and filter out Python builtins to keep
+    only the variables from the generated code. print() is used to send string data from
+    the subprocess back to the parent proceess via its `stdout`.
+    """
+
     if code_language != "python":
         raise ValueError("Can only execute the generated code if `code_language` = 'python'")
 
-    code_to_get_vars = """
-excluded_vars = { 'excluded_vars', '__builtins__', '__annotations__'} | set(dir(__builtins__))
-local_vars = {k:v for k,v in locals().items() if k not in excluded_vars}
-print(local_vars)
-"""
-    return generated_code + code_to_get_vars
+    code_to_get_vars = (
+        "excluded_vars = { 'excluded_vars', '__builtins__', '__annotations__'} | set(dir(__builtins__))\n"
+        "local_vars = {k:v for k,v in locals().items() if k not in excluded_vars}\n"
+        "print(local_vars)"
+    )
+
+    return parsed_generated_code + code_to_get_vars
 
 
 @extract_fields(
@@ -78,7 +93,12 @@ def code_prepared_for_execution(generated_code: str, code_language: str = "pytho
         execution_error=str,
     )
 )
-def execute_output(code_prepared_for_execution: str) -> dict:
+def executed_output(code_prepared_for_execution: str) -> dict:
+    """Execute the generated Python code + appended utilities in a subprocess.
+
+    The output and errors from the code are collected as strings. Executing
+    the code in a subprocess provides isolation, but isn't a security guarantee.
+    """
     process = subprocess.Popen(
         ["python", "-c", code_prepared_for_execution],
         stdout=subprocess.PIPE,
diff --git a/contrib/hamilton/contrib/user/zilto/llm_generate_code/tags.json b/contrib/hamilton/contrib/user/zilto/llm_generate_code/tags.json
@@ -1,5 +1,5 @@
 {
   "schema": "1.0",
-  "use_case_tags": ["LLM", "OpenAI"],
+  "use_case_tags": ["LLM", "OpenAI", "code generation"],
   "secondary_tags": {}
 }

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"schema": "1.0",`
`3`		`- "use_case_tags": ["LLM", "OpenAI"],`
	`3`	`+ "use_case_tags": ["LLM", "OpenAI", "code generation"],`
`4`	`4`	`"secondary_tags": {}`
`5`	`5`	`}`