Fix execute_function string parsing with serverless (unitycatalog#689)

serena-ruan · BenWilson2 · web-flow · commit 92532036aed8 · 2024-11-06T12:14:36.000+08:00
**PR Checklist**

- [ ] A description of the changes is added to the description of this
PR.
- [ ] If there is a related issue, make sure it is linked to this PR.
- [ ] If you've fixed a bug or added code that should be tested, add
tests!
- [ ] If you've added or modified a feature, documentation in `docs` is
updated

**Description of changes**
Update the logic of serverless execution to pass arguments using `:`
prefix by SQL literals. Wrap the exception to provide better error
message when failing.
Drop the previous sanitization logic as we no longer need it :) SQL side
can deal with the single+double quotes combination correctly.
Updated tests.


&lt;!-- Please state what you've changed and how it might affect the users.
--&gt;

---------

Signed-off-by: serena-ruan_data &lt;serena.ruan@databricks.com&gt;
Signed-off-by: Serena Ruan &lt;82044803+serena-ruan@users.noreply.github.com&gt;
Co-authored-by: Ben Wilson &lt;39283302+BenWilson2@users.noreply.github.com&gt;
diff --git a/ai/core/src/unitycatalog/ai/core/databricks.py b/ai/core/src/unitycatalog/ai/core/databricks.py
@@ -22,9 +22,6 @@
 )
 from unitycatalog.ai.core.paged_list import PagedList
 from unitycatalog.ai.core.utils.callable_utils import generate_sql_function_body
-from unitycatalog.ai.core.utils.function_processing_utils import (
-    sanitize_string_inputs_of_function_params,
-)
 from unitycatalog.ai.core.utils.type_utils import (
     column_type_to_python_type,
     convert_timedelta_to_interval_str,
@@ -146,7 +143,15 @@ def retry_on_session_expiration(func):
     def wrapper(self, *args, **kwargs):
         for attempt in range(1, max_attempts + 1):
             try:
-                return func(self, *args, **kwargs)
+                result = func(self, *args, **kwargs)
+                # for non-session related error in the result, we should directly return the result
+                if (
+                    isinstance(result, FunctionExecutionResult)
+                    and result.error
+                    and SESSION_EXCEPTION_MESSAGE in result.error
+                ):
+                    raise Exception(result.error)
+                return result
             except Exception as e:
                 error_message = str(e)
                 if SESSION_EXCEPTION_MESSAGE in error_message:
@@ -657,7 +662,11 @@ def _execute_uc_functions_with_serverless(
         _logger.info("Using databricks connect to execute functions with serverless compute.")
         self.set_default_spark_session()
         sql_command = get_execute_function_sql_command(function_info, parameters)
-        result = self.spark.sql(sqlQuery=sql_command)
+        try:
+            result = self.spark.sql(sqlQuery=sql_command.sql_query, args=sql_command.args or None)
+        except Exception as e:
+            error = f"Failed to execute function with command `{sql_command}`; Error: {e}"
+            return FunctionExecutionResult(error=error)
         if is_scalar(function_info):
             return FunctionExecutionResult(format="SCALAR", value=str(result.collect()[0][0]))
         else:
@@ -813,7 +822,15 @@ def get_execute_function_sql_stmt(
     return ParameterizedStatement(statement=statement, parameters=output_params)
 
 
-def get_execute_function_sql_command(function: "FunctionInfo", parameters: Dict[str, Any]) -> str:
+@dataclass
+class SparkSqlCommand:
+    sql_query: str
+    args: dict[str, Any]
+
+
+def get_execute_function_sql_command(
+    function: "FunctionInfo", parameters: Dict[str, Any]
+) -> SparkSqlCommand:
     from databricks.sdk.service.catalog import ColumnTypeName
 
     sql_query = ""
@@ -824,6 +841,7 @@ def get_execute_function_sql_command(function: "FunctionInfo", parameters: Dict[
             f"SELECT * FROM `{function.catalog_name}`.`{function.schema_name}`.`{function.name}`("
         )
 
+    params_dict: dict[str, Any] = {}
     if parameters and function.input_params and function.input_params.parameters:
         args: List[str] = []
         use_named_args = False
@@ -865,11 +883,9 @@ def get_execute_function_sql_command(function: "FunctionInfo", parameters: Dict[
                         param_value, Decimal
                     ):
                         param_value = float(param_value)
-                    # Handle all other types as string types and santitize escape characters
-                    # since this is likely a code block being executed
-                    param_value = sanitize_string_inputs_of_function_params(param_value)
-                    arg_clause += f"'{param_value}'"
+                    arg_clause += f":{param_info.name}"
+                    params_dict[param_info.name] = param_value
                 args.append(arg_clause)
         sql_query += ",".join(args)
     sql_query += ")"
-    return sql_query
+    return SparkSqlCommand(sql_query=sql_query, args=params_dict)
diff --git a/ai/core/src/unitycatalog/ai/core/utils/function_processing_utils.py b/ai/core/src/unitycatalog/ai/core/utils/function_processing_utils.py
@@ -1,4 +1,3 @@
-import ast
 import decimal
 import json
 import logging
@@ -283,63 +282,3 @@ def supported_function_info_types():
         pass
 
     return types
-
-
-def is_python_code(code_str: str) -> bool:
-    """Check if the provided string is valid Python code."""
-    try:
-        ast.parse(code_str)
-        return True
-    except SyntaxError:
-        return False
-
-
-def convert_quoting_to_sql_safe_format(string_value: str) -> str:
-    """
-    Convert a string to a SQL-safe format by escaping single quotes.
-
-    Args:
-        string_value: The string to be converted.
-
-    Returns:
-        str: The SQL-safe string.
-    """
-    has_single_quote = "'" in string_value
-    has_double_quote = '"' in string_value
-
-    if not has_single_quote and not has_double_quote:
-        return string_value
-
-    if has_single_quote and not has_double_quote:
-        string_value = string_value.replace("'", '"')
-    elif has_single_quote and has_double_quote:
-        raise ValueError(
-            "The argument passed in has been detected as Python code that contains both single and double quotes. "
-            "This is not supported. Code must use only one style of quotation. Please fix the code and try again."
-        )
-    return string_value
-
-
-def sanitize_string_inputs_of_function_params(param_value: Any) -> str:
-    """
-    Sanitize string inputs of function parameters to allow for code block submission.
-
-    Args:
-        param_value: The value of the parameter to sanitize.
-
-    Returns:
-        A sanitized string of the argument value.
-    """
-
-    if isinstance(param_value, str) and is_python_code(param_value):
-        # Escape single quotes, backslashes, and control characters that would otherwise break Python code execution
-        parsed = (
-            param_value.replace("\\", "\\\\")
-            .replace("\r", "\\r")
-            .replace("\n", "\\n")
-            .replace("\t", "\\t")
-        )
-        quotes_parsed = convert_quoting_to_sql_safe_format(parsed)
-    else:
-        quotes_parsed = param_value
-    return str(quotes_parsed)
diff --git a/ai/core/src/unitycatalog/ai/test_utils/function_utils.py b/ai/core/src/unitycatalog/ai/test_utils/function_utils.py
@@ -86,7 +86,9 @@ def create_python_function_and_cleanup(
 ) -> Generator[FunctionObj, None, None]:
     func_name = f"{CATALOG}.{schema}.{func.__name__}"
     try:
-        func_info = client.create_python_function(func=func, catalog=CATALOG, schema=schema)
+        func_info = client.create_python_function(
+            func=func, catalog=CATALOG, schema=schema, replace=True
+        )
         yield FunctionObj(
             full_function_name=func_name,
             comment=func_info.comment,
diff --git a/ai/core/tests/core/databricks/test_databricks_integration_tests.py b/ai/core/tests/core/databricks/test_databricks_integration_tests.py
@@ -1,3 +1,4 @@
+import math
 import os
 import time
 from typing import Callable, Dict, List
@@ -38,13 +39,16 @@
     UCAI_DATABRICKS_WAREHOUSE_RETRY_TIMEOUT,
 )
 from unitycatalog.ai.test_utils.client_utils import (
+    USE_SERVERLESS,
     client,  # noqa: F401
+    get_client,
     requires_databricks,
     retry_flaky_test,
     serverless_client,  # noqa: F401
 )
 from unitycatalog.ai.test_utils.function_utils import (
     CATALOG,
+    create_function_and_cleanup,
     create_python_function_and_cleanup,
     generate_func_name_and_cleanup,
     random_func_name,
@@ -418,14 +422,60 @@ def simple_func(x: int) -> str:
 print(calculate_sum([1, 2, 3, 4, 5]))""",
         "15\n",
     ),
+    # Simple print statement
+    ("print('Hello, world!')", "Hello, world!\n"),
+    # Code with double quotes
+    ('print("He said, \\"Hi!\\"")', 'He said, "Hi!"\n'),
+    # Code with backslashes
+    (r"print('C:\\path\\into\\dir')", "C:\\path\\into\\dir\n"),
+    # Multi-line code with newlines
+    ("for i in range(3):\n    print(i)", "0\n1\n2\n"),
+    # Code with tabs and indents
+    ("def greet(name):\n    print(f'Hello, {name}!')\ngreet('Alice')", "Hello, Alice!\n"),
+    # Code with special characters
+    ("print('Special chars: !@#$%^&*()')", "Special chars: !@#$%^&*()\n"),
+    # Unicode characters
+    ("print('Unicode test: ü, é, 漢字')", "Unicode test: ü, é, 漢字\n"),
+    # Code with comments
+    ("# This is a comment\nprint('Comment test')", "Comment test\n"),
+    # Code raising an exception
+    (
+        "try:\n    raise ValueError('Test error')\nexcept Exception as e:\n    print(f'Caught an error: {e}')",
+        "Caught an error: Test error\n",
+    ),
+    # Code with triple quotes
+    ('print("""Triple quote test""")', "Triple quote test\n"),
+    # Code with raw strings
+    ("print('Raw string: \\\\n new line')", "Raw string: \\n new line\n"),
+    # Empty code string
+    ("", ""),
+    # Code with carriage return
+    ("print('Line1\\\\rLine2')", "Line1\\rLine2\n"),
+    # Code with encoding declarations (Note: encoding declarations should be in the first or second line)
+    ("# -*- coding: utf-8 -*-\nprint('Encoding test')", "Encoding test\n"),
+    # Code importing a standard library
+    ("import math\nprint(math.pi)", f"{math.pi}\n"),
+    # Code with nested functions
+    (
+        "def outer():\n    def inner():\n        return 'Nested'\n    return inner()\nprint(outer())",
+        "Nested\n",
+    ),
+    # Code with list comprehensions
+    ("squares = [x**2 for x in range(5)]\nprint(squares)", "[0, 1, 4, 9, 16]\n"),
+    # Code with multi-line strings
+    ("multi_line = '''Line1\nLine2\nLine3'''\nprint(multi_line)", "Line1\nLine2\nLine3\n"),
 ]
 
 
 @requires_databricks
 @pytest.mark.parametrize("code, expected_output", integration_test_cases)
+@pytest.mark.parametrize("use_serverless", [True, False])
 def test_execute_python_code_integration(
-    client: DatabricksFunctionClient, code: str, expected_output: str
+    code: str, expected_output: str, use_serverless: bool, monkeypatch
 ):
+    monkeypatch.setenv(USE_SERVERLESS, str(use_serverless))
+    client = get_client()
+
     def python_exec(code: str) -> str:
         """
         Execute the provided Python code and return the output.
@@ -451,3 +501,28 @@ def python_exec(code: str) -> str:
         assert result.error is None, f"Function execution failed with error: {result.error}"
 
         assert result.value == expected_output
+
+
+@requires_databricks
+@pytest.mark.parametrize("use_serverless", [True, False])
+@pytest.mark.parametrize(
+    "text",
+    [
+        "MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It was developed by Databricks and is now a part of the Linux Foundation's AI Foundation.",
+        "print('Hello, \"world!\"')",
+        "'return '2' + \"" '3"' "' is a valid input to this function",
+    ],
+)
+def test_string_param_passing_work(text: str, use_serverless: bool, monkeypatch):
+    monkeypatch.setenv(USE_SERVERLESS, str(use_serverless))
+    client = get_client()
+    function_name = random_func_name(schema=SCHEMA)
+    summarize_in_20_words = f"""CREATE OR REPLACE FUNCTION {function_name}(text STRING)
+RETURNS STRING
+RETURN SELECT ai_summarize(text, 20)
+"""
+    with create_function_and_cleanup(client=client, schema=SCHEMA, sql_body=summarize_in_20_words):
+        result = client.execute_function(function_name, {"text": text})
+        assert result.error is None, f"Function execution failed with error: {result.error}"
+        # number of words should be no more than 20
+        assert len(result.value.split(" ")) <= 20
diff --git a/ai/core/tests/core/databricks/test_databricks_unit_tests.py b/ai/core/tests/core/databricks/test_databricks_unit_tests.py
diff --git a/ai/core/tests/core/test_utils.py b/ai/core/tests/core/test_utils.py