acryldata
diff --git a/‎metadata-ingestion/src/datahub/sql_parsing/split_statements.py
Lines changed: 209 additions & 122 deletions b/‎metadata-ingestion/src/datahub/sql_parsing/split_statements.py
Lines changed: 209 additions & 122 deletions
@@ -1,6 +1,10 @@
 import re
 from enum import Enum
-from typing import Generator, List, Tuple
+from typing import Iterator, List, Tuple
+
+SELECT_KEYWORD = "SELECT"
+CASE_KEYWORD = "CASE"
+END_KEYWORD = "END"
 
 CONTROL_FLOW_KEYWORDS = [
     "GO",
@@ -9,18 +13,36 @@
     "BEGIN",
     r"END\w+TRY",
     r"END\w+CATCH",
-    "END",
+    # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
+    # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
+    "IF",
+    # For things like CASE, END does not mean the end of a statement.
+    # We have special handling for this.
+    END_KEYWORD,
+    # "ELSE",  # else is also valid in CASE, so we we can't use it here.
 ]
 
 # There's an exception to this rule, which is when the statement
-# is preceeded by a CTE.
-FORCE_NEW_STATEMENT_KEYWORDS = [
+# is preceded by a CTE. For those, we have to check if the character
+# before this is a ")".
+NEW_STATEMENT_KEYWORDS = [
     # SELECT is used inside queries as well, so we can't include it here.
+    "CREATE",
     "INSERT",
     "UPDATE",
     "DELETE",
     "MERGE",
 ]
+STRICT_NEW_STATEMENT_KEYWORDS = [
+    # For these keywords, a SELECT following it does indicate a new statement.
+    "DROP",
+    "TRUNCATE",
+]
+
+
+class _AlreadyIncremented(Exception):
+    # Using exceptions for control flow isn't great - but the code is clearer so it's fine.
+    pass
 
 
 class ParserState(Enum):
@@ -30,134 +52,199 @@ class ParserState(Enum):
     MULTILINE_COMMENT = 4
 
 
-def _is_keyword_at_position(sql: str, pos: int, keyword: str) -> bool:
-    """
-    Check if a keyword exists at the given position using regex word boundaries.
-    """
-    if pos + len(keyword) > len(sql):
-        return False
+class _StatementSplitter:
+    def __init__(self, sql: str):
+        self.sql = sql
 
-    # If we're not at a word boundary, we can't generate a keyword.
-    if pos > 0 and not (
-        bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
-        or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
-    ):
-        return False
+        # Main parser state.
+        self.i = 0
+        self.state = ParserState.NORMAL
+        self.current_statement: List[str] = []
 
-    pattern = rf"^{re.escape(keyword)}\b"
-    match = re.match(pattern, sql[pos:], re.IGNORECASE)
-    return bool(match)
+        # Additional parser state.
 
+        # If we see a SELECT, should we start a new statement?
+        # If we previously saw a drop/truncate/etc, a SELECT does mean a new statement.
+        # But if we're in a select/create/etc, a select could just be a subquery.
+        self.does_select_mean_new_statement = False
 
-def _look_ahead_for_keywords(
-    sql: str, pos: int, keywords: List[str]
-) -> Tuple[bool, str, int]:
-    """
-    Look ahead for SQL keywords at the current position.
-    """
+        # The END keyword terminates CASE and BEGIN blocks.
+        # We need to match the CASE statements with END blocks to determine
+        # what a given END is closing.
+        self.current_case_statements = 0
 
-    for keyword in keywords:
-        if _is_keyword_at_position(sql, pos, keyword):
-            return True, keyword, len(keyword)
-    return False, "", 0
+    def _is_keyword_at_position(self, pos: int, keyword: str) -> bool:
+        """
+        Check if a keyword exists at the given position using regex word boundaries.
+        """
+        sql = self.sql
 
+        if pos + len(keyword) > len(sql):
+            return False
 
-def split_statements(sql: str) -> Generator[str, None, None]:
-    """
-    Split T-SQL code into individual statements, handling various SQL constructs.
-    """
-    if not sql or not sql.strip():
-        return
+        # If we're not at a word boundary, we can't generate a keyword.
+        if pos > 0 and not (
+            bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
+            or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
+        ):
+            return False
+
+        pattern = rf"^{re.escape(keyword)}\b"
+        match = re.match(pattern, sql[pos:], re.IGNORECASE)
+        return bool(match)
 
-    current_statement: List[str] = []
-    state = ParserState.NORMAL
-    i = 0
+    def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
+        """
+        Look ahead for SQL keywords at the current position.
+        """
 
-    def yield_if_complete() -> Generator[str, None, None]:
-        statement = "".join(current_statement).strip()
+        for keyword in keywords:
+            if self._is_keyword_at_position(self.i, keyword):
+                return True, keyword, len(keyword)
+        return False, "", 0
+
+    def _yield_if_complete(self) -> Iterator[str]:
+        statement = "".join(self.current_statement).strip()
         if statement:
+            # Subtle - to avoid losing full whitespace, they get merged into the next statement.
             yield statement
-            current_statement.clear()
-
-    prev_real_char = "\0"  # the most recent non-whitespace, non-comment character
-    while i < len(sql):
-        c = sql[i]
-        next_char = sql[i + 1] if i < len(sql) - 1 else "\0"
-
-        if state == ParserState.NORMAL:
-            if c == "'":
-                state = ParserState.STRING
-                current_statement.append(c)
-                prev_real_char = c
-            elif c == "-" and next_char == "-":
-                state = ParserState.COMMENT
-                current_statement.append(c)
-                current_statement.append(next_char)
-                i += 1
-            elif c == "/" and next_char == "*":
-                state = ParserState.MULTILINE_COMMENT
-                current_statement.append(c)
-                current_statement.append(next_char)
-                i += 1
-            else:
-                most_recent_real_char = prev_real_char
-                if not c.isspace():
+            self.current_statement.clear()
+
+        # Reset current_statement-specific state.
+        self.does_select_mean_new_statement = False
+        if self.current_case_statements != 0:
+            breakpoint()
+        self.current_case_statements = 0
+
+    def process(self) -> Iterator[str]:
+        if not self.sql or not self.sql.strip():
+            return
+
+        prev_real_char = "\0"  # the most recent non-whitespace, non-comment character
+        while self.i < len(self.sql):
+            c = self.sql[self.i]
+            next_char = self.sql[self.i + 1] if self.i < len(self.sql) - 1 else "\0"
+
+            if self.state == ParserState.NORMAL:
+                if c == "'":
+                    self.state = ParserState.STRING
+                    self.current_statement.append(c)
                     prev_real_char = c
-
-                is_control_keyword, keyword, keyword_len = _look_ahead_for_keywords(
-                    sql, i, keywords=CONTROL_FLOW_KEYWORDS
-                )
-                if is_control_keyword:
-                    # Yield current statement if any
-                    yield from yield_if_complete()
-                    # Yield keyword as its own statement
-                    yield keyword
-                    i += keyword_len
-                    continue
-
-                (
-                    is_force_new_statement_keyword,
-                    keyword,
-                    keyword_len,
-                ) = _look_ahead_for_keywords(
-                    sql, i, keywords=FORCE_NEW_STATEMENT_KEYWORDS
-                )
-                if (
-                    is_force_new_statement_keyword and most_recent_real_char != ")"
-                ):  # usually we'd have a close paren that closes a CTE
-                    # Force termination of current statement
-                    yield from yield_if_complete()
-
-                    current_statement.append(keyword)
-                    i += keyword_len
-                    continue
-
-                elif c == ";":
-                    yield from yield_if_complete()
+                elif c == "-" and next_char == "-":
+                    self.state = ParserState.COMMENT
+                    self.current_statement.append(c)
+                    self.current_statement.append(next_char)
+                    self.i += 1
+                elif c == "/" and next_char == "*":
+                    self.state = ParserState.MULTILINE_COMMENT
+                    self.current_statement.append(c)
+                    self.current_statement.append(next_char)
+                    self.i += 1
                 else:
-                    current_statement.append(c)
-
-        elif state == ParserState.STRING:
-            current_statement.append(c)
-            if c == "'" and next_char == "'":
-                current_statement.append(next_char)
-                i += 1
-            elif c == "'":
-                state = ParserState.NORMAL
-
-        elif state == ParserState.COMMENT:
-            current_statement.append(c)
-            if c == "\n":
-                state = ParserState.NORMAL
-
-        elif state == ParserState.MULTILINE_COMMENT:
-            current_statement.append(c)
-            if c == "*" and next_char == "/":
-                current_statement.append(next_char)
-                i += 1
-                state = ParserState.NORMAL
-
-        i += 1
-
-    # Handle the last statement
-    yield from yield_if_complete()
+                    most_recent_real_char = prev_real_char
+                    if not c.isspace():
+                        prev_real_char = c
+
+                    try:
+                        yield from self._process_normal(
+                            most_recent_real_char=most_recent_real_char
+                        )
+                    except _AlreadyIncremented:
+                        # Skip the normal i += 1 step.
+                        continue
+
+            elif self.state == ParserState.STRING:
+                self.current_statement.append(c)
+                if c == "'" and next_char == "'":
+                    self.current_statement.append(next_char)
+                    self.i += 1
+                elif c == "'":
+                    self.state = ParserState.NORMAL
+
+            elif self.state == ParserState.COMMENT:
+                self.current_statement.append(c)
+                if c == "\n":
+                    self.state = ParserState.NORMAL
+
+            elif self.state == ParserState.MULTILINE_COMMENT:
+                self.current_statement.append(c)
+                if c == "*" and next_char == "/":
+                    self.current_statement.append(next_char)
+                    self.i += 1
+                    self.state = ParserState.NORMAL
+
+            self.i += 1
+
+        # Handle the last statement
+        yield from self._yield_if_complete()
+
+    def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
+        c = self.sql[self.i]
+
+        if self._is_keyword_at_position(self.i, CASE_KEYWORD):
+            self.current_case_statements += 1
+
+        is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(
+            keywords=CONTROL_FLOW_KEYWORDS
+        )
+        if (
+            is_control_keyword
+            and keyword == END_KEYWORD
+            and self.current_case_statements > 0
+        ):
+            # If we're closing a CASE statement with END, we can just decrement the counter and continue.
+            self.current_case_statements -= 1
+        elif is_control_keyword:
+            # Yield current statement if any
+            yield from self._yield_if_complete()
+            # Yield keyword as its own statement
+            yield keyword
+            self.i += keyword_len
+            self.does_select_mean_new_statement = True
+            raise _AlreadyIncremented()
+
+        (
+            is_strict_new_statement_keyword,
+            keyword,
+            keyword_len,
+        ) = self._look_ahead_for_keywords(keywords=STRICT_NEW_STATEMENT_KEYWORDS)
+        if is_strict_new_statement_keyword:
+            yield from self._yield_if_complete()
+            self.current_statement.append(keyword)
+            self.i += keyword_len
+            self.does_select_mean_new_statement = True
+            raise _AlreadyIncremented()
+
+        (
+            is_force_new_statement_keyword,
+            keyword,
+            keyword_len,
+        ) = self._look_ahead_for_keywords(
+            keywords=(
+                NEW_STATEMENT_KEYWORDS
+                + ([SELECT_KEYWORD] if self.does_select_mean_new_statement else [])
+            ),
+        )
+        if (
+            is_force_new_statement_keyword and most_recent_real_char != ")"
+        ):  # usually we'd have a close paren that closes a CTE
+            # Force termination of current statement
+            yield from self._yield_if_complete()
+
+            self.current_statement.append(keyword)
+            self.i += keyword_len
+            raise _AlreadyIncremented()
+
+        if c == ";":
+            yield from self._yield_if_complete()
+        else:
+            self.current_statement.append(c)
+
+
+def split_statements(sql: str) -> Iterator[str]:
+    """
+    Split T-SQL code into individual statements, handling various SQL constructs.
+    """
+
+    splitter = _StatementSplitter(sql)
+    yield from splitter.process()