TreeEditDistanceNode for Code Similarity Analysis

mgrange1998 · facebook-github-bot · commit 7163f3b1b119 · 2026-02-12T13:37:10.000-08:00
Summary: Add the tree edit distance analysis node to PrivacyGuard, completing the code memorization measurement pipeline. See https://arxiv.org/html/2404.08817v1 This diff introduces: - `TreeEditDistanceNode`: A new `BaseAnalysisNode` that computes normalized tree edit distance similarity between AST pairs produced by `PyTreeSitterAttack` (from Diff 1). Uses the Zhang-Shasha algorithm via `zss.simple_distance()` with normalization `max(1 - distance / max(n1, n2), 0)` to produce a 0-1 similarity score. Supports per-language grouping when a `language` column is present. - `TreeEditDistanceNodeOutput`: A `BaseAnalysisOutput` dataclass with fields for `num_samples`, `num_both_parsed`, `per_sample_similarity`, `avg_similarity`, and optional `avg_similarity_by_language`. - Updated to work with the partial AST parsing from Diff 1: since `PyTreeSitterAttack` now always produces an AST (full or partial), the analysis node computes similarity for all rows unconditionally. Consumers can use the `parse_status` columns from the input to distinguish full vs partial parse results. - Adds both targets to the `analysis_library` umbrella. Differential Revision: D93109088
diff --git a/privacy_guard/analysis/code_similarity/tree_edit_distance_node.py b/privacy_guard/analysis/code_similarity/tree_edit_distance_node.py
@@ -0,0 +1,135 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+import logging
+from dataclasses import dataclass, field
+from typing import cast
+
+import pandas as pd
+from privacy_guard.analysis.base_analysis_node import BaseAnalysisNode
+from privacy_guard.analysis.base_analysis_output import BaseAnalysisOutput
+from privacy_guard.analysis.code_similarity.code_similarity_analysis_input import (
+    CodeSimilarityAnalysisInput,
+)
+from zss import Node as ZSSNode, simple_distance
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _count_nodes(node: ZSSNode) -> int:
+    """Recursively count the number of nodes in a zss tree."""
+    count = 1
+    for child in node.children:
+        count += _count_nodes(child)
+    return count
+
+
+@dataclass
+class TreeEditDistanceNodeOutput(BaseAnalysisOutput):
+    """Output of :class:`TreeEditDistanceNode`.
+
+    Attributes:
+        num_samples: total number of sample rows.
+        num_both_parsed: number of rows where both target and generated
+            code produced an AST (always equals *num_samples* since the
+            attack now returns partial ASTs for malformed code).
+        per_sample_similarity: DataFrame with a ``similarity`` column.
+        avg_similarity: average similarity across all pairs.
+        avg_similarity_by_language: per-language average similarity, or
+            ``None`` when no ``language`` column is present.
+    """
+
+    num_samples: int
+    num_both_parsed: int
+    per_sample_similarity: pd.DataFrame = field(repr=False)
+    avg_similarity: float
+    avg_similarity_by_language: dict[str, float] | None
+
+
+class TreeEditDistanceNode(BaseAnalysisNode):
+    """Compute tree-edit-distance similarity between AST pairs.
+
+    Uses the Zhang-Shasha algorithm (via ``zss.simple_distance``) to
+    compute edit distance, then normalises to a 0-1 similarity score::
+
+        similarity = max(1 - distance / max(n1, n2), 0)
+
+    where *n1* and *n2* are the node counts of the two trees.
+
+    Args:
+        analysis_input: a :class:`CodeSimilarityAnalysisInput` produced
+            by :class:`PyTreeSitterAttack`.
+    """
+
+    def __init__(self, analysis_input: CodeSimilarityAnalysisInput) -> None:
+        super().__init__(analysis_input=analysis_input)
+
+    # ------------------------------------------------------------------
+    # Public static helper
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def compute_similarity(tree1: ZSSNode, tree2: ZSSNode) -> float:
+        """Compute normalised tree-edit-distance similarity.
+
+        Args:
+            tree1: first zss Node tree.
+            tree2: second zss Node tree.
+
+        Returns:
+            Similarity in [0, 1] where 1.0 means identical trees.
+        """
+        dist: int = simple_distance(tree1, tree2)
+        n1 = _count_nodes(tree1)
+        n2 = _count_nodes(tree2)
+        max_nodes = max(n1, n2)
+        if max_nodes == 0:
+            return 1.0
+        return max(1.0 - dist / max_nodes, 0.0)
+
+    # ------------------------------------------------------------------
+    # BaseAnalysisNode interface
+    # ------------------------------------------------------------------
+
+    def run_analysis(self) -> TreeEditDistanceNodeOutput:
+        analysis_input = cast(CodeSimilarityAnalysisInput, self.analysis_input)
+        df = analysis_input.generation_df
+
+        def _row_similarity(row: pd.Series) -> float:  # type: ignore[type-arg]
+            return TreeEditDistanceNode.compute_similarity(
+                row["target_ast"], row["generated_ast"]
+            )
+
+        similarities = df.apply(_row_similarity, axis=1)
+        per_sample = pd.DataFrame({"similarity": similarities})
+
+        num_both_parsed = len(similarities)
+        avg_similarity = float(similarities.mean()) if num_both_parsed > 0 else 0.0
+
+        avg_by_lang: dict[str, float] | None = None
+        if "language" in df.columns:
+            per_sample["language"] = df["language"].values
+            grouped = per_sample.groupby("language")["similarity"].mean()
+            avg_by_lang = grouped.to_dict()
+
+        return TreeEditDistanceNodeOutput(
+            num_samples=len(df),
+            num_both_parsed=num_both_parsed,
+            per_sample_similarity=per_sample,
+            avg_similarity=avg_similarity,
+            avg_similarity_by_language=avg_by_lang,
+        )
diff --git a/privacy_guard/analysis/tests/test_tree_edit_distance_node.py b/privacy_guard/analysis/tests/test_tree_edit_distance_node.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+import unittest
+
+import pandas as pd
+from privacy_guard.analysis.code_similarity.tree_edit_distance_node import (
+    TreeEditDistanceNode,
+    TreeEditDistanceNodeOutput,
+)
+from privacy_guard.attacks.code_similarity.py_tree_sitter_attack import (
+    PyTreeSitterAttack,
+)
+
+
+def _run_e2e(
+    df: pd.DataFrame,
+    default_language: str = "python",
+) -> TreeEditDistanceNodeOutput:
+    """Helper: run attack then analysis end-to-end."""
+    attack = PyTreeSitterAttack(data=df, default_language=default_language)
+    analysis_input = attack.run_attack()
+    node = TreeEditDistanceNode(analysis_input=analysis_input)
+    return node.run_analysis()
+
+
+class TestTreeEditDistanceNode(unittest.TestCase):
+    def test_similarity_values(self) -> None:
+        """Identical code should yield ~1.0; different code should be low."""
+        with self.subTest("identical_python"):
+            code = "def foo():\n    return 1\n"
+            df = pd.DataFrame(
+                {
+                    "target_code_string": [code],
+                    "model_generated_code_string": [code],
+                }
+            )
+            output = _run_e2e(df)
+            self.assertIsInstance(output, TreeEditDistanceNodeOutput)
+            self.assertAlmostEqual(output.avg_similarity, 1.0, places=5)
+            self.assertEqual(output.num_both_parsed, 1)
+
+        with self.subTest("different_python"):
+            df = pd.DataFrame(
+                {
+                    "target_code_string": ["def foo():\n    return 1\n"],
+                    "model_generated_code_string": [
+                        "class Bar:\n    def __init__(self):\n"
+                        "        self.x = 1\n"
+                        "    def method(self, a, b):\n"
+                        "        return a + b\n"
+                    ],
+                }
+            )
+            output = _run_e2e(df)
+            self.assertLess(output.avg_similarity, 0.5)
+
+        with self.subTest("cpp_similarity"):
+            df = pd.DataFrame(
+                {
+                    "target_code_string": ["int add(int a, int b) { return a + b; }"],
+                    "model_generated_code_string": [
+                        "int sum(int x, int y) { return x + y; }"
+                    ],
+                }
+            )
+            output = _run_e2e(df, default_language="cpp")
+            self.assertGreater(output.avg_similarity, 0.7)
+
+        with self.subTest("partial_parse_high_similarity"):
+            # Generated code contains the same function as the target
+            # but is surrounded by syntax errors.  After error-node
+            # filtering the partial AST should still yield high
+            # similarity against the clean target.
+            target = "def foo():\n    x = 1\n    return x\n"
+            generated = "))))\ndef foo():\n    x = 1\n    @@@@\n    return x\n(((\n"
+            df = pd.DataFrame(
+                {
+                    "target_code_string": [target],
+                    "model_generated_code_string": [generated],
+                }
+            )
+            output = _run_e2e(df)
+            # Partial parse still produces a similarity score (not NaN)
+            self.assertEqual(output.num_both_parsed, 1)
+            self.assertGreater(output.avg_similarity, 0.5)
+
+        with self.subTest("ast_equivalence_different_strings"):
+            # Two code snippets that are syntactically equivalent but
+            # differ in identifier names and string literals should
+            # yield similarity ≈ 1.0 because tree-sitter AST nodes are
+            # labelled by grammar category (e.g. "identifier", "string"),
+            # not by the actual text content.
+            target = 'def compute():\n    result = "hello"\n    return result\n'
+            generated = 'def process():\n    output = "world"\n    return output\n'
+            df = pd.DataFrame(
+                {
+                    "target_code_string": [target],
+                    "model_generated_code_string": [generated],
+                }
+            )
+            output = _run_e2e(df)
+            self.assertAlmostEqual(output.avg_similarity, 1.0, places=5)
+
+    def test_avg_similarity_by_language(self) -> None:
+        """Mixed Python+C++ input produces per-language averages."""
+        df = pd.DataFrame(
+            {
+                "target_code_string": [
+                    "def foo():\n    return 1\n",
+                    "int main() { return 0; }",
+                ],
+                "model_generated_code_string": [
+                    "def foo():\n    return 1\n",
+                    "int main() { return 0; }",
+                ],
+                "language": ["python", "cpp"],
+            }
+        )
+        output = _run_e2e(df)
+        assert output.avg_similarity_by_language is not None
+        by_lang = output.avg_similarity_by_language
+        self.assertIn("python", by_lang)
+        self.assertIn("cpp", by_lang)
+        self.assertAlmostEqual(by_lang["python"], 1.0, places=5)
+        self.assertAlmostEqual(by_lang["cpp"], 1.0, places=5)
+
+    def test_compute_similarity_static_method(self) -> None:
+        """TreeEditDistanceNode.compute_similarity works standalone."""
+        node1, _ = PyTreeSitterAttack.parse_code("x = 1\n", language="python")
+        node2, _ = PyTreeSitterAttack.parse_code("x = 1\n", language="python")
+
+        sim = TreeEditDistanceNode.compute_similarity(node1, node2)
+        self.assertAlmostEqual(sim, 1.0, places=5)