Fix check_diff and format

Ashvin-Ranjan · Ashvin-Ranjan · commit d21a76cdc57d · 2025-10-10T20:15:51.000-07:00
diff --git a/src/examples/colors/exp1/check_diff.py b/src/examples/colors/exp1/check_diff.py
@@ -1,3 +1,4 @@
+from typing import Any
 from scipy.stats import ttest_1samp
 import pandas as pd
 import numpy as np
@@ -20,66 +21,58 @@ def get_closest(points: np.ndarray, point: np.ndarray, amount: int) -> np.ndarra
 
 def convert_model(
     model: pd.DataFrame,
-) -> tuple[
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-    np.ndarray,
-]:
+) -> dict[str, dict[str, Any]]:
     """
-    Converts an input model taken from the color model `.csv` file and returns helpful arrays which are used in calculations
+    Converts an input model taken from the color model `.csv` file and a helpful dict to use in calculations
 
     Args:
         model (DataFrame): The DataFrame which can be directly loaded from the color model's minimized `.csv` file
 
     Returns:
-        tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-            The following arrays, in order:
-        - The points from optimal encoders
-        - The points from suboptimal encoders
-        - The points from natural language encoders
-        - The concatination of all points (in order: optimal, suboptimal, natural)
-        - The concatination of suboptimal and natural language encoder points
-        - The concatination of optimal and suboptimal encoder points
-        - The quasi-convexity of the q(m|w) distributions for all encoders in points
-        - The quasi-convexity of the q(u|w) distributions for all encoders in points
+        dict[str, dict[str, Any]]: Contains the following data
+            For "optimal", "suboptimal", and "natural" has a dictionary with the following keys:
+            - "points": All the points of encoders of that type
+            - "check": All the points to check against in the neighbor comparison
+            - "check_convexities": A dict with the quasi-convexity of the q(m|w) and q(u|w) distributions for the encoders in "check"
+            - "point_convexities": A dict with the quasi-convexity of the q(m|w) and q(u|w) distributions for the encoders in "points"
     """
-    frontier = []
-    suboptimal = []
-    natural = []
-    convexities_qmw = []
-    convexities_quw = []
+    output = {
+        "optimal": {
+            "points": [],
+            "check": [],
+            "check_convexities": {"qmw": [], "quw": []},
+            "point_convexities": {"qmw": [], "quw": []},
+        },
+        "suboptimal": {
+            "points": [],
+            "check": [],
+            "check_convexities": {"qmw": [], "quw": []},
+            "point_convexities": {"qmw": [], "quw": []},
+        },
+        "natural": {
+            "points": [],
+            "check": [],
+            "check_convexities": {"qmw": [], "quw": []},
+            "point_convexities": {"qmw": [], "quw": []},
+        },
+    }
 
     for _, row in model.iterrows():
-        if row["type"] == "optimal":
-            frontier.append([row["complexity"], row["accuracy"]])
-        if row["type"] == "suboptimal":
-            suboptimal.append([row["complexity"], row["accuracy"]])
-        if row["type"] == "natural":
-            natural.append([row["complexity"], row["accuracy"]])
-        convexities_qmw.append(row["convexity-qmw"])
-        convexities_quw.append(row["convexity-quw"])
-
-    frontier = np.array(frontier)
-    suboptimal = np.array(suboptimal)
-    natural = np.array(natural)
-    points = np.concat((frontier, suboptimal, natural))
-    check_frontier = np.concat((suboptimal, natural))
-    check_natural = np.concat((frontier, suboptimal))
-    return (
-        frontier,
-        suboptimal,
-        natural,
-        points,
-        check_frontier,
-        check_natural,
-        convexities_qmw,
-        convexities_quw,
-    )
+        for k in output.keys():
+            if k == row["type"]:
+                output[k]["points"].append([row["complexity"], row["accuracy"]])
+                output[k]["point_convexities"]["quw"].append(row["convexity-quw"])
+                output[k]["point_convexities"]["qmw"].append(row["convexity-qmw"])
+            if k != row["type"] or row["type"] == "suboptimal":
+                output[k]["check"].append([row["complexity"], row["accuracy"]])
+                output[k]["check_convexities"]["quw"].append(row["convexity-quw"])
+                output[k]["check_convexities"]["qmw"].append(row["convexity-qmw"])
+
+    for k in output.keys():
+        output[k]["points"] = np.array(output[k]["points"])
+        output[k]["check"] = np.array(output[k]["check"])
+
+    return output
 
 
 def get_neighbor_comparison(amount: int, model: pd.DataFrame):
@@ -90,51 +83,77 @@ def get_neighbor_comparison(amount: int, model: pd.DataFrame):
         amount (int): The number of neighbors to compare to.
         model (DataFrame): The DataFrame which can be directly loaded from the color model's minimized `.csv` file.
     """
-    _, _, _, points, check_frontier, check_natural, convexities_qmw, convexities_quw = (
-        convert_model(model)
+    converted_model = convert_model(model)
+
+    comparison_qmw = {"optimal": [], "suboptimal": [], "natural": []}
+    comparison_quw = {"optimal": [], "suboptimal": [], "natural": []}
+    for k in converted_model.keys():
+        suboptimal = k == "suboptimal"
+        for i, p in enumerate(converted_model[k]["points"]):
+            higher_than_qmw = -0.5 if suboptimal else 0
+            higher_than_quw = -0.5 if suboptimal else 0
+            if suboptimal:
+                # Check includes itself so get 11 instead of 10
+                closest = get_closest(converted_model[k]["check"], p, amount + 1)
+            else:
+                closest = get_closest(converted_model[k]["check"], p, amount)
+            for c in closest:
+                qmw_diff = (
+                    converted_model[k]["point_convexities"]["qmw"][i]
+                    - converted_model[k]["check_convexities"]["qmw"][c]
+                )
+                quw_diff = (
+                    converted_model[k]["point_convexities"]["quw"][i]
+                    - converted_model[k]["check_convexities"]["quw"][c]
+                )
+                if qmw_diff > 0:
+                    higher_than_qmw += 1
+                if qmw_diff == 0:
+                    higher_than_qmw += 0.5
+                if quw_diff > 0:
+                    higher_than_quw += 1
+                if quw_diff == 0:
+                    higher_than_quw += 0.5
+            comparison_qmw[k].append(higher_than_qmw / amount)
+            comparison_quw[k].append(higher_than_quw / amount)
+
+    total_qmw = (
+        comparison_qmw["natural"]
+        + comparison_qmw["suboptimal"]
+        + comparison_qmw["optimal"]
+    )
+    print("Average comparison (q(m|w)):", sum(total_qmw) / len(total_qmw))
+    print(
+        "Average natural language (q(m|w)):",
+        sum(comparison_qmw["natural"]) / len(comparison_qmw["natural"]),
+    )
+    print(
+        "Average optimal encoder (q(m|w)):",
+        sum(comparison_qmw["optimal"]) / len(comparison_qmw["optimal"]),
     )
-
-    comparison_qmw = []
-    comparison_quw = []
-    for i, p in enumerate(points):
-        higher_than_qmw = 0
-        higher_than_quw = 0
-        if i < 1501:
-            closest = get_closest(check_frontier, p, amount)
-        elif i >= 2601:
-            closest = get_closest(check_natural, p, amount)
-        else:
-            closest = get_closest(points, p, amount + 1)
-        for c in closest:
-            if i < 1501:
-                c += 1501
-            if c == i:
-                continue
-            if convexities_qmw[i] - convexities_qmw[c] > 0:
-                higher_than_qmw += 1
-            if convexities_qmw[i] - convexities_qmw[c] == 0:
-                higher_than_qmw += 0.5
-            if convexities_quw[i] - convexities_quw[c] > 0:
-                higher_than_quw += 1
-            if convexities_quw[i] - convexities_quw[c] == 0:
-                higher_than_quw += 0.5
-        comparison_qmw.append(higher_than_qmw / amount)
-        comparison_quw.append(higher_than_quw / amount)
-
-    print("Average comparison (q(m|w)):", sum(comparison_qmw) / len(comparison_qmw))
-    print("Average natural language (q(m|w)):", sum(comparison_qmw[-110:]) / 110)
-    print("Average optimal encoder (q(m|w)):", sum(comparison_qmw[:1501]) / 1501)
     print(
         "Average suboptimal encoder (q(m|w)):",
-        sum(comparison_qmw[1501:-110]) / len(comparison_qmw[1501:-110]),
+        sum(comparison_qmw["suboptimal"]) / len(comparison_qmw["suboptimal"]),
     )
     print()
-    print("Average comparison (q(u|w)):", sum(comparison_quw) / len(comparison_quw))
-    print("Average natural language (q(u|w)):", sum(comparison_quw[-110:]) / 110)
-    print("Average optimal encoder (q(u|w)):", sum(comparison_quw[:1501]) / 1501)
+
+    total_quw = (
+        comparison_quw["natural"]
+        + comparison_quw["suboptimal"]
+        + comparison_quw["optimal"]
+    )
+    print("Average comparison (q(u|w)):", sum(total_quw) / len(total_quw))
+    print(
+        "Average natural language (q(u|w)):",
+        sum(comparison_quw["natural"]) / len(comparison_quw["natural"]),
+    )
+    print(
+        "Average optimal encoder (q(u|w)):",
+        sum(comparison_quw["optimal"]) / len(comparison_quw["optimal"]),
+    )
     print(
         "Average suboptimal encoder (q(u|w)):",
-        sum(comparison_quw[1501:-110]) / len(comparison_quw[1501:-110]),
+        sum(comparison_quw["suboptimal"]) / len(comparison_quw["suboptimal"]),
     )
 
 
@@ -147,20 +166,22 @@ def check_difference_significance(amount: int, model: pd.DataFrame):
         amount (int): The number of neighbors to compare to.
         model (DataFrame): The DataFrame which can be directly loaded from the color model's minimized `.csv` file.
     """
-    _, _, natural, _, _, check_natural, convexities_qmw, convexities_quw = (
-        convert_model(model)
-    )
-
-    offset = len(check_natural)
+    converted = convert_model(model)
 
     comparison_qmw = []
     comparison_quw = []
 
-    for i, p in enumerate(natural):
-        closest = get_closest(check_natural, p, amount)
+    for i, p in enumerate(converted["natural"]["points"]):
+        closest = get_closest(converted["natural"]["check"], p, amount)
         for c in closest:
-            comparison_qmw.append(convexities_qmw[i + offset] - convexities_qmw[c])
-            comparison_quw.append(convexities_quw[i + offset] - convexities_quw[c])
+            comparison_qmw.append(
+                converted["natural"]["point_convexities"]["qmw"][i]
+                - converted["natural"]["check_convexities"]["qmw"][c]
+            )
+            comparison_quw.append(
+                converted["natural"]["point_convexities"]["quw"][i]
+                - converted["natural"]["check_convexities"]["quw"][c]
+            )
 
     print(ttest_1samp(comparison_qmw, 0))
     print(ttest_1samp(comparison_quw, 0))
diff --git a/src/examples/colors/utils/minimize_models.py b/src/examples/colors/utils/minimize_models.py
@@ -45,7 +45,7 @@ def minimize_model(name: str):
         "type": [],
         "beta": [],
         "optimality": [],
-        "base_item_id": []
+        "base_item_id": [],
     }
 
     frontier = []
@@ -96,7 +96,7 @@ def minimize_model(name: str):
         df_data["optimality"].append(
             find_frontier_optimality(frontier, np.array([s.complexity, s.iwu]))
         )
-        df_data["base_item_id"].append(i//10)
+        df_data["base_item_id"].append(i // 10)
 
     df = pd.DataFrame(data=df_data)
     df.index.name = "item_id"