JuliaSymbolics · olynch · Aug 13, 2024 · Sep 24, 2024 · Oct 11, 2024 · 0x0f0f0f
diff --git a/src/EGraphs/egraph.jl b/src/EGraphs/egraph.jl
@@ -1,13 +1,16 @@
 # Functional implementation of https://egraphs-good.github.io/
 # https://dl.acm.org/doi/10.1145/3434304
 
+# ==============================================================
+# Interface to implement for custom analyses
+# ==============================================================
 
 """
     modify!(eclass::EClass{Analysis})
 
 The `modify!` function for EGraph Analysis can optionally modify the eclass
 `eclass` after it has been analyzed, typically by adding an e-node.
-It should be **idempotent** if no other changes occur to the EClass. 
+It should be **idempotent** if no other changes occur to the EClass.
 (See the [egg paper](https://dl.acm.org/doi/pdf/10.1145/3434304)).
 """
 function modify! end
@@ -25,10 +28,13 @@ function join end
 """
     make(g::EGraph{ExpressionType, AnalysisType}, n::VecExpr)::AnalysisType where {ExpressionType}
 
-Given an e-node `n`, `make` should return the corresponding analysis value. 
+Given an e-node `n`, `make` should return the corresponding analysis value.
 """
 function make end
 
+# ==============================================================
+# EClasses
+# ==============================================================
 
 """
     EClass{D}
@@ -69,7 +75,18 @@ function addparent!(@nospecialize(a::EClass), n::VecExpr, id::Id)
   push!(a.parents, (n => id))
 end
 
+"""
+    merge_analysis_data!(a::EClass{D}, b::EClass{D})::Tuple{Bool,Bool,Union{D,Nothing}} where {D}
+
+This is an internal function and should not be called directly by users; this
+docstring is only for those who wish to understand Metatheory internals.
+
+Returns a tuple of: `(did_update_a, did_update_b, newdata)` where:
 
+- `did_update_a` is a boolean indicating whether `a`'s analysis class was updated
+- `did_update_b` is a boolean indicating whether `b`'s analysis class was updated
+- `newdata` is the merged analysis data
+"""
 function merge_analysis_data!(a::EClass{D}, b::EClass{D})::Tuple{Bool,Bool,Union{D,Nothing}} where {D}
   if !isnothing(a.data) && !isnothing(b.data)
     new_a_data = join(a.data, b.data)
@@ -95,7 +112,16 @@ Trick from: https://discourse.julialang.org/t/dictionary-with-custom-hash-functi
 struct IdKey
   val::Id
 end
+
+"""
+Recall that `Base.hash` combines an existing "seed" (h) with a new value (a).
+
+In this case, we just use bitwise XOR; very cheap! This is because
+[`IdKey`](@ref) is supposed to just hash to itself, so we don't need to do
+anything special to `a.val`.
+"""
 Base.hash(a::IdKey, h::UInt) = xor(a.val, h)
+
 Base.:(==)(a::IdKey, b::IdKey) = a.val == b.val
 
 """
@@ -112,24 +138,71 @@ not necessarily very informative, but you can access the terms of each e-node
 via `Metatheory.to_expr`.
 
 See the [egg paper](https://dl.acm.org/doi/pdf/10.1145/3434304)
-for implementation details.
+for implementation details. Of special notice is the e-graph invariants,
+and when they do or do not hold. One of the main innovations of `egg` was to
+"batch" the maintenance of the e-graph invariants. We use the `clean` field
+on this struct to keep track of whether there is pending work to do in order
+to re-establish the e-graph invariants.
 """
 mutable struct EGraph{ExpressionType,Analysis}
-  "stores the equality relations over e-class ids"
+  """
+  stores the equality relations over e-class ids
+
+  More specifically, the `(potentially non-root id) --> (root id)` mapping.
+  """
   uf::UnionFind
-  "map from eclass id to eclasses"
+
+  """
+  map from eclass id to eclasses
+
+  The `(root id) --> e-class` mapping.
+  """
   classes::Dict{IdKey,EClass{Analysis}}
-  "hashcons mapping e-nodes to their e-class id"
+
+  """
+  hashcons mapping e-nodes to their e-class id
+
+  The `e-node --> (potentially non-root id)` mapping.
+  """
   memo::Dict{VecExpr,Id}
-  "Hashcons the constants in the e-graph"
+
+  """
+  hashcons the constants in the e-graph
+
+  For performance reasons, the "head" of an e-node is stored in the e-node as a
+  hash (so that it fits in a flat vector of unsigned integers
+  ([`VecExpr`](@ref))) and this dictionary is used to get the actual Julia
+  object of the head when extracting terms.
+  """
   constants::Dict{UInt64,Any}
-  "Nodes which need to be processed for rebuilding. The id is the id of the enode, not the canonical id of the eclass."
+
+  """
+  Nodes which need to be processed for rebuilding. The id is the id of the enode, not the canonical id of the eclass.
+  """
   pending::Vector{Pair{VecExpr,Id}}
+
+  """
+  When an e-node is added to an e-graph for the first time, we add analysis data to the
+  newly-created e-class by calling [`make`]() on the head of the e-node and the analysis
+  data for the arguments to that e-node. However, the analysis data for the arguments to
+  that e-node could get updated at some point, as e-classes are merged.
+
+  This is a queue for e-nodes which have had the analysis of some of their arguments
+  updated, but have not updated the analysis of their parent e-class yet.
+  """
   analysis_pending::UniqueQueue{Pair{VecExpr,Id}}
+
+  """
+  The Id of the e-class that we have built this e-graph to simplify.
+  """
   root::Id
+
   "a cache mapping signatures (function symbols and their arity) to e-classes that contain e-nodes with that function symbol."
   classes_by_op::Dict{IdKey,Vector{Id}}
+
+  "do we need to do extra work in order to re-establish the e-graph invariants"
   clean::Bool
+
   "If we use global buffers we may need to lock. Defaults to false."
   needslock::Bool
   lock::ReentrantLock
@@ -174,6 +247,8 @@ EGraph(e; kwargs...) = EGraph{typeof(e),Nothing}(e; kwargs...)
 @inline get_constant(@nospecialize(g::EGraph), hash::UInt64) = g.constants[hash]
 @inline has_constant(@nospecialize(g::EGraph), hash::UInt64)::Bool = haskey(g.constants, hash)
 
+# Why does one of these use `get!` and the other use `setindex!`?
+
 @inline function add_constant!(@nospecialize(g::EGraph), @nospecialize(c))::Id
   h = hash(c)
   get!(g.constants, h, c)
@@ -224,13 +299,17 @@ Returns the canonical e-class id for a given e-class.
 
 @inline Base.getindex(g::EGraph, i::Id) = g.classes[IdKey(find(g, i))]
 
+"""
+Make sure all of the arguments of `n` point to root nodes in the unionfind
+data structure for `g`.
+"""
 function canonicalize!(g::EGraph, n::VecExpr)
-  v_isexpr(n) || @goto ret
-  for i in (VECEXPR_META_LENGTH + 1):length(n)
-    @inbounds n[i] = find(g, n[i])
+  if v_isexpr(n)
+    for i in (VECEXPR_META_LENGTH + 1):length(n)
+        @inbounds n[i] = find(g, n[i])
+    end
+    v_unset_hash!(n)
   end
-  v_unset_hash!(n)
-  @label ret
   v_hash!(n)
   n
 end
@@ -284,7 +363,7 @@ end
 
 """
 Extend this function on your types to do preliminary
-preprocessing of a symbolic term before adding it to 
+preprocessing of a symbolic term before adding it to
 an EGraph. Most common preprocessing techniques are binarization
 of n-ary terms and metadata stripping.
 """
@@ -304,6 +383,7 @@ function addexpr!(g::EGraph, se)::Id
 
   isexpr(e) || return add!(g, VecExpr(Id[Id(0), Id(0), Id(0), add_constant!(g, e)]), false)
 
+<<<<<<< HEAD
   args = iscall(e) ? arguments(e) : children(e)
   ar = length(args)
   n = v_new(ar)
@@ -315,14 +395,29 @@ function addexpr!(g::EGraph, se)::Id
   v_set_signature!(n, hash(maybe_quote_operation(h), hash(ar)))
   for i in v_children_range(n)
     @inbounds n[i] = addexpr!(g, args[i - VECEXPR_META_LENGTH])
+=======
+    h = iscall(e) ? operation(e) : head(e)
+    v_set_head!(n, add_constant!(g, h))
+
+    # get the signature from op and arity
+    v_set_signature!(n, hash(maybe_quote_operation(h), hash(ar)))
+
+    for i in v_children_range(n)
+      @inbounds n[i] = addexpr!(g, args[i - VECEXPR_META_LENGTH])
+    end
+    n
+  else # constant enode
+    VecExpr(Id[Id(0), Id(0), Id(0), add_constant!(g, e)])
+>>>>>>> 4521a82 (Added more documentation to internal data structures)
   end
 
   add!(g, n, false)
 end
 
 """
-Given an [`EGraph`](@ref) and two e-class ids, set
-the two e-classes as equal.
+Given an [`EGraph`](@ref) and two e-class ids, merge the two corresponding e-classes.
+
+This includes merging the analysis data of the e-classes.
 """
 function Base.union!(
   g::EGraph{ExpressionType,AnalysisType},
@@ -365,6 +460,9 @@ function Base.union!(
   return true
 end
 
+"""
+Returns whether all of `ids...` are the same e-class in `g`.
+"""
 function in_same_class(g::EGraph, ids::Id...)::Bool
   nids = length(ids)
   nids == 1 && return true
@@ -492,7 +590,10 @@ end
 
 # Thanks to Max Willsey and Yihong Zhang
 
-
+"""
+Given a ground pattern, which is a pattern that has no pattern variables, find
+the eclass id in the egraph that represents that ground pattern.
+"""
 function lookup_pat(g::EGraph{ExpressionType}, p::PatExpr)::Id where {ExpressionType}
   @assert isground(p)
 

diff --git a/src/EGraphs/unionfind.jl b/src/EGraphs/unionfind.jl
@@ -1,9 +1,42 @@
+"""
+    UnionFind()
+
+Creates a new empty unionfind.
+
+A unionfind data structure is an eventually-idempotent endomorphism on `1:n`.
+
+What does this mean? It is a function `parents : 1:n -> 1:n` such that
+
+```
+parents^(n) = parents^(n+1)
+```
+
+This means that if you start at any `i ∈ 1:n`, repeatedly apply `parents` to `i`
+will eventually reach a fixed-point. We call the fixed points of `parents` the "roots" of the unionfind. This is implemented in [`find`].
+
+For the remaining discussion, let `parents*` the function which sends `i ∈ 1:n` to its fixed point with respect to `parents`.
+
+The point of a unionfind is to store a *partition* of `1:n`. To test if two
+elements `i, j ∈ 1:n` are in the same partition, we check if
+
+```
+parents*(i) = parents*(j)
+```
+
+That is, we check if `find(uf, i) == find(uf, j)`.
+"""
 struct UnionFind
   parents::Vector{Id}
 end
 
 UnionFind() = UnionFind(Id[])
 
+"""
+    Base.push(uf::UnionFind)::Id
+
+This extends the domain of `uf` from `1:n` to `1:n+1` and returns `n+1`. The
+element `n+1` is originally in its own equivalence class.
+"""
 function Base.push!(uf::UnionFind)::Id
   l = length(uf.parents) + 1
   push!(uf.parents, l)
@@ -12,11 +45,70 @@ end
 
 Base.length(uf::UnionFind) = length(uf.parents)
 
+"""
+    Base.union!(uf::UnionFind, i::Id, j::Id)
+
+Precondition: `i` and `j` must be roots of `uf`.
+
+Thus, we typically call this as `union!(uf, find(uf, i), find(uf, j))`. If this
+precondition is not satisfied, then it is easy to violate the eventually-idempotent criterion of the unionfind.
+
+Specifically,
+
+```
+union!(uf, 1, 2)
+union!(uf, 2, 1)
+```
+
+will create a cycle that will make `find(uf, 1)` non-terminate.
+"""
 function Base.union!(uf::UnionFind, i::Id, j::Id)
   uf.parents[j] = i
   i
 end
 
+
+# Potential optimization:
+
+# ```julia
+# j = i
+# while j != uf.parents[j]
+#   j = uf.parents[j]
+# end
+# root = j
+# while i != uf.parents[i]
+#   uf.parents[i] = root
+#   i = uf.parents[i]
+# end
+# root
+# ```
+
+# This optimization sets up a "short-circuit". That is, before, the parents array
+# might be set up as
+
+# ```
+# 1 -> 5 -> 2 -> 3 -> 3
+# ```
+
+# After, we have
+
+# ```
+# 1 -> 3
+# 5 -> 3
+# 2 -> 3
+# 3 -> 3
+# ```
+
+# Note: why don't we do this optimization? Question for Alessandro.
+
+"""
+    find(uf::UnionFind, i::Id)
+
+This computes the fixed point of `uf.parents` when applied to `i`.
+
+We know we are at a fixed point once `i == uf.parents[i]`. So, we continually
+set `i = uf.parents[i]` until this becomes true.
+"""
 function find(uf::UnionFind, i::Id)
   while i != uf.parents[i]
     i = uf.parents[i]