clean up more comments

mbutrovich · mbutrovich · commit 0b57f11aed95 · 2026-05-20T08:27:08.000-04:00
diff --git a/spark/src/main/java/org/apache/comet/codegen/CometBatchKernel.java b/spark/src/main/java/org/apache/comet/codegen/CometBatchKernel.java
@@ -44,17 +44,17 @@ protected CometBatchKernel(Object[] references) {
    * Deterministic expressions leave this as a no-op.
    *
    * <p>The caller invokes this before the first {@code process} call of each partition. The
-   * generated subclass is not thread-safe across concurrent {@code process} calls; the dispatcher
+   * generated subclass is not thread-safe across concurrent {@code process} calls. The dispatcher
    * allocates one per partition and serializes calls.
    */
   public void init(int partitionIndex) {}
 
   /**
    * Process one batch.
    *
-   * @param inputs Arrow input vectors; length and concrete classes match the schema the kernel was
-   *     compiled against
-   * @param output Arrow output vector; caller allocates to the expression's {@code dataType}
+   * @param inputs Arrow input vectors. Length and concrete classes match the schema the kernel was
+   *     compiled against.
+   * @param output Arrow output vector. Caller allocates to the expression's {@code dataType}.
    * @param numRows number of rows in this batch
    */
   public abstract void process(ValueVector[] inputs, FieldVector output, int numRows);
diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegen.scala
@@ -35,18 +35,18 @@ import org.apache.comet.shims.CometExprTraitShim
  * fuses Arrow input reads, Spark expression evaluation, and Arrow output writes into one
  * Janino-compiled method per `(expression, schema)` pair.
  *
- * The kernel compiles any bound Catalyst expression; the tree need not be rooted at a `ScalaUDF`.
+ * The kernel compiles any bound Catalyst expression. The tree need not be rooted at a `ScalaUDF`.
  * Today's only consumer is [[org.apache.comet.udf.codegen.CometScalaUDFCodegen]].
  *
- * Constraints: one output vector per kernel; per-row scalar evaluation only (aggregate, window,
+ * Constraints: one output vector per kernel, per-row scalar evaluation only (aggregate, window,
  * generator are rejected by [[canHandle]]).
  *
  * Input- and output-side emission live in [[CometBatchKernelCodegenInput]] and
  * [[CometBatchKernelCodegenOutput]]. This file owns the [[ArrowColumnSpec]] vocabulary, the
  * [[canHandle]] / [[allocateOutput]] / [[compile]] / [[generateSource]] entry points, and
  * cross-cutting kernel-shape decisions (NullIntolerant short-circuit, CSE variant).
  *
- * The generated kernel is the `InternalRow` that Spark's `BoundReference.genCode` reads from; see
+ * The generated kernel is the `InternalRow` that Spark's `BoundReference.genCode` reads from. See
  * [[generateSource]] for how the wiring is set up.
  */
 object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
@@ -128,7 +128,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
           s"spark.sql.codegen.maxFields=$maxFields)")
     }
     // HOFs are `CodegenFallback` but admitted: `CodegenFallback.doGenCode` emits one
-    // `((Expression) references[N]).eval(row)` call site per HOF; the kernel dispatches to the
+    // `((Expression) references[N]).eval(row)` call site per HOF. The kernel dispatches to the
     // HOF's interpreted `eval`, which mutates `NamedLambdaVariable.value` per element and reads
     // the input array through the kernel's typed Arrow getters. Per-task `boundExpr` isolation
     // in `CometScalaUDFCodegen.kernelCache` prevents concurrent partitions from racing on the
@@ -140,8 +140,8 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     //
     // `ExecSubqueryExpression` (`ScalarSubquery`, `InSubqueryExec`) is accepted: the surrounding
     // Comet operator's inherited `SparkPlan.waitForSubqueries` populates the subquery's
-    // `result` field before evaluation; the closure serializer captures that value into the
-    // arg-0 bytes; the dispatcher keys its compile cache on those bytes, so distinct subquery
+    // `result` field before evaluation. The closure serializer captures that value into the
+    // arg-0 bytes, and the dispatcher keys its compile cache on those bytes, so distinct subquery
     // results produce distinct cache entries.
     //
     // `Unevaluable`: rejected by default. `isCodegenInertUnevaluable` exempts version-specific
@@ -207,7 +207,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
           .mkString(","))
     // ScalaUDF embeds stateful `ExpressionEncoder` serializers via `ctx.addReferenceObj` that
     // reuse internal `UnsafeRow` / `byte[]` buffers per `apply`. Each kernel instance needs its
-    // own copy; the closure regenerates the references array per call so the dispatcher can hand
+    // own copy. The closure regenerates the references array per call so the dispatcher can hand
     // a fresh array to every kernel it allocates from this `CompiledKernel`.
     val freshReferences: () => Array[Any] = () =>
       generateSource(boundExpr, inputSchema).references
@@ -245,12 +245,12 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
     // empty string here.
     //
     // TODO(method-size): perRowBody is inlined inside process's for-loop and not split.
-    // Sufficiently deep trees can exceed Janino's 64KB method size; wrap in
+    // Sufficiently deep trees can exceed Janino's 64KB method size. Wrap in
     // ctx.splitExpressionsWithCurrentInputs when hit.
     val (concreteOutClass, outputSetup, perRowBody) = {
       // Class-field CSE. `generateExpressions` runs `subexpressionElimination` under the hood,
       // populating `ctx.subexprFunctions` with per-row helper calls that write common subtree
-      // results into `addMutableState` fields; the returned `ExprCode` references those fields.
+      // results into `addMutableState` fields. The returned `ExprCode` references those fields.
       // `subexprFunctionsCode` is the concatenated helper invocation block, spliced into the
       // per-row body by `defaultBody`.
       val ev = if (SQLConf.get.subexpressionEliminationEnabled) {
@@ -338,7 +338,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
    * skipped on null rows. Otherwise the standard shape: run `ev.code`, then `setNull` or write
    * based on `ev.isNull`.
    *
-   * `subExprsCode` is the CSE helper-invocation block; it must run before `ev.code`. Inside the
+   * `subExprsCode` is the CSE helper-invocation block. It must run before `ev.code`. Inside the
    * short-circuit it lives in the else branch so null rows skip CSE too.
    */
   private def defaultBody(
@@ -418,8 +418,8 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
 
   /**
    * Array column: an Arrow `ListVector` wrapping a child spec. `elementSparkType` lets the
-   * nested-class emitter pick the right read template; the child carries the Arrow vector class.
-   * Nested arrays compose recursively.
+   * nested-class emitter pick the right read template, and the child carries the Arrow vector
+   * class. Nested arrays compose recursively.
    */
   final case class ArrayColumnSpec(
       nullable: Boolean,
@@ -449,8 +449,9 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
 
   /**
    * Map column: an Arrow `MapVector` (subclass of `ListVector`) whose data vector is a
-   * `StructVector` with key at child 0 and value at child 1. Nullable keys/values are carried in
-   * the child specs. Nested keys and values compose recursively.
+   * `StructVector` with key at child 0 and value at child 1. Nested keys and values compose
+   * recursively. The child specs' `nullable` field is unused on the read path. Output-side null
+   * guards for map values come from `MapType.valueContainsNull` on the Spark `DataType`.
    */
   final case class MapColumnSpec(
       nullable: Boolean,
@@ -463,9 +464,9 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
   }
 
   /**
-   * Compiled kernel handle. `factory` is a Spark-generated stateless class safe to share across
-   * partitions; `freshReferences` regenerates the references array per kernel allocation because
-   * `ScalaUDF` embeds stateful `ExpressionEncoder` serializers that cannot be shared.
+   * Compiled kernel handle. `freshReferences` regenerates the references array per kernel
+   * allocation because `ScalaUDF` embeds stateful `ExpressionEncoder` serializers that cannot be
+   * shared.
    */
   final case class CompiledKernel(factory: GeneratedClass, freshReferences: () => Array[Any]) {
     def newInstance(): CometBatchKernel =
@@ -474,7 +475,7 @@ object CometBatchKernelCodegen extends Logging with CometExprTraitShim {
 
   /**
    * Output of [[generateSource]]. Tests inspect `body` to assert the shape of the generated
-   * source; see `CometCodegenSourceSuite`.
+   * source. See `CometCodegenSourceSuite`.
    */
   final case class GeneratedSource(body: String, code: CodeAndComment, references: Array[Any])
 
diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenInput.scala
@@ -88,15 +88,12 @@ private[codegen] object CometBatchKernelCodegenInput {
   }
 
   /**
-   * Emit typed-getter overrides. Each switches on column ordinal; with the inlined constant
+   * Emit typed-getter overrides. Each switches on column ordinal. With the inlined constant
    * ordinal from `BoundReference.genCode`, JIT folds the switch to one branch.
    *
    * `decimalTypeByOrdinal` lets the decimal getter specialize per ordinal: when only a
    * `DecimalType(precision <= 18)` `BoundReference` reads the ordinal, the case skips the
    * `BigDecimal` allocation and reads the unscaled long directly.
-   *
-   * TODO(unsafe-readers): primitive `v.get(i)` performs a bounds check that is redundant given `i
-   * in [0, numRows)`.
    */
   def emitTypedGetters(
       inputSchema: Seq[ArrowColumnSpec],
@@ -679,8 +676,8 @@ private[codegen] object CometBatchKernelCodegenInput {
 
   /**
    * Emit one `InputStruct_${path}` nested class. Constructor takes `rowIdx` and stores it in a
-   * `final` field. Scalar getters switch on field ordinal; complex getters allocate fresh inner
-   * views (offsets computed for array/map children; rowIdx passed through for struct children).
+   * `final` field. Scalar getters switch on field ordinal. Complex getters allocate fresh inner
+   * views (offsets computed for array/map children, rowIdx passed through for struct children).
    */
   private def emitStructClass(path: String, spec: StructColumnSpec): String = {
     val baseClassName = classOf[CometInternalRow].getName
diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenOutput.scala b/spark/src/main/scala/org/apache/comet/codegen/CometBatchKernelCodegenOutput.scala
@@ -73,7 +73,7 @@ private[codegen] object CometBatchKernelCodegenOutput {
    * Complex top-level types route through a [[RenamedListVector]] / [[RenamedMapVector]] /
    * [[RenamedStructVector]] (see those for the runtime-vs-export naming gap).
    *
-   * `estimatedBytes` pre-sizes the data buffer for variable-length scalar outputs; ignored for
+   * `estimatedBytes` pre-sizes the data buffer for variable-length scalar outputs. Ignored for
    * other root types, and not propagated into nested var-width children (their `allocateNew` runs
    * through the parent's `allocateNew`, which resets child buffers).
    *
@@ -184,7 +184,7 @@ private[codegen] object CometBatchKernelCodegenOutput {
    * typed child-vector casts and whose `perRow` writes `source` into `targetVec` at `idx`.
    * `targetVec` is assumed pre-cast to the right Arrow class (root prelude or a parent's setup).
    *
-   * Scalars emit `perRow` only; complex types emit both. Inner setup bubbles up so deep child
+   * Scalars emit `perRow` only. Complex types emit both. Inner setup bubbles up so deep child
    * casts land at the batch prelude.
    */
   private def emitWrite(
@@ -239,7 +239,7 @@ private[codegen] object CometBatchKernelCodegenOutput {
       // write each into the `ListVector`'s child, bracket with `startNewValue`/`endValue`. The
       // element write recurses through `emitWrite` on the child vector so any supported scalar
       // becomes a valid element. Nested complex types compose. `targetVec` is a `ListVector` at
-      // the call site; only its data vector needs casting (in setup).
+      // the call site, and only its data vector needs casting (in setup).
       //
       // NullableElementElision: when `containsNull == false` drop the `isNullAt` guard at
       // source level rather than relying on JIT folding.
@@ -274,7 +274,7 @@ private[codegen] object CometBatchKernelCodegenOutput {
       OutputEmit(setup, perRow)
     case st: StructType =>
       // Spark's `doGenCode` for StructType produces an `InternalRow`. Typed child-vector casts
-      // hoist to setup; the per-row body references the hoisted names.
+      // hoist to setup, and the per-row body references the hoisted names.
       //
       // For non-nullable fields, drop the `row.isNullAt($fi)` guard at source level so HotSpot
       // emits a straight write path per field rather than a branch.
@@ -311,7 +311,7 @@ private[codegen] object CometBatchKernelCodegenOutput {
       // entries struct and the key/value children hoist to setup.
       //
       // Per-row: read keyArray/valueArray, open via `startNewValue(idx)`, write each pair into
-      // the entries struct (key always non-null per Spark/Arrow invariant; value guarded on
+      // the entries struct (key always non-null per Spark/Arrow invariant, value guarded on
       // `valueContainsNull`), close via `endValue(idx, n)`.
       val entriesVar = ctx.freshName("outMapEntries")
       val keyVar = ctx.freshName("outMapKey")
diff --git a/spark/src/main/scala/org/apache/comet/codegen/CometInternalRow.scala b/spark/src/main/scala/org/apache/comet/codegen/CometInternalRow.scala
@@ -28,7 +28,7 @@ import org.apache.comet.shims.CometInternalRowShim
 
 /**
  * Throwing-default `InternalRow` base for the codegen kernel. Subclasses override only the
- * getters their input shape needs; centralizing the throws absorbs forward-compat breakage when
+ * getters their input shape needs. Centralizing the throws absorbs forward-compat breakage when
  * Spark adds abstract methods.
  *
  * Two consumers: the compiled kernel (`ctx.INPUT_ROW = "row"` aliases `this`) and per-column
diff --git a/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala b/spark/src/main/scala/org/apache/comet/serde/CometScalaUDF.scala
@@ -62,7 +62,7 @@ object CometScalaUDF extends CometExpressionSerde[ScalaUDF] {
     val attrs = expr.collect { case a: AttributeReference => a }.distinct
     val boundExpr = BindReferences.bindReference(expr, AttributeSeq(attrs))
 
-    // Gate at plan time; surface the reason via withInfo rather than crashing Janino at execute.
+    // Gate at plan time. Surface the reason via withInfo rather than crashing Janino at execute.
     CometBatchKernelCodegen.canHandle(boundExpr) match {
       case Some(reason) =>
         withInfo(expr, reason)
diff --git a/spark/src/main/scala/org/apache/comet/udf/codegen/CometScalaUDFCodegen.scala b/spark/src/main/scala/org/apache/comet/udf/codegen/CometScalaUDFCodegen.scala
@@ -69,7 +69,7 @@ import org.apache.comet.udf.CometUDF
  *
  * `evaluate` runs under `this.synchronized` because DataFusion operators like `HashJoinExec`
  * pipeline build/probe via `OnceAsync` (`tokio::spawn`), so multiple Tokio worker threads can
- * call back into one task's dispatcher; the kernel's per-batch instance fields would race
+ * call back into one task's dispatcher. The kernel's per-batch instance fields would race
  * otherwise.
  *
  * TODO(udf-codegen-pool): if intra-task UDF parallelism shows up as a bottleneck, replace the
@@ -98,7 +98,7 @@ class CometScalaUDFCodegen extends CometUDF with Logging {
       "CometScalaUDFCodegen requires non-null serialized expression bytes at arg 0")
     val bytes = exprVec.get(0)
 
-    // TODO(dict-encoded): kernels assume materialized inputs; dict-encoded vectors would fail the
+    // TODO(dict-encoded): kernels assume materialized inputs. Dict-encoded vectors would fail the
     // cast in `specFor` below. Fix is to materialize at the dispatcher (via
     // `CDataDictionaryProvider`) or widen `emitTypedGetters` with a dict-index + lookup path.
 
@@ -193,7 +193,7 @@ class CometScalaUDFCodegen extends CometUDF with Logging {
    */
   private def specFor(v: ValueVector): ArrowColumnSpec = v match {
     case map: MapVector =>
-      // MapVector extends ListVector; match it first.
+      // MapVector extends ListVector, match it first.
       val struct = map.getDataVector.asInstanceOf[StructVector]
       val keyVec = struct.getChildByOrdinal(0).asInstanceOf[ValueVector]
       val valueVec = struct.getChildByOrdinal(1).asInstanceOf[ValueVector]
@@ -253,7 +253,7 @@ class CometScalaUDFCodegen extends CometUDF with Logging {
 object CometScalaUDFCodegen {
 
   // JVM-wide counters across all per-task instances. Compile work is deduped JVM-wide via
-  // `CodeGenerator.compile`'s source cache; these track this dispatcher's per-task cache activity.
+  // `CodeGenerator.compile`'s source cache. These track this dispatcher's per-task cache activity.
   private val compileCount = new AtomicLong(0)
   private val cacheHitCount = new AtomicLong(0)
 
diff --git a/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprTraitShim.scala b/spark/src/main/spark-4.x/org/apache/comet/shims/CometExprTraitShim.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, ResolvedCollation}
 
 /**
  * Spark 4.x replaced the `NullIntolerant` marker trait with a boolean method on `Expression` and
- * added a `stateful` boolean. Neither exists as a trait in 4.x; this shim routes the checks
+ * added a `stateful` boolean. Neither exists as a trait in 4.x. This shim routes the checks
  * through the method form.
  */
 trait CometExprTraitShim {
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenFuzzSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenFuzzSuite.scala
@@ -202,7 +202,7 @@ class CometCodegenFuzzSuite
 
   /**
    * Element-level fuzz for nested array reads: `ArrayMax.doGenCode` walks every element of every
-   * row, calling the kernel's nested element getter — the path the unsafe-getter optimization
+   * row, calling the kernel's nested element getter, the path the unsafe-getter optimization
    * touches and which the cardinality probe deliberately skips.
    */
   test("array_max element fuzz: every Array<primitive> column") {
@@ -283,7 +283,7 @@ class CometCodegenFuzzSuite
 
   /**
    * Element-level fuzz for `Array<Struct<...>>`. `array_distinct` is a non-HOF unary expression
-   * that hashes each element to dedupe; struct hashing is field-wise, so the kernel emits element
+   * that hashes each element to dedupe. Struct hashing is field-wise, so the kernel emits element
    * reads on each struct's fields. `cardinality` consumes the result without materialization.
    * Asserts the optimizer keeps `ArrayDistinct` so the coverage isn't vacuously folded.
    */
@@ -316,8 +316,8 @@ class CometCodegenFuzzSuite
   }
 
   /**
-   * Top-level Array / Map → cardinality probe. Struct → drill into each scalar child via
-   * `GetStructField`; nested Array / Map sub-fields also get the cardinality probe (depth bound:
+   * Top-level Array / Map produces a cardinality probe. Struct drills into each scalar child via
+   * `GetStructField`. Nested Array / Map sub-fields also get the cardinality probe (depth bound:
    * deeper struct-of-struct nesting is skipped to keep the sweep finite).
    */
   private def probeComplexColumn(field: StructField, viewName: String): Unit = {
@@ -355,7 +355,7 @@ class CometCodegenFuzzSuite
     val intDigits = precision - scale
     // `BigInt.apply(bits, rng)` samples uniformly on `[0, 2^bits - 1]`; bound to the decimal's
     // integer-part range (10^intDigits - 1) so the result fits the schema. `BigInteger.bitLength`
-    // would overshoot slightly; min with the exact max is cheap insurance.
+    // would overshoot slightly. Min with the exact max is cheap insurance.
     val intMax = BigInt(10).pow(intDigits) - 1
     val bits = math.max(intMax.bitLength, 1)
     (0 until RowCount).map { _ =>
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenHOFSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenHOFSuite.scala
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenSourceSuite.scala
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ import org.apache.comet.shims.CometInternalRowShim`
`28`	`28`
`29`	`29`	`/**`
`30`	`30`	* Throwing-default `InternalRow` base for the codegen kernel. Subclasses override only the
`31`		`- * getters their input shape needs; centralizing the throws absorbs forward-compat breakage when`
	`31`	`+ * getters their input shape needs. Centralizing the throws absorbs forward-compat breakage when`
`32`	`32`	`* Spark adds abstract methods.`
`33`	`33`	`*`
`34`	`34`	* Two consumers: the compiled kernel (`ctx.INPUT_ROW = "row"` aliases `this`) and per-column