Skip to content

Commit e37d61e

Browse files
committed
Clean up docs trying to get line count on the diff down.
1 parent 1895a17 commit e37d61e

12 files changed

Lines changed: 49 additions & 141 deletions

File tree

docs/source/user-guide/latest/s3-credential-providers.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ Vendor implementations need the Comet SPI classes at compile time only. Use `pro
256256
```xml
257257
<dependency>
258258
<groupId>org.apache.datafusion</groupId>
259-
<artifactId>comet-common-spark${spark.version.short}_${scala.binary.version}</artifactId>
259+
<artifactId>comet-spark-spark${spark.version.short}_${scala.binary.version}</artifactId>
260260
<version>${comet.version}</version>
261261
<scope>provided</scope>
262262
</dependency>

native/core/src/cloud/s3/credential_bridge.rs

Lines changed: 8 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,7 @@
1717

1818
//! JNI bridge to the JVM `CometS3CredentialDispatcher` SPI, exposed as
1919
//! `object_store::CredentialProvider` (raw Parquet path) and `reqsign_core::ProvideCredential`
20-
//! (Iceberg via `opendal`).
21-
//!
22-
//! ```text
23-
//! JVM Native (Rust)
24-
//! --- -------------
25-
//!
26-
//! spark.hadoop.fs.s3a.comet.credential parquet/objectstore/s3.rs (object_store)
27-
//! .provider.class execution/operators/iceberg_scan.rs (opendal)
28-
//! | |
29-
//! v v
30-
//! CometS3CredentialDispatcher CometS3CredentialBridge
31-
//! ^ |
32-
//! | ensureInitialized -> long handle |
33-
//! +<--- getCredentialsForPath(handle, ...) ------+
34-
//! v
35-
//! vendor CometS3CredentialProvider -> CometS3Credentials
36-
//! ```
20+
//! (Iceberg via `opendal`). See `docs/source/contributor-guide/s3-credential-provider-design.md`.
3721
3822
use crate::execution::operators::ExecutionError;
3923
use crate::jvm_bridge::{jni_new_global_ref, jni_static_call, JVMClasses};
@@ -61,35 +45,29 @@ use std::time::Duration;
6145
/// executor from holding a stale credential for the entire job lifetime.
6246
const DEFAULT_EXPIRY_WHEN_UNKNOWN: Duration = Duration::from_secs(300);
6347

64-
/// Once-per-process latch for the "missing expiry" warning; bridges are per-scan so a per-bridge
65-
/// latch would log once per scan on the same misbehaving provider.
48+
/// Once-per-process latch for the "missing expiry" warning. Bridges are per-scan, so a per-bridge
49+
/// latch would re-log on every scan.
6650
static WARNED_MISSING_EXPIRY: OnceCell<()> = OnceCell::new();
6751

6852
/// Access intent forwarded to the Java SPI. Ordinal must match the JVM `CometS3AccessMode` enum.
6953
#[derive(Debug, Clone, Copy)]
7054
pub enum AccessMode {
7155
Read = 0,
72-
/// No native write path yet; kept so the SPI contract is complete.
7356
#[allow(dead_code)]
7457
Write = 1,
7558
}
7659

77-
/// Per-scan credential provider that delegates to the JVM SPI via JNI.
78-
///
79-
/// `handle` is the JVM-allocated identity for the `(provider_class, dispatch_key,
80-
/// catalog_properties)` triple, returned by `ensureInitialized` at construction. Per-request
81-
/// calls carry `(handle, bucket, path, mode)`, which lets the JVM disambiguate multi-tenant
82-
/// providers without re-sending the property bag and saves one JNI string allocation on the hot
83-
/// path. `bucket` and `path` are immutable for the bridge's lifetime so we cache them as JNI
84-
/// global refs.
60+
/// Per-scan credential provider that delegates to the JVM SPI via JNI. `handle` is the JVM-side
61+
/// identity for the `(provider_class, dispatch_key, catalog_properties)` triple returned by
62+
/// `ensureInitialized`. `bucket_jstr` / `path_jstr` are interned once at construction to avoid
63+
/// per-call `new_string` allocations on the hot path.
8564
pub struct CometS3CredentialBridge {
8665
provider_class: String,
8766
dispatch_key: String,
8867
bucket: String,
8968
path: String,
9069
mode: AccessMode,
9170
handle: i64,
92-
/// Cached JNI globals for the two constant String arguments to `getCredentialsForPath`.
9371
bucket_jstr: Arc<Global<JString<'static>>>,
9472
path_jstr: Arc<Global<JString<'static>>>,
9573
}
@@ -108,9 +86,6 @@ impl fmt::Debug for CometS3CredentialBridge {
10886
}
10987

11088
impl CometS3CredentialBridge {
111-
/// Run `ensureInitialized` synchronously and stash the returned handle for the bridge's
112-
/// lifetime. `dispatch_key` is the bucket on the Parquet path, the catalog name on the Iceberg
113-
/// path. `catalog_properties` is forwarded into the vendor's `initialize(Map)`.
11489
pub fn new(
11590
provider_class: impl Into<String>,
11691
dispatch_key: impl Into<String>,
@@ -232,7 +207,7 @@ fn ensure_initialized(
232207
}
233208

234209
/// Construct a `java.util.HashMap<String,String>` and populate it. Called once per bridge at
235-
/// construction (per-scan), so the per-call HashMap/put cost is amortized away from the hot path.
210+
/// construction, so per-call HashMap/put cost stays off the hot path.
236211
fn build_java_string_map<'a>(
237212
env: &mut jni::Env<'a>,
238213
map: &HashMap<String, String>,

native/core/src/execution/operators/iceberg_scan.rs

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,8 @@ use datafusion_comet_spark_expr::EvalMode;
5151
use datafusion_physical_expr_adapter::{PhysicalExprAdapter, PhysicalExprAdapterFactory};
5252
use iceberg::scan::FileScanTask;
5353

54-
/// Iceberg-namespaced activation knob for the `CometS3CredentialProvider` SPI, read from a Spark
55-
/// catalog's `s3.*` property bag (`spark.sql.catalog.<name>.s3.comet.credential.provider.class`).
56-
/// Mirrors the Hadoop-style `comet.credential.provider.class` used on the Parquet/object_store
57-
/// path, but lives here because only the Iceberg path consumes it.
54+
/// Activation key for the `CometS3CredentialProvider` SPI on the Iceberg path, read from a Spark
55+
/// catalog's `s3.*` property bag.
5856
const ICEBERG_PROVIDER_CLASS_PROPERTY: &str = "s3.comet.credential.provider.class";
5957

6058
/// Iceberg table scan operator that uses iceberg-rust to read Iceberg tables.
@@ -71,9 +69,8 @@ pub struct IcebergScanExec {
7169
/// may contain OAuth tokens, REST `credentials.uri`, and other secrets the credential bridge
7270
/// needs. Redacted in `Debug` so plan dumps and tracing do not leak credentials.
7371
catalog_properties: HashMap<String, String>,
74-
/// Spark V2 catalog name; forwarded as dispatchKey to the credential bridge so multiple
75-
/// catalogs sharing one provider FQCN get isolated provider instances. Empty when the table
76-
/// has no catalog identity.
72+
/// Spark V2 catalog name; forwarded as dispatchKey to the credential bridge. Empty when the
73+
/// table has no catalog identity.
7774
catalog_name: String,
7875
/// Pre-planned file scan tasks
7976
tasks: Vec<FileScanTask>,
@@ -269,10 +266,8 @@ impl IcebergScanExec {
269266

270267
const STORAGE_PROPERTY_PREFIXES: &[&str] = &["s3.", "gcs.", "adls.", "client."];
271268

272-
/// Wires the configured Comet credential provider into opendal's S3 service for this scan, or
273-
/// returns `None` so opendal falls back to its default credential chain. Iceberg passes its
274-
/// per-catalog properties (`spark.sql.catalog.<name>.*` after Spark stripping), so the activation
275-
/// key here is `s3.comet.credential.provider.class` to match Iceberg's `s3.*` namespace.
269+
/// Wires the configured Comet credential provider into opendal's S3 service, or returns `None`
270+
/// so opendal falls back to its default credential chain.
276271
fn build_s3_credential_loader(
277272
metadata_location: &str,
278273
catalog_properties: &HashMap<String, String>,
@@ -284,10 +279,8 @@ fn build_s3_credential_loader(
284279
.get(ICEBERG_PROVIDER_CLASS_PROPERTY)
285280
.map(|s| s.trim())
286281
.filter(|s| !s.is_empty())?;
287-
// Catalog name scopes provider instances on the JVM dispatcher so two catalogs sharing one
288-
// provider class get isolated state. Falls back to the bucket when the table has no catalog
289-
// identity (e.g. HadoopTables loaded by raw path), keeping the previous behavior in that
290-
// case.
282+
// Fall back to the bucket when the table has no catalog identity (e.g. HadoopTables loaded by
283+
// raw path).
291284
let dispatch_key: &str = if catalog_name.is_empty() {
292285
bucket
293286
} else {

native/core/src/parquet/objectstore/s3.rs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,7 @@ pub fn create_store(
7979
source: "Missing bucket name in S3 URL".into(),
8080
})?;
8181

82-
// Parquet path: dispatch_key = bucket (matches the per-bucket override config namespace);
83-
// catalog_properties is empty since vendors on the Parquet path read from Hadoop conf, not
84-
// catalog props.
82+
// Parquet path: catalog_properties is empty; vendors here read from Hadoop conf.
8583
let empty_props: HashMap<String, String> = HashMap::new();
8684
let bridge = match lookup_provider_class(configs, bucket) {
8785
Some(provider_class) => match CometS3CredentialBridge::new(
@@ -322,14 +320,10 @@ pub(super) fn get_config_trimmed<'a>(
322320
get_config(configs, bucket, property).map(|s| s.trim())
323321
}
324322

325-
/// Hadoop-style config key (without `fs.s3a.` prefix) naming the vendor `CometS3CredentialProvider`
326-
/// FQCN. Looked up via [`get_config_trimmed`] so the per-bucket override
327-
/// (`fs.s3a.bucket.<name>.comet.credential.provider.class`) is honored before the global
328-
/// (`fs.s3a.comet.credential.provider.class`).
323+
/// Activation key (without `fs.s3a.` prefix) naming the vendor `CometS3CredentialProvider` FQCN.
324+
/// Per-bucket override is honored via [`get_config_trimmed`].
329325
const PROVIDER_CLASS_PROPERTY: &str = "comet.credential.provider.class";
330326

331-
/// Returns the configured `CometS3CredentialProvider` FQCN for the given bucket, or `None` when
332-
/// no provider is registered. Trims surrounding whitespace and treats an empty string as unset.
333327
fn lookup_provider_class<'a>(
334328
configs: &'a HashMap<String, String>,
335329
bucket: &str,

native/jni-bridge/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,6 @@ pub struct JVMClasses<'a> {
236236
/// `None` if the class is not on the classpath.
237237
pub comet_udf_bridge: Option<CometUdfBridge<'a>>,
238238
/// JNI handles for the CometS3CredentialDispatcher SPI and the CometS3Credentials POJO.
239-
/// Always present (the classes ship in `comet-common`); whether a vendor provider is actually
240-
/// registered is a separate runtime check.
241239
pub comet_s3_credential_dispatcher: CometS3CredentialDispatcher<'a>,
242240
}
243241

spark/src/main/java/org/apache/comet/cloud/s3/CometS3CredentialContext.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,8 @@
2424
/**
2525
* Per-request context passed to {@link
2626
* CometS3CredentialProvider#getCredentialsForPath(CometS3CredentialContext)}. New fields can be
27-
* added here without breaking vendors compiled against earlier Comet versions, so the SPI method
28-
* signature does not change when Comet learns to forward additional per-request information (e.g.
29-
* region, write-target ARN).
27+
* added here without changing the SPI method signature, so vendors compiled against earlier
28+
* versions stay binary-compatible.
3029
*/
3130
public final class CometS3CredentialContext {
3231

spark/src/main/java/org/apache/comet/cloud/s3/CometS3CredentialProvider.java

Lines changed: 11 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,50 +22,32 @@
2222
import java.util.Map;
2323

2424
/**
25-
* SPI for supplying AWS credentials to Comet's native S3 readers, which bypass Spark's Hadoop S3A
26-
* code path. Vendors implement this when path-aware or vendor-managed credential mechanisms cannot
27-
* be reached through the standard parameterless {@code AWSCredentialsProvider.getCredentials()}
28-
* contract.
25+
* SPI for supplying AWS credentials to Comet's native S3 readers, which bypass Hadoop S3A. See the
26+
* user guide (operator setup, vendor contract) and the contributor-guide design notes for the
27+
* rationale.
2928
*
30-
* <p>Vendors register an implementation by setting {@code
31-
* spark.hadoop.fs.s3a.comet.credential.provider.class} (or the per-bucket form {@code
32-
* spark.hadoop.fs.s3a.bucket.<name>.comet.credential.provider.class}) for the Parquet path, or
33-
* {@code spark.sql.catalog.<catalog>.s3.comet.credential.provider.class} for the Iceberg path. The
34-
* class must have a public no-arg constructor.
35-
*
36-
* <p>{@link #initialize(Map)} runs once per Comet-cached instance before any {@link
37-
* #getCredentialsForPath} call, must be cheap and non-blocking, and may receive secrets in its map.
38-
* {@link #getCredentialsForPath} may be invoked concurrently from many native worker threads so
39-
* implementations must be thread-safe; it returns credentials or throws (no fall-through).
40-
*
41-
* <p>See the user guide on S3 credential providers for caching, refresh, and multi-tenant isolation
42-
* guidance.
29+
* <p>{@link #getCredentialsForPath} may be invoked concurrently from many native worker threads;
30+
* implementations must be thread-safe. It returns credentials or throws (no fall-through).
4331
*/
4432
public interface CometS3CredentialProvider extends AutoCloseable {
4533

4634
/**
47-
* Called once per {@code (FQCN, dispatchKey)} on each executor before any {@link
48-
* #getCredentialsForPath} call. The {@code catalogProperties} map carries the full FileIO
49-
* property bag for the Iceberg path (including {@code credentials.uri}, OAuth tokens, vendor keys
50-
* like {@code tenant-id}) and is empty on the Parquet path. The default no-op keeps Parquet
51-
* vendors source-compatible.
35+
* Called once per Comet-cached instance before any {@link #getCredentialsForPath} call. Must be
36+
* cheap and non-blocking. On the Iceberg path the map carries the unfiltered FileIO bag; on the
37+
* Parquet path it is empty.
5238
*
53-
* @param catalogProperties unfiltered FileIO/catalog properties; may contain secrets, do not log
39+
* @param catalogProperties may contain secrets, do not log
5440
*/
5541
default void initialize(Map<String, String> catalogProperties) {}
5642

5743
/**
58-
* @param context per-request context (bucket, path, access mode). Fields can be added to {@link
59-
* CometS3CredentialContext} in future Comet releases without changing this method signature,
60-
* so vendors compiled against today's API stay binary-compatible.
6144
* @return non-null credentials; {@code null} is a contract violation
6245
*/
6346
CometS3Credentials getCredentialsForPath(CometS3CredentialContext context) throws Exception;
6447

6548
/**
66-
* Releases vendor-owned resources (HTTP clients, refresh executors, STS connection pools).
67-
* Invoked from a best-effort JVM shutdown hook installed by the dispatcher; default no-op suits
68-
* stateless providers. The hook swallows exceptions thrown here.
49+
* Invoked from the dispatcher's best-effort JVM shutdown hook. Default no-op suits stateless
50+
* providers; override to release HTTP clients, refresh executors, etc.
6951
*/
7052
@Override
7153
default void close() throws Exception {}

spark/src/main/java/org/apache/comet/cloud/s3/CometS3Credentials.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,9 @@
2222
import java.util.Objects;
2323

2424
/**
25-
* Credentials returned by a {@link CometS3CredentialProvider}. Fields are read back over JNI by
26-
* name, so the field names are part of the cross-language contract.
27-
*
28-
* <p>{@code sessionToken} is null for non-STS credentials. {@code expirationEpochMillis} of {@code
29-
* 0} means "unknown"; the Iceberg path then caps opendal's cache at a short fallback to avoid
30-
* serving stale credentials for the executor lifetime.
25+
* Credentials returned by a {@link CometS3CredentialProvider}. Field names are read back over JNI
26+
* by name and are part of the cross-language contract. {@code sessionToken} is null for non-STS
27+
* credentials; {@code expirationEpochMillis} of {@code 0} means "unknown".
3128
*/
3229
public final class CometS3Credentials {
3330

common/src/main/java/org/apache/comet/util/ClassLoaders.java renamed to spark/src/main/java/org/apache/comet/util/ClassLoaders.java

File renamed without changes.

spark/src/main/scala/org/apache/comet/iceberg/IcebergReflection.scala

Lines changed: 9 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -664,10 +664,8 @@ object IcebergReflection extends Logging {
664664
* @param catalogProperties
665665
* Catalog properties for FileIO (S3 credentials, regions, etc.)
666666
* @param catalogName
667-
* Spark V2 catalog name that loaded this table, if it can be derived. Forwarded as
668-
* `dispatchKey` to the native CometS3CredentialBridge so two catalogs sharing one provider FQCN
669-
* get isolated provider instances. `None` when the table has no catalog identity (e.g.
670-
* HadoopTables loaded by raw path).
667+
* Spark V2 catalog name forwarded as `dispatchKey` to CometS3CredentialBridge. `None` when the
668+
* table has no catalog identity (e.g. HadoopTables loaded by raw path).
671669
*/
672670
case class CometIcebergNativeScanMetadata(
673671
table: Any,
@@ -740,20 +738,16 @@ object CometIcebergNativeScanMetadata extends Logging {
740738
}
741739

742740
/**
743-
* Best-effort extraction of the Spark V2 catalog name from an Iceberg `Table`. Iceberg's
744-
* `Table.name()` returns `catalog.namespace.table` for tables loaded through a catalog. We
745-
* intersect that name against the V2 catalogs Spark has registered so a value like `s3.foo` is
746-
* not mistaken for a catalog `s3` when no such catalog exists. Falls back to the dotted-prefix
747-
* split when the catalog manager is not reachable or the name does not match. Returns `None`
748-
* when the table has no catalog identity (e.g. HadoopTables loaded by raw path) or when
749-
* reflection fails.
741+
* Extracts the Spark V2 catalog name from an Iceberg `Table`. `Table.name()` returns
742+
* `catalog.namespace.table` for tables loaded through a catalog; we intersect against the
743+
* registered V2 catalogs so a value like `s3.foo` is not mistaken for a catalog `s3`. Returns
744+
* `None` for HadoopTables loaded by raw path or when reflection fails.
750745
*/
751746
private[iceberg] def deriveCatalogName(table: Any): Option[String] =
752747
deriveCatalogName(table, registeredCatalogNames _)
753748

754749
/**
755-
* Test seam for [[deriveCatalogName(table:Any)]]. The `knownCatalogNames` thunk lets tests
756-
* inject a fixed catalog set without bootstrapping a SparkSession.
750+
* Test seam that lets tests inject a fixed catalog set without bootstrapping a SparkSession.
757751
*/
758752
private[iceberg] def deriveCatalogName(
759753
table: Any,
@@ -773,15 +767,6 @@ object CometIcebergNativeScanMetadata extends Logging {
773767
}
774768
}
775769

776-
/**
777-
* Calls Iceberg's public `Table.name()` reflectively. Uses `getMethod` so the interface default
778-
* is reachable when a concrete table class does not override it. Matches the pattern used for
779-
* `Field.name()` / `Column.name()` elsewhere in this file.
780-
*
781-
* `name()` is a default method on `org.apache.iceberg.Table`. A thrown exception here means the
782-
* classpath is wrong or the object is not actually an Iceberg Table, so log at `warn` to make
783-
* it visible. A `null` return is legitimate (anonymous tables) and not noteworthy.
784-
*/
785770
private def invokeTableName(table: Any): Option[String] = {
786771
try {
787772
table.getClass.getMethod("name").invoke(table) match {
@@ -791,10 +776,8 @@ object CometIcebergNativeScanMetadata extends Logging {
791776
}
792777
} catch {
793778
case e: Exception =>
794-
logWarning(
795-
s"Iceberg reflection: Table.name() not callable on ${table.getClass.getName}; " +
796-
"native S3 credential dispatch will fall back to bucket-keyed isolation: " +
797-
s"${e.getMessage}")
779+
logWarning(s"Iceberg reflection: Table.name() not callable on ${table.getClass.getName}; " +
780+
s"native S3 credential dispatch will fall back to bucket-keyed isolation: ${e.getMessage}")
798781
None
799782
}
800783
}

0 commit comments

Comments
 (0)