awslabs
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/aws/s3/private/s3_auto_ranged_get.h‎
Lines changed: 6 additions & 0 deletions b/‎include/aws/s3/private/s3_auto_ranged_get.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/aws/s3/private/s3_default_buffer_pool.h‎
Lines changed: 70 additions & 0 deletions b/‎include/aws/s3/private/s3_default_buffer_pool.h‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎include/aws/s3/private/s3_meta_request_impl.h‎
Lines changed: 12 additions & 0 deletions b/‎include/aws/s3/private/s3_meta_request_impl.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎include/aws/s3/private/s3_util.h‎
Lines changed: 10 additions & 0 deletions b/‎include/aws/s3/private/s3_util.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/aws/s3/s3_buffer_pool.h‎
Lines changed: 53 additions & 0 deletions b/‎include/aws/s3/s3_buffer_pool.h‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎include/aws/s3/s3_client.h‎
Lines changed: 4 additions & 0 deletions b/‎include/aws/s3/s3_client.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎source/s3_auto_ranged_get.c‎
Lines changed: 56 additions & 4 deletions b/‎source/s3_auto_ranged_get.c‎
Lines changed: 56 additions & 4 deletions
@@ -223,6 +223,8 @@ jobs:
       uses: actions/checkout@v4
     - name: Build ${{ env.PACKAGE_NAME }} + consumers
       run: |
+        python3 -m venv .venv
+        source .venv/bin/activate
         python3 -c "from urllib.request import urlretrieve; urlretrieve('${{ env.BUILDER_HOST }}/${{ env.BUILDER_SOURCE }}/${{ env.BUILDER_VERSION }}/builder.pyz?run=${{ env.RUN }}', 'builder')"
         chmod a+x builder
         ./builder build -p ${{ env.PACKAGE_NAME }} --cmake-extra=-DASSERT_LOCK_HELD=ON
 
@@ -23,6 +23,12 @@ struct aws_s3_auto_ranged_get {
 
     /* Estimated object stored part size based on ETag analysis */
     uint64_t estimated_object_stored_part_size;
+    /* Number of parts stored in S3. We derive this from ETag, if ETag is not formatted as expected, this will be
+     * default to 1.
+     * Note: For S3Express Append, the object will be treated as a single part, even though, it can be multiple parts
+     * stored in S3.
+     */
+    uint64_t num_stored_parts;
     /* Part size was set or not from user for this meta request. */
     bool part_size_set;
     bool force_dynamic_part_size;
 
@@ -6,6 +6,8 @@
  * SPDX-License-Identifier: Apache-2.0.
  */
 
+#include <aws/common/hash_table.h>
+#include <aws/common/mutex.h>
 #include <aws/s3/s3.h>
 #include <aws/s3/s3_buffer_pool.h>
 
@@ -59,11 +61,79 @@ struct aws_s3_default_buffer_pool_usage_stats {
     /* Secondary memory reserved, but not yet used. Accurate, maps directly to base allocator. */
     size_t secondary_reserved;
 
+    /* Overall memory allocated for special-sized blocks. */
+    size_t special_blocks_allocated;
+    /* Number of special block sizes created. */
+    size_t special_blocks_num;
+    /* Memory reserved in special-sized blocks. */
+    size_t special_blocks_reserved;
+    /* Memory used in special-sized blocks. */
+    size_t special_blocks_used;
+
     /* Bytes used in "forced" buffers (created even if they exceed memory limits).
      * This is always <= primary_used + secondary_used */
     size_t forced_used;
 };
 
+/* Structure to track special-sized blocks */
+struct s3_special_block_list {
+    struct aws_allocator *allocator;
+    size_t buffer_size;           /* Size of buffers in this list */
+    struct aws_array_list blocks; /* Array of uint8_t* pointers to allocated blocks */
+};
+
+struct aws_s3_default_buffer_pool {
+    struct aws_allocator *base_allocator;
+    struct aws_mutex mutex;
+
+    size_t block_size;
+    size_t chunk_size;
+    /* size at which allocations should go to secondary */
+    size_t primary_size_cutoff;
+
+    /* NOTE: See aws_s3_buffer_pool_usage_stats for descriptions of most fields */
+
+    size_t mem_limit;
+
+    size_t primary_allocated;
+    size_t primary_reserved;
+    size_t primary_used;
+
+    size_t special_blocks_allocated;
+    size_t special_blocks_reserved;
+    size_t special_blocks_used;
+
+    size_t secondary_reserved;
+    size_t secondary_used;
+
+    size_t forced_used;
+
+    struct aws_array_list blocks;
+
+    struct aws_linked_list pending_reserves;
+
+    /* Special-sized blocks: hash table mapping size -> struct s3_special_block_list * */
+    /* TODO: let's discuss about the special list lifetime. Should we just keep it with the memory pool? Concern is that
+     * the pool will live with the client, and may result in all sorts of special lists to be around. */
+    struct aws_hash_table special_blocks;
+
+    /* TEST ONLY: to force the special blocks alive during trim. */
+    bool force_keeping_special_blocks;
+};
+
+struct s3_pending_reserve {
+    struct aws_linked_list_node node;
+    struct aws_future_s3_buffer_ticket *ticket_future;
+    struct aws_s3_default_buffer_ticket *ticket;
+    struct aws_s3_buffer_pool_reserve_meta meta;
+};
+
+struct s3_buffer_pool_block {
+    size_t block_size;
+    uint8_t *block_ptr;
+    uint16_t alloc_bit_mask;
+};
+
 /*
  * Create new buffer pool.
  * chunk_size - specifies the size of memory that will most commonly be acquired
 
@@ -157,6 +157,8 @@ struct aws_s3_meta_request {
 
     /* Part size to use for uploads and downloads.  Passed down by the creating client. */
     const size_t part_size;
+    /* Hard limit on max connections set through the meta request option. */
+    const uint32_t max_active_connections_override;
 
     struct aws_cached_signing_config_aws *cached_signing_config;
 
@@ -166,6 +168,9 @@ struct aws_s3_meta_request {
 
     struct aws_s3_endpoint *endpoint;
 
+    /* Number of requests being sent/received over network for the meta request. */
+    struct aws_atomic_var num_requests_network;
+
     /* Event loop to schedule IO work related on, ie, reading from streams, streaming parts back to the caller, etc...
      * After the meta request is finished, this will be reset along with the client reference.*/
     struct aws_event_loop *io_event_loop;
@@ -185,6 +190,10 @@ struct aws_s3_meta_request {
 
     enum aws_s3_meta_request_type type;
     struct aws_string *s3express_session_host;
+    /* Is the meta request made to s3express bucket or not. */
+    bool is_express;
+    /* If the buffer pool optimized for the specific size or not. */
+    bool buffer_pool_optimized;
 
     struct {
         struct aws_mutex lock;
@@ -269,6 +278,9 @@ struct aws_s3_meta_request {
         /* True if this meta request is currently in the client's list. */
         bool scheduled;
 
+        /* Track the number of requests being prepared for this meta request. */
+        size_t num_request_being_prepared;
+
     } client_process_work_threaded_data;
 
     /* Anything in this structure should only ever be accessed by the meta-request from its io_event_loop thread. */
 
@@ -169,6 +169,14 @@ extern const uint64_t g_default_max_part_size;
 
 AWS_S3_API
 extern const uint64_t g_s3_optimal_range_size_alignment;
+
+AWS_S3_API
+extern const uint32_t g_s3express_connection_limitation;
+AWS_S3_API
+extern const uint64_t g_s3express_connection_limitation_part_size_threshold;
+AWS_S3_API
+extern const uint64_t g_s3express_connection_limitation_object_size_threshold;
+
 /**
  * Returns AWS_S3_REQUEST_TYPE_UNKNOWN if name doesn't map to an enum value.
  */
@@ -359,13 +367,15 @@ int aws_s3_calculate_client_optimal_range_size(
  *
  * @param client_optimal_range_size The client-level optimal range size from initialization
  * @param estimated_object_stored_part_size Estimated size of object stored parts in S3
+ * @param is_express If the request is a s3express request or not.
  * @param out_request_optimal_range_size Output parameter for calculated request-level optimal range size
  * @return AWS_OP_SUCCESS on success, AWS_OP_ERR on failure (caller should fall back to client size)
  */
 AWS_S3_API
 int aws_s3_calculate_request_optimal_range_size(
     uint64_t client_optimal_range_size,
     uint64_t estimated_object_stored_part_size,
+    bool is_express,
     uint64_t *out_request_optimal_range_size);
 
 /**
 
@@ -102,6 +102,24 @@ struct aws_s3_buffer_pool_vtable {
      **/
     void (*trim)(struct aws_s3_buffer_pool *pool);
 
+    /**
+     * Optimize the buffer pool for allocations of a specific size.
+     * Creates a separate list of blocks dedicated to this size for better memory efficiency.
+     */
+    int (*add_special_size)(struct aws_s3_buffer_pool *pool, size_t buffer_size);
+
+    /**
+     * Release all special-sized blocks from the buffer pool.
+     * This frees all memory allocated for the special size optimization.
+     */
+    void (*release_special_size)(struct aws_s3_buffer_pool *pool, size_t buffer_size);
+
+    /**
+     * Align a range size to the buffer pool's allocation strategy.
+     * Returns the optimal aligned size based on the buffer pool's configuration.
+     */
+    uint64_t (*derive_aligned_buffer_size)(struct aws_s3_buffer_pool *pool, uint64_t size);
+
     /* Implement below for custom ref count behavior. Alternatively set those to null and init the ref count. */
     struct aws_s3_buffer_pool *(*acquire)(struct aws_s3_buffer_pool *pool);
     struct aws_s3_buffer_pool *(*release)(struct aws_s3_buffer_pool *pool);
@@ -144,6 +162,41 @@ typedef struct aws_s3_buffer_pool *(aws_s3_buffer_pool_factory_fn)(struct aws_al
                                                                    struct aws_s3_buffer_pool_config config,
                                                                    void *user_data);
 
+/**
+ * Optimize the buffer pool for allocations of a specific size.
+ * Creates a separate list of blocks dedicated to this size for better memory efficiency.
+ * Allocations of exactly this size will use these special blocks instead of the regular primary/secondary storage.
+ *
+ * @param buffer_pool The buffer pool to optimize
+ * @param buffer_size The size to optimize for (must be > 0)
+ * @return AWS_OP_SUCCESS on success, AWS_OP_ERR on failure
+ */
+AWS_S3_API
+int aws_s3_buffer_pool_add_special_size(struct aws_s3_buffer_pool *buffer_pool, size_t buffer_size);
+
+/**
+ * Release the special-sized blocks from the buffer pool.
+ * Should be called when done with the special-sized allocations.
+ *
+ * @param buffer_pool The buffer pool
+ * @param buffer_size The special size to release blocks for
+ */
+AWS_S3_API
+void aws_s3_buffer_pool_release_special_size(struct aws_s3_buffer_pool *buffer_pool, size_t buffer_size);
+
+/**
+ * Align a range size to the buffer pool's allocation strategy.
+ * This function determines the optimal aligned size based on the buffer pool's configuration.
+ * For sizes within the primary allocation range, it aligns to chunk boundaries.
+ * For larger sizes that go to secondary storage, it returns the size as-is.
+ *
+ * @param buffer_pool The buffer pool to use for alignment (can be NULL, in which case size is returned unchanged)
+ * @param size The size to align
+ * @return The aligned size that's optimal for the buffer pool's allocation strategy
+ */
+AWS_S3_API
+uint64_t aws_s3_buffer_pool_derive_aligned_buffer_size(struct aws_s3_buffer_pool *buffer_pool, uint64_t size);
+
 AWS_EXTERN_C_END
 AWS_POP_SANE_WARNING_LEVEL
 
 
@@ -1005,6 +1005,10 @@ struct aws_s3_meta_request_options {
      * This will be ignored for other operations.
      */
     struct aws_byte_cursor copy_source_uri;
+
+    /* When set, this will cap the number of active connections for the meta request. When 0, the client will determine
+     * it based on client side settings. (Recommended) */
+    uint32_t max_active_connections_override;
 };
 
 /* Result details of a meta request.
 
@@ -787,7 +787,7 @@ static void s_s3_auto_ranged_get_request_finished(
                 error_code = AWS_ERROR_S3_MISSING_ETAG;
                 goto update_synced_data;
             }
-            /* Extract number of parts from ETag and calculate estimated part size */
+            /* Extract number of parts stored in S3 from ETag and calculate estimated part size */
             uint32_t num_parts = 0;
             if (aws_s3_extract_parts_from_etag(etag_header_value, &num_parts) == AWS_OP_SUCCESS && num_parts > 0) {
                 auto_ranged_get->estimated_object_stored_part_size = object_size / num_parts;
@@ -801,6 +801,7 @@ static void s_s3_auto_ranged_get_request_finished(
                     num_parts,
                     auto_ranged_get->estimated_object_stored_part_size);
             } else {
+                num_parts = 1;
                 /* Failed to parse ETags */
                 AWS_LOGF_WARN(
                     AWS_LS_S3_META_REQUEST,
@@ -809,24 +810,49 @@ static void s_s3_auto_ranged_get_request_finished(
                 auto_ranged_get->estimated_object_stored_part_size = g_default_part_size_fallback;
                 goto update_synced_data;
             }
+            auto_ranged_get->num_stored_parts = num_parts;
         }
 
         /* If we were able to discover the object-range/content length successfully, then any error code that was passed
          * into this function is being handled and does not indicate an overall failure.*/
         error_code = AWS_ERROR_SUCCESS;
         found_object_size = true;
+        uint32_t max_connections = aws_s3_client_get_max_active_connections(meta_request->client, meta_request);
 
         if (auto_ranged_get->force_dynamic_part_size ||
             (!auto_ranged_get->part_size_set && !meta_request->client->part_size_set)) {
             /* No part size has been set from user. Now we use the optimal part size based on the throughput and memory
              * limit */
             uint64_t out_request_optimal_range_size = 0;
+
             if (aws_s3_calculate_request_optimal_range_size(
                     meta_request->client->optimal_range_size,
                     auto_ranged_get->estimated_object_stored_part_size,
+                    meta_request->is_express,
                     &out_request_optimal_range_size) == AWS_OP_SUCCESS) {
+                /* Apply a buffer pool alignment to the calculated result. */
+                out_request_optimal_range_size = aws_s3_buffer_pool_derive_aligned_buffer_size(
+                    meta_request->client->buffer_pool, out_request_optimal_range_size);
+                AWS_LOGF_INFO(
+                    AWS_LS_S3_META_REQUEST,
+                    "id=%p: Override the part size to be optimal. part_size=%" PRIu64 ".",
+                    (void *)meta_request,
+                    out_request_optimal_range_size);
                 /* Override the part size to be optimal */
                 *((size_t *)&meta_request->part_size) = (size_t)out_request_optimal_range_size;
+                uint64_t parts_threshold = aws_mul_u64_saturating(max_connections, 2);
+                if (auto_ranged_get->num_stored_parts > parts_threshold) {
+                    /* If the number of parts is greater than the threshold, so that we will be reusing the buffers
+                     * enough from the buffer pool. Let's add a special block for the buffer pool to optimize the
+                     * case.*/
+                    AWS_LOGF_INFO(
+                        AWS_LS_S3_META_REQUEST,
+                        "id=%p: Apply buffer pool optimization for the size=%zu.",
+                        (void *)meta_request,
+                        meta_request->part_size);
+                    aws_s3_buffer_pool_add_special_size(meta_request->client->buffer_pool, meta_request->part_size);
+                    meta_request->buffer_pool_optimized = true;
+                }
                 if (request->request_tag == AWS_S3_AUTO_RANGE_GET_REQUEST_TYPE_HEAD_OBJECT) {
                     /* Update the first part size as well, if we haven't made the request yet. */
                     first_part_size = meta_request->part_size;
@@ -847,6 +873,31 @@ static void s_s3_auto_ranged_get_request_finished(
             }
         }
 
+        if (meta_request->is_express &&
+            meta_request->part_size < g_s3express_connection_limitation_part_size_threshold &&
+            object_size > g_s3express_connection_limitation_object_size_threshold) {
+            /**
+             * TODO: THIS IS A TEMP WORKAROUND, not the long term solution.
+             * 1. If the Part Size we set is larger than the possible size to hit the limitation, we are safe to
+             * make as many connections as we want.
+             * 2. If the object size is less than the threshold, we keep our previous behavior, as it's less likely
+             * to hit the server side limitation.
+             *
+             * Otherwise, we need to make sure the number of concurrent connections is lower than the limitation.
+             */
+            uint32_t max_active_connections_override = aws_min_u32(g_s3express_connection_limitation, max_connections);
+            if (max_active_connections_override < max_connections) {
+                /* Override the max active connections to be the limitation. */
+                *((uint32_t *)&meta_request->max_active_connections_override) =
+                    (uint32_t)max_active_connections_override;
+                AWS_LOGF_WARN(
+                    AWS_LS_S3_META_REQUEST,
+                    "id=%p: Override the max active connections for the meta request to be the limitation: %d",
+                    (void *)meta_request,
+                    max_active_connections_override);
+            }
+        }
+
         if (!empty_file_error && meta_request->headers_callback != NULL) {
             /* Modify the header received to fake the header for the whole meta request. */
             if (request->request_tag == AWS_S3_AUTO_RANGE_GET_REQUEST_TYPE_GET_OBJECT_WITH_RANGE ||
@@ -936,8 +987,8 @@ static void s_s3_auto_ranged_get_request_finished(
                 if (empty_file_error) {
                     /*
                      * Try to download the object again using GET_OBJECT_WITH_PART_NUMBER_1. If the file is still
-                     * empty, successful response headers will be provided to users. If not, the newer version of the
-                     * file will be downloaded.
+                     * empty, successful response headers will be provided to users. If not, the newer version of
+                     * the file will be downloaded.
                      */
                     auto_ranged_get->synced_data.num_parts_requested = 0;
                     auto_ranged_get->synced_data.object_range_known = 0;
@@ -999,7 +1050,8 @@ static void s_s3_auto_ranged_get_request_finished(
             }
             aws_s3_meta_request_set_fail_synced(meta_request, request, error_code);
             if (error_code == AWS_ERROR_S3_RESPONSE_CHECKSUM_MISMATCH) {
-                /* It's a mismatch of checksum, tell user that we validated the checksum and the algorithm we validated
+                /* It's a mismatch of checksum, tell user that we validated the checksum and the algorithm we
+                 * validated
                  */
                 meta_request->synced_data.finish_result.did_validate = true;
                 meta_request->synced_data.finish_result.validation_algorithm = request->validation_algorithm;