Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions include/aws/s3/private/s3_meta_request_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,13 @@ struct aws_s3_meta_request {
FILE *recv_file;
struct aws_string *recv_filepath;
bool recv_file_delete_on_failure;
/* When true, use O_DIRECT for writing received data to file */
bool recv_file_direct_io;
/* Counter for how many times we fell back from O_DIRECT to buffered I/O for a single part.
* For unaligned last part: expected to be 1.
* For unsupported platform: 1 on first fallback, then direct_io is disabled (no further increments).
* The warning is only logged when this transitions from 0, to avoid log spam. */
size_t recv_file_direct_io_fallback_count;

/* File I/O options. */
struct aws_s3_file_io_options fio_opts;
Expand Down
5 changes: 4 additions & 1 deletion include/aws/s3/s3_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,10 @@ struct aws_s3_file_io_options {
* Enable direct IO to bypass the OS cache. Helpful when the disk I/O outperforms the kernel cache.
* Notes:
* - Only supported on linux for now.
* - Only supports upload for now.
* - Supported for both upload (send_filepath) and download (recv_filepath).
* - For download, O_DIRECT is only supported with AWS_S3_RECV_FILE_CREATE_OR_REPLACE
* and AWS_S3_RECV_FILE_CREATE_NEW (i.e. writing from the beginning of the file).
* APPEND and WRITE_TO_POSITION are not supported with O_DIRECT.
* - Check NOTES for O_DIRECT for additional info https://man7.org/linux/man-pages/man2/openat.2.html
* In summary, O_DIRECT is a potentially powerful tool that should be used with caution.
*/
Expand Down
11 changes: 11 additions & 0 deletions source/s3_auto_ranged_get.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "aws/s3/private/s3_request_messages.h"
#include "aws/s3/private/s3_util.h"
#include <aws/common/string.h>
#include <aws/common/system_info.h>
#include <inttypes.h>

/* Dont use buffer pool when we know response size, and its below this number,
Expand Down Expand Up @@ -837,6 +838,16 @@ static void s_s3_auto_ranged_get_request_finished(
/* Apply a buffer pool alignment to the calculated result. */
out_request_optimal_range_size = aws_s3_buffer_pool_derive_aligned_buffer_size(
meta_request->client->buffer_pool, out_request_optimal_range_size);
/* For O_DIRECT download, also ensure page alignment.
* Buffer pool typically aligns to chunk_size which is page-aligned, but apply
* a defensive round-up here to guarantee the invariant. */
if (meta_request->recv_file_direct_io) {
size_t page_size = aws_system_info_page_size();
if (out_request_optimal_range_size % page_size != 0) {
out_request_optimal_range_size =
((out_request_optimal_range_size / page_size) + 1) * page_size;
}
}
AWS_LOGF_INFO(
AWS_LS_S3_META_REQUEST,
"id=%p: Override the part size to be optimal. part_size=%" PRIu64 ".",
Expand Down
176 changes: 151 additions & 25 deletions source/s3_meta_request.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,27 @@ int aws_s3_meta_request_init_base(
if (options->recv_filepath.len > 0) {

meta_request->recv_filepath = aws_string_new_from_cursor(allocator, &options->recv_filepath);
meta_request->recv_file_delete_on_failure = options->recv_file_delete_on_failure;

bool direct_io = meta_request->fio_opts.direct_io;

/* For O_DIRECT download, part_size must be page-aligned so that all parts except the last
* can be written via O_DIRECT. The last part's unaligned tail falls back to buffered write. */
if (direct_io) {
size_t page_size = aws_system_info_page_size();
if (part_size % page_size != 0) {
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p: Invalid meta request configuration - direct_io download requires part size "
"to be aligned with page size. part size is:%zu, while page size is:%zu",
(void *)meta_request,
part_size,
page_size);
aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
goto error;
}
}

switch (options->recv_file_option) {
case AWS_S3_RECV_FILE_CREATE_OR_REPLACE:
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "wb");
Expand All @@ -299,40 +320,65 @@ int aws_s3_meta_request_init_base(
"id=%p Cannot receive file via CREATE_NEW: file already exists",
(void *)meta_request);
aws_raise_error(AWS_ERROR_S3_RECV_FILE_ALREADY_EXISTS);
break;
} else {
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "wb");
break;
goto error;
}
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "wb");
break;

case AWS_S3_RECV_FILE_CREATE_OR_APPEND:
if (direct_io) {
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p O_DIRECT for download is only supported with CREATE_OR_REPLACE and CREATE_NEW",
(void *)meta_request);
aws_raise_error(AWS_ERROR_UNSUPPORTED_OPERATION);
goto error;
}
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "ab");
break;

case AWS_S3_RECV_FILE_WRITE_TO_POSITION:
if (direct_io) {
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p O_DIRECT for download is only supported with CREATE_OR_REPLACE and CREATE_NEW",
(void *)meta_request);
aws_raise_error(AWS_ERROR_UNSUPPORTED_OPERATION);
goto error;
}
if (!aws_path_exists(meta_request->recv_filepath)) {
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p Cannot receive file via WRITE_TO_POSITION: file not found.",
(void *)meta_request);
aws_raise_error(AWS_ERROR_S3_RECV_FILE_NOT_FOUND);
break;
} else {
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "r+");
if (meta_request->recv_file &&
aws_fseek(meta_request->recv_file, options->recv_file_position, SEEK_SET) != AWS_OP_SUCCESS) {
/* error out. */
goto error;
}
break;
goto error;
}
meta_request->recv_file = aws_fopen(aws_string_c_str(meta_request->recv_filepath), "r+");
if (meta_request->recv_file &&
aws_fseek(meta_request->recv_file, options->recv_file_position, SEEK_SET) != AWS_OP_SUCCESS) {
goto error;
}
break;

default:
AWS_ASSERT(false);
aws_raise_error(AWS_ERROR_INVALID_ARGUMENT);
break;
goto error;
}

if (!meta_request->recv_file) {
goto error;
}

/* For O_DIRECT, the file is already created via aws_fopen above (file now exists on disk).
* Keep the FILE* open — it's used as the fallback when O_DIRECT can't be used
* (unaligned last part or platform doesn't support O_DIRECT). */
if (direct_io) {
meta_request->recv_file_direct_io = true;
AWS_LOGF_DEBUG(
AWS_LS_S3_META_REQUEST, "id=%p: O_DIRECT enabled for download write path.", (void *)meta_request);
}
}

/* If the request's body is being passed in some other way, set that up.
Expand Down Expand Up @@ -569,6 +615,9 @@ static void s_s3_meta_request_destroy(void *user_data) {
/* If the meta request succeed, the file should be closed from finish call. So it must be failing. */
aws_file_delete(meta_request->recv_filepath);
}
} else if (meta_request->recv_file_direct_io && meta_request->recv_file_delete_on_failure) {
/* O_DIRECT path: no FILE* to close, but still honor delete-on-failure during teardown */
aws_file_delete(meta_request->recv_filepath);
}
aws_string_destroy(meta_request->recv_filepath);

Expand Down Expand Up @@ -2014,6 +2063,24 @@ static bool s_should_apply_backpressure(struct aws_s3_request *request) {
return false;
}

/* Helper: write the response body to recv_file with fwrite. Sets *out_error_code on failure. */
static void s_buffered_write_to_recv_file(
struct aws_s3_meta_request *meta_request,
const struct aws_byte_cursor *response_body,
int *out_error_code) {
if (fwrite((void *)response_body->ptr, response_body->len, 1, meta_request->recv_file) < 1) {
int errno_value = ferror(meta_request->recv_file) ? errno : 0; /* Always cache errno */
aws_translate_and_raise_io_error_or(errno_value, AWS_ERROR_FILE_WRITE_FAILURE);
*out_error_code = aws_last_error();
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p Failed writing to file. errno:%d. aws-error:%s",
(void *)meta_request,
errno_value,
aws_error_name(*out_error_code));
}
}

/* Deliver events in event_delivery_array.
* This task runs on the meta-request's io_event_loop thread. */
static void s_s3_meta_request_event_delivery_task(struct aws_task *task, void *arg, enum aws_task_status task_status) {
Expand Down Expand Up @@ -2139,20 +2206,75 @@ static void s_s3_meta_request_event_delivery_task(struct aws_task *task, void *a
aws_high_res_clock_get_ticks((uint64_t *)&metric->time_metrics.deliver_start_timestamp_ns);
}

if (meta_request->recv_file) {
/* Write the data directly to the file. No need to seek, since the event will always be
* delivered with the right order. */
if (fwrite((void *)response_body.ptr, response_body.len, 1, meta_request->recv_file) < 1) {
int errno_value = ferror(meta_request->recv_file) ? errno : 0; /* Always cache errno */
aws_translate_and_raise_io_error_or(errno_value, AWS_ERROR_FILE_WRITE_FAILURE);
error_code = aws_last_error();
AWS_LOGF_ERROR(
if (meta_request->recv_file_direct_io) {
/* O_DIRECT write path — use offset-based direct I/O */
uint64_t write_offset = delivery_range_start;
struct aws_byte_cursor write_cursor =
aws_byte_cursor_from_array(response_body.ptr, response_body.len);

/* Check if this chunk is page-aligned. Only the last part of a download
* can have unaligned length. Use buffered write for that case. */
size_t page_size = aws_system_info_page_size();
bool use_direct_io = (response_body.len % page_size == 0);

if (!use_direct_io && meta_request->recv_file_direct_io_fallback_count == 0) {
AWS_LOGF_WARN(
AWS_LS_S3_META_REQUEST,
"id=%p Failed writing to file. errno:%d. aws-error:%s",
"id=%p: O_DIRECT requested but data length %zu is not page-aligned "
"(page size %zu). Falling back to buffered I/O for this part. "
"This is expected for the last part of a download.",
(void *)meta_request,
errno_value,
aws_error_name(error_code));
response_body.len,
page_size);
}
if (!use_direct_io) {
++meta_request->recv_file_direct_io_fallback_count;
}

if (use_direct_io) {
if (aws_file_path_write_to_offset_direct_io(
meta_request->recv_filepath, write_offset, write_cursor)) {
if (aws_last_error() == AWS_ERROR_UNSUPPORTED_OPERATION) {
/* Platform doesn't support O_DIRECT, fall back to buffered I/O */
if (meta_request->recv_file_direct_io_fallback_count == 0) {
AWS_LOGF_WARN(
AWS_LS_S3_META_REQUEST,
"id=%p: O_DIRECT write not supported on this platform, "
"falling back to buffered I/O for the rest of this download",
(void *)meta_request);
}
++meta_request->recv_file_direct_io_fallback_count;
meta_request->recv_file_direct_io = false;
aws_reset_error();
use_direct_io = false;
} else {
/* Real I/O error — hard fail */
error_code = aws_last_error();
AWS_LOGF_ERROR(
AWS_LS_S3_META_REQUEST,
"id=%p Failed writing to file with O_DIRECT. aws-error:%s",
(void *)meta_request,
aws_error_name(error_code));
}
}
}

if (!use_direct_io && error_code == AWS_ERROR_SUCCESS) {
/* Buffered write fallback. recv_file is already open from init.
* Need to seek because direct_io path doesn't update FILE* position. */
if (aws_fseek(meta_request->recv_file, (int64_t)write_offset, SEEK_SET) !=
AWS_OP_SUCCESS) {
error_code = aws_last_error();
} else {
s_buffered_write_to_recv_file(meta_request, &response_body, &error_code);
}
}
if (meta_request->client->enable_read_backpressure) {
aws_s3_meta_request_increment_read_window(meta_request, response_body.len);
}
} else if (meta_request->recv_file) {
/* Regular FILE* path. No need to seek — events arrive in order. */
s_buffered_write_to_recv_file(meta_request, &response_body, &error_code);
if (meta_request->client->enable_read_backpressure) {
aws_s3_meta_request_increment_read_window(meta_request, response_body.len);
}
Expand Down Expand Up @@ -2417,6 +2539,10 @@ void aws_s3_meta_request_finish_default(struct aws_s3_meta_request *meta_request
if (finish_result.error_code && meta_request->recv_file_delete_on_failure) {
aws_file_delete(meta_request->recv_filepath);
}
} else if (
meta_request->recv_file_direct_io && finish_result.error_code && meta_request->recv_file_delete_on_failure) {
/* O_DIRECT path has no FILE* to close, but still honor delete-on-failure */
aws_file_delete(meta_request->recv_filepath);
}

while (!aws_linked_list_empty(&release_request_list)) {
Expand Down
7 changes: 7 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ add_net_test_case(test_s3_get_object_file_path)
add_net_test_case(test_s3_get_object_file_path_create_new)
add_net_test_case(test_s3_get_object_file_path_append)
add_net_test_case(test_s3_get_object_file_path_to_position)
add_net_test_case(test_s3_get_object_file_path_direct_io)
add_net_test_case(test_s3_get_object_file_path_direct_io_content_verify)
add_net_test_case(test_s3_get_object_file_path_direct_io_unsupported_append)
add_net_test_case(test_s3_get_object_file_path_direct_io_unsupported_write_to_position)
add_net_test_case(test_s3_get_object_file_path_direct_io_multi_part)
add_net_test_case(test_s3_get_object_file_path_direct_io_unaligned_part_size)
add_net_test_case(test_s3_get_object_file_path_direct_io_unaligned_last_part)
add_net_test_case(test_s3_get_object_empty_object)
add_net_test_case(test_s3_get_object_multiple)
add_net_test_case(test_s3_get_object_multiple_serial)
Expand Down
Loading
Loading