Skip to content

Commit 33307a2

Browse files
json_printer.cu changed to use write-out buffer of 4KB
The json_printer::do_process_bulk_data_float64 used to write out one float32 value at a time. This PR introduces a buffer of 4KB that is being filled with values until full, and then written out. The 4KB value aligns with system memory page size and seems appropriate for relatively small datasizes of duration measurements.
1 parent 0c24f02 commit 33307a2

File tree

1 file changed

+32
-13
lines changed

1 file changed

+32
-13
lines changed

nvbench/json_printer.cu

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,17 @@ static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
5858
namespace
5959
{
6060

61-
bool is_little_endian()
62-
{
6361
#if NVBENCH_CPP_DIALECT >= 2020
64-
return std::endian::native == std::endian::little;
62+
constexpr bool is_little_endian() noexcept { return std::endian::native == std::endian::little; }
6563
#else
64+
bool is_little_endian() noexcept
65+
{
6666
const nvbench::uint32_t word = {0xBadDecaf};
6767
nvbench::uint8_t bytes[4];
6868
std::memcpy(bytes, &word, 4);
6969
return bytes[0] == 0xaf;
70-
#endif
7170
}
71+
#endif
7272

7373
template <typename JsonNode>
7474
void write_named_values(JsonNode &node, const nvbench::named_values &values)
@@ -167,23 +167,42 @@ void json_printer::do_process_bulk_data_float64(state &state,
167167
out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
168168
out.open(result_path, std::ios::binary | std::ios::out);
169169

170-
// FIXME: SLOW -- Writing the binary file, 4 bytes at a time...
171-
// There are a lot of optimizations that could be done here if this ends
172-
// up being a noticeable bottleneck.
170+
// choose buffer to be block size of modern SSD
171+
static constexpr std::size_t buffer_nbytes = 4096;
172+
static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t);
173+
static_assert(buffer_nbytes % value_nbytes == 0);
174+
175+
alignas(alignof(nvbench::float32_t)) char buffer[buffer_nbytes];
176+
std::size_t bytes_in_buffer = 0;
177+
173178
for (auto value64 : data)
174179
{
175-
const auto value32 = static_cast<nvbench::float32_t>(value64);
176-
char buffer[4];
177-
std::memcpy(buffer, &value32, 4);
180+
const auto value32 = static_cast<nvbench::float32_t>(value64);
181+
auto value_subbuffer = &buffer[bytes_in_buffer];
182+
std::memcpy(value_subbuffer, &value32, value_nbytes);
183+
178184
// the c++17 implementation of is_little_endian isn't constexpr, but
179185
// all supported compilers optimize this branch as if it were.
180186
if (!is_little_endian())
181187
{
182188
using std::swap;
183-
swap(buffer[0], buffer[3]);
184-
swap(buffer[1], buffer[2]);
189+
swap(value_subbuffer[0], value_subbuffer[3]);
190+
swap(value_subbuffer[1], value_subbuffer[2]);
191+
}
192+
bytes_in_buffer += value_nbytes;
193+
194+
// if buffer is full, write it out and wrap around
195+
if (bytes_in_buffer == buffer_nbytes)
196+
{
197+
out.write(buffer, buffer_nbytes);
198+
bytes_in_buffer = 0;
185199
}
186-
out.write(buffer, 4);
200+
} // end of foreach value64 in data
201+
202+
if (bytes_in_buffer)
203+
{
204+
out.write(buffer, bytes_in_buffer);
205+
bytes_in_buffer = 0;
187206
}
188207
}
189208
catch (std::exception &e)

0 commit comments

Comments
 (0)