Skip to content

Commit fa8dd48

Browse files
json_printer.cu changed to use write-out buffer of 4KB (#259)
* json_printer.cu changed to use write-out buffer of 4KB The json_printer::do_process_bulk_data_float64 used to write out one float32 value at a time. This PR introduces a buffer of 4KB that is being filled with values until full, and then written out. The 4KB value aligns with system memory page size and seems appropriate for relatively small datasizes of duration measurements. * Add explicit static cast from std::size_t to std::streamsize The explcit cast avoids narrowing error. * Factor out writing array out to binary file into standalone function This function is templated based on buffer-size. The function can be reused to also write-out frequence samples in the future.
1 parent 080052a commit fa8dd48

File tree

1 file changed

+47
-22
lines changed

1 file changed

+47
-22
lines changed

nvbench/json_printer.cu

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,17 @@ static_assert(false, "No <filesystem> or <experimental/filesystem> found.");
5858
namespace
5959
{
6060

61-
bool is_little_endian()
62-
{
6361
#if NVBENCH_CPP_DIALECT >= 2020
64-
return std::endian::native == std::endian::little;
62+
constexpr bool is_little_endian() noexcept { return std::endian::native == std::endian::little; }
6563
#else
64+
bool is_little_endian() noexcept
65+
{
6666
const nvbench::uint32_t word = {0xBadDecaf};
6767
nvbench::uint8_t bytes[4];
6868
std::memcpy(bytes, &word, 4);
6969
return bytes[0] == 0xaf;
70-
#endif
7170
}
71+
#endif
7272

7373
template <typename JsonNode>
7474
void write_named_values(JsonNode &node, const nvbench::named_values &values)
@@ -106,6 +106,45 @@ void write_named_values(JsonNode &node, const nvbench::named_values &values)
106106
} // end foreach value name
107107
}
108108

109+
template <std::size_t buffer_nbytes>
110+
void write_out_values(std::ofstream &out, const std::vector<nvbench::float64_t> &data)
111+
{
112+
static constexpr std::size_t value_nbytes = sizeof(nvbench::float32_t);
113+
static_assert(buffer_nbytes % value_nbytes == 0);
114+
115+
alignas(alignof(nvbench::float32_t)) char buffer[buffer_nbytes];
116+
std::size_t bytes_in_buffer = 0;
117+
118+
for (auto value64 : data)
119+
{
120+
const auto value32 = static_cast<nvbench::float32_t>(value64);
121+
auto value_subbuffer = &buffer[bytes_in_buffer];
122+
std::memcpy(value_subbuffer, &value32, value_nbytes);
123+
124+
// the c++17 implementation of is_little_endian isn't constexpr, but
125+
// all supported compilers optimize this branch as if it were.
126+
if (!is_little_endian())
127+
{
128+
std::swap(value_subbuffer[0], value_subbuffer[3]);
129+
std::swap(value_subbuffer[1], value_subbuffer[2]);
130+
}
131+
bytes_in_buffer += value_nbytes;
132+
133+
// if buffer is full, write it out and wrap around
134+
if (bytes_in_buffer == buffer_nbytes)
135+
{
136+
out.write(buffer, static_cast<std::streamsize>(buffer_nbytes));
137+
bytes_in_buffer = 0;
138+
}
139+
} // end of foreach value64 in data
140+
141+
if (bytes_in_buffer)
142+
{
143+
out.write(buffer, static_cast<std::streamsize>(bytes_in_buffer));
144+
bytes_in_buffer = 0;
145+
}
146+
}
147+
109148
} // end namespace
110149

111150
namespace nvbench
@@ -167,24 +206,10 @@ void json_printer::do_process_bulk_data_float64(state &state,
167206
out.exceptions(out.exceptions() | std::ios::failbit | std::ios::badbit);
168207
out.open(result_path, std::ios::binary | std::ios::out);
169208

170-
// FIXME: SLOW -- Writing the binary file, 4 bytes at a time...
171-
// There are a lot of optimizations that could be done here if this ends
172-
// up being a noticeable bottleneck.
173-
for (auto value64 : data)
174-
{
175-
const auto value32 = static_cast<nvbench::float32_t>(value64);
176-
char buffer[4];
177-
std::memcpy(buffer, &value32, 4);
178-
// the c++17 implementation of is_little_endian isn't constexpr, but
179-
// all supported compilers optimize this branch as if it were.
180-
if (!is_little_endian())
181-
{
182-
using std::swap;
183-
swap(buffer[0], buffer[3]);
184-
swap(buffer[1], buffer[2]);
185-
}
186-
out.write(buffer, 4);
187-
}
209+
// choose buffer to be block size of modern SSD
210+
// see: https://github.com/NVIDIA/nvbench/issues/255
211+
constexpr std::size_t buffer_nbytes = 4096;
212+
write_out_values<buffer_nbytes>(out, data);
188213
}
189214
catch (std::exception &e)
190215
{

0 commit comments

Comments
 (0)