I write one example server and benchmark tool, but I see performance is not good. I write similar tcp server can reach 3GB/s with one tcp connection and one thread.
but beast show very low BW.
// -----------------------------------------------------------------------------
// Boost.Beast Benchmark - Runs Server and Client Together
//
// This program starts an async HTTP server and then runs a client benchmark
// against it, measuring pure HTTP performance.
// -----------------------------------------------------------------------------
#include <boost/asio.hpp>
#include <boost/beast.hpp>
#include <boost/beast/http.hpp>
#include <iostream>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include <chrono>
#include <algorithm>
#include <iomanip>
#include <mutex>
#include <condition_variable>
// Include server and client components
// For simplicity, we'll inline the key components here
namespace asio = boost::asio;
namespace beast = boost::beast;
namespace http = beast::http;
using tcp = asio::ip::tcp;
// Forward declarations - we'll use simplified versions
class BenchmarkServer;
class BenchmarkClient;
struct BenchmarkStats
{
std::mutex mutex;
int success_count = 0;
int failure_count = 0;
std::vector<double> latencies_ms;
size_t total_bytes = 0;
std::chrono::steady_clock::time_point start_time;
std::chrono::steady_clock::time_point end_time;
void record_success(double latency_ms, size_t bytes)
{
std::lock_guard<std::mutex> lock(mutex);
success_count++;
latencies_ms.push_back(latency_ms);
total_bytes += bytes;
}
void record_failure()
{
std::lock_guard<std::mutex> lock(mutex);
failure_count++;
}
void print_results()
{
std::lock_guard<std::mutex> lock(mutex);
auto duration = std::chrono::duration<double>(
end_time - start_time).count();
std::cout << "\n" << std::string(70, '=') << "\n";
std::cout << "Boost.Beast Benchmark Results\n";
std::cout << std::string(70, '=') << "\n";
std::cout << "Total requests: " << (success_count + failure_count) << "\n";
std::cout << " Success: " << success_count << "\n";
std::cout << " Failed: " << failure_count << "\n";
std::cout << "Duration: " << std::fixed << std::setprecision(2)
<< duration << " seconds\n";
if (success_count > 0 && duration > 0)
{
double throughput_mbps = (total_bytes * 8.0) / (duration * 1'000'000.0);
double ops_per_sec = success_count / duration;
std::cout << "Throughput: " << std::fixed << std::setprecision(2)
<< throughput_mbps << " Mbps ("
<< (throughput_mbps / 8.0) << " MB/s)\n";
std::cout << "Operations/sec: " << ops_per_sec << "\n";
}
if (!latencies_ms.empty())
{
std::sort(latencies_ms.begin(), latencies_ms.end());
size_t n = latencies_ms.size();
double sum = 0;
for (auto l : latencies_ms) sum += l;
std::cout << "\nLatency (milliseconds):\n";
std::cout << " Min: " << std::fixed << std::setprecision(2)
<< latencies_ms[0] << "\n";
std::cout << " Max: " << latencies_ms[n - 1] << "\n";
std::cout << " Average: " << (sum / n) << "\n";
std::cout << " Median (p50): " << latencies_ms[n / 2] << "\n";
if (n > 0)
{
std::cout << " p95: "
<< latencies_ms[static_cast<size_t>(n * 0.95)] << "\n";
std::cout << " p99: "
<< latencies_ms[static_cast<size_t>(n * 0.99)] << "\n";
}
}
std::cout << std::string(70, '=') << "\n";
}
};
// Simplified server session (inline version)
class server_session : public std::enable_shared_from_this<server_session>
{
public:
explicit server_session(tcp::socket socket)
: socket_(std::move(socket))
, strand_(socket_.get_executor())
{
// Configure socket for maximum performance
// These settings are critical for achieving high bandwidth (3GB/s target)
beast::error_code ec;
// Disable Nagle's algorithm for low latency
socket_.set_option(tcp::no_delay(true), ec);
// Use large buffers (8MB) to match epolls_benchmark performance
// Small default buffers (64KB-256KB) severely limit bandwidth
socket_.set_option(asio::socket_base::send_buffer_size(8 * 1024 * 1024), ec);
socket_.set_option(asio::socket_base::receive_buffer_size(8 * 1024 * 1024), ec);
}
void run()
{
do_read();
}
private:
tcp::socket socket_;
asio::strand<asio::any_io_executor> strand_;
beast::flat_buffer buffer_;
http::request<http::string_body> req_;
void do_read()
{
auto self = shared_from_this();
// For single-threaded io_context, strand is unnecessary and hurts performance
// Remove strand binding for maximum throughput (only safe if single-threaded)
http::async_read(
socket_,
buffer_,
req_,
[self](beast::error_code ec, std::size_t) {
if (!ec)
{
self->handle_request();
}
});
}
void handle_request()
{
http::response<http::string_body> res{http::status::ok, req_.version()};
res.set(http::field::server, "BeastBench/1.0");
res.set(http::field::content_type, "application/octet-stream");
res.keep_alive(req_.keep_alive());
// Optimized: Use move to avoid copying large bodies
res.body() = std::move(req_.body());
res.prepare_payload();
send_response(std::move(res));
}
void send_response(http::response<http::string_body>&& res)
{
auto sp = std::make_shared<http::response<http::string_body>>(std::move(res));
auto self = shared_from_this();
// For single-threaded io_context, strand is unnecessary and hurts performance
// Remove strand binding for maximum throughput (only safe if single-threaded)
http::async_write(
socket_,
*sp,
[sp, self](beast::error_code ec, std::size_t) {
if (!ec && sp->keep_alive())
{
self->req_ = {};
self->buffer_.consume(self->buffer_.size());
self->do_read();
}
else if (!ec)
{
beast::error_code ec_shutdown;
self->socket_.shutdown(tcp::socket::shutdown_send, ec_shutdown);
}
});
}
};
// Simplified listener
class server_listener : public std::enable_shared_from_this<server_listener>
{
public:
server_listener(asio::any_io_executor exec, tcp::endpoint endpoint)
: acceptor_(exec)
{
beast::error_code ec;
acceptor_.open(endpoint.protocol(), ec);
acceptor_.set_option(asio::socket_base::reuse_address(true), ec);
acceptor_.bind(endpoint, ec);
acceptor_.listen(asio::socket_base::max_listen_connections, ec);
}
void run()
{
if (acceptor_.is_open())
do_accept();
}
private:
tcp::acceptor acceptor_;
void do_accept()
{
auto self = shared_from_this();
acceptor_.async_accept(
asio::make_strand(acceptor_.get_executor()),
[self](beast::error_code ec, tcp::socket socket) {
if (!ec)
{
std::make_shared<server_session>(std::move(socket))->run();
}
self->do_accept();
});
}
};
// Simplified client session
class client_session : public std::enable_shared_from_this<client_session>
{
public:
client_session(
asio::io_context& ioc,
const std::string& host,
const std::string& port,
const std::string& path,
const std::string& body,
BenchmarkStats& stats)
: resolver_(ioc)
, socket_(ioc)
, host_(host)
, port_(port)
, path_(path)
, body_(body)
, stats_(stats)
{
}
void run()
{
start_time_ = std::chrono::steady_clock::now();
resolver_.async_resolve(
host_,
port_,
[self = shared_from_this()](beast::error_code ec, tcp::resolver::results_type results) {
if (!ec)
{
self->on_resolve(results);
}
else
{
self->stats_.record_failure();
}
});
}
private:
tcp::resolver resolver_;
tcp::socket socket_;
beast::flat_buffer buffer_;
http::request<http::string_body> req_;
http::response<http::string_body> res_;
std::string host_;
std::string port_;
std::string path_;
std::string body_;
BenchmarkStats& stats_;
std::chrono::steady_clock::time_point start_time_;
void on_resolve(tcp::resolver::results_type results)
{
auto self = shared_from_this();
asio::async_connect(
socket_,
results,
[self](beast::error_code ec, const tcp::endpoint&) {
if (!ec)
{
self->on_connect();
}
else
{
self->stats_.record_failure();
}
});
}
void on_connect()
{
// Start timing here (after connection is established)
// This excludes connection setup time from latency measurement
start_time_ = std::chrono::steady_clock::now();
// Configure socket for maximum performance
// These settings are critical for achieving high bandwidth (3GB/s target)
beast::error_code ec;
// Disable Nagle's algorithm for low latency
socket_.set_option(tcp::no_delay(true), ec);
// Use large buffers (8MB) to match epolls_benchmark performance
// Small default buffers (64KB-256KB) severely limit bandwidth
socket_.set_option(asio::socket_base::send_buffer_size(8 * 1024 * 1024), ec);
socket_.set_option(asio::socket_base::receive_buffer_size(8 * 1024 * 1024), ec);
req_.method(http::verb::put);
req_.target(path_);
req_.version(11);
req_.set(http::field::host, host_);
req_.set(http::field::user_agent, "BeastBench/1.0");
req_.set(http::field::content_type, "application/octet-stream");
req_.set(http::field::connection, "keep-alive"); // Enable keep-alive
req_.body() = body_;
req_.prepare_payload();
auto self = shared_from_this();
http::async_write(
socket_,
req_,
[self](beast::error_code ec, std::size_t) {
if (!ec)
{
self->on_write();
}
else
{
self->stats_.record_failure();
}
});
}
void on_write()
{
auto self = shared_from_this();
http::async_read(
socket_,
buffer_,
res_,
[self](beast::error_code ec, std::size_t) {
if (!ec)
{
self->on_read();
}
else
{
self->stats_.record_failure();
}
});
}
void on_read()
{
auto end = std::chrono::steady_clock::now();
auto latency = std::chrono::duration<double, std::milli>(end - start_time_).count();
socket_.shutdown(tcp::socket::shutdown_both);
if (res_.result() == http::status::ok)
{
stats_.record_success(latency, body_.size());
}
else
{
stats_.record_failure();
}
}
};
//------------------------------------------------------------------------------
int main(int argc, char* argv[])
{
try
{
unsigned short port = 8080;
int requests = 10000;
int concurrency = 50;
size_t body_size = 1048576; // 1 MB
int server_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
// Parse arguments
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if (arg == "--port" && i + 1 < argc)
{
port = static_cast<unsigned short>(std::stoi(argv[++i]));
}
else if (arg == "--requests" && i + 1 < argc)
{
requests = std::stoi(argv[++i]);
}
else if (arg == "--concurrency" && i + 1 < argc)
{
concurrency = std::stoi(argv[++i]);
}
else if (arg == "--size" && i + 1 < argc)
{
body_size = std::stoull(argv[++i]);
}
else if (arg == "--threads" && i + 1 < argc)
{
server_threads = std::stoi(argv[++i]);
}
else if (arg == "--help" || arg == "-h")
{
std::cout << "Usage: " << argv[0] << " [OPTIONS]\n"
<< "Options:\n"
<< " --port PORT Server port (default: 8080)\n"
<< " --requests N Number of requests (default: 10000)\n"
<< " --concurrency N Concurrent requests (default: 50)\n"
<< " --size N Request body size in bytes (default: 1048576)\n"
<< " --threads N Server worker threads (default: CPU count)\n"
<< " --help, -h Show this help\n";
return 0;
}
}
std::cout << "Boost.Beast Benchmark - Server + Client\n";
std::cout << "========================================\n";
std::cout << "Port: " << port << "\n";
std::cout << "Requests: " << requests << "\n";
std::cout << "Concurrency: " << concurrency << "\n";
std::cout << "Body size: " << body_size << " bytes ("
<< (body_size / 1024 / 1024.0) << " MB)\n";
std::cout << "Server threads: " << server_threads << "\n\n";
// Start server in separate thread
asio::io_context server_ioc{server_threads};
auto address = asio::ip::make_address("0.0.0.0");
auto server_lst = std::make_shared<server_listener>(
server_ioc.get_executor(),
tcp::endpoint{address, port});
server_lst->run();
std::vector<std::thread> server_threads_vec;
server_threads_vec.reserve(server_threads - 1);
for (auto i = server_threads - 1; i > 0; --i)
server_threads_vec.emplace_back([&server_ioc] { server_ioc.run(); });
// Give server a moment to start
std::this_thread::sleep_for(std::chrono::milliseconds(100));
// Run client benchmark
std::string body(body_size, 'X');
asio::io_context client_ioc;
BenchmarkStats stats;
stats.start_time = std::chrono::steady_clock::now();
// Create and run client sessions
// Simple approach: start all requests, let asio handle concurrency
std::vector<std::shared_ptr<client_session>> sessions;
sessions.reserve(requests);
for (int i = 0; i < requests; ++i)
{
auto session = std::make_shared<client_session>(
client_ioc, "localhost", std::to_string(port),
"/test" + std::to_string(i), body, stats);
session->run();
sessions.push_back(session);
}
// Run client io_context
std::thread client_thread([&client_ioc] { client_ioc.run(); });
// Wait for all requests to complete
// (We'll wait until stats show all requests are done)
while (true)
{
std::this_thread::sleep_for(std::chrono::milliseconds(100));
std::lock_guard<std::mutex> lock(stats.mutex);
if (stats.success_count + stats.failure_count >= requests)
break;
}
// Stop and wait
client_ioc.stop();
client_thread.join();
stats.end_time = std::chrono::steady_clock::now();
// Stop server
server_ioc.stop();
for (auto& t : server_threads_vec)
t.join();
stats.print_results();
}
catch (const std::exception& e)
{
std::cerr << "Error: " << e.what() << "\n";
return 1;
}
return 0;
}
hello expert
I write one example server and benchmark tool, but I see performance is not good. I write similar tcp server can reach 3GB/s with one tcp connection and one thread.
but beast show very low BW.
my server benchmark code
could you please shed some light?