[perf tuning] benchmark shows only 200MB/s even with 128 threads

hello expert

I write one example server and benchmark tool, but I see performance is not good. I write similar tcp server can reach 3GB/s with one tcp connection and one thread.

but beast show very low BW.

my server benchmark code
```
// -----------------------------------------------------------------------------
// Boost.Beast Benchmark - Runs Server and Client Together
//
// This program starts an async HTTP server and then runs a client benchmark
// against it, measuring pure HTTP performance.
// -----------------------------------------------------------------------------

#include <boost/asio.hpp>
#include <boost/beast.hpp>
#include <boost/beast/http.hpp>
#include <iostream>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include <chrono>
#include <algorithm>
#include <iomanip>
#include <mutex>
#include <condition_variable>

// Include server and client components
// For simplicity, we'll inline the key components here

namespace asio = boost::asio;
namespace beast = boost::beast;
namespace http = beast::http;
using tcp = asio::ip::tcp;

// Forward declarations - we'll use simplified versions
class BenchmarkServer;
class BenchmarkClient;

struct BenchmarkStats
{
    std::mutex mutex;
    int success_count = 0;
    int failure_count = 0;
    std::vector<double> latencies_ms;
    size_t total_bytes = 0;
    std::chrono::steady_clock::time_point start_time;
    std::chrono::steady_clock::time_point end_time;

    void record_success(double latency_ms, size_t bytes)
    {
        std::lock_guard<std::mutex> lock(mutex);
        success_count++;
        latencies_ms.push_back(latency_ms);
        total_bytes += bytes;
    }

    void record_failure()
    {
        std::lock_guard<std::mutex> lock(mutex);
        failure_count++;
    }

    void print_results()
    {
        std::lock_guard<std::mutex> lock(mutex);
        auto duration = std::chrono::duration<double>(
            end_time - start_time).count();

        std::cout << "\n" << std::string(70, '=') << "\n";
        std::cout << "Boost.Beast Benchmark Results\n";
        std::cout << std::string(70, '=') << "\n";
        std::cout << "Total requests:  " << (success_count + failure_count) << "\n";
        std::cout << "  Success:      " << success_count << "\n";
        std::cout << "  Failed:       " << failure_count << "\n";
        std::cout << "Duration:       " << std::fixed << std::setprecision(2) 
                  << duration << " seconds\n";

        if (success_count > 0 && duration > 0)
        {
            double throughput_mbps = (total_bytes * 8.0) / (duration * 1'000'000.0);
            double ops_per_sec = success_count / duration;
            std::cout << "Throughput:     " << std::fixed << std::setprecision(2)
                      << throughput_mbps << " Mbps (" 
                      << (throughput_mbps / 8.0) << " MB/s)\n";
            std::cout << "Operations/sec: " << ops_per_sec << "\n";
        }

        if (!latencies_ms.empty())
        {
            std::sort(latencies_ms.begin(), latencies_ms.end());
            size_t n = latencies_ms.size();
            double sum = 0;
            for (auto l : latencies_ms) sum += l;

            std::cout << "\nLatency (milliseconds):\n";
            std::cout << "  Min:          " << std::fixed << std::setprecision(2)
                      << latencies_ms[0] << "\n";
            std::cout << "  Max:          " << latencies_ms[n - 1] << "\n";
            std::cout << "  Average:      " << (sum / n) << "\n";
            std::cout << "  Median (p50): " << latencies_ms[n / 2] << "\n";
            if (n > 0)
            {
                std::cout << "  p95:          " 
                          << latencies_ms[static_cast<size_t>(n * 0.95)] << "\n";
                std::cout << "  p99:          " 
                          << latencies_ms[static_cast<size_t>(n * 0.99)] << "\n";
            }
        }
        std::cout << std::string(70, '=') << "\n";
    }
};

// Simplified server session (inline version)
class server_session : public std::enable_shared_from_this<server_session>
{
public:
    explicit server_session(tcp::socket socket)
        : socket_(std::move(socket))
        , strand_(socket_.get_executor())
    {
        // Configure socket for maximum performance
        // These settings are critical for achieving high bandwidth (3GB/s target)
        beast::error_code ec;
        
        // Disable Nagle's algorithm for low latency
        socket_.set_option(tcp::no_delay(true), ec);
        
        // Use large buffers (8MB) to match epolls_benchmark performance
        // Small default buffers (64KB-256KB) severely limit bandwidth
        socket_.set_option(asio::socket_base::send_buffer_size(8 * 1024 * 1024), ec);
        socket_.set_option(asio::socket_base::receive_buffer_size(8 * 1024 * 1024), ec);
    }

    void run()
    {
        do_read();
    }

private:
    tcp::socket socket_;
    asio::strand<asio::any_io_executor> strand_;
    beast::flat_buffer buffer_;
    http::request<http::string_body> req_;

    void do_read()
    {
        auto self = shared_from_this();
        // For single-threaded io_context, strand is unnecessary and hurts performance
        // Remove strand binding for maximum throughput (only safe if single-threaded)
        http::async_read(
            socket_,
            buffer_,
            req_,
            [self](beast::error_code ec, std::size_t) {
                if (!ec)
                {
                    self->handle_request();
                }
            });
    }

    void handle_request()
    {
        http::response<http::string_body> res{http::status::ok, req_.version()};
        res.set(http::field::server, "BeastBench/1.0");
        res.set(http::field::content_type, "application/octet-stream");
        res.keep_alive(req_.keep_alive());
        // Optimized: Use move to avoid copying large bodies
        res.body() = std::move(req_.body());
        res.prepare_payload();
        send_response(std::move(res));
    }

    void send_response(http::response<http::string_body>&& res)
    {
        auto sp = std::make_shared<http::response<http::string_body>>(std::move(res));
        auto self = shared_from_this();

        // For single-threaded io_context, strand is unnecessary and hurts performance
        // Remove strand binding for maximum throughput (only safe if single-threaded)
        http::async_write(
            socket_,
            *sp,
            [sp, self](beast::error_code ec, std::size_t) {
                if (!ec && sp->keep_alive())
                {
                    self->req_ = {};
                    self->buffer_.consume(self->buffer_.size());
                    self->do_read();
                }
                else if (!ec)
                {
                    beast::error_code ec_shutdown;
                    self->socket_.shutdown(tcp::socket::shutdown_send, ec_shutdown);
                }
            });
    }
};

// Simplified listener
class server_listener : public std::enable_shared_from_this<server_listener>
{
public:
    server_listener(asio::any_io_executor exec, tcp::endpoint endpoint)
        : acceptor_(exec)
    {
        beast::error_code ec;
        acceptor_.open(endpoint.protocol(), ec);
        acceptor_.set_option(asio::socket_base::reuse_address(true), ec);
        acceptor_.bind(endpoint, ec);
        acceptor_.listen(asio::socket_base::max_listen_connections, ec);
    }

    void run()
    {
        if (acceptor_.is_open())
            do_accept();
    }

private:
    tcp::acceptor acceptor_;

    void do_accept()
    {
        auto self = shared_from_this();
        acceptor_.async_accept(
            asio::make_strand(acceptor_.get_executor()),
            [self](beast::error_code ec, tcp::socket socket) {
                if (!ec)
                {
                    std::make_shared<server_session>(std::move(socket))->run();
                }
                self->do_accept();
            });
    }
};

// Simplified client session
class client_session : public std::enable_shared_from_this<client_session>
{
public:
    client_session(
        asio::io_context& ioc,
        const std::string& host,
        const std::string& port,
        const std::string& path,
        const std::string& body,
        BenchmarkStats& stats)
        : resolver_(ioc)
        , socket_(ioc)
        , host_(host)
        , port_(port)
        , path_(path)
        , body_(body)
        , stats_(stats)
    {
    }

    void run()
    {
        start_time_ = std::chrono::steady_clock::now();
        resolver_.async_resolve(
            host_,
            port_,
            [self = shared_from_this()](beast::error_code ec, tcp::resolver::results_type results) {
                if (!ec)
                {
                    self->on_resolve(results);
                }
                else
                {
                    self->stats_.record_failure();
                }
            });
    }

private:
    tcp::resolver resolver_;
    tcp::socket socket_;
    beast::flat_buffer buffer_;
    http::request<http::string_body> req_;
    http::response<http::string_body> res_;
    std::string host_;
    std::string port_;
    std::string path_;
    std::string body_;
    BenchmarkStats& stats_;
    std::chrono::steady_clock::time_point start_time_;

    void on_resolve(tcp::resolver::results_type results)
    {
        auto self = shared_from_this();
        asio::async_connect(
            socket_,
            results,
            [self](beast::error_code ec, const tcp::endpoint&) {
                if (!ec)
                {
                    self->on_connect();
                }
                else
                {
                    self->stats_.record_failure();
                }
            });
    }

    void on_connect()
    {
        // Start timing here (after connection is established)
        // This excludes connection setup time from latency measurement
        start_time_ = std::chrono::steady_clock::now();
        
        // Configure socket for maximum performance
        // These settings are critical for achieving high bandwidth (3GB/s target)
        beast::error_code ec;
        
        // Disable Nagle's algorithm for low latency
        socket_.set_option(tcp::no_delay(true), ec);
        
        // Use large buffers (8MB) to match epolls_benchmark performance
        // Small default buffers (64KB-256KB) severely limit bandwidth
        socket_.set_option(asio::socket_base::send_buffer_size(8 * 1024 * 1024), ec);
        socket_.set_option(asio::socket_base::receive_buffer_size(8 * 1024 * 1024), ec);
        
        req_.method(http::verb::put);
        req_.target(path_);
        req_.version(11);
        req_.set(http::field::host, host_);
        req_.set(http::field::user_agent, "BeastBench/1.0");
        req_.set(http::field::content_type, "application/octet-stream");
        req_.set(http::field::connection, "keep-alive");  // Enable keep-alive
        req_.body() = body_;
        req_.prepare_payload();

        auto self = shared_from_this();
        http::async_write(
            socket_,
            req_,
            [self](beast::error_code ec, std::size_t) {
                if (!ec)
                {
                    self->on_write();
                }
                else
                {
                    self->stats_.record_failure();
                }
            });
    }

    void on_write()
    {
        auto self = shared_from_this();
        http::async_read(
            socket_,
            buffer_,
            res_,
            [self](beast::error_code ec, std::size_t) {
                if (!ec)
                {
                    self->on_read();
                }
                else
                {
                    self->stats_.record_failure();
                }
            });
    }

    void on_read()
    {
        auto end = std::chrono::steady_clock::now();
        auto latency = std::chrono::duration<double, std::milli>(end - start_time_).count();
        
        socket_.shutdown(tcp::socket::shutdown_both);
        
        if (res_.result() == http::status::ok)
        {
            stats_.record_success(latency, body_.size());
        }
        else
        {
            stats_.record_failure();
        }
    }
};

//------------------------------------------------------------------------------

int main(int argc, char* argv[])
{
    try
    {
        unsigned short port = 8080;
        int requests = 10000;
        int concurrency = 50;
        size_t body_size = 1048576; // 1 MB
        int server_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));

        // Parse arguments
        for (int i = 1; i < argc; ++i)
        {
            std::string arg = argv[i];
            if (arg == "--port" && i + 1 < argc)
            {
                port = static_cast<unsigned short>(std::stoi(argv[++i]));
            }
            else if (arg == "--requests" && i + 1 < argc)
            {
                requests = std::stoi(argv[++i]);
            }
            else if (arg == "--concurrency" && i + 1 < argc)
            {
                concurrency = std::stoi(argv[++i]);
            }
            else if (arg == "--size" && i + 1 < argc)
            {
                body_size = std::stoull(argv[++i]);
            }
            else if (arg == "--threads" && i + 1 < argc)
            {
                server_threads = std::stoi(argv[++i]);
            }
            else if (arg == "--help" || arg == "-h")
            {
                std::cout << "Usage: " << argv[0] << " [OPTIONS]\n"
                          << "Options:\n"
                          << "  --port PORT        Server port (default: 8080)\n"
                          << "  --requests N       Number of requests (default: 10000)\n"
                          << "  --concurrency N    Concurrent requests (default: 50)\n"
                          << "  --size N           Request body size in bytes (default: 1048576)\n"
                          << "  --threads N        Server worker threads (default: CPU count)\n"
                          << "  --help, -h         Show this help\n";
                return 0;
            }
        }

        std::cout << "Boost.Beast Benchmark - Server + Client\n";
        std::cout << "========================================\n";
        std::cout << "Port:          " << port << "\n";
        std::cout << "Requests:      " << requests << "\n";
        std::cout << "Concurrency:   " << concurrency << "\n";
        std::cout << "Body size:     " << body_size << " bytes (" 
                  << (body_size / 1024 / 1024.0) << " MB)\n";
        std::cout << "Server threads: " << server_threads << "\n\n";

        // Start server in separate thread
        asio::io_context server_ioc{server_threads};
        auto address = asio::ip::make_address("0.0.0.0");
        auto server_lst = std::make_shared<server_listener>(
            server_ioc.get_executor(),
            tcp::endpoint{address, port});
        server_lst->run();

        std::vector<std::thread> server_threads_vec;
        server_threads_vec.reserve(server_threads - 1);
        for (auto i = server_threads - 1; i > 0; --i)
            server_threads_vec.emplace_back([&server_ioc] { server_ioc.run(); });

        // Give server a moment to start
        std::this_thread::sleep_for(std::chrono::milliseconds(100));

        // Run client benchmark
        std::string body(body_size, 'X');
        asio::io_context client_ioc;
        BenchmarkStats stats;
        stats.start_time = std::chrono::steady_clock::now();

        // Create and run client sessions
        // Simple approach: start all requests, let asio handle concurrency
        std::vector<std::shared_ptr<client_session>> sessions;
        sessions.reserve(requests);

        for (int i = 0; i < requests; ++i)
        {
            auto session = std::make_shared<client_session>(
                client_ioc, "localhost", std::to_string(port), 
                "/test" + std::to_string(i), body, stats);
            session->run();
            sessions.push_back(session);
        }

        // Run client io_context
        std::thread client_thread([&client_ioc] { client_ioc.run(); });

        // Wait for all requests to complete
        // (We'll wait until stats show all requests are done)
        while (true)
        {
            std::this_thread::sleep_for(std::chrono::milliseconds(100));
            std::lock_guard<std::mutex> lock(stats.mutex);
            if (stats.success_count + stats.failure_count >= requests)
                break;
        }

        // Stop and wait
        client_ioc.stop();
        client_thread.join();
        stats.end_time = std::chrono::steady_clock::now();

        // Stop server
        server_ioc.stop();
        for (auto& t : server_threads_vec)
            t.join();

        stats.print_results();
    }
    catch (const std::exception& e)
    {
        std::cerr << "Error: " << e.what() << "\n";
        return 1;
    }

    return 0;
}



```
could you please shed some light?


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[perf tuning] benchmark shows only 200MB/s even with 128 threads #3064

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

[perf tuning] benchmark shows only 200MB/s even with 128 threads #3064

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions