@@ -8937,10 +8937,257 @@ index 662539cd89..29f545c69e 100644
89378937 r = cf_ssl_peer_key_add_path(&buf, "CA", ssl->CAfile, &is_local);
89388938 if(r)
89398939diff --git a/lib/ws.c b/lib/ws.c
8940- index 7f8a688ca1..ab34231bcc 100644
8940+ index 7f8a688ca1..150ab6f38d 100644
89418941--- a/lib/ws.c
89428942+++ b/lib/ws.c
8943- @@ -1587,7 +1587,7 @@ const struct Curl_handler Curl_handler_wss = {
8943+ @@ -26,6 +26,13 @@
8944+
8945+ #if !defined(CURL_DISABLE_WEBSOCKETS) && !defined(CURL_DISABLE_HTTP)
8946+
8947+ + #include <stdint.h>
8948+ + #if defined(__x86_64__) || defined(_M_X64)
8949+ + #include <immintrin.h>
8950+ + #elif defined(__aarch64__) || defined(_M_ARM64)
8951+ + #include <arm_neon.h>
8952+ + #endif
8953+ +
8954+ #include "urldata.h"
8955+ #include "url.h"
8956+ #include "bufq.h"
8957+ @@ -74,8 +81,74 @@
8958+ #define WSBIT_MASK 0x80
8959+
8960+ /* buffer dimensioning */
8961+ - #define WS_CHUNK_SIZE 65535
8962+ - #define WS_CHUNK_COUNT 2
8963+ + #define WS_CHUNK_SIZE 131072
8964+ + #define WS_CHUNK_COUNT 4
8965+ +
8966+ + #ifndef WS_ENC_XBUF_SIZE
8967+ + #define WS_ENC_XBUF_SIZE 8192
8968+ + #endif
8969+ +
8970+ + /* CPU Feature Bitmask Constants */
8971+ + #define WS_CPU_FEAT_INIT ((uint32_t)1 << 0)
8972+ + #define WS_CPU_FEAT_AVX2 ((uint32_t)1 << 1)
8973+ + #define WS_CPU_FEAT_AVX512 ((uint32_t)1 << 2)
8974+ +
8975+ + /* Determine endianness */
8976+ + #if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || \
8977+ + defined(_WIN32) || defined(_WIN64) || defined(__LITTLE_ENDIAN__)
8978+ + #define WS_IS_LITTLE_ENDIAN 1
8979+ + #else
8980+ + #define WS_IS_LITTLE_ENDIAN 0
8981+ + #endif
8982+ +
8983+ + #if WS_IS_LITTLE_ENDIAN
8984+ + #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
8985+ + /* AVX-512 Path: 128 bytes per iteration (2x unrolled) */
8986+ + __attribute__((target("avx512f")))
8987+ + static size_t ws_xor_avx512(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
8988+ + size_t j = 0;
8989+ + __m512i vmask = _mm512_set1_epi32((int)m32);
8990+ + for(; j + 128 <= len; j += 128) {
8991+ + __m512i v0 = _mm512_loadu_si512((const __m512i*)(const void *)(src + j));
8992+ + __m512i v1 = _mm512_loadu_si512((const __m512i*)(const void *)(src + j + 64));
8993+ + _mm512_storeu_si512((__m512i*)(void *)(dst + j), _mm512_xor_si512(v0, vmask));
8994+ + _mm512_storeu_si512((__m512i*)(void *)(dst + j + 64), _mm512_xor_si512(v1, vmask));
8995+ + }
8996+ + return j;
8997+ + }
8998+ +
8999+ + /* AVX2 Path: 64 bytes per iteration (2x unrolled) */
9000+ + __attribute__((target("avx2")))
9001+ + static size_t ws_xor_avx2(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
9002+ + size_t j = 0;
9003+ + __m256i vmask = _mm256_set1_epi32((int)m32);
9004+ + for(; j + 64 <= len; j += 64) {
9005+ + __m256i v0 = _mm256_loadu_si256((const __m256i*)(const void *)(src + j));
9006+ + __m256i v1 = _mm256_loadu_si256((const __m256i*)(const void *)(src + j + 32));
9007+ + _mm256_storeu_si256((__m256i*)(void *)(dst + j), _mm256_xor_si256(v0, vmask));
9008+ + _mm256_storeu_si256((__m256i*)(void *)(dst + j + 32), _mm256_xor_si256(v1, vmask));
9009+ + }
9010+ + return j;
9011+ + }
9012+ + #endif
9013+ +
9014+ + #if defined(__aarch64__) || defined(_M_ARM64)
9015+ + /* NEON Path: 32 bytes per iteration (2x unrolled) */
9016+ + static size_t ws_xor_neon(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
9017+ + size_t j = 0;
9018+ + uint32x4_t vmask = vdupq_n_u32(m32);
9019+ + for(; j + 32 <= len; j += 32) {
9020+ + uint8x16_t v0 = vld1q_u8(src + j);
9021+ + uint8x16_t v1 = vld1q_u8(src + j + 16);
9022+ + v0 = veorq_u8(v0, vreinterpretq_u8_u32(vmask));
9023+ + v1 = veorq_u8(v1, vreinterpretq_u8_u32(vmask));
9024+ + vst1q_u8(dst + j, v0);
9025+ + vst1q_u8(dst + j + 16, v1);
9026+ + }
9027+ + return j;
9028+ + }
9029+ + #endif
9030+ + #endif
9031+
9032+
9033+ /* a client-side WS frame decoder, parsing frame headers and
9034+ @@ -682,9 +755,9 @@ static CURLcode ws_cw_write(struct Curl_easy *data,
9035+ }
9036+ }
9037+
9038+ - if((type & CLIENTWRITE_EOS) && !Curl_bufq_is_empty(&ctx->buf)) {
9039+ - failf(data, "[WS] decode ending with %zd frame bytes remaining",
9040+ - Curl_bufq_len(&ctx->buf));
9041+ + if((type & CLIENTWRITE_EOS) && (!Curl_bufq_is_empty(&ctx->buf) || ws->dec.state != WS_DEC_INIT)) {
9042+ + failf(data, "[WS] decode ending with %zd bytes remaining and "
9043+ + "incomplete frame", Curl_bufq_len(&ctx->buf));
9044+ return CURLE_RECV_ERROR;
9045+ }
9046+
9047+ @@ -853,29 +926,102 @@ static ssize_t ws_enc_write_payload(struct ws_encoder *enc,
9048+ const unsigned char *buf, size_t buflen,
9049+ struct bufq *out, CURLcode *err)
9050+ {
9051+ - size_t i, len, n;
9052+ + size_t i = 0, len, n, chunk, j;
9053+ +
9054+ + /* Cache line alignment */
9055+ + #if defined(__GNUC__) || defined(__clang__)
9056+ + __attribute__((aligned(64))) unsigned char xbuf[WS_ENC_XBUF_SIZE];
9057+ + #elif defined(_MSC_VER)
9058+ + __declspec(align(64)) unsigned char xbuf[WS_ENC_XBUF_SIZE];
9059+ + #else
9060+ + unsigned char xbuf[WS_ENC_XBUF_SIZE];
9061+ + #endif
9062+ +
9063+ + /* Defensive check for finished payloads */
9064+ + if(enc->payload_remain <= 0) {
9065+ + ws_enc_info(enc, data, "buffered");
9066+ + return 0;
9067+ + }
9068+ +
9069+ + /* CPU feature cache. */
9070+ + #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
9071+ + static uint32_t cpu_feats = 0;
9072+ + if(!__atomic_load_n(&cpu_feats, __ATOMIC_ACQUIRE)) {
9073+ + uint32_t f = WS_CPU_FEAT_INIT;
9074+ + __builtin_cpu_init();
9075+ + if(__builtin_cpu_supports("avx2"))
9076+ + f |= WS_CPU_FEAT_AVX2;
9077+ + if(__builtin_cpu_supports("avx512f"))
9078+ + f |= WS_CPU_FEAT_AVX512;
9079+ + __atomic_store_n(&cpu_feats, f, __ATOMIC_RELEASE);
9080+ + }
9081+ + #endif
9082+
9083+ if(Curl_bufq_is_full(out)) {
9084+ *err = CURLE_AGAIN;
9085+ return -1;
9086+ }
9087+
9088+ - /* not the most performant way to do this */
9089+ len = buflen;
9090+ if((curl_off_t)len > enc->payload_remain)
9091+ len = (size_t)enc->payload_remain;
9092+
9093+ - for(i = 0; i < len; ++i) {
9094+ - unsigned char c = buf[i] ^ enc->mask[enc->xori];
9095+ - *err = Curl_bufq_write(out, &c, 1, &n);
9096+ + while(i < len) {
9097+ + unsigned int sx = enc->xori;
9098+ + unsigned char m[4] = { enc->mask[sx&3], enc->mask[(sx+1)&3],
9099+ + enc->mask[(sx+2)&3], enc->mask[(sx+3)&3] };
9100+ +
9101+ + chunk = len - i;
9102+ + if(chunk > sizeof(xbuf)) chunk = sizeof(xbuf);
9103+ +
9104+ + j = 0;
9105+ +
9106+ + /* SIMD and 32-bit Little-Endian gated scalar paths */
9107+ + #if WS_IS_LITTLE_ENDIAN
9108+ +
9109+ + uint32_t m32;
9110+ + memcpy(&m32, m, 4);
9111+ +
9112+ + #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
9113+ + {
9114+ + /* Reload feats before dispatch */
9115+ + uint32_t f = __atomic_load_n(&cpu_feats, __ATOMIC_RELAXED);
9116+ + if(f & WS_CPU_FEAT_AVX512) {
9117+ + j = ws_xor_avx512(buf + i, xbuf, chunk, m32);
9118+ + } else if(f & WS_CPU_FEAT_AVX2) {
9119+ + j = ws_xor_avx2(buf + i, xbuf, chunk, m32);
9120+ + }
9121+ + }
9122+ + #elif defined(__aarch64__) || defined(_M_ARM64)
9123+ + /* NEON path - little endian AARCH64 */
9124+ + j = ws_xor_neon(buf + i, xbuf, chunk, m32);
9125+ + #endif
9126+ +
9127+ + for(; j + 4 <= chunk; j += 4) {
9128+ + uint32_t d32;
9129+ + memcpy(&d32, buf + i + j, 4);
9130+ + d32 ^= m32;
9131+ + memcpy(xbuf + j, &d32, 4);
9132+ + }
9133+ + #endif
9134+ +
9135+ + /* Universal Scalar Fallback */
9136+ + for(; j < chunk; ++j) {
9137+ + xbuf[j] = buf[i + j] ^ m[j & 3];
9138+ + }
9139+ +
9140+ + *err = Curl_bufq_write(out, xbuf, chunk, &n);
9141+ + if(n > 0) {
9142+ + i += n;
9143+ + enc->xori = (sx + (unsigned int)n) & 3;
9144+ + }
9145+ if(*err) {
9146+ if((*err != CURLE_AGAIN) || !i)
9147+ return -1;
9148+ break;
9149+ }
9150+ - enc->xori++;
9151+ - enc->xori &= 3;
9152+ }
9153+ +
9154+ enc->payload_remain -= (curl_off_t)i;
9155+ ws_enc_info(enc, data, "buffered");
9156+ return (ssize_t)i;
9157+ @@ -1270,6 +1416,12 @@ static CURLcode ws_flush(struct Curl_easy *data, struct websocket *ws,
9158+ result = CURLE_AGAIN;
9159+ }
9160+
9161+ + /* Advance buffer for any bytes successfully written before handling errors */
9162+ + if(n > 0) {
9163+ + CURL_TRC_WS(data, "flushed %zu bytes", n);
9164+ + Curl_bufq_skip(&ws->sendbuf, n);
9165+ + }
9166+ +
9167+ if(result == CURLE_AGAIN) {
9168+ CURL_TRC_WS(data, "flush EAGAIN, %zu bytes remain in buffer",
9169+ Curl_bufq_len(&ws->sendbuf));
9170+ @@ -1279,10 +1431,6 @@ static CURLcode ws_flush(struct Curl_easy *data, struct websocket *ws,
9171+ failf(data, "[WS] flush, write error %d", result);
9172+ return result;
9173+ }
9174+ - else {
9175+ - CURL_TRC_WS(data, "flushed %zu bytes", n);
9176+ - Curl_bufq_skip(&ws->sendbuf, n);
9177+ - }
9178+ }
9179+ }
9180+ return CURLE_OK;
9181+ @@ -1353,6 +1501,8 @@ static CURLcode ws_send_raw(struct Curl_easy *data, const void *buffer,
9182+ if(result)
9183+ return result;
9184+ result = ws_send_raw_blocking(data, ws, buffer, buflen);
9185+ + if(!result)
9186+ + *pnwritten = buflen;
9187+ }
9188+ else {
9189+ /* We need any pending data to be sent or EAGAIN this call. */
9190+ @@ -1587,7 +1737,7 @@ const struct Curl_handler Curl_handler_wss = {
89449191 CURLPROTO_WSS, /* protocol */
89459192 CURLPROTO_HTTP, /* family */
89469193 PROTOPT_SSL | PROTOPT_CREDSPERREQUEST | /* flags */
0 commit comments