Skip to content

Commit e967342

Browse files
committed
Add in SIMD patch
1 parent 5dcb561 commit e967342

1 file changed

Lines changed: 249 additions & 2 deletions

File tree

patches/curl.patch

Lines changed: 249 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8937,10 +8937,257 @@ index 662539cd89..29f545c69e 100644
89378937
r = cf_ssl_peer_key_add_path(&buf, "CA", ssl->CAfile, &is_local);
89388938
if(r)
89398939
diff --git a/lib/ws.c b/lib/ws.c
8940-
index 7f8a688ca1..ab34231bcc 100644
8940+
index 7f8a688ca1..150ab6f38d 100644
89418941
--- a/lib/ws.c
89428942
+++ b/lib/ws.c
8943-
@@ -1587,7 +1587,7 @@ const struct Curl_handler Curl_handler_wss = {
8943+
@@ -26,6 +26,13 @@
8944+
8945+
#if !defined(CURL_DISABLE_WEBSOCKETS) && !defined(CURL_DISABLE_HTTP)
8946+
8947+
+#include <stdint.h>
8948+
+#if defined(__x86_64__) || defined(_M_X64)
8949+
+ #include <immintrin.h>
8950+
+#elif defined(__aarch64__) || defined(_M_ARM64)
8951+
+ #include <arm_neon.h>
8952+
+#endif
8953+
+
8954+
#include "urldata.h"
8955+
#include "url.h"
8956+
#include "bufq.h"
8957+
@@ -74,8 +81,74 @@
8958+
#define WSBIT_MASK 0x80
8959+
8960+
/* buffer dimensioning */
8961+
-#define WS_CHUNK_SIZE 65535
8962+
-#define WS_CHUNK_COUNT 2
8963+
+#define WS_CHUNK_SIZE 131072
8964+
+#define WS_CHUNK_COUNT 4
8965+
+
8966+
+#ifndef WS_ENC_XBUF_SIZE
8967+
+#define WS_ENC_XBUF_SIZE 8192
8968+
+#endif
8969+
+
8970+
+/* CPU Feature Bitmask Constants */
8971+
+#define WS_CPU_FEAT_INIT ((uint32_t)1 << 0)
8972+
+#define WS_CPU_FEAT_AVX2 ((uint32_t)1 << 1)
8973+
+#define WS_CPU_FEAT_AVX512 ((uint32_t)1 << 2)
8974+
+
8975+
+/* Determine endianness */
8976+
+#if (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || \
8977+
+ defined(_WIN32) || defined(_WIN64) || defined(__LITTLE_ENDIAN__)
8978+
+#define WS_IS_LITTLE_ENDIAN 1
8979+
+#else
8980+
+#define WS_IS_LITTLE_ENDIAN 0
8981+
+#endif
8982+
+
8983+
+#if WS_IS_LITTLE_ENDIAN
8984+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
8985+
+/* AVX-512 Path: 128 bytes per iteration (2x unrolled) */
8986+
+__attribute__((target("avx512f")))
8987+
+static size_t ws_xor_avx512(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
8988+
+ size_t j = 0;
8989+
+ __m512i vmask = _mm512_set1_epi32((int)m32);
8990+
+ for(; j + 128 <= len; j += 128) {
8991+
+ __m512i v0 = _mm512_loadu_si512((const __m512i*)(const void *)(src + j));
8992+
+ __m512i v1 = _mm512_loadu_si512((const __m512i*)(const void *)(src + j + 64));
8993+
+ _mm512_storeu_si512((__m512i*)(void *)(dst + j), _mm512_xor_si512(v0, vmask));
8994+
+ _mm512_storeu_si512((__m512i*)(void *)(dst + j + 64), _mm512_xor_si512(v1, vmask));
8995+
+ }
8996+
+ return j;
8997+
+}
8998+
+
8999+
+/* AVX2 Path: 64 bytes per iteration (2x unrolled) */
9000+
+__attribute__((target("avx2")))
9001+
+static size_t ws_xor_avx2(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
9002+
+ size_t j = 0;
9003+
+ __m256i vmask = _mm256_set1_epi32((int)m32);
9004+
+ for(; j + 64 <= len; j += 64) {
9005+
+ __m256i v0 = _mm256_loadu_si256((const __m256i*)(const void *)(src + j));
9006+
+ __m256i v1 = _mm256_loadu_si256((const __m256i*)(const void *)(src + j + 32));
9007+
+ _mm256_storeu_si256((__m256i*)(void *)(dst + j), _mm256_xor_si256(v0, vmask));
9008+
+ _mm256_storeu_si256((__m256i*)(void *)(dst + j + 32), _mm256_xor_si256(v1, vmask));
9009+
+ }
9010+
+ return j;
9011+
+}
9012+
+#endif
9013+
+
9014+
+#if defined(__aarch64__) || defined(_M_ARM64)
9015+
+/* NEON Path: 32 bytes per iteration (2x unrolled) */
9016+
+static size_t ws_xor_neon(const unsigned char *src, unsigned char *dst, size_t len, uint32_t m32) {
9017+
+ size_t j = 0;
9018+
+ uint32x4_t vmask = vdupq_n_u32(m32);
9019+
+ for(; j + 32 <= len; j += 32) {
9020+
+ uint8x16_t v0 = vld1q_u8(src + j);
9021+
+ uint8x16_t v1 = vld1q_u8(src + j + 16);
9022+
+ v0 = veorq_u8(v0, vreinterpretq_u8_u32(vmask));
9023+
+ v1 = veorq_u8(v1, vreinterpretq_u8_u32(vmask));
9024+
+ vst1q_u8(dst + j, v0);
9025+
+ vst1q_u8(dst + j + 16, v1);
9026+
+ }
9027+
+ return j;
9028+
+}
9029+
+#endif
9030+
+#endif
9031+
9032+
9033+
/* a client-side WS frame decoder, parsing frame headers and
9034+
@@ -682,9 +755,9 @@ static CURLcode ws_cw_write(struct Curl_easy *data,
9035+
}
9036+
}
9037+
9038+
- if((type & CLIENTWRITE_EOS) && !Curl_bufq_is_empty(&ctx->buf)) {
9039+
- failf(data, "[WS] decode ending with %zd frame bytes remaining",
9040+
- Curl_bufq_len(&ctx->buf));
9041+
+ if((type & CLIENTWRITE_EOS) && (!Curl_bufq_is_empty(&ctx->buf) || ws->dec.state != WS_DEC_INIT)) {
9042+
+ failf(data, "[WS] decode ending with %zd bytes remaining and "
9043+
+ "incomplete frame", Curl_bufq_len(&ctx->buf));
9044+
return CURLE_RECV_ERROR;
9045+
}
9046+
9047+
@@ -853,29 +926,102 @@ static ssize_t ws_enc_write_payload(struct ws_encoder *enc,
9048+
const unsigned char *buf, size_t buflen,
9049+
struct bufq *out, CURLcode *err)
9050+
{
9051+
- size_t i, len, n;
9052+
+ size_t i = 0, len, n, chunk, j;
9053+
+
9054+
+/* Cache line alignment */
9055+
+#if defined(__GNUC__) || defined(__clang__)
9056+
+ __attribute__((aligned(64))) unsigned char xbuf[WS_ENC_XBUF_SIZE];
9057+
+#elif defined(_MSC_VER)
9058+
+ __declspec(align(64)) unsigned char xbuf[WS_ENC_XBUF_SIZE];
9059+
+#else
9060+
+ unsigned char xbuf[WS_ENC_XBUF_SIZE];
9061+
+#endif
9062+
+
9063+
+ /* Defensive check for finished payloads */
9064+
+ if(enc->payload_remain <= 0) {
9065+
+ ws_enc_info(enc, data, "buffered");
9066+
+ return 0;
9067+
+ }
9068+
+
9069+
+ /* CPU feature cache. */
9070+
+#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
9071+
+ static uint32_t cpu_feats = 0;
9072+
+ if(!__atomic_load_n(&cpu_feats, __ATOMIC_ACQUIRE)) {
9073+
+ uint32_t f = WS_CPU_FEAT_INIT;
9074+
+ __builtin_cpu_init();
9075+
+ if(__builtin_cpu_supports("avx2"))
9076+
+ f |= WS_CPU_FEAT_AVX2;
9077+
+ if(__builtin_cpu_supports("avx512f"))
9078+
+ f |= WS_CPU_FEAT_AVX512;
9079+
+ __atomic_store_n(&cpu_feats, f, __ATOMIC_RELEASE);
9080+
+ }
9081+
+#endif
9082+
9083+
if(Curl_bufq_is_full(out)) {
9084+
*err = CURLE_AGAIN;
9085+
return -1;
9086+
}
9087+
9088+
- /* not the most performant way to do this */
9089+
len = buflen;
9090+
if((curl_off_t)len > enc->payload_remain)
9091+
len = (size_t)enc->payload_remain;
9092+
9093+
- for(i = 0; i < len; ++i) {
9094+
- unsigned char c = buf[i] ^ enc->mask[enc->xori];
9095+
- *err = Curl_bufq_write(out, &c, 1, &n);
9096+
+ while(i < len) {
9097+
+ unsigned int sx = enc->xori;
9098+
+ unsigned char m[4] = { enc->mask[sx&3], enc->mask[(sx+1)&3],
9099+
+ enc->mask[(sx+2)&3], enc->mask[(sx+3)&3] };
9100+
+
9101+
+ chunk = len - i;
9102+
+ if(chunk > sizeof(xbuf)) chunk = sizeof(xbuf);
9103+
+
9104+
+ j = 0;
9105+
+
9106+
+/* SIMD and 32-bit Little-Endian gated scalar paths */
9107+
+#if WS_IS_LITTLE_ENDIAN
9108+
+
9109+
+ uint32_t m32;
9110+
+ memcpy(&m32, m, 4);
9111+
+
9112+
+ #if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))
9113+
+ {
9114+
+ /* Reload feats before dispatch */
9115+
+ uint32_t f = __atomic_load_n(&cpu_feats, __ATOMIC_RELAXED);
9116+
+ if(f & WS_CPU_FEAT_AVX512) {
9117+
+ j = ws_xor_avx512(buf + i, xbuf, chunk, m32);
9118+
+ } else if(f & WS_CPU_FEAT_AVX2) {
9119+
+ j = ws_xor_avx2(buf + i, xbuf, chunk, m32);
9120+
+ }
9121+
+ }
9122+
+ #elif defined(__aarch64__) || defined(_M_ARM64)
9123+
+ /* NEON path - little endian AARCH64 */
9124+
+ j = ws_xor_neon(buf + i, xbuf, chunk, m32);
9125+
+ #endif
9126+
+
9127+
+ for(; j + 4 <= chunk; j += 4) {
9128+
+ uint32_t d32;
9129+
+ memcpy(&d32, buf + i + j, 4);
9130+
+ d32 ^= m32;
9131+
+ memcpy(xbuf + j, &d32, 4);
9132+
+ }
9133+
+#endif
9134+
+
9135+
+ /* Universal Scalar Fallback */
9136+
+ for(; j < chunk; ++j) {
9137+
+ xbuf[j] = buf[i + j] ^ m[j & 3];
9138+
+ }
9139+
+
9140+
+ *err = Curl_bufq_write(out, xbuf, chunk, &n);
9141+
+ if(n > 0) {
9142+
+ i += n;
9143+
+ enc->xori = (sx + (unsigned int)n) & 3;
9144+
+ }
9145+
if(*err) {
9146+
if((*err != CURLE_AGAIN) || !i)
9147+
return -1;
9148+
break;
9149+
}
9150+
- enc->xori++;
9151+
- enc->xori &= 3;
9152+
}
9153+
+
9154+
enc->payload_remain -= (curl_off_t)i;
9155+
ws_enc_info(enc, data, "buffered");
9156+
return (ssize_t)i;
9157+
@@ -1270,6 +1416,12 @@ static CURLcode ws_flush(struct Curl_easy *data, struct websocket *ws,
9158+
result = CURLE_AGAIN;
9159+
}
9160+
9161+
+ /* Advance buffer for any bytes successfully written before handling errors */
9162+
+ if(n > 0) {
9163+
+ CURL_TRC_WS(data, "flushed %zu bytes", n);
9164+
+ Curl_bufq_skip(&ws->sendbuf, n);
9165+
+ }
9166+
+
9167+
if(result == CURLE_AGAIN) {
9168+
CURL_TRC_WS(data, "flush EAGAIN, %zu bytes remain in buffer",
9169+
Curl_bufq_len(&ws->sendbuf));
9170+
@@ -1279,10 +1431,6 @@ static CURLcode ws_flush(struct Curl_easy *data, struct websocket *ws,
9171+
failf(data, "[WS] flush, write error %d", result);
9172+
return result;
9173+
}
9174+
- else {
9175+
- CURL_TRC_WS(data, "flushed %zu bytes", n);
9176+
- Curl_bufq_skip(&ws->sendbuf, n);
9177+
- }
9178+
}
9179+
}
9180+
return CURLE_OK;
9181+
@@ -1353,6 +1501,8 @@ static CURLcode ws_send_raw(struct Curl_easy *data, const void *buffer,
9182+
if(result)
9183+
return result;
9184+
result = ws_send_raw_blocking(data, ws, buffer, buflen);
9185+
+ if(!result)
9186+
+ *pnwritten = buflen;
9187+
}
9188+
else {
9189+
/* We need any pending data to be sent or EAGAIN this call. */
9190+
@@ -1587,7 +1737,7 @@ const struct Curl_handler Curl_handler_wss = {
89449191
CURLPROTO_WSS, /* protocol */
89459192
CURLPROTO_HTTP, /* family */
89469193
PROTOPT_SSL | PROTOPT_CREDSPERREQUEST | /* flags */

0 commit comments

Comments
 (0)