Skip to content

Commit 2bb3239

Browse files
committed
issue: 1557652 Improve memory copy for Blue Flame usage
Use write to BF operation based on CPU command set. Signed-off-by: Igor Ivanov <igor.ivanov.va@gmail.com>
1 parent b173291 commit 2bb3239

File tree

7 files changed

+182
-66
lines changed

7 files changed

+182
-66
lines changed

src/utils/asm-arm64.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,22 +37,12 @@
3737
#include <stdint.h>
3838
#include <unistd.h>
3939

40-
#define COPY_64B_NT(dst, src) \
41-
*dst++ = *src++; \
42-
*dst++ = *src++; \
43-
*dst++ = *src++; \
44-
*dst++ = *src++; \
45-
*dst++ = *src++; \
46-
*dst++ = *src++; \
47-
*dst++ = *src++; \
48-
*dst++ = *src++
4940

5041
#define mb() asm volatile("dsb sy" ::: "memory")
5142
#define rmb() asm volatile("dsb ld" ::: "memory")
5243
#define wmb() asm volatile("dsb st" ::: "memory")
5344
#define wc_wmb() wmb()
5445

55-
5646
/**
5747
* Read RDTSC register
5848
*/
@@ -84,6 +74,4 @@ static inline void prefetch_range(void *addr, size_t len)
8474
prefetch(cp);
8575
}
8676

87-
88-
8977
#endif

src/utils/asm-ppc64.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,6 @@
3737
#include <stdint.h>
3838
#include <unistd.h>
3939

40-
#define COPY_64B_NT(dst, src) \
41-
*dst++ = *src++; \
42-
*dst++ = *src++; \
43-
*dst++ = *src++; \
44-
*dst++ = *src++; \
45-
*dst++ = *src++; \
46-
*dst++ = *src++; \
47-
*dst++ = *src++; \
48-
*dst++ = *src++
4940

5041
#define mb() asm volatile("sync" ::: "memory")
5142
#define rmb() asm volatile("lwsync" ::: "memory")

src/utils/asm-x86.h

Lines changed: 131 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -43,21 +43,6 @@
4343
#define wmb() asm volatile("" ::: "memory")
4444
#define wc_wmb() asm volatile("sfence" ::: "memory")
4545

46-
#define COPY_64B_NT(dst, src) \
47-
__asm__ __volatile__ ( \
48-
" movdqa (%1),%%xmm0\n" \
49-
" movdqa 16(%1),%%xmm1\n" \
50-
" movdqa 32(%1),%%xmm2\n" \
51-
" movdqa 48(%1),%%xmm3\n" \
52-
" movntdq %%xmm0, (%0)\n" \
53-
" movntdq %%xmm1, 16(%0)\n" \
54-
" movntdq %%xmm2, 32(%0)\n" \
55-
" movntdq %%xmm3, 48(%0)\n" \
56-
: : "r" (dst), "r" (src) : "memory"); \
57-
dst += 8; \
58-
src += 8
59-
60-
6146
/**
6247
* Add to the atomic variable.
6348
* @param i integer value to add.
@@ -117,4 +102,135 @@ static inline void prefetch_range(void *addr, size_t len)
117102
prefetch(cp);
118103
}
119104

105+
enum {
106+
CPU_FLAG_CMOV = (1 << 0),
107+
CPU_FLAG_MMX = (1 << 1),
108+
CPU_FLAG_MMX2 = (1 << 2),
109+
CPU_FLAG_SSE = (1 << 3),
110+
CPU_FLAG_SSE2 = (1 << 4),
111+
CPU_FLAG_SSE3 = (1 << 5),
112+
CPU_FLAG_SSSE3 = (1 << 6),
113+
CPU_FLAG_SSE41 = (1 << 7),
114+
CPU_FLAG_SSE42 = (1 << 8),
115+
CPU_FLAG_AVX = (1 << 9),
116+
CPU_FLAG_AVX2 = (1 << 10)
117+
};
118+
119+
#define X86_CPUID_GET_MODEL 0x00000001u
120+
#define X86_CPUID_GET_BASE_VALUE 0x00000000u
121+
#define X86_CPUID_GET_EXTD_VALUE 0x00000007u
122+
#define X86_CPUID_GET_MAX_VALUE 0x80000000u
123+
124+
VMA_ATTRIBUTE_OPTIMIZE_NONE
125+
static inline void __x86_cpuid(uint32_t level,
126+
uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
127+
{
128+
asm volatile ("cpuid\n\t"
129+
: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
130+
: "0" (level));
131+
}
132+
133+
/* This allows the CPU detection to work with assemblers not supporting
134+
* the xgetbv mnemonic.
135+
*/
136+
#define __x86_xgetbv(_index, _eax, _edx) \
137+
asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a"(_eax), "=d"(_edx) : "c" (_index))
138+
139+
/**
140+
* Read CPU instruction set
141+
*/
142+
VMA_ATTRIBUTE_OPTIMIZE_NONE
143+
static inline int cpuid_flags()
144+
{
145+
static int cpu_flag = -1;
146+
147+
if (cpu_flag < 0) {
148+
uint32_t result = 0;
149+
uint32_t base_value;
150+
uint32_t _eax, _ebx, _ecx, _edx;
151+
152+
__x86_cpuid(X86_CPUID_GET_BASE_VALUE, &_eax, &_ebx, &_ecx, &_edx);
153+
base_value = _eax;
154+
155+
if (base_value >= 1) {
156+
__x86_cpuid(X86_CPUID_GET_MODEL, &_eax, &_ebx, &_ecx, &_edx);
157+
if (_edx & (1 << 15)) {
158+
result |= CPU_FLAG_CMOV;
159+
}
160+
if (_edx & (1 << 23)) {
161+
result |= CPU_FLAG_MMX;
162+
}
163+
if (_edx & (1 << 25)) {
164+
result |= CPU_FLAG_MMX2;
165+
}
166+
if (_edx & (1 << 25)) {
167+
result |= CPU_FLAG_SSE;
168+
}
169+
if (_edx & (1 << 26)) {
170+
result |= CPU_FLAG_SSE2;
171+
}
172+
if (_ecx & 1) {
173+
result |= CPU_FLAG_SSE3;
174+
}
175+
if (_ecx & (1 << 9)) {
176+
result |= CPU_FLAG_SSSE3;
177+
}
178+
if (_ecx & (1 << 19)) {
179+
result |= CPU_FLAG_SSE41;
180+
}
181+
if (_ecx & (1 << 20)) {
182+
result |= CPU_FLAG_SSE42;
183+
}
184+
if ((_ecx & 0x18000000) == 0x18000000) {
185+
__x86_xgetbv(0, _eax, _edx);
186+
if ((_eax & 0x6) == 0x6) {
187+
result |= CPU_FLAG_AVX;
188+
}
189+
}
190+
}
191+
if (base_value >= 7) {
192+
__x86_cpuid(X86_CPUID_GET_EXTD_VALUE, &_eax, &_ebx, &_ecx, &_edx);
193+
if ((result & CPU_FLAG_AVX) && (_ebx & (1 << 5))) {
194+
result |= CPU_FLAG_AVX2;
195+
}
196+
}
197+
cpu_flag = result;
198+
}
199+
200+
return cpu_flag;
201+
}
202+
203+
#define __vma_memory_copy64(_dst, _src) \
204+
{ \
205+
static int is_wc_simd = cpuid_flags() & \
206+
(CPU_FLAG_SSE3 | CPU_FLAG_SSSE3 | \
207+
CPU_FLAG_SSE41 | CPU_FLAG_SSE42 | \
208+
CPU_FLAG_AVX | CPU_FLAG_AVX2); \
209+
\
210+
if (is_wc_simd) { \
211+
__asm__ __volatile__ ( \
212+
" movdqa (%1), %%xmm0\n" \
213+
" movdqa 16(%1), %%xmm1\n" \
214+
" movdqa 32(%1), %%xmm2\n" \
215+
" movdqa 48(%1), %%xmm3\n" \
216+
\
217+
" movntdq %%xmm0, (%0)\n" \
218+
" movntdq %%xmm1, 16(%0)\n" \
219+
" movntdq %%xmm2, 32(%0)\n" \
220+
" movntdq %%xmm3, 48(%0)\n" \
221+
: : "r" (_dst), "r" (_src) : "memory"); \
222+
_dst += 8; \
223+
_src += 8; \
224+
} else { \
225+
*_dst++ = *_src++; \
226+
*_dst++ = *_src++; \
227+
*_dst++ = *_src++; \
228+
*_dst++ = *_src++; \
229+
*_dst++ = *_src++; \
230+
*_dst++ = *_src++; \
231+
*_dst++ = *_src++; \
232+
*_dst++ = *_src++; \
233+
} \
234+
}
235+
120236
#endif

src/utils/asm.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
#ifndef ASM_H_
3535
#define ASM_H_
3636

37+
#include "utils/compiler.h"
38+
3739
#ifndef __has_builtin
3840
#define __has_builtin(x) 0
3941
#endif
@@ -54,4 +56,18 @@ typedef atomic_int atomic_t;
5456
#error No architecture specific memory barrier definitions found!
5557
#endif
5658

59+
#ifndef __vma_memory_copy64
60+
#define memory_copy64(dst, src) \
61+
*dst++ = *src++; \
62+
*dst++ = *src++; \
63+
*dst++ = *src++; \
64+
*dst++ = *src++; \
65+
*dst++ = *src++; \
66+
*dst++ = *src++; \
67+
*dst++ = *src++; \
68+
*dst++ = *src++
69+
#else
70+
#define memory_copy64 __vma_memory_copy64
71+
#endif /* atomic_load_explicit */
72+
5773
#endif

src/utils/atomic.h

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -36,29 +36,33 @@
3636

3737
#include "asm.h"
3838

39+
#if defined(__clang__) && __has_builtin(__atomic_load_n) \
40+
&& __has_builtin(__atomic_store_n) \
41+
&& __has_builtin(__atomic_add_fetch) \
42+
&& __has_builtin(__atomic_exchange_n) \
43+
&& __has_builtin(__atomic_compare_exchange_n) \
44+
&& defined(__ATOMIC_RELAXED) \
45+
&& defined(__ATOMIC_CONSUME) \
46+
&& defined(__ATOMIC_ACQUIRE) \
47+
&& defined(__ATOMIC_RELEASE) \
48+
&& defined(__ATOMIC_ACQ_REL) \
49+
&& defined(__ATOMIC_SEQ_CST)
50+
#define USE_BUILTIN_ATOMIC
51+
#elif defined(__GNUC__) && \
52+
((__GNUC__ >= 5) || (__GNUC__ >= 4 && __GNUC_MINOR__ >= 7))
53+
#define USE_BUILTIN_ATOMIC
54+
#else
55+
#define __ATOMIC_RELAXED 0
56+
#define __ATOMIC_CONSUME 1
57+
#define __ATOMIC_ACQUIRE 2
58+
#define __ATOMIC_RELEASE 3
59+
#define __ATOMIC_ACQ_REL 4
60+
#define __ATOMIC_SEQ_CST 5
61+
#endif
3962

4063
/*
4164
* C++11 memory model
4265
*/
43-
#ifndef __ATOMIC_RELAXED
44-
#define __ATOMIC_RELAXED 0
45-
#endif
46-
#ifndef __ATOMIC_CONSUME
47-
#define __ATOMIC_CONSUME 1
48-
#endif
49-
#ifndef __ATOMIC_ACQUIRE
50-
#define __ATOMIC_ACQUIRE 2
51-
#endif
52-
#ifndef __ATOMIC_RELEASE
53-
#define __ATOMIC_RELEASE 3
54-
#endif
55-
#ifndef __ATOMIC_ACQ_REL
56-
#define __ATOMIC_ACQ_REL 4
57-
#endif
58-
#ifndef __ATOMIC_SEQ_CST
59-
#define __ATOMIC_SEQ_CST 5
60-
#endif
61-
6266
enum memory_order {
6367
/* memory_order_relaxed:
6468
* Only atomicity is provided there are no constraints on reordering of memory
@@ -84,11 +88,11 @@ enum memory_order {
8488
#define ATOMIC_INIT(i) { (i) }
8589

8690
#ifndef __vma_atomic_fetch_add_explicit
87-
#if defined(__ATOMIC_RELAXED)
88-
#define atomic_fetch_add_explicit(_obj, _operand, _order) \
91+
#if defined(USE_BUILTIN_ATOMIC)
92+
#define atomic_fetch_add_explicit(_obj, _operand, _order) \
8993
__atomic_fetch_add(&(obj)->value, _operand, _order)
9094
#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
91-
#define atomic_fetch_add_explicit(_obj, _order) \
95+
#define atomic_fetch_add_explicit(_obj, _order) \
9296
__sync_fetch_and_add(&(_obj)->value, _operand)
9397
#else
9498
#error "atomic_fetch_add_explicit() is not supported"
@@ -105,7 +109,7 @@ enum memory_order {
105109
* @param _order memory order.
106110
*/
107111
#ifndef __vma_atomic_store_explicit
108-
#if defined(__ATOMIC_RELAXED)
112+
#if defined(USE_BUILTIN_ATOMIC)
109113
#define atomic_store_explicit(_obj, _val, _order) \
110114
__atomic_store_n(&(_obj)->value, (_val), (_order))
111115
#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
@@ -130,12 +134,12 @@ enum memory_order {
130134
* @return Value before add.
131135
*/
132136
#ifndef __vma_atomic_load_explicit
133-
#if defined(__ATOMIC_RELAXED)
134-
#define atomic_load_explicit(_obj, _order) \
137+
#if defined(USE_BUILTIN_ATOMIC)
138+
#define atomic_load_explicit(_obj, _order) \
135139
__atomic_load_n(&(_obj)->value, (_order))
136140
#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
137-
#define atomic_load_explicit(_obj, _order) \
138-
__sync_fetch_and_add(&(object)->value, 0)
141+
#define atomic_load_explicit(_obj, _order) \
142+
__sync_fetch_and_add(&(_obj)->value, 0)
139143
#else
140144
#error "atomic_load_explicit() is not supported"
141145
#endif

src/vma/dev/qp_mgr_eth_mlx5.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#if defined(DEFINED_DIRECT_VERBS)
3535

3636
#include <sys/mman.h>
37+
#include "utils/asm.h"
3738
#include "cq_mgr_mlx5.h"
3839
#include "vma/util/utils.h"
3940
#include "vlogger/vlogger.h"
@@ -322,11 +323,11 @@ inline void qp_mgr_eth_mlx5::ring_doorbell(uint64_t* wqe, int num_wqebb, int num
322323
* which do not guarantee order of copying.
323324
*/
324325
while (num_wqebb--) {
325-
COPY_64B_NT(dst, src);
326+
memory_copy64(dst, src);
326327
}
327328
src = (uint64_t*)m_sq_wqes;
328329
while (num_wqebb_top--) {
329-
COPY_64B_NT(dst, src);
330+
memory_copy64(dst, src);
330331
}
331332
} else {
332333
*dst = *src;

src/vma/proto/mem_buf_desc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ class mem_buf_desc_t {
130130
// Rx: cq_mgr owns the mem_buf_desc and the associated data buffer
131131
ring_slave* p_desc_owner;
132132

133-
inline int get_ref_count() const {return atomic_read(&n_ref_count);}
133+
inline int get_ref_count() {return atomic_read(&n_ref_count);}
134134
inline void reset_ref_count() {atomic_set(&n_ref_count, 0);}
135135
inline int inc_ref_count() {return atomic_fetch_and_inc(&n_ref_count);}
136136
inline int dec_ref_count() {return atomic_fetch_and_dec(&n_ref_count);}

0 commit comments

Comments
 (0)