Dense Arrayed Matrix Math (DAMM)

DAMM is a high performance numerical computation package that exposes a full SIMD optimized BLAS surface through template metaprogramming in C++

BLAS Specialization

In addition to standard matrix operators like transpose, multiply and inverse, The BLAS functions are reduced to template set-like operators:

union
fused union
reduce
fused reduce

The operators could be specialized to BLAS-named functions:

	template<typename T, typename S>
	struct it
	{
		using scalar_t = T;
		using vector_t = T*;
		using matrix_t = T**;

	/*  LEVEL 1 BLAS-like operations */
		
		/**
		 * \brief Optimized dot product: x^T * y
		 * Equivalent to: fused_reduce<T, std::multiplies<>, std::plus<>, S>
		 */
		static constexpr auto dot_product = [](vector_t x, vector_t y, const size_t N) -> T 
		{ 
			return fused_reduce<T, std::multiplies<>, std::plus<>, S>(x, y, T(0), 1, N);
		};
		
		/**
		 * \brief Vector 2-norm: ||x||_2 = sqrt(x^T * x)
		 */
		static constexpr auto norm2 = [](vector_t x, const size_t N) -> T 
		{
			T sum_squares = fused_reduce<T, std::multiplies<>, std::plus<>, S>( x, x, T(0), 1, N);
			return std::sqrt(sum_squares);
		};
		
		/**
		 * \brief AXPY operation: y = a*x + y
		 */
		static constexpr auto axpy = [](const T a, vector_t x, vector_t y, const size_t N) -> void 
		{
			scalar::fused_union<FusionPolicy::FUSION_FIRST, T, std::plus<>, std::multiplies<>, S>(y, a, x, y, 1, N);
		};
		
		/**
		 * \brief SCAL operation: x = a*x  
		 */
		static constexpr auto scal = [](const T a, vector_t x, const size_t N) -> void 
		{
			scalar::unite<T, std::multiplies<>, S>(x, a, x, 1, N);
		};
		
		/**
		 * \brief Vector sum: sum(x)
		 */
		static constexpr auto sum = [](vector_t x, const size_t N) -> T 
		{
			return reduce<T, std::plus<>, S>(x, T(0), 1, N);
		};


		
	/*  LEVEL 2 BLAS-like operations */
		
		/**
		 * \brief Matrix-vector product: y = A*x
		 */
		static constexpr auto gemv = [](matrix_t A, vector_t x, vector_t y, const size_t M, const size_t N) -> void 
		{
			auto x_mat = view_as_2D(x, N, 1);
			auto y_mat = view_as_2D(y, M, 1);
			multiply<T, S>(A, x_mat.get(), y_mat.get(), M, N, 1);
		};
		
		/**
		 * \brief Rank-1 update: A = A + alpha*x*y^T
		 */
		static constexpr auto ger = [](const T alpha, vector_t x, vector_t y, matrix_t A,
									 const size_t M, const size_t N) -> void {
			auto x_mat = view_as_2D(x, M, 1);
			auto y_mat = view_as_2D(y, 1, N);
			auto outer_prod = aligned_alloc_2D<T, S::bytes>(M, N);
			
			multiply<T, S>(x_mat.get(), y_mat.get(), outer_prod.get(), M, 1, N);
			scalar::unite<T, std::multiplies<>, S>(
				outer_prod.get(), alpha, outer_prod.get(), M, N);
			matrix::unite<T, std::plus<>, S>(
				A, outer_prod.get(), A, M, N);
		};
		
	/*  LEVEL 3 BLAS-like operations */
		
		/**
		 * \brief General matrix multiply: C = A*B
		 */
		static constexpr auto gemm = [](matrix_t A, matrix_t B, matrix_t C,
									  const size_t M, const size_t N, const size_t P) -> void {
			multiply<T, S>(A, B, C, M, N, P);
		};
		
		/**
		 * \brief Symmetric matrix multiply: C = A*A^T
		 */
		static constexpr auto syrk = [](matrix_t A, matrix_t C, const size_t M, const size_t N) -> void {
			auto A_T = aligned_alloc_2D<T, S::bytes>(N, M);
			transpose<T, S>(A, A_T.get(), M, N);
			multiply<T, S>(A, A_T.get(), C, M, N, M);
		};
		
		/**
		 * \brief Matrix transpose: B = A^T
		 */
		static constexpr auto transpose_op = [](matrix_t A, matrix_t B, 
											  const size_t M, const size_t N) -> void {
			transpose<T, S>(A, B, M, N);
		};
		
		/**
		 * \brief Matrix trace: tr(A) = sum(diag(A))
		 */
		static constexpr auto trace = [](matrix_t A, const size_t N) -> T {
			T result = T(0);
			for (size_t i = 0; i < N; ++i)
				result += A[i][i];
			return result;
		};
		
		/**
		 * \brief Frobenius norm: ||A||_F = sqrt(sum(A.*A))
		 */
		static constexpr auto frobenius_norm = [](matrix_t A, const size_t M, const size_t N) -> T {
			T sum_squares = fused_reduce<T, std::multiplies<>, std::plus<>, S>(
				A, A, T(0), M, N);
			return std::sqrt(sum_squares);
		};
		
		/**
		 * \brief Hadamard product: C = A .* B (element-wise multiply)
		 */
		static constexpr auto hadamard = [](matrix_t A, matrix_t B, matrix_t C,
										  const size_t M, const size_t N) -> void {
			matrix::unite<T, std::multiplies<>, S>(A, B, C, M, N);
		};
		
		/**
		 * \brief Element-wise addition: C = A + B
		 */
		static constexpr auto add = [](matrix_t A, matrix_t B, matrix_t C,
									 const size_t M, const size_t N) -> void {
			matrix::unite<T, std::plus<>, S>(A, B, C, M, N);
		};
		
		/**
		 * \brief Element-wise subtraction: C = A - B
		 */
		static constexpr auto subtract = [](matrix_t A, matrix_t B, matrix_t C,
										  const size_t M, const size_t N) -> void {
			matrix::unite<T, std::minus<>, S>(A, B, C, M, N);
		};
		
		/**
		 * \brief LU decomposition: P*A = L*U
		 */
		static constexpr auto lu_decompose = [](matrix_t A, size_t* P, const size_t N) -> bool {
			return lu::decompose<T, S>(A, P, N);
		};
		
		/**
		 * \brief QR decomposition: A = Q*R
		 */
		static constexpr auto qr_decompose = [](matrix_t A, matrix_t Q, matrix_t R,
											   const size_t M, const size_t N) -> bool {
			return qr::decompose<T, S>(A, Q, R, M, N);
		};
		
		/**
		 * \brief Matrix inversion using LU
		 */
		static constexpr auto invert_lu = [](matrix_t A, matrix_t A_inv, const size_t N) -> bool {
			return lu::inverse<T, S>(A, A_inv, N);
		};
		
		/**
		 * \brief Matrix inversion using QR
		 */
		static constexpr auto invert_qr = [](matrix_t A, matrix_t A_inv, 
										   const size_t M, const size_t N) -> bool {
			return qr::inverse<T, S>(A, A_inv, M, N);
		};
		
		/**
		 * \brief Fused multiply-add: D = (A + B) * C
		 */
		static constexpr auto fma_union = [](matrix_t A, matrix_t B, const T C, matrix_t D,
										   const size_t M, const size_t N) -> void {
			scalar::fused_union<FusionPolicy::UNION_FIRST, T, std::plus<>, std::multiplies<>, S,
							  block_size>(A, B, C, D, M, N);
		};
		
		/**
		 * \brief Fused multiply-subtract: D = A - B * C  
		 */
		static constexpr auto fms_fusion = [](matrix_t A, matrix_t B, const T C, matrix_t D,
											const size_t M, const size_t N) -> void {
			scalar::fused_union<FusionPolicy::FUSION_FIRST, T, std::minus<>, std::multiplies<>, S,
							  block_size>(A, B, C, D, M, N);
		};
		
		/**
		 * \brief Fill matrix with zeros
		 */
		static constexpr auto zero_fill = [](matrix_t A, const size_t M, const size_t N) -> void {
			zeros<T, S>(A, M, N);
		};
		
		/**
		 * \brief Fill matrix with ones
		 */
		static constexpr auto one_fill = [](matrix_t A, const size_t M, const size_t N) -> void {
			ones<T, S>(A, M, N);
		};
		
		/**
		 * \brief Set identity matrix
		 */
		static constexpr auto set_identity_matrix = [](matrix_t A, const size_t M, const size_t N) -> void {
			identity<T, S>(A, M, N);
		};
	};

SIMD Specialization

Each operator is SIMD aware and can be specialized for SSE, AVX and AVX512 hardware.

/**:
 * \brief damm-blas (dblas) environment. 
 * 
 * Aliases the damm:: functions using BLAS-like semantics.
 * Allows the environment to be easily reconfigured for different targets 
 * 
 * \usage double result = dblas<double>::dot_product(x, y, N);
 * \usage dblas<float>::gemm(A, B, C, M, N, P);
 *
 */
template<typename T> using dblas = damm::it<T, AVX512>;///< BLAS-like environment optimized for target configuration.

Parallel Scaling

DAMM is capable of parallel computation on its operational kernels with OpenMP. Compile with flags -fopenmp and -lgomp for parallelization or omit the flags to avoid parallel region overheads if operating without parallelization.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Dense Arrayed Matrix Math (DAMM)

BLAS Specialization

SIMD Specialization

Parallel Scaling

FilesExpand file tree

README.md

Latest commit

History

README.md

File metadata and controls

Dense Arrayed Matrix Math (DAMM)

BLAS Specialization

SIMD Specialization

Parallel Scaling