NVIDIA
diff --git a/‎dali/kernels/signal/dct/dct_gpu.cu
Lines changed: 19 additions & 8 deletions b/‎dali/kernels/signal/dct/dct_gpu.cu
Lines changed: 19 additions & 8 deletions
diff --git a/‎dali/kernels/signal/dct/dct_gpu.h
Lines changed: 1 addition & 1 deletion b/‎dali/kernels/signal/dct/dct_gpu.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎dali/kernels/signal/dct/dct_gpu_test.cc
Lines changed: 28 additions & 6 deletions b/‎dali/kernels/signal/dct/dct_gpu_test.cc
Lines changed: 28 additions & 6 deletions
diff --git a/‎dali/kernels/signal/dct/dct_test.h
Lines changed: 17 additions & 13 deletions b/‎dali/kernels/signal/dct/dct_test.h
Lines changed: 17 additions & 13 deletions
diff --git a/‎dali/operators/audio/mfcc/mfcc.cc
Lines changed: 30 additions & 9 deletions b/‎dali/operators/audio/mfcc/mfcc.cc
Lines changed: 30 additions & 9 deletions
@@ -31,9 +31,9 @@ namespace dct {
 
 // The kernel processes data with the shape reduced to 3D.
 // Transform is applied over the middle axis.
-template <typename OutputType, typename InputType>
+template <typename OutputType, typename InputType, bool HasLifter>
 __global__ void ApplyDct(const typename Dct1DGpu<OutputType, InputType>::SampleDesc *samples,
-                         const BlockDesc<3> *blocks)  {
+                         const BlockDesc<3> *blocks,  const float *lifter_coeffs)  {
   int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z);
   auto block = blocks[bid];
   const auto &sample = samples[block.sample_idx];
@@ -43,6 +43,7 @@ __global__ void ApplyDct(const typename Dct1DGpu<OutputType, InputType>::SampleD
   for (int z = block.start.z + threadIdx.z; z < block.end.z; z += blockDim.z) {
     for (int y = block.start.y + threadIdx.y; y < block.end.y; y += blockDim.y) {
       const OutputType *cos_row = sample.cos_table + sample.input_length * y;
+      float coeff = HasLifter ? lifter_coeffs[y] : 1.f;
       for (int x = block.start.x + threadIdx.x; x < block.end.x; x += blockDim.x) {
         int output_idx = dot(out_stride, ivec3{z, y, x});
         const InputType *input = sample.input + dot(in_stride, ivec3{z, 0, x});
@@ -51,7 +52,7 @@ __global__ void ApplyDct(const typename Dct1DGpu<OutputType, InputType>::SampleD
           out_val += *input * cos_row[i];
           input += in_stride[1];
         }
-        sample.output[output_idx] = out_val;
+        sample.output[output_idx] = HasLifter ? out_val * coeff : out_val;
       }
     }
   }
@@ -60,8 +61,7 @@ __global__ void ApplyDct(const typename Dct1DGpu<OutputType, InputType>::SampleD
 template <typename OutputType, typename InputType>
 KernelRequirements Dct1DGpu<OutputType, InputType>::Setup(KernelContext &ctx,
                                                           const InListGPU<InputType> &in,
-                                                          span<const DctArgs> args,
-                                                          int axis) {
+                                                          span<const DctArgs> args, int axis) {
   DALI_ENFORCE(args.size() == in.num_samples());
   KernelRequirements req{};
   ScratchpadEstimator se{};
@@ -120,7 +120,7 @@ template <typename OutputType, typename InputType>
 DLL_PUBLIC void Dct1DGpu<OutputType, InputType>::Run(KernelContext &ctx,
                                                      const OutListGPU<OutputType> &out,
                                                      const InListGPU<InputType> &in,
-                                                     span<const DctArgs>, int) {
+                                                     InTensorGPU<float, 1> lifter_coeffs) {
   OutputType *cpu_cos_table[2];
   cpu_cos_table[0] =
     ctx.scratchpad->Allocate<OutputType>(AllocType::Pinned, max_cos_table_size_);
@@ -148,6 +148,10 @@ DLL_PUBLIC void Dct1DGpu<OutputType, InputType>::Run(KernelContext &ctx,
   for (auto arg : args_) {
     auto in_shape = reduce_shape(in.tensor_shape_span(s), axis_);
     auto out_shape = reduce_shape(out.tensor_shape_span(s), axis_);
+    DALI_ENFORCE(lifter_coeffs.num_elements() == 0 || out_shape[1] <= lifter_coeffs.num_elements(),
+                 make_string("Not enough lifter coefficients. NDCT for sample ", s, " is ",
+                             out_shape[1], " and only ", lifter_coeffs.num_elements(),
+                             " coefficients were passed."));
     ivec3 out_stride = GetStrides(ivec3{out_shape[0], out_shape[1], out_shape[2]});
     ivec3 in_stride = GetStrides(ivec3{in_shape[0], in_shape[1], in_shape[2]});;
     int n = in_shape[1];
@@ -162,8 +166,15 @@ DLL_PUBLIC void Dct1DGpu<OutputType, InputType>::Run(KernelContext &ctx,
     ctx.scratchpad->ToContiguousGPU(ctx.gpu.stream, sample_descs_, block_setup_.Blocks());
   dim3 grid_dim = block_setup_.GridDim();
   dim3 block_dim = block_setup_.BlockDim();
-  ApplyDct<OutputType, InputType>
-    <<<grid_dim, block_dim, 0, ctx.gpu.stream>>>(sample_descs_gpu, block_descs_gpu);
+  if (lifter_coeffs.num_elements() > 0) {
+    ApplyDct<OutputType, InputType, true>
+      <<<grid_dim, block_dim, 0, ctx.gpu.stream>>>(sample_descs_gpu, block_descs_gpu,
+                                                   lifter_coeffs.data);
+  } else {
+    ApplyDct<OutputType, InputType, false>
+      <<<grid_dim, block_dim, 0, ctx.gpu.stream>>>(sample_descs_gpu, block_descs_gpu,
+                                                   nullptr);
+  }
 }
 
 template class Dct1DGpu<float, float>;
 
@@ -82,7 +82,7 @@ class DLL_PUBLIC Dct1DGpu {
   DLL_PUBLIC void Run(KernelContext &context,
                       const OutListGPU<OutputType> &out,
                       const InListGPU<InputType> &in,
-                      span<const DctArgs> args, int axis);
+                      InTensorGPU<float, 1> lifter_coeffs);
 
  private:
   std::map<std::pair<int, DctArgs>, OutputType*> cos_tables_{};
 
@@ -19,6 +19,7 @@
 #include "dali/test/tensor_test_utils.h"
 #include "dali/test/test_tensors.h"
 #include "dali/kernels/signal/dct/dct_test.h"
+#include "dali/core/dev_buffer.h"
 
 namespace dali {
 namespace kernels {
@@ -27,13 +28,21 @@ namespace dct {
 namespace test {
 
 class Dct1DGpuTest : public ::testing::TestWithParam<
-  std::tuple<int, std::pair<int, std::vector<int>>>> {
+  std::tuple<int, float, std::pair<int, std::vector<int>>>> {
  public:
   Dct1DGpuTest()
       : batch_size_(std::get<0>(GetParam()))
-      , dims_(std::get<1>(GetParam()).first)
-      , axes_(std::get<1>(GetParam()).second)
+      , lifter_(std::get<1>(GetParam()))
+      , dims_(std::get<2>(GetParam()).first)
+      , axes_(std::get<2>(GetParam()).second)
       , in_shape_(batch_size_, dims_) {
+        if (lifter_) {
+          FillLifter();
+          lifter_coeffs_gpu_buffer.resize(max_ndct);
+          lifter_coeffs_gpu_ = make_tensor_gpu<1>(lifter_coeffs_gpu_buffer.data(), {max_ndct});
+          cudaMemcpy(lifter_coeffs_gpu_.data, lifter_coeffs_.data(),
+                    lifter_coeffs_.size() * sizeof(float), cudaMemcpyHostToDevice);
+        }
         while (args_.size() < static_cast<size_t>(batch_size_) * axes_.size()) {
           for (auto dct : dct_type) {
             for (auto norm : normalize) {
@@ -49,6 +58,13 @@ class Dct1DGpuTest : public ::testing::TestWithParam<
   ~Dct1DGpuTest() override = default;
 
  protected:
+  void FillLifter() {
+    lifter_coeffs_.resize(max_ndct);
+    for (int i = 0; i < max_ndct; ++i) {
+      lifter_coeffs_[i] = 1.0 + lifter_ / 2 * std::sin(M_PI / lifter_ * (i + 1));
+    }
+  }
+
   void PrepareInput() {
     std::mt19937_64 rng{12345};
     std::uniform_int_distribution<> dim_dist(1, 3);
@@ -82,17 +98,22 @@ class Dct1DGpuTest : public ::testing::TestWithParam<
   }
 
   int batch_size_;
+  float lifter_;
   int dims_;
   std::vector<int> axes_;
   TensorListShape<> in_shape_;
   TestTensorList<float> ttl_in_;
   TestTensorList<float> ttl_out_;
   std::vector<DctArgs> args_;
+  std::vector<float> lifter_coeffs_;
+  DeviceBuffer<float> lifter_coeffs_gpu_buffer;
+  OutTensorGPU<float, 1> lifter_coeffs_gpu_{};
   int args_idx_ = 0;
   span<const DctArgs> args_span_;
   const std::array<int, 4> dct_type = {{1, 2, 3, 4}};
   const std::array<bool, 2> normalize = {{false, true}};
   const std::array<int, 3> ndct = {{-1, 10, 20}};
+  const int max_ndct = 40;
 };
 
 
@@ -112,7 +133,7 @@ TEST_P(Dct1DGpuTest, DctTest) {
     ASSERT_EQ(out_shape, req.output_shapes[0]);
     ttl_out_.reshape(out_shape);
     auto out_view = ttl_out_.gpu();
-    kmgr.Run<Kernel>(0, 0, ctx, out_view, in_view, args_span_, axis);
+    kmgr.Run<Kernel>(0, 0, ctx, out_view, in_view, lifter_coeffs_gpu_);
     cudaStreamSynchronize(ctx.gpu.stream);
     auto cpu_in_view = ttl_in_.cpu();
     auto cpu_out_view = ttl_out_.cpu();
@@ -148,7 +169,7 @@ TEST_P(Dct1DGpuTest, DctTest) {
           LOG_LINE << "\n";
           int ndct = args.ndct > 0 ? args.ndct : in_shape_[s][axis];
           std::vector<float> ref(ndct, 0);
-          ReferenceDct(args.dct_type, make_span(ref), make_cspan(in_buf), args.normalize);
+          ReferenceDct(args.dct_type, make_span(ref), make_cspan(in_buf), args.normalize, lifter_);
           LOG_LINE << "DCT (type " << args.dct_type << "):";
           for (int k = 0; k < ndct; k++) {
             EXPECT_NEAR(ref[k], out[out_idx], 1e-5);
@@ -163,7 +184,8 @@ TEST_P(Dct1DGpuTest, DctTest) {
 }
 
 INSTANTIATE_TEST_SUITE_P(Dct1DGpuTest, Dct1DGpuTest, testing::Combine(
-    testing::Values(1, 6, 12),  // batch_size
+    testing::Values(1, 12),  // batch_size
+    testing::Values(0.f, 0.5f),  // lifter
     testing::Values(std::make_pair(2, std::vector<int>{1}),
                     std::make_pair(4, std::vector<int>{0, 3, 1}),
                     std::make_pair(1, std::vector<int>{0, 0}))  // dims, axes
 
@@ -24,7 +24,7 @@ namespace dct {
 namespace test {
 
 template <typename T>
-void ReferenceDctTypeI(span<T> out, span<const T> in, bool normalize) {
+void ReferenceDctTypeI(span<T> out, span<const T> in, bool normalize, float lifter) {
   int64_t in_length = in.size();
   int64_t out_length = out.size();
   double phase_mul = M_PI / (in_length - 1);
@@ -34,12 +34,13 @@ void ReferenceDctTypeI(span<T> out, span<const T> in, bool normalize) {
     for (int64_t n = 1; n < in_length - 1; n++) {
       out_val += in[n] * std::cos(phase_mul * n * k);
     }
-    out[k] = out_val;
+    float coeff = lifter ? (1.0 + lifter / 2 * std::sin(M_PI / lifter * (k + 1))) : 1.f;
+    out[k] = out_val * coeff;
   }
 }
 
 template <typename T>
-void ReferenceDctTypeII(span<T> out, span<const T> in, bool normalize) {
+void ReferenceDctTypeII(span<T> out, span<const T> in, bool normalize, float lifter) {
   int64_t in_length = in.size();
   int64_t out_length = out.size();
   double phase_mul = M_PI / in_length;
@@ -54,12 +55,13 @@ void ReferenceDctTypeII(span<T> out, span<const T> in, bool normalize) {
       out_val += in[n] * std::cos(phase_mul * (n + 0.5) * k);
     }
     double factor = (k == 0) ? factor_k_0 : factor_k_i;
-    out[k] = factor * out_val;
+    float coeff = lifter ? (1.0 + lifter / 2 * std::sin(M_PI / lifter * (k + 1))) : 1.f;
+    out[k] = factor * out_val * coeff;
   }
 }
 
 template <typename T>
-void ReferenceDctTypeIII(span<T> out, span<const T> in, bool normalize) {
+void ReferenceDctTypeIII(span<T> out, span<const T> in, bool normalize, float lifter) {
   int64_t in_length = in.size();
   int64_t out_length = out.size();
   double phase_mul = M_PI / in_length;
@@ -74,12 +76,13 @@ void ReferenceDctTypeIII(span<T> out, span<const T> in, bool normalize) {
     for (int64_t n = 1; n < in_length; n++) {
       out_val += factor_n_i * in[n] * std::cos(phase_mul * n * (k + 0.5));
     }
-    out[k] = out_val;
+    float coeff = lifter ? (1.0 + lifter / 2 * std::sin(M_PI / lifter * (k + 1))) : 1.f;
+    out[k] = out_val * coeff;
   }
 }
 
 template <typename T>
-void ReferenceDctTypeIV(span<T> out, span<const T> in, bool normalize) {
+void ReferenceDctTypeIV(span<T> out, span<const T> in, bool normalize, float lifter) {
   int64_t in_length = in.size();
   int64_t out_length = out.size();
   double phase_mul = M_PI / in_length;
@@ -89,28 +92,29 @@ void ReferenceDctTypeIV(span<T> out, span<const T> in, bool normalize) {
     for (int64_t n = 0; n < in_length; n++) {
       out_val += factor * in[n] * std::cos(phase_mul * (n + 0.5) * (k + 0.5));
     }
-    out[k] = out_val;
+    float coeff = lifter ? (1.0 + lifter / 2 * std::sin(M_PI / lifter * (k + 1))) : 1.f;
+    out[k] = out_val * coeff;
   }
 }
 
 
 template <typename T>
-void ReferenceDct(int dct_type, span<T> out, span<const T> in, bool normalize) {
+void ReferenceDct(int dct_type, span<T> out, span<const T> in, bool normalize, float lifter = 0) {
   switch (dct_type) {
     case 1:
-      ReferenceDctTypeI(out, in, normalize);
+      ReferenceDctTypeI(out, in, normalize, lifter);
       break;
 
     case 2:
-      ReferenceDctTypeII(out, in, normalize);
+      ReferenceDctTypeII(out, in, normalize, lifter);
       break;
 
     case 3:
-      ReferenceDctTypeIII(out, in, normalize);
+      ReferenceDctTypeIII(out, in, normalize, lifter);
       break;
 
     case 4:
-      ReferenceDctTypeIV(out, in, normalize);
+      ReferenceDctTypeIV(out, in, normalize, lifter);
       break;
 
     default:
 
@@ -1,4 +1,4 @@
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #include "dali/kernels/common/for_axis.h"
 #include "dali/pipeline/data/views.h"
 
-
-#define MFCC_SUPPORTED_TYPES (float)
 #define MFCC_SUPPORTED_NDIMS (2, 3, 4)
 
 static constexpr int kNumInputs = 1;
@@ -30,6 +28,28 @@ namespace dali {
 
 namespace detail {
 
+template <>
+DLL_PUBLIC void LifterCoeffs<CPUBackend>::Calculate(int64_t target_length, float lifter,
+                                                    cudaStream_t)  {
+  // If different lifter argument, clear previous coefficients
+  if (lifter_ != lifter) {
+    coeffs_.clear();
+    lifter_ = lifter;
+  }
+
+  // 0 means no liftering
+  if (lifter_ == 0.0f)
+    return;
+
+  // Calculate remaining coefficients (if necessary)
+  if (static_cast<int64_t>(coeffs_.size()) < target_length) {
+    int64_t start = coeffs_.size(), end = target_length;
+    coeffs_.resize(target_length);
+    CalculateCoeffs(coeffs_.data() + start, start, target_length - start);
+  }
+}
+
+
 template <typename T, int Dims>
 void ApplyLifter(const kernels::OutTensorCPU<T, Dims> &inout, int axis, const T* lifter_coeffs) {
   assert(axis >= 0 && axis < Dims);
@@ -93,6 +113,7 @@ the following formula::
 template <>
 bool MFCC<CPUBackend>::SetupImpl(std::vector<OutputDesc> &output_desc,
                                  const workspace_t<CPUBackend> &ws) {
+  GetArguments(ws);
   output_desc.resize(kNumOutputs);
   const auto &input = ws.InputRef<CPUBackend>(0);
   auto &output = ws.OutputRef<CPUBackend>(0);
@@ -116,11 +137,11 @@ bool MFCC<CPUBackend>::SetupImpl(std::vector<OutputDesc> &output_desc,
       output_desc[0].shape.resize(nsamples, Dims);
       for (int i = 0; i < nsamples; i++) {
         const auto in_view = view<const T, Dims>(input[i]);
-        auto &req = kmgr_.Setup<DctKernel>(i, ctx, in_view, args_, axis_);
-        output_desc[0].shape.set_tensor_shape(i, req.output_shapes[0][0].shape);
-
-        if (in_view.shape[axis_] > max_length) {
-          max_length = in_view.shape[axis_];
+        auto &req = kmgr_.Setup<DctKernel>(i, ctx, in_view, args_[i], axis_);
+        auto out_shape = req.output_shapes[0][0];
+        output_desc[0].shape.set_tensor_shape(i, out_shape);
+        if (out_shape[axis_] > max_length) {
+          max_length = out_shape[axis_];
         }
       }
     ), DALI_FAIL(make_string("Unsupported number of dimensions ", in_shape.size())));  // NOLINT
@@ -147,7 +168,7 @@ void MFCC<CPUBackend>::RunImpl(workspace_t<CPUBackend> &ws) {
             kernels::KernelContext ctx;
             auto in_view = view<const T, Dims>(input[i]);
             auto out_view = view<T, Dims>(output[i]);
-            kmgr_.Run<DctKernel>(thread_id, i, ctx, out_view, in_view, args_, axis_);
+            kmgr_.Run<DctKernel>(thread_id, i, ctx, out_view, in_view, args_[i], axis_);
             if (lifter_ != 0.0f) {
               assert(static_cast<int64_t>(lifter_coeffs_.size()) >= out_view.shape[axis_]);
               detail::ApplyLifter(out_view, axis_, lifter_coeffs_.data());