espressif
diff --git a/‎.gitlab/ci/build.yml‎
Lines changed: 12 additions & 1 deletion b/‎.gitlab/ci/build.yml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎.gitlab/ci/rules.yml‎
Lines changed: 13 additions & 0 deletions b/‎.gitlab/ci/rules.yml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎.gitlab/ci/target_test.yml‎
Lines changed: 20 additions & 0 deletions b/‎.gitlab/ci/target_test.yml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎esp-dl/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion b/‎esp-dl/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎esp-dl/README.md‎
Lines changed: 65 additions & 1 deletion b/‎esp-dl/README.md‎
Lines changed: 65 additions & 1 deletion
diff --git a/‎esp-dl/audio/README.md‎
Lines changed: 107 additions & 0 deletions b/‎esp-dl/audio/README.md‎
Lines changed: 107 additions & 0 deletions
@@ -280,4 +280,15 @@ build_dl_fft:
       - IMAGE: [espressif/idf:release-v5.3, espressif/idf:release-v5.5]
         TARGET: [esp32p4, esp32s3, esp32c3, esp32]
   variables:
-    EXAMPLE_DIR: test_apps/dl_fft
+    EXAMPLE_DIR: test_apps/dl_fft
+
+build_dl_audio:
+  extends:
+    - .build_examples_template
+    - .rules:build:test_dl_audio
+  parallel:
+    matrix:
+      - IMAGE: [espressif/idf:release-v5.3, espressif/idf:release-v5.5]
+        TARGET: [esp32p4, esp32s3, esp32]
+  variables:
+    EXAMPLE_DIR: test_apps/dl_audio
@@ -101,6 +101,8 @@
 .patterns-test_dl_fft: &patterns-test_dl_fft
   - "tools/dl_fft/**/*"
 
+.patterns-test_dl_audio: &patterns-test_dl_audio
+  - "esp-dl/dl_audio/**/*"
 
 ##############
 # if anchors #
@@ -302,6 +304,17 @@
     - <<: *if-dev-push
       changes: *patterns-gitlab-ci
 
+.rules:build:test_dl_audio:
+  rules:
+    - <<: *if-protected
+    - <<: *if-label-build
+    - <<: *if-dev-push
+      changes: *patterns-test_dl_fft
+    - <<: *if-dev-push
+      changes: *patterns-test_dl_audio
+    - <<: *if-dev-push
+      changes: *patterns-gitlab-ci
+
 .rules:pre_check:readme:
   rules:
     - <<: *if-protected
 
@@ -151,3 +151,23 @@ test_dl_image:
     TEST_FOLDER: 'test_apps/dl_image'
     TEST_TARGET: ${IDF_TARGET}
     TEST_ENV: ${IDF_TARGET}
+
+test_dl_audio:
+  extends:
+    - .pytest_api_template
+    - .rules:build:test_dl_audio
+  needs:
+    - job: "build_dl_audio"
+      artifacts: true
+      optional: true
+  parallel:
+    matrix:
+      - IDF_TARGET: [esp32p4, esp32s3, esp32]
+        IDF_VERSION: "5.3"
+  tags:
+    - ${IDF_TARGET}
+  image: $DOCKER_TARGET_TEST_v5_3_ENV_IMAGE
+  variables:
+    TEST_FOLDER: 'test_apps/dl_audio'
+    TEST_TARGET: ${IDF_TARGET}
+    TEST_ENV: ${IDF_TARGET}
@@ -11,6 +11,8 @@ set(src_dirs        ./dl/tool/src
                     ./vision/image
                     ./vision/recognition
                     ./vision/classification
+                    ./audio/common
+                    ./audio/speech_features
                     )
 
 set(include_dirs    ./dl
@@ -26,6 +28,8 @@ set(include_dirs    ./dl
                     ./vision/image
                     ./vision/recognition
                     ./vision/classification
+                    ./audio/common
+                    ./audio/speech_features
                     )
 
 if(CONFIG_IDF_TARGET_ESP32)
@@ -47,7 +51,8 @@ elseif(CONFIG_IDF_TARGET_ESP32P4)
     list(APPEND src_dirs        dl/base/isa/esp32p4)
 endif()
 
-set(requires        esp_mm
+set(requires        dl_fft
+                    esp_mm
                     esp_new_jpeg
                     esp_driver_jpeg
                     esp_driver_ppa
 
@@ -1,6 +1,6 @@
 # ESP-DL
 
-ESP-DL is designed to maintain optimal performance while significantly reducing the workload in model deployment. Our project has achieved the following key features:
+ESP-DL is a lightweight and efficient neural network inference framework specifically designed for ESP series chips (ESP32, ESP32-S3, ESP32-P4). It is built to maintain optimal performance while significantly reducing the workload in model deployment. Our project has achieved the following key features:
 
 ### ESP-DL Standard Model Format
 
@@ -29,4 +29,68 @@ The automatic dual-core scheduling enables computationally intensive operators t
 
 ---
 
+## Project Structure
+
+The ESP-DL project is organized to provide a clear separation of concerns for different functionalities. Here's a breakdown of the main directories and their purposes to help beginners get started quickly:
+
+```
+esp-dl/
+├── dl/                  # Core deep learning library
+│   ├── base/            # Fundamental data types and utilities
+│   ├── tensor/          # TensorBase class for data handling
+│   ├── model/           # Model class for loading, building, and running neural networks
+│   ├── module/          # Base Module class for operators/layers
+│   ├── math/            # Mathematical functions and operations
+│   ├── tool/            # Utility tools for the framework
+│   ├── dl_define.hpp    # Global definitions, quantization and activation types
+│   └── dl_define_private.hpp # Private definitions
+├── fbs_loader/          # FlatBuffers model loading functionality
+│   ├── include/         # Header files for the loader
+│   ├── src/             # Source files for the loader
+│   ├── lib/             # Pre-compiled FlatBuffers model library
+│   └── pack_espdl_models.py # Script to pack multiple models
+├── audio/               # Audio processing module
+│   ├── common/          # Common audio processing utilities (WAV decoding, etc.)
+│   ├── speech_features/ # Speech feature extraction (Fbank, MFCC, Spectrogram)
+│   └── README.md        # Detailed documentation for audio processing
+├── vision/              # Vision processing module
+│   ├── image/           # Image processing utilities (JPEG, BMP, drawing, preprocessing)
+│   ├── detect/          # Object detection post-processors (YOLO, etc.)
+│   ├── classification/  # Image classification post-processors (ImageNet, etc.)
+│   └── recognition/     # Face recognition components
+├── CMakeLists.txt       # CMake build configuration for the ESP-IDF component
+├── idf_component.yml    # ESP-IDF component manifest
+├── LICENSE              # Project license information
+└── README.md            # This file
+```
+
+### Core Components (`dl/`)
+
+This is the heart of the ESP-DL framework. It contains the fundamental classes and functions required for neural network inference.
+
+- `base/`: Contains basic utilities and low-level operations.
+- `tensor/`: Defines the `TensorBase` class, which is used throughout the framework to represent data.
+- `model/`: Contains the `Model` class, which handles loading `.espdl` files, building an execution plan, and running inference.
+- `module/`: Defines the `Module` base class, from which all neural network operators (like Conv2D, Pool2D) are derived.
+- `math/`: Provides optimized mathematical functions used by operators.
+- `tool/`: Offers various utility functions for the framework.
+- `dl_define.hpp`: Central place for global definitions like quantization and activation types.
+
+### FlatBuffers Loader (`fbs_loader/`)
+
+This component is responsible for loading models stored in the `.espdl` format, which is based on FlatBuffers.
+
+### Audio Processing (`audio/`)
+
+This module provides functionalities for audio signal processing, particularly focused on speech feature extraction. It includes utilities for WAV decoding and extracting features like Fbank, MFCC, and Spectrogram, optimized for ESP platforms.
+
+### Vision Processing (`vision/`)
+
+This module provides functionalities for computer vision tasks.
+
+- `image/`: Utilities for image loading (JPEG, BMP), preprocessing, color space conversion, and drawing.
+- `detect/`: Post-processors for object detection models (e.g., YOLO variants).
+- `classification/`: Post-processors for image classification models (e.g., ImageNet classifiers).
+- `recognition/`: Components for face recognition tasks.
+
 Explore ESP-DL to streamline your AI model deployment and achieve optimal performance with minimal resource usage.
@@ -0,0 +1,107 @@
+# ESP-DL Audio Processing Module
+
+The ESP-DL Audio Processing Module is a C++ library designed for audio signal processing, particularly focused on speech feature extraction. It provides implementations of common audio processing algorithms optimized for ESP platforms.
+
+## Features
+
+- WAV file decoding
+- Speech feature extractionincluding:
+  - Fbank (Filter Bank) features
+  - MFCC (Mel-Frequency Cepstral Coefficients)
+  - Spectrogram
+- Support for various window functions (Hanning, Hamming, Povey, etc.)
+- Configurable parameters for feature extraction
+- Optimized for ESP platforms with memory allocation capabilities
+- Aligned with Kaldi's implementation [(torchaudio.compliance.kaldi)](https://docs.pytorch.org/audio/stable/compliance.kaldi.html).
+
+## Directory Structure
+
+```
+audio/
+├── common/              # Common audio processing utilities
+│   ├── dl_audio_common.cpp/hpp  # Common audio functions and definitions
+│   └── dl_audio_wav.cpp/hpp     # WAV file decoding utilities
+└── speech_features/     # Speech feature extraction algorithms
+    ├── dl_fbank.cpp/hpp         # Fbank (Filter Bank) feature extraction
+    ├── dl_mfcc.cpp/hpp          # MFCC (Mel-Frequency Cepstral Coefficients)
+    ├── dl_spectrogram.cpp/hpp   # Spectrogram feature extraction
+    └── dl_speech_features.cpp/hpp # Base class for speech features
+```
+
+## Common Audio Utilities
+
+### WAV Decoding
+The module provides functionality to decode WAV audio files into raw PCM data.
+
+```cpp
+#include "dl_audio_wav.hpp"
+
+dl::audio::dl_audio_t *audio = dl::audio::decode_wav(wav_data, data_len);
+```
+
+### Audio Common Functions
+Provides common audio processing functions such as:
+- Window function generation (Hanning, Hamming, Blackman, etc.)
+- Mel filterbank initialization
+- Pre-emphasis filtering
+- FFT-related operations
+
+## Speech Feature Extraction
+
+All speech feature extraction classes inherit from the `SpeechFeatureBase` class, which provides a common interface.
+
+### Configuration
+Speech feature extraction can be configured using the `SpeechFeatureConfig` structure:
+
+```cpp
+dl::audio::SpeechFeatureConfig config;
+config.sample_rate = 16000;
+config.frame_length = 25;  // ms
+config.frame_shift = 10;   // ms
+config.num_mel_bins = 26;
+config.window_type = dl::audio::WinType::HANNING;
+```
+
+### Fbank (Filter Bank)
+Extracts filter bank features from audio signals.
+
+```cpp
+#include "dl_fbank.hpp"
+
+dl::audio::Fbank fbank(config);
+// Process audio data
+std::vector<int> shape = fbank.get_output_shape(audio_length);
+float *output_features = (float*) malloc(shape[0] * shape[1]);
+fbank.process(audio_data, audio_length, output_features);
+```
+
+### MFCC (Mel-Frequency Cepstral Coefficients)
+Extracts MFCC features, which are commonly used in speech recognition.
+
+```cpp
+#include "dl_mfcc.hpp"
+
+dl::audio::MFCC mfcc(config);
+// Process audio data
+std::vector<int> shape = mfcc.get_output_shape(audio_length);
+float *output_features = (float*) malloc(shape[0] * shape[1]);
+mfcc.process(audio_data, audio_length, output_features);
+```
+
+### Spectrogram
+Computes spectrogram features aligned with torchaudio.compliance.kaldi.spectrogram.
+
+```cpp
+#include "dl_spectrogram.hpp"
+
+dl::audio::Spectrogram spectrogram(config);
+// Process audio data
+std::vector<int> shape = mfcc.get_output_shape(audio_length);
+float *output_features = (float*) malloc(shape[0] * shape[1]);
+spectrogram.process(audio_data, audio_length, output_features);
+```
+
+
+## License
+
+MIT License