diff --git a/.gitignore b/.gitignore index 2cf8e138..60af5560 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ flatpak/build-dir flatpak/repo flatpak/.flatpak-builder *.user +models/*.bin +models/*.param +logs diff --git a/io.github.saeugetier.photobooth.json b/io.github.saeugetier.photobooth.json index fa0d4d86..cdc3d596 100644 --- a/io.github.saeugetier.photobooth.json +++ b/io.github.saeugetier.photobooth.json @@ -45,7 +45,7 @@ ], "cleanup": [ "/bin/*", - "/share/opencv4/*" + "/share/*" ] }, { @@ -75,6 +75,25 @@ } ] }, + { + "name": "ncnn", + "buildsystem": "cmake-ninja", + "builddir": true, + "config-opts": [ + "-DNCNN_BUILD_TESTS=OFF", + "-DNCNN_BUILD_EXAMPLES=OFF", + "-DNCNN_BUILD_TOOLS=OFF", + "-DNCNN_BUILD_BENCHMARK=OFF", + "-DNCNN_SHARED_LIB=ON" + ], + "sources": [ + { + "type": "archive", + "url": "https://github.com/Tencent/ncnn/archive/refs/tags/20250503.tar.gz", + "sha256": "3afea4cf092ce97d06305b72c6affbcfb3530f536ae8e81a4f22007d82b729e9" + } + ] + }, { "name": "qtvirtualkeyboard", "buildsystem": "cmake-ninja", @@ -92,6 +111,38 @@ "rmdir ${FLATPAK_DEST}/lib/${FLATPAK_ARCH}-linux-gnu" ] }, + { + "name": "yolo-models", + "buildsystem": "simple", + "build-options": { + "build-args": [ + "--share=network" + ] + }, + "build-commands": [ + "python3 -m venv yolo-venv", + ". yolo-venv/bin/activate && pip install --no-cache-dir numpy==1.26.4", + ". yolo-venv/bin/activate && pip install --no-cache-dir torch==2.4.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu", + ". yolo-venv/bin/activate && pip install --no-cache-dir ultralytics==8.3.168 onnxruntime onnx==1.16.1 onnxslim==0.1.59 ncnn==1.0.20250503 rknn-toolkit2==2.3.2", + ". yolo-venv/bin/activate && yolo export model=yolo11n-seg.pt format=onnx", + ". yolo-venv/bin/activate && yolo export model=yolo11x-seg.pt format=onnx", + ". yolo-venv/bin/activate && yolo export model=yolo11n-seg.pt format=ncnn", + ". yolo-venv/bin/activate && yolo export model=yolo11x-seg.pt format=ncnn", + "rm -rf yolo-venv" + ], + "post-install": [ + "rm *.pt", + "rm *.torchscript", + "mkdir ${FLATPAK_DEST}/share/models", + "mv * ${FLATPAK_DEST}/share/models" + ], + "sources": [ + { + "type": "file", + "path": "models/coco.names" + } + ] + }, { "name": "qtbooth", "buildsystem": "qmake", diff --git a/io.github.saeugetier.photobooth.metainfo.xml b/io.github.saeugetier.photobooth.metainfo.xml index 47ef54fc..40a7364e 100644 --- a/io.github.saeugetier.photobooth.metainfo.xml +++ b/io.github.saeugetier.photobooth.metainfo.xml @@ -36,6 +36,11 @@ + + +

NCNN as secondary neural network runtime for accelerated execution on Raspberry Pi. Several bugfixes for better user experience with flatpak build.

+
+

Initial release with flatpak. New implementation of background removal filter. Port from Qt5 to Qt6.

diff --git a/models/scripts/download_export_models.py b/models/scripts/download_export_models.py new file mode 100644 index 00000000..d2978fbc --- /dev/null +++ b/models/scripts/download_export_models.py @@ -0,0 +1,13 @@ +from ultralytics import YOLO + +# Load a model +model_n = YOLO("yolo11n-seg.pt") # load an official model +model_x = YOLO("yolo11x-seg.pt") # load an official model + +# Export the model +model_n.export(format="onnx") +model_n.export(format="ncnn") + +model_x.export(format="onnx") +model_x.export(format="ncnn") + diff --git a/qml/Application.qml b/qml/Application.qml index 2385ca33..4924433e 100644 --- a/qml/Application.qml +++ b/qml/Application.qml @@ -153,6 +153,12 @@ ApplicationWindow { console.log("Camera orientation changed to: " + applicationSettings.cameraOrientation) } + settingsMenu.comboBoxNeuralNetworkRuntime.onCurrentValueChanged: + { + applicationSettings.neuralNetworkRuntime = String(settingsMenu.comboBoxNeuralNetworkRuntime.currentValue) + console.log("Neural network runtime changed to: " + applicationSettings.neuralNetworkRuntime) + } + mainMenu.printerBusy: printer ? printer.busy : false } @@ -174,6 +180,7 @@ ApplicationWindow { property bool printFromGallery: true property bool enableSettingsPassword: true property int cameraOrientation: 0 + property string neuralNetworkRuntime: "ONNX" Component.onCompleted: { @@ -190,6 +197,7 @@ ApplicationWindow { flow.collageMenu.multiplePrints = multiplePrints flow.snapshotMenu.hideSnapshotSettingsPane = disableSnapshotSettingsPane flow.imagePreview.effectButton.visible = !disableEffectPopup + flow.settingsMenu.comboBoxNeuralNetworkRuntime.currentIndex = flow.settingsMenu.comboBoxNeuralNetworkRuntime.indexOfValue(neuralNetworkRuntime) } onPrinterNameChanged: diff --git a/qml/SettingsMenuForm.ui.qml b/qml/SettingsMenuForm.ui.qml index 21911a9e..7d610300 100644 --- a/qml/SettingsMenuForm.ui.qml +++ b/qml/SettingsMenuForm.ui.qml @@ -27,6 +27,7 @@ Item { property alias switchEnableSettingsPassword: switchEnableSettingsPassword property alias versionText: labelVersionText.text property alias comboBoxCameraOrientation: comboBoxCameraOrientation + property alias comboBoxNeuralNetworkRuntime: comboBoxNeuralNetworkRuntime ColumnLayout { anchors.fill: parent @@ -191,6 +192,27 @@ Item { }] } } + + RowLayout + { + spacing: 10 + Label + { + text: qsTr("Neuroal Network Runtime") + } + Item + { + Layout.fillWidth: true + } + ComboBox + { + id: comboBoxNeuralNetworkRuntime + textRole: "text" + valueRole: "value" + model: [{text: "ONNX Runtime", value: "ONNX"}, {text: "NCNN Runtime", value: "NCNN"}] + Layout.preferredWidth: 200 + } + } } } diff --git a/qml/SnapshotMenu.qml b/qml/SnapshotMenu.qml index d4a675cc..bf506bb1 100644 --- a/qml/SnapshotMenu.qml +++ b/qml/SnapshotMenu.qml @@ -126,6 +126,7 @@ SnapshotMenuForm { cameraRenderer.backgroundFilter.method: snapshotSettings.backgroundFilterEnabled ? (snapshotSettings.chromaKeyEnabled ? "Chroma" : "Neural") : "None" cameraRenderer.backgroundFilterEnabled: snapshotSettings.backgroundFilterEnabled cameraRenderer.backgroundFilter.keyColor: snapshotSettings.chromaKeyColor + cameraRenderer.backgroundFilter.neuralNetworkRuntime: applicationSettings.neuralNetworkRuntime cameraRenderer.backgroundImage: snapshotSettings.backgroundImage SequentialAnimation diff --git a/qtbooth.pro b/qtbooth.pro index ffd4cc23..62b43b70 100644 --- a/qtbooth.pro +++ b/qtbooth.pro @@ -21,15 +21,15 @@ SOURCES += src/collageiconmodel.cpp \ src/noprinter.cpp \ src/printerfactory.cpp \ src/replacebackgroundvideofilter.cpp \ + src/segmentation.cpp \ src/selphyprinter.cpp \ src/standardprinter.cpp \ src/system.cpp \ src/translationhelper.cpp \ - src/yolo11seg.cpp + src/yolo11segncnn.cpp \ + src/yolo11segonnx.cpp -RESOURCES += qml.qrc \ - yolomodel.large.qrc \ - yolomodel.small.qrc +RESOURCES += qml.qrc # Additional import path used to resolve QML modules in Qt Creator's code model QML_IMPORT_PATH = @@ -52,6 +52,7 @@ DISTFILES += \ INCLUDEPATH += src/ \ libs/onnxruntime/include/ \ + libs/ncnn/include/ \ HEADERS += \ src/abstractprinter.h \ @@ -67,11 +68,14 @@ HEADERS += \ src/noprinter.h \ src/printerfactory.h \ src/replacebackgroundvideofilter.h \ + src/segmentation.h \ src/selphyprinter.h \ src/standardprinter.h \ src/system.h \ src/translationhelper.h \ - src/yolo11seg.h + src/yolo11segncnn.h \ + src/yolo11segonnx.h \ + src/yolobackend.h contains(ANDROID_TARGET_ARCH,x86) { ANDROID_PACKAGE_SOURCE_DIR = \ @@ -81,6 +85,7 @@ contains(ANDROID_TARGET_ARCH,x86) { DEFINES += GIT_CURRENT_SHA1="$(shell git -C \""$$_PRO_FILE_PWD_"\" describe)" LIBS += -L"$$PWD/libs/onnxruntime/lib" -lonnxruntime +LIBS += -L"$$PWD/libs/ncnn/lib" -lncnn !isEmpty(PREFIX) { INSTALLS += target diff --git a/src/main.cpp b/src/main.cpp index 87cbdccb..16072d5d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -111,6 +111,9 @@ int main(int argc, char *argv[]) if (QFontDatabase::addApplicationFont(":/font/DejaVuSerif/DejaVuSerif.ttf") == -1) qWarning() << "Failed to load DejaVuSerif.ttf"; + qDebug() << "Standard path: " << QStandardPaths::standardLocations(QStandardPaths::AppDataLocation); + qDebug() << "Standard path: " << QStandardPaths::standardLocations(QStandardPaths::GenericDataLocation); + qmlRegisterType("CollageModel", 1, 0, "CollageModelFactory"); qmlRegisterUncreatableType("CollageModel", 1, 0, "CollageIconModel", "CollageIconModel can only be created via CollageModeFactory"); qmlRegisterUncreatableType("CollageModel", 1, 0, "CollageImageModel", "CollageImageModel can only be created via CollageModeFactory"); diff --git a/src/replacebackgroundvideofilter.cpp b/src/replacebackgroundvideofilter.cpp index 59a97ddb..38ff129c 100644 --- a/src/replacebackgroundvideofilter.cpp +++ b/src/replacebackgroundvideofilter.cpp @@ -8,7 +8,7 @@ #include ReplaceBackgroundVideoFilter::ReplaceBackgroundVideoFilter(QObject *parent) : QVideoFrameInput(parent), - mBackgroundImage(320, 240, CV_8UC3, cv::Scalar(0, 0, 0)), mRunable(new ReplaceBackgroundFilterRunable(this)) + mBackgroundImage(320, 240, CV_8UC3, cv::Scalar(0, 0, 0)), mRunable(new ReplaceBackgroundFilterRunable(this)) { mRunable->moveToThread(&mWorkerThread); @@ -16,6 +16,7 @@ ReplaceBackgroundVideoFilter::ReplaceBackgroundVideoFilter(QObject *parent) : QV connect(this, &ReplaceBackgroundVideoFilter::asyncProcessFrame, mRunable, &ReplaceBackgroundFilterRunable::run); connect(mRunable, &ReplaceBackgroundFilterRunable::processingFinished, this, &ReplaceBackgroundVideoFilter::onProcessingFinished); connect(mRunable, &ReplaceBackgroundFilterRunable::imageFileSaved, this, &ReplaceBackgroundVideoFilter::onImageSaved); + connect(this, &ReplaceBackgroundVideoFilter::changeNeuralNetworkRuntime, mRunable, &ReplaceBackgroundFilterRunable::changeNeuralNetworkRuntime); mWorkerThread.start(); } @@ -33,11 +34,11 @@ void ReplaceBackgroundVideoFilter::setKeyColor(float color) void ReplaceBackgroundVideoFilter::setMethod(QString method) { - if(method.contains("Chroma")) + if (method.contains("Chroma")) { mFilterMethod = FilterMethod::CHROMA; } - else if(method.contains("Neural")) + else if (method.contains("Neural")) { mFilterMethod = FilterMethod::NEURAL; } @@ -47,10 +48,35 @@ void ReplaceBackgroundVideoFilter::setMethod(QString method) } } -void ReplaceBackgroundVideoFilter::setBackground(QImage const& image) +void ReplaceBackgroundVideoFilter::setNeuralNetworkRuntime(QString runtime) +{ + NeuralNetworkRuntime newRuntime = mNeuralNetworkRuntime; + runtime = runtime.toUpper(); + + if (runtime.contains("ONNX")) + { + newRuntime = NeuralNetworkRuntime::ONNX; + } + else if (runtime.contains("NCNN")) + { + newRuntime = NeuralNetworkRuntime::NCNN; + } + else + { + throw std::runtime_error("Unknown neural network runtime: " + runtime.toStdString()); + } + + if (newRuntime != mNeuralNetworkRuntime) + { + mNeuralNetworkRuntime = newRuntime; + emit changeNeuralNetworkRuntime(mNeuralNetworkRuntime); + } +} + +void ReplaceBackgroundVideoFilter::setBackground(QImage const &image) { QImage img = image.convertToFormat(QImage::Format_RGB32).rgbSwapped(); - cv::Mat tmp(img.height(), img.width(), CV_8UC4, (void *) img.bits(), img.bytesPerLine()); + cv::Mat tmp(img.height(), img.width(), CV_8UC4, (void *)img.bits(), img.bytesPerLine()); mBackgroundImage = tmp.clone(); } @@ -61,11 +87,11 @@ float ReplaceBackgroundVideoFilter::getKeyColor() const QString ReplaceBackgroundVideoFilter::getMethod() const { - if(mFilterMethod == FilterMethod::CHROMA) + if (mFilterMethod == FilterMethod::CHROMA) { return QString("Chroma"); } - else if(mFilterMethod == FilterMethod::NEURAL) + else if (mFilterMethod == FilterMethod::NEURAL) { return QString("Neural"); } @@ -75,16 +101,32 @@ QString ReplaceBackgroundVideoFilter::getMethod() const } } +QString ReplaceBackgroundVideoFilter::getNeuralNetworkRuntime() const +{ + if (mNeuralNetworkRuntime == NeuralNetworkRuntime::ONNX) + { + return QString("ONNX"); + } + else if (mNeuralNetworkRuntime == NeuralNetworkRuntime::NCNN) + { + return QString("NCNN"); + } + else + { + return QString("Unknown"); + } +} + void ReplaceBackgroundVideoFilter::setVideoSink(QObject *videoSink) { - mVideoSink = qobject_cast(videoSink); + mVideoSink = qobject_cast(videoSink); connect(mVideoSink, &QVideoSink::videoFrameChanged, this, &ReplaceBackgroundVideoFilter::processFrame); } -void ReplaceBackgroundVideoFilter::processCapture(const QString& capture) +void ReplaceBackgroundVideoFilter::processCapture(const QString &capture) { - if(!mCaptureProcessing) + if (!mCaptureProcessing) { mCaptureProcessing = true; emit asyncProcessFrame(capture, true, true); @@ -93,7 +135,7 @@ void ReplaceBackgroundVideoFilter::processCapture(const QString& capture) void ReplaceBackgroundVideoFilter::processFrame(const QVideoFrame &frame) { - if(!mProcessing) + if (!mProcessing) { mProcessing = true; QVariant frameVariant; @@ -102,9 +144,9 @@ void ReplaceBackgroundVideoFilter::processFrame(const QVideoFrame &frame) } } -void ReplaceBackgroundVideoFilter::onProcessingFinished(const QImage& maskImage) +void ReplaceBackgroundVideoFilter::onProcessingFinished(const QImage &maskImage) { - if(!maskImage.isNull()) + if (!maskImage.isNull()) { sendVideoFrame(QVideoFrame(maskImage)); } @@ -112,34 +154,37 @@ void ReplaceBackgroundVideoFilter::onProcessingFinished(const QImage& maskImage) mProcessing = false; } -void ReplaceBackgroundVideoFilter::onImageSaved(const QString& fileName) +void ReplaceBackgroundVideoFilter::onImageSaved(const QString &fileName) { mCaptureProcessing = false; emit captureProcessingFinished(fileName); } -ReplaceBackgroundFilterRunable::ReplaceBackgroundFilterRunable(ReplaceBackgroundVideoFilter* filter) : mFilter(filter), mYoloSegmentorFast(":/models/yolo11n-seg.onnx", ":/models/coco.names" , false), mYoloSegmentorSlow(":/models/yolo11l-seg.onnx", ":/models/coco.names" , false) +ReplaceBackgroundFilterRunable::ReplaceBackgroundFilterRunable(ReplaceBackgroundVideoFilter *filter) : mFilter(filter) { + mYoloSegmentorPreview.reset(new YOLOv11SegDetectorOnnx("yolo11n-seg.onnx", "coco.names", false)); + mYoloSegmentorHighRes.reset(new YOLOv11SegDetectorOnnx("yolo11x-seg.onnx", "coco.names", false)); } -void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBackground, bool highResFilter) +void ReplaceBackgroundFilterRunable::run(const QVariant &variant, bool applyBackground, bool highResFilter) { - if(variant.canConvert()) + if (variant.canConvert()) { QImage image; QVideoFrame input = variant.value(); // Supports YUV (I420 and NV12) and RGB. The GL path is readback-based and slow. - if (!input.isValid() - || (input.handleType() != QVideoFrame::HandleType::NoHandle && input.handleType() != QVideoFrame::RhiTextureHandle)) { + if (!input.isValid() || (input.handleType() != QVideoFrame::HandleType::NoHandle && input.handleType() != QVideoFrame::RhiTextureHandle)) + { qWarning("Invalid input format"); emit processingFinished(QImage()); } QVideoFrame frame = input; - if (!frame.map(QVideoFrame::ReadOnly)) { + if (!frame.map(QVideoFrame::ReadOnly)) + { qWarning() << "Failed to map QVideoFrame"; emit processingFinished(QImage()); return; @@ -147,7 +192,7 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack image = frame.toImage(); - if(ReplaceBackgroundVideoFilter::FilterMethod::NONE == mFilter->mFilterMethod) + if (ReplaceBackgroundVideoFilter::FilterMethod::NONE == mFilter->mFilterMethod) { // do not go any further. no filter selected! emit processingFinished(image); @@ -159,7 +204,7 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack frame.unmap(); } - else if(variant.canConvert()) + else if (variant.canConvert()) { mMat = cv::imread(variant.toString().toStdString()); } @@ -170,7 +215,7 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack ensureC3(&mMat); - switch(mFilter->mFilterMethod) + switch (mFilter->mFilterMethod) { case ReplaceBackgroundVideoFilter::FilterMethod::CHROMA: { @@ -183,7 +228,7 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack cv::Scalar lower_color = (lower_green * (1.0 - mFilter->mKeyColor)) + (lower_blue * mFilter->mKeyColor); cv::Scalar upper_color = (upper_green * (1.0 - mFilter->mKeyColor)) + (upper_blue * mFilter->mKeyColor); - if(!highResFilter) + if (!highResFilter) { mMat = chromaKeyMask(mMat, lower_color, upper_color); } @@ -197,29 +242,29 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack case ReplaceBackgroundVideoFilter::FilterMethod::NEURAL: { std::vector results; - if(!highResFilter) + if (!highResFilter) { - results = mYoloSegmentorFast.segment(mMat, 0.2f, 0.45f); + results = mYoloSegmentorPreview->segment(mMat, 0.2f, 0.45f); } else { - results = mYoloSegmentorSlow.segment(mMat, 0.2f, 0.45f); + results = mYoloSegmentorHighRes->segment(mMat, 0.2f, 0.45f); } cv::Mat mask = cv::Mat::zeros(mMat.size(), CV_8UC1); std::vector objectFilter = {0, 27, 39, 40, 41, 67}; - if(!highResFilter) + if (!highResFilter) { - mYoloSegmentorFast.drawSegmentationMask(mask, results, objectFilter); + mYoloSegmentorPreview->drawSegmentationMask(mask, results, objectFilter); } else { - mYoloSegmentorSlow.drawSegmentationMask(mask, results, objectFilter); + mYoloSegmentorHighRes->drawSegmentationMask(mask, results, objectFilter); // use a gaussian blur for the mask in order to make it less sharp // kernel size of 30 is determined by experiment - cv::GaussianBlur(mask, mask, cv::Size(31,31), 0, 0); + cv::GaussianBlur(mask, mask, cv::Size(31, 31), 0, 0); } std::vector channels; @@ -234,7 +279,7 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack break; } - if(applyBackground) + if (applyBackground) { prepareBackground(mFilter->mBackgroundImage, mMat.size()); // use alpha channel to add bg image to mMat. The alpha channel is converted to float in order to multiply it with the color values. @@ -244,11 +289,11 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack } // Output is an RGB video frame. - if(variant.canConvert()) + if (variant.canConvert()) { emit processingFinished(mat8ToImage(mMat)); } - else if(variant.canConvert()) + else if (variant.canConvert()) { // Save the image to a file QString fileName = variant.toString(); @@ -258,15 +303,32 @@ void ReplaceBackgroundFilterRunable::run(const QVariant& variant, bool applyBack } } +void ReplaceBackgroundFilterRunable::changeNeuralNetworkRuntime(const NeuralNetworkRuntime &runtime) +{ + if (runtime == NeuralNetworkRuntime::ONNX) + { + qDebug() << "[INFO] Change YOLOv11Segmentation runtime to ONNX"; + mYoloSegmentorPreview.reset(new YOLOv11SegDetectorOnnx("yolo11n-seg.onnx", "coco.names", false)); + mYoloSegmentorHighRes.reset(new YOLOv11SegDetectorOnnx("yolo11x-seg.onnx", "coco.names", false)); + } + else if (runtime == NeuralNetworkRuntime::NCNN) + { + qDebug() << "[INFO] Change YOLOv11Segmentation runtime to NCNN"; + mYoloSegmentorPreview.reset(new YOLOv11SegDetectorNcnn("yolo11n-seg_ncnn_model", "coco.names", false)); + mYoloSegmentorHighRes.reset(new YOLOv11SegDetectorNcnn("yolo11x-seg_ncnn_model", "coco.names", false)); + } +} + void ReplaceBackgroundFilterRunable::prepareBackground(cv::Mat &bg, cv::Size size) { cv::resize(bg, bg, size); } -void ReplaceBackgroundFilterRunable::alphaBlend(const cv::Mat &src, const cv::Mat &bg, const cv::Mat& alpha, cv::Mat &dst) +void ReplaceBackgroundFilterRunable::alphaBlend(const cv::Mat &src, const cv::Mat &bg, const cv::Mat &alpha, cv::Mat &dst) { // Ensure the source and background images are the same size - if (src.size() != bg.size()) { + if (src.size() != bg.size()) + { cv::resize(bg, bg, src.size()); } @@ -285,8 +347,8 @@ void ReplaceBackgroundFilterRunable::alphaBlend(const cv::Mat &src, const cv::Ma // Convert the alpha channel to float cv::Mat alpha_float(alpha.size(), CV_8UC3); std::vector channels; - channels.push_back(alpha); // Kanal 1: Grauwert - channels.push_back(alpha); // Kanal 2: Grauwert (optional duplizieren) + channels.push_back(alpha); // Kanal 1: Grauwert + channels.push_back(alpha); // Kanal 2: Grauwert (optional duplizieren) channels.push_back(alpha); cv::merge(channels, alpha_float); alpha_float.convertTo(alpha_float, CV_32FC3, 1.0 / 255.0); @@ -297,14 +359,14 @@ void ReplaceBackgroundFilterRunable::alphaBlend(const cv::Mat &src, const cv::Ma // Blend the images cv::Mat ouImage(src_float.size(), src_float.type()); cv::multiply(alpha_float, src_float, src_float); - cv::multiply(cv::Scalar::all(1.0)-alpha_float, bg_float, bg_float); + cv::multiply(cv::Scalar::all(1.0) - alpha_float, bg_float, bg_float); cv::add(src_float, bg_float, ouImage); // Convert the result back to 8-bit ouImage.convertTo(ouImage, CV_8UC3, 255.0); ouImage.copyTo(dst); } -cv::Mat ReplaceBackgroundFilterRunable::chromaKeyMask(const cv::Mat& img, const cv::Scalar& lower_color, const cv::Scalar& upper_color) +cv::Mat ReplaceBackgroundFilterRunable::chromaKeyMask(const cv::Mat &img, const cv::Scalar &lower_color, const cv::Scalar &upper_color) { // Convert the image from BGR to YUV cv::Mat hsv_img; @@ -319,9 +381,9 @@ cv::Mat ReplaceBackgroundFilterRunable::chromaKeyMask(const cv::Mat& img, const int dilation_elem = 5; int dilation_size = 5; - cv::Mat element = getStructuringElement( cv::MORPH_DILATE, - cv::Size( 2*dilation_size + 1, 2*dilation_size+1 ), - cv::Point( dilation_size, dilation_size ) ); + cv::Mat element = getStructuringElement(cv::MORPH_DILATE, + cv::Size(2 * dilation_size + 1, 2 * dilation_size + 1), + cv::Point(dilation_size, dilation_size)); cv::dilate(mask, mask, element); @@ -335,7 +397,8 @@ cv::Mat ReplaceBackgroundFilterRunable::chromaKeyMask(const cv::Mat& img, const return result; } -cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat& img, const cv::Scalar& lower_color, const cv::Scalar& upper_color) { +cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat &img, const cv::Scalar &lower_color, const cv::Scalar &upper_color) +{ // Convert the image from BGR to YUV cv::Mat hsv_img; cv::Mat result; @@ -348,7 +411,7 @@ cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat& img, con int num_foreground_pixels = countNonZero(mask); int num_background_pixels = mask.total() - num_foreground_pixels; - if(num_background_pixels != 0) + if (num_background_pixels != 0) { // Initialize GrabCut mask cv::Mat grabcut_mask = cv::Mat::zeros(img.size(), CV_8UC1); @@ -360,7 +423,7 @@ cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat& img, con // Run GrabCut algorithm cv::Mat bgd_model, fgd_model; cv::Rect rect(1, 1, img.cols - 2, img.rows - 2); - //cv::Rect rect(0, 0, img.cols, img.rows); // Use entire image + // cv::Rect rect(0, 0, img.cols, img.rows); // Use entire image cv::grabCut(img, grabcut_mask, rect, bgd_model, fgd_model, 1, cv::GC_INIT_WITH_MASK); // Create final mask @@ -370,14 +433,14 @@ cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat& img, con int dilation_size = 5; - cv::Mat element = getStructuringElement( cv::MORPH_DILATE, - cv::Size( 2*dilation_size + 1, 2*dilation_size+1 ), - cv::Point( dilation_size, dilation_size ) ); + cv::Mat element = getStructuringElement(cv::MORPH_DILATE, + cv::Size(2 * dilation_size + 1, 2 * dilation_size + 1), + cv::Point(dilation_size, dilation_size)); cv::dilate(final_mask, final_mask, element); // use a gaussian blur for the mask in order to make it less sharp - cv::GaussianBlur(final_mask, final_mask, cv::Size(31,31), 0, 0); + cv::GaussianBlur(final_mask, final_mask, cv::Size(31, 31), 0, 0); // Extract the foreground cv::Mat fg, bg; @@ -397,14 +460,15 @@ cv::Mat ReplaceBackgroundFilterRunable::grabcutChromaKey(const cv::Mat& img, con cv::Mat ReplaceBackgroundFilterRunable::imageToMat8(const QImage &image) { QImage img = image.convertToFormat(QImage::Format_RGB32).rgbSwapped(); - cv::Mat tmp(img.height(), img.width(), CV_8UC4, (void *) img.bits(), img.bytesPerLine()); + cv::Mat tmp(img.height(), img.width(), CV_8UC4, (void *)img.bits(), img.bytesPerLine()); return tmp.clone(); } void ReplaceBackgroundFilterRunable::ensureC3(cv::Mat *mat) { Q_ASSERT(mat->type() == CV_8UC3 || mat->type() == CV_8UC4); - if (mat->type() != CV_8UC3) { + if (mat->type() != CV_8UC3) + { cv::Mat tmp; cvtColor(*mat, tmp, cv::COLOR_BGRA2BGR); *mat = tmp; @@ -413,13 +477,14 @@ void ReplaceBackgroundFilterRunable::ensureC3(cv::Mat *mat) QImage ReplaceBackgroundFilterRunable::mat8ToImage(const cv::Mat &mat) { - switch (mat.type()) { + switch (mat.type()) + { case CV_8UC1: { QVector ct; for (int i = 0; i < 256; ++i) ct.append(qRgb(i, i, i)); - QImage result(mat.data, mat.cols, mat.rows, (int) mat.step, QImage::Format_Indexed8); + QImage result(mat.data, mat.cols, mat.rows, (int)mat.step, QImage::Format_Indexed8); result.setColorTable(ct); return result.copy(); } @@ -431,7 +496,7 @@ QImage ReplaceBackgroundFilterRunable::mat8ToImage(const cv::Mat &mat) } case CV_8UC4: { - QImage result(mat.data, mat.cols, mat.rows, (int) mat.step, QImage::Format_RGB32); + QImage result(mat.data, mat.cols, mat.rows, (int)mat.step, QImage::Format_RGB32); return result.rgbSwapped(); } default: @@ -445,7 +510,7 @@ cv::Mat ReplaceBackgroundFilterRunable::yuvFrameToMat8(const QVideoFrame &frame) Q_ASSERT(frame.handleType() == QVideoFrame::HandleType::NoHandle && frame.isReadable()); Q_ASSERT(frame.pixelFormat() == QVideoFrameFormat::Format_YUV420P || frame.pixelFormat() == QVideoFrameFormat::Format_NV12); - cv::Mat tmp(frame.height() + frame.height() / 2, frame.width(), CV_8UC1, (uchar *) frame.bits(0)); + cv::Mat tmp(frame.height() + frame.height() / 2, frame.width(), CV_8UC1, (uchar *)frame.bits(0)); cv::Mat result(frame.height(), frame.width(), CV_8UC3); cvtColor(tmp, result, frame.pixelFormat() == QVideoFrameFormat::Format_YUV420P ? cv::COLOR_YUV2BGR_YV12 : cv::COLOR_YUV2BGR_NV12); return result; @@ -454,14 +519,17 @@ cv::Mat ReplaceBackgroundFilterRunable::yuvFrameToMat8(const QVideoFrame &frame) class YUVBuffer : public QAbstractVideoBuffer { public: - YUVBuffer(cv::Mat *mat) : m_mode(QVideoFrame::NotMapped) { + YUVBuffer(cv::Mat *mat) : m_mode(QVideoFrame::NotMapped) + { m_yuvMat.reset(mat); } QVideoFrameFormat format() const Q_DECL_OVERRIDE { return QVideoFrameFormat(QSize(m_yuvMat->cols, m_yuvMat->rows), QVideoFrameFormat::Format_YUV420P); } - QAbstractVideoBuffer::MapData map(QVideoFrame::MapMode mode) Q_DECL_OVERRIDE { + QAbstractVideoBuffer::MapData map(QVideoFrame::MapMode mode) Q_DECL_OVERRIDE + { QAbstractVideoBuffer::MapData data; - if (mode != QVideoFrame::NotMapped && m_mode == QVideoFrame::NotMapped) { + if (mode != QVideoFrame::NotMapped && m_mode == QVideoFrame::NotMapped) + { data.planeCount = 1; data.dataSize[0] = m_yuvMat->rows * m_yuvMat->cols; @@ -475,6 +543,7 @@ class YUVBuffer : public QAbstractVideoBuffer return data; } void unmap() Q_DECL_OVERRIDE { m_mode = QVideoFrame::NotMapped; } + private: QVideoFrame::MapMode m_mode; QScopedPointer m_yuvMat; @@ -512,7 +581,8 @@ void ReplaceBackgroundFilterRunable::mat8ToYuvFrame(const cv::Mat &mat, uchar *d QImage ReplaceBackgroundFilterRunable::imageWrapper(const QVideoFrame &frame) { #ifndef QT_NO_OPENGL - if (frame.handleType() == QVideoFrame::RhiTextureHandle) { + if (frame.handleType() == QVideoFrame::RhiTextureHandle) + { // Slow and inefficient path. Ideally what's on the GPU should remain on the GPU, instead of readbacks like this. QImage img(frame.width(), frame.height(), QImage::Format_RGBA8888); // GLuint textureId = frame.handle().toUInt(); @@ -527,10 +597,12 @@ QImage ReplaceBackgroundFilterRunable::imageWrapper(const QVideoFrame &frame) // f->glReadPixels(0, 0, frame.width(), frame.height(), GL_RGBA, GL_UNSIGNED_BYTE, img.bits()); // f->glBindFramebuffer(GL_FRAMEBUFFER, prevFbo); return img; - } else + } + else #endif // QT_NO_OPENGL { - if (!frame.isReadable()) { + if (!frame.isReadable()) + { qWarning("imageWrapper: No mapped image data available for read"); return QImage(); } @@ -547,12 +619,12 @@ QImage ReplaceBackgroundFilterRunable::imageWrapper(const QVideoFrame &frame) class TextureBuffer : public QAbstractVideoBuffer { public: - TextureBuffer(uint id) : QAbstractVideoBuffer(), m_id(id) { } + TextureBuffer(uint id) : QAbstractVideoBuffer(), m_id(id) {} QVideoFrame::MapMode mapMode() const { return QVideoFrame::NotMapped; } QAbstractVideoBuffer::MapData map(QVideoFrame::MapMode) override { return QAbstractVideoBuffer::MapData(); } - void unmap() override { } + void unmap() override {} QVariant handle() const { return QVariant::fromValue(m_id); } - QVideoFrameFormat format() const override {return QVideoFrameFormat(); } + QVideoFrameFormat format() const override { return QVideoFrameFormat(); } private: GLuint m_id; diff --git a/src/replacebackgroundvideofilter.h b/src/replacebackgroundvideofilter.h index 436bdfff..770df6cc 100644 --- a/src/replacebackgroundvideofilter.h +++ b/src/replacebackgroundvideofilter.h @@ -8,10 +8,18 @@ #include #include #include -#include "yolo11seg.h" +#include +#include "yolo11segonnx.h" +#include "yolo11segncnn.h" class ReplaceBackgroundFilterRunable; +enum class NeuralNetworkRuntime +{ + ONNX, + NCNN +}; + class ReplaceBackgroundVideoFilter : public QVideoFrameInput { friend ReplaceBackgroundFilterRunable; @@ -24,12 +32,15 @@ class ReplaceBackgroundVideoFilter : public QVideoFrameInput Q_PROPERTY(QUrl background WRITE setBackground) /* video frame input */ Q_PROPERTY(QObject* videoSink WRITE setVideoSink) + /* neural network runtime */ + Q_PROPERTY(QString neuralNetworkRuntime READ getNeuralNetworkRuntime WRITE setNeuralNetworkRuntime) public: ReplaceBackgroundVideoFilter(QObject *parent = nullptr); ~ReplaceBackgroundVideoFilter(); void setMethod(QString method); + void setNeuralNetworkRuntime(QString runtime); void setKeyColor(float color); void setBackground(QImage const& image); Q_INVOKABLE void setBackground(QUrl& imagePath) @@ -58,6 +69,7 @@ class ReplaceBackgroundVideoFilter : public QVideoFrameInput } } QString getMethod() const; + QString getNeuralNetworkRuntime() const; float getKeyColor() const; void setVideoSink(QObject *videoSink); @@ -78,6 +90,8 @@ class ReplaceBackgroundVideoFilter : public QVideoFrameInput FilterMethod mFilterMethod = FilterMethod::CHROMA; + NeuralNetworkRuntime mNeuralNetworkRuntime = NeuralNetworkRuntime::ONNX; + QVideoSink *mVideoSink; QThread mWorkerThread; @@ -95,6 +109,8 @@ protected slots: void asyncProcessFrame(const QVariant& frame, bool applyBackground, bool highResFilter); void captureProcessingFinished(const QString& fileName); + + void changeNeuralNetworkRuntime(const NeuralNetworkRuntime& runtime); }; @@ -105,6 +121,8 @@ class ReplaceBackgroundFilterRunable : public QObject ReplaceBackgroundFilterRunable(ReplaceBackgroundVideoFilter* filter); public slots: void run(const QVariant &input, bool highResStill, bool highResFilter); + + void changeNeuralNetworkRuntime(const NeuralNetworkRuntime& runtime); protected: cv::Mat chromaKeyMask(const cv::Mat& img, const cv::Scalar& lower_color, const cv::Scalar& upper_color); @@ -155,9 +173,8 @@ public slots: bool mYuv; ReplaceBackgroundVideoFilter* mFilter; - YOLOv11SegDetector mYoloSegmentorFast; - YOLOv11SegDetector mYoloSegmentorSlow; - + QSharedPointer mYoloSegmentorPreview; + QSharedPointer mYoloSegmentorHighRes; signals: void processingFinished(const QImage& maskImage); void imageFileSaved(const QString& fileName); diff --git a/src/segmentation.cpp b/src/segmentation.cpp new file mode 100644 index 00000000..4ff9f71c --- /dev/null +++ b/src/segmentation.cpp @@ -0,0 +1,207 @@ +#include "segmentation.h" +#include +#include +#include +#include +#include +#include "utils.h" + +Yolo11Segementation::Yolo11Segementation(const std::string &labelsFile) +{ + classNames = utils::getClassNames(getModelRessourcePath(labelsFile)); + classColors = utils::generateColors(classNames); +} + +void Yolo11Segementation::drawSegmentationsAndBoxes(cv::Mat &image, + const std::vector &results, + float maskAlpha) const +{ + for (const auto &seg : results) + { + if (seg.conf < CONFIDENCE_THRESHOLD) + { + continue; + } + cv::Scalar color = classColors[seg.classId % classColors.size()]; + + // ----------------------------- + // 1. Draw Bounding Box + // ----------------------------- + cv::rectangle(image, + cv::Point(seg.box.x, seg.box.y), + cv::Point(seg.box.x + seg.box.width, seg.box.y + seg.box.height), + color, 2); + + // ----------------------------- + // 2. Draw Label + // ----------------------------- + std::string label = classNames[seg.classId] + " " + std::to_string(static_cast(seg.conf * 100)) + "%"; + int baseLine = 0; + double fontScale = 0.5; + int thickness = 1; + cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, fontScale, thickness, &baseLine); + int top = std::max(seg.box.y, labelSize.height + 5); + cv::rectangle(image, + cv::Point(seg.box.x, top - labelSize.height - 5), + cv::Point(seg.box.x + labelSize.width + 5, top), + color, cv::FILLED); + cv::putText(image, label, + cv::Point(seg.box.x + 2, top - 2), + cv::FONT_HERSHEY_SIMPLEX, + fontScale, + cv::Scalar(255, 255, 255), + thickness); + + // ----------------------------- + // 3. Apply Segmentation Mask + // ----------------------------- + if (!seg.mask.empty()) + { + // Ensure the mask is single-channel + cv::Mat mask_gray; + if (seg.mask.channels() == 3) + { + cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); + } + else + { + mask_gray = seg.mask.clone(); + } + + // Threshold the mask to binary (object: 255, background: 0) + cv::Mat mask_binary; + cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); + + // Create a colored version of the mask + cv::Mat colored_mask; + cv::cvtColor(mask_binary, colored_mask, cv::COLOR_GRAY2BGR); + colored_mask.setTo(color, mask_binary); // Apply color where mask is present + + // Blend the colored mask with the original image + cv::addWeighted(image, 1.0, colored_mask, maskAlpha, 0, image); + } + } +} + +void Yolo11Segementation::drawSegmentations(cv::Mat &image, + const std::vector &results, + float maskAlpha) const +{ + for (const auto &seg : results) + { + if (seg.conf < CONFIDENCE_THRESHOLD) + { + continue; + } + cv::Scalar color = classColors[seg.classId % classColors.size()]; + + // ----------------------------- + // Draw Segmentation Mask Only + // ----------------------------- + if (!seg.mask.empty()) + { + // Ensure the mask is single-channel + cv::Mat mask_gray; + if (seg.mask.channels() == 3) + { + cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); + } + else + { + mask_gray = seg.mask.clone(); + } + + // Threshold the mask to binary (object: 255, background: 0) + cv::Mat mask_binary; + cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); + + // Create a colored version of the mask + cv::Mat colored_mask; + cv::cvtColor(mask_binary, colored_mask, cv::COLOR_GRAY2BGR); + colored_mask.setTo(color, mask_binary); // Apply color where mask is present + + // Blend the colored mask with the original image + cv::addWeighted(image, 1.0, colored_mask, maskAlpha, 0, image); + } + } +} + +void Yolo11Segementation::drawSegmentationMask(cv::Mat &image, + const std::vector &results, + const std::vector &classesFilter) const +{ + for (const auto &seg : results) + { + if (seg.conf < CONFIDENCE_THRESHOLD) + { + continue; + } + + if (!classesFilter.empty()) + { + if (std::find(classesFilter.begin(), classesFilter.end(), seg.classId) == classesFilter.end()) + { + // class id not in filter + continue; + } + } + + // ----------------------------- + // Draw Segmentation Mask Only + // ----------------------------- + if (!seg.mask.empty()) + { + // Ensure the mask is single-channel + cv::Mat mask_gray; + if (seg.mask.channels() == 3) + { + cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); + } + else + { + mask_gray = seg.mask.clone(); + mask_gray *= 255; + } + + // Threshold the mask to binary (object: 255, background: 0) + cv::Mat mask_binary; + cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); + + cv::normalize(mask_binary, mask_binary, 0, 255, cv::NORM_MINMAX, CV_8UC1); + + int image_type = image.type(); + int mask_type = mask_binary.type(); + + // Blend the mask together into one single mask + cv::add(image, mask_binary, image); + } + } +} + +std::string Yolo11Segementation::getModelRessourcePath(const std::string &filename) const +{ + QString ressourcePathGeneric = QStandardPaths::locate(QStandardPaths::GenericDataLocation, "models", QStandardPaths::LocateDirectory); + QString ressourcePathApp = QStandardPaths::locate(QStandardPaths::AppDataLocation, "models", QStandardPaths::LocateDirectory); + if (ressourcePathApp.isEmpty() && ressourcePathGeneric.isEmpty()) + { + throw std::runtime_error("Failed to locate the models directory."); + } + QString ressourcePath = ""; + + if (QFile(ressourcePathApp + "/" + QString::fromStdString(filename)).exists()) + { + ressourcePath = ressourcePathApp + "/" + QString::fromStdString(filename); + qDebug() << "Using model from app data path:" << ressourcePath; + } + else if (QFile(ressourcePathGeneric + "/" + QString::fromStdString(filename)).exists()) + { + ressourcePath = ressourcePathGeneric + "/" + QString::fromStdString(filename); + qDebug() << "Using model from generic data path:" << ressourcePath; + } + else + { + throw std::runtime_error("Model file not found: " + filename); + } + + return ressourcePath.toStdString(); +} diff --git a/src/segmentation.h b/src/segmentation.h new file mode 100644 index 00000000..fe7c5211 --- /dev/null +++ b/src/segmentation.h @@ -0,0 +1,73 @@ +#ifndef SEGMENTATION_H +#define SEGMENTATION_H + +#include +#include +#include + +struct BoundingBox { + int x{0}; + int y{0}; + int width{0}; + int height{0}; + + BoundingBox() = default; + BoundingBox(int _x, int _y, int w, int h) + : x(_x), y(_y), width(w), height(h) {} + + float area() const { return static_cast(width * height); } + + BoundingBox intersect(const BoundingBox &other) const { + int xStart = std::max(x, other.x); + int yStart = std::max(y, other.y); + int xEnd = std::min(x + width, other.x + other.width); + int yEnd = std::min(y + height, other.y + other.height); + int iw = std::max(0, xEnd - xStart); + int ih = std::max(0, yEnd - yStart); + return BoundingBox(xStart, yStart, iw, ih); + } +}; + +struct Segmentation { + BoundingBox box; + float conf{0.f}; + int classId{0}; + cv::Mat mask; // Single-channel (8UC1) mask in full resolution +}; + +class Yolo11Segementation +{ +public: + Yolo11Segementation(const std::string &labelsPath); + + virtual std::vector segment(const cv::Mat &image, + float confThreshold = CONFIDENCE_THRESHOLD, + float iouThreshold = IOU_THRESHOLD) = 0; + + // Draw results + void drawSegmentationsAndBoxes(cv::Mat &image, + const std::vector &results, + float maskAlpha = 0.5f) const; + + void drawSegmentations(cv::Mat &image, + const std::vector &results, + float maskAlpha = 0.5f) const; + + void drawSegmentationMask(cv::Mat &image, + const std::vector &results, + const std::vector &classesFilter) const; + // Accessors + const std::vector &getClassNames() const { return classNames; } + const std::vector &getClassColors() const { return classColors; } +protected: + std::vector classNames; + std::vector classColors; + + std::string getModelRessourcePath(const std::string &filename) const; + + static constexpr float CONFIDENCE_THRESHOLD = 0.40f; // Filter boxes below this confidence + static constexpr float IOU_THRESHOLD = 0.45f; // NMS IoU threshold + static constexpr float MASK_THRESHOLD = 0.40f; // Slightly lower to capture partial objects +}; + +#endif // SEGMENTATION_H diff --git a/src/utils.h b/src/utils.h new file mode 100644 index 00000000..e6353c69 --- /dev/null +++ b/src/utils.h @@ -0,0 +1,201 @@ +#ifndef UTILS_H_ +#define UTILS_H_ + +#include +#include +#include +#include +#include +#include + +namespace utils { + + template + T clamp(const T &val, const T &low, const T &high) { + return std::max(low, std::min(val, high)); + } + + inline std::vector getClassNames(const std::string &path) { + std::vector classNames; + QFile f(path.c_str()); + if (!f.open(QIODevice::ReadOnly)) { + qWarning() << "[ERROR] Could not open class names file: " << path.c_str(); + return classNames; + } + std::string line; + while (!(line = f.readLine()).empty()) { + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + classNames.push_back(line); + } + qDebug() << "Loaded " << classNames.size() << " class names from " << path; + return classNames; + } + + inline size_t vectorProduct(const std::vector &shape) { + return std::accumulate(shape.begin(), shape.end(), 1ull, std::multiplies()); + } + + inline void letterBox(const cv::Mat &image, + cv::Mat &outImage, + const cv::Size &newShape, + const cv::Scalar &color = cv::Scalar(114, 114, 114), + bool auto_ = true, + bool scaleFill = false, + bool scaleUp = true, + int stride = 32) { + float r = std::min((float)newShape.height / (float)image.rows, + (float)newShape.width / (float)image.cols); + if (!scaleUp) { + r = std::min(r, 1.0f); + } + + int newW = static_cast(std::round(image.cols * r)); + int newH = static_cast(std::round(image.rows * r)); + + int dw = newShape.width - newW; + int dh = newShape.height - newH; + + if (auto_) { + dw = dw % stride; + dh = dh % stride; + } + else if (scaleFill) { + newW = newShape.width; + newH = newShape.height; + dw = 0; + dh = 0; + } + + cv::Mat resized; + cv::resize(image, resized, cv::Size(newW, newH), 0, 0, cv::INTER_LINEAR); + + int top = dh / 2; + int bottom = dh - top; + int left = dw / 2; + int right = dw - left; + cv::copyMakeBorder(resized, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color); + } + + inline BoundingBox scaleCoords(const cv::Size &letterboxShape, + const BoundingBox &coords, + const cv::Size &originalShape, + bool p_Clip = true) { + float gain = std::min((float)letterboxShape.height / (float)originalShape.height, + (float)letterboxShape.width / (float)originalShape.width); + + int padW = static_cast(std::round(((float)letterboxShape.width - (float)originalShape.width * gain) / 2.f)); + int padH = static_cast(std::round(((float)letterboxShape.height - (float)originalShape.height * gain) / 2.f)); + + BoundingBox ret; + ret.x = static_cast(std::round(((float)coords.x - (float)padW) / gain)); + ret.y = static_cast(std::round(((float)coords.y - (float)padH) / gain)); + ret.width = static_cast(std::round((float)coords.width / gain)); + ret.height = static_cast(std::round((float)coords.height / gain)); + + if (p_Clip) { + ret.x = clamp(ret.x, 0, originalShape.width); + ret.y = clamp(ret.y, 0, originalShape.height); + ret.width = clamp(ret.width, 0, originalShape.width - ret.x); + ret.height = clamp(ret.height, 0, originalShape.height - ret.y); + } + + return ret; + } + + inline std::vector generateColors(const std::vector &classNames, int seed = 42) { + static std::unordered_map> cache; + size_t key = 0; + for (const auto &name : classNames) { + size_t h = std::hash{}(name); + key ^= (h + 0x9e3779b9 + (key << 6) + (key >> 2)); + } + auto it = cache.find(key); + if (it != cache.end()) { + return it->second; + } + std::mt19937 rng(seed); + std::uniform_int_distribution dist(0, 255); + std::vector colors; + colors.reserve(classNames.size()); + for (size_t i = 0; i < classNames.size(); ++i) { + colors.emplace_back(cv::Scalar(dist(rng), dist(rng), dist(rng))); + } + cache[key] = colors; + return colors; + } + + + + inline cv::Mat sigmoid(const cv::Mat& src) { + cv::Mat dst; + cv::exp(-src, dst); + dst = 1.0 / (1.0 + dst); + return dst; + } + + inline void NMSBoxes(const std::vector &boxes, + const std::vector &scores, + float scoreThreshold, + float nmsThreshold, + std::vector &indices) { + indices.clear(); + if (boxes.empty()) { + return; + } + + std::vector order; + order.reserve(boxes.size()); + for (size_t i = 0; i < boxes.size(); ++i) { + if (scores[i] >= scoreThreshold) { + order.push_back((int)i); + } + } + if (order.empty()) return; + + std::sort(order.begin(), order.end(), + [&scores](int a, int b) { + return scores[a] > scores[b]; + }); + + std::vector areas(boxes.size()); + for (size_t i = 0; i < boxes.size(); ++i) { + areas[i] = (float)(boxes[i].width * boxes[i].height); + } + + std::vector suppressed(boxes.size(), false); + for (size_t i = 0; i < order.size(); ++i) { + int idx = order[i]; + if (suppressed[idx]) continue; + + indices.push_back(idx); + + for (size_t j = i + 1; j < order.size(); ++j) { + int idx2 = order[j]; + if (suppressed[idx2]) continue; + + const BoundingBox &a = boxes[idx]; + const BoundingBox &b = boxes[idx2]; + int interX1 = std::max(a.x, b.x); + int interY1 = std::max(a.y, b.y); + int interX2 = std::min(a.x + a.width, b.x + b.width); + int interY2 = std::min(a.y + a.height, b.y + b.height); + + int w = interX2 - interX1; + int h = interY2 - interY1; + if (w > 0 && h > 0) { + float interArea = (float)(w * h); + float unionArea = areas[idx] + areas[idx2] - interArea; + float iou = (unionArea > 0.f)? (interArea / unionArea) : 0.f; + if (iou > nmsThreshold) { + suppressed[idx2] = true; + } + } + } + } + } + + } // namespace utils + +#endif // UTILS_H_ diff --git a/src/yolo11seg.cpp b/src/yolo11seg.cpp deleted file mode 100644 index bb333276..00000000 --- a/src/yolo11seg.cpp +++ /dev/null @@ -1,693 +0,0 @@ -#include "yolo11seg.h" -#include - -// Original Author: Abdalrahman M. Amer, www.linkedin.com/in/abdalrahman-m-amer -// Date: 25.01.2025 -// Modified for use in photbooth - -// ============================================================================ -// Utility Namespace -// ============================================================================ -namespace utils { - -template -T clamp(const T &val, const T &low, const T &high) { - return std::max(low, std::min(val, high)); -} - -inline std::vector getClassNames(const std::string &path) { - std::vector classNames; - QFile f(path.c_str()); - if (!f.open(QIODevice::ReadOnly)) { - qWarning() << "[ERROR] Could not open class names file: " << path.c_str(); - return classNames; - } - std::string line; - while (!(line = f.readLine()).empty()) { - if (!line.empty() && line.back() == '\r') { - line.pop_back(); - } - classNames.push_back(line); - } - qDebug() << "Loaded " << classNames.size() << " class names from " << path; - return classNames; -} - -inline size_t vectorProduct(const std::vector &shape) { - return std::accumulate(shape.begin(), shape.end(), 1ull, std::multiplies()); -} - -inline void letterBox(const cv::Mat &image, - cv::Mat &outImage, - const cv::Size &newShape, - const cv::Scalar &color = cv::Scalar(114, 114, 114), - bool auto_ = true, - bool scaleFill = false, - bool scaleUp = true, - int stride = 32) { - float r = std::min((float)newShape.height / (float)image.rows, - (float)newShape.width / (float)image.cols); - if (!scaleUp) { - r = std::min(r, 1.0f); - } - - int newW = static_cast(std::round(image.cols * r)); - int newH = static_cast(std::round(image.rows * r)); - - int dw = newShape.width - newW; - int dh = newShape.height - newH; - - if (auto_) { - dw = dw % stride; - dh = dh % stride; - } - else if (scaleFill) { - newW = newShape.width; - newH = newShape.height; - dw = 0; - dh = 0; - } - - cv::Mat resized; - cv::resize(image, resized, cv::Size(newW, newH), 0, 0, cv::INTER_LINEAR); - - int top = dh / 2; - int bottom = dh - top; - int left = dw / 2; - int right = dw - left; - cv::copyMakeBorder(resized, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color); -} - -inline BoundingBox scaleCoords(const cv::Size &letterboxShape, - const BoundingBox &coords, - const cv::Size &originalShape, - bool p_Clip = true) { - float gain = std::min((float)letterboxShape.height / (float)originalShape.height, - (float)letterboxShape.width / (float)originalShape.width); - - int padW = static_cast(std::round(((float)letterboxShape.width - (float)originalShape.width * gain) / 2.f)); - int padH = static_cast(std::round(((float)letterboxShape.height - (float)originalShape.height * gain) / 2.f)); - - BoundingBox ret; - ret.x = static_cast(std::round(((float)coords.x - (float)padW) / gain)); - ret.y = static_cast(std::round(((float)coords.y - (float)padH) / gain)); - ret.width = static_cast(std::round((float)coords.width / gain)); - ret.height = static_cast(std::round((float)coords.height / gain)); - - if (p_Clip) { - ret.x = clamp(ret.x, 0, originalShape.width); - ret.y = clamp(ret.y, 0, originalShape.height); - ret.width = clamp(ret.width, 0, originalShape.width - ret.x); - ret.height = clamp(ret.height, 0, originalShape.height - ret.y); - } - - return ret; -} - -inline std::vector generateColors(const std::vector &classNames, int seed = 42) { - static std::unordered_map> cache; - size_t key = 0; - for (const auto &name : classNames) { - size_t h = std::hash{}(name); - key ^= (h + 0x9e3779b9 + (key << 6) + (key >> 2)); - } - auto it = cache.find(key); - if (it != cache.end()) { - return it->second; - } - std::mt19937 rng(seed); - std::uniform_int_distribution dist(0, 255); - std::vector colors; - colors.reserve(classNames.size()); - for (size_t i = 0; i < classNames.size(); ++i) { - colors.emplace_back(cv::Scalar(dist(rng), dist(rng), dist(rng))); - } - cache[key] = colors; - return colors; -} - - - -cv::Mat sigmoid(const cv::Mat& src) { - cv::Mat dst; - cv::exp(-src, dst); - dst = 1.0 / (1.0 + dst); - return dst; -} -inline void NMSBoxes(const std::vector &boxes, - const std::vector &scores, - float scoreThreshold, - float nmsThreshold, - std::vector &indices) { - indices.clear(); - if (boxes.empty()) { - return; - } - - std::vector order; - order.reserve(boxes.size()); - for (size_t i = 0; i < boxes.size(); ++i) { - if (scores[i] >= scoreThreshold) { - order.push_back((int)i); - } - } - if (order.empty()) return; - - std::sort(order.begin(), order.end(), - [&scores](int a, int b) { - return scores[a] > scores[b]; - }); - - std::vector areas(boxes.size()); - for (size_t i = 0; i < boxes.size(); ++i) { - areas[i] = (float)(boxes[i].width * boxes[i].height); - } - - std::vector suppressed(boxes.size(), false); - for (size_t i = 0; i < order.size(); ++i) { - int idx = order[i]; - if (suppressed[idx]) continue; - - indices.push_back(idx); - - for (size_t j = i + 1; j < order.size(); ++j) { - int idx2 = order[j]; - if (suppressed[idx2]) continue; - - const BoundingBox &a = boxes[idx]; - const BoundingBox &b = boxes[idx2]; - int interX1 = std::max(a.x, b.x); - int interY1 = std::max(a.y, b.y); - int interX2 = std::min(a.x + a.width, b.x + b.width); - int interY2 = std::min(a.y + a.height, b.y + b.height); - - int w = interX2 - interX1; - int h = interY2 - interY1; - if (w > 0 && h > 0) { - float interArea = (float)(w * h); - float unionArea = areas[idx] + areas[idx2] - interArea; - float iou = (unionArea > 0.f)? (interArea / unionArea) : 0.f; - if (iou > nmsThreshold) { - suppressed[idx2] = true; - } - } - } - } -} - -} // namespace utils - -YOLOv11SegDetector::YOLOv11SegDetector(const std::string &modelPath, - const std::string &labelsPath, - bool useGPU) - : env(ORT_LOGGING_LEVEL_WARNING, "YOLOv11Seg") -{ - sessionOptions.SetIntraOpNumThreads(std::min(6, static_cast(std::thread::hardware_concurrency()))); - sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); - - std::vector providers = Ort::GetAvailableProviders(); - if (useGPU && std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) { - OrtCUDAProviderOptions cudaOptions; - sessionOptions.AppendExecutionProvider_CUDA(cudaOptions); - qDebug() << "[INFO] Using GPU (CUDA) for YOLOv11 Seg inference."; - } else { - qDebug() << "[INFO] Using CPU for YOLOv11 Seg inference."; - } - - QFile modelFile(modelPath.c_str()); - - if (!modelFile.open(QIODevice::ReadOnly)) { - qWarning() << "Failed to open the model file!"; - } - - QByteArray binaryData = modelFile.readAll(); - -#ifdef _WIN32 - std::wstring w_modelPath(modelPath.begin(), modelPath.end()); - session = Ort::Session(env, w_modelPath.c_str(), sessionOptions); -#else - session = Ort::Session(env, binaryData.data(), binaryData.size(), sessionOptions); -#endif - - numInputNodes = session.GetInputCount(); - numOutputNodes = session.GetOutputCount(); - - Ort::AllocatorWithDefaultOptions allocator; - - // Input - { - auto inNameAlloc = session.GetInputNameAllocated(0, allocator); - inputNameAllocs.emplace_back(std::move(inNameAlloc)); - inputNames.push_back(inputNameAllocs.back().get()); - - auto inTypeInfo = session.GetInputTypeInfo(0); - auto inShape = inTypeInfo.GetTensorTypeAndShapeInfo().GetShape(); - - if (inShape.size() == 4) { - if (inShape[2] == -1 || inShape[3] == -1) { - isDynamicInputShape = true; - inputImageShape = cv::Size(640, 640); // Fallback if dynamic - } else { - inputImageShape = cv::Size(static_cast(inShape[3]), static_cast(inShape[2])); - } - } else { - throw std::runtime_error("Model input is not 4D! Expect [N, C, H, W]."); - } - } - - // Outputs - if (numOutputNodes != 2) { - throw std::runtime_error("Expected exactly 2 output nodes: output0 and output1."); - } - - for (size_t i = 0; i < numOutputNodes; ++i) { - auto outNameAlloc = session.GetOutputNameAllocated(i, allocator); - outputNameAllocs.emplace_back(std::move(outNameAlloc)); - outputNames.push_back(outputNameAllocs.back().get()); - } - - classNames = utils::getClassNames(labelsPath); - classColors = utils::generateColors(classNames); - - qDebug() << "[INFO] YOLOv11Seg loaded: " << modelPath; - qDebug() << " Input shape: " << inputImageShape.height << "x" << inputImageShape.width - << (isDynamicInputShape ? " (dynamic)" : ""); - qDebug() << " #Outputs : " << numOutputNodes; - qDebug() << " #Classes : " << classNames.size(); -} - -cv::Mat YOLOv11SegDetector::preprocess(const cv::Mat &image, - float *&blobPtr, - std::vector &inputTensorShape) -{ - cv::Mat letterboxImage; - utils::letterBox(image, letterboxImage, inputImageShape, - cv::Scalar(114,114,114), /*auto_=*/isDynamicInputShape, - /*scaleFill=*/false, /*scaleUp=*/true, /*stride=*/32); - - // Update if dynamic - inputTensorShape[2] = static_cast(letterboxImage.rows); - inputTensorShape[3] = static_cast(letterboxImage.cols); - - letterboxImage.convertTo(letterboxImage, CV_32FC3, 1.0f/255.0f); - - size_t size = static_cast(letterboxImage.rows) * static_cast(letterboxImage.cols) * 3; - blobPtr = new float[size]; - - std::vector channels(3); - for (int c = 0; c < 3; ++c) { - channels[c] = cv::Mat(letterboxImage.rows, letterboxImage.cols, CV_32FC1, - blobPtr + c * (letterboxImage.rows * letterboxImage.cols)); - } - cv::split(letterboxImage, channels); - - return letterboxImage; -} - -std::vector YOLOv11SegDetector::postprocess( - const cv::Size &origSize, - const cv::Size &letterboxSize, - const std::vector &outputs, - float confThreshold, - float iouThreshold) -{ - std::vector results; - - // Validate outputs size - if (outputs.size() < 2) { - throw std::runtime_error("Insufficient outputs from the model. Expected at least 2 outputs."); - } - - // Extract outputs - const float* output0_ptr = outputs[0].GetTensorData(); - const float* output1_ptr = outputs[1].GetTensorData(); - - // Get shapes - auto shape0 = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, num_detections] - auto shape1 = outputs[1].GetTensorTypeAndShapeInfo().GetShape(); // [1, 32, maskH, maskW] - - if (shape1.size() != 4 || shape1[0] != 1 || shape1[1] != 32) - throw std::runtime_error("Unexpected output1 shape. Expected [1, 32, maskH, maskW]."); - - const size_t num_features = shape0[1]; // e.g 80 class + 4 bbox parms + 32 seg masks = 116 - const size_t num_detections = shape0[2]; - - // Early exit if no detections - if (num_detections == 0) - { - return results; - } - - const int numClasses = static_cast(num_features - 4 - 32); // Corrected number of classes - - // Validate numClasses - if (numClasses <= 0) - { - throw std::runtime_error("Invalid number of classes."); - } - - const int numBoxes = static_cast(num_detections); - const int maskH = static_cast(shape1[2]); - const int maskW = static_cast(shape1[3]); - - // Constants from model architecture - constexpr int BOX_OFFSET = 0; - constexpr int CLASS_CONF_OFFSET = 4; - const int MASK_COEFF_OFFSET = numClasses + CLASS_CONF_OFFSET; - - // 1. Process prototype masks - // Store all prototype masks in a vector for easy access - std::vector prototypeMasks; - prototypeMasks.reserve(32); - for (int m = 0; m < 32; ++m) { - // Each mask is maskH x maskW - cv::Mat proto(maskH, maskW, CV_32F, const_cast(output1_ptr + m * maskH * maskW)); - prototypeMasks.emplace_back(proto.clone()); // Clone to ensure data integrity - } - - // 2. Process detections - std::vector boxes; - boxes.reserve(numBoxes); - std::vector confidences; - confidences.reserve(numBoxes); - std::vector classIds; - classIds.reserve(numBoxes); - std::vector> maskCoefficientsList; - maskCoefficientsList.reserve(numBoxes); - - for (int i = 0; i < numBoxes; ++i) { - // Extract box coordinates - float xc = output0_ptr[BOX_OFFSET * numBoxes + i]; - float yc = output0_ptr[(BOX_OFFSET + 1) * numBoxes + i]; - float w = output0_ptr[(BOX_OFFSET + 2) * numBoxes + i]; - float h = output0_ptr[(BOX_OFFSET + 3) * numBoxes + i]; - - // Convert to xyxy format - BoundingBox box{ - static_cast(std::round(xc - w / 2.0f)), - static_cast(std::round(yc - h / 2.0f)), - static_cast(std::round(w)), - static_cast(std::round(h)) - }; - - // Get class confidence - float maxConf = 0.0f; - int classId = -1; - for (int c = 0; c < numClasses; ++c) { - float conf = output0_ptr[(CLASS_CONF_OFFSET + c) * numBoxes + i]; - if (conf > maxConf) { - maxConf = conf; - classId = c; - } - } - - if (maxConf < confThreshold) continue; - - // Store detection - boxes.push_back(box); - confidences.push_back(maxConf); - classIds.push_back(classId); - - // Store mask coefficients - std::vector maskCoeffs(32); - for (int m = 0; m < 32; ++m) { - maskCoeffs[m] = output0_ptr[(MASK_COEFF_OFFSET + m) * numBoxes + i]; - } - maskCoefficientsList.emplace_back(std::move(maskCoeffs)); - } - - // Early exit if no boxes after confidence threshold - if (boxes.empty()) { - return results; - } - - // 3. Apply NMS - std::vector nmsIndices; - utils::NMSBoxes(boxes, confidences, confThreshold, iouThreshold, nmsIndices); - - if (nmsIndices.empty()) { - return results; - } - - // 4. Prepare final results - results.reserve(nmsIndices.size()); - - // Calculate letterbox parameters - const float gain = std::min(static_cast(letterboxSize.height) / origSize.height, - static_cast(letterboxSize.width) / origSize.width); - const int scaledW = static_cast(origSize.width * gain); - const int scaledH = static_cast(origSize.height * gain); - const float padW = (letterboxSize.width - scaledW) / 2.0f; - const float padH = (letterboxSize.height - scaledH) / 2.0f; - - // Precompute mask scaling factors - const float maskScaleX = static_cast(maskW) / letterboxSize.width; - const float maskScaleY = static_cast(maskH) / letterboxSize.height; - - for (const int idx : nmsIndices) { - Segmentation seg; - seg.box = boxes[idx]; - seg.conf = confidences[idx]; - seg.classId = classIds[idx]; - - // 5. Scale box to original image - seg.box = utils::scaleCoords(letterboxSize, seg.box, origSize, true); - - // 6. Process mask - const auto& maskCoeffs = maskCoefficientsList[idx]; - - // Linear combination of prototype masks - cv::Mat finalMask = cv::Mat::zeros(maskH, maskW, CV_32F); - for (int m = 0; m < 32; ++m) { - finalMask += maskCoeffs[m] * prototypeMasks[m]; - } - - // Apply sigmoid activation - finalMask = utils::sigmoid(finalMask); - - // Crop mask to letterbox area with a slight padding to avoid border issues - int x1 = static_cast(std::round((padW - 0.1f) * maskScaleX)); - int y1 = static_cast(std::round((padH - 0.1f) * maskScaleY)); - int x2 = static_cast(std::round((letterboxSize.width - padW + 0.1f) * maskScaleX)); - int y2 = static_cast(std::round((letterboxSize.height - padH + 0.1f) * maskScaleY)); - - // Ensure coordinates are within mask bounds - x1 = std::max(0, std::min(x1, maskW - 1)); - y1 = std::max(0, std::min(y1, maskH - 1)); - x2 = std::max(x1, std::min(x2, maskW)); - y2 = std::max(y1, std::min(y2, maskH)); - - // Handle cases where cropping might result in zero area - if (x2 <= x1 || y2 <= y1) { - // Skip this mask as cropping is invalid - continue; - } - - cv::Rect cropRect(x1, y1, x2 - x1, y2 - y1); - cv::Mat croppedMask = finalMask(cropRect).clone(); // Clone to ensure data integrity - - // Resize to original dimensions - cv::Mat resizedMask; - cv::resize(croppedMask, resizedMask, origSize, 0, 0, cv::INTER_LINEAR); - - // Threshold and convert to binary - cv::Mat binaryMask; - cv::threshold(resizedMask, binaryMask, 0.5, 255.0, cv::THRESH_BINARY); - binaryMask.convertTo(binaryMask, CV_8U); - - // Crop to bounding box - cv::Mat finalBinaryMask = cv::Mat::zeros(origSize, CV_8U); - cv::Rect roi(seg.box.x, seg.box.y, seg.box.width, seg.box.height); - roi &= cv::Rect(0, 0, binaryMask.cols, binaryMask.rows); // Ensure ROI is within mask - if (roi.area() > 0) { - binaryMask(roi).copyTo(finalBinaryMask(roi)); - } - - seg.mask = finalBinaryMask; - results.push_back(seg); - } - - return results; -} - -void YOLOv11SegDetector::drawSegmentationsAndBoxes(cv::Mat &image, - const std::vector &results, - float maskAlpha) const -{ - for (const auto &seg : results) { - if (seg.conf < CONFIDENCE_THRESHOLD) { - continue; - } - cv::Scalar color = classColors[seg.classId % classColors.size()]; - - // ----------------------------- - // 1. Draw Bounding Box - // ----------------------------- - cv::rectangle(image, - cv::Point(seg.box.x, seg.box.y), - cv::Point(seg.box.x + seg.box.width, seg.box.y + seg.box.height), - color, 2); - - // ----------------------------- - // 2. Draw Label - // ----------------------------- - std::string label = classNames[seg.classId] + " " + std::to_string(static_cast(seg.conf * 100)) + "%"; - int baseLine = 0; - double fontScale = 0.5; - int thickness = 1; - cv::Size labelSize = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, fontScale, thickness, &baseLine); - int top = std::max(seg.box.y, labelSize.height + 5); - cv::rectangle(image, - cv::Point(seg.box.x, top - labelSize.height - 5), - cv::Point(seg.box.x + labelSize.width + 5, top), - color, cv::FILLED); - cv::putText(image, label, - cv::Point(seg.box.x + 2, top - 2), - cv::FONT_HERSHEY_SIMPLEX, - fontScale, - cv::Scalar(255, 255, 255), - thickness); - - // ----------------------------- - // 3. Apply Segmentation Mask - // ----------------------------- - if (!seg.mask.empty()) { - // Ensure the mask is single-channel - cv::Mat mask_gray; - if (seg.mask.channels() == 3) { - cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); - } else { - mask_gray = seg.mask.clone(); - } - - // Threshold the mask to binary (object: 255, background: 0) - cv::Mat mask_binary; - cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); - - // Create a colored version of the mask - cv::Mat colored_mask; - cv::cvtColor(mask_binary, colored_mask, cv::COLOR_GRAY2BGR); - colored_mask.setTo(color, mask_binary); // Apply color where mask is present - - // Blend the colored mask with the original image - cv::addWeighted(image, 1.0, colored_mask, maskAlpha, 0, image); - } - } -} - -void YOLOv11SegDetector::drawSegmentations(cv::Mat &image, - const std::vector &results, - float maskAlpha) const -{ - for (const auto &seg : results) { - if (seg.conf < CONFIDENCE_THRESHOLD) { - continue; - } - cv::Scalar color = classColors[seg.classId % classColors.size()]; - - // ----------------------------- - // Draw Segmentation Mask Only - // ----------------------------- - if (!seg.mask.empty()) { - // Ensure the mask is single-channel - cv::Mat mask_gray; - if (seg.mask.channels() == 3) { - cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); - } else { - mask_gray = seg.mask.clone(); - } - - // Threshold the mask to binary (object: 255, background: 0) - cv::Mat mask_binary; - cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); - - // Create a colored version of the mask - cv::Mat colored_mask; - cv::cvtColor(mask_binary, colored_mask, cv::COLOR_GRAY2BGR); - colored_mask.setTo(color, mask_binary); // Apply color where mask is present - - // Blend the colored mask with the original image - cv::addWeighted(image, 1.0, colored_mask, maskAlpha, 0, image); - } - } -} - -void YOLOv11SegDetector::drawSegmentationMask(cv::Mat &image, - const std::vector &results, - const std::vector &classesFilter) const -{ - for (const auto &seg : results) { - if (seg.conf < CONFIDENCE_THRESHOLD) { - continue; - } - - if(!classesFilter.empty()) - { - if(std::find(classesFilter.begin(), classesFilter.end(), seg.classId) == classesFilter.end()) - { - // class id not in filter - continue; - } - } - - // ----------------------------- - // Draw Segmentation Mask Only - // ----------------------------- - if (!seg.mask.empty()) { - // Ensure the mask is single-channel - cv::Mat mask_gray; - if (seg.mask.channels() == 3) { - cv::cvtColor(seg.mask, mask_gray, cv::COLOR_BGR2GRAY); - } else { - mask_gray = seg.mask.clone(); - mask_gray *= 255; - } - - // Threshold the mask to binary (object: 255, background: 0) - cv::Mat mask_binary; - cv::threshold(mask_gray, mask_binary, 127, 255, cv::THRESH_BINARY); - - cv::normalize(mask_binary, mask_binary, 0, 255, cv::NORM_MINMAX, CV_8UC1); - - int image_type = image.type(); - int mask_type = mask_binary.type(); - - // Blend the mask together into one single mask - cv::add(image, mask_binary, image); - } - } -} - -std::vector YOLOv11SegDetector::segment(const cv::Mat &image, - float confThreshold, - float iouThreshold) -{ - - float *blobPtr = nullptr; - std::vector inputShape = {1, 3, inputImageShape.height, inputImageShape.width}; - cv::Mat letterboxImg = preprocess(image, blobPtr, inputShape); - - size_t inputSize = utils::vectorProduct(inputShape); - std::vector inputVals(blobPtr, blobPtr + inputSize); - delete[] blobPtr; - - Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - Ort::Value inputTensor = Ort::Value::CreateTensor( - memInfo, - inputVals.data(), - inputSize, - inputShape.data(), - inputShape.size() - ); - - std::vector outputs = session.Run( - Ort::RunOptions{nullptr}, - inputNames.data(), - &inputTensor, - numInputNodes, - outputNames.data(), - numOutputNodes); - - cv::Size letterboxSize(static_cast(inputShape[3]), static_cast(inputShape[2])); - return postprocess(image.size(), letterboxSize, outputs, confThreshold, iouThreshold); -} diff --git a/src/yolo11segncnn.cpp b/src/yolo11segncnn.cpp new file mode 100644 index 00000000..111e4070 --- /dev/null +++ b/src/yolo11segncnn.cpp @@ -0,0 +1,293 @@ +#include "yolo11segncnn.h" +#include +#include "utils.h" +#include + +// Original Author: Abdalrahman M. Amer, www.linkedin.com/in/abdalrahman-m-amer +// Date: 25.01.2025 +// Modified for use in photbooth + +YOLOv11SegDetectorNcnn::YOLOv11SegDetectorNcnn(const std::string &modelPath, + const std::string &labelsPath, + bool useGPU) : Yolo11Segementation(labelsPath) +{ + QString ressourcePathGeneric = QStandardPaths::locate(QStandardPaths::GenericDataLocation, "models", QStandardPaths::LocateDirectory); + QString ressourcePathApp = QStandardPaths::locate(QStandardPaths::AppDataLocation, "models", QStandardPaths::LocateDirectory); + if (ressourcePathApp.isEmpty() && ressourcePathGeneric.isEmpty()) + { + throw std::runtime_error("Failed to locate the models directory."); + } + QString ressourcePath = ""; + if (!ressourcePathApp.isEmpty()) + { + ressourcePath = QDir::cleanPath(ressourcePathApp); + } + else if (!ressourcePathGeneric.isEmpty()) + { + ressourcePath = QDir::cleanPath(ressourcePathGeneric); + } + + qDebug() << "[INFO] Using model path: " << ressourcePath; + std::string params_path = ressourcePath.toStdString() + "/" + modelPath + "/model.ncnn.param"; + std::string model_path = ressourcePath.toStdString() + "/" + modelPath + "/model.ncnn.bin"; + + if (0 != net.load_param(params_path.c_str())) + { + throw std::runtime_error("Failed to load model parameters from: " + params_path); + } + + if (0 != net.load_model(model_path.c_str())) + { + throw std::runtime_error("Failed to load model binary from: " + model_path); + } + + // Set options + net.opt.use_vulkan_compute = useGPU; + + numInputNodes = net.input_names().size(); + numOutputNodes = net.output_names().size(); + + isDynamicInputShape = false; // Assume static input shape by default. NCNN models typically have fixed input shapes. + inputImageShape = cv::Size(640, 640); // Default shape. This is fixed for YOLOv11SegNCNN + + // Input + if (numInputNodes != 1) + { + throw std::runtime_error("Expected exactly 1 input node."); + } + + inputNames = net.input_names(); + + // Outputs + if (numOutputNodes != 2) + { + throw std::runtime_error("Expected exactly 2 output nodes: output0 and output1."); + } + + outputNames = net.output_names(); + + qDebug() << "[INFO] YOLOv11Seg loaded: " << modelPath; + qDebug() << " Input shape: " << inputImageShape.height << "x" << inputImageShape.width + << (isDynamicInputShape ? " (dynamic)" : ""); + qDebug() << " #Outputs : " << numOutputNodes; + qDebug() << " #Classes : " << classNames.size(); +} + +cv::Mat YOLOv11SegDetectorNcnn::preprocess(const cv::Mat &image, + float *&blobPtr, + std::vector &inputTensorShape) +{ + cv::Mat letterboxImage; + utils::letterBox(image, letterboxImage, inputImageShape, + cv::Scalar(114, 114, 114), /*auto_=*/false, + /*scaleFill=*/false, /*scaleUp=*/true, /*stride=*/32); + + // No dynamic shape in NCNN, so inputTensorShape is not used + + size_t size = static_cast(letterboxImage.rows) * static_cast(letterboxImage.cols) * 3; + blobPtr = new float[size]; + + return letterboxImage; +} + +std::vector YOLOv11SegDetectorNcnn::postprocess( + const cv::Size &origSize, + const cv::Size &letterboxSize, + const ncnn::Mat &outputs_boxes, + const ncnn::Mat &outputs_masks, + float confThreshold, + float iouThreshold) +{ + std::vector results; + + // output0: [num_features, num_boxes] + // output1: [32, maskH, maskW] + int num_features = outputs_boxes.h; + int num_boxes = outputs_boxes.w; + int maskC = outputs_masks.c; // Should be 32 + if (maskC != 32) + { + throw std::runtime_error("Expected 32 prototype masks in output1."); + } + int maskH = outputs_masks.h; + int maskW = outputs_masks.w; + + if (num_boxes == 0) + { + return results; // Early exit if no boxes + } + + const int numClasses = num_features - 4 - 32; + constexpr int BOX_OFFSET = 0; + constexpr int CLASS_CONF_OFFSET = 4; + const int MASK_COEFF_OFFSET = numClasses + CLASS_CONF_OFFSET; + + // 1. Process prototype masks + std::vector prototypeMasks; + prototypeMasks.reserve(32); + for (int m = 0; m < 32; ++m) + { + // Each mask is maskH x maskW + cv::Mat proto(maskH, maskW, CV_32F, (void *)outputs_masks.channel(m).data); + prototypeMasks.emplace_back(proto.clone()); + } + + // 2. Process detections + std::vector boxes; + std::vector confidences; + std::vector classIds; + std::vector> maskCoefficientsList; + + for (int i = 0; i < num_boxes; ++i) + { + // Extract box coordinates + float xc = outputs_boxes.row(BOX_OFFSET + 0)[i]; + float yc = outputs_boxes.row(BOX_OFFSET + 1)[i]; + float w = outputs_boxes.row(BOX_OFFSET + 2)[i]; + float h = outputs_boxes.row(BOX_OFFSET + 3)[i]; + + BoundingBox box{ + static_cast(std::round(xc - w / 2.0f)), + static_cast(std::round(yc - h / 2.0f)), + static_cast(std::round(w)), + static_cast(std::round(h))}; + + // Get class confidence + float maxConf = 0.0f; + int classId = -1; + for (int c = 0; c < numClasses; ++c) + { + float conf = outputs_boxes.row(CLASS_CONF_OFFSET + c)[i]; + if (conf > maxConf) + { + maxConf = conf; + classId = c; + } + } + + if (maxConf < confThreshold) + continue; + + boxes.push_back(box); + confidences.push_back(maxConf); + classIds.push_back(classId); + + // Mask coefficients + std::vector maskCoeffs(32); + for (int m = 0; m < 32; ++m) + { + maskCoeffs[m] = outputs_boxes.row(MASK_COEFF_OFFSET + m)[i]; + } + maskCoefficientsList.emplace_back(std::move(maskCoeffs)); + } + + if (boxes.empty()) + return results; + + // 3. Apply NMS + std::vector nmsIndices; + utils::NMSBoxes(boxes, confidences, confThreshold, iouThreshold, nmsIndices); + + if (nmsIndices.empty()) + return results; + + // 4. Prepare final results + results.reserve(nmsIndices.size()); + + const float gain = std::min(static_cast(letterboxSize.height) / origSize.height, + static_cast(letterboxSize.width) / origSize.width); + const int scaledW = static_cast(origSize.width * gain); + const int scaledH = static_cast(origSize.height * gain); + const float padW = (letterboxSize.width - scaledW) / 2.0f; + const float padH = (letterboxSize.height - scaledH) / 2.0f; + + const float maskScaleX = static_cast(maskW) / letterboxSize.width; + const float maskScaleY = static_cast(maskH) / letterboxSize.height; + + for (const int idx : nmsIndices) + { + Segmentation seg; + seg.box = boxes[idx]; + seg.conf = confidences[idx]; + seg.classId = classIds[idx]; + + // 5. Scale box to original image + seg.box = utils::scaleCoords(letterboxSize, seg.box, origSize, true); + + // 6. Process mask + const auto &maskCoeffs = maskCoefficientsList[idx]; + + // Linear combination of prototype masks + cv::Mat finalMask = cv::Mat::zeros(maskH, maskW, CV_32F); + for (int m = 0; m < 32; ++m) + { + finalMask += maskCoeffs[m] * prototypeMasks[m]; + } + + // Apply sigmoid activation + finalMask = utils::sigmoid(finalMask); + + // Crop mask to letterbox area with a slight padding to avoid border issues + int x1 = static_cast(std::round((padW - 0.1f) * maskScaleX)); + int y1 = static_cast(std::round((padH - 0.1f) * maskScaleY)); + int x2 = static_cast(std::round((letterboxSize.width - padW + 0.1f) * maskScaleX)); + int y2 = static_cast(std::round((letterboxSize.height - padH + 0.1f) * maskScaleY)); + + x1 = std::max(0, std::min(x1, maskW - 1)); + y1 = std::max(0, std::min(y1, maskH - 1)); + x2 = std::max(x1, std::min(x2, maskW)); + y2 = std::max(y1, std::min(y2, maskH)); + + if (x2 <= x1 || y2 <= y1) + continue; + + cv::Rect cropRect(x1, y1, x2 - x1, y2 - y1); + cv::Mat croppedMask = finalMask(cropRect).clone(); + + cv::Mat resizedMask; + cv::resize(croppedMask, resizedMask, origSize, 0, 0, cv::INTER_LINEAR); + + cv::Mat binaryMask; + cv::threshold(resizedMask, binaryMask, 0.5, 255.0, cv::THRESH_BINARY); + binaryMask.convertTo(binaryMask, CV_8U); + + cv::Mat finalBinaryMask = cv::Mat::zeros(origSize, CV_8U); + cv::Rect roi(seg.box.x, seg.box.y, seg.box.width, seg.box.height); + roi &= cv::Rect(0, 0, binaryMask.cols, binaryMask.rows); + if (roi.area() > 0) + { + binaryMask(roi).copyTo(finalBinaryMask(roi)); + } + + seg.mask = finalBinaryMask; + results.push_back(seg); + } + + return results; +} + +std::vector YOLOv11SegDetectorNcnn::segment(const cv::Mat &image, + float confThreshold, + float iouThreshold) +{ + static int counter = 0; + + cv::Mat letterboxImage; + utils::letterBox(image, letterboxImage, inputImageShape, + cv::Scalar(114, 114, 114), /*auto_=*/false, + /*scaleFill=*/false, /*scaleUp=*/true, /*stride=*/32); + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(letterboxImage.data, ncnn::Mat::PIXEL_BGR2RGB, letterboxImage.cols, letterboxImage.rows, 640, 640); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = net.create_extractor(); + ex.input("in0", in); // adjust input name as needed + + ncnn::Mat out_boxes, out_masks; + ex.extract("out0", out_boxes); // [num, 6] (x, y, w, h, conf, class) + ex.extract("out1", out_masks); // [num, mask_dim, mask_dim] + + return postprocess(image.size(), inputImageShape, out_boxes, out_masks, confThreshold, iouThreshold); +} diff --git a/src/yolo11segncnn.h b/src/yolo11segncnn.h new file mode 100644 index 00000000..02c24c62 --- /dev/null +++ b/src/yolo11segncnn.h @@ -0,0 +1,70 @@ +#ifndef YOLO11SEGNCNN_H +#define YOLO11SEGNCNN_H + +// Original Author: Abdalrahman M. Amer, www.linkedin.com/in/abdalrahman-m-amer +// Date: 25.01.2025 +// Modified for use in photbooth + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "segmentation.h" + + +class YOLOv11SegDetectorNcnn : public Yolo11Segementation { +public: + YOLOv11SegDetectorNcnn(const std::string &modelPath, + const std::string &labelsPath, + bool useGPU = false); + + // Main API + std::vector segment(const cv::Mat &image, + float confThreshold = CONFIDENCE_THRESHOLD, + float iouThreshold = IOU_THRESHOLD); +private: + ncnn::Net net; + + bool isDynamicInputShape{false}; + cv::Size inputImageShape; + + std::vector inputNames; + std::vector outputNames; + + size_t numInputNodes = 0; + size_t numOutputNodes = 0; + + // Helpers + cv::Mat preprocess(const cv::Mat &image, + float *&blobPtr, + std::vector &inputTensorShape); + + std::vector postprocess(const cv::Size &origSize, + const cv::Size &letterboxSize, + const ncnn::Mat &outputs_boxes, + const ncnn::Mat &outputs_masks, + float confThreshold, + float iouThreshold); + + static constexpr float CONFIDENCE_THRESHOLD = 0.40f; // Filter boxes below this confidence + static constexpr float IOU_THRESHOLD = 0.45f; // NMS IoU threshold + static constexpr float MASK_THRESHOLD = 0.40f; // Slightly lower to capture partial objects + +}; + + +#endif // YOLO11SEG_H diff --git a/src/yolo11segonnx.cpp b/src/yolo11segonnx.cpp new file mode 100644 index 00000000..67ba1775 --- /dev/null +++ b/src/yolo11segonnx.cpp @@ -0,0 +1,375 @@ +#include "yolo11segonnx.h" +#include +#include "utils.h" + +// Original Author: Abdalrahman M. Amer, www.linkedin.com/in/abdalrahman-m-amer +// Date: 25.01.2025 +// Modified for use in photbooth + +YOLOv11SegDetectorOnnx::YOLOv11SegDetectorOnnx(const std::string &modelPath, + const std::string &labelsPath, + bool useGPU) + : Yolo11Segementation(labelsPath), env(ORT_LOGGING_LEVEL_WARNING, "YOLOv11Seg") +{ + sessionOptions.SetIntraOpNumThreads(std::min(8, static_cast(std::thread::hardware_concurrency()))); + sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); + + std::vector providers = Ort::GetAvailableProviders(); + if (useGPU && std::find(providers.begin(), providers.end(), "CUDAExecutionProvider") != providers.end()) + { + OrtCUDAProviderOptions cudaOptions; + sessionOptions.AppendExecutionProvider_CUDA(cudaOptions); + qDebug() << "[INFO] Using GPU (CUDA) for YOLOv11 Seg inference."; + } + else + { + qDebug() << "[INFO] Using CPU for YOLOv11 Seg inference."; + } + + QFile modelFile(getModelRessourcePath(modelPath.c_str()).c_str()); + + if (!modelFile.open(QIODevice::ReadOnly)) + { + qWarning() << "Failed to open the model file!"; + } + + QByteArray binaryData = modelFile.readAll(); + +#ifdef _WIN32 + std::wstring w_modelPath(modelPath.begin(), modelPath.end()); + session = Ort::Session(env, w_modelPath.c_str(), sessionOptions); +#else + session = Ort::Session(env, binaryData.data(), binaryData.size(), sessionOptions); +#endif + + numInputNodes = session.GetInputCount(); + numOutputNodes = session.GetOutputCount(); + + Ort::AllocatorWithDefaultOptions allocator; + + // Input + { + auto inNameAlloc = session.GetInputNameAllocated(0, allocator); + inputNameAllocs.emplace_back(std::move(inNameAlloc)); + inputNames.push_back(inputNameAllocs.back().get()); + + auto inTypeInfo = session.GetInputTypeInfo(0); + auto inShape = inTypeInfo.GetTensorTypeAndShapeInfo().GetShape(); + + if (inShape.size() == 4) + { + if (inShape[2] == -1 || inShape[3] == -1) + { + isDynamicInputShape = true; + inputImageShape = cv::Size(640, 640); // Fallback if dynamic + } + else + { + inputImageShape = cv::Size(static_cast(inShape[3]), static_cast(inShape[2])); + } + } + else + { + throw std::runtime_error("Model input is not 4D! Expect [N, C, H, W]."); + } + } + + // Outputs + if (numOutputNodes != 2) + { + throw std::runtime_error("Expected exactly 2 output nodes: output0 and output1."); + } + + for (size_t i = 0; i < numOutputNodes; ++i) + { + auto outNameAlloc = session.GetOutputNameAllocated(i, allocator); + outputNameAllocs.emplace_back(std::move(outNameAlloc)); + outputNames.push_back(outputNameAllocs.back().get()); + } + + qDebug() << "[INFO] YOLOv11Seg loaded: " << modelPath; + qDebug() << " Input shape: " << inputImageShape.height << "x" << inputImageShape.width + << (isDynamicInputShape ? " (dynamic)" : ""); + qDebug() << " #Outputs : " << numOutputNodes; + qDebug() << " #Classes : " << classNames.size(); +} + +cv::Mat YOLOv11SegDetectorOnnx::preprocess(const cv::Mat &image, + float *&blobPtr, + std::vector &inputTensorShape) +{ + cv::Mat letterboxImage; + utils::letterBox(image, letterboxImage, inputImageShape, + cv::Scalar(114, 114, 114), /*auto_=*/isDynamicInputShape, + /*scaleFill=*/false, /*scaleUp=*/true, /*stride=*/32); + + // Update if dynamic + inputTensorShape[2] = static_cast(letterboxImage.rows); + inputTensorShape[3] = static_cast(letterboxImage.cols); + + letterboxImage.convertTo(letterboxImage, CV_32FC3, 1.0f / 255.0f); + + size_t size = static_cast(letterboxImage.rows) * static_cast(letterboxImage.cols) * 3; + blobPtr = new float[size]; + + std::vector channels(3); + for (int c = 0; c < 3; ++c) + { + channels[c] = cv::Mat(letterboxImage.rows, letterboxImage.cols, CV_32FC1, + blobPtr + c * (letterboxImage.rows * letterboxImage.cols)); + } + cv::split(letterboxImage, channels); + + return letterboxImage; +} + +std::vector YOLOv11SegDetectorOnnx::postprocess( + const cv::Size &origSize, + const cv::Size &letterboxSize, + const std::vector &outputs, + float confThreshold, + float iouThreshold) +{ + std::vector results; + + // Validate outputs size + if (outputs.size() < 2) + { + throw std::runtime_error("Insufficient outputs from the model. Expected at least 2 outputs."); + } + + // Extract outputs + const float *output0_ptr = outputs[0].GetTensorData(); + const float *output1_ptr = outputs[1].GetTensorData(); + + // Get shapes + auto shape0 = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, num_detections] + auto shape1 = outputs[1].GetTensorTypeAndShapeInfo().GetShape(); // [1, 32, maskH, maskW] + + if (shape1.size() != 4 || shape1[0] != 1 || shape1[1] != 32) + throw std::runtime_error("Unexpected output1 shape. Expected [1, 32, maskH, maskW]."); + + const size_t num_features = shape0[1]; // e.g 80 class + 4 bbox parms + 32 seg masks = 116 + const size_t num_detections = shape0[2]; + + // Early exit if no detections + if (num_detections == 0) + { + return results; + } + + const int numClasses = static_cast(num_features - 4 - 32); // Corrected number of classes + + // Validate numClasses + if (numClasses <= 0) + { + throw std::runtime_error("Invalid number of classes."); + } + + const int numBoxes = static_cast(num_detections); + const int maskH = static_cast(shape1[2]); + const int maskW = static_cast(shape1[3]); + + // Constants from model architecture + constexpr int BOX_OFFSET = 0; + constexpr int CLASS_CONF_OFFSET = 4; + const int MASK_COEFF_OFFSET = numClasses + CLASS_CONF_OFFSET; + + // 1. Process prototype masks + // Store all prototype masks in a vector for easy access + std::vector prototypeMasks; + prototypeMasks.reserve(32); + for (int m = 0; m < 32; ++m) + { + // Each mask is maskH x maskW + cv::Mat proto(maskH, maskW, CV_32F, const_cast(output1_ptr + m * maskH * maskW)); + prototypeMasks.emplace_back(proto.clone()); // Clone to ensure data integrity + } + + // 2. Process detections + std::vector boxes; + boxes.reserve(numBoxes); + std::vector confidences; + confidences.reserve(numBoxes); + std::vector classIds; + classIds.reserve(numBoxes); + std::vector> maskCoefficientsList; + maskCoefficientsList.reserve(numBoxes); + + for (int i = 0; i < numBoxes; ++i) + { + // Extract box coordinates + float xc = output0_ptr[BOX_OFFSET * numBoxes + i]; + float yc = output0_ptr[(BOX_OFFSET + 1) * numBoxes + i]; + float w = output0_ptr[(BOX_OFFSET + 2) * numBoxes + i]; + float h = output0_ptr[(BOX_OFFSET + 3) * numBoxes + i]; + + // Convert to xyxy format + BoundingBox box{ + static_cast(std::round(xc - w / 2.0f)), + static_cast(std::round(yc - h / 2.0f)), + static_cast(std::round(w)), + static_cast(std::round(h))}; + + // Get class confidence + float maxConf = 0.0f; + int classId = -1; + for (int c = 0; c < numClasses; ++c) + { + float conf = output0_ptr[(CLASS_CONF_OFFSET + c) * numBoxes + i]; + if (conf > maxConf) + { + maxConf = conf; + classId = c; + } + } + + if (maxConf < confThreshold) + continue; + + // Store detection + boxes.push_back(box); + confidences.push_back(maxConf); + classIds.push_back(classId); + + // Store mask coefficients + std::vector maskCoeffs(32); + for (int m = 0; m < 32; ++m) + { + maskCoeffs[m] = output0_ptr[(MASK_COEFF_OFFSET + m) * numBoxes + i]; + } + maskCoefficientsList.emplace_back(std::move(maskCoeffs)); + } + + // Early exit if no boxes after confidence threshold + if (boxes.empty()) + { + return results; + } + + // 3. Apply NMS + std::vector nmsIndices; + utils::NMSBoxes(boxes, confidences, confThreshold, iouThreshold, nmsIndices); + + if (nmsIndices.empty()) + { + return results; + } + + // 4. Prepare final results + results.reserve(nmsIndices.size()); + + // Calculate letterbox parameters + const float gain = std::min(static_cast(letterboxSize.height) / origSize.height, + static_cast(letterboxSize.width) / origSize.width); + const int scaledW = static_cast(origSize.width * gain); + const int scaledH = static_cast(origSize.height * gain); + const float padW = (letterboxSize.width - scaledW) / 2.0f; + const float padH = (letterboxSize.height - scaledH) / 2.0f; + + // Precompute mask scaling factors + const float maskScaleX = static_cast(maskW) / letterboxSize.width; + const float maskScaleY = static_cast(maskH) / letterboxSize.height; + + for (const int idx : nmsIndices) + { + Segmentation seg; + seg.box = boxes[idx]; + seg.conf = confidences[idx]; + seg.classId = classIds[idx]; + + // 5. Scale box to original image + seg.box = utils::scaleCoords(letterboxSize, seg.box, origSize, true); + + // 6. Process mask + const auto &maskCoeffs = maskCoefficientsList[idx]; + + // Linear combination of prototype masks + cv::Mat finalMask = cv::Mat::zeros(maskH, maskW, CV_32F); + for (int m = 0; m < 32; ++m) + { + finalMask += maskCoeffs[m] * prototypeMasks[m]; + } + + // Apply sigmoid activation + finalMask = utils::sigmoid(finalMask); + + // Crop mask to letterbox area with a slight padding to avoid border issues + int x1 = static_cast(std::round((padW - 0.1f) * maskScaleX)); + int y1 = static_cast(std::round((padH - 0.1f) * maskScaleY)); + int x2 = static_cast(std::round((letterboxSize.width - padW + 0.1f) * maskScaleX)); + int y2 = static_cast(std::round((letterboxSize.height - padH + 0.1f) * maskScaleY)); + + // Ensure coordinates are within mask bounds + x1 = std::max(0, std::min(x1, maskW - 1)); + y1 = std::max(0, std::min(y1, maskH - 1)); + x2 = std::max(x1, std::min(x2, maskW)); + y2 = std::max(y1, std::min(y2, maskH)); + + // Handle cases where cropping might result in zero area + if (x2 <= x1 || y2 <= y1) + { + // Skip this mask as cropping is invalid + continue; + } + + cv::Rect cropRect(x1, y1, x2 - x1, y2 - y1); + cv::Mat croppedMask = finalMask(cropRect).clone(); // Clone to ensure data integrity + + // Resize to original dimensions + cv::Mat resizedMask; + cv::resize(croppedMask, resizedMask, origSize, 0, 0, cv::INTER_LINEAR); + + // Threshold and convert to binary + cv::Mat binaryMask; + cv::threshold(resizedMask, binaryMask, 0.5, 255.0, cv::THRESH_BINARY); + binaryMask.convertTo(binaryMask, CV_8U); + + // Crop to bounding box + cv::Mat finalBinaryMask = cv::Mat::zeros(origSize, CV_8U); + cv::Rect roi(seg.box.x, seg.box.y, seg.box.width, seg.box.height); + roi &= cv::Rect(0, 0, binaryMask.cols, binaryMask.rows); // Ensure ROI is within mask + if (roi.area() > 0) + { + binaryMask(roi).copyTo(finalBinaryMask(roi)); + } + + seg.mask = finalBinaryMask; + results.push_back(seg); + } + + return results; +} + +std::vector YOLOv11SegDetectorOnnx::segment(const cv::Mat &image, + float confThreshold, + float iouThreshold) +{ + + float *blobPtr = nullptr; + std::vector inputShape = {1, 3, inputImageShape.height, inputImageShape.width}; + cv::Mat letterboxImg = preprocess(image, blobPtr, inputShape); + + size_t inputSize = utils::vectorProduct(inputShape); + std::vector inputVals(blobPtr, blobPtr + inputSize); + delete[] blobPtr; + + Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Value inputTensor = Ort::Value::CreateTensor( + memInfo, + inputVals.data(), + inputSize, + inputShape.data(), + inputShape.size()); + + std::vector outputs = session.Run( + Ort::RunOptions{nullptr}, + inputNames.data(), + &inputTensor, + numInputNodes, + outputNames.data(), + numOutputNodes); + + cv::Size letterboxSize(static_cast(inputShape[3]), static_cast(inputShape[2])); + return postprocess(image.size(), letterboxSize, outputs, confThreshold, iouThreshold); +} diff --git a/src/yolo11seg.h b/src/yolo11segonnx.h similarity index 53% rename from src/yolo11seg.h rename to src/yolo11segonnx.h index 9dad2f4a..b0d1b185 100644 --- a/src/yolo11seg.h +++ b/src/yolo11segonnx.h @@ -1,5 +1,5 @@ -#ifndef YOLO11SEG_H -#define YOLO11SEG_H +#ifndef YOLO11SEGONNX_H +#define YOLO11SEGONNX_H // Original Author: Abdalrahman M. Amer, www.linkedin.com/in/abdalrahman-m-amer // Date: 25.01.2025 @@ -22,6 +22,8 @@ #include +#include "segmentation.h" + // ============================================================================ // Constants / Thresholds // ============================================================================ @@ -29,45 +31,13 @@ static const float CONFIDENCE_THRESHOLD = 0.40f; // Filter boxes below this conf static const float IOU_THRESHOLD = 0.45f; // NMS IoU threshold static const float MASK_THRESHOLD = 0.40f; // Slightly lower to capture partial objects -// ============================================================================ -// Structs -// ============================================================================ -struct BoundingBox { - int x{0}; - int y{0}; - int width{0}; - int height{0}; - - BoundingBox() = default; - BoundingBox(int _x, int _y, int w, int h) - : x(_x), y(_y), width(w), height(h) {} - - float area() const { return static_cast(width * height); } - - BoundingBox intersect(const BoundingBox &other) const { - int xStart = std::max(x, other.x); - int yStart = std::max(y, other.y); - int xEnd = std::min(x + width, other.x + other.width); - int yEnd = std::min(y + height, other.y + other.height); - int iw = std::max(0, xEnd - xStart); - int ih = std::max(0, yEnd - yStart); - return BoundingBox(xStart, yStart, iw, ih); - } -}; - -struct Segmentation { - BoundingBox box; - float conf{0.f}; - int classId{0}; - cv::Mat mask; // Single-channel (8UC1) mask in full resolution -}; // ============================================================================ // YOLOv11SegDetector Class // ============================================================================ -class YOLOv11SegDetector { +class YOLOv11SegDetectorOnnx : public Yolo11Segementation { public: - YOLOv11SegDetector(const std::string &modelPath, + YOLOv11SegDetectorOnnx(const std::string &modelPath, const std::string &labelsPath, bool useGPU = false); @@ -76,22 +46,6 @@ class YOLOv11SegDetector { float confThreshold = CONFIDENCE_THRESHOLD, float iouThreshold = IOU_THRESHOLD); - // Draw results - void drawSegmentationsAndBoxes(cv::Mat &image, - const std::vector &results, - float maskAlpha = 0.5f) const; - - void drawSegmentations(cv::Mat &image, - const std::vector &results, - float maskAlpha = 0.5f) const; - - void drawSegmentationMask(cv::Mat &image, - const std::vector &results, - const std::vector &classesFilter) const; - // Accessors - const std::vector &getClassNames() const { return classNames; } - const std::vector &getClassColors() const { return classColors; } - private: Ort::Env env; Ort::SessionOptions sessionOptions; @@ -108,9 +62,6 @@ class YOLOv11SegDetector { size_t numInputNodes = 0; size_t numOutputNodes = 0; - std::vector classNames; - std::vector classColors; - // Helpers cv::Mat preprocess(const cv::Mat &image, float *&blobPtr, @@ -124,4 +75,4 @@ class YOLOv11SegDetector { }; -#endif // YOLO11SEG_H +#endif // YOLO11SEGONNX_H diff --git a/src/yolobackend.h b/src/yolobackend.h new file mode 100644 index 00000000..a309f975 --- /dev/null +++ b/src/yolobackend.h @@ -0,0 +1,13 @@ +#ifndef YOLOBACKEND_H +#define YOLOBACKEND_H + + +#include "segmentation.h" + +class YoloBackend { +public: + virtual ~YoloBackend() = default; + virtual std::vector segment(const cv::Mat &image, float confThreshold, float iouThreshold) = 0; +}; + +#endif // YOLOBACKEND_H diff --git a/yolomodel.large.qrc b/yolomodel.large.qrc deleted file mode 100644 index bfd9853e..00000000 --- a/yolomodel.large.qrc +++ /dev/null @@ -1,5 +0,0 @@ - - - models/yolo11l-seg.onnx - - diff --git a/yolomodel.small.qrc b/yolomodel.small.qrc deleted file mode 100644 index 8a142bd0..00000000 --- a/yolomodel.small.qrc +++ /dev/null @@ -1,5 +0,0 @@ - - - models/yolo11n-seg.onnx - -