rouault
diff --git a/‎alg/gdal_tps.cpp‎
Lines changed: 73 additions & 6 deletions b/‎alg/gdal_tps.cpp‎
Lines changed: 73 additions & 6 deletions
diff --git a/‎alg/gdallinearsystem.cpp‎
Lines changed: 59 additions & 24 deletions b/‎alg/gdallinearsystem.cpp‎
Lines changed: 59 additions & 24 deletions
diff --git a/‎alg/gdallinearsystem.h‎
Lines changed: 2 additions & 1 deletion b/‎alg/gdallinearsystem.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎alg/gdaltransformer.cpp‎
Lines changed: 6 additions & 1 deletion b/‎alg/gdaltransformer.cpp‎
Lines changed: 6 additions & 1 deletion
@@ -16,6 +16,8 @@
 
 #include <stdlib.h>
 #include <string.h>
+
+#include <algorithm>
 #include <map>
 #include <utility>
 
@@ -25,6 +27,7 @@
 #include "cpl_minixml.h"
 #include "cpl_multiproc.h"
 #include "cpl_string.h"
+#include "cpl_vsi.h"  // CPLGetUsablePhysicalRAM()
 #include "gdal.h"
 #include "gdal_alg.h"
 #include "gdal_alg_priv.h"
@@ -47,6 +50,7 @@ struct TPSTransformInfo
     double dfSrcApproxErrorReverse{};
 
     bool bReversed{};
+    bool bForceBuiltinMethod = false;
 
     std::vector<gdal::GCP> asGCPs{};
 
@@ -130,7 +134,8 @@ void *GDALCreateTPSTransformer(int nGCPCount, const GDAL_GCP *pasGCPList,
 static void GDALTPSComputeForwardInThread(void *pData)
 {
     TPSTransformInfo *psInfo = static_cast<TPSTransformInfo *>(pData);
-    psInfo->bForwardSolved = psInfo->poForward->solve() != 0;
+    psInfo->bForwardSolved =
+        psInfo->poForward->solve(psInfo->bForceBuiltinMethod) != 0;
 }
 
 void *GDALCreateTPSTransformerInt(int nGCPCount, const GDAL_GCP *pasGCPList,
@@ -233,6 +238,9 @@ void *GDALCreateTPSTransformerInt(int nGCPCount, const GDAL_GCP *pasGCPList,
         CSLFetchNameValueDef(papszOptions, "SRC_APPROX_ERROR_IN_PIXEL", "0"));
 
     int nThreads = 1;
+    bool bForceBuiltinMethod = false;
+    // Arbitrary threshold beyond which multithreading might be interesting,
+    // and checking memory usage too...
     if (nGCPCount > 100)
     {
         const char *pszWarpThreads =
@@ -243,23 +251,82 @@ void *GDALCreateTPSTransformerInt(int nGCPCount, const GDAL_GCP *pasGCPList,
             nThreads = CPLGetNumCPUs();
         else
             nThreads = atoi(pszWarpThreads);
+        nThreads = std::clamp(nThreads, 1, 2);
+
+        // Do sanity checks w.r.t. available RAM
+
+        const auto nRAM = CPLGetUsablePhysicalRAM();
+        if (nRAM > 0)
+        {
+            const int nMatrixSize = nGCPCount + 3;
+#ifdef HAVE_ARMADILLO
+            // Armadillo requires up to 3 matrices of size nMatrixSize x nMatrixSize
+            // for each transformation direction
+            constexpr int NUM_TEMP_MATRICES = 3;
+            if (nMatrixSize >
+                nRAM / (nThreads * NUM_TEMP_MATRICES * nMatrixSize *
+                        static_cast<int>(sizeof(double))))
+            {
+                if (nMatrixSize > nRAM / (NUM_TEMP_MATRICES * nMatrixSize *
+                                          static_cast<int>(sizeof(double))))
+                {
+                    CPLDebug("GDAL", "Not enough memory to use Armadillo "
+                                     "solver for thinplatespline. Falling back "
+                                     "to LU decomposition method");
+                    bForceBuiltinMethod = true;
+                }
+                else
+                {
+                    nThreads = 1;
+                }
+            }
+            if (bForceBuiltinMethod)
+#endif
+            {
+                if (nMatrixSize > nRAM / (nThreads * nMatrixSize *
+                                          static_cast<int>(sizeof(double))))
+                {
+                    nThreads = 1;
+                    if (nMatrixSize >
+                        nRAM / (nMatrixSize * static_cast<int>(sizeof(double))))
+                    {
+                        CPLError(CE_Failure, CPLE_OutOfMemory,
+                                 "thinplatespline: not enough memory. At least "
+                                 "%u MB are required",
+                                 static_cast<unsigned>(
+                                     static_cast<uint64_t>(nMatrixSize) *
+                                     nMatrixSize * sizeof(double) /
+                                     (1024 * 1024)));
+                        GDALDestroyTPSTransformer(psInfo);
+                        return nullptr;
+                    }
+                }
+            }
+        }
     }
 
-    if (nThreads > 1)
+    psInfo->bForceBuiltinMethod = bForceBuiltinMethod;
+
+    if (nThreads == 2)
     {
         // Compute direct and reverse transforms in parallel.
         CPLJoinableThread *hThread =
             CPLCreateJoinableThread(GDALTPSComputeForwardInThread, psInfo);
-        psInfo->bReverseSolved = psInfo->poReverse->solve() != 0;
+        psInfo->bReverseSolved =
+            psInfo->poReverse->solve(bForceBuiltinMethod) != 0;
         if (hThread != nullptr)
             CPLJoinThread(hThread);
         else
-            psInfo->bForwardSolved = psInfo->poForward->solve() != 0;
+            psInfo->bForwardSolved =
+                psInfo->poForward->solve(bForceBuiltinMethod) != 0;
     }
     else
     {
-        psInfo->bForwardSolved = psInfo->poForward->solve() != 0;
-        psInfo->bReverseSolved = psInfo->poReverse->solve() != 0;
+        psInfo->bForwardSolved =
+            psInfo->poForward->solve(bForceBuiltinMethod) != 0;
+        if (psInfo->bForwardSolved)
+            psInfo->bReverseSolved =
+                psInfo->poReverse->solve(bForceBuiltinMethod) != 0;
     }
 
     if (!psInfo->bForwardSolved || !psInfo->bReverseSolved)
 
@@ -30,7 +30,6 @@
 #include <cassert>
 #include <cmath>
 
-#ifndef HAVE_ARMADILLO
 namespace
 {
 // LU decomposition of the quadratic matrix A
@@ -47,8 +46,22 @@ bool solve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X, double eps)
     for (int iRow = 0; iRow < m; ++iRow)
         perm[iRow] = iRow;
 
+    // Arbitrary threshold to trigger progress in debug mode
+    const bool bDebug = (m > 10000);
+    int nLastPct = -1;
+
     for (int step = 0; step < m - 1; ++step)
     {
+        if (bDebug)
+        {
+            const int nPct = (step * 100 * 10 / m) / 2;
+            if (nPct != nLastPct)
+            {
+                CPLDebug("GDAL", "solve(): %d.%d %%", nPct / 10, nPct % 10);
+                nLastPct = nPct;
+            }
+        }
+
         // determine pivot element
         int iMax = step;
         double dMax = std::abs(A(step, step));
@@ -91,6 +104,16 @@ bool solve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X, double eps)
     // LUP solve;
     for (int iCol = 0; iCol < n; ++iCol)
     {
+        if (bDebug)
+        {
+            const int nPct = 500 + (iCol * 100 * 10 / n) / 2;
+            if (nPct != nLastPct)
+            {
+                CPLDebug("GDAL", "solve(): %d.%d %%", nPct / 10, nPct % 10);
+                nLastPct = nPct;
+            }
+        }
+
         for (int iRow = 0; iRow < m; ++iRow)
         {
             X(iRow, iCol) = RHS(perm[iRow], iCol);
@@ -108,48 +131,60 @@ bool solve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X, double eps)
             X(iRow, iCol) /= A(iRow, iRow);
         }
     }
+
+    if (bDebug)
+    {
+        CPLDebug("GDAL", "solve(): 100.0 %%");
+    }
+
     return true;
 }
 }  // namespace
-#endif
+
 /************************************************************************/
 /*                       GDALLinearSystemSolve()                        */
 /*                                                                      */
 /*   Solves the linear system A*X_i = RHS_i for each column i           */
 /*   where A is a square matrix.                                        */
 /************************************************************************/
-bool GDALLinearSystemSolve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X)
+bool GDALLinearSystemSolve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X,
+                           [[maybe_unused]] bool bForceBuiltinMethod)
 {
     assert(A.getNumRows() == RHS.getNumRows());
     assert(A.getNumCols() == X.getNumRows());
     assert(RHS.getNumCols() == X.getNumCols());
-    try
-    {
+
 #ifdef HAVE_ARMADILLO
-        arma::mat matA(A.data(), A.getNumRows(), A.getNumCols(), false, true);
-        arma::mat matRHS(RHS.data(), RHS.getNumRows(), RHS.getNumCols(), false,
-                         true);
-        arma::mat matOut(X.data(), X.getNumRows(), X.getNumCols(), false, true);
+    if (!bForceBuiltinMethod)
+    {
+        try
+        {
+            arma::mat matA(A.data(), A.getNumRows(), A.getNumCols(), false,
+                           true);
+            arma::mat matRHS(RHS.data(), RHS.getNumRows(), RHS.getNumCols(),
+                             false, true);
+            arma::mat matOut(X.data(), X.getNumRows(), X.getNumCols(), false,
+                             true);
 #if ARMA_VERSION_MAJOR > 6 ||                                                  \
     (ARMA_VERSION_MAJOR == 6 && ARMA_VERSION_MINOR >= 500)
-        // Perhaps available in earlier versions, but didn't check
-        return arma::solve(matOut, matA, matRHS,
-                           arma::solve_opts::equilibrate +
-                               arma::solve_opts::no_approx);
+            // Perhaps available in earlier versions, but didn't check
+            return arma::solve(matOut, matA, matRHS,
+                               arma::solve_opts::equilibrate +
+                                   arma::solve_opts::no_approx);
 #else
-        return arma::solve(matOut, matA, matRHS);
-#endif
-
-#else  // HAVE_ARMADILLO
-        return solve(A, RHS, X, 0);
+            return arma::solve(matOut, matA, matRHS);
 #endif
+        }
+        catch (std::exception const &e)
+        {
+            CPLError(CE_Failure, CPLE_AppDefined, "GDALLinearSystemSolve: %s",
+                     e.what());
+            return false;
+        }
     }
-    catch (std::exception const &e)
-    {
-        CPLError(CE_Failure, CPLE_AppDefined, "GDALLinearSystemSolve: %s",
-                 e.what());
-        return false;
-    }
+#endif  // HAVE_ARMADILLO
+
+    return solve(A, RHS, X, 0);
 }
 
 /*! @endcond */
@@ -84,7 +84,8 @@ struct GDALMatrix
     std::vector<double> v;
 };
 
-bool GDALLinearSystemSolve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X);
+bool GDALLinearSystemSolve(GDALMatrix &A, GDALMatrix &RHS, GDALMatrix &X,
+                           bool bForceBuiltinMethod = false);
 
 #endif /* #ifndef GDALLINEARSYSTEM_H_INCLUDED */
 
 
@@ -2083,7 +2083,8 @@ const char *GDALGetGenImgProjTranformerOptionList(void)
  * </li>
  * <li> MAX_GCP_ORDER: the maximum order to use for GCP derived polynomials if
  * possible.  The default is to autoselect based on the number of GCPs.
- * A value of -1 triggers use of Thin Plate Spline instead of polynomials.
+ * A value of -1 triggers use of Thin Plate Spline instead of polynomials if
+ * SRC_METHOD/DST_METHOD is not specified.
  * </li>
  * <li>GCP_ANTIMERIDIAN_UNWRAP=AUTO/YES/NO. (GDAL &gt;= 3.8) Whether to
  * "unwrap" longitudes of ground control points that span the antimeridian.
@@ -2100,6 +2101,8 @@ const char *GDALGetGenImgProjTranformerOptionList(void)
  * method to be considered on the source dataset. Will be used for pixel/line
  * to georef transformation on the source dataset. NO_GEOTRANSFORM can be
  * used to specify the identity geotransform (ungeoreferenced image)
+ * Note that using GCP_TPS with more than a few thousand GCPs requires significant RAM usage
+ * (at least numGCPs * numGCPs * 8 bytes) and processing time.
  * </li>
  * <li> DST_METHOD: may have a value which is one of GEOTRANSFORM,
  * GCP_POLYNOMIAL, GCP_HOMOGRAPHY, GCP_TPS, GEOLOC_ARRAY (added in 3.5), RPC to
@@ -2108,6 +2111,8 @@ const char *GDALGetGenImgProjTranformerOptionList(void)
  * pixel/line to georef transformation on the destination dataset.
  * NO_GEOTRANSFORM can be used to specify the identity geotransform
  * (ungeoreferenced image)
+ * Note that using GCP_TPS with more than a few thousand GCPs requires significant RAM usage
+ * (at least numGCPs * numGCPs * 8 bytes) and processing time.
  * </li>
  * <li> RPC_HEIGHT: A fixed height to be used with RPC
  * calculations. If RPC_HEIGHT and RPC_DEM are not specified but that the RPC