GerHobbelt
diff --git a/‎include/tesseract/baseapi.h
Lines changed: 32 additions & 0 deletions b/‎include/tesseract/baseapi.h
Lines changed: 32 additions & 0 deletions
diff --git a/‎include/tesseract/memcost_estimate.h
Lines changed: 62 additions & 0 deletions b/‎include/tesseract/memcost_estimate.h
Lines changed: 62 additions & 0 deletions
diff --git a/‎src/api/baseapi.cpp
Lines changed: 79 additions & 0 deletions b/‎src/api/baseapi.cpp
Lines changed: 79 additions & 0 deletions
diff --git a/‎src/ccmain/tesseractclass.cpp
Lines changed: 41 additions & 0 deletions b/‎src/ccmain/tesseractclass.cpp
Lines changed: 41 additions & 0 deletions
diff --git a/‎src/ccmain/tesseractclass.h
Lines changed: 14 additions & 0 deletions b/‎src/ccmain/tesseractclass.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/ccmain/thresholder.cpp
Lines changed: 14 additions & 3 deletions b/‎src/ccmain/thresholder.cpp
Lines changed: 14 additions & 3 deletions
@@ -28,6 +28,7 @@
 #include "unichar.h"
 
 #include <tesseract/version.h>
+#include <tesseract/memcost_estimate.h>  // for ImageCostEstimate
 
 #include <cstdio>
 #include <vector> // for std::vector
@@ -114,6 +115,37 @@ class TESS_API TessBaseAPI {
   int GetSourceYResolution();
   const char *GetDatapath();
 
+  /**
+  * Return a memory capacity cost estimate for the given image dimensions and
+  * some heuristics re tesseract behaviour, e.g. input images will be normalized/greyscaled,
+  * then thresholded, all of which will be kept in memory while the session runs.
+  *
+  * Also uses the Tesseract Variable `allowed_image_memory_capacity` to indicate
+  * whether the estimated cost is oversized --> `cost.is_too_large()`
+  *
+  * For user convenience, static functions are provided:
+  * the static functions MAY be used by userland code *before* the high cost of
+  * instantiating a Tesseract instance is incurred.
+  */
+  static ImageCostEstimate EstimateImageMemoryCost(int image_width, int image_height, float allowance = 1.0e30f /* a.k.a.dont_care, use system limit and be done */ );
+  static ImageCostEstimate EstimateImageMemoryCost(const Pix* pix, float allowance = 1.0e30f /* a.k.a. dont_care, use system limit and be done */ );
+
+  /**
+  * Ditto, but this API may be invoked after SetInputImage() or equivalent has been called
+  * and reports the cost estimate for the current instance/image.
+  */
+  ImageCostEstimate EstimateImageMemoryCost() const;
+
+  /**
+  * Helper, which may be invoked after SetInputImage() or equivalent has been called:
+  * reports the cost estimate for the current instance/image via `tprintf()` and returns
+  * `true` when the cost is expected to be too high.
+  *
+  * You can use this as a fast pre-flight check. Many major tesseract APIs perform
+  * this same check as part of their startup routine.
+  */
+  bool CheckAndReportIfImageTooLarge(const Pix* pix = nullptr /* default: use GetInputImage() data */ ) const;
+
   /** Set the name of the bonus output files. Needed only for debugging. */
   void SetOutputName(const char *name);
 
 
@@ -0,0 +1,62 @@
+/**********************************************************************
+ * File:        memcost_estimate.h
+ * Description: Inline routines and macros for serialisation functions
+ * Author:      Ger Hobbelt
+ *
+ * (C) Copyright 1990, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef T_MEMCOST_ESTIMATE_H
+#define T_MEMCOST_ESTIMATE_H
+
+#include <string>
+
+namespace tesseract {
+
+  // Image memory capacity cost estimate report. Cost is measured in BYTES. Cost is reported
+  // (`to_string()`) in GBYTES.
+  //
+  // Uses `allowed_image_memory_capacity` plus some compile-time heuristics to indicate
+  // whether the estimated cost is oversized --> `cost.is_too_large()`
+  struct ImageCostEstimate {
+    float cost;
+
+  protected:
+    float allowed_image_memory_capacity;
+
+  public:
+    ImageCostEstimate()
+      : ImageCostEstimate(0.0f, 1e30f) {
+    }
+
+    ImageCostEstimate(float c, float allowance = 1e30f);
+
+    static float get_max_system_allowance();
+
+    float get_max_allowance() const;
+
+    void set_max_allowance(float allowance);
+
+    bool is_too_large() const;
+
+    std::string to_string() const;
+
+    // implicit conversion
+    operator std::string() const;
+
+    static std::string capacity_to_string(float cost);
+  };
+
+} // namespace tesseract.
+
+#endif
@@ -74,6 +74,7 @@
 #include <set>      // for std::pair
 #include <sstream>  // for std::stringstream
 #include <vector>   // for std::vector
+#include <cfloat>
 
 #include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
 #ifdef HAVE_LIBCURL
@@ -102,6 +103,8 @@ static STRING_VAR(document_title, "", "Title of output document (used for hOCR a
 #ifdef HAVE_LIBCURL
 static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
 #endif
+double_VAR(allowed_image_memory_capacity, ImageCostEstimate::get_max_system_allowance(), "Set maximum memory allowance for image data: this will be used as part of a sanity check for oversized input images.");
+
 
 /** Minimum sensible image size to be worth running tesseract. */
 const int kMinRectSize = 10;
@@ -271,6 +274,66 @@ void TessBaseAPI::SetInputName(const char *name) {
   input_file_ = name ? name : "";
 }
 
+/**
+* Return a memory capacity cost estimate for the given image dimensions and
+* some heuristics re tesseract behaviour, e.g. input images will be normalized/greyscaled,
+* then thresholded, all of which will be kept in memory while the session runs.
+*
+* Also uses the Tesseract Variable `allowed_image_memory_capacity` to indicate
+* whether the estimated cost is oversized --> `cost.is_too_large()`
+*
+* For user convenience, static functions are provided:
+* the static functions MAY be used by userland code *before* the high cost of
+* instantiating a Tesseract instance is incurred.
+*/
+ImageCostEstimate TessBaseAPI::EstimateImageMemoryCost(int image_width, int image_height, float allowance) {
+  // The heuristics used:
+  // 
+  // we reckon with leptonica Pix storage at 4 bytes per pixel,
+  // tesseract storing (worst case) 3 different images: original, greyscale, binary thresholded,
+  // we DO NOT reckon with the extra image that may serve as background for PDF outputs, etc.
+  // we DO NOT reckon with the memory cost for the OCR match tree, etc.
+  // However, we attempt a VERY ROUGH estimate by calculating a 20% overdraft for internal operations'
+  // storage costs.
+  float cost = 4 * 3 * 1.20f;
+  cost *= image_width;
+  cost *= image_height;
+
+  if (allowed_image_memory_capacity > 0.0) {
+    // any rediculous input values will be replaced by the Tesseract configuration value:
+    if (allowance > allowed_image_memory_capacity || allowance <= 0.0)
+      allowance = allowed_image_memory_capacity;
+  }
+
+  return ImageCostEstimate(cost, allowance);
+}
+
+ImageCostEstimate TessBaseAPI::EstimateImageMemoryCost(const Pix* pix, float allowance) {
+  auto w = pixGetWidth(pix);
+  auto h = pixGetHeight(pix);
+  return EstimateImageMemoryCost(w, h, allowance);
+}
+
+/**
+* Ditto, but this API may be invoked after SetInputImage() or equivalent has been called
+* and reports the cost estimate for the current instance/image.
+*/
+ImageCostEstimate TessBaseAPI::EstimateImageMemoryCost() const {
+  return tesseract_->EstimateImageMemoryCost();
+}
+
+/**
+* Helper, which may be invoked after SetInputImage() or equivalent has been called:
+* reports the cost estimate for the current instance/image via `tprintf()` and returns
+* `true` when the cost is expected to be too high.
+*
+* You can use this as a fast pre-flight check. Many major tesseract APIs perform
+* this same check as part of their startup routine.
+*/
+bool TessBaseAPI::CheckAndReportIfImageTooLarge(const Pix* pix) const {
+  return tesseract_->CheckAndReportIfImageTooLarge(pix);
+}
+
 /** Set the name of the output files. Needed only for debugging. */
 void TessBaseAPI::SetOutputName(const char *name) {
   output_file_ = name ? name : "";
@@ -1255,6 +1318,22 @@ bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
                               TessResultRenderer *renderer) {
   SetInputName(filename);
   SetImage(pix);
+  // Before wee start to do *real* work, do a preliminary sanity check re expected memory pressure.
+  // The check MAY recur in some (semi)public APIs that MAY be called later, but this is the big one
+  // and it's a simple check at negligible cost, saving us some headaches when we start feeding large
+  // material to the Tesseract animal.
+  //
+  // TODO: rescale overlarge input images? Or is that left to userland code? (as it'll be pretty fringe anyway)
+  {
+    auto cost = TessBaseAPI::EstimateImageMemoryCost(pix);
+    std::string cost_report = cost;
+    tprintf("Estimated memory pressure: {} for input image size {} x {} px\n", cost_report, pixGetWidth(pix), pixGetHeight(pix));
+
+    if (CheckAndReportIfImageTooLarge(pix)) {
+      return false;   // fail early
+    }
+  }
+
   bool failed = false;
 
   if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
 
@@ -617,4 +617,45 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, O
   splitter_.Clear();
 }
 
+// Return a memory capacity cost estimate for the given image / current original image.
+//
+// uses the current original image for the estimate, i.e. tells you the cost estimate of this run:
+ImageCostEstimate Tesseract::EstimateImageMemoryCost(const Pix* pix) const {
+  // default: use pix_original() data 
+  if (pix == nullptr) {
+    pix = pix_original();
+  }
+
+  return TessBaseAPI::EstimateImageMemoryCost(pix, allowed_image_memory_capacity);
+}
+
+// Helper, which may be invoked after SetInputImage() or equivalent has been called:
+// reports the cost estimate for the current instance/image via `tprintf()` and returns
+// `true` when the cost is expected to be too high.
+bool Tesseract::CheckAndReportIfImageTooLarge(const Pix* pix) const {
+  // default: use pix_original() data 
+  if (pix == nullptr) {
+    pix = pix_original();
+  }
+
+  auto w = pixGetWidth(pix);
+  auto h = pixGetHeight(pix);
+  return CheckAndReportIfImageTooLarge(w, h);
+}
+
+bool Tesseract::CheckAndReportIfImageTooLarge(int width, int height) const {
+  auto cost = TessBaseAPI::EstimateImageMemoryCost(width, height, allowed_image_memory_capacity);
+
+  if (debug_all) {
+    tprintf("Image size & memory cost estimate: {} x {} px, estimated cost {} vs. {} allowed capacity.\n",
+      width, height, cost.to_string(), ImageCostEstimate::capacity_to_string(allowed_image_memory_capacity));
+  }
+
+  if (width >= TDIMENSION_MAX || height >= TDIMENSION_MAX || cost.is_too_large()) {
+    tprintf("ERROR: Image is too large: ({} x {} px, {})\n", width, height, cost.to_string());
+    return true;
+  }
+  return false;
+}
+
 } // namespace tesseract
@@ -46,6 +46,7 @@
 
 #include <tesseract/publictypes.h> // for OcrEngineMode, PageSegMode, OEM_L...
 #include <tesseract/unichar.h>     // for UNICHAR_ID
+#include <tesseract/memcost_estimate.h>  // for ImageCostEstimate
 
 #include <allheaders.h> // for pixDestroy, pixGetWidth, pixGetHe...
 
@@ -165,6 +166,7 @@ struct WordData {
   PointerVector<WERD_RES> lang_words;
 };
 
+
 // Definition of a Tesseract WordRecognizer. The WordData provides the context
 // of row/block, in_word holds an initialized, possibly pre-classified word,
 // that the recognizer may or may not consume (but if so it sets
@@ -227,6 +229,18 @@ class TESS_API Tesseract : public Wordrec {
       lang->set_pix_original(original_pix ? original_pix.clone() : nullptr);
     }
   }
+
+  // Return a memory capacity cost estimate for the given image / current original image.
+  //
+  // (unless overridden by the `pix` argument) uses the current original image for the estimate,
+  // i.e. tells you the cost estimate of this run:
+  ImageCostEstimate EstimateImageMemoryCost(const Pix* pix = nullptr /* default: use pix_original() data */) const;
+  // Helper, which may be invoked after SetInputImage() or equivalent has been called:
+  // reports the cost estimate for the current instance/image via `tprintf()` and returns
+  // `true` when the cost is expected to be too high.
+  bool CheckAndReportIfImageTooLarge(const Pix* pix = nullptr /* default: use pix_original() data */) const;
+  bool CheckAndReportIfImageTooLarge(int width, int height) const;
+
   // Returns a pointer to a Pix representing the best available resolution image
   // of the page, with best available bit depth as second priority. Result can
   // be of any bit depth, but never color-mapped, as that has always been
 
@@ -281,13 +281,24 @@ std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
 // Threshold the source image as efficiently as possible to the output Pix.
 // Creates a Pix and sets pix to point to the resulting pointer.
 // Caller must use pixDestroy to free the created Pix.
+//
 /// Returns false on error.
 bool ImageThresholder::ThresholdToPix(Image *pix) {
-  if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
-    tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
-    return false;
+  // tolerate overlarge images when they're about to be cropped by GetPixRect():
+  if (IsFullImage()) {
+    if (tesseract_->CheckAndReportIfImageTooLarge(pix_)) {
+      return false;
+    }
   }
+  else {
+    // validate against the future cropped image size:
+    if (tesseract_->CheckAndReportIfImageTooLarge(rect_width_, rect_height_)) {
+      return false;
+    }
+  }
+
   Image original = GetPixRect();
+
   if (pix_channels_ == 0) {
     // We have a binary image, but it still has to be copied, as this API
     // allows the caller to modify the output.