From c702b488f4b6becda961d23eb655e515c861fd9d Mon Sep 17 00:00:00 2001 From: Alex Jank Date: Sat, 25 Jan 2025 14:12:05 +0100 Subject: [PATCH 1/2] Fix alto xml duplicates IDs when multiple pages are present --- src/api/altorenderer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index e373f73aa4..4c1607abce 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -168,7 +168,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { case PT_PULLOUT_IMAGE: { // Handle all kinds of images. // TODO: optionally add TYPE, for example TYPE="photo". - alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); @@ -177,7 +177,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { case PT_HORZ_LINE: case PT_VERT_LINE: // Handle horizontal and vertical lines. - alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); @@ -190,24 +190,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { } if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { - alto_str << "\t\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { - alto_str << "\t\t\t\t\t\t Date: Sun, 26 Jan 2025 10:30:03 +0100 Subject: [PATCH 2/2] Convert ALTO XML ID generation to conditional code based on the current pge This will ensure, validated ALTO XML output is generated while keeping IDs for the first page consistent as before. --- src/api/altorenderer.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 4c1607abce..602649c012 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -51,6 +51,20 @@ static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, } } +static std::string GetID(char const * prefix, int page_number, int counter) { + std::stringstream idstr; + // IDs will only have the counter for the first page to keep them consistent + // with the IDs assigned before this change was made. + // From the second page on, IDs will also contain the page number to make them unique. + if (page_number == 0) { + idstr << prefix << "_" << counter; + } else { + idstr << prefix << "_" << page_number << "_" << counter; + } + + return idstr.str(); +} + /// /// Append the ALTO XML for the beginning of the document /// @@ -168,7 +182,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { case PT_PULLOUT_IMAGE: { // Handle all kinds of images. // TODO: optionally add TYPE, for example TYPE="photo". - alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); @@ -177,7 +191,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { case PT_HORZ_LINE: case PT_VERT_LINE: // Handle horizontal and vertical lines. - alto_str << "\t\t\t\t\n"; res_it->Next(RIL_BLOCK); @@ -190,24 +204,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { } if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - alto_str << "\t\t\t\tIsAtBeginningOf(RIL_PARA)) { - alto_str << "\t\t\t\t\tIsAtBeginningOf(RIL_TEXTLINE)) { - alto_str << "\t\t\t\t\t\t