Skip to content

Commit c6b7e05

Browse files
committed
Convert ALTO XML ID generation to conditional code based on the current pge
This will ensure, validated ALTO XML output is generated while keeping IDs for the first page consistent as before.
1 parent c702b48 commit c6b7e05

File tree

1 file changed

+20
-6
lines changed

1 file changed

+20
-6
lines changed

src/api/altorenderer.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,20 @@ static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
5151
}
5252
}
5353

54+
static std::string GetID(const char *prefix, int page_number, int counter) {
55+
std::stringstream idstr;
56+
// IDs will only have the counter for the first page to keep them consistent
57+
// with the IDs assigned before this change was made.
58+
// From the second page on, IDs will also contain the page number to make them unique.
59+
if (page_number == 0) {
60+
idstr << prefix << "_" << counter;
61+
} else {
62+
idstr << prefix << "_" << page_number << "_" << counter;
63+
}
64+
65+
return idstr.str();
66+
}
67+
5468
///
5569
/// Append the ALTO XML for the beginning of the document
5670
///
@@ -168,7 +182,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
168182
case PT_PULLOUT_IMAGE: {
169183
// Handle all kinds of images.
170184
// TODO: optionally add TYPE, for example TYPE="photo".
171-
alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << page_number << "_" << bcnt++ << "\"";
185+
alto_str << "\t\t\t\t<Illustration ID=\"" << GetID("cblock", page_number, bcnt++) << "\"";
172186
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
173187
alto_str << "</Illustration>\n";
174188
res_it->Next(RIL_BLOCK);
@@ -177,7 +191,7 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
177191
case PT_HORZ_LINE:
178192
case PT_VERT_LINE:
179193
// Handle horizontal and vertical lines.
180-
alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << page_number << "_" << bcnt++ << "\"";
194+
alto_str << "\t\t\t\t<GraphicalElement ID=\"" << GetID("cblock", page_number, bcnt++) << "\"";
181195
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
182196
alto_str << "</GraphicalElement >\n";
183197
res_it->Next(RIL_BLOCK);
@@ -190,24 +204,24 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
190204
}
191205

192206
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
193-
alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << page_number << "_" << bcnt << "\"";
207+
alto_str << "\t\t\t\t<ComposedBlock ID=\"" << GetID("cblock", page_number, bcnt) << "\"";
194208
AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
195209
alto_str << "\n";
196210
}
197211

198212
if (res_it->IsAtBeginningOf(RIL_PARA)) {
199-
alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << page_number << "_" << tcnt << "\"";
213+
alto_str << "\t\t\t\t\t<TextBlock ID=\"" << GetID("block", page_number, tcnt) << "\"";
200214
AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
201215
alto_str << "\n";
202216
}
203217

204218
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
205-
alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << page_number << "_" << lcnt << "\"";
219+
alto_str << "\t\t\t\t\t\t<TextLine ID=\"" << GetID("line", page_number, lcnt) << "\"";
206220
AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
207221
alto_str << "\n";
208222
}
209223

210-
alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << page_number << "_" << wcnt << "\"";
224+
alto_str << "\t\t\t\t\t\t\t<String ID=\"" << GetID("string", page_number, wcnt) << "\"";
211225
AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
212226
alto_str << " CONTENT=\"";
213227

0 commit comments

Comments
 (0)