From 48900fd5b466ec0de4266d7a6726b99f040566b8 Mon Sep 17 00:00:00 2001 From: Iryna Kopchak Date: Fri, 14 Feb 2025 16:18:32 +0200 Subject: [PATCH] Optimize regex in trimAdjacentBlankLines() method of ExtractedTextFormatter to prevent stack overflow Closes 2247 issue Signed-off-by: Iryna Kopchak --- .../springframework/ai/reader/ExtractedTextFormatter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spring-ai-core/src/main/java/org/springframework/ai/reader/ExtractedTextFormatter.java b/spring-ai-core/src/main/java/org/springframework/ai/reader/ExtractedTextFormatter.java index 0ad3e3290b3..95df2f6b02f 100644 --- a/spring-ai-core/src/main/java/org/springframework/ai/reader/ExtractedTextFormatter.java +++ b/spring-ai-core/src/main/java/org/springframework/ai/reader/ExtractedTextFormatter.java @@ -32,8 +32,9 @@ * An instance of this formatter can be customized using the {@link Builder} nested class. * * @author Christian Tzolov + * @author Iryna Kopchak */ -public final class ExtractedTextFormatter { +public class ExtractedTextFormatter { /** Flag indicating if the text should be left-aligned */ private final boolean leftAlignment; @@ -84,7 +85,7 @@ public static ExtractedTextFormatter defaults() { * @return Returns the same text but with blank lines trimmed. */ public static String trimAdjacentBlankLines(String pageText) { - return pageText.replaceAll("(?m)(^ *\n)", "\n").replaceAll("(?m)^$([\r\n]+?)(^$[\r\n]+?^)+", "$1"); + return pageText.replaceAll("(?m)^(?:\\s*\\r?\\n)+", "\n"); } /**