Add HashSet based filtering optimization to XContentMapValues (#17160)

hye-on · msfroh · web-flow · commit c06f53e3a73c · 2025-02-06T16:15:35.000-08:00
This optimization enhances document filtering when field names are simple (no dots or wildcards in field names). In such cases, it uses a HashSet-based implementation instead of automaton matching to prevent TooComplexToDeterminizeException when processing documents with numerous long field names.

Changes:
- Add HashSet optimization for simple field names
- Split filter implementation into set-based and automaton-based
- Add helper methods to check field name patterns

---------

Signed-off-by: hye-on &lt;ain0103@naver.com&gt;
Signed-off-by: Michael Froh &lt;froh@amazon.com&gt;
Co-authored-by: Michael Froh &lt;froh@amazon.com&gt;
diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md
@@ -30,6 +30,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
 - Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))
 - Breaking change: Do not request "search_pipelines" metrics by default in NodesInfoRequest ([#12497](https://github.com/opensearch-project/OpenSearch/pull/12497))
+- Use simpler matching logic for source fields when explicit field names (no wildcards or dot-paths) are specified ([#17160](https://github.com/opensearch-project/OpenSearch/pull/17160))
 - Refactor `:libs` module `bootstrap` package to eliminate top level split packages for JPMS support ([#17117](https://github.com/opensearch-project/OpenSearch/pull/17117))
 - Refactor the codebase to eliminate top level split packages for JPMS support ([#17153](https://github.com/opensearch-project/OpenSearch/pull/17153)
 - Refactor `:server` module `org.apacge.lucene` package to eliminate top level split packages for JPMS support ([#17241](https://github.com/opensearch-project/OpenSearch/pull/17241))
diff --git a/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java b/server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java
@@ -45,9 +45,12 @@
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.function.Function;
 
 /**
@@ -216,6 +219,54 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
      * @see #filter(Map, String[], String[]) for details
      */
     public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
+        if (hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) {
+            return createSetBasedFilter(includes, excludes);
+        }
+        return createAutomatonFilter(includes, excludes);
+    }
+
+    private static boolean hasNoWildcardsOrDots(String[] fields) {
+        if (fields == null || fields.length == 0) {
+            return true;
+        }
+
+        for (String field : fields) {
+            if (field.indexOf('*') != -1 || field.indexOf('.') != -1) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Creates a simple HashSet-based filter for exact field name matching
+     */
+    private static Function<Map<String, ?>, Map<String, Object>> createSetBasedFilter(String[] includes, String[] excludes) {
+        Set<String> includeSet = (includes == null || includes.length == 0) ? null : new HashSet<>(Arrays.asList(includes));
+        Set<String> excludeSet = (excludes == null || excludes.length == 0)
+            ? Collections.emptySet()
+            : new HashSet<>(Arrays.asList(excludes));
+
+        return (map) -> {
+            Map<String, Object> filtered = new HashMap<>();
+            for (Map.Entry<String, ?> entry : map.entrySet()) {
+                String key = entry.getKey();
+                int dotPos = key.indexOf('.');
+                if (dotPos > 0) {
+                    key = key.substring(0, dotPos);
+                }
+                if ((includeSet == null || includeSet.contains(key)) && !excludeSet.contains(key)) {
+                    filtered.put(entry.getKey(), entry.getValue());
+                }
+            }
+            return filtered;
+        };
+    }
+
+    /**
+     * Creates an automaton-based filter for complex pattern matching
+     */
+    public static Function<Map<String, ?>, Map<String, Object>> createAutomatonFilter(String[] includes, String[] excludes) {
         CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
 
         CharacterRunAutomaton include;