Skip to content

Commit c06f53e

Browse files
hye-onmsfroh
andauthored
Add HashSet based filtering optimization to XContentMapValues (#17160)
This optimization enhances document filtering when field names are simple (no dots or wildcards in field names). In such cases, it uses a HashSet-based implementation instead of automaton matching to prevent TooComplexToDeterminizeException when processing documents with numerous long field names. Changes: - Add HashSet optimization for simple field names - Split filter implementation into set-based and automaton-based - Add helper methods to check field name patterns --------- Signed-off-by: hye-on <[email protected]> Signed-off-by: Michael Froh <[email protected]> Co-authored-by: Michael Froh <[email protected]>
1 parent 3f793b6 commit c06f53e

File tree

2 files changed

+52
-0
lines changed

2 files changed

+52
-0
lines changed

CHANGELOG-3.0.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3030
- Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
3131
- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))
3232
- Breaking change: Do not request "search_pipelines" metrics by default in NodesInfoRequest ([#12497](https://github.com/opensearch-project/OpenSearch/pull/12497))
33+
- Use simpler matching logic for source fields when explicit field names (no wildcards or dot-paths) are specified ([#17160](https://github.com/opensearch-project/OpenSearch/pull/17160))
3334
- Refactor `:libs` module `bootstrap` package to eliminate top level split packages for JPMS support ([#17117](https://github.com/opensearch-project/OpenSearch/pull/17117))
3435
- Refactor the codebase to eliminate top level split packages for JPMS support ([#17153](https://github.com/opensearch-project/OpenSearch/pull/17153)
3536
- Refactor `:server` module `org.apacge.lucene` package to eliminate top level split packages for JPMS support ([#17241](https://github.com/opensearch-project/OpenSearch/pull/17241))

server/src/main/java/org/opensearch/common/xcontent/support/XContentMapValues.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,12 @@
4545

4646
import java.util.ArrayList;
4747
import java.util.Arrays;
48+
import java.util.Collections;
4849
import java.util.HashMap;
50+
import java.util.HashSet;
4951
import java.util.List;
5052
import java.util.Map;
53+
import java.util.Set;
5154
import java.util.function.Function;
5255

5356
/**
@@ -216,6 +219,54 @@ public static Map<String, Object> filter(Map<String, ?> map, String[] includes,
216219
* @see #filter(Map, String[], String[]) for details
217220
*/
218221
public static Function<Map<String, ?>, Map<String, Object>> filter(String[] includes, String[] excludes) {
222+
if (hasNoWildcardsOrDots(includes) && hasNoWildcardsOrDots(excludes)) {
223+
return createSetBasedFilter(includes, excludes);
224+
}
225+
return createAutomatonFilter(includes, excludes);
226+
}
227+
228+
private static boolean hasNoWildcardsOrDots(String[] fields) {
229+
if (fields == null || fields.length == 0) {
230+
return true;
231+
}
232+
233+
for (String field : fields) {
234+
if (field.indexOf('*') != -1 || field.indexOf('.') != -1) {
235+
return false;
236+
}
237+
}
238+
return true;
239+
}
240+
241+
/**
242+
* Creates a simple HashSet-based filter for exact field name matching
243+
*/
244+
private static Function<Map<String, ?>, Map<String, Object>> createSetBasedFilter(String[] includes, String[] excludes) {
245+
Set<String> includeSet = (includes == null || includes.length == 0) ? null : new HashSet<>(Arrays.asList(includes));
246+
Set<String> excludeSet = (excludes == null || excludes.length == 0)
247+
? Collections.emptySet()
248+
: new HashSet<>(Arrays.asList(excludes));
249+
250+
return (map) -> {
251+
Map<String, Object> filtered = new HashMap<>();
252+
for (Map.Entry<String, ?> entry : map.entrySet()) {
253+
String key = entry.getKey();
254+
int dotPos = key.indexOf('.');
255+
if (dotPos > 0) {
256+
key = key.substring(0, dotPos);
257+
}
258+
if ((includeSet == null || includeSet.contains(key)) && !excludeSet.contains(key)) {
259+
filtered.put(entry.getKey(), entry.getValue());
260+
}
261+
}
262+
return filtered;
263+
};
264+
}
265+
266+
/**
267+
* Creates an automaton-based filter for complex pattern matching
268+
*/
269+
public static Function<Map<String, ?>, Map<String, Object>> createAutomatonFilter(String[] includes, String[] excludes) {
219270
CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
220271

221272
CharacterRunAutomaton include;

0 commit comments

Comments
 (0)