Skip to content

Decode doc ids in BKD leaves with auto-vectorized loops #14203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
Mar 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
97af1d2
bpv24
gf2121 Jan 27, 2025
617cec7
only reduce virtual call
gf2121 Feb 5, 2025
c72f9f4
iter
gf2121 Feb 5, 2025
4446855
jmh
gf2121 Feb 6, 2025
b031449
iter
gf2121 Feb 6, 2025
e7a3056
stash
gf2121 Feb 6, 2025
10511a2
e2e benchmark
gf2121 Feb 6, 2025
bb1b923
bwc issue
gf2121 Feb 8, 2025
28cb597
jmh fix
gf2121 Feb 8, 2025
62c214f
fix
gf2121 Feb 8, 2025
d977c02
iter
gf2121 Feb 8, 2025
c5653f6
license
gf2121 Feb 8, 2025
9bf0870
iter
gf2121 Feb 8, 2025
3c91bf3
add license
gf2121 Feb 8, 2025
a474043
add java doc
gf2121 Feb 8, 2025
5c4e1e9
private
gf2121 Feb 8, 2025
6f8fc2b
simplify
gf2121 Feb 8, 2025
654f0a6
add CHANGES
gf2121 Feb 8, 2025
6273aa9
iter
gf2121 Feb 8, 2025
285ae58
Merge branch 'main' into vector_bpv24
gf2121 Feb 10, 2025
5ee86c9
iter
gf2121 Feb 10, 2025
decf31b
iter
gf2121 Feb 10, 2025
114d677
iter
gf2121 Feb 10, 2025
4b5e2f2
iter
gf2121 Feb 10, 2025
ae65f61
inner loop
gf2121 Feb 11, 2025
5eea20a
profile
gf2121 Feb 12, 2025
bac612b
tidy
gf2121 Feb 12, 2025
da73235
inner loop iter
gf2121 Feb 12, 2025
158b939
stash
gf2121 Feb 12, 2025
7c21d47
refactor to inner loop
gf2121 Feb 12, 2025
6de960c
unnecessary diff
gf2121 Feb 12, 2025
7d3adf3
fix test
gf2121 Feb 12, 2025
38c2ef0
show benchmark code
gf2121 Mar 15, 2025
ec87c3c
specialized decoding for DEFAULT_MAX_POINTS_IN_LEAF_NODE
gf2121 Mar 16, 2025
3766875
unnecessary diff
gf2121 Mar 16, 2025
908de60
drop BKDCodecBenchmark as well
gf2121 Mar 16, 2025
eb14ddd
add more chance to test BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE
gf2121 Mar 16, 2025
346210c
no more public
gf2121 Mar 16, 2025
be09930
review iter
gf2121 Mar 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ Optimizations

* GITHUB#14176: Reduce when visiting bpv24-encoded doc ids in BKD leaves. (Guo Feng)

# GITHUB#14203: Decode doc ids in BKD leaves with auto-vectorized loops when using DEFAULT_MAX_POINTS_IN_LEAF_NODE. (Guo Feng)

Bug Fixes
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@
package org.apache.lucene.codecs.lucene90;

import java.io.IOException;
import java.util.Map;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.bkd.BKDWriter;

/**
* Lucene 9.0 point format, which encodes dimensional values in a block KD-tree structure for fast
Expand Down Expand Up @@ -59,18 +61,40 @@ public final class Lucene90PointsFormat extends PointsFormat {
public static final String META_EXTENSION = "kdm";

static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final int VERSION_BKD_VECTORIZED_BPV24 = 1;
static final int VERSION_CURRENT = VERSION_BKD_VECTORIZED_BPV24;

private static final Map<Integer, Integer> VERSION_TO_BKD_VERSION =
Map.of(
VERSION_START, BKDWriter.VERSION_META_FILE,
VERSION_BKD_VECTORIZED_BPV24, BKDWriter.VERSION_VECTORIZED_DOCID);

private final int version;

/** Sole constructor */
public Lucene90PointsFormat() {}
public Lucene90PointsFormat() {
this(VERSION_CURRENT);
}

/** Constructor that takes a version. This is used for testing with older versions. */
Lucene90PointsFormat(int version) {
if (VERSION_TO_BKD_VERSION.containsKey(version) == false) {
throw new IllegalArgumentException("Invalid version: " + version);
}
this.version = version;
}

@Override
public PointsWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new Lucene90PointsWriter(state);
return new Lucene90PointsWriter(state, version);
}

@Override
public PointsReader fieldsReader(SegmentReadState state) throws IOException {
return new Lucene90PointsReader(state);
}

static int bkdVersion(int version) {
return VERSION_TO_BKD_VERSION.get(version);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,18 @@ public class Lucene90PointsWriter extends PointsWriter {
final SegmentWriteState writeState;
final int maxPointsInLeafNode;
final double maxMBSortInHeap;
final int version;
private boolean finished;

/** Full constructor */
public Lucene90PointsWriter(
SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap)
SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap, int version)
throws IOException {
assert writeState.fieldInfos.hasPointValues();
this.writeState = writeState;
this.maxPointsInLeafNode = maxPointsInLeafNode;
this.maxMBSortInHeap = maxMBSortInHeap;
this.version = version;
String dataFileName =
IndexFileNames.segmentFileName(
writeState.segmentInfo.name,
Expand Down Expand Up @@ -105,6 +107,12 @@ public Lucene90PointsWriter(
}
}

public Lucene90PointsWriter(
SegmentWriteState writeState, int maxPointsInLeafNode, double maxMBSortInHeap)
throws IOException {
this(writeState, maxPointsInLeafNode, maxMBSortInHeap, Lucene90PointsFormat.VERSION_CURRENT);
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's make all constructors that take a version pkg-private?


/**
* Uses the defaults values for {@code maxPointsInLeafNode} (512) and {@code maxMBSortInHeap}
* (16.0)
Expand All @@ -113,7 +121,17 @@ public Lucene90PointsWriter(SegmentWriteState writeState) throws IOException {
this(
writeState,
BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP);
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
Lucene90PointsFormat.VERSION_CURRENT);
}

/** Constructor that takes a version. This is used for testing with older versions. */
Lucene90PointsWriter(SegmentWriteState writeState, int version) throws IOException {
this(
writeState,
BKDConfig.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
version);
}

@Override
Expand All @@ -135,7 +153,8 @@ public void writeField(FieldInfo fieldInfo, PointsReader reader) throws IOExcept
writeState.segmentInfo.name,
config,
maxMBSortInHeap,
values.size())) {
values.size(),
Lucene90PointsFormat.bkdVersion(version))) {

if (values instanceof MutablePointTree) {
IORunnable finalizer =
Expand Down Expand Up @@ -233,7 +252,8 @@ public void merge(MergeState mergeState) throws IOException {
writeState.segmentInfo.name,
config,
maxMBSortInHeap,
totMaxSize)) {
totMaxSize,
Lucene90PointsFormat.bkdVersion(version))) {
List<PointValues> pointValues = new ArrayList<>();
List<MergeState.DocMap> docMaps = new ArrayList<>();
for (int i = 0; i < mergeState.pointsReaders.length; i++) {
Expand Down
10 changes: 5 additions & 5 deletions lucene/core/src/java/org/apache/lucene/util/bkd/BKDReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
* @lucene.experimental
*/
public class BKDReader extends PointValues {

final BKDConfig config;
final int numLeaves;
final IndexInput in;
Expand Down Expand Up @@ -261,7 +260,7 @@ private BKDPointTree(
1,
minPackedValue,
maxPackedValue,
new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode()),
new BKDReaderDocIDSetIterator(config.maxPointsInLeafNode(), version),
new byte[config.packedBytesLength()],
new byte[config.packedIndexBytesLength()],
new byte[config.packedIndexBytesLength()],
Expand Down Expand Up @@ -590,7 +589,8 @@ public void addAll(PointValues.IntersectVisitor visitor, boolean grown) throws I
// How many points are stored in this leaf cell:
int count = leafNodes.readVInt();
// No need to call grow(), it has been called up-front
docIdsWriter.readInts(leafNodes, count, visitor);
// Borrow scratchIterator.docIds as decoding buffer
docIdsWriter.readInts(leafNodes, count, visitor, scratchIterator.docIDs);
} else {
pushLeft();
addAll(visitor, grown);
Expand Down Expand Up @@ -1028,9 +1028,9 @@ private static class BKDReaderDocIDSetIterator extends DocIdSetIterator {
final int[] docIDs;
private final DocIdsWriter docIdsWriter;

public BKDReaderDocIDSetIterator(int maxPointsInLeafNode) {
public BKDReaderDocIDSetIterator(int maxPointsInLeafNode, int version) {
this.docIDs = new int[maxPointsInLeafNode];
this.docIdsWriter = new DocIdsWriter(maxPointsInLeafNode);
this.docIdsWriter = new DocIdsWriter(maxPointsInLeafNode, version);
}

@Override
Expand Down
31 changes: 28 additions & 3 deletions lucene/core/src/java/org/apache/lucene/util/bkd/BKDWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,8 @@ public class BKDWriter implements Closeable {
public static final int VERSION_SELECTIVE_INDEXING = 6;
public static final int VERSION_LOW_CARDINALITY_LEAVES = 7;
public static final int VERSION_META_FILE = 9;
public static final int VERSION_CURRENT = VERSION_META_FILE;
public static final int VERSION_VECTORIZED_DOCID = 10;
public static final int VERSION_CURRENT = VERSION_VECTORIZED_DOCID;

/** Number of splits before we compute the exact bounding box of an inner node. */
private static final int SPLITS_BEFORE_EXACT_BOUNDS = 4;
Expand All @@ -103,6 +104,7 @@ public class BKDWriter implements Closeable {
final TrackingDirectoryWrapper tempDir;
final String tempFileNamePrefix;
final double maxMBSortInHeap;
final int version;

final byte[] scratchDiff;
final byte[] scratch;
Expand Down Expand Up @@ -139,6 +141,29 @@ public BKDWriter(
BKDConfig config,
double maxMBSortInHeap,
long totalPointCount) {
this(
maxDoc,
tempDir,
tempFileNamePrefix,
config,
maxMBSortInHeap,
totalPointCount,
BKDWriter.VERSION_CURRENT);
}

/** This ctor should be only used for testing with older versions. */
public BKDWriter(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add javadocs that this ctor should be only used for testing with older versions?

int maxDoc,
Directory tempDir,
String tempFileNamePrefix,
BKDConfig config,
double maxMBSortInHeap,
long totalPointCount,
int version) {
if (version < VERSION_START || version > VERSION_CURRENT) {
throw new IllegalArgumentException("Version out of range: " + version);
}
this.version = version;
verifyParams(maxMBSortInHeap, totalPointCount);
// We use tracking dir to deal with removing files on exception, so each place that
// creates temp files doesn't need crazy try/finally/sucess logic:
Expand All @@ -165,7 +190,7 @@ public BKDWriter(

// Maximum number of points we hold in memory at any time
maxPointsSortInHeap = (int) ((maxMBSortInHeap * 1024 * 1024) / (config.bytesPerDoc()));
docIdsWriter = new DocIdsWriter(config.maxPointsInLeafNode());
docIdsWriter = new DocIdsWriter(config.maxPointsInLeafNode(), version);
// Finally, we must be able to hold at least the leaf node in heap during build:
if (maxPointsSortInHeap < config.maxPointsInLeafNode()) {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -1245,7 +1270,7 @@ private void writeIndex(
byte[] packedIndex,
long dataStartFP)
throws IOException {
CodecUtil.writeHeader(metaOut, CODEC_NAME, VERSION_CURRENT);
CodecUtil.writeHeader(metaOut, CODEC_NAME, version);
metaOut.writeVInt(config.numDims());
metaOut.writeVInt(config.numIndexDims());
metaOut.writeVInt(countPerLeaf);
Expand Down
Loading