Skip to content

CNDB-12937: enable jvector 4; remove jvector 3 write support #1685

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@
<dependency groupId="org.apache.lucene" artifactId="lucene-core" version="9.8.0-5ea8bb4f21" />
<dependency groupId="org.apache.lucene" artifactId="lucene-analysis-common" version="9.8.0-5ea8bb4f21" />
<dependency groupId="org.apache.lucene" artifactId="lucene-backward-codecs" version="9.8.0-5ea8bb4f21" />
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.2" />
<dependency groupId="io.github.jbellis" artifactId="jvector" version="4.0.0-beta.3" />
<dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>

<dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,14 @@ public class V3OnDiskFormat extends V2OnDiskFormat
public static volatile boolean WRITE_JVECTOR3_FORMAT = Boolean.parseBoolean(System.getProperty("cassandra.sai.write_jv3_format", "false"));
public static final boolean ENABLE_LTM_CONSTRUCTION = Boolean.parseBoolean(System.getProperty("cassandra.sai.ltm_construction", "true"));

public static final int JVECTOR_2_VERSION = 2;
// These are built to be backwards and forwards compatible. Not final only for testing.
public static int JVECTOR_VERSION = Integer.parseInt(System.getProperty("cassandra.sai.jvector_version", "4"));
static
{
// JVector 3 is not compatible with the latest jvector changes, so we fail fast if the config is enabled.
assert JVECTOR_VERSION != 3 : "JVector version 3 is no longer suppoerted";
assert !WRITE_JVECTOR3_FORMAT : "JVector version 3 is no longer suppoerted";
}

private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

Expand Down Expand Up @@ -110,10 +117,4 @@ public Set<IndexComponentType> perIndexComponentTypes(AbstractType<?> validator)
return VECTOR_COMPONENTS_V3;
return super.perIndexComponentTypes(validator);
}

@VisibleForTesting
public static void enableJVector3Format()
{
WRITE_JVECTOR3_FORMAT = true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,8 @@ public long ramBytesUsed()

public int size()
{
return graph.size();
// The base layer of the graph has all nodes.
return graph.size(0);
}

/**
Expand Down Expand Up @@ -210,6 +211,8 @@ public CloseableIterator<RowIdWithScore> search(VectorFloat<?> queryVector,
{
var view = (GraphIndex.ScoringView) searcher.getView();
SearchScoreProvider ssp;
// FusedADC can no longer be written due to jvector upgrade. However, it's possible these index files
// still exist, so we have to support them.
if (features.contains(FeatureId.FUSED_ADC))
{
var asf = view.approximateScoreFunctionFor(queryVector, similarityFunction);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
import org.apache.cassandra.utils.CloseableIterator;
import org.apache.lucene.util.StringHelper;

import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;

public class CassandraOnHeapGraph<T> implements Accountable
{
Expand Down Expand Up @@ -153,13 +153,18 @@ public CassandraOnHeapGraph(IndexContext context, boolean forSearching, Memtable
vectorsByKey = forSearching ? new NonBlockingHashMap<>() : null;
invalidVectorBehavior = forSearching ? InvalidVectorBehavior.FAIL : InvalidVectorBehavior.IGNORE;

// This is only a warning since it's not a fatal error to write without hierarchy
if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
"Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());

builder = new GraphIndexBuilder(vectorValues,
similarityFunction,
indexConfig.getAnnMaxDegree(),
indexConfig.getConstructionBeamWidth(),
indexConfig.getNeighborhoodOverflow(1.0f), // no overflow means add will be a bit slower but flush will be faster
indexConfig.getAlpha(dimension > 3 ? 1.2f : 2.0f),
indexConfig.isHierarchyEnabled());
indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4);
searchers = ThreadLocal.withInitial(() -> new GraphSearcherAccessManager(new GraphSearcher(builder.getGraph())));
}

Expand Down Expand Up @@ -421,7 +426,7 @@ public SegmentMetadata.ComponentMetadataMap flush(IndexComponents.ForWrite perIn
try (var pqOutput = perIndexComponents.addOrGet(IndexComponentType.PQ).openOutput(true);
var postingsOutput = perIndexComponents.addOrGet(IndexComponentType.POSTING_LISTS).openOutput(true);
var indexWriter = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), indexFile.toPath())
.withVersion(JVECTOR_2_VERSION) // always write old-version format since we're not using the new features
.withVersion(JVECTOR_VERSION)
.withMapper(ordinalMapper)
.with(new InlineVectors(vectorValues.dimension()))
.withStartOffset(termsOffset)
Expand Down Expand Up @@ -560,14 +565,14 @@ private long writePQ(SequentialWriter writer, V5VectorPostingsWriter.RemappedPos
return writer.position();

// save (outside the synchronized block, this is io-bound not CPU)
cv.write(writer, JVECTOR_2_VERSION);
cv.write(writer, JVECTOR_VERSION);
return writer.position();
}

static void writePqHeader(DataOutput writer, boolean unitVectors, CompressionType type)
throws IOException
{
if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT)
if (V3OnDiskFormat.JVECTOR_VERSION >= 3)
{
// version and optional fields
writer.writeInt(CassandraDiskAnn.PQ_MAGIC);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
import io.github.jbellis.jvector.graph.ListRandomAccessVectorValues;
import io.github.jbellis.jvector.graph.disk.feature.Feature;
import io.github.jbellis.jvector.graph.disk.feature.FeatureId;
import io.github.jbellis.jvector.graph.disk.feature.FusedADC;
import io.github.jbellis.jvector.graph.disk.feature.InlineVectors;
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndex;
import io.github.jbellis.jvector.graph.disk.OnDiskGraphIndexWriter;
Expand Down Expand Up @@ -91,7 +90,7 @@

import static java.lang.Math.max;
import static java.lang.Math.min;
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_2_VERSION;
import static org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat.JVECTOR_VERSION;

public class CompactionGraph implements Closeable, Accountable
{
Expand Down Expand Up @@ -200,13 +199,17 @@ else if (compressor instanceof BinaryQuantization)
{
throw new IllegalArgumentException("Unsupported compressor: " + compressor);
}
if (indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION < 4)
logger.warn("Hierarchical graphs configured but node configured with V3OnDiskFormat.JVECTOR_VERSION {}. " +
"Skipping setting for {}", V3OnDiskFormat.JVECTOR_VERSION, indexConfig.getIndexName());

builder = new GraphIndexBuilder(bsp,
dimension,
indexConfig.getAnnMaxDegree(),
indexConfig.getConstructionBeamWidth(),
indexConfig.getNeighborhoodOverflow(1.2f),
indexConfig.getAlpha(dimension > 3 ? 1.2f : 1.4f),
indexConfig.isHierarchyEnabled(),
indexConfig.isHierarchyEnabled() && V3OnDiskFormat.JVECTOR_VERSION >= 4,
compactionSimdPool, compactionFjp);

termsFile = perIndexComponents.addOrGet(IndexComponentType.TERMS_DATA).file();
Expand All @@ -220,19 +223,10 @@ else if (compressor instanceof BinaryQuantization)

private OnDiskGraphIndexWriter.Builder createTermsWriterBuilder() throws IOException
{
var indexConfig = context.getIndexWriterConfig();
var writerBuilder = new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
.withStartOffset(termsOffset)
.with(new InlineVectors(dimension));
if (V3OnDiskFormat.WRITE_JVECTOR3_FORMAT && compressor instanceof ProductQuantization)
{
writerBuilder = writerBuilder.with(new FusedADC(indexConfig.getAnnMaxDegree(), (ProductQuantization) compressor));
}
else
{
writerBuilder = writerBuilder.withVersion(JVECTOR_2_VERSION);
}
return writerBuilder;
return new OnDiskGraphIndexWriter.Builder(builder.getGraph(), termsFile.toPath())
.withStartOffset(termsOffset)
.with(new InlineVectors(dimension))
.withVersion(JVECTOR_VERSION);
}

@Override
Expand Down Expand Up @@ -401,7 +395,7 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException
// write PQ (time to do this is negligible, don't bother doing it async)
long pqOffset = pqOutput.getFilePointer();
CassandraOnHeapGraph.writePqHeader(pqOutput.asSequentialWriter(), unitVectors, VectorCompression.CompressionType.PRODUCT_QUANTIZATION);
compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_2_VERSION); // VSTODO old version until we add APQ
compressedVectors.write(pqOutput.asSequentialWriter(), JVECTOR_VERSION); // VSTODO old version until we add APQ
long pqLength = pqOutput.getFilePointer() - pqOffset;

// write postings asynchronously while we run cleanup()
Expand Down Expand Up @@ -455,18 +449,9 @@ public SegmentMetadata.ComponentMetadataMap flush() throws IOException

// write the graph edge lists and optionally fused adc features
var start = System.nanoTime();
if (writer.getFeatureSet().contains(FeatureId.FUSED_ADC))
{
try (var view = builder.getGraph().getView())
{
var supplier = Feature.singleStateFactory(FeatureId.FUSED_ADC, ordinal -> new FusedADC.State(view, (PQVectors) compressedVectors, ordinal));
writer.write(supplier);
}
}
else
{
writer.write(Map.of());
}
// Required becuase jvector 3 wrote the fused adc map here. We no longer write jvector 3, but we still
// write out the empty map.
writer.write(Map.of());
SAICodecUtils.writeFooter(writer.getOutput(), writer.checksum());
logger.info("Writing graph took {}ms", (System.nanoTime() - start) / 1_000_000);
long termsLength = writer.getOutput().position() - termsOffset;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ public class VectorDotProductWithLengthTest extends VectorTester
public void setup() throws Throwable
{
super.setup();
V3OnDiskFormat.enableJVector3Format(); // we are testing unit vector detection which is part of the v3 changes
// we are testing unit vector detection which is part of the v3 changes, but continues in all subsequent versions
assert V3OnDiskFormat.JVECTOR_VERSION >= 3 : "This test assumes JVector version 3 or greater";
}

// This tests our detection of unit-length vectors used with dot product and PQ.
Expand Down
64 changes: 53 additions & 11 deletions test/unit/org/apache/cassandra/index/sai/cql/VectorTypeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
import org.apache.cassandra.index.sai.disk.format.Version;
import org.apache.cassandra.index.sai.disk.v1.IndexWriterConfig;
import org.apache.cassandra.index.sai.disk.v1.SegmentBuilder;
import org.apache.cassandra.index.sai.disk.v3.V3OnDiskFormat;
import org.apache.cassandra.index.sai.disk.vector.CassandraOnHeapGraph;
import org.apache.cassandra.index.sai.disk.vector.VectorSourceModel;
import org.apache.cassandra.index.sai.plan.QueryController;
Expand Down Expand Up @@ -1120,8 +1121,22 @@ public void testPartitionKeyRestrictionCombinedWithSearchPredicate() throws Thro
}

@Test
public void newJVectorOptionsTest()
public void newJVectorOptionsTestVersion2()
{
newJVectorOptionsTest(2);
}
// We skip version 3 since it isn't supported anymore
@Test
public void newJVectorOptionsTestVersion4()
{
newJVectorOptionsTest(4);
}

public void newJVectorOptionsTest(int version)
{
// Configure the version to ensure we don't fail for settings that are unsupported on earlier versions of jvector
V3OnDiskFormat.JVECTOR_VERSION = version;

// This test ensures that we can set and retrieve new jvector parameters
// (neighborhood_overflow, alpha, enable_hierarchy), and that they are honored at index build time.

Expand All @@ -1142,20 +1157,20 @@ public void newJVectorOptionsTest()
+ " 'alpha' : '1.8' "
+ '}');

// Insert some data
execute("INSERT INTO %s (pk, txt, vec) VALUES (0, 'row0', [1.0, 2.0, 3.0, 4.0])");
execute("INSERT INTO %s (pk, txt, vec) VALUES (1, 'row1', [2.0, 2.5, 3.5, 4.5])");
execute("INSERT INTO %s (pk, txt, vec) VALUES (2, 'row2', [5.0, 1.0, 1.0, 1.0])");
// Run basic query
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
// Insert many rows
for (int i = 0; i < 2000; i++)
execute("INSERT INTO %s (pk, txt, vec) VALUES (?, ?, ?)", i, "row" + i, randomVectorBoxed(4));

// Run basic query to confirm we can, no need to validate results
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
// Confirm that we can flush with custom options
flush();
// Run basic query
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
// Run basic query to confirm we can, no need to validate results
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
// Confirm that we can compact with custom options
compact();
// Run basic query
assertRows(execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2"), row(1), row(0));
// Run basic query to confirm we can, no need to validate results
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");

// Confirm that the config picks up our custom settings.
StorageAttachedIndex saiIndex =
Expand All @@ -1172,4 +1187,31 @@ public void newJVectorOptionsTest()
assertEquals(VectorSimilarityFunction.EUCLIDEAN, config.getSimilarityFunction());
}

@Test
public void testMultiVersionJVectorCompatibility() throws Throwable
{
createTable("CREATE TABLE %s (pk int, vec vector<float, 4>, PRIMARY KEY(pk))");
createIndex("CREATE CUSTOM INDEX ON %s(vec) USING 'StorageAttachedIndex'");

// Note that we do not test the multi-version path where compaction produces different sstables, which is
// the norm in CNDB. If we had a way to compact individual sstables, we could.
disableCompaction();

// Create index files for each valid version
for (int version = 2; version <= V3OnDiskFormat.JVECTOR_VERSION; version++)
{
// Version 3 is no longer supported, so there is mild risk that it isn't covered here, but we can't write
// it any more, so there isn't much we can do.
if (version == 3)
continue;
V3OnDiskFormat.JVECTOR_VERSION = version;
for (int i = 0; i < CassandraOnHeapGraph.MIN_PQ_ROWS; i++)
execute("INSERT INTO %s (pk, vec) VALUES (?, ?)", i, randomVectorBoxed(4));
flush();
}

// Run basic query to confirm we can, no need to validate results
execute("SELECT pk FROM %s ORDER BY vec ANN OF [2.0, 2.0, 3.0, 4.0] LIMIT 2");
}

}