Skip to content

add public bioasq dataset #507

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions vectordb_bench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class config:
LOAD_TIMEOUT_1536D_500K = 24 * 3600 # 24h
LOAD_TIMEOUT_1536D_5M = 240 * 3600 # 10d

LOAD_TIMEOUT_1024D_1M = 24 * 3600 # 24h
LOAD_TIMEOUT_1024D_10M = 240 * 3600 # 10d

OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600 # 24h
OPTIMIZE_TIMEOUT_768D_100K = 24 * 3600 # 24h
OPTIMIZE_TIMEOUT_768D_1M = 24 * 3600 # 24h
Expand All @@ -62,6 +65,9 @@ class config:
OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600 # 24h
OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600 # 10d

OPTIMIZE_TIMEOUT_1024D_1M = 24 * 3600 # 24h
OPTIMIZE_TIMEOUT_1024D_10M = 240 * 3600 # 10d

def display(self) -> str:
return [
i
Expand Down
29 changes: 29 additions & 0 deletions vectordb_bench/backend/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class CaseType(Enum):
Performance1536D500K99P = 14
Performance1536D5M99P = 15

Performance1024D1M = 17
Performance1024D10M = 20

Performance1536D50K = 50

Custom = 100
Expand Down Expand Up @@ -309,6 +312,30 @@ class Performance1536D5M99P(IntFilterPerformanceCase):
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M


class Performance1024D1M(PerformanceCase):
case_id: CaseType = CaseType.Performance1024D1M
filter_rate: float | int | None = None
dataset: DatasetManager = Dataset.BIOASQ.manager(1_000_000)
name: str = "Search Performance Test (1M Dataset, 1024 Dim)"
description: str = """This case tests the search performance of a vector database with a medium 1M dataset
(<b>Bioasq 1M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
recall, and maximum QPS."""
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_1M
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_1M


class Performance1024D10M(PerformanceCase):
case_id: CaseType = CaseType.Performance1024D10M
filter_rate: float | int | None = None
dataset: DatasetManager = Dataset.BIOASQ.manager(10_000_000)
name: str = "Search Performance Test (10M Dataset, 1024 Dim)"
description: str = """This case tests the search performance of a vector database with a large 10M dataset
(<b>Bioasq 10M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
recall, and maximum QPS."""
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_10M
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_10M


class Performance1536D50K(PerformanceCase):
case_id: CaseType = CaseType.Performance1536D50K
filter_rate: float | int | None = None
Expand Down Expand Up @@ -497,6 +524,8 @@ def filters(self) -> Filter:
CaseType.Performance1536D5M1P: Performance1536D5M1P,
CaseType.Performance1536D500K99P: Performance1536D500K99P,
CaseType.Performance1536D5M99P: Performance1536D5M99P,
CaseType.Performance1024D1M: Performance1024D1M,
CaseType.Performance1024D10M: Performance1024D10M,
CaseType.Performance1536D50K: Performance1536D50K,
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
Expand Down
19 changes: 19 additions & 0 deletions vectordb_bench/backend/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,20 @@ class Cohere(BaseDataset):
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]


class Bioasq(BaseDataset):
name: str = "Bioasq"
dim: int = 1024
metric_type: MetricType = MetricType.COSINE
use_shuffled: bool = config.USE_SHUFFLED_DATA
with_gt: bool = True
_size_label: dict = {
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
}
with_scalar_labels: bool = True
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]


class Glove(BaseDataset):
name: str = "Glove"
dim: int = 200
Expand Down Expand Up @@ -361,6 +375,7 @@ class Dataset(Enum):
LAION = LAION
GIST = GIST
COHERE = Cohere
BIOASQ = Bioasq
GLOVE = Glove
SIFT = SIFT
OPENAI = OpenAI
Expand All @@ -376,6 +391,8 @@ class DatasetWithSizeType(Enum):
CohereSmall = "Small Cohere (768dim, 100K)"
CohereMedium = "Medium Cohere (768dim, 1M)"
CohereLarge = "Large Cohere (768dim, 10M)"
BioasqMedium = "Medium Bioasq (1024dim, 1M)"
BioasqLarge = "Large Bioasq (1024dim, 10M)"
OpenAISmall = "Small OpenAI (1536dim, 50K)"
OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
OpenAILarge = "Large OpenAI (1536dim, 5M)"
Expand Down Expand Up @@ -410,6 +427,8 @@ def get_optimize_timeout(self) -> float:
DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),
DatasetWithSizeType.OpenAILarge: Dataset.OPENAI.manager(5_000_000),
Expand Down
5 changes: 5 additions & 0 deletions vectordb_bench/frontend/config/dbCaseConfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,9 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D10M)),
UICaseItem(cases=generate_normal_cases(CaseType.Performance768D1M)),
UICaseItem(isLine=True),
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D1M)),
UICaseItem(cases=generate_normal_cases(CaseType.Performance1024D10M)),
UICaseItem(isLine=True),
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D5M)),
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D500K)),
UICaseItem(cases=generate_normal_cases(CaseType.Performance1536D50K)),
Expand Down Expand Up @@ -301,6 +304,8 @@ def generate_label_filter_cases(dataset_with_size_type: DatasetWithSizeType) ->
CaseType.Performance1536D5M,
CaseType.Performance1536D500K,
CaseType.Performance1536D50K,
CaseType.Performance1024D1M,
CaseType.Performance1024D10M,
CaseType.CapacityDim960,
CaseType.CapacityDim128,
]
Expand Down
Loading