Skip to content

Commit

Permalink
Add YAML config files for parquet for BEIR (#2599)
Browse files Browse the repository at this point in the history
  • Loading branch information
valamuri2020 committed Sep 13, 2024
1 parent dd31b52 commit 11b543e
Show file tree
Hide file tree
Showing 116 changed files with 6,148 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet

index_path: indexes/parquet/arguana
index_type: flat
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: ""

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: JsonStringVector
topics:
- name: "BEIR (v1.0.0): ArguAna"
id: test
path: topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz
qrel: qrels.beir-v1.0.0-arguana.test.txt

models:
- name: bge-flat-cached
display: BGE-base-en-v1.5
type: flat
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000
results:
nDCG@10:
- 0.6361
R@100:
- 0.9915
R@1000:
- 0.9964
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet

index_path: indexes/parquet/arguana
index_type: flat
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: ""

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvString
topics:
- name: "BEIR (v1.0.0): ArguAna"
id: test
path: topics.beir-v1.0.0-arguana.test.tsv.gz
qrel: qrels.beir-v1.0.0-arguana.test.txt

models:
- name: bge-flat-onnx
display: BGE-base-en-v1.5
type: flat
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000 -encoder BgeBaseEn15
results:
nDCG@10:
- 0.6361
R@100:
- 0.9915
R@1000:
- 0.9964
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet

index_path: indexes/parquet/arguana
index_type: hnsw
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: -M 16 -efC 100

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: JsonStringVector
topics:
- name: "BEIR (v1.0.0): ArguAna"
id: test
path: topics.beir-v1.0.0-arguana.test.bge-base-en-v1.5.jsonl.gz
qrel: qrels.beir-v1.0.0-arguana.test.txt

models:
- name: bge-hnsw-cached
display: BGE-base-en-v1.5
type: hnsw
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000 -efSearch 1000
results:
nDCG@10:
- 0.6361
R@100:
- 0.9915
R@1000:
- 0.9964
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-arguana.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/arguana.parquet

index_path: indexes/parquet/arguana
index_type: hnsw
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvString
topics:
- name: "BEIR (v1.0.0): ArguAna"
id: test
path: topics.beir-v1.0.0-arguana.test.tsv.gz
qrel: qrels.beir-v1.0.0-arguana.test.txt

models:
- name: bge-hnsw-onnx
display: BGE-base-en-v1.5
type: hnsw
params: -generator VectorQueryGenerator -topicField title -removeQuery -threads
16 -hits 1000 -efSearch 1000 -encoder BgeBaseEn15
results:
nDCG@10:
- 0.6361
R@100:
- 0.9915
R@1000:
- 0.9964
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-bioasq.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/bioasq.parquet

index_path: indexes/parquet/bioasq
index_type: flat
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: ""

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: JsonStringVector
topics:
- name: "BEIR (v1.0.0): BioASQ"
id: test
path: topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.gz
qrel: qrels.beir-v1.0.0-bioasq.test.txt

models:
- name: bge-flat-cached
display: BGE-base-en-v1.5
type: flat
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000
results:
nDCG@10:
- 0.4149
R@100:
- 0.6317
R@1000:
- 0.8059
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-bioasq.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/bioasq.parquet

index_path: indexes/parquet/bioasq
index_type: flat
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: ""

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: TsvString
topics:
- name: "BEIR (v1.0.0): BioASQ"
id: test
path: topics.beir-v1.0.0-bioasq.test.tsv.gz
qrel: qrels.beir-v1.0.0-bioasq.test.txt

models:
- name: bge-flat-onnx
display: BGE-base-en-v1.5
type: flat
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000 -encoder BgeBaseEn15
results:
nDCG@10:
- 0.4149
R@100:
- 0.6317
R@1000:
- 0.8059
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
corpus: beir-v1.0.0-bioasq.bge-base-en-v1.5
corpus_path: collections/beir-v1.0.0/bge-base-en-v1.5/bioasq.parquet

index_path: indexes/parquet/bioasq
index_type: hnsw
collection_class: ParquetDenseVectorCollection
generator_class: ParquetDenseVectorDocumentGenerator
index_threads: 16
index_options: -M 16 -efC 100

metrics:
- metric: nDCG@10
command: bin/trec_eval
params: -c -m ndcg_cut.10
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@100
command: bin/trec_eval
params: -c -m recall.100
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false
- metric: R@1000
command: bin/trec_eval
params: -c -m recall.1000
separator: "\t"
parse_index: 2
metric_precision: 4
can_combine: false

topic_reader: JsonStringVector
topics:
- name: "BEIR (v1.0.0): BioASQ"
id: test
path: topics.beir-v1.0.0-bioasq.test.bge-base-en-v1.5.jsonl.gz
qrel: qrels.beir-v1.0.0-bioasq.test.txt

models:
- name: bge-hnsw-cached
display: BGE-base-en-v1.5
type: hnsw
params: -generator VectorQueryGenerator -topicField vector -removeQuery -threads
16 -hits 1000 -efSearch 5000
results:
nDCG@10:
- 0.4149
R@100:
- 0.6317
R@1000:
- 0.8059
Loading

0 comments on commit 11b543e

Please sign in to comment.