From 4ddc64066aedecda9a772414b4b249589ffc1102 Mon Sep 17 00:00:00 2001
From: Jie Min <66545235+Stefan824@users.noreply.github.com>
Date: Sun, 8 Sep 2024 12:40:11 -0400
Subject: [PATCH] Add rank fusion - initial implementation (#2590)
---
pom.xml | 12 +
.../java/io/anserini/fusion/FuseTrecRuns.java | 131 +++++++++
.../io/anserini/fusion/RescoreMethod.java | 23 ++
src/main/java/io/anserini/fusion/TrecRun.java | 264 ++++++++++++++++++
.../java/io/anserini/fusion/TrecRunFuser.java | 153 ++++++++++
5 files changed, 583 insertions(+)
create mode 100644 src/main/java/io/anserini/fusion/FuseTrecRuns.java
create mode 100644 src/main/java/io/anserini/fusion/RescoreMethod.java
create mode 100644 src/main/java/io/anserini/fusion/TrecRun.java
create mode 100644 src/main/java/io/anserini/fusion/TrecRunFuser.java
diff --git a/pom.xml b/pom.xml
index df84a30a6..86551972e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -536,5 +536,17 @@
+
+ junit
+ junit
+ 4.13.2
+ test
+
+
+ org.junit.jupiter
+ junit-jupiter-engine
+ 5.8.2
+ test
+
diff --git a/src/main/java/io/anserini/fusion/FuseTrecRuns.java b/src/main/java/io/anserini/fusion/FuseTrecRuns.java
new file mode 100644
index 000000000..5f2d45a11
--- /dev/null
+++ b/src/main/java/io/anserini/fusion/FuseTrecRuns.java
@@ -0,0 +1,131 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.fusion;
+
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.kohsuke.args4j.ParserProperties;
+import org.kohsuke.args4j.spi.StringArrayOptionHandler;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.List;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * Main entry point for Fusion.
+ */
+public class FuseTrecRuns {
+ private static final Logger LOG = LogManager.getLogger(FuseTrecRuns.class);
+
+ public static class Args extends TrecRunFuser.Args {
+ @Option(name = "-options", required = false, usage = "Print information about options.")
+ public Boolean options = false;
+
+ @Option(name = "-runs", handler = StringArrayOptionHandler.class, metaVar = "[file]", required = true,
+ usage = "Path to both run files to fuse")
+ public String[] runs;
+
+ @Option (name = "-resort", required = false, metaVar = "[flag]", usage="We Resort the Trec run files or not")
+ public boolean resort = false;
+ }
+
+ private final Args args;
+ private final TrecRunFuser fuser;
+ private final List runs = new ArrayList();
+
+ public FuseTrecRuns(Args args) throws IOException {
+ this.args = args;
+ this.fuser = new TrecRunFuser(args);
+
+ LOG.info(String.format("============ Initializing %s ============", FuseTrecRuns.class.getSimpleName()));
+ LOG.info("Runs: " + Arrays.toString(args.runs));
+ LOG.info("Run tag: " + args.runtag);
+ LOG.info("Fusion method: " + args.method);
+ LOG.info("Reciprocal Rank Fusion K value (rrf_k): " + args.rrf_k);
+ LOG.info("Alpha value for interpolation: " + args.alpha);
+ LOG.info("Max documents to output (k): " + args.k);
+ LOG.info("Pool depth: " + args.depth);
+ LOG.info("Resort TREC run files: " + args.resort);
+
+ try {
+ // Ensure positive depth and k values
+ if (args.depth <= 0) {
+ throw new IllegalArgumentException("Option depth must be greater than 0");
+ }
+ if (args.k <= 0) {
+ throw new IllegalArgumentException("Option k must be greater than 0");
+ }
+ } catch (Exception e) {
+ throw new IllegalArgumentException(String.format("Error: %s. Please check the provided arguments. Use the \"-options\" flag to print out detailed information about available options and their usage.\n",
+ e.getMessage()));
+ }
+
+ for (String runFile : args.runs) {
+ try {
+ Path path = Paths.get(runFile);
+ TrecRun run = new TrecRun(path, args.resort);
+ runs.add(run);
+ } catch (Exception e) {
+ throw new IllegalArgumentException(String.format("Error: %s. Please check the provided arguments. Use the \"-options\" flag to print out detailed information about available options and their usage.\n",
+ e.getMessage()));
+ }
+ }
+ }
+
+ public void run() throws IOException {
+ LOG.info("============ Launching Fusion ============");
+ fuser.fuse(runs);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Args fuseArgs = new Args();
+ CmdLineParser parser = new CmdLineParser(fuseArgs, ParserProperties.defaults().withUsageWidth(120));
+
+ try {
+ parser.parseArgument(args);
+ } catch (CmdLineException e) {
+ if (fuseArgs.options) {
+ System.err.printf("Options for %s:\n\n", FuseTrecRuns.class.getSimpleName());
+ parser.printUsage(System.err);
+ ArrayList required = new ArrayList<>();
+ parser.getOptions().forEach(option -> {
+ if (option.option.required()) {
+ required.add(option.option.toString());
+ }
+ });
+ System.err.printf("\nRequired options are %s\n", required);
+ } else {
+ System.err.printf("Error: %s. For help, use \"-options\" to print out information about options.\n",
+ e.getMessage());
+ }
+ return;
+ }
+
+ try {
+ FuseTrecRuns fuser = new FuseTrecRuns(fuseArgs);
+ fuser.run();
+ } catch (Exception e) {
+ System.err.println(e.getMessage());
+ }
+ }
+}
diff --git a/src/main/java/io/anserini/fusion/RescoreMethod.java b/src/main/java/io/anserini/fusion/RescoreMethod.java
new file mode 100644
index 000000000..e07e9f082
--- /dev/null
+++ b/src/main/java/io/anserini/fusion/RescoreMethod.java
@@ -0,0 +1,23 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.fusion;
+
+public enum RescoreMethod {
+ RRF,
+ SCALE,
+ NORMALIZE;
+}
diff --git a/src/main/java/io/anserini/fusion/TrecRun.java b/src/main/java/io/anserini/fusion/TrecRun.java
new file mode 100644
index 000000000..14c63c7c2
--- /dev/null
+++ b/src/main/java/io/anserini/fusion/TrecRun.java
@@ -0,0 +1,264 @@
+/*
+* Anserini: A Lucene toolkit for reproducible information retrieval research
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package io.anserini.fusion;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.apache.commons.io.FileUtils;
+
+/**
+ * Wrapper class for a TREC run.
+*/
+public class TrecRun {
+ // Enum representing the columns in the TREC run file
+ public enum Column {
+ TOPIC, Q0, DOCID, RANK, SCORE, TAG
+ }
+
+ private List