Skip to content

Commit

Permalink
Add test case for tweets / delinting code (#2534)
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool committed Aug 18, 2024
1 parent 46b6834 commit abf36d1
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 43 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.anserini.collection;

import org.junit.Before;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;

public class TweetCollectionCompressedTest extends DocumentCollectionTest<TweetCollection.Document> {

@Before
public void setUp() throws Exception {
super.setUp();

collectionPath = Paths.get("src/test/resources/sample_docs/tweets/collection3");
collection = new TweetCollection(collectionPath);

Path segment1 = Paths.get("src/test/resources/sample_docs/tweets/collection3/tweets.jsonl.gz");

segmentPaths.add(segment1);
segmentDocCounts.put(segment1, 2);

totalSegments = 1;
totalDocs = 2;

expected.put("123456789",
Map.of("id", "123456789",
"content", "this is the tweet contents.",
"screen_name", "foo",
"timestamp_ms", "1517482567000"));

expected.put("123456787",
Map.of("id", "123456787",
"content", "this is the tweet contents, iteration should not have stopped." ,
"screen_name", "UserName",
"timestamp_ms", "1362038400000"));
}

@Override
void checkDocument(SourceDocument doc, Map<String, String> expected) {
assertTrue(doc.indexable());
assertEquals(expected.get("id"), doc.id());
assertEquals(expected.get("content"), doc.contents());
assertEquals(expected.get("content"), doc.raw());
assertEquals(expected.get("screen_name"), ((TweetCollection.Document) doc).getScreenName());
assertEquals((long) Long.valueOf(expected.get("timestamp_ms")),
((TweetCollection.Document) doc).getTimestampMs().getAsLong());
}
}
34 changes: 16 additions & 18 deletions src/test/java/io/anserini/doc/DataModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ public void setConversions(List<Conversion> conversions) {
this.conversions = conversions;
}

static class Topic {
static public class Topic {
private String name;
private String id;
private String path;
Expand All @@ -215,7 +215,7 @@ static class Topic {
public void setConvert_params(String convert_params) { this.convert_params = convert_params; }
}

static class Model {
static public class Model {
private String name;
private String display;
private String type;
Expand All @@ -235,7 +235,7 @@ static class Model {
public void setParams(String params) { this.params = params; }
}

static class Conversion {
static public class Conversion {
private String command;
private String in_file_ext;
private String out_file_ext;
Expand All @@ -251,7 +251,7 @@ static class Conversion {
public void setParams(String params) { this.params = params; }
}

static class Metric {
static public class Metric {
private String command;
private String params;
private String separator;
Expand Down Expand Up @@ -288,16 +288,14 @@ public String generateIndexingCommand(String collection) {
indexCommand = INDEX_INVERTED_DENSE_COMMAND;
}

StringBuilder builder = new StringBuilder();
builder.append(indexCommand).append(" \\\n");
builder.append(" -collection ").append(getCollection_class()).append(" \\\n");
builder.append(" -input ").append("/path/to/"+collection).append(" \\\n");
builder.append(" -generator ").append(getGenerator_class()).append(" \\\n");
builder.append(" -index ").append(getIndex_path()).append(" \\\n");
builder.append(" -threads ").append(getIndex_threads());
builder.append(" ").append(getIndex_options()).append(" \\\n");
builder.append(String.format(" >& logs/log.%s &", collection));
return builder.toString();
return indexCommand + " \\\n" +
" -collection " + getCollection_class() + " \\\n" +
" -input " + "/path/to/" + collection + " \\\n" +
" -generator " + getGenerator_class() + " \\\n" +
" -index " + getIndex_path() + " \\\n" +
" -threads " + getIndex_threads() +
" " + getIndex_options() + " \\\n" +
String.format(" >& logs/log.%s &", collection);
}

private String generateRunFile(String collection, Model model, Topic topic) {
Expand Down Expand Up @@ -354,8 +352,8 @@ public String generateConvertingCommand(String collection) {
builder.append(conversion.getCommand()).append(" \\\n");
builder.append(" --index").append(" ").append(getIndex_path()).append(" \\\n");
builder.append(" --topics").append(" ").append(topic.getId()).append(" \\\n");
builder.append(" --input").append(" ").append(generateRunFile(collection, model, topic) + ((conversion.getIn_file_ext() == null) ? "" : conversion.getIn_file_ext())).append(" \\\n");
builder.append(" --output").append(" ").append(generateRunFile(collection, model, topic) + conversion.getOut_file_ext()).append(" \\\n");
builder.append(" --input").append(" ").append(generateRunFile(collection, model, topic)).append((conversion.getIn_file_ext() == null) ? "" : conversion.getIn_file_ext()).append(" \\\n");
builder.append(" --output").append(" ").append(generateRunFile(collection, model, topic)).append(conversion.getOut_file_ext()).append(" \\\n");
if (conversion.getParams() != null) {
builder.append(" ").append(conversion.getParams());
}
Expand Down Expand Up @@ -400,12 +398,12 @@ public String generateEvalCommand(String collection) {
combinedEvalCmd.get(evalCmd).putIfAbsent(evalCmdResidual, new ArrayList<>());
combinedEvalCmd.get(evalCmd).get(evalCmdResidual).add(evalCmdOption);
} else {
builder.append(evalCmd + evalCmdOption + evalCmdResidual);
builder.append(evalCmd).append(evalCmdOption).append(evalCmdResidual);
}
}
for (Map.Entry<String, Map<String, List<String>>> entry : combinedEvalCmd.entrySet()) {
for (Map.Entry<String, List<String>> innerEntry : entry.getValue().entrySet()) {
builder.append(entry.getKey() + String.join("", innerEntry.getValue()) + innerEntry.getKey());
builder.append(entry.getKey()).append(String.join("", innerEntry.getValue())).append(innerEntry.getKey());
}
}
}
Expand Down
8 changes: 6 additions & 2 deletions src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@

import java.io.File;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Scanner;

public class GenerateRegressionDocsTest {
Expand All @@ -34,7 +36,8 @@ public void generateDocs() throws Exception {
ObjectMapper mapper = new ObjectMapper(new YAMLFactory());
URL templatesRoot = GenerateRegressionDocsTest.class.getResource("/docgen/templates/");

for (final File fileEntry : new File(templatesRoot.toURI()).listFiles()) {
assert templatesRoot != null;
for (final File fileEntry : Objects.requireNonNull(new File(templatesRoot.toURI()).listFiles())) {
// This is the name of the test, which can be different from the name of the collection,
// e.g., multiple topics run on the same collection.
String testName = fileEntry.getName().replaceAll(".template", "");
Expand All @@ -61,7 +64,8 @@ public void generateDocs() throws Exception {

StringSubstitutor sub = new StringSubstitutor(valuesMap);
URL template = GenerateRegressionDocsTest.class.getResource(String.format("/docgen/templates/%s.template", testName));
Scanner scanner = new Scanner(new File(template.toURI()), "UTF-8");
assert template != null;
Scanner scanner = new Scanner(new File(template.toURI()), StandardCharsets.UTF_8);
String text = scanner.useDelimiter("\\A").next();
scanner.close();
String resolvedString = sub.replace(text);
Expand Down
21 changes: 11 additions & 10 deletions src/test/java/io/anserini/doc/GenerateReproductionDocsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import java.io.File;
import java.io.FileInputStream;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
Expand Down Expand Up @@ -120,14 +121,14 @@ public void generateReport() throws Exception {
.replace("$output", runFile);

tempCommands.put(shortTopicKey, commandString);
String evalCommandString = "";
for (Entry<String, Double> entry : topic.scores.get(0).entrySet()) {
StringBuilder evalCommandString = new StringBuilder();
for (Entry<String, Double> entry : topic.scores.getFirst().entrySet()) {
final String tempEvalCommand = "tools/eval/trec_eval.9.0.4/trec_eval "
+ evalCommandMap.get(evalKey).get(entry.getKey()) + " " + evalKey + " " + runFile;
evalCommandString += tempEvalCommand + "\n";
evalCommandString.append(tempEvalCommand).append("\n");
metricScoreMap.put(entry.getKey(), (Double) entry.getValue());
}
tempEvalCommands.put(shortTopicKey, evalCommandString);
tempEvalCommands.put(shortTopicKey, evalCommandString.toString());
topicMetricMap.put(shortTopicKey, metricScoreMap);

}
Expand All @@ -138,8 +139,8 @@ public void generateReport() throws Exception {

// Additional logic to generate report
int rowCounter = 1;
String htmlString = "";
Scanner rowScanner = new Scanner(new File(ROW_TEMPLATE_PATH), "UTF-8");
StringBuilder htmlString = new StringBuilder();
Scanner rowScanner = new Scanner(new File(ROW_TEMPLATE_PATH), StandardCharsets.UTF_8);
String rowTemplateString = rowScanner.useDelimiter("\\A").next();
rowScanner.close();

Expand All @@ -164,19 +165,19 @@ public void generateReport() throws Exception {
valuesMap.put("eval_cmd3", formatEvalCommand(evalCommands.get(model).get("dev")));

StringSubstitutor sub = new StringSubstitutor(valuesMap);
htmlString += sub.replace(rowTemplateString) + "\n";
htmlString.append(sub.replace(rowTemplateString)).append("\n");
rowCounter++;
}
Scanner htmlScanner = new Scanner(new File(HTML_TEMPLATE_PATH), "UTF-8");
Scanner htmlScanner = new Scanner(new File(HTML_TEMPLATE_PATH), StandardCharsets.UTF_8);
String htmlTemplateString = htmlScanner.useDelimiter("\\A").next();
htmlScanner.close();

Map<String, String> outputValuesMap = new HashMap<>();
outputValuesMap.put("title", "MS MARCO V1 Passage");
outputValuesMap.put("rows", htmlString);
outputValuesMap.put("rows", htmlString.toString());

StringSubstitutor sub = new StringSubstitutor(outputValuesMap);
String resolvedString = new String(sub.replace(htmlTemplateString));
String resolvedString = sub.replace(htmlTemplateString);
FileUtils.writeStringToFile(new File("docs/reproduce/msmarco-v1-passage.html"), resolvedString, "UTF-8");
}
}
14 changes: 8 additions & 6 deletions src/test/java/io/anserini/doc/JDIQ2018EffectivenessDocsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

import java.io.File;
import java.net.URL;
import java.util.Arrays;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
Expand Down Expand Up @@ -68,9 +68,9 @@ public Map<String, Object> transform() {
public String generateEffectiveness() {
Map<String, Object> data = transform();
StringBuilder builder = new StringBuilder();
for (String collection: Arrays.asList(new String[] { "disk12", "robust04", "robust05", "core17",
"wt10g", "gov2", "cw09b", "cw12b13", "mb11", "mb13"})) {
builder.append("#### "+collection+"\n");
for (String collection: new String[] { "disk12", "robust04", "robust05", "core17",
"wt10g", "gov2", "cw09b", "cw12b13", "mb11", "mb13"}) {
builder.append("#### ").append(collection).append("\n");
for (Map.Entry<String, Object> entry2 : ((Map<String, Object>)data.get(collection)).entrySet()) {
String metric = entry2.getKey();
builder.append(String.format("%1$-40s|", metric.toUpperCase()));
Expand Down Expand Up @@ -113,16 +113,18 @@ public String generateEffectiveness() {
}

@Test
public void main() throws Exception {
public void mainTest() throws Exception {
ObjectMapper mapper = new ObjectMapper(new YAMLFactory());
URL yaml = JDIQ2018EffectivenessDocsTest.class.getResource("/jdiq2018/models.yaml");
assert yaml != null;
Model data = mapper.readValue(new File(yaml.toURI()), Model.class);
Map<String, String> valuesMap = new HashMap<>();
valuesMap.put("results", data.generateEffectiveness());

StringSubstitutor sub = new StringSubstitutor(valuesMap);
URL template = GenerateRegressionDocsTest.class.getResource("/jdiq2018/doc.template");
Scanner scanner = new Scanner(new File(template.toURI()), "UTF-8");
assert template != null;
Scanner scanner = new Scanner(new File(template.toURI()), StandardCharsets.UTF_8);
String text = scanner.useDelimiter("\\A").next();
scanner.close();
String resolvedString = sub.replace(text);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

Expand Down Expand Up @@ -1098,7 +1097,7 @@ public void basic() throws OrtException, IOException, URISyntaxException {
}

@Test
public void maxlen() throws OrtException, IOException {
public void testMaxLength() throws OrtException {
DenseEncoder encoder = null;
float[] expectedWeights = (float[]) longExamples[0][1];
String[] inputStrings = (String[]) longExamples[0][0];
Expand Down
9 changes: 5 additions & 4 deletions src/test/java/io/anserini/index/BasicIndexOperationsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,11 @@ public void readNorms() throws Exception {

Map<Integer, Integer> norms = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
NumericDocValues docValues = leafReader.getNormValues("contents");
while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
norms.put(docValues.docID() + context.docBase, SmallFloat.byte4ToInt((byte) docValues.longValue()));
try(LeafReader leafReader = context.reader()) {
NumericDocValues docValues = leafReader.getNormValues("contents");
while (docValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
norms.put(docValues.docID() + context.docBase, SmallFloat.byte4ToInt((byte) docValues.longValue()));
}
}
}

Expand Down
4 changes: 3 additions & 1 deletion src/test/java/io/anserini/index/CloneIndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jetbrains.annotations.NotNull;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
Expand Down Expand Up @@ -81,7 +82,7 @@ public void testCloneIndex() throws Exception {
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir2, config);

LeafReader leafReader = reader.leaves().get(0).reader();
LeafReader leafReader = reader.leaves().getFirst().reader();
CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
writer.addIndexes(new MyFilterCodecReader(codecReader));
writer.commit();
Expand Down Expand Up @@ -149,6 +150,7 @@ public void checkIntegrity() {
fieldsProducer.iterator();
}

@NotNull
@Override
public Iterator<String> iterator() {
return fieldsProducer.iterator();
Expand Down
Binary file not shown.

0 comments on commit abf36d1

Please sign in to comment.