diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java
index 5cbe62d89365..f9ae81837db7 100644
--- a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java
+++ b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java
@@ -20,9 +20,13 @@
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
-import org.languagetool.*;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.Languages;
+import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.bitext.TabBitextReader;
-import org.languagetool.language.*;
+import org.languagetool.language.AmericanEnglish;
+import org.languagetool.language.English;
import org.languagetool.language.identifier.LanguageIdentifier;
import org.languagetool.language.identifier.LanguageIdentifierService;
import org.languagetool.rules.Rule;
@@ -35,7 +39,13 @@
import org.xml.sax.SAXException;
import javax.xml.parsers.ParserConfigurationException;
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java
index f9e6c3f87917..f130c02887fd 100644
--- a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java
+++ b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java
@@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception {
String[] args = {"-l", "en", "--json", "-"};
Main.main(args);
String output = new String(this.out.toByteArray());
- assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\""));
+ assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":"));
assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\""));
assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\""));
assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]"));
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe
index b798387ded9d..ee091ce48847 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs
index af3abe0553ad..ba80b675fe1d 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si
index 7fce7afe0e39..4a4978085455 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1
index 6e93fa9e6973..e49de5d67abe 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe
index d04ba2ee5c95..15e6f67ed0a2 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs
index 36ff0f65c139..5d62d79a170c 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si
index 3b116de2a4f0..ce3c306f6fab 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1
index 6e93fa9e6973..15102c4ff7f2 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe
index dfbaafafcd61..fb7f2188cace 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs
index 552a0d09868e..c853d3bd23f0 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si
index 3f231fb03f90..10159d3fd64c 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si differ
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1
index 6e93fa9e6973..4b452afb987e 100644
Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 differ
diff --git a/languagetool-dev/pom.xml b/languagetool-dev/pom.xml
index de8a11b203bb..bba8bd88b0cd 100644
--- a/languagetool-dev/pom.xml
+++ b/languagetool-dev/pom.xml
@@ -58,6 +58,10 @@
org.languagetool
languagetool-wikipedia
+
+ org.apache.lucene
+ lucene-core
+
org.mariadb.jdbc
mariadb-java-client
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java
index ab45341f7edc..39c001d227a5 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java
@@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException {
try (FSDirectory directory = FSDirectory.open(dir.toPath());
IndexReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);
- Fields fields = MultiFields.getFields(reader);
- Terms ngrams = fields.terms("ngram");
- TermsEnum iterator = ngrams.iterator();
- BytesRef next;
- int i = 0;
- while ((next = iterator.next()) != null) {
- String term = next.utf8ToString();
- if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
- if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
- //System.out.println("ignore: " + term);
- continue;
- }
- TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
- if (topDocs.totalHits == 0) {
- throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
- } else if (topDocs.totalHits == 1) {
- int docId = topDocs.scoreDocs[0].doc;
- Document document = reader.document(docId);
- Long count = Long.parseLong(document.get("count"));
- //System.out.println(term + " -> " + count);
- totalCount += count;
- if (++i % 10_000 == 0) {
- System.out.println(i + " ... " + totalCount);
+ for (String field : FieldInfos.getIndexedFields(reader)) {
+ Terms ngrams = MultiTerms.getTerms(reader, field);
+ TermsEnum iterator = ngrams.iterator();
+ BytesRef next;
+ int i = 0;
+ while ((next = iterator.next()) != null) {
+ String term = next.utf8ToString();
+ if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
+ if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
+ //System.out.println("ignore: " + term);
+ continue;
+ }
+ TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
+ if (topDocs.totalHits.value == 0) {
+ throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value);
+ } else if (topDocs.totalHits.value == 1) {
+ int docId = topDocs.scoreDocs[0].doc;
+ Document document = reader.document(docId);
+ Long count = Long.parseLong(document.get("count"));
+ //System.out.println(term + " -> " + count);
+ totalCount += count;
+ if (++i % 10_000 == 0) {
+ System.out.println(i + " ... " + totalCount);
+ }
+ } else {
+ throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
- } else {
- throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
}
}
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java
index 8b352f75be7b..3a06ee9819d0 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java
@@ -1,4 +1,4 @@
-/* LanguageTool, a natural language style checker
+/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
@@ -20,7 +20,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.document.StringField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@@ -95,13 +99,8 @@ private Document getDoc(String ngram, long count) {
}
@NotNull
- private LongField getCountField(long count) {
- FieldType fieldType = new FieldType();
- fieldType.setStored(true);
- fieldType.setOmitNorms(true);
- fieldType.setNumericType(FieldType.NumericType.LONG);
- fieldType.setDocValuesType(DocValuesType.NUMERIC);
- return new LongField("count", count, fieldType);
+ private LongPoint getCountField(long count) {
+ return new LongPoint("count", count);
}
private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java
index 4e130e87bcb5..8e2e92b25566 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java
@@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws
if (newReader != null) {
reader = newReader;
}*/
- index.reader = DirectoryReader.open(index.indexWriter, true);
+ index.reader = DirectoryReader.open(index.indexWriter, true, true);
index.searcher = new IndexSearcher(index.reader);
for (Map.Entry entry : ngramToCount.entrySet()) {
Term ngram = new Term("ngram", entry.getKey());
TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
//System.out.println(ngram + " ==> " + topDocs.totalHits);
- if (topDocs.totalHits == 0) {
+ if (topDocs.totalHits.value == 0) {
Document doc = getDoc(entry.getKey(), entry.getValue());
index.indexWriter.addDocument(doc);
- } else if (topDocs.totalHits == 1) {
+ } else if (topDocs.totalHits.value == 1) {
int docNumber = topDocs.scoreDocs[0].doc;
Document document = index.reader.document(docNumber);
long oldCount = Long.parseLong(document.getField("count").stringValue());
@@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws
index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
// would probably be faster, but we currently rely on the count being a common field:
//indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
- } else if (topDocs.totalHits > 1) {
+ } else if (topDocs.totalHits.value > 1) {
throw new RuntimeException("Got more than one hit for: " + ngram);
}
//System.out.println(" " + entry.getKey() + " -> " + entry.getValue());
@@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) {
}
@NotNull
- private LongField getCountField(long count) {
- FieldType fieldType = new FieldType();
- fieldType.setStored(true);
- fieldType.setOmitNorms(true);
- fieldType.setNumericType(FieldType.NumericType.LONG);
- fieldType.setDocValuesType(DocValuesType.NUMERIC);
- return new LongField("count", count, fieldType);
+ private LongPoint getCountField(long count) {
+ return new LongPoint("count", count);
}
private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
@@ -269,7 +264,7 @@ static class LuceneLiveIndex {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
directory = FSDirectory.open(dir.toPath());
indexWriter = new IndexWriter(directory, config);
- reader = DirectoryReader.open(indexWriter, false);
+ reader = DirectoryReader.open(indexWriter, false, false);
searcher = new IndexSearcher(reader);
}
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java
index ad0e60af36a3..e8a2365c3111 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java
@@ -34,7 +34,9 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.List;
+import java.util.Set;
/**
* Prototype to find potential upper-only phrases like "Persischer Golf".
@@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
- Fields fields = MultiFields.getFields(reader);
- Terms terms = fields.terms("ngram");
- TermsEnum termsEnum = terms.iterator();
- int count = 0;
- BytesRef next;
- while ((next = termsEnum.next()) != null) {
- String term = next.utf8ToString();
- count++;
- //term = "persischer Golf"; // for testing
- String[] parts = term.split(" ");
- boolean useful = true;
- int lcCount = 0;
- List ucParts = new ArrayList<>();
- for (String part : parts) {
- if (part.length() < MIN_TERM_LEN) {
- useful = false;
- break;
+ for (String field: FieldInfos.getIndexedFields(reader)) {
+ Terms terms = MultiTerms.getTerms(reader, field);
+ TermsEnum termsEnum = terms.iterator();
+ int count = 0;
+ BytesRef next;
+ while ((next = termsEnum.next()) != null) {
+ String term = next.utf8ToString();
+ count++;
+ //term = "persischer Golf"; // for testing
+ String[] parts = term.split(" ");
+ boolean useful = true;
+ int lcCount = 0;
+ List ucParts = new ArrayList<>();
+ for (String part : parts) {
+ if (part.length() < MIN_TERM_LEN) {
+ useful = false;
+ break;
+ }
+ String uc = StringTools.uppercaseFirstChar(part);
+ if (!part.equals(uc)) {
+ lcCount++;
+ }
+ ucParts.add(uc);
}
- String uc = StringTools.uppercaseFirstChar(part);
- if (!part.equals(uc)) {
- lcCount++;
+ if (!useful || lcCount == 0 || lcCount == 2) {
+ continue;
}
- ucParts.add(uc);
- }
- if (!useful || lcCount == 0 || lcCount == 2) {
- continue;
- }
- String uppercase = String.join(" ", ucParts);
- if (term.equals(uppercase)){
- continue;
- }
- long thisCount = getOccurrenceCount(reader, searcher, term);
- long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
- if (count % 10_000 == 0) {
- System.err.println(count + " @ " + term);
- }
- if (thisCount > LIMIT || thisUpperCount > LIMIT) {
- if (thisUpperCount > thisCount) {
- if (isRelevant(lt, term)) {
- float factor = (float)thisUpperCount / thisCount;
- System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
+ String uppercase = String.join(" ", ucParts);
+ if (term.equals(uppercase)) {
+ continue;
+ }
+ long thisCount = getOccurrenceCount(reader, searcher, term);
+ long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
+ if (count % 10_000 == 0) {
+ System.err.println(count + " @ " + term);
+ }
+ if (thisCount > LIMIT || thisUpperCount > LIMIT) {
+ if (thisUpperCount > thisCount) {
+ if (isRelevant(lt, term)) {
+ float factor = (float) thisUpperCount / thisCount;
+ System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
+ }
}
}
}
@@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept
private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException {
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
- if (topDocs.totalHits == 0) {
+ if (topDocs.totalHits.value == 0) {
return 0;
}
int docId = topDocs.scoreDocs[0].doc;
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java
index 54d6dc8d8587..74a6b0c8f887 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java
@@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
- Fields fields = MultiFields.getFields(reader);
+ Terms terms = MultiTerms.getTerms(reader, "ngram");
long max = 0;
String maxTerm = "";
- Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
@@ -71,5 +70,6 @@ public static void main(String[] args) throws IOException {
}
System.out.println("Max: " + max + " for " + maxTerm);
}
+
}
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java
index d5caea350778..b7c43dc46619 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java
@@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException {
String ngramIndexDir = args[0];
FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
- Fields fields = MultiFields.getFields(reader);
- Terms terms = fields.terms("ngram");
+ Terms terms = MultiTerms.getTerms(reader, "ngram");
TermsEnum termsEnum = terms.iterator();
int i = 0;
int needed = 0;
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java
index 0d97a4df0bab..e4ed2832f7a0 100644
--- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java
+++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java
@@ -20,7 +20,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
diff --git a/languagetool-language-modules/ja/pom.xml b/languagetool-language-modules/ja/pom.xml
index 12dfea7942b2..58ee2b8b9d68 100644
--- a/languagetool-language-modules/ja/pom.xml
+++ b/languagetool-language-modules/ja/pom.xml
@@ -40,7 +40,7 @@
- com.github.lucene-gosen
+ org.omegat.lucene
lucene-gosen
ipadic
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java
index d3b2adfc4abe..5f99596bf348 100644
--- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java
+++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java
@@ -19,8 +19,8 @@
package org.languagetool.dev.dumpcheck;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java
index effff0cf9460..8cb404ed5924 100644
--- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java
+++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java
@@ -20,8 +20,8 @@
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;
import java.io.IOException;
@@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer {
private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length!
private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
- private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class);
@@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException {
while(true) {
if(this.bufferIndex >= this.dataLen) {
this.offset += this.dataLen;
- this.charUtils.fill(this.ioBuffer, this.input);
+ CharacterUtils.fill(this.ioBuffer, this.input);
if(this.ioBuffer.getLength() == 0) {
this.dataLen = 0;
if(length <= 0) {
@@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException {
this.bufferIndex = 0;
}
- int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
+ int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex);
int charCount = Character.charCount(c);
this.bufferIndex += charCount;
if(this.isTokenChar(c)) {
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java
index 622c7f7291f5..e356998b9f95 100644
--- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java
+++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java
@@ -124,7 +124,7 @@ private SpanQuery asSpanQuery(BooleanClause query) {
} else {
Set terms = new HashSet<>();
try {
- indexSearcher.createWeight(query.getQuery(), false).extractTerms(terms);
+ indexSearcher.createWeight(query.getQuery(), ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(terms);
} catch (IOException e) {
throw new RuntimeException(e);
}
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java
index 22dbb89e625c..8564e197a446 100644
--- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java
+++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java
@@ -33,6 +33,7 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
@@ -101,7 +102,7 @@ public int getDocCount() throws IOException {
private int getDocCount(IndexSearcher indexSearcher) throws IOException {
Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL);
TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1);
- if (search.totalHits != 1) {
+ if (search.totalHits.value != 1) {
return -1;
}
ScoreDoc scoreDoc = search.scoreDocs[0];
@@ -334,7 +335,7 @@ class SearchRunnable implements Runnable {
private List matchingSentences;
private Exception exception;
private boolean tooManyLuceneMatches;
- private int luceneMatchCount;
+ private long luceneMatchCount;
private int maxDocChecked;
private int docsChecked;
private int numDocs;
@@ -356,7 +357,7 @@ public void run() {
PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query);
long luceneTime = System.currentTimeMillis() - t2;
long t3 = System.currentTimeMillis();
- luceneMatchCount = limitedTopDocs.topDocs.totalHits;
+ luceneMatchCount = limitedTopDocs.topDocs.totalHits.value;
tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits;
MatchingSentencesResult res = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool);
matchingSentences = res.matchingSentences;
@@ -382,7 +383,7 @@ boolean hasTooManyLuceneMatches() {
return tooManyLuceneMatches;
}
- int getLuceneMatchCount() {
+ long getLuceneMatchCount() {
return luceneMatchCount;
}
diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java
index 40c860af0650..6c39036346e0 100644
--- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java
+++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java
@@ -35,7 +35,7 @@ public class SearcherResult {
private int docCount;
private int maxDocChecked;
private boolean hasTooManyLuceneMatches;
- private int luceneMatchCount;
+ private long luceneMatchCount;
private int skipHits;
private int numDocs;
@@ -81,11 +81,11 @@ public boolean hasTooManyLuceneMatches() {
return hasTooManyLuceneMatches;
}
- public void setLuceneMatchCount(int luceneMatchCount) {
+ public void setLuceneMatchCount(long luceneMatchCount) {
this.luceneMatchCount = luceneMatchCount;
}
- public int getLuceneMatchCount() {
+ public long getLuceneMatchCount() {
return luceneMatchCount;
}
diff --git a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java
index 8febf19486a4..be0b8a93e752 100644
--- a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java
+++ b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java
@@ -271,11 +271,11 @@ public void testSeveralElements() throws Exception {
assertMatches(makeRule("How do you"), 1); // known overmatching
}
- private void assertMatches(AbstractPatternRule patternRule, int expectedMatches) throws Exception {
+ private void assertMatches(AbstractPatternRule patternRule, long expectedMatches) throws Exception {
PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher);
Query query = queryBuilder.buildRelaxedQuery(patternRule);
//System.out.println("QUERY: " + query);
- int matches = searcher.search(query, 1000).totalHits;
+ long matches = searcher.search(query, 1000).totalHits.value;
assertEquals("Query failed: " + query, expectedMatches, matches);
}
diff --git a/pom.xml b/pom.xml
index f6ccc7e61d3c..b1423e960ece 100644
--- a/pom.xml
+++ b/pom.xml
@@ -163,6 +163,7 @@
0.8.2
2.1.2
6.2.1
+ 8.11.0
1.2.2
portable-1.8.2
70.1
@@ -217,7 +218,7 @@
2.16.1
0.02
1.18.30
- 5.5.5
+ 8.11.3
2.1.9
0.6
@@ -275,9 +276,9 @@
${jackson.version}
- com.github.lucene-gosen
+ org.omegat.lucene
lucene-gosen
- ${com.github.lucene-gosen.version}
+ ${org.omegat.lucene.lucene-gosen.version}
ipadic