diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java index 5cbe62d89365..f9ae81837db7 100644 --- a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java +++ b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java @@ -20,9 +20,13 @@ import org.apache.commons.io.ByteOrderMark; import org.apache.commons.io.input.BOMInputStream; -import org.languagetool.*; +import org.languagetool.JLanguageTool; +import org.languagetool.Language; +import org.languagetool.Languages; +import org.languagetool.MultiThreadedJLanguageTool; import org.languagetool.bitext.TabBitextReader; -import org.languagetool.language.*; +import org.languagetool.language.AmericanEnglish; +import org.languagetool.language.English; import org.languagetool.language.identifier.LanguageIdentifier; import org.languagetool.language.identifier.LanguageIdentifierService; import org.languagetool.rules.Rule; @@ -35,7 +39,13 @@ import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java index f9e6c3f87917..f130c02887fd 100644 --- a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java +++ b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java @@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception { String[] args = {"-l", "en", "--json", "-"}; Main.main(args); String output = new String(this.out.toByteArray()); - assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\"")); + assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":")); assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\"")); assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\"")); assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]")); diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe index b798387ded9d..ee091ce48847 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs index af3abe0553ad..ba80b675fe1d 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si index 7fce7afe0e39..4a4978085455 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 index 6e93fa9e6973..e49de5d67abe 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe index d04ba2ee5c95..15e6f67ed0a2 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs index 36ff0f65c139..5d62d79a170c 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si index 3b116de2a4f0..ce3c306f6fab 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 index 6e93fa9e6973..15102c4ff7f2 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe index dfbaafafcd61..fb7f2188cace 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs index 552a0d09868e..c853d3bd23f0 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si index 3f231fb03f90..10159d3fd64c 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si differ diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 index 6e93fa9e6973..4b452afb987e 100644 Binary files a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 and b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 differ diff --git a/languagetool-dev/pom.xml b/languagetool-dev/pom.xml index de8a11b203bb..bba8bd88b0cd 100644 --- a/languagetool-dev/pom.xml +++ b/languagetool-dev/pom.xml @@ -58,6 +58,10 @@ org.languagetool languagetool-wikipedia + + org.apache.lucene + lucene-core + org.mariadb.jdbc mariadb-java-client diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java index ab45341f7edc..39c001d227a5 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java @@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException { try (FSDirectory directory = FSDirectory.open(dir.toPath()); IndexReader reader = DirectoryReader.open(directory)) { IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms ngrams = fields.terms("ngram"); - TermsEnum iterator = ngrams.iterator(); - BytesRef next; - int i = 0; - while ((next = iterator.next()) != null) { - String term = next.utf8ToString(); - if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { - if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { - //System.out.println("ignore: " + term); - continue; - } - TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); - if (topDocs.totalHits == 0) { - throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits); - } else if (topDocs.totalHits == 1) { - int docId = topDocs.scoreDocs[0].doc; - Document document = reader.document(docId); - Long count = Long.parseLong(document.get("count")); - //System.out.println(term + " -> " + count); - totalCount += count; - if (++i % 10_000 == 0) { - System.out.println(i + " ... " + totalCount); + for (String field : FieldInfos.getIndexedFields(reader)) { + Terms ngrams = MultiTerms.getTerms(reader, field); + TermsEnum iterator = ngrams.iterator(); + BytesRef next; + int i = 0; + while ((next = iterator.next()) != null) { + String term = next.utf8ToString(); + if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { + if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { + //System.out.println("ignore: " + term); + continue; + } + TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); + if (topDocs.totalHits.value == 0) { + throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value); + } else if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = reader.document(docId); + Long count = Long.parseLong(document.get("count")); + //System.out.println(term + " -> " + count); + totalCount += count; + if (++i % 10_000 == 0) { + System.out.println(i + " ... " + totalCount); + } + } else { + throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } - } else { - throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits); } } } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java index 8b352f75be7b..3a06ee9819d0 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java @@ -1,4 +1,4 @@ -/* LanguageTool, a natural language style checker +/* LanguageTool, a natural language style checker * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or @@ -20,7 +20,11 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.StringField; import org.apache.lucene.index.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -95,13 +99,8 @@ private Document getDoc(String ngram, long count) { } @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); + private LongPoint getCountField(long count) { + return new LongPoint("count", count); } private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java index 4e130e87bcb5..8e2e92b25566 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java @@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws if (newReader != null) { reader = newReader; }*/ - index.reader = DirectoryReader.open(index.indexWriter, true); + index.reader = DirectoryReader.open(index.indexWriter, true, true); index.searcher = new IndexSearcher(index.reader); for (Map.Entry entry : ngramToCount.entrySet()) { Term ngram = new Term("ngram", entry.getKey()); TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2); //System.out.println(ngram + " ==> " + topDocs.totalHits); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { Document doc = getDoc(entry.getKey(), entry.getValue()); index.indexWriter.addDocument(doc); - } else if (topDocs.totalHits == 1) { + } else if (topDocs.totalHits.value == 1) { int docNumber = topDocs.scoreDocs[0].doc; Document document = index.reader.document(docNumber); long oldCount = Long.parseLong(document.getField("count").stringValue()); @@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map ngramToCount) throws index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue())); // would probably be faster, but we currently rely on the count being a common field: //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue()); - } else if (topDocs.totalHits > 1) { + } else if (topDocs.totalHits.value > 1) { throw new RuntimeException("Got more than one hit for: " + ngram); } //System.out.println(" " + entry.getKey() + " -> " + entry.getValue()); @@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) { } @NotNull - private LongField getCountField(long count) { - FieldType fieldType = new FieldType(); - fieldType.setStored(true); - fieldType.setOmitNorms(true); - fieldType.setNumericType(FieldType.NumericType.LONG); - fieldType.setDocValuesType(DocValuesType.NUMERIC); - return new LongField("count", count, fieldType); + private LongPoint getCountField(long count) { + return new LongPoint("count", count); } private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException { @@ -269,7 +264,7 @@ static class LuceneLiveIndex { IndexWriterConfig config = new IndexWriterConfig(analyzer); directory = FSDirectory.open(dir.toPath()); indexWriter = new IndexWriter(directory, config); - reader = DirectoryReader.open(indexWriter, false); + reader = DirectoryReader.open(indexWriter, false, false); searcher = new IndexSearcher(reader); } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java index ad0e60af36a3..e8a2365c3111 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java @@ -34,7 +34,9 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; +import java.util.Set; /** * Prototype to find potential upper-only phrases like "Persischer Golf". @@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); - TermsEnum termsEnum = terms.iterator(); - int count = 0; - BytesRef next; - while ((next = termsEnum.next()) != null) { - String term = next.utf8ToString(); - count++; - //term = "persischer Golf"; // for testing - String[] parts = term.split(" "); - boolean useful = true; - int lcCount = 0; - List ucParts = new ArrayList<>(); - for (String part : parts) { - if (part.length() < MIN_TERM_LEN) { - useful = false; - break; + for (String field: FieldInfos.getIndexedFields(reader)) { + Terms terms = MultiTerms.getTerms(reader, field); + TermsEnum termsEnum = terms.iterator(); + int count = 0; + BytesRef next; + while ((next = termsEnum.next()) != null) { + String term = next.utf8ToString(); + count++; + //term = "persischer Golf"; // for testing + String[] parts = term.split(" "); + boolean useful = true; + int lcCount = 0; + List ucParts = new ArrayList<>(); + for (String part : parts) { + if (part.length() < MIN_TERM_LEN) { + useful = false; + break; + } + String uc = StringTools.uppercaseFirstChar(part); + if (!part.equals(uc)) { + lcCount++; + } + ucParts.add(uc); } - String uc = StringTools.uppercaseFirstChar(part); - if (!part.equals(uc)) { - lcCount++; + if (!useful || lcCount == 0 || lcCount == 2) { + continue; } - ucParts.add(uc); - } - if (!useful || lcCount == 0 || lcCount == 2) { - continue; - } - String uppercase = String.join(" ", ucParts); - if (term.equals(uppercase)){ - continue; - } - long thisCount = getOccurrenceCount(reader, searcher, term); - long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); - if (count % 10_000 == 0) { - System.err.println(count + " @ " + term); - } - if (thisCount > LIMIT || thisUpperCount > LIMIT) { - if (thisUpperCount > thisCount) { - if (isRelevant(lt, term)) { - float factor = (float)thisUpperCount / thisCount; - System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + String uppercase = String.join(" ", ucParts); + if (term.equals(uppercase)) { + continue; + } + long thisCount = getOccurrenceCount(reader, searcher, term); + long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); + if (count % 10_000 == 0) { + System.err.println(count + " @ " + term); + } + if (thisCount > LIMIT || thisUpperCount > LIMIT) { + if (thisUpperCount > thisCount) { + if (isRelevant(lt, term)) { + float factor = (float) thisUpperCount / thisCount; + System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); + } } } } @@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException { TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); - if (topDocs.totalHits == 0) { + if (topDocs.totalHits.value == 0) { return 0; } int docId = topDocs.scoreDocs[0].doc; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java index 54d6dc8d8587..74a6b0c8f887 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java @@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException { FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); - Fields fields = MultiFields.getFields(reader); + Terms terms = MultiTerms.getTerms(reader, "ngram"); long max = 0; String maxTerm = ""; - Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; @@ -71,5 +70,6 @@ public static void main(String[] args) throws IOException { } System.out.println("Max: " + max + " for " + maxTerm); } + } diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java index d5caea350778..b7c43dc46619 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java @@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException { String ngramIndexDir = args[0]; FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath()); IndexReader reader = DirectoryReader.open(fsDir); - Fields fields = MultiFields.getFields(reader); - Terms terms = fields.terms("ngram"); + Terms terms = MultiTerms.getTerms(reader, "ngram"); TermsEnum termsEnum = terms.iterator(); int i = 0; int needed = 0; diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java index 0d97a4df0bab..e4ed2832f7a0 100644 --- a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java +++ b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java @@ -20,7 +20,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; diff --git a/languagetool-language-modules/ja/pom.xml b/languagetool-language-modules/ja/pom.xml index 12dfea7942b2..58ee2b8b9d68 100644 --- a/languagetool-language-modules/ja/pom.xml +++ b/languagetool-language-modules/ja/pom.xml @@ -40,7 +40,7 @@ - com.github.lucene-gosen + org.omegat.lucene lucene-gosen ipadic diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java index d3b2adfc4abe..5f99596bf348 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java @@ -19,8 +19,8 @@ package org.languagetool.dev.dumpcheck; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java index effff0cf9460..8cb404ed5924 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java @@ -20,8 +20,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.CharacterUtils; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.util.AttributeFactory; import java.io.IOException; @@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer { private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length! private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096); - private final CharacterUtils charUtils = CharacterUtils.getInstance(); private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class); @@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException { while(true) { if(this.bufferIndex >= this.dataLen) { this.offset += this.dataLen; - this.charUtils.fill(this.ioBuffer, this.input); + CharacterUtils.fill(this.ioBuffer, this.input); if(this.ioBuffer.getLength() == 0) { this.dataLen = 0; if(length <= 0) { @@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException { this.bufferIndex = 0; } - int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength()); + int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex); int charCount = Character.charCount(c); this.bufferIndex += charCount; if(this.isTokenChar(c)) { diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java index 622c7f7291f5..e356998b9f95 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/PatternRuleQueryBuilder.java @@ -124,7 +124,7 @@ private SpanQuery asSpanQuery(BooleanClause query) { } else { Set terms = new HashSet<>(); try { - indexSearcher.createWeight(query.getQuery(), false).extractTerms(terms); + indexSearcher.createWeight(query.getQuery(), ScoreMode.COMPLETE_NO_SCORES, 1.0f).extractTerms(terms); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java index 22dbb89e625c..8564e197a446 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/Searcher.java @@ -33,6 +33,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; import org.apache.lucene.search.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; @@ -101,7 +102,7 @@ public int getDocCount() throws IOException { private int getDocCount(IndexSearcher indexSearcher) throws IOException { Term searchTerm = new Term(MAX_DOC_COUNT_FIELD, MAX_DOC_COUNT_FIELD_VAL); TopDocs search = indexSearcher.search(new TermQuery(searchTerm), 1); - if (search.totalHits != 1) { + if (search.totalHits.value != 1) { return -1; } ScoreDoc scoreDoc = search.scoreDocs[0]; @@ -334,7 +335,7 @@ class SearchRunnable implements Runnable { private List matchingSentences; private Exception exception; private boolean tooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int maxDocChecked; private int docsChecked; private int numDocs; @@ -356,7 +357,7 @@ public void run() { PossiblyLimitedTopDocs limitedTopDocs = getTopDocs(query); long luceneTime = System.currentTimeMillis() - t2; long t3 = System.currentTimeMillis(); - luceneMatchCount = limitedTopDocs.topDocs.totalHits; + luceneMatchCount = limitedTopDocs.topDocs.totalHits.value; tooManyLuceneMatches = limitedTopDocs.topDocs.scoreDocs.length >= maxHits; MatchingSentencesResult res = findMatchingSentences(indexSearcher, limitedTopDocs.topDocs, languageTool); matchingSentences = res.matchingSentences; @@ -382,7 +383,7 @@ boolean hasTooManyLuceneMatches() { return tooManyLuceneMatches; } - int getLuceneMatchCount() { + long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java index 40c860af0650..6c39036346e0 100644 --- a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java +++ b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/SearcherResult.java @@ -35,7 +35,7 @@ public class SearcherResult { private int docCount; private int maxDocChecked; private boolean hasTooManyLuceneMatches; - private int luceneMatchCount; + private long luceneMatchCount; private int skipHits; private int numDocs; @@ -81,11 +81,11 @@ public boolean hasTooManyLuceneMatches() { return hasTooManyLuceneMatches; } - public void setLuceneMatchCount(int luceneMatchCount) { + public void setLuceneMatchCount(long luceneMatchCount) { this.luceneMatchCount = luceneMatchCount; } - public int getLuceneMatchCount() { + public long getLuceneMatchCount() { return luceneMatchCount; } diff --git a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java index 8febf19486a4..be0b8a93e752 100644 --- a/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java +++ b/languagetool-wikipedia/src/test/java/org/languagetool/dev/index/PatternRuleQueryBuilderTest.java @@ -271,11 +271,11 @@ public void testSeveralElements() throws Exception { assertMatches(makeRule("How do you"), 1); // known overmatching } - private void assertMatches(AbstractPatternRule patternRule, int expectedMatches) throws Exception { + private void assertMatches(AbstractPatternRule patternRule, long expectedMatches) throws Exception { PatternRuleQueryBuilder queryBuilder = new PatternRuleQueryBuilder(language, searcher); Query query = queryBuilder.buildRelaxedQuery(patternRule); //System.out.println("QUERY: " + query); - int matches = searcher.search(query, 1000).totalHits; + long matches = searcher.search(query, 1000).totalHits.value; assertEquals("Query failed: " + query, expectedMatches, matches); } diff --git a/pom.xml b/pom.xml index f6ccc7e61d3c..b1423e960ece 100644 --- a/pom.xml +++ b/pom.xml @@ -163,6 +163,7 @@ 0.8.2 2.1.2 6.2.1 + 8.11.0 1.2.2 portable-1.8.2 70.1 @@ -217,7 +218,7 @@ 2.16.1 0.02 1.18.30 - 5.5.5 + 8.11.3 2.1.9 0.6 @@ -275,9 +276,9 @@ ${jackson.version} - com.github.lucene-gosen + org.omegat.lucene lucene-gosen - ${com.github.lucene-gosen.version} + ${org.omegat.lucene.lucene-gosen.version} ipadic