feat: Bump [email protected] and [email protected]

languagetool-org · Aug 6, 2024 · 7a07955 · 7a07955
1 parent 70509a8
commit 7a07955
Show file tree

Hide file tree

Showing 30 changed files with 126 additions and 114 deletions.
diff --git a/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java b/languagetool-commandline/src/main/java/org/languagetool/commandline/Main.java
@@ -20,9 +20,13 @@
 
 import org.apache.commons.io.ByteOrderMark;
 import org.apache.commons.io.input.BOMInputStream;
-import org.languagetool.*;
+import org.languagetool.JLanguageTool;
+import org.languagetool.Language;
+import org.languagetool.Languages;
+import org.languagetool.MultiThreadedJLanguageTool;
 import org.languagetool.bitext.TabBitextReader;
-import org.languagetool.language.*;
+import org.languagetool.language.AmericanEnglish;
+import org.languagetool.language.English;
 import org.languagetool.language.identifier.LanguageIdentifier;
 import org.languagetool.language.identifier.LanguageIdentifierService;
 import org.languagetool.rules.Rule;
@@ -35,7 +39,13 @@
 import org.xml.sax.SAXException;
 
 import javax.xml.parsers.ParserConfigurationException;
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;

diff --git a/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java b/languagetool-commandline/src/test/java/org/languagetool/commandline/MainTest.java
@@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception {
     String[] args = {"-l", "en", "--json", "-"};
     Main.main(args);
     String output = new String(this.out.toByteArray());
-    assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\""));
+    assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":"));
     assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\""));
     assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\""));
     assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]"));

diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfe
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.cfs
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/_0.si
diff --git a/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1 b/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/1grams/segments_1
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfe
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.cfs
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/_0.si
diff --git a/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1 b/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/2grams/segments_1
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfe
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.cfs
diff --git a/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si b/languagetool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/_0.si
diff --git a/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1 b/...getool-core/src/test/resources/org/languagetool/resource/yy/ngram-index/3grams/segments_1
diff --git a/languagetool-dev/pom.xml b/languagetool-dev/pom.xml
@@ -58,6 +58,10 @@
             <groupId>org.languagetool</groupId>
             <artifactId>languagetool-wikipedia</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+        </dependency>
         <dependency>
             <groupId>org.mariadb.jdbc</groupId>
             <artifactId>mariadb-java-client</artifactId>

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/archive/StartTokenCounter.java
@@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException {
     try (FSDirectory directory = FSDirectory.open(dir.toPath());
          IndexReader reader = DirectoryReader.open(directory)) {
       IndexSearcher searcher = new IndexSearcher(reader);
-      Fields fields = MultiFields.getFields(reader);
-      Terms ngrams = fields.terms("ngram");
-      TermsEnum iterator = ngrams.iterator();
-      BytesRef next;
-      int i = 0;
-      while ((next = iterator.next()) != null) {
-        String term = next.utf8ToString();
-        if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
-          if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
-            //System.out.println("ignore: " + term);
-            continue;
-          }
-          TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
-          if (topDocs.totalHits == 0) {
-            throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
-          } else if (topDocs.totalHits == 1) {
-            int docId = topDocs.scoreDocs[0].doc;
-            Document document = reader.document(docId);
-            Long count = Long.parseLong(document.get("count"));
-            //System.out.println(term + " -> " + count);
-            totalCount += count;
-            if (++i % 10_000 == 0) {
-              System.out.println(i + " ... " + totalCount);
+      for (String field : FieldInfos.getIndexedFields(reader)) {
+        Terms ngrams = MultiTerms.getTerms(reader, field);
+        TermsEnum iterator = ngrams.iterator();
+        BytesRef next;
+        int i = 0;
+        while ((next = iterator.next()) != null) {
+          String term = next.utf8ToString();
+          if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
+            if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
+              //System.out.println("ignore: " + term);
+              continue;
+            }
+            TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
+            if (topDocs.totalHits.value == 0) {
+              throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value);
+            } else if (topDocs.totalHits.value == 1) {
+              int docId = topDocs.scoreDocs[0].doc;
+              Document document = reader.document(docId);
+              Long count = Long.parseLong(document.get("count"));
+              //System.out.println(term + " -> " + count);
+              totalCount += count;
+              if (++i % 10_000 == 0) {
+                System.out.println(i + " ... " + totalCount);
+              }
+            } else {
+              throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
             }
-          } else {
-            throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
           }
         }
       }

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/AggregatedNgramToLucene.java
@@ -1,4 +1,4 @@
-/* LanguageTool, a natural language style checker 
+/* LanguageTool, a natural language style checker
  * Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
  * 
  * This library is free software; you can redistribute it and/or
@@ -20,7 +20,11 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.*;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
@@ -95,13 +99,8 @@ private Document getDoc(String ngram, long count) {
   }
 
   @NotNull
-  private LongField getCountField(long count) {
-    FieldType fieldType = new FieldType();
-    fieldType.setStored(true);
-    fieldType.setOmitNorms(true);
-    fieldType.setNumericType(FieldType.NumericType.LONG);
-    fieldType.setDocValuesType(DocValuesType.NUMERIC);
-    return new LongField("count", count, fieldType);
+  private LongPoint getCountField(long count) {
+    return new LongPoint("count", count);
   }
 
   private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/CommonCrawlToNgram.java
@@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
     if (newReader != null) {
       reader = newReader;
     }*/
-    index.reader = DirectoryReader.open(index.indexWriter, true);
+    index.reader = DirectoryReader.open(index.indexWriter, true, true);
     index.searcher = new IndexSearcher(index.reader);
     for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
       Term ngram = new Term("ngram", entry.getKey());
       TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
       //System.out.println(ngram + " ==> " + topDocs.totalHits);
-      if (topDocs.totalHits == 0) {
+      if (topDocs.totalHits.value == 0) {
         Document doc = getDoc(entry.getKey(), entry.getValue());
         index.indexWriter.addDocument(doc);
-      } else if (topDocs.totalHits == 1) {
+      } else if (topDocs.totalHits.value == 1) {
         int docNumber = topDocs.scoreDocs[0].doc;
         Document document = index.reader.document(docNumber);
         long oldCount = Long.parseLong(document.getField("count").stringValue());
@@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
         index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
         // would probably be faster, but we currently rely on the count being a common field:
         //indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
-      } else if (topDocs.totalHits > 1) {
+      } else if (topDocs.totalHits.value > 1) {
         throw new RuntimeException("Got more than one hit for: " + ngram);
       }
       //System.out.println("   " + entry.getKey() + " -> " + entry.getValue());
@@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) {
   }
 
   @NotNull
-  private LongField getCountField(long count) {
-    FieldType fieldType = new FieldType();
-    fieldType.setStored(true);
-    fieldType.setOmitNorms(true);
-    fieldType.setNumericType(FieldType.NumericType.LONG);
-    fieldType.setDocValuesType(DocValuesType.NUMERIC);
-    return new LongField("count", count, fieldType);
+  private LongPoint getCountField(long count) {
+    return new LongPoint("count", count);
   }
 
   private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
@@ -269,7 +264,7 @@ static class LuceneLiveIndex {
       IndexWriterConfig config = new IndexWriterConfig(analyzer);
       directory = FSDirectory.open(dir.toPath());
       indexWriter = new IndexWriter(directory, config);
-      reader = DirectoryReader.open(indexWriter, false);
+      reader = DirectoryReader.open(indexWriter, false, false);
       searcher = new IndexSearcher(reader);
     }
 

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/GermanUppercasePhraseFinder.java
@@ -34,7 +34,9 @@
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
+import java.util.Set;
 
 /**
  * Prototype to find potential upper-only phrases like "Persischer Golf".
@@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException {
     FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
     IndexReader reader = DirectoryReader.open(fsDir);
     IndexSearcher searcher = new IndexSearcher(reader);
-    Fields fields = MultiFields.getFields(reader);
-    Terms terms = fields.terms("ngram");
-    TermsEnum termsEnum = terms.iterator();
-    int count = 0;
-    BytesRef next;
-    while ((next = termsEnum.next()) != null) {
-      String term = next.utf8ToString();
-      count++;
-      //term = "persischer Golf";  // for testing
-      String[] parts = term.split(" ");
-      boolean useful = true;
-      int lcCount = 0;
-      List<String> ucParts = new ArrayList<>();
-      for (String part : parts) {
-        if (part.length() < MIN_TERM_LEN) {
-          useful = false;
-          break;
+    for (String field: FieldInfos.getIndexedFields(reader)) {
+      Terms terms = MultiTerms.getTerms(reader, field);
+      TermsEnum termsEnum = terms.iterator();
+      int count = 0;
+      BytesRef next;
+      while ((next = termsEnum.next()) != null) {
+        String term = next.utf8ToString();
+        count++;
+        //term = "persischer Golf";  // for testing
+        String[] parts = term.split(" ");
+        boolean useful = true;
+        int lcCount = 0;
+        List<String> ucParts = new ArrayList<>();
+        for (String part : parts) {
+          if (part.length() < MIN_TERM_LEN) {
+            useful = false;
+            break;
+          }
+          String uc = StringTools.uppercaseFirstChar(part);
+          if (!part.equals(uc)) {
+            lcCount++;
+          }
+          ucParts.add(uc);
         }
-        String uc = StringTools.uppercaseFirstChar(part);
-        if (!part.equals(uc)) {
-          lcCount++;
+        if (!useful || lcCount == 0 || lcCount == 2) {
+          continue;
         }
-        ucParts.add(uc);
-      }
-      if (!useful || lcCount == 0 || lcCount == 2) {
-        continue;
-      }
-      String uppercase = String.join(" ", ucParts);
-      if (term.equals(uppercase)){
-        continue;
-      }
-      long thisCount = getOccurrenceCount(reader, searcher, term);
-      long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
-      if (count % 10_000 == 0) {
-        System.err.println(count + " @ " + term);
-      }
-      if (thisCount > LIMIT || thisUpperCount > LIMIT) {
-        if (thisUpperCount > thisCount) {
-          if (isRelevant(lt, term)) {
-            float factor = (float)thisUpperCount / thisCount;
-            System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
+        String uppercase = String.join(" ", ucParts);
+        if (term.equals(uppercase)) {
+          continue;
+        }
+        long thisCount = getOccurrenceCount(reader, searcher, term);
+        long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
+        if (count % 10_000 == 0) {
+          System.err.println(count + " @ " + term);
+        }
+        if (thisCount > LIMIT || thisUpperCount > LIMIT) {
+          if (thisUpperCount > thisCount) {
+            if (isRelevant(lt, term)) {
+              float factor = (float) thisUpperCount / thisCount;
+              System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
+            }
           }
         }
       }
@@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept
 
   private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException {
     TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
-    if (topDocs.totalHits == 0) {
+    if (topDocs.totalHits.value == 0) {
       return 0;
     }
     int docId = topDocs.scoreDocs[0].doc;

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/LargestNGramFinder.java
@@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException {
     FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
     IndexReader reader = DirectoryReader.open(fsDir);
     IndexSearcher searcher = new IndexSearcher(reader);
-    Fields fields = MultiFields.getFields(reader);
+    Terms terms = MultiTerms.getTerms(reader, "ngram");
     long max = 0;
     String maxTerm = "";
-    Terms terms = fields.terms("ngram");
     TermsEnum termsEnum = terms.iterator();
     int count = 0;
     BytesRef next;
@@ -71,5 +70,6 @@ public static void main(String[] args) throws IOException {
     }
     System.out.println("Max: " + max + " for " + maxTerm);
   }
+
 
 }
diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/NeededNGramCounter.java
@@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException {
     String ngramIndexDir = args[0];
     FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath());
     IndexReader reader = DirectoryReader.open(fsDir);
-    Fields fields = MultiFields.getFields(reader);
-    Terms terms = fields.terms("ngram");
+    Terms terms = MultiTerms.getTerms(reader, "ngram");
     TermsEnum termsEnum = terms.iterator();
     int i = 0;
     int needed = 0;

diff --git a/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java b/languagetool-dev/src/main/java/org/languagetool/dev/bigdata/TextIndexCreator.java
@@ -20,7 +20,7 @@
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.TextField;

diff --git a/languagetool-language-modules/ja/pom.xml b/languagetool-language-modules/ja/pom.xml
@@ -40,7 +40,7 @@
 
     <dependencies>
         <dependency>
-            <groupId>com.github.lucene-gosen</groupId>
+            <groupId>org.omegat.lucene</groupId>
             <artifactId>lucene-gosen</artifactId>
             <classifier>ipadic</classifier>
         </dependency>

diff --git a/...agetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java b/...agetool-wikipedia/src/main/java/org/languagetool/dev/dumpcheck/SentenceSourceIndexer.java
@@ -19,8 +19,8 @@
 package org.languagetool.dev.dumpcheck;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;

diff --git a/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java b/languagetool-wikipedia/src/main/java/org/languagetool/dev/index/AnyCharTokenizer.java
@@ -20,8 +20,8 @@
 
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.CharacterUtils;
 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.util.CharacterUtils;
 import org.apache.lucene.util.AttributeFactory;
 
 import java.io.IOException;
@@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer {
   private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length!
 
   private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
-  private final CharacterUtils charUtils = CharacterUtils.getInstance();
   private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class);
 
@@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException {
     while(true) {
       if(this.bufferIndex >= this.dataLen) {
         this.offset += this.dataLen;
-        this.charUtils.fill(this.ioBuffer, this.input);
+        CharacterUtils.fill(this.ioBuffer, this.input);
         if(this.ioBuffer.getLength() == 0) {
           this.dataLen = 0;
           if(length <= 0) {
@@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException {
         this.bufferIndex = 0;
       }
 
-      int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
+      int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex);
       int charCount = Character.charCount(c);
       this.bufferIndex += charCount;
       if(this.isTokenChar(c)) {