Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
miurahr committed Aug 6, 2024
1 parent 70509a8 commit 7a07955
Show file tree
Hide file tree
Showing 30 changed files with 126 additions and 114 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,13 @@

import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.BOMInputStream;
import org.languagetool.*;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.MultiThreadedJLanguageTool;
import org.languagetool.bitext.TabBitextReader;
import org.languagetool.language.*;
import org.languagetool.language.AmericanEnglish;
import org.languagetool.language.English;
import org.languagetool.language.identifier.LanguageIdentifier;
import org.languagetool.language.identifier.LanguageIdentifierService;
import org.languagetool.rules.Rule;
Expand All @@ -35,7 +39,13 @@
import org.xml.sax.SAXException;

import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ public void testEnglishStdIn4() throws Exception {
String[] args = {"-l", "en", "--json", "-"};
Main.main(args);
String output = new String(this.out.toByteArray());
assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":\""));
assertTrue("Got: " + output, output.contains("{\"software\":{\"name\":\"LanguageTool\",\"version\":"));
assertTrue("Got: " + output, output.contains("\"language\":{\"name\":\"English\",\"code\":\"en\""));
assertTrue("Got: " + output, output.contains("{\"message\":\"Use \\\"a\\\" instead of 'an' if the following word doesn't start with a vowel sound, e.g. 'a sentence', 'a university'.\""));
assertTrue("Got: " + output, output.contains("\"replacements\":[{\"value\":\"a\"}]"));
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
4 changes: 4 additions & 0 deletions languagetool-dev/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@
<groupId>org.languagetool</groupId>
<artifactId>languagetool-wikipedia</artifactId>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
</dependency>
<dependency>
<groupId>org.mariadb.jdbc</groupId>
<artifactId>mariadb-java-client</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,32 +47,33 @@ public static void main(String[] args) throws IOException {
try (FSDirectory directory = FSDirectory.open(dir.toPath());
IndexReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms ngrams = fields.terms("ngram");
TermsEnum iterator = ngrams.iterator();
BytesRef next;
int i = 0;
while ((next = iterator.next()) != null) {
String term = next.utf8ToString();
if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
//System.out.println("ignore: " + term);
continue;
}
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
if (topDocs.totalHits == 0) {
throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits);
} else if (topDocs.totalHits == 1) {
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
Long count = Long.parseLong(document.get("count"));
//System.out.println(term + " -> " + count);
totalCount += count;
if (++i % 10_000 == 0) {
System.out.println(i + " ... " + totalCount);
for (String field : FieldInfos.getIndexedFields(reader)) {
Terms ngrams = MultiTerms.getTerms(reader, field);
TermsEnum iterator = ngrams.iterator();
BytesRef next;
int i = 0;
while ((next = iterator.next()) != null) {
String term = next.utf8ToString();
if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) {
if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) {
//System.out.println("ignore: " + term);
continue;
}
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3);
if (topDocs.totalHits.value == 0) {
throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits.value);
} else if (topDocs.totalHits.value == 1) {
int docId = topDocs.scoreDocs[0].doc;
Document document = reader.document(docId);
Long count = Long.parseLong(document.get("count"));
//System.out.println(term + " -> " + count);
totalCount += count;
if (++i % 10_000 == 0) {
System.out.println(i + " ... " + totalCount);
}
} else {
throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
} else {
throw new RuntimeException("More hits than expected for " + term + ": " + topDocs.totalHits);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* LanguageTool, a natural language style checker
/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
Expand All @@ -20,7 +20,11 @@

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
Expand Down Expand Up @@ -95,13 +99,8 @@ private Document getDoc(String ngram, long count) {
}

@NotNull
private LongField getCountField(long count) {
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setNumericType(FieldType.NumericType.LONG);
fieldType.setDocValuesType(DocValuesType.NUMERIC);
return new LongField("count", count, fieldType);
private LongPoint getCountField(long count) {
return new LongPoint("count", count);
}

private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,16 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
if (newReader != null) {
reader = newReader;
}*/
index.reader = DirectoryReader.open(index.indexWriter, true);
index.reader = DirectoryReader.open(index.indexWriter, true, true);
index.searcher = new IndexSearcher(index.reader);
for (Map.Entry<String, Long> entry : ngramToCount.entrySet()) {
Term ngram = new Term("ngram", entry.getKey());
TopDocs topDocs = index.searcher.search(new TermQuery(ngram), 2);
//System.out.println(ngram + " ==> " + topDocs.totalHits);
if (topDocs.totalHits == 0) {
if (topDocs.totalHits.value == 0) {
Document doc = getDoc(entry.getKey(), entry.getValue());
index.indexWriter.addDocument(doc);
} else if (topDocs.totalHits == 1) {
} else if (topDocs.totalHits.value == 1) {
int docNumber = topDocs.scoreDocs[0].doc;
Document document = index.reader.document(docNumber);
long oldCount = Long.parseLong(document.getField("count").stringValue());
Expand All @@ -195,7 +195,7 @@ private void writeToLucene(int ngramSize, Map<String, Long> ngramToCount) throws
index.indexWriter.addDocument(getDoc(entry.getKey(), oldCount + entry.getValue()));
// would probably be faster, but we currently rely on the count being a common field:
//indexWriter.updateNumericDocValue(ngram, "count", oldCount + entry.getValue());
} else if (topDocs.totalHits > 1) {
} else if (topDocs.totalHits.value > 1) {
throw new RuntimeException("Got more than one hit for: " + ngram);
}
//System.out.println(" " + entry.getKey() + " -> " + entry.getValue());
Expand All @@ -221,13 +221,8 @@ private Document getDoc(String ngram, long count) {
}

@NotNull
private LongField getCountField(long count) {
FieldType fieldType = new FieldType();
fieldType.setStored(true);
fieldType.setOmitNorms(true);
fieldType.setNumericType(FieldType.NumericType.LONG);
fieldType.setDocValuesType(DocValuesType.NUMERIC);
return new LongField("count", count, fieldType);
private LongPoint getCountField(long count) {
return new LongPoint("count", count);
}

private void addTotalTokenCountDoc(long totalTokenCount, IndexWriter writer) throws IOException {
Expand Down Expand Up @@ -269,7 +264,7 @@ static class LuceneLiveIndex {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
directory = FSDirectory.open(dir.toPath());
indexWriter = new IndexWriter(directory, config);
reader = DirectoryReader.open(indexWriter, false);
reader = DirectoryReader.open(indexWriter, false, false);
searcher = new IndexSearcher(reader);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;

/**
* Prototype to find potential upper-only phrases like "Persischer Golf".
Expand All @@ -57,47 +59,48 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
while ((next = termsEnum.next()) != null) {
String term = next.utf8ToString();
count++;
//term = "persischer Golf"; // for testing
String[] parts = term.split(" ");
boolean useful = true;
int lcCount = 0;
List<String> ucParts = new ArrayList<>();
for (String part : parts) {
if (part.length() < MIN_TERM_LEN) {
useful = false;
break;
for (String field: FieldInfos.getIndexedFields(reader)) {
Terms terms = MultiTerms.getTerms(reader, field);
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
while ((next = termsEnum.next()) != null) {
String term = next.utf8ToString();
count++;
//term = "persischer Golf"; // for testing
String[] parts = term.split(" ");
boolean useful = true;
int lcCount = 0;
List<String> ucParts = new ArrayList<>();
for (String part : parts) {
if (part.length() < MIN_TERM_LEN) {
useful = false;
break;
}
String uc = StringTools.uppercaseFirstChar(part);
if (!part.equals(uc)) {
lcCount++;
}
ucParts.add(uc);
}
String uc = StringTools.uppercaseFirstChar(part);
if (!part.equals(uc)) {
lcCount++;
if (!useful || lcCount == 0 || lcCount == 2) {
continue;
}
ucParts.add(uc);
}
if (!useful || lcCount == 0 || lcCount == 2) {
continue;
}
String uppercase = String.join(" ", ucParts);
if (term.equals(uppercase)){
continue;
}
long thisCount = getOccurrenceCount(reader, searcher, term);
long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
if (count % 10_000 == 0) {
System.err.println(count + " @ " + term);
}
if (thisCount > LIMIT || thisUpperCount > LIMIT) {
if (thisUpperCount > thisCount) {
if (isRelevant(lt, term)) {
float factor = (float)thisUpperCount / thisCount;
System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
String uppercase = String.join(" ", ucParts);
if (term.equals(uppercase)) {
continue;
}
long thisCount = getOccurrenceCount(reader, searcher, term);
long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase);
if (count % 10_000 == 0) {
System.err.println(count + " @ " + term);
}
if (thisCount > LIMIT || thisUpperCount > LIMIT) {
if (thisUpperCount > thisCount) {
if (isRelevant(lt, term)) {
float factor = (float) thisUpperCount / thisCount;
System.out.printf("%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor);
}
}
}
}
Expand All @@ -117,7 +120,7 @@ private static boolean isRelevant(JLanguageTool lt, String term) throws IOExcept

private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException {
TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5);
if (topDocs.totalHits == 0) {
if (topDocs.totalHits.value == 0) {
return 0;
}
int docId = topDocs.scoreDocs[0].doc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ public static void main(String[] args) throws IOException {
FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
IndexSearcher searcher = new IndexSearcher(reader);
Fields fields = MultiFields.getFields(reader);
Terms terms = MultiTerms.getTerms(reader, "ngram");
long max = 0;
String maxTerm = "";
Terms terms = fields.terms("ngram");
TermsEnum termsEnum = terms.iterator();
int count = 0;
BytesRef next;
Expand All @@ -71,5 +70,6 @@ public static void main(String[] args) throws IOException {
}
System.out.println("Max: " + max + " for " + maxTerm);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,7 @@ public static void main(String[] args) throws IOException {
String ngramIndexDir = args[0];
FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath());
IndexReader reader = DirectoryReader.open(fsDir);
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("ngram");
Terms terms = MultiTerms.getTerms(reader, "ngram");
TermsEnum termsEnum = terms.iterator();
int i = 0;
int needed = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
Expand Down
2 changes: 1 addition & 1 deletion languagetool-language-modules/ja/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

<dependencies>
<dependency>
<groupId>com.github.lucene-gosen</groupId>
<groupId>org.omegat.lucene</groupId>
<artifactId>lucene-gosen</artifactId>
<classifier>ipadic</classifier>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
package org.languagetool.dev.dumpcheck;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;

import java.io.IOException;
Expand All @@ -36,7 +36,6 @@ public final class AnyCharTokenizer extends Tokenizer {
private static final int MAX_WORD_LEN = Integer.MAX_VALUE; // extend the word length!

private final CharacterUtils.CharacterBuffer ioBuffer = CharacterUtils.newCharacterBuffer(4096);
private final CharacterUtils charUtils = CharacterUtils.getInstance();
private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class);

Expand Down Expand Up @@ -83,7 +82,7 @@ public boolean incrementToken() throws IOException {
while(true) {
if(this.bufferIndex >= this.dataLen) {
this.offset += this.dataLen;
this.charUtils.fill(this.ioBuffer, this.input);
CharacterUtils.fill(this.ioBuffer, this.input);
if(this.ioBuffer.getLength() == 0) {
this.dataLen = 0;
if(length <= 0) {
Expand All @@ -97,7 +96,7 @@ public boolean incrementToken() throws IOException {
this.bufferIndex = 0;
}

int c = this.charUtils.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex, this.ioBuffer.getLength());
int c = Character.codePointAt(this.ioBuffer.getBuffer(), this.bufferIndex);
int charCount = Character.charCount(c);
this.bufferIndex += charCount;
if(this.isTokenChar(c)) {
Expand Down
Loading

0 comments on commit 7a07955

Please sign in to comment.