Skip to content

Commit

Permalink
Merge pull request #437 from dice-group/IndQNERDataset
Browse files Browse the repository at this point in the history
added indqNER dataset
  • Loading branch information
MichaelRoeder authored Jun 22, 2023
2 parents dd1b569 + 343b08a commit 7d4ec0f
Show file tree
Hide file tree
Showing 10 changed files with 319 additions and 157 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,27 @@
import java.util.Map;

/**
* Class to retrieve entity types for the specified annotations
* from a CoNLL formatted dataset.
* Class to retrieve entity types for the specified annotations from a CoNLL
* formatted dataset.
*/
public class CoNLLTypeRetriever {

private static final String PLACE_URI = "http://dbpedia.org/ontology/Place";
private static final String COMPANY_URI = "http://dbpedia.org/ontology/company";
private static final String FILM_URI = "http://dbpedia.org/ontology/Film";
private static final String MUSICAL_ARTIST_URI = "http://dbpedia.org/ontology/MusicalArtist";
private static final String UNKNOWN_URI = "http://dbpedia.org/ontology/Unknown";
private static final String PERSON_URI = "http://dbpedia.org/ontology/Person";
private static final String PRODUCT_URI = "http://dbpedia.org/ontology/product";
private static final String SPORTS_TEAM_URI = "http://dbpedia.org/ontology/SportsTeam";
private static final String COMPANY_URI = "http://dbpedia.org/ontology/Company";
private static final String FILM_URI = "http://dbpedia.org/ontology/Film";
private static final String MUSICAL_ARTIST_URI = "http://dbpedia.org/ontology/MusicalArtist";
private static final String UNKNOWN_URI = "http://dbpedia.org/ontology/Unknown";
private static final String PERSON_URI = "http://dbpedia.org/ontology/Person";
private static final String PRODUCT_URI = "http://dbpedia.org/ontology/product"; // TODO this IRI is a property but
// not a class.
private static final String SPORTS_TEAM_URI = "http://dbpedia.org/ontology/SportsTeam";
private static final String TV_SHOW_URI = "http://dbpedia.org/ontology/TelevisionShow";
private static final String ORGANISATION_URI = "http://dbpedia.org/ontology/Organisation";

private Map<String, String> annotationToType;

public CoNLLTypeRetriever(String place, String company, String film, String musicalArtist, String unknown, String person,
String product, String sportsTeam, String tvShow, String organisation) {
public CoNLLTypeRetriever(String place, String company, String film, String musicalArtist, String unknown,
String person, String product, String sportsTeam, String tvShow, String organisation) {
annotationToType = new HashMap<>();
annotationToType.put(place, PLACE_URI);
annotationToType.put(company, COMPANY_URI);
Expand All @@ -53,10 +54,23 @@ public CoNLLTypeRetriever(String place, String company, String film, String musi
annotationToType.put(organisation, ORGANISATION_URI);
}

/**
* Returns the IRI for the given key or {@code null} if there is no value for
* the given key.
*
* @param key the type key found in the dataset
* @return the IRI of the type
*/
public String getTypeURI(String key) {
return annotationToType.get(key);
return annotationToType.getOrDefault(key, null);
}

/**
* Adds the given type key IRI mapping.
*
* @param key the type key that can be found in the dataset
* @param uri the IRI that should be returned for this keys
*/
public void addTypeURI(String key, String uri) {
annotationToType.put(key, uri);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,21 @@
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.transfer.nif.data.SpanImpl;
import org.aksw.gerbil.transfer.nif.data.TypedNamedEntity;
import org.aksw.gerbil.transfer.nif.data.TypedSpanImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Dataset Adapter to load a dataset that follows the general structure of
* CoNLL.
*/
public class GenericCoNLLDataset extends AbstractDataset implements InitializableDataset {

private static final Logger LOGGER = LoggerFactory.getLogger(GenericCoNLLDataset.class);

/**
* Prefix of a value in the marking column that expresses the start of a
* marking. TODO think about removing the '-' or make it configurable.
Expand Down Expand Up @@ -209,14 +216,15 @@ protected List<Marking> findMarkings(List<String> linesOfCurrentDoc, StringBuild
return markings;
}

protected TypedNamedEntity getWholeMarking(List<String> linesOfCurrentDoc, int pos, StringBuilder currentText) {
protected Marking getWholeMarking(List<String> linesOfCurrentDoc, int pos, StringBuilder currentText) {
String[] tokens = linesOfCurrentDoc.get(pos).split("\t");

// get type of the marking
// get type of the marking TODO if the B- and I- are configurable, the
// substring(2) has to be configurable as well.
String type = typeRetriever.getTypeURI(tokens[annotationColumn].substring(2));

// get uri of the marking if given in the dataset
String uri = "";
String uri = null;
if (uriColumn != -1 && tokens[uriColumn].startsWith("http")) {
uri = tokens[uriColumn];
}
Expand All @@ -233,7 +241,23 @@ protected TypedNamedEntity getWholeMarking(List<String> linesOfCurrentDoc, int p
break;
}
}
return new TypedNamedEntity(currentText.length(), surfaceForm.length(), uri,
new HashSet<String>(Arrays.asList(type)));
if (type != null) {
if (uri != null) {
return new TypedNamedEntity(currentText.length(), surfaceForm.length(), uri,
new HashSet<String>(Arrays.asList(type)));
} else {
return new TypedSpanImpl(currentText.length(), surfaceForm.length(),
new HashSet<String>(Arrays.asList(type)));
}
} else {
if (uri != null) {
return new NamedEntity(currentText.length(), surfaceForm.length(), uri);
} else {
LOGGER.warn(
"Found a marked piece of text without any further information: \"{}\". This is either an error in the dataset or this adapter is not correctly configured.",
surfaceForm);
return new SpanImpl(currentText.length(), surfaceForm.length());
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package org.aksw.gerbil.dataset.impl.indqner;

import org.aksw.gerbil.dataset.impl.conll.CoNLLTypeRetriever;
import org.aksw.gerbil.dataset.impl.conll.GenericCoNLLDataset;

/**
* Implementation of the IndQNERDataset class, which represents an
* InitializableDataset for the IndQNER dataset.
*
* @author Neha
* @author Michael R&ouml;der ([email protected])
*
*/
public class IndQNERDataset extends GenericCoNLLDataset {

private static final int ANNOTATION_COLUMN = 1;
private static final int URI_COLUMN = -1;
private static final CoNLLTypeRetriever TYPE_TAGS = new CoNLLTypeRetriever("GeographicalLocation", null, null, null,
null, "Person", null, null, null, null);

public IndQNERDataset(String file) {
super(file, ANNOTATION_COLUMN, URI_COLUMN, TYPE_TAGS);
TYPE_TAGS.addTypeURI("AfterlifeLocation", "https://corpus.quran.com/concept.jsp?id=afterlife-location");
TYPE_TAGS.addTypeURI("Allah", "https://corpus.quran.com/concept.jsp?id=allah");
TYPE_TAGS.addTypeURI("Angel", "https://github.com/dice-group/IndQNER/Angel"); // TODO: replace
TYPE_TAGS.addTypeURI("Artifact", "https://corpus.quran.com/concept.jsp?id=artifact");
TYPE_TAGS.addTypeURI("AstronomicalBody", "https://corpus.quran.com/concept.jsp?id=astronomical-body");
TYPE_TAGS.addTypeURI("Color", "https://corpus.quran.com/concept.jsp?id=color");
TYPE_TAGS.addTypeURI("Event", "https://corpus.quran.com/concept.jsp?id=event");
TYPE_TAGS.addTypeURI("Food", "https://github.com/dice-group/IndQNER/Food"); // TODO: replace
TYPE_TAGS.addTypeURI("HolyBook", "https://corpus.quran.com/concept.jsp?id=holy-book");
TYPE_TAGS.addTypeURI("Language", "https://corpus.quran.com/concept.jsp?id=language");
TYPE_TAGS.addTypeURI("Messenger", "https://github.com/dice-group/IndQNER/Messenger"); // TODO: replace
TYPE_TAGS.addTypeURI("Prophet", "https://github.com/dice-group/IndQNER/Prophet"); // TODO: replace
TYPE_TAGS.addTypeURI("Religion", "https://corpus.quran.com/concept.jsp?id=religion");
TYPE_TAGS.addTypeURI("Throne", "https://corpus.quran.com/concept.jsp?id=allah%27s-throne");
}
}
20 changes: 19 additions & 1 deletion src/main/properties/datasets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,24 @@ org.aksw.gerbil.datasets.definition.IITB.cacheable=true
org.aksw.gerbil.datasets.definition.IITB.experimentType=A2KB
org.aksw.gerbil.datasets.definition.IITB.constructorArgs=${org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs},${org.aksw.gerbil.datasets.IITBDatasetConfig.annotations}

### IndQNER
org.aksw.gerbil.datasets.indqnerDatasetConfiguration.datasetdir=${org.aksw.gerbil.DataPath}/datasets/indqner
org.aksw.gerbil.datasets.definition.IndQNER-Dev.name=IndQNER-Dev
org.aksw.gerbil.datasets.definition.IndQNER-Dev.class=org.aksw.gerbil.dataset.impl.indq.IndQNERDataset
org.aksw.gerbil.datasets.definition.IndQNER-Dev.constructorArgs=${org.aksw.gerbil.datasets.indqnerDatasetConfiguration.datasetdir}/dev.txt
org.aksw.gerbil.datasets.definition.IndQNER-Dev.cacheable=true
org.aksw.gerbil.datasets.definition.IndQNER-Dev.experimentType=RT2KB
org.aksw.gerbil.datasets.definition.IndQNER-Test.name=IndQNER-Test
org.aksw.gerbil.datasets.definition.IndQNER-Test.class=org.aksw.gerbil.dataset.impl.indq.IndQNERDataset
org.aksw.gerbil.datasets.definition.IndQNER-Test.constructorArgs=${org.aksw.gerbil.datasets.indqnerDatasetConfiguration.datasetdir}/test.txt
org.aksw.gerbil.datasets.definition.IndQNER-Test.cacheable=true
org.aksw.gerbil.datasets.definition.IndQNER-Test.experimentType=RT2KB
org.aksw.gerbil.datasets.definition.IndQNER-Train.name=IndQNER-Train
org.aksw.gerbil.datasets.definition.IndQNER-Train.class=org.aksw.gerbil.dataset.impl.indq.IndQNERDataset
org.aksw.gerbil.datasets.definition.IndQNER-Train.constructorArgs=${org.aksw.gerbil.datasets.indqnerDatasetConfiguration.datasetdir}/train.txt
org.aksw.gerbil.datasets.definition.IndQNER-Train.cacheable=true
org.aksw.gerbil.datasets.definition.IndQNER-Train.experimentType=RT2KB

### Kore50
org.aksw.gerbil.datasets.KORE50.file=${org.aksw.gerbil.DataPath}/datasets/KORE50/kore50-nif.ttl
org.aksw.gerbil.datasets.definition.KORE50.name=KORE50
Expand Down Expand Up @@ -645,4 +663,4 @@ org.aksw.gerbil.datasets.definition.N4T.cacheable=true
org.aksw.gerbil.datasets.definition.N4T.experimentType=A2KB
org.aksw.gerbil.datasets.definition.N4T.constructorArgs=${org.aksw.gerbil.datasets.N4TDatasetConfiguration.datasetdir}
org.aksw.gerbil.datasets.definition.N4T.check.class=org.aksw.gerbil.web.config.check.DirectoryChecker
org.aksw.gerbil.datasets.definition.N4T.check.args=${org.aksw.gerbil.datasets.N4TDatasetConfiguration.datasetdir}
org.aksw.gerbil.datasets.definition.N4T.check.args=${org.aksw.gerbil.datasets.N4TDatasetConfiguration.datasetdir}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.conll;

import java.io.File;
import java.io.IOException;
import java.util.List;

import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.apache.commons.io.FileUtils;
import org.junit.Assert;
import org.junit.Test;

public abstract class AbstractGenericCoNLLDatasetTest {

private String fileContent;
private String text;
private Marking expectedMarking;
private int documentId;
private int markingId;

public AbstractGenericCoNLLDatasetTest(String fileContent, String text, Marking expectedMarking, int documentId,
int markingId) {
this.fileContent = fileContent;
this.text = text;
this.expectedMarking = expectedMarking;
this.documentId = documentId;
this.markingId = markingId;
}

@Test
public void test() throws IOException, GerbilException {
// Create temporary file with given text
File file = File.createTempFile("test-dataset-", ".tsv");
FileUtils.write(file, fileContent);

InitializableDataset dataset = createDataset(file);
dataset.init();
List<Document> documents = dataset.getInstances();
Assert.assertNotNull(documents);
Assert.assertTrue(documents.size() > documentId);
Assert.assertEquals(text, documents.get(documentId).getText());
List<Marking> markings = documents.get(documentId).getMarkings();
Assert.assertNotNull(markings);
Assert.assertTrue(markings.size() > markingId);
Assert.assertEquals(expectedMarking, markings.get(markingId));
dataset.close();
}

public abstract InitializableDataset createDataset(File file);

}
Loading

0 comments on commit 7d4ec0f

Please sign in to comment.