Skip to content

Commit

Permalink
Fixed a bug that caused a loading error if a mention couldn't be foun…
Browse files Browse the repository at this point in the history
…d inside a tweet. Fixed the problem that some mentions are not listed with hashtags while they contain hashtags inside the real tweet. Added a test for this type of 'search with ignored hashtags'. refs #81
  • Loading branch information
MichaelRoeder committed Jan 20, 2015
1 parent a543e9b commit 2e6627e
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 64 deletions.
133 changes: 106 additions & 27 deletions src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import it.acubelab.batframework.utils.AnnotationException;
import it.acubelab.batframework.utils.ProblemReduction;
import it.acubelab.batframework.utils.WikipediaApiInterface;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.lang.MutableString;

import java.io.BufferedReader;
Expand All @@ -48,13 +49,17 @@
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

/**
/**
* @author Giuseppe Rizzo <[email protected]>
*/
public class Microposts2014Dataset implements A2WDataset {

private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class);

private List<HashSet<Annotation>> annotations = new Vector<HashSet<Annotation>>();
private List<MutableString> tweets = new Vector<MutableString>();
private Pattern dbpediaUrlPattern = Pattern.compile("http://dbpedia.org/resource/(.*)");
Expand Down Expand Up @@ -89,33 +94,52 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
if (mTweet.matches())
{
// current tweet
String tweet = mTweet.group(1);
String tweet = mTweet.group(1);
tweets.add(new MutableString(tweet));

String pairs = mRecord.group(4);
if (pairs != null && !pairs.equals(""))
{
String[] tAnn = pairs.split("\t");
for (int i = 0; i < tAnn.length; i = i + 2)
{
for (int i = 0; i < tAnn.length; i = i + 2)
{
// fetch the DBpedia name
// TODO: naive assumption that all DBpedia resources have the corresponding Wikipedia ones
// better to be verified
Matcher mDBpedia = dbpediaUrlPattern.matcher(tAnn[i + 1]);
if (mDBpedia.matches())
if (mDBpedia.matches())
{
String mention = tAnn[i];

String mention = tAnn[i];

// Let's start getting the title
currentTitle = mDBpedia.group(1);
currentTitle = URLDecoder.decode(currentTitle, "utf-8");

// Try to create a Microposts2014Annotation object by searching the mention inside the
// tweet
Microposts2014Annotation annotation = null;
int offset = indexMentionAlreadySpotted(mention, currentAnns);
int currentPos = tweet.indexOf(mention, offset);

currentTitle = mDBpedia.group(1);
currentTitle = URLDecoder.decode(currentTitle, "utf-8");
currentAnns.add(new Microposts2014Annotation(mention,currentPos, mention.length(), currentTitle));

System.out.println(mention + " " + currentPos + " " + mention.length() + " " + currentTitle);

titlesToPrefetch.add(currentTitle);
if (currentPos >= 0) {
annotation = new Microposts2014Annotation(mention, currentPos, mention.length(),
currentTitle);
}
if (annotation == null) {
// Micha: In some cases the mention is not exactly the same as the part of the text.
// For now, we only can try to remove hash tags and search again.
annotation = findMentionInsideTweetIgnoringHashes(tweet, mention, offset,
currentTitle);
}
if (annotation == null) {
LOGGER.error(
"Couldn't find mention=\"{}\" inside the tweet=\"{}\" (should be there after the offset {}). Ignoring this mention.",
mention, tweet, offset);
} else {
currentAnns.add(annotation);
// System.out.println(mention + " " + currentPos + " " + mention.length() + " "
// + currentTitle);
titlesToPrefetch.add(currentTitle);
}
}

}
Expand All @@ -141,7 +165,7 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
for (Microposts2014Annotation aA : s) {
int wid = wikiApi.getIdByTitle(aA.title);
if (wid == -1)
System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page " + aA.title);
LOGGER.warn("Dataset is malformed: Wikipedia API could not find page " + aA.title);
else
sA.add(new Annotation(aA.position, aA.length, wid));
}
Expand All @@ -150,6 +174,61 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi)
}
}

/**
* A very simple workaround to search for a mention without hashes. Note that this only works, if the mention
* couldn't be found because the tweets contains hash tags that should be part of the mentions.
*
* @param tweet
* the tweet
* @param mention
* the mention that couldn't be found directly inside the tweet
* @param offset
* the position from which the search should start
* @param wikiTitle
* the title of the entity inside the Wikipedia
*
* @return
*/
protected static Microposts2014Annotation findMentionInsideTweetIgnoringHashes(String tweet, String mention,
int offset, String wikiTitle) {
IntArrayList hashes = new IntArrayList();
int pos = tweet.indexOf('#');
while (pos >= 0) {
hashes.add(pos);
pos = tweet.indexOf('#', pos + 1);
}
// There are no hashes --> the problem of finding the mention can't be solved by removing the hashes
if (hashes.size() == 0) {
return null;
}
// The offset might have been moved through the removing of the hashes.
int newOffset = 0;
for (int i = 0; (i < hashes.size() && (hashes.get(i) < newOffset)); ++i) {
--newOffset;
}
String newTweet = tweet.replaceAll("#", "");
pos = newTweet.indexOf(mention, newOffset);
// if the mention couldn't be found
if (pos < 0) {
return null;
}
// find the start and end positions of the mention inside the original tweet by looking at the list of hashes
int startPos = pos;
int endPos = pos + mention.length();
for (int i = 0; i < hashes.size(); ++i) {
if (hashes.get(i) < endPos) {
++endPos;
if (hashes.get(i) < startPos) {
++startPos;
}
}
}
String newMention = new String(tweet.substring(startPos, endPos));
LOGGER.debug("Couldn't find \"{}\" but found \"{}\" instead.", mention, newMention);
return new Microposts2014Annotation(newMention, startPos, newMention.length(),
wikiTitle);
}

@Override
public int getSize() {
return annotations.size();
Expand Down Expand Up @@ -199,22 +278,22 @@ public String getName() {

private int indexMentionAlreadySpotted(String mention, List<Microposts2014Annotation> currentAnns)
{
int result = 0;
for (Microposts2014Annotation a : currentAnns) {
if(a.mention.equals(mention))
result = a.position + mention.length(); //if many, then we get the last
}
return result;
}
private class Microposts2014Annotation {
int result = 0;
for (Microposts2014Annotation a : currentAnns) {
if (a.mention.equals(mention))
result = a.position + mention.length(); // if many, then we get the last
}
return result;
}

protected static class Microposts2014Annotation {
public Microposts2014Annotation(String mention, int position, int length, String title) {
this.mention = mention;
this.mention = mention;
this.position = position;
this.title = title;
this.length = length;
}

public String mention;
public String title;
public int position;
Expand Down
38 changes: 19 additions & 19 deletions src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,29 @@
import org.aksw.gerbil.datatypes.ExperimentType;

/**
* ...
*
* Configuration class that is able to load the Micropost2014 datasets (train and test).
* The datasets are distinguished using the {@link Microposts2014Chunk} enum.
*
* @author Giuseppe Rizzo <[email protected]>
*/
public class Microposts2014Config extends AbstractDatasetConfiguration {

public static final String DATASET_NAME_START = "Microposts2014";
public static final String DATASET_NAME_START = "Microposts2014";
private static final String DATASET_FILE_PROPERTY_NAME = "org.aksw.gerbil.datasets.Microposts2014DatasetConfig";

private Microposts2014Chunk chunk;
private WikipediaApiInterface wikiApi;

public static enum Microposts2014Chunk {
TRAIN, TEST
}

public Microposts2014Config(
Microposts2014Chunk chunk,
WikipediaApiInterface wikiApi
)
{
super(DATASET_NAME_START, true, ExperimentType.Sa2KB);
this.chunk = chunk;

public Microposts2014Config(
Microposts2014Chunk chunk,
WikipediaApiInterface wikiApi)
{
super(DATASET_NAME_START, true, ExperimentType.Sa2KB);
this.chunk = chunk;
this.wikiApi = wikiApi;
// Set the correct name
switch (chunk) {
Expand All @@ -68,27 +68,27 @@ public Microposts2014Config(
break;
}
}
}
}

@Override
protected TopicDataset loadDataset() throws Exception {
@Override
protected TopicDataset loadDataset() throws Exception {
switch (chunk) {
case TRAIN: {
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train"));
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train"));
if (file == null) {
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\".");
}
return new Microposts2014Dataset(file, wikiApi);
}
case TEST: {
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test"));
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test"));
if (file == null) {
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\".");
}
}
return new Microposts2014Dataset(file, wikiApi);
}
}
return null;
}
}

}
36 changes: 18 additions & 18 deletions src/test/java/org/aksw/gerbil/Microposts2014Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,30 +36,30 @@
import org.junit.Ignore;

/**
* ...
*
* Class for testing the microposts dataset.
*
* @author Giuseppe Rizzo <[email protected]>
*/
@Ignore
public class Microposts2014Test {

public static void main(String[] args) {
WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance();
ExperimentTaskConfiguration taskConfigs[] =
new ExperimentTaskConfiguration[]
{
new ExperimentTaskConfiguration(
new NERDAnnotatorConfig(wikiAPI),
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()),
ExperimentType.D2KB,
Matching.STRONG_ANNOTATION_MATCH) };
Experimenter experimenter = new Experimenter(wikiAPI,
new SimpleLoggingDAO4Debugging(),
taskConfigs,
"NERD_TEST");
public static void main(String[] args) {

WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance();
ExperimentTaskConfiguration taskConfigs[] =
new ExperimentTaskConfiguration[]
{
new ExperimentTaskConfiguration(
new NERDAnnotatorConfig(wikiAPI),
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()),
ExperimentType.D2KB,
Matching.STRONG_ANNOTATION_MATCH) };
Experimenter experimenter = new Experimenter(wikiAPI,
new SimpleLoggingDAO4Debugging(),
taskConfigs,
"MICROPOSTS_TEST");
experimenter.run();

}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package org.aksw.gerbil.bat.datasets;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

import org.aksw.gerbil.bat.datasets.Microposts2014Dataset.Microposts2014Annotation;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;

@RunWith(Parameterized.class)
public class Microposts2014DatasetMentionSearchTest {

@Parameters
public static Collection<Object[]> data() {
List<Object[]> testConfigs = new ArrayList<Object[]>();
testConfigs
.add(new Object[] {
"NOTW phone hacking",
"Rupert #Murdoch, asked who was responsible for #NOTW phone #hacking? 'The people I trusted & maybe the people they trusted'",
"#NOTW phone #hacking" });
testConfigs.add(new Object[] { "Amy Winehouse",
"#Amy #Winehouse Is #Dead After a Suspected Drug Overdose http://t.co/9KBWCeN via @YahooNews",
"#Amy #Winehouse" });
testConfigs
.add(new Object[] {
"White Sox",
"#MLB Live Score Update #White #Sox (4) - #Indians (2) Final Play By Play Click link: http://rotoinfo.com/gameview?310724105",
"#White #Sox" });
return testConfigs;
}

private String mention;
private String tweet;
private String expectedMention;

public Microposts2014DatasetMentionSearchTest(String mention, String tweet, String expectedMention) {
this.mention = mention;
this.tweet = tweet;
this.expectedMention = expectedMention;
}

@Test
public void test() {
Microposts2014Annotation annotation = Microposts2014Dataset.findMentionInsideTweetIgnoringHashes(tweet,
mention, 0, null);
Assert.assertNotNull(annotation);
Assert.assertEquals(expectedMention, annotation.mention);
}
}

0 comments on commit 2e6627e

Please sign in to comment.