diff --git a/src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java b/src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java index 779adc3f0..1fd128793 100644 --- a/src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java +++ b/src/main/java/org/aksw/gerbil/bat/datasets/Microposts2014Dataset.java @@ -30,6 +30,7 @@ import it.acubelab.batframework.utils.AnnotationException; import it.acubelab.batframework.utils.ProblemReduction; import it.acubelab.batframework.utils.WikipediaApiInterface; +import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.lang.MutableString; import java.io.BufferedReader; @@ -48,13 +49,17 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathExpressionException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -/** +/** * @author Giuseppe Rizzo */ public class Microposts2014Dataset implements A2WDataset { + private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class); + private List> annotations = new Vector>(); private List tweets = new Vector(); private Pattern dbpediaUrlPattern = Pattern.compile("http://dbpedia.org/resource/(.*)"); @@ -89,33 +94,52 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) if (mTweet.matches()) { // current tweet - String tweet = mTweet.group(1); + String tweet = mTweet.group(1); tweets.add(new MutableString(tweet)); String pairs = mRecord.group(4); if (pairs != null && !pairs.equals("")) { String[] tAnn = pairs.split("\t"); - for (int i = 0; i < tAnn.length; i = i + 2) - { + for (int i = 0; i < tAnn.length; i = i + 2) + { // fetch the DBpedia name // TODO: naive assumption that all DBpedia resources have the corresponding Wikipedia ones // better to be verified Matcher mDBpedia = dbpediaUrlPattern.matcher(tAnn[i + 1]); - if (mDBpedia.matches()) + if (mDBpedia.matches()) { - String mention = tAnn[i]; - + String mention = tAnn[i]; + + // Let's start getting the title + currentTitle = mDBpedia.group(1); + currentTitle = URLDecoder.decode(currentTitle, "utf-8"); + + // Try to create a Microposts2014Annotation object by searching the mention inside the + // tweet + Microposts2014Annotation annotation = null; int offset = indexMentionAlreadySpotted(mention, currentAnns); int currentPos = tweet.indexOf(mention, offset); - - currentTitle = mDBpedia.group(1); - currentTitle = URLDecoder.decode(currentTitle, "utf-8"); - currentAnns.add(new Microposts2014Annotation(mention,currentPos, mention.length(), currentTitle)); - - System.out.println(mention + " " + currentPos + " " + mention.length() + " " + currentTitle); - - titlesToPrefetch.add(currentTitle); + if (currentPos >= 0) { + annotation = new Microposts2014Annotation(mention, currentPos, mention.length(), + currentTitle); + } + if (annotation == null) { + // Micha: In some cases the mention is not exactly the same as the part of the text. + // For now, we only can try to remove hash tags and search again. + annotation = findMentionInsideTweetIgnoringHashes(tweet, mention, offset, + currentTitle); + } + if (annotation == null) { + LOGGER.error( + "Couldn't find mention=\"{}\" inside the tweet=\"{}\" (should be there after the offset {}). Ignoring this mention.", + mention, tweet, offset); + } else { + currentAnns.add(annotation); + // System.out.println(mention + " " + currentPos + " " + mention.length() + " " + // + currentTitle); + titlesToPrefetch.add(currentTitle); + } } } @@ -141,7 +165,7 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) for (Microposts2014Annotation aA : s) { int wid = wikiApi.getIdByTitle(aA.title); if (wid == -1) - System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page " + aA.title); + LOGGER.warn("Dataset is malformed: Wikipedia API could not find page " + aA.title); else sA.add(new Annotation(aA.position, aA.length, wid)); } @@ -150,6 +174,61 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) } } + /** + * A very simple workaround to search for a mention without hashes. Note that this only works, if the mention + * couldn't be found because the tweets contains hash tags that should be part of the mentions. + * + * @param tweet + * the tweet + * @param mention + * the mention that couldn't be found directly inside the tweet + * @param offset + * the position from which the search should start + * @param wikiTitle + * the title of the entity inside the Wikipedia + * + * @return + */ + protected static Microposts2014Annotation findMentionInsideTweetIgnoringHashes(String tweet, String mention, + int offset, String wikiTitle) { + IntArrayList hashes = new IntArrayList(); + int pos = tweet.indexOf('#'); + while (pos >= 0) { + hashes.add(pos); + pos = tweet.indexOf('#', pos + 1); + } + // There are no hashes --> the problem of finding the mention can't be solved by removing the hashes + if (hashes.size() == 0) { + return null; + } + // The offset might have been moved through the removing of the hashes. + int newOffset = 0; + for (int i = 0; (i < hashes.size() && (hashes.get(i) < newOffset)); ++i) { + --newOffset; + } + String newTweet = tweet.replaceAll("#", ""); + pos = newTweet.indexOf(mention, newOffset); + // if the mention couldn't be found + if (pos < 0) { + return null; + } + // find the start and end positions of the mention inside the original tweet by looking at the list of hashes + int startPos = pos; + int endPos = pos + mention.length(); + for (int i = 0; i < hashes.size(); ++i) { + if (hashes.get(i) < endPos) { + ++endPos; + if (hashes.get(i) < startPos) { + ++startPos; + } + } + } + String newMention = new String(tweet.substring(startPos, endPos)); + LOGGER.debug("Couldn't find \"{}\" but found \"{}\" instead.", mention, newMention); + return new Microposts2014Annotation(newMention, startPos, newMention.length(), + wikiTitle); + } + @Override public int getSize() { return annotations.size(); @@ -199,22 +278,22 @@ public String getName() { private int indexMentionAlreadySpotted(String mention, List currentAnns) { - int result = 0; - for (Microposts2014Annotation a : currentAnns) { - if(a.mention.equals(mention)) - result = a.position + mention.length(); //if many, then we get the last - } - return result; - } - - private class Microposts2014Annotation { + int result = 0; + for (Microposts2014Annotation a : currentAnns) { + if (a.mention.equals(mention)) + result = a.position + mention.length(); // if many, then we get the last + } + return result; + } + + protected static class Microposts2014Annotation { public Microposts2014Annotation(String mention, int position, int length, String title) { - this.mention = mention; + this.mention = mention; this.position = position; this.title = title; this.length = length; } - + public String mention; public String title; public int position; diff --git a/src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java b/src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java index 03a610191..f87db6ef2 100644 --- a/src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java +++ b/src/main/java/org/aksw/gerbil/datasets/Microposts2014Config.java @@ -33,29 +33,29 @@ import org.aksw.gerbil.datatypes.ExperimentType; /** - * ... - * + * Configuration class that is able to load the Micropost2014 datasets (train and test). + * The datasets are distinguished using the {@link Microposts2014Chunk} enum. + * * @author Giuseppe Rizzo */ public class Microposts2014Config extends AbstractDatasetConfiguration { - public static final String DATASET_NAME_START = "Microposts2014"; + public static final String DATASET_NAME_START = "Microposts2014"; private static final String DATASET_FILE_PROPERTY_NAME = "org.aksw.gerbil.datasets.Microposts2014DatasetConfig"; - + private Microposts2014Chunk chunk; private WikipediaApiInterface wikiApi; public static enum Microposts2014Chunk { TRAIN, TEST } - - public Microposts2014Config( - Microposts2014Chunk chunk, - WikipediaApiInterface wikiApi - ) - { - super(DATASET_NAME_START, true, ExperimentType.Sa2KB); - this.chunk = chunk; + + public Microposts2014Config( + Microposts2014Chunk chunk, + WikipediaApiInterface wikiApi) + { + super(DATASET_NAME_START, true, ExperimentType.Sa2KB); + this.chunk = chunk; this.wikiApi = wikiApi; // Set the correct name switch (chunk) { @@ -68,27 +68,27 @@ public Microposts2014Config( break; } } - } + } - @Override - protected TopicDataset loadDataset() throws Exception { + @Override + protected TopicDataset loadDataset() throws Exception { switch (chunk) { case TRAIN: { - String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train")); + String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train")); if (file == null) { throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\"."); } return new Microposts2014Dataset(file, wikiApi); } case TEST: { - String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test")); + String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test")); if (file == null) { throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\"."); - } + } return new Microposts2014Dataset(file, wikiApi); } } return null; - } + } } diff --git a/src/test/java/org/aksw/gerbil/Microposts2014Test.java b/src/test/java/org/aksw/gerbil/Microposts2014Test.java index dc7046234..3eb496b3d 100644 --- a/src/test/java/org/aksw/gerbil/Microposts2014Test.java +++ b/src/test/java/org/aksw/gerbil/Microposts2014Test.java @@ -36,30 +36,30 @@ import org.junit.Ignore; /** - * ... - * + * Class for testing the microposts dataset. + * * @author Giuseppe Rizzo */ @Ignore public class Microposts2014Test { - public static void main(String[] args) { - - WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance(); - ExperimentTaskConfiguration taskConfigs[] = - new ExperimentTaskConfiguration[] - { - new ExperimentTaskConfiguration( - new NERDAnnotatorConfig(wikiAPI), - new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()), - ExperimentType.D2KB, - Matching.STRONG_ANNOTATION_MATCH) }; - Experimenter experimenter = new Experimenter(wikiAPI, - new SimpleLoggingDAO4Debugging(), - taskConfigs, - "NERD_TEST"); + public static void main(String[] args) { + + WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance(); + ExperimentTaskConfiguration taskConfigs[] = + new ExperimentTaskConfiguration[] + { + new ExperimentTaskConfiguration( + new NERDAnnotatorConfig(wikiAPI), + new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()), + ExperimentType.D2KB, + Matching.STRONG_ANNOTATION_MATCH) }; + Experimenter experimenter = new Experimenter(wikiAPI, + new SimpleLoggingDAO4Debugging(), + taskConfigs, + "MICROPOSTS_TEST"); experimenter.run(); - } + } } diff --git a/src/test/java/org/aksw/gerbil/bat/datasets/Microposts2014DatasetMentionSearchTest.java b/src/test/java/org/aksw/gerbil/bat/datasets/Microposts2014DatasetMentionSearchTest.java new file mode 100644 index 000000000..903fa283a --- /dev/null +++ b/src/test/java/org/aksw/gerbil/bat/datasets/Microposts2014DatasetMentionSearchTest.java @@ -0,0 +1,53 @@ +package org.aksw.gerbil.bat.datasets; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.aksw.gerbil.bat.datasets.Microposts2014Dataset.Microposts2014Annotation; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(Parameterized.class) +public class Microposts2014DatasetMentionSearchTest { + + @Parameters + public static Collection data() { + List testConfigs = new ArrayList(); + testConfigs + .add(new Object[] { + "NOTW phone hacking", + "Rupert #Murdoch, asked who was responsible for #NOTW phone #hacking? 'The people I trusted & maybe the people they trusted'", + "#NOTW phone #hacking" }); + testConfigs.add(new Object[] { "Amy Winehouse", + "#Amy #Winehouse Is #Dead After a Suspected Drug Overdose http://t.co/9KBWCeN via @YahooNews", + "#Amy #Winehouse" }); + testConfigs + .add(new Object[] { + "White Sox", + "#MLB Live Score Update #White #Sox (4) - #Indians (2) Final Play By Play Click link: http://rotoinfo.com/gameview?310724105", + "#White #Sox" }); + return testConfigs; + } + + private String mention; + private String tweet; + private String expectedMention; + + public Microposts2014DatasetMentionSearchTest(String mention, String tweet, String expectedMention) { + this.mention = mention; + this.tweet = tweet; + this.expectedMention = expectedMention; + } + + @Test + public void test() { + Microposts2014Annotation annotation = Microposts2014Dataset.findMentionInsideTweetIgnoringHashes(tweet, + mention, 0, null); + Assert.assertNotNull(annotation); + Assert.assertEquals(expectedMention, annotation.mention); + } +}