-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fixed a bug that caused a loading error if a mention couldn't be foun…
…d inside a tweet. Fixed the problem that some mentions are not listed with hashtags while they contain hashtags inside the real tweet. Added a test for this type of 'search with ignored hashtags'. refs #81
- Loading branch information
1 parent
a543e9b
commit 2e6627e
Showing
4 changed files
with
196 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
import it.acubelab.batframework.utils.AnnotationException; | ||
import it.acubelab.batframework.utils.ProblemReduction; | ||
import it.acubelab.batframework.utils.WikipediaApiInterface; | ||
import it.unimi.dsi.fastutil.ints.IntArrayList; | ||
import it.unimi.dsi.lang.MutableString; | ||
|
||
import java.io.BufferedReader; | ||
|
@@ -48,13 +49,17 @@ | |
import javax.xml.parsers.ParserConfigurationException; | ||
import javax.xml.xpath.XPathExpressionException; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
import org.xml.sax.SAXException; | ||
|
||
/** | ||
/** | ||
* @author Giuseppe Rizzo <[email protected]> | ||
*/ | ||
public class Microposts2014Dataset implements A2WDataset { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class); | ||
|
||
private List<HashSet<Annotation>> annotations = new Vector<HashSet<Annotation>>(); | ||
private List<MutableString> tweets = new Vector<MutableString>(); | ||
private Pattern dbpediaUrlPattern = Pattern.compile("http://dbpedia.org/resource/(.*)"); | ||
|
@@ -89,33 +94,52 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) | |
if (mTweet.matches()) | ||
{ | ||
// current tweet | ||
String tweet = mTweet.group(1); | ||
String tweet = mTweet.group(1); | ||
tweets.add(new MutableString(tweet)); | ||
|
||
String pairs = mRecord.group(4); | ||
if (pairs != null && !pairs.equals("")) | ||
{ | ||
String[] tAnn = pairs.split("\t"); | ||
for (int i = 0; i < tAnn.length; i = i + 2) | ||
{ | ||
for (int i = 0; i < tAnn.length; i = i + 2) | ||
{ | ||
// fetch the DBpedia name | ||
// TODO: naive assumption that all DBpedia resources have the corresponding Wikipedia ones | ||
// better to be verified | ||
Matcher mDBpedia = dbpediaUrlPattern.matcher(tAnn[i + 1]); | ||
if (mDBpedia.matches()) | ||
if (mDBpedia.matches()) | ||
{ | ||
String mention = tAnn[i]; | ||
|
||
String mention = tAnn[i]; | ||
|
||
// Let's start getting the title | ||
currentTitle = mDBpedia.group(1); | ||
currentTitle = URLDecoder.decode(currentTitle, "utf-8"); | ||
|
||
// Try to create a Microposts2014Annotation object by searching the mention inside the | ||
// tweet | ||
Microposts2014Annotation annotation = null; | ||
int offset = indexMentionAlreadySpotted(mention, currentAnns); | ||
int currentPos = tweet.indexOf(mention, offset); | ||
|
||
currentTitle = mDBpedia.group(1); | ||
currentTitle = URLDecoder.decode(currentTitle, "utf-8"); | ||
currentAnns.add(new Microposts2014Annotation(mention,currentPos, mention.length(), currentTitle)); | ||
|
||
System.out.println(mention + " " + currentPos + " " + mention.length() + " " + currentTitle); | ||
|
||
titlesToPrefetch.add(currentTitle); | ||
if (currentPos >= 0) { | ||
annotation = new Microposts2014Annotation(mention, currentPos, mention.length(), | ||
currentTitle); | ||
} | ||
if (annotation == null) { | ||
// Micha: In some cases the mention is not exactly the same as the part of the text. | ||
// For now, we only can try to remove hash tags and search again. | ||
annotation = findMentionInsideTweetIgnoringHashes(tweet, mention, offset, | ||
currentTitle); | ||
} | ||
if (annotation == null) { | ||
LOGGER.error( | ||
"Couldn't find mention=\"{}\" inside the tweet=\"{}\" (should be there after the offset {}). Ignoring this mention.", | ||
mention, tweet, offset); | ||
} else { | ||
currentAnns.add(annotation); | ||
// System.out.println(mention + " " + currentPos + " " + mention.length() + " " | ||
// + currentTitle); | ||
titlesToPrefetch.add(currentTitle); | ||
} | ||
} | ||
|
||
} | ||
|
@@ -141,7 +165,7 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) | |
for (Microposts2014Annotation aA : s) { | ||
int wid = wikiApi.getIdByTitle(aA.title); | ||
if (wid == -1) | ||
System.out.println("ERROR: Dataset is malformed: Wikipedia API could not find page " + aA.title); | ||
LOGGER.warn("Dataset is malformed: Wikipedia API could not find page " + aA.title); | ||
else | ||
sA.add(new Annotation(aA.position, aA.length, wid)); | ||
} | ||
|
@@ -150,6 +174,61 @@ public Microposts2014Dataset(String file, WikipediaApiInterface wikiApi) | |
} | ||
} | ||
|
||
/** | ||
* A very simple workaround to search for a mention without hashes. Note that this only works, if the mention | ||
* couldn't be found because the tweets contains hash tags that should be part of the mentions. | ||
* | ||
* @param tweet | ||
* the tweet | ||
* @param mention | ||
* the mention that couldn't be found directly inside the tweet | ||
* @param offset | ||
* the position from which the search should start | ||
* @param wikiTitle | ||
* the title of the entity inside the Wikipedia | ||
* | ||
* @return | ||
*/ | ||
protected static Microposts2014Annotation findMentionInsideTweetIgnoringHashes(String tweet, String mention, | ||
int offset, String wikiTitle) { | ||
IntArrayList hashes = new IntArrayList(); | ||
int pos = tweet.indexOf('#'); | ||
while (pos >= 0) { | ||
hashes.add(pos); | ||
pos = tweet.indexOf('#', pos + 1); | ||
} | ||
// There are no hashes --> the problem of finding the mention can't be solved by removing the hashes | ||
if (hashes.size() == 0) { | ||
return null; | ||
} | ||
// The offset might have been moved through the removing of the hashes. | ||
int newOffset = 0; | ||
for (int i = 0; (i < hashes.size() && (hashes.get(i) < newOffset)); ++i) { | ||
--newOffset; | ||
} | ||
String newTweet = tweet.replaceAll("#", ""); | ||
pos = newTweet.indexOf(mention, newOffset); | ||
// if the mention couldn't be found | ||
if (pos < 0) { | ||
return null; | ||
} | ||
// find the start and end positions of the mention inside the original tweet by looking at the list of hashes | ||
int startPos = pos; | ||
int endPos = pos + mention.length(); | ||
for (int i = 0; i < hashes.size(); ++i) { | ||
if (hashes.get(i) < endPos) { | ||
++endPos; | ||
if (hashes.get(i) < startPos) { | ||
++startPos; | ||
} | ||
} | ||
} | ||
String newMention = new String(tweet.substring(startPos, endPos)); | ||
LOGGER.debug("Couldn't find \"{}\" but found \"{}\" instead.", mention, newMention); | ||
return new Microposts2014Annotation(newMention, startPos, newMention.length(), | ||
wikiTitle); | ||
} | ||
|
||
@Override | ||
public int getSize() { | ||
return annotations.size(); | ||
|
@@ -199,22 +278,22 @@ public String getName() { | |
|
||
private int indexMentionAlreadySpotted(String mention, List<Microposts2014Annotation> currentAnns) | ||
{ | ||
int result = 0; | ||
for (Microposts2014Annotation a : currentAnns) { | ||
if(a.mention.equals(mention)) | ||
result = a.position + mention.length(); //if many, then we get the last | ||
} | ||
return result; | ||
} | ||
private class Microposts2014Annotation { | ||
int result = 0; | ||
for (Microposts2014Annotation a : currentAnns) { | ||
if (a.mention.equals(mention)) | ||
result = a.position + mention.length(); // if many, then we get the last | ||
} | ||
return result; | ||
} | ||
|
||
protected static class Microposts2014Annotation { | ||
public Microposts2014Annotation(String mention, int position, int length, String title) { | ||
this.mention = mention; | ||
this.mention = mention; | ||
this.position = position; | ||
this.title = title; | ||
this.length = length; | ||
} | ||
|
||
public String mention; | ||
public String title; | ||
public int position; | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,29 +33,29 @@ | |
import org.aksw.gerbil.datatypes.ExperimentType; | ||
|
||
/** | ||
* ... | ||
* | ||
* Configuration class that is able to load the Micropost2014 datasets (train and test). | ||
* The datasets are distinguished using the {@link Microposts2014Chunk} enum. | ||
* | ||
* @author Giuseppe Rizzo <[email protected]> | ||
*/ | ||
public class Microposts2014Config extends AbstractDatasetConfiguration { | ||
|
||
public static final String DATASET_NAME_START = "Microposts2014"; | ||
public static final String DATASET_NAME_START = "Microposts2014"; | ||
private static final String DATASET_FILE_PROPERTY_NAME = "org.aksw.gerbil.datasets.Microposts2014DatasetConfig"; | ||
|
||
private Microposts2014Chunk chunk; | ||
private WikipediaApiInterface wikiApi; | ||
|
||
public static enum Microposts2014Chunk { | ||
TRAIN, TEST | ||
} | ||
|
||
public Microposts2014Config( | ||
Microposts2014Chunk chunk, | ||
WikipediaApiInterface wikiApi | ||
) | ||
{ | ||
super(DATASET_NAME_START, true, ExperimentType.Sa2KB); | ||
this.chunk = chunk; | ||
|
||
public Microposts2014Config( | ||
Microposts2014Chunk chunk, | ||
WikipediaApiInterface wikiApi) | ||
{ | ||
super(DATASET_NAME_START, true, ExperimentType.Sa2KB); | ||
this.chunk = chunk; | ||
this.wikiApi = wikiApi; | ||
// Set the correct name | ||
switch (chunk) { | ||
|
@@ -68,27 +68,27 @@ public Microposts2014Config( | |
break; | ||
} | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
protected TopicDataset loadDataset() throws Exception { | ||
@Override | ||
protected TopicDataset loadDataset() throws Exception { | ||
switch (chunk) { | ||
case TRAIN: { | ||
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train")); | ||
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Train")); | ||
if (file == null) { | ||
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\"."); | ||
} | ||
return new Microposts2014Dataset(file, wikiApi); | ||
} | ||
case TEST: { | ||
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test")); | ||
String file = GerbilConfiguration.getInstance().getString(DATASET_FILE_PROPERTY_NAME.concat(".Test")); | ||
if (file == null) { | ||
throw new IOException("Couldn't load needed Property \"" + DATASET_FILE_PROPERTY_NAME + "\"."); | ||
} | ||
} | ||
return new Microposts2014Dataset(file, wikiApi); | ||
} | ||
} | ||
return null; | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,30 +36,30 @@ | |
import org.junit.Ignore; | ||
|
||
/** | ||
* ... | ||
* | ||
* Class for testing the microposts dataset. | ||
* | ||
* @author Giuseppe Rizzo <[email protected]> | ||
*/ | ||
@Ignore | ||
public class Microposts2014Test { | ||
|
||
public static void main(String[] args) { | ||
WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance(); | ||
ExperimentTaskConfiguration taskConfigs[] = | ||
new ExperimentTaskConfiguration[] | ||
{ | ||
new ExperimentTaskConfiguration( | ||
new NERDAnnotatorConfig(wikiAPI), | ||
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()), | ||
ExperimentType.D2KB, | ||
Matching.STRONG_ANNOTATION_MATCH) }; | ||
Experimenter experimenter = new Experimenter(wikiAPI, | ||
new SimpleLoggingDAO4Debugging(), | ||
taskConfigs, | ||
"NERD_TEST"); | ||
public static void main(String[] args) { | ||
|
||
WikipediaApiInterface wikiAPI = SingletonWikipediaApi.getInstance(); | ||
ExperimentTaskConfiguration taskConfigs[] = | ||
new ExperimentTaskConfiguration[] | ||
{ | ||
new ExperimentTaskConfiguration( | ||
new NERDAnnotatorConfig(wikiAPI), | ||
new Microposts2014Config(Microposts2014Chunk.TRAIN, SingletonWikipediaApi.getInstance()), | ||
ExperimentType.D2KB, | ||
Matching.STRONG_ANNOTATION_MATCH) }; | ||
Experimenter experimenter = new Experimenter(wikiAPI, | ||
new SimpleLoggingDAO4Debugging(), | ||
taskConfigs, | ||
"MICROPOSTS_TEST"); | ||
experimenter.run(); | ||
|
||
} | ||
} | ||
|
||
} |
53 changes: 53 additions & 0 deletions
53
src/test/java/org/aksw/gerbil/bat/datasets/Microposts2014DatasetMentionSearchTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package org.aksw.gerbil.bat.datasets; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.List; | ||
|
||
import org.aksw.gerbil.bat.datasets.Microposts2014Dataset.Microposts2014Annotation; | ||
import org.junit.Assert; | ||
import org.junit.Test; | ||
import org.junit.runner.RunWith; | ||
import org.junit.runners.Parameterized; | ||
import org.junit.runners.Parameterized.Parameters; | ||
|
||
@RunWith(Parameterized.class) | ||
public class Microposts2014DatasetMentionSearchTest { | ||
|
||
@Parameters | ||
public static Collection<Object[]> data() { | ||
List<Object[]> testConfigs = new ArrayList<Object[]>(); | ||
testConfigs | ||
.add(new Object[] { | ||
"NOTW phone hacking", | ||
"Rupert #Murdoch, asked who was responsible for #NOTW phone #hacking? 'The people I trusted & maybe the people they trusted'", | ||
"#NOTW phone #hacking" }); | ||
testConfigs.add(new Object[] { "Amy Winehouse", | ||
"#Amy #Winehouse Is #Dead After a Suspected Drug Overdose http://t.co/9KBWCeN via @YahooNews", | ||
"#Amy #Winehouse" }); | ||
testConfigs | ||
.add(new Object[] { | ||
"White Sox", | ||
"#MLB Live Score Update #White #Sox (4) - #Indians (2) Final Play By Play Click link: http://rotoinfo.com/gameview?310724105", | ||
"#White #Sox" }); | ||
return testConfigs; | ||
} | ||
|
||
private String mention; | ||
private String tweet; | ||
private String expectedMention; | ||
|
||
public Microposts2014DatasetMentionSearchTest(String mention, String tweet, String expectedMention) { | ||
this.mention = mention; | ||
this.tweet = tweet; | ||
this.expectedMention = expectedMention; | ||
} | ||
|
||
@Test | ||
public void test() { | ||
Microposts2014Annotation annotation = Microposts2014Dataset.findMentionInsideTweetIgnoringHashes(tweet, | ||
mention, 0, null); | ||
Assert.assertNotNull(annotation); | ||
Assert.assertEquals(expectedMention, annotation.mention); | ||
} | ||
} |