Skip to content

Commit

Permalink
Merge pull request #303 from korpling/feature/import-conll-enhanced-d…
Browse files Browse the repository at this point in the history
…ependencies

Feature/import conll enhanced dependencies
  • Loading branch information
MartinKl authored Aug 20, 2024
2 parents 1722704 + 6950057 commit 076965e
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 100 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- `table` export has feature to customize n/a-value, which by default is the empty string
- Add `conllu` as export format
- import of `conllu` now supports enhanced dependencies

## [0.15.0] - 2024-08-14

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
source: src/exporter/conllu.rs
expression: actual.unwrap()
---
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|2:nsubj _
1 They they PRON PRP Case=Nom|Number=Plur 2 nsubj 2:nsubj|4:nsubj _
2 buy buy VERB VBP Number=Plur|Person=3|Tense=Pres _ _ _ _
3 and and CONJ CC _ 4 cc 4:cc|4:cc _
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 2:conj|2:conj _
5 books book NOUN NNS Number=Plur 2 obj 2:obj|2:obj SpaceAfter=No
6 . . PUNCT . _ 2 punct 2:punct|2:punct _
3 and and CONJ CC _ 4 cc 4:cc _
4 sell sell VERB VBP Number=Plur|Person=3|Tense=Pres 2 conj 2:conj _
5 books book NOUN NNS Number=Plur 2 obj 2:obj|4:obj SpaceAfter=No
6 . . PUNCT . _ 2 punct 2:punct _

1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj 2:nsubj|2:nsubj _
1 I I PRON PRP Case=Nom|Number=Sing|Person=1 2 nsubj 2:nsubj _
2 have have VERB VBP Number=Sing|Person=1|Tense=Pres _ _ _ _
3 no no DET DT PronType=Neg 4 det 4:det|4:det _
4 clue clue NOUN NN Number=Sing 2 obj 2:obj|2:obj SpaceAfter=No
5 . . PUNCT . _ 2 punct 2:punct|2:punct _
3 no no DET DT PronType=Neg 4 det 4:det _
4 clue clue NOUN NN Number=Sing 2 obj 2:obj SpaceAfter=No
5 . . PUNCT . _ 2 punct 2:punct _

63 changes: 45 additions & 18 deletions src/importer/conllu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use graphannis_core::{
util::{join_qname, split_qname},
};
use itertools::Itertools;
use linked_hash_set::LinkedHashSet;
use pest::{
iterators::{Pair, Pairs},
Parser,
Expand Down Expand Up @@ -107,7 +108,7 @@ impl Display for Rule {
}
}

type DepSpec = (usize, Option<String>);
type DepSpec = LinkedHashSet<(usize, Option<String>)>;

impl ImportCoNLLU {
fn import_document(
Expand Down Expand Up @@ -187,11 +188,21 @@ impl ImportCoNLLU {
for member in sentence.into_inner() {
match member.as_rule() {
Rule::token => {
let (tok_name, tok_id, dep) =
let (tok_name, tok_id, mut deps) =
self.map_token(step_id, update, document_node_name, member, tx)?;
id_to_tok_name.insert(tok_id, tok_name.to_string());
if let Some(dependency) = dep {
dependencies.push((tok_name, dependency.0, dependency.1));
if let Some(dependency) = deps.pop_front() {
dependencies.push((
tok_name.to_string(),
dependency.0,
dependency.1.clone(),
"",
"dep",
));
}

for (h, r) in deps {
dependencies.push((tok_name.to_string(), h, r, "enh", "dep"));
}
}
Rule::multi_token | Rule::invalid_multi_token => {
Expand Down Expand Up @@ -268,23 +279,23 @@ impl ImportCoNLLU {
component_name: "".to_string(),
})?;
}
for (target_node_name, head_id, deprel) in dependencies {
for (target_node_name, head_id, deprel, clayer, cname) in dependencies {
if head_id > 0 {
if let Some(source_node_name) = id_to_tok_name.get(&head_id) {
update.add_event(UpdateEvent::AddEdge {
source_node: source_node_name.to_string(),
target_node: target_node_name.to_string(),
layer: "".to_string(),
layer: clayer.to_string(),
component_type: AnnotationComponentType::Pointing.to_string(),
component_name: "dep".to_string(),
component_name: cname.to_string(),
})?;
if let Some(deprel_value) = deprel {
update.add_event(UpdateEvent::AddEdgeLabel {
source_node: source_node_name.to_string(),
target_node: target_node_name.to_string(),
layer: "".to_string(),
layer: clayer.to_string(),
component_type: AnnotationComponentType::Pointing.to_string(),
component_name: "dep".to_string(),
component_name: cname.to_string(),
anno_ns: "".to_string(),
anno_name: "deprel".to_string(),
anno_value: deprel_value.to_string(),
Expand Down Expand Up @@ -313,7 +324,7 @@ impl ImportCoNLLU {
document_node_name: &str,
token: Pair<Rule>,
_tx: &Option<StatusSender>,
) -> anyhow::Result<(String, usize, Option<DepSpec>)> {
) -> anyhow::Result<(String, usize, DepSpec)> {
let (l, c) = token.line_col();
let line = token.as_str().to_string();
let node_name = format!("{document_node_name}#t{l}_{c}");
Expand All @@ -335,8 +346,7 @@ impl ImportCoNLLU {
anno_value: "default_layer".to_string(),
})?;
let mut token_id = None;
let mut head_id = None;
let mut deprel = None;
let mut dependencies = DepSpec::default();
for member in token.into_inner() {
let rule = member.as_rule();
match rule {
Expand Down Expand Up @@ -390,21 +400,38 @@ impl ImportCoNLLU {
Rule::head => {
for id_or_else in member.into_inner() {
if id_or_else.as_rule() == Rule::id {
head_id = Some(id_or_else.as_str().trim().parse::<usize>()?);
break;
dependencies
.insert((id_or_else.as_str().trim().parse::<usize>()?, None));
}
}
}
Rule::deprel => {
deprel = Some(member.as_str().trim().to_string());
if let Some((base_head, None)) = dependencies.pop_back() {
dependencies.insert((base_head, Some(member.as_str().trim().to_string())));
}
}
Rule::enhanced_deps => {
for enh_dep in member.into_inner() {
let mut inner = enh_dep.into_inner();
if let Some(enh_id) = inner.next() {
let head = enh_id.as_str().trim().parse::<usize>()?;
if let Some(enh_rel) = inner.next() {
let rel = enh_rel.as_str().to_string();
let value = (head, Some(rel));
// this is to avoid the basic dependency to be anywhere else than in the first position, because this position needs to be treated differently
// to avoid cycles in the graph
if !dependencies.contains(&value) {
dependencies.insert(value);
}
}
}
}
}
Rule::enhanced_deps => {}
_ => {}
}
}
let dependency = head_id.map(|v| (v, deprel));
if let Some(id) = token_id {
Ok((node_name, id, dependency))
Ok((node_name, id, dependencies))
} else {
// by grammar spec this branch should never be possible
let reason = format!("Token `{line}` ({l}, {c}) has no id which is invalid.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,53 +194,59 @@ expression: actual.unwrap()
<edge id="e19" source="valid/website_example#t16_38" target="valid/website_example#t15_64" label="Pointing//dep">
<data key="k15">det</data>
</edge>
<edge id="e20" source="valid/website_example#t5_1" target="valid/website_example#t5_68" label="Ordering/annis/">
<edge id="e20" source="valid/website_example#t7_32" target="valid/website_example#t5_1" label="Pointing/enh/dep">
<data key="k15">nsubj</data>
</edge>
<edge id="e21" source="valid/website_example#t7_32" target="valid/website_example#t8_76" label="Pointing/enh/dep">
<data key="k15">obj</data>
</edge>
<edge id="e22" source="valid/website_example#t5_1" target="valid/website_example#t5_68" label="Ordering/annis/">
</edge>
<edge id="e21" source="valid/website_example#t5_68" target="valid/website_example#t6_67" label="Ordering/annis/">
<edge id="e23" source="valid/website_example#t5_68" target="valid/website_example#t6_67" label="Ordering/annis/">
</edge>
<edge id="e22" source="valid/website_example#t6_67" target="valid/website_example#t7_32" label="Ordering/annis/">
<edge id="e24" source="valid/website_example#t6_67" target="valid/website_example#t7_32" label="Ordering/annis/">
</edge>
<edge id="e23" source="valid/website_example#t7_32" target="valid/website_example#t8_76" label="Ordering/annis/">
<edge id="e25" source="valid/website_example#t7_32" target="valid/website_example#t8_76" label="Ordering/annis/">
</edge>
<edge id="e24" source="valid/website_example#t8_76" target="valid/website_example#t9_66" label="Ordering/annis/">
<edge id="e26" source="valid/website_example#t8_76" target="valid/website_example#t9_66" label="Ordering/annis/">
</edge>
<edge id="e25" source="valid/website_example#t9_66" target="valid/website_example#t14_1" label="Ordering/annis/">
<edge id="e27" source="valid/website_example#t9_66" target="valid/website_example#t14_1" label="Ordering/annis/">
</edge>
<edge id="e26" source="valid/website_example#t14_1" target="valid/website_example#t14_57" label="Ordering/annis/">
<edge id="e28" source="valid/website_example#t14_1" target="valid/website_example#t14_57" label="Ordering/annis/">
</edge>
<edge id="e27" source="valid/website_example#t14_57" target="valid/website_example#t15_64" label="Ordering/annis/">
<edge id="e29" source="valid/website_example#t14_57" target="valid/website_example#t15_64" label="Ordering/annis/">
</edge>
<edge id="e28" source="valid/website_example#t15_64" target="valid/website_example#t16_38" label="Ordering/annis/">
<edge id="e30" source="valid/website_example#t15_64" target="valid/website_example#t16_38" label="Ordering/annis/">
</edge>
<edge id="e29" source="valid/website_example#t16_38" target="valid/website_example#t17_54" label="Ordering/annis/">
<edge id="e31" source="valid/website_example#t16_38" target="valid/website_example#t17_54" label="Ordering/annis/">
</edge>
<edge id="e30" source="valid/website_example" target="valid" label="PartOf/annis/">
<edge id="e32" source="valid/website_example" target="valid" label="PartOf/annis/">
</edge>
<edge id="e31" source="valid/website_example#t5_1" target="valid/website_example" label="PartOf/annis/">
<edge id="e33" source="valid/website_example#t5_1" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e32" source="valid/website_example#t5_68" target="valid/website_example" label="PartOf/annis/">
<edge id="e34" source="valid/website_example#t5_68" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e33" source="valid/website_example#t6_67" target="valid/website_example" label="PartOf/annis/">
<edge id="e35" source="valid/website_example#t6_67" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e34" source="valid/website_example#t7_32" target="valid/website_example" label="PartOf/annis/">
<edge id="e36" source="valid/website_example#t7_32" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e35" source="valid/website_example#t8_76" target="valid/website_example" label="PartOf/annis/">
<edge id="e37" source="valid/website_example#t8_76" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e36" source="valid/website_example#t9_66" target="valid/website_example" label="PartOf/annis/">
<edge id="e38" source="valid/website_example#t9_66" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e37" source="valid/website_example#s1_1" target="valid/website_example" label="PartOf/annis/">
<edge id="e39" source="valid/website_example#s1_1" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e38" source="valid/website_example#t14_1" target="valid/website_example" label="PartOf/annis/">
<edge id="e40" source="valid/website_example#t14_1" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e39" source="valid/website_example#t14_57" target="valid/website_example" label="PartOf/annis/">
<edge id="e41" source="valid/website_example#t14_57" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e40" source="valid/website_example#t15_64" target="valid/website_example" label="PartOf/annis/">
<edge id="e42" source="valid/website_example#t15_64" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e41" source="valid/website_example#t16_38" target="valid/website_example" label="PartOf/annis/">
<edge id="e43" source="valid/website_example#t16_38" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e42" source="valid/website_example#t17_54" target="valid/website_example" label="PartOf/annis/">
<edge id="e44" source="valid/website_example#t17_54" target="valid/website_example" label="PartOf/annis/">
</edge>
<edge id="e43" source="valid/website_example#s10_34" target="valid/website_example" label="PartOf/annis/">
<edge id="e45" source="valid/website_example#s10_34" target="valid/website_example" label="PartOf/annis/">
</edge>
</graph>
</graphml>
Loading

0 comments on commit 076965e

Please sign in to comment.