Skip to content

Commit

Permalink
Merge branch 'mwestats' into anaregres
Browse files Browse the repository at this point in the history
  • Loading branch information
oktaal committed Mar 14, 2024
2 parents 4030644 + f3c637d commit 3333b35
Show file tree
Hide file tree
Showing 8 changed files with 121 additions and 25 deletions.
29 changes: 29 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# This CITATION.cff file was generated with cffinit.
# Visit https://bit.ly/cffinit to generate yours today!

cff-version: 1.2.0
title: MWE Query
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- given-names: Jan
family-names: Odijk
affiliation: Utrecht University
- given-names: Martin
family-names: Kroon
affiliation: Utrecht University
orcid: 'https://orcid.org/0000-0003-3059-6872'
- name: >-
Research Software Lab, Centre for Digital Humanities,
Utrecht University
website: >-
https://cdh.uu.nl/centre-for-digital-humanities/research-software-lab/
city: Utrecht
country: NL
identifiers:
- type: doi
value: 10.5281/zenodo.10410636
repository-code: 'https://github.com/UUDigitalHumanitieslab/mwe-query'
license: BSD-3-Clause
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# MWE Query
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10410636.svg)](https://doi.org/10.5281/zenodo.10410636)
[![PyPI version](https://badge.fury.io/py/mwe-query.svg)](https://badge.fury.io/py/mwe-query)
[![Actions Status](https://github.com/UUDigitalHumanitiesLab/mwe-query/workflows/Tests/badge.svg)](https://github.com/UUDigitalHumanitiesLab/mwe-query/actions)

# MWE Query

## Run Locally

Expand Down
36 changes: 24 additions & 12 deletions mwe_query/canonicalform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
to generate queries from them and to search using these queries.
"""

from typing import Dict, Iterable, List, Sequence, Optional, Set, Tuple, TypeVar
from typing import cast, Dict, Iterable, List, Sequence, Optional, Set, Tuple, TypeVar
from sastadev.sastatypes import SynTree
import re
import sys
Expand Down Expand Up @@ -403,7 +403,9 @@ def all_leaves(stree: SynTree, annotations: List[Annotation], allowedannotations

def headmodifiable(stree: SynTree, mwetop: int, annotations: List[int]):
head = getchild(stree, 'hd')
if terminal(head):
if head is None:
return False
elif terminal(head):
beginint = int(gav(head, 'begin'))
if 0 <= beginint < len(annotations):
if mwetop == notop:
Expand Down Expand Up @@ -467,6 +469,7 @@ def mknewnode(stree: SynTree, mwetop: int, atts: List[str], annotations: List[in
newnode.attrib['maxnodecount'] = f'{len(stree)}'
return newnode


def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties):
phraserel = gav(nonheadwordnode, 'rel')
newnonheadwordnode = copy.copy(nonheadwordnode)
Expand All @@ -475,10 +478,14 @@ def expandnonheadwordnode(nonheadwordnode, phrasenodeproperties):
phrasenode.attrib['rel'] = phraserel
phrasenode.append(newnonheadwordnode)
return phrasenode


def zullenheadclause(stree: SynTree) -> bool:
if stree.tag == 'node':
cat = gav(stree, 'cat')
head = getchild(stree, 'hd')
if head is None:
return False
headlemma = gav(head, 'lemma')
headpt = gav(head, 'pt')
result = cat in {
Expand Down Expand Up @@ -552,6 +559,8 @@ def transformtree(stree: SynTree, annotations: List[Annotation], mwetop=notop, a
return results
elif cat in {'smain', 'sv1'}:
head = getchild(stree, 'hd')
if head is None:
return []
lemma = gav(head, 'lemma')
vc = getchild(stree, 'vc')
# predm, if present, must be moved downwards here
Expand Down Expand Up @@ -1070,8 +1079,9 @@ def lowerpredm(stree: SynTree) -> SynTree:
predmparent = predmnode.getparent()
lowestvcnode = finddeepestvc(predmparent) # this xpath does not yield the right results './/node[@rel="vc" and not(node[@rel="vc"])]')
if lowestvcnode is not None:
predmparent.remove(predmnode)
lowestvcnode.append(predmnode)
if predmparent is not None:
predmparent.remove(predmnode)
lowestvcnode.append(predmnode)
# print('lowerpredm: newstree')
# ET.dump(newstree)
return newstree
Expand All @@ -1096,7 +1106,8 @@ def newgenvariants(stree: SynTree, nodeidwordmap: Dict[int, str]) -> List[SynTre
vblsu = find1(newstree, f'.//node[@rel="su" and {vblnode}]')
if vblsu is not None:
parent = vblsu.getparent()
parent.remove(vblsu)
if parent is not None:
parent.remove(vblsu)

# move predm down not needed already done in transformtree
# newstree = lowerpredm(newstree)
Expand Down Expand Up @@ -1128,12 +1139,13 @@ def newgenvariants(stree: SynTree, nodeidwordmap: Dict[int, str]) -> List[SynTre
newvcnode1 = nodecopy(vcnode)
newvcnode2 = nodecopy(vcnode)
parent = obj1node.getparent()
parent.remove(obj1node)
alternativesnode = mkalternativesnode(
[[obj1node], [newvcnode1], [newpobj1node, newvcnode2]])
if ppshow:
showtree(alternativesnode, 'alternativesnode')
parent.append(alternativesnode)
if parent is not None:
parent.remove(obj1node)
alternativesnode = mkalternativesnode(
[[obj1node], [newvcnode1], [newpobj1node, newvcnode2]])
if ppshow:
showtree(alternativesnode, 'alternativesnode')
parent.append(alternativesnode)

vblppnodeids = globalresult.xpath(vblppnodeidxpath)
for vblppnodeid in vblppnodeids:
Expand Down Expand Up @@ -1660,7 +1672,7 @@ def relpronsubst(stree: SynTree) -> SynTree:
if govprep is not None:
govprep.attrib['vztype'] = 'init'
govprep.attrib['lemma'] = adaptvzlemma_inv(
govprep.attrib['lemma'])
cast(str, govprep.attrib['lemma']))
# ET.dump(newstree)

elif rhdframe.startswith('waar_adverb'):
Expand Down
3 changes: 1 addition & 2 deletions mwe_query/lcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
phrasal node is generated for each (relevant) non-head single word.
"""


from typing import Optional
from sastadev.sastatypes import SynTree
from sastadev.treebankfunctions import getattval as gav, terminal, allcats as validcats, find1
Expand Down Expand Up @@ -52,7 +51,7 @@ def getlcatatt(node: SynTree) -> str:
def mkphrase(child: SynTree) -> SynTree:
newnode = ET.Element('node')
if 'íd' in child.attrib:
newnode.attrib['id'] = child.attrib['id'] + 'a'
newnode.attrib['id'] = str(child.attrib['id']) + 'a'
lcat = getlcatatt(child)
if lcat in validcats:
newnode.attrib['cat'] = lcat
Expand Down
3 changes: 1 addition & 2 deletions mwe_query/mwestats.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,7 +646,7 @@ def displayfullstats(stats: MWEstats, outfile, header=''):
rows: List[str] = []
for clemmas, cwords, utt in compliststats.data:
rows.append(f'{clemmas}: {cwords}: {utt}'.strip())

rows.sort()

for row in rows:
Expand Down Expand Up @@ -679,7 +679,6 @@ def displayfullstats(stats: MWEstats, outfile, header=''):
for row in rows:
print(row, file=outfile)

allcompnodes = stats.compnodes
modstats = stats.modstats
displaystats('Modification', modstats, outfile)

Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
# This file is autogenerated by pip-compile with python 3.10
# To update, run:
#
# pip-compile
#
Expand Down Expand Up @@ -36,7 +36,7 @@ requests==2.31.0
# via
# alpino-query
# mwe-query (setup.py)
sastadev==0.1.1
sastadev==0.1.4
# via
# auchann
# mwe-query (setup.py)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
package_data={"mwe_query": ["py.typed"]},
zip_safe=True,
install_requires=[
'alpino-query>=2.1.8', 'requests', 'BaseXClient', 'sastadev>=0.1.1'
'alpino-query>=2.1.8', 'requests', 'BaseXClient', 'sastadev>=0.1.4'
],
entry_points={
'console_scripts': [
Expand Down
63 changes: 59 additions & 4 deletions tests/update_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from alpino_query import parse_sentence # type: ignore
import sys
from os import path
from os import listdir, path
import glob
import lxml.etree as ET

Expand All @@ -15,7 +15,19 @@
# import this implementation
sys.path.insert(0, path.join(testdir, ".."))
from mwe_query import Mwe
from mwe_query.canonicalform import preprocess_MWE, transformtree
from mwe_query.canonicalform import (
preprocess_MWE,
transformtree,
generatemwestructures,
generatequeries,
applyqueries,
)

from mwe_query.mwestats import (
displayfullstats,
getstats,
gettreebank,
)


def datapath(dirname, filename):
Expand Down Expand Up @@ -57,11 +69,51 @@ def update_generate(basename):

def gettopnode(stree):
for child in stree:
if child.tag == 'node':
if child.tag == "node":
return child
return None


def update_full_mwe_stats(treebank_name: str, mwe: str):
dotbfolder = datapath("mwetreebanks", treebank_name)
rawtreebankfilenames = listdir(dotbfolder)
selcond = lambda _: True
treebankfilenames = [
path.join(dotbfolder, fn)
for fn in rawtreebankfilenames
if fn[-4:] == ".xml" and selcond(fn)
]
treebank = gettreebank(treebankfilenames)

mwestructures = generatemwestructures(mwe)
for i, mweparse in enumerate(mwestructures):
mwequery, nearmissquery, supersetquery = generatequeries(mwe)
queryresults = applyqueries(
treebank, mwe, mwequery, nearmissquery, supersetquery, verbose=False
)

fullmwestats = getstats(mwe, queryresults, treebank)

filename = f"full_mwe_stats_{treebank_name}_{i}.txt"
outputfilename = datapath(path.join("mwetreebanks", "expected"), filename)

with open(outputfilename, "w", encoding="utf8") as outfile:

displayfullstats(
fullmwestats.mwestats, outfile, header="*****MWE statistics*****"
)
displayfullstats(
fullmwestats.nearmissstats,
outfile,
header="*****Near-miss statistics*****",
)
displayfullstats(
fullmwestats.diffstats,
outfile,
header="*****Near-miss - MWE statistics*****",
)


def update_transform():
mwes = read("transform", "mwes.txt").splitlines()

Expand All @@ -82,8 +134,11 @@ def update_transform():
i += 1


input_files = glob.glob(path.join(datadir, "generate", '*.txt'))
input_files = glob.glob(path.join(datadir, "generate", "*.txt"))
for input in input_files:
head, ext = path.splitext(path.basename(input))
update_generate(head)

update_full_mwe_stats("dansontspringena", "iemand zal de dans ontspringen")
update_full_mwe_stats("hartbreken", "iemand zal iemands hart breken")
update_transform()

0 comments on commit 3333b35

Please sign in to comment.