clausal analysis preference

UUDigitalHumanitieslab · Sep 10, 2024 · e1e98b8 · e1e98b8
1 parent 15bd8f9
commit e1e98b8
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 11 deletions.
diff --git a/src/sastadev/alpinoparsing.py b/src/sastadev/alpinoparsing.py
@@ -83,7 +83,11 @@ def parse(origsent: str, escape: bool = True):
         if 300 > r1.status >= 200:
             streebytes = r1.read()
             # print(streebytes.decode('utf8'))
-            stree = etree.fromstring(streebytes)
+            try:
+                stree = etree.fromstring(streebytes)
+            except etree.XMLSyntaxError as e:
+                sastadev.conf.settings.LOGGER.error(f'Error: {e} for {sent}')
+                stree = None
             return stree
         else:
             sastadev.conf.settings.LOGGER.error('parsing failed:', r1.status, r1.reason, sent)

diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py
@@ -22,7 +22,7 @@
 from sastadev.sva import phicompatible
 from sastadev.syllablecount import countsyllables
 from sastadev.targets import get_mustbedone
-from sastadev.treebankfunctions import (adaptsentence, add_metadata, countav, deflate,
+from sastadev.treebankfunctions import (adaptsentence, add_metadata, clausecats, countav, deflate,
                                         deletewordnodes, fatparse, find1,
                                         getattval, getbeginend,
                                         getcompoundcount, getneighbourwordnode, getnodeyield, getorigutt,
@@ -55,7 +55,7 @@
 ParsedCorrection = Tuple[List[str], SynTree, List[Meta]]
 TupleNint = Tuple[19 * (int,)]
 
-altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'complsucount', 'dimcount', 'compcount', 'supcount',
+altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'topclause', 'complsucount', 'dimcount', 'compcount', 'supcount',
                        'compoundcount', 'unknownwordcount', 'wrongposwordcount', 'smainsucount', 'sucount', 'svaokcount', 'deplusneutcount', 'badcatcount',
                        'hyphencount', 'lonelytoecount', 'basicreplaceecount', 'ambigcount', 'subjunctivecount', 'unknownnouncount',
                        'unknownnamecount', 'dezebwcount', 'noun1c_count']
@@ -70,7 +70,7 @@
 
 
 class Alternative():
-    def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount,
+    def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount,
                  compcount, supcount, compoundcount, unknownwordcount, wrongposwordcount, smainsucount, sucount, svaok, deplusneutcount, badcatcount,
                  hyphencount, lonelytoecount,
                  basicreplaceecount, ambigcount, subjunctivecount, unknownnouncount, unknownnamecount,
@@ -82,6 +82,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, maincl
         self.dpcount: int = int(dpcount)
         self.dhyphencount: int = int(dhyphencount)
         self.mainclausecount: int = int(mainclausecount)
+        self.topclause: int = int(topclause)
         self.complsucount: int = int(complsucount)
         self.dimcount: int = int(dimcount)
         self.compcount: int = int(compcount)
@@ -117,7 +118,7 @@ def alt2row(self, uttid: UttId, base: str, user1: str = '', user2: str = '', use
         score = ampersand.join(scores)
         part4 = list(
             map(str, [self.altid, self.altsent, score, self.penalty, self.dpcount, self.dhyphencount,
-                      self.mainclausecount, self.complsucount,
+                      self.mainclausecount, self.topclause, self.complsucount,
                       self.dimcount, self.compcount, self.supcount, self.compoundcount, self.unknownwordcount,
                       self.wrongposwordcount, self.smainsucount, self.sucount,
                       self.svaok, self.deplusneutcount, self.badcatcount, self.hyphencount, self.lonelytoecount,
@@ -975,7 +976,7 @@ def oldgetuttid(stree: SynTree) -> UttId:
 
 def scorefunction(obj: Alternative) -> TupleNint:
     return (-obj.unknownwordcount, -obj.wrongposwordcount,-obj.unknownnouncount, -obj.unknownnamecount, -obj.ambigcount, -obj.dpcount,
-            -obj.dhyphencount, -obj.mainclausecount,
+            -obj.dhyphencount, -obj.mainclausecount, obj.topclause,
             -obj.complsucount, -obj.badcatcount,
             -obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, -obj.lonelytoecount,
             -obj.subjunctivecount, obj.smainsucount, obj.dimcount,
@@ -1087,6 +1088,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc
         sucount = countav(nt, 'rel', 'su')
         lonelytoecount = getlonelytoecount(nt)
         mainclausecount = getmainclausecount(nt)
+        topclause = gettopclause(nt)
         smainsucount = countsmainsu(nt)
         svaokcount = getsvaokcount(nt)
         deplusneutcount = getdeplusneutcount(nt)
@@ -1112,7 +1114,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc
         # overregcount but these will mostly be unknown words
         # mwunamecount well maybe unknownpropernoun first
 
-        alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount, compcount,
+        alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount, compcount,
                           supcount,
                           compoundcount, unknownwordcount,  wrongposwordcount, smainsucount, sucount, svaokcount, deplusneutcount, badcatcount,
                           hyphencount, lonelytoecount,
@@ -1167,6 +1169,21 @@ def getmainclausecount(nt: SynTree) -> int:
         result = lmatches
     return result
 
+topxpath = './/node[@cat="top"]'
+def gettopclause(nt: SynTree) -> int:
+    tops = nt.xpath(topxpath)
+    if tops == []:
+        return 0
+    top = tops[0]
+    realchildren = [child for child in top if getattval(child, 'pt') not in ['let', 'tsw']]
+    if len(realchildren) != 1:
+        return 0
+    else:
+       thechild = realchildren[0]
+       thechildcat = getattval(thechild, 'cat')
+       result = 1 if thechildcat in clausecats else 0
+       return result
+
 toexpath = './/node[@lemma="toe" or (@lemma="tot" and @vztype="fin")]'
 naarxpath = './/node[@lemma="naar"]'
 def getlonelytoecount(nt: SynTree) -> int:

diff --git a/src/sastadev/smallclauses.py b/src/sastadev/smallclauses.py
@@ -293,7 +293,7 @@ def getauxform(aux: str, node:SynTree) -> str:
             result = 'heeft' if aux == 'hebben' else 'is'
     return result
 
-def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty):
+def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty, cat=smallclause):
     insertposs = [token.pos + token.subpos for token in inserttokens]
     insertwordlist = [token.word for token in inserttokens]
     tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist]

diff --git a/src/sastadev/toe.py b/src/sastadev/toe.py
@@ -7,6 +7,8 @@
 from sastadev.smallclauses import bg, mkinsertmeta, realword, word
 from sastadev.tokenmd import TokenListMD
 
+lonelytoe = 'Lonely toe'
+
 def isdet(node) -> bool:
     nodept = getattval(node, 'pt')
     nodepdtype = getattval(node, 'pdtype' )
@@ -50,7 +52,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]:
                 if isdet(thisnode) and getattval(nextnode, 'pt') == 'n':
                     naartoken = Token('naar', token.pos, subpos=5)
                     inserttokens = [naartoken]
-                    metadata += mkinsertmeta(inserttokens, newtokens)
+                    metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe)
                     naarfound = True
                     newtokens.append(naartoken)
                     insertiondone = True
@@ -62,7 +64,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]:
                     naarfound = True
                     newtokens.append(naartoken)
                     inserttokens = [naartoken]
-                    metadata += mkinsertmeta(inserttokens, newtokens)
+                    metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe)
                     insertiondone = True
         newtokens.append(token)
     if insertiondone:

diff --git a/src/sastadev/treebankfunctions.py b/src/sastadev/treebankfunctions.py
@@ -610,7 +610,7 @@ def mktoken2nodemap(tokens: List[Token], tree: SynTree) -> Dict[int, SynTree]:
     tokennodes = tree.xpath('.//node[@pt or @pos or @word]')
     tokennodesdict = {int(getattval(n, 'begin')): n for n in tokennodes}
     token2nodemap = {token.pos: tokennodesdict[token.pos]
-                     for token in tokens if keycheck(token.pos, tokennodesdict)}
+                     for token in tokens if not token.skip and keycheck(token.pos, tokennodesdict)}
     return token2nodemap