Skip to content

Commit

Permalink
clausal analysis preference
Browse files Browse the repository at this point in the history
  • Loading branch information
JanOdijk committed Sep 10, 2024
1 parent 15bd8f9 commit e1e98b8
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 11 deletions.
6 changes: 5 additions & 1 deletion src/sastadev/alpinoparsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,11 @@ def parse(origsent: str, escape: bool = True):
if 300 > r1.status >= 200:
streebytes = r1.read()
# print(streebytes.decode('utf8'))
stree = etree.fromstring(streebytes)
try:
stree = etree.fromstring(streebytes)
except etree.XMLSyntaxError as e:
sastadev.conf.settings.LOGGER.error(f'Error: {e} for {sent}')
stree = None
return stree
else:
sastadev.conf.settings.LOGGER.error('parsing failed:', r1.status, r1.reason, sent)
Expand Down
29 changes: 23 additions & 6 deletions src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from sastadev.sva import phicompatible
from sastadev.syllablecount import countsyllables
from sastadev.targets import get_mustbedone
from sastadev.treebankfunctions import (adaptsentence, add_metadata, countav, deflate,
from sastadev.treebankfunctions import (adaptsentence, add_metadata, clausecats, countav, deflate,
deletewordnodes, fatparse, find1,
getattval, getbeginend,
getcompoundcount, getneighbourwordnode, getnodeyield, getorigutt,
Expand Down Expand Up @@ -55,7 +55,7 @@
ParsedCorrection = Tuple[List[str], SynTree, List[Meta]]
TupleNint = Tuple[19 * (int,)]

altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'complsucount', 'dimcount', 'compcount', 'supcount',
altpropertiesheader = ['penalty', 'dpcount', 'dhyphencount', 'mainclausecount', 'topclause', 'complsucount', 'dimcount', 'compcount', 'supcount',
'compoundcount', 'unknownwordcount', 'wrongposwordcount', 'smainsucount', 'sucount', 'svaokcount', 'deplusneutcount', 'badcatcount',
'hyphencount', 'lonelytoecount', 'basicreplaceecount', 'ambigcount', 'subjunctivecount', 'unknownnouncount',
'unknownnamecount', 'dezebwcount', 'noun1c_count']
Expand All @@ -70,7 +70,7 @@


class Alternative():
def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount,
def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount,
compcount, supcount, compoundcount, unknownwordcount, wrongposwordcount, smainsucount, sucount, svaok, deplusneutcount, badcatcount,
hyphencount, lonelytoecount,
basicreplaceecount, ambigcount, subjunctivecount, unknownnouncount, unknownnamecount,
Expand All @@ -82,6 +82,7 @@ def __init__(self, stree, altid, altsent, penalty, dpcount, dhyphencount, maincl
self.dpcount: int = int(dpcount)
self.dhyphencount: int = int(dhyphencount)
self.mainclausecount: int = int(mainclausecount)
self.topclause: int = int(topclause)
self.complsucount: int = int(complsucount)
self.dimcount: int = int(dimcount)
self.compcount: int = int(compcount)
Expand Down Expand Up @@ -117,7 +118,7 @@ def alt2row(self, uttid: UttId, base: str, user1: str = '', user2: str = '', use
score = ampersand.join(scores)
part4 = list(
map(str, [self.altid, self.altsent, score, self.penalty, self.dpcount, self.dhyphencount,
self.mainclausecount, self.complsucount,
self.mainclausecount, self.topclause, self.complsucount,
self.dimcount, self.compcount, self.supcount, self.compoundcount, self.unknownwordcount,
self.wrongposwordcount, self.smainsucount, self.sucount,
self.svaok, self.deplusneutcount, self.badcatcount, self.hyphencount, self.lonelytoecount,
Expand Down Expand Up @@ -975,7 +976,7 @@ def oldgetuttid(stree: SynTree) -> UttId:

def scorefunction(obj: Alternative) -> TupleNint:
return (-obj.unknownwordcount, -obj.wrongposwordcount,-obj.unknownnouncount, -obj.unknownnamecount, -obj.ambigcount, -obj.dpcount,
-obj.dhyphencount, -obj.mainclausecount,
-obj.dhyphencount, -obj.mainclausecount, obj.topclause,
-obj.complsucount, -obj.badcatcount,
-obj.basicreplaceecount, -obj.ambigcount, -obj.hyphencount, -obj.lonelytoecount,
-obj.subjunctivecount, obj.smainsucount, obj.dimcount,
Expand Down Expand Up @@ -1087,6 +1088,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc
sucount = countav(nt, 'rel', 'su')
lonelytoecount = getlonelytoecount(nt)
mainclausecount = getmainclausecount(nt)
topclause = gettopclause(nt)
smainsucount = countsmainsu(nt)
svaokcount = getsvaokcount(nt)
deplusneutcount = getdeplusneutcount(nt)
Expand All @@ -1112,7 +1114,7 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc
# overregcount but these will mostly be unknown words
# mwunamecount well maybe unknownpropernoun first

alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, complsucount, dimcount, compcount,
alt = Alternative(stree, altid, altsent, penalty, dpcount, dhyphencount, mainclausecount, topclause, complsucount, dimcount, compcount,
supcount,
compoundcount, unknownwordcount, wrongposwordcount, smainsucount, sucount, svaokcount, deplusneutcount, badcatcount,
hyphencount, lonelytoecount,
Expand Down Expand Up @@ -1167,6 +1169,21 @@ def getmainclausecount(nt: SynTree) -> int:
result = lmatches
return result

topxpath = './/node[@cat="top"]'
def gettopclause(nt: SynTree) -> int:
tops = nt.xpath(topxpath)
if tops == []:
return 0
top = tops[0]
realchildren = [child for child in top if getattval(child, 'pt') not in ['let', 'tsw']]
if len(realchildren) != 1:
return 0
else:
thechild = realchildren[0]
thechildcat = getattval(thechild, 'cat')
result = 1 if thechildcat in clausecats else 0
return result

toexpath = './/node[@lemma="toe" or (@lemma="tot" and @vztype="fin")]'
naarxpath = './/node[@lemma="naar"]'
def getlonelytoecount(nt: SynTree) -> int:
Expand Down
2 changes: 1 addition & 1 deletion src/sastadev/smallclauses.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def getauxform(aux: str, node:SynTree) -> str:
result = 'heeft' if aux == 'hebben' else 'is'
return result

def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty):
def mkinsertmeta(inserttokens, resultlist, penalty=defaultpenalty, cat=smallclause):
insertposs = [token.pos + token.subpos for token in inserttokens]
insertwordlist = [token.word for token in inserttokens]
tokenmappinglist = [token.pos if token.subpos == 0 else None for token in resultlist]
Expand Down
6 changes: 4 additions & 2 deletions src/sastadev/toe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from sastadev.smallclauses import bg, mkinsertmeta, realword, word
from sastadev.tokenmd import TokenListMD

lonelytoe = 'Lonely toe'

def isdet(node) -> bool:
nodept = getattval(node, 'pt')
nodepdtype = getattval(node, 'pdtype' )
Expand Down Expand Up @@ -50,7 +52,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]:
if isdet(thisnode) and getattval(nextnode, 'pt') == 'n':
naartoken = Token('naar', token.pos, subpos=5)
inserttokens = [naartoken]
metadata += mkinsertmeta(inserttokens, newtokens)
metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe)
naarfound = True
newtokens.append(naartoken)
insertiondone = True
Expand All @@ -62,7 +64,7 @@ def lonelytoe(tokensmd: TokenListMD, tree: SynTree) -> List[TokenListMD]:
naarfound = True
newtokens.append(naartoken)
inserttokens = [naartoken]
metadata += mkinsertmeta(inserttokens, newtokens)
metadata += mkinsertmeta(inserttokens, newtokens, cat=lonelytoe)
insertiondone = True
newtokens.append(token)
if insertiondone:
Expand Down
2 changes: 1 addition & 1 deletion src/sastadev/treebankfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ def mktoken2nodemap(tokens: List[Token], tree: SynTree) -> Dict[int, SynTree]:
tokennodes = tree.xpath('.//node[@pt or @pos or @word]')
tokennodesdict = {int(getattval(n, 'begin')): n for n in tokennodes}
token2nodemap = {token.pos: tokennodesdict[token.pos]
for token in tokens if keycheck(token.pos, tokennodesdict)}
for token in tokens if not token.skip and keycheck(token.pos, tokennodesdict)}
return token2nodemap


Expand Down

0 comments on commit e1e98b8

Please sign in to comment.