Skip to content

Commit

Permalink
childescorrections history update, correcttreebank\:reducing alternat…
Browse files Browse the repository at this point in the history
…ives, add aha/oho etc
  • Loading branch information
JanOdijk committed Sep 22, 2024
1 parent 50a615a commit 4fc7174
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 12 deletions.
17 changes: 17 additions & 0 deletions src/sastadev/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,23 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int
name='Emphasis', value='Phoneme lengthening', cat='Pronunciation',
backplacement=bpl_word)

# aha oho uhu ehe
ahapattern = r'([aeouy])h\1'
ahare = re.compile(ahapattern)
if not known_word(token.word) and ahare.search(token.word):
newwords = [ahare.sub(r'\1', token.word)]
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
backplacement=bpl_word)
# iehie ijhij
iehiepattern = r'(ie|ij)h\1'
iehiere = re.compile(iehiepattern)
if not known_word(token.word) and iehiere.search(token.word):
newwords = [iehiere.sub(r'\1', token.word)]
newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
backplacement=bpl_word)

# basic replacements replace as by als, isse by is
# here come the replacements
if token.word in basicreplacements:
Expand Down
36 changes: 28 additions & 8 deletions src/sastadev/correcttreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node,
bpl_none, bpl_replacement, bpl_word, bpl_wordlemma, bpl_word_delprec, insertion)
from sastadev.sastatok import sasta_tokenize
from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist
from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string
from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement,
MethodName, Penalty, Position, PositionStr,
SynTree, Targets, Treebank, UttId)
Expand Down Expand Up @@ -613,9 +613,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
debug = False
# (fatstree, text='fattened tree:')

ctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree)

rawctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree)

ctmds = reducecorrections(rawctmds)
# ctmds = rawctmds

debug = False
if debug:
Expand Down Expand Up @@ -665,7 +666,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
# make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata
fatnewstree = add_metadata(fatstree, chatmetadata)

ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata))
ptmds.append((correctiontokenlist, fatnewstree, cwmdmetadata))

# select the stree for the most promising correction
debug = False
Expand All @@ -675,16 +676,16 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C
debug = False

if ptmds == []:
thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None
elif corr in [corr1, corrn]:
thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr)
else:
settings.LOGGER.error(
'Illegal correction value: {}. No corrections applied'.format(corr))
thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None

thetree = deepcopy(thecorrection[1])

correctiontokenlist = thecorrection[0]
debuga = False
# debuga = False
if debuga:
Expand Down Expand Up @@ -943,7 +944,25 @@ def oldgetuttid(stree: SynTree) -> UttId:
uttid = uttidlist[0]
return uttid

def reducecorrections(ctmds: List[Correction]) -> List[Correction]:
tempdict = {}
for tokenlist, metadata in ctmds:
tokenstr = tokenlist2string(tokenlist)
newpenalty = compute_penalty(metadata)
if tokenstr in tempdict:
oldpenalty = tempdict[tokenstr][0]
if newpenalty < oldpenalty:
tempdict[tokenstr] = (newpenalty, tokenlist, metadata)
else:
tempdict[tokenstr] = (newpenalty, tokenlist, metadata)

resultlist = []
for tokenstr in tempdict:
cand = tempdict[tokenstr]
result = (cand[1], cand[2])
resultlist.append(result)

return resultlist


def scorefunction(obj: Alternative) -> TupleNint:
Expand Down Expand Up @@ -1071,7 +1090,8 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc

altid: AltId = 0
alts: Dict[AltId, Alternative] = {}
for cw, nt, md in ptmds:
for ct, nt, md in ptmds:
cw = tokenlist2stringlist(ct)
altsent = space.join(cw)
penalty = compute_penalty(md)

Expand Down
36 changes: 32 additions & 4 deletions src/sastadev/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,16 @@ class HistoryCorrection:
HistoryCorrectionDict = Dict[str, List[HistoryCorrection]]
space = ' '
eps = ''
nocorrectiontype = 'nocorrectiontype'

correctionset = [CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion]
correctionset = [nocorrectiontype, CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion]

chatshorttypedict = {CHAT_explanation: 'explanation',
CHAT_wordnoncompletion: 'noncompletion',
CHAT_replacement: 'replacement'}
CHAT_replacement: 'replacement',
nocorrectiontype: nocorrectiontype}

shortcorrectionset = [chatshorttypedict[v] for v in correctionset]

def getshortchattype(metaname: str) -> str:
if metaname in chatshorttypedict:
Expand Down Expand Up @@ -73,15 +76,40 @@ def gathercorrections(treebank: TreeBank) -> defaultdict:

def getcorrections(filename) -> defaultdict:
resultdict = defaultdict(list)
tempdict = defaultdict(list)
idata = readcsv(filename, header=False)
for i, row in idata:
wrong = row[0]
newhc = HistoryCorrection(wrong=wrong, correction=row[1], correctiontype=row[2], frequency=int(row[3]))
resultdict[wrong].append(newhc)
correction = row[1]
newhc = HistoryCorrection(wrong=wrong, correction=correction, correctiontype=row[2], frequency=int(row[3]))
tempdict[(wrong, correction)].append(newhc)

for (wrong, correction) in tempdict:
unifiedcorrection = unifycorrections(tempdict[(wrong, correction)])
resultdict[wrong].append(unifiedcorrection)
return resultdict


def unifycorrections(hcs: List[HistoryCorrection]) -> HistoryCorrection:
currentcorrectiontype = nocorrectiontype
totalfrq = 0
for hc in hcs:
wrong = hc.wrong
correction= hc.correction
if isbetter(hc.correctiontype, currentcorrectiontype):
currentcorrectiontype = hc.correctiontype
totalfrq += hc.frequency
result = HistoryCorrection(wrong=wrong, correction=correction,
correctiontype = currentcorrectiontype, frequency=totalfrq)
return result


def isbetter(corrtype1, corrtype2) -> str:
c1score = shortcorrectionset.index(corrtype1)
c2score = shortcorrectionset.index(corrtype2)
result = c1score > c2score
return result

def getdonefilenames(filename) -> set:
result = set()
idata = readcsv(filename, header=False)
Expand Down

0 comments on commit 4fc7174

Please sign in to comment.