diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py index 33c8dda..2b7b78f 100644 --- a/src/sastadev/corrector.py +++ b/src/sastadev/corrector.py @@ -1098,6 +1098,23 @@ def getalternativetokenmds(tokenmd: TokenMD, tokens: List[Token], tokenctr: int name='Emphasis', value='Phoneme lengthening', cat='Pronunciation', backplacement=bpl_word) + # aha oho uhu ehe + ahapattern = r'([aeouy])h\1' + ahare = re.compile(ahapattern) + if not known_word(token.word) and ahare.search(token.word): + newwords = [ahare.sub(r'\1', token.word)] + newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, + name='Emphasis', value='Phoneme Duplication', cat='Pronunciation', + backplacement=bpl_word) + # iehie ijhij + iehiepattern = r'(ie|ij)h\1' + iehiere = re.compile(iehiepattern) + if not known_word(token.word) and iehiere.search(token.word): + newwords = [iehiere.sub(r'\1', token.word)] + newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata, + name='Emphasis', value='Phoneme Duplication', cat='Pronunciation', + backplacement=bpl_word) + # basic replacements replace as by als, isse by is # here come the replacements if token.word in basicreplacements: diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py index 739878d..31b818e 100644 --- a/src/sastadev/correcttreebank.py +++ b/src/sastadev/correcttreebank.py @@ -17,7 +17,7 @@ from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node, bpl_none, bpl_replacement, bpl_word, bpl_wordlemma, bpl_word_delprec, insertion) from sastadev.sastatok import sasta_tokenize -from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist +from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement, MethodName, Penalty, Position, PositionStr, SynTree, Targets, Treebank, UttId) @@ -613,9 +613,10 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C debug = False # (fatstree, text='fattened tree:') - ctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree) - + rawctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree) + ctmds = reducecorrections(rawctmds) + # ctmds = rawctmds debug = False if debug: @@ -665,7 +666,7 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C # make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata fatnewstree = add_metadata(fatstree, chatmetadata) - ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata)) + ptmds.append((correctiontokenlist, fatnewstree, cwmdmetadata)) # select the stree for the most promising correction debug = False @@ -675,16 +676,16 @@ def correct_stree(stree: SynTree, corr: CorrectionMode, correctionparameters: C debug = False if ptmds == []: - thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None + thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None elif corr in [corr1, corrn]: thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr) else: settings.LOGGER.error( 'Illegal correction value: {}. No corrections applied'.format(corr)) - thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None + thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None thetree = deepcopy(thecorrection[1]) - + correctiontokenlist = thecorrection[0] debuga = False # debuga = False if debuga: @@ -943,7 +944,25 @@ def oldgetuttid(stree: SynTree) -> UttId: uttid = uttidlist[0] return uttid +def reducecorrections(ctmds: List[Correction]) -> List[Correction]: + tempdict = {} + for tokenlist, metadata in ctmds: + tokenstr = tokenlist2string(tokenlist) + newpenalty = compute_penalty(metadata) + if tokenstr in tempdict: + oldpenalty = tempdict[tokenstr][0] + if newpenalty < oldpenalty: + tempdict[tokenstr] = (newpenalty, tokenlist, metadata) + else: + tempdict[tokenstr] = (newpenalty, tokenlist, metadata) + resultlist = [] + for tokenstr in tempdict: + cand = tempdict[tokenstr] + result = (cand[1], cand[2]) + resultlist.append(result) + + return resultlist def scorefunction(obj: Alternative) -> TupleNint: @@ -1071,7 +1090,8 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc altid: AltId = 0 alts: Dict[AltId, Alternative] = {} - for cw, nt, md in ptmds: + for ct, nt, md in ptmds: + cw = tokenlist2stringlist(ct) altsent = space.join(cw) penalty = compute_penalty(md) diff --git a/src/sastadev/history.py b/src/sastadev/history.py index b3980a3..a199b9a 100644 --- a/src/sastadev/history.py +++ b/src/sastadev/history.py @@ -28,13 +28,16 @@ class HistoryCorrection: HistoryCorrectionDict = Dict[str, List[HistoryCorrection]] space = ' ' eps = '' +nocorrectiontype = 'nocorrectiontype' -correctionset = [CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion] +correctionset = [nocorrectiontype, CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion] chatshorttypedict = {CHAT_explanation: 'explanation', CHAT_wordnoncompletion: 'noncompletion', - CHAT_replacement: 'replacement'} + CHAT_replacement: 'replacement', + nocorrectiontype: nocorrectiontype} +shortcorrectionset = [chatshorttypedict[v] for v in correctionset] def getshortchattype(metaname: str) -> str: if metaname in chatshorttypedict: @@ -73,15 +76,40 @@ def gathercorrections(treebank: TreeBank) -> defaultdict: def getcorrections(filename) -> defaultdict: resultdict = defaultdict(list) + tempdict = defaultdict(list) idata = readcsv(filename, header=False) for i, row in idata: wrong = row[0] - newhc = HistoryCorrection(wrong=wrong, correction=row[1], correctiontype=row[2], frequency=int(row[3])) - resultdict[wrong].append(newhc) + correction = row[1] + newhc = HistoryCorrection(wrong=wrong, correction=correction, correctiontype=row[2], frequency=int(row[3])) + tempdict[(wrong, correction)].append(newhc) + for (wrong, correction) in tempdict: + unifiedcorrection = unifycorrections(tempdict[(wrong, correction)]) + resultdict[wrong].append(unifiedcorrection) return resultdict +def unifycorrections(hcs: List[HistoryCorrection]) -> HistoryCorrection: + currentcorrectiontype = nocorrectiontype + totalfrq = 0 + for hc in hcs: + wrong = hc.wrong + correction= hc.correction + if isbetter(hc.correctiontype, currentcorrectiontype): + currentcorrectiontype = hc.correctiontype + totalfrq += hc.frequency + result = HistoryCorrection(wrong=wrong, correction=correction, + correctiontype = currentcorrectiontype, frequency=totalfrq) + return result + + +def isbetter(corrtype1, corrtype2) -> str: + c1score = shortcorrectionset.index(corrtype1) + c2score = shortcorrectionset.index(corrtype2) + result = c1score > c2score + return result + def getdonefilenames(filename) -> set: result = set() idata = readcsv(filename, header=False)