childescorrections history update, correcttreebank\:reducing alternat…

…ives, add aha/oho etc
UUDigitalHumanitieslab · Sep 22, 2024 · 4fc7174 · 4fc7174
1 parent 50a615a
commit 4fc7174
Show file tree

Hide file tree

Showing 3 changed files with 77 additions and 12 deletions.
diff --git a/src/sastadev/corrector.py b/src/sastadev/corrector.py
@@ -1098,6 +1098,23 @@ def getalternativetokenmds(tokenmd: TokenMD,  tokens: List[Token], tokenctr: int
                                         name='Emphasis', value='Phoneme lengthening', cat='Pronunciation',
                                         backplacement=bpl_word)
 
+    # aha oho uhu ehe
+    ahapattern = r'([aeouy])h\1'
+    ahare = re.compile(ahapattern)
+    if not known_word(token.word) and ahare.search(token.word):
+        newwords = [ahare.sub(r'\1', token.word)]
+        newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
+                                        name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
+                                        backplacement=bpl_word)
+    # iehie ijhij
+    iehiepattern = r'(ie|ij)h\1'
+    iehiere = re.compile(iehiepattern)
+    if not known_word(token.word) and iehiere.search(token.word):
+        newwords = [iehiere.sub(r'\1', token.word)]
+        newtokenmds = updatenewtokenmds(newtokenmds, token, newwords, beginmetadata,
+                                        name='Emphasis', value='Phoneme Duplication', cat='Pronunciation',
+                                        backplacement=bpl_word)
+
     # basic replacements replace as by als, isse by is
     # here come the replacements
     if token.word in basicreplacements:

diff --git a/src/sastadev/correcttreebank.py b/src/sastadev/correcttreebank.py
@@ -17,7 +17,7 @@
 from sastadev.metadata import (Meta, bpl_delete, bpl_indeze, bpl_node,
                                bpl_none, bpl_replacement, bpl_word, bpl_wordlemma, bpl_word_delprec, insertion)
 from sastadev.sastatok import sasta_tokenize
-from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist
+from sastadev.sastatoken import Token, insertinflate, tokenlist2stringlist, tokenlist2string
 from sastadev.sastatypes import (AltId, CorrectionMode, ErrorDict, MetaElement,
                                  MethodName, Penalty, Position, PositionStr,
                                  SynTree, Targets, Treebank, UttId)
@@ -613,9 +613,10 @@ def correct_stree(stree: SynTree,  corr: CorrectionMode, correctionparameters: C
     debug = False
     # (fatstree, text='fattened tree:')
 
-    ctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree)
-
+    rawctmds: List[Correction] = getcorrections(cleanutttokens, correctionparameters, fatstree)
 
+    ctmds = reducecorrections(rawctmds)
+    # ctmds = rawctmds
 
     debug = False
     if debug:
@@ -665,7 +666,7 @@ def correct_stree(stree: SynTree,  corr: CorrectionMode, correctionparameters: C
             # make sure to include the xmeta from CHAT cleaning!! variable allmetadata, or better metadata but perhaps rename to chatmetadata
             fatnewstree = add_metadata(fatstree, chatmetadata)
 
-        ptmds.append((correctionwordlist, fatnewstree, cwmdmetadata))
+        ptmds.append((correctiontokenlist, fatnewstree, cwmdmetadata))
 
     # select the stree for the most promising correction
     debug = False
@@ -675,16 +676,16 @@ def correct_stree(stree: SynTree,  corr: CorrectionMode, correctionparameters: C
     debug = False
 
     if ptmds == []:
-        thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
+        thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None
     elif corr in [corr1, corrn]:
         thecorrection, orandalts = selectcorrection(fatstree, ptmds, corr)
     else:
         settings.LOGGER.error(
             'Illegal correction value: {}. No corrections applied'.format(corr))
-        thecorrection, orandalts = (cleanutt, fatstree, origmetadata), None
+        thecorrection, orandalts = (cleanutttokens, fatstree, origmetadata), None
 
     thetree = deepcopy(thecorrection[1])
-
+    correctiontokenlist = thecorrection[0]
     debuga = False
     # debuga = False
     if debuga:
@@ -943,7 +944,25 @@ def oldgetuttid(stree: SynTree) -> UttId:
         uttid = uttidlist[0]
     return uttid
 
+def reducecorrections(ctmds: List[Correction]) -> List[Correction]:
+    tempdict = {}
+    for tokenlist, metadata in ctmds:
+        tokenstr = tokenlist2string(tokenlist)
+        newpenalty = compute_penalty(metadata)
+        if tokenstr in tempdict:
+           oldpenalty = tempdict[tokenstr][0]
+           if newpenalty < oldpenalty:
+               tempdict[tokenstr] = (newpenalty, tokenlist, metadata)
+        else:
+            tempdict[tokenstr] = (newpenalty, tokenlist, metadata)
 
+    resultlist = []
+    for tokenstr in tempdict:
+        cand = tempdict[tokenstr]
+        result = (cand[1], cand[2])
+        resultlist.append(result)
+
+    return resultlist
 
 
 def scorefunction(obj: Alternative) -> TupleNint:
@@ -1071,7 +1090,8 @@ def selectcorrection(stree: SynTree, ptmds: List[ParsedCorrection], corr: Correc
 
     altid: AltId = 0
     alts: Dict[AltId, Alternative] = {}
-    for cw, nt, md in ptmds:
+    for ct, nt, md in ptmds:
+        cw = tokenlist2stringlist(ct)
         altsent = space.join(cw)
         penalty = compute_penalty(md)
 

diff --git a/src/sastadev/history.py b/src/sastadev/history.py
@@ -28,13 +28,16 @@ class HistoryCorrection:
 HistoryCorrectionDict = Dict[str, List[HistoryCorrection]]
 space = ' '
 eps = ''
+nocorrectiontype = 'nocorrectiontype'
 
-correctionset = [CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion]
+correctionset = [nocorrectiontype, CHAT_explanation, CHAT_replacement, CHAT_wordnoncompletion]
 
 chatshorttypedict = {CHAT_explanation: 'explanation',
                      CHAT_wordnoncompletion: 'noncompletion',
-                     CHAT_replacement: 'replacement'}
+                     CHAT_replacement: 'replacement',
+                     nocorrectiontype: nocorrectiontype}
 
+shortcorrectionset =  [chatshorttypedict[v] for v in correctionset]
 
 def getshortchattype(metaname: str) -> str:
     if metaname in chatshorttypedict:
@@ -73,15 +76,40 @@ def gathercorrections(treebank: TreeBank) -> defaultdict:
 
 def getcorrections(filename) -> defaultdict:
     resultdict = defaultdict(list)
+    tempdict = defaultdict(list)
     idata = readcsv(filename, header=False)
     for i, row in idata:
         wrong = row[0]
-        newhc = HistoryCorrection(wrong=wrong, correction=row[1], correctiontype=row[2], frequency=int(row[3]))
-        resultdict[wrong].append(newhc)
+        correction = row[1]
+        newhc = HistoryCorrection(wrong=wrong, correction=correction, correctiontype=row[2], frequency=int(row[3]))
+        tempdict[(wrong, correction)].append(newhc)
 
+    for (wrong, correction) in tempdict:
+        unifiedcorrection = unifycorrections(tempdict[(wrong, correction)])
+        resultdict[wrong].append(unifiedcorrection)
     return resultdict
 
 
+def unifycorrections(hcs: List[HistoryCorrection]) -> HistoryCorrection:
+    currentcorrectiontype = nocorrectiontype
+    totalfrq = 0
+    for hc in hcs:
+        wrong = hc.wrong
+        correction= hc.correction
+        if isbetter(hc.correctiontype, currentcorrectiontype):
+            currentcorrectiontype = hc.correctiontype
+        totalfrq += hc.frequency
+    result = HistoryCorrection(wrong=wrong, correction=correction,
+                               correctiontype = currentcorrectiontype, frequency=totalfrq)
+    return result
+
+
+def isbetter(corrtype1, corrtype2) -> str:
+    c1score = shortcorrectionset.index(corrtype1)
+    c2score = shortcorrectionset.index(corrtype2)
+    result = c1score > c2score
+    return result
+
 def getdonefilenames(filename) -> set:
     result = set()
     idata = readcsv(filename, header=False)