Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add non exact match strategy #11

Open
lsmith77 opened this issue Nov 22, 2022 · 0 comments
Open

add non exact match strategy #11

lsmith77 opened this issue Nov 22, 2022 · 0 comments

Comments

@lsmith77
Copy link

lsmith77 commented Nov 22, 2022

in our testing we found too many words missing than we can realistically add to Wiktionary (see also #8). so we now implemented the following strategy which at least allows us to detect the genus. would this be interesting to add to your package?

primary_german_genus_endings = {
    "n": [
        "chen",
        "ett",
        "eau",
        "lein",
        "icht",
        "il",
        "ium",
        "it",
        "ma",
        "ment",
        "tel",
        "tum",
        "um",
    ],
    "f": [
        "in",
        "a",
        "ade",
        "age",
        "anz",
        "elle",
        "ette",
        "ere",
        "enz",
        "ei",
        "ine",
        "isse",
        "itis",
        "ive",
        "ie",
        "heit",
        "keit",
        "ik",
        "sion",
        "se",
        "sis",
        "tät",
        "ung",
        "ur",
        "schaft",
    ],
    "m": [
        "ant",
        "ast",
        "ich",
        "ist",
        "ig",
        "ling",
        "or",
        "us",
        "ismus",
        "är",
        "eur",
        "iker",
        "ps",
    ],
}

secondary_german_genus_endings = {
    # 3 out of four words ending with -nis and -sal are neuter nouns
    "n": [
        "nis", "sal",
    ],
    # There are exceptions such as Postillion, which is masculine while the oberwhelming majority of -ion words in German is feminine.
    "f": [
        "ion",
    ],
    # More than half of words ending with -er, -en, -el are masculine
    "m": [
        "er", "en", "el",
    ],
}

def determine_genus_from_ending(word, german_genus_endings):
    for genus in german_genus_endings:
        for ending in german_genus_endings[genus]:
            if word.endswith(ending):
                return {"genus": genus}

    return None


def german_noun_lookup(word):
    result = german_nouns[word]
    if not len(result):
        return None

    result = result[0]

    if "genus" in result:
        return result

    if "genus 1" in result:
        result["genus"] = result["genus 1"]

        return result

    if word[-5:].lower() == "leute":
        result["genus"] = "f"

        return result

    genus_result = determine_genus_from_ending(word, primary_german_genus_endings)
    if genus_result == None or "genus" not in genus_result:
        genus_result = determine_genus_from_ending(word, secondary_german_genus_endings)
        if genus_result == None or "genus" not in genus_result:
            return None

    result["genus"] = genus_result["genus"]

    return result


def german_noun_analysis(word, genus_only=False):
    result = german_noun_lookup(word)
    if result != None:
        return result

    if genus_only:
        result = determine_genus_from_ending(word, primary_german_genus_endings)

        if result != None:
            return result

    # skip the first 2 letters
    i = 2

    # skip the last 2 letters
    while i < len(word) - 2:
        partial_word = word[i:]

        # avoid cases like 'Ende' at the end of 'Arbeitgebende'
        if partial_word == "ende":
            break

        result = german_noun_lookup(partial_word.capitalize())
        if result == None:
            i += 1
            continue

        result["Lemma"] = word
        if not genus_only:
            word_prefix = word[0:i]
            for flexion in result["flexion"]:
                result["flexion"][flexion] = (
                    word_prefix + result["flexion"][flexion].lower()
                )

        return result

    if genus_only:
        result = determine_genus_from_ending(word, primary_german_genus_endings)

    return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant