Merge branch 'JuliaText:master' into fix/ROUGE

JuliaText · Oct 15, 2023 · 96e1cdc · 96e1cdc
2 parents 66d7657 + dfb4fd9
commit 96e1cdc
Show file tree

Hide file tree

Showing 8 changed files with 233 additions and 6 deletions.
diff --git a/Project.toml b/Project.toml
@@ -2,7 +2,7 @@ name = "TextAnalysis"
 uuid = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 license = "MIT"
 desc = "Julia package for text analysis"
-version = "0.7.4"
+version = "0.7.5"
 
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
@@ -23,6 +23,7 @@ WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
 DataStructures = "0.17, 0.18"
+DelimitedFiles = "1"
 JSON = "0.21"
 Languages = "0.4"
 ProgressMeter = "1.5"

diff --git a/README.md b/README.md
@@ -3,14 +3,14 @@
 A Julia package for working with text.
 
 [![CI](https://github.com/juliatext/TextAnalysis.jl/workflows/CI/badge.svg?event=push&branch=master)](https://github.com/JuliaText/TextAnalysis.jl/actions?query=workflow%3ACI)
-[![version](https://juliahub.com/docs/TextAnalysis/version.svg)](https://juliahub.com/ui/Packages/TextAnalysis/5Mwet)
-[![docs](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliahub.com/docs/TextAnalysis/5Mwet) 
+[![version](https://juliahub.com/docs/TextAnalysis/version.svg)](https://juliahub.com/ui/Packages/General/TextAnalysis/)
+[![docs](https://img.shields.io/badge/docs-stable-blue.svg)](https://docs.juliahub.com/TextAnalysis/) 
 
 ## Introduction
 
 TextAnalysis provides support for standard tools and models for working with textual data and natural languages in the Julia language. 
 
-- **Documentation** :  [Documentation for stable released version](https://juliahub.com/docs/TextAnalysis/5Mwet)
+- **Documentation** :  [Documentation for stable released version](https://docs.juliahub.com/TextAnalysis)
 - **License** : [MIT License](https://github.com/JuliaText/TextAnalysis.jl/blob/master/LICENSE.md)
 
 ## Features

diff --git a/docs/src/evaluation_metrics.md b/docs/src/evaluation_metrics.md
@@ -6,11 +6,15 @@ As of now TextAnalysis provides the following evaluation metrics.
 * [ROUGE-N](https://en.wikipedia.org/wiki/ROUGE_(metric))
 * [ROUGE-L](https://en.wikipedia.org/wiki/ROUGE_(metric))
 
+* [BLEU (bilingual evaluation understudy)](https://en.wikipedia.org/wiki/BLEU)
+
 ## ROUGE-N
 This metric evaluation based on the overlap of N-grams
 between the system and reference summaries.
 
-    rouge_n(references, candidate, n; avg, lang)
+```@docs
+rouge_n
+```
 
 The function takes the following arguments -
 
@@ -35,3 +39,39 @@ julia> rouge_n(reference_summaries, candidate_summary, 2, avg=true)
 julia> rouge_n(reference_summaries, candidate_summary, 1, avg=true)
 0.5051282051282051
 ```
+
+## BLEU (bilingual evaluation understudy)
+
+```@docs
+bleu_score
+```
+
+[NLTK sample](https://www.nltk.org/api/nltk.translate.bleu_score.html)
+```@example
+    using TextAnalysis
+
+    reference1 = [
+        "It", "is", "a", "guide", "to", "action", "that",
+        "ensures", "that", "the", "military", "will", "forever",
+        "heed", "Party", "commands"
+    ]
+    reference2 = [
+        "It", "is", "the", "guiding", "principle", "which",
+        "guarantees", "the", "military", "forces", "always",
+        "being", "under", "the", "command", "of", "the",
+        "Party"
+    ]
+    reference3 = [
+        "It", "is", "the", "practical", "guide", "for", "the",
+        "army", "always", "to", "heed", "the", "directions",
+        "of", "the", "party"
+    ]
+
+    hypothesis1 = [
+        "It", "is", "a", "guide", "to", "action", "which",
+        "ensures", "that", "the", "military", "always",
+        "obeys", "the", "commands", "of", "the", "party"
+    ]
+
+    score = bleu_score([[reference1, reference2, reference3]], [hypothesis1])
+```
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -61,6 +61,7 @@ module TextAnalysis
     export NaiveBayesClassifier
     export tag_scheme!
     export rouge_l_summary, rouge_l_sentence, rouge_n
+    export bleu_score
     export PerceptronTagger, fit!, predict
 
     export Vocabulary, lookup, update
@@ -91,6 +92,7 @@ module TextAnalysis
     include("utils.jl")
 
     include("evaluation_metrics.jl")
+    include("translate_evaluation/bleu_score.jl")
     include("coom.jl")
 
 

diff --git a/src/coom.jl b/src/coom.jl
@@ -145,7 +145,7 @@ function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true) where T<:Abstrac
     CooMatrix{T}(doc, terms, window=window, normalize=normalize)
 end
 
-CooMatrix(doc; window::Int=5, normalize::Bool=true) where T<:AbstractFloat =
+CooMatrix(doc; window::Int=5, normalize::Bool=true) =
     CooMatrix{Float64}(doc, window=window, normalize=normalize)
 
 """

diff --git a/src/translate_evaluation/bleu_score.jl b/src/translate_evaluation/bleu_score.jl
@@ -0,0 +1,145 @@
+# Julia Implementation of BLEU and Smooth BLEU score
+# ref: https://github.com/tensorflow/nmt/blob/master/nmt/scripts/bleu.py#L56
+
+# Example: bleu_score([["apple is apple"]], ["apple is appl"])
+
+# Julia implementation of BLEU and smooth-BLEU.
+# Based on https://github.com/AdarshKumar712/Metrics.jl/blob/master/src/NLP_Metrics/bleu.jl
+
+# This module provides a Julia implementation of BLEU and smooth-BLEU.
+# Smooth BLEU is computed following the method outlined in the paper:
+# Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
+# evaluation metrics for machine translation. COLING 2004.
+
+
+"""
+    get_ngrams(segment, max_order)
+
+Extracts all n-grams upto a given maximum order from an input segment. Returns the counter containing all n-grams upto max_order in segment
+with a count of how many times each n-gram occurred.
+
+# Arguments 
+ - `segment`: text segment from which n-grams will be extracted.
+ - `max_order`: maximum length in tokens of the n-grams returned by this methods.
+
+"""
+function get_ngrams(segment::Vector{<:AbstractString}, max_order::Integer)
+    ngrams_count = Dict()
+    for order in 1:max_order
+        for i in 1:(length(segment)-order+1)
+            ngram = Symbol.(segment[i:i+order-1])
+            count = get(ngrams_count, ngram, 0)
+            ngrams_count[ngram] = count + 1
+        end
+    end
+    return ngrams_count
+end
+
+const ListOfTokens = Vector{<:AbstractString}
+const DocumentWithTokenizedSentences = Vector{<:ListOfTokens}
+
+"""
+    bleu_score(reference_corpus::Vector{Vector{Token}}, translation_corpus::Vector{Token}; max_order=4, smooth=false)
+
+Computes BLEU score of translated segments against one or more references. Returns the `BLEU score`, `n-gram precisions`, `brevity penalty`, 
+geometric mean of n-gram precisions, translation_length and  reference_length
+
+# Arguments
+ - `reference_corpus`: list of lists of references for each translation. Each reference should be tokenized into a list of tokens.
+ - `translation_corpus`: list of translations to score. Each translation should be tokenized into a list of tokens.
+ - `max_order`: maximum n-gram order to use when computing BLEU score. 
+ - `smooth=false`: whether or not to apply. Lin et al. 2004 smoothing.
+
+
+Example:
+```julia
+one_doc_references = [
+    ["apple", "is", "apple"],
+    ["apple", "is", "a", "fruit"]
+]  
+one_doc_translation = [
+    "apple", "is", "appl"
+]
+bleu_score([one_doc_references], [one_doc_translation], smooth=true)
+```
+"""
+function bleu_score(
+    reference_corpus::Vector{<:T}, translation_corpus::T;
+    max_order=4, smooth=false
+) where {T<:DocumentWithTokenizedSentences}
+    matches_by_order = zeros(max_order)
+    possible_matches_by_order = zeros(max_order)
+
+    reference_length = 0
+    translation_length = 0
+    if !isempty(reference_corpus) && !isempty(translation_corpus)
+        for (references, translation) in zip(reference_corpus, translation_corpus)
+            isempty(references) && continue
+
+            reference_length += min([length(r) for r in references]...)
+            translation_length += length(translation)
+            merged_ref_ngram_counts = Dict()
+            for reference in references
+                ref_ngrams = get_ngrams(reference, max_order)
+                for (k, v) in ref_ngrams
+                    merged_count = get(merged_ref_ngram_counts, k, 0)
+                    if v > merged_count
+                        merged_ref_ngram_counts[k] = v
+                    end
+                end
+            end
+
+            translation_ngram_counts = get_ngrams(translation, max_order)
+            overlap = Dict()
+            for (k, v) in translation_ngram_counts
+                new_counter = min(get(merged_ref_ngram_counts, k, 0), v)
+                if new_counter > 0
+                    overlap[k] = new_counter
+                end
+            end
+
+            for (ngram, count) in overlap
+                matches_by_order[length(ngram)] += count
+            end
+            for order in 1:max_order
+                possible_matches = length(translation) - order + 1
+                if possible_matches > 0
+                    possible_matches_by_order[order] += possible_matches
+                end
+            end
+        end
+    end
+
+    precisions = map(1:max_order) do i
+        if smooth
+            (matches_by_order[i] + 1.0) / (possible_matches_by_order[i] + 1.0)
+        elseif possible_matches_by_order[i] > 0
+            matches_by_order[i] / possible_matches_by_order[i]
+        else
+            0.0
+        end
+    end
+
+    geo_mean = 0.0
+    if all(>(0), precisions)
+        p_log_sum = sum(log.(precisions)) / max_order
+        geo_mean = exp(p_log_sum)
+    end
+
+    ratio = float(translation_length) / reference_length
+    bp = 1.0
+    if ratio < 1.0
+        bp = exp(1 - 1.0 / ratio)
+    end
+
+    bleu = geo_mean * bp
+
+    return (
+        bleu=bleu,
+        precisions=precisions,
+        bp=bp,
+        geo_mean=geo_mean,
+        translation_length=translation_length,
+        reference_length=reference_length
+    )
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -24,3 +24,4 @@ include("bayes.jl")
 include("taggingschemes.jl")
 include("evaluation_metrics.jl")
 include("LM.jl")
+include("translate_evaluation.jl")
diff --git a/test/translate_evaluation.jl b/test/translate_evaluation.jl
@@ -0,0 +1,38 @@
+using TextAnalysis
+
+@testset "Evaluation/BLEU" begin
+    max_order = 4
+    # test token-based ngrams
+    ngrams = TextAnalysis.get_ngrams(split("it is a dog "), max_order)
+    actual_orders = Set(length.(keys(ngrams)))
+
+    @test length(intersect(actual_orders, 1:max_order)) == max_order
+    @test length(setdiff(actual_orders, 1:max_order)) == 0
+
+    # NLTK sample https://www.nltk.org/api/nltk.translate.bleu_score.html
+    reference1 = [
+        "It", "is", "a", "guide", "to", "action", "that",
+        "ensures", "that", "the", "military", "will", "forever",
+        "heed", "Party", "commands"
+    ]
+    reference2 = [
+        "It", "is", "the", "guiding", "principle", "which",
+        "guarantees", "the", "military", "forces", "always",
+        "being", "under", "the", "command", "of", "the",
+        "Party"
+    ]
+    reference3 = [
+        "It", "is", "the", "practical", "guide", "for", "the",
+        "army", "always", "to", "heed", "the", "directions",
+        "of", "the", "party"
+    ]
+
+    hypothesis1 = [
+        "It", "is", "a", "guide", "to", "action", "which",
+        "ensures", "that", "the", "military", "always",
+        "obeys", "the", "commands", "of", "the", "party"
+    ]
+
+    score = bleu_score([[reference1, reference2, reference3]], [hypothesis1])
+    @test isapprox(score.bleu, 0.5045, atol=1e-4) #(NLTK)
+end