ROUGE: changed output types, optimized, aligned with ROUGE-1.5.5 and …

…checked with Google Research implementation
JuliaText · Oct 16, 2023 · 177e5d4 · 177e5d4
1 parent 66d7657
commit 177e5d4
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 77 deletions.
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -60,7 +60,10 @@ module TextAnalysis
 
     export NaiveBayesClassifier
     export tag_scheme!
-    export rouge_l_summary, rouge_l_sentence, rouge_n
+
+    export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
+    export bleu_score
+
     export PerceptronTagger, fit!, predict
 
     export Vocabulary, lookup, update

diff --git a/src/evaluation_metrics.jl b/src/evaluation_metrics.jl
@@ -1,20 +1,68 @@
 """
-    rouge_n(references::Array{T}, candidate::AbstractString, n; avg::Bool, lang::Language) where T<: AbstractString
+A score with precision, recall and fmeasure
+"""
+@kwdef struct Score
+    precision::Float32
+    recall::Float32
+    fmeasure::Float32
+end
+
+Base.show(io::IO, score::Score) = Base.write(io,
+    string(
+        "Score(precision=", score.precision,
+        ", recall=", score.recall,
+        ", fmeasure=", score.fmeasure,
+        ")"
+    )
+)
+
+"""
+    average(scores::Vector{Score})::Score
+
+Returns average values of scores by precision/recall/fmeasure separately
+"""
+function average(scores::Vector{Score})::Score
+    res = reduce(scores, init=zeros(Float32, 3)) do acc, i
+        acc + [
+            i.precision
+            i.recall
+            i.fmeasure
+        ]
+    end
+    Score((res ./ length(scores))...)
+end
+
+"""
+    argmax(scores::Vector{Score})::Score
+
+Returns maximum by precision value
+"""
+Base.argmax(scores::Vector{Score})::Score = argmax(s -> s.fmeasure, scores)
+
+"""
+    rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int; lang::Language)
 
 Compute n-gram recall between `candidate` and the `references` summaries.
 
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
 See also: [`rouge_l_sentence`](@ref), [`rouge_l_summary`](@ref)
 """
-function rouge_n(references, candidate, n; avg = true, lang = Languages.English())
+function rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int;
+    lang=Languages.English())::Vector{Score}
     ng_candidate = ngramize(lang, candidate, n)
     rouge_recall = map(references) do ref
         ng_ref = ngramize(lang, ref, n)
-        rouge_match_score(keys(ng_ref), ng_candidate) / sum(values(ng_ref))
+        totalGramHit = rouge_match_score(keys(ng_ref), ng_candidate)
+        score_r = totalGramHit / sum(values(ng_ref))
+        score_p = totalGramHit / sum(values(ng_candidate))
+        Score(
+            score_p,
+            score_r,
+            fmeasure_lcs(score_r, score_p)
+        )
     end
 
-    avg == true && return jackknife_avg(rouge_recall)
     return rouge_recall
 end
 
@@ -28,65 +76,73 @@ function rouge_match_score(ref, candidate::Dict)
 end
 
 """
-    rouge_l_sentence(references, candidate, β, average)
+    rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8; weighted=false)
 
 Calculate the ROUGE-L score between `references` and `candidate` at sentence level.
 
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
 See also: [`rouge_n`](@ref), [`rouge_l_summary`](@ref)
 """
-function rouge_l_sentence(references, candidate, β=8, average = true)
+function rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8; weighted=true)
     ngram_cand = tokenize(Languages.English(), candidate)
-    rouge_l_list = Float64[]
+    rouge_l_list = Score[]
 
     for ref in references
         ngram_ref = tokenize(Languages.English(), ref)
-        r_lcs = weighted_lcs(ngram_ref, ngram_cand, true, sqrt) / length(ngram_ref)
-        p_lcs = weighted_lcs(ngram_ref, ngram_cand, true, sqrt) / length(ngram_cand)
-        score = fmeasure_lcs(r_lcs, p_lcs, β)
-        push!(rouge_l_list, score)
+        r_lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, sqrt) / length(ngram_ref)
+        p_lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, sqrt) / length(ngram_cand)
+        fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
+        push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
     end
 
-    if average == true
-        rouge_l_list = jackknife_avg(rouge_l_list)
-    end
     return rouge_l_list
 end
 
 """
-    rouge_l_summary(references, candidate, β, average)
+    rouge_l_summary(references, candidate, β)
 
 Calculate the ROUGE-L score between `references` and `candidate` at summary level.
 
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
 See also: [`rouge_l_sentence()`](@ref), [`rouge_l_summary`](@ref)
 """
-function rouge_l_summary(references, candidate, β, averaging=true)
-    rouge_l_list = Float64[]
+function rouge_l_summary(references::Vector{<:AbstractString}, candidate::AbstractString, β::Int)::Vector{Score}
+    rouge_l_list = Score[]
+    ref_sent_tokens = map(references) do ref_sents
+        map(split_sentences(ref_sents)) do ref_sent
+            tokenize(Languages.English(), ref_sent)
+        end
+    end
+
+    ref_sent_total_tokens = map(ref_sent_tokens) do ref_tokens
+        sum(length, ref_tokens)
+    end
+
     cand_sent_list = split_sentences(candidate)
+    cand_sent_tokens = map(cand_sent_list) do cand_sent
+        tokenize(Languages.English(), cand_sent)
+    end
 
-    for ref in references
-        ref_sent_list = split_sentences(ref)
+    cand_total_tokens_length = sum(length, cand_sent_tokens)
+
+    for i in eachindex(ref_sent_tokens)
         sum_value = 0
 
-        for ref_sent in ref_sent_list
+        for ref_sent in ref_sent_tokens[i]
             l_ = []
-            arg1 = tokenize(Languages.English(), ref_sent)
-            for cand_sent in cand_sent_list
-                arg2 = tokenize(Languages.English(), cand_sent)
-                append!(l_, weighted_lcs_tokens(arg1, arg2, false, sqrt))
+            for cand_sent in cand_sent_tokens
+                append!(l_, weighted_lcs_tokens(ref_sent, cand_sent, false))
             end
-            sum_value += length(unique(l_))
+            sum_value += count(!isempty, unique(l_))
         end
 
-        r_lcs = sum_value / length(tokenize(Languages.English(), ref))
-        p_lcs = sum_value / length(tokenize(Languages.English(), candidate))
-        score = fmeasure_lcs(r_lcs, p_lcs, β)
-        push!(rouge_l_list,score)
+        r_lcs = sum_value / ref_sent_total_tokens[i]
+        p_lcs = sum_value / cand_total_tokens_length
+        fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
+        push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
     end
 
-    averaging == true && return jackknife_avg(rouge_l_list)
     return rouge_l_list
 end
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,27 +1,3 @@
-# The jackknife is a resampling technique especially useful for variance and bias estimation.
-"""
-    jackknife_avg(`scores`)
-
-Apply jackknife on the input list of `scores`
-"""
-function jackknife_avg(scores)
-    if length(collect(Set(scores))) == 1
-        #= In case the elements of the array are all equal=#
-        return scores[1]
-    else
-        #=store the maximum scores
-        from the m different sets of m-1 scores.
-        such that m is the len(score_list)=#
-        average = []
-        for i in scores
-            # dummy : list a particular combo of m-1 scores
-            dummy = [j for j in scores if i != j]
-            append!(average, max(dummy...))
-            end
-
-        return sum(average)/length(average)
-    end
-end
 
 """
     weighted_lcs(X, Y, weight_score::Bool, returns_string::Bool, weigthing_function::Function)
@@ -31,16 +7,16 @@ Compute the Weighted Longest Common Subsequence of X and Y.
 function weighted_lcs(X, Y, weighted=true, f=sqrt)
     result = weighted_lcs_inner(X, Y, weighted, f)
 
-    return result.lcs_length
+    return result.c_table[end, end]
 end
 
 function weighted_lcs_tokens(X, Y, weighted=true, f=sqrt)
-    m, n, c_table, w_table, lcs_length = weighted_lcs_inner(X, Y, weighted, f)
+    m, n, c_table, w_table = weighted_lcs_inner(X, Y, weighted, f)
 
     # if weighted == true 
     #     lcs_length = c_table[m, n]^(2) # ?....
     # end
-
+    lcs_length = m
     lcs = ["" for i in 1:(lcs_length+1)]
     i = m + 1
     j = n + 1
@@ -63,7 +39,7 @@ end
 
 function weighted_lcs_inner(X, Y, weighted=true, f=sqrt)
     m, n = length(X), length(Y)
-    c_table = zeros(Int32, m + 1, n + 1)
+    c_table = zeros(Float32, m + 1, n + 1)
     w_table = zeros(Int32, m + 1, n + 1)
     increment = 1
 
@@ -78,12 +54,12 @@ function weighted_lcs_inner(X, Y, weighted=true, f=sqrt)
                 w_table[i, j] = k + 1
             else
                 c_table[i, j] = max(c_table[i-1, j], c_table[i, j-1])
-                w_table[i, j] = 0  # no match at i,j
+                # w_table[i, j] = 0  # no match at i,j
             end
         end
     end
 
-    (m=m, n=n, c_table=c_table, w_table=w_table, lcs_length=c_table[m+1, n+1])
+    (m=m, n=n, c_table=c_table, w_table=w_table)
 end
 
 
@@ -98,14 +74,7 @@ Compute the F-measure based on WLCS.
 - `PLCS` - Precision Factor
 - `β` - Parameter
 """
-function fmeasure_lcs(RLCS, PLCS, β=1)
-    try
-        return ((1 + β^2) * RLCS * PLCS) / (RLCS + (β^2) * PLCS)
-    catch ex
-        if ex isa DivideError
-            return 0
-        else
-            rethrow(ex)
-        end
-    end
+function fmeasure_lcs(RLCS::Real, PLCS::Real, β=1.0)::Real
+    divider = RLCS + (β^2) * PLCS
+    return iszero(divider) ? 0.0 : (1 + β^2) * RLCS * PLCS / divider
 end
diff --git a/test/evaluation_metrics.jl b/test/evaluation_metrics.jl
@@ -1,17 +1,61 @@
+using TextAnalysis
+using Test
+
+@testset "Service functions check" begin
+    @test argmax([
+        Score(0, 1, 2),
+        Score(3, 0, 0),
+        Score(0, 6, 1)
+    ]) == Score(0, 1, 2)
+
+    @test average([
+        Score(1, 10, 100),
+        Score(2, 20, 200),
+        Score(3, 30, 300)
+    ]) == Score(2, 20, 200)
+end
+
 @testset "Evaluation Metrics" begin
     @testset "Rouge" begin
         candidate_sentence = "Brazil, Russia, China and India are growing nations"
-        candidate_summary =  "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
+        candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
 
         reference_sentences = ["Brazil, Russia, India and China are growing nations", "Brazil and India are two of the developing nations that are part of the BRIC"]
         reference_summaries = ["Brazil, Russia, India and China are the next big poltical powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."]
 
-        @test rouge_n(reference_summaries, candidate_summary, 1, avg=true) >= 0.505
-        @test rouge_n(reference_summaries, candidate_summary, 2, avg=true) >= 0.131
+        @test argmax(@show rouge_n(reference_summaries, candidate_summary, 1)).fmeasure >= 0.505
+        @test argmax(rouge_n(reference_summaries, candidate_summary, 2)).fmeasure >= 0.131
 
-        @test rouge_n(reference_sentences, candidate_sentence, 2, avg=true) >= 0.349
-        @test rouge_n(reference_sentences, candidate_sentence, 1, avg=true) >= 0.666
+        @test argmax(rouge_n(reference_sentences, candidate_sentence, 2)).fmeasure >= 0.349
+        @test argmax(rouge_n(reference_sentences, candidate_sentence, 1)).fmeasure >= 0.666
 
-        @test rouge_l_summary(reference_summaries, candidate_summary, 8, true) >= 0.4256
+        @test argmax(rouge_l_sentence(reference_summaries, candidate_summary, 8, weighted=true)).recall >= 0.23
+
+        @test argmax(rouge_l_summary(reference_summaries, candidate_summary, 8)).recall >= 0.23
     end
 end
+
+# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
+# 
+# from rouge_score import rouge_scorer
+# 
+# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
+# scores = scorer.score('The quick brown fox jumps over the lazy dog',
+#                       'The quick brown dog jumps on the log.')
+# print(scores)
+# {
+#   'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765), 
+#   'rouge2': Score(precision=0.2857142857142857, recall=0.25, fmeasure=0.26666666666666666), 
+#   'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471), 
+#   'rougeLsum': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)
+# }
+
+@testset "Compare with google-research/rouge" begin
+    reference_summaries = ["The quick brown fox jumps over the lazy dog"]
+    candidate_summary = "The quick brown dog jumps on the log"
+    @test argmax(rouge_n(reference_summaries, candidate_summary, 1)).fmeasure ≈ 0.70588
+    @test argmax(rouge_n(reference_summaries, candidate_summary, 2)).fmeasure ≈ 0.26667
+    @show rouge_l_sentence(reference_summaries, candidate_summary, 1) # result is different
+    # @test argmax(@show rouge_l_sentence(reference_summaries, candidate_summary, 1)).fmeasure ≈ 0.5882
+    @test argmax(rouge_l_summary(reference_summaries, candidate_summary, 1)).fmeasure ≈ 0.5882
+end