ROUGE: fixed sentences calculation and some minor refactoring (#272)

ROUGE: * fixed sentences calculation and some minor refactoring * changed output types, optimized, aligned with ROUGE-1.5.5 and checked with Google Research implementation * docs updates * rouge_l_sentence - added an argument with weighted function.
JuliaText · Oct 24, 2023 · 14533f0 · 14533f0
1 parent e4b1564
commit 14533f0
Show file tree

Hide file tree

Showing 6 changed files with 262 additions and 144 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.7.5"
 [deps]
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
+DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -24,6 +25,7 @@ WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 [compat]
 DataStructures = "0.17, 0.18"
 DelimitedFiles = "1"
+DocStringExtensions = "0.9"
 JSON = "0.21"
 Languages = "0.4"
 ProgressMeter = "1.5"

diff --git a/docs/src/evaluation_metrics.md b/docs/src/evaluation_metrics.md
@@ -8,36 +8,28 @@ As of now TextAnalysis provides the following evaluation metrics.
 
 * [BLEU (bilingual evaluation understudy)](https://en.wikipedia.org/wiki/BLEU)
 
-## ROUGE-N
+## ROUGE-N, ROUGE-L, ROUGE-L-Summary
 This metric evaluation based on the overlap of N-grams
 between the system and reference summaries.
 
 ```@docs
+argmax
+average
 rouge_n
+rouge_l_sentence
+rouge_l_summary
 ```
 
-The function takes the following arguments -
-
-* `references::Array{T} where T<: AbstractString` = The list of reference summaries.
-* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries.
-* `n::Integer` = Order of NGrams
-* `avg::Bool` = Setting this parameter to `true`, applies jackkniving the calculated scores. Defaults to `true`
-* `lang::Language` = Language of the text, useful while generating N-grams. Defaults to English i.e. Languages.English()
-
-```julia
-julia> candidate_summary =  "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
-"Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
-
-julia> reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."]
-2-element Array{String,1}:
- "Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea."
- "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."                                    
+```@example
+using TextAnalysis
 
-julia> rouge_n(reference_summaries, candidate_summary, 2, avg=true)
-0.1317241379310345
+candidate_summary =  "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
+reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."]
 
-julia> rouge_n(reference_summaries, candidate_summary, 1, avg=true)
-0.5051282051282051
+results = [
+    rouge_n(reference_summaries, candidate_summary, 2),
+    rouge_n(reference_summaries, candidate_summary, 1)
+] .|> argmax
 ```
 
 ## BLEU (bilingual evaluation understudy)

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -12,6 +12,7 @@ module TextAnalysis
     using Statistics
     using Serialization
     using ProgressMeter
+    using DocStringExtensions
 
     import Base: depwarn, merge!
     import Serialization: serialize, deserialize
@@ -60,8 +61,10 @@ module TextAnalysis
 
     export NaiveBayesClassifier
     export tag_scheme!
-    export rouge_l_summary, rouge_l_sentence, rouge_n
+
+    export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
     export bleu_score
+
     export PerceptronTagger, fit!, predict
 
     export Vocabulary, lookup, update

diff --git a/src/evaluation_metrics.jl b/src/evaluation_metrics.jl
@@ -1,95 +1,197 @@
 """
-    rouge_n(references::Array{T}, candidate::AbstractString, n; avg::Bool, lang::Language) where T<: AbstractString
+$(TYPEDEF)
+$(TYPEDFIELDS)
+"""
+struct Score
+    precision::Float32
+    recall::Float32
+    fmeasure::Float32
+
+    @doc """
+    $(TYPEDSIGNATURES)
+
+    Stores a result of evaluation
+    """
+    Score(precision::AbstractFloat, recall::AbstractFloat, fmeasure::AbstractFloat) =
+        new(precision, recall, fmeasure)
+
+    @doc """
+    $(TYPEDSIGNATURES)
+    """
+    Score(; precision=0.0, recall=0.0, fmeasure=0.0) =
+        new(precision, recall, fmeasure)
+end
+
+Base.show(io::IO, score::Score) = Base.write(io,
+    string(
+        "Score(precision=", score.precision,
+        ", recall=", score.recall,
+        ", fmeasure=", score.fmeasure,
+        ")"
+    )
+)
+
+"""
+    average(scores::Vector{Score})::Score
+
+* scores - vector of [`Score`](@ref)
+
+Returns average values of scores as a [`Score`](@ref) with precision/recall/fmeasure
+"""
+function average(scores::Vector{Score})::Score
+    res = reduce(scores, init=zeros(Float32, 3)) do acc, i
+        acc + [
+            i.precision
+            i.recall
+            i.fmeasure
+        ]
+    end
+    Score((res ./ length(scores))...)
+end
+
+"""
+    argmax(scores::Vector{Score})::Score
+
+* scores - vector of [`Score`](@ref)
+
+Returns maximum by precision fiels of each [`Score`](@ref)
+"""
+Base.argmax(scores::Vector{Score})::Score = argmax(s -> s.fmeasure, scores)
+
+"""
+    rouge_n(
+        references::Vector{<:AbstractString}, 
+        candidate::AbstractString, 
+        n::Int; 
+        lang::Language
+    )::Vector{Score}
 
 Compute n-gram recall between `candidate` and the `references` summaries.
 
+The function takes the following arguments -
+
+* `references::Vector{T} where T<: AbstractString` = The list of reference summaries.
+* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries.
+* `n::Integer` = Order of NGrams
+* `lang::Language` = Language of the text, useful while generating N-grams. Defaults value is Languages.English()
+
+Returns a vector of [`Score`](@ref)
+
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
 See also: [`rouge_l_sentence`](@ref), [`rouge_l_summary`](@ref)
 """
-function rouge_n(references, candidate, n; avg = true, lang = Languages.English())
+function rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int;
+    lang=Languages.English())::Vector{Score}
     ng_candidate = ngramize(lang, candidate, n)
-    ng_refs = [ngramize(lang, ref, n) for ref in references]
-
-    rouge_recall = Array{Float64,1}()
-    for ref in ng_refs
-        push!(rouge_recall, rouge_match_score(keys(ref), ng_candidate) / sum(values(ref)) )
+    rouge_recall = map(references) do ref
+        ng_ref = ngramize(lang, ref, n)
+        totalGramHit = rouge_match_score(keys(ng_ref), ng_candidate)
+        score_r = totalGramHit / sum(values(ng_ref))
+        score_p = totalGramHit / sum(values(ng_candidate))
+        Score(
+            score_p,
+            score_r,
+            fmeasure_lcs(score_r, score_p)
+        )
     end
 
-    avg == true && return jackknife_avg(rouge_recall)
     return rouge_recall
 end
 
 function rouge_match_score(ref, candidate::Dict)
     matches = 0
-    for p in keys(candidate)
+    for (p, v) in candidate
         p ∉ ref && continue
-        matches += candidate[p]
+        matches += v
     end
     return matches
 end
 
 """
-    rouge_l_sentence(references, candidate, β, average)
+    rouge_l_sentence(
+        references::Vector{<:AbstractString}, candidate::AbstractString, β=8;
+        weighted=false, weight_func=sqrt,
+        lang=Languages.English()
+    )::Vector{Score}
 
 Calculate the ROUGE-L score between `references` and `candidate` at sentence level.
 
+Returns a vector of [`Score`](@ref)
+
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
+Note: the `weighted` argument enables weighting of values when calculating the longest common subsequence.
+Initial implementation ROUGE-1.5.5.pl contains a power function. The function `weight_func` here has a power of 0.5 by default.
+
 See also: [`rouge_n`](@ref), [`rouge_l_summary`](@ref)
 """
-function rouge_l_sentence(references, candidate, β=8, average = true)
-    ngram_cand = tokenize(Languages.English(), candidate)
-    rouge_l_list = []
+function rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8;
+    weighted=false, weight_func=sqrt, lang=Languages.English())::Vector{Score}
+    ngram_cand = tokenize(lang, candidate)
+    rouge_l_list = Score[]
 
     for ref in references
-        ngram_ref = tokenize(Languages.English(), ref)
-        r_lcs = weighted_lcs(ngram_ref, ngram_cand, true, false, sqrt) / length(ngram_ref)
-        p_lcs = weighted_lcs(ngram_ref, ngram_cand, true, false, sqrt) / length(ngram_cand)
-        score = fmeasure_lcs(r_lcs, p_lcs, β)
-        push!(rouge_l_list, score)
+        ngram_ref = tokenize(lang, ref)
+        lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, weight_func)
+        r_lcs = lcs / length(ngram_ref)
+        p_lcs = lcs / length(ngram_cand)
+        fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
+        push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
     end
 
-    if average == true
-        rouge_l_list = jackknife_avg(rouge_l_list)
-    end
     return rouge_l_list
 end
 
 """
-    rouge_l_summary(references, candidate, β, average)
+    rouge_l_summary(
+        references::Vector{<:AbstractString}, candidate::AbstractString, β::Int;
+        lang=Languages.English()
+    )::Vector{Score}
 
 Calculate the ROUGE-L score between `references` and `candidate` at summary level.
 
+Returns a vector of [`Score`](@ref)
+
 See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
 
-See also: [`rouge_l_sentence()`](@ref), [`rouge_l_summary`](@ref)
+See also: [`rouge_l_sentence()`](@ref), [`rouge_n`](@ref)
 """
-function rouge_l_summary(references, candidate, β, averaging=true)
-    rouge_l_list = []
+function rouge_l_summary(references::Vector{<:AbstractString}, candidate::AbstractString, β::Int;
+    lang=Languages.English())::Vector{Score}
+    rouge_l_list = Score[]
+    ref_sent_tokens = map(references) do ref_sents
+        map(split_sentences(ref_sents)) do ref_sent
+            tokenize(lang, ref_sent)
+        end
+    end
+
+    ref_sent_total_tokens = map(ref_sent_tokens) do ref_tokens
+        sum(length, ref_tokens)
+    end
+
     cand_sent_list = split_sentences(candidate)
+    cand_sent_tokens = map(cand_sent_list) do cand_sent
+        tokenize(lang, cand_sent)
+    end
 
-    for ref in references
-        ref_sent_list = split_sentences(ref)
+    cand_total_tokens_length = sum(length, cand_sent_tokens)
+
+    for i in eachindex(ref_sent_tokens)
         sum_value = 0
 
-        for ref_sent in ref_sent_list
-            l_ = []
-            arg1 = tokenize(Languages.English(), ref)
-            for cand_sent in cand_sent_list
-                arg2 = tokenize(Languages.English(), cand_sent)
-                d = tokenize(Languages.English(), weighted_lcs(arg1, arg2, false, true, sqrt))
-                append!(l_,d)
+        for ref_sent in ref_sent_tokens[i]
+            l_ = reduce(cand_sent_tokens, init=String[]) do acc, cand_sent
+                append!(acc, weighted_lcs_tokens(ref_sent, cand_sent, false))
             end
-            sum_value += length(unique(l_))
+            sum_value += count(!isempty, unique(l_))
         end
 
-        r_lcs = sum_value / length(tokenize(Languages.English(), ref))
-        p_lcs = sum_value / length(tokenize(Languages.English(), candidate))
-        score = fmeasure_lcs(r_lcs, p_lcs, β)
-        push!(rouge_l_list,score)
+        r_lcs = sum_value / ref_sent_total_tokens[i]
+        p_lcs = sum_value / cand_total_tokens_length
+        fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
+        push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
     end
 
-    averaging == true && return jackknife_avg(rouge_l_list)
     return rouge_l_list
 end