Skip to content

Commit

Permalink
ROUGE: changed output types, optimized, aligned with ROUGE-1.5.5 and …
Browse files Browse the repository at this point in the history
…checked with Google Research implementation
  • Loading branch information
rssdev10 committed Oct 16, 2023
1 parent 66d7657 commit 177e5d4
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 77 deletions.
5 changes: 4 additions & 1 deletion src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ module TextAnalysis

export NaiveBayesClassifier
export tag_scheme!
export rouge_l_summary, rouge_l_sentence, rouge_n

export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
export bleu_score

export PerceptronTagger, fit!, predict

export Vocabulary, lookup, update
Expand Down
116 changes: 86 additions & 30 deletions src/evaluation_metrics.jl
Original file line number Diff line number Diff line change
@@ -1,20 +1,68 @@
"""
rouge_n(references::Array{T}, candidate::AbstractString, n; avg::Bool, lang::Language) where T<: AbstractString
A score with precision, recall and fmeasure
"""
@kwdef struct Score
precision::Float32
recall::Float32
fmeasure::Float32
end

Base.show(io::IO, score::Score) = Base.write(io,
string(
"Score(precision=", score.precision,
", recall=", score.recall,
", fmeasure=", score.fmeasure,
")"
)
)

"""
average(scores::Vector{Score})::Score
Returns average values of scores by precision/recall/fmeasure separately
"""
function average(scores::Vector{Score})::Score
res = reduce(scores, init=zeros(Float32, 3)) do acc, i
acc + [
i.precision
i.recall
i.fmeasure
]
end
Score((res ./ length(scores))...)
end

"""
argmax(scores::Vector{Score})::Score
Returns maximum by precision value
"""
Base.argmax(scores::Vector{Score})::Score = argmax(s -> s.fmeasure, scores)

"""
rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int; lang::Language)
Compute n-gram recall between `candidate` and the `references` summaries.
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
See also: [`rouge_l_sentence`](@ref), [`rouge_l_summary`](@ref)
"""
function rouge_n(references, candidate, n; avg = true, lang = Languages.English())
function rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int;
lang=Languages.English())::Vector{Score}
ng_candidate = ngramize(lang, candidate, n)
rouge_recall = map(references) do ref
ng_ref = ngramize(lang, ref, n)
rouge_match_score(keys(ng_ref), ng_candidate) / sum(values(ng_ref))
totalGramHit = rouge_match_score(keys(ng_ref), ng_candidate)
score_r = totalGramHit / sum(values(ng_ref))
score_p = totalGramHit / sum(values(ng_candidate))
Score(
score_p,
score_r,
fmeasure_lcs(score_r, score_p)
)
end

avg == true && return jackknife_avg(rouge_recall)
return rouge_recall
end

Expand All @@ -28,65 +76,73 @@ function rouge_match_score(ref, candidate::Dict)
end

"""
rouge_l_sentence(references, candidate, β, average)
rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8; weighted=false)
Calculate the ROUGE-L score between `references` and `candidate` at sentence level.
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
See also: [`rouge_n`](@ref), [`rouge_l_summary`](@ref)
"""
function rouge_l_sentence(references, candidate, β=8, average = true)
function rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8; weighted=true)
ngram_cand = tokenize(Languages.English(), candidate)
rouge_l_list = Float64[]
rouge_l_list = Score[]

for ref in references
ngram_ref = tokenize(Languages.English(), ref)
r_lcs = weighted_lcs(ngram_ref, ngram_cand, true, sqrt) / length(ngram_ref)
p_lcs = weighted_lcs(ngram_ref, ngram_cand, true, sqrt) / length(ngram_cand)
score = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, score)
r_lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, sqrt) / length(ngram_ref)
p_lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, sqrt) / length(ngram_cand)
fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
end

if average == true
rouge_l_list = jackknife_avg(rouge_l_list)
end
return rouge_l_list
end

"""
rouge_l_summary(references, candidate, β, average)
rouge_l_summary(references, candidate, β)
Calculate the ROUGE-L score between `references` and `candidate` at summary level.
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
See also: [`rouge_l_sentence()`](@ref), [`rouge_l_summary`](@ref)
"""
function rouge_l_summary(references, candidate, β, averaging=true)
rouge_l_list = Float64[]
function rouge_l_summary(references::Vector{<:AbstractString}, candidate::AbstractString, β::Int)::Vector{Score}
rouge_l_list = Score[]
ref_sent_tokens = map(references) do ref_sents
map(split_sentences(ref_sents)) do ref_sent
tokenize(Languages.English(), ref_sent)
end
end

ref_sent_total_tokens = map(ref_sent_tokens) do ref_tokens
sum(length, ref_tokens)
end

cand_sent_list = split_sentences(candidate)
cand_sent_tokens = map(cand_sent_list) do cand_sent
tokenize(Languages.English(), cand_sent)
end

for ref in references
ref_sent_list = split_sentences(ref)
cand_total_tokens_length = sum(length, cand_sent_tokens)

for i in eachindex(ref_sent_tokens)
sum_value = 0

for ref_sent in ref_sent_list
for ref_sent in ref_sent_tokens[i]
l_ = []
arg1 = tokenize(Languages.English(), ref_sent)
for cand_sent in cand_sent_list
arg2 = tokenize(Languages.English(), cand_sent)
append!(l_, weighted_lcs_tokens(arg1, arg2, false, sqrt))
for cand_sent in cand_sent_tokens
append!(l_, weighted_lcs_tokens(ref_sent, cand_sent, false))
end
sum_value += length(unique(l_))
sum_value += count(!isempty, unique(l_))
end

r_lcs = sum_value / length(tokenize(Languages.English(), ref))
p_lcs = sum_value / length(tokenize(Languages.English(), candidate))
score = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list,score)
r_lcs = sum_value / ref_sent_total_tokens[i]
p_lcs = sum_value / cand_total_tokens_length
fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
end

averaging == true && return jackknife_avg(rouge_l_list)
return rouge_l_list
end
49 changes: 9 additions & 40 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,27 +1,3 @@
# The jackknife is a resampling technique especially useful for variance and bias estimation.
"""
jackknife_avg(`scores`)
Apply jackknife on the input list of `scores`
"""
function jackknife_avg(scores)
if length(collect(Set(scores))) == 1
#= In case the elements of the array are all equal=#
return scores[1]
else
#=store the maximum scores
from the m different sets of m-1 scores.
such that m is the len(score_list)=#
average = []
for i in scores
# dummy : list a particular combo of m-1 scores
dummy = [j for j in scores if i != j]
append!(average, max(dummy...))
end

return sum(average)/length(average)
end
end

"""
weighted_lcs(X, Y, weight_score::Bool, returns_string::Bool, weigthing_function::Function)
Expand All @@ -31,16 +7,16 @@ Compute the Weighted Longest Common Subsequence of X and Y.
function weighted_lcs(X, Y, weighted=true, f=sqrt)
result = weighted_lcs_inner(X, Y, weighted, f)

return result.lcs_length
return result.c_table[end, end]
end

function weighted_lcs_tokens(X, Y, weighted=true, f=sqrt)
m, n, c_table, w_table, lcs_length = weighted_lcs_inner(X, Y, weighted, f)
m, n, c_table, w_table = weighted_lcs_inner(X, Y, weighted, f)

# if weighted == true
# lcs_length = c_table[m, n]^(2) # ?....
# end

lcs_length = m
lcs = ["" for i in 1:(lcs_length+1)]
i = m + 1
j = n + 1
Expand All @@ -63,7 +39,7 @@ end

function weighted_lcs_inner(X, Y, weighted=true, f=sqrt)
m, n = length(X), length(Y)
c_table = zeros(Int32, m + 1, n + 1)
c_table = zeros(Float32, m + 1, n + 1)
w_table = zeros(Int32, m + 1, n + 1)
increment = 1

Expand All @@ -78,12 +54,12 @@ function weighted_lcs_inner(X, Y, weighted=true, f=sqrt)
w_table[i, j] = k + 1
else
c_table[i, j] = max(c_table[i-1, j], c_table[i, j-1])
w_table[i, j] = 0 # no match at i,j
# w_table[i, j] = 0 # no match at i,j
end
end
end

(m=m, n=n, c_table=c_table, w_table=w_table, lcs_length=c_table[m+1, n+1])
(m=m, n=n, c_table=c_table, w_table=w_table)
end


Expand All @@ -98,14 +74,7 @@ Compute the F-measure based on WLCS.
- `PLCS` - Precision Factor
- `β` - Parameter
"""
function fmeasure_lcs(RLCS, PLCS, β=1)
try
return ((1 + β^2) * RLCS * PLCS) / (RLCS +^2) * PLCS)
catch ex
if ex isa DivideError
return 0
else
rethrow(ex)
end
end
function fmeasure_lcs(RLCS::Real, PLCS::Real, β=1.0)::Real
divider = RLCS +^2) * PLCS
return iszero(divider) ? 0.0 : (1 + β^2) * RLCS * PLCS / divider
end
56 changes: 50 additions & 6 deletions test/evaluation_metrics.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,61 @@
using TextAnalysis
using Test

@testset "Service functions check" begin
@test argmax([
Score(0, 1, 2),
Score(3, 0, 0),
Score(0, 6, 1)
]) == Score(0, 1, 2)

@test average([
Score(1, 10, 100),
Score(2, 20, 200),
Score(3, 30, 300)
]) == Score(2, 20, 200)
end

@testset "Evaluation Metrics" begin
@testset "Rouge" begin
candidate_sentence = "Brazil, Russia, China and India are growing nations"
candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."

reference_sentences = ["Brazil, Russia, India and China are growing nations", "Brazil and India are two of the developing nations that are part of the BRIC"]
reference_summaries = ["Brazil, Russia, India and China are the next big poltical powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."]

@test rouge_n(reference_summaries, candidate_summary, 1, avg=true) >= 0.505
@test rouge_n(reference_summaries, candidate_summary, 2, avg=true) >= 0.131
@test argmax(@show rouge_n(reference_summaries, candidate_summary, 1)).fmeasure >= 0.505
@test argmax(rouge_n(reference_summaries, candidate_summary, 2)).fmeasure >= 0.131

@test rouge_n(reference_sentences, candidate_sentence, 2, avg=true) >= 0.349
@test rouge_n(reference_sentences, candidate_sentence, 1, avg=true) >= 0.666
@test argmax(rouge_n(reference_sentences, candidate_sentence, 2)).fmeasure >= 0.349
@test argmax(rouge_n(reference_sentences, candidate_sentence, 1)).fmeasure >= 0.666

@test rouge_l_summary(reference_summaries, candidate_summary, 8, true) >= 0.4256
@test argmax(rouge_l_sentence(reference_summaries, candidate_summary, 8, weighted=true)).recall >= 0.23

@test argmax(rouge_l_summary(reference_summaries, candidate_summary, 8)).recall >= 0.23
end
end

# https://github.com/google-research/google-research/blob/master/rouge/rouge_scorer.py
#
# from rouge_score import rouge_scorer
#
# scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
# scores = scorer.score('The quick brown fox jumps over the lazy dog',
# 'The quick brown dog jumps on the log.')
# print(scores)
# {
# 'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
# 'rouge2': Score(precision=0.2857142857142857, recall=0.25, fmeasure=0.26666666666666666),
# 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471),
# 'rougeLsum': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)
# }

@testset "Compare with google-research/rouge" begin
reference_summaries = ["The quick brown fox jumps over the lazy dog"]
candidate_summary = "The quick brown dog jumps on the log"
@test argmax(rouge_n(reference_summaries, candidate_summary, 1)).fmeasure 0.70588
@test argmax(rouge_n(reference_summaries, candidate_summary, 2)).fmeasure 0.26667
@show rouge_l_sentence(reference_summaries, candidate_summary, 1) # result is different
# @test argmax(@show rouge_l_sentence(reference_summaries, candidate_summary, 1)).fmeasure ≈ 0.5882
@test argmax(rouge_l_summary(reference_summaries, candidate_summary, 1)).fmeasure 0.5882
end

0 comments on commit 177e5d4

Please sign in to comment.