Skip to content

Commit

Permalink
ROUGE: fixed sentences calculation and some minor refactoring (#272)
Browse files Browse the repository at this point in the history
ROUGE: 
* fixed sentences calculation and some minor refactoring
* changed output types, optimized, aligned with ROUGE-1.5.5 and checked with Google Research implementation
* docs updates
* rouge_l_sentence - added an argument with weighted function.
  • Loading branch information
rssdev10 authored Oct 24, 2023
1 parent e4b1564 commit 14533f0
Show file tree
Hide file tree
Showing 6 changed files with 262 additions and 144 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.7.5"
[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -24,6 +25,7 @@ WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
[compat]
DataStructures = "0.17, 0.18"
DelimitedFiles = "1"
DocStringExtensions = "0.9"
JSON = "0.21"
Languages = "0.4"
ProgressMeter = "1.5"
Expand Down
34 changes: 13 additions & 21 deletions docs/src/evaluation_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,28 @@ As of now TextAnalysis provides the following evaluation metrics.

* [BLEU (bilingual evaluation understudy)](https://en.wikipedia.org/wiki/BLEU)

## ROUGE-N
## ROUGE-N, ROUGE-L, ROUGE-L-Summary
This metric evaluation based on the overlap of N-grams
between the system and reference summaries.

```@docs
argmax
average
rouge_n
rouge_l_sentence
rouge_l_summary
```

The function takes the following arguments -

* `references::Array{T} where T<: AbstractString` = The list of reference summaries.
* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries.
* `n::Integer` = Order of NGrams
* `avg::Bool` = Setting this parameter to `true`, applies jackkniving the calculated scores. Defaults to `true`
* `lang::Language` = Language of the text, useful while generating N-grams. Defaults to English i.e. Languages.English()

```julia
julia> candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
"Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."

julia> reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."]
2-element Array{String,1}:
"Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea."
"Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."
```@example
using TextAnalysis
julia> rouge_n(reference_summaries, candidate_summary, 2, avg=true)
0.1317241379310345
candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."]
julia> rouge_n(reference_summaries, candidate_summary, 1, avg=true)
0.5051282051282051
results = [
rouge_n(reference_summaries, candidate_summary, 2),
rouge_n(reference_summaries, candidate_summary, 1)
] .|> argmax
```

## BLEU (bilingual evaluation understudy)
Expand Down
5 changes: 4 additions & 1 deletion src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module TextAnalysis
using Statistics
using Serialization
using ProgressMeter
using DocStringExtensions

import Base: depwarn, merge!
import Serialization: serialize, deserialize
Expand Down Expand Up @@ -60,8 +61,10 @@ module TextAnalysis

export NaiveBayesClassifier
export tag_scheme!
export rouge_l_summary, rouge_l_sentence, rouge_n

export rouge_l_summary, rouge_l_sentence, rouge_n, Score, average, argmax
export bleu_score

export PerceptronTagger, fit!, predict

export Vocabulary, lookup, update
Expand Down
184 changes: 143 additions & 41 deletions src/evaluation_metrics.jl
Original file line number Diff line number Diff line change
@@ -1,95 +1,197 @@
"""
rouge_n(references::Array{T}, candidate::AbstractString, n; avg::Bool, lang::Language) where T<: AbstractString
$(TYPEDEF)
$(TYPEDFIELDS)
"""
struct Score
precision::Float32
recall::Float32
fmeasure::Float32

@doc """
$(TYPEDSIGNATURES)
Stores a result of evaluation
"""
Score(precision::AbstractFloat, recall::AbstractFloat, fmeasure::AbstractFloat) =
new(precision, recall, fmeasure)

@doc """
$(TYPEDSIGNATURES)
"""
Score(; precision=0.0, recall=0.0, fmeasure=0.0) =
new(precision, recall, fmeasure)
end

Base.show(io::IO, score::Score) = Base.write(io,
string(
"Score(precision=", score.precision,
", recall=", score.recall,
", fmeasure=", score.fmeasure,
")"
)
)

"""
average(scores::Vector{Score})::Score
* scores - vector of [`Score`](@ref)
Returns average values of scores as a [`Score`](@ref) with precision/recall/fmeasure
"""
function average(scores::Vector{Score})::Score
res = reduce(scores, init=zeros(Float32, 3)) do acc, i
acc + [
i.precision
i.recall
i.fmeasure
]
end
Score((res ./ length(scores))...)
end

"""
argmax(scores::Vector{Score})::Score
* scores - vector of [`Score`](@ref)
Returns maximum by precision fiels of each [`Score`](@ref)
"""
Base.argmax(scores::Vector{Score})::Score = argmax(s -> s.fmeasure, scores)

"""
rouge_n(
references::Vector{<:AbstractString},
candidate::AbstractString,
n::Int;
lang::Language
)::Vector{Score}
Compute n-gram recall between `candidate` and the `references` summaries.
The function takes the following arguments -
* `references::Vector{T} where T<: AbstractString` = The list of reference summaries.
* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries.
* `n::Integer` = Order of NGrams
* `lang::Language` = Language of the text, useful while generating N-grams. Defaults value is Languages.English()
Returns a vector of [`Score`](@ref)
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
See also: [`rouge_l_sentence`](@ref), [`rouge_l_summary`](@ref)
"""
function rouge_n(references, candidate, n; avg = true, lang = Languages.English())
function rouge_n(references::Vector{<:AbstractString}, candidate::AbstractString, n::Int;
lang=Languages.English())::Vector{Score}
ng_candidate = ngramize(lang, candidate, n)
ng_refs = [ngramize(lang, ref, n) for ref in references]

rouge_recall = Array{Float64,1}()
for ref in ng_refs
push!(rouge_recall, rouge_match_score(keys(ref), ng_candidate) / sum(values(ref)) )
rouge_recall = map(references) do ref
ng_ref = ngramize(lang, ref, n)
totalGramHit = rouge_match_score(keys(ng_ref), ng_candidate)
score_r = totalGramHit / sum(values(ng_ref))
score_p = totalGramHit / sum(values(ng_candidate))
Score(
score_p,
score_r,
fmeasure_lcs(score_r, score_p)
)
end

avg == true && return jackknife_avg(rouge_recall)
return rouge_recall
end

function rouge_match_score(ref, candidate::Dict)
matches = 0
for p in keys(candidate)
for (p, v) in candidate
p ref && continue
matches += candidate[p]
matches += v
end
return matches
end

"""
rouge_l_sentence(references, candidate, β, average)
rouge_l_sentence(
references::Vector{<:AbstractString}, candidate::AbstractString, β=8;
weighted=false, weight_func=sqrt,
lang=Languages.English()
)::Vector{Score}
Calculate the ROUGE-L score between `references` and `candidate` at sentence level.
Returns a vector of [`Score`](@ref)
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
Note: the `weighted` argument enables weighting of values when calculating the longest common subsequence.
Initial implementation ROUGE-1.5.5.pl contains a power function. The function `weight_func` here has a power of 0.5 by default.
See also: [`rouge_n`](@ref), [`rouge_l_summary`](@ref)
"""
function rouge_l_sentence(references, candidate, β=8, average = true)
ngram_cand = tokenize(Languages.English(), candidate)
rouge_l_list = []
function rouge_l_sentence(references::Vector{<:AbstractString}, candidate::AbstractString, β=8;
weighted=false, weight_func=sqrt, lang=Languages.English())::Vector{Score}
ngram_cand = tokenize(lang, candidate)
rouge_l_list = Score[]

for ref in references
ngram_ref = tokenize(Languages.English(), ref)
r_lcs = weighted_lcs(ngram_ref, ngram_cand, true, false, sqrt) / length(ngram_ref)
p_lcs = weighted_lcs(ngram_ref, ngram_cand, true, false, sqrt) / length(ngram_cand)
score = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, score)
ngram_ref = tokenize(lang, ref)
lcs = weighted_lcs(ngram_ref, ngram_cand, weighted, weight_func)
r_lcs = lcs / length(ngram_ref)
p_lcs = lcs / length(ngram_cand)
fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
end

if average == true
rouge_l_list = jackknife_avg(rouge_l_list)
end
return rouge_l_list
end

"""
rouge_l_summary(references, candidate, β, average)
rouge_l_summary(
references::Vector{<:AbstractString}, candidate::AbstractString, β::Int;
lang=Languages.English()
)::Vector{Score}
Calculate the ROUGE-L score between `references` and `candidate` at summary level.
Returns a vector of [`Score`](@ref)
See [Rouge: A package for automatic evaluation of summaries](http://www.aclweb.org/anthology/W04-1013)
See also: [`rouge_l_sentence()`](@ref), [`rouge_l_summary`](@ref)
See also: [`rouge_l_sentence()`](@ref), [`rouge_n`](@ref)
"""
function rouge_l_summary(references, candidate, β, averaging=true)
rouge_l_list = []
function rouge_l_summary(references::Vector{<:AbstractString}, candidate::AbstractString, β::Int;
lang=Languages.English())::Vector{Score}
rouge_l_list = Score[]
ref_sent_tokens = map(references) do ref_sents
map(split_sentences(ref_sents)) do ref_sent
tokenize(lang, ref_sent)
end
end

ref_sent_total_tokens = map(ref_sent_tokens) do ref_tokens
sum(length, ref_tokens)
end

cand_sent_list = split_sentences(candidate)
cand_sent_tokens = map(cand_sent_list) do cand_sent
tokenize(lang, cand_sent)
end

for ref in references
ref_sent_list = split_sentences(ref)
cand_total_tokens_length = sum(length, cand_sent_tokens)

for i in eachindex(ref_sent_tokens)
sum_value = 0

for ref_sent in ref_sent_list
l_ = []
arg1 = tokenize(Languages.English(), ref)
for cand_sent in cand_sent_list
arg2 = tokenize(Languages.English(), cand_sent)
d = tokenize(Languages.English(), weighted_lcs(arg1, arg2, false, true, sqrt))
append!(l_,d)
for ref_sent in ref_sent_tokens[i]
l_ = reduce(cand_sent_tokens, init=String[]) do acc, cand_sent
append!(acc, weighted_lcs_tokens(ref_sent, cand_sent, false))
end
sum_value += length(unique(l_))
sum_value += count(!isempty, unique(l_))
end

r_lcs = sum_value / length(tokenize(Languages.English(), ref))
p_lcs = sum_value / length(tokenize(Languages.English(), candidate))
score = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list,score)
r_lcs = sum_value / ref_sent_total_tokens[i]
p_lcs = sum_value / cand_total_tokens_length
fmeasure = fmeasure_lcs(r_lcs, p_lcs, β)
push!(rouge_l_list, Score(p_lcs, r_lcs, fmeasure))
end

averaging == true && return jackknife_avg(rouge_l_list)
return rouge_l_list
end
Loading

0 comments on commit 14533f0

Please sign in to comment.