diff --git a/Project.toml b/Project.toml index aad38d52..da220465 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.7.5" [deps] DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" +DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Languages = "8ef0a80b-9436-5d2c-a485-80b904378c43" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -24,6 +25,7 @@ WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e" [compat] DataStructures = "0.17, 0.18" DelimitedFiles = "1" +DocStringExtensions = "0.9" JSON = "0.21" Languages = "0.4" ProgressMeter = "1.5" diff --git a/docs/src/evaluation_metrics.md b/docs/src/evaluation_metrics.md index 07a7d484..a07258cf 100644 --- a/docs/src/evaluation_metrics.md +++ b/docs/src/evaluation_metrics.md @@ -8,36 +8,28 @@ As of now TextAnalysis provides the following evaluation metrics. * [BLEU (bilingual evaluation understudy)](https://en.wikipedia.org/wiki/BLEU) -## ROUGE-N +## ROUGE-N, ROUGE-L, ROUGE-L-Summary This metric evaluation based on the overlap of N-grams between the system and reference summaries. ```@docs +argmax +average rouge_n +rouge_l_sentence +rouge_l_summary ``` -The function takes the following arguments - - -* `references::Array{T} where T<: AbstractString` = The list of reference summaries. -* `candidate::AbstractString` = Input candidate summary, to be scored against reference summaries. -* `n::Integer` = Order of NGrams -* `avg::Bool` = Setting this parameter to `true`, applies jackkniving the calculated scores. Defaults to `true` -* `lang::Language` = Language of the text, useful while generating N-grams. Defaults to English i.e. Languages.English() - -```julia -julia> candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits." -"Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits." - -julia> reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."] -2-element Array{String,1}: - "Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea." - "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit." +```@example +using TextAnalysis -julia> rouge_n(reference_summaries, candidate_summary, 2, avg=true) -0.1317241379310345 +candidate_summary = "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits." +reference_summaries = ["Brazil, Russia, India and China are the next big political powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the BRIC(S) and have been invited to the G20 summit."] -julia> rouge_n(reference_summaries, candidate_summary, 1, avg=true) -0.5051282051282051 +results = [ + rouge_n(reference_summaries, candidate_summary, 2), + rouge_n(reference_summaries, candidate_summary, 1) +] .|> argmax ``` ## BLEU (bilingual evaluation understudy) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 699d3eb1..53c1470d 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -12,6 +12,7 @@ module TextAnalysis using Statistics using Serialization using ProgressMeter + using DocStringExtensions import Base: depwarn, merge! import Serialization: serialize, deserialize