From ce69b59d41ea2fb06f5fa0e1cbdc6b62e6ab6e08 Mon Sep 17 00:00:00 2001 From: Alex Tantos Date: Tue, 9 Jan 2024 11:55:31 +0200 Subject: [PATCH] Directional coom (#264) * Adding the directional coocurrence matrix. * Added tests --- src/coom.jl | 108 +++++++++++++++++++++++++++++++-------------------- test/coom.jl | 62 +++++++++++++++++++++++------ 2 files changed, 116 insertions(+), 54 deletions(-) diff --git a/src/coom.jl b/src/coom.jl index b5246bc8..e76cb151 100644 --- a/src/coom.jl +++ b/src/coom.jl @@ -8,14 +8,14 @@ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ - coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool) + coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool, mode::Symbol) Basic low-level function that calculates the co-occurrence matrix of a document. Returns a sparse co-occurrence matrix sized `n × n` where `n = length(vocab)` with elements of type `T`. The document `doc` is represented by a vector of its terms (in order)`. The keywords `window` and `normalize` indicate the size of the sliding word window in which co-occurrences are counted and whether to normalize -of not the counts by the distance between word positions. +of not the counts by the distance between word positions. The `mode` keyword can be either `:default` or `:directional` and indicates whether the co-occurrence matrix should be directional or not. This means that if `mode` is `:directional` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc`. If `mode` is `:default` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be twice the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc` (once for each direction, from i to j + from j to i). # Example ``` @@ -30,28 +30,48 @@ julia> using TextAnalysis, DataStructures [1, 2] = 2.0 [3, 2] = 0.3999 [2, 3] = 0.3999 + +julia> using TextAnalysis, DataStructures + doc = StringDocument("This is a text about an apple. There are many texts about apples.") + docv = TextAnalysis.tokenize(language(doc), text(doc)) + vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3) + TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional) + +3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries: + [2, 1] = 1.0 + [1, 2] = 1.0 + [3, 2] = 0.1999 + [2, 3] = 0.1999 ``` """ function coo_matrix(::Type{T}, - doc::Vector{<:AbstractString}, - vocab::OrderedDict{<:AbstractString, Int}, - window::Int, - normalize::Bool=true) where T<:AbstractFloat + doc::Vector{<:AbstractString}, + vocab::OrderedDict{<:AbstractString, + Int}, + window::Int, + normalize::Bool=true, + mode::Symbol=:default) where {T<:AbstractFloat} + # Initializations n = length(vocab) m = length(doc) coom = spzeros(T, n, n) # Count co-occurrences for (i, token) in enumerate(doc) + inner_range = if mode == :directional + i:min(m, i + window) + else + max(1, i - window):min(m, i + window) + end row = get(vocab, token, nothing) isnothing(row) && continue - @inbounds for j in max(1, i - window):min(m, i + window) + # looking forward + @inbounds for j in inner_range i == j && continue wtoken = doc[j] col = get(vocab, wtoken, nothing) isnothing(col) && continue - nm = T(ifelse(normalize, abs(i - j), 1)) coom[row, col] += one(T) / nm coom[col, row] = coom[row, col] @@ -60,9 +80,9 @@ function coo_matrix(::Type{T}, return coom end -coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString, Int}, - window::Int, normalize::Bool=true) where T<:AbstractFloat = - coo_matrix(T, doc, OrderedDict(vocab), window, normalize) +coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString,Int}, + window::Int, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = + coo_matrix(T, doc, OrderedDict(vocab), window, normalize, mode) """ Basic Co-occurrence Matrix (COOM) type. @@ -75,9 +95,9 @@ the document or corpus columns of the co-occurrence matrix """ struct CooMatrix{T} - coom::SparseMatrixCSC{T, Int} + coom::SparseMatrixCSC{T,Int} terms::Vector{String} - column_indices::OrderedDict{String, Int} + column_indices::OrderedDict{String,Int} end @@ -91,66 +111,68 @@ can be a `Vector{String}`, an `AbstractDict` where the keys are the lexicon, or can be omitted, in which case the `lexicon` field of the corpus is used. """ function CooMatrix{T}(crps::Corpus, - terms::Vector{String}; - window::Int=5, - normalize::Bool=true) where T<:AbstractFloat + terms::Vector{String}; + window::Int=5, + normalize::Bool=true, + mode::Symbol=:default) where {T<:AbstractFloat} column_indices = OrderedDict(columnindices(terms)) n = length(terms) coom = spzeros(T, n, n) for doc in crps - coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize) + coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize, mode) end return CooMatrix{T}(coom, terms, column_indices) end -CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true) = - CooMatrix{Float64}(crps, terms, window=window, normalize=normalize) +CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = + CooMatrix{Float64}(crps, terms, window=window, normalize=normalize, mode=mode) -CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true - ) where T<:AbstractFloat = - CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize) +CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = + CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize, mode=mode) -CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true) = - CooMatrix{Float64}(crps, lex, window=window, normalize=normalize) +CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = + CooMatrix{Float64}(crps, lex, window=window, normalize=normalize, mode=mode) -CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true) where T<:AbstractFloat = begin +CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = begin isempty(lexicon(crps)) && update_lexicon!(crps) - CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize) + CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode) end -CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true) = begin +CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = begin isempty(lexicon(crps)) && update_lexicon!(crps) - CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize) + CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode) end # Document methods function CooMatrix{T}(doc::AbstractDocument, - terms::Vector{String}; - window::Int=5, - normalize::Bool=true) where T<:AbstractFloat + terms::Vector{String}; + window::Int=5, + normalize::Bool=true, + mode::Symbol=:default) where {T<:AbstractFloat} # Initializations column_indices = OrderedDict(columnindices(terms)) - coom = coo_matrix(T, tokens(doc), column_indices, window, normalize) + coom = coo_matrix(T, tokens(doc), column_indices, window, normalize, mode) return CooMatrix{T}(coom, terms, column_indices) end function CooMatrix{T}(doc::NGramDocument, - terms::Vector{String}; - window::Int=5, - normalize::Bool=true) where T <: AbstractFloat + terms::Vector{String}; + window::Int=5, + normalize::Bool=true, + mode::Symbol=:default) where {T<:AbstractFloat} error("The Co occurrence matrix of an NGramDocument can't be created.") end -CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true) = - CooMatrix{Float64}(doc, terms, window=window, normalize=normalize) +CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = + CooMatrix{Float64}(doc, terms, window=window, normalize=normalize, mode=mode) -function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true) where T<:AbstractFloat +function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} terms = unique(String.(tokens(doc))) - CooMatrix{T}(doc, terms, window=window, normalize=normalize) + CooMatrix{T}(doc, terms, window=window, normalize=normalize, mode=mode) end -CooMatrix(doc; window::Int=5, normalize::Bool=true) = - CooMatrix{Float64}(doc, window=window, normalize=normalize) +CooMatrix(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = + CooMatrix{Float64}(doc, window=window, normalize=normalize, mode=mode) """ coom(c::CooMatrix) @@ -167,5 +189,5 @@ with the `entity`. The `CooMatrix{T}` will first have to be created in order for the actual matrix to be accessed. """ coom(entity, eltype::Type{T}=Float; - window::Int=5, normalize::Bool=true) where T<:AbstractFloat = - coom(CooMatrix{T}(entity, window=window, normalize=normalize)) + window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = + coom(CooMatrix{T}(entity, window=window, normalize=normalize, mode=mode)) diff --git a/test/coom.jl b/test/coom.jl index 4b710374..20e9759e 100644 --- a/test/coom.jl +++ b/test/coom.jl @@ -1,6 +1,6 @@ @testset "COOM (Co-occurence Matrix)" begin doc_raw = StringDocument("This is a document. It has two sentences.") - prepare!(doc_raw, strip_punctuation|strip_whitespace|strip_case) + prepare!(doc_raw, strip_punctuation | strip_whitespace | strip_case) doc = text(doc_raw) sd = StringDocument(doc) td = TokenDocument(doc) @@ -8,7 +8,8 @@ crps = Corpus([sd, td]) T = Float16 # Results for window = 5, all terms in document used - expected_result = [ # for window == 5 + # expected_result_C is the expected matrix for the normalized and default mode case. + expected_result_C = [ # for window == 5 0.0 2.0 1.0 2/3 0.5 0.4 0.0 0.0 2.0 0.0 2.0 1.0 2/3 0.5 0.4 0.0 1.0 2.0 0.0 2.0 1.0 2/3 0.5 0.4 @@ -17,14 +18,28 @@ 0.4 0.5 2/3 1.0 2.0 0.0 2.0 1.0 0.0 0.4 0.5 2/3 1.0 2.0 0.0 2.0 0.0 0.0 0.4 0.5 2/3 1.0 2.0 0.0] + + # expected_result_D is the expected matrix for the normalized and directional mode case. + expected_result_D = [ # for window == 5 + 0.0 1.0 0.5 1/3 0.25 0.2 0.0 0.0 + 1.0 0.0 1.0 0.5 1/3 0.25 0.2 0.0 + 0.5 1.0 0.0 1.0 0.5 1/3 0.25 0.2 + 1/3 0.5 1.0 0.0 1.0 0.5 1/3 0.25 + 0.25 1/3 0.5 1.0 0.0 1.0 0.5 1/3 + 0.2 0.25 1/3 0.5 1.0 0.0 1.0 0.5 + 0.0 0.2 0.25 1/3 0.5 1.0 0.0 1.0 + 0.0 0.0 0.2 0.25 1/3 0.5 1.0 0.0] # Verify untyped constructor terms = tokens(td) for d in [sd, td, crps] C = TextAnalysis.CooMatrix(d, terms) + D = TextAnalysis.CooMatrix(d, terms, mode=:directional) if !(d isa Corpus) - @test TextAnalysis.coom(C) == expected_result + @test TextAnalysis.coom(C) == expected_result_C + @test TextAnalysis.coom(D) == expected_result_D else - @test TextAnalysis.coom(C) == length(crps) * expected_result + @test TextAnalysis.coom(C) == length(crps) * expected_result_C + @test TextAnalysis.coom(D) == length(crps) * expected_result_D end end @test_throws ErrorException TextAnalysis.CooMatrix(nd) @@ -33,28 +48,53 @@ terms = tokens(td) for d in [sd, td, crps] C = TextAnalysis.CooMatrix{T}(d, terms) + D = TextAnalysis.CooMatrix{T}(d, terms, mode=:directional) @test C isa TextAnalysis.CooMatrix{T} if !(d isa Corpus) - @test TextAnalysis.coom(C) == T.(expected_result) + @test TextAnalysis.coom(C) == T.(expected_result_C) + @test TextAnalysis.coom(D) == T.(expected_result_D) else - @test TextAnalysis.coom(C) == length(crps) * T.(expected_result) + @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C) + @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D) end end @test_throws ErrorException TextAnalysis.CooMatrix{T}(nd) # Results for window = 1, custom terms terms = ["this", "document", "it"] - expected_result = [0.0 0.0 0.0; # document - 0.0 0.0 2.0; # it - 0.0 2.0 0.0] # this + expected_result_C = [0.0 0.0 0.0; # document + 0.0 0.0 2.0; # it + 0.0 2.0 0.0] # this + + expected_result_D = [0.0 0.0 0.0; # document + 0.0 0.0 1.0; # it + 0.0 1.0 0.0] # this + # Verify untyped constructor + for d in [sd, td, crps] + C = TextAnalysis.CooMatrix(d, terms, window=1) + D = TextAnalysis.CooMatrix(d, terms, window=1, mode=:directional) + if !(d isa Corpus) + @test TextAnalysis.coom(C) == T.(expected_result_C) + @test TextAnalysis.coom(D) == T.(expected_result_D) + else + @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C) + @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D) + end + end + @test_throws ErrorException TextAnalysis.CooMatrix(nd) + + # Verify typed constructor for d in [sd, td, crps] C = TextAnalysis.CooMatrix{T}(d, terms, window=1) + D = TextAnalysis.CooMatrix{T}(d, terms, window=1, mode=:directional) @test C isa TextAnalysis.CooMatrix{T} if !(d isa Corpus) - @test TextAnalysis.coom(C) == T.(expected_result) + @test TextAnalysis.coom(C) == T.(expected_result_C) + @test TextAnalysis.coom(D) == T.(expected_result_D) else - @test TextAnalysis.coom(C) == length(crps) * T.(expected_result) + @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C) + @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D) end end @test_throws ErrorException TextAnalysis.CooMatrix{T}(nd)