Skip to content
This repository has been archived by the owner on May 29, 2024. It is now read-only.

Commit

Permalink
feat(utils): allow half-cost string dist case-sub
Browse files Browse the repository at this point in the history
  • Loading branch information
tecosaur committed Jul 16, 2023
1 parent 6df5ed2 commit a937ade
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 16 deletions.
6 changes: 3 additions & 3 deletions src/model/errors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,15 @@ function Base.showerror(io::IO, err::UnresolveableIdentifier{DataSet, String})
for ident in Identifier.(collection.datasets, nothing)
istr = @advise collection string(ident)
push!(candidates,
(ident, collection, stringsimilarity(err.identifier, istr)))
(ident, collection, stringsimilarity(err.identifier, istr; halfcase=true)))
end
end
elseif isnothing(err.collection) && !isempty(STACK)
for collection in last(Iterators.peel(STACK))
for ident in Identifier.(collection.datasets)
istr = @advise collection string(ident)
push!(candidates,
(ident, collection, stringsimilarity(err.identifier, istr)))
(ident, collection, stringsimilarity(err.identifier, istr; halfcase=true)))
end
end
end
Expand Down Expand Up @@ -118,7 +118,7 @@ function Base.showerror(io::IO, err::UnresolveableIdentifier{DataCollection})
for collection in STACK
if !isnothing(collection.name)
push!(candidates,
(collection, stringsimilarity(err.identifier, collection.name)))
(collection, stringsimilarity(err.identifier, collection.name; halfcase=true)))
end
end
if maximum(last.(candidates), init=0.0) >= 0.5
Expand Down
48 changes: 35 additions & 13 deletions src/model/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function natkeygen(key::String)
end

"""
stringdist(a::AbstractString, b::AbstractString)
stringdist(a::AbstractString, b::AbstractString; halfcase::Bool=false)
Calculate the Restricted Damerau-Levenshtein distance (aka. Optimal String
Alignment) between `a` and `b`.
Expand All @@ -47,6 +47,9 @@ This is the minimum number of edits required to transform `a` to `b`, where each
edit is a *deletion*, *insertion*, *substitution*, or *transposition* of a
character, with the restriction that no substring is edited more than once.
When `halfcase` is true, substitutions that just switch the case of a character
cost half as much.
# Examples
```jldoctest; setup = :(import DataToolkitBase.stringdist)
Expand All @@ -57,11 +60,14 @@ julia> stringdist("The quick brown fox jumps over the lazy dog",
julia> stringdist("typo", "tpyo")
1
julia> DataToolkitBase.stringdist("frog", "cat")
julia> stringdist("frog", "cat")
4
julia> stringdist("Thing", "thing", halfcase=true)
0.5
```
"""
function stringdist(a::AbstractString, b::AbstractString)
function stringdist(a::AbstractString, b::AbstractString; halfcase::Bool=false)
if length(a) > length(b)
a, b = b, a
end
Expand All @@ -74,14 +80,14 @@ function stringdist(a::AbstractString, b::AbstractString)
end
end
start == length(a) && return length(b) - start
v₀ = collect(1:(length(b) - start))
v₀ = collect(2:2:2*(length(b) - start))
v₁ = similar(v₀)
aᵢ₋₁, bⱼ₋₁ = first(a), first(b)
current = 0
for (i, aᵢ) in enumerate(a)
i > start || (aᵢ₋₁ = aᵢ; continue)
left = i - start - 1
current = i - start
left = 2*(i - start - 1)
current = 2*(i - start)
transition_next = 0
@inbounds for (j, bⱼ) in enumerate(b)
j > start || (bⱼ₋₁ = bⱼ; continue)
Expand All @@ -92,25 +98,38 @@ function stringdist(a::AbstractString, b::AbstractString)
v₁[j - start] = current = left
left = v₀[j - start]
if aᵢ != bⱼ
# (Potentially) cheaper substitution when just
# switching case.
substitutecost = if halfcase
aᵢswitchcap = if isuppercase(aᵢ)
lowercase(aᵢ)
elseif islowercase(aᵢ)
uppercase(aᵢ)
else aᵢ end
ifelse(aᵢswitchcap == bⱼ, 1, 2)
else
2
end
# Minimum between substitution, deletion and insertion
current = min(current + 1, above + 1, left + 1)
current = min(current + substitutecost,
above + 2, left + 2) # deletion or insertion
if i > start + 1 && j > start + 1 && aᵢ == bⱼ₋₁ && aᵢ₋₁ == bⱼ
current = min(current, (this_transition += 1))
current = min(current, (this_transition += 2))
end
end
v₀[j - start] = current
bⱼ₋₁ = bⱼ
end
aᵢ₋₁ = aᵢ
end
current
if halfcase current/2 else current÷2 end
end

"""
stringsimilarity(a::AbstractString, b::AbstractString)
stringsimilarity(a::AbstractString, b::AbstractString; halfcase::Bool=false)
Return the `stringdist` as a proportion of the maximum length of `a` and `b`,
take one.
take one. When `halfcase` is true, case switches cost half as much.
# Example
Expand All @@ -120,10 +139,13 @@ julia> stringsimilarity("same", "same")
julia> stringsimilarity("semi", "demi")
0.75
julia> stringsimilarity("Same", "same")
0.875
```
"""
stringsimilarity(a::AbstractString, b::AbstractString) =
1 - stringdist(a, b) / max(length(a), length(b))
stringsimilarity(a::AbstractString, b::AbstractString; halfcase::Bool=false) =
1 - stringdist(a, b; halfcase) / max(length(a), length(b))

"""
longest_common_subsequence(a, b)
Expand Down

0 comments on commit a937ade

Please sign in to comment.