diff --git a/docs/src/features.md b/docs/src/features.md index 0b4b4526..c7322c97 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -226,6 +226,32 @@ julia> summarize(s, ns=2) "This has too foo sentences." ``` +## Tagging_schemes + +There are many tagging schemes used for sequence labelling. +TextAnalysis currently offers functions for conversion between these tagging format. + +* BIO1 +* BIO2 +* BIOES + +```julia +julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] + +julia> tag_scheme!(tags, "BIO1", "BIOES") + +julia> tags +8-element Array{String,1}: + "S-LOC" + "O" + "S-PER" + "B-MISC" + "E-MISC" + "B-PER" + "I-PER" + "E-PER" +``` + ## Parts of Speech Tagger This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`. diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index e34285b0..6702fa57 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -50,6 +50,7 @@ module TextAnalysis export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags export SentimentAnalyzer + export tag_scheme! export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs export rouge_l_summary, rouge_l_sentence, rouge_n export PerceptronTagger, fit!, predict @@ -78,6 +79,7 @@ module TextAnalysis include("sentiment.jl") include("bayes.jl") include("deprecations.jl") + include("tagging_schemes.jl") include("utils.jl") include("rouge.jl") include("averagePerceptronTagger.jl") diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl new file mode 100644 index 00000000..983368ef --- /dev/null +++ b/src/tagging_schemes.jl @@ -0,0 +1,146 @@ +# Ref: +# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) +# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php + +abstract type tag_scheme end + +struct BIO1 <: tag_scheme end # BIO +struct BIO2 <: tag_scheme end +struct BIOES <: tag_scheme end + +const available_schemes = ["BIO1", "BIO2", "BIOES"] + +""" + tag_scheme!(tags, current_scheme::String, new_scheme::String) + +Convert `tags` from `current_scheme` to `new_scheme`. + +List of tagging schemes currently supported- + * BIO1 (BIO) + * BIO2 + * BIOES + +# Example +```julia-repl +julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] + +julia> tag_scheme!(tags, "BIO1", "BIOES") + +julia> tags +8-element Array{String,1}: + "S-LOC" + "O" + "S-PER" + "B-MISC" + "E-MISC" + "B-PER" + "I-PER" + "E-PER" +``` +""" +function tag_scheme!(tags, current_scheme::String, new_scheme::String) + current_scheme = uppercase(current_scheme) + new_scheme = uppercase(new_scheme) + (length(tags) == 0 || current_scheme == new_scheme) && return + + if new_scheme ∉ available_schemes || current_scheme ∉ available_schemes + error("Invalid tagging scheme") + end + + current_scheme = eval(Symbol(current_scheme))() + new_scheme = eval(Symbol(new_scheme))() + + tag_scheme!(tags, current_scheme, new_scheme) +end + +function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == "O" + tags[i] = "O" + continue + end + (tags[i][1] == 'O' || tags[i][1] == 'B') && continue + + if tags[i][1] == 'I' + if i == 1 + tags[i] = 'B' * tags[i][2:end] + elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end] + tags[i] = 'B' * tags[i][2:end] + else + continue + end + else + error("Invalid tags") + end + end +end + +function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == "O" + tags[i] = "O" + continue + end + (tags[i][1] == 'O' || tags[i][1] == 'I') && continue + + if tags[i][1] == 'B' + if i == length(tags) + tags[i] = 'I' * tags[i][2:end] + elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end] + tags[i] = 'I' * tags[i][2:end] + else + continue + end + else + error("Invalid tags") + end + end +end + +function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == 'O' + tags[i] = "O" + continue + end + + if tags[i][1] == 'I' && (i == length(tags) || + tags[i+1][2:end] != tags[i][2:end]) + tags[i] = 'E' * tags[i][2:end] + elseif tags[i][1] == 'B' && (i == length(tags) || + tags[i+1][2:end] != tags[i][2:end]) + tags[i] = 'S' * tags[i][2:end] + else + (tags[i][1] == 'I' || tags[i][1] == 'B') && continue + error("Invalid tags") + end + end +end + +function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2) + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == 'O' + tags[i] = "O" + continue + end + (tags[i][1] == 'B' || tags[i][1] == 'I') && continue + + if tags[i][1] == 'E' + tags[i] = 'I' * tags[i][2:end] + elseif tags[i][1] == 'S' + tags[i] = 'B' * tags[i][2:end] + else + error("Invalid tags") + end + end +end + +function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) + tag_scheme!(tags, BIO1(), BIO2()) + tag_scheme!(tags, BIO2(), BIOES()) +end + +function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1) + tag_scheme!(tags, BIOES(), BIO2()) + tag_scheme!(tags, BIO2(), BIO1()) +end diff --git a/test/runtests.jl b/test/runtests.jl index bffc62d2..d0f6a48d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -22,6 +22,7 @@ include("lda.jl") include("summarizer.jl") include("sentiment.jl") include("bayes.jl") +include("taggingschemes.jl") include("rouge.jl") include("averagePerceptronTagger.jl") diff --git a/test/taggingschemes.jl b/test/taggingschemes.jl new file mode 100644 index 00000000..0a96c4d5 --- /dev/null +++ b/test/taggingschemes.jl @@ -0,0 +1,41 @@ +@testset "Tagging_Schemes" begin + @testset "BIO1 and BIO2" begin + tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"] + tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"] + + output_tags = deepcopy(tags_BIO1) + tag_scheme!(tags_BIO1, "BIO1", "BIO2") + @test tags_BIO1 == tags_BIO2 + + tag_scheme!(tags_BIO1, "BIO2", "BIO1") + @test tags_BIO1 == output_tags + end + + @testset "BIO1 and BIOES" begin + tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] + + output_tags = deepcopy(tags_BIO1) + tag_scheme!(tags_BIO1, "BIO1", "BIOES") + @test tags_BIO1 == tags_BIOES + + tag_scheme!(tags_BIO1, "BIOES", "BIO1") + @test tags_BIO1 == output_tags + end + + @testset "BIO2 and BIOES" begin + tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] + + output_tags = deepcopy(tags_BIO2) + tag_scheme!(tags_BIO2, "BIO2", "BIOES") + @test tags_BIO2 == tags_BIOES + + tag_scheme!(tags_BIO2, "BIOES", "BIO2") + @test tags_BIO2 == output_tags + end +end