-
Notifications
You must be signed in to change notification settings - Fork 96
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #161 from Ayushk4/tagging_schemes_patch
Add functions for Tagging Schemes and Conversion.
- Loading branch information
Showing
5 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
# Ref: | ||
# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) | ||
# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php | ||
|
||
abstract type tag_scheme end | ||
|
||
struct BIO1 <: tag_scheme end # BIO | ||
struct BIO2 <: tag_scheme end | ||
struct BIOES <: tag_scheme end | ||
|
||
const available_schemes = ["BIO1", "BIO2", "BIOES"] | ||
|
||
""" | ||
tag_scheme!(tags, current_scheme::String, new_scheme::String) | ||
Convert `tags` from `current_scheme` to `new_scheme`. | ||
List of tagging schemes currently supported- | ||
* BIO1 (BIO) | ||
* BIO2 | ||
* BIOES | ||
# Example | ||
```julia-repl | ||
julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] | ||
julia> tag_scheme!(tags, "BIO1", "BIOES") | ||
julia> tags | ||
8-element Array{String,1}: | ||
"S-LOC" | ||
"O" | ||
"S-PER" | ||
"B-MISC" | ||
"E-MISC" | ||
"B-PER" | ||
"I-PER" | ||
"E-PER" | ||
``` | ||
""" | ||
function tag_scheme!(tags, current_scheme::String, new_scheme::String) | ||
current_scheme = uppercase(current_scheme) | ||
new_scheme = uppercase(new_scheme) | ||
(length(tags) == 0 || current_scheme == new_scheme) && return | ||
|
||
if new_scheme ∉ available_schemes || current_scheme ∉ available_schemes | ||
error("Invalid tagging scheme") | ||
end | ||
|
||
current_scheme = eval(Symbol(current_scheme))() | ||
new_scheme = eval(Symbol(new_scheme))() | ||
|
||
tag_scheme!(tags, current_scheme, new_scheme) | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) | ||
for i in eachindex(tags) | ||
if tags[i] == 'O' || tags[i][1] == "O" | ||
tags[i] = "O" | ||
continue | ||
end | ||
(tags[i][1] == 'O' || tags[i][1] == 'B') && continue | ||
|
||
if tags[i][1] == 'I' | ||
if i == 1 | ||
tags[i] = 'B' * tags[i][2:end] | ||
elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end] | ||
tags[i] = 'B' * tags[i][2:end] | ||
else | ||
continue | ||
end | ||
else | ||
error("Invalid tags") | ||
end | ||
end | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) | ||
for i in eachindex(tags) | ||
if tags[i] == 'O' || tags[i][1] == "O" | ||
tags[i] = "O" | ||
continue | ||
end | ||
(tags[i][1] == 'O' || tags[i][1] == 'I') && continue | ||
|
||
if tags[i][1] == 'B' | ||
if i == length(tags) | ||
tags[i] = 'I' * tags[i][2:end] | ||
elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end] | ||
tags[i] = 'I' * tags[i][2:end] | ||
else | ||
continue | ||
end | ||
else | ||
error("Invalid tags") | ||
end | ||
end | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) | ||
for i in eachindex(tags) | ||
if tags[i] == 'O' || tags[i][1] == 'O' | ||
tags[i] = "O" | ||
continue | ||
end | ||
|
||
if tags[i][1] == 'I' && (i == length(tags) || | ||
tags[i+1][2:end] != tags[i][2:end]) | ||
tags[i] = 'E' * tags[i][2:end] | ||
elseif tags[i][1] == 'B' && (i == length(tags) || | ||
tags[i+1][2:end] != tags[i][2:end]) | ||
tags[i] = 'S' * tags[i][2:end] | ||
else | ||
(tags[i][1] == 'I' || tags[i][1] == 'B') && continue | ||
error("Invalid tags") | ||
end | ||
end | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2) | ||
for i in eachindex(tags) | ||
if tags[i] == 'O' || tags[i][1] == 'O' | ||
tags[i] = "O" | ||
continue | ||
end | ||
(tags[i][1] == 'B' || tags[i][1] == 'I') && continue | ||
|
||
if tags[i][1] == 'E' | ||
tags[i] = 'I' * tags[i][2:end] | ||
elseif tags[i][1] == 'S' | ||
tags[i] = 'B' * tags[i][2:end] | ||
else | ||
error("Invalid tags") | ||
end | ||
end | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) | ||
tag_scheme!(tags, BIO1(), BIO2()) | ||
tag_scheme!(tags, BIO2(), BIOES()) | ||
end | ||
|
||
function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1) | ||
tag_scheme!(tags, BIOES(), BIO2()) | ||
tag_scheme!(tags, BIO2(), BIO1()) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
@testset "Tagging_Schemes" begin | ||
@testset "BIO1 and BIO2" begin | ||
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"] | ||
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"] | ||
|
||
output_tags = deepcopy(tags_BIO1) | ||
tag_scheme!(tags_BIO1, "BIO1", "BIO2") | ||
@test tags_BIO1 == tags_BIO2 | ||
|
||
tag_scheme!(tags_BIO1, "BIO2", "BIO1") | ||
@test tags_BIO1 == output_tags | ||
end | ||
|
||
@testset "BIO1 and BIOES" begin | ||
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", | ||
"I-PER", "I-PER"] | ||
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", | ||
"I-PER", "E-PER"] | ||
|
||
output_tags = deepcopy(tags_BIO1) | ||
tag_scheme!(tags_BIO1, "BIO1", "BIOES") | ||
@test tags_BIO1 == tags_BIOES | ||
|
||
tag_scheme!(tags_BIO1, "BIOES", "BIO1") | ||
@test tags_BIO1 == output_tags | ||
end | ||
|
||
@testset "BIO2 and BIOES" begin | ||
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER", | ||
"I-PER", "I-PER"] | ||
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", | ||
"I-PER", "E-PER"] | ||
|
||
output_tags = deepcopy(tags_BIO2) | ||
tag_scheme!(tags_BIO2, "BIO2", "BIOES") | ||
@test tags_BIO2 == tags_BIOES | ||
|
||
tag_scheme!(tags_BIO2, "BIOES", "BIO2") | ||
@test tags_BIO2 == output_tags | ||
end | ||
end |