From bb8d5c99f8e52592b23ed795e5496cc68cfa6142 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 00:14:12 +0530 Subject: [PATCH 1/6] Add tagging_schemes --- src/tagging_schemes.jl | 108 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 src/tagging_schemes.jl diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl new file mode 100644 index 00000000..6ae492a6 --- /dev/null +++ b/src/tagging_schemes.jl @@ -0,0 +1,108 @@ +# Ref: +# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) +# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php + +# Tagging schemes for NER - BIO = BIO1, BIO2, BIOES / BILOU + +abstract type tag_scheme end + +struct BIO1 <: tag_scheme end # BIO +struct BIO2 <: tag_scheme end +struct BIOES <: tag_scheme end + +const available_schemes = ["BIO1", "BIO2", "BIOES"] + +""" + tag_scheme(tags) => str::String + +Identify tagging scheme and raise error for an invalid tagging scheme. +""" +function tag_scheme(tags) + tag_scheme(tags, BIO1()) && return "BIO1" + tag_scheme(tags, BIO2()) && return "BIO2" + tag_scheme(tags, BIOES()) && return "BIOES" +end + +function tag_scheme(tags, scheme::String) + return tag_scheme(tags, eval(Symbol(scheme))()) +end + +# Validate the tagging scheme. Return false for invalid +function tag_scheme(tags, scheme::BIO1) + return true +end + +function tag_scheme(tags, scheme::BIO2) + return true +end + +function tag_scheme(tags, scheme::BIOES) + return true +end + +""" + convert_tag_scheme(tags, current_scheme, new_scheme) + convert_tag_scheme(tags, new_scheme) + +Convert `tags` from `current_scheme` to `new_scheme`. + +Delimiter between prefix and tag type is assumed to be `-`. +List of tagging schemes currently supported- + * BIO1 (BIO) + * BIO2 + * BIOES +""" +function tag_scheme!(tags, new_scheme::String) + new_scheme = uppercase(new_scheme)) + length(tags) == 0 && return + current_scheme = tag_scheme(tags) + new_scheme ∈ available_schemes || error("Invalid tagging scheme") + + current_scheme == new_scheme && return + tag_scheme!(tags, current_scheme, new_scheme) +end + +function tag_scheme!(tags, current_scheme::String, new_scheme::String) + current_scheme = uppercase(current_scheme)) + new_scheme = uppercase(new_scheme)) + (length(tags) == 0 || !tag_scheme(tags, current_scheme)) && return + current_scheme != new_scheme || return + if new_scheme ∉ available_schemes || !tag_scheme(tags, current_scheme) + error("Invalid tagging scheme") + end + + current_scheme = eval(Symbol(current_scheme)) + new_scheme = eval(Symbol(new_scheme)) + + tag_scheme!(tags, current_scheme, new_scheme) +end + + + +function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) + tag_scheme(tags, current_scheme) || error("Wrong Tagging scheme ") + + # If I: If prev not of same type then change to 'B'. + # If O: Then same. Also change this to String if it is a char. + # If B: Then same. +end + +function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) + tag_scheme!(tag_scheme!(tags, BIO1(), BIO2()), BIO2(), BIOES()) +end + +function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) + # If I: If prev not of same type then change to 'B'. + # If O: Then same. Also change this to String if it is a char. + # If B: Then same. +end + +function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) +end + +function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1) + tag_scheme!(tag_scheme!(tags, BIOES(), BIO2()), BIO2(), BIO1()) +end + +function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2) +end From 6036ea5279eaf2084b93f53fec34591ff24d276a Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 01:30:34 +0530 Subject: [PATCH 2/6] Add BIO1 to BIO2 conversion and reverse --- src/TextAnalysis.jl | 2 + src/tagging_schemes.jl | 101 ++++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 57 deletions(-) diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 13d12a43..23ae16d9 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -50,6 +50,7 @@ module TextAnalysis export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags export SentimentAnalyzer + export tag_scheme! include("tokenizer.jl") include("ngramizer.jl") @@ -75,4 +76,5 @@ module TextAnalysis include("sentiment.jl") include("bayes.jl") include("deprecations.jl") + include("tagging_schemes.jl") end diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl index 6ae492a6..590d79ae 100644 --- a/src/tagging_schemes.jl +++ b/src/tagging_schemes.jl @@ -2,8 +2,6 @@ # https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging) # https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php -# Tagging schemes for NER - BIO = BIO1, BIO2, BIOES / BILOU - abstract type tag_scheme end struct BIO1 <: tag_scheme end # BIO @@ -12,37 +10,8 @@ struct BIOES <: tag_scheme end const available_schemes = ["BIO1", "BIO2", "BIOES"] -""" - tag_scheme(tags) => str::String - -Identify tagging scheme and raise error for an invalid tagging scheme. -""" -function tag_scheme(tags) - tag_scheme(tags, BIO1()) && return "BIO1" - tag_scheme(tags, BIO2()) && return "BIO2" - tag_scheme(tags, BIOES()) && return "BIOES" -end - -function tag_scheme(tags, scheme::String) - return tag_scheme(tags, eval(Symbol(scheme))()) -end - -# Validate the tagging scheme. Return false for invalid -function tag_scheme(tags, scheme::BIO1) - return true -end - -function tag_scheme(tags, scheme::BIO2) - return true -end - -function tag_scheme(tags, scheme::BIOES) - return true -end - """ convert_tag_scheme(tags, current_scheme, new_scheme) - convert_tag_scheme(tags, new_scheme) Convert `tags` from `current_scheme` to `new_scheme`. @@ -52,39 +21,41 @@ List of tagging schemes currently supported- * BIO2 * BIOES """ -function tag_scheme!(tags, new_scheme::String) - new_scheme = uppercase(new_scheme)) - length(tags) == 0 && return - current_scheme = tag_scheme(tags) - new_scheme ∈ available_schemes || error("Invalid tagging scheme") - - current_scheme == new_scheme && return - tag_scheme!(tags, current_scheme, new_scheme) -end - function tag_scheme!(tags, current_scheme::String, new_scheme::String) - current_scheme = uppercase(current_scheme)) - new_scheme = uppercase(new_scheme)) - (length(tags) == 0 || !tag_scheme(tags, current_scheme)) && return - current_scheme != new_scheme || return + current_scheme = uppercase(current_scheme) + new_scheme = uppercase(new_scheme) + (length(tags) == 0 || current_scheme == new_scheme) && return + if new_scheme ∉ available_schemes || !tag_scheme(tags, current_scheme) error("Invalid tagging scheme") end - current_scheme = eval(Symbol(current_scheme)) - new_scheme = eval(Symbol(new_scheme)) + current_scheme = eval(Symbol(current_scheme))() + new_scheme = eval(Symbol(new_scheme))() tag_scheme!(tags, current_scheme, new_scheme) end - - function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) - tag_scheme(tags, current_scheme) || error("Wrong Tagging scheme ") - - # If I: If prev not of same type then change to 'B'. - # If O: Then same. Also change this to String if it is a char. - # If B: Then same. + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == "O" + tags[i] = "O" + continue + end + (tags[i][1] == 'O' || tags[i][1] == 'B') && continue + + if tags[i][1] == 'I' + if i == 1 + tags[i] = 'B' * tags[i][2:end] + elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end] + tags[i] = 'B' * tags[i][2:end] + else + continue + end + else + error("Invalid tags") + end + end end function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) @@ -92,9 +63,25 @@ function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) end function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) - # If I: If prev not of same type then change to 'B'. - # If O: Then same. Also change this to String if it is a char. - # If B: Then same. + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == "O" + tags[i] = "O" + continue + end + (tags[i][1] == 'O' || tags[i][1] == 'I') && continue + + if tags[i][1] == 'B' + if i == length(tags) + tags[i] = 'I' * tags[i][2:end] + elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end] + tags[i] = 'I' * tags[i][2:end] + else + continue + end + else + error("Invalid tags") + end + end end function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) From 0641dc41795e658bdffb5ed22b9213e1f883a99f Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 11:52:13 +0530 Subject: [PATCH 3/6] Add BIOES and it's conversion --- src/tagging_schemes.jl | 48 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl index 590d79ae..9a46eb75 100644 --- a/src/tagging_schemes.jl +++ b/src/tagging_schemes.jl @@ -58,10 +58,6 @@ function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2) end end -function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) - tag_scheme!(tag_scheme!(tags, BIO1(), BIO2()), BIO2(), BIOES()) -end - function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) for i in eachindex(tags) if tags[i] == 'O' || tags[i][1] == "O" @@ -85,11 +81,49 @@ function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1) end function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES) -end + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == 'O' + tags[i] = "O" + continue + end -function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1) - tag_scheme!(tag_scheme!(tags, BIOES(), BIO2()), BIO2(), BIO1()) + if tags[i][1] == 'I' && (i == length(tags) || + tags[i+1][2:end] != tags[i][2:end]) + tags[i] = 'E' * tags[i][2:end] + elseif tags[i][1] == 'B' && (i == length(tags) || + tags[i+1][2:end] != tags[i][2:end]) + tags[i] = 'S' * tags[i][2:end] + else + (tags[i][1] == 'I' || tags[i][1] == 'B') && continue + error("Invalid tags") + end + end end function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2) + for i in eachindex(tags) + if tags[i] == 'O' || tags[i][1] == 'O' + tags[i] = "O" + continue + end + (tags[i][1] == 'B' || tags[i][1] == 'I') && continue + + if tags[i][1] == 'E' + tags[i] = 'I' * tags[i][2:end] + elseif tags[i][1] == 'S' + tags[i] = 'B' * tags[i][2:end] + else + error("Invalid tags") + end + end +end + +function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES) + tag_scheme!(tags, BIO1(), BIO2()) + tag_scheme!(tags, BIO2(), BIOES()) +end + +function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1) + tag_scheme!(tags, BIOES(), BIO2()) + tag_scheme!(tags, BIO2(), BIO1()) end From ff10b806a8f7c2dd8ed64cf141fe7939272dd615 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 12:13:34 +0530 Subject: [PATCH 4/6] Add tests for tagging_schemes --- src/tagging_schemes.jl | 4 ++-- test/runtests.jl | 3 +-- test/taggingschemes.jl | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 test/taggingschemes.jl diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl index 9a46eb75..0abb3be2 100644 --- a/src/tagging_schemes.jl +++ b/src/tagging_schemes.jl @@ -11,7 +11,7 @@ struct BIOES <: tag_scheme end const available_schemes = ["BIO1", "BIO2", "BIOES"] """ - convert_tag_scheme(tags, current_scheme, new_scheme) + tag_scheme!(tags, current_scheme, new_scheme) Convert `tags` from `current_scheme` to `new_scheme`. @@ -26,7 +26,7 @@ function tag_scheme!(tags, current_scheme::String, new_scheme::String) new_scheme = uppercase(new_scheme) (length(tags) == 0 || current_scheme == new_scheme) && return - if new_scheme ∉ available_schemes || !tag_scheme(tags, current_scheme) + if new_scheme ∉ available_schemes error("Invalid tagging scheme") end diff --git a/test/runtests.jl b/test/runtests.jl index 993c8c86..0c0ca57d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -22,9 +22,8 @@ include("lda.jl") include("summarizer.jl") include("sentiment.jl") include("bayes.jl") - +include("taggingschemes.jl") # end end - diff --git a/test/taggingschemes.jl b/test/taggingschemes.jl new file mode 100644 index 00000000..0a96c4d5 --- /dev/null +++ b/test/taggingschemes.jl @@ -0,0 +1,41 @@ +@testset "Tagging_Schemes" begin + @testset "BIO1 and BIO2" begin + tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"] + tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"] + + output_tags = deepcopy(tags_BIO1) + tag_scheme!(tags_BIO1, "BIO1", "BIO2") + @test tags_BIO1 == tags_BIO2 + + tag_scheme!(tags_BIO1, "BIO2", "BIO1") + @test tags_BIO1 == output_tags + end + + @testset "BIO1 and BIOES" begin + tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] + + output_tags = deepcopy(tags_BIO1) + tag_scheme!(tags_BIO1, "BIO1", "BIOES") + @test tags_BIO1 == tags_BIOES + + tag_scheme!(tags_BIO1, "BIOES", "BIO1") + @test tags_BIO1 == output_tags + end + + @testset "BIO2 and BIOES" begin + tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER", + "I-PER", "I-PER"] + tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER", + "I-PER", "E-PER"] + + output_tags = deepcopy(tags_BIO2) + tag_scheme!(tags_BIO2, "BIO2", "BIOES") + @test tags_BIO2 == tags_BIOES + + tag_scheme!(tags_BIO2, "BIOES", "BIO2") + @test tags_BIO2 == output_tags + end +end From 2b4da02335fa13f4a35ada1476b374a31a8f7713 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 12:27:01 +0530 Subject: [PATCH 5/6] Add docstrings --- src/tagging_schemes.jl | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl index 0abb3be2..983368ef 100644 --- a/src/tagging_schemes.jl +++ b/src/tagging_schemes.jl @@ -11,22 +11,39 @@ struct BIOES <: tag_scheme end const available_schemes = ["BIO1", "BIO2", "BIOES"] """ - tag_scheme!(tags, current_scheme, new_scheme) + tag_scheme!(tags, current_scheme::String, new_scheme::String) Convert `tags` from `current_scheme` to `new_scheme`. -Delimiter between prefix and tag type is assumed to be `-`. List of tagging schemes currently supported- * BIO1 (BIO) * BIO2 * BIOES + +# Example +```julia-repl +julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] + +julia> tag_scheme!(tags, "BIO1", "BIOES") + +julia> tags +8-element Array{String,1}: + "S-LOC" + "O" + "S-PER" + "B-MISC" + "E-MISC" + "B-PER" + "I-PER" + "E-PER" +``` """ function tag_scheme!(tags, current_scheme::String, new_scheme::String) current_scheme = uppercase(current_scheme) new_scheme = uppercase(new_scheme) (length(tags) == 0 || current_scheme == new_scheme) && return - if new_scheme ∉ available_schemes + if new_scheme ∉ available_schemes || current_scheme ∉ available_schemes error("Invalid tagging scheme") end From f74dbb5d1ffa795693b121d6c1a55523a8b597a2 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 29 May 2019 12:33:17 +0530 Subject: [PATCH 6/6] Add documentation for tagging_schemes --- docs/src/features.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/src/features.md b/docs/src/features.md index c1a4c401..c3d01e06 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -225,3 +225,29 @@ julia> summarize(s, ns=2) "Assume this Short Document as an example." "This has too foo sentences." ``` + +## Tagging_schemes + +There are many tagging schemes used for sequence labelling. +TextAnalysis currently offers functions for conversion between these tagging format. + +* BIO1 +* BIO2 +* BIOES + +```julia +julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"] + +julia> tag_scheme!(tags, "BIO1", "BIOES") + +julia> tags +8-element Array{String,1}: + "S-LOC" + "O" + "S-PER" + "B-MISC" + "E-MISC" + "B-PER" + "I-PER" + "E-PER" +```