Skip to content

Commit

Permalink
Merge pull request #161 from Ayushk4/tagging_schemes_patch
Browse files Browse the repository at this point in the history
Add functions for Tagging Schemes and Conversion.
  • Loading branch information
aviks committed Jun 24, 2019
2 parents 5730ba6 + 2bae9e4 commit dd67dcf
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 0 deletions.
26 changes: 26 additions & 0 deletions docs/src/features.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,32 @@ julia> summarize(s, ns=2)
"This has too foo sentences."
```

## Tagging_schemes

There are many tagging schemes used for sequence labelling.
TextAnalysis currently offers functions for conversion between these tagging format.

* BIO1
* BIO2
* BIOES

```julia
julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]

julia> tag_scheme!(tags, "BIO1", "BIOES")

julia> tags
8-element Array{String,1}:
"S-LOC"
"O"
"S-PER"
"B-MISC"
"E-MISC"
"B-PER"
"I-PER"
"E-PER"
```

## Parts of Speech Tagger

This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.
Expand Down
2 changes: 2 additions & 0 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ module TextAnalysis
export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
export SentimentAnalyzer
export tag_scheme!
export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs
export rouge_l_summary, rouge_l_sentence, rouge_n
export PerceptronTagger, fit!, predict
Expand Down Expand Up @@ -78,6 +79,7 @@ module TextAnalysis
include("sentiment.jl")
include("bayes.jl")
include("deprecations.jl")
include("tagging_schemes.jl")
include("utils.jl")
include("rouge.jl")
include("averagePerceptronTagger.jl")
Expand Down
146 changes: 146 additions & 0 deletions src/tagging_schemes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# Ref:
# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php

abstract type tag_scheme end

struct BIO1 <: tag_scheme end # BIO
struct BIO2 <: tag_scheme end
struct BIOES <: tag_scheme end

const available_schemes = ["BIO1", "BIO2", "BIOES"]

"""
tag_scheme!(tags, current_scheme::String, new_scheme::String)
Convert `tags` from `current_scheme` to `new_scheme`.
List of tagging schemes currently supported-
* BIO1 (BIO)
* BIO2
* BIOES
# Example
```julia-repl
julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
julia> tag_scheme!(tags, "BIO1", "BIOES")
julia> tags
8-element Array{String,1}:
"S-LOC"
"O"
"S-PER"
"B-MISC"
"E-MISC"
"B-PER"
"I-PER"
"E-PER"
```
"""
function tag_scheme!(tags, current_scheme::String, new_scheme::String)
current_scheme = uppercase(current_scheme)
new_scheme = uppercase(new_scheme)
(length(tags) == 0 || current_scheme == new_scheme) && return

if new_scheme available_schemes || current_scheme available_schemes
error("Invalid tagging scheme")
end

current_scheme = eval(Symbol(current_scheme))()
new_scheme = eval(Symbol(new_scheme))()

tag_scheme!(tags, current_scheme, new_scheme)
end

function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == "O"
tags[i] = "O"
continue
end
(tags[i][1] == 'O' || tags[i][1] == 'B') && continue

if tags[i][1] == 'I'
if i == 1
tags[i] = 'B' * tags[i][2:end]
elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end]
tags[i] = 'B' * tags[i][2:end]
else
continue
end
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == "O"
tags[i] = "O"
continue
end
(tags[i][1] == 'O' || tags[i][1] == 'I') && continue

if tags[i][1] == 'B'
if i == length(tags)
tags[i] = 'I' * tags[i][2:end]
elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end]
tags[i] = 'I' * tags[i][2:end]
else
continue
end
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == 'O'
tags[i] = "O"
continue
end

if tags[i][1] == 'I' && (i == length(tags) ||
tags[i+1][2:end] != tags[i][2:end])
tags[i] = 'E' * tags[i][2:end]
elseif tags[i][1] == 'B' && (i == length(tags) ||
tags[i+1][2:end] != tags[i][2:end])
tags[i] = 'S' * tags[i][2:end]
else
(tags[i][1] == 'I' || tags[i][1] == 'B') && continue
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2)
for i in eachindex(tags)
if tags[i] == 'O' || tags[i][1] == 'O'
tags[i] = "O"
continue
end
(tags[i][1] == 'B' || tags[i][1] == 'I') && continue

if tags[i][1] == 'E'
tags[i] = 'I' * tags[i][2:end]
elseif tags[i][1] == 'S'
tags[i] = 'B' * tags[i][2:end]
else
error("Invalid tags")
end
end
end

function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES)
tag_scheme!(tags, BIO1(), BIO2())
tag_scheme!(tags, BIO2(), BIOES())
end

function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1)
tag_scheme!(tags, BIOES(), BIO2())
tag_scheme!(tags, BIO2(), BIO1())
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ include("lda.jl")
include("summarizer.jl")
include("sentiment.jl")
include("bayes.jl")
include("taggingschemes.jl")
include("rouge.jl")
include("averagePerceptronTagger.jl")

Expand Down
41 changes: 41 additions & 0 deletions test/taggingschemes.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
@testset "Tagging_Schemes" begin
@testset "BIO1 and BIO2" begin
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"]
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"]

output_tags = deepcopy(tags_BIO1)
tag_scheme!(tags_BIO1, "BIO1", "BIO2")
@test tags_BIO1 == tags_BIO2

tag_scheme!(tags_BIO1, "BIO2", "BIO1")
@test tags_BIO1 == output_tags
end

@testset "BIO1 and BIOES" begin
tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER",
"I-PER", "I-PER"]
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
"I-PER", "E-PER"]

output_tags = deepcopy(tags_BIO1)
tag_scheme!(tags_BIO1, "BIO1", "BIOES")
@test tags_BIO1 == tags_BIOES

tag_scheme!(tags_BIO1, "BIOES", "BIO1")
@test tags_BIO1 == output_tags
end

@testset "BIO2 and BIOES" begin
tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER",
"I-PER", "I-PER"]
tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
"I-PER", "E-PER"]

output_tags = deepcopy(tags_BIO2)
tag_scheme!(tags_BIO2, "BIO2", "BIOES")
@test tags_BIO2 == tags_BIOES

tag_scheme!(tags_BIO2, "BIOES", "BIO2")
@test tags_BIO2 == output_tags
end
end

0 comments on commit dd67dcf

Please sign in to comment.