From 1dacfc58bb3348deca7b53b61d91c3a71080109e Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 16 Mar 2019 02:44:44 +0530 Subject: [PATCH 1/8] Update section under Creating Documents --- docs/src/documents.md | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/src/documents.md b/docs/src/documents.md index 1d0b59e2..050bfc56 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -8,7 +8,8 @@ allows one to work with documents stored in a variety of formats: * _TokenDocument_ : A document represented as a sequence of UTF8 tokens * _NGramDocument_ : A document represented as a bag of n-grams, which are UTF8 n-grams that map to counts -These format represent a hierarchy: you can always move down the hierachy, but can generally not move up the hierachy. A `FileDocument` can easily become a `StringDocument`, but an `NGramDocument` cannot easily become a `FileDocument`. +!!! note + These formats represent a hierarchy: you can always move down the hierachy, but can generally not move up the hierachy. A `FileDocument` can easily become a `StringDocument`, but an `NGramDocument` cannot easily become a `FileDocument`. Creating any of the four basic types of documents is very easy: @@ -55,9 +56,16 @@ NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not For every type of document except a `FileDocument`, you can also construct a new document by simply passing in a string of text: - sd = StringDocument("To be or not to be...") - td = TokenDocument("To be or not to be...") - ngd = NGramDocument("To be or not to be...") +```julia +julia> sd = StringDocument("To be or not to be...") +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> td = TokenDocument("To be or not to be...") +TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> ngd = NGramDocument("To be or not to be...") +NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` The system will automatically perform tokenization or n-gramization in order to produce the required data. Unfortunately, `FileDocument`'s cannot be @@ -68,10 +76,19 @@ That said, there is one way around this restriction: you can use the generic `Document()` constructor function, which will guess at the type of the inputs and construct the appropriate type of document object: - Document("To be or not to be...") - Document("/usr/share/dict/words") - Document(String["To", "be", "or", "not", "to", "be..."]) - Document(Dict{String, Int}("a" => 1, "b" => 3)) +```julia +julia> Document("To be or not to be...") +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> Document("/usr/share/dict/words") +FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time")) + +julia> Document(String["To", "be", "or", "not", "to", "be..."]) +TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> Document(Dict{String, Int}("a" => 1, "b" => 3)) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` This constructor is very convenient for working in the REPL, but should be avoided in permanent code because, unlike the other constructors, the return type of the `Document` function cannot be known at compile-time. From 43349ecabef24f171257656e5fa054da85c194d8 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 16 Mar 2019 11:05:59 +0530 Subject: [PATCH 2/8] Update section - functions to work with Documents --- docs/src/documents.md | 81 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/docs/src/documents.md b/docs/src/documents.md index 050bfc56..a5b3f18f 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -53,6 +53,14 @@ julia> ngd = NGramDocument(my_ngrams) NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) ``` +An NGramDocument consisting of bigrams or any higher order representation `N` +can be easily created by passing the parameter `N` to `NGramDocument` + +```julia +julia> ngd = NGramDocument("To be or not to be ...", 2) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` + For every type of document except a `FileDocument`, you can also construct a new document by simply passing in a string of text: @@ -97,32 +105,79 @@ This constructor is very convenient for working in the REPL, but should be avoid Once you've created a document object, you can work with it in many ways. The most obvious thing is to access its text using the `text()` function: - text(sd) +```julia +julia> sd = StringDocument("To be or not to be...") +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) -This function works without warnings on `StringDocument`'s and -`FileDocument`'s. For `TokenDocument`'s it is not possible to know if the -text can be reconstructed perfectly, so calling -`text(TokenDocument("This is text"))` will produce a warning message before -returning an approximate reconstruction of the text as it existed before -tokenization. It is entirely impossible to reconstruct the text of an -`NGramDocument`, so `text(NGramDocument("This is text"))` raises an error. +julia> text(sd) +"To be or not to be..." +``` +!!! note + This function works without warnings on `StringDocument`'s and + `FileDocument`'s. For `TokenDocument`'s it is not possible to know if the + text can be reconstructed perfectly, so calling + `text(TokenDocument("This is text"))` will produce a warning message before + returning an approximate reconstruction of the text as it existed before + tokenization. It is entirely impossible to reconstruct the text of an + `NGramDocument`, so `text(NGramDocument("This is text"))` raises an error. Instead of working with the text itself, you can work with the tokens or n-grams of a document using the `tokens()` and `ngrams()` functions: - tokens(sd) - ngrams(sd) +```julia +julia> tokens(sd) +7-element Array{String,1}: + "To" + "be" + "or" + "not" + "to" + "be.." + "." + + julia> ngrams(sd) + Dict{String,Int64} with 7 entries: + "or" => 1 + "not" => 1 + "to" => 1 + "To" => 1 + "be" => 1 + "be.." => 1 + "." => 1 +``` By default the `ngrams()` function produces unigrams. If you would like to produce bigrams or trigrams, you can specify that directly using a numeric argument to the `ngrams()` function: - ngrams(sd, 2) +```julia +julia> ngrams(sd, 2) +Dict{AbstractString,Int64} with 13 entries: + "not" => 1 + "be.." => 1 + "be or" => 1 + "or" => 1 + "not to" => 1 + "To" => 1 + "." => 1 + "be" => 1 + "To be" => 1 + "or not" => 1 + "to be.." => 1 + "be.. ." => 1 + "to" => 1 +``` If you have a `NGramDocument`, you can determine whether an `NGramDocument` contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function: - ngram_complexity(ngd) +```julia +julia> ngd = NGramDocument("To be or not to be ...", 2) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> ngram_complexity(ngd) +2 +``` This information is not available for other types of `Document` objects because it is possible to produce any level of complexity when constructing @@ -234,7 +289,7 @@ These special classes can all be removed using specially-named parameters: * `prepare!(sd, strip_html_tags)` These functions use words lists, so they are capable of working for many -different languages without change, also these operations can be combined +different languages without change, also these operations can be combined together for improved performance: * `prepare!(sd, strip_articles| strip_numbers| strip_html_tags)` From b5e69222d2f0ddd3f5175e12e5fa0b2eb11d5ede Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sat, 16 Mar 2019 15:44:27 +0530 Subject: [PATCH 3/8] Finish updating documents.md --- docs/src/corpus.md | 25 ++++++++++++ docs/src/documents.md | 91 +++++++++++++++++++++++++------------------ 2 files changed, 79 insertions(+), 37 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index 876027c8..333fc7e2 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -83,3 +83,28 @@ corpus. The easiest way to do this is to convert a `Corpus` object into a `DataFrame`: convert(DataFrame, crps) + +## Corpus Metadata + +You can also retrieve the metadata for every document in a `Corpus` at once: + + languages(crps) + names(crps) + authors(crps) + timestamps(crps) + +It is possible to change the metadata fields for each document in a `Corpus`. +These functions use the same metadata value for every document: + + languages!(crps, Languages.German()) + names!(crps, "") + authors!(crps, "Me") + timestamps!(crps, "Now") + +Additionally, you can specify the metadata fields for each document in +a `Corpus` individually: + + languages!(crps, [Languages.German(), Languages.English()]) + names!(crps, ["", "Untitled"]) + authors!(crps, ["Ich", "You"]) + timestamps!(crps, ["Unbekannt", "2018"]) diff --git a/docs/src/documents.md b/docs/src/documents.md index a5b3f18f..9e5736d0 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -197,41 +197,39 @@ including the following pieces of information: Try these functions out on a `StringDocument` to see how the defaults work in practice: - language(sd) - name(sd) - author(sd) - timestamp(sd) +```julia +julia> sd = StringDocument("This document has too foo words") +StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) -If you need reset these fields, you can use the mutating versions of the same -functions: +julia> language(sd) +Languages.English() - language!(sd, Languages.Spanish()) - name!(sd, "El Cid") - author!(sd, "Desconocido") - timestamp!(sd, "Desconocido") +julia> name(sd) +"Unnamed Document" -You can also retrieve the metadata for every document in a `Corpus` at once: +julia> author(sd) +"Unknown Author" + +julia> timestamp(sd) +"Unknown Time" +``` - languages(crps) - names(crps) - authors(crps) - timestamps(crps) +If you need reset these fields, you can use the mutating versions of the same +functions: -It is possible to change the metadata fields for each document in a `Corpus`. -These functions use the same metadata value for every document: +```julia +julia> language!(sd, Languages.Spanish()) +Languages.Spanish() - languages!(crps, Languages.German()) - names!(crps, "") - authors!(crps, "Me") - timestamps!(crps, "Now") +julia> name!(sd, "El Cid") +"El Cid" -Additionally, you can specify the metadata fields for each document in -a `Corpus` individually: +julia> author!(sd, "Desconocido") +"Desconocido" - languages!(crps, [Languages.German(), Languages.English()]) - names!(crps, ["", "Untitled"]) - authors!(crps, ["Ich", "You"]) - timestamps!(crps, ["Unbekannt", "2018"]) +julia> timestamp!(sd, "Desconocido") +"Desconocido" +``` ## Preprocessing Documents @@ -248,21 +246,33 @@ to process automatically. For example, our sample text sentence taken from Hamle has three periods that we might like to discard. We can remove this kind of punctuation using the `prepare!()` function: - prepare!(sd, strip_punctuation) +```julia +julia> str = StringDocument("here are some punctuations !!!...") -Like punctuation, numbers and case distinctions are often easier removed than -dealt with. To remove numbers or case distinctions, use the -`remove_numbers!()` and `remove_case!()` functions: +julia> prepare!(str, strip_punctuation) - remove_numbers!(sd) - remove_case!(sd) +julia> str +StringDocument{String}("here are some punctuations ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` -At times you'll want to remove specific words from a document like a person's +* To case distinctions, use `remove_case!()` function: +* At times you'll want to remove specific words from a document like a person's name. To do that, use the `remove_words!()` function: - sd = StringDocument("Lear is mad") - remove_words!(sd, ["Lear"]) +```julia +julia> sd = StringDocument("Lear is mad") +StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> remove_case!(sd) + +julia> sd +StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) + +julia> remove_words!(sd, ["lear"]) +julia> sd +StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` At other times, you'll want to remove whole classes of words. To make this easier, we can use several classes of basic words defined by the Languages.jl package: @@ -298,4 +308,11 @@ closely related like "dog" and "dogs" and stem them in order to produce a smaller set of words for analysis. We can do this using the `stem!()` function: - stem!(sd) +```julia +julia> sd = StringDocument("Foo writes and foo bar write") + +julia> stem!(sd) + +julia> sd +StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +``` From 43cde898998466f85c37387374615a56a33030a0 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 17 Mar 2019 01:43:50 +0530 Subject: [PATCH 4/8] Update corpus.md --- docs/src/corpus.md | 100 ++++++++++++++++++++++++++++++++---------- docs/src/documents.md | 2 +- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index 333fc7e2..5178ea91 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -4,35 +4,54 @@ Working with isolated documents gets boring quickly. We typically want to work with a collection of documents. We represent collections of documents using the Corpus type: - crps = Corpus([StringDocument("Document 1"), - StringDocument("Document 2")]) +```julia +julia> crps = Corpus([StringDocument("Document 1"), + StringDocument("Document 2")]) +Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +``` ## Standardizing a Corpus -A `Corpus` may contain many different types of documents: +!!! note + It is recommended to read about `Documents` types and their heirarchy in the package. - crps = Corpus([StringDocument("Document 1"), - TokenDocument("Document 2"), - NGramDocument("Document 3")]) +A `Corpus` may contain many different types of documents: +```julia +julia> crps = Corpus([StringDocument("Document 1"), + TokenDocument("Document 2"), + NGramDocument("Document 3")]) +Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +``` It is generally more convenient to standardize all of the documents in a corpus using a single type. This can be done using the `standardize!` function: - standardize!(crps, NGramDocument) +```julia +julia> standardize!(crps, NGramDocument) +``` After this step, you can check that the corpus only contains `NGramDocument`'s: - crps +```julia +julia> crps +Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +``` ## Processing a Corpus We can apply the same sort of preprocessing steps that are defined for individual documents to an entire corpus at once: - crps = Corpus([StringDocument("Document 1"), - StringDocument("Document 2")]) - remove_punctuation!(crps) +```julia +julia> crps = Corpus([StringDocument("Document !!1"), + StringDocument("Document 2!!")]) + +julia> prepare!(crps, strip_punctuation) + +julia> crps +Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +``` These operations are run on each document in the corpus individually. @@ -47,34 +66,71 @@ In particular, we want to work with two constructs: Because computations involving the lexicon can take a long time, a `Corpus`'s default lexicon is blank: - lexicon(crps) +```julia +julia> julia> crps = Corpus([StringDocument("Name Foo"), + StringDocument("Name Bar")]) +julia> lexicon(crps) +Dict{String,Int64} with 0 entries +``` In order to work with the lexicon, you have to update it and then access it: - update_lexicon!(crps) - lexicon(crps) +```julia +julia> update_lexicon!(crps) + +julia> lexicon(crps) +Dict{String,Int64} with 3 entries: + "Bar" => 1 + "Foo" => 1 + "Name" => 2 +``` But once this work is done, you can easier address lots of interesting questions about a corpus: +``` +julia> lexical_frequency(crps, "Name") +0.5 - lexical_frequency(crps, "Summer") - lexical_frequency(crps, "Document") +julia> lexical_frequency(crps, "Foo") +0.25 +``` Like the lexicon, the inverse index for a corpus is blank by default: - inverse_index(crps) +```julia +julia> inverse_index(crps) +Dict{String,Array{Int64,1}} with 0 entries +``` Again, you need to update it before you can work with it: - update_inverse_index!(crps) - inverse_index(crps) +```julia +julia> update_inverse_index!(crps) + +julia> inverse_index(crps) +Dict{String,Array{Int64,1}} with 3 entries: + "Bar" => [2] + "Foo" => [1] + "Name" => [1, 2] +``` But once you've updated the inverse index, you can easily search the entire corpus: - crps["Document"] - crps["1"] - crps["Summer"] +```julia +julia> crps["Name"] + +2-element Array{Int64,1}: + 1 + 2 + +julia> crps["Foo"] +1-element Array{Int64,1}: + 1 + +julia> crps["Summer"] +0-element Array{Int64,1} +``` ## Converting a DataFrame from a Corpus diff --git a/docs/src/documents.md b/docs/src/documents.md index 9e5736d0..df17f183 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -255,7 +255,7 @@ julia> str StringDocument{String}("here are some punctuations ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) ``` -* To case distinctions, use `remove_case!()` function: +* To remove case distinctions, use `remove_case!()` function: * At times you'll want to remove specific words from a document like a person's name. To do that, use the `remove_words!()` function: From 9a85a9026366236bb58345c0cc30ce9cb0b24b68 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Sun, 17 Mar 2019 14:57:42 +0530 Subject: [PATCH 5/8] Finish update for corpus.md --- docs/src/corpus.md | 57 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index 5178ea91..c9af53ef 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -44,13 +44,13 @@ We can apply the same sort of preprocessing steps that are defined for individual documents to an entire corpus at once: ```julia -julia> crps = Corpus([StringDocument("Document !!1"), - StringDocument("Document 2!!")]) +julia> crps = Corpus([StringDocument("Document ..!!"), + StringDocument("Document ..!!")]) julia> prepare!(crps, strip_punctuation) julia> crps -Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) ``` These operations are run on each document in the corpus individually. @@ -67,7 +67,7 @@ Because computations involving the lexicon can take a long time, a `Corpus`'s default lexicon is blank: ```julia -julia> julia> crps = Corpus([StringDocument("Name Foo"), +julia> crps = Corpus([StringDocument("Name Foo"), StringDocument("Name Bar")]) julia> lexicon(crps) Dict{String,Int64} with 0 entries @@ -144,23 +144,46 @@ a `DataFrame`: You can also retrieve the metadata for every document in a `Corpus` at once: - languages(crps) - names(crps) - authors(crps) - timestamps(crps) +```julia +julia> crps = Corpus([StringDocument("Name Foo"), + StringDocument("Name Bar")]) + +julia> languages(crps) +2-element Array{Languages.English,1}: + Languages.English() + Languages.English() + +julia> names(crps) +2-element Array{String,1}: + "Unnamed Document" + "Unnamed Document" + +julia> authors(crps) +2-element Array{String,1}: + "Unknown Author" + "Unknown Author" + +julia> timestamps(crps) +2-element Array{String,1}: + "Unknown Time" + "Unknown Time" +``` It is possible to change the metadata fields for each document in a `Corpus`. These functions use the same metadata value for every document: - languages!(crps, Languages.German()) - names!(crps, "") - authors!(crps, "Me") - timestamps!(crps, "Now") - +```julia +julia> languages!(crps, Languages.German()) +julia> names!(crps, "") +julia> authors!(crps, "Me") +julia> timestamps!(crps, "Now") +``` Additionally, you can specify the metadata fields for each document in a `Corpus` individually: - languages!(crps, [Languages.German(), Languages.English()]) - names!(crps, ["", "Untitled"]) - authors!(crps, ["Ich", "You"]) - timestamps!(crps, ["Unbekannt", "2018"]) +```julia +julia> languages!(crps, [Languages.German(), Languages.English +julia> names!(crps, ["", "Untitled"]) +julia> authors!(crps, ["Ich", "You"]) +julia> timestamps!(crps, ["Unbekannt", "2018"]) +``` From 29405d0be531d57c9182b31cc9add7b6eb9715cc Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Mon, 18 Mar 2019 13:56:10 +0530 Subject: [PATCH 6/8] Update docs/features.md --- docs/src/corpus.md | 5 +++ docs/src/features.md | 91 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index c9af53ef..5fa6a597 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -144,6 +144,11 @@ a `DataFrame`: You can also retrieve the metadata for every document in a `Corpus` at once: +* `languages()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package. +* `names()`: What is the name of the document? Defaults to `"Unnamed Document"`. +* `authors()`: Who wrote the document? Defaults to `"Unknown Author"`. +* `timestamps()`: When was the document written? Defaults to `"Unknown Time"`. + ```julia julia> crps = Corpus([StringDocument("Name Foo"), StringDocument("Name Bar")]) diff --git a/docs/src/features.md b/docs/src/features.md index cda25231..481fcf41 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -4,18 +4,53 @@ Often we want to represent documents as a matrix of word counts so that we can apply linear algebra operations and statistical techniques. Before we do this, we need to update the lexicon: - update_lexicon!(crps) - m = DocumentTermMatrix(crps) +```julia +julia> crps = Corpus([StringDocument("To be or not to be"), + StringDocument("To become or not to become")]) + +julia> update_lexicon!(crps) + +julia> m = DocumentTermMatrix(crps) +DocumentTermMatrix( + [1, 1] = 1 + [2, 1] = 1 + [1, 2] = 2 + [2, 3] = 2 + [1, 4] = 1 + [2, 4] = 1 + [1, 5] = 1 + [2, 5] = 1 + [1, 6] = 1 + [2, 6] = 1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3)) +``` A `DocumentTermMatrix` object is a special type. If you would like to use a simple sparse matrix, call `dtm()` on this object: - dtm(m) +```julia +julia> dtm(m) +2×6 SparseArrays.SparseMatrixCSC{Int64,Int64} with 10 stored entries: + [1, 1] = 1 + [2, 1] = 1 + [1, 2] = 2 + [2, 3] = 2 + [1, 4] = 1 + [2, 4] = 1 + [1, 5] = 1 + [2, 5] = 1 + [1, 6] = 1 + [2, 6] = 1 +``` If you would like to use a dense matrix instead, you can pass this as an argument to the `dtm` function: - dtm(m, :dense) +```julia +julia> dtm(m, :dense) +2×6 Array{Int64,2}: + 1 2 0 1 1 1 + 1 0 2 1 1 1 +``` ## Creating Individual Rows of a Document Term Matrix @@ -24,7 +59,11 @@ make do with just a single row. You can get this using the `dtv` function. Because individual's document do not have a lexicon associated with them, we have to pass in a lexicon as an additional argument: - dtv(crps[1], lexicon(crps)) +```julia +julia> dtv(crps[1], lexicon(crps)) +1×6 Array{Int64,2}: + 1 2 0 1 1 1 +``` ## The Hash Trick @@ -33,35 +72,61 @@ The need to create a lexicon before we can construct a document term matrix is o function that outputs integers from 1 to N. To construct such a hash function, you can use the `TextHashFunction(N)` constructor: - h = TextHashFunction(10) +```julia +julia> h = TextHashFunction(10) +TextHashFunction(hash, 10) +``` You can see how this function maps strings to numbers by calling the `index_hash` function: - index_hash("a", h) - index_hash("b", h) +```julia +julia> index_hash("a", h) +8 + +julia> index_hash("b", h) +7 +``` Using a text hash function, we can represent a document as a vector with N entries by calling the `hash_dtv` function: - hash_dtv(crps[1], h) +```julia +julia> hash_dtv(crps[1], h) +1×10 Array{Int64,2}: + 0 2 0 0 1 3 0 0 0 0 +``` This can be done for a corpus as a whole to construct a DTM without defining a lexicon in advance: - hash_dtm(crps, h) +```julia +julia> hash_dtm(crps, h) +2×10 Array{Int64,2}: + 0 2 0 0 1 3 0 0 0 0 + 0 2 0 0 1 1 0 0 2 0 +``` Every corpus has a hash function built-in, so this function can be called using just one argument: - hash_dtm(crps) +```julia +julia> hash_dtm(crps) +2×100 Array{Int64,2}: + 0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +``` Moreover, if you do not specify a hash function for just one row of the hash DTM, a default hash function will be constructed for you: - hash_dtv(crps[1]) +```julia +julia> hash_dtv(crps[1]) +1×100 Array{Int64,2}: + 0 0 0 0 0 0 0 0 0 0 0 0 0 … 0 0 0 0 0 0 0 0 0 0 0 0 +``` -## TF-IDF +## TF-IDF (Term Frequency - Inverse Document Frequency) In many cases, raw word counts are not appropriate for use because: From 121ea8f58c905d4120189354b0e80e6fb20dd3a7 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 20 Mar 2019 18:24:52 +0530 Subject: [PATCH 7/8] Update semantic.md --- docs/src/corpus.md | 3 --- docs/src/semantic.md | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index 5fa6a597..e73a96e9 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -12,9 +12,6 @@ Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Do ## Standardizing a Corpus -!!! note - It is recommended to read about `Documents` types and their heirarchy in the package. - A `Corpus` may contain many different types of documents: ```julia diff --git a/docs/src/semantic.md b/docs/src/semantic.md index cfd47c43..f9dc957a 100644 --- a/docs/src/semantic.md +++ b/docs/src/semantic.md @@ -11,11 +11,35 @@ Analysis or LSA on the corpus. You can do this using the `lsa` function: Another way to get a handle on the semantic content of a corpus is to use [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation): - m = DocumentTermMatrix(crps) - k = 2 # number of topics - iteration = 1000 # number of gibbs sampling iterations - alpha = 0.1 # hyper parameter - beta = 0.1 # hyber parameter - ϕ, θ = lda(m, k, iteration, alpha, beta) # ϕ is k x word matrix. - # value is probablity of occurrence of a word in a topic. +First we need to produce the DocumentTermMatrix +```julia +julia> crps = Corpus([StringDocument("This is the Foo Bar Document"), StringDocument("This document has too Foo words")]) +julia> update_lexicon!(crps) +julia> m = DocumentTermMatrix(crps) +``` + +Latent Dirchlet Allocation has two hyper parameters - +* _α_ : The hyperparameter for topic distribution per document. `α<1` yields a sparse topic mixture for each document. `α>1` yields a more uniform topic mixture for each document. +- _β_ : The hyperparameter for word distribution per topic. `β<1` yields a sparse word mixture for each topic. `β>1` yields a more uniform word mixture for each topic. + +```julia +julia> k = 2 # number of topics +julia> iterations = 1000 # number of gibbs sampling iterations + +julia> α = 0.1 # hyper parameter +julia> β = 0.1 # hyper parameter + +julia> ϕ, θ = lda(m, k, iterations, α, β) +( + [2 , 1] = 0.333333 + [2 , 2] = 0.333333 + [1 , 3] = 0.222222 + [1 , 4] = 0.222222 + [1 , 5] = 0.111111 + [1 , 6] = 0.111111 + [1 , 7] = 0.111111 + [2 , 8] = 0.333333 + [1 , 9] = 0.111111 + [1 , 10] = 0.111111, [0.5 1.0; 0.5 0.0]) +``` See `?lda` for more help. From f4b5561664e45ecb04d668f80cb707e5d6603b50 Mon Sep 17 00:00:00 2001 From: Ayushk4 Date: Wed, 20 Mar 2019 19:39:39 +0530 Subject: [PATCH 8/8] Change name -> title in docs --- docs/src/corpus.md | 20 ++++++++++---------- docs/src/documents.md | 44 +++++++++++++++++++++---------------------- docs/src/features.md | 12 ++++++++++++ src/TextAnalysis.jl | 10 +++++----- src/corpus.jl | 4 ++-- src/document.jl | 6 +++--- src/metadata.jl | 14 +++++++------- src/sentiment.jl | 33 ++++++++++++++++++++++++++------ src/show.jl | 2 +- src/summarizer.jl | 1 - test/metadata.jl | 16 ++++++++-------- test/sentiment.jl | 21 +++++++++++++++++++++ 12 files changed, 118 insertions(+), 65 deletions(-) diff --git a/docs/src/corpus.md b/docs/src/corpus.md index e73a96e9..4e3f7da7 100644 --- a/docs/src/corpus.md +++ b/docs/src/corpus.md @@ -7,7 +7,7 @@ using the Corpus type: ```julia julia> crps = Corpus([StringDocument("Document 1"), StringDocument("Document 2")]) -Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) ``` ## Standardizing a Corpus @@ -18,7 +18,7 @@ A `Corpus` may contain many different types of documents: julia> crps = Corpus([StringDocument("Document 1"), TokenDocument("Document 2"), NGramDocument("Document 3")]) -Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) ``` It is generally more convenient to standardize all of the documents in a corpus using a single type. This can be done using the `standardize!` @@ -32,7 +32,7 @@ After this step, you can check that the corpus only contains `NGramDocument`'s: ```julia julia> crps -Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) ``` ## Processing a Corpus @@ -47,7 +47,7 @@ julia> crps = Corpus([StringDocument("Document ..!!"), julia> prepare!(crps, strip_punctuation) julia> crps -Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) +Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100)) ``` These operations are run on each document in the corpus individually. @@ -142,7 +142,7 @@ a `DataFrame`: You can also retrieve the metadata for every document in a `Corpus` at once: * `languages()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package. -* `names()`: What is the name of the document? Defaults to `"Unnamed Document"`. +* `titles()`: What is the title of the document? Defaults to `"Untitled Document"`. * `authors()`: Who wrote the document? Defaults to `"Unknown Author"`. * `timestamps()`: When was the document written? Defaults to `"Unknown Time"`. @@ -155,10 +155,10 @@ julia> languages(crps) Languages.English() Languages.English() -julia> names(crps) +julia> titles(crps) 2-element Array{String,1}: - "Unnamed Document" - "Unnamed Document" + "Untitled Document" + "Untitled Document" julia> authors(crps) 2-element Array{String,1}: @@ -176,7 +176,7 @@ These functions use the same metadata value for every document: ```julia julia> languages!(crps, Languages.German()) -julia> names!(crps, "") +julia> titles!(crps, "") julia> authors!(crps, "Me") julia> timestamps!(crps, "Now") ``` @@ -185,7 +185,7 @@ a `Corpus` individually: ```julia julia> languages!(crps, [Languages.German(), Languages.English -julia> names!(crps, ["", "Untitled"]) +julia> titles!(crps, ["", "Untitled"]) julia> authors!(crps, ["Ich", "You"]) julia> timestamps!(crps, ["Unbekannt", "2018"]) ``` diff --git a/docs/src/documents.md b/docs/src/documents.md index df17f183..7ba964f7 100644 --- a/docs/src/documents.md +++ b/docs/src/documents.md @@ -18,7 +18,7 @@ julia> str = "To be or not to be..." "To be or not to be..." julia> sd = StringDocument(str) -StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> pathname = "/usr/share/dict/words" "/usr/share/dict/words" @@ -36,7 +36,7 @@ julia> my_tokens = String["To", "be", "or", "not", "to", "be..."] "be..." julia> td = TokenDocument(my_tokens) -TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> my_ngrams = Dict{String, Int}("To" => 1, "be" => 2, "or" => 1, "not" => 1, @@ -50,7 +50,7 @@ Dict{String,Int64} with 6 entries: "be" => 2 julia> ngd = NGramDocument(my_ngrams) -NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` An NGramDocument consisting of bigrams or any higher order representation `N` @@ -58,7 +58,7 @@ can be easily created by passing the parameter `N` to `NGramDocument` ```julia julia> ngd = NGramDocument("To be or not to be ...", 2) -NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` For every type of document except a `FileDocument`, you can also construct a @@ -66,13 +66,13 @@ new document by simply passing in a string of text: ```julia julia> sd = StringDocument("To be or not to be...") -StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> td = TokenDocument("To be or not to be...") -TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> ngd = NGramDocument("To be or not to be...") -NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` The system will automatically perform tokenization or n-gramization in order @@ -86,16 +86,16 @@ and construct the appropriate type of document object: ```julia julia> Document("To be or not to be...") -StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> Document("/usr/share/dict/words") FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time")) julia> Document(String["To", "be", "or", "not", "to", "be..."]) -TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> Document(Dict{String, Int}("a" => 1, "b" => 3)) -NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` This constructor is very convenient for working in the REPL, but should be avoided in permanent code because, unlike the other constructors, the return type of the `Document` function cannot be known at compile-time. @@ -107,7 +107,7 @@ most obvious thing is to access its text using the `text()` function: ```julia julia> sd = StringDocument("To be or not to be...") -StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> text(sd) "To be or not to be..." @@ -173,7 +173,7 @@ contains unigrams, bigrams or a higher-order representation using the `ngram_com ```julia julia> ngd = NGramDocument("To be or not to be ...", 2) -NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> ngram_complexity(ngd) 2 @@ -190,7 +190,7 @@ document, every document object also stores basic metadata about itself, including the following pieces of information: * `language()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package. -* `name()`: What is the name of the document? Defaults to `"Unnamed Document"`. +* `title()`: What is the title of the document? Defaults to `"Untitled Document"`. * `author()`: Who wrote the document? Defaults to `"Unknown Author"`. * `timestamp()`: When was the document written? Defaults to `"Unknown Time"`. @@ -199,13 +199,13 @@ in practice: ```julia julia> sd = StringDocument("This document has too foo words") -StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> language(sd) Languages.English() -julia> name(sd) -"Unnamed Document" +julia> title(sd) +"Untitled Document" julia> author(sd) "Unknown Author" @@ -221,7 +221,7 @@ functions: julia> language!(sd, Languages.Spanish()) Languages.Spanish() -julia> name!(sd, "El Cid") +julia> title!(sd, "El Cid") "El Cid" julia> author!(sd, "Desconocido") @@ -252,7 +252,7 @@ julia> str = StringDocument("here are some punctuations !!!...") julia> prepare!(str, strip_punctuation) julia> str -StringDocument{String}("here are some punctuations ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("here are some punctuations ", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` * To remove case distinctions, use `remove_case!()` function: @@ -261,17 +261,17 @@ name. To do that, use the `remove_words!()` function: ```julia julia> sd = StringDocument("Lear is mad") -StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> remove_case!(sd) julia> sd -StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) julia> remove_words!(sd, ["lear"]) julia> sd -StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` At other times, you'll want to remove whole classes of words. To make this easier, we can use several classes of basic words defined by the Languages.jl @@ -314,5 +314,5 @@ julia> sd = StringDocument("Foo writes and foo bar write") julia> stem!(sd) julia> sd -StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time")) +StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time")) ``` diff --git a/docs/src/features.md b/docs/src/features.md index 481fcf41..a6813a84 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -141,3 +141,15 @@ You can work around this by performing TF-IDF on a DocumentTermMatrix: As you can see, TF-IDF has the effect of inserting 0's into the columns of words that occur in all documents. This is a useful way to avoid having to remove those words during preprocessing. + +## Sentiment Analyzer + +It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document. +A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments. + + model = SentimentAnalyzer(doc) + model = SentimentAnalyzer(doc, handle_unknown) + +* doc = Input Document for calculating document (AbstractDocument type) +* handle_unknown = A function for handling unknown words. Should return an array (default (x)->[]) + diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl index 08127aa3..db0bcd2c 100644 --- a/src/TextAnalysis.jl +++ b/src/TextAnalysis.jl @@ -4,7 +4,7 @@ module TextAnalysis using SparseArrays using Printf using LinearAlgebra - + using Languages using DataFrames using WordTokenizers @@ -21,10 +21,10 @@ module TextAnalysis export text, tokens, ngrams export text!, tokens!, ngrams! export documents - export language, name, author, timestamp - export languages, names, authors, timestamps - export language!, name!, author!, timestamp! - export languages!, names!, authors!, timestamps! + export language, title, author, timestamp + export languages, titles, authors, timestamps + export language!, title!, author!, timestamp! + export languages!, titles!, authors!, timestamps! export ngram_complexity export lexicon, update_lexicon!, lexical_frequency, lexicon_size export inverse_index, update_inverse_index!, index_size diff --git a/src/corpus.jl b/src/corpus.jl index bd94f410..85ad1c10 100644 --- a/src/corpus.jl +++ b/src/corpus.jl @@ -79,7 +79,7 @@ function Base.convert(::Type{DataFrame}, crps::Corpus) df = DataFrame() n = length(crps) df[:Language] = Array{Union{String,Missing}}(n) - df[:Name] = Array{Union{String,Missing}}(n) + df[:Title] = Array{Union{String,Missing}}(n) df[:Author] = Array{Union{String,Missing}}(n) df[:TimeStamp] = Array{Union{String,Missing}}(n) df[:Length] = Array{Union{Int,Missing}}(n) @@ -87,7 +87,7 @@ function Base.convert(::Type{DataFrame}, crps::Corpus) for i in 1:n d = crps.documents[i] df[i, :Language] = string(language(d)) - df[i, :Name] = name(d) + df[i, :Title] = title(d) df[i, :Author] = author(d) df[i, :TimeStamp] = timestamp(d) df[i, :Length] = length(d) diff --git a/src/document.jl b/src/document.jl index aeaedaf5..2657b68b 100644 --- a/src/document.jl +++ b/src/document.jl @@ -6,13 +6,13 @@ mutable struct DocumentMetadata language - name::String + title::String author::String timestamp::String end DocumentMetadata() = DocumentMetadata( Languages.English(), - "Unnamed Document", + "Untitled Document", "Unknown Author", "Unknown Time" ) @@ -38,7 +38,7 @@ end function FileDocument(f::AbstractString) d = FileDocument(String(f), DocumentMetadata()) - d.metadata.name = f + d.metadata.title = f return d end diff --git a/src/metadata.jl b/src/metadata.jl index 5ba94a4a..f8700570 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -6,13 +6,13 @@ import Languages.name -name(d::AbstractDocument) = d.metadata.name +title(d::AbstractDocument) = d.metadata.title language(d::AbstractDocument) = d.metadata.language author(d::AbstractDocument) = d.metadata.author timestamp(d::AbstractDocument) = d.metadata.timestamp -function name!(d::AbstractDocument, nv::AbstractString) - d.metadata.name = nv +function title!(d::AbstractDocument, nv::AbstractString) + d.metadata.title = nv end function language!(d::AbstractDocument, nv::T) where T <: Language @@ -33,20 +33,20 @@ end # ############################################################################## -names(c::Corpus) = map(d -> name(d), documents(c)) +titles(c::Corpus) = map(d -> title(d), documents(c)) languages(c::Corpus) = map(d -> language(d), documents(c)) authors(c::Corpus) = map(d -> author(d), documents(c)) timestamps(c::Corpus) = map(d -> timestamp(d), documents(c)) -names!(c::Corpus, nv::AbstractString) = name!.(documents(c), nv) +titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv) languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv)) timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv)) -function names!(c::Corpus, nvs::Vector{String}) +function titles!(c::Corpus, nvs::Vector{String}) length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match")) for (i, d) in pairs(IndexLinear(), documents(c)) - name!(d, nvs[i]) + title!(d, nvs[i]) end end diff --git a/src/sentiment.jl b/src/sentiment.jl index f9c980b4..91ec5804 100644 --- a/src/sentiment.jl +++ b/src/sentiment.jl @@ -35,7 +35,7 @@ function flatten(x) return reshape(x, (l, 1)) end -function get_sentiment(ip::Array{T, 1}, weight, rwi) where T <: AbstractString +function get_sentiment(handle_unknown, ip::Array{T, 1}, weight, rwi) where T <: AbstractString model = (x,) -> begin a_1 = embedding(weight[:embedding_1]["embedding_1"]["embeddings:0"], x) a_2 = flatten(a_1) @@ -45,7 +45,16 @@ function get_sentiment(ip::Array{T, 1}, weight, rwi) where T <: AbstractString end res = Array{Int, 1}() for ele in ip - push!(res, rwi[ele]) + if ele in keys(rwi) && rwi[ele] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2] # there are only 5000 unique embeddings + push!(res, rwi[ele]) + else + for words in handle_unknown(ele) + if words in keys(rwi) && rwi[words] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2] + push!(res, rwi[words]) + end + end + + end end return model(pad_sequences(res))[1] end @@ -74,10 +83,22 @@ function Base.show(io::IO, s::SentimentAnalyzer) end -function(m::SentimentModel)(text::Array{T, 1}) where T <: AbstractString - return get_sentiment(text, m.weight, m.words) +function(m::SentimentModel)(handle_unknown, text::Array{T, 1}) where T <: AbstractString + return get_sentiment(handle_unknown, text, m.weight, m.words) end -function(m::SentimentAnalyzer)(d::AbstractDocument) - m.model(tokens(d)) + +""" + ``` + model = SentimentAnalyzer(doc) + model = SentimentAnalyzer(doc, handle_unknown) + ``` + Return sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being + the highest: + - doc = Input Document for calculating document (AbstractDocument type) + - handle_unknown = A function for handling unknown words. Should return an array (default x->tuple()) + """ + +function(m::SentimentAnalyzer)(d::AbstractDocument, handle_unknown = x->tuple()) + m.model(handle_unknown, tokens(d)) end diff --git a/src/show.jl b/src/show.jl index 53588783..0c68094d 100644 --- a/src/show.jl +++ b/src/show.jl @@ -18,7 +18,7 @@ function summary(d::AbstractDocument) o = "" o *= "A $(typeof(d))\n" o *= " * Language: $(language(d))\n" - o *= " * Name: $(name(d))\n" + o *= " * Title: $(title(d))\n" o *= " * Author: $(author(d))\n" o *= " * Timestamp: $(timestamp(d))\n" if contains(Any[TokenDocument, NGramDocument], typeof(d)) diff --git a/src/summarizer.jl b/src/summarizer.jl index 1179bafc..0f371fa5 100644 --- a/src/summarizer.jl +++ b/src/summarizer.jl @@ -1,4 +1,3 @@ - function summarize(d::AbstractDocument; ns=5) sentences = sentence_tokenize(language(d), text(d)) s = StringDocument.(sentences) diff --git a/test/metadata.jl b/test/metadata.jl index 29ee45cd..19a96e54 100644 --- a/test/metadata.jl +++ b/test/metadata.jl @@ -7,41 +7,41 @@ crps = Corpus([sd1, sd2]) # Single document metadata getters - @test isequal(name(sd1), "Unnamed Document") + @test isequal(title(sd1), "Untitled Document") @test isequal(language(sd1), Languages.English()) @test isequal(author(sd1), "Unknown Author") @test isequal(timestamp(sd1), "Unknown Time") # Single document metadata setters - name!(sd1, "Document") + title!(sd1, "Document") language!(sd1, Languages.German()) author!(sd1, "Author") timestamp!(sd1, "Time") - @test isequal(name(sd1), "Document") + @test isequal(title(sd1), "Document") @test isequal(language(sd1), Languages.German()) @test isequal(author(sd1), "Author") @test isequal(timestamp(sd1), "Time") # Metadata getters for an entire corpus - @test isequal(TextAnalysis.names(crps), ["Document", "Unnamed Document"]) + @test isequal(TextAnalysis.titles(crps), ["Document", "Untitled Document"]) @test isequal(languages(crps), [Languages.German(), Languages.English()]) @test isequal(authors(crps), ["Author", "Unknown Author"]) @test isequal(timestamps(crps), ["Time", "Unknown Time"]) # Metadata setters for an entire corpus - names!(crps, "Document") + titles!(crps, "Document") languages!(crps, Languages.Spanish()) authors!(crps, "Author") timestamps!(crps, "Time") - @test isequal(TextAnalysis.names(crps), ["Document", "Document"]) + @test isequal(TextAnalysis.titles(crps), ["Document", "Document"]) @test isequal(languages(crps), [Languages.Spanish(), Languages.Spanish()]) @test isequal(authors(crps), ["Author", "Author"]) @test isequal(timestamps(crps), ["Time", "Time"]) - names!(crps, ["Unnamed Document", "Unnamed Document"]) + titles!(crps, ["Untitled Document", "Untitled Document"]) languages!(crps, [Languages.English(), Languages.English()]) authors!(crps, ["Unknown Author", "Unknown Author"]) timestamps!(crps, ["Unknown Time", "Unknown Time"]) - @test isequal(TextAnalysis.names(crps), ["Unnamed Document", "Unnamed Document"]) + @test isequal(TextAnalysis.titles(crps), ["Untitled Document", "Untitled Document"]) @test isequal(languages(crps), [Languages.English(), Languages.English()]) @test isequal(authors(crps), ["Unknown Author", "Unknown Author"]) @test isequal(timestamps(crps), ["Unknown Time", "Unknown Time"]) diff --git a/test/sentiment.jl b/test/sentiment.jl index ce6233cf..4ec131e1 100644 --- a/test/sentiment.jl +++ b/test/sentiment.jl @@ -8,4 +8,25 @@ d = StringDocument("a horrible thing that everyone hates") @test m(d) < 0.5 + + # testing default behaviour of handle_unknown + d = StringDocument("some sense and some nonSense") + + @test m(d) < 0.5 + + # testing behaviour of words which are present in dictionary but do not have embedding assigned + d = StringDocument("some sense and some duh") + + @test m(d) < 0.5 + + # testing user given handle_unknown function + d = Document("a Horrible thing that Everyone Hates") + + @test m(d, (x) -> [lowercase(x)]) < 0.5 + + # Make it throw an error when unknown word encountered + d = Document("some sense and some Hectic") + + @test_throws ErrorException m(d, (x) -> error("OOV word $x encountered")) + end