From 1dacfc58bb3348deca7b53b61d91c3a71080109e Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 16 Mar 2019 02:44:44 +0530
Subject: [PATCH 1/8] Update section under Creating Documents

---
 docs/src/documents.md | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/docs/src/documents.md b/docs/src/documents.md
index 1d0b59e2..050bfc56 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -8,7 +8,8 @@ allows one to work with documents stored in a variety of formats:
 * _TokenDocument_ : A document represented as a sequence of UTF8 tokens
 * _NGramDocument_ : A document represented as a bag of n-grams, which are UTF8 n-grams that map to counts
 
-These format represent a hierarchy: you can always move down the hierachy, but can generally not move up the hierachy. A `FileDocument` can easily become a `StringDocument`, but an `NGramDocument` cannot easily become a `FileDocument`.
+!!! note
+    These formats represent a hierarchy: you can always move down the hierachy, but can generally not move up the hierachy. A `FileDocument` can easily become a `StringDocument`, but an `NGramDocument` cannot easily become a `FileDocument`.
 
 Creating any of the four basic types of documents is very easy:
 
@@ -55,9 +56,16 @@ NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not
 For every type of document except a `FileDocument`, you can also construct a
 new document by simply passing in a string of text:
 
-    sd = StringDocument("To be or not to be...")
-    td = TokenDocument("To be or not to be...")
-    ngd = NGramDocument("To be or not to be...")
+```julia
+julia> sd = StringDocument("To be or not to be...")
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> td = TokenDocument("To be or not to be...")
+TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> ngd = NGramDocument("To be or not to be...")
+NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```
 
 The system will automatically perform tokenization or n-gramization in order
 to produce the required data. Unfortunately, `FileDocument`'s cannot be
@@ -68,10 +76,19 @@ That said, there is one way around this restriction: you can use the generic
 `Document()` constructor function, which will guess at the type of the inputs
 and construct the appropriate type of document object:
 
-    Document("To be or not to be...")
-    Document("/usr/share/dict/words")
-    Document(String["To", "be", "or", "not", "to", "be..."])
-    Document(Dict{String, Int}("a" => 1, "b" => 3))
+```julia
+julia> Document("To be or not to be...")
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> Document("/usr/share/dict/words")
+FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time"))
+
+julia> Document(String["To", "be", "or", "not", "to", "be..."])
+TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> Document(Dict{String, Int}("a" => 1, "b" => 3))
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```
 
 This constructor is very convenient for working in the REPL, but should be avoided in permanent code because, unlike the other constructors, the return type of the `Document` function cannot be known at compile-time.
 

From 43349ecabef24f171257656e5fa054da85c194d8 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 16 Mar 2019 11:05:59 +0530
Subject: [PATCH 2/8] Update section - functions to work with Documents

---
 docs/src/documents.md | 81 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/docs/src/documents.md b/docs/src/documents.md
index 050bfc56..a5b3f18f 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -53,6 +53,14 @@ julia> ngd = NGramDocument(my_ngrams)
 NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
 ```
 
+An NGramDocument consisting of bigrams or any higher order representation `N`
+can be easily created by passing the parameter `N` to `NGramDocument`
+
+```julia
+julia> ngd = NGramDocument("To be or not to be ...", 2)
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```
+
 For every type of document except a `FileDocument`, you can also construct a
 new document by simply passing in a string of text:
 
@@ -97,32 +105,79 @@ This constructor is very convenient for working in the REPL, but should be avoid
 Once you've created a document object, you can work with it in many ways. The
 most obvious thing is to access its text using the `text()` function:
 
-    text(sd)
+```julia
+julia> sd = StringDocument("To be or not to be...")
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
 
-This function works without warnings on `StringDocument`'s and
-`FileDocument`'s. For `TokenDocument`'s it is not possible to know if the
-text can be reconstructed perfectly, so calling
-`text(TokenDocument("This is text"))` will produce a warning message before
-returning an approximate reconstruction of the text as it existed before
-tokenization. It is entirely impossible to reconstruct the text of an
-`NGramDocument`, so `text(NGramDocument("This is text"))` raises an error.
+julia> text(sd)
+"To be or not to be..."
+```
+!!! note
+    This function works without warnings on `StringDocument`'s and
+    `FileDocument`'s. For `TokenDocument`'s it is not possible to know if the
+    text can be reconstructed perfectly, so calling
+    `text(TokenDocument("This is text"))` will produce a warning message before
+    returning an approximate reconstruction of the text as it existed before
+    tokenization. It is entirely impossible to reconstruct the text of an
+    `NGramDocument`, so `text(NGramDocument("This is text"))` raises an error.
 
 Instead of working with the text itself, you can work with the tokens or
 n-grams of a document using the `tokens()` and `ngrams()` functions:
 
-    tokens(sd)
-    ngrams(sd)
+```julia
+julia> tokens(sd)
+7-element Array{String,1}:
+ "To"  
+ "be"  
+ "or"  
+ "not"
+ "to"  
+ "be.."
+ "."   
+
+ julia> ngrams(sd)
+ Dict{String,Int64} with 7 entries:
+  "or"   => 1
+  "not"  => 1
+  "to"   => 1
+  "To"   => 1
+  "be"   => 1
+  "be.." => 1
+  "."    => 1
+```
 
 By default the `ngrams()` function produces unigrams. If you would like to
 produce bigrams or trigrams, you can specify that directly using a numeric
 argument to the `ngrams()` function:
 
-    ngrams(sd, 2)
+```julia
+julia> ngrams(sd, 2)
+Dict{AbstractString,Int64} with 13 entries:
+  "not"     => 1
+  "be.."    => 1
+  "be or"   => 1
+  "or"      => 1
+  "not to"  => 1
+  "To"      => 1
+  "."       => 1
+  "be"      => 1
+  "To be"   => 1
+  "or not"  => 1
+  "to be.." => 1
+  "be.. ."  => 1
+  "to"      => 1
+```
 
 If you have a `NGramDocument`, you can determine whether an `NGramDocument`
 contains unigrams, bigrams or a higher-order representation using the `ngram_complexity()` function:
 
-    ngram_complexity(ngd)
+```julia
+julia> ngd = NGramDocument("To be or not to be ...", 2)
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> ngram_complexity(ngd)
+2
+```
 
 This information is not available for other types of `Document` objects
 because it is possible to produce any level of complexity when constructing
@@ -234,7 +289,7 @@ These special classes can all be removed using specially-named parameters:
 * `prepare!(sd, strip_html_tags)`
 
 These functions use words lists, so they are capable of working for many
-different languages without change, also these operations can be combined 
+different languages without change, also these operations can be combined
 together for improved performance:
 * `prepare!(sd, strip_articles| strip_numbers| strip_html_tags)`
 

From b5e69222d2f0ddd3f5175e12e5fa0b2eb11d5ede Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sat, 16 Mar 2019 15:44:27 +0530
Subject: [PATCH 3/8] Finish updating documents.md

---
 docs/src/corpus.md    | 25 ++++++++++++
 docs/src/documents.md | 91 +++++++++++++++++++++++++------------------
 2 files changed, 79 insertions(+), 37 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index 876027c8..333fc7e2 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -83,3 +83,28 @@ corpus. The easiest way to do this is to convert a `Corpus` object into
 a `DataFrame`:
 
     convert(DataFrame, crps)
+
+## Corpus Metadata
+
+You can also retrieve the metadata for every document in a `Corpus` at once:
+
+    languages(crps)
+    names(crps)
+    authors(crps)
+    timestamps(crps)
+
+It is possible to change the metadata fields for each document in a `Corpus`.
+These functions use the same metadata value for every document:
+
+    languages!(crps, Languages.German())
+    names!(crps, "")
+    authors!(crps, "Me")
+    timestamps!(crps, "Now")
+
+Additionally, you can specify the metadata fields for each document in
+a `Corpus` individually:
+
+    languages!(crps, [Languages.German(), Languages.English()])
+    names!(crps, ["", "Untitled"])
+    authors!(crps, ["Ich", "You"])
+    timestamps!(crps, ["Unbekannt", "2018"])
diff --git a/docs/src/documents.md b/docs/src/documents.md
index a5b3f18f..9e5736d0 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -197,41 +197,39 @@ including the following pieces of information:
 Try these functions out on a `StringDocument` to see how the defaults work
 in practice:
 
-    language(sd)
-    name(sd)
-    author(sd)
-    timestamp(sd)
+```julia
+julia> sd = StringDocument("This document has too foo words")
+StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
 
-If you need reset these fields, you can use the mutating versions of the same
-functions:
+julia> language(sd)
+Languages.English()
 
-    language!(sd, Languages.Spanish())
-    name!(sd, "El Cid")
-    author!(sd, "Desconocido")
-    timestamp!(sd, "Desconocido")
+julia> name(sd)
+"Unnamed Document"
 
-You can also retrieve the metadata for every document in a `Corpus` at once:
+julia> author(sd)
+"Unknown Author"
+
+julia> timestamp(sd)
+"Unknown Time"
+```
 
-    languages(crps)
-    names(crps)
-    authors(crps)
-    timestamps(crps)
+If you need reset these fields, you can use the mutating versions of the same
+functions:
 
-It is possible to change the metadata fields for each document in a `Corpus`.
-These functions use the same metadata value for every document:
+```julia
+julia> language!(sd, Languages.Spanish())
+Languages.Spanish()
 
-    languages!(crps, Languages.German())
-    names!(crps, "")
-    authors!(crps, "Me")
-    timestamps!(crps, "Now")
+julia> name!(sd, "El Cid")
+"El Cid"
 
-Additionally, you can specify the metadata fields for each document in
-a `Corpus` individually:
+julia> author!(sd, "Desconocido")
+"Desconocido"
 
-    languages!(crps, [Languages.German(), Languages.English()])
-    names!(crps, ["", "Untitled"])
-    authors!(crps, ["Ich", "You"])
-    timestamps!(crps, ["Unbekannt", "2018"])
+julia> timestamp!(sd, "Desconocido")
+"Desconocido"
+```
 
 ## Preprocessing Documents
 
@@ -248,21 +246,33 @@ to process automatically. For example, our sample text sentence taken from Hamle
 has three periods that we might like to discard. We can remove this kind of
 punctuation using the `prepare!()` function:
 
-    prepare!(sd, strip_punctuation)
+```julia
+julia> str = StringDocument("here are some punctuations !!!...")
 
-Like punctuation, numbers and case distinctions are often easier removed than
-dealt with. To remove numbers or case distinctions, use the
-`remove_numbers!()` and `remove_case!()` functions:
+julia> prepare!(str, strip_punctuation)
 
-    remove_numbers!(sd)
-    remove_case!(sd)
+julia> str
+StringDocument{String}("here are some punctuations  ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```
 
-At times you'll want to remove specific words from a document like a person's
+* To case distinctions, use `remove_case!()` function:
+* At times you'll want to remove specific words from a document like a person's
 name. To do that, use the `remove_words!()` function:
 
-    sd = StringDocument("Lear is mad")
-    remove_words!(sd, ["Lear"])
+```julia
+julia> sd = StringDocument("Lear is mad")
+StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> remove_case!(sd)
+
+julia> sd
+StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+
+julia> remove_words!(sd, ["lear"])
 
+julia> sd
+StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```
 At other times, you'll want to remove whole classes of words. To make this
 easier, we can use several classes of basic words defined by the Languages.jl
 package:
@@ -298,4 +308,11 @@ closely related like "dog" and "dogs" and stem them in order to produce a
 smaller set of words for analysis. We can do this using the `stem!()`
 function:
 
-    stem!(sd)
+```julia
+julia> sd = StringDocument("Foo writes and foo bar write")
+
+julia> stem!(sd)
+
+julia> sd
+StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+```

From 43cde898998466f85c37387374615a56a33030a0 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sun, 17 Mar 2019 01:43:50 +0530
Subject: [PATCH 4/8] Update corpus.md

---
 docs/src/corpus.md    | 100 ++++++++++++++++++++++++++++++++----------
 docs/src/documents.md |   2 +-
 2 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index 333fc7e2..5178ea91 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -4,35 +4,54 @@ Working with isolated documents gets boring quickly. We typically want to
 work with a collection of documents. We represent collections of documents
 using the Corpus type:
 
-    crps = Corpus([StringDocument("Document 1"),
-                   StringDocument("Document 2")])
+```julia
+julia> crps = Corpus([StringDocument("Document 1"),
+                      StringDocument("Document 2")])
+Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+```
 
 ## Standardizing a Corpus
 
-A `Corpus` may contain many different types of documents:
+!!! note
+    It is recommended to read about `Documents` types and their heirarchy in the package.
 
-    crps = Corpus([StringDocument("Document 1"),
-                   TokenDocument("Document 2"),
-                   NGramDocument("Document 3")])
+A `Corpus` may contain many different types of documents:
 
+```julia
+julia> crps = Corpus([StringDocument("Document 1"),
+                          TokenDocument("Document 2"),
+                          NGramDocument("Document 3")])
+Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+```
 It is generally more convenient to standardize all of the documents in a
 corpus using a single type. This can be done using the `standardize!`
 function:
 
-    standardize!(crps, NGramDocument)
+```julia
+julia> standardize!(crps, NGramDocument)
+```
 
 After this step, you can check that the corpus only contains `NGramDocument`'s:
 
-    crps
+```julia
+julia> crps
+Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+```
 
 ## Processing a Corpus
 
 We can apply the same sort of preprocessing steps that are defined for
 individual documents to an entire corpus at once:
 
-    crps = Corpus([StringDocument("Document 1"),
-                   StringDocument("Document 2")])
-    remove_punctuation!(crps)
+```julia
+julia> crps = Corpus([StringDocument("Document !!1"),
+                          StringDocument("Document 2!!")])
+
+julia> prepare!(crps, strip_punctuation)
+
+julia> crps
+Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document  ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document  ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+```
 
 These operations are run on each document in the corpus individually.
 
@@ -47,34 +66,71 @@ In particular, we want to work with two constructs:
 Because computations involving the lexicon can take a long time, a
 `Corpus`'s default lexicon is blank:
 
-    lexicon(crps)
+```julia
+julia> julia> crps = Corpus([StringDocument("Name Foo"),
+                          StringDocument("Name Bar")])
+julia> lexicon(crps)
+Dict{String,Int64} with 0 entries
+```
 
 In order to work with the lexicon, you have to update it and then access it:
 
-    update_lexicon!(crps)
-    lexicon(crps)
+```julia
+julia> update_lexicon!(crps)
+
+julia> lexicon(crps)
+Dict{String,Int64} with 3 entries:
+  "Bar"    => 1
+  "Foo"    => 1
+  "Name" => 2
+```
 
 But once this work is done, you can easier address lots of interesting
 questions about a corpus:
+```
+julia> lexical_frequency(crps, "Name")
+0.5
 
-    lexical_frequency(crps, "Summer")
-    lexical_frequency(crps, "Document")
+julia> lexical_frequency(crps, "Foo")
+0.25
+```
 
 Like the lexicon, the inverse index for a corpus is blank by default:
 
-    inverse_index(crps)
+```julia
+julia> inverse_index(crps)
+Dict{String,Array{Int64,1}} with 0 entries
+```
 
 Again, you need to update it before you can work with it:
 
-    update_inverse_index!(crps)
-    inverse_index(crps)
+```julia
+julia> update_inverse_index!(crps)
+
+julia> inverse_index(crps)
+Dict{String,Array{Int64,1}} with 3 entries:
+  "Bar"    => [2]
+  "Foo"    => [1]
+  "Name" => [1, 2]
+```
 
 But once you've updated the inverse index, you can easily search the entire
 corpus:
 
-    crps["Document"]
-    crps["1"]
-    crps["Summer"]
+```julia
+julia> crps["Name"]
+
+2-element Array{Int64,1}:
+ 1
+ 2
+
+julia> crps["Foo"]
+1-element Array{Int64,1}:
+ 1
+
+julia> crps["Summer"]
+0-element Array{Int64,1}
+```
 
 ## Converting a DataFrame from a Corpus
 
diff --git a/docs/src/documents.md b/docs/src/documents.md
index 9e5736d0..df17f183 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -255,7 +255,7 @@ julia> str
 StringDocument{String}("here are some punctuations  ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
 ```
 
-* To case distinctions, use `remove_case!()` function:
+* To remove case distinctions, use `remove_case!()` function:
 * At times you'll want to remove specific words from a document like a person's
 name. To do that, use the `remove_words!()` function:
 

From 9a85a9026366236bb58345c0cc30ce9cb0b24b68 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Sun, 17 Mar 2019 14:57:42 +0530
Subject: [PATCH 5/8] Finish update for corpus.md

---
 docs/src/corpus.md | 57 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 17 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index 5178ea91..c9af53ef 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -44,13 +44,13 @@ We can apply the same sort of preprocessing steps that are defined for
 individual documents to an entire corpus at once:
 
 ```julia
-julia> crps = Corpus([StringDocument("Document !!1"),
-                          StringDocument("Document 2!!")])
+julia> crps = Corpus([StringDocument("Document ..!!"),
+                          StringDocument("Document ..!!")])
 
 julia> prepare!(crps, strip_punctuation)
 
 julia> crps
-Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document  ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document  ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document   ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document   ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
 ```
 
 These operations are run on each document in the corpus individually.
@@ -67,7 +67,7 @@ Because computations involving the lexicon can take a long time, a
 `Corpus`'s default lexicon is blank:
 
 ```julia
-julia> julia> crps = Corpus([StringDocument("Name Foo"),
+julia> crps = Corpus([StringDocument("Name Foo"),
                           StringDocument("Name Bar")])
 julia> lexicon(crps)
 Dict{String,Int64} with 0 entries
@@ -144,23 +144,46 @@ a `DataFrame`:
 
 You can also retrieve the metadata for every document in a `Corpus` at once:
 
-    languages(crps)
-    names(crps)
-    authors(crps)
-    timestamps(crps)
+```julia
+julia> crps = Corpus([StringDocument("Name Foo"),
+                                 StringDocument("Name Bar")])
+
+julia> languages(crps)
+2-element Array{Languages.English,1}:
+ Languages.English()
+ Languages.English()
+
+julia> names(crps)
+2-element Array{String,1}:
+ "Unnamed Document"
+ "Unnamed Document"
+
+julia> authors(crps)
+2-element Array{String,1}:
+ "Unknown Author"
+ "Unknown Author"
+
+julia> timestamps(crps)
+2-element Array{String,1}:
+ "Unknown Time"
+ "Unknown Time"
+```
 
 It is possible to change the metadata fields for each document in a `Corpus`.
 These functions use the same metadata value for every document:
 
-    languages!(crps, Languages.German())
-    names!(crps, "")
-    authors!(crps, "Me")
-    timestamps!(crps, "Now")
-
+```julia
+julia> languages!(crps, Languages.German())
+julia> names!(crps, "")
+julia> authors!(crps, "Me")
+julia> timestamps!(crps, "Now")
+```
 Additionally, you can specify the metadata fields for each document in
 a `Corpus` individually:
 
-    languages!(crps, [Languages.German(), Languages.English()])
-    names!(crps, ["", "Untitled"])
-    authors!(crps, ["Ich", "You"])
-    timestamps!(crps, ["Unbekannt", "2018"])
+```julia
+julia> languages!(crps, [Languages.German(), Languages.English
+julia> names!(crps, ["", "Untitled"])
+julia> authors!(crps, ["Ich", "You"])
+julia> timestamps!(crps, ["Unbekannt", "2018"])
+```

From 29405d0be531d57c9182b31cc9add7b6eb9715cc Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Mon, 18 Mar 2019 13:56:10 +0530
Subject: [PATCH 6/8] Update docs/features.md

---
 docs/src/corpus.md   |  5 +++
 docs/src/features.md | 91 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 13 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index c9af53ef..5fa6a597 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -144,6 +144,11 @@ a `DataFrame`:
 
 You can also retrieve the metadata for every document in a `Corpus` at once:
 
+* `languages()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package.
+* `names()`: What is the name of the document? Defaults to `"Unnamed Document"`.
+* `authors()`: Who wrote the document? Defaults to `"Unknown Author"`.
+* `timestamps()`: When was the document written? Defaults to `"Unknown Time"`.
+
 ```julia
 julia> crps = Corpus([StringDocument("Name Foo"),
                                  StringDocument("Name Bar")])
diff --git a/docs/src/features.md b/docs/src/features.md
index cda25231..481fcf41 100644
--- a/docs/src/features.md
+++ b/docs/src/features.md
@@ -4,18 +4,53 @@ Often we want to represent documents as a matrix of word counts so that we
 can apply linear algebra operations and statistical techniques. Before
 we do this, we need to update the lexicon:
 
-    update_lexicon!(crps)
-    m = DocumentTermMatrix(crps)
+```julia
+julia> crps = Corpus([StringDocument("To be or not to be"),
+                             StringDocument("To become or not to become")])
+
+julia> update_lexicon!(crps)
+
+julia> m = DocumentTermMatrix(crps)
+DocumentTermMatrix(
+  [1, 1]  =  1
+  [2, 1]  =  1
+  [1, 2]  =  2
+  [2, 3]  =  2
+  [1, 4]  =  1
+  [2, 4]  =  1
+  [1, 5]  =  1
+  [2, 5]  =  1
+  [1, 6]  =  1
+  [2, 6]  =  1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3))
+```
 
 A `DocumentTermMatrix` object is a special type. If you would like to use
 a simple sparse matrix, call `dtm()` on this object:
 
-    dtm(m)
+```julia
+julia> dtm(m)
+2×6 SparseArrays.SparseMatrixCSC{Int64,Int64} with 10 stored entries:
+  [1, 1]  =  1
+  [2, 1]  =  1
+  [1, 2]  =  2
+  [2, 3]  =  2
+  [1, 4]  =  1
+  [2, 4]  =  1
+  [1, 5]  =  1
+  [2, 5]  =  1
+  [1, 6]  =  1
+  [2, 6]  =  1
+```
 
 If you would like to use a dense matrix instead, you can pass this as
 an argument to the `dtm` function:
 
-    dtm(m, :dense)
+```julia
+julia> dtm(m, :dense)
+2×6 Array{Int64,2}:
+ 1  2  0  1  1  1
+ 1  0  2  1  1  1
+```
 
 ## Creating Individual Rows of a Document Term Matrix
 
@@ -24,7 +59,11 @@ make do with just a single row. You can get this using the `dtv` function.
 Because individual's document do not have a lexicon associated with them, we
 have to pass in a lexicon as an additional argument:
 
-    dtv(crps[1], lexicon(crps))
+```julia
+julia> dtv(crps[1], lexicon(crps))
+1×6 Array{Int64,2}:
+ 1  2  0  1  1  1
+```
 
 ## The Hash Trick
 
@@ -33,35 +72,61 @@ The need to create a lexicon before we can construct a document term matrix is o
 function that outputs integers from 1 to N. To construct such a hash function,
 you can use the `TextHashFunction(N)` constructor:
 
-    h = TextHashFunction(10)
+```julia
+julia> h = TextHashFunction(10)
+TextHashFunction(hash, 10)
+```
 
 You can see how this function maps strings to numbers by calling the
 `index_hash` function:
 
-    index_hash("a", h)
-    index_hash("b", h)
+```julia
+julia> index_hash("a", h)
+8
+
+julia> index_hash("b", h)
+7
+```
 
 Using a text hash function, we can represent a document as a vector with N
 entries by calling the `hash_dtv` function:
 
-    hash_dtv(crps[1], h)
+```julia
+julia> hash_dtv(crps[1], h)
+1×10 Array{Int64,2}:
+ 0  2  0  0  1  3  0  0  0  0
+```
 
 This can be done for a corpus as a whole to construct a DTM without defining
 a lexicon in advance:
 
-    hash_dtm(crps, h)
+```julia
+julia> hash_dtm(crps, h)
+2×10 Array{Int64,2}:
+ 0  2  0  0  1  3  0  0  0  0
+ 0  2  0  0  1  1  0  0  2  0
+```
 
 Every corpus has a hash function built-in, so this function can be called
 using just one argument:
 
-    hash_dtm(crps)
+```julia
+julia> hash_dtm(crps)
+2×100 Array{Int64,2}:
+ 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
+ 0  0  0  0  0  0  0  0  2  0  0  0  0     0  0  0  0  0  0  0  0  0  0  0  0
+```
 
 Moreover, if you do not specify a hash function for just one row of the hash
 DTM, a default hash function will be constructed for you:
 
-    hash_dtv(crps[1])
+```julia
+julia> hash_dtv(crps[1])
+1×100 Array{Int64,2}:
+ 0  0  0  0  0  0  0  0  0  0  0  0  0  …  0  0  0  0  0  0  0  0  0  0  0  0
+```
 
-## TF-IDF
+## TF-IDF (Term Frequency - Inverse Document Frequency)
 
 In many cases, raw word counts are not appropriate for use because:
 

From 121ea8f58c905d4120189354b0e80e6fb20dd3a7 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Wed, 20 Mar 2019 18:24:52 +0530
Subject: [PATCH 7/8] Update semantic.md

---
 docs/src/corpus.md   |  3 ---
 docs/src/semantic.md | 38 +++++++++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index 5fa6a597..e73a96e9 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -12,9 +12,6 @@ Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Do
 
 ## Standardizing a Corpus
 
-!!! note
-    It is recommended to read about `Documents` types and their heirarchy in the package.
-
 A `Corpus` may contain many different types of documents:
 
 ```julia
diff --git a/docs/src/semantic.md b/docs/src/semantic.md
index cfd47c43..f9dc957a 100644
--- a/docs/src/semantic.md
+++ b/docs/src/semantic.md
@@ -11,11 +11,35 @@ Analysis or LSA on the corpus. You can do this using the `lsa` function:
 Another way to get a handle on the semantic content of a corpus is to use
 [Latent Dirichlet Allocation](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation):
 
-    m = DocumentTermMatrix(crps)
-    k = 2            # number of topics
-    iteration = 1000 # number of gibbs sampling iterations
-    alpha = 0.1      # hyper parameter
-    beta  = 0.1       # hyber parameter
-    ϕ, θ  = lda(m, k, iteration, alpha, beta) # ϕ is k x word matrix.
-                                              # value is probablity of occurrence of a word in a topic.
+First we need to produce the DocumentTermMatrix
+```julia
+julia> crps = Corpus([StringDocument("This is the Foo Bar Document"), StringDocument("This document has too Foo words")])
+julia> update_lexicon!(crps)
+julia> m = DocumentTermMatrix(crps)
+```
+
+Latent Dirchlet Allocation has two hyper parameters -
+* _α_ : The hyperparameter for topic distribution per document. `α<1` yields a sparse topic mixture for each document. `α>1` yields a more uniform topic mixture for each document.
+- _β_ : The hyperparameter for word distribution per topic. `β<1` yields a sparse word mixture for each topic. `β>1` yields a more uniform word mixture for each topic.
+
+```julia
+julia> k = 2            # number of topics
+julia> iterations = 1000 # number of gibbs sampling iterations
+
+julia> α = 0.1      # hyper parameter
+julia> β  = 0.1       # hyper parameter
+
+julia> ϕ, θ  = lda(m, k, iterations, α, β)
+(
+  [2 ,  1]  =  0.333333
+  [2 ,  2]  =  0.333333
+  [1 ,  3]  =  0.222222
+  [1 ,  4]  =  0.222222
+  [1 ,  5]  =  0.111111
+  [1 ,  6]  =  0.111111
+  [1 ,  7]  =  0.111111
+  [2 ,  8]  =  0.333333
+  [1 ,  9]  =  0.111111
+  [1 , 10]  =  0.111111, [0.5 1.0; 0.5 0.0])
+```
 See `?lda` for more help.

From f4b5561664e45ecb04d668f80cb707e5d6603b50 Mon Sep 17 00:00:00 2001
From: Ayushk4 <ayushk4@gmail.com>
Date: Wed, 20 Mar 2019 19:39:39 +0530
Subject: [PATCH 8/8] Change name -> title in docs

---
 docs/src/corpus.md    | 20 ++++++++++----------
 docs/src/documents.md | 44 +++++++++++++++++++++----------------------
 docs/src/features.md  | 12 ++++++++++++
 src/TextAnalysis.jl   | 10 +++++-----
 src/corpus.jl         |  4 ++--
 src/document.jl       |  6 +++---
 src/metadata.jl       | 14 +++++++-------
 src/sentiment.jl      | 33 ++++++++++++++++++++++++++------
 src/show.jl           |  2 +-
 src/summarizer.jl     |  1 -
 test/metadata.jl      | 16 ++++++++--------
 test/sentiment.jl     | 21 +++++++++++++++++++++
 12 files changed, 118 insertions(+), 65 deletions(-)

diff --git a/docs/src/corpus.md b/docs/src/corpus.md
index e73a96e9..4e3f7da7 100644
--- a/docs/src/corpus.md
+++ b/docs/src/corpus.md
@@ -7,7 +7,7 @@ using the Corpus type:
 ```julia
 julia> crps = Corpus([StringDocument("Document 1"),
                       StringDocument("Document 2")])
-Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
 ```
 
 ## Standardizing a Corpus
@@ -18,7 +18,7 @@ A `Corpus` may contain many different types of documents:
 julia> crps = Corpus([StringDocument("Document 1"),
                           TokenDocument("Document 2"),
                           NGramDocument("Document 3")])
-Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
 ```
 It is generally more convenient to standardize all of the documents in a
 corpus using a single type. This can be done using the `standardize!`
@@ -32,7 +32,7 @@ After this step, you can check that the corpus only contains `NGramDocument`'s:
 
 ```julia
 julia> crps
-Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
 ```
 
 ## Processing a Corpus
@@ -47,7 +47,7 @@ julia> crps = Corpus([StringDocument("Document ..!!"),
 julia> prepare!(crps, strip_punctuation)
 
 julia> crps
-Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document   ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document   ", DocumentMetadata(English(), "Unnamed Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document   ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document   ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
 ```
 
 These operations are run on each document in the corpus individually.
@@ -142,7 +142,7 @@ a `DataFrame`:
 You can also retrieve the metadata for every document in a `Corpus` at once:
 
 * `languages()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package.
-* `names()`: What is the name of the document? Defaults to `"Unnamed Document"`.
+* `titles()`: What is the title of the document? Defaults to `"Untitled Document"`.
 * `authors()`: Who wrote the document? Defaults to `"Unknown Author"`.
 * `timestamps()`: When was the document written? Defaults to `"Unknown Time"`.
 
@@ -155,10 +155,10 @@ julia> languages(crps)
  Languages.English()
  Languages.English()
 
-julia> names(crps)
+julia> titles(crps)
 2-element Array{String,1}:
- "Unnamed Document"
- "Unnamed Document"
+ "Untitled Document"
+ "Untitled Document"
 
 julia> authors(crps)
 2-element Array{String,1}:
@@ -176,7 +176,7 @@ These functions use the same metadata value for every document:
 
 ```julia
 julia> languages!(crps, Languages.German())
-julia> names!(crps, "")
+julia> titles!(crps, "")
 julia> authors!(crps, "Me")
 julia> timestamps!(crps, "Now")
 ```
@@ -185,7 +185,7 @@ a `Corpus` individually:
 
 ```julia
 julia> languages!(crps, [Languages.German(), Languages.English
-julia> names!(crps, ["", "Untitled"])
+julia> titles!(crps, ["", "Untitled"])
 julia> authors!(crps, ["Ich", "You"])
 julia> timestamps!(crps, ["Unbekannt", "2018"])
 ```
diff --git a/docs/src/documents.md b/docs/src/documents.md
index df17f183..7ba964f7 100644
--- a/docs/src/documents.md
+++ b/docs/src/documents.md
@@ -18,7 +18,7 @@ julia> str = "To be or not to be..."
 "To be or not to be..."
 
 julia> sd = StringDocument(str)
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> pathname = "/usr/share/dict/words"
 "/usr/share/dict/words"
@@ -36,7 +36,7 @@ julia> my_tokens = String["To", "be", "or", "not", "to", "be..."]
  "be..."
 
 julia> td = TokenDocument(my_tokens)
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> my_ngrams = Dict{String, Int}("To" => 1, "be" => 2,
                                     "or" => 1, "not" => 1,
@@ -50,7 +50,7 @@ Dict{String,Int64} with 6 entries:
   "be"    => 2
 
 julia> ngd = NGramDocument(my_ngrams)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 
 An NGramDocument consisting of bigrams or any higher order representation `N`
@@ -58,7 +58,7 @@ can be easily created by passing the parameter `N` to `NGramDocument`
 
 ```julia
 julia> ngd = NGramDocument("To be or not to be ...", 2)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 
 For every type of document except a `FileDocument`, you can also construct a
@@ -66,13 +66,13 @@ new document by simply passing in a string of text:
 
 ```julia
 julia> sd = StringDocument("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> td = TokenDocument("To be or not to be...")
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> ngd = NGramDocument("To be or not to be...")
-NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 
 The system will automatically perform tokenization or n-gramization in order
@@ -86,16 +86,16 @@ and construct the appropriate type of document object:
 
 ```julia
 julia> Document("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> Document("/usr/share/dict/words")
 FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time"))
 
 julia> Document(String["To", "be", "or", "not", "to", "be..."])
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> Document(Dict{String, Int}("a" => 1, "b" => 3))
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 
 This constructor is very convenient for working in the REPL, but should be avoided in permanent code because, unlike the other constructors, the return type of the `Document` function cannot be known at compile-time.
@@ -107,7 +107,7 @@ most obvious thing is to access its text using the `text()` function:
 
 ```julia
 julia> sd = StringDocument("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> text(sd)
 "To be or not to be..."
@@ -173,7 +173,7 @@ contains unigrams, bigrams or a higher-order representation using the `ngram_com
 
 ```julia
 julia> ngd = NGramDocument("To be or not to be ...", 2)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> ngram_complexity(ngd)
 2
@@ -190,7 +190,7 @@ document, every document object also stores basic metadata about itself,
 including the following pieces of information:
 
 * `language()`: What language is the document in? Defaults to `Languages.English()`, a Language instance defined by the Languages package.
-* `name()`: What is the name of the document? Defaults to `"Unnamed Document"`.
+* `title()`: What is the title of the document? Defaults to `"Untitled Document"`.
 * `author()`: Who wrote the document? Defaults to `"Unknown Author"`.
 * `timestamp()`: When was the document written? Defaults to `"Unknown Time"`.
 
@@ -199,13 +199,13 @@ in practice:
 
 ```julia
 julia> sd = StringDocument("This document has too foo words")
-StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> language(sd)
 Languages.English()
 
-julia> name(sd)
-"Unnamed Document"
+julia> title(sd)
+"Untitled Document"
 
 julia> author(sd)
 "Unknown Author"
@@ -221,7 +221,7 @@ functions:
 julia> language!(sd, Languages.Spanish())
 Languages.Spanish()
 
-julia> name!(sd, "El Cid")
+julia> title!(sd, "El Cid")
 "El Cid"
 
 julia> author!(sd, "Desconocido")
@@ -252,7 +252,7 @@ julia> str = StringDocument("here are some punctuations !!!...")
 julia> prepare!(str, strip_punctuation)
 
 julia> str
-StringDocument{String}("here are some punctuations  ", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("here are some punctuations  ", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 
 * To remove case distinctions, use `remove_case!()` function:
@@ -261,17 +261,17 @@ name. To do that, use the `remove_words!()` function:
 
 ```julia
 julia> sd = StringDocument("Lear is mad")
-StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> remove_case!(sd)
 
 julia> sd
-StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 
 julia> remove_words!(sd, ["lear"])
 
 julia> sd
-StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
 At other times, you'll want to remove whole classes of words. To make this
 easier, we can use several classes of basic words defined by the Languages.jl
@@ -314,5 +314,5 @@ julia> sd = StringDocument("Foo writes and foo bar write")
 julia> stem!(sd)
 
 julia> sd
-StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Unnamed Document", "Unknown Author", "Unknown Time"))
+StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
 ```
diff --git a/docs/src/features.md b/docs/src/features.md
index 481fcf41..a6813a84 100644
--- a/docs/src/features.md
+++ b/docs/src/features.md
@@ -141,3 +141,15 @@ You can work around this by performing TF-IDF on a DocumentTermMatrix:
 As you can see, TF-IDF has the effect of inserting 0's into the columns of
 words that occur in all documents. This is a useful way to avoid having to
 remove those words during preprocessing.
+
+## Sentiment Analyzer
+
+It can be used to find the sentiment score (between 0 and 1) of a word, sentence or a Document.
+A trained model (using Flux) on IMDB word corpus with weights saved are used to calculate the sentiments.
+
+    model = SentimentAnalyzer(doc)
+    model = SentimentAnalyzer(doc, handle_unknown)
+
+*  doc              = Input Document for calculating document (AbstractDocument type)
+*  handle_unknown   = A function for handling unknown words. Should return an array (default (x)->[])
+ 
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
index 08127aa3..db0bcd2c 100644
--- a/src/TextAnalysis.jl
+++ b/src/TextAnalysis.jl
@@ -4,7 +4,7 @@ module TextAnalysis
     using SparseArrays
     using Printf
     using LinearAlgebra
-    
+
     using Languages
     using DataFrames
     using WordTokenizers
@@ -21,10 +21,10 @@ module TextAnalysis
     export text, tokens, ngrams
     export text!, tokens!, ngrams!
     export documents
-    export language, name, author, timestamp
-    export languages, names, authors, timestamps
-    export language!, name!, author!, timestamp!
-    export languages!, names!, authors!, timestamps!
+    export language, title, author, timestamp
+    export languages, titles, authors, timestamps
+    export language!, title!, author!, timestamp!
+    export languages!, titles!, authors!, timestamps!
     export ngram_complexity
     export lexicon, update_lexicon!, lexical_frequency, lexicon_size
     export inverse_index, update_inverse_index!, index_size
diff --git a/src/corpus.jl b/src/corpus.jl
index bd94f410..85ad1c10 100644
--- a/src/corpus.jl
+++ b/src/corpus.jl
@@ -79,7 +79,7 @@ function Base.convert(::Type{DataFrame}, crps::Corpus)
     df = DataFrame()
     n = length(crps)
     df[:Language] = Array{Union{String,Missing}}(n)
-    df[:Name] = Array{Union{String,Missing}}(n)
+    df[:Title] = Array{Union{String,Missing}}(n)
     df[:Author] = Array{Union{String,Missing}}(n)
     df[:TimeStamp] = Array{Union{String,Missing}}(n)
     df[:Length] = Array{Union{Int,Missing}}(n)
@@ -87,7 +87,7 @@ function Base.convert(::Type{DataFrame}, crps::Corpus)
     for i in 1:n
         d = crps.documents[i]
         df[i, :Language] = string(language(d))
-        df[i, :Name] = name(d)
+        df[i, :Title] = title(d)
         df[i, :Author] = author(d)
         df[i, :TimeStamp] = timestamp(d)
         df[i, :Length] = length(d)
diff --git a/src/document.jl b/src/document.jl
index aeaedaf5..2657b68b 100644
--- a/src/document.jl
+++ b/src/document.jl
@@ -6,13 +6,13 @@
 
 mutable struct DocumentMetadata
     language
-    name::String
+    title::String
     author::String
     timestamp::String
 end
 DocumentMetadata() = DocumentMetadata(
     Languages.English(),
-    "Unnamed Document",
+    "Untitled Document",
     "Unknown Author",
     "Unknown Time"
 )
@@ -38,7 +38,7 @@ end
 
 function FileDocument(f::AbstractString)
     d = FileDocument(String(f), DocumentMetadata())
-    d.metadata.name = f
+    d.metadata.title = f
     return d
 end
 
diff --git a/src/metadata.jl b/src/metadata.jl
index 5ba94a4a..f8700570 100644
--- a/src/metadata.jl
+++ b/src/metadata.jl
@@ -6,13 +6,13 @@
 
 import Languages.name
 
-name(d::AbstractDocument) = d.metadata.name
+title(d::AbstractDocument) = d.metadata.title
 language(d::AbstractDocument) = d.metadata.language
 author(d::AbstractDocument) = d.metadata.author
 timestamp(d::AbstractDocument) = d.metadata.timestamp
 
-function name!(d::AbstractDocument, nv::AbstractString)
-    d.metadata.name = nv
+function title!(d::AbstractDocument, nv::AbstractString)
+    d.metadata.title = nv
 end
 
 function language!(d::AbstractDocument, nv::T) where T <: Language
@@ -33,20 +33,20 @@ end
 #
 ##############################################################################
 
-names(c::Corpus) = map(d -> name(d), documents(c))
+titles(c::Corpus) = map(d -> title(d), documents(c))
 languages(c::Corpus) = map(d -> language(d), documents(c))
 authors(c::Corpus) = map(d -> author(d), documents(c))
 timestamps(c::Corpus) = map(d -> timestamp(d), documents(c))
 
-names!(c::Corpus, nv::AbstractString) = name!.(documents(c), nv)
+titles!(c::Corpus, nv::AbstractString) = title!.(documents(c), nv)
 languages!(c::Corpus, nv::T) where {T <: Language} = language!.(documents(c), Ref(nv)) #Ref to force scalar broadcast
 authors!(c::Corpus, nv::AbstractString) = author!.(documents(c), Ref(nv))
 timestamps!(c::Corpus, nv::AbstractString) = timestamp!.(documents(c), Ref(nv))
 
-function names!(c::Corpus, nvs::Vector{String})
+function titles!(c::Corpus, nvs::Vector{String})
     length(c) == length(nvs) || throw(DimensionMismatch("dimensions must match"))
     for (i, d) in pairs(IndexLinear(), documents(c))
-        name!(d, nvs[i])
+        title!(d, nvs[i])
     end
 end
 
diff --git a/src/sentiment.jl b/src/sentiment.jl
index f9c980b4..91ec5804 100644
--- a/src/sentiment.jl
+++ b/src/sentiment.jl
@@ -35,7 +35,7 @@ function flatten(x)
     return reshape(x, (l, 1))
 end
 
-function get_sentiment(ip::Array{T, 1}, weight, rwi) where T <: AbstractString
+function get_sentiment(handle_unknown, ip::Array{T, 1}, weight, rwi) where T <: AbstractString
     model = (x,) -> begin
     a_1 = embedding(weight[:embedding_1]["embedding_1"]["embeddings:0"], x)
     a_2 = flatten(a_1)
@@ -45,7 +45,16 @@ function get_sentiment(ip::Array{T, 1}, weight, rwi) where T <: AbstractString
     end
     res = Array{Int, 1}()
     for ele in ip
-        push!(res, rwi[ele])
+	if ele in keys(rwi) && rwi[ele] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2]   # there are only 5000 unique embeddings
+            push!(res, rwi[ele])
+	else
+	    for words in handle_unknown(ele) 
+		if words in keys(rwi) && rwi[words] <= size(weight[:embedding_1]["embedding_1"]["embeddings:0"])[2]
+		    push!(res, rwi[words])
+		end
+	    end	
+		
+	end
     end
     return model(pad_sequences(res))[1]
 end
@@ -74,10 +83,22 @@ function Base.show(io::IO, s::SentimentAnalyzer)
 end
 
 
-function(m::SentimentModel)(text::Array{T, 1}) where T <: AbstractString
-    return get_sentiment(text, m.weight, m.words)
+function(m::SentimentModel)(handle_unknown, text::Array{T, 1}) where T <: AbstractString
+    return get_sentiment(handle_unknown, text, m.weight, m.words)
 end
 
-function(m::SentimentAnalyzer)(d::AbstractDocument)
-    m.model(tokens(d))
+
+"""
+ ```
+ model = SentimentAnalyzer(doc)
+ model = SentimentAnalyzer(doc, handle_unknown)
+ ```
+ Return sentiment of the input doc in range 0 to 1, 0 being least sentiment score and 1 being
+ the highest:
+  -  doc              = Input Document for calculating document (AbstractDocument type)
+  -  handle_unknown   = A function for handling unknown words. Should return an array (default x->tuple())
+ """
+
+function(m::SentimentAnalyzer)(d::AbstractDocument, handle_unknown = x->tuple())
+    m.model(handle_unknown, tokens(d))
 end
diff --git a/src/show.jl b/src/show.jl
index 53588783..0c68094d 100644
--- a/src/show.jl
+++ b/src/show.jl
@@ -18,7 +18,7 @@ function summary(d::AbstractDocument)
     o = ""
     o *= "A $(typeof(d))\n"
     o *= " * Language: $(language(d))\n"
-    o *= " * Name: $(name(d))\n"
+    o *= " * Title: $(title(d))\n"
     o *= " * Author: $(author(d))\n"
     o *= " * Timestamp: $(timestamp(d))\n"
     if contains(Any[TokenDocument, NGramDocument], typeof(d))
diff --git a/src/summarizer.jl b/src/summarizer.jl
index 1179bafc..0f371fa5 100644
--- a/src/summarizer.jl
+++ b/src/summarizer.jl
@@ -1,4 +1,3 @@
-
 function summarize(d::AbstractDocument; ns=5)
     sentences = sentence_tokenize(language(d), text(d))
     s = StringDocument.(sentences)
diff --git a/test/metadata.jl b/test/metadata.jl
index 29ee45cd..19a96e54 100644
--- a/test/metadata.jl
+++ b/test/metadata.jl
@@ -7,41 +7,41 @@
     crps = Corpus([sd1, sd2])
 
     # Single document metadata getters
-    @test isequal(name(sd1), "Unnamed Document")
+    @test isequal(title(sd1), "Untitled Document")
     @test isequal(language(sd1), Languages.English())
     @test isequal(author(sd1), "Unknown Author")
     @test isequal(timestamp(sd1), "Unknown Time")
 
     # Single document metadata setters
-    name!(sd1, "Document")
+    title!(sd1, "Document")
     language!(sd1, Languages.German())
     author!(sd1, "Author")
     timestamp!(sd1, "Time")
-    @test isequal(name(sd1), "Document")
+    @test isequal(title(sd1), "Document")
     @test isequal(language(sd1), Languages.German())
     @test isequal(author(sd1), "Author")
     @test isequal(timestamp(sd1), "Time")
 
     # Metadata getters for an entire corpus
-    @test isequal(TextAnalysis.names(crps), ["Document", "Unnamed Document"])
+    @test isequal(TextAnalysis.titles(crps), ["Document", "Untitled Document"])
     @test isequal(languages(crps), [Languages.German(), Languages.English()])
     @test isequal(authors(crps), ["Author", "Unknown Author"])
     @test isequal(timestamps(crps), ["Time", "Unknown Time"])
 
     # Metadata setters for an entire corpus
-    names!(crps, "Document")
+    titles!(crps, "Document")
     languages!(crps, Languages.Spanish())
     authors!(crps, "Author")
     timestamps!(crps, "Time")
-    @test isequal(TextAnalysis.names(crps), ["Document", "Document"])
+    @test isequal(TextAnalysis.titles(crps), ["Document", "Document"])
     @test isequal(languages(crps), [Languages.Spanish(), Languages.Spanish()])
     @test isequal(authors(crps), ["Author", "Author"])
     @test isequal(timestamps(crps), ["Time", "Time"])
-    names!(crps, ["Unnamed Document", "Unnamed Document"])
+    titles!(crps, ["Untitled Document", "Untitled Document"])
     languages!(crps, [Languages.English(), Languages.English()])
     authors!(crps, ["Unknown Author", "Unknown Author"])
     timestamps!(crps, ["Unknown Time", "Unknown Time"])
-    @test isequal(TextAnalysis.names(crps), ["Unnamed Document", "Unnamed Document"])
+    @test isequal(TextAnalysis.titles(crps), ["Untitled Document", "Untitled Document"])
     @test isequal(languages(crps), [Languages.English(), Languages.English()])
     @test isequal(authors(crps), ["Unknown Author", "Unknown Author"])
     @test isequal(timestamps(crps), ["Unknown Time", "Unknown Time"])
diff --git a/test/sentiment.jl b/test/sentiment.jl
index ce6233cf..4ec131e1 100644
--- a/test/sentiment.jl
+++ b/test/sentiment.jl
@@ -8,4 +8,25 @@
     d = StringDocument("a horrible thing that everyone hates")
 
     @test m(d) < 0.5
+
+    # testing default behaviour of handle_unknown
+    d = StringDocument("some sense and some nonSense")
+
+    @test m(d) < 0.5
+
+    # testing behaviour of words which are present in dictionary but do not have embedding assigned
+    d = StringDocument("some sense and some duh")
+
+    @test m(d) < 0.5
+
+    # testing user given handle_unknown function
+    d = Document("a Horrible thing that Everyone Hates")
+   
+    @test m(d, (x) -> [lowercase(x)]) < 0.5
+
+    # Make it throw an error when unknown word encountered
+    d = Document("some sense and some Hectic")
+   
+    @test_throws ErrorException m(d, (x) -> error("OOV word $x encountered"))
+
 end