Merge pull request #159 from Ayushk4/Ayushk4_patch

Update the documentation reflecting changes in show.jl
JuliaText · May 23, 2019 · bd4b84d · bd4b84d
2 parents e1f3b05 + 7ba449f
commit bd4b84d
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 49 deletions.
diff --git a/docs/src/classify.md b/docs/src/classify.md
@@ -21,8 +21,6 @@ It takes two arguments-
 * `dict`:(Optional Argument) An Array of possible tokens (words). This is automatically updated if a new token is detected in the Step 2) or 3)
 
 
-
-
 2- Fitting the model weights on input -
 
     fit!(model, str, class)
@@ -36,7 +34,6 @@ It takes two arguments-
 ```julia
 julia> m = NaiveBayesClassifier([:legal, :financial])
 NaiveBayesClassifier{Symbol}(String[], Symbol[:legal, :financial], Array{Int64}(0,2))
-
 ```
 
 ```julia
@@ -45,13 +42,11 @@ NaiveBayesClassifier{Symbol}(["financial", "this", "is", "doc"], Symbol[:legal,
 
 julia> fit!(m, "this is legal doc", :legal)
 NaiveBayesClassifier{Symbol}(["financial", "this", "is", "doc", "legal"], Symbol[:legal, :financial], [1 2; 2 2; … ; 2 2; 2 1])
-
 ```
 
 ```julia
 julia> predict(m, "this should be predicted as a legal document")
 Dict{Symbol,Float64} with 2 entries:
   :legal     => 0.666667
   :financial => 0.333333
-
 ```
diff --git a/docs/src/corpus.md b/docs/src/corpus.md
@@ -7,7 +7,14 @@ using the Corpus type:
 ```julia
 julia> crps = Corpus([StringDocument("Document 1"),
                       StringDocument("Document 2")])
-Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document 2", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+A Corpus with 2 documents:
+ * 2 StringDocument's
+ * 0 FileDocument's
+ * 0 TokenDocument's
+ * 0 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
 ```
 
 ## Standardizing a Corpus
@@ -18,8 +25,16 @@ A `Corpus` may contain many different types of documents:
 julia> crps = Corpus([StringDocument("Document 1"),
                           TokenDocument("Document 2"),
                           NGramDocument("Document 3")])
-Corpus{AbstractDocument}(AbstractDocument[StringDocument{String}("Document 1", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), TokenDocument{String}(["Document", "2"], DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+A Corpus with 3 documents:
+ * 1 StringDocument's
+ * 0 FileDocument's
+ * 1 TokenDocument's
+ * 1 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
 ```
+
 It is generally more convenient to standardize all of the documents in a
 corpus using a single type. This can be done using the `standardize!`
 function:
@@ -32,7 +47,14 @@ After this step, you can check that the corpus only contains `NGramDocument`'s:
 
 ```julia
 julia> crps
-Corpus{AbstractDocument}(AbstractDocument[NGramDocument{String}(Dict("1"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("2"=>1,"Document"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), NGramDocument{String}(Dict("Document"=>1,"3"=>1), 1, DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+A Corpus with 3 documents:
+ * 0 StringDocument's
+ * 0 FileDocument's
+ * 0 TokenDocument's
+ * 3 NGramDocument's
+
+Corpus's lexicon contains 0 tokens
+Corpus's index contains 0 tokens
 ```
 
 ## Processing a Corpus
@@ -46,8 +68,11 @@ julia> crps = Corpus([StringDocument("Document ..!!"),
 
 julia> prepare!(crps, strip_punctuation)
 
-julia> crps
-Corpus{StringDocument{String}}(StringDocument{String}[StringDocument{String}("Document   ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time")), StringDocument{String}("Document   ", DocumentMetadata(English(), "Untitled Document", "Unknown Author", "Unknown Time"))], 0, Dict{String,Int64}(), Dict{String,Array{Int64,1}}(), TextHashFunction(hash, 100))
+julia> text(crps[1])
+"Document "
+
+julia> text(crps[2])
+"Document "
 ```
 
 These operations are run on each document in the corpus individually.

diff --git a/docs/src/documents.md b/docs/src/documents.md
@@ -18,13 +18,23 @@ julia> str = "To be or not to be..."
 "To be or not to be..."
 
 julia> sd = StringDocument(str)
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: To be or not to be...
 
 julia> pathname = "/usr/share/dict/words"
 "/usr/share/dict/words"
 
 julia> fd = FileDocument(pathname)
-FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time"))
+A FileDocument
+ * Language: Languages.English()
+ * Title: /usr/share/dict/words
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: A A's AMD AMD's AOL AOL's Aachen Aachen's Aaliyah
 
 julia> my_tokens = String["To", "be", "or", "not", "to", "be..."]
 6-element Array{String,1}:
@@ -36,7 +46,13 @@ julia> my_tokens = String["To", "be", "or", "not", "to", "be..."]
  "be..."
 
 julia> td = TokenDocument(my_tokens)
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A TokenDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
+
 
 julia> my_ngrams = Dict{String, Int}("To" => 1, "be" => 2,
                                     "or" => 1, "not" => 1,
@@ -50,29 +66,54 @@ Dict{String,Int64} with 6 entries:
   "be"    => 2
 
 julia> ngd = NGramDocument(my_ngrams)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("or"=>1,"be..."=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>2), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A NGramDocument{AbstractString}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 ```
 
 An NGramDocument consisting of bigrams or any higher order representation `N`
 can be easily created by passing the parameter `N` to `NGramDocument`
 
 ```julia
-julia> ngd = NGramDocument("To be or not to be ...", 2)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> NGramDocument("To be or not to be ...", 2)
+A NGramDocument{AbstractString}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 ```
 
 For every type of document except a `FileDocument`, you can also construct a
 new document by simply passing in a string of text:
 
 ```julia
 julia> sd = StringDocument("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: To be or not to be...
 
 julia> td = TokenDocument("To be or not to be...")
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..", "."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A TokenDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 
 julia> ngd = NGramDocument("To be or not to be...")
-NGramDocument{String}(Dict("or"=>1,"not"=>1,"to"=>1,"To"=>1,"be"=>1,"be.."=>1,"."=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A NGramDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 ```
 
 The system will automatically perform tokenization or n-gramization in order
@@ -86,16 +127,35 @@ and construct the appropriate type of document object:
 
 ```julia
 julia> Document("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
-
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: To be or not to be...
 julia> Document("/usr/share/dict/words")
-FileDocument("/usr/share/dict/words", TextAnalysis.DocumentMetadata(Languages.English(), "/usr/share/dict/words", "Unknown Author", "Unknown Time"))
+A FileDocument
+ * Language: Languages.English()
+ * Title: /usr/share/dict/words
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: A A's AMD AMD's AOL AOL's Aachen Aachen's Aaliyah
 
 julia> Document(String["To", "be", "or", "not", "to", "be..."])
-TokenDocument{String}(["To", "be", "or", "not", "to", "be..."], TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A TokenDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 
 julia> Document(Dict{String, Int}("a" => 1, "b" => 3))
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("b"=>3,"a"=>1), 1, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A NGramDocument{AbstractString}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 ```
 
 This constructor is very convenient for working in the REPL, but should be avoided in permanent code because, unlike the other constructors, the return type of the `Document` function cannot be known at compile-time.
@@ -107,7 +167,12 @@ most obvious thing is to access its text using the `text()` function:
 
 ```julia
 julia> sd = StringDocument("To be or not to be...")
-StringDocument{String}("To be or not to be...", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: To be or not to be...
 
 julia> text(sd)
 "To be or not to be..."
@@ -166,7 +231,12 @@ contains unigrams, bigrams or a higher-order representation using the `ngram_com
 
 ```julia
 julia> ngd = NGramDocument("To be or not to be ...", 2)
-NGramDocument{AbstractString}(Dict{AbstractString,Int64}("to be"=>1,"not"=>1,"be or"=>1,"or"=>1,"not to"=>1,"To"=>1,".."=>1,"."=>1,"be .."=>1,"be"=>2…), 2, TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A NGramDocument{AbstractString}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: ***SAMPLE TEXT NOT AVAILABLE***
 
 julia> ngram_complexity(ngd)
 2
@@ -191,8 +261,13 @@ Try these functions out on a `StringDocument` to see how the defaults work
 in practice:
 
 ```julia
-julia> sd = StringDocument("This document has too foo words")
-StringDocument{String}("This document has too foo words", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> StringDocument("This document has too foo words")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: This document has too foo words
 
 julia> language(sd)
 Languages.English()
@@ -244,8 +319,8 @@ julia> str = StringDocument("here are some punctuations !!!...")
 
 julia> prepare!(str, strip_punctuation)
 
-julia> str
-StringDocument{String}("here are some punctuations  ", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> text(str)
+"here are some punctuations "
 ```
 
 * To remove case distinctions, use `remove_case!()` function:
@@ -254,17 +329,22 @@ name. To do that, use the `remove_words!()` function:
 
 ```julia
 julia> sd = StringDocument("Lear is mad")
-StringDocument{String}("Lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: Lear is mad
 
 julia> remove_case!(sd)
 
-julia> sd
-StringDocument{String}("lear is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> text(sd)
+"lear is mad"
 
 julia> remove_words!(sd, ["lear"])
 
-julia> sd
-StringDocument{String}(" is mad", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> text(sd)
+" is mad"
 ```
 
 At other times, you'll want to remove whole classes of words. To make this
@@ -304,10 +384,16 @@ smaller set of words for analysis. We can do this using the `stem!()`
 function:
 
 ```julia
-julia> sd = StringDocument("Foo writes and foo bar write")
+julia> sd = StringDocument("They write, it writes")
+A StringDocument{String}
+ * Language: Languages.English()
+ * Title: Untitled Document
+ * Author: Unknown Author
+ * Timestamp: Unknown Time
+ * Snippet: They write, it writes
 
 julia> stem!(sd)
 
-julia> sd
-StringDocument{String}("Foo write and foo bar write", TextAnalysis.DocumentMetadata(Languages.English(), "Untitled Document", "Unknown Author", "Unknown Time"))
+julia> text(sd)
+"They write , it write"
 ```
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -11,17 +11,7 @@ julia> crps = Corpus([StringDocument("To be or not to be"),
 julia> update_lexicon!(crps)
 
 julia> m = DocumentTermMatrix(crps)
-DocumentTermMatrix(
-  [1, 1]  =  1
-  [2, 1]  =  1
-  [1, 2]  =  2
-  [2, 3]  =  2
-  [1, 4]  =  1
-  [2, 4]  =  1
-  [1, 5]  =  1
-  [2, 5]  =  1
-  [1, 6]  =  1
-  [2, 6]  =  1, ["To", "be", "become", "not", "or", "to"], Dict("or"=>5,"not"=>4,"to"=>6,"To"=>1,"be"=>2,"become"=>3))
+A 2 X 6 DocumentTermMatrix
 ```
 
 A `DocumentTermMatrix` object is a special type. If you would like to use