JuliaText · aviks · Jun 9, 2019 · May 4, 2019 · May 4, 2019 · May 5, 2019
diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -50,6 +50,8 @@ module TextAnalysis
     export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
     export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
     export SentimentAnalyzer
+    export jackknife_avg, listify_ngrams, weighted_lcs, FMeasureLCS
+    export rouge_l_summary, rouge_l_sentence, rouge_n
 
     include("tokenizer.jl")
     include("ngramizer.jl")
@@ -75,4 +77,7 @@ module TextAnalysis
     include("sentiment.jl")
     include("bayes.jl")
     include("deprecations.jl")
+    include("utils.jl")
+    include("rouge.jl")
+    include("evaluate.jl")
 end
diff --git a/src/rouge.jl b/src/rouge.jl
@@ -0,0 +1,152 @@
+#= ROUGE score implementation
+Lin C.Y. , 2004
+Rouge: A package for automatic evaluation of summaries
+Proceedings of the workshop on text summarization branches out (WAS 2004) (2004), pp. 25-26
+Link to paper:
+http://www.aclweb.org/anthology/W04-1013 =#
+
+using TextAnalysis
+using WordTokenizers
+function rouge_n(references, candidate, n, averaging = true)
+    #= It is a n-gram recall between a candidate summary
+    and a set of reference summaries.
+
+    param references : list of reference strings
+    type references : Array{String,1}
+
+    param candidate :  the candidate string
+    type (candidate) : Array{String,1} 
+
+    n : length of ngram
+    type (n) : int
+
+    ngram_cand : list of ngrams in candidate
+    ngram_ref : list of ngrams in reference
+    r_lcs : recall factor
+    p_lcs : precision factor
+    rouge_recall : list containing all the rouge-n scores for
+                   every reference against the candidate=#
+
+    ngram_cand = listify_ngrams(ngrams(StringDocument(candidate), n))
+    rouge_recall = []
+
+    for ref in references
+        matches = 0  #variable counting the no.of matching ngrams
+        ngram_ref = listify_ngrams(ngrams(StringDocument(ref), n))
+        print(ngram_ref)
+
+        for ngr in ngram_cand
+            if ngr in ngram_ref
+                matches += 1 
+            end
+
+        end
+
+        push!(rouge_recall, matches/length(ngram_ref))
+
+    end
+
+    if averaging == true
+        rouge_recall = jackknife_avg(rouge_recall)
+    end
+
+    return(rouge_recall)
+
+end
+
+
+function rouge_l_sentence(references, candidate, beta=8, averaging = true)
+    #= It calculates the rouge-l score between the candidate
+    and the reference at the sentence level.
+
+    param references : list of reference strings
+    type references : Array{String,1}
+
+    param candidate :  the candidate string
+    type (candidate) : Array{String,1} 
+
+    param beta : user-defined parameter. Default value = 8 
+    type (beta) : float
+
+    rouge_l_list : list containing all the rouge scores for
+                   every reference against the candidate
+    r_lcs : recall factor
+    p_lcs : precision factor
+    score : rouge-l score between the reference sentence and 
+            the candidate sentence =#
+
+    ngram_cand = tokenize(candidate)
+    rouge_l_list = []
+
+    for ref in references
+        ngram_ref = tokenize(ref)
+        r_lcs = weighted_lcs(ngram_ref, ngram_cand,true, false, sqrt)/length(ngram_ref)
+        p_lcs = weighted_lcs(ngram_ref, ngram_cand,true, false, sqrt)/length(ngram_cand)
+        score = FMeasureLCS(r_lcs, p_lcs, beta)
+        push!(rouge_l_list,score)
+
+    end
+    if averaging == true
+        rouge_l_list = jackknife_avg(rouge_l_list)
+    end
+    return rouge_l_list
+
+end
+
+function rouge_l_summary(references, candidate, beta, averaging=true)
+    #=It calculates the rouge-l score between the candidate
+    and the reference at the summary level.
+
+    param references : list of reference summaries. Each of the summaries 
+                       must be tokenized list of words 
+    type (references) : list
+
+    param candidate : candidate summary tokenized into list of words
+    type (candidate) : list
+     param beta : user-defined parameter
+    type (beta) : float
+
+    rouge_l_list : list containing all the rouge scores for
+                   every reference against the candidate
+
+    r_lcs : recall factor
+    p_lcs : precision factor
+    score : rouge-l score between a reference and the candidate
+    =#
+
+    rouge_l_list = []
+    cand_sent_list = split_sentences(candidate)
+
+    for ref in references
+        ref_sent_list = split_sentences(ref)
+        sum_value = 0
+
+        for ref_sent in ref_sent_list
+            l_ = []
+            arg1 = tokenize(ref)
+
+            for cand_sent in cand_sent_list
+                arg2 = tokenize(cand_sent)
+                d = tokenize(weighted_lcs(arg1, arg2, false, true, sqrt))
+                append!(l_,d)
+            end
+
+            print(l_)
+            sum_value = sum_value+length(unique(l_))
+
+        end
+
+        r_lcs = sum_value/length(tokenize(ref))
+        p_lcs = sum_value/length(tokenize(candidate))
+        score = FMeasureLCS(r_lcs, p_lcs, beta)
+        push!(rouge_l_list,score)
+
+    end
+
+    if averaging == true
+        rouge_l_list = jackknife_avg(rouge_l_list)
+    end
+
+    return rouge_l_list
+
+end
diff --git a/src/utils.jl b/src/utils.jl
@@ -0,0 +1,158 @@
+# JuliaText TextAnalysis.jl Utility Functions
+
+function jackknife_avg(scores)
+
+    #= The jackknife is a resampling technique especially useful for variance and bias estimation. 
+    Currently being used for averaging in ROUGE scores in evaluate.jl
+    :param scores: List of integers to average
+    :type scores: Array{Int64,1} =#
+
+    if length(collect(Set(scores))) == 1
+
+        #= In case the elements of the array are all equal=#
+        return scores[1]
+
+    else
+
+        #=store the maximum scores
+        from the m different sets of m-1 scores.
+        such that m is the len(score_list)=#
+
+        average = []
+
+        for i in scores
+            # dummy : list a particular combo of m-1 scores
+            dummy = [j for j in scores if i != j]
+            append!(average, max(dummy...))
+            end
+
+        return sum(average)/length(average)
+
+    end
+end
+
+function listify_ngrams(ngram_doc)
+    flattened = []
+    for item in ngram_doc
+        for i in 1:item[2]
+            push!(flattened, item[1])
+        end
+    end
+    return flattened
+end
+
+function weighted_lcs(X, Y, weighted = true, return_string = false, f = sqrt)
+    #=This function returns the longest common subsequence
+    of two strings using the dynamic programming algorithm.
+     param X : first string in tokenized form
+    type (X) : Array{SubString{String},1}
+     param Y : second string in tokenized form
+    type (Y) : Array{SubString{String},1}
+     param weighted : Weighted LCS is done if weighted is True (default)
+    type (weighted) : Boolean
+     param return_string : Function returns weighted LCS length when set to False (default).
+                           Function returns longest common substring when set to True.
+    type (return_string) : Boolean
+     param f: weighting function. The weighting function f must have the property
+              that f(x+y) > f(x) + f(y) for any positive integers x and y. 
+    type (f) : generic function which takes a float as an input and returns a float.
+    =#
+
+    m, n = length(X), length(Y)
+    c_table = [zeros(n+1) for i in 1:m+1]
+    w_table = [zeros(n+1) for i in 1:m+1]
+
+    for i in 1:(m+1)
+
+        for j in 1:(n+1)
+
+            if i == 1 || j == 1
+                continue
+
+            elseif X[i-1] == Y[j-1]
+
+                k = w_table[i-1][j-1]
+                if weighted == true
+                    increment = (f(k+1)) - (f(k)) 
+                else
+                    increment = 1
+                end
+                c_table[i][j] = c_table[i-1][j-1] + increment
+                w_table[i][j] = k + 1
+
+            else
+
+                if c_table[i-1][j] > c_table[i][j-1]
+                    c_table[i][j] = c_table[i-1][j]
+                    w_table[i][j] = 0  # no match at i,j
+                else
+                    c_table[i][j] = c_table[i][j-1]
+                    w_table[i][j] = 0  # no match at i,j
+                end
+
+            end
+
+        end
+
+    end
+
+    lcs_length = (c_table[m+1][n+1])
+    if return_string == false
+        return lcs_length
+    end
+
+    if weighted == true
+        lcs_length = c_table[m][n]^(2)
+    end
+
+    lcs_length = round(Int64, lcs_length)
+    lcs_length = convert(Int64, lcs_length)
+    lcs = ["" for i in  1:(lcs_length+1)]
+    lcs[lcs_length+1] = ""
+    i = m+1
+    j = n+1
+
+    while i>1 && j>1
+        if X[i-1] == Y[j-1]
+            lcs[lcs_length+1] = X[i-1]
+            i -= 1
+            j -= 1
+            lcs_length -= 1
+
+        elseif c_table[i-1][j] > c_table[i][j-1]
+            i -= 1
+        else
+            j -= 1
+        end
+    end
+
+    return (join(lcs, " "))  # the lcs string
+
+end
+
+function FMeasureLCS(RLCS, PLCS, beta=1)
+    #=F-measure based on WLCS
+
+    param beta : user defined parameter
+    type (beta) : float
+
+    param r_lcs : recall factor
+    type (r_lcs) : float
+
+    param p_lcs : precision factor
+    type (p_lcs) : float
+
+    score : f measure score between a candidate
+    	    and a reference
+    =#
+
+    try
+        return ((1+beta^2)*RLCS*PLCS)/(RLCS+(beta^2)*PLCS)
+    catch ex
+        if ex isa DivideError
+            return 0
+        else
+            rethrow(ex)
+        end
+    end
+end
diff --git a/test/rouge.jl b/test/rouge.jl
@@ -0,0 +1,19 @@
+@testset "ROUGE" begin
+
+    candidate_sentence = "Brazil, Russia, China and India are growing nations"
+    candidate_summary =  "Brazil, Russia, China and India are growing nations. They are all an important part of BRIC as well as regular part of G20 summits."
+
+    reference_sentences = ["Brazil, Russia, India and China are growing nations", "Brazil and India are two of the developing nations that are part of the BRIC"]
+    reference_summaries = ["Brazil, Russia, India and China are the next big poltical powers in the global economy. Together referred to as BRIC(S) along with South Korea.", "Brazil, Russia, India and China are together known as the  BRIC(S) and have been invited to the G20 summit."]
+
+    @test rouge_l_summary(reference_summaries, candidate_summary, 8, true) == 0.42565327352779836
+
+    @test rouge_n(reference_summaries, candidate_summary, 1, true) == 0.5051282051282051
+    @test rouge_n(reference_summaries, candidate_summary, 2, true) == 0.1317241379310345
+
+    @test rouge_n(reference_sentences, candidate_sentence, 2, true) == 0.3492063492063492
+    @test rouge_n(reference_sentences, candidate_sentence, 2, true) == 0.6666666666666666
+
+    @test rouge_l_sentence(reference_sentences, candidate_sentence, 8, false) == 0.36164547980729794
+
+end