diff --git a/src/rankcorr.jl b/src/rankcorr.jl
index ea8041058..0592530b8 100644
--- a/src/rankcorr.jl
+++ b/src/rankcorr.jl
@@ -33,66 +33,50 @@ corspearman(X::RealMatrix) = (Z = mapslices(tiedrank, X, dims=1); cor(Z, Z))
 #
 #######################################
 
-# Knight JASA (1966)
-
-function corkendall!(x::RealVector, y::RealVector)
+# Knight, William R. “A Computer Method for Calculating Kendall's Tau with Ungrouped Data.”
+# Journal of the American Statistical Association, vol. 61, no. 314, 1966, pp. 436–439.
+# JSTOR, www.jstor.org/stable/2282833.
+function corkendall!(x::RealVector, y::RealVector, permx::AbstractVector{<:Integer}=sortperm(x))
     if any(isnan, x) || any(isnan, y) return NaN end
     n = length(x)
     if n != length(y) error("Vectors must have same length") end
 
     # Initial sorting
-    pm = sortperm(y)
-    x[:] = x[pm]
-    y[:] = y[pm]
-    pm[:] = sortperm(x)
-    x[:] = x[pm]
-
-    # Counting ties in x and y
-    iT = 1
-    nT = 0
-    iU = 1
-    nU = 0
-    for i = 2:n
-        if x[i] == x[i-1]
-            iT += 1
-        else
-            nT += iT*(iT - 1)
-            iT = 1
-        end
-        if y[i] == y[i-1]
-            iU += 1
-        else
-            nU += iU*(iU - 1)
-            iU = 1
+    permute!(x, permx)
+    permute!(y, permx)
+    
+    # Use widen to avoid overflows on both 32bit and 64bit
+    npairs = div(widen(n) * (n - 1), 2)
+    ntiesx = ndoubleties = nswaps = widen(0)
+    k = 0
+
+    @inbounds for i = 2:n
+        if x[i - 1] == x[i]
+            k += 1
+        elseif k > 0
+            # Sort the corresponding chunk of y, so the rows of hcat(x,y) are 
+            # sorted first on x, then (where x values are tied) on y. Hence 
+            # double ties can be counted by calling countties.
+            sort!(view(y, (i - k - 1):(i - 1)))
+            ntiesx += div(widen(k) * (k + 1), 2) # Must use wide integers here
+            ndoubleties += countties(y,  i - k - 1, i - 1)
+            k = 0
         end
     end
-    if iT > 1 nT += iT*(iT - 1) end
-    nT = div(nT,2)
-    if iU > 1 nU += iU*(iU - 1) end
-    nU = div(nU,2)
-
-    # Sort y after x
-    y[:] = y[pm]
-
-    # Calculate double ties
-    iV = 1
-    nV = 0
-    jV = 1
-    for i = 2:n
-        if x[i] == x[i-1] && y[i] == y[i-1]
-            iV += 1
-        else
-            nV += iV*(iV - 1)
-            iV = 1
-        end
+    if k > 0
+        sort!(view(y, (n - k):n))
+        ntiesx += div(widen(k) * (k + 1), 2)
+        ndoubleties += countties(y, n - k, n)
     end
-    if iV > 1 nV += iV*(iV - 1) end
-    nV = div(nV,2)
 
-    nD = div(n*(n - 1),2)
-    return (nD - nT - nU + nV - 2swaps!(y)) / (sqrt(nD - nT) * sqrt(nD - nU))
-end
+    nswaps = merge_sort!(y, 1, n)
+    ntiesy = countties(y, 1, n)
 
+    # Calls to float below prevent possible overflow errors when
+    # length(x) exceeds 77_936 (32 bit) or 5_107_605_667 (64 bit)
+    (npairs + ndoubleties - ntiesx - ntiesy - 2 * nswaps) /
+     sqrt(float(npairs - ntiesx) * float(npairs - ntiesy))
+end
 
 """
     corkendall(x, y=x)
@@ -100,21 +84,40 @@ end
 Compute Kendall's rank correlation coefficient, τ. `x` and `y` must both be either
 matrices or vectors.
 """
-corkendall(x::RealVector, y::RealVector) = corkendall!(float(copy(x)), float(copy(y)))
+corkendall(x::RealVector, y::RealVector) = corkendall!(copy(x), copy(y))
 
-corkendall(X::RealMatrix, y::RealVector) = Float64[corkendall!(float(X[:,i]), float(copy(y))) for i in 1:size(X, 2)]
-
-corkendall(x::RealVector, Y::RealMatrix) = (n = size(Y,2); reshape(Float64[corkendall!(float(copy(x)), float(Y[:,i])) for i in 1:n], 1, n))
+function corkendall(X::RealMatrix, y::RealVector)
+    permy = sortperm(y)
+    return([corkendall!(copy(y), X[:,i], permy) for i in 1:size(X, 2)])
+end
 
-corkendall(X::RealMatrix, Y::RealMatrix) = Float64[corkendall!(float(X[:,i]), float(Y[:,j])) for i in 1:size(X, 2), j in 1:size(Y, 2)]
+function corkendall(x::RealVector, Y::RealMatrix)
+    n = size(Y, 2)
+    permx = sortperm(x)
+    return(reshape([corkendall!(copy(x), Y[:,i], permx) for i in 1:n], 1, n))
+end
 
 function corkendall(X::RealMatrix)
     n = size(X, 2)
-    C = Matrix{eltype(X)}(I, n, n)
+    C = Matrix{Float64}(I, n, n)
     for j = 2:n
-        for i = 1:j-1
-            C[i,j] = corkendall!(X[:,i],X[:,j])
-            C[j,i] = C[i,j]
+        permx = sortperm(X[:,j])
+        for i = 1:j - 1
+            C[j,i] = corkendall!(X[:,j], X[:,i], permx)
+            C[i,j] = C[j,i]
+        end
+    end
+    return C
+end
+
+function corkendall(X::RealMatrix, Y::RealMatrix)
+    nr = size(X, 2)
+    nc = size(Y, 2)
+    C = Matrix{Float64}(undef, nr, nc)
+    for j = 1:nr
+        permx = sortperm(X[:,j])
+        for i = 1:nc
+            C[j,i] = corkendall!(X[:,j], Y[:,i], permx)
         end
     end
     return C
@@ -122,32 +125,111 @@ end
 
 # Auxilliary functions for Kendall's rank correlation
 
-function swaps!(x::RealVector)
-    n = length(x)
-    if n == 1 return 0 end
-    n2 = div(n, 2)
-    xl = view(x, 1:n2)
-    xr = view(x, n2+1:n)
-    nsl = swaps!(xl)
-    nsr = swaps!(xr)
-    sort!(xl)
-    sort!(xr)
-    return nsl + nsr + mswaps(xl,xr)
+"""
+    countties(x::RealVector, lo::Integer, hi::Integer)
+
+Return the number of ties within `x[lo:hi]`. Assumes `x` is sorted. 
+"""
+function countties(x::AbstractVector, lo::Integer, hi::Integer)
+    # Use of widen below prevents possible overflow errors when
+    # length(x) exceeds 2^16 (32 bit) or 2^32 (64 bit)
+    thistiecount = result = widen(0)
+    checkbounds(x, lo:hi)
+    @inbounds for i = (lo + 1):hi
+        if x[i] == x[i - 1]
+            thistiecount += 1
+        elseif thistiecount > 0
+            result += div(thistiecount * (thistiecount + 1), 2)
+            thistiecount = widen(0)
+        end
+    end
+
+    if thistiecount > 0
+        result += div(thistiecount * (thistiecount + 1), 2)
+    end
+    result
 end
 
-function mswaps(x::RealVector, y::RealVector)
-    i = 1
-    j = 1
-    nSwaps = 0
-    n = length(x)
-    while i <= n && j <= length(y)
-        if y[j] < x[i]
-            nSwaps += n - i + 1
+# Tests appear to show that a value of 64 is optimal,
+# but note that the equivalent constant in base/sort.jl is 20.
+const SMALL_THRESHOLD = 64
+
+# merge_sort! copied from Julia Base
+# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020)
+"""
+    merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0))    
+
+Mutates `v` by sorting elements `x[lo:hi]` using the merge sort algorithm. 
+This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance.
+"""
+function merge_sort!(v::AbstractVector, lo::Integer, hi::Integer, t::AbstractVector=similar(v, 0))
+    # Use of widen below prevents possible overflow errors when
+    # length(v) exceeds 2^16 (32 bit) or 2^32 (64 bit)
+    nswaps = widen(0)
+    @inbounds if lo < hi
+        hi - lo <= SMALL_THRESHOLD && return insertion_sort!(v, lo, hi)
+
+        m = midpoint(lo, hi)
+        (length(t) < m - lo + 1) && resize!(t, m - lo + 1)
+
+        nswaps = merge_sort!(v, lo,  m, t)
+        nswaps += merge_sort!(v, m + 1, hi, t)
+
+        i, j = 1, lo
+        while j <= m
+            t[i] = v[j]
+            i += 1
             j += 1
-        else
+        end
+
+        i, k = 1, lo
+        while k < j <= hi
+            if v[j] < t[i]
+                v[k] = v[j]
+                j += 1
+                nswaps += m - lo + 1 - (i - 1)
+            else
+                v[k] = t[i]
+                i += 1
+            end
+            k += 1
+        end
+        while k < j
+            v[k] = t[i]
+            k += 1
             i += 1
         end
     end
-    return nSwaps
+    return nswaps
 end
 
+# insertion_sort! and midpoint copied from Julia Base
+# (commit 28330a2fef4d9d149ba0fd3ffa06347b50067647, dated 20 Sep 2020)
+midpoint(lo::T, hi::T) where T <: Integer = lo + ((hi - lo) >>> 0x01)
+midpoint(lo::Integer, hi::Integer) = midpoint(promote(lo, hi)...)
+
+"""
+    insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer)
+
+Mutates `v` by sorting elements `x[lo:hi]` using the insertion sort algorithm. 
+This method is a copy-paste-edit of sort! in base/sort.jl, amended to return the bubblesort distance.
+"""
+function insertion_sort!(v::AbstractVector, lo::Integer, hi::Integer)
+    if lo == hi return widen(0) end
+    nswaps = widen(0)
+    @inbounds for i = lo + 1:hi
+        j = i
+        x = v[i]
+        while j > lo
+            if x < v[j - 1]
+                nswaps += 1
+                v[j] = v[j - 1]
+                j -= 1
+                continue
+            end
+            break
+        end
+        v[j] = x
+    end
+    return nswaps
+end
diff --git a/test/rankcorr.jl b/test/rankcorr.jl
index e5505c2a0..6110d4c41 100644
--- a/test/rankcorr.jl
+++ b/test/rankcorr.jl
@@ -23,20 +23,86 @@ c22 = corspearman(x2, x2)
 @test corspearman(X, X) ≈ [c11 c12; c12 c22]
 @test corspearman(X)    ≈ [c11 c12; c12 c22]
 
-
 # corkendall
 
-@test corkendall(x1, y) ≈ -0.105409255338946
-@test corkendall(x2, y) ≈ -0.117851130197758
+# Check error, handling of NaN, Inf etc
+@test_throws ErrorException("Vectors must have same length") corkendall([1,2,3,4], [1,2,3])
+@test isnan(corkendall([1,2], [3,NaN]))
+@test isnan(corkendall([1,1,1], [1,2,3]))
+@test corkendall([-Inf,-0.0,Inf],[1,2,3]) == 1.0
+
+# Test, with exact equality, some known results. 
+# RealVector, RealVector
+@test corkendall(x1, y) == -1/sqrt(90)
+@test corkendall(x2, y) == -1/sqrt(72)
+# RealMatrix, RealVector
+@test corkendall(X, y)  == [-1/sqrt(90), -1/sqrt(72)]
+# RealVector, RealMatrix
+@test corkendall(y, X)  == [-1/sqrt(90) -1/sqrt(72)]
+
+# n = 78_000 tests for overflow errors on 32 bit
+# Testing for overflow errors on 64bit would require n be too large for practicality
+# This also tests merge_sort! since n is (much) greater than SMALL_THRESHOLD.
+n = 78_000
+# Test with many repeats
+@test corkendall(repeat(x1, n), repeat(y, n)) ≈ -1/sqrt(90)
+@test corkendall(repeat(x2, n), repeat(y, n)) ≈ -1/sqrt(72)
+@test corkendall(repeat(X, n), repeat(y, n))  ≈ [-1/sqrt(90), -1/sqrt(72)]
+@test corkendall(repeat(y, n), repeat(X, n))  ≈ [-1/sqrt(90) -1/sqrt(72)]
+@test corkendall(repeat([0,1,1,0], n), repeat([1,0,1,0], n)) == 0.0
+
+# Test with no repeats, note testing for exact equality
+@test corkendall(collect(1:n), collect(1:n))          == 1.0
+@test corkendall(collect(1:n), reverse(collect(1:n))) == -1.0
 
-@test corkendall(X, y) ≈ [-0.105409255338946, -0.117851130197758]
-@test corkendall(y, X) ≈ [-0.105409255338946 -0.117851130197758]
+# All elements identical should yield NaN
+@test isnan(corkendall(repeat([1], n), collect(1:n)))
 
 c11 = corkendall(x1, x1)
 c12 = corkendall(x1, x2)
 c22 = corkendall(x2, x2)
 
-@test c11 ≈ 1.0
-@test c22 ≈ 1.0
+# RealMatrix, RealMatrix
 @test corkendall(X, X) ≈ [c11 c12; c12 c22]
+# RealMatrix
 @test corkendall(X)    ≈ [c11 c12; c12 c22]
+
+@test c11 == 1.0
+@test c22 == 1.0
+@test c12 == 3/sqrt(20)
+
+# Finished testing for overflow, so redefine n for speedier tests
+n = 100
+
+@test corkendall(repeat(X, n), repeat(X, n)) ≈ [c11 c12; c12 c22]
+@test corkendall(repeat(X, n))               ≈ [c11 c12; c12 c22]
+
+# All eight three-element permutations
+z = [1  1  1;
+     1  1  2;
+     1  2  2;
+     1  2  2;
+     1  2  1;
+     2  1  2;
+     1  1  2;
+     2  2  2]
+
+@test corkendall(z)         == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(z, z)      == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(z[:,1], z) == [1 0 1/3]
+@test corkendall(z, z[:,1]) == [1; 0; 1/3]
+
+z = float(z)
+@test corkendall(z)         == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(z, z)      == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(z[:,1], z) == [1 0 1/3]
+@test corkendall(z, z[:,1]) == [1; 0; 1/3]
+
+w = repeat(z, n)
+@test corkendall(w)         == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(w, w)      == [1 0 1/3; 0 1 0; 1/3 0 1]
+@test corkendall(w[:,1], w) == [1 0 1/3]
+@test corkendall(w, w[:,1]) == [1; 0; 1/3]
+
+StatsBase.midpoint(1,10)        == 5
+StatsBase.midpoint(1,widen(10)) == 5