From d0b1bb6f422c0193baf89b7aa08a66813f54fbb1 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 11 Oct 2019 11:06:10 -0500
Subject: [PATCH 1/4] Partitions of all arrays use views

Previously only `::Vector` was special-cased to use views. The trade-off here is that we lose the ability to predict the concrete eltype -- since arrays can potentially choose to return something different from `vec` or `view`. Generic iterables still collect their elements into a freshly-allocated `Vector`, like before.
---
 NEWS.md           |  2 ++
 base/iterators.jl | 31 +++++++++++++++++++++++++------
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 7ac3d162a924a..034ea5d85fbae 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -42,6 +42,8 @@ Standard library changes
 
 * Verbose `display` of `Char` (`text/plain` output) now shows the codepoint value in standard-conforming `"U+XXXX"` format ([#33291]).
 
+* `Iterators.partition` now uses views (or smartly re-computed ranges) for partitions of all `AbstractArray`s ([#33533]).
+
 
 #### Libdl
 
diff --git a/base/iterators.jl b/base/iterators.jl
index 9c813d7e21632..4617a139b9b4d 100644
--- a/base/iterators.jl
+++ b/base/iterators.jl
@@ -923,37 +923,56 @@ Iterate over a collection `n` elements at a time.
 # Examples
 ```jldoctest
 julia> collect(Iterators.partition([1,2,3,4,5], 2))
-3-element Array{Array{Int64,1},1}:
+3-element Array{SubArray{Int64,1,Array{Int64,1},Tuple{UnitRange{Int64}},true},1}:
  [1, 2]
  [3, 4]
  [5]
 ```
 """
-partition(c::T, n::Integer) where {T} = PartitionIterator{T}(c, Int(n))
+function partition(c, n::Integer)
+    n < 1 && throw(ArgumentError("cannot create partitions of length $n"))
+    return PartitionIterator(c, Int(n))
+end
 
 struct PartitionIterator{T}
     c::T
     n::Int
 end
+# Partitions are explicitly a linear indexing operation, so reshape to 1-d immediately
+PartitionIterator(A::AbstractArray, n::Int) = PartitionIterator(vec(A), n)
+PartitionIterator(v::AbstractVector, n::Int) = PartitionIterator{typeof(v)}(v, n)
 
 eltype(::Type{PartitionIterator{T}}) where {T} = Vector{eltype(T)}
+# Arrays use a generic `view`-of-a-`vec`, so we cannot exactly predict what we'll get back
+eltype(::Type{PartitionIterator{T}}) where {T<:AbstractArray} = AbstractVector{eltype(T)}
+# But for some common implementations in Base we know the answer exactly
+eltype(::Type{PartitionIterator{T}}) where {T<:Vector} = SubArray{eltype(T), 1, T, Tuple{UnitRange{Int}}, true}
+
+IteratorEltype(::Type{<:PartitionIterator{T}}) where {T} = IteratorEltype(T)
+IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:AbstractArray} = EltypeUnknown()
+IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:Vector} = IteratorEltype(T)
+
 partition_iteratorsize(::HasShape) = HasLength()
 partition_iteratorsize(isz) = isz
 function IteratorSize(::Type{PartitionIterator{T}}) where {T}
     partition_iteratorsize(IteratorSize(T))
 end
 
-IteratorEltype(::Type{<:PartitionIterator{T}}) where {T} = IteratorEltype(T)
-
 function length(itr::PartitionIterator)
     l = length(itr.c)
     return div(l, itr.n) + ((mod(l, itr.n) > 0) ? 1 : 0)
 end
 
-function iterate(itr::PartitionIterator{<:Vector}, state=1)
+function iterate(itr::PartitionIterator{<:AbstractRange}, state=1)
+    state > length(itr.c) && return nothing
+    r = min(state + itr.n - 1, length(itr.c))
+    return @inbounds itr.c[state:r], r + 1
+end
+
+function iterate(itr::PartitionIterator{<:AbstractArray}, state=1)
     state > length(itr.c) && return nothing
     r = min(state + itr.n - 1, length(itr.c))
-    return view(itr.c, state:r), r + 1
+    return @inbounds view(itr.c, state:r), r + 1
 end
 
 struct IterationCutShort; end

From 9f24a06ccd5f47df902354919fd7440ac161ea1d Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 11 Oct 2019 11:10:10 -0500
Subject: [PATCH 2/4] Optimize partitioned views of CartesianIndices

And also recompute ranges instead of using views for partitions of ranges.

Since `Iterators.partition` is so handy for dividing up iteration spaces, it makes sense to optimize this as much as possible. While it is enherently a "linear" operation, it is a batched linear operation that allows us to skip doing all the effective ind2sub work on every single iteration.
---
 base/multidimensional.jl | 47 +++++++++++++++++++++++-
 test/iterators.jl        | 79 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index f3cebeec249ff..1978f45da44e3 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -8,8 +8,9 @@ module IteratorsMD
 
     import .Base: +, -, *, (:)
     import .Base: simd_outer_range, simd_inner_length, simd_index
-    using .Base: IndexLinear, IndexCartesian, AbstractCartesianIndex, fill_to_length, tail
-    using .Base.Iterators: Reverse
+    using .Base: IndexLinear, IndexCartesian, AbstractCartesianIndex, fill_to_length, tail,
+        ReshapedArray, ReshapedArrayLF, OneTo
+    using .Base.Iterators: Reverse, PartitionIterator
 
     export CartesianIndex, CartesianIndices
 
@@ -463,6 +464,48 @@ module IteratorsMD
     iterate(iter::Reverse{<:CartesianIndices{0}}, state=false) = state ? nothing : (CartesianIndex(), true)
 
     Base.LinearIndices(inds::CartesianIndices{N,R}) where {N,R} = LinearIndices{N,R}(inds.indices)
+
+    # Views of reshaped CartesianIndices are used for partitions — ensure these are fast
+    const CartesianPartition{T<:CartesianIndex, P<:CartesianIndices, R<:ReshapedArray{T,1,P}} = SubArray{T,1,R,Tuple{UnitRange{Int}},false}
+    eltype(::Type{PartitionIterator{T}}) where {T<:ReshapedArrayLF} = SubArray{eltype(T), 1, T, Tuple{UnitRange{Int}}, true}
+    eltype(::Type{PartitionIterator{T}}) where {T<:ReshapedArray} = SubArray{eltype(T), 1, T, Tuple{UnitRange{Int}}, false}
+    Iterators.IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:ReshapedArray} = Iterators.IteratorEltype(T)
+
+    eltype(::Type{PartitionIterator{T}}) where {T<:OneTo} = UnitRange{eltype(T)}
+    eltype(::Type{PartitionIterator{T}}) where {T<:Union{UnitRange, StepRange, StepRangeLen, LinRange}} = T
+    Iterators.IteratorEltype(::Type{<:PartitionIterator{T}}) where {T<:Union{OneTo, UnitRange, StepRange, StepRangeLen, LinRange}} = Iterators.IteratorEltype(T)
+
+
+    @inline function iterate(iter::CartesianPartition)
+        isempty(iter) && return nothing
+        f = first(iter)
+        return (f, (f, 1))
+    end
+    @inline function iterate(iter::CartesianPartition, (state, n))
+        n >= length(iter) && return nothing
+        I = IteratorsMD.inc(state.I, first(iter.parent.parent).I, last(iter.parent.parent).I)
+        return I, (I, n+1)
+    end
+
+    simd_outer_range(iter::CartesianPartition{CartesianIndex{2}}) = @inbounds CartesianIndices((iter[1][2]:iter[end][2],))
+    @inline function simd_outer_range(iter::CartesianPartition)
+        t = tail(iter.parent.parent.indices)
+        ci = CartesianIndices(t)
+        li = LinearIndices(t)
+        return @inbounds view(ci, li[tail(iter[1].I)...]:li[tail(iter[end].I)...])
+    end
+    @inline function simd_inner_length(iter::CartesianPartition, I::CartesianIndex)
+        inner = iter.parent.parent.indices[1]
+        @inbounds fi = iter[1].I
+        @inbounds li = iter[end].I
+        inner_start = I.I == tail(fi) ? fi[1] : first(inner)
+        inner_end   = I.I == tail(li) ? li[1] : last(inner)
+        return inner_end - inner_start + 1
+    end
+    @inline function simd_index(iter::CartesianPartition, Ilast::CartesianIndex, I1::Int)
+        f = @inbounds iter[1]
+        CartesianIndex((I1+first(iter.parent.parent.indices[1])+(Ilast.I==tail(f.I))*(f[1]-1), Ilast.I...))
+    end
 end  # IteratorsMD
 
 
diff --git a/test/iterators.jl b/test/iterators.jl
index eabe5e0905b98..4156942daefda 100644
--- a/test/iterators.jl
+++ b/test/iterators.jl
@@ -410,6 +410,85 @@ for n in [5,6]
           [(1,1),(2,2),(3,3),(4,4),(5,5)]
 end
 
+function iterate_length(iter)
+    n=0
+    for i in iter
+        n += 1
+    end
+    return n
+end
+function simd_iterate_length(iter)
+    n=0
+    @simd for i in iter
+        n += 1
+    end
+    return n
+end
+function simd_trip_count(iter)
+    return sum(Base.SimdLoop.simd_inner_length(iter, i) for i in Base.SimdLoop.simd_outer_range(iter))
+end
+function iterate_elements(iter)
+    vals = Vector{eltype(iter)}(undef, length(iter))
+    i = 1
+    for v in iter
+        @inbounds vals[i] = v
+        i += 1
+    end
+    return vals
+end
+function simd_iterate_elements(iter)
+    vals = Vector{eltype(iter)}(undef, length(iter))
+    i = 1
+    @simd for v in iter
+        @inbounds vals[i] = v
+        i += 1
+    end
+    return vals
+end
+function index_elements(iter)
+    vals = Vector{eltype(iter)}(undef, length(iter))
+    i = 1
+    for j in eachindex(iter)
+        @inbounds vals[i] = iter[j]
+        i += 1
+    end
+    return vals
+end
+
+@testset "CartesianPartition optimizations" for dims in ((1,), (64,), (101,),
+                                                         (1,1), (8,8), (11, 13),
+                                                         (1,1,1), (8, 4, 2), (11, 13, 17)),
+                                                part in (1, 7, 8, 11, 63, 64, 65, 142, 143, 144)
+    P = partition(CartesianIndices(dims), part)
+    for I in P
+        @test length(I) == iterate_length(I) == simd_iterate_length(I) == simd_trip_count(I)
+        @test collect(I) == iterate_elements(I) == simd_iterate_elements(I) == index_elements(I)
+    end
+    @test all(Base.splat(==), zip(Iterators.flatten(map(collect, P)), CartesianIndices(dims)))
+end
+@testset "empty/invalid partitions" begin
+    @test_throws ArgumentError partition(1:10, 0)
+    @test_throws ArgumentError partition(1:10, -1)
+    @test_throws ArgumentError partition(1:0, 0)
+    @test_throws ArgumentError partition(1:0, -1)
+    @test isempty(partition(1:0, 1))
+    @test isempty(partition(CartesianIndices((0,1)), 1))
+end
+@testset "exact partition eltypes" for a in (Base.OneTo(24), 1:24, 1:1:24, LinRange(1,10,24), .1:.1:2.4, Vector(1:24),
+                                             CartesianIndices((4, 6)), Dict((1:24) .=> (1:24)))
+    P = partition(a, 2)
+    @test eltype(P) === typeof(first(P))
+    @test Iterators.IteratorEltype(P) == Iterators.HasEltype()
+    if a isa AbstractArray
+        P = partition(vec(a), 2)
+        @test eltype(P) === typeof(first(P))
+        P = partition(reshape(a, 6, 4), 2)
+        @test eltype(P) === typeof(first(P))
+        P = partition(reshape(a, 2, 3, 4), 2)
+        @test eltype(P) === typeof(first(P))
+    end
+end
+
 @test join(map(x->string(x...), partition("Hello World!", 5)), "|") ==
       "Hello| Worl|d!"
 

From 9e10015337204c0b216011cad583d002574428ca Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Fri, 11 Oct 2019 11:10:43 -0500
Subject: [PATCH 3/4] Simplify, simdify, and speed up BitArray broadcasts with
 partitions

---
 base/broadcast.jl | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 16c0653be515d..ebce0afaba6c1 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -919,20 +919,19 @@ end
     length(dest) < 256 && return invoke(copyto!, Tuple{AbstractArray, Broadcasted{Nothing}}, dest, bc)
     tmp = Vector{Bool}(undef, bitcache_size)
     destc = dest.chunks
-    ind = cind = 1
+    cind = 1
     bc′ = preprocess(dest, bc)
-    @simd for I in eachindex(bc′)
-        @inbounds tmp[ind] = bc′[I]
-        ind += 1
-        if ind > bitcache_size
-            dumpbitcache(destc, cind, tmp)
-            cind += bitcache_chunks
-            ind = 1
+    for P in Iterators.partition(eachindex(bc′), bitcache_size)
+        ind = 1
+        @simd for I in P
+            @inbounds tmp[ind] = bc′[I]
+            ind += 1
+        end
+        @simd for i in ind:bitcache_size
+            @inbounds tmp[i] = false
         end
-    end
-    if ind > 1
-        @inbounds tmp[ind:bitcache_size] .= false
         dumpbitcache(destc, cind, tmp)
+        cind += bitcache_chunks
     end
     return dest
 end

From e472b14f50888ed03729685027b9817b5fbda2ab Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@gmail.com>
Date: Thu, 31 Oct 2019 15:05:10 -0500
Subject: [PATCH 4/4] NFC: style changes per review

shorter lines, more spaces, and comments (because I had already forgotten how this worked myself)
---
 base/multidimensional.jl | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 1978f45da44e3..3745186f5cc4f 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -487,13 +487,21 @@ module IteratorsMD
         return I, (I, n+1)
     end
 
-    simd_outer_range(iter::CartesianPartition{CartesianIndex{2}}) = @inbounds CartesianIndices((iter[1][2]:iter[end][2],))
     @inline function simd_outer_range(iter::CartesianPartition)
+        # In general, the Cartesian Partition might start and stop in the middle of the outer
+        # dimensions — thus the outer range of a CartesianPartition is itself a
+        # CartesianPartition.
         t = tail(iter.parent.parent.indices)
         ci = CartesianIndices(t)
         li = LinearIndices(t)
         return @inbounds view(ci, li[tail(iter[1].I)...]:li[tail(iter[end].I)...])
     end
+    function simd_outer_range(iter::CartesianPartition{CartesianIndex{2}})
+        # But for two-dimensional Partitions the above is just a simple one-dimensional range
+        # over the second dimension; we don't need to worry about non-rectangular staggers in
+        # higher dimensions.
+        return @inbounds CartesianIndices((iter[1][2]:iter[end][2],))
+    end
     @inline function simd_inner_length(iter::CartesianPartition, I::CartesianIndex)
         inner = iter.parent.parent.indices[1]
         @inbounds fi = iter[1].I
@@ -503,8 +511,12 @@ module IteratorsMD
         return inner_end - inner_start + 1
     end
     @inline function simd_index(iter::CartesianPartition, Ilast::CartesianIndex, I1::Int)
+        # I1 is the 0-based distance from the first dimension's offest
+        offset = first(iter.parent.parent.indices[1]) # (this is 1 for 1-based arrays)
+        # In the first column we need to also add in the iter's starting point (branchlessly)
         f = @inbounds iter[1]
-        CartesianIndex((I1+first(iter.parent.parent.indices[1])+(Ilast.I==tail(f.I))*(f[1]-1), Ilast.I...))
+        startoffset = (Ilast.I == tail(f.I))*(f[1] - 1)
+        CartesianIndex((I1 + offset + startoffset, Ilast.I...))
     end
 end  # IteratorsMD