diff --git a/Src/ILGPU/Util/ParallelCache.cs b/Src/ILGPU/Util/ParallelCache.cs
new file mode 100644
index 0000000000..ecb4adbb96
--- /dev/null
+++ b/Src/ILGPU/Util/ParallelCache.cs
@@ -0,0 +1,378 @@
+// ---------------------------------------------------------------------------------------
+// ILGPU
+// Copyright (c) 2023 ILGPU Project
+// www.ilgpu.net
+//
+// File: ParallelCache.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+using System.Threading.Tasks;
+
+namespace ILGPU.Util
+{
+ ///
+ /// Represents a parallel object cache to be used in combination with a
+ /// for implementation to avoid unnecessary temporary object
+ /// creation.
+ ///
+ /// The type of the elements to cache.
+ public abstract class ParallelCache : DisposeBase, IParallelCache
+ where T : class
+ {
+ #region Instance
+
+ private InlineList cache;
+ private InlineList used;
+
+ ///
+ /// Creates a new parallel cache.
+ ///
+ ///
+ /// The initial number of processing threads (if any).
+ ///
+ protected ParallelCache(int? initialCapacity = null)
+ {
+ int capacity = initialCapacity ?? Environment.ProcessorCount * 2;
+ cache = InlineList.Create(capacity);
+ used = InlineList.Create(capacity);
+
+ LocalInitializer = GetOrCreate;
+ LocalFinalizer = FinishProcessing;
+ }
+
+ #endregion
+
+ #region Properties
+
+ ///
+ /// Returns the underlying sync root object.
+ ///
+ public object SyncRoot { get; } = new object();
+
+ ///
+ /// Returns the local initializer function.
+ ///
+ public Func LocalInitializer { get; }
+
+ ///
+ /// Returns the local finalizer action.
+ ///
+ public Action LocalFinalizer { get; }
+
+ ///
+ /// Returns the underlying used intermediates.
+ ///
+ protected ReadOnlySpan Used => used;
+
+ #endregion
+
+ #region Methods
+
+ ///
+ /// Initializes this parallel cache of the next parallel operation.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void InitializeProcessing()
+ {
+ // This method does not perform an operation at the moment but this may
+ // change in the future. For this reason, this (empty) method remains here
+ // and should be called in all cases prior to calling GetOrCreate().
+ }
+
+ ///
+ /// Gets or creates a new intermediate array tuple storing information for the
+ /// upcoming optimizer iteration.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public T GetOrCreate()
+ {
+ // Checks the cache contents to retrieve previously
+ T intermediate;
+ lock (SyncRoot)
+ {
+ if (cache.Count > 0)
+ {
+ int lastIndex = cache.Count - 1;
+ intermediate = cache[lastIndex];
+ cache.RemoveAt(lastIndex);
+ }
+ else
+ {
+ // Create a new intermediate result
+ intermediate = CreateIntermediate();
+ }
+ }
+
+ // Initialize intermediate result and return
+ InitializeIntermediate(intermediate);
+
+ // Add to our list of used intermediates
+ lock (SyncRoot)
+ used.Add(intermediate);
+
+ return intermediate;
+ }
+
+ ///
+ /// Finishes a parallel processing step.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void FinishProcessing()
+ {
+ // Return all used intermediates to the cache
+ cache.AddRange(used);
+ used.Clear();
+
+ }
+
+ ///
+ /// Creates a new intermediate instance without initializing it properly.
+ ///
+ /// The created intermediate state.
+ protected abstract T CreateIntermediate();
+
+ ///
+ /// Initializes the given intermediate state in order to prepare it for
+ /// processing.
+ ///
+ /// The intermediate state to prepare.
+ protected virtual void InitializeIntermediate(T intermediateState) { }
+
+ ///
+ /// Finishes processing of the current thread while getting an intermediate state.
+ ///
+ /// The intermediate state to operate on.
+ protected virtual void FinishProcessing(T intermediateState) { }
+
+ #endregion
+
+ #region IParallelCache
+
+ ///
+ /// Creates a new intermediate instance without initializing it properly.
+ ///
+ /// The created intermediate state.
+ T IParallelCache.CreateIntermediate() => CreateIntermediate();
+
+ ///
+ /// Initializes the given intermediate state in order to prepare it for
+ /// processing.
+ ///
+ /// The intermediate state to prepare.
+ void IParallelCache.InitializeIntermediate(T intermediateState) =>
+ InitializeIntermediate(intermediateState);
+
+ ///
+ /// Finishes processing of the current thread while getting an intermediate state.
+ ///
+ /// The intermediate state to operate on.
+ void IParallelCache.FinishProcessing(T intermediateState) =>
+ FinishProcessing(intermediateState);
+
+ #endregion
+
+ #region IDisposable
+
+ ///
+ /// Disposes all created intermediate states (if required).
+ ///
+ protected override void Dispose(bool disposing)
+ {
+ // Check whether we need to dispose all elements
+ if (cache.Count > 0 && typeof(IDisposable).IsAssignableFrom(typeof(T)))
+ {
+ foreach (var intermediateStates in cache)
+ intermediateStates.AsNotNullCast().Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ #endregion
+ }
+
+ ///
+ /// An abstract parallel cache interface operating on intermediate states.
+ ///
+ /// The type of all intermediate states.
+ public interface IParallelCache
+ {
+ ///
+ /// Creates a new intermediate instance without initializing it properly.
+ ///
+ /// The created intermediate state.
+ T CreateIntermediate();
+
+ ///
+ /// Initializes the given intermediate state in order to prepare it for
+ /// processing.
+ ///
+ /// The intermediate state to prepare.
+ void InitializeIntermediate(T intermediateState);
+
+ ///
+ /// Finishes processing of the current thread while getting an intermediate state.
+ ///
+ /// The intermediate state to operate on.
+ void FinishProcessing(T intermediateState);
+ }
+
+ ///
+ /// An abstract parallel processing body representing a function to be executed
+ /// concurrently on a given value range. It operates on intermediate values that are
+ /// managed by its surrounding processing cache.
+ ///
+ /// The type of all intermediate states.
+ public interface IParallelProcessingBody
+ where T : class
+ {
+ ///
+ /// Initializes this processing body to prepare the upcoming parallel processing
+ /// steps.
+ ///
+ void Initialize();
+
+ ///
+ /// Processes a single element concurrently while accepting an intermediate state
+ /// on which this body operates on.
+ ///
+ /// The current processing element index.
+ /// The parallel loop state (if any).
+ ///
+ /// The current intermediate state for this thread.
+ ///
+ void Process(
+ int index,
+ ParallelLoopState? loopState,
+ T intermediateState);
+
+ ///
+ /// Finalizes the current body operating while having the ability to inspect all
+ /// previously used intermediate states.
+ ///
+ ///
+ /// A span referring to all previously used intermediate states.
+ ///
+ void Finalize(ReadOnlySpan intermediateStates);
+ }
+
+ ///
+ /// Static helpers for parallel processing extensions.
+ ///
+ public static class ParallelProcessing
+ {
+ ///
+ /// Gets or sets whether debug mode is enabled. Note that this assignment needs to
+ /// be changes before the first
+ /// instance has been created since the flag is cached locally to enable JIT
+ /// optimizations.
+ ///
+ public static bool DebugMode { get; set; }
+ }
+
+ ///
+ /// Represents a parallel object cache to be used in combination with a
+ /// for implementation to avoid unnecessary temporary object
+ /// creation. Furthermore, this implementation operates on specialized body instances
+ /// to avoid virtual function calls in each processing step.
+ ///
+ /// The type of the elements to cache.
+ /// The type of the custom loop body instance.
+ public abstract class ParallelProcessingCache : ParallelCache
+ where T : class
+ where TBody : IParallelProcessingBody
+ {
+ ///
+ /// Returns true if the debug mode is enabled for all parallel processing
+ /// operations.
+ ///
+ private static readonly bool DebugMode = ParallelProcessing.DebugMode;
+
+ private readonly Func body;
+ private readonly TBody bodyImplementation;
+ private readonly ParallelOptions defaultOptions = new();
+
+ ///
+ /// Creates a new parallel processing cache operating on intermediate states.
+ ///
+ ///
+ /// The initial number of processing threads (if any).
+ ///
+ [SuppressMessage(
+ "Usage",
+ "CA2214:Do not call overridable methods in constructors",
+ Justification = "This method is called here as it represents an abstract " +
+ "static factory method")]
+ protected ParallelProcessingCache(int? initialCapacity = null)
+ : base(initialCapacity)
+ {
+ bodyImplementation = CreateBody();
+ body = (i, state, intermediate) =>
+ {
+ bodyImplementation.Process(i, state, intermediate);
+ return intermediate;
+ };
+ }
+
+ ///
+ /// Creates the required parallel processing body to be used.
+ ///
+ /// The processing body to use.
+ protected abstract TBody CreateBody();
+
+ ///
+ /// Performs the current operation in parallel.
+ ///
+ /// The inclusive start index.
+ /// The exclusive end index.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void ParallelFor(int fromInclusive, int toExclusive) =>
+ ParallelFor(fromInclusive, toExclusive, defaultOptions);
+
+ ///
+ /// Performs the current operation in parallel.
+ ///
+ /// The inclusive start index.
+ /// The exclusive end index.
+ /// The parallel execution options.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public void ParallelFor(
+ int fromInclusive,
+ int toExclusive,
+ ParallelOptions options)
+ {
+ // Initialize processing cache
+ InitializeProcessing();
+
+ // Initialize operation
+ bodyImplementation.Initialize();
+
+ // Check for enabled debug mode
+ if (DebugMode)
+ {
+ var intermediate = GetOrCreate();
+ for (int i = fromInclusive; i < toExclusive; ++i)
+ body(i, null, intermediate);
+ }
+ else
+ {
+ Parallel.For(
+ fromInclusive,
+ toExclusive,
+ options,
+ LocalInitializer,
+ body,
+ LocalFinalizer);
+ }
+
+ // Finalize operation
+ bodyImplementation.Finalize(Used);
+ FinishProcessing();
+ }
+ }
+}