diff --git a/README.md b/README.md
index 1d8fe978de..b3be611454 100755
--- a/README.md
+++ b/README.md
@@ -100,7 +100,7 @@ Now you're set up and ready to check out some of the other examples from our
 ## Authors and Citation
 
 Qiskit Aer is the work of [many people](https://github.com/Qiskit/qiskit-aer/graphs/contributors) who contribute
-to the project at different levels. If you use Qiskit, please cite as per the included [BibTeX file](https://github.com/Qiskit/qiskit/blob/master/Qiskit.bib).
+to the project at different levels. If you use Qiskit, please cite as per the included [BibTeX file](https://github.com/Qiskit/qiskit-terra/blob/main/CITATION.bib).
 
 ## License
 
diff --git a/qiskit_aer/backends/aer_compiler.py b/qiskit_aer/backends/aer_compiler.py
index 2e7930d198..4909f73537 100644
--- a/qiskit_aer/backends/aer_compiler.py
+++ b/qiskit_aer/backends/aer_compiler.py
@@ -465,6 +465,8 @@ def compile_circuit(circuits, basis_gates=None, optypes=None):
     "chunk_swap_buffer_qubits": (int, np.integer),
     "batched_shots_gpu": (bool, np.bool_),
     "batched_shots_gpu_max_qubits": (int, np.integer),
+    "shot_branching_enable": (bool, np.bool_),
+    "shot_branching_sampling_enable": (bool, np.bool_),
     "num_threads_per_device": (int, np.integer),
     "statevector_parallel_threshold": (int, np.integer),
     "statevector_sample_measure_opt": (int, np.integer),
@@ -488,6 +490,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None):
     "use_cuTensorNet_autotuning": (bool, np.bool_),
     "parameterizations": (list),
     "fusion_parallelization_threshold": (int, np.integer),
+    "target_gpus": (list),
 }
 
 
diff --git a/qiskit_aer/backends/aer_simulator.py b/qiskit_aer/backends/aer_simulator.py
index 0b7565e539..1154dab407 100644
--- a/qiskit_aer/backends/aer_simulator.py
+++ b/qiskit_aer/backends/aer_simulator.py
@@ -170,6 +170,10 @@ class AerSimulator(AerBackend):
     If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled
     by setting ``cuStateVec_enable=True``.
 
+    * ``target_gpus`` (list): List of GPU's IDs starting from 0 sets
+      the target GPUs used for the simulation.
+      If this option is not specified, all the available GPUs are used for
+      chunks/shots distribution.
 
     **Additional Backend Options**
 
@@ -287,6 +291,30 @@ class AerSimulator(AerBackend):
       threads per GPU. This parameter is used to optimize Pauli noise
       simulation with multiple-GPUs (Default: 1).
 
+    * ``shot_branching_enable`` (bool): This option enables/disables
+      applying shot-branching technique to speed up multi-shots of dynamic
+      circutis simulations or circuits simulations with noise models.
+      (Default: False).
+      Starting from single state shared with multiple shots and
+      state will be branched dynamically at runtime.
+      This option can decrease runs of shots if there will be less branches
+      than number of total shots.
+      This option is available for ``"statevector"``, ``"density_matrix"``
+      and ``"tensor_network"``.
+
+    * ``shot_branching_sampling_enable`` (bool): This option enables/disables
+      applying sampling measure if the input circuit has all the measure
+      operations at the end of the circuit. (Default: False).
+      Because measure operation branches state into 2 states, it is not
+      efficient to apply branching for measure.
+      Sampling measure improves speed to get counts for multiple-shots
+      sharing the same state.
+      Note that the counts obtained by sampling measure may not be as same as
+      the counts calculated by multiple measure operations,
+      becuase sampling measure takes only one randome number per shot.
+      This option is available for ``"statevector"``, ``"density_matrix"``
+      and ``"tensor_network"``.
+
     * ``accept_distributed_results`` (bool): This option enables storing
       results independently in each process (Default: None).
 
@@ -709,6 +737,9 @@ def _default_options(cls):
             batched_shots_gpu=False,
             batched_shots_gpu_max_qubits=16,
             num_threads_per_device=1,
+            # multi-shot branching
+            shot_branching_enable=False,
+            shot_branching_sampling_enable=False,
             # statevector options
             statevector_parallel_threshold=14,
             statevector_sample_measure_opt=10,
diff --git a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
index bf5296b18a..67e057c74f 100644
--- a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
+++ b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp
@@ -182,6 +182,11 @@ void bind_aer_controller(MODULE m) {
       [](Config &config, uint_t val) {
         config.num_threads_per_device.value(val);
       });
+  // # multi-shot branching
+  aer_config.def_readwrite("shot_branching_enable",
+                           &Config::shot_branching_enable);
+  aer_config.def_readwrite("shot_branching_sampling_enable",
+                           &Config::shot_branching_sampling_enable);
   // # statevector options
   aer_config.def_readwrite("statevector_parallel_threshold",
                            &Config::statevector_parallel_threshold);
@@ -403,6 +408,10 @@ void bind_aer_controller(MODULE m) {
       [](Config &config, uint_t val) {
         config.extended_stabilizer_norm_estimation_default_samples.value(val);
       });
+  aer_config.def_property(
+      "target_gpus",
+      [](const Config &config) { return config.target_gpus.val; },
+      [](Config &config, reg_t val) { config.target_gpus.value(val); });
 
   aer_config.def(py::pickle(
       [](const AER::Config &config) {
@@ -488,12 +497,14 @@ void bind_aer_controller(MODULE m) {
             write_value(77, config.unitary_parallel_threshold),
             write_value(78, config.memory_blocking_bits),
             write_value(
-                79,
-                config.extended_stabilizer_norm_estimation_default_samples));
+                79, config.extended_stabilizer_norm_estimation_default_samples),
+            write_value(80, config.shot_branching_enable),
+            write_value(81, config.shot_branching_sampling_enable),
+            write_value(82, config.target_gpus));
       },
       [](py::tuple t) {
         AER::Config config;
-        if (t.size() != 79)
+        if (t.size() != 82)
           throw std::runtime_error("Invalid serialization format.");
 
         read_value(t, 0, config.shots);
@@ -580,6 +591,9 @@ void bind_aer_controller(MODULE m) {
         read_value(t, 78, config.memory_blocking_bits);
         read_value(t, 79,
                    config.extended_stabilizer_norm_estimation_default_samples);
+        read_value(t, 80, config.shot_branching_enable);
+        read_value(t, 81, config.shot_branching_sampling_enable);
+        read_value(t, 82, config.target_gpus);
         return config;
       }));
 }
diff --git a/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml b/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml
new file mode 100644
index 0000000000..3d27fd9482
--- /dev/null
+++ b/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml
@@ -0,0 +1,30 @@
+---
+features:
+  - |
+    This release restructures ``State`` classes.
+    Adding circuit executor classes that runs a circuit and manages multiple
+    states for multi-shots simulations or multi-chunk simulations for large
+    number of qubits.
+    Previously ``StateChunk`` class manages multiple chunks for multi-shots or
+    multi-chunk simulations but now ``State`` class only has one state
+    and all the parallelization codes are moved to ``Executor`` classes.
+    Now all ``State`` classes are independent from parallelization.
+    Also some of the functions in ``Aer::Controller`` class are moved to
+    ``CircuitExecutor::Executor`` class.
+  - |
+    Shot-branching technique that accelerates dynamic circuits simulations
+    is implemented with restructured ``Executor`` classes.
+    Shot-branching is currently applicable to statevector density_matrix
+    and tensor_network methods.
+    Shot-branching provides dynamic distribution of multi-shots
+    by branching states when applying dynamic operations
+    (measure, reset, initialize, noises)
+    By default ``shot_branching_enable`` is disabled.
+    And by setting ``shot_branching_sampling_enable``, final measures will be
+    done by sampling measure that will speed up to get counts for multiple shots
+    sharing the same state.
+  - |
+    A new option ``target_gpus`` is added to select GPUs used for the
+    simulation. A list of target GPU's ID is passed for example
+    ``target_gpus=[0, 2]`` select 2 GPUs to be used.
+    Without this option, all the available GPUs are used.
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
old mode 100644
new mode 100755
index 2a45353dcd..7ea6c35553
--- a/src/controllers/aer_controller.hpp
+++ b/src/controllers/aer_controller.hpp
@@ -52,15 +52,15 @@
 #include "transpile/cacheblocking.hpp"
 #include "transpile/fusion.hpp"
 
-#include "simulators/density_matrix/densitymatrix_state.hpp"
-#include "simulators/extended_stabilizer/extended_stabilizer_state.hpp"
-#include "simulators/matrix_product_state/matrix_product_state.hpp"
-#include "simulators/stabilizer/stabilizer_state.hpp"
-#include "simulators/statevector/qubitvector.hpp"
-#include "simulators/statevector/statevector_state.hpp"
-#include "simulators/superoperator/superoperator_state.hpp"
-#include "simulators/tensor_network/tensor_net_state.hpp"
-#include "simulators/unitary/unitary_state.hpp"
+#include "simulators/simulators.hpp"
+
+#include "simulators/circuit_executor.hpp"
+#include "simulators/multi_state_executor.hpp"
+
+#include "simulators/density_matrix/densitymatrix_executor.hpp"
+#include "simulators/statevector/statevector_executor.hpp"
+#include "simulators/tensor_network/tensor_net_executor.hpp"
+#include "simulators/unitary/unitary_executor.hpp"
 
 namespace AER {
 
@@ -104,35 +104,6 @@ class Controller {
   // Simulation types
   //-----------------------------------------------------------------------
 
-  // Simulation methods for the Qasm Controller
-  enum class Method {
-    automatic,
-    statevector,
-    density_matrix,
-    matrix_product_state,
-    stabilizer,
-    extended_stabilizer,
-    unitary,
-    superop,
-    tensor_network
-  };
-
-  enum class Device { CPU, GPU, ThrustCPU };
-
-  // Simulation precision
-  enum class Precision { Double, Single };
-
-  const std::unordered_map<Method, std::string> method_names_ = {
-      {Method::automatic, "automatic"},
-      {Method::statevector, "statevector"},
-      {Method::density_matrix, "density_matrix"},
-      {Method::matrix_product_state, "matrix_product_state"},
-      {Method::stabilizer, "stabilizer"},
-      {Method::extended_stabilizer, "extended_stabilizer"},
-      {Method::unitary, "unitary"},
-      {Method::superop, "superop"},
-      {Method::tensor_network, "tensor_network"}};
-
   //-----------------------------------------------------------------------
   // Config
   //-----------------------------------------------------------------------
@@ -140,12 +111,6 @@ class Controller {
   // Timer type
   using myclock_t = std::chrono::high_resolution_clock;
 
-  // Validation threshold for validating states and operators
-  double validation_threshold_ = 1e-8;
-
-  // Save counts as memory list
-  bool save_creg_memory_ = false;
-
   // Simulation method
   Method method_ = Method::automatic;
 
@@ -156,78 +121,6 @@ class Controller {
   // Simulation precision
   Precision sim_precision_ = Precision::Double;
 
-  // Controller-level parameter for CH method
-  bool extended_stabilizer_measure_sampling_ = false;
-
-  //-----------------------------------------------------------------------
-  // Circuit Execution
-  //-----------------------------------------------------------------------
-
-  // Abstract method for executing a circuit.
-  // This method must initialize a state and return output data for
-  // the required number of shots.
-  void run_circuit(const Circuit &circ, const Noise::NoiseModel &noise,
-                   const Method method, const Config &config,
-                   ExperimentResult &result) const;
-
-  //----------------------------------------------------------------
-  // Run circuit helpers
-  //----------------------------------------------------------------
-
-  // Execute n-shots of a circuit on the input state
-  template <class State_t>
-  void run_circuit_helper(const Circuit &circ, const Noise::NoiseModel &noise,
-                          const Config &config, const Method method,
-                          ExperimentResult &result) const;
-
-  // Execute a single shot a of circuit by initializing the state vector,
-  // running all ops in circ, and updating data with
-  // simulation output.
-  template <class State_t>
-  void run_single_shot(const Circuit &circ, State_t &state,
-                       ExperimentResult &result, RngEngine &rng) const;
-
-  // Execute a single shot a of circuit by initializing the state vector,
-  // running all ops in circ, and updating data with
-  // simulation output.
-  template <class State_t>
-  void run_with_sampling(const Circuit &circ, State_t &state,
-                         ExperimentResult &result, RngEngine &rng,
-                         const uint_t block_bits, const uint_t shots) const;
-
-  // Execute multiple shots a of circuit by initializing the state vector,
-  // running all ops in circ, and updating data with
-  // simulation output. Will use measurement sampling if possible
-  template <class State_t>
-  void run_circuit_without_sampled_noise(Circuit &circ,
-                                         const Noise::NoiseModel &noise,
-                                         const Config &config,
-                                         const Method method,
-                                         ExperimentResult &result) const;
-
-  template <class State_t>
-  void run_circuit_with_sampled_noise(const Circuit &circ,
-                                      const Noise::NoiseModel &noise,
-                                      const Config &config, const Method method,
-                                      ExperimentResult &result) const;
-
-  //----------------------------------------------------------------
-  // Measurement
-  //----------------------------------------------------------------
-
-  // Sample measurement outcomes for the input measure ops from the
-  // current state of the input State_t
-  template <typename InputIterator, class State_t>
-  void measure_sampler(InputIterator first_meas, InputIterator last_meas,
-                       uint_t shots, State_t &state, ExperimentResult &result,
-                       RngEngine &rng, int_t shot_index = -1) const;
-
-  // Check if measure sampling optimization is valid for the input circuit
-  // for the given method. This checks if operation types before
-  // the first measurement in the circuit prevent sampling
-  bool check_measure_sampling_opt(const Circuit &circ,
-                                  const Method method) const;
-
   //-------------------------------------------------------------------------
   // State validation
   //-------------------------------------------------------------------------
@@ -242,53 +135,28 @@ class Controller {
                        const Noise::NoiseModel &noise,
                        bool throw_except = false) const;
 
-  template <class state_t>
-  bool validate_state(const state_t &state, const Circuit &circ,
-                      const Noise::NoiseModel &noise,
-                      bool throw_except = false) const;
-
-  // Return an estimate of the required memory for a circuit.
-  size_t required_memory_mb(const Circuit &circuit,
-                            const Noise::NoiseModel &noise,
-                            const Method method) const;
-
   //----------------------------------------------------------------
   // Utility functions
   //----------------------------------------------------------------
+  std::shared_ptr<CircuitExecutor::Base>
+  make_circuit_executor(const Method method) const;
 
   // Return a vector of simulation methods for each circuit.
   // If the default method is automatic this will be computed based on the
   // circuit and noise model.
   // The noise model will be modified to enable superop or kraus sampling
   // methods if required by the chosen methods.
-  std::vector<Controller::Method>
+  std::vector<Method>
   simulation_methods(std::vector<std::shared_ptr<Circuit>> &circuits,
                      Noise::NoiseModel &noise_model) const;
 
   // Return the simulation method to use based on the input circuit
   // and noise model
-  Controller::Method
+  Method
   automatic_simulation_method(const Circuit &circ,
                               const Noise::NoiseModel &noise_model) const;
 
-  // Return a fusion transpilation pass configured for the current
-  // method, circuit and config
-  Transpile::Fusion transpile_fusion(Method method,
-                                     const Operations::OpSet &opset,
-                                     const Config &config) const;
-
-  // Return cache blocking transpiler pass
-  Transpile::CacheBlocking
-  transpile_cache_blocking(Controller::Method method, const Circuit &circ,
-                           const Noise::NoiseModel &noise,
-                           const Config &config) const;
-
-  // return maximum number of qubits for matrix
-  int_t get_max_matrix_qubits(const Circuit &circ) const;
-  int_t get_matrix_bits(const Operations::Op &op) const;
-
   bool has_statevector_ops(const Circuit &circuit) const;
-
   //-----------------------------------------------------------------------
   // Parallelization Config
   //-----------------------------------------------------------------------
@@ -301,82 +169,32 @@ class Controller {
       const std::vector<std::shared_ptr<Circuit>> &circuits,
       const Noise::NoiseModel &noise, const std::vector<Method> &methods);
 
-  // Set circuit parallelization
-  void set_parallelization_circuit(const Circuit &circ,
-                                   const Noise::NoiseModel &noise,
-                                   const Method method);
-
-  bool multiple_chunk_required(const Circuit &circuit,
-                               const Noise::NoiseModel &noise,
-                               const Method method) const;
-
-  bool multiple_shots_required(const Circuit &circuit,
-                               const Noise::NoiseModel &noise,
-                               const Method method) const;
-
   void save_exception_to_results(Result &result, const std::exception &e) const;
 
   // Get system memory size
   size_t get_system_memory_mb();
   size_t get_gpu_memory_mb();
 
-  size_t get_min_memory_mb() const {
-    if (sim_device_ == Device::GPU && num_gpus_ > 0) {
-      return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size
-    }
-    return max_memory_mb_;
-  }
-
   // The maximum number of threads to use for various levels of parallelization
   int max_parallel_threads_;
 
   // Parameters for parallelization management in configuration
   int max_parallel_experiments_;
-  int max_parallel_shots_;
   size_t max_memory_mb_;
   size_t max_gpu_memory_mb_;
-  int num_gpus_; // max number of GPU per process
 
   // use explicit parallelization
   bool explicit_parallelization_;
 
   // Parameters for parallelization management for experiments
   int parallel_experiments_;
-  int parallel_shots_;
-  int parallel_state_update_;
 
   bool parallel_nested_ = false;
 
-  // max number of states can be stored on memory for batched
-  // multi-shots/experiments optimization
-  int max_batched_states_;
-
-  // max number of qubits in given circuits
-  int max_qubits_;
-
-  // results are stored independently in each process if true
-  bool accept_distributed_results_ = true;
-
   // process information (MPI)
   int myrank_ = 0;
   int num_processes_ = 1;
   int num_process_per_experiment_ = 1;
-
-  uint_t cache_block_qubit_ = 0;
-
-  // multi-chunks are required to simulate circuits
-  bool multi_chunk_required_ = false;
-
-  // config setting for multi-shot parallelization
-  bool batched_shots_gpu_ = true;
-  int_t batched_shots_gpu_max_qubits_ =
-      16; // multi-shot parallelization is applied if qubits is less than max
-          // qubits
-  bool enable_batch_multi_shots_ =
-      false; // multi-shot parallelization can be applied
-
-  // settings for cuStateVec
-  bool cuStateVec_enable_ = false;
 };
 
 //=========================================================================
@@ -389,21 +207,12 @@ class Controller {
 
 void Controller::set_config(const Config &config) {
 
-  // Load validation threshold
-  validation_threshold_ = config.validation_threshold;
-
-  // Load config for memory (creg list data)
-  if (config.memory.has_value())
-    save_creg_memory_ = config.memory.value();
-
 #ifdef _OPENMP
   // Load OpenMP maximum thread settings
   if (config.max_parallel_threads.has_value())
     max_parallel_threads_ = config.max_parallel_threads.value();
   if (config.max_parallel_experiments.has_value())
     max_parallel_experiments_ = config.max_parallel_experiments.value();
-  if (config.max_parallel_shots.has_value())
-    max_parallel_shots_ = config.max_parallel_shots.value();
   // Limit max threads based on number of available OpenMP threads
   auto omp_threads = omp_get_max_threads();
   max_parallel_threads_ = (max_parallel_threads_ > 0)
@@ -412,7 +221,6 @@ void Controller::set_config(const Config &config) {
 #else
   // No OpenMP so we disable parallelization
   max_parallel_threads_ = 1;
-  max_parallel_shots_ = 1;
   max_parallel_experiments_ = 1;
   parallel_nested_ = false;
 #endif
@@ -430,38 +238,18 @@ void Controller::set_config(const Config &config) {
 
   // for debugging
   if (config._parallel_shots.has_value()) {
-    parallel_shots_ = config._parallel_shots.value();
     explicit_parallelization_ = true;
   }
 
   // for debugging
   if (config._parallel_state_update.has_value()) {
-    parallel_state_update_ = config._parallel_state_update.value();
     explicit_parallelization_ = true;
   }
 
   if (explicit_parallelization_) {
     parallel_experiments_ = std::max<int>({parallel_experiments_, 1});
-    parallel_shots_ = std::max<int>({parallel_shots_, 1});
-    parallel_state_update_ = std::max<int>({parallel_state_update_, 1});
   }
 
-  if (config.accept_distributed_results.has_value())
-    accept_distributed_results_ = config.accept_distributed_results.value();
-
-  // enable multiple qregs if cache blocking is enabled
-  if (config.blocking_qubits.has_value())
-    cache_block_qubit_ = config.blocking_qubits.value();
-
-  // enable batched multi-shots/experiments optimization
-  batched_shots_gpu_ = config.batched_shots_gpu;
-  batched_shots_gpu_max_qubits_ = config.batched_shots_gpu_max_qubits;
-
-  // cuStateVec configs
-  cuStateVec_enable_ = false;
-  if (config.cuStateVec_enable.has_value())
-    cuStateVec_enable_ = config.cuStateVec_enable.value();
-
   // Override automatic simulation method with a fixed method
   std::string method = config.method;
   if (config.method == "statevector") {
@@ -485,9 +273,6 @@ void Controller::set_config(const Config &config) {
                              method + std::string(")."));
   }
 
-  if (method_ == Method::density_matrix || method_ == Method::unitary)
-    batched_shots_gpu_max_qubits_ /= 2;
-
   // Override automatic simulation method with a fixed method
   sim_device_name_ = config.device;
   if (sim_device_name_ == "CPU") {
@@ -506,10 +291,13 @@ void Controller::set_config(const Config &config) {
 #else
 
 #ifndef AER_CUSTATEVEC
-    if (cuStateVec_enable_) {
-      // Aer is not built for cuStateVec
-      throw std::runtime_error("Simulation device \"GPU\" does not support "
-                               "cuStateVec on this system");
+    // cuStateVec configs
+    if (config.cuStateVec_enable.has_value()) {
+      if (config.cuStateVec_enable.value()) {
+        // Aer is not built for cuStateVec
+        throw std::runtime_error("Simulation device \"GPU\" does not support "
+                                 "cuStateVec on this system");
+      }
     }
 #endif
     int nDev;
@@ -546,7 +334,6 @@ void Controller::set_config(const Config &config) {
 
 void Controller::clear_config() {
   clear_parallelization();
-  validation_threshold_ = 1e-8;
   method_ = Method::automatic;
   sim_device_ = Device::CPU;
   sim_precision_ = Precision::Double;
@@ -555,18 +342,12 @@ void Controller::clear_config() {
 void Controller::clear_parallelization() {
   max_parallel_threads_ = 0;
   max_parallel_experiments_ = 1;
-  max_parallel_shots_ = 0;
-  max_batched_states_ = 1;
 
   parallel_experiments_ = 1;
-  parallel_shots_ = 1;
-  parallel_state_update_ = 1;
   parallel_nested_ = false;
 
   num_process_per_experiment_ = 1;
 
-  num_gpus_ = 0;
-
   explicit_parallelization_ = false;
   max_memory_mb_ = get_system_memory_mb();
   max_gpu_memory_mb_ = get_gpu_memory_mb();
@@ -575,35 +356,6 @@ void Controller::clear_parallelization() {
 void Controller::set_parallelization_experiments(
     const std::vector<std::shared_ptr<Circuit>> &circuits,
     const Noise::NoiseModel &noise, const std::vector<Method> &methods) {
-  std::vector<size_t> required_memory_mb_list(circuits.size());
-  max_qubits_ = 0;
-  for (size_t j = 0; j < circuits.size(); j++) {
-    if (circuits[j]->num_qubits > max_qubits_)
-      max_qubits_ = circuits[j]->num_qubits;
-    required_memory_mb_list[j] =
-        required_memory_mb(*circuits[j], noise, methods[j]);
-  }
-  std::sort(required_memory_mb_list.begin(), required_memory_mb_list.end(),
-            std::greater<>());
-
-  // set max batchable number of circuits
-  if (batched_shots_gpu_) {
-    if (required_memory_mb_list[0] == 0 || max_qubits_ == 0)
-      max_batched_states_ = 1;
-    else {
-      if (sim_device_ == Device::GPU) {
-        max_batched_states_ = ((max_gpu_memory_mb_ / num_gpus_ * 8 / 10) /
-                               required_memory_mb_list[0]) *
-                              num_gpus_;
-      } else {
-        max_batched_states_ =
-            (max_memory_mb_ * 8 / 10) / required_memory_mb_list[0];
-      }
-    }
-  }
-  if (max_qubits_ == 0)
-    max_qubits_ = 1;
-
   if (explicit_parallelization_)
     return;
 
@@ -626,6 +378,17 @@ void Controller::set_parallelization_experiments(
   }
 
   // If memory allows, execute experiments in parallel
+  std::vector<size_t> required_memory_mb_list(circuits.size());
+  for (size_t j = 0; j < circuits.size(); j++) {
+    std::shared_ptr<CircuitExecutor::Base> executor =
+        make_circuit_executor(methods[j]);
+    required_memory_mb_list[j] =
+        executor->required_memory_mb(*circuits[j], noise);
+    executor.reset();
+  }
+  std::sort(required_memory_mb_list.begin(), required_memory_mb_list.end(),
+            std::greater<>());
+
   size_t total_memory = 0;
   int parallel_experiments = 0;
   for (size_t required_memory_mb : required_memory_mb_list) {
@@ -643,139 +406,6 @@ void Controller::set_parallelization_experiments(
                      max_parallel_threads_, static_cast<int>(circuits.size())});
 }
 
-void Controller::set_parallelization_circuit(const Circuit &circ,
-                                             const Noise::NoiseModel &noise,
-                                             const Method method) {
-  enable_batch_multi_shots_ = false;
-  if (batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 &&
-      max_batched_states_ >= num_gpus_ &&
-      batched_shots_gpu_max_qubits_ >= circ.num_qubits) {
-    enable_batch_multi_shots_ = true;
-  }
-
-  if (sim_device_ == Device::GPU && cuStateVec_enable_) {
-    enable_batch_multi_shots_ =
-        false; // cuStateVec does not support batch execution of multi-shots
-    return;
-  }
-
-  if (explicit_parallelization_)
-    return;
-
-  // Check for trivial parallelization conditions
-  switch (method) {
-  case Method::statevector:
-  case Method::stabilizer:
-  case Method::unitary:
-  case Method::matrix_product_state: {
-    if (circ.shots == 1 || num_process_per_experiment_ > 1 ||
-        (!noise.has_quantum_errors() &&
-         check_measure_sampling_opt(circ, method))) {
-      parallel_shots_ = 1;
-      parallel_state_update_ =
-          std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
-      return;
-    }
-    break;
-  }
-  case Method::density_matrix:
-  case Method::superop:
-  case Method::tensor_network: {
-    if (circ.shots == 1 || num_process_per_experiment_ > 1 ||
-        check_measure_sampling_opt(circ, method)) {
-      parallel_shots_ = 1;
-      parallel_state_update_ =
-          std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
-      return;
-    }
-    break;
-  }
-  case Method::extended_stabilizer:
-    break;
-  default:
-    throw std::invalid_argument(
-        "Cannot set parallelization for unresolved method.");
-  }
-
-  // Use a local variable to not override stored maximum based
-  // on currently executed circuits
-  const auto max_shots =
-      (max_parallel_shots_ > 0)
-          ? std::min({max_parallel_shots_, max_parallel_threads_})
-          : max_parallel_threads_;
-
-  // If we are executing circuits in parallel we disable
-  // parallel shots
-  if (max_shots == 1 || parallel_experiments_ > 1) {
-    parallel_shots_ = 1;
-  } else {
-    // Parallel shots is > 1
-    // Limit parallel shots by available memory and number of shots
-    // And assign the remaining threads to state update
-    int circ_memory_mb =
-        required_memory_mb(circ, noise, method) / num_process_per_experiment_;
-    size_t mem_size =
-        (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_;
-    if (mem_size < circ_memory_mb)
-      throw std::runtime_error(
-          "a circuit requires more memory than max_memory_mb.");
-    // If circ memory is 0, set it to 1 so that we don't divide by zero
-    circ_memory_mb = std::max<int>({1, circ_memory_mb});
-
-    int shots = circ.shots;
-    parallel_shots_ = std::min<int>(
-        {static_cast<int>(mem_size / (circ_memory_mb * 2)), max_shots, shots});
-  }
-  parallel_state_update_ =
-      (parallel_shots_ > 1)
-          ? std::max<int>({1, max_parallel_threads_ / parallel_shots_})
-          : std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
-}
-
-bool Controller::multiple_chunk_required(const Circuit &circ,
-                                         const Noise::NoiseModel &noise,
-                                         const Method method) const {
-  if (circ.num_qubits < 3)
-    return false;
-  if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits)
-    return true;
-
-  if (num_process_per_experiment_ == 1 && sim_device_ == Device::GPU &&
-      num_gpus_ > 0) {
-    return (max_gpu_memory_mb_ / num_gpus_ <
-            required_memory_mb(circ, noise, method));
-  }
-  if (num_process_per_experiment_ > 1) {
-    size_t total_mem = max_memory_mb_;
-    if (sim_device_ == Device::GPU)
-      total_mem += max_gpu_memory_mb_;
-    if (total_mem * num_process_per_experiment_ >
-        required_memory_mb(circ, noise, method))
-      return true;
-  }
-
-  return false;
-}
-
-bool Controller::multiple_shots_required(const Circuit &circ,
-                                         const Noise::NoiseModel &noise,
-                                         const Method method) const {
-  if (circ.shots < 2)
-    return false;
-  if (method == Method::density_matrix || method == Method::superop ||
-      method == Method::unitary) {
-    return false;
-  }
-
-  bool can_sample = check_measure_sampling_opt(circ, method);
-
-  if (noise.is_ideal()) {
-    return !can_sample;
-  }
-
-  return true;
-}
-
 size_t Controller::get_system_memory_mb() {
   size_t total_physical_memory = Utils::get_system_memory_mb();
 #ifdef AER_MPI
@@ -803,7 +433,6 @@ size_t Controller::get_gpu_memory_mb() {
     cudaMemGetInfo(&freeMem, &totalMem);
     total_physical_memory += totalMem;
   }
-  num_gpus_ = nDev;
 #endif
 
 #ifdef AER_MPI
@@ -812,41 +441,11 @@ size_t Controller::get_gpu_memory_mb() {
   locMem = total_physical_memory;
   MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, MPI_COMM_WORLD);
   total_physical_memory = minMem;
-
-  int t = num_gpus_;
-  MPI_Allreduce(&t, &num_gpus_, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
 #endif
 
   return total_physical_memory >> 20;
 }
 
-Transpile::CacheBlocking Controller::transpile_cache_blocking(
-    Controller::Method method, const Circuit &circ,
-    const Noise::NoiseModel &noise, const Config &config) const {
-  Transpile::CacheBlocking cache_block_pass;
-
-  const bool is_matrix =
-      (method == Method::density_matrix || method == Method::unitary);
-  const auto complex_size = (sim_precision_ == Precision::Single)
-                                ? sizeof(std::complex<float>)
-                                : sizeof(std::complex<double>);
-
-  cache_block_pass.set_num_processes(num_process_per_experiment_);
-  cache_block_pass.set_config(config);
-
-  if (!cache_block_pass.enabled()) {
-    // if blocking is not set by config, automatically set if required
-    if (multiple_chunk_required(circ, noise, method)) {
-      int nplace = num_process_per_experiment_;
-      if (sim_device_ == Device::GPU && num_gpus_ > 0)
-        nplace *= num_gpus_;
-      cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20,
-                                    nplace, complex_size, is_matrix);
-    }
-  }
-  return cache_block_pass;
-}
-
 //-------------------------------------------------------------------------
 // Qobj execution
 //-------------------------------------------------------------------------
@@ -916,18 +515,7 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 
   // Execute each circuit in a try block
   try {
-    // check if multi-chunk distribution is required
-    bool multi_chunk_required_ = false;
-    for (size_t j = 0; j < circuits.size(); j++) {
-      if (circuits[j]->num_qubits > 0) {
-        if (multiple_chunk_required(*circuits[j], noise_model, methods[j]))
-          multi_chunk_required_ = true;
-      }
-    }
-    if (multi_chunk_required_)
-      num_process_per_experiment_ = num_processes_;
-    else
-      num_process_per_experiment_ = 1;
+    num_process_per_experiment_ = num_processes_;
 
     // set parallelization for experiments
     try {
@@ -938,23 +526,13 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
       save_exception_to_results(result, e);
     }
 
-#ifdef _OPENMP
-    result.metadata.add(true, "omp_enabled");
-#else
-    result.metadata.add(false, "omp_enabled");
-#endif
     result.metadata.add(parallel_experiments_, "parallel_experiments");
     result.metadata.add(max_memory_mb_, "max_memory_mb");
     result.metadata.add(max_gpu_memory_mb_, "max_gpu_memory_mb");
 
-    // store rank and number of processes, if no distribution rank=0 procs=1 is
-    // set
-    result.metadata.add(num_process_per_experiment_,
-                        "num_processes_per_experiments");
-    result.metadata.add(num_processes_, "num_mpi_processes");
-    result.metadata.add(myrank_, "mpi_rank");
-
 #ifdef _OPENMP
+    result.metadata.add(true, "omp_enabled");
+
     // Check if circuit parallelism is nested with one of the others
     if (parallel_experiments_ > 1 &&
         parallel_experiments_ < max_parallel_threads_) {
@@ -972,9 +550,18 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
     } else {
       parallel_nested_ = false;
     }
+#else
+    result.metadata.add(false, "omp_enabled");
 #endif
 
 #ifdef AER_MPI
+    // store rank and number of processes, if no distribution rank=0 procs=1 is
+    // set
+    result.metadata.add(num_process_per_experiment_,
+                        "num_processes_per_experiments");
+    result.metadata.add(num_processes_, "num_mpi_processes");
+    result.metadata.add(myrank_, "mpi_rank");
+
     // average random seed to set the same seed to each process (when
     // seed_simulator is not set)
     if (num_processes_ > 1) {
@@ -995,16 +582,20 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
     // in #pragma omp)
     if (parallel_experiments_ == 1) {
       for (int j = 0; j < NUM_RESULTS; ++j) {
-        set_parallelization_circuit(*circuits[j], noise_model, methods[j]);
-        run_circuit(*circuits[j], noise_model, methods[j], config,
-                    result.results[j]);
+        std::shared_ptr<CircuitExecutor::Base> executor =
+            make_circuit_executor(methods[j]);
+        executor->run_circuit(*circuits[j], noise_model, config, methods[j],
+                              sim_device_, result.results[j]);
+        executor.reset();
       }
     } else {
 #pragma omp parallel for num_threads(parallel_experiments_)
       for (int j = 0; j < NUM_RESULTS; ++j) {
-        set_parallelization_circuit(*circuits[j], noise_model, methods[j]);
-        run_circuit(*circuits[j], noise_model, methods[j], config,
-                    result.results[j]);
+        std::shared_ptr<CircuitExecutor::Base> executor =
+            make_circuit_executor(methods[j]);
+        executor->run_circuit(*circuits[j], noise_model, config, methods[j],
+                              sim_device_, result.results[j]);
+        executor.reset();
       }
     }
 
@@ -1042,813 +633,128 @@ Result Controller::execute(std::vector<std::shared_ptr<Circuit>> &circuits,
 }
 
 //-------------------------------------------------------------------------
-// Base class override
+// Utility methods
 //-------------------------------------------------------------------------
-void Controller::run_circuit(const Circuit &circ,
-                             const Noise::NoiseModel &noise,
-                             const Method method, const Config &config,
-                             ExperimentResult &result) const {
+std::shared_ptr<CircuitExecutor::Base>
+Controller::make_circuit_executor(const Method method) const {
   // Run the circuit
   switch (method) {
-  case Method::statevector: {
+  case Method::statevector:
     if (sim_device_ == Device::CPU) {
-      // Chunk based simualtion
       if (sim_precision_ == Precision::Double) {
         // Double-precision Statevector simulation
-        return run_circuit_helper<Statevector::State<QV::QubitVector<double>>>(
-            circ, noise, config, Method::statevector, result);
+        return std::make_shared<Statevector::Executor<
+            Statevector::State<QV::QubitVector<double>>>>();
       } else {
         // Single-precision Statevector simulation
-        return run_circuit_helper<Statevector::State<QV::QubitVector<float>>>(
-            circ, noise, config, Method::statevector, result);
+        return std::make_shared<Statevector::Executor<
+            Statevector::State<QV::QubitVector<float>>>>();
       }
     } else {
 #ifdef AER_THRUST_SUPPORTED
       // Chunk based simulation
       if (sim_precision_ == Precision::Double) {
         // Double-precision Statevector simulation
-        return run_circuit_helper<
-            Statevector::State<QV::QubitVectorThrust<double>>>(
-            circ, noise, config, Method::statevector, result);
+        return std::make_shared<Statevector::Executor<
+            Statevector::State<QV::QubitVectorThrust<double>>>>();
       } else {
         // Single-precision Statevector simulation
-        return run_circuit_helper<
-            Statevector::State<QV::QubitVectorThrust<float>>>(
-            circ, noise, config, Method::statevector, result);
+        return std::make_shared<Statevector::Executor<
+            Statevector::State<QV::QubitVectorThrust<float>>>>();
       }
 #endif
     }
-  }
-  case Method::density_matrix: {
+    break;
+  case Method::density_matrix:
     if (sim_device_ == Device::CPU) {
       if (sim_precision_ == Precision::Double) {
-        // Double-precision density matrix simulation
-        return run_circuit_helper<
-            DensityMatrix::State<QV::DensityMatrix<double>>>(
-            circ, noise, config, Method::density_matrix, result);
+        // Double-precision DensityMatrix simulation
+        return std::make_shared<DensityMatrix::Executor<
+            DensityMatrix::State<QV::DensityMatrix<double>>>>();
       } else {
-        // Single-precision density matrix simulation
-        return run_circuit_helper<
-            DensityMatrix::State<QV::DensityMatrix<float>>>(
-            circ, noise, config, Method::density_matrix, result);
+        // Single-precision DensityMatrix simulation
+        return std::make_shared<DensityMatrix::Executor<
+            DensityMatrix::State<QV::DensityMatrix<float>>>>();
       }
     } else {
 #ifdef AER_THRUST_SUPPORTED
+      // Chunk based simulation
       if (sim_precision_ == Precision::Double) {
-        // Double-precision density matrix simulation
-        return run_circuit_helper<
-            DensityMatrix::State<QV::DensityMatrixThrust<double>>>(
-            circ, noise, config, Method::density_matrix, result);
+        // Double-precision DensityMatrix simulation
+        return std::make_shared<DensityMatrix::Executor<
+            DensityMatrix::State<QV::DensityMatrixThrust<double>>>>();
       } else {
-        // Single-precision density matrix simulation
-        return run_circuit_helper<
-            DensityMatrix::State<QV::DensityMatrixThrust<float>>>(
-            circ, noise, config, Method::density_matrix, result);
+        // Single-precision DensityMatrix simulation
+        return std::make_shared<DensityMatrix::Executor<
+            DensityMatrix::State<QV::DensityMatrixThrust<float>>>>();
       }
 #endif
     }
-  }
-  case Method::unitary: {
+    break;
+  case Method::unitary:
     if (sim_device_ == Device::CPU) {
       if (sim_precision_ == Precision::Double) {
         // Double-precision unitary simulation
-        return run_circuit_helper<
-            QubitUnitary::State<QV::UnitaryMatrix<double>>>(
-            circ, noise, config, Method::unitary, result);
+        return std::make_shared<QubitUnitary::Executor<
+            QubitUnitary::State<QV::UnitaryMatrix<double>>>>();
       } else {
         // Single-precision unitary simulation
-        return run_circuit_helper<
-            QubitUnitary::State<QV::UnitaryMatrix<float>>>(
-            circ, noise, config, Method::unitary, result);
+        return std::make_shared<QubitUnitary::Executor<
+            QubitUnitary::State<QV::UnitaryMatrix<float>>>>();
       }
     } else {
 #ifdef AER_THRUST_SUPPORTED
+      // Chunk based simulation
       if (sim_precision_ == Precision::Double) {
         // Double-precision unitary simulation
-        return run_circuit_helper<
-            QubitUnitary::State<QV::UnitaryMatrixThrust<double>>>(
-            circ, noise, config, Method::unitary, result);
+        return std::make_shared<QubitUnitary::Executor<
+            QubitUnitary::State<QV::UnitaryMatrixThrust<double>>>>();
       } else {
         // Single-precision unitary simulation
-        return run_circuit_helper<
-            QubitUnitary::State<QV::UnitaryMatrixThrust<float>>>(
-            circ, noise, config, Method::unitary, result);
+        return std::make_shared<QubitUnitary::Executor<
+            QubitUnitary::State<QV::UnitaryMatrixThrust<float>>>>();
       }
 #endif
     }
-  }
-  case Method::superop: {
-    if (sim_precision_ == Precision::Double) {
-      return run_circuit_helper<
-          QubitSuperoperator::State<QV::Superoperator<double>>>(
-          circ, noise, config, Method::superop, result);
-    } else {
-      return run_circuit_helper<
-          QubitSuperoperator::State<QV::Superoperator<float>>>(
-          circ, noise, config, Method::superop, result);
-    }
-  }
-  case Method::stabilizer:
-    // Stabilizer simulation
-    // TODO: Stabilizer doesn't yet support custom state initialization
-    return run_circuit_helper<Stabilizer::State>(circ, noise, config,
-                                                 Method::stabilizer, result);
-  case Method::extended_stabilizer:
-    return run_circuit_helper<ExtendedStabilizer::State>(
-        circ, noise, config, Method::extended_stabilizer, result);
-  case Method::matrix_product_state:
-    return run_circuit_helper<MatrixProductState::State>(
-        circ, noise, config, Method::matrix_product_state, result);
-  case Method::tensor_network: {
+    break;
+  case Method::superop:
     if (sim_precision_ == Precision::Double) {
-      return run_circuit_helper<
-          TensorNetwork::State<TensorNetwork::TensorNet<double>>>(
-          circ, noise, config, Method::tensor_network, result);
-    } else {
-      return run_circuit_helper<
-          TensorNetwork::State<TensorNetwork::TensorNet<float>>>(
-          circ, noise, config, Method::tensor_network, result);
-    }
-  }
-  default:
-    throw std::runtime_error("Controller:Invalid simulation method");
-  }
-}
-
-//-------------------------------------------------------------------------
-// Utility methods
-//-------------------------------------------------------------------------
-
-size_t Controller::required_memory_mb(const Circuit &circ,
-                                      const Noise::NoiseModel &noise,
-                                      const Method method) const {
-  switch (method) {
-  case Method::statevector: {
-    if (sim_precision_ == Precision::Single) {
-      Statevector::State<QV::QubitVector<float>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
+      return std::make_shared<CircuitExecutor::Executor<
+          QubitSuperoperator::State<QV::Superoperator<double>>>>();
     } else {
-      Statevector::State<QV::QubitVector<double>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
+      return std::make_shared<CircuitExecutor::Executor<
+          QubitSuperoperator::State<QV::Superoperator<float>>>>();
     }
-  }
-  case Method::density_matrix: {
-    if (sim_precision_ == Precision::Single) {
-      DensityMatrix::State<QV::DensityMatrix<float>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    } else {
-      DensityMatrix::State<QV::DensityMatrix<double>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    }
-  }
-  case Method::unitary: {
-    if (sim_precision_ == Precision::Single) {
-      QubitUnitary::State<QV::UnitaryMatrix<float>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    } else {
-      QubitUnitary::State<QV::UnitaryMatrix<double>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    }
-  }
-  case Method::superop: {
-    if (sim_precision_ == Precision::Single) {
-      QubitSuperoperator::State<QV::Superoperator<float>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    } else {
-      QubitSuperoperator::State<QV::Superoperator<double>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
-    }
-  }
+    break;
   case Method::stabilizer: {
-    Stabilizer::State state;
-    return state.required_memory_mb(circ.num_qubits, circ.ops);
-  }
+    return std::make_shared<CircuitExecutor::Executor<Stabilizer::State>>();
+  } break;
   case Method::extended_stabilizer: {
-    ExtendedStabilizer::State state;
-    return state.required_memory_mb(circ.num_qubits, circ.ops);
-  }
+    return std::make_shared<
+        CircuitExecutor::Executor<ExtendedStabilizer::State>>();
+  } break;
   case Method::matrix_product_state: {
-    MatrixProductState::State state;
-    return state.required_memory_mb(circ.num_qubits, circ.ops);
-  }
+    return std::make_shared<
+        CircuitExecutor::Executor<MatrixProductState::State>>();
+  } break;
   case Method::tensor_network: {
-    if (sim_precision_ == Precision::Single) {
-      TensorNetwork::State<TensorNetwork::TensorNet<float>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
+    if (sim_precision_ == Precision::Double) {
+      return std::make_shared<TensorNetwork::Executor<
+          TensorNetwork::State<TensorNetwork::TensorNet<double>>>>();
     } else {
-      TensorNetwork::State<TensorNetwork::TensorNet<double>> state;
-      return state.required_memory_mb(circ.num_qubits, circ.ops);
+      return std::make_shared<TensorNetwork::Executor<
+          TensorNetwork::State<TensorNetwork::TensorNet<float>>>>();
     }
-  }
+  } break;
+  case Method::automatic:
+    throw std::runtime_error(
+        "Cannot make circuit executor for automatic simulation method.");
   default:
-    // We shouldn't get here, so throw an exception if we do
-    throw std::runtime_error("Controller: Invalid simulation method");
-  }
-}
-
-Transpile::Fusion Controller::transpile_fusion(Method method,
-                                               const Operations::OpSet &opset,
-                                               const Config &config) const {
-  Transpile::Fusion fusion_pass;
-  fusion_pass.set_parallelization(parallel_state_update_);
-
-  if (opset.contains(Operations::OpType::superop)) {
-    fusion_pass.allow_superop = true;
-  }
-  if (opset.contains(Operations::OpType::kraus)) {
-    fusion_pass.allow_kraus = true;
-  }
-  switch (method) {
-  case Method::density_matrix:
-  case Method::superop: {
-    // Halve the default threshold and max fused qubits for density matrix
-    fusion_pass.threshold /= 2;
-    fusion_pass.max_qubit /= 2;
-    break;
-  }
-  case Method::matrix_product_state: {
-    fusion_pass.active = false;
-    return fusion_pass; // Do not allow the config to set active for MPS
-  }
-  case Method::statevector: {
-    if (fusion_pass.allow_kraus) {
-      // Halve default max fused qubits for Kraus noise fusion
-      fusion_pass.max_qubit /= 2;
-    }
-    break;
-  }
-  case Method::unitary: {
-    // max_qubit is the same with statevector
-    fusion_pass.threshold /= 2;
-    break;
-  }
-  case Method::tensor_network: {
-    if (opset.contains(Operations::OpType::save_statevec) ||
-        opset.contains(Operations::OpType::save_statevec_dict)) {
-      if (fusion_pass.allow_kraus) {
-        // Halve default max fused qubits for Kraus noise fusion
-        fusion_pass.max_qubit /= 2;
-      }
-    } else {
-      // Halve the default threshold and max fused qubits for density matrix
-      fusion_pass.threshold /= 2;
-      fusion_pass.max_qubit /= 2;
-    }
-    break;
-  }
-  default: {
-    fusion_pass.active = false;
-    return fusion_pass;
-  }
-  }
-  // Override default fusion settings with custom config
-  fusion_pass.set_config(config);
-  return fusion_pass;
-}
-
-//-------------------------------------------------------------------------
-// Run circuit helpers
-//-------------------------------------------------------------------------
-
-template <class State_t>
-void Controller::run_circuit_helper(const Circuit &circ,
-                                    const Noise::NoiseModel &noise,
-                                    const Config &config, const Method method,
-                                    ExperimentResult &result) const {
-  // Start individual circuit timer
-  auto timer_start = myclock_t::now(); // state circuit timer
-
-  // Execute in try block so we can catch errors and return the error message
-  // for individual circuit failures.
-  try {
-    // Rng engine (this one is used to add noise on circuit)
-    RngEngine rng;
-    rng.set_seed(circ.seed);
-
-    // Output data container
-    result.set_config(config);
-    result.metadata.add(method_names_.at(method), "method");
-    if (method == Method::statevector || method == Method::density_matrix ||
-        method == Method::unitary || method == Method::tensor_network) {
-      result.metadata.add(sim_device_name_, "device");
-    } else {
-      result.metadata.add("CPU", "device");
-    }
-
-    // Circuit qubit metadata
-    result.metadata.add(circ.num_qubits, "num_qubits");
-    result.metadata.add(circ.num_memory, "num_clbits");
-    result.metadata.add(circ.qubits(), "active_input_qubits");
-    result.metadata.add(circ.qubit_map(), "input_qubit_map");
-    result.metadata.add(circ.remapped_qubits, "remapped_qubits");
-
-    // Add measure sampling to metadata
-    // Note: this will set to `true` if sampling is enabled for the circuit
-    result.metadata.add(false, "measure_sampling");
-    result.metadata.add(false, "batched_shots_optimization");
-
-    if (circ.num_qubits > 0) { // do nothing for query steps
-      // Choose execution method based on noise and method
-      Circuit opt_circ;
-      bool noise_sampling = false;
-
-      // Ideal circuit
-      if (noise.is_ideal()) {
-        opt_circ = circ;
-        result.metadata.add("ideal", "noise");
-      }
-      // Readout error only
-      else if (noise.has_quantum_errors() == false) {
-        opt_circ = noise.sample_noise(circ, rng);
-        result.metadata.add("readout", "noise");
-      }
-      // Superop noise sampling
-      else if (method == Method::density_matrix || method == Method::superop ||
-               (method == Method::tensor_network &&
-                !has_statevector_ops(circ))) {
-        // Sample noise using SuperOp method
-        opt_circ =
-            noise.sample_noise(circ, rng, Noise::NoiseModel::Method::superop);
-        result.metadata.add("superop", "noise");
-      }
-      // Kraus noise sampling
-      else if (noise.opset().contains(Operations::OpType::kraus) ||
-               noise.opset().contains(Operations::OpType::superop)) {
-        opt_circ =
-            noise.sample_noise(circ, rng, Noise::NoiseModel::Method::kraus);
-        result.metadata.add("kraus", "noise");
-      }
-      // General circuit noise sampling
-      else {
-        if (enable_batch_multi_shots_ && !multi_chunk_required_) {
-          // batched optimization samples noise at runtime
-          opt_circ = noise.sample_noise(
-              circ, rng, Noise::NoiseModel::Method::circuit, true);
-        } else {
-          noise_sampling = true;
-        }
-        result.metadata.add("circuit", "noise");
-      }
-
-      if (noise_sampling) {
-        run_circuit_with_sampled_noise<State_t>(circ, noise, config, method,
-                                                result);
-      } else {
-        // Run multishot simulation without noise sampling
-        run_circuit_without_sampled_noise<State_t>(opt_circ, noise, config,
-                                                   method, result);
-      }
-    }
-
-    // Report success
-    result.status = ExperimentResult::Status::completed;
-
-    // Pass through circuit header and add metadata
-    result.header = circ.header;
-    result.shots = circ.shots;
-    result.seed = circ.seed;
-    result.metadata.add(parallel_shots_, "parallel_shots");
-    result.metadata.add(parallel_state_update_, "parallel_state_update");
-    if (parallel_shots_ > 1 && parallel_state_update_ > 1)
-      result.metadata.add(true, "omp_nested");
-    else
-      result.metadata.add(false, "omp_nested");
-
-    // Add timer data
-    auto timer_stop = myclock_t::now(); // stop timer
-    double time_taken =
-        std::chrono::duration<double>(timer_stop - timer_start).count();
-    result.time_taken = time_taken;
-  }
-  // If an exception occurs during execution, catch it and pass it to the output
-  catch (std::exception &e) {
-    result.status = ExperimentResult::Status::error;
-    result.message = e.what();
-  }
-}
-
-template <class State_t>
-void Controller::run_single_shot(const Circuit &circ, State_t &state,
-                                 ExperimentResult &result,
-                                 RngEngine &rng) const {
-  state.initialize_qreg(circ.num_qubits);
-  state.initialize_creg(circ.num_memory, circ.num_registers);
-  state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), result, rng, true);
-  result.save_count_data(state.cregs(), save_creg_memory_);
-}
-
-template <class State_t>
-void Controller::run_with_sampling(const Circuit &circ, State_t &state,
-                                   ExperimentResult &result, RngEngine &rng,
-                                   const uint_t block_bits,
-                                   const uint_t shots) const {
-  auto &ops = circ.ops;
-  auto first_meas = circ.first_measure_pos; // Position of first measurement op
-  bool final_ops = (first_meas == ops.size());
-
-  // allocate qubit register
-  state.allocate(circ.num_qubits, block_bits);
-
-  // Run circuit instructions before first measure
-  state.initialize_qreg(circ.num_qubits);
-  state.initialize_creg(circ.num_memory, circ.num_registers);
-
-  state.apply_ops(ops.cbegin(), ops.cbegin() + first_meas, result, rng,
-                  final_ops);
-
-  // Get measurement operations and set of measured qubits
-  measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), shots, state,
-                  result, rng);
-}
-
-template <class State_t>
-void Controller::run_circuit_without_sampled_noise(
-    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    const Method method, ExperimentResult &result) const {
-  State_t state;
-
-  // Validate gateset and memory requirements, raise exception if they're
-  // exceeded
-  validate_state(state, circ, noise, true);
-
-  // Set state config
-  state.set_config(config);
-  state.set_parallelization(parallel_state_update_);
-  state.set_global_phase(circ.global_phase_angle);
-  state.enable_density_matrix(!has_statevector_ops(circ));
-
-  bool can_sample = circ.can_sample;
-
-  // Optimize circuit
-  Noise::NoiseModel dummy_noise;
-
-  auto fusion_pass = transpile_fusion(method, circ.opset(), config);
-  fusion_pass.optimize_circuit(circ, dummy_noise, state.opset(), result);
-
-  // Cache blocking pass
-  uint_t block_bits = circ.num_qubits;
-  if (state.multi_chunk_distribution_supported()) {
-    auto cache_block_pass =
-        transpile_cache_blocking(method, circ, dummy_noise, config);
-    cache_block_pass.set_sample_measure(can_sample);
-    cache_block_pass.optimize_circuit(circ, dummy_noise, state.opset(), result);
-    if (cache_block_pass.enabled()) {
-      block_bits = cache_block_pass.block_bits();
-    }
-  }
-  // Check if measure sampling supported
-  can_sample &= check_measure_sampling_opt(circ, method);
-  auto max_bits = get_max_matrix_qubits(circ);
-
-  // Check if measure sampler and optimization are valid
-  if (can_sample) {
-    // Implement measure sampler
-    if (parallel_shots_ <= 1) {
-      state.set_distribution(num_process_per_experiment_);
-      state.set_max_matrix_qubits(max_bits);
-      RngEngine rng;
-      rng.set_seed(circ.seed);
-      run_with_sampling(circ, state, result, rng, block_bits, circ.shots);
-    } else {
-      // Vector to store parallel thread output data
-      std::vector<ExperimentResult> par_results(parallel_shots_);
-
-#pragma omp parallel for num_threads(parallel_shots_)
-      for (int i = 0; i < parallel_shots_; i++) {
-        uint_t i_shot = circ.shots * i / parallel_shots_;
-        uint_t shot_end = circ.shots * (i + 1) / parallel_shots_;
-        uint_t this_shot = shot_end - i_shot;
-
-        State_t shot_state;
-        // Set state config
-        shot_state.set_config(config);
-        shot_state.set_parallelization(parallel_state_update_);
-        shot_state.set_global_phase(circ.global_phase_angle);
-        shot_state.enable_density_matrix(!has_statevector_ops(circ));
-
-        shot_state.set_max_matrix_qubits(max_bits);
-
-        RngEngine rng;
-        rng.set_seed(circ.seed + i);
-
-        run_with_sampling(circ, shot_state, par_results[i], rng, block_bits,
-                          this_shot);
-
-        shot_state.add_metadata(par_results[i]);
-      }
-      for (auto &res : par_results) {
-        result.combine(std::move(res));
-      }
-
-      if (sim_device_name_ == "GPU") {
-        if (parallel_shots_ >= num_gpus_)
-          result.metadata.add(num_gpus_, "gpu_parallel_shots_");
-        else
-          result.metadata.add(parallel_shots_, "gpu_parallel_shots_");
-      }
-    }
-    // Add measure sampling metadata
-    result.metadata.add(true, "measure_sampling");
-
-  } else {
-    // Perform standard execution if we cannot apply the
-    // measurement sampling optimization
-
-    if (block_bits == circ.num_qubits && enable_batch_multi_shots_ &&
-        state.multi_shot_parallelization_supported()) {
-      // apply batched multi-shots optimization (currenly only on GPU)
-      state.set_max_bached_shots(max_batched_states_);
-      state.set_distribution(num_processes_);
-      state.set_max_matrix_qubits(max_bits);
-      state.set_num_creg_bits(circ.num_memory, circ.num_registers);
-      state.allocate(circ.num_qubits, circ.num_qubits,
-                     circ.shots); // allocate multiple-shots
-
-      // qreg is initialized inside state class
-      state.initialize_creg(circ.num_memory, circ.num_registers);
-
-      state.apply_ops_multi_shots(circ.ops.cbegin(), circ.ops.cend(), noise,
-                                  result, circ.seed, true);
-
-      result.save_count_data(state.cregs(), save_creg_memory_);
-
-      // Add batched multi-shots optimizaiton metadata
-      result.metadata.add(true, "batched_shots_optimization");
-    } else {
-      std::vector<ExperimentResult> par_results(parallel_shots_);
-      int_t par_shots = parallel_shots_;
-      if (block_bits != circ.num_qubits)
-        par_shots = 1;
-
-      auto run_circuit_without_sampled_noise_lambda =
-          [this, &par_results, circ, noise, config, method, block_bits,
-           max_bits, par_shots](int_t i) {
-            uint_t i_shot, shot_end;
-            i_shot = circ.shots * i / par_shots;
-            shot_end = circ.shots * (i + 1) / par_shots;
-
-            State_t par_state;
-            // Set state config
-            par_state.set_config(config);
-            par_state.set_parallelization(parallel_state_update_);
-            par_state.set_global_phase(circ.global_phase_angle);
-            par_state.enable_density_matrix(!has_statevector_ops(circ));
-
-            par_state.set_distribution(num_process_per_experiment_);
-            par_state.set_max_matrix_qubits(max_bits);
-
-            // allocate qubit register
-            par_state.allocate(circ.num_qubits, block_bits);
-
-            for (; i_shot < shot_end; i_shot++) {
-              RngEngine rng;
-              rng.set_seed(circ.seed + i_shot);
-              run_single_shot(circ, par_state, par_results[i], rng);
-            }
-            par_state.add_metadata(par_results[i]);
-          };
-      Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots,
-                                    run_circuit_without_sampled_noise_lambda);
-
-      for (auto &res : par_results) {
-        result.combine(std::move(res));
-      }
-      if (sim_device_name_ == "GPU") {
-        if (par_shots >= num_gpus_)
-          result.metadata.add(num_gpus_, "gpu_parallel_shots_");
-        else
-          result.metadata.add(par_shots, "gpu_parallel_shots_");
-      }
-    }
-  }
-  state.add_metadata(result);
-}
-
-template <class State_t>
-void Controller::run_circuit_with_sampled_noise(
-    const Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
-    const Method method, ExperimentResult &result) const {
-  std::vector<ExperimentResult> par_results(parallel_shots_);
-
-  auto run_circuit_with_sampled_noise_lambda = [this, &par_results, circ, noise,
-                                                config, method](int_t i) {
-    State_t state;
-    uint_t i_shot, shot_end;
-    Noise::NoiseModel dummy_noise;
-
-    // Validate gateset and memory requirements, raise exception if they're
-    // exceeded
-    validate_state(state, circ, noise, true);
-
-    // Set state config
-    state.set_config(config);
-    state.set_parallelization(parallel_state_update_);
-    state.set_global_phase(circ.global_phase_angle);
-    state.enable_density_matrix(!has_statevector_ops(circ));
-
-    // Transpilation for circuit noise method
-    auto fusion_pass = transpile_fusion(method, circ.opset(), config);
-    auto cache_block_pass =
-        transpile_cache_blocking(method, circ, noise, config);
-
-    i_shot = circ.shots * i / parallel_shots_;
-    shot_end = circ.shots * (i + 1) / parallel_shots_;
-
-    for (; i_shot < shot_end; i_shot++) {
-      RngEngine rng;
-      rng.set_seed(circ.seed + i_shot);
-
-      // Sample noise using circuit method
-      Circuit noise_circ = noise.sample_noise(circ, rng);
-
-      noise_circ.shots = 1;
-      fusion_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(),
-                                   par_results[i]);
-      uint_t block_bits = circ.num_qubits;
-      if (state.multi_chunk_distribution_supported()) {
-        cache_block_pass.optimize_circuit(noise_circ, dummy_noise,
-                                          state.opset(), par_results[i]);
-        if (cache_block_pass.enabled()) {
-          block_bits = cache_block_pass.block_bits();
-        }
-      }
-
-      state.set_distribution(num_process_per_experiment_);
-      state.set_max_matrix_qubits(get_max_matrix_qubits(noise_circ));
-      // allocate qubit register
-      state.allocate(noise_circ.num_qubits, block_bits);
-
-      run_single_shot(noise_circ, state, par_results[i], rng);
-    }
-    state.add_metadata(par_results[i]);
-  };
-  Utils::apply_omp_parallel_for((parallel_shots_ > 1), 0, parallel_shots_,
-                                run_circuit_with_sampled_noise_lambda);
-
-  for (auto &res : par_results) {
-    result.combine(std::move(res));
-  }
-
-  if (sim_device_name_ == "GPU") {
-    if (parallel_shots_ >= num_gpus_)
-      result.metadata.add(num_gpus_, "gpu_parallel_shots_");
-    else
-      result.metadata.add(parallel_shots_, "gpu_parallel_shots_");
-  }
-}
-
-//-------------------------------------------------------------------------
-// Measure sampling optimization
-//-------------------------------------------------------------------------
-
-bool Controller::check_measure_sampling_opt(const Circuit &circ,
-                                            const Method method) const {
-  // Check if circuit has sampling flag disabled
-  if (circ.can_sample == false) {
-    return false;
-  }
-
-  // If density matrix, unitary, superop method all supported instructions
-  // allow sampling
-  if (method == Method::density_matrix || method == Method::superop ||
-      method == Method::unitary) {
-    return true;
-  }
-  if (method == Method::tensor_network) {
-    // if there are no save statevec ops, tensor network simulator runs as
-    // density matrix simulator
-    if ((!circ.opset().contains(Operations::OpType::save_statevec)) &&
-        (!circ.opset().contains(Operations::OpType::save_statevec_dict))) {
-      return true;
-    }
-  }
-
-  // If circuit contains a non-initial initialize that is not a full width
-  // instruction we can't sample
-  if (circ.can_sample_initialize == false) {
-    return false;
-  }
-
-  // Check if non-density matrix simulation and circuit contains
-  // a stochastic instruction before measurement
-  // ie. reset, kraus, superop
-  // TODO:
-  // * Resets should be allowed if applied to |0> state (no gates before).
-  if (circ.opset().contains(Operations::OpType::reset) ||
-      circ.opset().contains(Operations::OpType::kraus) ||
-      circ.opset().contains(Operations::OpType::superop) ||
-      circ.opset().contains(Operations::OpType::jump) ||
-      circ.opset().contains(Operations::OpType::mark)) {
-    return false;
-  }
-  // Otherwise true
-  return true;
-}
-
-template <typename InputIterator, class State_t>
-void Controller::measure_sampler(InputIterator first_meas,
-                                 InputIterator last_meas, uint_t shots,
-                                 State_t &state, ExperimentResult &result,
-                                 RngEngine &rng, int_t shot_index) const {
-  // Check if meas_circ is empty, and if so return initial creg
-  if (first_meas == last_meas) {
-    while (shots-- > 0) {
-      result.save_count_data(state.cregs(), save_creg_memory_);
-    }
-    return;
-  }
-
-  std::vector<Operations::Op> meas_ops;
-  std::vector<Operations::Op> roerror_ops;
-  for (auto op = first_meas; op != last_meas; op++) {
-    if (op->type == Operations::OpType::roerror) {
-      roerror_ops.push_back(*op);
-    } else { /*(op.type == Operations::OpType::measure) */
-      meas_ops.push_back(*op);
-    }
-  }
-
-  // Get measured qubits from circuit sort and delete duplicates
-  std::vector<uint_t> meas_qubits; // measured qubits
-  for (const auto &op : meas_ops) {
-    for (size_t j = 0; j < op.qubits.size(); ++j)
-      meas_qubits.push_back(op.qubits[j]);
-  }
-  sort(meas_qubits.begin(), meas_qubits.end());
-  meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()),
-                    meas_qubits.end());
-
-  // Generate the samples
-  uint_t shots_or_index;
-  if (shot_index < 0)
-    shots_or_index = shots;
-  else
-    shots_or_index = shot_index;
-
-  auto timer_start = myclock_t::now();
-  auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng);
-  auto time_taken =
-      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
-  result.metadata.add(time_taken, "sample_measure_time");
-
-  // Make qubit map of position in vector of measured qubits
-  std::unordered_map<uint_t, uint_t> qubit_map;
-  for (uint_t j = 0; j < meas_qubits.size(); ++j) {
-    qubit_map[meas_qubits[j]] = j;
-  }
-
-  // Maps of memory and register to qubit position
-  std::map<uint_t, uint_t> memory_map;
-  std::map<uint_t, uint_t> register_map;
-  for (const auto &op : meas_ops) {
-    for (size_t j = 0; j < op.qubits.size(); ++j) {
-      auto pos = qubit_map[op.qubits[j]];
-      if (!op.memory.empty())
-        memory_map[op.memory[j]] = pos;
-      if (!op.registers.empty())
-        register_map[op.registers[j]] = pos;
-    }
-  }
-
-  // Process samples
-  uint_t num_memory =
-      (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
-  uint_t num_registers =
-      (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
-  ClassicalRegister creg;
-  while (!all_samples.empty()) {
-    auto sample = all_samples.back();
-    creg.initialize(num_memory, num_registers);
-
-    // process memory bit measurements
-    for (const auto &pair : memory_map) {
-      creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}),
-                         reg_t());
-    }
-    // process register bit measurements
-    for (const auto &pair : register_map) {
-      creg.store_measure(reg_t({sample[pair.second]}), reg_t(),
-                         reg_t({pair.first}));
-    }
-
-    // process read out errors for memory and registers
-    for (const Operations::Op &roerror : roerror_ops) {
-      creg.apply_roerror(roerror, rng);
-    }
-
-    // Save count data
-    result.save_count_data(creg, save_creg_memory_);
-
-    // pop off processed sample
-    all_samples.pop_back();
+    throw std::runtime_error("Controller:Invalid simulation method");
   }
 }
 
-//-------------------------------------------------------------------------
-// Validation
-//-------------------------------------------------------------------------
-
-std::vector<Controller::Method>
+std::vector<Method>
 Controller::simulation_methods(std::vector<std::shared_ptr<Circuit>> &circuits,
                                Noise::NoiseModel &noise_model) const {
   // Does noise model contain kraus noise
@@ -1904,7 +810,7 @@ Controller::simulation_methods(std::vector<std::shared_ptr<Circuit>> &circuits,
   return sim_methods;
 }
 
-Controller::Method Controller::automatic_simulation_method(
+Method Controller::automatic_simulation_method(
     const Circuit &circ, const Noise::NoiseModel &noise_model) const {
   // If circuit and noise model are Clifford run on Stabilizer simulator
   if (validate_method(Method::stabilizer, circ, noise_model, false)) {
@@ -1918,7 +824,7 @@ Controller::Method Controller::automatic_simulation_method(
   if (noise_model.has_quantum_errors() && circ.num_qubits < 64 &&
       circ.shots > (1ULL << circ.num_qubits) &&
       validate_method(Method::density_matrix, circ, noise_model, false) &&
-      check_measure_sampling_opt(circ, Method::density_matrix)) {
+      circ.can_sample) {
     return Method::density_matrix;
   }
 
@@ -1942,95 +848,6 @@ Controller::Method Controller::automatic_simulation_method(
   return Method::statevector;
 }
 
-bool Controller::validate_method(Method method, const Circuit &circ,
-                                 const Noise::NoiseModel &noise_model,
-                                 bool throw_except) const {
-  // Switch wrapper for templated function validate_state
-  switch (method) {
-  case Method::stabilizer:
-    return validate_state(Stabilizer::State(), circ, noise_model, throw_except);
-  case Method::extended_stabilizer:
-    return validate_state(ExtendedStabilizer::State(), circ, noise_model,
-                          throw_except);
-  case Method::matrix_product_state:
-    return validate_state(MatrixProductState::State(), circ, noise_model,
-                          throw_except);
-  case Method::statevector:
-    return validate_state(Statevector::State<>(), circ, noise_model,
-                          throw_except);
-  case Method::density_matrix:
-    return validate_state(DensityMatrix::State<>(), circ, noise_model,
-                          throw_except);
-  case Method::unitary:
-    return validate_state(QubitUnitary::State<>(), circ, noise_model,
-                          throw_except);
-  case Method::superop:
-    return validate_state(QubitSuperoperator::State<>(), circ, noise_model,
-                          throw_except);
-  case Method::tensor_network:
-    return validate_state(TensorNetwork::State<>(), circ, noise_model,
-                          throw_except);
-  case Method::automatic:
-    throw std::runtime_error(
-        "Cannot validate circuit for unresolved simulation method.");
-  }
-}
-
-template <class state_t>
-bool Controller::validate_state(const state_t &state, const Circuit &circ,
-                                const Noise::NoiseModel &noise,
-                                bool throw_except) const {
-  std::stringstream error_msg;
-  std::string circ_name;
-  JSON::get_value(circ_name, "name", circ.header);
-
-  // Check if a circuit is valid for state ops
-  bool circ_valid = state.opset().contains(circ.opset());
-  if (throw_except && !circ_valid) {
-    error_msg << "Circuit " << circ_name << " contains invalid instructions ";
-    error_msg << state.opset().difference(circ.opset());
-    error_msg << " for \"" << state.name() << "\" method.";
-  }
-
-  // Check if a noise model valid for state ops
-  bool noise_valid = noise.is_ideal() || state.opset().contains(noise.opset());
-  if (throw_except && !noise_valid) {
-    error_msg << "Noise model contains invalid instructions ";
-    error_msg << state.opset().difference(noise.opset());
-    error_msg << " for \"" << state.name() << "\" method.";
-  }
-
-  // Validate memory requirements
-  bool memory_valid = true;
-  if (max_memory_mb_ > 0) {
-    size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) /
-                         num_process_per_experiment_;
-    size_t mem_size = (sim_device_ == Device::GPU)
-                          ? max_memory_mb_ + max_gpu_memory_mb_
-                          : max_memory_mb_;
-    memory_valid = (required_mb <= mem_size);
-    if (throw_except && !memory_valid) {
-      error_msg << "Insufficient memory to run circuit " << circ_name;
-      error_msg << " using the " << state.name() << " simulator.";
-      error_msg << " Required memory: " << required_mb
-                << "M, max memory: " << max_memory_mb_ << "M";
-      if (sim_device_ == Device::GPU) {
-        error_msg << " (Host) + " << max_gpu_memory_mb_ << "M (GPU)";
-      }
-    }
-  }
-
-  if (noise_valid && circ_valid && memory_valid) {
-    return true;
-  }
-
-  // One of the validation checks failed for the current state
-  if (throw_except) {
-    throw std::runtime_error(error_msg.str());
-  }
-  return false;
-}
-
 void Controller::save_exception_to_results(Result &result,
                                            const std::exception &e) const {
   result.status = Result::Status::error;
@@ -2041,40 +858,25 @@ void Controller::save_exception_to_results(Result &result,
   }
 }
 
-int_t Controller::get_matrix_bits(const Operations::Op &op) const {
-  int_t bit = 1;
-  if (op.type == Operations::OpType::matrix ||
-      op.type == Operations::OpType::diagonal_matrix ||
-      op.type == Operations::OpType::initialize)
-    bit = op.qubits.size();
-  else if (op.type == Operations::OpType::kraus ||
-           op.type == Operations::OpType::superop) {
-    if (method_ == Method::density_matrix)
-      bit = op.qubits.size() * 2;
-    else
-      bit = op.qubits.size();
-  }
-  return bit;
-}
-
-int_t Controller::get_max_matrix_qubits(const Circuit &circ) const {
-  int_t max_bits = 0;
-  int_t i;
-
-  for (i = 0; i < circ.ops.size(); i++) {
-    int_t bit = 1;
-    bit = get_matrix_bits(circ.ops[i]);
-    max_bits = std::max(max_bits, bit);
-  }
-  return max_bits;
-}
-
 bool Controller::has_statevector_ops(const Circuit &circ) const {
   return circ.opset().contains(Operations::OpType::save_statevec) ||
          circ.opset().contains(Operations::OpType::save_statevec_dict) ||
          circ.opset().contains(Operations::OpType::save_amps);
 }
 
+//-------------------------------------------------------------------------
+// Validation
+//-------------------------------------------------------------------------
+bool Controller::validate_method(Method method, const Circuit &circ,
+                                 const Noise::NoiseModel &noise_model,
+                                 bool throw_except) const {
+  std::shared_ptr<CircuitExecutor::Base> executor =
+      make_circuit_executor(method);
+  bool ret = executor->validate_state(circ, noise_model, throw_except);
+  executor.reset();
+  return ret;
+}
+
 //-------------------------------------------------------------------------
 } // end namespace AER
 //-------------------------------------------------------------------------
diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp
index bb7ac166e7..52791d16f6 100644
--- a/src/controllers/state_controller.hpp
+++ b/src/controllers/state_controller.hpp
@@ -804,7 +804,7 @@ reg_t AerState::initialize_statevector(uint_t num_of_qubits, complex_t *data,
   auto qv = QV::QubitVector<double>();
   qv.move_from_vector(std::move(vec));
 
-  state->initialize_qreg(num_of_qubits_, std::move(qv));
+  state->initialize_statevector(num_of_qubits_, std::move(qv));
   state->initialize_creg(num_of_qubits_, num_of_qubits_);
   initialized_ = true;
 
diff --git a/src/framework/config.hpp b/src/framework/config.hpp
index 56a8015a0b..60a5d7c313 100644
--- a/src/framework/config.hpp
+++ b/src/framework/config.hpp
@@ -100,6 +100,9 @@ struct Config {
   bool batched_shots_gpu = false;
   uint_t batched_shots_gpu_max_qubits = 16;
   optional<uint_t> num_threads_per_device;
+  // # multi-shot branching
+  bool shot_branching_enable = false;
+  bool shot_branching_sampling_enable = false;
   // # statevector options
   uint_t statevector_parallel_threshold = 14;
   uint_t statevector_sample_measure_opt = 10;
@@ -167,6 +170,7 @@ struct Config {
   optional<uint_t> unitary_parallel_threshold;
   optional<uint_t> memory_blocking_bits;
   optional<uint_t> extended_stabilizer_norm_estimation_default_samples;
+  optional<reg_t> target_gpus;
 
   void clear() {
     shots = 1024;
@@ -201,6 +205,9 @@ struct Config {
     batched_shots_gpu = false;
     batched_shots_gpu_max_qubits = 16;
     num_threads_per_device.clear();
+    // # multi-shot branching
+    shot_branching_enable = false;
+    shot_branching_sampling_enable = false;
     // # statevector options
     statevector_parallel_threshold = 14;
     statevector_sample_measure_opt = 10;
@@ -263,6 +270,7 @@ struct Config {
     unitary_parallel_threshold.clear();
     memory_blocking_bits.clear();
     extended_stabilizer_norm_estimation_default_samples.clear();
+    target_gpus.clear();
   }
 
   void merge(const Config &other) {
@@ -312,6 +320,9 @@ struct Config {
     batched_shots_gpu_max_qubits = other.batched_shots_gpu_max_qubits;
     if (other.num_threads_per_device.has_value())
       num_threads_per_device.value(other.num_threads_per_device.value());
+    // # multi-shot branching
+    shot_branching_enable = other.shot_branching_enable;
+    shot_branching_sampling_enable = other.shot_branching_sampling_enable;
     // # statevector options
     statevector_parallel_threshold = other.statevector_parallel_threshold;
     statevector_sample_measure_opt = other.statevector_sample_measure_opt;
@@ -401,6 +412,8 @@ struct Config {
     if (other.extended_stabilizer_norm_estimation_default_samples.has_value())
       extended_stabilizer_norm_estimation_default_samples.value(
           other.extended_stabilizer_norm_estimation_default_samples.value());
+    if (other.target_gpus.has_value())
+      target_gpus.value(other.target_gpus.value());
   }
 };
 
@@ -440,6 +453,10 @@ inline void from_json(const json_t &js, Config &config) {
   get_value(config.batched_shots_gpu_max_qubits, "batched_shots_gpu_max_qubits",
             js);
   get_value(config.num_threads_per_device, "num_threads_per_device", js);
+  // # multi-shot branching
+  get_value(config.shot_branching_enable, "shot_branching_enable", js);
+  get_value(config.shot_branching_sampling_enable,
+            "shot_branching_sampling_enable", js);
   // # statevector options
   get_value(config.statevector_parallel_threshold,
             "statevector_parallel_threshold", js);
@@ -511,6 +528,7 @@ inline void from_json(const json_t &js, Config &config) {
   get_value(config.memory_blocking_bits, "memory_blocking_bits", js);
   get_value(config.extended_stabilizer_norm_estimation_default_samples,
             "extended_stabilizer_norm_estimation_default_samples", js);
+  get_value(config.target_gpus, "target_gpus", js);
 }
 
 } // namespace AER
diff --git a/src/framework/operations.hpp b/src/framework/operations.hpp
old mode 100644
new mode 100755
index da1f575054..4ec55757ff
--- a/src/framework/operations.hpp
+++ b/src/framework/operations.hpp
@@ -61,6 +61,7 @@ enum class OpType {
   superop,
   roerror,
   noise_switch,
+  sample_noise,
   // Save instructions
   save_state,
   save_expval,
@@ -207,6 +208,9 @@ inline std::ostream &operator<<(std::ostream &stream, const OpType &type) {
   case OpType::qerror_loc:
     stream << "qerror_loc";
     break;
+  case OpType::sample_noise:
+    stream << "sample_noise";
+    break;
   case OpType::noise_switch:
     stream << "noise_switch";
     break;
diff --git a/src/framework/results/experiment_result.hpp b/src/framework/results/experiment_result.hpp
index b956e5f06b..f8e2771e61 100644
--- a/src/framework/results/experiment_result.hpp
+++ b/src/framework/results/experiment_result.hpp
@@ -62,8 +62,6 @@ struct ExperimentResult {
 
   // save creg as count data
   void save_count_data(const ClassicalRegister &creg, bool save_memory);
-  void save_count_data(const std::vector<ClassicalRegister> &cregs,
-                       bool save_memory);
 
   // Save data type which can be averaged over all shots.
   // This supports DataSubTypes: list, c_list, accum, c_accum, average,
@@ -148,12 +146,6 @@ void ExperimentResult::save_count_data(const ClassicalRegister &creg,
   }
 }
 
-void ExperimentResult::save_count_data(
-    const std::vector<ClassicalRegister> &cregs, bool save_memory) {
-  for (int_t i = 0; i < cregs.size(); i++)
-    save_count_data(cregs[i], save_memory);
-}
-
 template <class T>
 void ExperimentResult::save_data_average(const ClassicalRegister &creg,
                                          const std::string &key, const T &datum,
diff --git a/src/framework/utils.hpp b/src/framework/utils.hpp
old mode 100644
new mode 100755
index e2d3b8407b..6c3cc52d77
--- a/src/framework/utils.hpp
+++ b/src/framework/utils.hpp
@@ -1327,6 +1327,30 @@ double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin,
   return val;
 }
 
+// apply OpenMP parallel loop to lambda function and return reduced integer if
+// enabled
+template <typename Lambda>
+int apply_omp_parallel_for_reduction_int(bool enabled, int_t i_begin,
+                                         int_t i_end, Lambda &func,
+                                         int nthreads = 0) {
+  int val = 0;
+  if (enabled) {
+    if (nthreads > 0) {
+#pragma omp parallel for reduction(+ : val) num_threads(nthreads)
+      for (int_t i = i_begin; i < i_end; i++)
+        val += func(i);
+    } else {
+#pragma omp parallel for reduction(+ : val)
+      for (int_t i = i_begin; i < i_end; i++)
+        val += func(i);
+    }
+  } else {
+    for (int_t i = i_begin; i < i_end; i++)
+      val += func(i);
+  }
+  return val;
+}
+
 //------------------------------------------------------------------------------
 } // end namespace Utils
 //------------------------------------------------------------------------------
diff --git a/src/noise/noise_model.hpp b/src/noise/noise_model.hpp
index d1207fa4b2..feff38054e 100644
--- a/src/noise/noise_model.hpp
+++ b/src/noise/noise_model.hpp
@@ -528,7 +528,7 @@ NoiseModel::sample_noise_helper(const Operations::Op &op, RngEngine &rng,
   // Combine errors
   auto &noise_ops = noise_before;
   noise_ops.reserve(noise_before.size() + noise_after.size() + 1);
-  if (op.type != Operations::OpType::qerror_loc) {
+  if (op.type != Operations::OpType::sample_noise) {
     noise_ops.push_back(op);
   }
   noise_ops.insert(noise_ops.end(),
@@ -802,7 +802,7 @@ NoiseModel::NoiseOps
 NoiseModel::create_noise_loc(const Operations::Op &op) const {
   NoiseOps ops(1);
   ops[0] = op;
-  ops[0].type = Operations::OpType::qerror_loc;
+  ops[0].type = Operations::OpType::sample_noise;
   return ops;
 }
 
diff --git a/src/simulators/batch_shots_executor.hpp b/src/simulators/batch_shots_executor.hpp
new file mode 100644
index 0000000000..e0e7b544a8
--- /dev/null
+++ b/src/simulators/batch_shots_executor.hpp
@@ -0,0 +1,477 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _batch_shots_executor_hpp_
+#define _batch_shots_executor_hpp_
+
+#include "simulators/parallel_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace CircuitExecutor {
+
+//-------------------------------------------------------------------------
+// batched-shots executor class implementation
+//-------------------------------------------------------------------------
+template <class state_t>
+class BatchShotsExecutor : public virtual MultiStateExecutor<state_t> {
+  using Base = MultiStateExecutor<state_t>;
+
+protected:
+  // config setting for multi-shot parallelization
+  bool batched_shots_gpu_ = true;
+  int_t batched_shots_gpu_max_qubits_ =
+      16; // multi-shot parallelization is applied if qubits is less than max
+          // qubits
+  bool enable_batch_multi_shots_ =
+      false;                 // multi-shot parallelization can be applied
+  uint_t local_state_index_; // local shot ID of current loop
+public:
+  BatchShotsExecutor();
+  virtual ~BatchShotsExecutor();
+
+protected:
+  void set_config(const Config &config) override;
+  void set_parallelization(const Circuit &circ,
+                           const Noise::NoiseModel &noise) override;
+
+  void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                         const Config &config, RngEngine &init_rng,
+                         ExperimentResult &result, bool sample_noise) override;
+
+  // apply ops for multi-shots to one group
+  template <typename InputIterator>
+  void apply_ops_batched_shots_for_group(int_t i_group, InputIterator first,
+                                         InputIterator last,
+                                         const Noise::NoiseModel &noise,
+                                         ExperimentResult &result,
+                                         RngEngine &init_rng, uint_t rng_seed,
+                                         bool final_ops);
+
+  // apply op to multiple shots , return flase if op is not supported to execute
+  // in a batch
+  virtual bool apply_batched_op(const int_t istate, const Operations::Op &op,
+                                ExperimentResult &result,
+                                std::vector<RngEngine> &rng,
+                                bool final_op = false) {
+    return false;
+  }
+
+  // apply sampled noise to multiple-shots (this is used for ops contains
+  // non-Pauli operators)
+  void apply_batched_noise_ops(
+      const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
+      ExperimentResult &result, std::vector<RngEngine> &rng);
+};
+
+template <class state_t>
+BatchShotsExecutor<state_t>::BatchShotsExecutor() {}
+
+template <class state_t>
+BatchShotsExecutor<state_t>::~BatchShotsExecutor() {}
+
+template <class state_t>
+void BatchShotsExecutor<state_t>::set_config(const Config &config) {
+  Base::set_config(config);
+
+  // enable batched multi-shots/experiments optimization
+  batched_shots_gpu_ = config.batched_shots_gpu;
+
+  batched_shots_gpu_max_qubits_ = config.batched_shots_gpu_max_qubits;
+  if (Base::method_ == Method::density_matrix ||
+      Base::method_ == Method::unitary)
+    batched_shots_gpu_max_qubits_ /= 2;
+}
+
+template <class state_t>
+void BatchShotsExecutor<state_t>::set_parallelization(
+    const Circuit &circ, const Noise::NoiseModel &noise) {
+  Base::set_parallelization(circ, noise);
+
+  enable_batch_multi_shots_ = false;
+  if (batched_shots_gpu_ && Base::sim_device_ != Device::CPU) {
+    enable_batch_multi_shots_ = true;
+    if (circ.num_qubits >= batched_shots_gpu_max_qubits_)
+      enable_batch_multi_shots_ = false;
+    else if (circ.shots == 1)
+      enable_batch_multi_shots_ = false;
+    //    else if (Base::multiple_chunk_required(circ, noise))
+    //      enable_batch_multi_shots_ = false;
+  }
+
+#ifdef AER_CUSTATEVEC
+  // disable cuStateVec for batch-shots optimization
+  if (enable_batch_multi_shots_ && Base::cuStateVec_enable_)
+    Base::cuStateVec_enable_ = false;
+#endif
+}
+
+template <class state_t>
+void BatchShotsExecutor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+  state_t dummy_state;
+  // if batched-shot is not applicable, use base multi-shots executor
+  if (!enable_batch_multi_shots_) {
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+                                   sample_noise);
+  }
+
+  Noise::NoiseModel dummy_noise;
+
+  Base::num_qubits_ = circ.num_qubits;
+  Base::num_creg_memory_ = circ.num_memory;
+  Base::num_creg_registers_ = circ.num_registers;
+
+  if (Base::sim_device_ == Device::GPU) {
+#ifdef _OPENMP
+    if (omp_get_num_threads() == 1)
+      Base::shot_omp_parallel_ = true;
+#endif
+  } else if (Base::sim_device_ == Device::ThrustCPU) {
+    Base::shot_omp_parallel_ = false;
+  }
+
+  Base::set_distribution(circ.shots);
+  Base::num_max_shots_ = Base::get_max_parallel_shots(circ, noise);
+  if (Base::num_max_shots_ == 0)
+    Base::num_max_shots_ = 1;
+
+  RngEngine rng = init_rng;
+
+  Circuit circ_opt;
+  if (sample_noise)
+    circ_opt =
+        noise.sample_noise(circ, rng, Noise::NoiseModel::Method::circuit, true);
+  else
+    circ_opt = circ;
+  auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config);
+
+  fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
+                               result);
+  Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
+
+  // Add batched multi-shots optimizaiton metadata
+  result.metadata.add(true, "batched_shots_optimization");
+
+  int_t i;
+  int_t i_begin, n_shots;
+
+#ifdef AER_MPI
+  // if shots are distributed to MPI processes, allocate cregs to be gathered
+  if (Base::num_process_per_experiment_ > 1)
+    Base::cregs_.resize(circ_opt.shots);
+#endif
+
+  i_begin = 0;
+  while (i_begin < Base::num_local_states_) {
+    local_state_index_ = Base::global_state_index_ + i_begin;
+
+    // loop for states can be stored in available memory
+    n_shots = std::min(Base::num_local_states_, Base::num_max_shots_);
+    if (i_begin + n_shots > Base::num_local_states_) {
+      n_shots = Base::num_local_states_ - i_begin;
+    }
+
+    // allocate shots
+    this->allocate_states(n_shots, config);
+
+    // Set state config
+    for (i = 0; i < n_shots; i++) {
+      Base::states_[i].set_parallelization(Base::parallel_state_update_);
+      Base::states_[i].set_global_phase(circ.global_phase_angle);
+    }
+    this->set_global_phase(circ_opt.global_phase_angle);
+
+    // initialization (equivalent to initialize_qreg + initialize_creg)
+    auto init_group = [this](int_t ig) {
+      for (uint_t j = Base::top_state_of_group_[ig];
+           j < Base::top_state_of_group_[ig + 1]; j++) {
+        // enabling batch shots optimization
+        Base::states_[j].qreg().enable_batch(true);
+
+        // initialize qreg here
+        Base::states_[j].qreg().set_num_qubits(Base::num_qubits_);
+        Base::states_[j].qreg().initialize();
+
+        // initialize creg here
+        Base::states_[j].qreg().initialize_creg(Base::num_creg_memory_,
+                                                Base::num_creg_registers_);
+      }
+    };
+    Utils::apply_omp_parallel_for(
+        (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0,
+        Base::num_groups_, init_group);
+
+    this->apply_global_phase(); // this is parallelized in sub-classes
+
+    // apply ops to multiple-shots
+    if (Base::num_groups_ > 1 && Base::shot_omp_parallel_) {
+      std::vector<ExperimentResult> par_results(Base::num_groups_);
+#pragma omp parallel for num_threads(Base::num_groups_)
+      for (i = 0; i < Base::num_groups_; i++)
+        apply_ops_batched_shots_for_group(
+            i, circ_opt.ops.cbegin(), circ_opt.ops.cend(), noise,
+            par_results[i], rng, circ_opt.seed, true);
+
+      for (auto &res : par_results)
+        result.combine(std::move(res));
+    } else {
+      for (i = 0; i < Base::num_groups_; i++)
+        apply_ops_batched_shots_for_group(i, circ_opt.ops.cbegin(),
+                                          circ_opt.ops.cend(), noise, result,
+                                          rng, circ_opt.seed, true);
+    }
+
+    // collect measured bits and copy memory
+    for (i = 0; i < n_shots; i++) {
+      if (Base::num_process_per_experiment_ > 1) {
+        Base::states_[i].qreg().read_measured_data(
+            Base::cregs_[local_state_index_ + i]);
+      } else {
+        Base::states_[i].qreg().read_measured_data(Base::states_[i].creg());
+        result.save_count_data(Base::states_[i].creg(),
+                               Base::save_creg_memory_);
+      }
+    }
+
+    i_begin += n_shots;
+  }
+
+  // gather cregs on MPI processes and save to result
+#ifdef AER_MPI
+  if (Base::num_process_per_experiment_ > 1) {
+    Base::gather_creg_memory(Base::cregs_, Base::state_index_begin_);
+
+    for (i = 0; i < circ_opt.shots; i++)
+      result.save_count_data(Base::cregs_[i], Base::save_creg_memory_);
+    Base::cregs_.clear();
+  }
+#endif
+
+#ifdef AER_THRUST_CUDA
+  if (Base::sim_device_ == Device::GPU) {
+    int nDev;
+    if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+      cudaGetLastError();
+      nDev = 0;
+    }
+    if (nDev > Base::num_groups_)
+      nDev = Base::num_groups_;
+    result.metadata.add(nDev, "batched_shots_optimization_parallel_gpus");
+  }
+#endif
+}
+
+template <class state_t>
+template <typename InputIterator>
+void BatchShotsExecutor<state_t>::apply_ops_batched_shots_for_group(
+    int_t i_group, InputIterator first, InputIterator last,
+    const Noise::NoiseModel &noise, ExperimentResult &result,
+    RngEngine &init_rng, uint_t rng_seed, bool final_ops) {
+  uint_t istate = Base::top_state_of_group_[i_group];
+  std::vector<RngEngine> rng(Base::num_states_in_group_[i_group]);
+#ifdef _OPENMP
+  int num_inner_threads = omp_get_max_threads() / omp_get_num_threads();
+#else
+  int num_inner_threads = 1;
+#endif
+
+  for (uint_t j = Base::top_state_of_group_[i_group];
+       j < Base::top_state_of_group_[i_group + 1]; j++)
+    if (local_state_index_ + j == 0)
+      rng[j - Base::top_state_of_group_[i_group]] = init_rng;
+    else {
+      rng[j - Base::top_state_of_group_[i_group]].set_seed(
+          rng_seed + local_state_index_ + j);
+    }
+
+  for (auto op = first; op != last; ++op) {
+    if (op->type == Operations::OpType::sample_noise) {
+      // sample error here
+      uint_t count = Base::num_states_in_group_[i_group];
+      std::vector<std::vector<Operations::Op>> noise_ops(count);
+
+      uint_t count_ops = 0;
+      uint_t non_pauli_gate_count = 0;
+      if (num_inner_threads > 1) {
+#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads)
+        for (int_t j = 0; j < count; j++) {
+          noise_ops[j] = noise.sample_noise_loc(*op, rng[j]);
+
+          if (!(noise_ops[j].size() == 0 ||
+                (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) {
+            count_ops++;
+            for (int_t k = 0; k < noise_ops[j].size(); k++) {
+              if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" &&
+                  noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" &&
+                  noise_ops[j][k].name != "pauli") {
+                non_pauli_gate_count++;
+                break;
+              }
+            }
+          }
+        }
+      } else {
+        for (int_t j = 0; j < count; j++) {
+          noise_ops[j] = noise.sample_noise_loc(*op, rng[j]);
+
+          if (!(noise_ops[j].size() == 0 ||
+                (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) {
+            count_ops++;
+            for (int_t k = 0; k < noise_ops[j].size(); k++) {
+              if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" &&
+                  noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" &&
+                  noise_ops[j][k].name != "pauli") {
+                non_pauli_gate_count++;
+                break;
+              }
+            }
+          }
+        }
+      }
+
+      if (count_ops == 0) {
+        continue; // do nothing
+      }
+      if (non_pauli_gate_count == 0) { // ptimization for Pauli error
+        Base::states_[istate].qreg().apply_batched_pauli_ops(noise_ops);
+      } else {
+        // otherwise execute each circuit
+        apply_batched_noise_ops(i_group, noise_ops, result, rng);
+      }
+    } else {
+      if (!apply_batched_op(istate, *op, result, rng,
+                            final_ops && (op + 1 == last))) {
+        // call apply_op for each state
+        for (uint_t j = Base::top_state_of_group_[i_group];
+             j < Base::top_state_of_group_[i_group + 1]; j++) {
+          Base::states_[j].qreg().enable_batch(false);
+          Base::states_[j].qreg().read_measured_data(Base::states_[j].creg());
+          Base::states_[j].apply_op(*op, result,
+                                    rng[j - Base::top_state_of_group_[i_group]],
+                                    final_ops && (op + 1 == last));
+          Base::states_[j].qreg().enable_batch(true);
+        }
+      }
+    }
+  }
+}
+
+template <class state_t>
+void BatchShotsExecutor<state_t>::apply_batched_noise_ops(
+    const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
+    ExperimentResult &result, std::vector<RngEngine> &rng) {
+  int_t i, j, k, count, nop, pos = 0;
+  uint_t istate = Base::top_state_of_group_[i_group];
+  count = ops.size();
+
+  reg_t mask(count);
+  std::vector<bool> finished(count, false);
+  for (i = 0; i < count; i++) {
+    int_t cond_reg = -1;
+
+    if (finished[i])
+      continue;
+    if (ops[i].size() == 0 || (ops[i].size() == 1 && ops[i][0].name == "id")) {
+      finished[i] = true;
+      continue;
+    }
+    mask[i] = 1;
+
+    // find same ops to be exectuted in a batch
+    for (j = i + 1; j < count; j++) {
+      if (finished[j]) {
+        mask[j] = 0;
+        continue;
+      }
+      if (ops[j].size() == 0 ||
+          (ops[j].size() == 1 && ops[j][0].name == "id")) {
+        mask[j] = 0;
+        finished[j] = true;
+        continue;
+      }
+
+      if (ops[i].size() != ops[j].size()) {
+        mask[j] = 0;
+        continue;
+      }
+
+      mask[j] = true;
+      for (k = 0; k < ops[i].size(); k++) {
+        if (ops[i][k].conditional) {
+          cond_reg = ops[i][k].conditional_reg;
+        }
+        if (ops[i][k].type != ops[j][k].type ||
+            ops[i][k].name != ops[j][k].name) {
+          mask[j] = false;
+          break;
+        }
+      }
+      if (mask[j])
+        finished[j] = true;
+    }
+
+    // mask conditional register
+    int_t sys_reg = Base::states_[istate].qreg().set_batched_system_conditional(
+        cond_reg, mask);
+
+    // batched execution on same ops
+    for (k = 0; k < ops[i].size(); k++) {
+      Operations::Op cop = ops[i][k];
+
+      // mark op conditional to mask shots
+      cop.conditional = true;
+      cop.conditional_reg = sys_reg;
+
+      if (!apply_batched_op(istate, cop, result, rng, false)) {
+        // call apply_op for each state
+        /*if(cop.conditional){
+          //copy creg to local state
+          reg_t reg_pos(1);
+          reg_t mem_pos;
+          int bit =
+        Base::states_[j].qreg().measured_cregister(cop.conditional_reg);
+          const reg_t reg = Utils::int2reg(bit, 2, 1);
+          reg_pos[0] = cop.conditional_reg;
+          Base::states_[j].creg().store_measure(reg, mem_pos, reg_pos);
+        }*/
+        for (uint_t j = Base::top_state_of_group_[i_group];
+             j < Base::top_state_of_group_[i_group + 1]; j++) {
+          Base::states_[j].qreg().enable_batch(false);
+          Base::states_[j].apply_op(
+              cop, result, rng[j - Base::top_state_of_group_[i_group]], false);
+          Base::states_[j].qreg().enable_batch(true);
+        }
+      }
+    }
+    mask[i] = 0;
+    finished[i] = true;
+  }
+}
+
+//-------------------------------------------------------------------------
+} // end namespace CircuitExecutor
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/chunk_utils.hpp b/src/simulators/chunk_utils.hpp
new file mode 100644
index 0000000000..3277e2c0fd
--- /dev/null
+++ b/src/simulators/chunk_utils.hpp
@@ -0,0 +1,116 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019.2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _chunk_utils_hpp
+#define _chunk_utils_hpp
+
+#include "framework/opset.hpp"
+#include "framework/types.hpp"
+
+namespace AER {
+
+namespace Chunk {
+
+void get_qubits_inout(const int chunk_qubits, const reg_t &qubits,
+                      reg_t &qubits_in, reg_t &qubits_out) {
+  int_t i;
+  qubits_in.clear();
+  qubits_out.clear();
+  for (i = 0; i < qubits.size(); i++) {
+    if (qubits[i] < chunk_qubits) { // in chunk
+      qubits_in.push_back(qubits[i]);
+    } else {
+      qubits_out.push_back(qubits[i]);
+    }
+  }
+}
+
+void get_inout_ctrl_qubits(const Operations::Op &op, const uint_t num_qubits,
+                           reg_t &qubits_in, reg_t &qubits_out) {
+  if (op.type == Operations::OpType::gate &&
+      (op.name[0] == 'c' || op.name.find("mc") == 0)) {
+    for (int i = 0; i < op.qubits.size(); i++) {
+      if (op.qubits[i] < num_qubits)
+        qubits_in.push_back(op.qubits[i]);
+      else
+        qubits_out.push_back(op.qubits[i]);
+    }
+  }
+}
+
+Operations::Op correct_gate_op_in_chunk(const Operations::Op &op,
+                                        reg_t &qubits_in) {
+  Operations::Op new_op = op;
+  new_op.qubits = qubits_in;
+  // change gate name if there is no control qubits inside chunk
+  if (op.name.find("swap") != std::string::npos && qubits_in.size() == 2) {
+    new_op.name = "swap";
+  }
+  if (op.name.find("ccx") != std::string::npos) {
+    if (qubits_in.size() == 1)
+      new_op.name = "x";
+    else
+      new_op.name = "cx";
+  } else if (qubits_in.size() == 1) {
+    if (op.name[0] == 'c')
+      new_op.name = op.name.substr(1);
+    else if (op.name == "mcphase")
+      new_op.name = "p";
+    else
+      new_op.name = op.name.substr(2); // remove "mc"
+  }
+  return new_op;
+}
+
+void block_diagonal_matrix(const uint_t gid, const uint_t chunk_bits,
+                           reg_t &qubits, cvector_t &diag) {
+  uint_t i;
+  uint_t mask_out = 0;
+  uint_t mask_id = 0;
+
+  reg_t qubits_in;
+  cvector_t diag_in;
+
+  for (i = 0; i < qubits.size(); i++) {
+    if (qubits[i] < chunk_bits) { // in chunk
+      qubits_in.push_back(qubits[i]);
+    } else {
+      mask_out |= (1ull << i);
+      if ((gid >> (qubits[i] - chunk_bits)) & 1)
+        mask_id |= (1ull << i);
+    }
+  }
+
+  if (qubits_in.size() < qubits.size()) {
+    for (i = 0; i < diag.size(); i++) {
+      if ((i & mask_out) == mask_id)
+        diag_in.push_back(diag[i]);
+    }
+
+    if (qubits_in.size() == 0) {
+      qubits_in.push_back(0);
+      diag_in.resize(2);
+      diag_in[1] = diag_in[0];
+    }
+    qubits = qubits_in;
+    diag = diag_in;
+  }
+}
+
+//-------------------------------------------------------------------------
+} // namespace Chunk
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp
new file mode 100644
index 0000000000..aaa0e7b217
--- /dev/null
+++ b/src/simulators/circuit_executor.hpp
@@ -0,0 +1,1189 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _circuit_executor_hpp_
+#define _circuit_executor_hpp_
+
+#include "framework/config.hpp"
+#include "framework/creg.hpp"
+#include "framework/json.hpp"
+#include "framework/opset.hpp"
+#include "framework/results/experiment_result.hpp"
+#include "framework/results/result.hpp"
+#include "framework/rng.hpp"
+#include "framework/types.hpp"
+#include "noise/noise_model.hpp"
+
+#include "transpile/cacheblocking.hpp"
+#include "transpile/fusion.hpp"
+
+#include "simulators/state.hpp"
+
+namespace AER {
+
+namespace CircuitExecutor {
+
+using OpItr = std::vector<Operations::Op>::const_iterator;
+
+// Timer type
+using myclock_t = std::chrono::high_resolution_clock;
+
+//-------------------------------------------------------------------------
+// Executor base class
+//-------------------------------------------------------------------------
+class Base {
+protected:
+public:
+  Base() {}
+  virtual ~Base() {}
+
+  virtual void run_circuit(Circuit &circ, const Noise::NoiseModel &noise,
+                           const Config &config, const Method method,
+                           const Device device, ExperimentResult &result) = 0;
+
+  // Return an estimate of the required memory for a circuit.
+  virtual size_t required_memory_mb(const Circuit &circuit,
+                                    const Noise::NoiseModel &noise) const = 0;
+  virtual size_t max_memory_mb(void) = 0;
+
+  virtual bool validate_state(const Circuit &circ,
+                              const Noise::NoiseModel &noise,
+                              bool throw_except) const = 0;
+};
+
+//-------------------------------------------------------------------------
+// Simple Executor
+//-------------------------------------------------------------------------
+template <class state_t>
+class Executor : public Base {
+protected:
+  // Simulation method
+  Method method_;
+
+  // Simulation device
+  Device sim_device_ = Device::CPU;
+
+  // Simulation precision
+  Precision sim_precision_ = Precision::Double;
+
+  // Save counts as memory list
+  bool save_creg_memory_ = false;
+
+  // The maximum number of threads to use for various levels of parallelization
+  int max_parallel_threads_;
+
+  // Parameters for parallelization management in configuration
+  int max_parallel_shots_;
+  size_t max_memory_mb_;
+  size_t max_gpu_memory_mb_;
+  int num_gpus_;      // max number of GPU per process
+  reg_t target_gpus_; // GPUs to be used
+
+  // use explicit parallelization
+  bool explicit_parallelization_;
+
+  // Parameters for parallelization management for experiments
+  int parallel_experiments_;
+  int parallel_shots_;
+  int parallel_state_update_;
+
+  // results are stored independently in each process if true
+  bool accept_distributed_results_ = true;
+
+  uint_t myrank_;               // process ID
+  uint_t nprocs_;               // number of processes
+  uint_t distributed_rank_;     // process ID in communicator group
+  uint_t distributed_procs_;    // number of processes in communicator group
+  uint_t distributed_group_;    // group id of distribution
+  int_t distributed_proc_bits_; // distributed_procs_=2^distributed_proc_bits_
+                                // (if nprocs != power of 2, set -1)
+  int num_process_per_experiment_ = 1;
+
+#ifdef AER_MPI
+  // communicator group to simulate a circuit (for multi-experiments)
+  MPI_Comm distributed_comm_;
+#endif
+
+#ifdef AER_CUSTATEVEC
+  // settings for cuStateVec
+  bool cuStateVec_enable_ = false;
+#endif
+
+  // if circuit has statevector operations or not
+  bool has_statevector_ops_;
+
+public:
+  Executor();
+  virtual ~Executor() {}
+
+  void run_circuit(Circuit &circ, const Noise::NoiseModel &noise,
+                   const Config &config, const Method method,
+                   const Device device, ExperimentResult &result) override;
+
+  // Return an estimate of the required memory for a circuit.
+  size_t required_memory_mb(const Circuit &circuit,
+                            const Noise::NoiseModel &noise) const override {
+    state_t tmp;
+    return tmp.required_memory_mb(circuit.num_qubits, circuit.ops);
+  }
+  size_t max_memory_mb(void) override { return max_memory_mb_; }
+
+  bool validate_state(const Circuit &circ, const Noise::NoiseModel &noise,
+                      bool throw_except) const override;
+
+protected:
+  // Return a fusion transpilation pass configured for the current
+  // method, circuit and config
+  Transpile::Fusion transpile_fusion(const Operations::OpSet &opset,
+                                     const Config &config) const;
+
+  // return maximum number of qubits for matrix
+  int_t get_max_matrix_qubits(const Circuit &circ) const;
+  int_t get_matrix_bits(const Operations::Op &op) const;
+
+  // Get system memory size
+  size_t get_system_memory_mb();
+  size_t get_gpu_memory_mb();
+
+  size_t get_min_memory_mb() const {
+    if (sim_device_ == Device::GPU && num_gpus_ > 0) {
+      return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size
+    }
+    return max_memory_mb_;
+  }
+
+  // get max shots stored on memory
+  uint_t get_max_parallel_shots(const Circuit &circuit,
+                                const Noise::NoiseModel &noise) const;
+
+  bool multiple_shots_required(const Circuit &circuit,
+                               const Noise::NoiseModel &noise) const;
+
+  // Check if measure sampling optimization is valid for the input circuit
+  // for the given method. This checks if operation types before
+  // the first measurement in the circuit prevent sampling
+  bool check_measure_sampling_opt(const Circuit &circ) const;
+
+  bool has_statevector_ops(const Circuit &circ) const;
+
+  virtual void set_config(const Config &config);
+  virtual void set_parallelization(const Circuit &circ,
+                                   const Noise::NoiseModel &noise);
+
+  virtual void run_circuit_with_sampling(Circuit &circ, const Config &config,
+                                         RngEngine &init_rng,
+                                         ExperimentResult &result);
+
+  virtual void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                                 const Config &config, RngEngine &init_rng,
+                                 ExperimentResult &result, bool sample_noise);
+
+  template <typename InputIterator>
+  void measure_sampler(InputIterator first_meas, InputIterator last_meas,
+                       uint_t shots, state_t &state, ExperimentResult &result,
+                       RngEngine &rng) const;
+
+#ifdef AER_MPI
+  void gather_creg_memory(std::vector<ClassicalRegister> &cregs,
+                          reg_t &shot_index);
+#endif
+};
+
+template <class state_t>
+Executor<state_t>::Executor() {
+  max_memory_mb_ = 0;
+  max_gpu_memory_mb_ = 0;
+  max_parallel_threads_ = 0;
+  max_parallel_shots_ = 0;
+
+  parallel_shots_ = 1;
+  parallel_state_update_ = 1;
+
+  num_process_per_experiment_ = 0;
+
+  num_gpus_ = 0;
+
+  explicit_parallelization_ = false;
+
+  has_statevector_ops_ = false;
+
+  myrank_ = 0;
+  nprocs_ = 1;
+
+  distributed_procs_ = 1;
+  distributed_rank_ = 0;
+  distributed_group_ = 0;
+  distributed_proc_bits_ = 0;
+
+#ifdef AER_MPI
+  distributed_comm_ = MPI_COMM_WORLD;
+#endif
+}
+
+template <class state_t>
+void Executor<state_t>::set_config(const Config &config) {
+  // Load config for memory (creg list data)
+  if (config.memory.has_value())
+    save_creg_memory_ = config.memory.value();
+
+#ifdef _OPENMP
+  // Load OpenMP maximum thread settings
+  if (config.max_parallel_threads.has_value())
+    max_parallel_threads_ = config.max_parallel_threads.value();
+  if (config.max_parallel_shots.has_value())
+    max_parallel_shots_ = config.max_parallel_shots.value();
+  // Limit max threads based on number of available OpenMP threads
+  auto omp_threads = omp_get_max_threads();
+  max_parallel_threads_ = (max_parallel_threads_ > 0)
+                              ? std::min(max_parallel_threads_, omp_threads)
+                              : std::max(1, omp_threads);
+#else
+  // No OpenMP so we disable parallelization
+  max_parallel_threads_ = 1;
+  max_parallel_shots_ = 1;
+#endif
+
+  // Load configurations for parallelization
+
+  if (config.max_memory_mb.has_value())
+    max_memory_mb_ = config.max_memory_mb.value();
+
+  // for debugging
+  if (config._parallel_shots.has_value()) {
+    parallel_shots_ = config._parallel_shots.value();
+    explicit_parallelization_ = true;
+  }
+
+  // for debugging
+  if (config._parallel_state_update.has_value()) {
+    parallel_state_update_ = config._parallel_state_update.value();
+    explicit_parallelization_ = true;
+  }
+
+  if (explicit_parallelization_) {
+    parallel_shots_ = std::max<int>({parallel_shots_, 1});
+    parallel_state_update_ = std::max<int>({parallel_state_update_, 1});
+  }
+
+  if (config.accept_distributed_results.has_value())
+    accept_distributed_results_ = config.accept_distributed_results.value();
+
+#ifdef AER_CUSTATEVEC
+  // cuStateVec configs
+  cuStateVec_enable_ = false;
+  if (config.cuStateVec_enable.has_value())
+    cuStateVec_enable_ = config.cuStateVec_enable.value();
+#endif
+
+  std::string precision = config.precision;
+  if (precision == "double") {
+    sim_precision_ = Precision::Double;
+  } else if (precision == "single") {
+    sim_precision_ = Precision::Single;
+  }
+
+  // set target GPUs
+#ifdef AER_THRUST_CUDA
+  int nDev = 0;
+  if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+    cudaGetLastError();
+    nDev = 0;
+  }
+  if (config.target_gpus.has_value()) {
+    target_gpus_ = config.target_gpus.value();
+    if (nDev < target_gpus_.size()) {
+      throw std::invalid_argument("target_gpus has more GPUs than available.");
+    }
+    num_gpus_ = target_gpus_.size();
+  } else {
+    num_gpus_ = nDev;
+    target_gpus_.resize(num_gpus_);
+    for (int_t i = 0; i < num_gpus_; i++)
+      target_gpus_[i] = i;
+  }
+#endif
+}
+
+template <class state_t>
+size_t Executor<state_t>::get_system_memory_mb() {
+  size_t total_physical_memory = Utils::get_system_memory_mb();
+#ifdef AER_MPI
+  // get minimum memory size per process
+  uint64_t locMem, minMem;
+  locMem = total_physical_memory;
+  MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, distributed_comm_);
+  total_physical_memory = minMem;
+#endif
+
+  return total_physical_memory;
+}
+
+template <class state_t>
+size_t Executor<state_t>::get_gpu_memory_mb() {
+  size_t total_physical_memory = 0;
+#ifdef AER_THRUST_CUDA
+  for (int_t iDev = 0; iDev < target_gpus_.size(); iDev++) {
+    size_t freeMem, totalMem;
+    cudaSetDevice(target_gpus_[iDev]);
+    cudaMemGetInfo(&freeMem, &totalMem);
+    total_physical_memory += totalMem;
+  }
+#endif
+
+#ifdef AER_MPI
+  // get minimum memory size per process
+  uint64_t locMem, minMem;
+  locMem = total_physical_memory;
+  MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, distributed_comm_);
+  total_physical_memory = minMem;
+
+  int t = num_gpus_;
+  MPI_Allreduce(&t, &num_gpus_, 1, MPI_INT, MPI_MAX, distributed_comm_);
+#endif
+
+  return total_physical_memory >> 20;
+}
+
+template <class state_t>
+bool Executor<state_t>::multiple_shots_required(
+    const Circuit &circ, const Noise::NoiseModel &noise) const {
+  if (circ.shots < 2)
+    return false;
+  if (method_ == Method::density_matrix || method_ == Method::superop ||
+      method_ == Method::unitary) {
+    return false;
+  }
+
+  bool can_sample = check_measure_sampling_opt(circ);
+
+  if (noise.is_ideal()) {
+    return !can_sample;
+  }
+
+  return true;
+}
+
+template <class state_t>
+uint_t Executor<state_t>::get_max_parallel_shots(
+    const Circuit &circ, const Noise::NoiseModel &noise) const {
+  uint_t mem = required_memory_mb(circ, noise);
+  if (mem == 0)
+    return circ.shots;
+
+  if (sim_device_ == Device::GPU && num_gpus_ > 0) {
+    return std::min(circ.shots, (max_gpu_memory_mb_ * 8 / 10 / mem));
+  } else {
+    return std::min(circ.shots, (max_memory_mb_ / mem));
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::set_parallelization(const Circuit &circ,
+                                            const Noise::NoiseModel &noise) {
+  // MPI setting
+  myrank_ = 0;
+  nprocs_ = 1;
+#ifdef AER_MPI
+  int t;
+  MPI_Comm_size(MPI_COMM_WORLD, &t);
+  nprocs_ = t;
+  MPI_Comm_rank(MPI_COMM_WORLD, &t);
+  myrank_ = t;
+#endif
+  if (num_process_per_experiment_ == 0)
+    num_process_per_experiment_ = nprocs_;
+
+  distributed_procs_ = num_process_per_experiment_;
+  distributed_rank_ = myrank_ % distributed_procs_;
+  distributed_group_ = myrank_ / distributed_procs_;
+
+  distributed_proc_bits_ = 0;
+  int proc_bits = 0;
+  uint_t p = distributed_procs_;
+  while (p > 1) {
+    if ((p & 1) != 0) { // procs is not power of 2
+      distributed_proc_bits_ = -1;
+      break;
+    }
+    distributed_proc_bits_++;
+    p >>= 1;
+  }
+
+#ifdef AER_MPI
+  if (num_process_per_experiment_ != nprocs_) {
+    MPI_Comm_split(MPI_COMM_WORLD, (int)distributed_group_,
+                   (int)distributed_rank_, &distributed_comm_);
+  } else {
+    distributed_comm_ = MPI_COMM_WORLD;
+  }
+#endif
+
+  if (max_memory_mb_ == 0)
+    max_memory_mb_ = get_system_memory_mb();
+  max_gpu_memory_mb_ = get_gpu_memory_mb();
+
+  // number of threads for parallel loop of experiments
+  parallel_experiments_ = omp_get_num_threads();
+
+  if (explicit_parallelization_)
+    return;
+
+  // Check for trivial parallelization conditions
+  switch (method_) {
+  case Method::statevector:
+  case Method::stabilizer:
+  case Method::unitary:
+  case Method::matrix_product_state: {
+    if (circ.shots == 1 || num_process_per_experiment_ > 1 ||
+        (!noise.has_quantum_errors() && check_measure_sampling_opt(circ))) {
+      parallel_shots_ = 1;
+      parallel_state_update_ =
+          std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
+      return;
+    }
+    break;
+  }
+  case Method::density_matrix:
+  case Method::superop:
+  case Method::tensor_network: {
+    if (circ.shots == 1 || num_process_per_experiment_ > 1 ||
+        check_measure_sampling_opt(circ)) {
+      parallel_shots_ = 1;
+      parallel_state_update_ =
+          std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
+      return;
+    }
+    break;
+  }
+  case Method::extended_stabilizer:
+    break;
+  default:
+    throw std::invalid_argument(
+        "Cannot set parallelization for unresolved method.");
+  }
+
+  // Use a local variable to not override stored maximum based
+  // on currently executed circuits
+  const auto max_shots =
+      (max_parallel_shots_ > 0)
+          ? std::min({max_parallel_shots_, max_parallel_threads_})
+          : max_parallel_threads_;
+
+  // If we are executing circuits in parallel we disable
+  // parallel shots
+  if (max_shots == 1 || parallel_experiments_ > 1) {
+    parallel_shots_ = 1;
+  } else {
+    // Parallel shots is > 1
+    // Limit parallel shots by available memory and number of shots
+    // And assign the remaining threads to state update
+    int circ_memory_mb =
+        required_memory_mb(circ, noise) / num_process_per_experiment_;
+    size_t mem_size =
+        (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_;
+    if (mem_size < circ_memory_mb)
+      throw std::runtime_error(
+          "a circuit requires more memory than max_memory_mb.");
+    // If circ memory is 0, set it to 1 so that we don't divide by zero
+    circ_memory_mb = std::max<int>({1, circ_memory_mb});
+
+    int shots = circ.shots;
+    parallel_shots_ = std::min<int>(
+        {static_cast<int>(mem_size / (circ_memory_mb * 2)), max_shots, shots});
+  }
+  parallel_state_update_ =
+      (parallel_shots_ > 1)
+          ? std::max<int>({1, max_parallel_threads_ / parallel_shots_})
+          : std::max<int>({1, max_parallel_threads_ / parallel_experiments_});
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit(Circuit &circ,
+                                    const Noise::NoiseModel &noise,
+                                    const Config &config, const Method method,
+                                    const Device device,
+                                    ExperimentResult &result) {
+  // Start individual circuit timer
+  auto timer_start = myclock_t::now(); // state circuit timer
+
+  // Execute in try block so we can catch errors and return the error message
+  // for individual circuit failures.
+  try {
+    // set configuration
+    method_ = method;
+    sim_device_ = device;
+
+    set_config(config);
+    set_parallelization(circ, noise);
+
+    // Rng engine (this one is used to add noise on circuit)
+    RngEngine rng;
+    rng.set_seed(circ.seed);
+
+    // Output data container
+    result.set_config(config);
+    result.metadata.add(method_names_.at(method), "method");
+    if (sim_device_ == Device::GPU)
+      result.metadata.add("GPU", "device");
+    else if (sim_device_ == Device::ThrustCPU)
+      result.metadata.add("Thrust", "device");
+    else
+      result.metadata.add("CPU", "device");
+
+    // Circuit qubit metadata
+    result.metadata.add(circ.num_qubits, "num_qubits");
+    result.metadata.add(circ.num_memory, "num_clbits");
+    result.metadata.add(circ.qubits(), "active_input_qubits");
+    result.metadata.add(circ.qubit_map(), "input_qubit_map");
+    result.metadata.add(circ.remapped_qubits, "remapped_qubits");
+
+    // Add measure sampling to metadata
+    // Note: this will set to `true` if sampling is enabled for the circuit
+    result.metadata.add(false, "measure_sampling");
+    result.metadata.add(false, "batched_shots_optimization");
+
+    // Validate gateset and memory requirements, raise exception if they're
+    // exceeded
+    validate_state(circ, noise, true);
+
+    has_statevector_ops_ = has_statevector_ops(circ);
+
+    if (circ.num_qubits > 0) { // do nothing for query steps
+      // Choose execution method based on noise and method
+      Circuit opt_circ;
+      bool noise_sampling = false;
+
+      // Ideal circuit
+      if (noise.is_ideal()) {
+        opt_circ = circ;
+        result.metadata.add("ideal", "noise");
+      }
+      // Readout error only
+      else if (noise.has_quantum_errors() == false) {
+        opt_circ = noise.sample_noise(circ, rng);
+        result.metadata.add("readout", "noise");
+      }
+      // Superop noise sampling
+      else if (method == Method::density_matrix || method == Method::superop ||
+               (method == Method::tensor_network && !has_statevector_ops_)) {
+        // Sample noise using SuperOp method
+        opt_circ =
+            noise.sample_noise(circ, rng, Noise::NoiseModel::Method::superop);
+        result.metadata.add("superop", "noise");
+      }
+      // Kraus noise sampling
+      else if (noise.opset().contains(Operations::OpType::kraus) ||
+               noise.opset().contains(Operations::OpType::superop)) {
+        opt_circ =
+            noise.sample_noise(circ, rng, Noise::NoiseModel::Method::kraus);
+        result.metadata.add("kraus", "noise");
+      }
+      // General circuit noise sampling
+      else {
+        noise_sampling = true;
+        result.metadata.add("circuit", "noise");
+      }
+
+      if (noise_sampling) {
+        run_circuit_shots(circ, noise, config, rng, result, true);
+      } else {
+        // Run multishot simulation without noise sampling
+        bool can_sample = opt_circ.can_sample;
+        can_sample &= check_measure_sampling_opt(opt_circ);
+
+        if (can_sample)
+          run_circuit_with_sampling(opt_circ, config, rng, result);
+        else
+          run_circuit_shots(opt_circ, noise, config, rng, result, false);
+      }
+    }
+    // Report success
+    result.status = ExperimentResult::Status::completed;
+
+    // Pass through circuit header and add metadata
+    result.header = circ.header;
+    result.shots = circ.shots;
+    result.seed = circ.seed;
+    result.metadata.add(parallel_shots_, "parallel_shots");
+    result.metadata.add(parallel_state_update_, "parallel_state_update");
+#ifdef AER_CUSTATEVEC
+    if (sim_device_ == Device::GPU)
+      result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
+#endif
+    if (sim_device_ == Device::GPU)
+      result.metadata.add(target_gpus_, "target_gpus");
+
+    // Add timer data
+    auto timer_stop = myclock_t::now(); // stop timer
+    double time_taken =
+        std::chrono::duration<double>(timer_stop - timer_start).count();
+    result.time_taken = time_taken;
+  }
+  // If an exception occurs during execution, catch it and pass it to the output
+  catch (std::exception &e) {
+    result.status = ExperimentResult::Status::error;
+    result.message = e.what();
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit_with_sampling(Circuit &circ,
+                                                  const Config &config,
+                                                  RngEngine &init_rng,
+                                                  ExperimentResult &result) {
+  state_t state;
+
+  // Optimize circuit
+  Noise::NoiseModel dummy_noise;
+
+  auto fusion_pass = transpile_fusion(circ.opset(), config);
+  fusion_pass.optimize_circuit(circ, dummy_noise, state.opset(), result);
+
+  auto max_bits = get_max_matrix_qubits(circ);
+
+  // Set state config
+  state.set_config(config);
+  state.set_parallelization(parallel_state_update_);
+  state.set_global_phase(circ.global_phase_angle);
+
+  state.set_distribution(1);
+  state.set_max_matrix_qubits(max_bits);
+
+  RngEngine rng = init_rng;
+
+  auto first_meas = circ.first_measure_pos; // Position of first measurement op
+  bool final_ops = (first_meas == circ.ops.size());
+
+  // allocate qubit register
+#ifdef AER_CUSTATEVEC
+  state.enable_cuStateVec(cuStateVec_enable_);
+#endif
+  state.allocate(circ.num_qubits, circ.num_qubits);
+  state.set_num_global_qubits(circ.num_qubits);
+  state.enable_density_matrix(!has_statevector_ops_);
+
+  // Run circuit instructions before first measure
+  state.initialize_qreg(circ.num_qubits);
+  state.initialize_creg(circ.num_memory, circ.num_registers);
+
+  state.apply_ops(circ.ops.cbegin(), circ.ops.cbegin() + first_meas, result,
+                  rng, final_ops);
+
+  // Get measurement operations and set of measured qubits
+  measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
+                  state, result, rng);
+
+  // Add measure sampling metadata
+  result.metadata.add(true, "measure_sampling");
+
+  state.add_metadata(result);
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+
+  // insert runtime noise sample ops here
+  int_t par_shots = (int_t)get_max_parallel_shots(circ, noise);
+  par_shots = std::min((int_t)parallel_shots_, par_shots);
+  std::vector<ExperimentResult> par_results(par_shots);
+
+  uint_t num_shots = circ.shots;
+  uint_t seed_begin = circ.seed;
+
+  // MPI distribution settings
+  std::vector<ClassicalRegister> cregs;
+  reg_t shot_begin(distributed_procs_);
+  reg_t shot_end(distributed_procs_);
+  for (int_t i = 0; i < distributed_procs_; i++) {
+    shot_begin[i] = circ.shots * i / distributed_procs_;
+    shot_end[i] = circ.shots * (i + 1) / distributed_procs_;
+  }
+  num_shots = shot_end[distributed_rank_] - shot_begin[distributed_rank_];
+  seed_begin += shot_begin[distributed_rank_];
+  cregs.resize(circ.shots);
+
+  int max_matrix_qubits;
+  auto fusion_pass = transpile_fusion(circ.opset(), config);
+  if (!sample_noise) {
+    Noise::NoiseModel dummy_noise;
+    state_t dummy_state;
+    fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                 result);
+    max_matrix_qubits = get_max_matrix_qubits(circ);
+  } else {
+    max_matrix_qubits = get_max_matrix_qubits(circ);
+    max_matrix_qubits = std::max(max_matrix_qubits, (int)fusion_pass.max_qubit);
+  }
+
+  // run each shot
+  auto run_circuit_lambda = [this, &par_results, circ, noise, config, par_shots,
+                             sample_noise, num_shots, seed_begin, shot_begin,
+                             &cregs, init_rng, max_matrix_qubits,
+                             fusion_pass](int_t i) {
+    state_t state;
+    uint_t i_shot, shot_end;
+    i_shot = num_shots * i / par_shots;
+    shot_end = num_shots * (i + 1) / par_shots;
+
+    // Set state config
+    state.set_config(config);
+    state.set_parallelization(this->parallel_state_update_);
+    state.set_global_phase(circ.global_phase_angle);
+    state.enable_density_matrix(!has_statevector_ops_);
+
+    state.set_distribution(this->num_process_per_experiment_);
+    state.set_num_global_qubits(circ.num_qubits);
+    state.set_max_matrix_qubits(max_matrix_qubits);
+#ifdef AER_CUSTATEVEC
+    state.enable_cuStateVec(cuStateVec_enable_);
+#endif
+    state.allocate(circ.num_qubits, circ.num_qubits);
+
+    for (; i_shot < shot_end; i_shot++) {
+      RngEngine rng;
+      if (i_shot == 0)
+        rng = init_rng;
+      else
+        rng.set_seed(seed_begin + i_shot);
+
+      state.initialize_qreg(circ.num_qubits);
+      state.initialize_creg(circ.num_memory, circ.num_registers);
+
+      if (sample_noise) {
+        Circuit circ_opt;
+        Noise::NoiseModel dummy_noise;
+        circ_opt = noise.sample_noise(circ, rng);
+        fusion_pass.optimize_circuit(circ_opt, dummy_noise, state.opset(),
+                                     par_results[i]);
+        state.apply_ops(circ_opt.ops.cbegin(), circ_opt.ops.cend(),
+                        par_results[i], rng, true);
+      } else {
+        state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), par_results[i], rng,
+                        true);
+      }
+      if (distributed_procs_ > 1) {
+        // save creg to be gathered
+        cregs[shot_begin[distributed_rank_] + i_shot] = state.creg();
+      } else {
+        par_results[i].save_count_data(state.creg(), save_creg_memory_);
+      }
+    }
+    state.add_metadata(par_results[i]);
+  };
+  Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots,
+                                run_circuit_lambda);
+
+  // gather cregs on MPI processes and save to result
+#ifdef AER_MPI
+  if (num_process_per_experiment_ > 1) {
+    gather_creg_memory(cregs, shot_begin);
+
+    // save cregs to result
+    num_shots = circ.shots;
+    auto save_cregs = [this, &par_results, par_shots, num_shots,
+                       cregs](int_t i) {
+      uint_t i_shot, shot_end;
+      i_shot = num_shots * i / par_shots;
+      shot_end = num_shots * (i + 1) / par_shots;
+
+      for (; i_shot < shot_end; i_shot++) {
+        par_results[i].save_count_data(cregs[i_shot], save_creg_memory_);
+      }
+    };
+    Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, save_cregs,
+                                  par_shots);
+  }
+#endif
+
+  for (auto &res : par_results) {
+    result.combine(std::move(res));
+  }
+#ifdef AER_CUSTATEVEC
+  if (sim_device_ == Device::GPU) {
+    result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
+    if (par_shots >= num_gpus_)
+      result.metadata.add(num_gpus_, "gpu_parallel_shots_");
+    else
+      result.metadata.add(par_shots, "gpu_parallel_shots_");
+  }
+#endif
+}
+
+template <class state_t>
+template <typename InputIterator>
+void Executor<state_t>::measure_sampler(InputIterator first_meas,
+                                        InputIterator last_meas, uint_t shots,
+                                        state_t &state,
+                                        ExperimentResult &result,
+                                        RngEngine &rng) const {
+  // Check if meas_circ is empty, and if so return initial creg
+  if (first_meas == last_meas) {
+    while (shots-- > 0) {
+      result.save_count_data(state.creg(), save_creg_memory_);
+    }
+    return;
+  }
+
+  std::vector<Operations::Op> meas_ops;
+  std::vector<Operations::Op> roerror_ops;
+  for (auto op = first_meas; op != last_meas; op++) {
+    if (op->type == Operations::OpType::roerror) {
+      roerror_ops.push_back(*op);
+    } else { /*(op.type == Operations::OpType::measure) */
+      meas_ops.push_back(*op);
+    }
+  }
+
+  // Get measured qubits from circuit sort and delete duplicates
+  std::vector<uint_t> meas_qubits; // measured qubits
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j)
+      meas_qubits.push_back(op.qubits[j]);
+  }
+  sort(meas_qubits.begin(), meas_qubits.end());
+  meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()),
+                    meas_qubits.end());
+
+  // Generate the samples
+  auto timer_start = myclock_t::now();
+  std::vector<reg_t> all_samples;
+  all_samples = state.sample_measure(meas_qubits, shots, rng);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  result.metadata.add(time_taken, "sample_measure_time");
+
+  // Make qubit map of position in vector of measured qubits
+  std::unordered_map<uint_t, uint_t> qubit_map;
+  for (uint_t j = 0; j < meas_qubits.size(); ++j) {
+    qubit_map[meas_qubits[j]] = j;
+  }
+
+  // Maps of memory and register to qubit position
+  std::map<uint_t, uint_t> memory_map;
+  std::map<uint_t, uint_t> register_map;
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j) {
+      auto pos = qubit_map[op.qubits[j]];
+      if (!op.memory.empty())
+        memory_map[op.memory[j]] = pos;
+      if (!op.registers.empty())
+        register_map[op.registers[j]] = pos;
+    }
+  }
+
+  // Process samples
+  uint_t num_memory =
+      (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
+  uint_t num_registers =
+      (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
+  ClassicalRegister creg;
+  for (int_t i = 0; i < all_samples.size(); i++) {
+    creg.initialize(num_memory, num_registers);
+
+    // process memory bit measurements
+    for (const auto &pair : memory_map) {
+      creg.store_measure(reg_t({all_samples[i][pair.second]}),
+                         reg_t({pair.first}), reg_t());
+    }
+    // process register bit measurements
+    for (const auto &pair : register_map) {
+      creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
+                         reg_t({pair.first}));
+    }
+
+    // process read out errors for memory and registers
+    for (const Operations::Op &roerror : roerror_ops)
+      creg.apply_roerror(roerror, rng);
+
+    // Save count data
+    result.save_count_data(creg, save_creg_memory_);
+  }
+}
+
+template <class state_t>
+bool Executor<state_t>::validate_state(const Circuit &circ,
+                                       const Noise::NoiseModel &noise,
+                                       bool throw_except) const {
+  std::stringstream error_msg;
+  std::string circ_name;
+  state_t state;
+
+  JSON::get_value(circ_name, "name", circ.header);
+
+  // Check if a circuit is valid for state ops
+  bool circ_valid = state.opset().contains(circ.opset());
+  if (throw_except && !circ_valid) {
+    error_msg << "Circuit " << circ_name << " contains invalid instructions ";
+    error_msg << state.opset().difference(circ.opset());
+    error_msg << " for \"" << state.name() << "\" method.";
+  }
+
+  // Check if a noise model valid for state ops
+  bool noise_valid = noise.is_ideal() || state.opset().contains(noise.opset());
+  if (throw_except && !noise_valid) {
+    error_msg << "Noise model contains invalid instructions ";
+    error_msg << state.opset().difference(noise.opset());
+    error_msg << " for \"" << state.name() << "\" method.";
+  }
+
+  // Validate memory requirements
+  bool memory_valid = true;
+  if (max_memory_mb_ > 0) {
+    size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) /
+                         num_process_per_experiment_;
+    size_t mem_size = (sim_device_ == Device::GPU)
+                          ? max_memory_mb_ + max_gpu_memory_mb_
+                          : max_memory_mb_;
+    memory_valid = (required_mb <= mem_size);
+    if (throw_except && !memory_valid) {
+      error_msg << "Insufficient memory to run circuit " << circ_name;
+      error_msg << " using the " << state.name() << " simulator.";
+      error_msg << " Required memory: " << required_mb
+                << "M, max memory: " << max_memory_mb_ << "M";
+      if (sim_device_ == Device::GPU) {
+        error_msg << " (Host) + " << max_gpu_memory_mb_ << "M (GPU)";
+      }
+    }
+  }
+
+  if (noise_valid && circ_valid && memory_valid) {
+    return true;
+  }
+
+  // One of the validation checks failed for the current state
+  if (throw_except) {
+    throw std::runtime_error(error_msg.str());
+  }
+  return false;
+}
+
+template <class state_t>
+Transpile::Fusion
+Executor<state_t>::transpile_fusion(const Operations::OpSet &opset,
+                                    const Config &config) const {
+  Transpile::Fusion fusion_pass;
+  fusion_pass.set_parallelization(parallel_state_update_);
+
+  if (opset.contains(Operations::OpType::superop)) {
+    fusion_pass.allow_superop = true;
+  }
+  if (opset.contains(Operations::OpType::kraus)) {
+    fusion_pass.allow_kraus = true;
+  }
+  switch (method_) {
+  case Method::density_matrix:
+  case Method::superop: {
+    // Halve the default threshold and max fused qubits for density matrix
+    fusion_pass.threshold /= 2;
+    fusion_pass.max_qubit /= 2;
+    break;
+  }
+  case Method::matrix_product_state: {
+    fusion_pass.active = false;
+    return fusion_pass; // Do not allow the config to set active for MPS
+  }
+  case Method::statevector: {
+    if (fusion_pass.allow_kraus) {
+      // Halve default max fused qubits for Kraus noise fusion
+      fusion_pass.max_qubit /= 2;
+    }
+    break;
+  }
+  case Method::unitary: {
+    // max_qubit is the same with statevector
+    fusion_pass.threshold /= 2;
+    break;
+  }
+  case Method::tensor_network: {
+    if (opset.contains(Operations::OpType::save_statevec) ||
+        opset.contains(Operations::OpType::save_statevec_dict)) {
+      if (fusion_pass.allow_kraus) {
+        // Halve default max fused qubits for Kraus noise fusion
+        fusion_pass.max_qubit /= 2;
+      }
+    } else {
+      // Halve the default threshold and max fused qubits for density matrix
+      fusion_pass.threshold /= 2;
+      fusion_pass.max_qubit /= 2;
+    }
+    break;
+  }
+  default: {
+    fusion_pass.active = false;
+    return fusion_pass;
+  }
+  }
+  // Override default fusion settings with custom config
+  fusion_pass.set_config(config);
+  return fusion_pass;
+}
+
+template <class state_t>
+bool Executor<state_t>::check_measure_sampling_opt(const Circuit &circ) const {
+  // Check if circuit has sampling flag disabled
+  if (circ.can_sample == false) {
+    return false;
+  }
+
+  // If density matrix, unitary, superop method all supported instructions
+  // allow sampling
+  if (method_ == Method::density_matrix || method_ == Method::superop ||
+      method_ == Method::unitary) {
+    return true;
+  }
+  if (method_ == Method::tensor_network) {
+    // if there are no save statevec ops, tensor network simulator runs as
+    // density matrix simulator
+    if ((!circ.opset().contains(Operations::OpType::save_statevec)) &&
+        (!circ.opset().contains(Operations::OpType::save_statevec_dict))) {
+      return true;
+    }
+  }
+
+  // If circuit contains a non-initial initialize that is not a full width
+  // instruction we can't sample
+  if (circ.can_sample_initialize == false) {
+    return false;
+  }
+
+  // Check if non-density matrix simulation and circuit contains
+  // a stochastic instruction before measurement
+  // ie. reset, kraus, superop
+  // TODO:
+  // * Resets should be allowed if applied to |0> state (no gates before).
+  if (circ.opset().contains(Operations::OpType::reset) ||
+      circ.opset().contains(Operations::OpType::kraus) ||
+      circ.opset().contains(Operations::OpType::superop) ||
+      circ.opset().contains(Operations::OpType::jump) ||
+      circ.opset().contains(Operations::OpType::mark)) {
+    return false;
+  }
+  // Otherwise true
+  return true;
+}
+
+template <class state_t>
+int_t Executor<state_t>::get_matrix_bits(const Operations::Op &op) const {
+  int_t bit = 1;
+  if (op.type == Operations::OpType::matrix ||
+      op.type == Operations::OpType::diagonal_matrix ||
+      op.type == Operations::OpType::initialize)
+    bit = op.qubits.size();
+  else if (op.type == Operations::OpType::kraus ||
+           op.type == Operations::OpType::superop) {
+    if (method_ == Method::density_matrix)
+      bit = op.qubits.size() * 2;
+    else
+      bit = op.qubits.size();
+  }
+  return bit;
+}
+
+template <class state_t>
+int_t Executor<state_t>::get_max_matrix_qubits(const Circuit &circ) const {
+  int_t max_bits = 0;
+  int_t i;
+
+  if (sim_device_ != Device::CPU) { // Only applicable for GPU (and Thrust)
+    for (i = 0; i < circ.ops.size(); i++) {
+      int_t bit = 1;
+      bit = get_matrix_bits(circ.ops[i]);
+      max_bits = std::max(max_bits, bit);
+    }
+  }
+  return max_bits;
+}
+
+template <class state_t>
+bool Executor<state_t>::has_statevector_ops(const Circuit &circ) const {
+  return circ.opset().contains(Operations::OpType::save_statevec) ||
+         circ.opset().contains(Operations::OpType::save_statevec_dict) ||
+         circ.opset().contains(Operations::OpType::save_amps);
+}
+
+#ifdef AER_MPI
+template <class state_t>
+void Executor<state_t>::gather_creg_memory(
+    std::vector<ClassicalRegister> &cregs, reg_t &shot_index) {
+  int_t i, j;
+  uint_t n64, i64, ibit, num_local_shots;
+
+  if (distributed_procs_ == 0)
+    return;
+  if (cregs.size() == 0)
+    return;
+  int_t size = cregs[0].memory_size();
+  if (size == 0)
+    return;
+
+  if (distributed_rank_ == distributed_procs_ - 1)
+    num_local_shots = cregs.size() - shot_index[distributed_rank_];
+  else
+    num_local_shots =
+        shot_index[distributed_rank_ + 1] - shot_index[distributed_rank_];
+
+  // number of 64-bit integers per memory
+  n64 = (size + 63) >> 6;
+
+  reg_t bin_memory(n64 * num_local_shots, 0);
+  // compress memory string to binary
+#pragma omp parallel for private(i, j, i64, ibit)
+  for (i = 0; i < num_local_shots; i++) {
+    for (j = 0; j < size; j++) {
+      i64 = j >> 6;
+      ibit = j & 63;
+      if (cregs[shot_index[distributed_rank_] + i].creg_memory()[j] == '1') {
+        bin_memory[i * n64 + i64] |= (1ull << ibit);
+      }
+    }
+  }
+
+  reg_t recv(n64 * cregs.size());
+  std::vector<int> recv_counts(distributed_procs_);
+  std::vector<int> recv_offset(distributed_procs_);
+
+  for (i = 0; i < distributed_procs_ - 1; i++) {
+    recv_offset[i] = shot_index[i];
+    recv_counts[i] = shot_index[i + 1] - shot_index[i];
+  }
+  recv_offset[distributed_procs_ - 1] = shot_index[distributed_procs_ - 1];
+  recv_counts[i] = cregs.size() - shot_index[distributed_procs_ - 1];
+
+  MPI_Allgatherv(&bin_memory[0], n64 * num_local_shots, MPI_UINT64_T, &recv[0],
+                 &recv_counts[0], &recv_offset[0], MPI_UINT64_T,
+                 distributed_comm_);
+
+  // store gathered memory
+#pragma omp parallel for private(i, j, i64, ibit)
+  for (i = 0; i < cregs.size(); i++) {
+    for (j = 0; j < size; j++) {
+      i64 = j >> 6;
+      ibit = j & 63;
+      if (((recv[i * n64 + i64] >> ibit) & 1) == 1)
+        cregs[i].creg_memory()[j] = '1';
+      else
+        cregs[i].creg_memory()[j] = '0';
+    }
+  }
+}
+#endif
+
+//-------------------------------------------------------------------------
+} // end namespace CircuitExecutor
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/density_matrix/densitymatrix.hpp b/src/simulators/density_matrix/densitymatrix.hpp
old mode 100644
new mode 100755
index d574bef6f6..cdbc6c8336
--- a/src/simulators/density_matrix/densitymatrix.hpp
+++ b/src/simulators/density_matrix/densitymatrix.hpp
@@ -59,6 +59,11 @@ class DensityMatrix : public UnitaryMatrix<data_t> {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const DensityMatrix<data_t> &obj) {
+    BaseMatrix::initialize(obj);
+  }
+
   // Initializes the vector to a custom initial state.
   // The vector can be either a statevector or a vectorized density matrix
   // If the length of the data vector does not match either case for the
diff --git a/src/simulators/density_matrix/densitymatrix_executor.hpp b/src/simulators/density_matrix/densitymatrix_executor.hpp
new file mode 100644
index 0000000000..d656a6f9a0
--- /dev/null
+++ b/src/simulators/density_matrix/densitymatrix_executor.hpp
@@ -0,0 +1,1408 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _densitymatrix_executor_hpp_
+#define _densitymatrix_executor_hpp_
+
+#include "simulators/batch_shots_executor.hpp"
+#include "simulators/parallel_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace DensityMatrix {
+
+//-------------------------------------------------------------------------
+// batched-shots executor for density matrix
+//-------------------------------------------------------------------------
+template <class state_t>
+class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
+                 public CircuitExecutor::BatchShotsExecutor<state_t> {
+  using Base = CircuitExecutor::MultiStateExecutor<state_t>;
+  using BasePar = CircuitExecutor::ParallelStateExecutor<state_t>;
+  using BaseBatch = CircuitExecutor::BatchShotsExecutor<state_t>;
+
+protected:
+public:
+  Executor() {}
+  virtual ~Executor() {}
+
+protected:
+  void set_config(const Config &config) override;
+
+  bool shot_branching_supported(void) override { return true; }
+
+  // apply parallel operations
+  bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result,
+                         RngEngine &rng, bool final_op) override;
+
+  // apply op to multiple shots , return flase if op is not supported to execute
+  // in a batch
+  bool apply_batched_op(const int_t istate, const Operations::Op &op,
+                        ExperimentResult &result, std::vector<RngEngine> &rng,
+                        bool final_op = false) override;
+
+  bool apply_branching_op(CircuitExecutor::Branch &root,
+                          const Operations::Op &op, ExperimentResult &result,
+                          bool final_op) override;
+
+  // Initializes an n-qubit state to the all |0> state
+  void initialize_qreg(uint_t num_qubits) override;
+
+  auto move_to_matrix();
+  auto copy_to_matrix();
+
+  template <typename list_t>
+  void initialize_from_vector(const list_t &vec);
+
+  void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                         const Config &config, RngEngine &init_rng,
+                         ExperimentResult &result, bool sample_noise) override;
+
+  bool allocate_states(uint_t num_states, const Config &config) override {
+    return BasePar::allocate_states(num_states, config);
+  }
+  //-----------------------------------------------------------------------
+  // Apply instructions
+  //-----------------------------------------------------------------------
+
+  // Measure qubits and return a list of outcomes [q0, q1, ...]
+  // If a state subclass supports this function it then "measure"
+  // should be contained in the set returned by the 'allowed_ops'
+  // method.
+  void apply_measure(const reg_t &qubits, const reg_t &cmemory,
+                     const reg_t &cregister, RngEngine &rng);
+
+  // Reset the specified qubits to the |0> state by tracing out qubits
+  void apply_reset(const reg_t &qubits);
+
+  // Apply a Kraus error operation
+  void apply_kraus(const reg_t &qubits, const std::vector<cmatrix_t> &kraus);
+
+  //-----------------------------------------------------------------------
+  // Save data instructions
+  //-----------------------------------------------------------------------
+
+  // Save the current full density matrix
+  void apply_save_state(const Operations::Op &op, ExperimentResult &result,
+                        bool last_op = false);
+
+  // Save the current density matrix or reduced density matrix
+  void apply_save_density_matrix(const Operations::Op &op,
+                                 ExperimentResult &result,
+                                 bool last_op = false);
+
+  // Helper function for computing expectation value
+  void apply_save_probs(const Operations::Op &op, ExperimentResult &result);
+
+  // Helper function for saving amplitudes squared
+  void apply_save_amplitudes_sq(const Operations::Op &op,
+                                ExperimentResult &result);
+
+  // Helper function for computing expectation value
+  virtual double expval_pauli(const reg_t &qubits,
+                              const std::string &pauli) override;
+
+  // Return the reduced density matrix for the simulator
+  cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false);
+  cmatrix_t reduced_density_matrix_helper(const reg_t &qubits,
+                                          const reg_t &qubits_sorted);
+
+  //-----------------------------------------------------------------------
+  // Measurement Helpers
+  //-----------------------------------------------------------------------
+
+  // Return vector of measure probabilities for specified qubits
+  // If a state subclass supports this function it then "measure"
+  // should be contained in the set returned by the 'allowed_ops'
+  // method.
+  rvector_t measure_probs(const reg_t &qubits) const;
+
+  // Sample the measurement outcome for qubits
+  // return a pair (m, p) of the outcome m, and its corresponding
+  // probability p.
+  // Outcome is given as an int: Eg for two-qubits {q0, q1} we have
+  // 0 -> |q1 = 0, q0 = 0> state
+  // 1 -> |q1 = 0, q0 = 1> state
+  // 2 -> |q1 = 1, q0 = 0> state
+  // 3 -> |q1 = 1, q0 = 1> state
+  std::pair<uint_t, double> sample_measure_with_prob(const reg_t &qubits,
+                                                     RngEngine &rng);
+
+  void measure_reset_update(const std::vector<uint_t> &qubits,
+                            const uint_t final_state, const uint_t meas_state,
+                            const double meas_prob);
+
+  // Sample n-measurement outcomes without applying the measure operation
+  // to the system state
+  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
+                                    RngEngine &rng) const override;
+
+  rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                     const reg_t &qubits);
+  void measure_reset_update(CircuitExecutor::Branch &root,
+                            const std::vector<uint_t> &qubits,
+                            const int_t final_state,
+                            const rvector_t &meas_probs);
+  void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
+                     const reg_t &cmemory, const reg_t &cregister);
+
+  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
+                                    uint_t shots,
+                                    std::vector<RngEngine> &rng) const override;
+
+  //-----------------------------------------------------------------------
+  // Functions for multi-chunk distribution
+  //-----------------------------------------------------------------------
+  // swap between chunks
+  void apply_chunk_swap(const reg_t &qubits) override;
+
+  // apply multiple swaps between chunks
+  void apply_multi_chunk_swap(const reg_t &qubits) override;
+
+  // scale for density matrix = 2
+  // this function is used in the base class to scale chunk qubits for
+  // multi-chunk distribution
+  uint_t qubit_scale(void) override { return 2; }
+};
+
+//-------------------------------------------------------------------------
+// Initialization
+//-------------------------------------------------------------------------
+template <class densmat_t>
+void Executor<densmat_t>::initialize_qreg(uint_t num_qubits) {
+  for (int_t i = 0; i < Base::states_.size(); i++) {
+    Base::states_[i].qreg().set_num_qubits(BasePar::chunk_bits_);
+  }
+
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        if (Base::global_state_index_ + iChunk == 0) {
+          Base::states_[iChunk].qreg().initialize();
+        } else {
+          Base::states_[iChunk].qreg().zero();
+        }
+      }
+    }
+  } else {
+    for (int_t i = 0; i < Base::states_.size(); i++) {
+      if (Base::global_state_index_ + i == 0) {
+        Base::states_[i].qreg().initialize();
+      } else {
+        Base::states_[i].qreg().zero();
+      }
+    }
+  }
+}
+
+template <class densmat_t>
+template <typename list_t>
+void Executor<densmat_t>::initialize_from_vector(const list_t &vec) {
+  if ((1ull << (Base::num_qubits_ * 2)) == vec.size()) {
+    BasePar::initialize_from_vector(vec);
+  } else if ((1ull << (Base::num_qubits_ * 2)) == vec.size() * vec.size()) {
+    int_t iChunk;
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+          uint_t irow_chunk = ((iChunk + Base::global_state_index_) >>
+                               ((Base::num_qubits_ - BasePar::chunk_bits_)))
+                              << (BasePar::chunk_bits_);
+          uint_t icol_chunk =
+              ((iChunk + Base::global_state_index_) &
+               ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1))
+              << (BasePar::chunk_bits_);
+
+          // copy part of state for this chunk
+          uint_t i, row, col;
+          list_t vec1(1ull << BasePar::chunk_bits_);
+          list_t vec2(1ull << BasePar::chunk_bits_);
+
+          for (i = 0; i < (1ull << BasePar::chunk_bits_); i++) {
+            vec1[i] = vec[(irow_chunk << BasePar::chunk_bits_) + i];
+            vec2[i] = std::conj(vec[(icol_chunk << BasePar::chunk_bits_) + i]);
+          }
+          Base::states_[iChunk].qreg().initialize_from_vector(
+              AER::Utils::tensor_product(vec1, vec2));
+        }
+      }
+    } else {
+      for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) {
+        uint_t irow_chunk = ((iChunk + Base::global_state_index_) >>
+                             ((Base::num_qubits_ - BasePar::chunk_bits_)))
+                            << (BasePar::chunk_bits_);
+        uint_t icol_chunk =
+            ((iChunk + Base::global_state_index_) &
+             ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1))
+            << (BasePar::chunk_bits_);
+
+        // copy part of state for this chunk
+        uint_t i, row, col;
+        list_t vec1(1ull << BasePar::chunk_bits_);
+        list_t vec2(1ull << BasePar::chunk_bits_);
+
+        for (i = 0; i < (1ull << BasePar::chunk_bits_); i++) {
+          vec1[i] = vec[(irow_chunk << BasePar::chunk_bits_) + i];
+          vec2[i] = std::conj(vec[(icol_chunk << BasePar::chunk_bits_) + i]);
+        }
+        Base::states_[iChunk].qreg().initialize_from_vector(
+            AER::Utils::tensor_product(vec1, vec2));
+      }
+    }
+  } else {
+    throw std::runtime_error(
+        "DensityMatrixChunk::initialize input vector is incorrect length. "
+        "Expected: " +
+        std::to_string((1ull << (Base::num_qubits_ * 2))) +
+        " Received: " + std::to_string(vec.size()));
+  }
+}
+
+template <class densmat_t>
+auto Executor<densmat_t>::move_to_matrix() {
+  return BasePar::apply_to_matrix(false);
+}
+
+template <class densmat_t>
+auto Executor<densmat_t>::copy_to_matrix() {
+  return BasePar::apply_to_matrix(true);
+}
+
+//-------------------------------------------------------------------------
+// Utility
+//-------------------------------------------------------------------------
+
+template <class densmat_t>
+void Executor<densmat_t>::set_config(const Config &config) {
+  BasePar::set_config(config);
+  BaseBatch::set_config(config);
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+  state_t dummy_state;
+  if (BasePar::multiple_chunk_required(circ, noise)) {
+    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result,
+                                      sample_noise);
+  } else {
+    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result,
+                                        sample_noise);
+  }
+}
+
+//=========================================================================
+// Implementation: apply operations
+//=========================================================================
+
+template <class densmat_t>
+bool Executor<densmat_t>::apply_parallel_op(const Operations::Op &op,
+                                            ExperimentResult &result,
+                                            RngEngine &rng, bool final_ops) {
+  if (Base::states_[0].creg().check_conditional(op)) {
+    switch (op.type) {
+    case Operations::OpType::reset:
+      apply_reset(op.qubits);
+      break;
+    case Operations::OpType::measure:
+      apply_measure(op.qubits, op.memory, op.registers, rng);
+      break;
+    case Operations::OpType::bfunc:
+      BasePar::apply_bfunc(op);
+      break;
+    case Operations::OpType::roerror:
+      BasePar::apply_roerror(op, rng);
+      break;
+    case Operations::OpType::kraus:
+      apply_kraus(op.qubits, op.mats);
+      break;
+    case Operations::OpType::set_statevec:
+      initialize_from_vector(op.params);
+      break;
+    case Operations::OpType::set_densmat:
+      BasePar::initialize_from_matrix(op.mats[0]);
+      break;
+    case Operations::OpType::save_expval:
+    case Operations::OpType::save_expval_var:
+      BasePar::apply_save_expval(op, result);
+      break;
+    case Operations::OpType::save_state:
+      apply_save_state(op, result, final_ops);
+      break;
+    case Operations::OpType::save_densmat:
+      apply_save_density_matrix(op, result, final_ops);
+      break;
+    case Operations::OpType::save_probs:
+    case Operations::OpType::save_probs_ket:
+      apply_save_probs(op, result);
+      break;
+    case Operations::OpType::save_amps_sq:
+      apply_save_amplitudes_sq(op, result);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_batched_op(const int_t istate,
+                                         const Operations::Op &op,
+                                         ExperimentResult &result,
+                                         std::vector<RngEngine> &rng,
+                                         bool final_op) {
+  if (op.conditional) {
+    Base::states_[istate].qreg().set_conditional(op.conditional_reg);
+  }
+
+  switch (op.type) {
+  case Operations::OpType::barrier:
+  case Operations::OpType::nop:
+  case Operations::OpType::qerror_loc:
+    break;
+  case Operations::OpType::reset:
+    Base::states_[istate].apply_reset(op.qubits);
+    break;
+  case Operations::OpType::measure:
+    Base::states_[istate].qreg().apply_batched_measure(op.qubits, rng,
+                                                       op.memory, op.registers);
+    break;
+  case Operations::OpType::bfunc:
+    Base::states_[istate].qreg().apply_bfunc(op);
+    break;
+  case Operations::OpType::roerror:
+    Base::states_[istate].qreg().apply_roerror(op, rng);
+    break;
+  case Operations::OpType::gate:
+    Base::states_[istate].apply_gate(op);
+    break;
+  case Operations::OpType::matrix:
+    Base::states_[istate].apply_matrix(op.qubits, op.mats[0]);
+    break;
+  case Operations::OpType::diagonal_matrix:
+    Base::states_[istate].apply_diagonal_unitary_matrix(op.qubits, op.params);
+    break;
+  case Operations::OpType::superop:
+    Base::states_[istate].qreg().apply_superop_matrix(
+        op.qubits, Utils::vectorize_matrix(op.mats[0]));
+    break;
+  case Operations::OpType::kraus:
+    Base::states_[istate].apply_kraus(op.qubits, op.mats);
+    break;
+  default:
+    // other operations should be called to indivisual chunks by apply_op
+    return false;
+  }
+  return true;
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
+                                           const Operations::Op &op,
+                                           ExperimentResult &result,
+                                           bool final_op) {
+  RngEngine dummy;
+  if (Base::states_[root.state_index()].creg().check_conditional(op)) {
+    switch (op.type) {
+      // ops with branching
+      //      case Operations::OpType::reset:
+      //        apply_reset(root, op.qubits);
+      //        break;
+    case Operations::OpType::measure:
+      apply_measure(root, op.qubits, op.memory, op.registers);
+      break;
+    // save ops
+    case Operations::OpType::save_expval:
+    case Operations::OpType::save_expval_var:
+    case Operations::OpType::save_state:
+    case Operations::OpType::save_densmat:
+    case Operations::OpType::save_probs:
+    case Operations::OpType::save_probs_ket:
+    case Operations::OpType::save_amps_sq:
+      // call save functions in state class
+      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+//=========================================================================
+// Implementation: Save data
+//=========================================================================
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_probs(const Operations::Op &op,
+                                           ExperimentResult &result) {
+  auto probs = measure_probs(op.qubits);
+  if (op.type == Operations::OpType::save_probs_ket) {
+    result.save_data_average(
+        Base::states_[0].creg(), op.string_params[0],
+        Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type,
+        op.save_type);
+  } else {
+    result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                             std::move(probs), op.type, op.save_type);
+  }
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_amplitudes_sq(const Operations::Op &op,
+                                                   ExperimentResult &result) {
+  if (op.int_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save_amplitudes_sq instructions (empty params).");
+  }
+  const int_t size = op.int_params.size();
+  rvector_t amps_sq(size);
+
+  int_t iChunk;
+#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk)
+  for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) {
+    uint_t irow, icol;
+    irow = (Base::global_state_index_ + iChunk) >>
+           ((Base::num_qubits_ - BasePar::chunk_bits_));
+    icol = (Base::global_state_index_ + iChunk) -
+           (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+    if (irow != icol)
+      continue;
+
+    for (int_t i = 0; i < size; ++i) {
+      uint_t idx = BasePar::mapped_index(op.int_params[i]);
+      if (idx >= (irow << BasePar::chunk_bits_) &&
+          idx < ((irow + 1) << BasePar::chunk_bits_))
+        amps_sq[i] = Base::states_[iChunk].qreg().probability(
+            idx - (irow << BasePar::chunk_bits_));
+    }
+  }
+#ifdef AER_MPI
+  BasePar::reduce_sum(amps_sq);
+#endif
+
+  result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                           std::move(amps_sq), op.type, op.save_type);
+}
+
+template <class densmat_t>
+double Executor<densmat_t>::expval_pauli(const reg_t &qubits,
+                                         const std::string &pauli) {
+  reg_t qubits_in_chunk;
+  reg_t qubits_out_chunk;
+  std::string pauli_in_chunk;
+  std::string pauli_out_chunk;
+  int_t i, n;
+  double expval(0.);
+
+  // get inner/outer chunk pauli string
+  n = pauli.size();
+  for (i = 0; i < n; i++) {
+    if (qubits[i] < BasePar::chunk_bits_) {
+      qubits_in_chunk.push_back(qubits[i]);
+      pauli_in_chunk.push_back(pauli[n - i - 1]);
+    } else {
+      qubits_out_chunk.push_back(qubits[i]);
+      pauli_out_chunk.push_back(pauli[n - i - 1]);
+    }
+  }
+
+  int_t nrows = 1ull << ((Base::num_qubits_ - BasePar::chunk_bits_));
+
+  if (qubits_out_chunk.size() > 0) { // there are bits out of chunk
+    std::complex<double> phase = 1.0;
+
+    std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end());
+    std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end());
+
+    uint_t x_mask, z_mask, num_y, x_max;
+    std::tie(x_mask, z_mask, num_y, x_max) =
+        AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk);
+
+    z_mask >>= (BasePar::chunk_bits_);
+    if (x_mask != 0) {
+      x_mask >>= (BasePar::chunk_bits_);
+      x_max -= (BasePar::chunk_bits_);
+
+      AER::QV::add_y_phase(num_y, phase);
+
+      const uint_t mask_u = ~((1ull << (x_max + 1)) - 1);
+      const uint_t mask_l = (1ull << x_max) - 1;
+
+      for (i = 0; i < nrows / 2; i++) {
+        uint_t irow = ((i << 1) & mask_u) | (i & mask_l);
+        uint_t iChunk = (irow ^ x_mask) + irow * nrows;
+
+        if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk &&
+            Base::state_index_end_[Base::distributed_rank_] >
+                iChunk) { // on this process
+          double sign = 2.0;
+          if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1))
+            sign = -2.0;
+          expval += sign * Base::states_[iChunk - Base::global_state_index_]
+                               .qreg()
+                               .expval_pauli_non_diagonal_chunk(
+                                   qubits_in_chunk, pauli_in_chunk, phase);
+        }
+      }
+    } else {
+      for (i = 0; i < nrows; i++) {
+        uint_t iChunk = i * (nrows + 1);
+        if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk &&
+            Base::state_index_end_[Base::distributed_rank_] >
+                iChunk) { // on this process
+          double sign = 1.0;
+          if (z_mask && (AER::Utils::popcount(i & z_mask) & 1))
+            sign = -1.0;
+          expval +=
+              sign * Base::states_[iChunk - Base::global_state_index_]
+                         .qreg()
+                         .expval_pauli(qubits_in_chunk, pauli_in_chunk, 1.0);
+        }
+      }
+    }
+  } else { // all bits are inside chunk
+    for (i = 0; i < nrows; i++) {
+      uint_t iChunk = i * (nrows + 1);
+      if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk &&
+          Base::state_index_end_[Base::distributed_rank_] >
+              iChunk) { // on this process
+        expval += Base::states_[iChunk - Base::global_state_index_]
+                      .qreg()
+                      .expval_pauli(qubits, pauli, 1.0);
+      }
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(expval);
+#endif
+  return expval;
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_density_matrix(const Operations::Op &op,
+                                                    ExperimentResult &result,
+                                                    bool last_op) {
+  result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                           reduced_density_matrix(op.qubits, last_op), op.type,
+                           op.save_type);
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_save_state(const Operations::Op &op,
+                                           ExperimentResult &result,
+                                           bool last_op) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name + " was not applied to all qubits."
+                                          " Only the full state can be saved.");
+  }
+  // Renamp single data type to average
+  Operations::DataSubType save_type;
+  switch (op.save_type) {
+  case Operations::DataSubType::single:
+    save_type = Operations::DataSubType::average;
+    break;
+  case Operations::DataSubType::c_single:
+    save_type = Operations::DataSubType::c_average;
+    break;
+  default:
+    save_type = op.save_type;
+  }
+
+  // Default key
+  std::string key = (op.string_params[0] == "_method_") ? "density_matrix"
+                                                        : op.string_params[0];
+  if (last_op) {
+    result.save_data_average(Base::states_[0].creg(), key, move_to_matrix(),
+                             Operations::OpType::save_densmat, save_type);
+  } else {
+    result.save_data_average(Base::states_[0].creg(), key, copy_to_matrix(),
+                             Operations::OpType::save_densmat, save_type);
+  }
+}
+
+template <class densmat_t>
+cmatrix_t Executor<densmat_t>::reduced_density_matrix(const reg_t &qubits,
+                                                      bool last_op) {
+  cmatrix_t reduced_state;
+
+  // Check if tracing over all qubits
+  if (qubits.empty()) {
+    reduced_state = cmatrix_t(1, 1);
+    std::complex<double> sum = 0.0;
+    for (int_t i = 0; i < Base::states_.size(); i++) {
+      sum += Base::states_[i].qreg().trace();
+    }
+#ifdef AER_MPI
+    BasePar::reduce_sum(sum);
+#endif
+    reduced_state[0] = sum;
+  } else {
+    auto qubits_sorted = qubits;
+    std::sort(qubits_sorted.begin(), qubits_sorted.end());
+
+    if ((qubits.size() == Base::num_qubits_) && (qubits == qubits_sorted)) {
+      if (last_op) {
+        reduced_state = move_to_matrix();
+      } else {
+        reduced_state = copy_to_matrix();
+      }
+    } else {
+      reduced_state = reduced_density_matrix_helper(qubits, qubits_sorted);
+    }
+  }
+  return reduced_state;
+}
+
+template <class densmat_t>
+cmatrix_t
+Executor<densmat_t>::reduced_density_matrix_helper(const reg_t &qubits,
+                                                   const reg_t &qubits_sorted) {
+  int_t iChunk;
+  uint_t size = 1ull << (BasePar::chunk_bits_ * 2);
+  uint_t mask = (1ull << (BasePar::chunk_bits_)) - 1;
+  uint_t num_threads = Base::states_[0].qreg().get_omp_threads();
+
+  size_t size_required =
+      (sizeof(std::complex<double>) << (qubits.size() * 2)) +
+      (sizeof(std::complex<double>) << (BasePar::chunk_bits_ * 2)) *
+          Base::num_local_states_;
+  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
+    throw std::runtime_error(
+        std::string("There is not enough memory to store density matrix"));
+  }
+  cmatrix_t reduced_state(1ull << qubits.size(), 1ull << qubits.size(), true);
+
+  if (Base::distributed_rank_ == 0) {
+    auto tmp = Base::states_[0].copy_to_matrix();
+    for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) {
+      int_t i;
+      uint_t irow_chunk =
+          (iChunk >> ((Base::num_qubits_ - BasePar::chunk_bits_)))
+          << BasePar::chunk_bits_;
+      uint_t icol_chunk =
+          (iChunk &
+           ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1))
+          << BasePar::chunk_bits_;
+
+      if (iChunk < Base::num_local_states_)
+        tmp = Base::states_[iChunk].qreg().copy_to_matrix();
+#ifdef AER_MPI
+      else
+        BasePar::recv_data(tmp.data(), size, 0, iChunk);
+#endif
+#pragma omp parallel for if (num_threads > 1) num_threads(num_threads)
+      for (i = 0; i < size; i++) {
+        uint_t irow = (i >> (BasePar::chunk_bits_)) + irow_chunk;
+        uint_t icol = (i & mask) + icol_chunk;
+        uint_t irow_out = 0;
+        uint_t icol_out = 0;
+        int j;
+        for (j = 0; j < qubits.size(); j++) {
+          if ((irow >> qubits[j]) & 1) {
+            irow &= ~(1ull << qubits[j]);
+            irow_out += (1ull << j);
+          }
+          if ((icol >> qubits[j]) & 1) {
+            icol &= ~(1ull << qubits[j]);
+            icol_out += (1ull << j);
+          }
+        }
+        if (irow == icol) { // only diagonal base can be reduced
+          uint_t idx = ((irow_out) << qubits.size()) + icol_out;
+#pragma omp critical
+          reduced_state[idx] += tmp[i];
+        }
+      }
+    }
+  } else {
+#ifdef AER_MPI
+    // send matrices to process 0
+    for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) {
+      uint_t iProc = BasePar::get_process_by_chunk(iChunk);
+      if (iProc == Base::distributed_rank_) {
+        auto tmp = Base::states_[iChunk - Base::global_state_index_]
+                       .qreg()
+                       .copy_to_matrix();
+        BasePar::send_data(tmp.data(), size, iChunk, 0);
+      }
+    }
+#endif
+  }
+
+  return reduced_state;
+}
+
+//=========================================================================
+// Implementation: Reset and Measurement Sampling
+//=========================================================================
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_measure(const reg_t &qubits,
+                                        const reg_t &cmemory,
+                                        const reg_t &cregister,
+                                        RngEngine &rng) {
+  // Actual measurement outcome
+  const auto meas = sample_measure_with_prob(qubits, rng);
+  // Implement measurement update
+  measure_reset_update(qubits, meas.first, meas.first, meas.second);
+  const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size());
+  BasePar::store_measure(outcome, cmemory, cregister);
+}
+
+template <class densmat_t>
+rvector_t Executor<densmat_t>::measure_probs(const reg_t &qubits) const {
+  uint_t dim = 1ull << qubits.size();
+  rvector_t sum(dim, 0.0);
+  int_t i, j, k;
+  reg_t qubits_in_chunk;
+  reg_t qubits_out_chunk;
+
+  for (i = 0; i < qubits.size(); i++) {
+    if (qubits[i] < BasePar::chunk_bits_) {
+      qubits_in_chunk.push_back(qubits[i]);
+    } else {
+      qubits_out_chunk.push_back(qubits[i]);
+    }
+  }
+
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(i, j, k)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (i = Base::top_state_of_group_[ig];
+           i < Base::top_state_of_group_[ig + 1]; i++) {
+        uint_t irow, icol;
+        irow = (Base::global_state_index_ + i) >>
+               ((Base::num_qubits_ - BasePar::chunk_bits_));
+        icol = (Base::global_state_index_ + i) -
+               (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+
+        if (irow == icol) { // diagonal chunk
+          if (qubits_in_chunk.size() > 0) {
+            auto chunkSum =
+                Base::states_[i].qreg().probabilities(qubits_in_chunk);
+            if (qubits_in_chunk.size() == qubits.size()) {
+              for (j = 0; j < dim; j++) {
+#pragma omp atomic
+                sum[j] += chunkSum[j];
+              }
+            } else {
+              for (j = 0; j < chunkSum.size(); j++) {
+                int idx = 0;
+                int i_in = 0;
+                for (k = 0; k < qubits.size(); k++) {
+                  if (qubits[k] < (BasePar::chunk_bits_)) {
+                    idx += (((j >> i_in) & 1) << k);
+                    i_in++;
+                  } else {
+                    if ((((i + Base::global_state_index_)
+                          << (BasePar::chunk_bits_)) >>
+                         qubits[k]) &
+                        1) {
+                      idx += 1ull << k;
+                    }
+                  }
+                }
+#pragma omp atomic
+                sum[idx] += chunkSum[j];
+              }
+            }
+          } else { // there is no bit in chunk
+            auto tr = std::real(Base::states_[i].qreg().trace());
+            int idx = 0;
+            for (k = 0; k < qubits_out_chunk.size(); k++) {
+              if ((((i + Base::global_state_index_)
+                    << (BasePar::chunk_bits_)) >>
+                   qubits_out_chunk[k]) &
+                  1) {
+                idx += 1ull << k;
+              }
+            }
+#pragma omp atomic
+            sum[idx] += tr;
+          }
+        }
+      }
+    }
+  } else {
+    for (i = 0; i < Base::states_.size(); i++) {
+      uint_t irow, icol;
+      irow = (Base::global_state_index_ + i) >>
+             ((Base::num_qubits_ - BasePar::chunk_bits_));
+      icol = (Base::global_state_index_ + i) -
+             (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+
+      if (irow == icol) { // diagonal chunk
+        if (qubits_in_chunk.size() > 0) {
+          auto chunkSum =
+              Base::states_[i].qreg().probabilities(qubits_in_chunk);
+          if (qubits_in_chunk.size() == qubits.size()) {
+            for (j = 0; j < dim; j++) {
+              sum[j] += chunkSum[j];
+            }
+          } else {
+            for (j = 0; j < chunkSum.size(); j++) {
+              int idx = 0;
+              int i_in = 0;
+              for (k = 0; k < qubits.size(); k++) {
+                if (qubits[k] < (BasePar::chunk_bits_)) {
+                  idx += (((j >> i_in) & 1) << k);
+                  i_in++;
+                } else {
+                  if ((((i + Base::global_state_index_)
+                        << (BasePar::chunk_bits_)) >>
+                       qubits[k]) &
+                      1) {
+                    idx += 1ull << k;
+                  }
+                }
+              }
+              sum[idx] += chunkSum[j];
+            }
+          }
+        } else { // there is no bit in chunk
+          auto tr = std::real(Base::states_[i].qreg().trace());
+          int idx = 0;
+          for (k = 0; k < qubits_out_chunk.size(); k++) {
+            if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >>
+                 qubits_out_chunk[k]) &
+                1) {
+              idx += 1ull << k;
+            }
+          }
+          sum[idx] += tr;
+        }
+      }
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(sum);
+#endif
+
+  return sum;
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_reset(const reg_t &qubits) {
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        Base::states_[iChunk].qreg().apply_reset(qubits);
+      }
+    }
+  } else {
+    for (int_t i = 0; i < Base::states_.size(); i++)
+      Base::states_[i].qreg().apply_reset(qubits);
+  }
+}
+
+template <class densmat_t>
+std::pair<uint_t, double>
+Executor<densmat_t>::sample_measure_with_prob(const reg_t &qubits,
+                                              RngEngine &rng) {
+  rvector_t probs = measure_probs(qubits);
+  // Randomly pick outcome and return pair
+  uint_t outcome = rng.rand_int(probs);
+  return std::make_pair(outcome, probs[outcome]);
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::measure_reset_update(const reg_t &qubits,
+                                               const uint_t final_state,
+                                               const uint_t meas_state,
+                                               const double meas_prob) {
+  // Update a state vector based on an outcome pair [m, p] from
+  // sample_measure_with_prob function, and a desired post-measurement
+  // final_state Single-qubit case
+  if (qubits.size() == 1) {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    cvector_t mdiag(2, 0.);
+    mdiag[meas_state] = 1. / std::sqrt(meas_prob);
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++)
+          Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag);
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag);
+    }
+
+    // If it doesn't agree with the reset state update
+    if (final_state != meas_state) {
+      if (qubits[0] < BasePar::chunk_bits_) {
+        if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+          for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+            for (int_t i = Base::top_state_of_group_[ig];
+                 i < Base::top_state_of_group_[ig + 1]; i++)
+              Base::states_[i].qreg().apply_x(qubits[0]);
+          }
+        } else {
+          for (int_t i = 0; i < Base::states_.size(); i++)
+            Base::states_[i].qreg().apply_x(qubits[0]);
+        }
+      } else {
+        BasePar::apply_chunk_x(qubits[0]);
+        BasePar::apply_chunk_x(qubits[0] + BasePar::chunk_bits_);
+      }
+    }
+  }
+  // Multi qubit case
+  else {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    const size_t dim = 1ULL << qubits.size();
+    cvector_t mdiag(dim, 0.);
+    mdiag[meas_state] = 1. / std::sqrt(meas_prob);
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++)
+          Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag);
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag);
+    }
+
+    // If it doesn't agree with the reset state update
+    // TODO This function could be optimized as a permutation update
+    if (final_state != meas_state) {
+      // build vectorized permutation matrix
+      cvector_t perm(dim * dim, 0.);
+      perm[final_state * dim + meas_state] = 1.;
+      perm[meas_state * dim + final_state] = 1.;
+      for (size_t j = 0; j < dim; j++) {
+        if (j != final_state && j != meas_state)
+          perm[j * dim + j] = 1.;
+      }
+      // apply permutation to swap state
+      reg_t qubits_in_chunk;
+      reg_t qubits_out_chunk;
+
+      for (int_t i = 0; i < qubits.size(); i++) {
+        if (qubits[i] < BasePar::chunk_bits_) {
+          qubits_in_chunk.push_back(qubits[i]);
+        } else {
+          qubits_out_chunk.push_back(qubits[i]);
+        }
+      }
+      if (qubits_in_chunk.size() > 0) { // in chunk exchange
+        if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+          for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+            for (int_t i = Base::top_state_of_group_[ig];
+                 i < Base::top_state_of_group_[ig + 1]; i++)
+              Base::states_[i].qreg().apply_unitary_matrix(qubits, perm);
+          }
+        } else {
+          for (int_t i = 0; i < Base::states_.size(); i++)
+            Base::states_[i].qreg().apply_unitary_matrix(qubits, perm);
+        }
+      }
+      if (qubits_out_chunk.size() > 0) { // out of chunk exchange
+        for (int_t i = 0; i < qubits_out_chunk.size(); i++) {
+          BasePar::apply_chunk_x(qubits_out_chunk[i]);
+          BasePar::apply_chunk_x(qubits_out_chunk[i] +
+                                 (Base::num_qubits_ - BasePar::chunk_bits_));
+        }
+      }
+    }
+  }
+}
+
+template <class densmat_t>
+std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
+                                                       uint_t shots,
+                                                       RngEngine &rng) const {
+  // Generate flat register for storing
+  std::vector<double> rnds;
+  rnds.reserve(shots);
+  for (uint_t i = 0; i < shots; ++i)
+    rnds.push_back(rng.rand(0, 1));
+  reg_t allbit_samples(shots, 0);
+
+  int_t i, j;
+  std::vector<double> chunkSum(Base::states_.size() + 1, 0);
+  double sum, localSum;
+  // calculate per chunk sum
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(i)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (i = Base::top_state_of_group_[ig];
+           i < Base::top_state_of_group_[ig + 1]; i++) {
+        uint_t irow, icol;
+        irow = (Base::global_state_index_ + i) >>
+               ((Base::num_qubits_ - BasePar::chunk_bits_));
+        icol = (Base::global_state_index_ + i) -
+               (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+        if (irow == icol) // only diagonal chunk has probabilities
+          chunkSum[i] = std::real(Base::states_[i].qreg().trace());
+        else
+          chunkSum[i] = 0.0;
+      }
+    }
+  } else {
+    for (i = 0; i < Base::states_.size(); i++) {
+      uint_t irow, icol;
+      irow = (Base::global_state_index_ + i) >>
+             ((Base::num_qubits_ - BasePar::chunk_bits_));
+      icol = (Base::global_state_index_ + i) -
+             (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+      if (irow == icol) // only diagonal chunk has probabilities
+        chunkSum[i] = std::real(Base::states_[i].qreg().trace());
+      else
+        chunkSum[i] = 0.0;
+    }
+  }
+  localSum = 0.0;
+  for (i = 0; i < Base::states_.size(); i++) {
+    sum = localSum;
+    localSum += chunkSum[i];
+    chunkSum[i] = sum;
+  }
+  chunkSum[Base::states_.size()] = localSum;
+
+  double globalSum = 0.0;
+  if (Base::nprocs_ > 1) {
+    std::vector<double> procTotal(Base::nprocs_);
+
+    for (i = 0; i < Base::nprocs_; i++) {
+      procTotal[i] = localSum;
+    }
+    BasePar::gather_value(procTotal);
+
+    for (i = 0; i < Base::myrank_; i++) {
+      globalSum += procTotal[i];
+    }
+  }
+
+  reg_t local_samples(shots, 0);
+
+  // get rnds positions for each chunk
+  for (i = 0; i < Base::states_.size(); i++) {
+    uint_t irow, icol;
+    irow = (Base::global_state_index_ + i) >>
+           ((Base::num_qubits_ - BasePar::chunk_bits_));
+    icol = (Base::global_state_index_ + i) -
+           (irow << ((Base::num_qubits_ - BasePar::chunk_bits_)));
+    if (irow != icol)
+      continue;
+
+    uint_t nIn;
+    std::vector<uint_t> vIdx;
+    std::vector<double> vRnd;
+
+    // find rnds in this chunk
+    nIn = 0;
+    for (j = 0; j < shots; j++) {
+      if (rnds[j] >= chunkSum[i] + globalSum &&
+          rnds[j] < chunkSum[i + 1] + globalSum) {
+        vRnd.push_back(rnds[j] - (globalSum + chunkSum[i]));
+        vIdx.push_back(j);
+        nIn++;
+      }
+    }
+
+    if (nIn > 0) {
+      auto chunkSamples = Base::states_[i].qreg().sample_measure(vRnd);
+      uint_t ir;
+      ir = (Base::global_state_index_ + i) >>
+           ((Base::num_qubits_ - BasePar::chunk_bits_));
+
+      for (j = 0; j < chunkSamples.size(); j++) {
+        local_samples[vIdx[j]] = (ir << BasePar::chunk_bits_) + chunkSamples[j];
+      }
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(local_samples);
+#endif
+  allbit_samples = local_samples;
+
+  // Convert to reg_t format
+  std::vector<reg_t> all_samples;
+  all_samples.reserve(shots);
+  for (int_t val : allbit_samples) {
+    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
+    reg_t sample;
+    sample.reserve(qubits.size());
+    for (uint_t qubit : qubits) {
+      sample.push_back(allbit_sample[qubit]);
+    }
+    all_samples.push_back(sample);
+  }
+  return all_samples;
+}
+
+template <class state_t>
+rvector_t
+Executor<state_t>::sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                            const reg_t &qubits) {
+  rvector_t probs =
+      Base::states_[root.state_index()].qreg().probabilities(qubits);
+  uint_t nshots = root.num_shots();
+  reg_t shot_branch(nshots);
+
+  for (int_t i = 0; i < nshots; i++) {
+    shot_branch[i] = root.rng_shots()[i].rand_int(probs);
+  }
+
+  // branch shots
+  root.creg() = Base::states_[root.state_index()].creg();
+  root.branch_shots(shot_branch, probs.size());
+
+  return probs;
+}
+
+template <class state_t>
+void Executor<state_t>::measure_reset_update(CircuitExecutor::Branch &root,
+                                             const std::vector<uint_t> &qubits,
+                                             const int_t final_state,
+                                             const rvector_t &meas_probs) {
+  // Update a state vector based on an outcome pair [m, p] from
+  // sample_measure_with_prob function, and a desired post-measurement
+  // final_state
+
+  // Single-qubit case
+  if (qubits.size() == 1) {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    for (int_t i = 0; i < 2; i++) {
+      cvector_t mdiag(2, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        Operations::Op op;
+        op.type = OpType::gate;
+        op.name = "x";
+        op.qubits = qubits;
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+  // Multi qubit case
+  else {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    const size_t dim = 1ULL << qubits.size();
+    for (int_t i = 0; i < dim; i++) {
+      cvector_t mdiag(dim, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        // build vectorized permutation matrix
+        cvector_t perm(dim * dim, 0.);
+        perm[final_state * dim + i] = 1.;
+        perm[i * dim + final_state] = 1.;
+        for (size_t j = 0; j < dim; j++) {
+          if (j != final_state && j != i)
+            perm[j * dim + j] = 1.;
+        }
+        Operations::Op op;
+        op.type = OpType::matrix;
+        op.qubits = qubits;
+        op.mats.push_back(Utils::devectorize_matrix(perm));
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
+                                      const reg_t &qubits, const reg_t &cmemory,
+                                      const reg_t &cregister) {
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  // save result to cregs
+  for (int_t i = 0; i < probs.size(); i++) {
+    const reg_t outcome = Utils::int2reg(i, 2, qubits.size());
+    root.branches()[i]->creg().store_measure(outcome, cmemory, cregister);
+  }
+
+  measure_reset_update(root, qubits, -1, probs);
+}
+/*
+template <class state_t>
+void Executor<state_t>::apply_reset(CircuitExecutor::Branch& root, const
+reg_t &qubits)
+{
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  measure_reset_update(root, qubits, 0, probs);
+}
+*/
+
+template <class state_t>
+std::vector<reg_t>
+Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
+                                  uint_t shots,
+                                  std::vector<RngEngine> &rng) const {
+  int_t i, j;
+  std::vector<double> rnds;
+  rnds.reserve(shots);
+
+  /*
+  double norm = std::real( state.qreg().trace() );
+  std::cout << "   trace = " << norm<<std::endl;
+
+  for (i = 0; i < shots; ++i)
+    rnds.push_back(rng[i].rand(0, norm));
+  */
+
+  for (i = 0; i < shots; ++i)
+    rnds.push_back(rng[i].rand(0, 1));
+
+  bool flg = state.qreg().enable_batch(false);
+  auto allbit_samples = state.qreg().sample_measure(rnds);
+  state.qreg().enable_batch(flg);
+
+  // Convert to reg_t format
+  std::vector<reg_t> all_samples;
+  all_samples.reserve(shots);
+  for (int_t val : allbit_samples) {
+    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
+    reg_t sample;
+    sample.reserve(qubits.size());
+    for (uint_t qubit : qubits) {
+      sample.push_back(allbit_sample[qubit]);
+    }
+    all_samples.push_back(sample);
+  }
+  return all_samples;
+}
+
+//=========================================================================
+// Implementation: Kraus Noise
+//=========================================================================
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_kraus(const reg_t &qubits,
+                                      const std::vector<cmatrix_t> &kmats) {
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        Base::states_[iChunk].qreg().apply_superop_matrix(
+            qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats)));
+      }
+    }
+  } else {
+    for (int_t i = 0; i < Base::states_.size(); i++)
+      Base::states_[i].qreg().apply_superop_matrix(
+          qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats)));
+  }
+}
+
+//-----------------------------------------------------------------------
+// Functions for multi-chunk distribution
+//-----------------------------------------------------------------------
+// swap between chunks
+template <class densmat_t>
+void Executor<densmat_t>::apply_chunk_swap(const reg_t &qubits) {
+  uint_t q0, q1;
+  q0 = qubits[0];
+  q1 = qubits[1];
+
+  std::swap(BasePar::qubit_map_[q0], BasePar::qubit_map_[q1]);
+
+  if (qubits[0] >= BasePar::chunk_bits_) {
+    q0 += BasePar::chunk_bits_;
+  }
+  if (qubits[1] >= BasePar::chunk_bits_) {
+    q1 += BasePar::chunk_bits_;
+  }
+  reg_t qs0 = {{q0, q1}};
+  BasePar::apply_chunk_swap(qs0);
+
+  if (qubits[0] >= BasePar::chunk_bits_) {
+    q0 += (Base::num_qubits_ - BasePar::chunk_bits_);
+  } else {
+    q0 += BasePar::chunk_bits_;
+  }
+  if (qubits[1] >= BasePar::chunk_bits_) {
+    q1 += (Base::num_qubits_ - BasePar::chunk_bits_);
+  } else {
+    q1 += BasePar::chunk_bits_;
+  }
+  reg_t qs1 = {{q0, q1}};
+  BasePar::apply_chunk_swap(qs1);
+}
+
+template <class densmat_t>
+void Executor<densmat_t>::apply_multi_chunk_swap(const reg_t &qubits) {
+  reg_t qubits_density;
+
+  for (int_t i = 0; i < qubits.size(); i += 2) {
+    uint_t q0, q1;
+    q0 = qubits[i * 2];
+    q1 = qubits[i * 2 + 1];
+
+    std::swap(BasePar::qubit_map_[q0], BasePar::qubit_map_[q1]);
+
+    if (q1 >= BasePar::chunk_bits_) {
+      q1 += BasePar::chunk_bits_;
+    }
+    qubits_density.push_back(q0);
+    qubits_density.push_back(q1);
+
+    q0 += BasePar::chunk_bits_;
+    if (q1 >= BasePar::chunk_bits_) {
+      q1 += (Base::num_qubits_ - BasePar::chunk_bits_ * 2);
+    }
+  }
+
+  BasePar::apply_multi_chunk_swap(qubits_density);
+}
+
+//-------------------------------------------------------------------------
+} // end namespace DensityMatrix
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
index ab8e3b4fd3..a5bfa46585 100644
--- a/src/simulators/density_matrix/densitymatrix_state.hpp
+++ b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -23,7 +23,8 @@
 #include "framework/json.hpp"
 #include "framework/opset.hpp"
 #include "framework/utils.hpp"
-#include "simulators/state_chunk.hpp"
+#include "simulators/chunk_utils.hpp"
+#include "simulators/state.hpp"
 #ifdef AER_THRUST_SUPPORTED
 #include "densitymatrix_thrust.hpp"
 #endif
@@ -90,9 +91,9 @@ enum class Gates {
 //=========================================================================
 
 template <class densmat_t = QV::DensityMatrix<double>>
-class State : public QuantumState::StateChunk<densmat_t> {
+class State : public QuantumState::State<densmat_t> {
 public:
-  using BaseState = QuantumState::StateChunk<densmat_t>;
+  using BaseState = QuantumState::State<densmat_t>;
 
   State() : BaseState(StateOpSet) {}
   virtual ~State() = default;
@@ -102,32 +103,35 @@ class State : public QuantumState::StateChunk<densmat_t> {
   //-----------------------------------------------------------------------
 
   // Return the string name of the State class
-  virtual std::string name() const override { return densmat_t::name(); }
+  std::string name() const override { return densmat_t::name(); }
 
   // Apply an operation
   // If the op is not in allowed_ops an exeption will be raised.
-  virtual void apply_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, RngEngine &rng,
-                        bool final_op = false) override;
+  void apply_op(const Operations::Op &op, ExperimentResult &result,
+                RngEngine &rng, bool final_op = false) override;
+
+  // memory allocation (previously called before inisitalize_qreg)
+  bool allocate(uint_t num_qubits, uint_t block_bits,
+                uint_t num_parallel_shots = 1) override;
 
   // Initializes an n-qubit state to the all |0> state
-  virtual void initialize_qreg(uint_t num_qubits) override;
+  void initialize_qreg(uint_t num_qubits) override;
 
   // Returns the required memory for storing an n-qubit state in megabytes.
   // For this state the memory is indepdentent of the number of ops
   // and is approximately 16 * 1 << num_qubits bytes
-  virtual size_t
+  size_t
   required_memory_mb(uint_t num_qubits,
                      const std::vector<Operations::Op> &ops) const override;
 
   // Load the threshold for applying OpenMP parallelization
   // if the controller/engine allows threads for it
-  virtual void set_config(const Config &config) override;
+  void set_config(const Config &config) override;
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) override;
+  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
+                                    RngEngine &rng) override;
 
   //-----------------------------------------------------------------------
   // Additional methods
@@ -139,96 +143,80 @@ class State : public QuantumState::StateChunk<densmat_t> {
   // Initialize OpenMP settings for the underlying DensityMatrix class
   void initialize_omp();
 
-  auto move_to_matrix(const int_t iChunk);
-  auto copy_to_matrix(const int_t iChunk);
+  auto move_to_matrix();
+  auto copy_to_matrix();
 
-protected:
   template <typename list_t>
-  void initialize_from_vector(const int_t iChunk, const list_t &vec);
+  void initialize_from_vector(const list_t &vec);
 
   //-----------------------------------------------------------------------
   // Apply instructions
   //-----------------------------------------------------------------------
-  // apply op to multiple shots , return flase if op is not supported to execute
-  // in a batch
-  bool apply_batched_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, std::vector<RngEngine> &rng,
-                        bool final_op = false) override;
-
   // Applies a sypported Gate operation to the state class.
   // If the input is not in allowed_gates an exeption will be raised.
-  void apply_gate(const int_t iChunk, const Operations::Op &op);
+  void apply_gate(const Operations::Op &op);
 
   // apply (multi) control gate by statevector
-  void apply_gate_statevector(const int_t iChunk, const Operations::Op &op);
+  void apply_gate_statevector(const Operations::Op &op);
 
   // Measure qubits and return a list of outcomes [q0, q1, ...]
   // If a state subclass supports this function it then "measure"
   // should be contained in the set returned by the 'allowed_ops'
   // method.
-  virtual void apply_measure(const int_t iChunk, const reg_t &qubits,
-                             const reg_t &cmemory, const reg_t &cregister,
-                             RngEngine &rng);
+  virtual void apply_measure(const reg_t &qubits, const reg_t &cmemory,
+                             const reg_t &cregister, RngEngine &rng);
 
   // Reset the specified qubits to the |0> state by tracing out qubits
-  void apply_reset(const int_t iChunk, const reg_t &qubits);
+  void apply_reset(const reg_t &qubits);
 
   // Apply a matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const reg_t &qubits,
-                    const cmatrix_t &mat);
+  void apply_matrix(const reg_t &qubits, const cmatrix_t &mat);
 
   // Apply a vectorized matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const reg_t &qubits,
-                    const cvector_t &vmat);
+  void apply_matrix(const reg_t &qubits, const cvector_t &vmat);
 
   // apply diagonal matrix
-  void apply_diagonal_unitary_matrix(const int_t iChunk, const reg_t &qubits,
+  void apply_diagonal_unitary_matrix(const reg_t &qubits,
                                      const cvector_t &diag);
 
   // Apply a Kraus error operation
-  void apply_kraus(const int_t iChunk, const reg_t &qubits,
-                   const std::vector<cmatrix_t> &kraus);
+  void apply_kraus(const reg_t &qubits, const std::vector<cmatrix_t> &kraus);
 
   // Apply an N-qubit Pauli gate
-  void apply_pauli(const int_t iChunk, const reg_t &qubits,
-                   const std::string &pauli);
+  void apply_pauli(const reg_t &qubits, const std::string &pauli);
 
   // apply phase
-  void apply_phase(const int_t iChunk, const uint_t qubit,
-                   const complex_t phase);
-  void apply_phase(const int_t iChunk, const reg_t &qubits,
-                   const complex_t phase);
+  void apply_phase(const uint_t qubit, const complex_t phase);
+  void apply_phase(const reg_t &qubits, const complex_t phase);
 
+protected:
   //-----------------------------------------------------------------------
   // Save data instructions
   //-----------------------------------------------------------------------
 
   // Save the current full density matrix
-  void apply_save_state(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, bool last_op = false);
+  void apply_save_state(const Operations::Op &op, ExperimentResult &result,
+                        bool last_op = false);
 
   // Save the current density matrix or reduced density matrix
-  void apply_save_density_matrix(const int_t iChunk, const Operations::Op &op,
+  void apply_save_density_matrix(const Operations::Op &op,
                                  ExperimentResult &result,
                                  bool last_op = false);
 
   // Helper function for computing expectation value
-  void apply_save_probs(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result);
+  void apply_save_probs(const Operations::Op &op, ExperimentResult &result);
 
   // Helper function for saving amplitudes squared
-  void apply_save_amplitudes_sq(const int_t iChunk, const Operations::Op &op,
+  void apply_save_amplitudes_sq(const Operations::Op &op,
                                 ExperimentResult &result);
 
   // Helper function for computing expectation value
-  virtual double expval_pauli(const int_t iChunk, const reg_t &qubits,
+  virtual double expval_pauli(const reg_t &qubits,
                               const std::string &pauli) override;
 
   // Return the reduced density matrix for the simulator
-  cmatrix_t reduced_density_matrix(const int_t iChunk, const reg_t &qubits,
-                                   bool last_op = false);
-  cmatrix_t reduced_density_matrix_helper(const int_t iChunk,
-                                          const reg_t &qubits,
+  cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false);
+  cmatrix_t reduced_density_matrix_helper(const reg_t &qubits,
                                           const reg_t &qubits_sorted);
 
   //-----------------------------------------------------------------------
@@ -240,7 +228,7 @@ class State : public QuantumState::StateChunk<densmat_t> {
   // should be contained in the set returned by the 'allowed_ops'
   // method.
   // TODO: move to private (no longer part of base class)
-  rvector_t measure_probs(const int_t iChunk, const reg_t &qubits) const;
+  rvector_t measure_probs(const reg_t &qubits) const;
 
   // Sample the measurement outcome for qubits
   // return a pair (m, p) of the outcome m, and its corresponding
@@ -250,12 +238,10 @@ class State : public QuantumState::StateChunk<densmat_t> {
   // 1 -> |q1 = 0, q0 = 1> state
   // 2 -> |q1 = 1, q0 = 0> state
   // 3 -> |q1 = 1, q0 = 1> state
-  std::pair<uint_t, double> sample_measure_with_prob(const int_t iChunk,
-                                                     const reg_t &qubits,
+  std::pair<uint_t, double> sample_measure_with_prob(const reg_t &qubits,
                                                      RngEngine &rng);
 
-  void measure_reset_update(const int_t iChunk,
-                            const std::vector<uint_t> &qubits,
+  void measure_reset_update(const std::vector<uint_t> &qubits,
                             const uint_t final_state, const uint_t meas_state,
                             const double meas_prob);
 
@@ -264,8 +250,8 @@ class State : public QuantumState::StateChunk<densmat_t> {
   //-----------------------------------------------------------------------
 
   // Apply a waltz gate specified by parameters u3(theta, phi, lambda)
-  void apply_gate_u3(const int_t iChunk, const uint_t qubit, const double theta,
-                     const double phi, const double lambda);
+  void apply_gate_u3(const uint_t qubit, const double theta, const double phi,
+                     const double lambda);
 
   //-----------------------------------------------------------------------
   // Config Settings
@@ -281,20 +267,6 @@ class State : public QuantumState::StateChunk<densmat_t> {
 
   // Table of allowed gate names to gate enum class members
   const static stringmap_t<Gates> gateset_;
-
-  // scale for density matrix = 2
-  // this function is used in the base class to scale chunk qubits for
-  // multi-chunk distribution
-  int qubit_scale(void) override { return 2; }
-
-  //-----------------------------------------------------------------------
-  // Functions for multi-chunk distribution
-  //-----------------------------------------------------------------------
-  // swap between chunks
-  void apply_chunk_swap(const reg_t &qubits) override;
-
-  // apply multiple swaps between chunks
-  void apply_multi_chunk_swap(const reg_t &qubits) override;
 };
 
 //=========================================================================
@@ -356,41 +328,22 @@ const stringmap_t<Gates> State<densmat_t>::gateset_({
 //-------------------------------------------------------------------------
 template <class densmat_t>
 void State<densmat_t>::initialize_qreg(uint_t num_qubits) {
-  if (BaseState::qregs_.size() == 0)
-    BaseState::allocate(num_qubits, num_qubits, 1);
   initialize_omp();
 
-  for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_num_qubits(BaseState::chunk_bits_);
-  }
+  BaseState::qreg_.set_num_qubits(num_qubits);
+  BaseState::qreg_.initialize();
+}
 
-  if (BaseState::multi_chunk_distribution_) {
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-          if (BaseState::global_chunk_index_ + iChunk == 0) {
-            BaseState::qregs_[iChunk].initialize();
-          } else {
-            BaseState::qregs_[iChunk].zero();
-          }
-        }
-      }
-    } else {
-      for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-        if (BaseState::global_chunk_index_ + i == 0) {
-          BaseState::qregs_[i].initialize();
-        } else {
-          BaseState::qregs_[i].zero();
-        }
-      }
-    }
-  } else {
-    for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-      BaseState::qregs_[i].initialize();
-    }
-  }
+template <class densmat_t>
+bool State<densmat_t>::allocate(uint_t num_qubits, uint_t block_bits,
+                                uint_t num_parallel_shots) {
+  if (BaseState::max_matrix_qubits_ > 0)
+    BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_);
+
+  BaseState::qreg_.set_target_gpus(BaseState::target_gpus_);
+  BaseState::qreg_.chunk_setup(block_bits * 2, block_bits * 2, 0, 1);
+
+  return true;
 }
 
 template <class densmat_t>
@@ -401,159 +354,33 @@ void State<densmat_t>::initialize_qreg(uint_t num_qubits, densmat_t &&state) {
                                 "initial state does not match qubit number");
   }
 
-  if (BaseState::qregs_.size() == 1) {
-    BaseState::qregs_[0] = std::move(state);
-  } else {
-    initialize_omp();
-    for (int_t iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-      BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_);
-    }
-
-    if (BaseState::multi_chunk_distribution_) {
-      auto matrix = state.move_to_matrix();
-      uint_t size = 1ull << (BaseState::chunk_bits_ * 2);
-      uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1;
-
-      auto copy_matrix_to_chunks_lambda = [this, &matrix, size,
-                                           mask](int_t ig) {
-        for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-          uint_t irow_chunk =
-              ((iChunk + BaseState::global_chunk_index_) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_)))
-              << (BaseState::chunk_bits_);
-          uint_t icol_chunk =
-              ((iChunk + BaseState::global_chunk_index_) &
-               ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) -
-                1))
-              << (BaseState::chunk_bits_);
-
-          auto sub_mat =
-              BaseState::qregs_[iChunk]
-                  .copy_to_matrix(); // allocate sub-matrix by copying data type
-                                     // and storage from chunk
-          for (int_t i = 0; i < size; i++) {
-            uint_t irow = (i >> (BaseState::chunk_bits_)) + irow_chunk;
-            uint_t icol = (i & mask) + icol_chunk;
-            sub_mat[i] = matrix[(irow << BaseState::num_qubits_) + icol];
-          }
-          BaseState::qregs_[iChunk].initialize_from_vector(sub_mat);
-        }
-      };
-      Utils::apply_omp_parallel_for(
-          (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0), 0,
-          BaseState::num_groups_, copy_matrix_to_chunks_lambda);
-    } else {
-      auto mat = state.copy_to_matrix();
-      for (int_t iChunk = 0; iChunk < BaseState::qregs_.size() - 1; iChunk++) {
-        BaseState::qregs_[iChunk].initialize_from_vector(mat);
-      }
-      BaseState::qregs_[BaseState::qregs_.size() - 1] = std::move(state);
-    }
-  }
+  BaseState::qreg_ = std::move(state);
 }
 
 template <class densmat_t>
 void State<densmat_t>::initialize_omp() {
   uint_t i;
-  for (i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_);
-    if (BaseState::threads_ > 0)
-      BaseState::qregs_[i].set_omp_threads(
-          BaseState::threads_); // set allowed OMP threads in qubitvector
-  }
+  BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_);
+  if (BaseState::threads_ > 0)
+    BaseState::qreg_.set_omp_threads(
+        BaseState::threads_); // set allowed OMP threads in qubitvector
 }
 
 template <class densmat_t>
 template <typename list_t>
-void State<densmat_t>::initialize_from_vector(const int_t iChunkIn,
-                                              const list_t &vec) {
-  if ((1ull << (BaseState::num_qubits_ * 2)) == vec.size()) {
-    BaseState::initialize_from_vector(iChunkIn, vec);
-  } else if ((1ull << (BaseState::num_qubits_ * 2)) ==
-             vec.size() * vec.size()) {
-    int_t iChunk;
-    if (BaseState::multi_chunk_distribution_) {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-               iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-            uint_t irow_chunk =
-                ((iChunk + BaseState::global_chunk_index_) >>
-                 ((BaseState::num_qubits_ - BaseState::chunk_bits_)))
-                << (BaseState::chunk_bits_);
-            uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) &
-                                 ((1ull << ((BaseState::num_qubits_ -
-                                             BaseState::chunk_bits_))) -
-                                  1))
-                                << (BaseState::chunk_bits_);
-
-            // copy part of state for this chunk
-            uint_t i, row, col;
-            list_t vec1(1ull << BaseState::chunk_bits_);
-            list_t vec2(1ull << BaseState::chunk_bits_);
-
-            for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) {
-              vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i];
-              vec2[i] =
-                  std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]);
-            }
-            BaseState::qregs_[iChunk].initialize_from_vector(
-                AER::Utils::tensor_product(vec1, vec2));
-          }
-        }
-      } else {
-        for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-          uint_t irow_chunk =
-              ((iChunk + BaseState::global_chunk_index_) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_)))
-              << (BaseState::chunk_bits_);
-          uint_t icol_chunk =
-              ((iChunk + BaseState::global_chunk_index_) &
-               ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) -
-                1))
-              << (BaseState::chunk_bits_);
-
-          // copy part of state for this chunk
-          uint_t i, row, col;
-          list_t vec1(1ull << BaseState::chunk_bits_);
-          list_t vec2(1ull << BaseState::chunk_bits_);
-
-          for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) {
-            vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i];
-            vec2[i] =
-                std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]);
-          }
-          BaseState::qregs_[iChunk].initialize_from_vector(
-              AER::Utils::tensor_product(vec1, vec2));
-        }
-      }
-    } else {
-      BaseState::qregs_[iChunkIn].initialize_from_vector(
-          AER::Utils::tensor_product(AER::Utils::conjugate(vec), vec));
-    }
-  } else {
-    throw std::runtime_error(
-        "DensityMatrixChunk::initialize input vector is incorrect length. "
-        "Expected: " +
-        std::to_string((1ull << (BaseState::num_qubits_ * 2))) +
-        " Received: " + std::to_string(vec.size()));
-  }
+void State<densmat_t>::initialize_from_vector(const list_t &vec) {
+  BaseState::qreg_.initialize_from_vector(
+      AER::Utils::tensor_product(AER::Utils::conjugate(vec), vec));
 }
 
 template <class densmat_t>
-auto State<densmat_t>::move_to_matrix(const int_t iChunk) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].move_to_matrix();
-  return BaseState::apply_to_matrix(false);
+auto State<densmat_t>::move_to_matrix() {
+  return BaseState::qreg_.move_to_matrix();
 }
 
 template <class densmat_t>
-auto State<densmat_t>::copy_to_matrix(const int_t iChunk) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].copy_to_matrix();
-  return BaseState::apply_to_matrix(true);
+auto State<densmat_t>::copy_to_matrix() {
+  return BaseState::qreg_.copy_to_matrix();
 }
 
 //-------------------------------------------------------------------------
@@ -575,9 +402,7 @@ void State<densmat_t>::set_config(const Config &config) {
   // Set threshold for truncating snapshots
   json_chop_threshold_ = config.chop_threshold;
   uint_t i;
-  for (i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_);
-  }
+  BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_);
 
   // Set OMP threshold for state update functions
   omp_qubit_threshold_ = config.statevector_parallel_threshold;
@@ -588,64 +413,64 @@ void State<densmat_t>::set_config(const Config &config) {
 //=========================================================================
 
 template <class densmat_t>
-void State<densmat_t>::apply_op(const int_t iChunk, const Operations::Op &op,
+void State<densmat_t>::apply_op(const Operations::Op &op,
                                 ExperimentResult &result, RngEngine &rng,
                                 bool final_ops) {
-  if (BaseState::check_conditional(iChunk, op)) {
+  if (BaseState::creg().check_conditional(op)) {
     switch (op.type) {
     case OpType::barrier:
     case OpType::qerror_loc:
       break;
     case OpType::reset:
-      apply_reset(iChunk, op.qubits);
+      apply_reset(op.qubits);
       break;
     case OpType::measure:
-      apply_measure(iChunk, op.qubits, op.memory, op.registers, rng);
+      apply_measure(op.qubits, op.memory, op.registers, rng);
       break;
     case OpType::bfunc:
-      BaseState::cregs_[0].apply_bfunc(op);
+      BaseState::creg().apply_bfunc(op);
       break;
     case OpType::roerror:
-      BaseState::cregs_[0].apply_roerror(op, rng);
+      BaseState::creg().apply_roerror(op, rng);
       break;
     case OpType::gate:
-      apply_gate(iChunk, op);
+      apply_gate(op);
       break;
     case OpType::matrix:
-      apply_matrix(iChunk, op.qubits, op.mats[0]);
+      apply_matrix(op.qubits, op.mats[0]);
       break;
     case OpType::diagonal_matrix:
-      apply_diagonal_unitary_matrix(iChunk, op.qubits, op.params);
+      apply_diagonal_unitary_matrix(op.qubits, op.params);
       break;
     case OpType::superop:
-      BaseState::qregs_[iChunk].apply_superop_matrix(
+      BaseState::qreg_.apply_superop_matrix(
           op.qubits, Utils::vectorize_matrix(op.mats[0]));
       break;
     case OpType::kraus:
-      apply_kraus(iChunk, op.qubits, op.mats);
+      apply_kraus(op.qubits, op.mats);
       break;
     case OpType::set_statevec:
-      initialize_from_vector(iChunk, op.params);
+      initialize_from_vector(op.params);
       break;
     case OpType::set_densmat:
-      BaseState::initialize_from_matrix(iChunk, op.mats[0]);
+      BaseState::qreg_.initialize_from_matrix(op.mats[0]);
       break;
     case OpType::save_expval:
     case OpType::save_expval_var:
-      BaseState::apply_save_expval(iChunk, op, result);
+      BaseState::apply_save_expval(op, result);
       break;
     case OpType::save_state:
-      apply_save_state(iChunk, op, result, final_ops);
+      apply_save_state(op, result, final_ops);
       break;
     case OpType::save_densmat:
-      apply_save_density_matrix(iChunk, op, result, final_ops);
+      apply_save_density_matrix(op, result, final_ops);
       break;
     case OpType::save_probs:
     case OpType::save_probs_ket:
-      apply_save_probs(iChunk, op, result);
+      apply_save_probs(op, result);
       break;
     case OpType::save_amps_sq:
-      apply_save_amplitudes_sq(iChunk, op, result);
+      apply_save_amplitudes_sq(op, result);
       break;
     default:
       throw std::invalid_argument(
@@ -654,80 +479,26 @@ void State<densmat_t>::apply_op(const int_t iChunk, const Operations::Op &op,
   }
 }
 
-template <class densmat_t>
-bool State<densmat_t>::apply_batched_op(const int_t iChunk,
-                                        const Operations::Op &op,
-                                        ExperimentResult &result,
-                                        std::vector<RngEngine> &rng,
-                                        bool final_ops) {
-  if (op.conditional)
-    BaseState::qregs_[iChunk].set_conditional(op.conditional_reg);
-
-  switch (op.type) {
-  case OpType::barrier:
-  case OpType::nop:
-  case OpType::qerror_loc:
-    break;
-  case OpType::reset:
-    BaseState::qregs_[iChunk].apply_reset(op.qubits);
-    break;
-  case OpType::measure:
-    BaseState::qregs_[iChunk].apply_batched_measure(op.qubits, rng, op.memory,
-                                                    op.registers);
-    break;
-  case OpType::bfunc:
-    BaseState::qregs_[iChunk].apply_bfunc(op);
-    break;
-  case OpType::roerror:
-    BaseState::qregs_[iChunk].apply_roerror(op, rng);
-    break;
-  case OpType::gate:
-    apply_gate(iChunk, op);
-    break;
-  case OpType::matrix:
-    apply_matrix(iChunk, op.qubits, op.mats[0]);
-    break;
-  case OpType::diagonal_matrix:
-    BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(op.qubits,
-                                                            op.params);
-    break;
-  case OpType::superop:
-    BaseState::qregs_[iChunk].apply_superop_matrix(
-        op.qubits, Utils::vectorize_matrix(op.mats[0]));
-    break;
-  case OpType::kraus:
-    apply_kraus(iChunk, op.qubits, op.mats);
-    break;
-  default:
-    // other operations should be called to indivisual chunks by apply_op
-    return false;
-  }
-  return true;
-}
-
 //=========================================================================
 // Implementation: Save data
 //=========================================================================
 
 template <class densmat_t>
-void State<densmat_t>::apply_save_probs(const int_t iChunk,
-                                        const Operations::Op &op,
+void State<densmat_t>::apply_save_probs(const Operations::Op &op,
                                         ExperimentResult &result) {
-  auto probs = measure_probs(iChunk, op.qubits);
-  auto cr = this->creg(BaseState::get_global_shot_index(iChunk));
+  auto probs = measure_probs(op.qubits);
   if (op.type == OpType::save_probs_ket) {
-    result.save_data_average(cr, op.string_params[0],
+    result.save_data_average(BaseState::creg(), op.string_params[0],
                              Utils::vec2ket(probs, json_chop_threshold_, 16),
                              op.type, op.save_type);
   } else {
-    result.save_data_average(cr, op.string_params[0], std::move(probs), op.type,
-                             op.save_type);
+    result.save_data_average(BaseState::creg(), op.string_params[0],
+                             std::move(probs), op.type, op.save_type);
   }
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_save_amplitudes_sq(const int_t iChunkIn,
-                                                const Operations::Op &op,
+void State<densmat_t>::apply_save_amplitudes_sq(const Operations::Op &op,
                                                 ExperimentResult &result) {
   if (op.int_params.empty()) {
     throw std::invalid_argument(
@@ -736,162 +507,37 @@ void State<densmat_t>::apply_save_amplitudes_sq(const int_t iChunkIn,
   const int_t size = op.int_params.size();
   rvector_t amps_sq(size);
 
-  if (BaseState::multi_chunk_distribution_) {
-    int_t iChunk;
-#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk)
-    for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-      uint_t irow, icol;
-      irow = (BaseState::global_chunk_index_ + iChunk) >>
-             ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-      icol = (BaseState::global_chunk_index_ + iChunk) -
-             (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-      if (irow != icol)
-        continue;
-
-#pragma omp parallel for if (size > pow(2, omp_qubit_threshold_) &&            \
-                             BaseState::threads_ > 1 &&                        \
-                             !BaseState::chunk_omp_parallel_)                  \
-    num_threads(BaseState::threads_)
-      for (int_t i = 0; i < size; ++i) {
-        uint_t idx = BaseState::mapped_index(op.int_params[i]);
-        if (idx >= (irow << BaseState::chunk_bits_) &&
-            idx < ((irow + 1) << BaseState::chunk_bits_))
-          amps_sq[i] = BaseState::qregs_[iChunk].probability(
-              idx - (irow << BaseState::chunk_bits_));
-      }
-    }
-#ifdef AER_MPI
-    BaseState::reduce_sum(amps_sq);
-#endif
-  } else {
 #pragma omp parallel for if (size > pow(2, omp_qubit_threshold_) &&            \
                              BaseState::threads_ > 1)                          \
     num_threads(BaseState::threads_)
-    for (int_t i = 0; i < size; ++i) {
-      amps_sq[i] = BaseState::qregs_[iChunkIn].probability(op.int_params[i]);
-    }
+  for (int_t i = 0; i < size; ++i) {
+    amps_sq[i] = BaseState::qreg_.probability(op.int_params[i]);
   }
-  auto cr = this->creg(BaseState::get_global_shot_index(iChunkIn));
-  result.save_data_average(cr, op.string_params[0], std::move(amps_sq), op.type,
-                           op.save_type);
+
+  result.save_data_average(BaseState::creg(), op.string_params[0],
+                           std::move(amps_sq), op.type, op.save_type);
 }
 
 template <class densmat_t>
-double State<densmat_t>::expval_pauli(const int_t iChunk, const reg_t &qubits,
+double State<densmat_t>::expval_pauli(const reg_t &qubits,
                                       const std::string &pauli) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].expval_pauli(qubits, pauli);
-
-  reg_t qubits_in_chunk;
-  reg_t qubits_out_chunk;
-  std::string pauli_in_chunk;
-  std::string pauli_out_chunk;
-  int_t i, n;
-  double expval(0.);
-
-  // get inner/outer chunk pauli string
-  n = pauli.size();
-  for (i = 0; i < n; i++) {
-    if (qubits[i] < BaseState::chunk_bits_) {
-      qubits_in_chunk.push_back(qubits[i]);
-      pauli_in_chunk.push_back(pauli[n - i - 1]);
-    } else {
-      qubits_out_chunk.push_back(qubits[i]);
-      pauli_out_chunk.push_back(pauli[n - i - 1]);
-    }
-  }
-
-  int_t nrows = 1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-
-  if (qubits_out_chunk.size() > 0) { // there are bits out of chunk
-    std::complex<double> phase = 1.0;
-
-    std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end());
-    std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end());
-
-    uint_t x_mask, z_mask, num_y, x_max;
-    std::tie(x_mask, z_mask, num_y, x_max) =
-        AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk);
-
-    z_mask >>= (BaseState::chunk_bits_);
-    if (x_mask != 0) {
-      x_mask >>= (BaseState::chunk_bits_);
-      x_max -= (BaseState::chunk_bits_);
-
-      AER::QV::add_y_phase(num_y, phase);
-
-      const uint_t mask_u = ~((1ull << (x_max + 1)) - 1);
-      const uint_t mask_l = (1ull << x_max) - 1;
-
-      for (i = 0; i < nrows / 2; i++) {
-        uint_t irow = ((i << 1) & mask_u) | (i & mask_l);
-        uint_t iChunk = (irow ^ x_mask) + irow * nrows;
-
-        if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <=
-                iChunk &&
-            BaseState::chunk_index_end_[BaseState::distributed_rank_] >
-                iChunk) { // on this process
-          double sign = 2.0;
-          if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1))
-            sign = -2.0;
-          expval += sign *
-                    BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                        .expval_pauli_non_diagonal_chunk(qubits_in_chunk,
-                                                         pauli_in_chunk, phase);
-        }
-      }
-    } else {
-      for (i = 0; i < nrows; i++) {
-        uint_t iChunk = i * (nrows + 1);
-        if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <=
-                iChunk &&
-            BaseState::chunk_index_end_[BaseState::distributed_rank_] >
-                iChunk) { // on this process
-          double sign = 1.0;
-          if (z_mask && (AER::Utils::popcount(i & z_mask) & 1))
-            sign = -1.0;
-          expval +=
-              sign * BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                         .expval_pauli(qubits_in_chunk, pauli_in_chunk, 1.0);
-        }
-      }
-    }
-  } else { // all bits are inside chunk
-    for (i = 0; i < nrows; i++) {
-      uint_t iChunk = i * (nrows + 1);
-      if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <=
-              iChunk &&
-          BaseState::chunk_index_end_[BaseState::distributed_rank_] >
-              iChunk) { // on this process
-        expval += BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                      .expval_pauli(qubits, pauli, 1.0);
-      }
-    }
-  }
-
-#ifdef AER_MPI
-  BaseState::reduce_sum(expval);
-#endif
-  return expval;
+  return BaseState::qreg_.expval_pauli(qubits, pauli);
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_save_density_matrix(const int_t iChunk,
-                                                 const Operations::Op &op,
+void State<densmat_t>::apply_save_density_matrix(const Operations::Op &op,
                                                  ExperimentResult &result,
                                                  bool last_op) {
-  auto cr = this->creg(BaseState::get_global_shot_index(iChunk));
-  result.save_data_average(cr, op.string_params[0],
-                           reduced_density_matrix(iChunk, op.qubits, last_op),
-                           op.type, op.save_type);
+  result.save_data_average(BaseState::creg(), op.string_params[0],
+                           reduced_density_matrix(op.qubits, last_op), op.type,
+                           op.save_type);
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_save_state(const int_t iChunk,
-                                        const Operations::Op &op,
+void State<densmat_t>::apply_save_state(const Operations::Op &op,
                                         ExperimentResult &result,
                                         bool last_op) {
-  if (op.qubits.size() != BaseState::num_qubits_) {
+  if (op.qubits.size() != BaseState::qreg_.num_qubits()) {
     throw std::invalid_argument(op.name + " was not applied to all qubits."
                                           " Only the full state can be saved.");
   }
@@ -911,164 +557,75 @@ void State<densmat_t>::apply_save_state(const int_t iChunk,
   // Default key
   std::string key = (op.string_params[0] == "_method_") ? "density_matrix"
                                                         : op.string_params[0];
-  auto cr = this->creg(BaseState::get_global_shot_index(iChunk));
   if (last_op) {
-    result.save_data_average(cr, key, move_to_matrix(iChunk),
+    result.save_data_average(BaseState::creg(), key, move_to_matrix(),
                              OpType::save_densmat, save_type);
   } else {
-    result.save_data_average(cr, key, copy_to_matrix(iChunk),
+    result.save_data_average(BaseState::creg(), key, copy_to_matrix(),
                              OpType::save_densmat, save_type);
   }
 }
 
 template <class densmat_t>
-cmatrix_t State<densmat_t>::reduced_density_matrix(const int_t iChunk,
-                                                   const reg_t &qubits,
+cmatrix_t State<densmat_t>::reduced_density_matrix(const reg_t &qubits,
                                                    bool last_op) {
   cmatrix_t reduced_state;
 
   // Check if tracing over all qubits
   if (qubits.empty()) {
     reduced_state = cmatrix_t(1, 1);
-    if (!BaseState::multi_chunk_distribution_) {
-      reduced_state[0] = BaseState::qregs_[iChunk].trace();
-    } else {
-      std::complex<double> sum = 0.0;
-      for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-        sum += BaseState::qregs_[i].trace();
-      }
-#ifdef AER_MPI
-      BaseState::reduce_sum(sum);
-#endif
-      reduced_state[0] = sum;
-    }
+    reduced_state[0] = BaseState::qreg_.trace();
   } else {
 
     auto qubits_sorted = qubits;
     std::sort(qubits_sorted.begin(), qubits_sorted.end());
 
-    if ((qubits.size() == BaseState::num_qubits_) &&
+    if ((qubits.size() == BaseState::qreg_.num_qubits()) &&
         (qubits == qubits_sorted)) {
       if (last_op) {
-        reduced_state = move_to_matrix(iChunk);
+        reduced_state = move_to_matrix();
       } else {
-        reduced_state = copy_to_matrix(iChunk);
+        reduced_state = copy_to_matrix();
       }
     } else {
-      reduced_state =
-          reduced_density_matrix_helper(iChunk, qubits, qubits_sorted);
+      reduced_state = reduced_density_matrix_helper(qubits, qubits_sorted);
     }
   }
   return reduced_state;
 }
 
 template <class densmat_t>
-cmatrix_t State<densmat_t>::reduced_density_matrix_helper(
-    const int_t iChunkIn, const reg_t &qubits, const reg_t &qubits_sorted) {
-  if (!BaseState::multi_chunk_distribution_) {
-    // Get superoperator qubits
-    const reg_t squbits = BaseState::qregs_[iChunkIn].superop_qubits(qubits);
-    const reg_t squbits_sorted =
-        BaseState::qregs_[iChunkIn].superop_qubits(qubits_sorted);
-
-    // Get dimensions
-    const size_t N = qubits.size();
-    const size_t DIM = 1ULL << N;
-    const int_t VDIM = 1ULL << (2 * N);
-    const size_t END = 1ULL << (BaseState::qregs_[iChunkIn].num_qubits() - N);
-    const size_t SHIFT = END + 1;
-
-    // Copy vector to host memory
-    auto vmat = BaseState::qregs_[iChunkIn].vector();
-    cmatrix_t reduced_state(DIM, DIM, false);
-    {
-      // Fill matrix with first iteration
-      const auto inds = QV::indexes(squbits, squbits_sorted, 0);
-      for (int_t i = 0; i < VDIM; ++i) {
-        reduced_state[i] = std::move(vmat[inds[i]]);
-      }
-    }
-    // Accumulate with remaning blocks
-    for (size_t k = 1; k < END; k++) {
-      const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT);
-      for (int_t i = 0; i < VDIM; ++i) {
-        reduced_state[i] += complex_t(std::move(vmat[inds[i]]));
-      }
+cmatrix_t
+State<densmat_t>::reduced_density_matrix_helper(const reg_t &qubits,
+                                                const reg_t &qubits_sorted) {
+  // Get superoperator qubits
+  const reg_t squbits = BaseState::qreg_.superop_qubits(qubits);
+  const reg_t squbits_sorted = BaseState::qreg_.superop_qubits(qubits_sorted);
+
+  // Get dimensions
+  const size_t N = qubits.size();
+  const size_t DIM = 1ULL << N;
+  const int_t VDIM = 1ULL << (2 * N);
+  const size_t END = 1ULL << (BaseState::qreg_.num_qubits() - N);
+  const size_t SHIFT = END + 1;
+
+  // Copy vector to host memory
+  auto vmat = BaseState::qreg_.vector();
+  cmatrix_t reduced_state(DIM, DIM, false);
+  {
+    // Fill matrix with first iteration
+    const auto inds = QV::indexes(squbits, squbits_sorted, 0);
+    for (int_t i = 0; i < VDIM; ++i) {
+      reduced_state[i] = std::move(vmat[inds[i]]);
     }
-    return reduced_state;
-  }
-
-  int_t iChunk;
-  uint_t size = 1ull << (BaseState::chunk_bits_ * 2);
-  uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1;
-  uint_t num_threads = BaseState::qregs_[0].get_omp_threads();
-
-  size_t size_required =
-      (sizeof(std::complex<double>) << (qubits.size() * 2)) +
-      (sizeof(std::complex<double>) << (BaseState::chunk_bits_ * 2)) *
-          BaseState::num_local_chunks_;
-  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
-    throw std::runtime_error(
-        std::string("There is not enough memory to store density matrix"));
   }
-  cmatrix_t reduced_state(1ull << qubits.size(), 1ull << qubits.size(), true);
-
-  if (BaseState::distributed_rank_ == 0) {
-    auto tmp = BaseState::qregs_[0].copy_to_matrix();
-    for (iChunk = 0; iChunk < BaseState::num_global_chunks_; iChunk++) {
-      int_t i;
-      uint_t irow_chunk =
-          (iChunk >> ((BaseState::num_qubits_ - BaseState::chunk_bits_)))
-          << BaseState::chunk_bits_;
-      uint_t icol_chunk =
-          (iChunk &
-           ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - 1))
-          << BaseState::chunk_bits_;
-
-      if (iChunk < BaseState::num_local_chunks_)
-        tmp = BaseState::qregs_[iChunk].copy_to_matrix();
-#ifdef AER_MPI
-      else
-        BaseState::recv_data(tmp.data(), size, 0, iChunk);
-#endif
-#pragma omp parallel for if (num_threads > 1) num_threads(num_threads)
-      for (i = 0; i < size; i++) {
-        uint_t irow = (i >> (BaseState::chunk_bits_)) + irow_chunk;
-        uint_t icol = (i & mask) + icol_chunk;
-        uint_t irow_out = 0;
-        uint_t icol_out = 0;
-        int j;
-        for (j = 0; j < qubits.size(); j++) {
-          if ((irow >> qubits[j]) & 1) {
-            irow &= ~(1ull << qubits[j]);
-            irow_out += (1ull << j);
-          }
-          if ((icol >> qubits[j]) & 1) {
-            icol &= ~(1ull << qubits[j]);
-            icol_out += (1ull << j);
-          }
-        }
-        if (irow == icol) { // only diagonal base can be reduced
-          uint_t idx = ((irow_out) << qubits.size()) + icol_out;
-#pragma omp critical
-          reduced_state[idx] += tmp[i];
-        }
-      }
-    }
-  } else {
-#ifdef AER_MPI
-    // send matrices to process 0
-    for (iChunk = 0; iChunk < BaseState::num_global_chunks_; iChunk++) {
-      uint_t iProc = BaseState::get_process_by_chunk(iChunk);
-      if (iProc == BaseState::distributed_rank_) {
-        auto tmp = BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                       .copy_to_matrix();
-        BaseState::send_data(tmp.data(), size, iChunk, 0);
-      }
+  // Accumulate with remaning blocks
+  for (size_t k = 1; k < END; k++) {
+    const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT);
+    for (int_t i = 0; i < VDIM; ++i) {
+      reduced_state[i] += complex_t(std::move(vmat[inds[i]]));
     }
-#endif
   }
-
   return reduced_state;
 }
 
@@ -1077,40 +634,42 @@ cmatrix_t State<densmat_t>::reduced_density_matrix_helper(
 //=========================================================================
 
 template <class densmat_t>
-void State<densmat_t>::apply_gate(const int_t iChunk,
-                                  const Operations::Op &op) {
-  if (!BaseState::global_chunk_indexing_) {
+void State<densmat_t>::apply_gate(const Operations::Op &op) {
+  // CPU qubit vector does not handle chunk ID inside kernel, so modify op here
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() &&
+      !BaseState::qreg_.support_global_indexing()) {
     reg_t qubits_in, qubits_out;
     bool ctrl_chunk = true;
     bool ctrl_chunk_sp = true;
-    BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in);
+    if (op.name[0] == 'c' || op.name.find("mc") == 0) {
+      Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in,
+                                   qubits_out);
+    }
     if (qubits_out.size() > 0) {
       uint_t mask = 0;
       for (int i = 0; i < qubits_out.size(); i++) {
-        mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_));
+        mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits()));
       }
-      if (((BaseState::global_chunk_index_ + iChunk) & mask) != mask) {
+      if ((BaseState::qreg_.chunk_index() & mask) != mask) {
         ctrl_chunk = false;
       }
-      if ((((BaseState::global_chunk_index_ + iChunk) >>
-            (BaseState::num_qubits_ - BaseState::chunk_bits_)) &
+      if (((BaseState::qreg_.chunk_index() >>
+            (BaseState::num_global_qubits_ - BaseState::qreg_.num_qubits())) &
            mask) != mask) {
         ctrl_chunk_sp = false;
       }
       if (!ctrl_chunk && !ctrl_chunk_sp)
         return; // do nothing for this chunk
       else {
-        Operations::Op new_op =
-            BaseState::remake_gate_in_chunk_qubits(op, qubits_in);
+        Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in);
         if (ctrl_chunk && ctrl_chunk_sp)
-          apply_gate(iChunk,
-                     new_op); // apply gate by using op with internal qubits
+          apply_gate(new_op); // apply gate by using op with internal qubits
         else if (ctrl_chunk)
-          apply_gate_statevector(iChunk, new_op);
+          apply_gate_statevector(new_op);
         else {
           for (int i = 0; i < new_op.qubits.size(); i++)
-            new_op.qubits[i] += BaseState::chunk_bits_;
-          apply_gate_statevector(iChunk, new_op);
+            new_op.qubits[i] += BaseState::qreg_.num_qubits();
+          apply_gate_statevector(new_op);
         }
         return;
       }
@@ -1124,111 +683,106 @@ void State<densmat_t>::apply_gate(const int_t iChunk,
         "DensityMatrixState::invalid gate instruction \'" + op.name + "\'.");
   switch (it->second) {
   case Gates::u3:
-    apply_gate_u3(iChunk, op.qubits[0], std::real(op.params[0]),
+    apply_gate_u3(op.qubits[0], std::real(op.params[0]),
                   std::real(op.params[1]), std::real(op.params[2]));
     break;
   case Gates::u2:
-    apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., std::real(op.params[0]),
+    apply_gate_u3(op.qubits[0], M_PI / 2., std::real(op.params[0]),
                   std::real(op.params[1]));
     break;
   case Gates::u1:
-    apply_phase(iChunk, op.qubits[0],
-                std::exp(complex_t(0., 1.) * op.params[0]));
+    apply_phase(op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0]));
     break;
   case Gates::cx:
-    BaseState::qregs_[iChunk].apply_cnot(op.qubits[0], op.qubits[1]);
+    BaseState::qreg_.apply_cnot(op.qubits[0], op.qubits[1]);
     break;
   case Gates::cy:
-    BaseState::qregs_[iChunk].apply_cy(op.qubits[0], op.qubits[1]);
+    BaseState::qreg_.apply_cy(op.qubits[0], op.qubits[1]);
     break;
   case Gates::cz:
-    BaseState::qregs_[iChunk].apply_cphase(op.qubits[0], op.qubits[1], -1);
+    BaseState::qreg_.apply_cphase(op.qubits[0], op.qubits[1], -1);
     break;
   case Gates::cp:
-    BaseState::qregs_[iChunk].apply_cphase(
-        op.qubits[0], op.qubits[1], std::exp(complex_t(0., 1.) * op.params[0]));
+    BaseState::qreg_.apply_cphase(op.qubits[0], op.qubits[1],
+                                  std::exp(complex_t(0., 1.) * op.params[0]));
     break;
   case Gates::id:
     break;
   case Gates::x:
-    BaseState::qregs_[iChunk].apply_x(op.qubits[0]);
+    BaseState::qreg_.apply_x(op.qubits[0]);
     break;
   case Gates::y:
-    BaseState::qregs_[iChunk].apply_y(op.qubits[0]);
+    BaseState::qreg_.apply_y(op.qubits[0]);
     break;
   case Gates::z:
-    apply_phase(iChunk, op.qubits[0], -1);
+    apply_phase(op.qubits[0], -1);
     break;
   case Gates::h:
-    apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., 0., M_PI);
+    apply_gate_u3(op.qubits[0], M_PI / 2., 0., M_PI);
     break;
   case Gates::s:
-    apply_phase(iChunk, op.qubits[0], complex_t(0., 1.));
+    apply_phase(op.qubits[0], complex_t(0., 1.));
     break;
   case Gates::sdg:
-    apply_phase(iChunk, op.qubits[0], complex_t(0., -1.));
+    apply_phase(op.qubits[0], complex_t(0., -1.));
     break;
   case Gates::sx:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits,
-                                                   Linalg::VMatrix::SX);
+    BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::SX);
     break;
   case Gates::sxdg:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits,
-                                                   Linalg::VMatrix::SXDG);
+    BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::SXDG);
     break;
   case Gates::t: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2));
+    apply_phase(op.qubits[0], complex_t(isqrt2, isqrt2));
   } break;
   case Gates::tdg: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2));
+    apply_phase(op.qubits[0], complex_t(isqrt2, -isqrt2));
   } break;
   case Gates::swap: {
-    BaseState::qregs_[iChunk].apply_swap(op.qubits[0], op.qubits[1]);
+    BaseState::qreg_.apply_swap(op.qubits[0], op.qubits[1]);
   } break;
   case Gates::ecr: {
-    BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits,
-                                                   Linalg::VMatrix::ECR);
+    BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::ECR);
   } break;
   case Gates::ccx:
-    BaseState::qregs_[iChunk].apply_toffoli(op.qubits[0], op.qubits[1],
-                                            op.qubits[2]);
+    BaseState::qreg_.apply_toffoli(op.qubits[0], op.qubits[1], op.qubits[2]);
     break;
   case Gates::r:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
+    BaseState::qreg_.apply_unitary_matrix(
         op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1]));
     break;
   case Gates::rx:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        op.qubits, Linalg::VMatrix::rx(op.params[0]));
+    BaseState::qreg_.apply_unitary_matrix(op.qubits,
+                                          Linalg::VMatrix::rx(op.params[0]));
     break;
   case Gates::ry:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        op.qubits, Linalg::VMatrix::ry(op.params[0]));
+    BaseState::qreg_.apply_unitary_matrix(op.qubits,
+                                          Linalg::VMatrix::ry(op.params[0]));
     break;
   case Gates::rz:
-    apply_diagonal_unitary_matrix(iChunk, op.qubits,
+    apply_diagonal_unitary_matrix(op.qubits,
                                   Linalg::VMatrix::rz_diag(op.params[0]));
     break;
   case Gates::rxx:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        op.qubits, Linalg::VMatrix::rxx(op.params[0]));
+    BaseState::qreg_.apply_unitary_matrix(op.qubits,
+                                          Linalg::VMatrix::rxx(op.params[0]));
     break;
   case Gates::ryy:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        op.qubits, Linalg::VMatrix::ryy(op.params[0]));
+    BaseState::qreg_.apply_unitary_matrix(op.qubits,
+                                          Linalg::VMatrix::ryy(op.params[0]));
     break;
   case Gates::rzz:
-    apply_diagonal_unitary_matrix(iChunk, op.qubits,
+    apply_diagonal_unitary_matrix(op.qubits,
                                   Linalg::VMatrix::rzz_diag(op.params[0]));
     break;
   case Gates::rzx:
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        op.qubits, Linalg::VMatrix::rzx(op.params[0]));
+    BaseState::qreg_.apply_unitary_matrix(op.qubits,
+                                          Linalg::VMatrix::rzx(op.params[0]));
     break;
   case Gates::pauli:
-    apply_pauli(iChunk, op.qubits, op.string_params[0]);
+    apply_pauli(op.qubits, op.string_params[0]);
     break;
   default:
     // We shouldn't reach here unless there is a bug in gateset
@@ -1238,8 +792,7 @@ void State<densmat_t>::apply_gate(const int_t iChunk,
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_gate_statevector(const int_t iChunk,
-                                              const Operations::Op &op) {
+void State<densmat_t>::apply_gate_statevector(const Operations::Op &op) {
   // Look for gate name in gateset
   auto it = gateset_.find(op.name);
   if (it == gateset_.end())
@@ -1248,22 +801,22 @@ void State<densmat_t>::apply_gate_statevector(const int_t iChunk,
   switch (it->second) {
   case Gates::x:
   case Gates::cx:
-    BaseState::qregs_[iChunk].apply_mcx(op.qubits);
+    BaseState::qreg_.apply_mcx(op.qubits);
     break;
   case Gates::u1:
-    if (op.qubits[op.qubits.size() - 1] < BaseState::chunk_bits_) {
-      BaseState::qregs_[iChunk].apply_mcphase(
+    if (op.qubits[op.qubits.size() - 1] < BaseState::qreg_.num_qubits()) {
+      BaseState::qreg_.apply_mcphase(
           op.qubits, std::exp(complex_t(0., 1.) * op.params[0]));
     } else {
-      BaseState::qregs_[iChunk].apply_mcphase(
+      BaseState::qreg_.apply_mcphase(
           op.qubits, std::conj(std::exp(complex_t(0., 1.) * op.params[0])));
     }
     break;
   case Gates::y:
-    BaseState::qregs_[iChunk].apply_mcy(op.qubits);
+    BaseState::qreg_.apply_mcy(op.qubits);
     break;
   case Gates::z:
-    BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1);
+    BaseState::qreg_.apply_mcphase(op.qubits, -1);
     break;
   default:
     // We shouldn't reach here unless there is a bug in gateset
@@ -1273,87 +826,84 @@ void State<densmat_t>::apply_gate_statevector(const int_t iChunk,
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_matrix(const int_t iChunk, const reg_t &qubits,
-                                    const cmatrix_t &mat) {
+void State<densmat_t>::apply_matrix(const reg_t &qubits, const cmatrix_t &mat) {
   if (mat.GetRows() == 1) {
-    apply_diagonal_unitary_matrix(iChunk, qubits, Utils::vectorize_matrix(mat));
+    apply_diagonal_unitary_matrix(qubits, Utils::vectorize_matrix(mat));
   } else {
-    BaseState::qregs_[iChunk].apply_unitary_matrix(
-        qubits, Utils::vectorize_matrix(mat));
+    BaseState::qreg_.apply_unitary_matrix(qubits, Utils::vectorize_matrix(mat));
   }
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_gate_u3(const int_t iChunk, uint_t qubit,
-                                     double theta, double phi, double lambda) {
-  BaseState::qregs_[iChunk].apply_unitary_matrix(
+void State<densmat_t>::apply_gate_u3(uint_t qubit, double theta, double phi,
+                                     double lambda) {
+  BaseState::qreg_.apply_unitary_matrix(
       reg_t({qubit}), Linalg::VMatrix::u3(theta, phi, lambda));
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_diagonal_unitary_matrix(const int_t iChunk,
-                                                     const reg_t &qubits,
+void State<densmat_t>::apply_diagonal_unitary_matrix(const reg_t &qubits,
                                                      const cvector_t &diag) {
-  if (BaseState::global_chunk_indexing_ ||
-      !BaseState::multi_chunk_distribution_) {
-    // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix
-    // as is
-    BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits, diag);
-  } else {
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() &&
+      !BaseState::qreg_.support_global_indexing()) {
     reg_t qubits_in = qubits;
     reg_t qubits_row = qubits;
     cvector_t diag_in = diag;
     cvector_t diag_row = diag;
 
-    BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in);
+    Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(),
+                                 BaseState::qreg_.num_qubits(), qubits_in,
+                                 diag_in);
 
     if (qubits_in.size() == qubits.size()) {
-      BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits, diag);
+      BaseState::qreg_.apply_diagonal_unitary_matrix(qubits, diag);
     } else {
       for (int_t i = 0; i < qubits.size(); i++) {
-        if (qubits[i] >= BaseState::chunk_bits_)
-          qubits_row[i] =
-              qubits[i] + BaseState::num_qubits_ - BaseState::chunk_bits_;
+        if (qubits[i] >= BaseState::qreg_.num_qubits())
+          qubits_row[i] = qubits[i] + BaseState::num_global_qubits_ -
+                          BaseState::qreg_.num_qubits();
       }
-      BaseState::block_diagonal_matrix(iChunk, qubits_row, diag_row);
+      Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(),
+                                   BaseState::qreg_.num_qubits(), qubits_row,
+                                   diag_row);
 
       reg_t qubits_chunk(qubits_in.size() * 2);
       for (int_t i = 0; i < qubits_in.size(); i++) {
         qubits_chunk[i] = qubits_in[i];
         qubits_chunk[i + qubits_in.size()] =
-            qubits_in[i] + BaseState::chunk_bits_;
+            qubits_in[i] + BaseState::qreg_.num_qubits();
       }
-      BaseState::qregs_[iChunk].apply_diagonal_matrix(
+      BaseState::qreg_.apply_diagonal_matrix(
           qubits_chunk,
           AER::Utils::tensor_product(AER::Utils::conjugate(diag_row), diag_in));
     }
+  } else {
+    BaseState::qreg_.apply_diagonal_unitary_matrix(qubits, diag);
   }
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_phase(const int_t iChunk, const uint_t qubit,
-                                   const complex_t phase) {
+void State<densmat_t>::apply_phase(const uint_t qubit, const complex_t phase) {
   cvector_t diag(2);
   diag[0] = 1.0;
   diag[1] = phase;
-  apply_diagonal_unitary_matrix(iChunk, reg_t({qubit}), diag);
+  apply_diagonal_unitary_matrix(reg_t({qubit}), diag);
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_phase(const int_t iChunk, const reg_t &qubits,
-                                   const complex_t phase) {
+void State<densmat_t>::apply_phase(const reg_t &qubits, const complex_t phase) {
   cvector_t diag((1 << qubits.size()), 1.0);
   diag[(1 << qubits.size()) - 1] = phase;
-  apply_diagonal_unitary_matrix(iChunk, qubits, diag);
+  apply_diagonal_unitary_matrix(qubits, diag);
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_pauli(const int_t iChunk, const reg_t &qubits,
+void State<densmat_t>::apply_pauli(const reg_t &qubits,
                                    const std::string &pauli) {
   // Pauli as a superoperator is (-1)^num_y P\otimes P
   complex_t coeff = (std::count(pauli.begin(), pauli.end(), 'Y') % 2) ? -1 : 1;
-  BaseState::qregs_[iChunk].apply_pauli(
-      BaseState::qregs_[iChunk].superop_qubits(qubits), pauli + pauli, coeff);
+  BaseState::qreg_.apply_pauli(BaseState::qreg_.superop_qubits(qubits),
+                               pauli + pauli, coeff);
 }
 
 //=========================================================================
@@ -1361,171 +911,38 @@ void State<densmat_t>::apply_pauli(const int_t iChunk, const reg_t &qubits,
 //=========================================================================
 
 template <class densmat_t>
-void State<densmat_t>::apply_measure(const int_t iChunk, const reg_t &qubits,
-                                     const reg_t &cmemory,
+void State<densmat_t>::apply_measure(const reg_t &qubits, const reg_t &cmemory,
                                      const reg_t &cregister, RngEngine &rng) {
-  int_t ishot = BaseState::get_global_shot_index(iChunk);
   // Actual measurement outcome
-  const auto meas = sample_measure_with_prob(iChunk, qubits, rng);
+  const auto meas = sample_measure_with_prob(qubits, rng);
   // Implement measurement update
-  measure_reset_update(iChunk, qubits, meas.first, meas.first, meas.second);
+  measure_reset_update(qubits, meas.first, meas.first, meas.second);
   const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size());
-  BaseState::cregs_[ishot].store_measure(outcome, cmemory, cregister);
+  BaseState::creg().store_measure(outcome, cmemory, cregister);
 }
 
 template <class densmat_t>
-rvector_t State<densmat_t>::measure_probs(const int_t iChunk,
-                                          const reg_t &qubits) const {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].probabilities(qubits);
-
-  uint_t dim = 1ull << qubits.size();
-  rvector_t sum(dim, 0.0);
-  int_t i, j, k;
-  reg_t qubits_in_chunk;
-  reg_t qubits_out_chunk;
-
-  for (i = 0; i < qubits.size(); i++) {
-    if (qubits[i] < BaseState::chunk_bits_) {
-      qubits_in_chunk.push_back(qubits[i]);
-    } else {
-      qubits_out_chunk.push_back(qubits[i]);
-    }
-  }
-
-  if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(i, j, k)
-    for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-      for (i = BaseState::top_chunk_of_group_[ig];
-           i < BaseState::top_chunk_of_group_[ig + 1]; i++) {
-        uint_t irow, icol;
-        irow = (BaseState::global_chunk_index_ + i) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-        icol = (BaseState::global_chunk_index_ + i) -
-               (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-
-        if (irow == icol) { // diagonal chunk
-          if (qubits_in_chunk.size() > 0) {
-            auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk);
-            if (qubits_in_chunk.size() == qubits.size()) {
-              for (j = 0; j < dim; j++) {
-#pragma omp atomic
-                sum[j] += chunkSum[j];
-              }
-            } else {
-              for (j = 0; j < chunkSum.size(); j++) {
-                int idx = 0;
-                int i_in = 0;
-                for (k = 0; k < qubits.size(); k++) {
-                  if (qubits[k] < (BaseState::chunk_bits_)) {
-                    idx += (((j >> i_in) & 1) << k);
-                    i_in++;
-                  } else {
-                    if ((((i + BaseState::global_chunk_index_)
-                          << (BaseState::chunk_bits_)) >>
-                         qubits[k]) &
-                        1) {
-                      idx += 1ull << k;
-                    }
-                  }
-                }
-#pragma omp atomic
-                sum[idx] += chunkSum[j];
-              }
-            }
-          } else { // there is no bit in chunk
-            auto tr = std::real(BaseState::qregs_[i].trace());
-            int idx = 0;
-            for (k = 0; k < qubits_out_chunk.size(); k++) {
-              if ((((i + BaseState::global_chunk_index_)
-                    << (BaseState::chunk_bits_)) >>
-                   qubits_out_chunk[k]) &
-                  1) {
-                idx += 1ull << k;
-              }
-            }
-#pragma omp atomic
-            sum[idx] += tr;
-          }
-        }
-      }
-    }
-  } else {
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      uint_t irow, icol;
-      irow = (BaseState::global_chunk_index_ + i) >>
-             ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-      icol = (BaseState::global_chunk_index_ + i) -
-             (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-
-      if (irow == icol) { // diagonal chunk
-        if (qubits_in_chunk.size() > 0) {
-          auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk);
-          if (qubits_in_chunk.size() == qubits.size()) {
-            for (j = 0; j < dim; j++) {
-              sum[j] += chunkSum[j];
-            }
-          } else {
-            for (j = 0; j < chunkSum.size(); j++) {
-              int idx = 0;
-              int i_in = 0;
-              for (k = 0; k < qubits.size(); k++) {
-                if (qubits[k] < (BaseState::chunk_bits_)) {
-                  idx += (((j >> i_in) & 1) << k);
-                  i_in++;
-                } else {
-                  if ((((i + BaseState::global_chunk_index_)
-                        << (BaseState::chunk_bits_)) >>
-                       qubits[k]) &
-                      1) {
-                    idx += 1ull << k;
-                  }
-                }
-              }
-              sum[idx] += chunkSum[j];
-            }
-          }
-        } else { // there is no bit in chunk
-          auto tr = std::real(BaseState::qregs_[i].trace());
-          int idx = 0;
-          for (k = 0; k < qubits_out_chunk.size(); k++) {
-            if ((((i + BaseState::global_chunk_index_)
-                  << (BaseState::chunk_bits_)) >>
-                 qubits_out_chunk[k]) &
-                1) {
-              idx += 1ull << k;
-            }
-          }
-          sum[idx] += tr;
-        }
-      }
-    }
-  }
-
-#ifdef AER_MPI
-  BaseState::reduce_sum(sum);
-#endif
-
-  return sum;
+rvector_t State<densmat_t>::measure_probs(const reg_t &qubits) const {
+  return BaseState::qreg_.probabilities(qubits);
 }
 
 template <class densmat_t>
-void State<densmat_t>::apply_reset(const int_t iChunk, const reg_t &qubits) {
-  BaseState::qregs_[iChunk].apply_reset(qubits);
+void State<densmat_t>::apply_reset(const reg_t &qubits) {
+  BaseState::qreg_.apply_reset(qubits);
 }
 
 template <class densmat_t>
-std::pair<uint_t, double> State<densmat_t>::sample_measure_with_prob(
-    const int_t iChunk, const reg_t &qubits, RngEngine &rng) {
-  rvector_t probs = measure_probs(iChunk, qubits);
+std::pair<uint_t, double>
+State<densmat_t>::sample_measure_with_prob(const reg_t &qubits,
+                                           RngEngine &rng) {
+  rvector_t probs = measure_probs(qubits);
   // Randomly pick outcome and return pair
   uint_t outcome = rng.rand_int(probs);
   return std::make_pair(outcome, probs[outcome]);
 }
 
 template <class densmat_t>
-void State<densmat_t>::measure_reset_update(const int_t iChunk,
-                                            const reg_t &qubits,
+void State<densmat_t>::measure_reset_update(const reg_t &qubits,
                                             const uint_t final_state,
                                             const uint_t meas_state,
                                             const double meas_prob) {
@@ -1536,44 +953,11 @@ void State<densmat_t>::measure_reset_update(const int_t iChunk,
     // Diagonal matrix for projecting and renormalizing to measurement outcome
     cvector_t mdiag(2, 0.);
     mdiag[meas_state] = 1. / std::sqrt(meas_prob);
-    if (!BaseState::multi_chunk_distribution_)
-      apply_diagonal_unitary_matrix(iChunk, qubits, mdiag);
-    else {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t i = BaseState::top_chunk_of_group_[ig];
-               i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-            apply_diagonal_unitary_matrix(i, qubits, mdiag);
-        }
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          apply_diagonal_unitary_matrix(i, qubits, mdiag);
-      }
-    }
+    apply_diagonal_unitary_matrix(qubits, mdiag);
 
     // If it doesn't agree with the reset state update
     if (final_state != meas_state) {
-      if (!BaseState::multi_chunk_distribution_)
-        BaseState::qregs_[iChunk].apply_x(qubits[0]);
-      else {
-        if (qubits[0] < BaseState::chunk_bits_) {
-          if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-            for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-              for (int_t i = BaseState::top_chunk_of_group_[ig];
-                   i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-                BaseState::qregs_[i].apply_x(qubits[0]);
-            }
-          } else {
-            for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-              BaseState::qregs_[i].apply_x(qubits[0]);
-          }
-        } else {
-          BaseState::apply_chunk_x(qubits[0]);
-          BaseState::apply_chunk_x(qubits[0] + BaseState::chunk_bits_);
-        }
-      }
+      BaseState::qreg_.apply_x(qubits[0]);
     }
   }
   // Multi qubit case
@@ -1582,21 +966,7 @@ void State<densmat_t>::measure_reset_update(const int_t iChunk,
     const size_t dim = 1ULL << qubits.size();
     cvector_t mdiag(dim, 0.);
     mdiag[meas_state] = 1. / std::sqrt(meas_prob);
-    if (!BaseState::multi_chunk_distribution_)
-      apply_diagonal_unitary_matrix(iChunk, qubits, mdiag);
-    else {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t i = BaseState::top_chunk_of_group_[ig];
-               i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-            apply_diagonal_unitary_matrix(i, qubits, mdiag);
-        }
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          apply_diagonal_unitary_matrix(i, qubits, mdiag);
-      }
-    }
+    apply_diagonal_unitary_matrix(qubits, mdiag);
 
     // If it doesn't agree with the reset state update
     // TODO This function could be optimized as a permutation update
@@ -1610,41 +980,7 @@ void State<densmat_t>::measure_reset_update(const int_t iChunk,
           perm[j * dim + j] = 1.;
       }
       // apply permutation to swap state
-      if (!BaseState::multi_chunk_distribution_)
-        BaseState::qregs_[iChunk].apply_unitary_matrix(qubits, perm);
-      else {
-        reg_t qubits_in_chunk;
-        reg_t qubits_out_chunk;
-
-        for (int_t i = 0; i < qubits.size(); i++) {
-          if (qubits[i] < BaseState::chunk_bits_) {
-            qubits_in_chunk.push_back(qubits[i]);
-          } else {
-            qubits_out_chunk.push_back(qubits[i]);
-          }
-        }
-        if (qubits_in_chunk.size() > 0) { // in chunk exchange
-          if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-            for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-              for (int_t i = BaseState::top_chunk_of_group_[ig];
-                   i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-                BaseState::qregs_[i].apply_unitary_matrix(qubits, perm);
-            }
-          } else {
-            for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-              BaseState::qregs_[i].apply_unitary_matrix(qubits, perm);
-          }
-        }
-        if (qubits_out_chunk.size() > 0) { // out of chunk exchange
-          for (int_t i = 0; i < qubits_out_chunk.size(); i++) {
-            BaseState::apply_chunk_x(qubits_out_chunk[i]);
-            BaseState::apply_chunk_x(
-                qubits_out_chunk[i] +
-                (BaseState::num_qubits_ - BaseState::chunk_bits_));
-          }
-        }
-      }
+      BaseState::qreg_.apply_unitary_matrix(qubits, perm);
     }
   }
 }
@@ -1660,115 +996,13 @@ std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
     rnds.push_back(rng.rand(0, 1));
   reg_t allbit_samples(shots, 0);
 
-  if (!BaseState::multi_chunk_distribution_)
-    allbit_samples = BaseState::qregs_[0].sample_measure(rnds);
-  else {
-    int_t i, j;
-    std::vector<double> chunkSum(BaseState::qregs_.size() + 1, 0);
-    double sum, localSum;
-    // calculate per chunk sum
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for private(i)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (i = BaseState::top_chunk_of_group_[ig];
-             i < BaseState::top_chunk_of_group_[ig + 1]; i++) {
-          uint_t irow, icol;
-          irow = (BaseState::global_chunk_index_ + i) >>
-                 ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-          icol = (BaseState::global_chunk_index_ + i) -
-                 (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-          if (irow == icol) // only diagonal chunk has probabilities
-            chunkSum[i] = std::real(BaseState::qregs_[i].trace());
-          else
-            chunkSum[i] = 0.0;
-        }
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++) {
-        uint_t irow, icol;
-        irow = (BaseState::global_chunk_index_ + i) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-        icol = (BaseState::global_chunk_index_ + i) -
-               (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-        if (irow == icol) // only diagonal chunk has probabilities
-          chunkSum[i] = std::real(BaseState::qregs_[i].trace());
-        else
-          chunkSum[i] = 0.0;
-      }
-    }
-    localSum = 0.0;
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      sum = localSum;
-      localSum += chunkSum[i];
-      chunkSum[i] = sum;
-    }
-    chunkSum[BaseState::qregs_.size()] = localSum;
-
-    double globalSum = 0.0;
-    if (BaseState::nprocs_ > 1) {
-      std::vector<double> procTotal(BaseState::nprocs_);
-
-      for (i = 0; i < BaseState::nprocs_; i++) {
-        procTotal[i] = localSum;
-      }
-      BaseState::gather_value(procTotal);
-
-      for (i = 0; i < BaseState::myrank_; i++) {
-        globalSum += procTotal[i];
-      }
-    }
-
-    reg_t local_samples(shots, 0);
-
-    // get rnds positions for each chunk
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      uint_t irow, icol;
-      irow = (BaseState::global_chunk_index_ + i) >>
-             ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-      icol = (BaseState::global_chunk_index_ + i) -
-             (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-      if (irow != icol)
-        continue;
-
-      uint_t nIn;
-      std::vector<uint_t> vIdx;
-      std::vector<double> vRnd;
-
-      // find rnds in this chunk
-      nIn = 0;
-      for (j = 0; j < shots; j++) {
-        if (rnds[j] >= chunkSum[i] + globalSum &&
-            rnds[j] < chunkSum[i + 1] + globalSum) {
-          vRnd.push_back(rnds[j] - (globalSum + chunkSum[i]));
-          vIdx.push_back(j);
-          nIn++;
-        }
-      }
-
-      if (nIn > 0) {
-        auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd);
-        uint_t ir;
-        ir = (BaseState::global_chunk_index_ + i) >>
-             ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-
-        for (j = 0; j < chunkSamples.size(); j++) {
-          local_samples[vIdx[j]] =
-              (ir << BaseState::chunk_bits_) + chunkSamples[j];
-        }
-      }
-    }
-
-#ifdef AER_MPI
-    BaseState::reduce_sum(local_samples);
-#endif
-    allbit_samples = local_samples;
-  }
+  allbit_samples = BaseState::qreg_.sample_measure(rnds);
 
   // Convert to reg_t format
   std::vector<reg_t> all_samples;
   all_samples.reserve(shots);
   for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_);
+    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
     reg_t sample;
     sample.reserve(qubits.size());
     for (uint_t qubit : qubits) {
@@ -1784,73 +1018,12 @@ std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
 //=========================================================================
 
 template <class densmat_t>
-void State<densmat_t>::apply_kraus(const int_t iChunk, const reg_t &qubits,
+void State<densmat_t>::apply_kraus(const reg_t &qubits,
                                    const std::vector<cmatrix_t> &kmats) {
-  BaseState::qregs_[iChunk].apply_superop_matrix(
+  BaseState::qreg_.apply_superop_matrix(
       qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats)));
 }
 
-//-----------------------------------------------------------------------
-// Functions for multi-chunk distribution
-//-----------------------------------------------------------------------
-// swap between chunks
-template <class densmat_t>
-void State<densmat_t>::apply_chunk_swap(const reg_t &qubits) {
-  uint_t q0, q1;
-  q0 = qubits[0];
-  q1 = qubits[1];
-
-  std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]);
-
-  if (qubits[0] >= BaseState::chunk_bits_) {
-    q0 += BaseState::chunk_bits_;
-  }
-  if (qubits[1] >= BaseState::chunk_bits_) {
-    q1 += BaseState::chunk_bits_;
-  }
-  reg_t qs0 = {{q0, q1}};
-  BaseState::apply_chunk_swap(qs0);
-
-  if (qubits[0] >= BaseState::chunk_bits_) {
-    q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_);
-  } else {
-    q0 += BaseState::chunk_bits_;
-  }
-  if (qubits[1] >= BaseState::chunk_bits_) {
-    q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_);
-  } else {
-    q1 += BaseState::chunk_bits_;
-  }
-  reg_t qs1 = {{q0, q1}};
-  BaseState::apply_chunk_swap(qs1);
-}
-
-template <class densmat_t>
-void State<densmat_t>::apply_multi_chunk_swap(const reg_t &qubits) {
-  reg_t qubits_density;
-
-  for (int_t i = 0; i < qubits.size(); i += 2) {
-    uint_t q0, q1;
-    q0 = qubits[i * 2];
-    q1 = qubits[i * 2 + 1];
-
-    std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]);
-
-    if (q1 >= BaseState::chunk_bits_) {
-      q1 += BaseState::chunk_bits_;
-    }
-    qubits_density.push_back(q0);
-    qubits_density.push_back(q1);
-
-    q0 += BaseState::chunk_bits_;
-    if (q1 >= BaseState::chunk_bits_) {
-      q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_ * 2);
-    }
-  }
-
-  BaseState::apply_multi_chunk_swap(qubits_density);
-}
-
 //-------------------------------------------------------------------------
 } // end namespace DensityMatrix
 //-------------------------------------------------------------------------
diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp
old mode 100644
new mode 100755
index cdb8b67acb..73f8dca3cf
--- a/src/simulators/density_matrix/densitymatrix_thrust.hpp
+++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp
@@ -61,6 +61,11 @@ class DensityMatrixThrust : public UnitaryMatrixThrust<data_t> {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const DensityMatrixThrust<data_t> &obj) {
+    BaseMatrix::initialize(obj);
+  }
+
   // Initializes the vector to a custom initial state.
   // The vector can be either a statevector or a vectorized density matrix
   // If the length of the data vector does not match either case for the
@@ -1275,12 +1280,13 @@ reg_t DensityMatrixThrust<data_t>::sample_measure(
     const std::vector<double> &rnds) const {
   uint_t count = 1;
   if (!BaseVector::multi_chunk_distribution_) {
-    if (BaseVector::enable_batch_ && BaseVector::chunk_.pos() != 0) {
-      return reg_t(); // first chunk execute all in batch
+    if (BaseVector::enable_batch_) {
+      if (BaseVector::chunk_.pos() != 0)
+        return reg_t(); // first chunk execute all in batch
+      else
+        count = BaseVector::chunk_.container()->num_chunks();
     }
-    count = BaseVector::chunk_.container()->num_chunks();
   }
-
   uint_t nrows = BaseMatrix::num_rows();
 
 #ifdef AER_DEBUG
diff --git a/src/simulators/extended_stabilizer/ch_runner.hpp b/src/simulators/extended_stabilizer/ch_runner.hpp
index 78eb03a81c..489d6b77ad 100644
--- a/src/simulators/extended_stabilizer/ch_runner.hpp
+++ b/src/simulators/extended_stabilizer/ch_runner.hpp
@@ -77,6 +77,10 @@ class Runner {
   virtual ~Runner() = default;
 
   void initialize(uint_t n_qubits);
+  // initialize from existing state (copy)
+  void initialize(const Runner &obj) {
+  } // imlement this if extended stabilizer will support shot-branching
+
   void initialize_omp(uint_t n_threads, uint_t threshold_rank);
 
   bool empty() const { return (n_qubits_ == 0 || num_states_ == 0); }
diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp
new file mode 100644
index 0000000000..2d0da87e4a
--- /dev/null
+++ b/src/simulators/multi_state_executor.hpp
@@ -0,0 +1,815 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _multi_state_executor_hpp_
+#define _multi_state_executor_hpp_
+
+#include "simulators/circuit_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+#include "simulators/shot_branching.hpp"
+
+namespace AER {
+
+namespace CircuitExecutor {
+
+//-------------------------------------------------------------------------
+// Multiple-shots executor class implementation
+//-------------------------------------------------------------------------
+template <class state_t>
+class MultiStateExecutor : public Executor<state_t> {
+  using Base = Executor<state_t>;
+
+protected:
+  std::vector<state_t> states_;
+  std::vector<ClassicalRegister> cregs_; // classical registers for all shots
+
+  // number of qubits for the circuit
+  uint_t num_qubits_;
+
+  uint_t num_global_states_; // number of total shots
+  uint_t num_local_states_;  // number of local shots
+
+  uint_t global_state_index_; // beginning chunk index for this process
+  reg_t state_index_begin_;   // beginning chunk index for each process
+  reg_t state_index_end_;     // ending chunk index for each process
+  uint_t num_active_states_;  // number of active shots in current loop
+
+  bool shot_omp_parallel_; // using thread parallel to process loop of chunks or
+                           // not
+
+  bool set_parallelization_called_ =
+      false; // this flag is used to check set_parallelization is already
+             // called, if yes the call sets max_batched_shots_
+  uint_t num_max_shots_ =
+      1; // max number of shots can be stored on available memory
+
+  int max_matrix_qubits_; // max qubits for matrix
+
+  // shot branching
+  bool shot_branching_enable_ = true;
+  bool shot_branching_sampling_enable_ = false;
+
+  // group of states (GPU devices)
+  uint_t num_groups_; // number of groups of chunks
+  reg_t top_state_of_group_;
+  reg_t num_states_in_group_;
+  int num_threads_per_group_; // number of outer threads per group
+
+  uint_t num_creg_memory_ =
+      0; // number of total bits for creg (reserve for multi-shots)
+  uint_t num_creg_registers_ = 0;
+
+  // OpenMP qubit threshold
+  int omp_qubit_threshold_ = 14;
+
+  // Threshold for chopping small values to zero in JSON
+  double json_chop_threshold_ = 1e-10;
+
+  // Set a global phase exp(1j * theta) for the state
+  bool has_global_phase_ = false;
+  complex_t global_phase_ = 1;
+
+  // number of threads for inner loop of shot-branching
+  int_t shot_branch_parallel_ = 1;
+
+public:
+  MultiStateExecutor();
+  virtual ~MultiStateExecutor();
+
+  size_t required_memory_mb(const Circuit &circuit,
+                            const Noise::NoiseModel &noise) const override {
+    state_t tmp;
+    return tmp.required_memory_mb(circuit.num_qubits, circuit.ops);
+  }
+
+  uint_t get_process_by_chunk(uint_t cid);
+
+protected:
+  void set_config(const Config &config) override;
+
+  // distribute states on processes
+  void set_distribution(uint_t num_states);
+
+  virtual uint_t qubit_scale(void) { return 1; }
+
+  virtual bool allocate_states(uint_t num_shots, const Config &config);
+
+  void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                         const Config &config, RngEngine &init_rng,
+                         ExperimentResult &result, bool sample_noise) override;
+
+  void run_circuit_with_shot_branching(
+      uint_t top_state, uint_t num_states, Circuit &circ,
+      const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng,
+      uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise);
+
+  // apply op for shot-branching, return false if op is not applied in sub-class
+  virtual bool apply_branching_op(Branch &root, const Operations::Op &op,
+                                  ExperimentResult &result, bool final_op) {
+    std::cout << "  base is called, implement for each method" << std::endl;
+    return false;
+  }
+
+  // Apply the global phase
+  virtual void apply_global_phase() {}
+  void set_global_phase(double theta);
+
+  void set_parallelization(const Circuit &circ,
+                           const Noise::NoiseModel &noise) override;
+
+  virtual bool shot_branching_supported(void) {
+    return false; // return true in the sub-class if supports shot-branching
+  }
+
+  template <typename InputIterator>
+  void measure_sampler(InputIterator first_meas, InputIterator last_meas,
+                       uint_t shots, Branch &branch, ExperimentResult &result,
+                       std::vector<RngEngine> &rng);
+
+  // sampling measure
+  virtual std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
+                                            uint_t shots,
+                                            std::vector<RngEngine> &rng) const {
+    // this is for single rng, impement in sub-class for multi-shots case
+    return state.sample_measure(qubits, shots, rng[0]);
+  }
+};
+
+template <class state_t>
+MultiStateExecutor<state_t>::MultiStateExecutor() {
+  num_global_states_ = 0;
+  num_local_states_ = 0;
+
+  shot_omp_parallel_ = false;
+
+  shot_branching_enable_ = false;
+}
+
+template <class state_t>
+MultiStateExecutor<state_t>::~MultiStateExecutor() {
+  states_.clear();
+  cregs_.clear();
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::set_config(const Config &config) {
+  Base::set_config(config);
+
+  // Set threshold for truncating states to be saved
+  json_chop_threshold_ = config.zero_threshold;
+
+  // Set OMP threshold for state update functions
+  omp_qubit_threshold_ = config.statevector_parallel_threshold;
+
+  // shot branching optimization
+  shot_branching_enable_ = config.shot_branching_enable;
+  shot_branching_sampling_enable_ = config.shot_branching_sampling_enable;
+
+  if (config.num_threads_per_device.has_value())
+    num_threads_per_group_ = config.num_threads_per_device.value();
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::set_global_phase(double theta) {
+  if (Linalg::almost_equal(theta, 0.0)) {
+    has_global_phase_ = false;
+    global_phase_ = 1;
+  } else {
+    has_global_phase_ = true;
+    global_phase_ = std::exp(complex_t(0.0, theta));
+  }
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::set_distribution(uint_t num_states) {
+
+  num_global_states_ = num_states;
+
+  state_index_begin_.resize(Base::distributed_procs_);
+  state_index_end_.resize(Base::distributed_procs_);
+  for (int_t i = 0; i < Base::distributed_procs_; i++) {
+    state_index_begin_[i] = num_global_states_ * i / Base::distributed_procs_;
+    state_index_end_[i] =
+        num_global_states_ * (i + 1) / Base::distributed_procs_;
+  }
+
+  num_local_states_ = state_index_end_[Base::distributed_rank_] -
+                      state_index_begin_[Base::distributed_rank_];
+  global_state_index_ = state_index_begin_[Base::distributed_rank_];
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::set_parallelization(
+    const Circuit &circ, const Noise::NoiseModel &noise) {
+  Base::set_parallelization(circ, noise);
+}
+
+template <class state_t>
+bool MultiStateExecutor<state_t>::allocate_states(uint_t num_shots,
+                                                  const Config &config) {
+  int_t i;
+  bool ret = true;
+
+  states_.resize(num_shots);
+
+  num_active_states_ = num_shots;
+
+  // initialize groups
+  top_state_of_group_.resize(1);
+  num_states_in_group_.resize(1);
+  num_groups_ = 1;
+  top_state_of_group_[0] = 0;
+  num_states_in_group_[0] = num_shots;
+
+  for (i = 0; i < num_shots; i++) {
+    states_[i].set_config(config);
+    states_[i].set_num_global_qubits(num_qubits_);
+  }
+
+  return ret;
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+  num_qubits_ = circ.num_qubits;
+  num_creg_memory_ = circ.num_memory;
+  num_creg_registers_ = circ.num_registers;
+
+  if (this->sim_device_ == Device::GPU) {
+#ifdef _OPENMP
+    if (omp_get_num_threads() == 1)
+      shot_omp_parallel_ = true;
+#endif
+  } else if (this->sim_device_ == Device::ThrustCPU) {
+    shot_omp_parallel_ = false;
+  }
+
+  set_distribution(circ.shots);
+  num_max_shots_ = Base::get_max_parallel_shots(circ, noise);
+
+  bool shot_branching = false;
+  if (shot_branching_enable_ && num_local_states_ > 1 &&
+      shot_branching_supported() && num_max_shots_ > 1) {
+    shot_branching = true;
+  } else
+    shot_branching = false;
+
+  if (!shot_branching) {
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+                                   sample_noise);
+  }
+  // disable cuStateVec if shot-branching is enabled
+#ifdef AER_CUSTATEVEC
+  if (Base::cuStateVec_enable_)
+    Base::cuStateVec_enable_ = false;
+#endif
+
+  Noise::NoiseModel dummy_noise;
+  state_t dummy_state;
+
+  Circuit circ_opt;
+  if (sample_noise) {
+    RngEngine dummy_rng;
+    circ_opt = noise.sample_noise(circ, dummy_rng,
+                                  Noise::NoiseModel::Method::circuit, true);
+    auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config);
+    fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
+                                 result);
+    max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
+  } else {
+    auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
+    fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                 result);
+    max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+  }
+
+#ifdef AER_MPI
+  // if shots are distributed to MPI processes, allocate cregs to be gathered
+  if (Base::num_process_per_experiment_ > 1)
+    cregs_.resize(circ.shots);
+#endif
+
+  // reserve states
+  allocate_states(num_max_shots_, config);
+
+  int_t par_shots;
+  if (Base::sim_device_ == Device::GPU) {
+    par_shots = num_groups_;
+  } else {
+    par_shots =
+        std::min((int_t)Base::parallel_shots_, (int_t)num_local_states_);
+  }
+  shot_branch_parallel_ = Base::parallel_shots_ / par_shots;
+  std::vector<ExperimentResult> par_results(par_shots);
+
+  auto parallel_shot_branching = [this, &par_results, par_shots, &circ,
+                                  &circ_opt, noise, config, &init_rng,
+                                  sample_noise](int_t i) {
+    // shot distribution
+    uint_t ishot = i * num_local_states_ / par_shots;
+    uint_t nshots = (i + 1) * num_local_states_ / par_shots;
+    nshots -= ishot;
+
+    // state distribution
+    uint_t istate, nstates;
+    if (Base::sim_device_ == Device::GPU) {
+      istate = top_state_of_group_[i];
+      nstates = num_states_in_group_[i];
+    } else {
+      istate = i * num_active_states_ / par_shots;
+      nstates = (i + 1) * num_active_states_ / par_shots;
+      nstates -= istate;
+    }
+
+    if (nshots > 0) {
+      if (sample_noise) {
+        run_circuit_with_shot_branching(istate, nstates, circ_opt, noise,
+                                        config, init_rng, ishot, nshots,
+                                        par_results[i], sample_noise);
+      } else {
+        run_circuit_with_shot_branching(istate, nstates, circ, noise, config,
+                                        init_rng, ishot, nshots, par_results[i],
+                                        sample_noise);
+      }
+    }
+  };
+  Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots,
+                                parallel_shot_branching, par_shots);
+
+  // gather cregs on MPI processes and save to result
+#ifdef AER_MPI
+  if (Base::num_process_per_experiment_ > 1) {
+    Base::gather_creg_memory(cregs_, state_index_begin_);
+
+    // save cregs to result
+    auto save_cregs = [this, &par_results, par_shots](int_t i) {
+      uint_t i_shot, shot_end;
+      i_shot = num_global_states_ * i / par_shots;
+      shot_end = num_global_states_ * (i + 1) / par_shots;
+
+      for (; i_shot < shot_end; i_shot++) {
+        if (cregs_[i_shot].memory_size() > 0) {
+          std::string memory_hex = cregs_[i_shot].memory_hex();
+          par_results[i].data.add_accum(static_cast<uint_t>(1ULL), "counts",
+                                        memory_hex);
+          if (Base::save_creg_memory_) {
+            par_results[i].data.add_list(std::move(memory_hex), "memory");
+          }
+        }
+      }
+    };
+    Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, save_cregs,
+                                  par_shots);
+    cregs_.clear();
+  }
+#endif
+
+  for (auto &res : par_results) {
+    result.combine(std::move(res));
+  }
+
+  result.metadata.add(true, "shot_branching_enabled");
+}
+
+template <class state_t>
+void MultiStateExecutor<state_t>::run_circuit_with_shot_branching(
+    uint_t top_state, uint_t num_states, Circuit &circ,
+    const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng,
+    uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise) {
+  std::vector<std::shared_ptr<Branch>> branches;
+  OpItr first;
+  OpItr last;
+
+  first = circ.ops.cbegin();
+  last = circ.ops.cend();
+
+  // check if there is sequence of measure at the end of operations
+  bool can_sample = false;
+  OpItr measure_seq = last;
+  OpItr it = last - 1;
+  int_t num_measure = 0;
+
+  if (shot_branching_sampling_enable_) {
+    do {
+      if (it->type != Operations::OpType::measure) {
+        measure_seq = it + 1;
+        break;
+      }
+      num_measure += it->qubits.size();
+      it--;
+    } while (it != first);
+
+    if (num_measure >= num_qubits_ && measure_seq != last) {
+      can_sample = true;
+    } else {
+      measure_seq = last;
+    }
+  }
+
+  int_t par_shots = std::min(shot_branch_parallel_, (int_t)num_states);
+  if (par_shots == 0)
+    par_shots = 1;
+
+  // initialize local shots
+  std::vector<RngEngine> shots_storage(nshots);
+  if (global_state_index_ + ishot == 0)
+    shots_storage[0] = init_rng;
+  else
+    shots_storage[0].set_seed(circ.seed + global_state_index_ + ishot);
+  if (par_shots > 1) {
+#pragma omp parallel for num_threads(par_shots)
+    for (int_t i = 1; i < nshots; i++)
+      shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+  } else {
+    for (int_t i = 1; i < nshots; i++)
+      shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i);
+  }
+
+  std::vector<ExperimentResult> par_results(par_shots);
+
+  uint_t num_shots_saved = 0;
+
+  // loop until all local shots are simulated
+  while (shots_storage.size() > 0) {
+    uint_t num_active_states = 1;
+
+    // initial state
+    branches.push_back(std::make_shared<Branch>());
+    branches[0]->state_index() = top_state;
+    branches[0]->set_shots(shots_storage);
+    branches[0]->op_iterator() = first;
+    branches[0]->shot_index() =
+        global_state_index_ + nshots - shots_storage.size();
+    shots_storage.clear();
+
+    // initialize initial state
+    states_[top_state].set_parallelization(this->parallel_state_update_);
+    states_[top_state].set_global_phase(circ.global_phase_angle);
+    states_[top_state].enable_density_matrix(!Base::has_statevector_ops_);
+    states_[top_state].initialize_qreg(num_qubits_);
+    states_[top_state].initialize_creg(num_creg_memory_, num_creg_registers_);
+
+    while (num_active_states > 0) { // loop until all branches execute all ops
+      // functor for ops execution
+      auto apply_ops_func = [this, &branches, &noise, &par_results, measure_seq,
+                             par_shots, num_active_states](int_t i) {
+        uint_t istate, state_end;
+        istate = branches.size() * i / par_shots;
+        state_end = branches.size() * (i + 1) / par_shots;
+        uint_t nbranch = 0;
+        RngEngine dummy_rng;
+
+        for (; istate < state_end; istate++) {
+          while (branches[istate]->op_iterator() != measure_seq ||
+                 branches[istate]->additional_ops().size() > 0) {
+            // execute additional ops first if avaiable
+            if (branches[istate]->additional_ops().size() > 0) {
+              int_t iadd = 0;
+              int_t num_add = branches[istate]->additional_ops().size();
+              while (iadd < num_add) {
+                if (apply_branching_op(*branches[istate],
+                                       branches[istate]->additional_ops()[iadd],
+                                       par_results[i], false)) {
+                  // check if there are new branches
+                  if (branches[istate]->num_branches() > 0) {
+                    // if there are additional ops remaining, queue them on new
+                    // branches
+                    for (int_t k = iadd + 1;
+                         k < branches[istate]->additional_ops().size(); k++) {
+                      for (int_t l = 0; l < branches[istate]->num_branches();
+                           l++)
+                        branches[istate]->branches()[l]->add_op_after_branch(
+                            branches[istate]->additional_ops()[k]);
+                    }
+                    branches[istate]->remove_empty_branches();
+                    states_[branches[istate]->state_index()].creg() =
+                        branches[istate]->creg();
+                    // if there are some branches still remaining
+                    if (branches[istate]->num_branches() > 0) {
+                      nbranch += branches[istate]->num_branches();
+                      break;
+                    }
+                    iadd = 0;
+                    num_add = branches[istate]->additional_ops().size();
+                  }
+                } else {
+                  states_[branches[istate]->state_index()].apply_op(
+                      branches[istate]->additional_ops()[iadd], par_results[i],
+                      dummy_rng, false);
+                }
+                iadd++;
+              }
+              branches[istate]->clear_additional_ops();
+              // if there are some branches still remaining
+              if (branches[istate]->num_branches() > 0) {
+                nbranch += branches[istate]->num_branches();
+                break;
+              }
+            }
+            // then execute ops
+            if (branches[istate]->op_iterator() != measure_seq) {
+              if (!branches[istate]->apply_control_flow(
+                      states_[branches[istate]->state_index()].creg(),
+                      measure_seq)) {
+                if (!branches[istate]->apply_runtime_noise_sampling(
+                        states_[branches[istate]->state_index()].creg(),
+                        *branches[istate]->op_iterator(), noise)) {
+                  if (!apply_branching_op(*branches[istate],
+                                          *branches[istate]->op_iterator(),
+                                          par_results[i], true)) {
+                    states_[branches[istate]->state_index()].apply_op(
+                        *branches[istate]->op_iterator(), par_results[i],
+                        dummy_rng, true);
+                  }
+                }
+                branches[istate]->advance_iterator();
+                if (branches[istate]->num_branches() > 0) {
+                  branches[istate]->remove_empty_branches();
+                  states_[branches[istate]->state_index()].creg() =
+                      branches[istate]->creg();
+
+                  // if there are some branches still remaining
+                  if (branches[istate]->num_branches() > 0) {
+                    nbranch += branches[istate]->num_branches();
+                    break;
+                  }
+                }
+              }
+            }
+          }
+        }
+        return nbranch;
+      };
+
+      // apply ops until some branch operations are executed in some branches
+      uint_t nbranch = Utils::apply_omp_parallel_for_reduction_int(
+          (par_shots > 1 && branches.size() > 1 && shot_omp_parallel_), 0,
+          par_shots, apply_ops_func, par_shots);
+
+      // repeat until new branch is available
+      if (nbranch > 0) {
+        uint_t num_states_prev = branches.size();
+        for (int_t i = 0; i < num_states_prev; i++) {
+          // add new branches
+          if (branches[i]->num_branches() > 0) {
+            for (int_t j = 0; j < branches[i]->num_branches(); j++) {
+              if (branches[i]->branches()[j]->num_shots() > 0) {
+                // add new branched state
+                uint_t pos = branches.size();
+                if (pos >= num_states) { // if there is not enough memory to
+                                         // allocate copied state, shots are
+                                         // reserved to the next iteration
+                  // reset seed to reproduce same results
+                  for (int_t k = 0; k < branches[i]->branches()[j]->num_shots();
+                       k++) {
+                    branches[i]->branches()[j]->rng_shots()[k].set_seed(
+                        branches[i]
+                            ->branches()[j]
+                            ->rng_shots()[k]
+                            .initial_seed());
+                  }
+                  shots_storage.insert(
+                      shots_storage.end(),
+                      branches[i]->branches()[j]->rng_shots().begin(),
+                      branches[i]->branches()[j]->rng_shots().end());
+                } else {
+                  branches.push_back(branches[i]->branches()[j]);
+                  branches[pos]->state_index() = top_state + pos;
+                  branches[pos]->root_state_index() =
+                      branches[i]->state_index();
+                }
+              } else {
+                branches[i]->branches()[j].reset();
+              }
+            }
+            branches[i]->clear_branch();
+          }
+        }
+
+        // copy state to new branch
+        uint_t num_new_branches = branches.size() - num_states_prev;
+        auto copy_branch_func = [this, &branches, par_shots, circ,
+                                 num_new_branches, num_states_prev](int_t i) {
+          uint_t pos, pos_end;
+          pos = num_states_prev + num_new_branches * i / par_shots;
+          pos_end = num_states_prev + num_new_branches * (i + 1) / par_shots;
+          for (; pos < pos_end; pos++) {
+            uint_t istate = branches[pos]->state_index();
+            states_[istate].set_parallelization(this->parallel_state_update_);
+            states_[istate].set_global_phase(circ.global_phase_angle);
+            states_[istate].enable_density_matrix(!Base::has_statevector_ops_);
+            states_[istate].qreg().initialize(
+                states_[branches[pos]->root_state_index()].qreg());
+            states_[istate].creg() = branches[pos]->creg();
+          }
+        };
+        Utils::apply_omp_parallel_for(
+            (par_shots > 1 && num_new_branches > 1 && shot_omp_parallel_), 0,
+            par_shots, copy_branch_func, par_shots);
+      }
+
+      // check if there are remaining ops
+      num_active_states = 0;
+      for (int_t i = 0; i < branches.size(); i++) {
+        if (branches[i]->op_iterator() != measure_seq ||
+            branches[i]->additional_ops().size() > 0)
+          num_active_states++;
+      }
+    }
+
+    if (can_sample) {
+      // apply sampling measure for each branch
+      auto sampling_measure_func = [this, &branches, &par_results, measure_seq,
+                                    last, par_shots](int_t i) {
+        uint_t istate, state_end;
+        istate = branches.size() * i / par_shots;
+        state_end = branches.size() * (i + 1) / par_shots;
+
+        for (; istate < state_end; istate++) {
+          measure_sampler(measure_seq, last, branches[istate]->num_shots(),
+                          *branches[istate], par_results[i],
+                          branches[istate]->rng_shots());
+        }
+      };
+      bool can_parallel = par_shots > 1 && branches.size() > 1;
+#ifdef AER_CUSTATEVEC
+      can_parallel &= !Base::cuStateVec_enable_;
+#endif
+      Utils::apply_omp_parallel_for(can_parallel, 0, par_shots,
+                                    sampling_measure_func, par_shots);
+
+      result.metadata.add(true, "shot_branching_sampling_enabled");
+    } else {
+      // save cregs to result
+      auto save_cregs = [this, &branches, &par_results, par_shots](int_t i) {
+        uint_t istate, state_end;
+        istate = branches.size() * i / par_shots;
+        state_end = branches.size() * (i + 1) / par_shots;
+
+        for (; istate < state_end; istate++) {
+          if (Base::num_process_per_experiment_ > 1) {
+            for (int_t j = 0; j < branches[istate]->num_shots(); j++) {
+              cregs_[branches[istate]->shot_index() + j] =
+                  states_[branches[istate]->state_index()].creg();
+            }
+          } else {
+            std::string memory_hex =
+                states_[branches[istate]->state_index()].creg().memory_hex();
+            for (int_t j = 0; j < branches[istate]->num_shots(); j++)
+              par_results[i].data.add_accum(static_cast<uint_t>(1ULL), "counts",
+                                            memory_hex);
+            if (Base::save_creg_memory_) {
+              for (int_t j = 0; j < branches[istate]->num_shots(); j++)
+                par_results[i].data.add_list(memory_hex, "memory");
+            }
+          }
+        }
+      };
+      Utils::apply_omp_parallel_for(
+          (par_shots > 1 && branches.size() > 1 && shot_omp_parallel_), 0,
+          par_shots, save_cregs, par_shots);
+    }
+
+    // clear
+    for (int_t i = 0; i < branches.size(); i++) {
+      branches[i].reset();
+    }
+    branches.clear();
+  }
+
+  for (auto &res : par_results) {
+    result.combine(std::move(res));
+  }
+}
+
+template <class state_t>
+template <typename InputIterator>
+void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
+                                                  InputIterator last_meas,
+                                                  uint_t shots, Branch &branch,
+                                                  ExperimentResult &result,
+                                                  std::vector<RngEngine> &rng) {
+  state_t &state = states_[branch.state_index()];
+  // Check if meas_circ is empty, and if so return initial creg
+  if (first_meas == last_meas) {
+    for (int_t i = 0; i < shots; i++) {
+      if (Base::num_process_per_experiment_ > 1) {
+        cregs_[branch.shot_index() + i] = state.creg();
+      } else {
+        result.save_count_data(state.creg(), Base::save_creg_memory_);
+      }
+    }
+    return;
+  }
+
+  std::vector<Operations::Op> meas_ops;
+  std::vector<Operations::Op> roerror_ops;
+  for (auto op = first_meas; op != last_meas; op++) {
+    if (op->type == Operations::OpType::roerror) {
+      roerror_ops.push_back(*op);
+    } else { /*(op.type == Operations::OpType::measure) */
+      meas_ops.push_back(*op);
+    }
+  }
+
+  // Get measured qubits from circuit sort and delete duplicates
+  std::vector<uint_t> meas_qubits; // measured qubits
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j)
+      meas_qubits.push_back(op.qubits[j]);
+  }
+  sort(meas_qubits.begin(), meas_qubits.end());
+  meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()),
+                    meas_qubits.end());
+
+  // Generate the samples
+  auto timer_start = myclock_t::now();
+  std::vector<reg_t> all_samples;
+  all_samples = sample_measure(state, meas_qubits, shots, rng);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  result.metadata.add(time_taken, "sample_measure_time");
+
+  // Make qubit map of position in vector of measured qubits
+  std::unordered_map<uint_t, uint_t> qubit_map;
+  for (uint_t j = 0; j < meas_qubits.size(); ++j) {
+    qubit_map[meas_qubits[j]] = j;
+  }
+
+  // Maps of memory and register to qubit position
+  std::map<uint_t, uint_t> memory_map;
+  std::map<uint_t, uint_t> register_map;
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j) {
+      auto pos = qubit_map[op.qubits[j]];
+      if (!op.memory.empty())
+        memory_map[op.memory[j]] = pos;
+      if (!op.registers.empty())
+        register_map[op.registers[j]] = pos;
+    }
+  }
+
+  // Process samples
+  uint_t num_memory =
+      (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
+  uint_t num_registers =
+      (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
+  ClassicalRegister creg;
+  for (int_t i = 0; i < all_samples.size(); i++) {
+    creg = state.creg();
+
+    // process memory bit measurements
+    for (const auto &pair : memory_map) {
+      creg.store_measure(reg_t({all_samples[i][pair.second]}),
+                         reg_t({pair.first}), reg_t());
+    }
+    // process register bit measurements
+    for (const auto &pair : register_map) {
+      creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
+                         reg_t({pair.first}));
+    }
+
+    // process read out errors for memory and registers
+    for (const Operations::Op &roerror : roerror_ops)
+      creg.apply_roerror(roerror, rng[i]);
+
+    // save creg to gather
+    if (Base::num_process_per_experiment_ > 1) {
+      for (int_t j = 0; j < shots; j++)
+        cregs_[branch.shot_index() + j] = creg;
+    } else {
+      std::string memory_hex = creg.memory_hex();
+      result.data.add_accum(static_cast<uint_t>(1ULL), "counts", memory_hex);
+      if (Base::save_creg_memory_)
+        result.data.add_list(memory_hex, "memory");
+    }
+  }
+}
+
+//-------------------------------------------------------------------------
+} // end namespace CircuitExecutor
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/parallel_state_executor.hpp b/src/simulators/parallel_state_executor.hpp
new file mode 100644
index 0000000000..5e5074449c
--- /dev/null
+++ b/src/simulators/parallel_state_executor.hpp
@@ -0,0 +1,1869 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _parallel_executor_hpp_
+#define _parallel_executor_hpp_
+
+#include "simulators/multi_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace CircuitExecutor {
+
+//-------------------------------------------------------------------------
+// Parallel executor class implementation
+//-------------------------------------------------------------------------
+template <class state_t>
+class ParallelStateExecutor : public virtual MultiStateExecutor<state_t> {
+  using Base = MultiStateExecutor<state_t>;
+
+protected:
+  // extra parameters for parallel simulations
+  uint_t chunk_bits_; // number of qubits per chunk
+
+  bool chunk_omp_parallel_; // using thread parallel to process loop of chunks
+                            // or not
+  bool global_chunk_indexing_; // using global index for control qubits and
+                               // diagonal matrix
+
+  reg_t qubit_map_; // qubit map to restore swapped qubits
+
+  bool multi_chunk_swap_enable_ = true; // enable multi-chunk swaps
+  // maximum buffer size in qubits for chunk swap
+  uint_t chunk_swap_buffer_qubits_ = 15;
+  uint_t max_multi_swap_; // maximum swaps can be applied at a time, calculated
+                          // by chunk_swap_buffer_bits_
+
+  uint_t cache_block_qubit_ = 0;
+
+public:
+  ParallelStateExecutor();
+  virtual ~ParallelStateExecutor();
+
+  size_t required_memory_mb(const Circuit &circuit,
+                            const Noise::NoiseModel &noise) const override {
+    state_t tmp;
+    return tmp.required_memory_mb(circuit.num_qubits, circuit.ops);
+  }
+
+  uint_t get_process_by_chunk(uint_t cid);
+
+protected:
+  void set_config(const Config &config) override;
+
+  virtual uint_t qubit_scale(void) { return 1; }
+
+  bool multiple_chunk_required(const Circuit &circuit,
+                               const Noise::NoiseModel &noise) const;
+
+  // Return cache blocking transpiler pass
+  Transpile::CacheBlocking
+  transpile_cache_blocking(const Circuit &circ, const Noise::NoiseModel &noise,
+                           const Config &config) const;
+
+  bool allocate(uint_t num_qubits, const Config &config);
+  bool allocate_states(uint_t num_shots, const Config &config) override;
+
+  void run_circuit_with_sampling(Circuit &circ, const Config &config,
+                                 RngEngine &init_rng,
+                                 ExperimentResult &result) override;
+
+  void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                         const Config &config, RngEngine &init_rng,
+                         ExperimentResult &result, bool sample_noise) override;
+
+  template <typename InputIterator>
+  void measure_sampler(InputIterator first_meas, InputIterator last_meas,
+                       uint_t shots, ExperimentResult &result,
+                       RngEngine &rng) const;
+
+  // apply operations for multi-chunk simulator
+  template <typename InputIterator>
+  void apply_ops_chunks(InputIterator first, InputIterator last,
+                        ExperimentResult &result, RngEngine &rng,
+                        bool final_ops);
+
+  // apply ops on cache memory
+  template <typename InputIterator>
+  void apply_cache_blocking_ops(const int_t iGroup, InputIterator first,
+                                InputIterator last, ExperimentResult &result,
+                                RngEngine &rng);
+
+  // apply parallel operations (implement for each simulation method)
+  virtual bool apply_parallel_op(const Operations::Op &op,
+                                 ExperimentResult &result, RngEngine &rng,
+                                 bool final_op) = 0;
+
+  // store measure to cregs
+  void store_measure(const reg_t &outcome, const reg_t &memory,
+                     const reg_t &registers);
+
+  void apply_bfunc(const Operations::Op &op);
+  void apply_roerror(const Operations::Op &op, RngEngine &rng);
+
+  //-----------------------------------------------------------------------
+  // Initialization
+  //-----------------------------------------------------------------------
+  template <typename list_t>
+  void initialize_from_vector(const list_t &vec);
+
+  template <typename list_t>
+  void initialize_from_matrix(const list_t &mat);
+
+  // Initializes an n-qubit state to the all |0> state
+  virtual void initialize_qreg(uint_t num_qubits) = 0;
+
+  //-----------------------------------------------------------------------
+  // Functions for multi-chunk distribution
+  //-----------------------------------------------------------------------
+  // Helper function for computing expectation value
+  virtual double expval_pauli(const reg_t &qubits,
+                              const std::string &pauli) = 0;
+
+  // Apply a save expectation value instruction
+  void apply_save_expval(const Operations::Op &op, ExperimentResult &result);
+
+  // Sample n-measurement outcomes without applying the measure operation
+  // to the system state
+  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
+                                            RngEngine &rng) const {
+    std::vector<reg_t> ret;
+    return ret;
+  };
+
+  // swap between chunks
+  virtual void apply_chunk_swap(const reg_t &qubits);
+
+  // apply multiple swaps between chunks
+  virtual void apply_multi_chunk_swap(const reg_t &qubits);
+
+  // apply X gate over chunks
+  virtual void apply_chunk_x(const uint_t qubit);
+
+  // send/receive chunk in receive buffer
+  void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
+  void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
+
+  template <class data_t>
+  void send_data(data_t *pSend, uint_t size, uint_t myid, uint_t pairid);
+  template <class data_t>
+  void recv_data(data_t *pRecv, uint_t size, uint_t myid, uint_t pairid);
+
+  // reduce values over processes
+  void reduce_sum(reg_t &sum) const;
+  void reduce_sum(rvector_t &sum) const;
+  void reduce_sum(complex_t &sum) const;
+  void reduce_sum(double &sum) const;
+
+  // gather values on each process
+  void gather_value(rvector_t &val) const;
+
+  // barrier all processes
+  void sync_process(void) const;
+
+  // gather distributed state into vector (if memory is enough)
+  template <class data_t>
+  void gather_state(std::vector<std::complex<data_t>> &state);
+
+  template <class data_t>
+  void gather_state(AER::Vector<std::complex<data_t>> &state);
+
+  // collect matrix over multiple chunks
+  auto apply_to_matrix(bool copy = false);
+
+  // Apply the global phase
+  virtual void apply_global_phase();
+
+  uint_t mapped_index(const uint_t idx);
+};
+
+template <class state_t>
+ParallelStateExecutor<state_t>::ParallelStateExecutor() {
+  chunk_omp_parallel_ = false;
+  global_chunk_indexing_ = false;
+  chunk_bits_ = 0;
+  cache_block_qubit_ = 0;
+}
+
+template <class state_t>
+ParallelStateExecutor<state_t>::~ParallelStateExecutor() {}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::set_config(const Config &config) {
+  Base::set_config(config);
+
+  if (config.chunk_swap_buffer_qubits.has_value())
+    chunk_swap_buffer_qubits_ = config.chunk_swap_buffer_qubits.value();
+
+  // enable multiple qregs if cache blocking is enabled
+  cache_block_qubit_ = 0;
+  if (config.blocking_qubits.has_value())
+    cache_block_qubit_ = config.blocking_qubits.value();
+}
+
+template <class state_t>
+bool ParallelStateExecutor<state_t>::multiple_chunk_required(
+    const Circuit &circ, const Noise::NoiseModel &noise) const {
+  if (circ.num_qubits < 3)
+    return false;
+  if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits)
+    return true;
+
+  if (Base::num_process_per_experiment_ == 1 &&
+      Base::sim_device_ == Device::GPU && Base::num_gpus_ > 0) {
+    return (Base::max_gpu_memory_mb_ / Base::num_gpus_ <
+            Base::required_memory_mb(circ, noise));
+  }
+  if (Base::num_process_per_experiment_ > 1) {
+    size_t total_mem = Base::max_memory_mb_;
+    if (Base::sim_device_ == Device::GPU)
+      total_mem += Base::max_gpu_memory_mb_;
+    if (total_mem * Base::num_process_per_experiment_ >
+        Base::required_memory_mb(circ, noise))
+      return true;
+  }
+
+  return false;
+}
+
+template <class state_t>
+Transpile::CacheBlocking
+ParallelStateExecutor<state_t>::transpile_cache_blocking(
+    const Circuit &circ, const Noise::NoiseModel &noise,
+    const Config &config) const {
+  Transpile::CacheBlocking cache_block_pass;
+
+  const bool is_matrix = (Base::method_ == Method::density_matrix ||
+                          Base::method_ == Method::unitary);
+  const auto complex_size = (Base::sim_precision_ == Precision::Single)
+                                ? sizeof(std::complex<float>)
+                                : sizeof(std::complex<double>);
+
+  cache_block_pass.set_num_processes(Base::num_process_per_experiment_);
+  cache_block_pass.set_config(config);
+
+  if (!cache_block_pass.enabled()) {
+    // if blocking is not set by config, automatically set if required
+    if (multiple_chunk_required(circ, noise)) {
+      int nplace = Base::num_process_per_experiment_;
+      if (Base::sim_device_ == Device::GPU && Base::num_gpus_ > 0)
+        nplace *= Base::num_gpus_;
+      cache_block_pass.set_blocking(circ.num_qubits,
+                                    Base::get_min_memory_mb() << 20, nplace,
+                                    complex_size, is_matrix);
+    }
+  }
+  return cache_block_pass;
+}
+
+template <class state_t>
+bool ParallelStateExecutor<state_t>::allocate(uint_t num_qubits,
+                                              const Config &config) {
+  int_t i;
+  Base::num_qubits_ = num_qubits;
+  chunk_bits_ = cache_block_qubit_;
+
+  global_chunk_indexing_ = false;
+  chunk_omp_parallel_ = false;
+  if (Base::sim_device_ == Device::GPU) {
+#ifdef _OPENMP
+    if (omp_get_num_threads() == 1)
+      chunk_omp_parallel_ = true;
+#endif
+
+    global_chunk_indexing_ = true; // cuStateVec does not handle global chunk
+                                   // index for diagonal matrix
+#ifdef AER_CUSTATEVEC
+    if (!Base::cuStateVec_enable_)
+      global_chunk_indexing_ = false;
+#endif
+  } else if (Base::sim_device_ == Device::ThrustCPU) {
+    global_chunk_indexing_ = true;
+    chunk_omp_parallel_ = false;
+  }
+
+  allocate_states(Base::num_local_states_, config);
+
+  // initialize qubit map
+  qubit_map_.resize(Base::num_qubits_);
+  for (i = 0; i < Base::num_qubits_; i++) {
+    qubit_map_[i] = i;
+  }
+
+  if (chunk_bits_ <= chunk_swap_buffer_qubits_ + 1)
+    multi_chunk_swap_enable_ = false;
+  else
+    max_multi_swap_ = chunk_bits_ - chunk_swap_buffer_qubits_;
+
+  return true;
+}
+
+template <class state_t>
+bool ParallelStateExecutor<state_t>::allocate_states(uint_t num_states,
+                                                     const Config &config) {
+  int_t i;
+  bool init_states = true;
+  bool ret = true;
+  // deallocate qregs before reallocation
+  if (Base::states_.size() > 0) {
+    if (Base::states_.size() == num_states)
+      init_states = false; // can reuse allocated chunks
+    else
+      Base::states_.clear();
+  }
+  if (init_states) {
+    Base::states_.resize(num_states);
+
+    if (Base::num_creg_memory_ != 0 || Base::num_creg_registers_ != 0) {
+      for (i = 0; i < num_states; i++) {
+        // set number of creg bits before actual initialization
+        Base::states_[i].initialize_creg(Base::num_creg_memory_,
+                                         Base::num_creg_registers_);
+      }
+    }
+    uint_t gqubits = Base::num_qubits_ * this->qubit_scale();
+    uint_t squbits;
+    if (chunk_bits_ == 0)
+      squbits = Base::num_qubits_ * this->qubit_scale();
+    else
+      squbits = chunk_bits_ * this->qubit_scale();
+
+    // allocate qregs
+    Base::states_[0].set_config(config);
+    Base::states_[0].qreg().set_max_matrix_bits(Base::max_matrix_qubits_);
+    Base::states_[0].qreg().set_num_threads_per_group(
+        Base::num_threads_per_group_);
+    Base::states_[0].set_num_global_qubits(Base::num_qubits_);
+#ifdef AER_CUSTATEVEC
+    Base::states_[0].qreg().cuStateVec_enable(Base::cuStateVec_enable_);
+#endif
+    Base::states_[0].qreg().set_target_gpus(Base::target_gpus_);
+
+    ret &= Base::states_[0].qreg().chunk_setup(
+        squbits, gqubits, Base::global_state_index_, num_states);
+    for (i = 1; i < num_states; i++) {
+      Base::states_[i].set_config(config);
+      ret &= Base::states_[i].qreg().chunk_setup(Base::states_[0].qreg(),
+                                                 Base::global_state_index_ + i);
+      Base::states_[i].qreg().set_num_threads_per_group(
+          Base::num_threads_per_group_);
+      Base::states_[i].set_num_global_qubits(Base::num_qubits_);
+    }
+  }
+  Base::num_active_states_ = num_states;
+
+  // initialize groups
+  Base::top_state_of_group_.clear();
+  Base::num_groups_ = 0;
+  for (i = 0; i < num_states; i++) {
+    if (Base::states_[i].qreg().top_of_group()) {
+      Base::top_state_of_group_.push_back(i);
+      Base::num_groups_++;
+    }
+  }
+  Base::top_state_of_group_.push_back(num_states);
+  Base::num_states_in_group_.resize(Base::num_groups_);
+  for (i = 0; i < Base::num_groups_; i++) {
+    Base::num_states_in_group_[i] =
+        Base::top_state_of_group_[i + 1] - Base::top_state_of_group_[i];
+  }
+  return ret;
+}
+
+template <class state_t>
+uint_t ParallelStateExecutor<state_t>::get_process_by_chunk(uint_t cid) {
+  uint_t i;
+  for (i = 0; i < Base::distributed_procs_; i++) {
+    if (cid >= Base::state_index_begin_[i] && cid < Base::state_index_end_[i]) {
+      return i;
+    }
+  }
+  return Base::distributed_procs_;
+}
+
+template <class state_t>
+uint_t ParallelStateExecutor<state_t>::mapped_index(const uint_t idx) {
+  uint_t i, ret = 0;
+  uint_t t = idx;
+
+  for (i = 0; i < Base::num_qubits_; i++) {
+    if (t & 1) {
+      ret |= (1ull << qubit_map_[i]);
+    }
+    t >>= 1;
+  }
+  return ret;
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::run_circuit_with_sampling(
+    Circuit &circ, const Config &config, RngEngine &init_rng,
+    ExperimentResult &result) {
+
+  // Optimize circuit
+  Noise::NoiseModel dummy_noise;
+  state_t dummy_state;
+
+  bool cache_block = false;
+  if (multiple_chunk_required(circ, dummy_noise)) {
+    auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
+    fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                 result);
+
+    // Cache blocking pass
+    auto cache_block_pass = transpile_cache_blocking(circ, dummy_noise, config);
+    cache_block_pass.set_sample_measure(true);
+    cache_block_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
+                                      result);
+    cache_block = cache_block_pass.enabled();
+  }
+  if (!cache_block) {
+    return Executor<state_t>::run_circuit_with_sampling(circ, config, init_rng,
+                                                        result);
+  }
+  Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
+
+  uint_t nchunks =
+      1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());
+  Base::set_distribution(nchunks);
+  allocate(circ.num_qubits, config);
+  // Set state config
+  for (uint_t i = 0; i < Base::states_.size(); i++) {
+    Base::states_[i].set_parallelization(Base::parallel_state_update_);
+    Base::states_[i].set_global_phase(circ.global_phase_angle);
+  }
+  Base::set_global_phase(circ.global_phase_angle);
+
+  // run with multi-chunks
+  RngEngine rng = init_rng;
+
+  auto &ops = circ.ops;
+  auto first_meas = circ.first_measure_pos; // Position of first measurement op
+  bool final_ops = (first_meas == ops.size());
+
+  initialize_qreg(circ.num_qubits);
+  for (uint_t i = 0; i < Base::states_.size(); i++) {
+    Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
+  }
+
+  // Run circuit instructions before first measure
+  apply_ops_chunks(ops.cbegin(), ops.cbegin() + first_meas, result, rng,
+                   final_ops);
+
+  // Get measurement operations and set of measured qubits
+  measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
+                  result, rng);
+
+  // Add measure sampling metadata
+  result.metadata.add(true, "measure_sampling");
+  Base::states_[0].add_metadata(result);
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+
+  if (!multiple_chunk_required(circ, noise)) {
+    return Base::run_circuit_shots(circ, noise, config, init_rng, result,
+                                   sample_noise);
+  }
+
+  uint_t nchunks =
+      1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());
+  Base::set_distribution(nchunks);
+
+  auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
+  auto cache_block_pass = transpile_cache_blocking(circ, noise, config);
+
+  for (int_t ishot = 0; ishot < circ.shots; ishot++) {
+    RngEngine rng;
+    if (ishot == 0)
+      rng = init_rng;
+    else
+      rng.set_seed(circ.seed + ishot);
+
+    // Optimize circuit
+    Noise::NoiseModel dummy_noise;
+    state_t dummy_state;
+
+    Circuit circ_opt;
+    if (sample_noise) {
+      circ_opt = noise.sample_noise(circ, rng);
+    } else {
+      circ_opt = circ;
+    }
+    fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
+                                 result);
+    Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
+
+    // Cache blocking pass
+    cache_block_pass.set_sample_measure(false);
+    cache_block_pass.optimize_circuit(circ_opt, dummy_noise,
+                                      dummy_state.opset(), result);
+    allocate(circ.num_qubits, config);
+
+    // Set state config
+    for (uint_t i = 0; i < Base::states_.size(); i++) {
+      Base::states_[i].set_parallelization(Base::parallel_state_update_);
+      Base::states_[i].set_global_phase(circ.global_phase_angle);
+    }
+    Base::set_global_phase(circ.global_phase_angle);
+
+    initialize_qreg(circ.num_qubits);
+    for (uint_t i = 0; i < Base::states_.size(); i++) {
+      Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
+    }
+
+    apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(), result, rng,
+                     true);
+    result.save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
+  }
+  Base::states_[0].add_metadata(result);
+}
+
+template <class state_t>
+template <typename InputIterator>
+void ParallelStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
+                                                     InputIterator last_meas,
+                                                     uint_t shots,
+                                                     ExperimentResult &result,
+                                                     RngEngine &rng) const {
+  // Check if meas_circ is empty, and if so return initial creg
+  if (first_meas == last_meas) {
+    while (shots-- > 0) {
+      result.save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
+    }
+    return;
+  }
+
+  std::vector<Operations::Op> meas_ops;
+  std::vector<Operations::Op> roerror_ops;
+  for (auto op = first_meas; op != last_meas; op++) {
+    if (op->type == Operations::OpType::roerror) {
+      roerror_ops.push_back(*op);
+    } else { /*(op.type == Operations::OpType::measure) */
+      meas_ops.push_back(*op);
+    }
+  }
+
+  // Get measured qubits from circuit sort and delete duplicates
+  std::vector<uint_t> meas_qubits; // measured qubits
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j)
+      meas_qubits.push_back(op.qubits[j]);
+  }
+  sort(meas_qubits.begin(), meas_qubits.end());
+  meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()),
+                    meas_qubits.end());
+
+  // Generate the samples
+  auto timer_start = myclock_t::now();
+  auto all_samples = sample_measure(meas_qubits, shots, rng);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  result.metadata.add(time_taken, "sample_measure_time");
+
+  // Make qubit map of position in vector of measured qubits
+  std::unordered_map<uint_t, uint_t> qubit_map;
+  for (uint_t j = 0; j < meas_qubits.size(); ++j) {
+    qubit_map[meas_qubits[j]] = j;
+  }
+
+  // Maps of memory and register to qubit position
+  std::map<uint_t, uint_t> memory_map;
+  std::map<uint_t, uint_t> register_map;
+  for (const auto &op : meas_ops) {
+    for (size_t j = 0; j < op.qubits.size(); ++j) {
+      auto pos = qubit_map[op.qubits[j]];
+      if (!op.memory.empty())
+        memory_map[op.memory[j]] = pos;
+      if (!op.registers.empty())
+        register_map[op.registers[j]] = pos;
+    }
+  }
+
+  // Process samples
+  uint_t num_memory =
+      (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
+  uint_t num_registers =
+      (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
+  ClassicalRegister creg;
+  while (!all_samples.empty()) {
+    auto sample = all_samples.back();
+    creg.initialize(num_memory, num_registers);
+
+    // process memory bit measurements
+    for (const auto &pair : memory_map) {
+      creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}),
+                         reg_t());
+    }
+    // process register bit measurements
+    for (const auto &pair : register_map) {
+      creg.store_measure(reg_t({sample[pair.second]}), reg_t(),
+                         reg_t({pair.first}));
+    }
+
+    // process read out errors for memory and registers
+    for (const Operations::Op &roerror : roerror_ops) {
+      creg.apply_roerror(roerror, rng);
+    }
+
+    // Save count data
+    result.save_count_data(creg, Base::save_creg_memory_);
+
+    // pop off processed sample
+    all_samples.pop_back();
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::store_measure(const reg_t &outcome,
+                                                   const reg_t &memory,
+                                                   const reg_t &registers) {
+  auto apply_store_measure = [this, outcome, memory, registers](int_t iGroup) {
+    int_t iChunk = Base::top_state_of_group_[iGroup];
+    int_t nChunk = 1;
+#ifdef AER_CUSTATEVEC
+    if (Base::cuStateVec_enable_) {
+      nChunk = Base::num_states_in_group_[iGroup];
+    }
+#endif
+    for (int_t i = 0; i < nChunk; i++)
+      Base::states_[iChunk + i].creg().store_measure(outcome, memory,
+                                                     registers);
+  };
+  Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
+                                0, Base::num_groups_, apply_store_measure);
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_bfunc(const Operations::Op &op) {
+  auto bfunc_kernel = [this, op](int_t iGroup) {
+    int_t iChunk = Base::top_state_of_group_[iGroup];
+    int_t nChunk = 1;
+#ifdef AER_CUSTATEVEC
+    if (Base::cuStateVec_enable_) {
+      nChunk = Base::num_states_in_group_[iGroup];
+    }
+#endif
+    for (int_t i = 0; i < nChunk; i++)
+      Base::states_[iChunk + i].creg().apply_bfunc(op);
+  };
+  Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
+                                0, Base::num_groups_, bfunc_kernel);
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_roerror(const Operations::Op &op,
+                                                   RngEngine &rng) {
+  auto roerror_kernel = [this, op, &rng](int_t iGroup) {
+    int_t iChunk = Base::top_state_of_group_[iGroup];
+    int_t nChunk = 1;
+#ifdef AER_CUSTATEVEC
+    if (Base::cuStateVec_enable_) {
+      nChunk = Base::num_states_in_group_[iGroup];
+    }
+#endif
+    for (int_t i = 0; i < nChunk; i++)
+      Base::states_[iChunk + i].creg().apply_roerror(op, rng);
+  };
+  Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
+                                0, Base::num_groups_, roerror_kernel);
+}
+
+template <class state_t>
+template <typename InputIterator>
+void ParallelStateExecutor<state_t>::apply_ops_chunks(InputIterator first,
+                                                      InputIterator last,
+                                                      ExperimentResult &result,
+                                                      RngEngine &rng,
+                                                      bool final_ops) {
+  uint_t iOp, nOp;
+  reg_t multi_swap;
+
+  nOp = std::distance(first, last);
+  iOp = 0;
+
+  while (iOp < nOp) {
+    const Operations::Op op_iOp = *(first + iOp);
+    if (op_iOp.type == Operations::OpType::gate &&
+        op_iOp.name == "swap_chunk") {
+      // apply swap between chunks
+      if (multi_chunk_swap_enable_ && op_iOp.qubits[0] < chunk_bits_ &&
+          op_iOp.qubits[1] >= chunk_bits_) {
+        if (Base::distributed_proc_bits_ < 0 ||
+            (op_iOp.qubits[1] >=
+             (Base::num_qubits_ * qubit_scale() -
+              Base::distributed_proc_bits_))) { // apply multi-swap when swap is
+                                                // cross
+                                                // qubits
+          multi_swap.push_back(op_iOp.qubits[0]);
+          multi_swap.push_back(op_iOp.qubits[1]);
+          if (multi_swap.size() >= max_multi_swap_ * 2) {
+            apply_multi_chunk_swap(multi_swap);
+            multi_swap.clear();
+          }
+        } else
+          apply_chunk_swap(op_iOp.qubits);
+      } else {
+        if (multi_swap.size() > 0) {
+          apply_multi_chunk_swap(multi_swap);
+          multi_swap.clear();
+        }
+        apply_chunk_swap(op_iOp.qubits);
+      }
+      iOp++;
+      continue;
+    } else if (multi_swap.size() > 0) {
+      apply_multi_chunk_swap(multi_swap);
+      multi_swap.clear();
+    }
+
+    if (op_iOp.type == Operations::OpType::sim_op &&
+        op_iOp.name == "begin_blocking") {
+      // applying sequence of gates inside each chunk
+
+      uint_t iOpEnd = iOp;
+      while (iOpEnd < nOp) {
+        const Operations::Op op_iOpEnd = *(first + iOpEnd);
+        if (op_iOpEnd.type == Operations::OpType::sim_op &&
+            op_iOpEnd.name == "end_blocking") {
+          break;
+        }
+        iOpEnd++;
+      }
+
+      uint_t iOpBegin = iOp + 1;
+      if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
+#pragma omp parallel for num_threads(Base::num_groups_)
+        for (int_t ig = 0; ig < Base::num_groups_; ig++)
+          apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
+                                   rng);
+      } else {
+        for (int_t ig = 0; ig < Base::num_groups_; ig++)
+          apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
+                                   rng);
+      }
+      iOp = iOpEnd;
+    } else {
+      if (!apply_parallel_op(op_iOp, result, rng,
+                             final_ops && nOp == iOp + 1)) {
+        if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
+#pragma omp parallel for num_threads(Base::num_groups_)
+          for (int_t ig = 0; ig < Base::num_groups_; ig++)
+            apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
+                                     rng);
+        } else {
+          for (int_t ig = 0; ig < Base::num_groups_; ig++)
+            apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
+                                     rng);
+        }
+      }
+    }
+    iOp++;
+  }
+
+  if (multi_swap.size() > 0)
+    apply_multi_chunk_swap(multi_swap);
+
+  if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
+#pragma omp parallel for num_threads(Base::num_groups_)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++)
+      Base::states_[Base::top_state_of_group_[ig]].qreg().synchronize();
+  } else {
+    for (int_t ig = 0; ig < Base::num_groups_; ig++)
+      Base::states_[Base::top_state_of_group_[ig]].qreg().synchronize();
+  }
+
+  if (Base::sim_device_ == Device::GPU) {
+#ifdef AER_THRUST_CUDA
+    int nDev;
+    if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+      cudaGetLastError();
+      nDev = 0;
+    }
+    if (nDev > Base::num_groups_)
+      nDev = Base::num_groups_;
+    result.metadata.add(nDev, "cacheblocking", "chunk_parallel_gpus");
+#endif
+  }
+
+#ifdef AER_MPI
+  result.metadata.add(multi_chunk_swap_enable_, "cacheblocking",
+                      "multiple_chunk_swaps_enable");
+  if (multi_chunk_swap_enable_) {
+    result.metadata.add(chunk_swap_buffer_qubits_, "cacheblocking",
+                        "multiple_chunk_swaps_buffer_qubits");
+    result.metadata.add(max_multi_swap_, "cacheblocking",
+                        "max_multiple_chunk_swaps");
+  }
+#endif
+}
+
+template <class state_t>
+template <typename InputIterator>
+void ParallelStateExecutor<state_t>::apply_cache_blocking_ops(
+    const int_t iGroup, InputIterator first, InputIterator last,
+    ExperimentResult &result, RngEngine &rng) {
+  // for each chunk in group
+  for (int_t iChunk = Base::top_state_of_group_[iGroup];
+       iChunk < Base::top_state_of_group_[iGroup + 1]; iChunk++) {
+    // fecth chunk in cache
+    if (Base::states_[iChunk].qreg().fetch_chunk()) {
+      Base::states_[iChunk].apply_ops(first, last, result, rng, false);
+
+      // release chunk from cache
+      Base::states_[iChunk].qreg().release_chunk();
+    }
+  }
+}
+
+template <class state_t>
+template <typename list_t>
+void ParallelStateExecutor<state_t>::initialize_from_vector(const list_t &vec) {
+  int_t iChunk;
+
+  if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(iChunk)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        list_t tmp(1ull << (chunk_bits_ * qubit_scale()));
+        for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
+          tmp[i] = vec[((Base::global_state_index_ + iChunk)
+                        << (chunk_bits_ * qubit_scale())) +
+                       i];
+        }
+        Base::states_[iChunk].qreg().initialize_from_vector(tmp);
+      }
+    }
+  } else {
+    for (iChunk = 0; iChunk < Base::num_local_states_; iChunk++) {
+      list_t tmp(1ull << (chunk_bits_ * qubit_scale()));
+      for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
+        tmp[i] = vec[((Base::global_state_index_ + iChunk)
+                      << (chunk_bits_ * qubit_scale())) +
+                     i];
+      }
+      Base::states_[iChunk].qreg().initialize_from_vector(tmp);
+    }
+  }
+}
+
+template <class state_t>
+template <typename list_t>
+void ParallelStateExecutor<state_t>::initialize_from_matrix(const list_t &mat) {
+  int_t iChunk;
+  if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(iChunk)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_));
+        uint_t irow_chunk = ((iChunk + Base::global_state_index_) >>
+                             ((Base::num_qubits_ - chunk_bits_)))
+                            << (chunk_bits_);
+        uint_t icol_chunk =
+            ((iChunk + Base::global_state_index_) &
+             ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1))
+            << (chunk_bits_);
+
+        // copy part of state for this chunk
+        uint_t i, row, col;
+        for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
+          uint_t icol = i & ((1ull << chunk_bits_) - 1);
+          uint_t irow = i >> chunk_bits_;
+          tmp[i] = mat[icol_chunk + icol +
+                       ((irow_chunk + irow) << Base::num_qubits_)];
+        }
+        Base::states_[iChunk].qreg().initialize_from_matrix(tmp);
+      }
+    }
+  } else {
+    for (iChunk = 0; iChunk < Base::num_local_states_; iChunk++) {
+      list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_));
+      uint_t irow_chunk = ((iChunk + Base::global_state_index_) >>
+                           ((Base::num_qubits_ - chunk_bits_)))
+                          << (chunk_bits_);
+      uint_t icol_chunk = ((iChunk + Base::global_state_index_) &
+                           ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1))
+                          << (chunk_bits_);
+
+      // copy part of state for this chunk
+      uint_t i, row, col;
+      for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
+        uint_t icol = i & ((1ull << chunk_bits_) - 1);
+        uint_t irow = i >> chunk_bits_;
+        tmp[i] =
+            mat[icol_chunk + icol + ((irow_chunk + irow) << Base::num_qubits_)];
+      }
+      Base::states_[iChunk].qreg().initialize_from_matrix(tmp);
+    }
+  }
+}
+
+template <class state_t>
+auto ParallelStateExecutor<state_t>::apply_to_matrix(bool copy) {
+  // this function is used to collect states over chunks
+  int_t iChunk;
+  uint_t size = 1ull << (chunk_bits_ * qubit_scale());
+  uint_t mask = (1ull << (chunk_bits_)) - 1;
+  uint_t num_threads = Base::states_[0].qreg().get_omp_threads();
+
+  size_t size_required =
+      2 * (sizeof(std::complex<double>) << (Base::num_qubits_ * 2)) +
+      (sizeof(std::complex<double>) << (chunk_bits_ * 2)) *
+          Base::num_local_states_;
+  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
+    throw std::runtime_error(
+        std::string("There is not enough memory to store states as matrix"));
+  }
+
+  auto matrix = Base::states_[0].qreg().copy_to_matrix();
+
+  if (Base::distributed_rank_ == 0) {
+    matrix.resize(1ull << (Base::num_qubits_), 1ull << (Base::num_qubits_));
+
+    auto tmp = Base::states_[0].qreg().copy_to_matrix();
+    for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) {
+      int_t i;
+      uint_t irow_chunk = (iChunk >> ((Base::num_qubits_ - chunk_bits_)))
+                          << chunk_bits_;
+      uint_t icol_chunk =
+          (iChunk & ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1))
+          << chunk_bits_;
+
+      if (iChunk < Base::num_local_states_) {
+        if (copy)
+          tmp = Base::states_[iChunk].qreg().copy_to_matrix();
+        else
+          tmp = Base::states_[iChunk].qreg().move_to_matrix();
+      }
+#ifdef AER_MPI
+      else
+        recv_data(tmp.data(), size, 0, iChunk);
+#endif
+#pragma omp parallel for if (num_threads > 1) num_threads(num_threads)
+      for (i = 0; i < size; i++) {
+        uint_t irow = i >> (chunk_bits_);
+        uint_t icol = i & mask;
+        uint_t idx =
+            ((irow + irow_chunk) << (Base::num_qubits_)) + icol_chunk + icol;
+        matrix[idx] = tmp[i];
+      }
+    }
+  } else {
+#ifdef AER_MPI
+    // send matrices to process 0
+    for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) {
+      uint_t iProc = get_process_by_chunk(iChunk);
+      if (iProc == Base::distributed_rank_) {
+        if (copy) {
+          auto tmp = Base::states_[iChunk - Base::global_state_index_]
+                         .qreg()
+                         .copy_to_matrix();
+          send_data(tmp.data(), size, iChunk, 0);
+        } else {
+          auto tmp = Base::states_[iChunk - Base::global_state_index_]
+                         .qreg()
+                         .move_to_matrix();
+          send_data(tmp.data(), size, iChunk, 0);
+        }
+      }
+    }
+#endif
+  }
+
+  return matrix;
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_save_expval(
+    const Operations::Op &op, ExperimentResult &result) {
+  // Check empty edge case
+  if (op.expval_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save expval instruction (Pauli components are empty).");
+  }
+  bool variance = (op.type == Operations::OpType::save_expval_var);
+
+  // Accumulate expval components
+  double expval(0.);
+  double sq_expval(0.);
+
+  for (const auto &param : op.expval_params) {
+    // param is tuple (pauli, coeff, sq_coeff)
+    const auto val = expval_pauli(op.qubits, std::get<0>(param));
+    expval += std::get<1>(param) * val;
+    if (variance) {
+      sq_expval += std::get<2>(param) * val;
+    }
+  }
+  if (variance) {
+    std::vector<double> expval_var(2);
+    expval_var[0] = expval;                      // mean
+    expval_var[1] = sq_expval - expval * expval; // variance
+    result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                             expval_var, op.type, op.save_type);
+  } else {
+    result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                             expval, op.type, op.save_type);
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_global_phase() {
+  if (Base::has_global_phase_) {
+    if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+          Base::states_[iChunk].qreg().apply_diagonal_matrix(
+              {0}, {Base::global_phase_, Base::global_phase_});
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].qreg().apply_diagonal_matrix(
+            {0}, {Base::global_phase_, Base::global_phase_});
+    }
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_chunk_swap(const reg_t &qubits) {
+  uint_t nLarge = 1;
+  uint_t q0, q1;
+  int_t iChunk;
+
+  q0 = qubits[qubits.size() - 2];
+  q1 = qubits[qubits.size() - 1];
+
+  if (qubit_scale() == 1) {
+    std::swap(qubit_map_[q0], qubit_map_[q1]);
+  }
+
+  if (q0 > q1) {
+    std::swap(q0, q1);
+  }
+
+  if (q1 < chunk_bits_ * qubit_scale()) {
+    // inside chunk
+    if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for num_threads(Base::num_groups_)
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+          Base::states_[iChunk].qreg().apply_mcswap(qubits);
+      }
+    } else {
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+          Base::states_[iChunk].qreg().apply_mcswap(qubits);
+      }
+    }
+  } else { // swap over chunks
+    uint_t mask0, mask1;
+
+    mask0 = (1ull << q0);
+    mask1 = (1ull << q1);
+    mask0 >>= (chunk_bits_ * qubit_scale());
+    mask1 >>= (chunk_bits_ * qubit_scale());
+
+    if (Base::distributed_procs_ == 1 ||
+        (Base::distributed_proc_bits_ >= 0 &&
+         q1 < (Base::num_qubits_ * qubit_scale() -
+               Base::distributed_proc_bits_))) { // no data transfer between
+                                                 // processes
+                                                 // is needed
+      auto apply_chunk_swap_1qubit = [this, mask1, qubits](int_t iGroup) {
+        for (int_t ic = Base::top_state_of_group_[iGroup];
+             ic < Base::top_state_of_group_[iGroup + 1]; ic++) {
+          uint_t baseChunk;
+          baseChunk = ic & (~mask1);
+          if (ic == baseChunk)
+            Base::states_[ic].qreg().apply_chunk_swap(
+                qubits, Base::states_[ic | mask1].qreg(), true);
+        }
+      };
+      auto apply_chunk_swap_2qubits = [this, mask0, mask1,
+                                       qubits](int_t iGroup) {
+        for (int_t ic = Base::top_state_of_group_[iGroup];
+             ic < Base::top_state_of_group_[iGroup + 1]; ic++) {
+          uint_t baseChunk;
+          baseChunk = ic & (~(mask0 | mask1));
+          uint_t iChunk1 = baseChunk | mask0;
+          uint_t iChunk2 = baseChunk | mask1;
+          if (ic == iChunk1)
+            Base::states_[iChunk1].qreg().apply_chunk_swap(
+                qubits, Base::states_[iChunk2].qreg(), true);
+        }
+      };
+      if (q0 < chunk_bits_ * qubit_scale())
+        Utils::apply_omp_parallel_for(
+            (chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
+            Base::num_groups_, apply_chunk_swap_1qubit);
+      else
+        Utils::apply_omp_parallel_for(
+            (chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
+            Base::num_groups_, apply_chunk_swap_2qubits);
+    }
+#ifdef AER_MPI
+    else {
+      int_t iPair;
+      uint_t nPair;
+      uint_t baseChunk, iChunk1, iChunk2;
+
+      if (q0 < chunk_bits_ * qubit_scale())
+        nLarge = 1;
+      else
+        nLarge = 2;
+
+      // chunk scheduler that supports any number of processes
+      uint_t nu[3];
+      uint_t ub[3];
+      uint_t iu[3];
+      uint_t add;
+      uint_t iLocalChunk, iRemoteChunk, iProc;
+      int i;
+
+      if (q0 < chunk_bits_ * qubit_scale()) {
+        nLarge = 1;
+        nu[0] = 1ull << (q1 - chunk_bits_ * qubit_scale());
+        ub[0] = 0;
+        iu[0] = 0;
+
+        nu[1] = 1ull << (Base::num_qubits_ * qubit_scale() - q1 - 1);
+        ub[1] = (q1 - chunk_bits_ * qubit_scale()) + 1;
+        iu[1] = 0;
+      } else {
+        nLarge = 2;
+        nu[0] = 1ull << (q0 - chunk_bits_ * qubit_scale());
+        ub[0] = 0;
+        iu[0] = 0;
+
+        nu[1] = 1ull << (q1 - q0 - 1);
+        ub[1] = (q0 - chunk_bits_ * qubit_scale()) + 1;
+        iu[1] = 0;
+
+        nu[2] = 1ull << (Base::num_qubits_ * qubit_scale() - q1 - 1);
+        ub[2] = (q1 - chunk_bits_ * qubit_scale()) + 1;
+        iu[2] = 0;
+      }
+      nPair = 1ull << (Base::num_qubits_ * qubit_scale() -
+                       chunk_bits_ * qubit_scale() - nLarge);
+
+      for (iPair = 0; iPair < nPair; iPair++) {
+        // calculate index of pair of chunks
+        baseChunk = 0;
+        add = 1;
+        for (i = nLarge; i >= 0; i--) {
+          baseChunk += (iu[i] << ub[i]);
+          // update for next
+          iu[i] += add;
+          add = 0;
+          if (iu[i] >= nu[i]) {
+            iu[i] = 0;
+            add = 1;
+          }
+        }
+
+        iChunk1 = baseChunk | mask0;
+        iChunk2 = baseChunk | mask1;
+
+        if (iChunk1 >= Base::state_index_begin_[Base::distributed_rank_] &&
+            iChunk1 <
+                Base::state_index_end_[Base::distributed_rank_]) { // chunk1 is
+                                                                   // on
+          // this process
+          if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] &&
+              iChunk2 <
+                  Base::state_index_end_[Base::distributed_rank_]) { // chunk2
+                                                                     // is on
+            // this process
+            Base::states_[iChunk1 - Base::global_state_index_]
+                .qreg()
+                .apply_chunk_swap(
+                    qubits,
+                    Base::states_[iChunk2 - Base::global_state_index_].qreg(),
+                    true);
+            continue;
+          } else {
+            iLocalChunk = iChunk1;
+            iRemoteChunk = iChunk2;
+            iProc = get_process_by_chunk(iChunk2);
+          }
+        } else {
+          if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] &&
+              iChunk2 <
+                  Base::state_index_end_[Base::distributed_rank_]) { // chunk2
+                                                                     // is on
+            // this process
+            iLocalChunk = iChunk2;
+            iRemoteChunk = iChunk1;
+            iProc = get_process_by_chunk(iChunk1);
+          } else {
+            continue; // there is no chunk for this pair on this process
+          }
+        }
+
+        MPI_Request reqSend, reqRecv;
+        MPI_Status st;
+        uint_t sizeRecv, sizeSend;
+
+        auto pRecv = Base::states_[iLocalChunk - Base::global_state_index_]
+                         .qreg()
+                         .recv_buffer(sizeRecv);
+        MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair,
+                  Base::distributed_comm_, &reqRecv);
+
+        auto pSend = Base::states_[iLocalChunk - Base::global_state_index_]
+                         .qreg()
+                         .send_buffer(sizeSend);
+        MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair,
+                  Base::distributed_comm_, &reqSend);
+
+        MPI_Wait(&reqSend, &st);
+        MPI_Wait(&reqRecv, &st);
+
+        Base::states_[iLocalChunk - Base::global_state_index_]
+            .qreg()
+            .apply_chunk_swap(qubits, iRemoteChunk);
+      }
+    }
+#endif
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_multi_chunk_swap(
+    const reg_t &qubits) {
+  int_t nswap = qubits.size() / 2;
+  reg_t chunk_shuffle_qubits(nswap, 0);
+  reg_t local_swaps;
+  uint_t baseChunk = 0;
+  uint_t nchunk = 1ull << nswap;
+  reg_t chunk_procs(nchunk);
+  reg_t chunk_offset(nchunk);
+
+  if (qubit_scale() == 1) {
+    for (int_t i = 0; i < nswap; i++)
+      std::swap(qubit_map_[qubits[i * 2]], qubit_map_[qubits[i * 2] + 1]);
+  }
+
+  // define local swaps
+  for (int_t i = 0; i < nswap; i++) {
+    if (qubits[i * 2] >= chunk_bits_ * qubit_scale() - nswap) // no swap
+                                                              // required
+      chunk_shuffle_qubits[qubits[i * 2] + nswap -
+                           chunk_bits_ * qubit_scale()] = qubits[i * 2 + 1];
+  }
+  int_t pos = 0;
+  for (int_t i = 0; i < nswap; i++) {
+    if (qubits[i * 2] <
+        chunk_bits_ * qubit_scale() - nswap) { // local swap required
+      // find empty position
+      while (pos < nswap) {
+        if (chunk_shuffle_qubits[pos] < chunk_bits_ * qubit_scale()) {
+          chunk_shuffle_qubits[pos] = qubits[i * 2 + 1];
+          local_swaps.push_back(qubits[i * 2]);
+          local_swaps.push_back(chunk_bits_ * qubit_scale() - nswap + pos);
+          pos++;
+          break;
+        }
+        pos++;
+      }
+    }
+  }
+  for (int_t i = 0; i < nswap; i++)
+    chunk_shuffle_qubits[i] -= chunk_bits_ * qubit_scale();
+
+  // swap inside chunks to prepare for all-to-all shuffle
+  if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+        Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps);
+    }
+  } else {
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+        Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps);
+    }
+  }
+
+  // apply all-to-all chunk shuffle
+  int_t nPair;
+  reg_t chunk_shuffle_qubits_sorted = chunk_shuffle_qubits;
+  std::sort(chunk_shuffle_qubits_sorted.begin(),
+            chunk_shuffle_qubits_sorted.end());
+
+  nPair = Base::num_global_states_ >> nswap;
+
+  for (uint_t i = 0; i < nchunk; i++) {
+    chunk_offset[i] = 0;
+    for (uint_t k = 0; k < nswap; k++) {
+      if (((i >> k) & 1) != 0)
+        chunk_offset[i] += (1ull << chunk_shuffle_qubits[k]);
+    }
+  }
+
+#ifdef AER_MPI
+  std::vector<MPI_Request> reqSend(nchunk);
+  std::vector<MPI_Request> reqRecv(nchunk);
+#endif
+
+  for (int_t iPair = 0; iPair < nPair; iPair++) {
+    uint_t i1, i2, k, ii, t;
+    baseChunk = 0;
+    ii = iPair;
+    for (k = 0; k < nswap; k++) {
+      t = ii & ((1ull << chunk_shuffle_qubits_sorted[k]) - 1);
+      baseChunk += t;
+      ii = (ii - t) << 1;
+    }
+    baseChunk += ii;
+
+    for (i1 = 0; i1 < nchunk; i1++) {
+      chunk_procs[i1] = get_process_by_chunk(baseChunk + chunk_offset[i1]);
+    }
+
+    // all-to-all
+    // send data
+    for (uint_t iswap = 1; iswap < nchunk; iswap++) {
+      uint_t sizeRecv, sizeSend;
+      uint_t num_local_swap = 0;
+      for (i1 = 0; i1 < nchunk; i1++) {
+        i2 = i1 ^ iswap;
+        if (i1 >= i2)
+          continue;
+
+        uint_t iProc1 = chunk_procs[i1];
+        uint_t iProc2 = chunk_procs[i2];
+        if (iProc1 != Base::distributed_rank_ &&
+            iProc2 != Base::distributed_rank_)
+          continue;
+        if (iProc1 == iProc2) { // on the same process
+          num_local_swap++;
+          continue; // swap while data is exchanged between processes
+        }
+#ifdef AER_MPI
+        uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap);
+        uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
+        uint_t iChunk1 =
+            baseChunk + chunk_offset[i1] - Base::global_state_index_;
+        uint_t iChunk2 =
+            baseChunk + chunk_offset[i2] - Base::global_state_index_;
+
+        int_t tid = (iPair << nswap) + iswap;
+
+        if (iProc1 == Base::distributed_rank_) {
+          auto pRecv = Base::states_[iChunk1].qreg().recv_buffer(sizeRecv);
+          MPI_Irecv(pRecv + offset2, (sizeRecv >> nswap), MPI_BYTE, iProc2, tid,
+                    Base::distributed_comm_, &reqRecv[i2]);
+
+          auto pSend = Base::states_[iChunk1].qreg().send_buffer(sizeSend);
+          MPI_Isend(pSend + offset2, (sizeSend >> nswap), MPI_BYTE, iProc2, tid,
+                    Base::distributed_comm_, &reqSend[i2]);
+        } else {
+          auto pRecv = Base::states_[iChunk2].qreg().recv_buffer(sizeRecv);
+          MPI_Irecv(pRecv + offset1, (sizeRecv >> nswap), MPI_BYTE, iProc1, tid,
+                    Base::distributed_comm_, &reqRecv[i1]);
+
+          auto pSend = Base::states_[iChunk2].qreg().send_buffer(sizeSend);
+          MPI_Isend(pSend + offset1, (sizeSend >> nswap), MPI_BYTE, iProc1, tid,
+                    Base::distributed_comm_, &reqSend[i1]);
+        }
+#endif
+      }
+
+      // swaps inside process
+      if (num_local_swap > 0) {
+        for (i1 = 0; i1 < nchunk; i1++) {
+          i2 = i1 ^ iswap;
+          if (i1 > i2)
+            continue;
+
+          uint_t iProc1 = chunk_procs[i1];
+          uint_t iProc2 = chunk_procs[i2];
+          if (iProc1 != Base::distributed_rank_ &&
+              iProc2 != Base::distributed_rank_)
+            continue;
+          if (iProc1 == iProc2) { // on the same process
+            uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap);
+            uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
+            uint_t iChunk1 =
+                baseChunk + chunk_offset[i1] - Base::global_state_index_;
+            uint_t iChunk2 =
+                baseChunk + chunk_offset[i2] - Base::global_state_index_;
+            Base::states_[iChunk1].qreg().apply_chunk_swap(
+                Base::states_[iChunk2].qreg(), offset2, offset1,
+                (1ull << (chunk_bits_ * qubit_scale() - nswap)));
+          }
+        }
+      }
+
+#ifdef AER_MPI
+      // recv data
+      for (i1 = 0; i1 < nchunk; i1++) {
+        i2 = i1 ^ iswap;
+
+        uint_t iProc1 = chunk_procs[i1];
+        uint_t iProc2 = chunk_procs[i2];
+        if (iProc1 != Base::distributed_rank_)
+          continue;
+        if (iProc1 == iProc2) { // on the same process
+          continue;
+        }
+        uint_t iChunk1 =
+            baseChunk + chunk_offset[i1] - Base::global_state_index_;
+        uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
+
+        MPI_Status st;
+        MPI_Wait(&reqSend[i2], &st);
+        MPI_Wait(&reqRecv[i2], &st);
+
+        // copy states from recv buffer to chunk
+        Base::states_[iChunk1].qreg().apply_chunk_swap(
+            Base::states_[iChunk1].qreg(), offset2, offset2,
+            (1ull << (chunk_bits_ * qubit_scale() - nswap)));
+      }
+#endif
+    }
+  }
+
+  // restore qubits order
+  if (chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+        Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps);
+    }
+  } else {
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+        Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps);
+    }
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::apply_chunk_x(const uint_t qubit) {
+  int_t iChunk;
+  uint_t nLarge = 1;
+
+  if (qubit < chunk_bits_ * qubit_scale()) {
+    auto apply_mcx = [this, qubit](int_t ig) {
+      reg_t qubits(1, qubit);
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+        Base::states_[iChunk].qreg().apply_mcx(qubits);
+    };
+    Utils::apply_omp_parallel_for(
+        (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, Base::num_groups_,
+        apply_mcx);
+  } else { // exchange over chunks
+    int_t iPair;
+    uint_t nPair, mask;
+    uint_t baseChunk, iChunk1, iChunk2;
+    reg_t qubits(2);
+    qubits[0] = qubit;
+    qubits[1] = qubit;
+
+    mask = (1ull << qubit);
+    mask >>= (chunk_bits_ * qubit_scale());
+
+    if (Base::distributed_procs_ == 1 ||
+        (Base::distributed_proc_bits_ >= 0 &&
+         qubit < (Base::num_qubits_ * qubit_scale() -
+                  Base::distributed_proc_bits_))) { // no data transfer between
+                                                    // processes is needed
+      nPair = Base::num_local_states_ >> 1;
+
+      auto apply_chunk_swap = [this, mask, qubits](int_t iGroup) {
+        for (int_t ic = Base::top_state_of_group_[iGroup];
+             ic < Base::top_state_of_group_[iGroup + 1]; ic++) {
+          uint_t pairChunk;
+          pairChunk = ic ^ mask;
+          if (ic < pairChunk)
+            Base::states_[ic].qreg().apply_chunk_swap(
+                qubits, Base::states_[pairChunk].qreg(), true);
+        }
+      };
+      Utils::apply_omp_parallel_for(
+          (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, nPair,
+          apply_chunk_swap);
+    }
+#ifdef AER_MPI
+    else {
+      // chunk scheduler that supports any number of processes
+      uint_t nu[3];
+      uint_t ub[3];
+      uint_t iu[3];
+      uint_t add;
+      uint_t iLocalChunk, iRemoteChunk, iProc;
+      int i;
+
+      nLarge = 1;
+      nu[0] = 1ull << (qubit - chunk_bits_ * qubit_scale());
+      ub[0] = 0;
+      iu[0] = 0;
+
+      nu[1] = 1ull << (Base::num_qubits_ * qubit_scale() - qubit - 1);
+      ub[1] = (qubit - chunk_bits_ * qubit_scale()) + 1;
+      iu[1] = 0;
+      nPair = 1ull << (Base::num_qubits_ * qubit_scale() -
+                       chunk_bits_ * qubit_scale() - 1);
+
+      for (iPair = 0; iPair < nPair; iPair++) {
+        // calculate index of pair of chunks
+        baseChunk = 0;
+        add = 1;
+        for (i = 1; i >= 0; i--) {
+          baseChunk += (iu[i] << ub[i]);
+          // update for next
+          iu[i] += add;
+          add = 0;
+          if (iu[i] >= nu[i]) {
+            iu[i] = 0;
+            add = 1;
+          }
+        }
+
+        iChunk1 = baseChunk;
+        iChunk2 = baseChunk | mask;
+
+        if (iChunk1 >= Base::state_index_begin_[Base::distributed_rank_] &&
+            iChunk1 <
+                Base::state_index_end_[Base::distributed_rank_]) { // chunk1 is
+                                                                   // on
+          // this process
+          if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] &&
+              iChunk2 <
+                  Base::state_index_end_[Base::distributed_rank_]) { // chunk2
+                                                                     // is on
+            // this process
+            Base::states_[iChunk1 - Base::global_state_index_]
+                .qreg()
+                .apply_chunk_swap(
+                    qubits,
+                    Base::states_[iChunk2 - Base::global_state_index_].qreg(),
+                    true);
+            continue;
+          } else {
+            iLocalChunk = iChunk1;
+            iRemoteChunk = iChunk2;
+            iProc = get_process_by_chunk(iChunk2);
+          }
+        } else {
+          if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] &&
+              iChunk2 <
+                  Base::state_index_end_[Base::distributed_rank_]) { // chunk2
+                                                                     // is on
+            // this process
+            iLocalChunk = iChunk2;
+            iRemoteChunk = iChunk1;
+            iProc = get_process_by_chunk(iChunk1);
+          } else {
+            continue; // there is no chunk for this pair on this process
+          }
+        }
+
+        MPI_Request reqSend, reqRecv;
+        MPI_Status st;
+        uint_t sizeRecv, sizeSend;
+
+        auto pSend = Base::states_[iLocalChunk - Base::global_state_index_]
+                         .qreg()
+                         .send_buffer(sizeSend);
+        MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair,
+                  Base::distributed_comm_, &reqSend);
+
+        auto pRecv = Base::states_[iLocalChunk - Base::global_state_index_]
+                         .qreg()
+                         .recv_buffer(sizeRecv);
+        MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair,
+                  Base::distributed_comm_, &reqRecv);
+
+        MPI_Wait(&reqSend, &st);
+        MPI_Wait(&reqRecv, &st);
+
+        Base::states_[iLocalChunk - Base::global_state_index_]
+            .qreg()
+            .apply_chunk_swap(qubits, iRemoteChunk);
+      }
+    }
+#endif
+  }
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::send_chunk(uint_t local_chunk_index,
+                                                uint_t global_pair_index) {
+#ifdef AER_MPI
+  MPI_Request reqSend;
+  MPI_Status st;
+  uint_t sizeSend;
+  uint_t iProc;
+
+  iProc = get_process_by_chunk(global_pair_index);
+
+  auto pSend = Base::states_[local_chunk_index].qreg().send_buffer(sizeSend);
+  MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc,
+            local_chunk_index + Base::global_state_index_,
+            Base::distributed_comm_, &reqSend);
+
+  MPI_Wait(&reqSend, &st);
+
+  Base::states_[local_chunk_index].qreg().release_send_buffer();
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::recv_chunk(uint_t local_chunk_index,
+                                                uint_t global_pair_index) {
+#ifdef AER_MPI
+  MPI_Request reqRecv;
+  MPI_Status st;
+  uint_t sizeRecv;
+  uint_t iProc;
+
+  iProc = get_process_by_chunk(global_pair_index);
+
+  auto pRecv = Base::states_[local_chunk_index].qreg().recv_buffer(sizeRecv);
+  MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, global_pair_index,
+            Base::distributed_comm_, &reqRecv);
+
+  MPI_Wait(&reqRecv, &st);
+#endif
+}
+
+template <class state_t>
+template <class data_t>
+void ParallelStateExecutor<state_t>::send_data(data_t *pSend, uint_t size,
+                                               uint_t myid, uint_t pairid) {
+#ifdef AER_MPI
+  MPI_Request reqSend;
+  MPI_Status st;
+  uint_t iProc;
+
+  iProc = get_process_by_chunk(pairid);
+
+  MPI_Isend(pSend, size * sizeof(data_t), MPI_BYTE, iProc, myid,
+            Base::distributed_comm_, &reqSend);
+
+  MPI_Wait(&reqSend, &st);
+#endif
+}
+
+template <class state_t>
+template <class data_t>
+void ParallelStateExecutor<state_t>::recv_data(data_t *pRecv, uint_t size,
+                                               uint_t myid, uint_t pairid) {
+#ifdef AER_MPI
+  MPI_Request reqRecv;
+  MPI_Status st;
+  uint_t iProc;
+
+  iProc = get_process_by_chunk(pairid);
+
+  MPI_Irecv(pRecv, size * sizeof(data_t), MPI_BYTE, iProc, pairid,
+            Base::distributed_comm_, &reqRecv);
+
+  MPI_Wait(&reqRecv, &st);
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::reduce_sum(reg_t &sum) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    uint_t i, n = sum.size();
+    reg_t tmp(n);
+    MPI_Allreduce(&sum[0], &tmp[0], n, MPI_UINT64_T, MPI_SUM,
+                  Base::distributed_comm_);
+    for (i = 0; i < n; i++) {
+      sum[i] = tmp[i];
+    }
+  }
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::reduce_sum(rvector_t &sum) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    uint_t i, n = sum.size();
+    rvector_t tmp(n);
+    MPI_Allreduce(&sum[0], &tmp[0], n, MPI_DOUBLE_PRECISION, MPI_SUM,
+                  Base::distributed_comm_);
+    for (i = 0; i < n; i++) {
+      sum[i] = tmp[i];
+    }
+  }
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::reduce_sum(complex_t &sum) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    complex_t tmp;
+    MPI_Allreduce(&sum, &tmp, 2, MPI_DOUBLE_PRECISION, MPI_SUM,
+                  Base::distributed_comm_);
+    sum = tmp;
+  }
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::reduce_sum(double &sum) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    double tmp;
+    MPI_Allreduce(&sum, &tmp, 1, MPI_DOUBLE_PRECISION, MPI_SUM,
+                  Base::distributed_comm_);
+    sum = tmp;
+  }
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::gather_value(rvector_t &val) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    rvector_t tmp = val;
+    MPI_Alltoall(&tmp[0], 1, MPI_DOUBLE_PRECISION, &val[0], 1,
+                 MPI_DOUBLE_PRECISION, Base::distributed_comm_);
+  }
+#endif
+}
+
+template <class state_t>
+void ParallelStateExecutor<state_t>::sync_process(void) const {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    MPI_Barrier(Base::distributed_comm_);
+  }
+#endif
+}
+
+// gather distributed state into vector (if memory is enough)
+template <class state_t>
+template <class data_t>
+void ParallelStateExecutor<state_t>::gather_state(
+    std::vector<std::complex<data_t>> &state) {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    uint_t size, local_size, global_size, offset;
+    int i;
+    std::vector<int> recv_counts(Base::distributed_procs_);
+    std::vector<int> recv_offset(Base::distributed_procs_);
+
+    global_size = 0;
+    for (i = 0; i < Base::distributed_procs_; i++) {
+      recv_offset[i] =
+          (int)(Base::state_index_begin_[i] << (chunk_bits_ * qubit_scale())) *
+          2;
+      recv_counts[i] =
+          (int)((Base::state_index_end_[i] - Base::state_index_begin_[i])
+                << (chunk_bits_ * qubit_scale()));
+      global_size += recv_counts[i];
+      recv_counts[i] *= 2;
+    }
+    if ((global_size >> 21) > Utils::get_system_memory_mb()) {
+      throw std::runtime_error(
+          std::string("There is not enough memory to gather state"));
+    }
+    std::vector<std::complex<data_t>> local_state = state;
+    state.resize(global_size);
+
+    if (sizeof(std::complex<data_t>) == 16) {
+      MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_],
+                     MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0],
+                     &recv_offset[0], MPI_DOUBLE_PRECISION,
+                     Base::distributed_comm_);
+    } else {
+      MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_],
+                     MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0],
+                     MPI_FLOAT, Base::distributed_comm_);
+    }
+  }
+#endif
+}
+
+template <class state_t>
+template <class data_t>
+void ParallelStateExecutor<state_t>::gather_state(
+    AER::Vector<std::complex<data_t>> &state) {
+#ifdef AER_MPI
+  if (Base::distributed_procs_ > 1) {
+    uint_t size, local_size, global_size, offset;
+    int i;
+
+    std::vector<int> recv_counts(Base::distributed_procs_);
+    std::vector<int> recv_offset(Base::distributed_procs_);
+
+    global_size = 0;
+    for (i = 0; i < Base::distributed_procs_; i++) {
+      recv_offset[i] =
+          (int)(Base::state_index_begin_[i] << (chunk_bits_ * qubit_scale())) *
+          2;
+      recv_counts[i] =
+          (int)((Base::state_index_end_[i] - Base::state_index_begin_[i])
+                << (chunk_bits_ * qubit_scale()));
+      global_size += recv_counts[i];
+      recv_counts[i] *= 2;
+    }
+    if ((global_size >> 21) > Utils::get_system_memory_mb()) {
+      throw std::runtime_error(
+          std::string("There is not enough memory to gather state"));
+    }
+    AER::Vector<std::complex<data_t>> local_state = state;
+    state.resize(global_size);
+
+    if (sizeof(std::complex<data_t>) == 16) {
+      MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_],
+                     MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0],
+                     &recv_offset[0], MPI_DOUBLE_PRECISION,
+                     Base::distributed_comm_);
+    } else {
+      MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_],
+                     MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0],
+                     MPI_FLOAT, Base::distributed_comm_);
+    }
+  }
+#endif
+}
+
+//-------------------------------------------------------------------------
+} // end namespace CircuitExecutor
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/shot_branching.hpp b/src/simulators/shot_branching.hpp
new file mode 100644
index 0000000000..358b07c08d
--- /dev/null
+++ b/src/simulators/shot_branching.hpp
@@ -0,0 +1,301 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019.2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _shot_branching_hpp
+#define _shot_branching_hpp
+
+namespace AER {
+
+namespace CircuitExecutor {
+
+using OpItr = std::vector<Operations::Op>::const_iterator;
+
+class Branch;
+
+// class for shared state for sho-branching
+class Branch {
+protected:
+  uint_t state_index_; // state index
+  uint_t root_state_index_;
+
+  uint_t shot_index_; // starting shot index
+
+  // creg to be stored to the state
+  ClassicalRegister creg_;
+  // random generators for shots
+  std::vector<RngEngine> shots_;
+  // additional operations applied after shot branching
+  std::vector<Operations::Op> additional_ops_;
+
+  // mark for control flow
+  std::unordered_map<std::string, OpItr> flow_marks_;
+
+  // current iterator of operations
+  OpItr iter_;
+
+  // branches from this
+  std::vector<std::shared_ptr<Branch>> branches_;
+
+public:
+  Branch(void) {}
+  ~Branch() {
+    shots_.clear();
+    additional_ops_.clear();
+    branches_.clear();
+  }
+  Branch(const Branch &src) {
+    shots_ = src.shots_;
+    creg_ = src.creg_;
+    iter_ = src.iter_;
+    flow_marks_ = src.flow_marks_;
+  }
+
+  uint_t &state_index(void) { return state_index_; }
+  uint_t &root_state_index(void) { return root_state_index_; }
+  uint_t &shot_index(void) { return shot_index_; }
+  ClassicalRegister &creg(void) { return creg_; }
+  std::vector<RngEngine> &rng_shots(void) { return shots_; }
+  OpItr &op_iterator(void) { return iter_; }
+  std::unordered_map<std::string, OpItr> &marks(void) { return flow_marks_; }
+  uint_t num_branches(void) { return branches_.size(); }
+  std::vector<std::shared_ptr<Branch>> &branches(void) { return branches_; }
+
+  uint_t num_shots(void) { return shots_.size(); }
+  void clear(void) {
+    shots_.clear();
+    additional_ops_.clear();
+    branches_.clear();
+  }
+  void clear_branch(void) { branches_.clear(); }
+
+  void set_shots(std::vector<RngEngine> &shots) { shots_ = shots; }
+  void initialize_shots(const uint_t nshots, const uint_t seed) {
+    shots_.resize(nshots);
+    for (int_t i = 0; i < nshots; i++) {
+      shots_[i].set_seed(seed + i);
+    }
+  }
+
+  void add_op_after_branch(Operations::Op &op) {
+    additional_ops_.push_back(op);
+  }
+  void copy_ops_after_branch(std::vector<Operations::Op> &ops) {
+    additional_ops_ = ops;
+  }
+  void clear_additional_ops(void) { additional_ops_.clear(); }
+
+  std::vector<Operations::Op> &additional_ops(void) { return additional_ops_; }
+
+  void branch_shots(reg_t &shots, int_t nbranch);
+
+  bool apply_control_flow(ClassicalRegister &creg, OpItr last) {
+    if (iter_->type == Operations::OpType::mark) {
+      flow_marks_[iter_->string_params[0]] = iter_;
+      iter_++;
+      return true;
+    } else if (iter_->type == Operations::OpType::jump) {
+      if (creg.check_conditional(*iter_)) {
+        const auto &mark_name = iter_->string_params[0];
+        auto mark_it = flow_marks_.find(mark_name);
+        if (mark_it != flow_marks_.end()) {
+          iter_ = mark_it->second;
+        } else {
+          for (++iter_; iter_ != last; ++iter_) {
+            if (iter_->type == Operations::OpType::mark) {
+              flow_marks_[iter_->string_params[0]] = iter_;
+              if (iter_->string_params[0] == mark_name) {
+                break;
+              }
+            }
+          }
+          if (iter_ == last) {
+            std::stringstream msg;
+            msg << "Invalid jump destination:\"" << mark_name << "\"."
+                << std::endl;
+            throw std::runtime_error(msg.str());
+          }
+        }
+      }
+      iter_++;
+      return true;
+    }
+    return false;
+  }
+
+  void advance_iterator(void);
+
+  bool apply_runtime_noise_sampling(const ClassicalRegister &creg,
+                                    const Operations::Op &op,
+                                    const Noise::NoiseModel &noise);
+
+  void remove_empty_branches(void);
+};
+
+void Branch::branch_shots(reg_t &shots, int_t nbranch) {
+  branches_.resize(nbranch);
+
+  for (int_t i = 0; i < nbranch; i++) {
+    branches_[i] = std::make_shared<Branch>();
+    branches_[i]->creg_ = creg_;
+    branches_[i]->iter_ = iter_;
+    branches_[i]->flow_marks_ = flow_marks_;
+  }
+  for (int_t i = 0; i < shots.size(); i++) {
+    branches_[shots[i]]->shots_.push_back(shots_[i]);
+  }
+  // update shot indices
+  uint_t index = shot_index_;
+  for (int_t i = 0; i < nbranch; i++) {
+    branches_[i]->shot_index_ = index;
+    index += branches_[i]->shots_.size();
+  }
+}
+
+void Branch::advance_iterator(void) {
+  iter_++;
+  for (int_t i = 0; i < branches_.size(); i++) {
+    branches_[i]->iter_++;
+  }
+}
+
+bool Branch::apply_runtime_noise_sampling(const ClassicalRegister &creg,
+                                          const Operations::Op &op,
+                                          const Noise::NoiseModel &noise) {
+  if (op.type != Operations::OpType::sample_noise)
+    return false;
+
+  uint_t nshots = num_shots();
+  reg_t shot_map(nshots);
+  std::vector<std::vector<Operations::Op>> noises;
+
+  for (int_t i = 0; i < nshots; i++) {
+    std::vector<Operations::Op> noise_ops =
+        noise.sample_noise_loc(op, shots_[i]);
+
+    // search same noise ops
+    int_t pos = -1;
+    for (int_t j = 0; j < noises.size(); j++) {
+      if (noise_ops.size() != noises[j].size())
+        continue;
+      bool same = true;
+      for (int_t k = 0; k < noise_ops.size(); k++) {
+        if (noise_ops[k].type != noises[j][k].type ||
+            noise_ops[k].name != noises[j][k].name)
+          same = false;
+        else if (noise_ops[k].qubits.size() != noises[j][k].qubits.size())
+          same = false;
+        else {
+          for (int_t l = 0; l < noise_ops[k].qubits.size(); l++) {
+            if (noise_ops[k].qubits[l] != noises[j][k].qubits[l]) {
+              same = false;
+              break;
+            }
+          }
+        }
+        if (!same)
+          break;
+        if (noise_ops[k].type == Operations::OpType::gate) {
+          if (noise_ops[k].name == "pauli") {
+            if (noise_ops[k].string_params[0] != noises[j][k].string_params[0])
+              same = false;
+          } else if (noise_ops[k].params.size() != noises[j][k].params.size())
+            same = false;
+          else {
+            for (int_t l = 0; l < noise_ops[k].params.size(); l++) {
+              if (noise_ops[k].params[l] != noises[j][k].params[l]) {
+                same = false;
+                break;
+              }
+            }
+          }
+        } else if (noise_ops[k].type == Operations::OpType::matrix ||
+                   noise_ops[k].type == Operations::OpType::diagonal_matrix) {
+          if (noise_ops[k].mats.size() != noises[j][k].mats.size())
+            same = false;
+          else {
+            for (int_t l = 0; l < noise_ops[k].mats.size(); l++) {
+              if (noise_ops[k].mats[l].size() != noises[j][k].mats[l].size()) {
+                same = false;
+                break;
+              }
+              for (int_t m = 0; m < noise_ops[k].mats[l].size(); m++) {
+                if (noise_ops[k].mats[l][m] != noises[j][k].mats[l][m]) {
+                  same = false;
+                  break;
+                }
+              }
+              if (!same)
+                break;
+            }
+          }
+        }
+        if (!same)
+          break;
+      }
+      if (same) {
+        pos = j;
+        break;
+      }
+    }
+
+    if (pos < 0) { // if not found, add noise ops to the list
+      shot_map[i] = noises.size();
+      noises.push_back(noise_ops);
+    } else { // if found, add shot
+      shot_map[i] = pos;
+    }
+  }
+
+  creg_ = creg;
+  branch_shots(shot_map, noises.size());
+  for (int_t i = 0; i < noises.size(); i++) {
+    branches_[i]->copy_ops_after_branch(noises[i]);
+  }
+
+  return true;
+}
+
+void Branch::remove_empty_branches(void) {
+  int_t istart = 0;
+  for (int_t j = 0; j < branches_.size(); j++) {
+    if (branches_[j]->num_shots() > 0) {
+      // copy shots to the root
+      shots_ = branches_[j]->rng_shots();
+      additional_ops_ = branches_[j]->additional_ops();
+      shot_index_ = branches_[j]->shot_index();
+      creg_ = branches_[j]->creg();
+      branches_[j].reset();
+      istart = j + 1;
+      break;
+    }
+    branches_[j].reset();
+  }
+
+  std::vector<std::shared_ptr<Branch>> new_branches;
+
+  for (int_t j = istart; j < branches_.size(); j++) {
+    if (branches_[j]->num_shots() > 0)
+      new_branches.push_back(branches_[j]);
+    else
+      branches_[j].reset();
+  }
+  branches_ = new_branches;
+}
+
+//-------------------------------------------------------------------------
+} // namespace CircuitExecutor
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/simulators.hpp b/src/simulators/simulators.hpp
new file mode 100644
index 0000000000..017979e8fd
--- /dev/null
+++ b/src/simulators/simulators.hpp
@@ -0,0 +1,61 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _aer_simulators_hpp_
+#define _aer_simulators_hpp_
+
+#include "simulators/density_matrix/densitymatrix_state.hpp"
+#include "simulators/extended_stabilizer/extended_stabilizer_state.hpp"
+#include "simulators/matrix_product_state/matrix_product_state.hpp"
+#include "simulators/stabilizer/stabilizer_state.hpp"
+#include "simulators/statevector/statevector_state.hpp"
+#include "simulators/superoperator/superoperator_state.hpp"
+#include "simulators/tensor_network/tensor_net_state.hpp"
+#include "simulators/unitary/unitary_state.hpp"
+
+namespace AER {
+
+// Simulation methods
+enum class Method {
+  automatic,
+  statevector,
+  density_matrix,
+  matrix_product_state,
+  stabilizer,
+  extended_stabilizer,
+  unitary,
+  superop,
+  tensor_network
+};
+
+enum class Device { CPU, GPU, ThrustCPU };
+
+// Simulation precision
+enum class Precision { Double, Single };
+
+const std::unordered_map<Method, std::string> method_names_ = {
+    {Method::automatic, "automatic"},
+    {Method::statevector, "statevector"},
+    {Method::density_matrix, "density_matrix"},
+    {Method::matrix_product_state, "matrix_product_state"},
+    {Method::stabilizer, "stabilizer"},
+    {Method::extended_stabilizer, "extended_stabilizer"},
+    {Method::unitary, "unitary"},
+    {Method::superop, "superop"},
+    {Method::tensor_network, "tensor_network"}};
+
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/stabilizer/clifford.hpp b/src/simulators/stabilizer/clifford.hpp
index 15c5bcc202..e54844e573 100644
--- a/src/simulators/stabilizer/clifford.hpp
+++ b/src/simulators/stabilizer/clifford.hpp
@@ -45,6 +45,9 @@ class Clifford {
   Clifford() = default;
   explicit Clifford(const uint64_t nqubit);
 
+  // initialize from existing state (copy)
+  void initialize(const Clifford &obj);
+
   //-----------------------------------------------------------------------
   // Utility functions
   //-----------------------------------------------------------------------
@@ -224,6 +227,17 @@ void Clifford::initialize(uint64_t nq) {
   stabilizer_phases_.setLength(nq);
 }
 
+void Clifford::initialize(const Clifford &obj) {
+  destabilizer_table_ = obj.destabilizer_table_;
+  stabilizer_table_ = obj.stabilizer_table_;
+  destabilizer_phases_ = obj.destabilizer_phases_;
+  stabilizer_phases_ = obj.stabilizer_phases_;
+  num_qubits_ = obj.num_qubits_;
+  omp_threads_ = obj.omp_threads_;
+  omp_threshold_ = obj.omp_threshold_;
+  json_chop_threshold_ = obj.json_chop_threshold_;
+}
+
 //------------------------------------------------------------------------------
 // Apply Clifford gates
 //------------------------------------------------------------------------------
diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp
index 4136230f8f..c8aebfef79 100644
--- a/src/simulators/state.hpp
+++ b/src/simulators/state.hpp
@@ -225,6 +225,11 @@ class Base {
 
   // can apply density matrix (without statevector output required)
   virtual void enable_density_matrix(bool flg) {}
+
+  void set_num_global_qubits(uint_t qubits) { num_global_qubits_ = qubits; }
+
+  void enable_cuStateVec(bool flg) { cuStateVec_enable_ = flg; }
+
   //-----------------------------------------------------------------------
   // Common instructions
   //-----------------------------------------------------------------------
@@ -250,10 +255,20 @@ class Base {
   int_t max_matrix_qubits_ = 0;
 
   std::string sim_device_name_ = "CPU";
+
+  uint_t num_global_qubits_; // used for chunk parallelization
+
+  bool cuStateVec_enable_ = false;
+
+  reg_t target_gpus_;
 };
 
 void Base::set_config(const Config &config) {
   sim_device_name_ = config.device;
+
+  if (config.target_gpus.has_value()) {
+    target_gpus_ = config.target_gpus.value();
+  }
 }
 
 std::vector<reg_t> Base::sample_measure(const reg_t &qubits, uint_t shots,
diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp
deleted file mode 100644
index 0b0c455d7e..0000000000
--- a/src/simulators/state_chunk.hpp
+++ /dev/null
@@ -1,2288 +0,0 @@
-/**
- * This code is part of Qiskit.
- *
- * (C) Copyright IBM 2018, 2019.
- *
- * This code is licensed under the Apache License, Version 2.0. You may
- * obtain a copy of this license in the LICENSE.txt file in the root directory
- * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
- *
- * Any modifications or derivative works of this code must retain this
- * copyright notice, and modified files need to carry a notice indicating
- * that they have been altered from the originals.
- */
-
-#ifndef _aer_base_state_chunk_hpp_
-#define _aer_base_state_chunk_hpp_
-
-#include "framework/creg.hpp"
-#include "framework/json.hpp"
-#include "framework/opset.hpp"
-#include "framework/results/experiment_result.hpp"
-#include "framework/types.hpp"
-
-#include "noise/noise_model.hpp"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifdef AER_MPI
-#include <mpi.h>
-#endif
-
-namespace AER {
-
-namespace QuantumState {
-
-#define STATE_APPLY_TO_ALL_CHUNKS 0
-
-//=========================================================================
-// StateChunk interface base class with multiple chunks for Qiskit-Aer
-// The base state class that supports multi-chunk distribution/ multi-shot
-// parallelization
-//=========================================================================
-
-template <class state_t>
-class StateChunk : public State<state_t> {
-
-public:
-  using ignore_argument = void;
-  using BaseState = State<state_t>;
-  using DataSubType = Operations::DataSubType;
-  using OpType = Operations::OpType;
-  using OpItr = std::vector<Operations::Op>::const_iterator;
-
-  //-----------------------------------------------------------------------
-  // Constructors
-  //-----------------------------------------------------------------------
-
-  // The constructor arguments are used to initialize the OpSet
-  // for the StateChunk class for checking supported simulator Operations
-  //
-  // Standard OpTypes that can be included here are:
-  // - `OpType::gate` if gates are supported
-  // - `OpType::measure` if measure is supported
-  // - `OpType::reset` if reset is supported
-  // - `OpType::barrier` if barrier is supported
-  // - `OpType::matrix` if arbitrary unitary matrices are supported
-  // - `OpType::kraus` if general Kraus noise channels are supported
-  //
-  // For gate ops allowed gates are specified by a set of string names,
-  // for example this could include {"u1", "u2", "u3", "U", "cx", "CX"}
-  //
-
-  StateChunk(const Operations::OpSet &opset) : BaseState(opset) {
-    num_global_chunks_ = 0;
-    num_local_chunks_ = 0;
-
-    myrank_ = 0;
-    nprocs_ = 1;
-
-    distributed_procs_ = 1;
-    distributed_rank_ = 0;
-    distributed_group_ = 0;
-    distributed_proc_bits_ = 0;
-
-    chunk_omp_parallel_ = false;
-    global_chunk_indexing_ = false;
-
-#ifdef AER_MPI
-    distributed_comm_ = MPI_COMM_WORLD;
-#endif
-  }
-
-  virtual ~StateChunk();
-
-  //-----------------------------------------------------------------------
-  // Data accessors
-  //-----------------------------------------------------------------------
-
-  // Return the state qreg object
-  auto &qreg(int_t idx = 0) { return qregs_[idx]; }
-  const auto &qreg(int_t idx = 0) const { return qregs_[idx]; }
-
-  // Return the creg object
-  auto &chunk_creg(uint_t iChunk) {
-    return BaseState::creg(get_global_shot_index(iChunk));
-  }
-  const auto &chunk_creg(uint_t iChunk) const {
-    return BaseState::creg(get_global_shot_index(iChunk));
-  }
-
-  //=======================================================================
-  // Subclass Override Methods
-  //
-  // The following methods should be implemented by any StateChunk subclasses.
-  // Abstract methods are required, while some methods are optional for
-  // StateChunk classes that support measurement to be compatible with a general
-  // QasmController.
-  //=======================================================================
-
-  //-----------------------------------------------------------------------
-  // Abstract methods
-  //
-  // The implementation of these methods must be defined in all subclasses
-  //-----------------------------------------------------------------------
-
-  // Return a string name for the StateChunk type
-  virtual std::string name() const = 0;
-
-  // Initializes the StateChunk to the default state.
-  // Typically this is the n-qubit all |0> state
-  virtual void initialize_qreg(uint_t num_qubits) = 0;
-
-  // Return an estimate of the required memory for implementing the
-  // specified sequence of operations on a `num_qubit` sized StateChunk.
-  virtual size_t
-  required_memory_mb(uint_t num_qubits,
-                     const std::vector<Operations::Op> &ops) const = 0;
-
-  // memory allocation (previously called before inisitalize_qreg)
-  virtual bool allocate(uint_t num_qubits, uint_t block_bits,
-                        uint_t num_parallel_shots = 1);
-
-  // Return the expectation value of a N-qubit Pauli operator
-  // If the simulator does not support Pauli expectation value this should
-  // raise an exception.
-  double expval_pauli(const reg_t &qubits,
-                      const std::string &pauli) override final {
-    return 0.0;
-  }
-
-  virtual double expval_pauli(const int_t iChunk, const reg_t &qubits,
-                              const std::string &pauli) = 0;
-
-  //-----------------------------------------------------------------------
-  // Optional: Load config settings
-  //-----------------------------------------------------------------------
-
-  // Load any settings for the StateChunk class from a config
-  virtual void set_config(const Config &config);
-
-  //=======================================================================
-  // Standard non-virtual methods
-  //
-  // These methods should not be modified in any StateChunk subclasses
-  //=======================================================================
-
-  //-----------------------------------------------------------------------
-  // Apply circuits and ops
-  //-----------------------------------------------------------------------
-
-  // Apply a single operation
-  // The `final_op` flag indicates no more instructions will be applied
-  // to the state after this sequence, so the state can be modified at the
-  // end of the instructions.
-
-  // this is not used for StateChunk
-  void apply_op(const Operations::Op &op, ExperimentResult &result,
-                RngEngine &rng, bool final_op = false) override final {
-    apply_op(0, op, result, rng, final_op);
-  }
-
-  // so this one is used
-  virtual void apply_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, RngEngine &rng,
-                        bool final_op = false) = 0;
-
-  // Apply a sequence of operations to the current state of the StateChunk
-  // class. It is up to the StateChunk subclass to decide how this sequence
-  // should be executed (ie in sequence, or some other execution strategy.) If
-  // this sequence contains operations not in the supported opset an exeption
-  // will be thrown. The `final_ops` flag indicates no more instructions will be
-  // applied to the state after this sequence, so the state can be modified at
-  // the end of the instructions.
-  void apply_ops(OpItr first, OpItr last, ExperimentResult &result,
-                 RngEngine &rng, bool final_ops = false) override;
-
-  // apply ops to multiple shots
-  // this function should be separately defined since apply_ops is called in
-  // quantum_error
-  template <typename InputIterator>
-  void apply_ops_multi_shots(InputIterator first, InputIterator last,
-                             const Noise::NoiseModel &noise,
-                             ExperimentResult &result, uint_t rng_seed,
-                             bool final_ops = false);
-
-  //-----------------------------------------------------------------------
-  // Initialization
-  //-----------------------------------------------------------------------
-  template <typename list_t>
-  void initialize_from_vector(const int_t iChunk, const list_t &vec);
-
-  template <typename list_t>
-  void initialize_from_matrix(const int_t iChunk, const list_t &mat);
-
-  //-----------------------------------------------------------------------
-  // ClassicalRegister methods
-  //-----------------------------------------------------------------------
-
-  // Initialize classical memory and register to default value (all-0)
-  virtual void initialize_creg(uint_t num_memory, uint_t num_register);
-
-  // Initialize classical memory and register to specific values
-  virtual void initialize_creg(uint_t num_memory, uint_t num_register,
-                               const std::string &memory_hex,
-                               const std::string &register_hex);
-
-  //-----------------------------------------------------------------------
-  // Common instructions
-  //-----------------------------------------------------------------------
-
-  // Apply a save expectation value instruction
-  void apply_save_expval(const int_t iChunk, const Operations::Op &op,
-                         ExperimentResult &result);
-
-  //-----------------------------------------------------------------------
-  // Config Settings
-  //-----------------------------------------------------------------------
-
-  // set number of processes to be distributed
-  virtual void set_distribution(uint_t nprocs);
-
-  // set max number of shots to execute in a batch
-  void set_max_bached_shots(uint_t shots) { max_batched_shots_ = shots; }
-
-  // Does this state support multi-chunk distribution?
-  virtual bool multi_chunk_distribution_supported(void) { return true; }
-  // Does this state support multi-shot parallelization?
-  virtual bool multi_shot_parallelization_supported(void) { return true; }
-
-  // set creg bit counts before initialize creg
-  void set_num_creg_bits(uint_t num_memory, uint_t num_register) override {
-    num_creg_memory_ = num_memory;
-    num_creg_registers_ = num_register;
-  }
-
-protected:
-  // The array of the quantum state data structure
-  std::vector<state_t> qregs_;
-
-  // number of qubits for the circuit
-  uint_t num_qubits_;
-
-  // extra parameters for parallel simulations
-  uint_t num_global_chunks_; // number of total chunks
-  uint_t num_local_chunks_;  // number of local chunks
-  uint_t chunk_bits_;        // number of qubits per chunk
-  uint_t block_bits_;        // number of cache blocked qubits
-
-  uint_t global_chunk_index_; // beginning chunk index for this process
-  reg_t chunk_index_begin_;   // beginning chunk index for each process
-  reg_t chunk_index_end_;     // ending chunk index for each process
-  uint_t local_shot_index_;   // local shot ID of current batch loop
-
-  uint_t myrank_;               // process ID
-  uint_t nprocs_;               // number of processes
-  uint_t distributed_rank_;     // process ID in communicator group
-  uint_t distributed_procs_;    // number of processes in communicator group
-  uint_t distributed_group_;    // group id of distribution
-  int_t distributed_proc_bits_; // distributed_procs_=2^distributed_proc_bits_
-                                // (if nprocs != power of 2, set -1)
-
-  bool chunk_omp_parallel_; // using thread parallel to process loop of chunks
-                            // or not
-  bool global_chunk_indexing_; // using global index for control qubits and
-                               // diagonal matrix
-
-  bool multi_chunk_distribution_ =
-      false; // distributing chunks to apply cache blocking parallelization
-  bool multi_shots_parallelization_ =
-      false; // using chunks as multiple shots parallelization
-  bool set_parallelization_called_ =
-      false; // this flag is used to check set_parallelization is already
-             // called, if yes the call sets max_batched_shots_
-  uint_t max_batched_shots_ =
-      1; // max number of shots can be stored on available memory
-
-  reg_t qubit_map_; // qubit map to restore swapped qubits
-
-  bool multi_chunk_swap_enable_ = true; // enable multi-chunk swaps
-  uint_t chunk_swap_buffer_qubits_ =
-      15;                 // maximum buffer size in qubits for chunk swap
-  uint_t max_multi_swap_; // maximum swaps can be applied at a time, calculated
-                          // by chunk_swap_buffer_bits_
-
-  // group of states (GPU devices)
-  uint_t num_groups_; // number of groups of chunks
-  reg_t top_chunk_of_group_;
-  reg_t num_chunks_in_group_;
-  int num_threads_per_group_; // number of outer threads per group
-
-  // cuStateVec settings
-  bool cuStateVec_enable_ = false;
-
-  uint_t num_creg_memory_ =
-      0; // number of total bits for creg (reserve for multi-shots)
-  uint_t num_creg_registers_ = 0;
-
-  //-----------------------------------------------------------------------
-  // Apply circuits and ops
-  //-----------------------------------------------------------------------
-  // apply ops for multi-chunk distribution
-  template <typename InputIterator>
-  void apply_ops_chunks(InputIterator first, InputIterator last,
-                        ExperimentResult &result, RngEngine &rng,
-                        bool final_ops = false);
-
-  // apply cache blocked ops in each chunk
-  template <typename InputIterator>
-  void apply_cache_blocking_ops(const int_t iGroup, InputIterator first,
-                                InputIterator last, ExperimentResult &result,
-                                RngEngine &rng);
-
-  // apply ops for multi-shots to one group
-  template <typename InputIterator>
-  void apply_ops_multi_shots_for_group(int_t i_group, InputIterator first,
-                                       InputIterator last,
-                                       const Noise::NoiseModel &noise,
-                                       ExperimentResult &result,
-                                       uint_t rng_seed, bool final_ops);
-
-  // apply op to multiple shots , return flase if op is not supported to execute
-  // in a batch
-  virtual bool apply_batched_op(const int_t iChunk, const Operations::Op &op,
-                                ExperimentResult &result,
-                                std::vector<RngEngine> &rng,
-                                bool final_op = false) {
-    return false;
-  }
-
-  // apply sampled noise to multiple-shots (this is used for ops contains
-  // non-Pauli operators)
-  void apply_batched_noise_ops(
-      const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
-      ExperimentResult &result, std::vector<RngEngine> &rng);
-
-  // check conditional
-  bool check_conditional(const int_t iChunk, const Operations::Op &op);
-
-  // this function is used to scale chunk qubits for multi-chunk distribution
-  virtual int qubit_scale(void) {
-    return 1; // scale of qubit number (x2 for density and unitary matrices)
-  }
-  uint_t get_process_by_chunk(uint_t cid);
-
-  // allocate qregs
-  bool allocate_qregs(uint_t num_chunks);
-
-  //-----------------------------------------------------------------------
-  // Functions for multi-chunk distribution
-  //-----------------------------------------------------------------------
-  // swap between chunks
-  virtual void apply_chunk_swap(const reg_t &qubits);
-
-  // apply multiple swaps between chunks
-  virtual void apply_multi_chunk_swap(const reg_t &qubits);
-
-  // apply X gate over chunks
-  virtual void apply_chunk_x(const uint_t qubit);
-
-  // send/receive chunk in receive buffer
-  void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
-  void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index);
-
-  template <class data_t>
-  void send_data(data_t *pSend, uint_t size, uint_t myid, uint_t pairid);
-  template <class data_t>
-  void recv_data(data_t *pRecv, uint_t size, uint_t myid, uint_t pairid);
-
-  // reduce values over processes
-  void reduce_sum(reg_t &sum) const;
-  void reduce_sum(rvector_t &sum) const;
-  void reduce_sum(complex_t &sum) const;
-  void reduce_sum(double &sum) const;
-
-  // gather values on each process
-  void gather_value(rvector_t &val) const;
-
-  // gather cregs
-  void gather_creg_memory(void);
-
-  // barrier all processes
-  void sync_process(void) const;
-
-  // gather distributed state into vector (if memory is enough)
-  template <class data_t>
-  void gather_state(std::vector<std::complex<data_t>> &state);
-
-  template <class data_t>
-  void gather_state(AER::Vector<std::complex<data_t>> &state);
-
-  // block diagonal matrix in chunk
-  void block_diagonal_matrix(const int_t iChunk, reg_t &qubits,
-                             cvector_t &diag);
-  void qubits_inout(const reg_t &qubits, reg_t &qubits_in,
-                    reg_t &qubits_out) const;
-
-  // collect matrix over multiple chunks
-  auto apply_to_matrix(bool copy = false);
-
-  // Apply the global phase
-  virtual void apply_global_phase() override {}
-
-  // check if the operator should be applied to each chunk
-  virtual bool is_applied_to_each_chunk(const Operations::Op &op);
-
-  // return global shot index for the chunk
-  inline int_t get_global_shot_index(const int_t iChunk) const {
-    return multi_shots_parallelization_
-               ? (iChunk + local_shot_index_ + global_chunk_index_)
-               : 0;
-  }
-
-  // separate inside and outside qubits for (multi) control gates
-  void get_inout_ctrl_qubits(const Operations::Op &op, reg_t &qubits_out,
-                             reg_t &qubits_in);
-
-  // remake gate operation by qubits inside chunk
-  Operations::Op remake_gate_in_chunk_qubits(const Operations::Op &op,
-                                             reg_t &qubits_in);
-
-#ifdef AER_MPI
-  // communicator group to simulate a circuit (for multi-experiments)
-  MPI_Comm distributed_comm_;
-#endif
-
-  uint_t mapped_index(const uint_t idx);
-};
-
-//=========================================================================
-// Implementations
-//=========================================================================
-
-template <class state_t>
-StateChunk<state_t>::~StateChunk(void) {
-#ifdef AER_MPI
-  if (distributed_comm_ != MPI_COMM_WORLD) {
-    MPI_Comm_free(&distributed_comm_);
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::set_config(const Config &config) {
-  BaseState::set_config(config);
-
-  num_threads_per_group_ = 1;
-  if (config.num_threads_per_device.has_value())
-    num_threads_per_group_ = config.num_threads_per_device.value();
-
-  if (config.chunk_swap_buffer_qubits.has_value())
-    chunk_swap_buffer_qubits_ = config.chunk_swap_buffer_qubits.value();
-
-#ifdef AER_CUSTATEVEC
-  // cuStateVec configs
-  if (config.cuStateVec_enable.has_value())
-    cuStateVec_enable_ = config.cuStateVec_enable.value();
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::set_distribution(uint_t nprocs) {
-  myrank_ = 0;
-  nprocs_ = 1;
-#ifdef AER_MPI
-  int t;
-  MPI_Comm_size(MPI_COMM_WORLD, &t);
-  nprocs_ = t;
-  MPI_Comm_rank(MPI_COMM_WORLD, &t);
-  myrank_ = t;
-#endif
-
-  distributed_procs_ = nprocs;
-  distributed_rank_ = myrank_ % nprocs;
-  distributed_group_ = myrank_ / nprocs;
-
-  distributed_proc_bits_ = 0;
-  int proc_bits = 0;
-  uint_t p = distributed_procs_;
-  while (p > 1) {
-    if ((p & 1) != 0) { // procs is not power of 2
-      distributed_proc_bits_ = -1;
-      break;
-    }
-    distributed_proc_bits_++;
-    p >>= 1;
-  }
-
-#ifdef AER_MPI
-  if (nprocs != nprocs_) {
-    MPI_Comm_split(MPI_COMM_WORLD, (int)distributed_group_,
-                   (int)distributed_rank_, &distributed_comm_);
-  } else {
-    distributed_comm_ = MPI_COMM_WORLD;
-  }
-#endif
-}
-
-template <class state_t>
-bool StateChunk<state_t>::allocate(uint_t num_qubits, uint_t block_bits,
-                                   uint_t num_parallel_shots) {
-  int_t i;
-  num_qubits_ = num_qubits;
-  block_bits_ = block_bits;
-
-  if (block_bits_ > 0) {
-    chunk_bits_ = block_bits_;
-    if (chunk_bits_ > num_qubits_) {
-      chunk_bits_ = num_qubits_;
-    }
-  } else {
-    chunk_bits_ = num_qubits_;
-  }
-
-  if (chunk_bits_ < num_qubits_) {
-    // multi-chunk distribution with cache blocking transpiler
-    multi_chunk_distribution_ = true;
-    multi_shots_parallelization_ = false;
-    num_global_chunks_ = 1ull << ((num_qubits_ - chunk_bits_) * qubit_scale());
-
-    BaseState::cregs_.resize(1);
-  } else {
-    // multi-shots parallelization
-    multi_chunk_distribution_ = false;
-    if (num_parallel_shots > 1)
-      multi_shots_parallelization_ = true;
-    else
-      multi_shots_parallelization_ = false;
-    num_global_chunks_ = num_parallel_shots;
-
-    // classical registers for all shots
-    BaseState::cregs_.resize(num_parallel_shots);
-  }
-
-  chunk_index_begin_.resize(distributed_procs_);
-  chunk_index_end_.resize(distributed_procs_);
-  for (i = 0; i < distributed_procs_; i++) {
-    chunk_index_begin_[i] = num_global_chunks_ * i / distributed_procs_;
-    chunk_index_end_[i] = num_global_chunks_ * (i + 1) / distributed_procs_;
-  }
-
-  num_local_chunks_ = chunk_index_end_[distributed_rank_] -
-                      chunk_index_begin_[distributed_rank_];
-  global_chunk_index_ = chunk_index_begin_[distributed_rank_];
-  local_shot_index_ = 0;
-
-  global_chunk_indexing_ = false;
-  chunk_omp_parallel_ = false;
-  if (BaseState::sim_device_name_ == "GPU") {
-#ifdef _OPENMP
-    if (omp_get_num_threads() == 1)
-      chunk_omp_parallel_ = true;
-#endif
-
-    // set cuStateVec_enable_
-    if (cuStateVec_enable_) {
-      if (multi_shots_parallelization_)
-        cuStateVec_enable_ = false; // multi-shots parallelization is not
-                                    // supported for cuStateVec
-    }
-
-    if (!cuStateVec_enable_)
-      global_chunk_indexing_ = true; // cuStateVec does not handle global chunk
-                                     // index for diagonal matrix
-  } else if (BaseState::sim_device_name_ == "Thrust") {
-    global_chunk_indexing_ = true;
-    chunk_omp_parallel_ = false;
-  }
-
-  if (multi_shots_parallelization_) {
-    allocate_qregs(std::min(num_local_chunks_, max_batched_shots_));
-  } else {
-    allocate_qregs(num_local_chunks_);
-  }
-
-  // initialize qubit map
-  qubit_map_.resize(num_qubits_);
-  for (i = 0; i < num_qubits_; i++) {
-    qubit_map_[i] = i;
-  }
-
-  if (chunk_bits_ <= chunk_swap_buffer_qubits_ + 1)
-    multi_chunk_swap_enable_ = false;
-  else
-    max_multi_swap_ = chunk_bits_ - chunk_swap_buffer_qubits_;
-
-  return true;
-}
-
-template <class state_t>
-bool StateChunk<state_t>::allocate_qregs(uint_t num_chunks) {
-  int_t i;
-  // deallocate qregs before reallocation
-  if (qregs_.size() > 0) {
-    if (qregs_.size() == num_chunks)
-      return true; // can reuse allocated chunks
-
-    qregs_.clear();
-  }
-
-  qregs_.resize(num_chunks);
-
-  if (num_creg_memory_ != 0 || num_creg_registers_ != 0) {
-    for (i = 0; i < num_chunks; i++) {
-      // set number of creg bits before actual initialization
-      qregs_[i].initialize_creg(num_creg_memory_, num_creg_registers_);
-    }
-  }
-
-  // allocate qregs
-  uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0;
-  bool ret = true;
-  qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_);
-  qregs_[0].set_num_threads_per_group(num_threads_per_group_);
-  qregs_[0].cuStateVec_enable(cuStateVec_enable_);
-  ret &=
-      qregs_[0].chunk_setup(chunk_bits_ * qubit_scale(),
-                            num_qubits_ * qubit_scale(), chunk_id, num_chunks);
-  for (i = 1; i < num_chunks; i++) {
-    uint_t gid = i + chunk_id;
-    ret &= qregs_[i].chunk_setup(qregs_[0], gid);
-    qregs_[i].set_num_threads_per_group(num_threads_per_group_);
-  }
-
-  // initialize groups
-  top_chunk_of_group_.clear();
-  num_groups_ = 0;
-  for (i = 0; i < qregs_.size(); i++) {
-    if (qregs_[i].top_of_group()) {
-      top_chunk_of_group_.push_back(i);
-      num_groups_++;
-    }
-  }
-  top_chunk_of_group_.push_back(qregs_.size());
-  num_chunks_in_group_.resize(num_groups_);
-  for (i = 0; i < num_groups_; i++) {
-    num_chunks_in_group_[i] =
-        top_chunk_of_group_[i + 1] - top_chunk_of_group_[i];
-  }
-
-  return ret;
-}
-
-template <class state_t>
-uint_t StateChunk<state_t>::get_process_by_chunk(uint_t cid) {
-  uint_t i;
-  for (i = 0; i < distributed_procs_; i++) {
-    if (cid >= chunk_index_begin_[i] && cid < chunk_index_end_[i]) {
-      return i;
-    }
-  }
-  return distributed_procs_;
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_ops(OpItr first, OpItr last,
-                                    ExperimentResult &result, RngEngine &rng,
-                                    bool final_ops) {
-  if (multi_chunk_distribution_) {
-    apply_ops_chunks(first, last, result, rng, final_ops);
-  } else {
-    Base::apply_ops(first, last, result, rng, final_ops);
-  }
-
-  qregs_[0].synchronize();
-
-#ifdef AER_CUSTATEVEC
-  result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
-#endif
-}
-
-template <class state_t>
-template <typename InputIterator>
-void StateChunk<state_t>::apply_ops_chunks(InputIterator first,
-                                           InputIterator last,
-                                           ExperimentResult &result,
-                                           RngEngine &rng, bool final_ops) {
-  uint_t iOp, nOp;
-  reg_t multi_swap;
-
-  nOp = std::distance(first, last);
-  iOp = 0;
-
-  while (iOp < nOp) {
-    const Operations::Op op_iOp = *(first + iOp);
-
-    if (op_iOp.type == Operations::OpType::gate &&
-        op_iOp.name == "swap_chunk") {
-      // apply swap between chunks
-      if (multi_chunk_swap_enable_ && op_iOp.qubits[0] < chunk_bits_ &&
-          op_iOp.qubits[1] >= chunk_bits_) {
-        if (distributed_proc_bits_ < 0 ||
-            (op_iOp.qubits[1] >=
-             (num_qubits_ * qubit_scale() -
-              distributed_proc_bits_))) { // apply multi-swap when swap is cross
-                                          // qubits
-          multi_swap.push_back(op_iOp.qubits[0]);
-          multi_swap.push_back(op_iOp.qubits[1]);
-          if (multi_swap.size() >= max_multi_swap_ * 2) {
-            apply_multi_chunk_swap(multi_swap);
-            multi_swap.clear();
-          }
-        } else
-          apply_chunk_swap(op_iOp.qubits);
-      } else {
-        if (multi_swap.size() > 0) {
-          apply_multi_chunk_swap(multi_swap);
-          multi_swap.clear();
-        }
-        apply_chunk_swap(op_iOp.qubits);
-      }
-      iOp++;
-      continue;
-    } else if (multi_swap.size() > 0) {
-      apply_multi_chunk_swap(multi_swap);
-      multi_swap.clear();
-    }
-
-    if (op_iOp.type == Operations::OpType::sim_op &&
-        op_iOp.name == "begin_blocking") {
-      // applying sequence of gates inside each chunk
-
-      uint_t iOpEnd = iOp;
-      while (iOpEnd < nOp) {
-        const Operations::Op op_iOpEnd = *(first + iOpEnd);
-        if (op_iOpEnd.type == Operations::OpType::sim_op &&
-            op_iOpEnd.name == "end_blocking") {
-          break;
-        }
-        iOpEnd++;
-      }
-
-      uint_t iOpBegin = iOp + 1;
-      if (num_groups_ > 1 && chunk_omp_parallel_) {
-#pragma omp parallel for num_threads(num_groups_)
-        for (int_t ig = 0; ig < num_groups_; ig++)
-          apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
-                                   rng);
-      } else {
-        for (int_t ig = 0; ig < num_groups_; ig++)
-          apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result,
-                                   rng);
-      }
-      iOp = iOpEnd;
-    } else if (is_applied_to_each_chunk(op_iOp)) {
-      if (num_groups_ > 1 && chunk_omp_parallel_) {
-#pragma omp parallel for num_threads(num_groups_)
-        for (int_t ig = 0; ig < num_groups_; ig++)
-          apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
-                                   rng);
-      } else {
-        for (int_t ig = 0; ig < num_groups_; ig++)
-          apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
-                                   rng);
-      }
-    } else {
-      // parallelize inside state implementations
-      apply_op(STATE_APPLY_TO_ALL_CHUNKS, op_iOp, result, rng,
-               final_ops && nOp == iOp + 1);
-    }
-    iOp++;
-  }
-
-  if (multi_swap.size() > 0)
-    apply_multi_chunk_swap(multi_swap);
-
-  if (num_groups_ > 1 && chunk_omp_parallel_) {
-#pragma omp parallel for num_threads(num_groups_)
-    for (int_t ig = 0; ig < num_groups_; ig++)
-      qregs_[top_chunk_of_group_[ig]].synchronize();
-  } else {
-    for (int_t ig = 0; ig < num_groups_; ig++)
-      qregs_[top_chunk_of_group_[ig]].synchronize();
-  }
-
-  if (BaseState::sim_device_name_ == "GPU") {
-#ifdef AER_THRUST_CUDA
-    int nDev;
-    if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
-      cudaGetLastError();
-      nDev = 0;
-    }
-    if (nDev > num_groups_)
-      nDev = num_groups_;
-    result.metadata.add(nDev, "cacheblocking", "chunk_parallel_gpus");
-#endif
-
-#ifdef AER_CUSTATEVEC
-    result.metadata.add(cuStateVec_enable_, "cuStateVec_enable");
-#endif
-  }
-
-#ifdef AER_MPI
-  result.metadata.add(multi_chunk_swap_enable_, "cacheblocking",
-                      "multiple_chunk_swaps_enable");
-  if (multi_chunk_swap_enable_) {
-    result.metadata.add(chunk_swap_buffer_qubits_, "cacheblocking",
-                        "multiple_chunk_swaps_buffer_qubits");
-    result.metadata.add(max_multi_swap_, "cacheblocking",
-                        "max_multiple_chunk_swaps");
-  }
-#endif
-}
-
-template <class state_t>
-template <typename InputIterator>
-void StateChunk<state_t>::apply_cache_blocking_ops(const int_t iGroup,
-                                                   InputIterator first,
-                                                   InputIterator last,
-                                                   ExperimentResult &result,
-                                                   RngEngine &rng) {
-  // for each chunk in group
-  for (int_t iChunk = top_chunk_of_group_[iGroup];
-       iChunk < top_chunk_of_group_[iGroup + 1]; iChunk++) {
-    // fecth chunk in cache
-    if (qregs_[iChunk].fetch_chunk()) {
-      for (auto it = first; it != last; ++it) {
-        apply_op(iChunk, *it, result, rng, false);
-      }
-      // release chunk from cache
-      qregs_[iChunk].release_chunk();
-    }
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::get_inout_ctrl_qubits(const Operations::Op &op,
-                                                reg_t &qubits_out,
-                                                reg_t &qubits_in) {
-  if (op.type == Operations::OpType::gate &&
-      (op.name[0] == 'c' || op.name.find("mc") == 0)) {
-    for (int i = 0; i < op.qubits.size(); i++) {
-      if (op.qubits[i] < chunk_bits_)
-        qubits_in.push_back(op.qubits[i]);
-      else
-        qubits_out.push_back(op.qubits[i]);
-    }
-  }
-}
-
-template <class state_t>
-Operations::Op
-StateChunk<state_t>::remake_gate_in_chunk_qubits(const Operations::Op &op,
-                                                 reg_t &qubits_in) {
-  Operations::Op new_op = op;
-  new_op.qubits = qubits_in;
-  // change gate name if there is no control qubits inside chunk
-  if (op.name.find("swap") != std::string::npos && qubits_in.size() == 2) {
-    new_op.name = "swap";
-  }
-  if (op.name.find("ccx") != std::string::npos) {
-    if (qubits_in.size() == 1)
-      new_op.name = "x";
-    else
-      new_op.name = "cx";
-  } else if (qubits_in.size() == 1) {
-    if (op.name[0] == 'c')
-      new_op.name = op.name.substr(1);
-    else if (op.name == "mcphase")
-      new_op.name = "p";
-    else
-      new_op.name = op.name.substr(2); // remove "mc"
-  }
-  return new_op;
-}
-
-template <class state_t>
-bool StateChunk<state_t>::is_applied_to_each_chunk(const Operations::Op &op) {
-  if (op.type == Operations::OpType::gate ||
-      op.type == Operations::OpType::matrix ||
-      op.type == Operations::OpType::diagonal_matrix ||
-      op.type == Operations::OpType::multiplexer ||
-      op.type == Operations::OpType::superop) {
-    return true;
-  }
-  return false;
-}
-
-template <class state_t>
-bool StateChunk<state_t>::check_conditional(const int_t iChunk,
-                                            const Operations::Op &op) {
-  if (multi_shots_parallelization_) {
-    // multi-shots parallelization
-    if (op.conditional) {
-      qregs_[iChunk].set_conditional(op.conditional_reg);
-    }
-    return true;
-  } else {
-    return BaseState::cregs_[0].check_conditional(op);
-  }
-}
-
-template <class state_t>
-template <typename InputIterator>
-void StateChunk<state_t>::apply_ops_multi_shots(
-    InputIterator first, InputIterator last, const Noise::NoiseModel &noise,
-    ExperimentResult &result, uint_t rng_seed, bool final_ops) {
-  int_t i;
-  int_t i_begin, n_shots;
-
-  i_begin = 0;
-  while (i_begin < num_local_chunks_) {
-    local_shot_index_ = i_begin;
-
-    // loop for states can be stored in available memory
-    n_shots = qregs_.size();
-    if (i_begin + n_shots > num_local_chunks_) {
-      n_shots = num_local_chunks_ - i_begin;
-      // resize qregs
-      allocate_qregs(n_shots);
-    }
-    // initialization (equivalent to initialize_qreg + initialize_creg)
-    auto init_group = [this](int_t ig) {
-      for (uint_t j = top_chunk_of_group_[ig]; j < top_chunk_of_group_[ig + 1];
-           j++) {
-        // enabling batch shots optimization
-        qregs_[j].enable_batch(true);
-
-        // initialize qreg here
-        qregs_[j].set_num_qubits(chunk_bits_);
-        qregs_[j].initialize();
-
-        // initialize creg here
-        qregs_[j].initialize_creg(this->creg(0).memory_size(),
-                                  this->creg(0).register_size());
-      }
-    };
-    Utils::apply_omp_parallel_for((num_groups_ > 1 && chunk_omp_parallel_), 0,
-                                  num_groups_, init_group);
-
-    apply_global_phase(); // this is parallelized in StateChunk sub-classes
-
-    // apply ops to multiple-shots
-    if (num_groups_ > 1 && chunk_omp_parallel_) {
-      std::vector<ExperimentResult> par_results(num_groups_);
-#pragma omp parallel for num_threads(num_groups_)
-      for (i = 0; i < num_groups_; i++)
-        apply_ops_multi_shots_for_group(i, first, last, noise, par_results[i],
-                                        rng_seed, final_ops);
-
-      for (auto &res : par_results)
-        result.combine(std::move(res));
-    } else {
-      for (i = 0; i < num_groups_; i++)
-        apply_ops_multi_shots_for_group(i, first, last, noise, result, rng_seed,
-                                        final_ops);
-    }
-
-    // collect measured bits and copy memory
-    for (i = 0; i < n_shots; i++) {
-      qregs_[i].read_measured_data(
-          this->creg(global_chunk_index_ + i_begin + i));
-    }
-
-    i_begin += n_shots;
-  }
-
-  gather_creg_memory();
-
-#ifdef AER_THRUST_CUDA
-  if (BaseState::sim_device_name_ == "GPU") {
-    int nDev;
-    if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
-      cudaGetLastError();
-      nDev = 0;
-    }
-    if (nDev > num_groups_)
-      nDev = num_groups_;
-    result.metadata.add(nDev, "batched_shots_optimization_parallel_gpus");
-  }
-#endif
-}
-
-template <class state_t>
-template <typename InputIterator>
-void StateChunk<state_t>::apply_ops_multi_shots_for_group(
-    int_t i_group, InputIterator first, InputIterator last,
-    const Noise::NoiseModel &noise, ExperimentResult &result, uint_t rng_seed,
-    bool final_ops) {
-  uint_t istate = top_chunk_of_group_[i_group];
-  std::vector<RngEngine> rng(num_chunks_in_group_[i_group]);
-#ifdef _OPENMP
-  int num_inner_threads = omp_get_max_threads() / omp_get_num_threads();
-#else
-  int num_inner_threads = 1;
-#endif
-
-  for (uint_t j = top_chunk_of_group_[i_group];
-       j < top_chunk_of_group_[i_group + 1]; j++)
-    rng[j - top_chunk_of_group_[i_group]].set_seed(
-        rng_seed + global_chunk_index_ + local_shot_index_ + j);
-
-  for (auto op = first; op != last; ++op) {
-    if (op->type == Operations::OpType::qerror_loc) {
-      // sample error here
-      uint_t count = num_chunks_in_group_[i_group];
-      std::vector<std::vector<Operations::Op>> noise_ops(count);
-
-      uint_t count_ops = 0;
-      uint_t non_pauli_gate_count = 0;
-      if (num_inner_threads > 1) {
-#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads)
-        for (int_t j = 0; j < count; j++) {
-          noise_ops[j] = noise.sample_noise_loc(*op, rng[j]);
-
-          if (!(noise_ops[j].size() == 0 ||
-                (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) {
-            count_ops++;
-            for (int_t k = 0; k < noise_ops[j].size(); k++) {
-              if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" &&
-                  noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" &&
-                  noise_ops[j][k].name != "pauli") {
-                non_pauli_gate_count++;
-                break;
-              }
-            }
-          }
-        }
-      } else {
-        for (int_t j = 0; j < count; j++) {
-          noise_ops[j] = noise.sample_noise_loc(*op, rng[j]);
-
-          if (!(noise_ops[j].size() == 0 ||
-                (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) {
-            count_ops++;
-            for (int_t k = 0; k < noise_ops[j].size(); k++) {
-              if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" &&
-                  noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" &&
-                  noise_ops[j][k].name != "pauli") {
-                non_pauli_gate_count++;
-                break;
-              }
-            }
-          }
-        }
-      }
-
-      if (count_ops == 0) {
-        continue; // do nothing
-      }
-      if (non_pauli_gate_count == 0) { // ptimization for Pauli error
-        qregs_[istate].apply_batched_pauli_ops(noise_ops);
-      } else {
-        // otherwise execute each circuit
-        apply_batched_noise_ops(i_group, noise_ops, result, rng);
-      }
-    } else {
-      if (!apply_batched_op(istate, *op, result, rng,
-                            final_ops && (op + 1 == last))) {
-        // call apply_op for each state
-        for (uint_t j = top_chunk_of_group_[i_group];
-             j < top_chunk_of_group_[i_group + 1]; j++) {
-          qregs_[j].enable_batch(false);
-          apply_op(j, *op, result, rng[j - top_chunk_of_group_[i_group]],
-                   final_ops && (op + 1 == last));
-          qregs_[j].enable_batch(true);
-        }
-      }
-    }
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_batched_noise_ops(
-    const int_t i_group, const std::vector<std::vector<Operations::Op>> &ops,
-    ExperimentResult &result, std::vector<RngEngine> &rng) {
-  int_t i, j, k, count, nop, pos = 0;
-  uint_t istate = top_chunk_of_group_[i_group];
-  count = ops.size();
-
-  reg_t mask(count);
-  std::vector<bool> finished(count, false);
-  for (i = 0; i < count; i++) {
-    int_t cond_reg = -1;
-
-    if (finished[i])
-      continue;
-    if (ops[i].size() == 0 || (ops[i].size() == 1 && ops[i][0].name == "id")) {
-      finished[i] = true;
-      continue;
-    }
-    mask[i] = 1;
-
-    // find same ops to be exectuted in a batch
-    for (j = i + 1; j < count; j++) {
-      if (finished[j]) {
-        mask[j] = 0;
-        continue;
-      }
-      if (ops[j].size() == 0 ||
-          (ops[j].size() == 1 && ops[j][0].name == "id")) {
-        mask[j] = 0;
-        finished[j] = true;
-        continue;
-      }
-
-      if (ops[i].size() != ops[j].size()) {
-        mask[j] = 0;
-        continue;
-      }
-
-      mask[j] = true;
-      for (k = 0; k < ops[i].size(); k++) {
-        if (ops[i][k].conditional) {
-          cond_reg = ops[i][k].conditional_reg;
-        }
-        if (ops[i][k].type != ops[j][k].type ||
-            ops[i][k].name != ops[j][k].name) {
-          mask[j] = false;
-          break;
-        }
-      }
-      if (mask[j])
-        finished[j] = true;
-    }
-
-    // mask conditional register
-    int_t sys_reg =
-        qregs_[istate].set_batched_system_conditional(cond_reg, mask);
-
-    // batched execution on same ops
-    for (k = 0; k < ops[i].size(); k++) {
-      Operations::Op cop = ops[i][k];
-
-      // mark op conditional to mask shots
-      cop.conditional = true;
-      cop.conditional_reg = sys_reg;
-
-      if (!apply_batched_op(istate, cop, result, rng, false)) {
-        // call apply_op for each state
-        for (uint_t j = top_chunk_of_group_[i_group];
-             j < top_chunk_of_group_[i_group + 1]; j++) {
-          qregs_[j].enable_batch(false);
-          apply_op(j, cop, result, rng[j - top_chunk_of_group_[i_group]],
-                   false);
-          qregs_[j].enable_batch(true);
-        }
-      }
-    }
-    mask[i] = 0;
-    finished[i] = true;
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::initialize_creg(uint_t num_memory,
-                                          uint_t num_register) {
-  for (int_t i = 0; i < BaseState::cregs_.size(); i++) {
-    BaseState::cregs_[i].initialize(num_memory, num_register);
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::initialize_creg(uint_t num_memory,
-                                          uint_t num_register,
-                                          const std::string &memory_hex,
-                                          const std::string &register_hex) {
-  for (int_t i = 0; i < BaseState::cregs_.size(); i++) {
-    BaseState::cregs_[i].initialize(num_memory, num_register, memory_hex,
-                                    register_hex);
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_save_expval(const int_t iChunk,
-                                            const Operations::Op &op,
-                                            ExperimentResult &result) {
-  // Check empty edge case
-  if (op.expval_params.empty()) {
-    throw std::invalid_argument(
-        "Invalid save expval instruction (Pauli components are empty).");
-  }
-  bool variance = (op.type == Operations::OpType::save_expval_var);
-
-  // Accumulate expval components
-  double expval(0.);
-  double sq_expval(0.);
-
-  for (const auto &param : op.expval_params) {
-    // param is tuple (pauli, coeff, sq_coeff)
-    const auto val = expval_pauli(iChunk, op.qubits, std::get<0>(param));
-    expval += std::get<1>(param) * val;
-    if (variance) {
-      sq_expval += std::get<2>(param) * val;
-    }
-  }
-  if (variance) {
-    std::vector<double> expval_var(2);
-    expval_var[0] = expval;                      // mean
-    expval_var[1] = sq_expval - expval * expval; // variance
-    result.save_data_average(BaseState::cregs_[get_global_shot_index(iChunk)],
-                             op.string_params[0], expval_var, op.type,
-                             op.save_type);
-  } else {
-    result.save_data_average(BaseState::cregs_[get_global_shot_index(iChunk)],
-                             op.string_params[0], expval, op.type,
-                             op.save_type);
-  }
-}
-
-//-------------------------------------------------------------------------
-// functions for multi-chunk distribution
-//-------------------------------------------------------------------------
-template <class state_t>
-void StateChunk<state_t>::block_diagonal_matrix(const int_t iChunk,
-                                                reg_t &qubits,
-                                                cvector_t &diag) {
-  uint_t gid = global_chunk_index_ + iChunk;
-  uint_t i;
-  uint_t mask_out = 0;
-  uint_t mask_id = 0;
-
-  reg_t qubits_in;
-  cvector_t diag_in;
-
-  for (i = 0; i < qubits.size(); i++) {
-    if (qubits[i] < chunk_bits_) { // in chunk
-      qubits_in.push_back(qubits[i]);
-    } else {
-      mask_out |= (1ull << i);
-      if ((gid >> (qubits[i] - chunk_bits_)) & 1)
-        mask_id |= (1ull << i);
-    }
-  }
-
-  if (qubits_in.size() < qubits.size()) {
-    for (i = 0; i < diag.size(); i++) {
-      if ((i & mask_out) == mask_id)
-        diag_in.push_back(diag[i]);
-    }
-
-    if (qubits_in.size() == 0) {
-      qubits_in.push_back(0);
-      diag_in.resize(2);
-      diag_in[1] = diag_in[0];
-    }
-    qubits = qubits_in;
-    diag = diag_in;
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::qubits_inout(const reg_t &qubits, reg_t &qubits_in,
-                                       reg_t &qubits_out) const {
-  int_t i;
-  qubits_in.clear();
-  qubits_out.clear();
-  for (i = 0; i < qubits.size(); i++) {
-    if (qubits[i] < chunk_bits_) { // in chunk
-      qubits_in.push_back(qubits[i]);
-    } else {
-      qubits_out.push_back(qubits[i]);
-    }
-  }
-}
-
-template <class state_t>
-template <typename list_t>
-void StateChunk<state_t>::initialize_from_vector(const int_t iChunkIn,
-                                                 const list_t &vec) {
-  int_t iChunk;
-
-  if (multi_chunk_distribution_) {
-    if (chunk_omp_parallel_ && num_groups_ > 1) {
-#pragma omp parallel for private(iChunk)
-      for (int_t ig = 0; ig < num_groups_; ig++) {
-        for (iChunk = top_chunk_of_group_[ig];
-             iChunk < top_chunk_of_group_[ig + 1]; iChunk++) {
-          list_t tmp(1ull << (chunk_bits_ * qubit_scale()));
-          for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
-            tmp[i] = vec[((global_chunk_index_ + iChunk)
-                          << (chunk_bits_ * qubit_scale())) +
-                         i];
-          }
-          qregs_[iChunk].initialize_from_vector(tmp);
-        }
-      }
-    } else {
-      for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) {
-        list_t tmp(1ull << (chunk_bits_ * qubit_scale()));
-        for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
-          tmp[i] = vec[((global_chunk_index_ + iChunk)
-                        << (chunk_bits_ * qubit_scale())) +
-                       i];
-        }
-        qregs_[iChunk].initialize_from_vector(tmp);
-      }
-    }
-  } else {
-    if (iChunkIn == STATE_APPLY_TO_ALL_CHUNKS) {
-      for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) {
-        qregs_[iChunk].initialize_from_vector(vec);
-      }
-    } else
-      qregs_[iChunkIn].initialize_from_vector(vec);
-  }
-}
-
-template <class state_t>
-template <typename list_t>
-void StateChunk<state_t>::initialize_from_matrix(const int_t iChunkIn,
-                                                 const list_t &mat) {
-  int_t iChunk;
-  if (multi_chunk_distribution_) {
-    if (chunk_omp_parallel_ && num_groups_ > 1) {
-#pragma omp parallel for private(iChunk)
-      for (int_t ig = 0; ig < num_groups_; ig++) {
-        for (iChunk = top_chunk_of_group_[ig];
-             iChunk < top_chunk_of_group_[ig + 1]; iChunk++) {
-          list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_));
-          uint_t irow_chunk =
-              ((iChunk + global_chunk_index_) >> ((num_qubits_ - chunk_bits_)))
-              << (chunk_bits_);
-          uint_t icol_chunk = ((iChunk + global_chunk_index_) &
-                               ((1ull << ((num_qubits_ - chunk_bits_))) - 1))
-                              << (chunk_bits_);
-
-          // copy part of state for this chunk
-          uint_t i, row, col;
-          for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
-            uint_t icol = i & ((1ull << chunk_bits_) - 1);
-            uint_t irow = i >> chunk_bits_;
-            tmp[i] =
-                mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)];
-          }
-          qregs_[iChunk].initialize_from_matrix(tmp);
-        }
-      }
-    } else {
-      for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) {
-        list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_));
-        uint_t irow_chunk =
-            ((iChunk + global_chunk_index_) >> ((num_qubits_ - chunk_bits_)))
-            << (chunk_bits_);
-        uint_t icol_chunk = ((iChunk + global_chunk_index_) &
-                             ((1ull << ((num_qubits_ - chunk_bits_))) - 1))
-                            << (chunk_bits_);
-
-        // copy part of state for this chunk
-        uint_t i, row, col;
-        for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) {
-          uint_t icol = i & ((1ull << chunk_bits_) - 1);
-          uint_t irow = i >> chunk_bits_;
-          tmp[i] =
-              mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)];
-        }
-        qregs_[iChunk].initialize_from_matrix(tmp);
-      }
-    }
-  } else {
-    if (iChunkIn == STATE_APPLY_TO_ALL_CHUNKS) {
-      for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) {
-        qregs_[iChunk].initialize_from_matrix(mat);
-      }
-    } else
-      qregs_[iChunkIn].initialize_from_matrix(mat);
-  }
-}
-
-template <class state_t>
-auto StateChunk<state_t>::apply_to_matrix(bool copy) {
-  // this function is used to collect states over chunks
-  int_t iChunk;
-  uint_t size = 1ull << (chunk_bits_ * qubit_scale());
-  uint_t mask = (1ull << (chunk_bits_)) - 1;
-  uint_t num_threads = qregs_[0].get_omp_threads();
-
-  size_t size_required =
-      2 * (sizeof(std::complex<double>) << (num_qubits_ * 2)) +
-      (sizeof(std::complex<double>) << (chunk_bits_ * 2)) * num_local_chunks_;
-  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
-    throw std::runtime_error(
-        std::string("There is not enough memory to store states as matrix"));
-  }
-
-  auto matrix = qregs_[0].copy_to_matrix();
-
-  if (distributed_rank_ == 0) {
-    matrix.resize(1ull << (num_qubits_), 1ull << (num_qubits_));
-
-    auto tmp = qregs_[0].copy_to_matrix();
-    for (iChunk = 0; iChunk < num_global_chunks_; iChunk++) {
-      int_t i;
-      uint_t irow_chunk = (iChunk >> ((num_qubits_ - chunk_bits_)))
-                          << chunk_bits_;
-      uint_t icol_chunk =
-          (iChunk & ((1ull << ((num_qubits_ - chunk_bits_))) - 1))
-          << chunk_bits_;
-
-      if (iChunk < num_local_chunks_) {
-        if (copy)
-          tmp = qregs_[iChunk].copy_to_matrix();
-        else
-          tmp = qregs_[iChunk].move_to_matrix();
-      }
-#ifdef AER_MPI
-      else
-        recv_data(tmp.data(), size, 0, iChunk);
-#endif
-#pragma omp parallel for if (num_threads > 1) num_threads(num_threads)
-      for (i = 0; i < size; i++) {
-        uint_t irow = i >> (chunk_bits_);
-        uint_t icol = i & mask;
-        uint_t idx = ((irow + irow_chunk) << (num_qubits_)) + icol_chunk + icol;
-        matrix[idx] = tmp[i];
-      }
-    }
-  } else {
-#ifdef AER_MPI
-    // send matrices to process 0
-    for (iChunk = 0; iChunk < num_global_chunks_; iChunk++) {
-      uint_t iProc = get_process_by_chunk(iChunk);
-      if (iProc == distributed_rank_) {
-        if (copy) {
-          auto tmp = qregs_[iChunk - global_chunk_index_].copy_to_matrix();
-          send_data(tmp.data(), size, iChunk, 0);
-        } else {
-          auto tmp = qregs_[iChunk - global_chunk_index_].move_to_matrix();
-          send_data(tmp.data(), size, iChunk, 0);
-        }
-      }
-    }
-#endif
-  }
-
-  return matrix;
-}
-
-template <class state_t>
-uint_t StateChunk<state_t>::mapped_index(const uint_t idx) {
-  uint_t i, ret = 0;
-  uint_t t = idx;
-
-  for (i = 0; i < num_qubits_; i++) {
-    if (t & 1) {
-      ret |= (1ull << qubit_map_[i]);
-    }
-    t >>= 1;
-  }
-  return ret;
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_chunk_swap(const reg_t &qubits) {
-  uint_t nLarge = 1;
-  uint_t q0, q1;
-  int_t iChunk;
-
-  q0 = qubits[qubits.size() - 2];
-  q1 = qubits[qubits.size() - 1];
-
-  if (qubit_scale() == 1) {
-    std::swap(qubit_map_[q0], qubit_map_[q1]);
-  }
-
-  if (q0 > q1) {
-    std::swap(q0, q1);
-  }
-
-  if (q1 < chunk_bits_ * qubit_scale()) {
-    // inside chunk
-    if (chunk_omp_parallel_ && num_groups_ > 1) {
-#pragma omp parallel for num_threads(num_groups_)
-      for (int_t ig = 0; ig < num_groups_; ig++) {
-        for (int_t iChunk = top_chunk_of_group_[ig];
-             iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-          qregs_[iChunk].apply_mcswap(qubits);
-      }
-    } else {
-      for (int_t ig = 0; ig < num_groups_; ig++) {
-        for (int_t iChunk = top_chunk_of_group_[ig];
-             iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-          qregs_[iChunk].apply_mcswap(qubits);
-      }
-    }
-  } else { // swap over chunks
-    uint_t mask0, mask1;
-
-    mask0 = (1ull << q0);
-    mask1 = (1ull << q1);
-    mask0 >>= (chunk_bits_ * qubit_scale());
-    mask1 >>= (chunk_bits_ * qubit_scale());
-
-    if (distributed_procs_ == 1 ||
-        (distributed_proc_bits_ >= 0 &&
-         q1 < (num_qubits_ * qubit_scale() -
-               distributed_proc_bits_))) { // no data transfer between processes
-                                           // is needed
-      auto apply_chunk_swap_1qubit = [this, mask1, qubits](int_t iGroup) {
-        for (int_t ic = top_chunk_of_group_[iGroup];
-             ic < top_chunk_of_group_[iGroup + 1]; ic++) {
-          uint_t baseChunk;
-          baseChunk = ic & (~mask1);
-          if (ic == baseChunk)
-            qregs_[ic].apply_chunk_swap(qubits, qregs_[ic | mask1], true);
-        }
-      };
-      auto apply_chunk_swap_2qubits = [this, mask0, mask1,
-                                       qubits](int_t iGroup) {
-        for (int_t ic = top_chunk_of_group_[iGroup];
-             ic < top_chunk_of_group_[iGroup + 1]; ic++) {
-          uint_t baseChunk;
-          baseChunk = ic & (~(mask0 | mask1));
-          uint_t iChunk1 = baseChunk | mask0;
-          uint_t iChunk2 = baseChunk | mask1;
-          if (ic == iChunk1)
-            qregs_[iChunk1].apply_chunk_swap(qubits, qregs_[iChunk2], true);
-        }
-      };
-      if (q0 < chunk_bits_ * qubit_scale())
-        Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1),
-                                      0, num_groups_, apply_chunk_swap_1qubit);
-      else
-        Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1),
-                                      0, num_groups_, apply_chunk_swap_2qubits);
-    }
-#ifdef AER_MPI
-    else {
-      int_t iPair;
-      uint_t nPair;
-      uint_t baseChunk, iChunk1, iChunk2;
-
-      if (q0 < chunk_bits_ * qubit_scale())
-        nLarge = 1;
-      else
-        nLarge = 2;
-
-      // chunk scheduler that supports any number of processes
-      uint_t nu[3];
-      uint_t ub[3];
-      uint_t iu[3];
-      uint_t add;
-      uint_t iLocalChunk, iRemoteChunk, iProc;
-      int i;
-
-      if (q0 < chunk_bits_ * qubit_scale()) {
-        nLarge = 1;
-        nu[0] = 1ull << (q1 - chunk_bits_ * qubit_scale());
-        ub[0] = 0;
-        iu[0] = 0;
-
-        nu[1] = 1ull << (num_qubits_ * qubit_scale() - q1 - 1);
-        ub[1] = (q1 - chunk_bits_ * qubit_scale()) + 1;
-        iu[1] = 0;
-      } else {
-        nLarge = 2;
-        nu[0] = 1ull << (q0 - chunk_bits_ * qubit_scale());
-        ub[0] = 0;
-        iu[0] = 0;
-
-        nu[1] = 1ull << (q1 - q0 - 1);
-        ub[1] = (q0 - chunk_bits_ * qubit_scale()) + 1;
-        iu[1] = 0;
-
-        nu[2] = 1ull << (num_qubits_ * qubit_scale() - q1 - 1);
-        ub[2] = (q1 - chunk_bits_ * qubit_scale()) + 1;
-        iu[2] = 0;
-      }
-      nPair = 1ull << (num_qubits_ * qubit_scale() -
-                       chunk_bits_ * qubit_scale() - nLarge);
-
-      for (iPair = 0; iPair < nPair; iPair++) {
-        // calculate index of pair of chunks
-        baseChunk = 0;
-        add = 1;
-        for (i = nLarge; i >= 0; i--) {
-          baseChunk += (iu[i] << ub[i]);
-          // update for next
-          iu[i] += add;
-          add = 0;
-          if (iu[i] >= nu[i]) {
-            iu[i] = 0;
-            add = 1;
-          }
-        }
-
-        iChunk1 = baseChunk | mask0;
-        iChunk2 = baseChunk | mask1;
-
-        if (iChunk1 >= chunk_index_begin_[distributed_rank_] &&
-            iChunk1 < chunk_index_end_[distributed_rank_]) { // chunk1 is on
-                                                             // this process
-          if (iChunk2 >= chunk_index_begin_[distributed_rank_] &&
-              iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on
-                                                               // this process
-            qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap(
-                qubits, qregs_[iChunk2 - global_chunk_index_], true);
-            continue;
-          } else {
-            iLocalChunk = iChunk1;
-            iRemoteChunk = iChunk2;
-            iProc = get_process_by_chunk(iChunk2);
-          }
-        } else {
-          if (iChunk2 >= chunk_index_begin_[distributed_rank_] &&
-              iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on
-                                                               // this process
-            iLocalChunk = iChunk2;
-            iRemoteChunk = iChunk1;
-            iProc = get_process_by_chunk(iChunk1);
-          } else {
-            continue; // there is no chunk for this pair on this process
-          }
-        }
-
-        MPI_Request reqSend, reqRecv;
-        MPI_Status st;
-        uint_t sizeRecv, sizeSend;
-
-        auto pRecv =
-            qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv);
-        MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, distributed_comm_,
-                  &reqRecv);
-
-        auto pSend =
-            qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend);
-        MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, distributed_comm_,
-                  &reqSend);
-
-        MPI_Wait(&reqSend, &st);
-        MPI_Wait(&reqRecv, &st);
-
-        qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap(
-            qubits, iRemoteChunk);
-      }
-    }
-#endif
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_multi_chunk_swap(const reg_t &qubits) {
-  int_t nswap = qubits.size() / 2;
-  reg_t chunk_shuffle_qubits(nswap, 0);
-  reg_t local_swaps;
-  uint_t baseChunk = 0;
-  uint_t nchunk = 1ull << nswap;
-  reg_t chunk_procs(nchunk);
-  reg_t chunk_offset(nchunk);
-
-  if (qubit_scale() == 1) {
-    for (int_t i = 0; i < nswap; i++)
-      std::swap(qubit_map_[qubits[i * 2]], qubit_map_[qubits[i * 2] + 1]);
-  }
-
-  // define local swaps
-  for (int_t i = 0; i < nswap; i++) {
-    if (qubits[i * 2] >= chunk_bits_ * qubit_scale() - nswap) // no swap
-                                                              // required
-      chunk_shuffle_qubits[qubits[i * 2] + nswap -
-                           chunk_bits_ * qubit_scale()] = qubits[i * 2 + 1];
-  }
-  int_t pos = 0;
-  for (int_t i = 0; i < nswap; i++) {
-    if (qubits[i * 2] <
-        chunk_bits_ * qubit_scale() - nswap) { // local swap required
-      // find empty position
-      while (pos < nswap) {
-        if (chunk_shuffle_qubits[pos] < chunk_bits_ * qubit_scale()) {
-          chunk_shuffle_qubits[pos] = qubits[i * 2 + 1];
-          local_swaps.push_back(qubits[i * 2]);
-          local_swaps.push_back(chunk_bits_ * qubit_scale() - nswap + pos);
-          pos++;
-          break;
-        }
-        pos++;
-      }
-    }
-  }
-  for (int_t i = 0; i < nswap; i++)
-    chunk_shuffle_qubits[i] -= chunk_bits_ * qubit_scale();
-
-  // swap inside chunks to prepare for all-to-all shuffle
-  if (chunk_omp_parallel_ && num_groups_ > 1) {
-#pragma omp parallel for
-    for (int_t ig = 0; ig < num_groups_; ig++) {
-      for (int_t iChunk = top_chunk_of_group_[ig];
-           iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-        qregs_[iChunk].apply_multi_swaps(local_swaps);
-    }
-  } else {
-    for (int_t ig = 0; ig < num_groups_; ig++) {
-      for (int_t iChunk = top_chunk_of_group_[ig];
-           iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-        qregs_[iChunk].apply_multi_swaps(local_swaps);
-    }
-  }
-
-  // apply all-to-all chunk shuffle
-  int_t nPair;
-  reg_t chunk_shuffle_qubits_sorted = chunk_shuffle_qubits;
-  std::sort(chunk_shuffle_qubits_sorted.begin(),
-            chunk_shuffle_qubits_sorted.end());
-
-  nPair = num_global_chunks_ >> nswap;
-
-  for (uint_t i = 0; i < nchunk; i++) {
-    chunk_offset[i] = 0;
-    for (uint_t k = 0; k < nswap; k++) {
-      if (((i >> k) & 1) != 0)
-        chunk_offset[i] += (1ull << chunk_shuffle_qubits[k]);
-    }
-  }
-
-#ifdef AER_MPI
-  std::vector<MPI_Request> reqSend(nchunk);
-  std::vector<MPI_Request> reqRecv(nchunk);
-#endif
-
-  for (int_t iPair = 0; iPair < nPair; iPair++) {
-    uint_t i1, i2, k, ii, t;
-    baseChunk = 0;
-    ii = iPair;
-    for (k = 0; k < nswap; k++) {
-      t = ii & ((1ull << chunk_shuffle_qubits_sorted[k]) - 1);
-      baseChunk += t;
-      ii = (ii - t) << 1;
-    }
-    baseChunk += ii;
-
-    for (i1 = 0; i1 < nchunk; i1++) {
-      chunk_procs[i1] = get_process_by_chunk(baseChunk + chunk_offset[i1]);
-    }
-
-    // all-to-all
-    // send data
-    for (uint_t iswap = 1; iswap < nchunk; iswap++) {
-      uint_t sizeRecv, sizeSend;
-      uint_t num_local_swap = 0;
-      for (i1 = 0; i1 < nchunk; i1++) {
-        i2 = i1 ^ iswap;
-        if (i1 >= i2)
-          continue;
-
-        uint_t iProc1 = chunk_procs[i1];
-        uint_t iProc2 = chunk_procs[i2];
-        if (iProc1 != distributed_rank_ && iProc2 != distributed_rank_)
-          continue;
-        if (iProc1 == iProc2) { // on the same process
-          num_local_swap++;
-          continue; // swap while data is exchanged between processes
-        }
-#ifdef AER_MPI
-        uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap);
-        uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
-        uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_;
-        uint_t iChunk2 = baseChunk + chunk_offset[i2] - global_chunk_index_;
-
-        int_t tid = (iPair << nswap) + iswap;
-
-        if (iProc1 == distributed_rank_) {
-          auto pRecv = qregs_[iChunk1].recv_buffer(sizeRecv);
-          MPI_Irecv(pRecv + offset2, (sizeRecv >> nswap), MPI_BYTE, iProc2, tid,
-                    distributed_comm_, &reqRecv[i2]);
-
-          auto pSend = qregs_[iChunk1].send_buffer(sizeSend);
-          MPI_Isend(pSend + offset2, (sizeSend >> nswap), MPI_BYTE, iProc2, tid,
-                    distributed_comm_, &reqSend[i2]);
-        } else {
-          auto pRecv = qregs_[iChunk2].recv_buffer(sizeRecv);
-          MPI_Irecv(pRecv + offset1, (sizeRecv >> nswap), MPI_BYTE, iProc1, tid,
-                    distributed_comm_, &reqRecv[i1]);
-
-          auto pSend = qregs_[iChunk2].send_buffer(sizeSend);
-          MPI_Isend(pSend + offset1, (sizeSend >> nswap), MPI_BYTE, iProc1, tid,
-                    distributed_comm_, &reqSend[i1]);
-        }
-#endif
-      }
-
-      // swaps inside process
-      if (num_local_swap > 0) {
-        for (i1 = 0; i1 < nchunk; i1++) {
-          i2 = i1 ^ iswap;
-          if (i1 > i2)
-            continue;
-
-          uint_t iProc1 = chunk_procs[i1];
-          uint_t iProc2 = chunk_procs[i2];
-          if (iProc1 != distributed_rank_ && iProc2 != distributed_rank_)
-            continue;
-          if (iProc1 == iProc2) { // on the same process
-            uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap);
-            uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
-            uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_;
-            uint_t iChunk2 = baseChunk + chunk_offset[i2] - global_chunk_index_;
-            qregs_[iChunk1].apply_chunk_swap(
-                qregs_[iChunk2], offset2, offset1,
-                (1ull << (chunk_bits_ * qubit_scale() - nswap)));
-          }
-        }
-      }
-
-#ifdef AER_MPI
-      // recv data
-      for (i1 = 0; i1 < nchunk; i1++) {
-        i2 = i1 ^ iswap;
-
-        uint_t iProc1 = chunk_procs[i1];
-        uint_t iProc2 = chunk_procs[i2];
-        if (iProc1 != distributed_rank_)
-          continue;
-        if (iProc1 == iProc2) { // on the same process
-          continue;
-        }
-        uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_;
-        uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap);
-
-        MPI_Status st;
-        MPI_Wait(&reqSend[i2], &st);
-        MPI_Wait(&reqRecv[i2], &st);
-
-        // copy states from recv buffer to chunk
-        qregs_[iChunk1].apply_chunk_swap(
-            qregs_[iChunk1], offset2, offset2,
-            (1ull << (chunk_bits_ * qubit_scale() - nswap)));
-      }
-#endif
-    }
-  }
-
-  // restore qubits order
-  if (chunk_omp_parallel_ && num_groups_ > 1) {
-#pragma omp parallel for
-    for (int_t ig = 0; ig < num_groups_; ig++) {
-      for (int_t iChunk = top_chunk_of_group_[ig];
-           iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-        qregs_[iChunk].apply_multi_swaps(local_swaps);
-    }
-  } else {
-    for (int_t ig = 0; ig < num_groups_; ig++) {
-      for (int_t iChunk = top_chunk_of_group_[ig];
-           iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-        qregs_[iChunk].apply_multi_swaps(local_swaps);
-    }
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::apply_chunk_x(const uint_t qubit) {
-  int_t iChunk;
-  uint_t nLarge = 1;
-
-  if (qubit < chunk_bits_ * qubit_scale()) {
-    auto apply_mcx = [this, qubit](int_t ig) {
-      reg_t qubits(1, qubit);
-      for (int_t iChunk = top_chunk_of_group_[ig];
-           iChunk < top_chunk_of_group_[ig + 1]; iChunk++)
-        qregs_[iChunk].apply_mcx(qubits);
-    };
-    Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), 0,
-                                  num_groups_, apply_mcx);
-  } else { // exchange over chunks
-    int_t iPair;
-    uint_t nPair, mask;
-    uint_t baseChunk, iChunk1, iChunk2;
-    reg_t qubits(2);
-    qubits[0] = qubit;
-    qubits[1] = qubit;
-
-    mask = (1ull << qubit);
-    mask >>= (chunk_bits_ * qubit_scale());
-
-    if (distributed_procs_ == 1 ||
-        (distributed_proc_bits_ >= 0 &&
-         qubit < (num_qubits_ * qubit_scale() -
-                  distributed_proc_bits_))) { // no data transfer between
-                                              // processes is needed
-      nPair = num_local_chunks_ >> 1;
-
-      auto apply_chunk_swap = [this, mask, qubits](int_t iGroup) {
-        for (int_t ic = top_chunk_of_group_[iGroup];
-             ic < top_chunk_of_group_[iGroup + 1]; ic++) {
-          uint_t pairChunk;
-          pairChunk = ic ^ mask;
-          if (ic < pairChunk)
-            qregs_[ic].apply_chunk_swap(qubits, qregs_[pairChunk], true);
-        }
-      };
-      Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), 0,
-                                    nPair, apply_chunk_swap);
-    }
-#ifdef AER_MPI
-    else {
-      // chunk scheduler that supports any number of processes
-      uint_t nu[3];
-      uint_t ub[3];
-      uint_t iu[3];
-      uint_t add;
-      uint_t iLocalChunk, iRemoteChunk, iProc;
-      int i;
-
-      nLarge = 1;
-      nu[0] = 1ull << (qubit - chunk_bits_ * qubit_scale());
-      ub[0] = 0;
-      iu[0] = 0;
-
-      nu[1] = 1ull << (num_qubits_ * qubit_scale() - qubit - 1);
-      ub[1] = (qubit - chunk_bits_ * qubit_scale()) + 1;
-      iu[1] = 0;
-      nPair = 1ull << (num_qubits_ * qubit_scale() -
-                       chunk_bits_ * qubit_scale() - 1);
-
-      for (iPair = 0; iPair < nPair; iPair++) {
-        // calculate index of pair of chunks
-        baseChunk = 0;
-        add = 1;
-        for (i = 1; i >= 0; i--) {
-          baseChunk += (iu[i] << ub[i]);
-          // update for next
-          iu[i] += add;
-          add = 0;
-          if (iu[i] >= nu[i]) {
-            iu[i] = 0;
-            add = 1;
-          }
-        }
-
-        iChunk1 = baseChunk;
-        iChunk2 = baseChunk | mask;
-
-        if (iChunk1 >= chunk_index_begin_[distributed_rank_] &&
-            iChunk1 < chunk_index_end_[distributed_rank_]) { // chunk1 is on
-                                                             // this process
-          if (iChunk2 >= chunk_index_begin_[distributed_rank_] &&
-              iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on
-                                                               // this process
-            qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap(
-                qubits, qregs_[iChunk2 - global_chunk_index_], true);
-            continue;
-          } else {
-            iLocalChunk = iChunk1;
-            iRemoteChunk = iChunk2;
-            iProc = get_process_by_chunk(iChunk2);
-          }
-        } else {
-          if (iChunk2 >= chunk_index_begin_[distributed_rank_] &&
-              iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on
-                                                               // this process
-            iLocalChunk = iChunk2;
-            iRemoteChunk = iChunk1;
-            iProc = get_process_by_chunk(iChunk1);
-          } else {
-            continue; // there is no chunk for this pair on this process
-          }
-        }
-
-        MPI_Request reqSend, reqRecv;
-        MPI_Status st;
-        uint_t sizeRecv, sizeSend;
-
-        auto pSend =
-            qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend);
-        MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, distributed_comm_,
-                  &reqSend);
-
-        auto pRecv =
-            qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv);
-        MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, distributed_comm_,
-                  &reqRecv);
-
-        MPI_Wait(&reqSend, &st);
-        MPI_Wait(&reqRecv, &st);
-
-        qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap(
-            qubits, iRemoteChunk);
-      }
-    }
-#endif
-  }
-}
-
-template <class state_t>
-void StateChunk<state_t>::send_chunk(uint_t local_chunk_index,
-                                     uint_t global_pair_index) {
-#ifdef AER_MPI
-  MPI_Request reqSend;
-  MPI_Status st;
-  uint_t sizeSend;
-  uint_t iProc;
-
-  iProc = get_process_by_chunk(global_pair_index);
-
-  auto pSend = qregs_[local_chunk_index].send_buffer(sizeSend);
-  MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc,
-            local_chunk_index + global_chunk_index_, distributed_comm_,
-            &reqSend);
-
-  MPI_Wait(&reqSend, &st);
-
-  qregs_[local_chunk_index].release_send_buffer();
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::recv_chunk(uint_t local_chunk_index,
-                                     uint_t global_pair_index) {
-#ifdef AER_MPI
-  MPI_Request reqRecv;
-  MPI_Status st;
-  uint_t sizeRecv;
-  uint_t iProc;
-
-  iProc = get_process_by_chunk(global_pair_index);
-
-  auto pRecv = qregs_[local_chunk_index].recv_buffer(sizeRecv);
-  MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, global_pair_index,
-            distributed_comm_, &reqRecv);
-
-  MPI_Wait(&reqRecv, &st);
-#endif
-}
-
-template <class state_t>
-template <class data_t>
-void StateChunk<state_t>::send_data(data_t *pSend, uint_t size, uint_t myid,
-                                    uint_t pairid) {
-#ifdef AER_MPI
-  MPI_Request reqSend;
-  MPI_Status st;
-  uint_t iProc;
-
-  iProc = get_process_by_chunk(pairid);
-
-  MPI_Isend(pSend, size * sizeof(data_t), MPI_BYTE, iProc, myid,
-            distributed_comm_, &reqSend);
-
-  MPI_Wait(&reqSend, &st);
-#endif
-}
-
-template <class state_t>
-template <class data_t>
-void StateChunk<state_t>::recv_data(data_t *pRecv, uint_t size, uint_t myid,
-                                    uint_t pairid) {
-#ifdef AER_MPI
-  MPI_Request reqRecv;
-  MPI_Status st;
-  uint_t iProc;
-
-  iProc = get_process_by_chunk(pairid);
-
-  MPI_Irecv(pRecv, size * sizeof(data_t), MPI_BYTE, iProc, pairid,
-            distributed_comm_, &reqRecv);
-
-  MPI_Wait(&reqRecv, &st);
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::reduce_sum(reg_t &sum) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    uint_t i, n = sum.size();
-    reg_t tmp(n);
-    MPI_Allreduce(&sum[0], &tmp[0], n, MPI_UINT64_T, MPI_SUM,
-                  distributed_comm_);
-    for (i = 0; i < n; i++) {
-      sum[i] = tmp[i];
-    }
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::reduce_sum(rvector_t &sum) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    uint_t i, n = sum.size();
-    rvector_t tmp(n);
-    MPI_Allreduce(&sum[0], &tmp[0], n, MPI_DOUBLE_PRECISION, MPI_SUM,
-                  distributed_comm_);
-    for (i = 0; i < n; i++) {
-      sum[i] = tmp[i];
-    }
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::reduce_sum(complex_t &sum) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    complex_t tmp;
-    MPI_Allreduce(&sum, &tmp, 2, MPI_DOUBLE_PRECISION, MPI_SUM,
-                  distributed_comm_);
-    sum = tmp;
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::reduce_sum(double &sum) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    double tmp;
-    MPI_Allreduce(&sum, &tmp, 1, MPI_DOUBLE_PRECISION, MPI_SUM,
-                  distributed_comm_);
-    sum = tmp;
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::gather_value(rvector_t &val) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    rvector_t tmp = val;
-    MPI_Alltoall(&tmp[0], 1, MPI_DOUBLE_PRECISION, &val[0], 1,
-                 MPI_DOUBLE_PRECISION, distributed_comm_);
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::sync_process(void) const {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    MPI_Barrier(distributed_comm_);
-  }
-#endif
-}
-
-// gather distributed state into vector (if memory is enough)
-template <class state_t>
-template <class data_t>
-void StateChunk<state_t>::gather_state(
-    std::vector<std::complex<data_t>> &state) {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    uint_t size, local_size, global_size, offset;
-    int i;
-    std::vector<int> recv_counts(distributed_procs_);
-    std::vector<int> recv_offset(distributed_procs_);
-
-    global_size = 0;
-    for (i = 0; i < distributed_procs_; i++) {
-      recv_offset[i] =
-          (int)(chunk_index_begin_[i] << (chunk_bits_ * qubit_scale())) * 2;
-      recv_counts[i] = (int)((chunk_index_end_[i] - chunk_index_begin_[i])
-                             << (chunk_bits_ * qubit_scale()));
-      global_size += recv_counts[i];
-      recv_counts[i] *= 2;
-    }
-    if ((global_size >> 21) > Utils::get_system_memory_mb()) {
-      throw std::runtime_error(
-          std::string("There is not enough memory to gather state"));
-    }
-    std::vector<std::complex<data_t>> local_state = state;
-    state.resize(global_size);
-
-    if (sizeof(std::complex<data_t>) == 16) {
-      MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_],
-                     MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0],
-                     &recv_offset[0], MPI_DOUBLE_PRECISION, distributed_comm_);
-    } else {
-      MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_],
-                     MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0],
-                     MPI_FLOAT, distributed_comm_);
-    }
-  }
-#endif
-}
-
-template <class state_t>
-template <class data_t>
-void StateChunk<state_t>::gather_state(
-    AER::Vector<std::complex<data_t>> &state) {
-#ifdef AER_MPI
-  if (distributed_procs_ > 1) {
-    uint_t size, local_size, global_size, offset;
-    int i;
-
-    std::vector<int> recv_counts(distributed_procs_);
-    std::vector<int> recv_offset(distributed_procs_);
-
-    global_size = 0;
-    for (i = 0; i < distributed_procs_; i++) {
-      recv_offset[i] =
-          (int)(chunk_index_begin_[i] << (chunk_bits_ * qubit_scale())) * 2;
-      recv_counts[i] = (int)((chunk_index_end_[i] - chunk_index_begin_[i])
-                             << (chunk_bits_ * qubit_scale()));
-      global_size += recv_counts[i];
-      recv_counts[i] *= 2;
-    }
-    if ((global_size >> 21) > Utils::get_system_memory_mb()) {
-      throw std::runtime_error(
-          std::string("There is not enough memory to gather state"));
-    }
-    AER::Vector<std::complex<data_t>> local_state = state;
-    state.resize(global_size);
-
-    if (sizeof(std::complex<data_t>) == 16) {
-      MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_],
-                     MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0],
-                     &recv_offset[0], MPI_DOUBLE_PRECISION, distributed_comm_);
-    } else {
-      MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_],
-                     MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0],
-                     MPI_FLOAT, distributed_comm_);
-    }
-  }
-#endif
-}
-
-template <class state_t>
-void StateChunk<state_t>::gather_creg_memory(void) {
-#ifdef AER_MPI
-  int_t i, j;
-  uint_t n64, i64, ibit;
-
-  if (distributed_procs_ == 1)
-    return;
-  if (BaseState::cregs_[0].memory_size() == 0)
-    return;
-
-  // number of 64-bit integers per memory
-  n64 = (BaseState::cregs_[0].memory_size() + 63) >> 6;
-
-  reg_t bin_memory(n64 * num_local_chunks_, 0);
-  // compress memory string to binary
-#pragma omp parallel for private(i, j, i64, ibit)
-  for (i = 0; i < num_local_chunks_; i++) {
-    for (j = 0; j < BaseState::cregs_[0].memory_size(); j++) {
-      i64 = j >> 6;
-      ibit = j & 63;
-      if (BaseState::cregs_[global_chunk_index_ + i].creg_memory()[j] == '1') {
-        bin_memory[i * n64 + i64] |= (1ull << ibit);
-      }
-    }
-  }
-
-  reg_t recv(n64 * num_global_chunks_);
-  std::vector<int> recv_counts(distributed_procs_);
-  std::vector<int> recv_offset(distributed_procs_);
-
-  for (i = 0; i < distributed_procs_; i++) {
-    recv_offset[i] = num_global_chunks_ * i / distributed_procs_;
-    recv_counts[i] =
-        (num_global_chunks_ * (i + 1) / distributed_procs_) - recv_offset[i];
-  }
-
-  MPI_Allgatherv(&bin_memory[0], n64 * num_local_chunks_, MPI_UINT64_T,
-                 &recv[0], &recv_counts[0], &recv_offset[0], MPI_UINT64_T,
-                 distributed_comm_);
-
-  // store gathered memory
-#pragma omp parallel for private(i, j, i64, ibit)
-  for (i = 0; i < num_global_chunks_; i++) {
-    for (j = 0; j < BaseState::cregs_[0].memory_size(); j++) {
-      i64 = j >> 6;
-      ibit = j & 63;
-      if (((recv[i * n64 + i64] >> ibit) & 1) == 1)
-        BaseState::cregs_[i].creg_memory()[j] = '1';
-      else
-        BaseState::cregs_[i].creg_memory()[j] = '0';
-    }
-  }
-#endif
-}
-
-//-------------------------------------------------------------------------
-} // namespace QuantumState
-//-------------------------------------------------------------------------
-} // end namespace AER
-//-------------------------------------------------------------------------
-#endif
diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp
index 67c0c454ef..37067c172a 100644
--- a/src/simulators/statevector/chunk/chunk.hpp
+++ b/src/simulators/statevector/chunk/chunk.hpp
@@ -399,6 +399,12 @@ class Chunk {
   void probabilities(std::vector<double> &probs, const reg_t &qubits) const {
     chunk_container_.lock()->probabilities(probs, chunk_pos_, qubits);
   }
+  // get norm of matrix multiplication
+  double expval_matrix(const reg_t &qubits, const cvector_t<double> &mat,
+                       const uint_t count) const {
+    return chunk_container_.lock()->expval_matrix(chunk_pos_, qubits, mat,
+                                                  count);
+  }
   // Pauli expectation values
   double expval_pauli(const reg_t &qubits, const std::string &pauli,
                       const complex_t initial_phase) const {
diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp
index a609f135a8..b674e6217c 100644
--- a/src/simulators/statevector/chunk/chunk_container.hpp
+++ b/src/simulators/statevector/chunk/chunk_container.hpp
@@ -174,11 +174,11 @@ class ChunkContainer
   virtual thrust::complex<data_t> Get(uint_t i) const = 0;
 
   virtual void StoreMatrix(const std::vector<std::complex<double>> &mat,
-                           uint_t iChunk) = 0;
+                           uint_t iChunk) const = 0;
   virtual void StoreMatrix(const std::complex<double> *mat, uint_t iChunk,
-                           uint_t size) = 0;
+                           uint_t size) const = 0;
   virtual void StoreUintParams(const std::vector<uint_t> &prm,
-                               uint_t iChunk) = 0;
+                               uint_t iChunk) const = 0;
   virtual void ResizeMatrixBuffers(int bits) = 0;
 
   virtual void CopyIn(Chunk<data_t> &src, uint_t iChunk) = 0;
@@ -310,6 +310,11 @@ class ChunkContainer
   virtual void probabilities(std::vector<double> &probs, const uint_t iChunk,
                              const reg_t &qubits) const;
 
+  // get norm of matrix multiplication
+  virtual double expval_matrix(const uint_t iChunk, const reg_t &qubits,
+                               const cvector_t<double> &mat,
+                               const uint_t count) const;
+
   // Pauli expectation values
   virtual double expval_pauli(const uint_t iChunk, const reg_t &qubits,
                               const std::string &pauli,
@@ -1009,6 +1014,32 @@ double ChunkContainer<data_t>::trace(uint_t iChunk, uint_t row,
   return ret;
 }
 
+template <typename data_t>
+double ChunkContainer<data_t>::expval_matrix(const uint_t iChunk,
+                                             const reg_t &qubits,
+                                             const cvector_t<double> &mat,
+                                             const uint_t count) const {
+  double ret;
+  const size_t N = qubits.size();
+
+  if (N == 1)
+    ExecuteSum(&ret, NormMatrixMult2x2<data_t>(mat, qubits[0]), iChunk, count);
+  else {
+    auto qubits_sorted = qubits;
+    std::sort(qubits_sorted.begin(), qubits_sorted.end());
+    for (int_t i = 0; i < N; i++) {
+      qubits_sorted.push_back(qubits[i]);
+    }
+
+    StoreMatrix(mat, iChunk);
+    StoreUintParams(qubits_sorted, iChunk);
+
+    ExecuteSum(&ret, NormMatrixMultNxN<data_t>(N), iChunk, count);
+  }
+
+  return ret;
+}
+
 template <typename data_t>
 double
 ChunkContainer<data_t>::expval_pauli(const uint_t iChunk, const reg_t &qubits,
diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp
index 8023b1699c..1efc57db52 100644
--- a/src/simulators/statevector/chunk/chunk_manager.hpp
+++ b/src/simulators/statevector/chunk/chunk_manager.hpp
@@ -57,6 +57,8 @@ class ChunkManager {
   int num_threads_per_group_;
   uint_t num_creg_bits_ = 0;
 
+  reg_t target_gpus_;
+
 public:
   ChunkManager();
 
@@ -71,7 +73,7 @@ class ChunkManager {
 
   uint_t Allocate(int chunk_bits, int nqubits, uint_t nchunks,
                   uint_t chunk_index, int matrix_bit, bool density_mat,
-                  bool enable_cuStatevec);
+                  reg_t &gpus, bool enable_cuStatevec);
   void Free(void);
 
   int num_devices(void) { return num_devices_; }
@@ -160,7 +162,7 @@ template <typename data_t>
 uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
                                       uint_t nchunks, uint_t chunk_index,
                                       int matrix_bit, bool density_mat,
-                                      bool enable_cuStatevec) {
+                                      reg_t &gpus, bool enable_cuStatevec) {
   uint_t num_buffers;
   int iDev;
   uint_t is, ie, nc;
@@ -183,6 +185,17 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
   density_matrix_ = density_mat;
 
   enable_cuStatevec_ = enable_cuStatevec;
+  target_gpus_ = gpus;
+  if (target_gpus_.size() > 0) {
+    num_devices_ = target_gpus_.size();
+    if (num_devices_ > 1)
+      multi_gpu = true;
+  } else {
+    target_gpus_.resize(num_devices_);
+    for (iDev = 0; iDev < num_devices_; iDev++) {
+      target_gpus_[iDev] = iDev;
+    }
+  }
 
   chunk_index_ = chunk_index;
 
@@ -246,7 +259,7 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
 
       if (!multi_gpu) {
         size_t freeMem, totalMem;
-        cudaSetDevice(0);
+        cudaSetDevice(target_gpus_[0]);
         cudaMemGetInfo(&freeMem, &totalMem);
         if (freeMem > (((uint_t)sizeof(thrust::complex<data_t>) *
                         (nchunks + num_buffers + AER_DUMMY_BUFFERS))
@@ -295,14 +308,16 @@ uint_t ChunkManager<data_t>::Allocate(int chunk_bits, int nqubits,
           chunk_index_ +
           chunks_allocated); // set first chunk index for the container
       chunks_[iDev]->set_num_creg_bits(num_creg_bits_);
-      if (num_devices_ > 0)
-        chunks_allocated += chunks_[iDev]->Allocate(
-            (iDev + idev_start) % num_devices_, chunk_bits, nqubits, nc,
-            num_buffers, multi_shots_, matrix_bit, density_matrix_);
-      else
+      if (num_devices_ > 0) {
+        int id = target_gpus_[(iDev + idev_start) % num_devices_];
+        chunks_allocated +=
+            chunks_[iDev]->Allocate(id, chunk_bits, nqubits, nc, num_buffers,
+                                    multi_shots_, matrix_bit, density_matrix_);
+      } else {
         chunks_allocated +=
             chunks_[iDev]->Allocate(iDev, chunk_bits, nqubits, nc, num_buffers,
                                     multi_shots_, matrix_bit, density_matrix_);
+      }
     }
     if (chunks_allocated < num_chunks_) {
       int nplaces_add = num_places_;
diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
index 3fd95b94ce..9fe2fadefd 100644
--- a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp
@@ -142,7 +142,7 @@ uint_t cuStateVecChunkContainer<data_t>::Allocate(
     throw std::runtime_error(str.str());
   }
 
-  err = custatevecSetStream(custatevec_handle_, BaseContainer::stream_);
+  err = custatevecSetStream(custatevec_handle_, BaseContainer::stream(0));
   if (err != CUSTATEVEC_STATUS_SUCCESS) {
     std::stringstream str;
     str << "cuStateVecChunkContainer::allocate::custatevecSetStream : "
@@ -214,13 +214,13 @@ reg_t cuStateVecChunkContainer<data_t>::sample_measure(
     reg_t samples(SHOTS, 0);
 
     BaseContainer::set_device();
-    custatevecSetStream(custatevec_handle_, BaseContainer::stream_);
+    custatevecSetStream(custatevec_handle_, BaseContainer::stream(0));
 
     custatevecStatus_t err;
     custatevecSamplerDescriptor_t sampler;
     size_t extSize;
 
-    cudaStreamSynchronize(BaseContainer::stream_);
+    cudaStreamSynchronize(BaseContainer::stream(0));
 
     cudaDataType_t state_type;
     if (sizeof(data_t) == sizeof(double))
diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp
index e4237dec67..bfd75cb92b 100644
--- a/src/simulators/statevector/chunk/device_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/device_chunk_container.hpp
@@ -31,13 +31,17 @@ namespace Chunk {
 template <typename data_t>
 class DeviceChunkContainer : public ChunkContainer<data_t> {
 protected:
-  AERDeviceVector<thrust::complex<data_t>>
-      data_; // device vector to chunks and buffers
-  AERDeviceVector<thrust::complex<double>> matrix_; // storage for large matrix
-  mutable AERDeviceVector<uint_t> params_; // storage for additional parameters
-  AERDeviceVector<double> reduce_buffer_;  // buffer for reduction
-  AERDeviceVector<double>
-      probability_buffer_; // buffer used for measure probability
+  // device vector to chunks and buffers
+  AERDeviceVector<thrust::complex<data_t>> data_;
+  // storage for large matrix
+  mutable AERDeviceVector<thrust::complex<double>> matrix_;
+  // storage for additional parameters
+  mutable AERDeviceVector<uint_t> params_;
+  // buffer for reduction
+  AERDeviceVector<double> reduce_buffer_;
+  // buffer used for measure probability
+  AERDeviceVector<double> probability_buffer_;
+
   AERDeviceVector<uint_t> cregs_;
   AERHostVector<uint_t> cregs_host_;
   int device_id_;                 // device index
@@ -51,6 +55,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   bool multi_shots_; // multi-shot parallelization
 
   bool creg_host_update_;
+  bool creg_dev_update_;
 
   // for register blocking
   thrust::host_vector<uint_t> blocked_qubits_holder_;
@@ -60,8 +65,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   reg_t num_blocked_qubits_;
 
 #ifdef AER_THRUST_CUDA
-  cudaStream_t stream_;       // asynchronous execution
-  cudaStream_t stream_cache_; // asynchronous execution
+  std::vector<cudaStream_t> stream_; // asynchronous execution
 #endif
 
 public:
@@ -72,10 +76,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
     num_matrices_ = 1;
     multi_shots_ = false;
     creg_host_update_ = true;
-#ifdef AER_THRUST_CUDA
-    stream_ = nullptr;
-    stream_cache_ = nullptr;
-#endif
+    creg_dev_update_ = false;
   }
   ~DeviceChunkContainer();
 
@@ -106,10 +107,11 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   void Deallocate(void) override;
 
   void StoreMatrix(const std::vector<std::complex<double>> &mat,
-                   uint_t iChunk) override;
+                   uint_t iChunk) const override;
   void StoreMatrix(const std::complex<double> *mat, uint_t iChunk,
-                   uint_t size) override;
-  void StoreUintParams(const std::vector<uint_t> &prm, uint_t iChunk) override;
+                   uint_t size) const override;
+  void StoreUintParams(const std::vector<uint_t> &prm,
+                       uint_t iChunk) const override;
   void ResizeMatrixBuffers(int bits) override;
 
   void calculate_matrix_buffer_size(int bits);
@@ -123,8 +125,10 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
 #ifdef AER_THRUST_CUDA
   cudaStream_t stream(uint_t iChunk) const {
     if (iChunk >= this->num_chunks_)
-      return stream_cache_;
-    return stream_;
+      return stream_[(num_matrices_ + iChunk - this->num_chunks_)];
+    if (num_matrices_ == 1)
+      return stream_[0];
+    return stream_[iChunk];
   }
 #endif
 
@@ -212,9 +216,9 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
 #ifdef AER_THRUST_CUDA
       cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()),
                       thrust::raw_pointer_cast(cregs_.data()),
-                      sizeof(uint_t) * this->num_chunks_ * n64,
-                      cudaMemcpyDeviceToHost, stream_);
-      cudaStreamSynchronize(stream_);
+                      sizeof(uint_t) * num_matrices_ * n64,
+                      cudaMemcpyDeviceToHost, stream_[0]);
+      cudaStreamSynchronize(stream_[0]);
 #else
       thrust::copy_n(cregs_.begin(), this->num_chunks_ * n64,
                      cregs_host_.begin());
@@ -224,6 +228,50 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
     return (cregs_host_[iChunk * n64 + i64] >> ibit) & 1;
   }
 
+  void write_cbit(uint_t iChunk, int qubit, int val) {
+    uint_t n64, i64, ibit;
+    if (qubit >= this->num_creg_bits_)
+      return;
+    n64 = (this->num_creg_bits_ + 63) >> 6;
+    i64 = qubit >> 6;
+    ibit = qubit & 63;
+    if (iChunk == 0 && creg_host_update_) {
+      creg_host_update_ = false;
+#ifdef AER_THRUST_CUDA
+      cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()),
+                      thrust::raw_pointer_cast(cregs_.data()),
+                      sizeof(uint_t) * num_matrices_ * n64,
+                      cudaMemcpyDeviceToHost, stream_[0]);
+      cudaStreamSynchronize(stream_[0]);
+#else
+      thrust::copy_n(cregs_.begin(), this->num_chunks_ * n64,
+                     cregs_host_.begin());
+#endif
+    }
+
+    cregs_host_[iChunk * n64 + i64] =
+        (cregs_host_[iChunk * n64 + i64] & (~(1ull << ibit))) |
+        (((uint_t)val & 1) << ibit);
+    creg_dev_update_ = true;
+  }
+  void store_cbits(void) {
+    if (creg_dev_update_) {
+      uint_t n64;
+      n64 = (this->num_creg_bits_ + 63) >> 6;
+      creg_dev_update_ = false;
+      creg_host_update_ = false;
+#ifdef AER_THRUST_CUDA
+      cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_.data()),
+                      thrust::raw_pointer_cast(cregs_host_.data()),
+                      sizeof(uint_t) * num_matrices_ * n64,
+                      cudaMemcpyHostToDevice, stream_[0]);
+#else
+      thrust::copy_n(cregs_host_.begin(), this->num_chunks_ * n64,
+                     cregs_.begin());
+#endif
+    }
+  }
+
   uint_t *creg_buffer(uint_t iChunk) const {
     uint_t n64;
     n64 = (this->num_creg_bits_ + 63) >> 6;
@@ -234,10 +282,7 @@ class DeviceChunkContainer : public ChunkContainer<data_t> {
   void synchronize(uint_t iChunk) {
 #ifdef AER_THRUST_CUDA
     set_device();
-    if (iChunk >= this->num_chunks_)
-      cudaStreamSynchronize(stream_cache_);
-    else
-      cudaStreamSynchronize(stream_);
+    cudaStreamSynchronize(stream(iChunk));
 #endif
   }
 
@@ -276,28 +321,24 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
   set_device();
 
 #ifdef AER_THRUST_CUDA
-  if (!multi_shots) {
-    int ip, nd;
-    cudaGetDeviceCount(&nd);
-    peer_access_.resize(nd);
-    for (i = 0; i < nd; i++) {
-      ip = 1;
-      if (i != device_id_) {
-        cudaDeviceCanAccessPeer(&ip, device_id_, i);
-      }
-      if (ip) {
-        if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
-          cudaGetLastError();
-        peer_access_[i] = true;
-      } else
-        peer_access_[i] = false;
+  int ip, nd;
+  cudaGetDeviceCount(&nd);
+  peer_access_.resize(nd);
+  for (i = 0; i < nd; i++) {
+    ip = 1;
+    if (i != device_id_) {
+      cudaDeviceCanAccessPeer(&ip, device_id_, i);
     }
-  } else {
-#endif
-    peer_access_.resize(1);
-    peer_access_[0] = true;
-#ifdef AER_THRUST_CUDA
+    if (ip) {
+      if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess)
+        cudaGetLastError();
+      peer_access_[i] = true;
+    } else
+      peer_access_[i] = false;
   }
+#else
+  peer_access_.resize(1);
+  peer_access_[0] = true;
 #endif
 
   this->num_buffers_ = buffers;
@@ -352,10 +393,7 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
     }
   }
 
-  cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
-  cudaStreamCreateWithFlags(&stream_cache_, cudaStreamNonBlocking);
 #endif
-
   ResizeMatrixBuffers(matrix_bit);
 
   this->num_chunks_ = nc;
@@ -369,14 +407,29 @@ uint_t DeviceChunkContainer<data_t>::Allocate(int idev, int chunk_bits,
     nc_tmp >>= 1;
   }
 
+  uint_t size = num_matrices_ + this->num_buffers_;
+
+#ifdef AER_THRUST_CUDA
+  stream_.resize(size);
+  for (int i = 0; i < size; i++)
+    cudaStreamCreateWithFlags(&stream_[i], cudaStreamNonBlocking);
+
+  if (chunk_bits < 10) {
+    reduce_buffer_size_ = 1;
+  } else {
+    reduce_buffer_size_ = (1ull << (chunk_bits - 10));
+  }
+#else
+  reduce_buffer_size_ = 1;
+#endif
+
+  reduce_buffer_size_ *= 2;
   reduce_buffer_.resize(reduce_buffer_size_ * nc);
-  if (multi_shots)
-    probability_buffer_.resize(nc * QV_PROBABILITY_BUFFER_SIZE);
+  probability_buffer_.resize(nc * QV_PROBABILITY_BUFFER_SIZE);
 
   creg_host_update_ = false;
   this->num_creg_bits_ = num_qubits;
 
-  uint_t size = num_matrices_ + this->num_buffers_;
   num_blocked_gates_.resize(size);
   num_blocked_matrix_.resize(size);
   num_blocked_qubits_.resize(size);
@@ -401,8 +454,10 @@ void DeviceChunkContainer<data_t>::allocate_creg(uint_t num_mem,
   this->num_cmemory_ = num_mem;
 
   uint_t n64 = (this->num_creg_bits_ + 63) >> 6;
-  cregs_.resize(num_matrices_ * n64);
-  cregs_host_.resize(num_matrices_ * n64);
+  if (cregs_.size() != num_matrices_ * n64) {
+    cregs_.resize(num_matrices_ * n64);
+    cregs_host_.resize(num_matrices_ * n64);
+  }
 }
 
 template <typename data_t>
@@ -431,14 +486,9 @@ void DeviceChunkContainer<data_t>::Deallocate(void) {
   blocked_qubits_holder_.clear();
 
 #ifdef AER_THRUST_CUDA
-  if (stream_) {
-    cudaStreamDestroy(stream_);
-    stream_ = nullptr;
-  }
-  if (stream_cache_) {
-    cudaStreamDestroy(stream_cache_);
-    stream_cache_ = nullptr;
-  }
+  for (int i = 0; i < stream_.size(); i++)
+    cudaStreamDestroy(stream_[i]);
+  stream_.clear();
 #endif
   ChunkContainer<data_t>::deallocate_chunks();
 }
@@ -489,7 +539,7 @@ void DeviceChunkContainer<data_t>::ResizeMatrixBuffers(int bits) {
 
 template <typename data_t>
 void DeviceChunkContainer<data_t>::StoreMatrix(
-    const std::vector<std::complex<double>> &mat, uint_t iChunk) {
+    const std::vector<std::complex<double>> &mat, uint_t iChunk) const {
   set_device();
 
 #ifdef AER_THRUST_CUDA
@@ -520,7 +570,8 @@ void DeviceChunkContainer<data_t>::StoreMatrix(
 
 template <typename data_t>
 void DeviceChunkContainer<data_t>::StoreMatrix(const std::complex<double> *mat,
-                                               uint_t iChunk, uint_t size) {
+                                               uint_t iChunk,
+                                               uint_t size) const {
   set_device();
 
 #ifdef AER_THRUST_CUDA
@@ -552,7 +603,7 @@ void DeviceChunkContainer<data_t>::StoreMatrix(const std::complex<double> *mat,
 
 template <typename data_t>
 void DeviceChunkContainer<data_t>::StoreUintParams(
-    const std::vector<uint_t> &prm, uint_t iChunk) {
+    const std::vector<uint_t> &prm, uint_t iChunk) const {
   set_device();
 
 #ifdef AER_THRUST_CUDA
@@ -589,10 +640,10 @@ void DeviceChunkContainer<data_t>::CopyIn(Chunk<data_t> &src, uint_t iChunk) {
     if (peer_access(src.device())) {
       cudaMemcpyAsync(chunk_pointer(iChunk), src.pointer(),
                       size * sizeof(thrust::complex<data_t>),
-                      cudaMemcpyDeviceToDevice, stream_);
+                      cudaMemcpyDeviceToDevice, stream(iChunk));
     } else {
       cudaMemcpyPeerAsync(chunk_pointer(iChunk), device_id_, src.pointer(),
-                          src.device(), size, stream_);
+                          src.device(), size, stream(iChunk));
     }
   } else {
     cudaMemcpyAsync(chunk_pointer(iChunk), src.pointer(),
@@ -621,10 +672,10 @@ void DeviceChunkContainer<data_t>::CopyOut(Chunk<data_t> &dest, uint_t iChunk) {
     if (peer_access(dest.device())) {
       cudaMemcpyAsync(dest.pointer(), chunk_pointer(iChunk),
                       size * sizeof(thrust::complex<data_t>),
-                      cudaMemcpyDeviceToDevice, stream_);
+                      cudaMemcpyDeviceToDevice, stream(iChunk));
     } else {
       cudaMemcpyPeerAsync(dest.pointer(), dest.device(), chunk_pointer(iChunk),
-                          device_id_, size, stream_);
+                          device_id_, size, stream(iChunk));
     }
   } else {
     cudaMemcpyAsync(dest.pointer(), chunk_pointer(iChunk),
@@ -650,8 +701,12 @@ template <typename data_t>
 void DeviceChunkContainer<data_t>::CopyIn(thrust::complex<data_t> *src,
                                           uint_t iChunk, uint_t size) {
   uint_t this_size = 1ull << this->chunk_bits_;
-  if (this_size < size)
-    throw std::runtime_error("CopyIn chunk size is less than provided size");
+  if (this_size < size) {
+    std::stringstream str;
+    str << "DeviceChunkContainer::CopyIn chunk size " << this_size
+        << " is less than " << size;
+    throw std::runtime_error(str.str());
+  }
 
   synchronize(iChunk);
   thrust::copy_n(src, size, data_.begin() + (iChunk << this->chunk_bits_));
@@ -661,9 +716,12 @@ template <typename data_t>
 void DeviceChunkContainer<data_t>::CopyOut(thrust::complex<data_t> *dest,
                                            uint_t iChunk, uint_t size) {
   uint_t this_size = 1ull << this->chunk_bits_;
-  if (this_size < size)
-    throw std::runtime_error("CopyOut chunk size is less than provided size");
-
+  if (this_size < size) {
+    std::stringstream str;
+    str << "DeviceChunkContainer::CopyOut chunk size " << this_size
+        << " is less than " << size;
+    throw std::runtime_error(str.str());
+  }
   synchronize(iChunk);
   thrust::copy_n(data_.begin() + (iChunk << this->chunk_bits_), size, dest);
 }
@@ -689,26 +747,26 @@ void DeviceChunkContainer<data_t>::Swap(Chunk<data_t> &src, uint_t iChunk,
       thrust::complex<data_t> *pSrc = src.pointer();
       cudaMemcpyPeerAsync(pBuffer + dest_offset, device_id_, pSrc + src_offset,
                           src.device(), size * sizeof(thrust::complex<data_t>),
-                          stream_);
+                          stream(iChunk));
       this->Execute(BufferSwap_func<data_t>(chunk_pointer(iChunk) + dest_offset,
                                             pBuffer + dest_offset, size, true),
                     iChunk, 0, 1);
-      cudaMemcpyPeerAsync(pSrc + src_offset, src.device(),
-                          pBuffer + dest_offset, device_id_,
-                          size * sizeof(thrust::complex<data_t>), stream_);
+      cudaMemcpyPeerAsync(
+          pSrc + src_offset, src.device(), pBuffer + dest_offset, device_id_,
+          size * sizeof(thrust::complex<data_t>), stream(iChunk));
     }
   } else {
     thrust::complex<data_t> *pBuffer = buffer_pointer();
     thrust::complex<data_t> *pSrc = src.pointer();
     cudaMemcpyAsync(pBuffer + dest_offset, pSrc + src_offset,
                     size * sizeof(thrust::complex<data_t>),
-                    cudaMemcpyHostToDevice, stream_cache_);
+                    cudaMemcpyHostToDevice, stream(this->num_chunks_));
     this->Execute(BufferSwap_func<data_t>(chunk_pointer(iChunk) + dest_offset,
                                           pBuffer + dest_offset, size, true),
                   iChunk, 0, 1);
     cudaMemcpyAsync(pSrc + src_offset, pBuffer + dest_offset,
                     size * sizeof(thrust::complex<data_t>),
-                    cudaMemcpyDeviceToHost, stream_cache_);
+                    cudaMemcpyDeviceToHost, stream(this->num_chunks_));
   }
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
@@ -728,7 +786,7 @@ template <typename data_t>
 void DeviceChunkContainer<data_t>::Zero(uint_t iChunk, uint_t count) {
   set_device();
 #ifdef AER_THRUST_CUDA
-  thrust::fill_n(thrust::cuda::par.on(stream_),
+  thrust::fill_n(thrust::cuda::par.on(stream(iChunk)),
                  data_.begin() + (iChunk << this->chunk_bits_), count, 0.0);
 #else
   if (this->omp_threads_ > 1)
@@ -755,24 +813,31 @@ reg_t DeviceChunkContainer<data_t>::sample_measure(
 #ifdef AER_THRUST_CUDA
 
   if (dot)
-    thrust::transform_inclusive_scan(
-        thrust::cuda::par.on(stream_), iter.begin(), iter.end(), iter.begin(),
-        complex_dot_scan<data_t>(), thrust::plus<thrust::complex<data_t>>());
+    thrust::transform_inclusive_scan(thrust::cuda::par.on(stream(iChunk)),
+                                     iter.begin(), iter.end(), iter.begin(),
+                                     complex_dot_scan<data_t>(),
+                                     thrust::plus<thrust::complex<data_t>>());
   else
-    thrust::inclusive_scan(thrust::cuda::par.on(stream_), iter.begin(),
+    thrust::inclusive_scan(thrust::cuda::par.on(stream(iChunk)), iter.begin(),
                            iter.end(), iter.begin(),
                            thrust::plus<thrust::complex<data_t>>());
 
+  uint_t i, nshots, size;
   uint_t iBuf = 0;
-  if (multi_shots_)
+  if (multi_shots_) {
     iBuf = iChunk;
+    size = matrix_buffer_size_ * 2;
+    if (size > params_buffer_size_)
+      size = params_buffer_size_;
+  } else {
+    size = matrix_.size() * 2;
+    if (size > params_.size())
+      size = params_.size();
+  }
 
   double *pRnd = (double *)matrix_pointer(iBuf);
   uint_t *pSmp = param_pointer(iBuf);
   thrust::device_ptr<double> rnd_dev_ptr = thrust::device_pointer_cast(pRnd);
-  uint_t i, nshots, size = matrix_.size() * 2;
-  if (size > params_.size())
-    size = params_.size();
 
   for (i = 0; i < SHOTS; i += size) {
     nshots = size;
@@ -780,17 +845,17 @@ reg_t DeviceChunkContainer<data_t>::sample_measure(
       nshots = SHOTS - i;
 
     cudaMemcpyAsync(pRnd, &rnds[i], nshots * sizeof(double),
-                    cudaMemcpyHostToDevice, stream_);
+                    cudaMemcpyHostToDevice, stream(iChunk));
 
-    thrust::lower_bound(thrust::cuda::par.on(stream_), iter.begin(), iter.end(),
-                        rnd_dev_ptr, rnd_dev_ptr + nshots,
+    thrust::lower_bound(thrust::cuda::par.on(stream(iChunk)), iter.begin(),
+                        iter.end(), rnd_dev_ptr, rnd_dev_ptr + nshots,
                         params_.begin() + (iBuf * params_buffer_size_),
                         complex_less<data_t>());
 
     cudaMemcpyAsync(&samples[i], pSmp, nshots * sizeof(uint_t),
-                    cudaMemcpyDeviceToHost, stream_);
+                    cudaMemcpyDeviceToHost, stream(iChunk));
   }
-  cudaStreamSynchronize(stream_);
+  cudaStreamSynchronize(stream(iChunk));
 #else
   if (this->omp_threads_ > 1) {
     if (dot)
@@ -854,7 +919,7 @@ void DeviceChunkContainer<data_t>::set_blocked_qubits(uint_t iChunk,
   set_device();
   cudaMemcpyAsync(param_pointer(iChunk), (uint_t *)&qubits_sorted[0],
                   qubits.size() * sizeof(uint_t), cudaMemcpyHostToDevice,
-                  stream_);
+                  stream(iChunk));
 #endif
 
   num_blocked_gates_[iBlock] = 0;
@@ -944,7 +1009,7 @@ void DeviceChunkContainer<data_t>::queue_blocked_gate(
                                         num_blocked_qubits_[iBlock]) +
                       num_blocked_gates_[iBlock],
                   &params, sizeof(BlockedGateParams), cudaMemcpyHostToDevice,
-                  stream_);
+                  stream(iChunk));
 
   if (pMat != NULL) {
     if (gate == 'd') { // diagonal matrix
@@ -953,14 +1018,14 @@ void DeviceChunkContainer<data_t>::queue_blocked_gate(
       cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock],
                       (thrust::complex<double> *)&mat[0],
                       2 * sizeof(thrust::complex<double>),
-                      cudaMemcpyHostToDevice, stream_);
+                      cudaMemcpyHostToDevice, stream(iChunk));
       num_blocked_matrix_[iBlock] += 2;
     } else if (gate == 'p') { // phase
       mat[0] = pMat[0];
       cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock],
                       (thrust::complex<double> *)&mat[0],
                       1 * sizeof(thrust::complex<double>),
-                      cudaMemcpyHostToDevice, stream_);
+                      cudaMemcpyHostToDevice, stream(iChunk));
       num_blocked_matrix_[iBlock] += 1;
     } else { // otherwise, 2x2 matrix
       mat[0] = pMat[0];
@@ -970,7 +1035,7 @@ void DeviceChunkContainer<data_t>::queue_blocked_gate(
       cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock],
                       (thrust::complex<double> *)&mat[0],
                       4 * sizeof(thrust::complex<double>),
-                      cudaMemcpyHostToDevice, stream_);
+                      cudaMemcpyHostToDevice, stream(iChunk));
       num_blocked_matrix_[iBlock] += 4;
     }
   }
@@ -1265,13 +1330,14 @@ void DeviceChunkContainer<data_t>::apply_blocked_gates(uint_t iChunk) {
     dev_apply_register_blocked_gates<data_t>
         <<<nb, nt,
            num_blocked_matrix_[iChunk] * sizeof(thrust::complex<double>),
-           stream_>>>(chunk_pointer(iChunk), num_blocked_gates_[iBlock],
-                      num_blocked_qubits_[iBlock], num_blocked_matrix_[iBlock],
-                      pQubits, pParams, pMatrix);
+           stream(iChunk)>>>(chunk_pointer(iChunk), num_blocked_gates_[iBlock],
+                             num_blocked_qubits_[iBlock],
+                             num_blocked_matrix_[iBlock], pQubits, pParams,
+                             pMatrix);
   } else {
     // using shared memory blocking (<=10 qubits)
     dev_apply_shared_memory_blocked_gates<data_t>
-        <<<nb, nt, 1024 * sizeof(thrust::complex<data_t>), stream_>>>(
+        <<<nb, nt, 1024 * sizeof(thrust::complex<data_t>), stream(iChunk)>>>(
             chunk_pointer(iChunk), num_blocked_gates_[iBlock],
             num_blocked_qubits_[iBlock], pQubits, pParams, pMatrix);
   }
@@ -1288,7 +1354,8 @@ void DeviceChunkContainer<data_t>::copy_to_probability_buffer(
 #ifdef AER_THRUST_CUDA
   set_device();
   cudaMemcpyAsync(probability_buffer(0) + pos * this->num_chunks_, &buf[0],
-                  buf.size() * sizeof(double), cudaMemcpyHostToDevice, stream_);
+                  buf.size() * sizeof(double), cudaMemcpyHostToDevice,
+                  stream_[0]);
 #else
   thrust::copy_n(buf.begin(), buf.size(), probability_buffer_.begin());
 #endif
diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp
index 53d3de7a1f..9e95316fd2 100644
--- a/src/simulators/statevector/chunk/host_chunk_container.hpp
+++ b/src/simulators/statevector/chunk/host_chunk_container.hpp
@@ -29,8 +29,8 @@ class HostChunkContainer : public ChunkContainer<data_t> {
 protected:
   AERHostVector<thrust::complex<data_t>>
       data_; // host vector for chunks + buffers
-  std::vector<thrust::complex<double> *> matrix_; // pointer to matrix
-  std::vector<uint_t *> params_; // pointer to additional parameters
+  mutable std::vector<thrust::complex<double> *> matrix_; // pointer to matrix
+  mutable std::vector<uint_t *> params_; // pointer to additional parameters
 public:
   HostChunkContainer() {}
   ~HostChunkContainer();
@@ -47,15 +47,16 @@ class HostChunkContainer : public ChunkContainer<data_t> {
   void Deallocate(void) override;
 
   void StoreMatrix(const std::vector<std::complex<double>> &mat,
-                   uint_t iChunk) override {
+                   uint_t iChunk) const override {
     matrix_[iChunk] = (thrust::complex<double> *)&mat[0];
   }
   void StoreMatrix(const std::complex<double> *mat, uint_t iChunk,
-                   uint_t size) override {
+                   uint_t size) const override {
     matrix_[iChunk] = (thrust::complex<double> *)mat;
   }
 
-  void StoreUintParams(const std::vector<uint_t> &prm, uint_t iChunk) override {
+  void StoreUintParams(const std::vector<uint_t> &prm,
+                       uint_t iChunk) const override {
     params_[iChunk] = (uint_t *)&prm[0];
   }
   void ResizeMatrixBuffers(int bits) {}
diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp
index c7f9f11610..f8bec5f665 100644
--- a/src/simulators/statevector/chunk/thrust_kernels.hpp
+++ b/src/simulators/statevector/chunk/thrust_kernels.hpp
@@ -407,12 +407,15 @@ template <typename data_t>
 class initialize_component_func : public GateFuncBase<data_t> {
 protected:
   int nqubits;
-  uint_t matSize;
+  uint_t offset;
+  uint_t mat_pos;
+  uint_t mat_num;
 
 public:
-  initialize_component_func(const cvector_t<double> &mat, const reg_t &qb) {
-    nqubits = qb.size();
-    matSize = 1ull << nqubits;
+  initialize_component_func(const int nq, const uint_t pos, const uint_t num) {
+    nqubits = nq;
+    mat_pos = pos;
+    mat_num = num;
   }
 
   int qubits_count(void) { return nqubits; }
@@ -445,13 +448,17 @@ class initialize_component_func : public GateFuncBase<data_t> {
     idx += ii;
 
     q0 = vec[idx];
-    for (k = 0; k < matSize; k++) {
+    for (k = mat_pos; k < mat_pos + mat_num; k++) {
       ii = idx;
       for (j = 0; j < nqubits; j++) {
         if (((k >> j) & 1) != 0)
           ii += (1ull << qubits[j]);
       }
-      q = q0 * state[k];
+      if (ii == idx) {
+        if (mat_pos > 0)
+          continue;
+      }
+      q = q0 * state[k - mat_pos];
       vec[ii] = q;
     }
   }
@@ -459,44 +466,6 @@ class initialize_component_func : public GateFuncBase<data_t> {
   const char *name(void) { return "initialize_component"; }
 };
 
-template <typename data_t>
-class initialize_large_component_func : public GateFuncBase<data_t> {
-protected:
-  int num_qubits_;
-  uint_t mask_;
-  uint_t cmask_;
-  thrust::complex<double> init_;
-
-public:
-  initialize_large_component_func(thrust::complex<double> m,
-                                  const reg_t &qubits, int i) {
-    num_qubits_ = qubits.size();
-    init_ = m;
-
-    mask_ = 0;
-    cmask_ = 0;
-    for (int k = 0; k < num_qubits_; k++) {
-      mask_ |= (1ull << qubits[k]);
-
-      if (((i >> k) & 1) != 0) {
-        cmask_ |= (1ull << qubits[k]);
-      }
-    }
-  }
-  bool is_diagonal(void) { return true; }
-
-  __host__ __device__ void operator()(const uint_t &i) const {
-    thrust::complex<data_t> *vec;
-    thrust::complex<double> q;
-    vec = this->data_;
-    if ((i & mask_) == cmask_) {
-      q = vec[i];
-      vec[i] = init_ * q;
-    }
-  }
-  const char *name(void) { return "initialize_large_component"; }
-};
-
 //------------------------------------------------------------------------------
 // Zero clear
 //------------------------------------------------------------------------------
@@ -1371,7 +1340,7 @@ class DiagonalMult2x2Controlled : public GateFuncBase<data_t> {
     m0 = mat[0];
     m1 = mat[1];
 
-    mask = (1ull << qubits[nqubits - 1]) - 1;
+    mask = (1ull << qubits[nqubits - 1]);
     cmask = 0;
     for (i = 0; i < nqubits - 1; i++) {
       cmask |= (1ull << qubits[i]);
diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp
old mode 100644
new mode 100755
index 4ae87ed670..3cc84d8a79
--- a/src/simulators/statevector/qubitvector.hpp
+++ b/src/simulators/statevector/qubitvector.hpp
@@ -99,7 +99,7 @@ class QubitVector {
   static std::string name() { return "statevector"; }
 
   // Set the size of the vector in terms of qubit number
-  void set_num_qubits(size_t num_qubits);
+  virtual void set_num_qubits(size_t num_qubits);
 
   // Returns the number of qubits for the current vector
   virtual uint_t num_qubits() const { return num_qubits_; }
@@ -147,6 +147,7 @@ class QubitVector {
   bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
                    uint_t num_local_chunks);
   bool chunk_setup(QubitVector<data_t> &base, const uint_t chunk_index);
+  uint_t chunk_index(void) { return chunk_index_; }
 
   // cache control for chunks on host
   bool fetch_chunk(void) const { return true; }
@@ -159,6 +160,7 @@ class QubitVector {
   // prepare buffer for MPI send/recv
   std::complex<data_t> *send_buffer(uint_t &size_in_byte);
   std::complex<data_t> *recv_buffer(uint_t &size_in_byte);
+
   void release_send_buffer(void) const;
   void release_recv_buffer(void) const;
 
@@ -186,6 +188,9 @@ class QubitVector {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const QubitVector<data_t> &obj) { copy_qv(obj); }
+
   // Initializes the vector to a custom initial state.
   // If the length of the data vector does not match the number of qubits
   // an exception is raised.
@@ -429,6 +434,8 @@ class QubitVector {
   // cuStateVec
   void cuStateVec_enable(bool flg) {}
 
+  void set_target_gpus(reg_t &t) {}
+
   //-----------------------------------------------------------------------
   // Optimization configuration settings
   //-----------------------------------------------------------------------
@@ -441,6 +448,8 @@ class QubitVector {
 
   virtual bool enable_batch(bool flg) const { return false; }
 
+  bool support_global_indexing(void) { return false; }
+
 protected:
   //-----------------------------------------------------------------------
   // Protected data members
@@ -623,6 +632,9 @@ class QubitVector {
 
   // Allocates memory for the checkoiunt
   void allocate_checkpoint(size_t data_size);
+
+  // copy state from other QubitVector
+  void copy_qv(const QubitVector<data_t> &obj);
 };
 
 /*******************************************************************************
@@ -741,6 +753,22 @@ QubitVector<data_t>::~QubitVector() {
   free_checkpoint();
 }
 
+template <typename data_t>
+void QubitVector<data_t>::copy_qv(const QubitVector<data_t> &obj) {
+  data_ = nullptr;
+  checkpoint_ = nullptr;
+  set_num_qubits(obj.num_qubits());
+  set_transformer_method();
+
+  initialize_from_data(obj.data_, obj.data_size_);
+
+  chunk_index_ = obj.chunk_index_;
+  omp_threads_ = obj.omp_threads_;
+  omp_threshold_ = obj.omp_threshold_;
+  sample_measure_index_size_ = obj.sample_measure_index_size_;
+  json_chop_threshold_ = obj.json_chop_threshold_;
+}
+
 template <typename data_t>
 QubitVector<data_t> &QubitVector<data_t>::operator=(QubitVector<data_t> &&obj) {
   num_qubits_ = obj.num_qubits_;
@@ -753,6 +781,7 @@ QubitVector<data_t> &QubitVector<data_t>::operator=(QubitVector<data_t> &&obj) {
   omp_threshold_ = obj.omp_threshold_;
   sample_measure_index_size_ = obj.sample_measure_index_size_;
   json_chop_threshold_ = obj.json_chop_threshold_;
+
   obj.data_ = nullptr;
   obj.checkpoint_ = nullptr;
   return *this;
@@ -1298,7 +1327,6 @@ void QubitVector<data_t>::apply_multiplexer(const reg_t &control_qubits,
 template <typename data_t>
 void QubitVector<data_t>::apply_diagonal_matrix(const reg_t &qubits,
                                                 const cvector_t<double> &diag) {
-
   transformer_->apply_diagonal_matrix(data_, data_size_, omp_threads_managed(),
                                       qubits, diag);
 }
diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp
index 06aef07eec..2be6721de7 100644
--- a/src/simulators/statevector/qubitvector_thrust.hpp
+++ b/src/simulators/statevector/qubitvector_thrust.hpp
@@ -144,7 +144,9 @@ class QubitVectorThrust {
   // chunk setup
   bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index,
                    uint_t num_local_chunks);
-  bool chunk_setup(QubitVectorThrust<data_t> &base, const uint_t chunk_index);
+  bool chunk_setup(const QubitVectorThrust<data_t> &base,
+                   const uint_t chunk_index);
+  uint_t chunk_index(void) { return chunk_index_; }
 
   // cache control for chunks on host
   bool fetch_chunk(void) const;
@@ -185,6 +187,9 @@ class QubitVectorThrust {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const QubitVectorThrust<data_t> &obj) { copy_qv(obj); }
+
   // Initializes the vector to a custom initial state.
   // If the length of the data vector does not match the number of qubits
   // an exception is raised.
@@ -233,7 +238,8 @@ class QubitVectorThrust {
 
   // Apply a N-qubit diagonal matrix to the state vector.
   // The matrix is input as vector of the matrix diagonal.
-  void apply_diagonal_matrix(const reg_t &qubits, const cvector_t<double> &mat);
+  virtual void apply_diagonal_matrix(const reg_t &qubits,
+                                     const cvector_t<double> &mat);
 
   // Swap pairs of indicies in the underlying vector
   void
@@ -321,7 +327,7 @@ class QubitVectorThrust {
   //-----------------------------------------------------------------------
   virtual bool batched_optimization_supported(void) {
 #ifdef AER_THRUST_CUDA
-    if (multi_shots_ && enable_batch_)
+    if (enable_batch_)
       return true;
     else
       return false;
@@ -379,28 +385,12 @@ class QubitVectorThrust {
   // expectation value of A^\dagger A, and could probably be removed because
   // of this
 
-  // Return the norm for of the vector obtained after apply the 1-qubit
-  // matrix mat to the vector.
-  // The matrix is input as vector of the column-major vectorized 1-qubit
-  // matrix.
-  double norm(const uint_t qubit, const cvector_t<double> &mat) const;
-
   // Return the norm for of the vector obtained after apply the N-qubit
   // matrix mat to the vector.
   // The matrix is input as vector of the column-major vectorized N-qubit
   // matrix.
   double norm(const reg_t &qubits, const cvector_t<double> &mat) const;
 
-  // Return the norm for of the vector obtained after apply the 1-qubit
-  // diagonal matrix mat to the vector.
-  // The matrix is input as vector of the matrix diagonal.
-  double norm_diagonal(const uint_t qubit, const cvector_t<double> &mat) const;
-
-  // Return the norm for of the vector obtained after apply the N-qubit
-  // diagonal matrix mat to the vector.
-  // The matrix is input as vector of the matrix diagonal.
-  double norm_diagonal(const reg_t &qubits, const cvector_t<double> &mat) const;
-
   //-----------------------------------------------------------------------
   // Expectation Value
   //-----------------------------------------------------------------------
@@ -452,6 +442,9 @@ class QubitVectorThrust {
   // cuStateVec
   void cuStateVec_enable(bool flg) { cuStateVec_enable_ = flg; }
 
+  bool support_global_indexing(void) { return (!cuStateVec_enable_); }
+
+  void set_target_gpus(reg_t &t) { target_gpus_ = t; }
   //-----------------------------------------------------------------------
   // Optimization configuration settings
   //-----------------------------------------------------------------------
@@ -479,9 +472,9 @@ class QubitVectorThrust {
 
   uint_t chunk_index_;
   bool multi_chunk_distribution_;
-  bool multi_shots_;
   mutable bool enable_batch_;
   bool cuStateVec_enable_ = false;
+  reg_t target_gpus_;
 
   bool register_blocking_;
 
@@ -531,7 +524,10 @@ class QubitVectorThrust {
                            bool async = false) const;
 
   // get number of chunk to be applied
-  uint_t get_chunk_count(void);
+  uint_t get_chunk_count(void) const;
+
+  // copy from other qv
+  void copy_qv(const QubitVectorThrust<data_t> &obj);
 
 #ifdef AER_DEBUG
   // for debugging
@@ -649,7 +645,6 @@ QubitVectorThrust<data_t>::QubitVectorThrust(size_t num_qubits)
     : num_qubits_(0) {
   chunk_index_ = 0;
   multi_chunk_distribution_ = false;
-  multi_shots_ = false;
   enable_batch_ = false;
 
   max_matrix_bits_ = 0;
@@ -679,6 +674,25 @@ QubitVectorThrust<data_t>::~QubitVectorThrust() {
   checkpoint_.clear();
 }
 
+template <typename data_t>
+void QubitVectorThrust<data_t>::copy_qv(const QubitVectorThrust<data_t> &obj) {
+  omp_threads_ = obj.omp_threads_;
+  omp_threshold_ = obj.omp_threshold_;
+  sample_measure_index_size_ = obj.sample_measure_index_size_;
+  json_chop_threshold_ = obj.json_chop_threshold_;
+  chunk_index_ = obj.chunk_index_;
+  num_threads_per_group_ = obj.num_threads_per_group_;
+  max_matrix_bits_ = obj.max_matrix_bits_;
+
+  if (!chunk_setup(obj, obj.chunk_index_)) {
+    throw std::runtime_error(
+        "QubitVectorThrust: can not allocate chunk for copy");
+  }
+  set_num_qubits(obj.num_qubits());
+
+  chunk_.set_device();
+  chunk_.CopyIn(obj.chunk_);
+}
 //------------------------------------------------------------------------------
 // Element access operators
 //------------------------------------------------------------------------------
@@ -787,7 +801,7 @@ void QubitVectorThrust<data_t>::initialize_component(
   if (qubits.size() == 1) {
     apply_function(Chunk::initialize_component_1qubit_func<data_t>(
         qubits[0], state0[0], state0[1]));
-  } else if (qubits.size() <= chunk_.container()->matrix_bits()) {
+  } else {
     auto qubits_sorted = qubits;
     std::sort(qubits_sorted.begin(), qubits_sorted.end());
 
@@ -796,19 +810,19 @@ void QubitVectorThrust<data_t>::initialize_component(
     for (i = 0; i < qubits.size(); i++)
       qubits_param.push_back(qubits_sorted[i]);
 
-    //    chunk_.StoreMatrix(state0);
-    //    chunk_.StoreUintParams(qubits_param);
-
-    apply_function(
-        Chunk::initialize_component_func<data_t>(state0, qubits_sorted), state0,
-        qubits_param);
-  } else {
-    // if initial state is larger that matrix buffer, set one by one.
-    uint_t DIM = 1ull << qubits.size();
-    uint_t i;
-    for (i = 0; i < DIM; i++) {
-      apply_function(
-          Chunk::initialize_large_component_func<data_t>(state0[i], qubits, i));
+    int nbit = chunk_.container()->matrix_bits();
+    if (nbit > qubits.size())
+      nbit = qubits.size();
+
+    uint_t dim = 1ull << qubits.size();
+    uint_t sub_dim = 1ull << nbit;
+    for (uint_t i = 0; i < dim; i += sub_dim) {
+      cvector_t<double> state(sub_dim);
+      for (uint_t j = 0; j < sub_dim; j++)
+        state[j] = state0[dim - sub_dim - i + j];
+      apply_function(Chunk::initialize_component_func<data_t>(
+                         qubits.size(), dim - sub_dim - i, sub_dim),
+                     state, qubits_param);
     }
   }
 }
@@ -858,7 +872,8 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
     chunk_manager_->set_num_creg_bits(num_creg_bits_ + num_cmem_bits_);
     chunk_manager_->Allocate(chunk_bits, num_qubits, num_local_chunks,
                              chunk_index_, max_matrix_bits_,
-                             is_density_matrix(), cuStateVec_enable_);
+                             is_density_matrix(), target_gpus_,
+                             cuStateVec_enable_);
   }
 
   multi_chunk_distribution_ = false;
@@ -866,8 +881,10 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
     multi_chunk_distribution_ = true;
   }
 
-  chunk_.unmap();
-  buffer_chunk_.unmap();
+  if (chunk_.is_mapped())
+    chunk_manager_->UnmapChunk(chunk_);
+  if (buffer_chunk_.is_mapped())
+    chunk_manager_->UnmapBufferChunk(buffer_chunk_);
   send_chunk_.unmap();
   recv_chunk_.unmap();
 
@@ -879,30 +896,28 @@ bool QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
 }
 
 template <typename data_t>
-bool QubitVectorThrust<data_t>::chunk_setup(QubitVectorThrust<data_t> &base,
-                                            const uint_t chunk_index) {
-  chunk_manager_ = base.chunk_manager_;
-
+bool QubitVectorThrust<data_t>::chunk_setup(
+    const QubitVectorThrust<data_t> &base, const uint_t chunk_index) {
   multi_chunk_distribution_ = base.multi_chunk_distribution_;
-  if (!multi_chunk_distribution_) {
-    if (chunk_manager_->chunk_bits() == chunk_manager_->num_qubits()) {
-      multi_shots_ = true;
-      base.multi_shots_ = true;
-    }
-  }
   cuStateVec_enable_ = base.cuStateVec_enable_;
+  target_gpus_ = base.target_gpus_;
 
   // set global chunk ID / shot ID
   chunk_index_ = chunk_index;
+  chunk_.set_chunk_index(chunk_index_);
 
-  chunk_.unmap();
-  buffer_chunk_.unmap();
+  if (buffer_chunk_.is_mapped())
+    chunk_manager_->UnmapBufferChunk(buffer_chunk_);
   send_chunk_.unmap();
   recv_chunk_.unmap();
 
+  if (chunk_.is_mapped()) {
+    return true;
+  }
+
   // mapping/setting chunk
+  chunk_manager_ = base.chunk_manager_;
   bool mapped = chunk_manager_->MapChunk(chunk_, 0);
-  chunk_.set_chunk_index(chunk_index_);
 
   return mapped;
 }
@@ -1160,7 +1175,7 @@ bool QubitVectorThrust<data_t>::enable_batch(bool flg) const {
 }
 
 template <typename data_t>
-uint_t QubitVectorThrust<data_t>::get_chunk_count(void) {
+uint_t QubitVectorThrust<data_t>::get_chunk_count(void) const {
   if (multi_chunk_distribution_) {
     if (chunk_.device() < 0 || cuStateVec_enable_)
       return 1;
@@ -1169,6 +1184,8 @@ uint_t QubitVectorThrust<data_t>::get_chunk_count(void) {
   } else {      // multi-shots
     if (enable_batch_ && chunk_.pos() != 0)
       return 0; // first chunk execute all in batch
+    else if (!enable_batch_)
+      return 1;
   }
   return chunk_.container()->num_chunks();
 }
@@ -1921,65 +1938,19 @@ double QubitVectorThrust<data_t>::norm() const {
 template <typename data_t>
 double QubitVectorThrust<data_t>::norm(const reg_t &qubits,
                                        const cvector_t<double> &mat) const {
-  const size_t N = qubits.size();
-
-  if (N == 1) {
-    return norm(qubits[0], mat);
-  } else {
-    auto qubits_sorted = qubits;
-    std::sort(qubits_sorted.begin(), qubits_sorted.end());
-    for (int_t i = 0; i < N; i++) {
-      qubits_sorted.push_back(qubits[i]);
+  uint_t count = 1;
+#ifdef AER_THRUST_CUDA
+  if (!cuStateVec_enable_ &&
+      ((multi_chunk_distribution_ && chunk_.device() >= 0 &&
+        num_qubits_ == num_qubits()) ||
+       (enable_batch_))) {
+    if (chunk_.pos() != 0) {
+      return 0.0;
     }
-
-    chunk_.StoreMatrix(mat);
-    chunk_.StoreUintParams(qubits_sorted);
-
-    double ret;
-    apply_function_sum(&ret, Chunk::NormMatrixMultNxN<data_t>(N));
-    return ret;
-  }
-}
-
-template <typename data_t>
-double
-QubitVectorThrust<data_t>::norm_diagonal(const reg_t &qubits,
-                                         const cvector_t<double> &mat) const {
-
-  const uint_t N = qubits.size();
-
-  if (N == 1) {
-    return norm_diagonal(qubits[0], mat);
-  } else {
-    chunk_.StoreMatrix(mat);
-    chunk_.StoreUintParams(qubits);
-
-    double ret;
-    apply_function_sum(&ret, Chunk::NormDiagonalMultNxN<data_t>(qubits));
-    return ret;
+    count = chunk_.container()->num_chunks();
   }
-}
-
-//------------------------------------------------------------------------------
-// Single-qubit specialization
-//------------------------------------------------------------------------------
-template <typename data_t>
-double QubitVectorThrust<data_t>::norm(const uint_t qubit,
-                                       const cvector_t<double> &mat) const {
-  double ret;
-  apply_function_sum(&ret, Chunk::NormMatrixMult2x2<data_t>(mat, qubit));
-
-  return ret;
-}
-
-template <typename data_t>
-double
-QubitVectorThrust<data_t>::norm_diagonal(const uint_t qubit,
-                                         const cvector_t<double> &mat) const {
-  double ret;
-  apply_function_sum(&ret, Chunk::NormDiagonalMult2x2<data_t>(mat, qubit));
-
-  return ret;
+#endif
+  return chunk_.expval_matrix(qubits, mat, count);
 }
 
 /*******************************************************************************
@@ -2003,8 +1974,6 @@ std::vector<double> QubitVectorThrust<data_t>::probabilities() const {
   DebugMsg("calling probabilities");
 #endif
 
-#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \
-    num_threads(omp_threads_)
   for (int_t j = 0; j < END; j++) {
     probs[j] = probability(j);
   }
diff --git a/src/simulators/statevector/statevector_executor.hpp b/src/simulators/statevector/statevector_executor.hpp
new file mode 100644
index 0000000000..28312f4aae
--- /dev/null
+++ b/src/simulators/statevector/statevector_executor.hpp
@@ -0,0 +1,1807 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _statevector_executor_hpp_
+#define _statevector_executor_hpp_
+
+#include "simulators/batch_shots_executor.hpp"
+#include "simulators/parallel_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace Statevector {
+
+//-------------------------------------------------------------------------
+// Executor for statevector
+//-------------------------------------------------------------------------
+template <class state_t>
+class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
+                 public CircuitExecutor::BatchShotsExecutor<state_t> {
+  using Base = CircuitExecutor::MultiStateExecutor<state_t>;
+  using BasePar = CircuitExecutor::ParallelStateExecutor<state_t>;
+  using BaseBatch = CircuitExecutor::BatchShotsExecutor<state_t>;
+
+protected:
+public:
+  Executor() {}
+  virtual ~Executor() {}
+
+protected:
+  void set_config(const Config &config) override;
+
+  void apply_global_phase() override;
+
+  bool shot_branching_supported(void) override { return true; }
+
+  // apply parallel operations
+  bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result,
+                         RngEngine &rng, bool final_op) override;
+
+  // apply op to multiple shots , return flase if op is not supported to execute
+  // in a batch
+  bool apply_batched_op(const int_t istate, const Operations::Op &op,
+                        ExperimentResult &result, std::vector<RngEngine> &rng,
+                        bool final_op = false) override;
+
+  bool apply_branching_op(CircuitExecutor::Branch &root,
+                          const Operations::Op &op, ExperimentResult &result,
+                          bool final_op) override;
+
+  // Initializes an n-qubit state to the all |0> state
+  void initialize_qreg(uint_t num_qubits) override;
+
+  auto move_to_vector(void);
+  auto copy_to_vector(void);
+
+  void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise,
+                         const Config &config, RngEngine &init_rng,
+                         ExperimentResult &result, bool sample_noise) override;
+
+  bool allocate_states(uint_t num_states, const Config &config) override {
+    return BasePar::allocate_states(num_states, config);
+  }
+
+  //-----------------------------------------------------------------------
+  // Apply instructions
+  //-----------------------------------------------------------------------
+  // Measure qubits and return a list of outcomes [q0, q1, ...]
+  // If a state subclass supports this function it then "measure"
+  // should be contained in the set returned by the 'allowed_ops'
+  // method.
+  void apply_measure(const reg_t &qubits, const reg_t &cmemory,
+                     const reg_t &cregister, RngEngine &rng);
+
+  // Reset the specified qubits to the |0> state by simulating
+  // a measurement, applying a conditional x-gate if the outcome is 1, and
+  // then discarding the outcome.
+  void apply_reset(const reg_t &qubits, RngEngine &rng);
+
+  // Initialize the specified qubits to a given state |psi>
+  // by applying a reset to the these qubits and then
+  // computing the tensor product with the new state |psi>
+  // /psi> is given in params
+  void apply_initialize(const reg_t &qubits, const cvector_t &params,
+                        RngEngine &rng);
+
+  void initialize_from_vector(const cvector_t &params);
+
+  // Apply a Kraus error operation
+  void apply_kraus(const reg_t &qubits, const std::vector<cmatrix_t> &krausops,
+                   RngEngine &rng);
+
+  void apply_reset(CircuitExecutor::Branch &root, const reg_t &qubits);
+  void apply_initialize(CircuitExecutor::Branch &root, const reg_t &qubits,
+                        const cvector_t &params);
+  void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits,
+                   const std::vector<cmatrix_t> &kmats);
+
+  //-----------------------------------------------------------------------
+  // Save data instructions
+  //-----------------------------------------------------------------------
+
+  // Save the current state of the statevector simulator
+  // If `last_op` is True this will use move semantics to move the simulator
+  // state to the results, otherwise it will use copy semantics to leave
+  // the current simulator state unchanged.
+  void apply_save_statevector(const Operations::Op &op,
+                              ExperimentResult &result, bool last_op);
+
+  // Save the current state of the statevector simulator as a ket-form map.
+  void apply_save_statevector_dict(const Operations::Op &op,
+                                   ExperimentResult &result);
+
+  // Save the current density matrix or reduced density matrix
+  void apply_save_density_matrix(const Operations::Op &op,
+                                 ExperimentResult &result);
+
+  // Helper function for computing expectation value
+  void apply_save_probs(const Operations::Op &op, ExperimentResult &result);
+
+  // Helper function for saving amplitudes and amplitudes squared
+  void apply_save_amplitudes(const Operations::Op &op,
+                             ExperimentResult &result);
+
+  void apply_save_statevector(CircuitExecutor::Branch &root,
+                              const Operations::Op &op,
+                              ExperimentResult &result, bool last_op);
+  void apply_save_statevector_dict(CircuitExecutor::Branch &root,
+                                   const Operations::Op &op,
+                                   ExperimentResult &result);
+  void apply_save_amplitudes(CircuitExecutor::Branch &root,
+                             const Operations::Op &op,
+                             ExperimentResult &result);
+
+  // Helper function for computing expectation value
+  double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
+  //-----------------------------------------------------------------------
+  // Measurement Helpers
+  //-----------------------------------------------------------------------
+
+  // Return vector of measure probabilities for specified qubits
+  // If a state subclass supports this function it then "measure"
+  // should be contained in the set returned by the 'allowed_ops'
+  // method.
+  rvector_t measure_probs(const reg_t &qubits) const;
+
+  // Sample the measurement outcome for qubits
+  // return a pair (m, p) of the outcome m, and its corresponding
+  // probability p.
+  // Outcome is given as an int: Eg for two-qubits {q0, q1} we have
+  // 0 -> |q1 = 0, q0 = 0> state
+  // 1 -> |q1 = 0, q0 = 1> state
+  // 2 -> |q1 = 1, q0 = 0> state
+  // 3 -> |q1 = 1, q0 = 1> state
+  std::pair<uint_t, double> sample_measure_with_prob(const reg_t &qubits,
+                                                     RngEngine &rng);
+
+  void measure_reset_update(const std::vector<uint_t> &qubits,
+                            const uint_t final_state, const uint_t meas_state,
+                            const double meas_prob);
+
+  rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                     const reg_t &qubits);
+  void measure_reset_update(CircuitExecutor::Branch &root,
+                            const std::vector<uint_t> &qubits,
+                            const int_t final_state,
+                            const rvector_t &meas_probs);
+  void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
+                     const reg_t &cmemory, const reg_t &cregister);
+
+  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
+                                    uint_t shots,
+                                    std::vector<RngEngine> &rng) const override;
+
+  // Return the reduced density matrix for the simulator
+  cmatrix_t density_matrix(const reg_t &qubits);
+
+  // Sample n-measurement outcomes without applying the measure operation
+  // to the system state
+  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
+                                    RngEngine &rng) const override;
+};
+
+template <class state_t>
+void Executor<state_t>::set_config(const Config &config) {
+  BasePar::set_config(config);
+  BaseBatch::set_config(config);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_global_phase() {
+  if (Base::has_global_phase_) {
+    int_t i;
+    if (Base::shot_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+          Base::states_[iChunk].apply_diagonal_matrix(
+              {0}, {Base::global_phase_, Base::global_phase_});
+      }
+    } else {
+      for (i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].apply_diagonal_matrix(
+            {0}, {Base::global_phase_, Base::global_phase_});
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::run_circuit_shots(
+    Circuit &circ, const Noise::NoiseModel &noise, const Config &config,
+    RngEngine &init_rng, ExperimentResult &result, bool sample_noise) {
+  state_t dummy_state;
+  if (BasePar::multiple_chunk_required(circ, noise)) {
+    return BasePar::run_circuit_shots(circ, noise, config, init_rng, result,
+                                      sample_noise);
+  } else {
+    return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result,
+                                        sample_noise);
+  }
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_parallel_op(const Operations::Op &op,
+                                          ExperimentResult &result,
+                                          RngEngine &rng, bool final_op) {
+  // temporary : this is for statevector
+  if (Base::states_[0].creg().check_conditional(op)) {
+    switch (op.type) {
+    case Operations::OpType::reset:
+      apply_reset(op.qubits, rng);
+      break;
+    case Operations::OpType::initialize:
+      apply_initialize(op.qubits, op.params, rng);
+      break;
+    case Operations::OpType::measure:
+      apply_measure(op.qubits, op.memory, op.registers, rng);
+      break;
+    case Operations::OpType::bfunc:
+      BasePar::apply_bfunc(op);
+      break;
+    case Operations::OpType::roerror:
+      BasePar::apply_roerror(op, rng);
+      break;
+    case Operations::OpType::kraus:
+      apply_kraus(op.qubits, op.mats, rng);
+      break;
+    case Operations::OpType::set_statevec:
+      initialize_from_vector(op.params);
+      break;
+    case Operations::OpType::save_expval:
+    case Operations::OpType::save_expval_var:
+      BasePar::apply_save_expval(op, result);
+      break;
+    case Operations::OpType::save_densmat:
+      apply_save_density_matrix(op, result);
+      break;
+    case Operations::OpType::save_state:
+    case Operations::OpType::save_statevec:
+      apply_save_statevector(op, result, final_op);
+      break;
+    case Operations::OpType::save_statevec_dict:
+      apply_save_statevector_dict(op, result);
+      break;
+    case Operations::OpType::save_probs:
+    case Operations::OpType::save_probs_ket:
+      apply_save_probs(op, result);
+      break;
+    case Operations::OpType::save_amps:
+    case Operations::OpType::save_amps_sq:
+      apply_save_amplitudes(op, result);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_batched_op(const int_t istate,
+                                         const Operations::Op &op,
+                                         ExperimentResult &result,
+                                         std::vector<RngEngine> &rng,
+                                         bool final_op) {
+  if (op.conditional) {
+    Base::states_[istate].qreg().set_conditional(op.conditional_reg);
+  }
+
+  switch (op.type) {
+  case Operations::OpType::barrier:
+  case Operations::OpType::nop:
+  case Operations::OpType::qerror_loc:
+    break;
+  case Operations::OpType::reset:
+    Base::states_[istate].qreg().apply_batched_reset(op.qubits, rng);
+    break;
+  case Operations::OpType::initialize:
+    Base::states_[istate].qreg().apply_batched_reset(op.qubits, rng);
+    Base::states_[istate].qreg().initialize_component(op.qubits, op.params);
+    break;
+  case Operations::OpType::measure:
+    Base::states_[istate].qreg().apply_batched_measure(op.qubits, rng,
+                                                       op.memory, op.registers);
+    break;
+  case Operations::OpType::bfunc:
+    Base::states_[istate].qreg().apply_bfunc(op);
+    break;
+  case Operations::OpType::roerror:
+    Base::states_[istate].qreg().apply_roerror(op, rng);
+    break;
+  case Operations::OpType::gate:
+    Base::states_[istate].apply_gate(op);
+    break;
+  case Operations::OpType::matrix:
+    Base::states_[istate].apply_matrix(op);
+    break;
+  case Operations::OpType::diagonal_matrix:
+    Base::states_[istate].qreg().apply_diagonal_matrix(op.qubits, op.params);
+    break;
+  case Operations::OpType::multiplexer:
+    Base::states_[istate].apply_multiplexer(
+        op.regs[0], op.regs[1],
+        op.mats); // control qubits ([0]) & target qubits([1])
+    break;
+  case Operations::OpType::kraus:
+    Base::states_[istate].qreg().apply_batched_kraus(op.qubits, op.mats, rng);
+    break;
+  case Operations::OpType::sim_op:
+    if (op.name == "begin_register_blocking") {
+      Base::states_[istate].qreg().enter_register_blocking(op.qubits);
+    } else if (op.name == "end_register_blocking") {
+      Base::states_[istate].qreg().leave_register_blocking();
+    } else {
+      return false;
+    }
+    break;
+  case Operations::OpType::set_statevec:
+    Base::states_[istate].qreg().initialize_from_vector(op.params);
+    break;
+  default:
+    // other operations should be called to indivisual chunks by apply_op
+    return false;
+  }
+  return true;
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
+                                           const Operations::Op &op,
+                                           ExperimentResult &result,
+                                           bool final_op) {
+  RngEngine dummy;
+  if (Base::states_[root.state_index()].creg().check_conditional(op)) {
+    switch (op.type) {
+    // ops with branching
+    case Operations::OpType::reset:
+      apply_reset(root, op.qubits);
+      break;
+    case Operations::OpType::initialize:
+      apply_initialize(root, op.qubits, op.params);
+      break;
+    case Operations::OpType::measure:
+      apply_measure(root, op.qubits, op.memory, op.registers);
+      break;
+    case Operations::OpType::kraus:
+      apply_kraus(root, op.qubits, op.mats);
+      break;
+    // save ops
+    case Operations::OpType::save_expval:
+    case Operations::OpType::save_expval_var:
+    case Operations::OpType::save_densmat:
+    case Operations::OpType::save_probs:
+    case Operations::OpType::save_probs_ket:
+      // call save functions in state class
+      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      break;
+    case Operations::OpType::save_state:
+    case Operations::OpType::save_statevec:
+      apply_save_statevector(root, op, result, final_op);
+      break;
+    case Operations::OpType::save_statevec_dict:
+      apply_save_statevector_dict(root, op, result);
+      break;
+    case Operations::OpType::save_amps:
+    case Operations::OpType::save_amps_sq:
+      apply_save_amplitudes(root, op, result);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class state_t>
+void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
+  int_t i;
+
+  for (i = 0; i < Base::states_.size(); i++) {
+    Base::states_[i].qreg().set_num_qubits(BasePar::chunk_bits_);
+  }
+
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        if (Base::global_state_index_ + iChunk == 0 ||
+            this->num_qubits_ == this->chunk_bits_) {
+          Base::states_[iChunk].qreg().initialize();
+        } else {
+          Base::states_[iChunk].qreg().zero();
+        }
+      }
+    }
+  } else {
+    for (i = 0; i < Base::states_.size(); i++) {
+      if (Base::global_state_index_ + i == 0 ||
+          this->num_qubits_ == this->chunk_bits_) {
+        Base::states_[i].qreg().initialize();
+      } else {
+        Base::states_[i].qreg().zero();
+      }
+    }
+  }
+
+  BasePar::apply_global_phase();
+}
+
+template <class state_t>
+auto Executor<state_t>::move_to_vector(void) {
+  size_t size_required =
+      2 * (sizeof(std::complex<double>) << Base::num_qubits_) +
+      (sizeof(std::complex<double>) << BasePar::chunk_bits_) *
+          Base::num_local_states_;
+  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
+    throw std::runtime_error(
+        std::string("There is not enough memory to store states"));
+  }
+  int_t iChunk;
+  auto state = Base::states_[0].qreg().move_to_vector();
+  state.resize(Base::num_local_states_ << BasePar::chunk_bits_);
+
+#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk)
+  for (iChunk = 1; iChunk < Base::states_.size(); iChunk++) {
+    auto tmp = Base::states_[iChunk].qreg().move_to_vector();
+    uint_t j, offset = iChunk << BasePar::chunk_bits_;
+    for (j = 0; j < tmp.size(); j++) {
+      state[offset + j] = tmp[j];
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::gather_state(state);
+#endif
+  return state;
+}
+
+template <class state_t>
+auto Executor<state_t>::copy_to_vector(void) {
+  size_t size_required =
+      2 * (sizeof(std::complex<double>) << Base::num_qubits_) +
+      (sizeof(std::complex<double>) << BasePar::chunk_bits_) *
+          Base::num_local_states_;
+  if ((size_required >> 20) > Utils::get_system_memory_mb()) {
+    throw std::runtime_error(
+        std::string("There is not enough memory to store states"));
+  }
+  int_t iChunk;
+  auto state = Base::states_[0].qreg().copy_to_vector();
+  state.resize(Base::num_local_states_ << BasePar::chunk_bits_);
+
+#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk)
+  for (iChunk = 1; iChunk < Base::states_.size(); iChunk++) {
+    auto tmp = Base::states_[iChunk].qreg().copy_to_vector();
+    uint_t j, offset = iChunk << BasePar::chunk_bits_;
+    for (j = 0; j < tmp.size(); j++) {
+      state[offset + j] = tmp[j];
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::gather_state(state);
+#endif
+  return state;
+}
+
+//=========================================================================
+// Implementation: Save data
+//=========================================================================
+
+template <class state_t>
+void Executor<state_t>::apply_save_probs(const Operations::Op &op,
+                                         ExperimentResult &result) {
+  // get probs as hexadecimal
+  auto probs = measure_probs(op.qubits);
+  if (op.type == Operations::OpType::save_probs_ket) {
+    // Convert to ket dict
+    result.save_data_average(
+        Base::states_[0].creg(), op.string_params[0],
+        Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type,
+        op.save_type);
+  } else {
+    result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                             std::move(probs), op.type, op.save_type);
+  }
+}
+
+template <class state_t>
+double Executor<state_t>::expval_pauli(const reg_t &qubits,
+                                       const std::string &pauli) {
+  reg_t qubits_in_chunk;
+  reg_t qubits_out_chunk;
+  std::string pauli_in_chunk;
+  std::string pauli_out_chunk;
+  int_t i, n;
+  double expval(0.);
+
+  // get inner/outer chunk pauli string
+  n = pauli.size();
+  for (i = 0; i < n; i++) {
+    if (qubits[i] < BasePar::chunk_bits_) {
+      qubits_in_chunk.push_back(qubits[i]);
+      pauli_in_chunk.push_back(pauli[n - i - 1]);
+    } else {
+      qubits_out_chunk.push_back(qubits[i]);
+      pauli_out_chunk.push_back(pauli[n - i - 1]);
+    }
+  }
+
+  if (qubits_out_chunk.size() > 0) { // there are bits out of chunk
+    std::complex<double> phase = 1.0;
+
+    std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end());
+    std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end());
+
+    uint_t x_mask, z_mask, num_y, x_max;
+    std::tie(x_mask, z_mask, num_y, x_max) =
+        AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk);
+
+    AER::QV::add_y_phase(num_y, phase);
+
+    if (x_mask != 0) { // pairing state is out of chunk
+      bool on_same_process = true;
+#ifdef AER_MPI
+      int proc_bits = 0;
+      uint_t procs = Base::distributed_procs_;
+      while (procs > 1) {
+        if ((procs & 1) != 0) {
+          proc_bits = -1;
+          break;
+        }
+        proc_bits++;
+        procs >>= 1;
+      }
+      if (x_mask & (~((1ull << (Base::num_qubits_ - proc_bits)) - 1)) !=
+                       0) { // data exchange between processes is required
+        on_same_process = false;
+      }
+#endif
+
+      x_mask >>= BasePar::chunk_bits_;
+      z_mask >>= BasePar::chunk_bits_;
+      x_max -= BasePar::chunk_bits_;
+
+      const uint_t mask_u = ~((1ull << (x_max + 1)) - 1);
+      const uint_t mask_l = (1ull << x_max) - 1;
+      if (on_same_process) {
+        auto apply_expval_pauli_chunk = [this, x_mask, z_mask, x_max, mask_u,
+                                         mask_l, qubits_in_chunk,
+                                         pauli_in_chunk, phase](int_t iGroup) {
+          double expval = 0.0;
+          for (int_t iChunk = Base::top_state_of_group_[iGroup];
+               iChunk < Base::top_state_of_group_[iGroup + 1]; iChunk++) {
+            uint_t pair_chunk = iChunk ^ x_mask;
+            if (iChunk < pair_chunk) {
+              uint_t z_count, z_count_pair;
+              z_count = AER::Utils::popcount(iChunk & z_mask);
+              z_count_pair = AER::Utils::popcount(pair_chunk & z_mask);
+
+              expval += Base::states_[iChunk - Base::global_state_index_]
+                            .qreg()
+                            .expval_pauli(qubits_in_chunk, pauli_in_chunk,
+                                          Base::states_[pair_chunk].qreg(),
+                                          z_count, z_count_pair, phase);
+            }
+          }
+          return expval;
+        };
+        expval += Utils::apply_omp_parallel_for_reduction(
+            (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
+            Base::num_global_states_ / 2, apply_expval_pauli_chunk);
+      } else {
+        for (int_t i = 0; i < Base::num_global_states_ / 2; i++) {
+          uint_t iChunk = ((i << 1) & mask_u) | (i & mask_l);
+          uint_t pair_chunk = iChunk ^ x_mask;
+          uint_t iProc = BasePar::get_process_by_chunk(pair_chunk);
+          if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk &&
+              Base::state_index_end_[Base::distributed_rank_] >
+                  iChunk) { // on this process
+            uint_t z_count, z_count_pair;
+            z_count = AER::Utils::popcount(iChunk & z_mask);
+            z_count_pair = AER::Utils::popcount(pair_chunk & z_mask);
+
+            if (iProc == Base::distributed_rank_) { // pair is on the
+                                                    // same process
+              expval +=
+                  Base::states_[iChunk - Base::global_state_index_]
+                      .qreg()
+                      .expval_pauli(
+                          qubits_in_chunk, pauli_in_chunk,
+                          Base::states_[pair_chunk - Base::global_state_index_]
+                              .qreg(),
+                          z_count, z_count_pair, phase);
+            } else {
+              BasePar::recv_chunk(iChunk - Base::global_state_index_,
+                                  pair_chunk);
+              // refer receive buffer to calculate expectation value
+              expval +=
+                  Base::states_[iChunk - Base::global_state_index_]
+                      .qreg()
+                      .expval_pauli(
+                          qubits_in_chunk, pauli_in_chunk,
+                          Base::states_[iChunk - Base::global_state_index_]
+                              .qreg(),
+                          z_count, z_count_pair, phase);
+            }
+          } else if (iProc == Base::distributed_rank_) { // pair is on
+                                                         // this process
+            BasePar::send_chunk(iChunk - Base::global_state_index_, pair_chunk);
+          }
+        }
+      }
+    } else { // no exchange between chunks
+      z_mask >>= BasePar::chunk_bits_;
+      if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for reduction(+ : expval)
+        for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+          double e_tmp = 0.0;
+          for (int_t iChunk = Base::top_state_of_group_[ig];
+               iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+            double sign = 1.0;
+            if (z_mask && (AER::Utils::popcount(
+                               (iChunk + Base::global_state_index_) & z_mask) &
+                           1))
+              sign = -1.0;
+            e_tmp += sign * Base::states_[iChunk].qreg().expval_pauli(
+                                qubits_in_chunk, pauli_in_chunk);
+          }
+          expval += e_tmp;
+        }
+      } else {
+        for (i = 0; i < Base::states_.size(); i++) {
+          double sign = 1.0;
+          if (z_mask &&
+              (AER::Utils::popcount((i + Base::global_state_index_) & z_mask) &
+               1))
+            sign = -1.0;
+          expval += sign * Base::states_[i].qreg().expval_pauli(qubits_in_chunk,
+                                                                pauli_in_chunk);
+        }
+      }
+    }
+  } else { // all bits are inside chunk
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for reduction(+ : expval)
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        double e_tmp = 0.0;
+        for (int_t iChunk = Base::top_state_of_group_[ig];
+             iChunk < Base::top_state_of_group_[ig + 1]; iChunk++)
+          e_tmp += Base::states_[iChunk].qreg().expval_pauli(qubits, pauli);
+        expval += e_tmp;
+      }
+    } else {
+      for (i = 0; i < Base::states_.size(); i++)
+        expval += Base::states_[i].qreg().expval_pauli(qubits, pauli);
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(expval);
+#endif
+  return expval;
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector(const Operations::Op &op,
+                                               ExperimentResult &result,
+                                               bool last_op) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  std::string key =
+      (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0];
+
+  if (last_op) {
+    auto v = move_to_vector();
+    result.save_data_pershot(Base::states_[0].creg(), key, std::move(v),
+                             Operations::OpType::save_statevec, op.save_type);
+  } else {
+    result.save_data_pershot(Base::states_[0].creg(), key, copy_to_vector(),
+                             Operations::OpType::save_statevec, op.save_type);
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector_dict(const Operations::Op &op,
+                                                    ExperimentResult &result) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  auto vec = copy_to_vector();
+  std::map<std::string, complex_t> result_state_ket;
+  for (size_t k = 0; k < vec.size(); ++k) {
+    if (std::abs(vec[k]) >= Base::json_chop_threshold_) {
+      std::string key = Utils::int2hex(k);
+      result_state_ket.insert({key, vec[k]});
+    }
+  }
+  result.save_data_pershot(Base::states_[0].creg(), op.string_params[0],
+                           std::move(result_state_ket), op.type, op.save_type);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_density_matrix(const Operations::Op &op,
+                                                  ExperimentResult &result) {
+  cmatrix_t reduced_state;
+
+  // Check if tracing over all qubits
+  if (op.qubits.empty()) {
+    reduced_state = cmatrix_t(1, 1);
+
+    double sum = 0.0;
+    if (BasePar::chunk_omp_parallel_) {
+#pragma omp parallel for reduction(+ : sum)
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        sum += Base::states_[i].qreg().norm();
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        sum += Base::states_[i].qreg().norm();
+    }
+#ifdef AER_MPI
+    BasePar::reduce_sum(sum);
+#endif
+    reduced_state[0] = sum;
+  } else {
+    reduced_state = density_matrix(op.qubits);
+  }
+
+  result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                           std::move(reduced_state), op.type, op.save_type);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_amplitudes(const Operations::Op &op,
+                                              ExperimentResult &result) {
+  if (op.int_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save_amplitudes instructions (empty params).");
+  }
+  const int_t size = op.int_params.size();
+  if (op.type == Operations::OpType::save_amps) {
+    Vector<complex_t> amps(size, false);
+    for (int_t i = 0; i < size; ++i) {
+      uint_t idx = BasePar::mapped_index(op.int_params[i]);
+      uint_t iChunk = idx >> BasePar::chunk_bits_;
+      amps[i] = 0.0;
+      if (iChunk >= Base::global_state_index_ &&
+          iChunk < Base::global_state_index_ + Base::states_.size()) {
+        amps[i] =
+            Base::states_[iChunk - Base::global_state_index_].qreg().get_state(
+                idx - (iChunk << BasePar::chunk_bits_));
+      }
+#ifdef AER_MPI
+      complex_t amp = amps[i];
+      BasePar::reduce_sum(amp);
+      amps[i] = amp;
+#endif
+    }
+    result.save_data_pershot(Base::states_[0].creg(), op.string_params[0],
+                             std::move(amps), op.type, op.save_type);
+  } else {
+    rvector_t amps_sq(size, 0);
+    for (int_t i = 0; i < size; ++i) {
+      uint_t idx = BasePar::mapped_index(op.int_params[i]);
+      uint_t iChunk = idx >> BasePar::chunk_bits_;
+      if (iChunk >= Base::global_state_index_ &&
+          iChunk < Base::global_state_index_ + Base::states_.size()) {
+        amps_sq[i] = Base::states_[iChunk - Base::global_state_index_]
+                         .qreg()
+                         .probability(idx - (iChunk << BasePar::chunk_bits_));
+      }
+    }
+#ifdef AER_MPI
+    BasePar::reduce_sum(amps_sq);
+#endif
+    result.save_data_average(Base::states_[0].creg(), op.string_params[0],
+                             std::move(amps_sq), op.type, op.save_type);
+  }
+}
+
+template <class state_t>
+cmatrix_t Executor<state_t>::density_matrix(const reg_t &qubits) {
+  const size_t N = qubits.size();
+  const size_t DIM = 1ULL << N;
+  auto qubits_sorted = qubits;
+  std::sort(qubits_sorted.begin(), qubits_sorted.end());
+
+  auto vec = copy_to_vector();
+
+  // Return full density matrix
+  cmatrix_t densmat(DIM, DIM);
+  if ((N == Base::num_qubits_) && (qubits == qubits_sorted)) {
+    const int_t mask = QV::MASKS[N];
+#pragma omp parallel for
+    for (int_t rowcol = 0; rowcol < int_t(DIM * DIM); ++rowcol) {
+      const int_t row = rowcol >> N;
+      const int_t col = rowcol & mask;
+      densmat(row, col) = complex_t(vec[row]) * complex_t(std::conj(vec[col]));
+    }
+  } else {
+    const size_t END = 1ULL << (Base::num_qubits_ - N);
+    // Initialize matrix values with first block
+    {
+      const auto inds = QV::indexes(qubits, qubits_sorted, 0);
+      for (size_t row = 0; row < DIM; ++row)
+        for (size_t col = 0; col < DIM; ++col) {
+          densmat(row, col) =
+              complex_t(vec[inds[row]]) * complex_t(std::conj(vec[inds[col]]));
+        }
+    }
+    // Accumulate remaining blocks
+    for (size_t k = 1; k < END; k++) {
+      // store entries touched by U
+      const auto inds = QV::indexes(qubits, qubits_sorted, k);
+      for (size_t row = 0; row < DIM; ++row)
+        for (size_t col = 0; col < DIM; ++col) {
+          densmat(row, col) +=
+              complex_t(vec[inds[row]]) * complex_t(std::conj(vec[inds[col]]));
+        }
+    }
+  }
+  return densmat;
+}
+
+//=========================================================================
+// Implementation: Reset, Initialize and Measurement Sampling
+//=========================================================================
+
+template <class state_t>
+void Executor<state_t>::apply_measure(const reg_t &qubits, const reg_t &cmemory,
+                                      const reg_t &cregister, RngEngine &rng) {
+  // Actual measurement outcome
+  const auto meas = sample_measure_with_prob(qubits, rng);
+  // Implement measurement update
+  measure_reset_update(qubits, meas.first, meas.first, meas.second);
+  const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size());
+  BasePar::store_measure(outcome, cmemory, cregister);
+}
+
+template <class state_t>
+rvector_t Executor<state_t>::measure_probs(const reg_t &qubits) const {
+  uint_t dim = 1ull << qubits.size();
+  rvector_t sum(dim, 0.0);
+  int_t i, j, k;
+  reg_t qubits_in_chunk;
+  reg_t qubits_out_chunk;
+
+  Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk,
+                          qubits_out_chunk);
+
+  if (qubits_in_chunk.size() > 0) {
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(i, j, k)
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++) {
+          auto chunkSum =
+              Base::states_[i].qreg().probabilities(qubits_in_chunk);
+
+          if (qubits_in_chunk.size() == qubits.size()) {
+            for (j = 0; j < dim; j++) {
+#pragma omp atomic
+              sum[j] += chunkSum[j];
+            }
+          } else {
+            for (j = 0; j < chunkSum.size(); j++) {
+              int idx = 0;
+              int i_in = 0;
+              for (k = 0; k < qubits.size(); k++) {
+                if (qubits[k] < BasePar::chunk_bits_) {
+                  idx += (((j >> i_in) & 1) << k);
+                  i_in++;
+                } else {
+                  if ((((i + Base::global_state_index_)
+                        << BasePar::chunk_bits_) >>
+                       qubits[k]) &
+                      1) {
+                    idx += 1ull << k;
+                  }
+                }
+              }
+#pragma omp atomic
+              sum[idx] += chunkSum[j];
+            }
+          }
+        }
+      }
+    } else {
+      for (i = 0; i < Base::states_.size(); i++) {
+        auto chunkSum = Base::states_[i].qreg().probabilities(qubits_in_chunk);
+
+        if (qubits_in_chunk.size() == qubits.size()) {
+          for (j = 0; j < dim; j++) {
+            sum[j] += chunkSum[j];
+          }
+        } else {
+          for (j = 0; j < chunkSum.size(); j++) {
+            int idx = 0;
+            int i_in = 0;
+            for (k = 0; k < qubits.size(); k++) {
+              if (qubits[k] < BasePar::chunk_bits_) {
+                idx += (((j >> i_in) & 1) << k);
+                i_in++;
+              } else {
+                if ((((i + Base::global_state_index_)
+                      << BasePar::chunk_bits_) >>
+                     qubits[k]) &
+                    1) {
+                  idx += 1ull << k;
+                }
+              }
+            }
+            sum[idx] += chunkSum[j];
+          }
+        }
+      }
+    }
+  } else { // there is no bit in chunk
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(i, j, k)
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++) {
+          auto nr = std::real(Base::states_[i].qreg().norm());
+          int idx = 0;
+          for (k = 0; k < qubits_out_chunk.size(); k++) {
+            if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >>
+                 qubits_out_chunk[k]) &
+                1) {
+              idx += 1ull << k;
+            }
+          }
+#pragma omp atomic
+          sum[idx] += nr;
+        }
+      }
+    } else {
+      for (i = 0; i < Base::states_.size(); i++) {
+        auto nr = std::real(Base::states_[i].qreg().norm());
+        int idx = 0;
+        for (k = 0; k < qubits_out_chunk.size(); k++) {
+          if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >>
+               qubits_out_chunk[k]) &
+              1) {
+            idx += 1ull << k;
+          }
+        }
+        sum[idx] += nr;
+      }
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(sum);
+#endif
+
+  return sum;
+}
+
+template <class state_t>
+void Executor<state_t>::apply_reset(const reg_t &qubits, RngEngine &rng) {
+  // Simulate unobserved measurement
+  const auto meas = sample_measure_with_prob(qubits, rng);
+  // Apply update to reset state
+  measure_reset_update(qubits, 0, meas.first, meas.second);
+}
+
+template <class state_t>
+std::pair<uint_t, double>
+Executor<state_t>::sample_measure_with_prob(const reg_t &qubits,
+                                            RngEngine &rng) {
+  rvector_t probs = measure_probs(qubits);
+
+  // Randomly pick outcome and return pair
+  uint_t outcome = rng.rand_int(probs);
+  return std::make_pair(outcome, probs[outcome]);
+}
+
+template <class state_t>
+void Executor<state_t>::measure_reset_update(const std::vector<uint_t> &qubits,
+                                             const uint_t final_state,
+                                             const uint_t meas_state,
+                                             const double meas_prob) {
+  // Update a state vector based on an outcome pair [m, p] from
+  // sample_measure_with_prob function, and a desired post-measurement
+  // final_state
+
+  // Single-qubit case
+  if (qubits.size() == 1) {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    cvector_t mdiag(2, 0.);
+    mdiag[meas_state] = 1. / std::sqrt(meas_prob);
+
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].apply_diagonal_matrix(qubits, mdiag);
+      }
+    } else {
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].apply_diagonal_matrix(qubits, mdiag);
+      }
+    }
+
+    // If it doesn't agree with the reset state update
+    if (final_state != meas_state) {
+      BasePar::apply_chunk_x(qubits[0]);
+    }
+  }
+  // Multi qubit case
+  else {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    const size_t dim = 1ULL << qubits.size();
+    cvector_t mdiag(dim, 0.);
+    mdiag[meas_state] = 1. / std::sqrt(meas_prob);
+
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].apply_diagonal_matrix(qubits, mdiag);
+      }
+    } else {
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].apply_diagonal_matrix(qubits, mdiag);
+      }
+    }
+
+    // If it doesn't agree with the reset state update
+    // This function could be optimized as a permutation update
+    if (final_state != meas_state) {
+      reg_t qubits_in_chunk;
+      reg_t qubits_out_chunk;
+
+      Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk,
+                              qubits_out_chunk);
+
+      if (qubits_in_chunk.size() == qubits.size()) { // all bits are inside
+                                                     // chunk
+        // build vectorized permutation matrix
+        cvector_t perm(dim * dim, 0.);
+        perm[final_state * dim + meas_state] = 1.;
+        perm[meas_state * dim + final_state] = 1.;
+        for (size_t j = 0; j < dim; j++) {
+          if (j != final_state && j != meas_state)
+            perm[j * dim + j] = 1.;
+        }
+        // apply permutation to swap state
+        if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+          for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+            for (int_t ic = Base::top_state_of_group_[ig];
+                 ic < Base::top_state_of_group_[ig + 1]; ic++)
+              Base::states_[ic].qreg().apply_matrix(qubits, perm);
+          }
+        } else {
+          for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+            for (int_t ic = Base::top_state_of_group_[ig];
+                 ic < Base::top_state_of_group_[ig + 1]; ic++)
+              Base::states_[ic].qreg().apply_matrix(qubits, perm);
+          }
+        }
+      } else {
+        for (int_t i = 0; i < qubits.size(); i++) {
+          if (((final_state >> i) & 1) != ((meas_state >> i) & 1)) {
+            BasePar::apply_chunk_x(qubits[i]);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <class state_t>
+std::vector<reg_t> Executor<state_t>::sample_measure(const reg_t &qubits,
+                                                     uint_t shots,
+                                                     RngEngine &rng) const {
+  int_t i, j;
+  // Generate flat register for storing
+  std::vector<double> rnds;
+  rnds.reserve(shots);
+  reg_t allbit_samples(shots, 0);
+
+  for (i = 0; i < shots; ++i)
+    rnds.push_back(rng.rand(0, 1));
+
+  std::vector<double> chunkSum(Base::states_.size() + 1, 0);
+  double sum, localSum;
+
+  // calculate per chunk sum
+  if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t ic = Base::top_state_of_group_[ig];
+           ic < Base::top_state_of_group_[ig + 1]; ic++) {
+        bool batched = Base::states_[ic].qreg().enable_batch(
+            true); // return sum of all chunks in group
+        chunkSum[ic] = Base::states_[ic].qreg().norm();
+        Base::states_[ic].qreg().enable_batch(batched);
+      }
+    }
+  } else {
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (int_t ic = Base::top_state_of_group_[ig];
+           ic < Base::top_state_of_group_[ig + 1]; ic++) {
+        bool batched = Base::states_[ic].qreg().enable_batch(
+            true); // return sum of all chunks in group
+        chunkSum[ic] = Base::states_[ic].qreg().norm();
+        Base::states_[ic].qreg().enable_batch(batched);
+      }
+    }
+  }
+
+  localSum = 0.0;
+  for (i = 0; i < Base::states_.size(); i++) {
+    sum = localSum;
+    localSum += chunkSum[i];
+    chunkSum[i] = sum;
+  }
+  chunkSum[Base::states_.size()] = localSum;
+
+  double globalSum = 0.0;
+  if (Base::nprocs_ > 1) {
+    std::vector<double> procTotal(Base::nprocs_);
+
+    for (i = 0; i < Base::nprocs_; i++) {
+      procTotal[i] = localSum;
+    }
+
+    BasePar::gather_value(procTotal);
+
+    for (i = 0; i < Base::myrank_; i++) {
+      globalSum += procTotal[i];
+    }
+  }
+
+  reg_t local_samples(shots, 0);
+
+  // get rnds positions for each chunk
+  for (i = 0; i < Base::states_.size(); i++) {
+    uint_t nIn;
+    std::vector<uint_t> vIdx;
+    std::vector<double> vRnd;
+
+    // find rnds in this chunk
+    nIn = 0;
+    for (j = 0; j < shots; j++) {
+      if (rnds[j] >= chunkSum[i] + globalSum &&
+          rnds[j] < chunkSum[i + 1] + globalSum) {
+        vRnd.push_back(rnds[j] - (globalSum + chunkSum[i]));
+        vIdx.push_back(j);
+        nIn++;
+      }
+    }
+
+    if (nIn > 0) {
+      auto chunkSamples = Base::states_[i].qreg().sample_measure(vRnd);
+
+      for (j = 0; j < chunkSamples.size(); j++) {
+        local_samples[vIdx[j]] =
+            ((Base::global_state_index_ + i) << BasePar::chunk_bits_) +
+            chunkSamples[j];
+      }
+    }
+  }
+
+#ifdef AER_MPI
+  BasePar::reduce_sum(local_samples);
+#endif
+  allbit_samples = local_samples;
+
+  // Convert to reg_t format
+  std::vector<reg_t> all_samples;
+  all_samples.reserve(shots);
+  for (int_t val : allbit_samples) {
+    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
+    reg_t sample;
+    sample.reserve(qubits.size());
+    for (uint_t qubit : qubits) {
+      sample.push_back(allbit_sample[qubit]);
+    }
+    all_samples.push_back(sample);
+  }
+
+  return all_samples;
+}
+
+template <class state_t>
+void Executor<state_t>::apply_initialize(const reg_t &qubits,
+                                         const cvector_t &params,
+                                         RngEngine &rng) {
+  auto sorted_qubits = qubits;
+  std::sort(sorted_qubits.begin(), sorted_qubits.end());
+  if (qubits.size() == Base::num_qubits_) {
+    // If qubits is all ordered qubits in the statevector
+    // we can just initialize the whole state directly
+    if (qubits == sorted_qubits) {
+      initialize_from_vector(params);
+      return;
+    }
+  }
+  // Apply reset to qubits
+  apply_reset(qubits, rng);
+
+  // Apply initialize_component
+  reg_t qubits_in_chunk;
+  reg_t qubits_out_chunk;
+  Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk,
+                          qubits_out_chunk);
+
+  if (qubits_out_chunk.size() == 0) { // no qubits outside of chunk
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++)
+          Base::states_[i].qreg().initialize_component(qubits, params);
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].qreg().initialize_component(qubits, params);
+    }
+  } else {
+    // scatter base states
+    if (qubits_in_chunk.size() > 0) {
+      // scatter inside chunks
+      const size_t dim = 1ULL << qubits_in_chunk.size();
+      cvector_t perm(dim * dim, 0.);
+      for (int_t i = 0; i < dim; i++) {
+        perm[i] = 1.0;
+      }
+
+      if (BasePar::chunk_omp_parallel_) {
+#pragma omp parallel for
+        for (int_t i = 0; i < Base::states_.size(); i++)
+          Base::states_[i].qreg().apply_matrix(qubits_in_chunk, perm);
+      } else {
+        for (int_t i = 0; i < Base::states_.size(); i++)
+          Base::states_[i].qreg().apply_matrix(qubits_in_chunk, perm);
+      }
+    }
+    if (qubits_out_chunk.size() > 0) {
+      // then scatter outside chunk
+      auto sorted_qubits_out = qubits_out_chunk;
+      std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end());
+
+      for (int_t i = 0; i < (1ull << (Base::num_qubits_ - BasePar::chunk_bits_ -
+                                      qubits_out_chunk.size()));
+           i++) {
+        uint_t baseChunk = 0;
+        uint_t j, ii, t;
+        ii = i;
+        for (j = 0; j < qubits_out_chunk.size(); j++) {
+          t = ii & ((1ull << qubits_out_chunk[j]) - 1);
+          baseChunk += t;
+          ii = (ii - t) << 1;
+        }
+        baseChunk += ii;
+        baseChunk >>= BasePar::chunk_bits_;
+
+        for (j = 1; j < (1ull << qubits_out_chunk.size()); j++) {
+          int_t ic = baseChunk;
+          for (t = 0; t < qubits_out_chunk.size(); t++) {
+            if ((j >> t) & 1)
+              ic += (1ull << (qubits_out_chunk[t] - BasePar::chunk_bits_));
+          }
+
+          if (ic >= Base::state_index_begin_[Base::distributed_rank_] &&
+              ic < Base::state_index_end_[Base::distributed_rank_]) { // on this
+                                                                      // process
+            if (baseChunk >=
+                    Base::state_index_begin_[Base::distributed_rank_] &&
+                baseChunk < Base::state_index_end_
+                                [Base::distributed_rank_]) { // base chunk is on
+                                                             // this process
+              Base::states_[ic].qreg().initialize_from_data(
+                  Base::states_[baseChunk].qreg().data(),
+                  1ull << BasePar::chunk_bits_);
+            } else {
+              BasePar::recv_chunk(ic, baseChunk);
+              // using swap chunk function to release send/recv buffers for
+              // Thrust
+              reg_t swap(2);
+              swap[0] = BasePar::chunk_bits_;
+              swap[1] = BasePar::chunk_bits_;
+              Base::states_[ic].qreg().apply_chunk_swap(swap, baseChunk);
+            }
+          } else if (baseChunk >=
+                         Base::state_index_begin_[Base::distributed_rank_] &&
+                     baseChunk < Base::state_index_end_
+                                     [Base::distributed_rank_]) { // base chunk
+                                                                  // is on this
+                                                                  // process
+            BasePar::send_chunk(baseChunk - Base::global_state_index_, ic);
+          }
+        }
+      }
+    }
+
+    // initialize by params
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++)
+          Base::states_[i].qreg().apply_diagonal_matrix(qubits, params);
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        Base::states_[i].qreg().apply_diagonal_matrix(qubits, params);
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::initialize_from_vector(const cvector_t &params) {
+  uint_t local_offset = Base::global_state_index_ << BasePar::chunk_bits_;
+
+#pragma omp parallel for if (BasePar::chunk_omp_parallel_)
+  for (int_t i = 0; i < Base::states_.size(); i++) {
+    // copy part of state for this chunk
+    cvector_t tmp(1ull << BasePar::chunk_bits_);
+    std::copy(params.begin() + local_offset + (i << BasePar::chunk_bits_),
+              params.begin() + local_offset + ((i + 1) << BasePar::chunk_bits_),
+              tmp.begin());
+    Base::states_[i].qreg().initialize_from_vector(tmp);
+  }
+}
+
+//=========================================================================
+// Implementation: Kraus Noise
+//=========================================================================
+template <class state_t>
+void Executor<state_t>::apply_kraus(const reg_t &qubits,
+                                    const std::vector<cmatrix_t> &kmats,
+                                    RngEngine &rng) {
+  // Check edge case for empty Kraus set (this shouldn't happen)
+  if (kmats.empty())
+    return; // end function early
+
+  // Choose a real in [0, 1) to choose the applied kraus operator once
+  // the accumulated probability is greater than r.
+  // We know that the Kraus noise must be normalized
+  // So we only compute probabilities for the first N-1 kraus operators
+  // and infer the probability of the last one from 1 - sum of the previous
+
+  double r = rng.rand(0., 1.);
+  double accum = 0.;
+  double p;
+  bool complete = false;
+
+  // Loop through N-1 kraus operators
+  for (size_t j = 0; j < kmats.size() - 1; j++) {
+
+    // Calculate probability
+    cvector_t vmat = Utils::vectorize_matrix(kmats[j]);
+
+    p = 0.0;
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for reduction(+ : p)
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t i = Base::top_state_of_group_[ig];
+             i < Base::top_state_of_group_[ig + 1]; i++)
+          p += Base::states_[i].qreg().norm(qubits, vmat);
+      }
+    } else {
+      for (int_t i = 0; i < Base::states_.size(); i++)
+        p += Base::states_[i].qreg().norm(qubits, vmat);
+    }
+
+#ifdef AER_MPI
+    BasePar::reduce_sum(p);
+#endif
+    accum += p;
+
+    // check if we need to apply this operator
+    if (accum > r) {
+      // rescale vmat so projection is normalized
+      Utils::scalar_multiply_inplace(vmat, 1 / std::sqrt(p));
+      // apply Kraus projection operator
+      if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+        for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+          for (int_t ic = Base::top_state_of_group_[ig];
+               ic < Base::top_state_of_group_[ig + 1]; ic++)
+            Base::states_[ic].qreg().apply_matrix(qubits, vmat);
+        }
+      } else {
+        for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+          for (int_t ic = Base::top_state_of_group_[ig];
+               ic < Base::top_state_of_group_[ig + 1]; ic++)
+            Base::states_[ic].qreg().apply_matrix(qubits, vmat);
+        }
+      }
+      complete = true;
+      break;
+    }
+  }
+
+  // check if we haven't applied a kraus operator yet
+  if (complete == false) {
+    // Compute probability from accumulated
+    complex_t renorm = 1 / std::sqrt(1. - accum);
+    auto vmat = Utils::vectorize_matrix(renorm * kmats.back());
+    if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].qreg().apply_matrix(qubits, vmat);
+      }
+    } else {
+      for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+        for (int_t ic = Base::top_state_of_group_[ig];
+             ic < Base::top_state_of_group_[ig + 1]; ic++)
+          Base::states_[ic].qreg().apply_matrix(qubits, vmat);
+      }
+    }
+  }
+}
+
+template <class state_t>
+rvector_t
+Executor<state_t>::sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                            const reg_t &qubits) {
+  rvector_t probs =
+      Base::states_[root.state_index()].qreg().probabilities(qubits);
+  uint_t nshots = root.num_shots();
+  reg_t shot_branch(nshots);
+
+  for (int_t i = 0; i < nshots; i++) {
+    shot_branch[i] = root.rng_shots()[i].rand_int(probs);
+  }
+
+  // branch shots
+  root.creg() = Base::states_[root.state_index()].creg();
+  root.branch_shots(shot_branch, probs.size());
+
+  return probs;
+}
+
+template <class state_t>
+void Executor<state_t>::measure_reset_update(CircuitExecutor::Branch &root,
+                                             const std::vector<uint_t> &qubits,
+                                             const int_t final_state,
+                                             const rvector_t &meas_probs) {
+  // Update a state vector based on an outcome pair [m, p] from
+  // sample_measure_with_prob function, and a desired post-measurement
+  // final_state
+
+  // Single-qubit case
+  if (qubits.size() == 1) {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    for (int_t i = 0; i < 2; i++) {
+      cvector_t mdiag(2, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        Operations::Op op;
+        op.type = OpType::gate;
+        op.name = "mcx";
+        op.qubits = qubits;
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+  // Multi qubit case
+  else {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    const size_t dim = 1ULL << qubits.size();
+    for (int_t i = 0; i < dim; i++) {
+      cvector_t mdiag(dim, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        // build vectorized permutation matrix
+        cvector_t perm(dim * dim, 0.);
+        perm[final_state * dim + i] = 1.;
+        perm[i * dim + final_state] = 1.;
+        for (size_t j = 0; j < dim; j++) {
+          if (j != final_state && j != i)
+            perm[j * dim + j] = 1.;
+        }
+        Operations::Op op;
+        op.type = OpType::matrix;
+        op.qubits = qubits;
+        op.mats.push_back(Utils::devectorize_matrix(perm));
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
+                                      const reg_t &qubits, const reg_t &cmemory,
+                                      const reg_t &cregister) {
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  // save result to cregs
+  for (int_t i = 0; i < probs.size(); i++) {
+    const reg_t outcome = Utils::int2reg(i, 2, qubits.size());
+    root.branches()[i]->creg().store_measure(outcome, cmemory, cregister);
+  }
+
+  measure_reset_update(root, qubits, -1, probs);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_reset(CircuitExecutor::Branch &root,
+                                    const reg_t &qubits) {
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  measure_reset_update(root, qubits, 0, probs);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_initialize(CircuitExecutor::Branch &root,
+                                         const reg_t &qubits,
+                                         const cvector_t &params) {
+  if (qubits.size() == Base::num_qubits_) {
+    auto sorted_qubits = qubits;
+    std::sort(sorted_qubits.begin(), sorted_qubits.end());
+    // If qubits is all ordered qubits in the statevector
+    // we can just initialize the whole state directly
+    if (qubits == sorted_qubits) {
+      Base::states_[root.state_index()].initialize_from_vector(params);
+      return;
+    }
+  }
+
+  if (root.additional_ops().size() == 0) {
+    apply_reset(root, qubits);
+
+    Operations::Op op;
+    op.type = OpType::initialize;
+    op.name = "initialize";
+    op.qubits = qubits;
+    op.params = params;
+    for (int_t i = 0; i < root.num_branches(); i++) {
+      root.branches()[i]->add_op_after_branch(op);
+    }
+    return; // initialization will be done in next call because of shot
+            // branching in reset
+  }
+
+  Base::states_[root.state_index()].qreg().initialize_component(qubits, params);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_kraus(CircuitExecutor::Branch &root,
+                                    const reg_t &qubits,
+                                    const std::vector<cmatrix_t> &kmats) {
+  // Check edge case for empty Kraus set (this shouldn't happen)
+  if (kmats.empty())
+    return; // end function early
+
+  // Choose a real in [0, 1) to choose the applied kraus operator once
+  // the accumulated probability is greater than r.
+  // We know that the Kraus noise must be normalized
+  // So we only compute probabilities for the first N-1 kraus operators
+  // and infer the probability of the last one from 1 - sum of the previous
+
+  double r;
+  double accum = 0.;
+  double p;
+  bool complete = false;
+
+  reg_t shot_branch;
+  uint_t nshots;
+  rvector_t rshots, pmats;
+  uint_t nshots_multiplied = 0;
+
+  nshots = root.num_shots();
+  shot_branch.resize(nshots);
+  rshots.resize(nshots);
+  for (int_t i = 0; i < nshots; i++) {
+    shot_branch[i] = kmats.size() - 1;
+    rshots[i] = root.rng_shots()[i].rand(0., 1.);
+  }
+  pmats.resize(kmats.size());
+
+  // Loop through N-1 kraus operators
+  for (size_t j = 0; j < kmats.size() - 1; j++) {
+    // Calculate probability
+    cvector_t vmat = Utils::vectorize_matrix(kmats[j]);
+
+    p = Base::states_[root.state_index()].qreg().norm(qubits, vmat);
+    accum += p;
+
+    // check if we need to apply this operator
+    pmats[j] = p;
+    for (int_t i = 0; i < nshots; i++) {
+      if (shot_branch[i] >= kmats.size() - 1) {
+        if (accum > rshots[i]) {
+          shot_branch[i] = j;
+          nshots_multiplied++;
+        }
+      }
+    }
+    if (nshots_multiplied >= nshots) {
+      complete = true;
+      break;
+    }
+  }
+
+  // check if we haven't applied a kraus operator yet
+  pmats[pmats.size() - 1] = 1. - accum;
+
+  root.creg() = Base::states_[root.state_index()].creg();
+  root.branch_shots(shot_branch, kmats.size());
+  for (int_t i = 0; i < kmats.size(); i++) {
+    Operations::Op op;
+    op.type = OpType::matrix;
+    op.qubits = qubits;
+    op.mats.push_back(kmats[i]);
+    p = 1 / std::sqrt(pmats[i]);
+    for (int_t j = 0; j < op.mats[0].size(); j++)
+      op.mats[0][j] *= p;
+    root.branches()[i]->add_op_after_branch(op);
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
+                                               const Operations::Op &op,
+                                               ExperimentResult &result,
+                                               bool last_op) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  std::string key =
+      (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0];
+
+  if (last_op) {
+    const auto v = Base::states_[root.state_index()].move_to_vector();
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                               OpType::save_statevec, op.save_type);
+    }
+  } else {
+    const auto v = Base::states_[root.state_index()].copy_to_vector();
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                               OpType::save_statevec, op.save_type);
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector_dict(
+    CircuitExecutor::Branch &root, const Operations::Op &op,
+    ExperimentResult &result) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  auto state_ket = Base::states_[root.state_index()].qreg().vector_ket(
+      Base::json_chop_threshold_);
+  std::map<std::string, complex_t> result_state_ket;
+  for (auto const &it : state_ket) {
+    result_state_ket[it.first] = it.second;
+  }
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    result.save_data_pershot(
+        Base::states_[root.state_index()].creg(), op.string_params[0],
+        (const std::map<std::string, complex_t> &)result_state_ket, op.type,
+        op.save_type);
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
+                                              const Operations::Op &op,
+                                              ExperimentResult &result) {
+  if (op.int_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save_amplitudes instructions (empty params).");
+  }
+  const int_t size = op.int_params.size();
+  if (op.type == Operations::OpType::save_amps) {
+    Vector<complex_t> amps(size, false);
+    for (int_t i = 0; i < size; ++i) {
+      amps[i] =
+          Base::states_[root.state_index()].qreg().get_state(op.int_params[i]);
+    }
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(
+          Base::states_[root.state_index()].creg(), op.string_params[0],
+          (const Vector<complex_t> &)amps, op.type, op.save_type);
+    }
+  } else {
+    rvector_t amps_sq(size, 0);
+    for (int_t i = 0; i < size; ++i) {
+      amps_sq[i] = Base::states_[root.state_index()].qreg().probability(
+          op.int_params[i]);
+    }
+    result.save_data_average(Base::states_[root.state_index()].creg(),
+                             op.string_params[0], amps_sq, op.type,
+                             op.save_type);
+  }
+}
+
+template <class state_t>
+std::vector<reg_t>
+Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
+                                  uint_t shots,
+                                  std::vector<RngEngine> &rng) const {
+  int_t i, j;
+  std::vector<double> rnds;
+  rnds.reserve(shots);
+
+  for (i = 0; i < shots; ++i)
+    rnds.push_back(rng[i].rand(0, 1));
+
+  bool flg = state.qreg().enable_batch(false);
+  auto allbit_samples = state.qreg().sample_measure(rnds);
+  state.qreg().enable_batch(flg);
+
+  // Convert to reg_t format
+  std::vector<reg_t> all_samples;
+  all_samples.reserve(shots);
+  for (int_t val : allbit_samples) {
+    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
+    reg_t sample;
+    sample.reserve(qubits.size());
+    for (uint_t qubit : qubits) {
+      sample.push_back(allbit_sample[qubit]);
+    }
+    all_samples.push_back(sample);
+  }
+  return all_samples;
+}
+
+//-------------------------------------------------------------------------
+} // end namespace Statevector
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
old mode 100644
new mode 100755
index 9a257ef08d..6746cd897a
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -23,7 +23,9 @@
 #include "framework/json.hpp"
 #include "framework/utils.hpp"
 #include "qubitvector.hpp"
-#include "simulators/state_chunk.hpp"
+#include "simulators/chunk_utils.hpp"
+#include "simulators/state.hpp"
+
 #ifdef AER_THRUST_SUPPORTED
 #include "qubitvector_thrust.hpp"
 #endif
@@ -109,9 +111,9 @@ enum class Gates {
 //=========================================================================
 
 template <class statevec_t = QV::QubitVector<double>>
-class State : public QuantumState::StateChunk<statevec_t> {
+class State : public QuantumState::State<statevec_t> {
 public:
-  using BaseState = QuantumState::StateChunk<statevec_t>;
+  using BaseState = QuantumState::State<statevec_t>;
 
   State() : BaseState(StateOpSet) {}
   virtual ~State() = default;
@@ -125,12 +127,18 @@ class State : public QuantumState::StateChunk<statevec_t> {
 
   // Apply an operation
   // If the op is not in allowed_ops an exeption will be raised.
-  void apply_op(const int_t iChunk, const Operations::Op &op,
-                ExperimentResult &result, RngEngine &rng,
-                bool final_op = false) override;
+  void apply_op(const Operations::Op &op, ExperimentResult &result,
+                RngEngine &rng, bool final_op = false) override;
+
+  // memory allocation (previously called before inisitalize_qreg)
+  bool allocate(uint_t num_qubits, uint_t block_bits,
+                uint_t num_parallel_shots = 1) override;
 
   // Initializes an n-qubit state to the all |0> state
-  virtual void initialize_qreg(uint_t num_qubits) override;
+  void initialize_qreg(uint_t num_qubits) override;
+
+  // Initializes to a specific n-qubit state
+  void initialize_statevector(uint_t num_qubits, statevec_t &&state);
 
   // Returns the required memory for storing an n-qubit state in megabytes.
   // For this state the memory is independent of the number of ops
@@ -155,74 +163,66 @@ class State : public QuantumState::StateChunk<statevec_t> {
   // Initialize OpenMP settings for the underlying QubitVector class
   void initialize_omp();
 
-  // Initializes to a specific n-qubit state
-  virtual void initialize_qreg(uint_t num_qubits, statevec_t &&state);
-
-  auto move_to_vector(const int_t iChunk);
-  auto copy_to_vector(const int_t iChunk);
+  auto move_to_vector(void);
+  auto copy_to_vector(void);
 
-protected:
   //-----------------------------------------------------------------------
   // Apply instructions
   //-----------------------------------------------------------------------
-  // apply op to multiple shots , return flase if op is not supported to execute
-  // in a batch
-  bool apply_batched_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, std::vector<RngEngine> &rng,
-                        bool final_op = false) override;
 
   // Applies a sypported Gate operation to the state class.
   // If the input is not in allowed_gates an exeption will be raised.
-  void apply_gate(const int_t iChunk, const Operations::Op &op);
+  void apply_gate(const Operations::Op &op);
 
   // Measure qubits and return a list of outcomes [q0, q1, ...]
   // If a state subclass supports this function it then "measure"
   // should be contained in the set returned by the 'allowed_ops'
   // method.
-  virtual void apply_measure(const int_t iChunk, const reg_t &qubits,
-                             const reg_t &cmemory, const reg_t &cregister,
-                             RngEngine &rng);
+  virtual void apply_measure(const reg_t &qubits, const reg_t &cmemory,
+                             const reg_t &cregister, RngEngine &rng);
 
   // Reset the specified qubits to the |0> state by simulating
   // a measurement, applying a conditional x-gate if the outcome is 1, and
   // then discarding the outcome.
-  void apply_reset(const int_t iChunk, const reg_t &qubits, RngEngine &rng);
+  void apply_reset(const reg_t &qubits, RngEngine &rng);
 
   // Initialize the specified qubits to a given state |psi>
   // by applying a reset to the these qubits and then
   // computing the tensor product with the new state |psi>
   // /psi> is given in params
-  void apply_initialize(const int_t iChunk, const reg_t &qubits,
-                        const cvector_t &params, RngEngine &rng);
+  void apply_initialize(const reg_t &qubits, const cvector_t &params,
+                        RngEngine &rng);
 
-  void initialize_from_vector(const int_t iChunk, const cvector_t &params);
+  void initialize_from_vector(const cvector_t &params);
 
   // Apply a matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const Operations::Op &op);
+  void apply_matrix(const Operations::Op &op);
 
   // Apply a vectorized matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const reg_t &qubits,
-                    const cvector_t &vmat);
+  void apply_matrix(const reg_t &qubits, const cvector_t &vmat);
 
   // apply diagonal matrix
-  void apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits,
-                             const cvector_t &diag);
+  void apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag);
 
   // Apply a vector of control matrices to given qubits (identity on all other
   // qubits)
-  void apply_multiplexer(const int_t iChunk, const reg_t &control_qubits,
+  void apply_multiplexer(const reg_t &control_qubits,
                          const reg_t &target_qubits,
                          const std::vector<cmatrix_t> &mmat);
 
   // Apply stacked (flat) version of multiplexer matrix to target qubits (using
   // control qubits to select matrix instance)
-  void apply_multiplexer(const int_t iChunk, const reg_t &control_qubits,
+  void apply_multiplexer(const reg_t &control_qubits,
                          const reg_t &target_qubits, const cmatrix_t &mat);
 
   // Apply a Kraus error operation
-  void apply_kraus(const int_t iChunk, const reg_t &qubits,
-                   const std::vector<cmatrix_t> &krausops, RngEngine &rng);
+  void apply_kraus(const reg_t &qubits, const std::vector<cmatrix_t> &krausops,
+                   RngEngine &rng);
 
+  // Return the reduced density matrix for the simulator
+  cmatrix_t density_matrix(const reg_t &qubits);
+
+protected:
   //-----------------------------------------------------------------------
   // Save data instructions
   //-----------------------------------------------------------------------
@@ -231,27 +231,26 @@ class State : public QuantumState::StateChunk<statevec_t> {
   // If `last_op` is True this will use move semantics to move the simulator
   // state to the results, otherwise it will use copy semantics to leave
   // the current simulator state unchanged.
-  void apply_save_statevector(const int_t iChunk, const Operations::Op &op,
+  void apply_save_statevector(const Operations::Op &op,
                               ExperimentResult &result, bool last_op);
 
   // Save the current state of the statevector simulator as a ket-form map.
-  void apply_save_statevector_dict(const int_t iChunk, const Operations::Op &op,
+  void apply_save_statevector_dict(const Operations::Op &op,
                                    ExperimentResult &result);
 
   // Save the current density matrix or reduced density matrix
-  void apply_save_density_matrix(const int_t iChunk, const Operations::Op &op,
+  void apply_save_density_matrix(const Operations::Op &op,
                                  ExperimentResult &result);
 
   // Helper function for computing expectation value
-  void apply_save_probs(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result);
+  void apply_save_probs(const Operations::Op &op, ExperimentResult &result);
 
   // Helper function for saving amplitudes and amplitudes squared
-  void apply_save_amplitudes(const int_t iChunk, const Operations::Op &op,
+  void apply_save_amplitudes(const Operations::Op &op,
                              ExperimentResult &result);
 
   // Helper function for computing expectation value
-  virtual double expval_pauli(const int_t iChunk, const reg_t &qubits,
+  virtual double expval_pauli(const reg_t &qubits,
                               const std::string &pauli) override;
   //-----------------------------------------------------------------------
   // Measurement Helpers
@@ -262,7 +261,7 @@ class State : public QuantumState::StateChunk<statevec_t> {
   // should be contained in the set returned by the 'allowed_ops'
   // method.
   // TODO: move to private (no longer part of base class)
-  rvector_t measure_probs(const int_t iChunk, const reg_t &qubits) const;
+  rvector_t measure_probs(const reg_t &qubits) const;
 
   // Sample the measurement outcome for qubits
   // return a pair (m, p) of the outcome m, and its corresponding
@@ -272,18 +271,13 @@ class State : public QuantumState::StateChunk<statevec_t> {
   // 1 -> |q1 = 0, q0 = 1> state
   // 2 -> |q1 = 1, q0 = 0> state
   // 3 -> |q1 = 1, q0 = 1> state
-  std::pair<uint_t, double> sample_measure_with_prob(const int_t iChunk,
-                                                     const reg_t &qubits,
+  std::pair<uint_t, double> sample_measure_with_prob(const reg_t &qubits,
                                                      RngEngine &rng);
 
-  void measure_reset_update(const int_t iChunk,
-                            const std::vector<uint_t> &qubits,
+  void measure_reset_update(const std::vector<uint_t> &qubits,
                             const uint_t final_state, const uint_t meas_state,
                             const double meas_prob);
 
-  // Return the reduced density matrix for the simulator
-  cmatrix_t density_matrix(const int_t iChunk, const reg_t &qubits);
-
   // Helper function to convert a vector to a reduced density matrix
   template <class T>
   cmatrix_t vec2density(const reg_t &qubits, const T &vec);
@@ -293,8 +287,7 @@ class State : public QuantumState::StateChunk<statevec_t> {
   //-----------------------------------------------------------------------
 
   // Optimize phase gate with diagonal [1, phase]
-  void apply_gate_phase(const int_t iChunk, const uint_t qubit,
-                        const complex_t phase);
+  void apply_gate_phase(const uint_t qubit, const complex_t phase);
 
   //-----------------------------------------------------------------------
   // Multi-controlled u3
@@ -303,9 +296,8 @@ class State : public QuantumState::StateChunk<statevec_t> {
   // Apply N-qubit multi-controlled single qubit gate specified by
   // 4 parameters u4(theta, phi, lambda, gamma)
   // NOTE: if N=1 this is just a regular u4 gate.
-  void apply_gate_mcu(const int_t iChunk, const reg_t &qubits,
-                      const double theta, const double phi, const double lambda,
-                      const double gamma);
+  void apply_gate_mcu(const reg_t &qubits, const double theta, const double phi,
+                      const double lambda, const double gamma);
 
   //-----------------------------------------------------------------------
   // Config Settings
@@ -411,92 +403,23 @@ const stringmap_t<Gates> State<statevec_t>::gateset_(
 template <class statevec_t>
 void State<statevec_t>::initialize_qreg(uint_t num_qubits) {
   int_t i;
-  if (BaseState::qregs_.size() == 0)
-    BaseState::allocate(num_qubits, num_qubits, 1);
-
   initialize_omp();
 
-  for (i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_num_qubits(BaseState::chunk_bits_);
-  }
+  BaseState::qreg_.set_num_qubits(num_qubits);
+  BaseState::qreg_.initialize();
 
-  if (BaseState::multi_chunk_distribution_) {
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-          if (BaseState::global_chunk_index_ + iChunk == 0 ||
-              this->num_qubits_ == this->chunk_bits_) {
-            BaseState::qregs_[iChunk].initialize();
-          } else {
-            BaseState::qregs_[iChunk].zero();
-          }
-        }
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++) {
-        if (BaseState::global_chunk_index_ + i == 0 ||
-            this->num_qubits_ == this->chunk_bits_) {
-          BaseState::qregs_[i].initialize();
-        } else {
-          BaseState::qregs_[i].zero();
-        }
-      }
-    }
-  } else {
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      BaseState::qregs_[i].initialize();
-    }
-  }
   apply_global_phase();
 }
 
 template <class statevec_t>
-void State<statevec_t>::initialize_qreg(uint_t num_qubits, statevec_t &&state) {
+void State<statevec_t>::initialize_statevector(uint_t num_qubits,
+                                               statevec_t &&state) {
   if (state.num_qubits() != num_qubits) {
     throw std::invalid_argument("QubitVector::State::initialize: initial state "
                                 "does not match qubit number");
   }
 
-  if (BaseState::qregs_.size() == 1) {
-    BaseState::qregs_[0] = std::move(state);
-  } else {
-    if (BaseState::qregs_.size() == 0)
-      BaseState::allocate(num_qubits, num_qubits, 1);
-    initialize_omp();
-
-    int_t iChunk;
-    for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-      BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_);
-    }
-
-    if (BaseState::multi_chunk_distribution_) {
-      uint_t local_offset = BaseState::global_chunk_index_
-                            << BaseState::chunk_bits_;
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(iChunk)
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (iChunk = BaseState::top_chunk_of_group_[ig];
-               iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++)
-            BaseState::qregs_[iChunk].initialize_from_data(
-                state.data() + local_offset +
-                    (iChunk << BaseState::chunk_bits_),
-                1ull << BaseState::chunk_bits_);
-        }
-      } else {
-        for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++)
-          BaseState::qregs_[iChunk].initialize_from_data(
-              state.data() + local_offset + (iChunk << BaseState::chunk_bits_),
-              1ull << BaseState::chunk_bits_);
-      }
-    } else {
-      for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-        BaseState::qregs_[iChunk].initialize_from_data(
-            state.data(), 1ull << BaseState::chunk_bits_);
-      }
-    }
-  }
+  BaseState::qreg_ = std::move(state);
 
   apply_global_phase();
 }
@@ -505,12 +428,21 @@ template <class statevec_t>
 void State<statevec_t>::initialize_omp() {
   uint_t i;
 
-  for (i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_);
-    if (BaseState::threads_ > 0)
-      BaseState::qregs_[i].set_omp_threads(
-          BaseState::threads_); // set allowed OMP threads in qubitvector
-  }
+  BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_);
+  if (BaseState::threads_ > 0) // set allowed OMP threads in qubitvector
+    BaseState::qreg_.set_omp_threads(BaseState::threads_);
+}
+
+template <class statevec_t>
+bool State<statevec_t>::allocate(uint_t num_qubits, uint_t block_bits,
+                                 uint_t num_parallel_shots) {
+  if (BaseState::max_matrix_qubits_ > 0)
+    BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_);
+
+  BaseState::qreg_.set_target_gpus(BaseState::target_gpus_);
+  BaseState::qreg_.chunk_setup(block_bits, num_qubits, 0, 1);
+
+  return true;
 }
 
 //-------------------------------------------------------------------------
@@ -519,30 +451,16 @@ void State<statevec_t>::initialize_omp() {
 
 template <class statevec_t>
 void State<statevec_t>::apply_global_phase() {
-  if (BaseState::has_global_phase_) {
-    int_t i;
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++)
-          BaseState::qregs_[iChunk].apply_diagonal_matrix(
-              {0}, {BaseState::global_phase_, BaseState::global_phase_});
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++)
-        BaseState::qregs_[i].apply_diagonal_matrix(
-            {0}, {BaseState::global_phase_, BaseState::global_phase_});
-    }
-  }
+  if (BaseState::has_global_phase_)
+    BaseState::qreg_.apply_diagonal_matrix(
+        {0}, {BaseState::global_phase_, BaseState::global_phase_});
 }
 
 template <class statevec_t>
 size_t State<statevec_t>::required_memory_mb(
     uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
   (void)ops; // avoid unused variable compiler warning
-  statevec_t tmp;
-  return tmp.required_memory_mb(num_qubits);
+  return BaseState::qreg_.required_memory_mb(num_qubits);
 }
 
 template <class statevec_t>
@@ -551,9 +469,7 @@ void State<statevec_t>::set_config(const Config &config) {
 
   // Set threshold for truncating states to be saved
   json_chop_threshold_ = config.zero_threshold;
-  for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_);
-  }
+  BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_);
 
   // Set OMP threshold for state update functions
   omp_qubit_threshold_ = config.statevector_parallel_threshold;
@@ -561,152 +477,95 @@ void State<statevec_t>::set_config(const Config &config) {
   // Set the sample measure indexing size
   if (config.statevector_sample_measure_opt) {
     int index_size = config.statevector_sample_measure_opt;
-    for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-      BaseState::qregs_[i].set_sample_measure_index_size(index_size);
-    }
+    BaseState::qreg_.set_sample_measure_index_size(index_size);
   }
 }
 
 template <class statevec_t>
-auto State<statevec_t>::move_to_vector(const int_t iChunkIn) {
-  if (BaseState::multi_chunk_distribution_) {
-    size_t size_required =
-        2 * (sizeof(std::complex<double>) << BaseState::num_qubits_) +
-        (sizeof(std::complex<double>) << BaseState::chunk_bits_) *
-            BaseState::num_local_chunks_;
-    if ((size_required >> 20) > Utils::get_system_memory_mb()) {
-      throw std::runtime_error(
-          std::string("There is not enough memory to store states"));
-    }
-    int_t iChunk;
-    auto state = BaseState::qregs_[0].move_to_vector();
-    state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_);
-
-#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk)
-    for (iChunk = 1; iChunk < BaseState::qregs_.size(); iChunk++) {
-      auto tmp = BaseState::qregs_[iChunk].move_to_vector();
-      uint_t j, offset = iChunk << BaseState::chunk_bits_;
-      for (j = 0; j < tmp.size(); j++) {
-        state[offset + j] = tmp[j];
-      }
-    }
-
-#ifdef AER_MPI
-    BaseState::gather_state(state);
-#endif
-    return state;
-  } else {
-    return std::move(BaseState::qregs_[iChunkIn].move_to_vector());
-  }
+auto State<statevec_t>::move_to_vector(void) {
+  return std::move(BaseState::qreg_.move_to_vector());
 }
 
 template <class statevec_t>
-auto State<statevec_t>::copy_to_vector(const int_t iChunkIn) {
-  if (BaseState::multi_chunk_distribution_) {
-    size_t size_required =
-        2 * (sizeof(std::complex<double>) << BaseState::num_qubits_) +
-        (sizeof(std::complex<double>) << BaseState::chunk_bits_) *
-            BaseState::num_local_chunks_;
-    if ((size_required >> 20) > Utils::get_system_memory_mb()) {
-      throw std::runtime_error(
-          std::string("There is not enough memory to store states"));
-    }
-    int_t iChunk;
-    auto state = BaseState::qregs_[0].copy_to_vector();
-    state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_);
-
-#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk)
-    for (iChunk = 1; iChunk < BaseState::qregs_.size(); iChunk++) {
-      auto tmp = BaseState::qregs_[iChunk].copy_to_vector();
-      uint_t j, offset = iChunk << BaseState::chunk_bits_;
-      for (j = 0; j < tmp.size(); j++) {
-        state[offset + j] = tmp[j];
-      }
-    }
-
-#ifdef AER_MPI
-    BaseState::gather_state(state);
-#endif
-    return state;
-  } else
-    return BaseState::qregs_[iChunkIn].copy_to_vector();
+auto State<statevec_t>::copy_to_vector(void) {
+  return BaseState::qreg_.copy_to_vector();
 }
 
 //=========================================================================
 // Implementation: apply operations
 //=========================================================================
 template <class statevec_t>
-void State<statevec_t>::apply_op(const int_t iChunk, const Operations::Op &op,
+void State<statevec_t>::apply_op(const Operations::Op &op,
                                  ExperimentResult &result, RngEngine &rng,
                                  bool final_op) {
-  if (BaseState::check_conditional(iChunk, op)) {
+  if (BaseState::creg().check_conditional(op)) {
     switch (op.type) {
     case OpType::barrier:
     case OpType::nop:
     case OpType::qerror_loc:
       break;
     case OpType::reset:
-      apply_reset(iChunk, op.qubits, rng);
+      apply_reset(op.qubits, rng);
       break;
     case OpType::initialize:
-      apply_initialize(iChunk, op.qubits, op.params, rng);
+      apply_initialize(op.qubits, op.params, rng);
       break;
     case OpType::measure:
-      apply_measure(iChunk, op.qubits, op.memory, op.registers, rng);
+      apply_measure(op.qubits, op.memory, op.registers, rng);
       break;
     case OpType::bfunc:
-      BaseState::cregs_[0].apply_bfunc(op);
+      BaseState::creg().apply_bfunc(op);
       break;
     case OpType::roerror:
-      BaseState::cregs_[0].apply_roerror(op, rng);
+      BaseState::creg().apply_roerror(op, rng);
       break;
     case OpType::gate:
-      apply_gate(iChunk, op);
+      apply_gate(op);
       break;
     case OpType::matrix:
-      apply_matrix(iChunk, op);
+      apply_matrix(op);
       break;
     case OpType::diagonal_matrix:
-      apply_diagonal_matrix(iChunk, op.qubits, op.params);
+      apply_diagonal_matrix(op.qubits, op.params);
       break;
     case OpType::multiplexer:
-      apply_multiplexer(iChunk, op.regs[0], op.regs[1],
+      apply_multiplexer(op.regs[0], op.regs[1],
                         op.mats); // control qubits ([0]) & target qubits([1])
       break;
     case OpType::kraus:
-      apply_kraus(iChunk, op.qubits, op.mats, rng);
+      apply_kraus(op.qubits, op.mats, rng);
       break;
     case OpType::sim_op:
       if (op.name == "begin_register_blocking") {
-        BaseState::qregs_[iChunk].enter_register_blocking(op.qubits);
+        BaseState::qreg_.enter_register_blocking(op.qubits);
       } else if (op.name == "end_register_blocking") {
-        BaseState::qregs_[iChunk].leave_register_blocking();
+        BaseState::qreg_.leave_register_blocking();
       }
       break;
     case OpType::set_statevec:
-      initialize_from_vector(iChunk, op.params);
+      initialize_from_vector(op.params);
       break;
     case OpType::save_expval:
     case OpType::save_expval_var:
-      BaseState::apply_save_expval(iChunk, op, result);
+      BaseState::apply_save_expval(op, result);
       break;
     case OpType::save_densmat:
-      apply_save_density_matrix(iChunk, op, result);
+      apply_save_density_matrix(op, result);
       break;
     case OpType::save_state:
     case OpType::save_statevec:
-      apply_save_statevector(iChunk, op, result, final_op);
+      apply_save_statevector(op, result, final_op);
       break;
     case OpType::save_statevec_dict:
-      apply_save_statevector_dict(iChunk, op, result);
+      apply_save_statevector_dict(op, result);
       break;
     case OpType::save_probs:
     case OpType::save_probs_ket:
-      apply_save_probs(iChunk, op, result);
+      apply_save_probs(op, result);
       break;
     case OpType::save_amps:
     case OpType::save_amps_sq:
-      apply_save_amplitudes(iChunk, op, result);
+      apply_save_amplitudes(op, result);
       break;
     default:
       throw std::invalid_argument("QubitVector::State::invalid instruction \'" +
@@ -715,282 +574,37 @@ void State<statevec_t>::apply_op(const int_t iChunk, const Operations::Op &op,
   }
 }
 
-template <class statevec_t>
-bool State<statevec_t>::apply_batched_op(const int_t iChunk,
-                                         const Operations::Op &op,
-                                         ExperimentResult &result,
-                                         std::vector<RngEngine> &rng,
-                                         bool final_op) {
-  if (op.conditional) {
-    BaseState::qregs_[iChunk].set_conditional(op.conditional_reg);
-  }
-
-  switch (op.type) {
-  case OpType::barrier:
-  case OpType::nop:
-  case OpType::qerror_loc:
-    break;
-  case OpType::reset:
-    BaseState::qregs_[iChunk].apply_batched_reset(op.qubits, rng);
-    break;
-  case OpType::initialize:
-    BaseState::qregs_[iChunk].apply_batched_reset(op.qubits, rng);
-    BaseState::qregs_[iChunk].initialize_component(op.qubits, op.params);
-    break;
-  case OpType::measure:
-    BaseState::qregs_[iChunk].apply_batched_measure(op.qubits, rng, op.memory,
-                                                    op.registers);
-    break;
-  case OpType::bfunc:
-    BaseState::qregs_[iChunk].apply_bfunc(op);
-    break;
-  case OpType::roerror:
-    BaseState::qregs_[iChunk].apply_roerror(op, rng);
-    break;
-  case OpType::gate:
-    apply_gate(iChunk, op);
-    break;
-  case OpType::matrix:
-    apply_matrix(iChunk, op);
-    break;
-  case OpType::diagonal_matrix:
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, op.params);
-    break;
-  case OpType::multiplexer:
-    apply_multiplexer(iChunk, op.regs[0], op.regs[1],
-                      op.mats); // control qubits ([0]) & target qubits([1])
-    break;
-  case OpType::kraus:
-    BaseState::qregs_[iChunk].apply_batched_kraus(op.qubits, op.mats, rng);
-    break;
-  case OpType::sim_op:
-    if (op.name == "begin_register_blocking") {
-      BaseState::qregs_[iChunk].enter_register_blocking(op.qubits);
-    } else if (op.name == "end_register_blocking") {
-      BaseState::qregs_[iChunk].leave_register_blocking();
-    } else {
-      return false;
-    }
-    break;
-  case OpType::set_statevec:
-    BaseState::qregs_[iChunk].initialize_from_vector(op.params);
-    break;
-  default:
-    // other operations should be called to indivisual chunks by apply_op
-    return false;
-  }
-  return true;
-}
-
 //=========================================================================
 // Implementation: Save data
 //=========================================================================
 
 template <class statevec_t>
-void State<statevec_t>::apply_save_probs(const int_t iChunk,
-                                         const Operations::Op &op,
+void State<statevec_t>::apply_save_probs(const Operations::Op &op,
                                          ExperimentResult &result) {
   // get probs as hexadecimal
-  auto probs = measure_probs(iChunk, op.qubits);
+  auto probs = measure_probs(op.qubits);
   if (op.type == Operations::OpType::save_probs_ket) {
     // Convert to ket dict
-    result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0],
+    result.save_data_average(BaseState::creg(), op.string_params[0],
                              Utils::vec2ket(probs, json_chop_threshold_, 16),
                              op.type, op.save_type);
   } else {
-    result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0],
+    result.save_data_average(BaseState::creg(), op.string_params[0],
                              std::move(probs), op.type, op.save_type);
   }
 }
 
 template <class statevec_t>
-double State<statevec_t>::expval_pauli(const int_t iChunk, const reg_t &qubits,
+double State<statevec_t>::expval_pauli(const reg_t &qubits,
                                        const std::string &pauli) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].expval_pauli(qubits, pauli);
-
-  // multi-chunk distribution
-  reg_t qubits_in_chunk;
-  reg_t qubits_out_chunk;
-  std::string pauli_in_chunk;
-  std::string pauli_out_chunk;
-  int_t i, n;
-  double expval(0.);
-
-  // get inner/outer chunk pauli string
-  n = pauli.size();
-  for (i = 0; i < n; i++) {
-    if (qubits[i] < BaseState::chunk_bits_) {
-      qubits_in_chunk.push_back(qubits[i]);
-      pauli_in_chunk.push_back(pauli[n - i - 1]);
-    } else {
-      qubits_out_chunk.push_back(qubits[i]);
-      pauli_out_chunk.push_back(pauli[n - i - 1]);
-    }
-  }
-
-  if (qubits_out_chunk.size() > 0) { // there are bits out of chunk
-    std::complex<double> phase = 1.0;
-
-    std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end());
-    std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end());
-
-    uint_t x_mask, z_mask, num_y, x_max;
-    std::tie(x_mask, z_mask, num_y, x_max) =
-        AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk);
-
-    AER::QV::add_y_phase(num_y, phase);
-
-    if (x_mask != 0) { // pairing state is out of chunk
-      bool on_same_process = true;
-#ifdef AER_MPI
-      int proc_bits = 0;
-      uint_t procs = BaseState::distributed_procs_;
-      while (procs > 1) {
-        if ((procs & 1) != 0) {
-          proc_bits = -1;
-          break;
-        }
-        proc_bits++;
-        procs >>= 1;
-      }
-      if (x_mask & (~((1ull << (BaseState::num_qubits_ - proc_bits)) - 1)) !=
-                       0) { // data exchange between processes is required
-        on_same_process = false;
-      }
-#endif
-
-      x_mask >>= BaseState::chunk_bits_;
-      z_mask >>= BaseState::chunk_bits_;
-      x_max -= BaseState::chunk_bits_;
-
-      const uint_t mask_u = ~((1ull << (x_max + 1)) - 1);
-      const uint_t mask_l = (1ull << x_max) - 1;
-      if (on_same_process) {
-        auto apply_expval_pauli_chunk = [this, x_mask, z_mask, x_max, mask_u,
-                                         mask_l, qubits_in_chunk,
-                                         pauli_in_chunk, phase](int_t iGroup) {
-          double expval = 0.0;
-          for (int_t iChunk = BaseState::top_chunk_of_group_[iGroup];
-               iChunk < BaseState::top_chunk_of_group_[iGroup + 1]; iChunk++) {
-            uint_t pair_chunk = iChunk ^ x_mask;
-            if (iChunk < pair_chunk) {
-              uint_t z_count, z_count_pair;
-              z_count = AER::Utils::popcount(iChunk & z_mask);
-              z_count_pair = AER::Utils::popcount(pair_chunk & z_mask);
-
-              expval +=
-                  BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                      .expval_pauli(qubits_in_chunk, pauli_in_chunk,
-                                    BaseState::qregs_[pair_chunk], z_count,
-                                    z_count_pair, phase);
-            }
-          }
-          return expval;
-        };
-        expval += Utils::apply_omp_parallel_for_reduction(
-            (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0), 0,
-            BaseState::num_global_chunks_ / 2, apply_expval_pauli_chunk);
-      } else {
-        for (int_t i = 0; i < BaseState::num_global_chunks_ / 2; i++) {
-          uint_t iChunk = ((i << 1) & mask_u) | (i & mask_l);
-          uint_t pair_chunk = iChunk ^ x_mask;
-          uint_t iProc = BaseState::get_process_by_chunk(pair_chunk);
-          if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <=
-                  iChunk &&
-              BaseState::chunk_index_end_[BaseState::distributed_rank_] >
-                  iChunk) { // on this process
-            uint_t z_count, z_count_pair;
-            z_count = AER::Utils::popcount(iChunk & z_mask);
-            z_count_pair = AER::Utils::popcount(pair_chunk & z_mask);
-
-            if (iProc ==
-                BaseState::distributed_rank_) { // pair is on the same process
-              expval +=
-                  BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                      .expval_pauli(
-                          qubits_in_chunk, pauli_in_chunk,
-                          BaseState::qregs_[pair_chunk -
-                                            BaseState::global_chunk_index_],
-                          z_count, z_count_pair, phase);
-            } else {
-              BaseState::recv_chunk(iChunk - BaseState::global_chunk_index_,
-                                    pair_chunk);
-              // refer receive buffer to calculate expectation value
-              expval +=
-                  BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                      .expval_pauli(
-                          qubits_in_chunk, pauli_in_chunk,
-                          BaseState::qregs_[iChunk -
-                                            BaseState::global_chunk_index_],
-                          z_count, z_count_pair, phase);
-            }
-          } else if (iProc ==
-                     BaseState::distributed_rank_) { // pair is on this process
-            BaseState::send_chunk(iChunk - BaseState::global_chunk_index_,
-                                  pair_chunk);
-          }
-        }
-      }
-    } else { // no exchange between chunks
-      z_mask >>= BaseState::chunk_bits_;
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for reduction(+ : expval)
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          double e_tmp = 0.0;
-          for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-               iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-            double sign = 1.0;
-            if (z_mask &&
-                (AER::Utils::popcount(
-                     (iChunk + BaseState::global_chunk_index_) & z_mask) &
-                 1))
-              sign = -1.0;
-            e_tmp += sign * BaseState::qregs_[iChunk].expval_pauli(
-                                qubits_in_chunk, pauli_in_chunk);
-          }
-          expval += e_tmp;
-        }
-      } else {
-        for (i = 0; i < BaseState::qregs_.size(); i++) {
-          double sign = 1.0;
-          if (z_mask && (AER::Utils::popcount(
-                             (i + BaseState::global_chunk_index_) & z_mask) &
-                         1))
-            sign = -1.0;
-          expval += sign * BaseState::qregs_[i].expval_pauli(qubits_in_chunk,
-                                                             pauli_in_chunk);
-        }
-      }
-    }
-  } else { // all bits are inside chunk
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for reduction(+ : expval)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        double e_tmp = 0.0;
-        for (int_t iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++)
-          e_tmp += BaseState::qregs_[iChunk].expval_pauli(qubits, pauli);
-        expval += e_tmp;
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++)
-        expval += BaseState::qregs_[i].expval_pauli(qubits, pauli);
-    }
-  }
-
-#ifdef AER_MPI
-  BaseState::reduce_sum(expval);
-#endif
-  return expval;
+  return BaseState::qreg_.expval_pauli(qubits, pauli);
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_save_statevector(const int_t iChunk,
-                                               const Operations::Op &op,
+void State<statevec_t>::apply_save_statevector(const Operations::Op &op,
                                                ExperimentResult &result,
                                                bool last_op) {
-  if (op.qubits.size() != BaseState::num_qubits_) {
+  if (op.qubits.size() != BaseState::qreg_.num_qubits()) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
                                 " Only the full statevector can be saved.");
@@ -999,52 +613,34 @@ void State<statevec_t>::apply_save_statevector(const int_t iChunk,
       (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0];
 
   if (last_op) {
-    auto v = move_to_vector(iChunk);
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), key, std::move(v),
+    auto v = move_to_vector();
+    result.save_data_pershot(BaseState::creg(), key, std::move(v),
                              OpType::save_statevec, op.save_type);
   } else {
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), key,
-                             copy_to_vector(iChunk), OpType::save_statevec,
-                             op.save_type);
+    result.save_data_pershot(BaseState::creg(), key, copy_to_vector(),
+                             OpType::save_statevec, op.save_type);
   }
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_save_statevector_dict(const int_t iChunk,
-                                                    const Operations::Op &op,
+void State<statevec_t>::apply_save_statevector_dict(const Operations::Op &op,
                                                     ExperimentResult &result) {
-  if (op.qubits.size() != BaseState::num_qubits_) {
+  if (op.qubits.size() != BaseState::qreg_.num_qubits()) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
                                 " Only the full statevector can be saved.");
   }
-  if (BaseState::multi_chunk_distribution_) {
-    auto vec = copy_to_vector(iChunk);
-    std::map<std::string, complex_t> result_state_ket;
-    for (size_t k = 0; k < vec.size(); ++k) {
-      if (std::abs(vec[k]) >= json_chop_threshold_) {
-        std::string key = Utils::int2hex(k);
-        result_state_ket.insert({key, vec[k]});
-      }
-    }
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), op.string_params[0],
-                             std::move(result_state_ket), op.type,
-                             op.save_type);
-  } else {
-    auto state_ket = BaseState::qregs_[iChunk].vector_ket(json_chop_threshold_);
-    std::map<std::string, complex_t> result_state_ket;
-    for (auto const &it : state_ket) {
-      result_state_ket[it.first] = it.second;
-    }
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), op.string_params[0],
-                             std::move(result_state_ket), op.type,
-                             op.save_type);
+  auto state_ket = BaseState::qreg_.vector_ket(json_chop_threshold_);
+  std::map<std::string, complex_t> result_state_ket;
+  for (auto const &it : state_ket) {
+    result_state_ket[it.first] = it.second;
   }
+  result.save_data_pershot(BaseState::creg(), op.string_params[0],
+                           std::move(result_state_ket), op.type, op.save_type);
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_save_density_matrix(const int_t iChunk,
-                                                  const Operations::Op &op,
+void State<statevec_t>::apply_save_density_matrix(const Operations::Op &op,
                                                   ExperimentResult &result) {
   cmatrix_t reduced_state;
 
@@ -1052,34 +648,17 @@ void State<statevec_t>::apply_save_density_matrix(const int_t iChunk,
   if (op.qubits.empty()) {
     reduced_state = cmatrix_t(1, 1);
 
-    if (!BaseState::multi_chunk_distribution_) {
-      reduced_state[0] = BaseState::qregs_[iChunk].norm();
-    } else {
-      double sum = 0.0;
-      if (BaseState::chunk_omp_parallel_) {
-#pragma omp parallel for reduction(+ : sum)
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          sum += BaseState::qregs_[i].norm();
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          sum += BaseState::qregs_[i].norm();
-      }
-#ifdef AER_MPI
-      BaseState::reduce_sum(sum);
-#endif
-      reduced_state[0] = sum;
-    }
+    reduced_state[0] = BaseState::qreg_.norm();
   } else {
-    reduced_state = density_matrix(iChunk, op.qubits);
+    reduced_state = density_matrix(op.qubits);
   }
 
-  result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0],
+  result.save_data_average(BaseState::creg(), op.string_params[0],
                            std::move(reduced_state), op.type, op.save_type);
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_save_amplitudes(const int_t iChunkIn,
-                                              const Operations::Op &op,
+void State<statevec_t>::apply_save_amplitudes(const Operations::Op &op,
                                               ExperimentResult &result) {
   if (op.int_params.empty()) {
     throw std::invalid_argument(
@@ -1088,63 +667,24 @@ void State<statevec_t>::apply_save_amplitudes(const int_t iChunkIn,
   const int_t size = op.int_params.size();
   if (op.type == Operations::OpType::save_amps) {
     Vector<complex_t> amps(size, false);
-    if (!BaseState::multi_chunk_distribution_) {
-      for (int_t i = 0; i < size; ++i) {
-        amps[i] = BaseState::qregs_[iChunkIn].get_state(op.int_params[i]);
-      }
-    } else {
-      for (int_t i = 0; i < size; ++i) {
-        uint_t idx = BaseState::mapped_index(op.int_params[i]);
-        uint_t iChunk = idx >> BaseState::chunk_bits_;
-        amps[i] = 0.0;
-        if (iChunk >= BaseState::global_chunk_index_ &&
-            iChunk <
-                BaseState::global_chunk_index_ + BaseState::qregs_.size()) {
-          amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                        .get_state(idx - (iChunk << BaseState::chunk_bits_));
-        }
-#ifdef AER_MPI
-        complex_t amp = amps[i];
-        BaseState::reduce_sum(amp);
-        amps[i] = amp;
-#endif
-      }
+    for (int_t i = 0; i < size; ++i) {
+      amps[i] = BaseState::qreg_.get_state(op.int_params[i]);
     }
-    result.save_data_pershot(BaseState::chunk_creg(iChunkIn),
-                             op.string_params[0], std::move(amps), op.type,
-                             op.save_type);
+    result.save_data_pershot(BaseState::creg(), op.string_params[0],
+                             std::move(amps), op.type, op.save_type);
   } else {
     rvector_t amps_sq(size, 0);
-    if (!BaseState::multi_chunk_distribution_) {
-      for (int_t i = 0; i < size; ++i) {
-        amps_sq[i] = BaseState::qregs_[iChunkIn].probability(op.int_params[i]);
-      }
-    } else {
-      for (int_t i = 0; i < size; ++i) {
-        uint_t idx = BaseState::mapped_index(op.int_params[i]);
-        uint_t iChunk = idx >> BaseState::chunk_bits_;
-        if (iChunk >= BaseState::global_chunk_index_ &&
-            iChunk <
-                BaseState::global_chunk_index_ + BaseState::qregs_.size()) {
-          amps_sq[i] =
-              BaseState::qregs_[iChunk - BaseState::global_chunk_index_]
-                  .probability(idx - (iChunk << BaseState::chunk_bits_));
-        }
-      }
-#ifdef AER_MPI
-      BaseState::reduce_sum(amps_sq);
-#endif
+    for (int_t i = 0; i < size; ++i) {
+      amps_sq[i] = BaseState::qreg_.probability(op.int_params[i]);
     }
-    result.save_data_average(BaseState::chunk_creg(iChunkIn),
-                             op.string_params[0], std::move(amps_sq), op.type,
-                             op.save_type);
+    result.save_data_average(BaseState::creg(), op.string_params[0],
+                             std::move(amps_sq), op.type, op.save_type);
   }
 }
 
 template <class statevec_t>
-cmatrix_t State<statevec_t>::density_matrix(const int_t iChunk,
-                                            const reg_t &qubits) {
-  return vec2density(qubits, copy_to_vector(iChunk));
+cmatrix_t State<statevec_t>::density_matrix(const reg_t &qubits) {
+  return vec2density(qubits, copy_to_vector());
 }
 
 template <class statevec_t>
@@ -1157,7 +697,7 @@ cmatrix_t State<statevec_t>::vec2density(const reg_t &qubits, const T &vec) {
 
   // Return full density matrix
   cmatrix_t densmat(DIM, DIM);
-  if ((N == BaseState::num_qubits_) && (qubits == qubits_sorted)) {
+  if ((N == BaseState::qreg_.num_qubits()) && (qubits == qubits_sorted)) {
     const int_t mask = QV::MASKS[N];
 #pragma omp parallel for if (2 * N > omp_qubit_threshold_ &&                   \
                              BaseState::threads_ > 1)                          \
@@ -1168,7 +708,7 @@ cmatrix_t State<statevec_t>::vec2density(const reg_t &qubits, const T &vec) {
       densmat(row, col) = complex_t(vec[row]) * complex_t(std::conj(vec[col]));
     }
   } else {
-    const size_t END = 1ULL << (BaseState::num_qubits_ - N);
+    const size_t END = 1ULL << (BaseState::qreg_.num_qubits() - N);
     // Initialize matrix values with first block
     {
       const auto inds = QV::indexes(qubits, qubits_sorted, 0);
@@ -1197,20 +737,23 @@ cmatrix_t State<statevec_t>::vec2density(const reg_t &qubits, const T &vec) {
 //=========================================================================
 
 template <class statevec_t>
-void State<statevec_t>::apply_gate(const int_t iChunk,
-                                   const Operations::Op &op) {
-  if (!BaseState::global_chunk_indexing_) {
+void State<statevec_t>::apply_gate(const Operations::Op &op) {
+  // CPU qubit vector does not handle chunk ID inside kernel, so modify op here
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() &&
+      !BaseState::qreg_.support_global_indexing()) {
     reg_t qubits_in, qubits_out;
-    BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in);
+    if (op.name[0] == 'c' || op.name.find("mc") == 0) {
+      Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in,
+                                   qubits_out);
+    }
     if (qubits_out.size() > 0) {
       uint_t mask = 0;
       for (int i = 0; i < qubits_out.size(); i++) {
-        mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_));
+        mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits()));
       }
-      if (((BaseState::global_chunk_index_ + iChunk) & mask) == mask) {
-        Operations::Op new_op =
-            BaseState::remake_gate_in_chunk_qubits(op, qubits_in);
-        apply_gate(iChunk, new_op);
+      if ((BaseState::qreg_.chunk_index() & mask) == mask) {
+        Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in);
+        apply_gate(new_op);
       }
       return;
     }
@@ -1224,103 +767,102 @@ void State<statevec_t>::apply_gate(const int_t iChunk,
   switch (it->second) {
   case Gates::mcx:
     // Includes X, CX, CCX, etc
-    BaseState::qregs_[iChunk].apply_mcx(op.qubits);
+    BaseState::qreg_.apply_mcx(op.qubits);
     break;
   case Gates::mcy:
     // Includes Y, CY, CCY, etc
-    BaseState::qregs_[iChunk].apply_mcy(op.qubits);
+    BaseState::qreg_.apply_mcy(op.qubits);
     break;
   case Gates::mcz:
     // Includes Z, CZ, CCZ, etc
-    BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1);
+    BaseState::qreg_.apply_mcphase(op.qubits, -1);
     break;
   case Gates::mcr:
-    BaseState::qregs_[iChunk].apply_mcu(
-        op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1]));
+    BaseState::qreg_.apply_mcu(op.qubits,
+                               Linalg::VMatrix::r(op.params[0], op.params[1]));
     break;
   case Gates::mcrx:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::x,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::x,
+                                    std::real(op.params[0]));
     break;
   case Gates::mcry:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::y,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::y,
+                                    std::real(op.params[0]));
     break;
   case Gates::mcrz:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::z,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::z,
+                                    std::real(op.params[0]));
     break;
   case Gates::rxx:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::xx,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::xx,
+                                    std::real(op.params[0]));
     break;
   case Gates::ryy:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::yy,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::yy,
+                                    std::real(op.params[0]));
     break;
   case Gates::rzz:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zz,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::zz,
+                                    std::real(op.params[0]));
     break;
   case Gates::rzx:
-    BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zx,
-                                             std::real(op.params[0]));
+    BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::zx,
+                                    std::real(op.params[0]));
     break;
   case Gates::ecr:
-    BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ECR);
+    BaseState::qreg_.apply_matrix(op.qubits, Linalg::VMatrix::ECR);
   case Gates::id:
     break;
   case Gates::h:
-    apply_gate_mcu(iChunk, op.qubits, M_PI / 2., 0., M_PI, 0.);
+    apply_gate_mcu(op.qubits, M_PI / 2., 0., M_PI, 0.);
     break;
   case Gates::s:
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(0., 1.));
+    apply_gate_phase(op.qubits[0], complex_t(0., 1.));
     break;
   case Gates::sdg:
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(0., -1.));
+    apply_gate_phase(op.qubits[0], complex_t(0., -1.));
     break;
   case Gates::t: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2));
+    apply_gate_phase(op.qubits[0], complex_t(isqrt2, isqrt2));
   } break;
   case Gates::tdg: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2));
+    apply_gate_phase(op.qubits[0], complex_t(isqrt2, -isqrt2));
   } break;
   case Gates::mcswap:
     // Includes SWAP, CSWAP, etc
-    BaseState::qregs_[iChunk].apply_mcswap(op.qubits);
+    BaseState::qreg_.apply_mcswap(op.qubits);
     break;
   case Gates::mcu3:
     // Includes u3, cu3, etc
-    apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]),
-                   std::real(op.params[1]), std::real(op.params[2]), 0.);
+    apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]),
+                   std::real(op.params[2]), 0.);
     break;
   case Gates::mcu:
     // Includes u3, cu3, etc
-    apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]),
-                   std::real(op.params[1]), std::real(op.params[2]),
-                   std::real(op.params[3]));
+    apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]),
+                   std::real(op.params[2]), std::real(op.params[3]));
     break;
   case Gates::mcu2:
     // Includes u2, cu2, etc
-    apply_gate_mcu(iChunk, op.qubits, M_PI / 2., std::real(op.params[0]),
+    apply_gate_mcu(op.qubits, M_PI / 2., std::real(op.params[0]),
                    std::real(op.params[1]), 0.);
     break;
   case Gates::mcp:
     // Includes u1, cu1, p, cp, mcp etc
-    BaseState::qregs_[iChunk].apply_mcphase(
-        op.qubits, std::exp(complex_t(0, 1) * op.params[0]));
+    BaseState::qreg_.apply_mcphase(op.qubits,
+                                   std::exp(complex_t(0, 1) * op.params[0]));
     break;
   case Gates::mcsx:
     // Includes sx, csx, mcsx etc
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SX);
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SX);
     break;
   case Gates::mcsxdg:
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SXDG);
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SXDG);
     break;
   case Gates::pauli:
-    BaseState::qregs_[iChunk].apply_pauli(op.qubits, op.string_params[0]);
+    BaseState::qreg_.apply_pauli(op.qubits, op.string_params[0]);
     break;
   default:
     // We shouldn't reach here unless there is a bug in gateset
@@ -1330,74 +872,67 @@ void State<statevec_t>::apply_gate(const int_t iChunk,
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_multiplexer(const int_t iChunk,
-                                          const reg_t &control_qubits,
+void State<statevec_t>::apply_multiplexer(const reg_t &control_qubits,
                                           const reg_t &target_qubits,
                                           const cmatrix_t &mat) {
   if (control_qubits.empty() == false && target_qubits.empty() == false &&
       mat.size() > 0) {
     cvector_t vmat = Utils::vectorize_matrix(mat);
-    BaseState::qregs_[iChunk].apply_multiplexer(control_qubits, target_qubits,
-                                                vmat);
+    BaseState::qreg_.apply_multiplexer(control_qubits, target_qubits, vmat);
   }
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_matrix(const int_t iChunk,
-                                     const Operations::Op &op) {
+void State<statevec_t>::apply_matrix(const Operations::Op &op) {
   if (op.qubits.empty() == false && op.mats[0].size() > 0) {
     if (Utils::is_diagonal(op.mats[0], .0)) {
-      apply_diagonal_matrix(iChunk, op.qubits,
-                            Utils::matrix_diagonal(op.mats[0]));
+      apply_diagonal_matrix(op.qubits, Utils::matrix_diagonal(op.mats[0]));
     } else {
-      BaseState::qregs_[iChunk].apply_matrix(
-          op.qubits, Utils::vectorize_matrix(op.mats[0]));
+      BaseState::qreg_.apply_matrix(op.qubits,
+                                    Utils::vectorize_matrix(op.mats[0]));
     }
   }
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_matrix(const int_t iChunk, const reg_t &qubits,
+void State<statevec_t>::apply_matrix(const reg_t &qubits,
                                      const cvector_t &vmat) {
   // Check if diagonal matrix
   if (vmat.size() == 1ULL << qubits.size()) {
-    apply_diagonal_matrix(iChunk, qubits, vmat);
+    apply_diagonal_matrix(qubits, vmat);
   } else {
-    BaseState::qregs_[iChunk].apply_matrix(qubits, vmat);
+    BaseState::qreg_.apply_matrix(qubits, vmat);
   }
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_diagonal_matrix(const int_t iChunk,
-                                              const reg_t &qubits,
+void State<statevec_t>::apply_diagonal_matrix(const reg_t &qubits,
                                               const cvector_t &diag) {
-  if (BaseState::global_chunk_indexing_ ||
-      !BaseState::multi_chunk_distribution_) {
-    // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix
-    // as is
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, diag);
-  } else {
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() &&
+      !BaseState::qreg_.support_global_indexing()) {
     reg_t qubits_in = qubits;
     cvector_t diag_in = diag;
-
-    BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in);
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in, diag_in);
+    Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(),
+                                 BaseState::qreg_.num_qubits(), qubits_in,
+                                 diag_in);
+    BaseState::qreg_.apply_diagonal_matrix(qubits_in, diag_in);
+  } else {
+    BaseState::qreg_.apply_diagonal_matrix(qubits, diag);
   }
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_gate_mcu(const int_t iChunk, const reg_t &qubits,
-                                       double theta, double phi, double lambda,
+void State<statevec_t>::apply_gate_mcu(const reg_t &qubits, double theta,
+                                       double phi, double lambda,
                                        double gamma) {
-  BaseState::qregs_[iChunk].apply_mcu(
-      qubits, Linalg::VMatrix::u4(theta, phi, lambda, gamma));
+  BaseState::qreg_.apply_mcu(qubits,
+                             Linalg::VMatrix::u4(theta, phi, lambda, gamma));
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_gate_phase(const int_t iChunk, uint_t qubit,
-                                         complex_t phase) {
+void State<statevec_t>::apply_gate_phase(uint_t qubit, complex_t phase) {
   cvector_t diag = {{1., phase}};
-  apply_diagonal_matrix(iChunk, reg_t({qubit}), diag);
+  apply_diagonal_matrix(reg_t({qubit}), diag);
 }
 
 //=========================================================================
@@ -1405,163 +940,41 @@ void State<statevec_t>::apply_gate_phase(const int_t iChunk, uint_t qubit,
 //=========================================================================
 
 template <class statevec_t>
-void State<statevec_t>::apply_measure(const int_t iChunk, const reg_t &qubits,
-                                      const reg_t &cmemory,
+void State<statevec_t>::apply_measure(const reg_t &qubits, const reg_t &cmemory,
                                       const reg_t &cregister, RngEngine &rng) {
-  int_t ishot = BaseState::get_global_shot_index(iChunk);
   // Actual measurement outcome
-  const auto meas = sample_measure_with_prob(iChunk, qubits, rng);
+  const auto meas = sample_measure_with_prob(qubits, rng);
   // Implement measurement update
-  measure_reset_update(iChunk, qubits, meas.first, meas.first, meas.second);
+  measure_reset_update(qubits, meas.first, meas.first, meas.second);
   const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size());
-  BaseState::cregs_[ishot].store_measure(outcome, cmemory, cregister);
+  BaseState::creg().store_measure(outcome, cmemory, cregister);
 }
 
 template <class statevec_t>
-rvector_t State<statevec_t>::measure_probs(const int_t iChunk,
-                                           const reg_t &qubits) const {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].probabilities(qubits);
-
-  uint_t dim = 1ull << qubits.size();
-  rvector_t sum(dim, 0.0);
-  int_t i, j, k;
-  reg_t qubits_in_chunk;
-  reg_t qubits_out_chunk;
-
-  BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk);
-
-  if (qubits_in_chunk.size() > 0) {
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(i, j, k)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t i = BaseState::top_chunk_of_group_[ig];
-             i < BaseState::top_chunk_of_group_[ig + 1]; i++) {
-          auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk);
-
-          if (qubits_in_chunk.size() == qubits.size()) {
-            for (j = 0; j < dim; j++) {
-#pragma omp atomic
-              sum[j] += chunkSum[j];
-            }
-          } else {
-            for (j = 0; j < chunkSum.size(); j++) {
-              int idx = 0;
-              int i_in = 0;
-              for (k = 0; k < qubits.size(); k++) {
-                if (qubits[k] < BaseState::chunk_bits_) {
-                  idx += (((j >> i_in) & 1) << k);
-                  i_in++;
-                } else {
-                  if ((((i + BaseState::global_chunk_index_)
-                        << BaseState::chunk_bits_) >>
-                       qubits[k]) &
-                      1) {
-                    idx += 1ull << k;
-                  }
-                }
-              }
-#pragma omp atomic
-              sum[idx] += chunkSum[j];
-            }
-          }
-        }
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++) {
-        auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk);
-
-        if (qubits_in_chunk.size() == qubits.size()) {
-          for (j = 0; j < dim; j++) {
-            sum[j] += chunkSum[j];
-          }
-        } else {
-          for (j = 0; j < chunkSum.size(); j++) {
-            int idx = 0;
-            int i_in = 0;
-            for (k = 0; k < qubits.size(); k++) {
-              if (qubits[k] < BaseState::chunk_bits_) {
-                idx += (((j >> i_in) & 1) << k);
-                i_in++;
-              } else {
-                if ((((i + BaseState::global_chunk_index_)
-                      << BaseState::chunk_bits_) >>
-                     qubits[k]) &
-                    1) {
-                  idx += 1ull << k;
-                }
-              }
-            }
-            sum[idx] += chunkSum[j];
-          }
-        }
-      }
-    }
-  } else { // there is no bit in chunk
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(i, j, k)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t i = BaseState::top_chunk_of_group_[ig];
-             i < BaseState::top_chunk_of_group_[ig + 1]; i++) {
-          auto nr = std::real(BaseState::qregs_[i].norm());
-          int idx = 0;
-          for (k = 0; k < qubits_out_chunk.size(); k++) {
-            if ((((i + BaseState::global_chunk_index_)
-                  << (BaseState::chunk_bits_)) >>
-                 qubits_out_chunk[k]) &
-                1) {
-              idx += 1ull << k;
-            }
-          }
-#pragma omp atomic
-          sum[idx] += nr;
-        }
-      }
-    } else {
-      for (i = 0; i < BaseState::qregs_.size(); i++) {
-        auto nr = std::real(BaseState::qregs_[i].norm());
-        int idx = 0;
-        for (k = 0; k < qubits_out_chunk.size(); k++) {
-          if ((((i + BaseState::global_chunk_index_)
-                << (BaseState::chunk_bits_)) >>
-               qubits_out_chunk[k]) &
-              1) {
-            idx += 1ull << k;
-          }
-        }
-        sum[idx] += nr;
-      }
-    }
-  }
-
-#ifdef AER_MPI
-  BaseState::reduce_sum(sum);
-#endif
-
-  return sum;
+rvector_t State<statevec_t>::measure_probs(const reg_t &qubits) const {
+  return BaseState::qreg_.probabilities(qubits);
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_reset(const int_t iChunk, const reg_t &qubits,
-                                    RngEngine &rng) {
+void State<statevec_t>::apply_reset(const reg_t &qubits, RngEngine &rng) {
   // Simulate unobserved measurement
-  const auto meas = sample_measure_with_prob(iChunk, qubits, rng);
+  const auto meas = sample_measure_with_prob(qubits, rng);
   // Apply update to reset state
-  measure_reset_update(iChunk, qubits, 0, meas.first, meas.second);
+  measure_reset_update(qubits, 0, meas.first, meas.second);
 }
 
 template <class statevec_t>
-std::pair<uint_t, double> State<statevec_t>::sample_measure_with_prob(
-    const int_t iChunk, const reg_t &qubits, RngEngine &rng) {
-  rvector_t probs = measure_probs(iChunk, qubits);
+std::pair<uint_t, double>
+State<statevec_t>::sample_measure_with_prob(const reg_t &qubits,
+                                            RngEngine &rng) {
+  rvector_t probs = measure_probs(qubits);
   // Randomly pick outcome and return pair
   uint_t outcome = rng.rand_int(probs);
   return std::make_pair(outcome, probs[outcome]);
 }
 
 template <class statevec_t>
-void State<statevec_t>::measure_reset_update(const int_t iChunk,
-                                             const std::vector<uint_t> &qubits,
+void State<statevec_t>::measure_reset_update(const std::vector<uint_t> &qubits,
                                              const uint_t final_state,
                                              const uint_t meas_state,
                                              const double meas_prob) {
@@ -1575,32 +988,11 @@ void State<statevec_t>::measure_reset_update(const int_t iChunk,
     cvector_t mdiag(2, 0.);
     mdiag[meas_state] = 1. / std::sqrt(meas_prob);
 
-    if (!BaseState::multi_chunk_distribution_)
-      BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag);
-    else {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_diagonal_matrix(ic, qubits, mdiag);
-        }
-      } else {
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_diagonal_matrix(ic, qubits, mdiag);
-        }
-      }
-    }
+    BaseState::qreg_.apply_diagonal_matrix(qubits, mdiag);
 
     // If it doesn't agree with the reset state update
-    if (final_state != meas_state) {
-      if (!BaseState::multi_chunk_distribution_)
-        BaseState::qregs_[iChunk].apply_mcx(qubits);
-      else
-        BaseState::apply_chunk_x(qubits[0]);
-    }
+    if (final_state != meas_state)
+      BaseState::qreg_.apply_mcx(qubits);
   }
   // Multi qubit case
   else {
@@ -1609,53 +1001,21 @@ void State<statevec_t>::measure_reset_update(const int_t iChunk,
     cvector_t mdiag(dim, 0.);
     mdiag[meas_state] = 1. / std::sqrt(meas_prob);
 
-    if (!BaseState::multi_chunk_distribution_)
-      BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag);
-    else {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_diagonal_matrix(ic, qubits, mdiag);
-        }
-      } else {
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_diagonal_matrix(ic, qubits, mdiag);
-        }
-      }
-    }
+    BaseState::qreg_.apply_diagonal_matrix(qubits, mdiag);
 
     // If it doesn't agree with the reset state update
     // This function could be optimized as a permutation update
     if (final_state != meas_state) {
-      reg_t qubits_in_chunk;
-      reg_t qubits_out_chunk;
-
-      BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk);
-
-      if (!BaseState::multi_chunk_distribution_ ||
-          qubits_in_chunk.size() ==
-              qubits.size()) { // all bits are inside chunk
-        // build vectorized permutation matrix
-        cvector_t perm(dim * dim, 0.);
-        perm[final_state * dim + meas_state] = 1.;
-        perm[meas_state * dim + final_state] = 1.;
-        for (size_t j = 0; j < dim; j++) {
-          if (j != final_state && j != meas_state)
-            perm[j * dim + j] = 1.;
-        }
-        // apply permutation to swap state
-        apply_matrix(iChunk, qubits, perm);
-      } else {
-        for (int_t i = 0; i < qubits.size(); i++) {
-          if (((final_state >> i) & 1) != ((meas_state >> i) & 1)) {
-            BaseState::apply_chunk_x(qubits[i]);
-          }
-        }
+      // build vectorized permutation matrix
+      cvector_t perm(dim * dim, 0.);
+      perm[final_state * dim + meas_state] = 1.;
+      perm[meas_state * dim + final_state] = 1.;
+      for (size_t j = 0; j < dim; j++) {
+        if (j != final_state && j != meas_state)
+          perm[j * dim + j] = 1.;
       }
+      // apply permutation to swap state
+      apply_matrix(qubits, perm);
     }
   }
 }
@@ -1673,100 +1033,13 @@ std::vector<reg_t> State<statevec_t>::sample_measure(const reg_t &qubits,
   for (i = 0; i < shots; ++i)
     rnds.push_back(rng.rand(0, 1));
 
-  if (!BaseState::multi_chunk_distribution_)
-    allbit_samples = BaseState::qregs_[0].sample_measure(rnds);
-  else {
-    std::vector<double> chunkSum(BaseState::qregs_.size() + 1, 0);
-    double sum, localSum;
-
-    // calculate per chunk sum
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t ic = BaseState::top_chunk_of_group_[ig];
-             ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) {
-          bool batched = BaseState::qregs_[ic].enable_batch(
-              true); // return sum of all chunks in group
-          chunkSum[ic] = BaseState::qregs_[ic].norm();
-          BaseState::qregs_[ic].enable_batch(batched);
-        }
-      }
-    } else {
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t ic = BaseState::top_chunk_of_group_[ig];
-             ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) {
-          bool batched = BaseState::qregs_[ic].enable_batch(
-              true); // return sum of all chunks in group
-          chunkSum[ic] = BaseState::qregs_[ic].norm();
-          BaseState::qregs_[ic].enable_batch(batched);
-        }
-      }
-    }
-
-    localSum = 0.0;
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      sum = localSum;
-      localSum += chunkSum[i];
-      chunkSum[i] = sum;
-    }
-    chunkSum[BaseState::qregs_.size()] = localSum;
-
-    double globalSum = 0.0;
-    if (BaseState::nprocs_ > 1) {
-      std::vector<double> procTotal(BaseState::nprocs_);
-
-      for (i = 0; i < BaseState::nprocs_; i++) {
-        procTotal[i] = localSum;
-      }
-
-      BaseState::gather_value(procTotal);
-
-      for (i = 0; i < BaseState::myrank_; i++) {
-        globalSum += procTotal[i];
-      }
-    }
-
-    reg_t local_samples(shots, 0);
-
-    // get rnds positions for each chunk
-    for (i = 0; i < BaseState::qregs_.size(); i++) {
-      uint_t nIn;
-      std::vector<uint_t> vIdx;
-      std::vector<double> vRnd;
-
-      // find rnds in this chunk
-      nIn = 0;
-      for (j = 0; j < shots; j++) {
-        if (rnds[j] >= chunkSum[i] + globalSum &&
-            rnds[j] < chunkSum[i + 1] + globalSum) {
-          vRnd.push_back(rnds[j] - (globalSum + chunkSum[i]));
-          vIdx.push_back(j);
-          nIn++;
-        }
-      }
-
-      if (nIn > 0) {
-        auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd);
-
-        for (j = 0; j < chunkSamples.size(); j++) {
-          local_samples[vIdx[j]] =
-              ((BaseState::global_chunk_index_ + i) << BaseState::chunk_bits_) +
-              chunkSamples[j];
-        }
-      }
-    }
-
-#ifdef AER_MPI
-    BaseState::reduce_sum(local_samples);
-#endif
-    allbit_samples = local_samples;
-  }
+  allbit_samples = BaseState::qreg_.sample_measure(rnds);
 
   // Convert to reg_t format
   std::vector<reg_t> all_samples;
   all_samples.reserve(shots);
   for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_);
+    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
     reg_t sample;
     sample.reserve(qubits.size());
     for (uint_t qubit : qubits) {
@@ -1779,161 +1052,29 @@ std::vector<reg_t> State<statevec_t>::sample_measure(const reg_t &qubits,
 }
 
 template <class statevec_t>
-void State<statevec_t>::apply_initialize(const int_t iChunk,
-                                         const reg_t &qubits,
+void State<statevec_t>::apply_initialize(const reg_t &qubits,
                                          const cvector_t &params,
                                          RngEngine &rng) {
   auto sorted_qubits = qubits;
   std::sort(sorted_qubits.begin(), sorted_qubits.end());
-  if (qubits.size() == BaseState::num_qubits_) {
+  if (qubits.size() == BaseState::qreg_.num_qubits()) {
     // If qubits is all ordered qubits in the statevector
     // we can just initialize the whole state directly
     if (qubits == sorted_qubits) {
-      initialize_from_vector(iChunk, params);
+      initialize_from_vector(params);
       return;
     }
   }
   // Apply reset to qubits
-  apply_reset(iChunk, qubits, rng);
+  apply_reset(qubits, rng);
 
   // Apply initialize_component
-  if (!BaseState::multi_chunk_distribution_)
-    BaseState::qregs_[iChunk].initialize_component(qubits, params);
-  else {
-    reg_t qubits_in_chunk;
-    reg_t qubits_out_chunk;
-    BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk);
-
-    if (qubits_out_chunk.size() == 0) { // no qubits outside of chunk
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t i = BaseState::top_chunk_of_group_[ig];
-               i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-            BaseState::qregs_[i].initialize_component(qubits, params);
-        }
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          BaseState::qregs_[i].initialize_component(qubits, params);
-      }
-    } else {
-      // scatter base states
-      if (qubits_in_chunk.size() > 0) {
-        // scatter inside chunks
-        const size_t dim = 1ULL << qubits_in_chunk.size();
-        cvector_t perm(dim * dim, 0.);
-        for (int_t i = 0; i < dim; i++) {
-          perm[i] = 1.0;
-        }
-
-        if (BaseState::chunk_omp_parallel_) {
-#pragma omp parallel for
-          for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-            apply_matrix(i, qubits_in_chunk, perm);
-        } else {
-          for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-            apply_matrix(i, qubits_in_chunk, perm);
-        }
-      }
-      if (qubits_out_chunk.size() > 0) {
-        // then scatter outside chunk
-        auto sorted_qubits_out = qubits_out_chunk;
-        std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end());
-
-        for (int_t i = 0;
-             i < (1ull << (BaseState::num_qubits_ - BaseState::chunk_bits_ -
-                           qubits_out_chunk.size()));
-             i++) {
-          uint_t baseChunk = 0;
-          uint_t j, ii, t;
-          ii = i;
-          for (j = 0; j < qubits_out_chunk.size(); j++) {
-            t = ii & ((1ull << qubits_out_chunk[j]) - 1);
-            baseChunk += t;
-            ii = (ii - t) << 1;
-          }
-          baseChunk += ii;
-          baseChunk >>= BaseState::chunk_bits_;
-
-          for (j = 1; j < (1ull << qubits_out_chunk.size()); j++) {
-            int_t ic = baseChunk;
-            for (t = 0; t < qubits_out_chunk.size(); t++) {
-              if ((j >> t) & 1)
-                ic += (1ull << (qubits_out_chunk[t] - BaseState::chunk_bits_));
-            }
-
-            if (ic >= BaseState::chunk_index_begin_
-                          [BaseState::distributed_rank_] &&
-                ic < BaseState::chunk_index_end_
-                         [BaseState::distributed_rank_]) { // on this process
-              if (baseChunk >= BaseState::chunk_index_begin_
-                                   [BaseState::distributed_rank_] &&
-                  baseChunk <
-                      BaseState::chunk_index_end_
-                          [BaseState::distributed_rank_]) { // base chunk is on
-                                                            // this process
-                BaseState::qregs_[ic].initialize_from_data(
-                    BaseState::qregs_[baseChunk].data(),
-                    1ull << BaseState::chunk_bits_);
-              } else {
-                BaseState::recv_chunk(ic, baseChunk);
-                // using swap chunk function to release send/recv buffers for
-                // Thrust
-                reg_t swap(2);
-                swap[0] = BaseState::chunk_bits_;
-                swap[1] = BaseState::chunk_bits_;
-                BaseState::qregs_[ic].apply_chunk_swap(swap, baseChunk);
-              }
-            } else if (baseChunk >= BaseState::chunk_index_begin_
-                                        [BaseState::distributed_rank_] &&
-                       baseChunk <
-                           BaseState::chunk_index_end_
-                               [BaseState::distributed_rank_]) { // base chunk
-                                                                 // is on this
-                                                                 // process
-              BaseState::send_chunk(baseChunk - BaseState::global_chunk_index_,
-                                    ic);
-            }
-          }
-        }
-      }
-
-      // initialize by params
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t i = BaseState::top_chunk_of_group_[ig];
-               i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-            apply_diagonal_matrix(i, qubits, params);
-        }
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          apply_diagonal_matrix(i, qubits, params);
-      }
-    }
-  }
+  BaseState::qreg_.initialize_component(qubits, params);
 }
 
 template <class statevec_t>
-void State<statevec_t>::initialize_from_vector(const int_t iChunk,
-                                               const cvector_t &params) {
-  if (!BaseState::multi_chunk_distribution_)
-    BaseState::qregs_[iChunk].initialize_from_vector(params);
-  else { // multi-chunk distribution
-    uint_t local_offset = BaseState::global_chunk_index_
-                          << BaseState::chunk_bits_;
-
-#pragma omp parallel for if (BaseState::chunk_omp_parallel_)
-    for (int_t i = 0; i < BaseState::qregs_.size(); i++) {
-      // copy part of state for this chunk
-      cvector_t tmp(1ull << BaseState::chunk_bits_);
-      std::copy(params.begin() + local_offset + (i << BaseState::chunk_bits_),
-                params.begin() + local_offset +
-                    ((i + 1) << BaseState::chunk_bits_),
-                tmp.begin());
-      BaseState::qregs_[i].initialize_from_vector(tmp);
-    }
-  }
+void State<statevec_t>::initialize_from_vector(const cvector_t &params) {
+  BaseState::qreg_.initialize_from_vector(params);
 }
 
 //=========================================================================
@@ -1941,8 +1082,7 @@ void State<statevec_t>::initialize_from_vector(const int_t iChunk,
 //=========================================================================
 
 template <class statevec_t>
-void State<statevec_t>::apply_multiplexer(const int_t iChunk,
-                                          const reg_t &control_qubits,
+void State<statevec_t>::apply_multiplexer(const reg_t &control_qubits,
                                           const reg_t &target_qubits,
                                           const std::vector<cmatrix_t> &mmat) {
   // (1) Pack vector of matrices into single (stacked) matrix ... note: matrix
@@ -1950,14 +1090,14 @@ void State<statevec_t>::apply_multiplexer(const int_t iChunk,
   cmatrix_t multiplexer_matrix = Utils::stacked_matrix(mmat);
 
   // (2) Treat as single, large(r), chained/batched matrix operator
-  apply_multiplexer(iChunk, control_qubits, target_qubits, multiplexer_matrix);
+  apply_multiplexer(control_qubits, target_qubits, multiplexer_matrix);
 }
 
 //=========================================================================
 // Implementation: Kraus Noise
 //=========================================================================
 template <class statevec_t>
-void State<statevec_t>::apply_kraus(const int_t iChunk, const reg_t &qubits,
+void State<statevec_t>::apply_kraus(const reg_t &qubits,
                                     const std::vector<cmatrix_t> &kmats,
                                     RngEngine &rng) {
   // Check edge case for empty Kraus set (this shouldn't happen)
@@ -1981,52 +1121,14 @@ void State<statevec_t>::apply_kraus(const int_t iChunk, const reg_t &qubits,
     // Calculate probability
     cvector_t vmat = Utils::vectorize_matrix(kmats[j]);
 
-    if (!BaseState::multi_chunk_distribution_) {
-      p = BaseState::qregs_[iChunk].norm(qubits, vmat);
-      accum += p;
-    } else {
-      p = 0.0;
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for reduction(+ : p)
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t i = BaseState::top_chunk_of_group_[ig];
-               i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-            p += BaseState::qregs_[i].norm(qubits, vmat);
-        }
-      } else {
-        for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-          p += BaseState::qregs_[i].norm(qubits, vmat);
-      }
-
-#ifdef AER_MPI
-      BaseState::reduce_sum(p);
-#endif
-      accum += p;
-    }
-
+    p = BaseState::qreg_.norm(qubits, vmat);
+    accum += p;
     // check if we need to apply this operator
     if (accum > r) {
       // rescale vmat so projection is normalized
       Utils::scalar_multiply_inplace(vmat, 1 / std::sqrt(p));
       // apply Kraus projection operator
-      if (!BaseState::multi_chunk_distribution_)
-        apply_matrix(iChunk, qubits, vmat);
-      else {
-        if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-          for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-            for (int_t ic = BaseState::top_chunk_of_group_[ig];
-                 ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-              apply_matrix(ic, qubits, vmat);
-          }
-        } else {
-          for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-            for (int_t ic = BaseState::top_chunk_of_group_[ig];
-                 ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-              apply_matrix(ic, qubits, vmat);
-          }
-        }
-      }
+      apply_matrix(qubits, vmat);
       complete = true;
       break;
     }
@@ -2037,24 +1139,7 @@ void State<statevec_t>::apply_kraus(const int_t iChunk, const reg_t &qubits,
     // Compute probability from accumulated
     complex_t renorm = 1 / std::sqrt(1. - accum);
     auto vmat = Utils::vectorize_matrix(renorm * kmats.back());
-    if (!BaseState::multi_chunk_distribution_)
-      apply_matrix(iChunk, qubits, vmat);
-    else {
-      if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) {
-#pragma omp parallel for
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_matrix(ic, qubits, vmat);
-        }
-      } else {
-        for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-          for (int_t ic = BaseState::top_chunk_of_group_[ig];
-               ic < BaseState::top_chunk_of_group_[ig + 1]; ic++)
-            apply_matrix(ic, qubits, vmat);
-        }
-      }
-    }
+    apply_matrix(qubits, vmat);
   }
 }
 
diff --git a/src/simulators/superoperator/superoperator.hpp b/src/simulators/superoperator/superoperator.hpp
index c9264fb638..54343ad932 100644
--- a/src/simulators/superoperator/superoperator.hpp
+++ b/src/simulators/superoperator/superoperator.hpp
@@ -61,6 +61,12 @@ class Superoperator : public DensityMatrix<data_t> {
   // Initialize to the identity superoperator
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const Superoperator<data_t> &obj) {
+    BaseDensity::copy_qv(obj);
+    num_qubits_ = obj.num_qubits_;
+  }
+
   // Initializes the vector to a custom initial state.
   // The matrix can either be superoperator matrix or unitary matrix.
   // The type is inferred by the dimensions of the input matrix.
diff --git a/src/simulators/superoperator/superoperator_thrust.hpp b/src/simulators/superoperator/superoperator_thrust.hpp
index 6ad6c8ce1a..538122c29d 100644
--- a/src/simulators/superoperator/superoperator_thrust.hpp
+++ b/src/simulators/superoperator/superoperator_thrust.hpp
@@ -61,6 +61,12 @@ class SuperoperatorThrust : public DensityMatrixThrust<data_t> {
   // Initialize to the identity superoperator
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const SuperoperatorThrust<data_t> &obj) {
+    BaseDensity::copy_qv(obj);
+    num_qubits_ = obj.num_qubits_;
+  }
+
   // Initializes the vector to a custom initial state.
   // The matrix can either be superoperator matrix or unitary matrix.
   // The type is inferred by the dimensions of the input matrix.
diff --git a/src/simulators/tensor_network/tensor_net_contractor.hpp b/src/simulators/tensor_network/tensor_net_contractor.hpp
index b8d6f0c3b9..00b28fe733 100644
--- a/src/simulators/tensor_network/tensor_net_contractor.hpp
+++ b/src/simulators/tensor_network/tensor_net_contractor.hpp
@@ -54,6 +54,8 @@ class TensorNetContractor {
   virtual void
   allocate_sampling_buffers(uint_t size = AER_TENSOR_NET_MAX_SAMPLING) = 0;
   virtual void deallocate_sampling_buffers(void) = 0;
+
+  virtual void set_target_gpus(reg_t &t) {}
 };
 
 template <typename data_t = double>
diff --git a/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp b/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp
index 1d52e1674b..cc69b93e38 100644
--- a/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp
+++ b/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp
@@ -842,6 +842,8 @@ class TensorNetContractor_cuTensorNet : public TensorNetContractor<data_t> {
   int nprocs_ = 1;
   int myrank_ = 0;
 
+  reg_t target_gpus_;
+
 public:
   TensorNetContractor_cuTensorNet();
   ~TensorNetContractor_cuTensorNet();
@@ -872,6 +874,8 @@ class TensorNetContractor_cuTensorNet : public TensorNetContractor<data_t> {
   allocate_sampling_buffers(uint_t size = AER_TENSOR_NET_MAX_SAMPLING) override;
   void deallocate_sampling_buffers(void) override;
 
+  void set_target_gpus(reg_t &t) override { target_gpus_ = t; }
+
 protected:
   void remove_additional_tensors(void);
 
@@ -903,10 +907,18 @@ void TensorNetContractor_cuTensorNet<data_t>::set_network(
   // allocate tensor data storage for each device
   if (cudaGetDeviceCount(&num_devices_) != cudaSuccess)
     cudaGetLastError();
+  if (target_gpus_.size() > 0) {
+    num_devices_ = target_gpus_.size();
+  } else {
+    target_gpus_.resize(num_devices_);
+    for (int_t i = 0; i < num_devices_; i++)
+      target_gpus_[i] = i;
+  }
+
   tensor_data_.clear();
   tensor_data_.resize(num_devices_);
   for (int_t i = 0; i < num_devices_; i++) {
-    tensor_data_[i].set_device(i);
+    tensor_data_[i].set_device(target_gpus_[i]);
   }
 
   // count number of tensors
@@ -1022,7 +1034,7 @@ void TensorNetContractor_cuTensorNet<data_t>::setup_contraction(
 
   // allocate work buffer on GPU
   if (!tensor_data_[0].work_allocated()) {
-    cudaSetDevice(0);
+    cudaSetDevice(target_gpus_[0]);
     HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
     work_size = (freeMem / nid) * 0.9;
     tensor_data_[0].allocate_work(work_size);
@@ -1049,7 +1061,7 @@ void TensorNetContractor_cuTensorNet<data_t>::setup_contraction(
       if (ns > 0) {
         // setup for the device
         if (!tensor_data_[i].work_allocated()) {
-          cudaSetDevice(i);
+          cudaSetDevice(target_gpus_[i]);
           HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
           work_size = (freeMem / nid) * 0.9;
           tensor_data_[i].allocate_work(work_size);
diff --git a/src/simulators/tensor_network/tensor_net_executor.hpp b/src/simulators/tensor_network/tensor_net_executor.hpp
new file mode 100644
index 0000000000..74be04051e
--- /dev/null
+++ b/src/simulators/tensor_network/tensor_net_executor.hpp
@@ -0,0 +1,469 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _tensor_network_executor_hpp_
+#define _tensor_network_executor_hpp_
+
+#include "simulators/multi_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace TensorNetwork {
+
+//-------------------------------------------------------------------------
+// Batched-shots executor for statevector
+//-------------------------------------------------------------------------
+template <class state_t>
+class Executor : public CircuitExecutor::MultiStateExecutor<state_t> {
+  using Base = CircuitExecutor::MultiStateExecutor<state_t>;
+
+protected:
+public:
+  Executor() {}
+  virtual ~Executor() {}
+
+protected:
+  void set_config(const Config &config) override;
+
+  bool shot_branching_supported(void) override { return true; }
+
+  bool apply_branching_op(CircuitExecutor::Branch &root,
+                          const Operations::Op &op, ExperimentResult &result,
+                          bool final_op) override;
+
+  rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                     const reg_t &qubits);
+  void measure_reset_update(CircuitExecutor::Branch &root,
+                            const std::vector<uint_t> &qubits,
+                            const int_t final_state,
+                            const rvector_t &meas_probs);
+  void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
+                     const reg_t &cmemory, const reg_t &cregister);
+  void apply_reset(CircuitExecutor::Branch &root, const reg_t &qubits);
+  void apply_initialize(CircuitExecutor::Branch &root, const reg_t &qubits,
+                        const cvector_t<double> &params);
+  void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits,
+                   const std::vector<cmatrix_t> &kmats);
+
+  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
+                                    uint_t shots,
+                                    std::vector<RngEngine> &rng) const override;
+
+  void apply_save_statevector(CircuitExecutor::Branch &root,
+                              const Operations::Op &op,
+                              ExperimentResult &result, bool last_op);
+  void apply_save_statevector_dict(CircuitExecutor::Branch &root,
+                                   const Operations::Op &op,
+                                   ExperimentResult &result);
+  void apply_save_amplitudes(CircuitExecutor::Branch &root,
+                             const Operations::Op &op,
+                             ExperimentResult &result);
+};
+
+template <class state_t>
+void Executor<state_t>::set_config(const Config &config) {
+  Base::set_config(config);
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_branching_op(CircuitExecutor::Branch &root,
+                                           const Operations::Op &op,
+                                           ExperimentResult &result,
+                                           bool final_op) {
+  RngEngine dummy;
+  if (Base::states_[root.state_index()].creg().check_conditional(op)) {
+    switch (op.type) {
+    case OpType::reset:
+      apply_reset(root, op.qubits);
+      break;
+    case OpType::initialize:
+      apply_initialize(root, op.qubits, op.params);
+      break;
+    case OpType::measure:
+      apply_measure(root, op.qubits, op.memory, op.registers);
+      break;
+    case OpType::kraus:
+      if (!Base::has_statevector_ops_)
+        return false;
+      apply_kraus(root, op.qubits, op.mats);
+      break;
+    case OpType::save_expval:
+    case OpType::save_expval_var:
+    case OpType::save_densmat:
+    case OpType::save_probs:
+    case OpType::save_probs_ket:
+      // call save functions in state class
+      Base::states_[root.state_index()].apply_op(op, result, dummy, final_op);
+      break;
+    case OpType::save_state:
+    case OpType::save_statevec:
+      apply_save_statevector(root, op, result, final_op);
+      break;
+    case OpType::save_statevec_dict:
+      apply_save_statevector_dict(root, op, result);
+      break;
+    case OpType::save_amps:
+    case OpType::save_amps_sq:
+      apply_save_amplitudes(root, op, result);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class state_t>
+rvector_t
+Executor<state_t>::sample_measure_with_prob(CircuitExecutor::Branch &root,
+                                            const reg_t &qubits) {
+  rvector_t probs =
+      Base::states_[root.state_index()].qreg().probabilities(qubits);
+  uint_t nshots = root.num_shots();
+  reg_t shot_branch(nshots);
+
+  for (int_t i = 0; i < nshots; i++) {
+    shot_branch[i] = root.rng_shots()[i].rand_int(probs);
+  }
+
+  // branch shots
+  root.creg() = Base::states_[root.state_index()].creg();
+  root.branch_shots(shot_branch, probs.size());
+
+  return probs;
+}
+
+template <class state_t>
+void Executor<state_t>::measure_reset_update(CircuitExecutor::Branch &root,
+                                             const std::vector<uint_t> &qubits,
+                                             const int_t final_state,
+                                             const rvector_t &meas_probs) {
+  // Update a state vector based on an outcome pair [m, p] from
+  // sample_measure_with_prob function, and a desired post-measurement
+  // final_state
+
+  // Single-qubit case
+  if (qubits.size() == 1) {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    for (int_t i = 0; i < 2; i++) {
+      cvector_t<double> mdiag(2, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        Operations::Op op;
+        op.type = OpType::gate;
+        op.name = "mcx";
+        op.qubits = qubits;
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+  // Multi qubit case
+  else {
+    // Diagonal matrix for projecting and renormalizing to measurement outcome
+    const size_t dim = 1ULL << qubits.size();
+    for (int_t i = 0; i < dim; i++) {
+      cvector_t<double> mdiag(dim, 0.);
+      mdiag[i] = 1. / std::sqrt(meas_probs[i]);
+
+      Operations::Op op;
+      op.type = OpType::diagonal_matrix;
+      op.qubits = qubits;
+      op.params = mdiag;
+      root.branches()[i]->add_op_after_branch(op);
+
+      if (final_state >= 0 && final_state != i) {
+        // build vectorized permutation matrix
+        cvector_t<double> perm(dim * dim, 0.);
+        perm[final_state * dim + i] = 1.;
+        perm[i * dim + final_state] = 1.;
+        for (size_t j = 0; j < dim; j++) {
+          if (j != final_state && j != i)
+            perm[j * dim + j] = 1.;
+        }
+        Operations::Op op;
+        op.type = OpType::matrix;
+        op.qubits = qubits;
+        op.mats.push_back(Utils::devectorize_matrix(perm));
+        root.branches()[i]->add_op_after_branch(op);
+      }
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
+                                      const reg_t &qubits, const reg_t &cmemory,
+                                      const reg_t &cregister) {
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  // save result to cregs
+  for (int_t i = 0; i < probs.size(); i++) {
+    const reg_t outcome = Utils::int2reg(i, 2, qubits.size());
+    root.branches()[i]->creg().store_measure(outcome, cmemory, cregister);
+  }
+
+  measure_reset_update(root, qubits, -1, probs);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_reset(CircuitExecutor::Branch &root,
+                                    const reg_t &qubits) {
+  rvector_t probs = sample_measure_with_prob(root, qubits);
+
+  measure_reset_update(root, qubits, 0, probs);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_initialize(CircuitExecutor::Branch &root,
+                                         const reg_t &qubits,
+                                         const cvector_t<double> &params) {
+  if (qubits.size() == Base::num_qubits_) {
+    auto sorted_qubits = qubits;
+    std::sort(sorted_qubits.begin(), sorted_qubits.end());
+    // If qubits is all ordered qubits in the statevector
+    // we can just initialize the whole state directly
+    if (qubits == sorted_qubits) {
+      Base::states_[root.state_index()].initialize_from_vector(params);
+      return;
+    }
+  }
+
+  if (root.additional_ops().size() == 0) {
+    apply_reset(root, qubits);
+
+    Operations::Op op;
+    op.type = OpType::initialize;
+    op.name = "initialize";
+    op.qubits = qubits;
+    op.params = params;
+    for (int_t i = 0; i < root.num_branches(); i++) {
+      root.branches()[i]->add_op_after_branch(op);
+    }
+    return; // initialization will be done in next call because of shot
+            // branching in reset
+  }
+
+  Base::states_[root.state_index()].qreg().initialize_component(qubits, params);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_kraus(CircuitExecutor::Branch &root,
+                                    const reg_t &qubits,
+                                    const std::vector<cmatrix_t> &kmats) {
+  // Check edge case for empty Kraus set (this shouldn't happen)
+  if (kmats.empty())
+    return; // end function early
+
+  // Choose a real in [0, 1) to choose the applied kraus operator once
+  // the accumulated probability is greater than r.
+  // We know that the Kraus noise must be normalized
+  // So we only compute probabilities for the first N-1 kraus operators
+  // and infer the probability of the last one from 1 - sum of the previous
+
+  double r;
+  double accum = 0.;
+  double p;
+  bool complete = false;
+
+  reg_t shot_branch;
+  uint_t nshots;
+  rvector_t rshots, pmats;
+  uint_t nshots_multiplied = 0;
+
+  nshots = root.num_shots();
+  shot_branch.resize(nshots);
+  rshots.resize(nshots);
+  for (int_t i = 0; i < nshots; i++) {
+    shot_branch[i] = kmats.size() - 1;
+    rshots[i] = root.rng_shots()[i].rand(0., 1.);
+  }
+  pmats.resize(kmats.size());
+
+  // Loop through N-1 kraus operators
+  for (size_t j = 0; j < kmats.size() - 1; j++) {
+    // Calculate probability
+    cvector_t<double> vmat = Utils::vectorize_matrix(kmats[j]);
+
+    p = Base::states_[root.state_index()].qreg().norm(qubits, vmat);
+    accum += p;
+
+    // check if we need to apply this operator
+    pmats[j] = p;
+    for (int_t i = 0; i < nshots; i++) {
+      if (shot_branch[i] >= kmats.size() - 1) {
+        if (accum > rshots[i]) {
+          shot_branch[i] = j;
+          nshots_multiplied++;
+        }
+      }
+    }
+    if (nshots_multiplied >= nshots) {
+      complete = true;
+      break;
+    }
+  }
+
+  // check if we haven't applied a kraus operator yet
+  pmats[pmats.size() - 1] = 1. - accum;
+
+  root.creg() = Base::states_[root.state_index()].creg();
+  root.branch_shots(shot_branch, kmats.size());
+  for (int_t i = 0; i < kmats.size(); i++) {
+    Operations::Op op;
+    op.type = OpType::matrix;
+    op.qubits = qubits;
+    op.mats.push_back(kmats[i]);
+    p = 1 / std::sqrt(pmats[i]);
+    for (int_t j = 0; j < op.mats[0].size(); j++)
+      op.mats[0][j] *= p;
+    root.branches()[i]->add_op_after_branch(op);
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector(CircuitExecutor::Branch &root,
+                                               const Operations::Op &op,
+                                               ExperimentResult &result,
+                                               bool last_op) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  std::string key =
+      (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0];
+
+  if (last_op) {
+    const auto v = Base::states_[root.state_index()].move_to_vector();
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                               OpType::save_statevec, op.save_type);
+    }
+  } else {
+    const auto v = Base::states_[root.state_index()].copy_to_vector();
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v,
+                               OpType::save_statevec, op.save_type);
+    }
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_statevector_dict(
+    CircuitExecutor::Branch &root, const Operations::Op &op,
+    ExperimentResult &result) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full statevector can be saved.");
+  }
+  auto state_ket = Base::states_[root.state_index()].qreg().vector_ket(
+      Base::json_chop_threshold_);
+  std::map<std::string, complex_t> result_state_ket;
+  for (auto const &it : state_ket) {
+    result_state_ket[it.first] = it.second;
+  }
+  for (int_t i = 0; i < root.num_shots(); i++) {
+    result.save_data_pershot(
+        Base::states_[root.state_index()].creg(), op.string_params[0],
+        (const std::map<std::string, complex_t> &)result_state_ket, op.type,
+        op.save_type);
+  }
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
+                                              const Operations::Op &op,
+                                              ExperimentResult &result) {
+  if (op.int_params.empty()) {
+    throw std::invalid_argument(
+        "Invalid save_amplitudes instructions (empty params).");
+  }
+  const int_t size = op.int_params.size();
+  if (op.type == Operations::OpType::save_amps) {
+    Vector<complex_t> amps(size, false);
+    for (int_t i = 0; i < size; ++i) {
+      amps[i] =
+          Base::states_[root.state_index()].qreg().get_state(op.int_params[i]);
+    }
+    for (int_t i = 0; i < root.num_shots(); i++) {
+      result.save_data_pershot(
+          Base::states_[root.state_index()].creg(), op.string_params[0],
+          (const Vector<complex_t> &)amps, op.type, op.save_type);
+    }
+  } else {
+    rvector_t amps_sq(size, 0);
+    for (int_t i = 0; i < size; ++i) {
+      amps_sq[i] = Base::states_[root.state_index()].qreg().probability(
+          op.int_params[i]);
+    }
+    result.save_data_average(Base::states_[root.state_index()].creg(),
+                             op.string_params[0], amps_sq, op.type,
+                             op.save_type);
+  }
+}
+
+template <class state_t>
+std::vector<reg_t>
+Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
+                                  uint_t shots,
+                                  std::vector<RngEngine> &rng) const {
+  int_t i, j;
+  std::vector<double> rnds;
+  rnds.reserve(shots);
+
+  for (i = 0; i < shots; ++i)
+    rnds.push_back(rng[i].rand(0, 1));
+
+  std::vector<reg_t> samples = state.qreg().sample_measure(rnds);
+  std::vector<reg_t> ret(shots);
+
+  if (omp_get_num_threads() > 1) {
+    for (i = 0; i < shots; ++i) {
+      ret[i].resize(qubits.size());
+      for (j = 0; j < qubits.size(); j++)
+        ret[i][j] = samples[i][qubits[j]];
+    }
+  } else {
+#pragma omp parallel for private(j)
+    for (i = 0; i < shots; ++i) {
+      ret[i].resize(qubits.size());
+      for (j = 0; j < qubits.size(); j++)
+        ret[i][j] = samples[i][qubits[j]];
+    }
+  }
+  return ret;
+}
+
+//-------------------------------------------------------------------------
+} // namespace TensorNetwork
+//-------------------------------------------------------------------------
+} // end namespace AER
+//-------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/tensor_network/tensor_net_state.hpp b/src/simulators/tensor_network/tensor_net_state.hpp
index 44a6221c36..a1004a2312 100644
--- a/src/simulators/tensor_network/tensor_net_state.hpp
+++ b/src/simulators/tensor_network/tensor_net_state.hpp
@@ -23,7 +23,7 @@
 #include "framework/json.hpp"
 #include "framework/opset.hpp"
 #include "framework/utils.hpp"
-#include "simulators/state_chunk.hpp"
+#include "simulators/state.hpp"
 #include "tensor_net.hpp"
 
 #include "simulators/tensor_network/tensor_net.hpp"
@@ -152,6 +152,8 @@ class State : public QuantumState::State<tensor_net_t> {
   // Initializes to a specific n-qubit state
   void initialize_qreg(const tensor_net_t &tensor);
 
+  void initialize_from_vector(const cvector_t<double> &params);
+
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -190,8 +192,6 @@ class State : public QuantumState::State<tensor_net_t> {
   void apply_initialize(const reg_t &qubits, const cvector_t<double> &params,
                         RngEngine &rng);
 
-  void initialize_from_vector(const cvector_t<double> &params);
-
   void initialize_from_matrix(const cmatrix_t &params);
 
   // Apply a matrix to given qubits (identity on all other qubits)
diff --git a/src/simulators/unitary/unitary_executor.hpp b/src/simulators/unitary/unitary_executor.hpp
new file mode 100644
index 0000000000..240d806870
--- /dev/null
+++ b/src/simulators/unitary/unitary_executor.hpp
@@ -0,0 +1,213 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019. 2023.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _unitary_executor_hpp
+#define _unitary_executor_hpp
+
+#include "simulators/parallel_state_executor.hpp"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef AER_MPI
+#include <mpi.h>
+#endif
+
+namespace AER {
+
+namespace QubitUnitary {
+
+//-------------------------------------------------------------------------
+// Parallel executor for QubitUnitar
+//-------------------------------------------------------------------------
+
+template <class state_t>
+class Executor : public CircuitExecutor::ParallelStateExecutor<state_t> {
+  using Base = CircuitExecutor::ParallelStateExecutor<state_t>;
+
+protected:
+public:
+  Executor() {}
+  virtual ~Executor() {}
+
+  auto move_to_matrix(void);
+  auto copy_to_matrix(void);
+
+protected:
+  void set_config(const Config &config) override;
+
+  // apply parallel operations
+  bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result,
+                         RngEngine &rng, bool final_op) override;
+
+  void initialize_qreg(uint_t num_qubits) override;
+
+  //-----------------------------------------------------------------------
+  // Apply Instructions
+  //-----------------------------------------------------------------------
+  // swap between chunks
+  void apply_chunk_swap(const reg_t &qubits) override;
+
+  //-----------------------------------------------------------------------
+  // Save data instructions
+  //-----------------------------------------------------------------------
+
+  // Save the unitary matrix for the simulator
+  void apply_save_unitary(const Operations::Op &op, ExperimentResult &result,
+                          bool last_op);
+
+  // Helper function for computing expectation value
+  double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
+
+  // scale for unitary = 2
+  // this function is used in the base class to scale chunk qubits for
+  // multi-chunk distribution
+  uint_t qubit_scale(void) override { return 2; }
+};
+
+template <class state_t>
+void Executor<state_t>::set_config(const Config &config) {
+  Base::set_config(config);
+}
+
+template <class state_t>
+void Executor<state_t>::initialize_qreg(uint_t num_qubits) {
+  int_t iChunk;
+  for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) {
+    Base::states_[iChunk].qreg().set_num_qubits(Base::chunk_bits_);
+  }
+
+  if (Base::chunk_omp_parallel_ && Base::num_groups_ > 1) {
+#pragma omp parallel for private(iChunk)
+    for (int_t ig = 0; ig < Base::num_groups_; ig++) {
+      for (iChunk = Base::top_state_of_group_[ig];
+           iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) {
+        uint_t irow, icol;
+        irow = (Base::global_state_index_ + iChunk) >>
+               ((Base::num_qubits_ - Base::chunk_bits_));
+        icol = (Base::global_state_index_ + iChunk) -
+               (irow << ((Base::num_qubits_ - Base::chunk_bits_)));
+        if (irow == icol)
+          Base::states_[iChunk].qreg().initialize();
+        else
+          Base::states_[iChunk].qreg().zero();
+      }
+    }
+  } else {
+    for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) {
+      uint_t irow, icol;
+      irow = (Base::global_state_index_ + iChunk) >>
+             ((Base::num_qubits_ - Base::chunk_bits_));
+      icol = (Base::global_state_index_ + iChunk) -
+             (irow << ((Base::num_qubits_ - Base::chunk_bits_)));
+      if (irow == icol)
+        Base::states_[iChunk].qreg().initialize();
+      else
+        Base::states_[iChunk].qreg().zero();
+    }
+  }
+
+  Base::apply_global_phase();
+}
+
+template <class state_t>
+bool Executor<state_t>::apply_parallel_op(const Operations::Op &op,
+                                          ExperimentResult &result,
+                                          RngEngine &rng, bool final_op) {
+  // temporary : this is for statevector
+  if (Base::states_[0].creg().check_conditional(op)) {
+    switch (op.type) {
+    case Operations::OpType::bfunc:
+      Base::states_[0].creg().apply_bfunc(op);
+      break;
+    case Operations::OpType::roerror:
+      Base::states_[0].creg().apply_roerror(op, rng);
+      break;
+    case Operations::OpType::set_unitary:
+      Base::initialize_from_matrix(op.mats[0]);
+      break;
+    case Operations::OpType::save_state:
+    case Operations::OpType::save_unitary:
+      apply_save_unitary(op, result, final_op);
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
+template <class state_t>
+auto Executor<state_t>::move_to_matrix(void) {
+  return Base::apply_to_matrix(false);
+}
+
+template <class state_t>
+auto Executor<state_t>::copy_to_matrix(void) {
+  return Base::apply_to_matrix(true);
+}
+
+template <class state_t>
+void Executor<state_t>::apply_save_unitary(const Operations::Op &op,
+                                           ExperimentResult &result,
+                                           bool last_op) {
+  if (op.qubits.size() != Base::num_qubits_) {
+    throw std::invalid_argument(op.name +
+                                " was not applied to all qubits."
+                                " Only the full unitary can be saved.");
+  }
+  std::string key =
+      (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0];
+
+  if (last_op) {
+    result.save_data_pershot(Base::states_[0].creg(), key, move_to_matrix(),
+                             Operations::OpType::save_unitary, op.save_type);
+  } else {
+    result.save_data_pershot(Base::states_[0].creg(), key, copy_to_matrix(),
+                             Operations::OpType::save_unitary, op.save_type);
+  }
+}
+
+template <class state_t>
+double Executor<state_t>::expval_pauli(const reg_t &qubits,
+                                       const std::string &pauli) {
+  throw std::runtime_error(
+      "Unitary simulator does not support Pauli expectation values.");
+}
+
+// swap between chunks
+template <class state_t>
+void Executor<state_t>::apply_chunk_swap(const reg_t &qubits) {
+  uint_t q0, q1;
+  q0 = qubits[0];
+  q1 = qubits[1];
+
+  std::swap(Base::qubit_map_[q0], Base::qubit_map_[q1]);
+
+  if (qubits[0] >= Base::chunk_bits_) {
+    q0 += Base::chunk_bits_;
+  }
+  if (qubits[1] >= Base::chunk_bits_) {
+    q1 += Base::chunk_bits_;
+  }
+  reg_t qs0 = {{q0, q1}};
+  Base::apply_chunk_swap(qs0);
+}
+
+//------------------------------------------------------------------------------
+} // namespace QubitUnitary
+} // end namespace AER
+//------------------------------------------------------------------------------
+#endif
diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp
old mode 100644
new mode 100755
index 0b86625cef..e7352b84c2
--- a/src/simulators/unitary/unitary_state.hpp
+++ b/src/simulators/unitary/unitary_state.hpp
@@ -17,11 +17,10 @@
 
 #include <algorithm>
 #define _USE_MATH_DEFINES
-#include "framework/config.hpp"
 #include "framework/json.hpp"
 #include "framework/utils.hpp"
+#include "simulators/chunk_utils.hpp"
 #include "simulators/state.hpp"
-#include "simulators/state_chunk.hpp"
 #include "unitarymatrix.hpp"
 #include <math.h>
 #ifdef AER_THRUST_SUPPORTED
@@ -86,9 +85,9 @@ enum class Gates {
 //=========================================================================
 
 template <class unitary_matrix_t = QV::UnitaryMatrix<double>>
-class State : public virtual QuantumState::StateChunk<unitary_matrix_t> {
+class State : public virtual QuantumState::State<unitary_matrix_t> {
 public:
-  using BaseState = QuantumState::StateChunk<unitary_matrix_t>;
+  using BaseState = QuantumState::State<unitary_matrix_t>;
 
   State() : BaseState(StateOpSet) {}
   virtual ~State() = default;
@@ -102,9 +101,12 @@ class State : public virtual QuantumState::StateChunk<unitary_matrix_t> {
 
   // Apply an operation
   // If the op is not in allowed_ops an exeption will be raised.
-  virtual void apply_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, RngEngine &rng,
-                        bool final_op = false) override;
+  virtual void apply_op(const Operations::Op &op, ExperimentResult &result,
+                        RngEngine &rng, bool final_op = false) override;
+
+  // memory allocation (previously called before inisitalize_qreg)
+  bool allocate(uint_t num_qubits, uint_t block_bits,
+                uint_t num_parallel_shots = 1) override;
 
   // Initializes an n-qubit unitary to the identity matrix
   virtual void initialize_qreg(uint_t num_qubits) override;
@@ -131,49 +133,35 @@ class State : public virtual QuantumState::StateChunk<unitary_matrix_t> {
   // Initialize OpenMP settings for the underlying QubitVector class
   void initialize_omp();
 
-  auto move_to_matrix(const int_t iChunk);
-  auto copy_to_matrix(const int_t iChunk);
+  auto move_to_matrix();
+  auto copy_to_matrix();
 
 protected:
   //-----------------------------------------------------------------------
   // Apply Instructions
   //-----------------------------------------------------------------------
-  // apply op to multiple shots , return flase if op is not supported to execute
-  // in a batch
-  bool apply_batched_op(const int_t iChunk, const Operations::Op &op,
-                        ExperimentResult &result, std::vector<RngEngine> &rng,
-                        bool final_op = false) override;
-
   // Applies a Gate operation to the state class.
   // This should support all and only the operations defined in
   // allowed_operations.
-  void apply_gate(const int_t iChunk, const Operations::Op &op);
+  void apply_gate(const Operations::Op &op);
 
   // Apply a matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const reg_t &qubits,
-                    const cmatrix_t &mat);
+  void apply_matrix(const reg_t &qubits, const cmatrix_t &mat);
 
   // Apply a matrix to given qubits (identity on all other qubits)
-  void apply_matrix(const int_t iChunk, const reg_t &qubits,
-                    const cvector_t &vmat);
+  void apply_matrix(const reg_t &qubits, const cvector_t &vmat);
 
   // Apply a diagonal matrix
-  void apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits,
-                             const cvector_t &diag);
-
-  // swap between chunks
-  virtual void apply_chunk_swap(const reg_t &qubits) override;
+  void apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag);
 
   //-----------------------------------------------------------------------
   // 1-Qubit Gates
   //-----------------------------------------------------------------------
 
   // Optimize phase gate with diagonal [1, phase]
-  void apply_gate_phase(const int_t iChunk, const uint_t qubit,
-                        const complex_t phase);
+  void apply_gate_phase(const uint_t qubit, const complex_t phase);
 
-  void apply_gate_phase(const int_t iChunk, const reg_t &qubits,
-                        const complex_t phase);
+  void apply_gate_phase(const reg_t &qubits, const complex_t phase);
 
   //-----------------------------------------------------------------------
   // Multi-controlled u
@@ -182,19 +170,19 @@ class State : public virtual QuantumState::StateChunk<unitary_matrix_t> {
   // Apply N-qubit multi-controlled single qubit gate specified by
   // 4 parameters u4(theta, phi, lambda, gamma)
   // NOTE: if N=1 this is just a regular u4 gate.
-  void apply_gate_mcu(const int_t iChunk, const reg_t &qubits, double theta,
-                      double phi, double lambda, double gamma);
+  void apply_gate_mcu(const reg_t &qubits, double theta, double phi,
+                      double lambda, double gamma);
 
   //-----------------------------------------------------------------------
   // Save data instructions
   //-----------------------------------------------------------------------
 
   // Save the unitary matrix for the simulator
-  void apply_save_unitary(const int_t iChunk, const Operations::Op &op,
-                          ExperimentResult &result, bool last_op);
+  void apply_save_unitary(const Operations::Op &op, ExperimentResult &result,
+                          bool last_op);
 
   // Helper function for computing expectation value
-  virtual double expval_pauli(const int_t iChunk, const reg_t &qubits,
+  virtual double expval_pauli(const reg_t &qubits,
                               const std::string &pauli) override;
 
   //-----------------------------------------------------------------------
@@ -212,11 +200,6 @@ class State : public virtual QuantumState::StateChunk<unitary_matrix_t> {
 
   // Table of allowed gate names to gate enum class members
   const static stringmap_t<Gates> gateset_;
-
-  // scale for unitary = 2
-  // this function is used in the base class to scale chunk qubits for
-  // multi-chunk distribution
-  int qubit_scale(void) override { return 2; }
 };
 
 //============================================================================
@@ -298,36 +281,35 @@ const stringmap_t<Gates> State<unitary_matrix_t>::gateset_({
 //============================================================================
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_op(const int_t iChunk,
-                                       const Operations::Op &op,
+void State<unitary_matrix_t>::apply_op(const Operations::Op &op,
                                        ExperimentResult &result, RngEngine &rng,
                                        bool final_op) {
-  if (BaseState::check_conditional(iChunk, op)) {
+  if (BaseState::creg().check_conditional(op)) {
     switch (op.type) {
     case Operations::OpType::barrier:
     case Operations::OpType::qerror_loc:
       break;
     case Operations::OpType::bfunc:
-      BaseState::cregs_[0].apply_bfunc(op);
+      BaseState::creg().apply_bfunc(op);
       break;
     case Operations::OpType::roerror:
-      BaseState::cregs_[0].apply_roerror(op, rng);
+      BaseState::creg().apply_roerror(op, rng);
       break;
     case Operations::OpType::gate:
-      apply_gate(iChunk, op);
+      apply_gate(op);
       break;
     case Operations::OpType::set_unitary:
-      BaseState::initialize_from_matrix(iChunk, op.mats[0]);
+      BaseState::qreg_.initialize_from_matrix(op.mats[0]);
       break;
     case Operations::OpType::save_state:
     case Operations::OpType::save_unitary:
-      apply_save_unitary(iChunk, op, result, final_op);
+      apply_save_unitary(op, result, final_op);
       break;
     case Operations::OpType::matrix:
-      apply_matrix(iChunk, op.qubits, op.mats[0]);
+      apply_matrix(op.qubits, op.mats[0]);
       break;
     case Operations::OpType::diagonal_matrix:
-      apply_diagonal_matrix(iChunk, op.qubits, op.params);
+      apply_diagonal_matrix(op.qubits, op.params);
       break;
     default:
       throw std::invalid_argument(
@@ -336,42 +318,6 @@ void State<unitary_matrix_t>::apply_op(const int_t iChunk,
   }
 }
 
-template <class densmat_t>
-bool State<densmat_t>::apply_batched_op(const int_t iChunk,
-                                        const Operations::Op &op,
-                                        ExperimentResult &result,
-                                        std::vector<RngEngine> &rng,
-                                        bool final_ops) {
-  if (op.conditional)
-    BaseState::qregs_[iChunk].set_conditional(op.conditional_reg);
-
-  switch (op.type) {
-  case Operations::OpType::barrier:
-  case Operations::OpType::nop:
-  case Operations::OpType::qerror_loc:
-    break;
-  case Operations::OpType::bfunc:
-    BaseState::qregs_[iChunk].apply_bfunc(op);
-    break;
-  case Operations::OpType::roerror:
-    BaseState::qregs_[iChunk].apply_roerror(op, rng);
-    break;
-  case Operations::OpType::gate:
-    apply_gate(iChunk, op);
-    break;
-  case Operations::OpType::matrix:
-    apply_matrix(iChunk, op.qubits, op.mats[0]);
-    break;
-  case Operations::OpType::diagonal_matrix:
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, op.params);
-    break;
-  default:
-    // other operations should be called to indivisual chunks by apply_op
-    return false;
-  }
-  return true;
-}
-
 template <class unitary_matrix_t>
 size_t State<unitary_matrix_t>::required_memory_mb(
     uint_t num_qubits, const std::vector<Operations::Op> &ops) const {
@@ -391,57 +337,16 @@ void State<unitary_matrix_t>::set_config(const Config &config) {
   // Set threshold for truncating snapshots
   json_chop_threshold_ = config.zero_threshold;
 
-  for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-    BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_);
+  BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_);
 }
 
 template <class unitary_matrix_t>
 void State<unitary_matrix_t>::initialize_qreg(uint_t num_qubits) {
-  if (BaseState::qregs_.size() == 0)
-    BaseState::allocate(num_qubits, num_qubits, 1);
-
   initialize_omp();
 
-  int_t iChunk;
-  for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-    BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_);
-  }
+  BaseState::qreg_.set_num_qubits(num_qubits);
+  BaseState::qreg_.initialize();
 
-  if (BaseState::multi_chunk_distribution_) {
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(iChunk)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-          uint_t irow, icol;
-          irow = (BaseState::global_chunk_index_ + iChunk) >>
-                 ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-          icol = (BaseState::global_chunk_index_ + iChunk) -
-                 (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-          if (irow == icol)
-            BaseState::qregs_[iChunk].initialize();
-          else
-            BaseState::qregs_[iChunk].zero();
-        }
-      }
-    } else {
-      for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-        uint_t irow, icol;
-        irow = (BaseState::global_chunk_index_ + iChunk) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_));
-        icol = (BaseState::global_chunk_index_ + iChunk) -
-               (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-        if (irow == icol)
-          BaseState::qregs_[iChunk].initialize();
-        else
-          BaseState::qregs_[iChunk].zero();
-      }
-    }
-  } else {
-    for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-      BaseState::qregs_[iChunk].initialize();
-    }
-  }
   apply_global_phase();
 }
 
@@ -454,101 +359,43 @@ void State<unitary_matrix_t>::initialize_qreg(uint_t num_qubits,
         "Unitary::State::initialize: initial state does not match qubit "
         "number");
   }
-  if (BaseState::qregs_.size() == 0)
-    BaseState::allocate(num_qubits, num_qubits, 1);
   initialize_omp();
 
-  int_t iChunk;
-  for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++)
-    BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_);
+  BaseState::qreg_.set_num_qubits(num_qubits);
+  BaseState::qreg_.initialize_from_matrix(unitary);
 
-  if (BaseState::multi_chunk_distribution_) {
-    uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1;
-    for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-      // this function should be called in-order
-      BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_);
-    }
-
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for private(iChunk)
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (iChunk = BaseState::top_chunk_of_group_[ig];
-             iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) {
-          uint_t irow_chunk =
-              ((iChunk + BaseState::global_chunk_index_) >>
-               ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-          uint_t icol_chunk =
-              ((iChunk + BaseState::global_chunk_index_) &
-               ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) -
-                1));
-
-          // copy part of state for this chunk
-          uint_t i, row, col;
-          cvector_t tmp(1ull << BaseState::chunk_bits_);
-          for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) {
-            uint_t icol = i >> (BaseState::chunk_bits_);
-            uint_t irow = i & mask;
-            uint_t idx = ((icol + (irow_chunk << BaseState::chunk_bits_))
-                          << (BaseState::num_qubits_)) +
-                         (icol_chunk << BaseState::chunk_bits_) + irow;
-            tmp[i] = unitary[idx];
-          }
-          BaseState::qregs_[iChunk].initialize_from_vector(tmp);
-        }
-      }
-    } else {
-      for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) {
-        uint_t irow_chunk =
-            ((iChunk + BaseState::global_chunk_index_) >>
-             ((BaseState::num_qubits_ - BaseState::chunk_bits_)));
-        uint_t icol_chunk =
-            ((iChunk + BaseState::global_chunk_index_) &
-             ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) -
-              1));
-
-        // copy part of state for this chunk
-        uint_t i, row, col;
-        cvector_t tmp(1ull << BaseState::chunk_bits_);
-        for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) {
-          uint_t icol = i >> (BaseState::chunk_bits_);
-          uint_t irow = i & mask;
-          uint_t idx = ((icol + (irow_chunk << BaseState::chunk_bits_))
-                        << (BaseState::num_qubits_)) +
-                       (icol_chunk << BaseState::chunk_bits_) + irow;
-          tmp[i] = unitary[idx];
-        }
-        BaseState::qregs_[iChunk].initialize_from_vector(tmp);
-      }
-    }
-  } else {
-    BaseState::qregs_[iChunk].initialize_from_matrix(unitary);
-  }
   apply_global_phase();
 }
 
 template <class unitary_matrix_t>
 void State<unitary_matrix_t>::initialize_omp() {
   uint_t i;
-  for (i = 0; i < BaseState::qregs_.size(); i++) {
-    BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_);
-    if (BaseState::threads_ > 0)
-      BaseState::qregs_[i].set_omp_threads(
-          BaseState::threads_); // set allowed OMP threads in qubitvector
-  }
+  BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_);
+  if (BaseState::threads_ > 0)
+    BaseState::qreg_.set_omp_threads(
+        BaseState::threads_); // set allowed OMP threads in qubitvector
 }
 
 template <class unitary_matrix_t>
-auto State<unitary_matrix_t>::move_to_matrix(const int_t iChunk) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].move_to_matrix();
-  return BaseState::apply_to_matrix(false);
+bool State<unitary_matrix_t>::allocate(uint_t num_qubits, uint_t block_bits,
+                                       uint_t num_parallel_shots) {
+  if (BaseState::max_matrix_qubits_ > 0)
+    BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_);
+
+  BaseState::qreg_.set_target_gpus(BaseState::target_gpus_);
+  BaseState::qreg_.chunk_setup(block_bits * 2, num_qubits * 2, 0, 1);
+
+  return true;
+}
+
+template <class unitary_matrix_t>
+auto State<unitary_matrix_t>::move_to_matrix() {
+  return BaseState::qreg_.move_to_matrix();
 }
 
 template <class unitary_matrix_t>
-auto State<unitary_matrix_t>::copy_to_matrix(const int_t iChunk) {
-  if (!BaseState::multi_chunk_distribution_)
-    return BaseState::qregs_[iChunk].copy_to_matrix();
-  return BaseState::apply_to_matrix(true);
+auto State<unitary_matrix_t>::copy_to_matrix() {
+  return BaseState::qreg_.copy_to_matrix();
 }
 
 //=========================================================================
@@ -556,20 +403,23 @@ auto State<unitary_matrix_t>::copy_to_matrix(const int_t iChunk) {
 //=========================================================================
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_gate(const int_t iChunk,
-                                         const Operations::Op &op) {
-  if (!BaseState::global_chunk_indexing_) {
+void State<unitary_matrix_t>::apply_gate(const Operations::Op &op) {
+  // CPU qubit vector does not handle chunk ID inside kernel, so modify op here
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() &&
+      !BaseState::qreg_.support_global_indexing()) {
     reg_t qubits_in, qubits_out;
-    BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in);
+    if (op.name[0] == 'c' || op.name.find("mc") == 0) {
+      Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in,
+                                   qubits_out);
+    }
     if (qubits_out.size() > 0) {
       uint_t mask = 0;
       for (int i = 0; i < qubits_out.size(); i++) {
-        mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_));
+        mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits()));
       }
-      if (((BaseState::global_chunk_index_ + iChunk) & mask) == mask) {
-        Operations::Op new_op =
-            BaseState::remake_gate_in_chunk_qubits(op, qubits_in);
-        apply_gate(iChunk, new_op);
+      if ((BaseState::qreg_.chunk_index() & mask) == mask) {
+        Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in);
+        apply_gate(new_op);
       }
       return;
     }
@@ -584,104 +434,99 @@ void State<unitary_matrix_t>::apply_gate(const int_t iChunk,
   switch (g) {
   case Gates::mcx:
     // Includes X, CX, CCX, etc
-    BaseState::qregs_[iChunk].apply_mcx(op.qubits);
+    BaseState::qreg_.apply_mcx(op.qubits);
     break;
   case Gates::mcy:
     // Includes Y, CY, CCY, etc
-    BaseState::qregs_[iChunk].apply_mcy(op.qubits);
+    BaseState::qreg_.apply_mcy(op.qubits);
     break;
   case Gates::mcz:
     // Includes Z, CZ, CCZ, etc
-    BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1);
+    BaseState::qreg_.apply_mcphase(op.qubits, -1);
     break;
   case Gates::mcr:
-    BaseState::qregs_[iChunk].apply_mcu(
-        op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1]));
+    BaseState::qreg_.apply_mcu(op.qubits,
+                               Linalg::VMatrix::r(op.params[0], op.params[1]));
     break;
   case Gates::mcrx:
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits,
-                                        Linalg::VMatrix::rx(op.params[0]));
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0]));
     break;
   case Gates::mcry:
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits,
-                                        Linalg::VMatrix::ry(op.params[0]));
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0]));
     break;
   case Gates::mcrz:
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits,
-                                        Linalg::VMatrix::rz(op.params[0]));
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0]));
     break;
   case Gates::rxx:
-    BaseState::qregs_[iChunk].apply_matrix(op.qubits,
-                                           Linalg::VMatrix::rxx(op.params[0]));
+    BaseState::qreg_.apply_matrix(op.qubits,
+                                  Linalg::VMatrix::rxx(op.params[0]));
     break;
   case Gates::ryy:
-    BaseState::qregs_[iChunk].apply_matrix(op.qubits,
-                                           Linalg::VMatrix::ryy(op.params[0]));
+    BaseState::qreg_.apply_matrix(op.qubits,
+                                  Linalg::VMatrix::ryy(op.params[0]));
     break;
   case Gates::rzz:
-    apply_diagonal_matrix(iChunk, op.qubits,
-                          Linalg::VMatrix::rzz_diag(op.params[0]));
+    apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0]));
     break;
   case Gates::rzx:
-    BaseState::qregs_[iChunk].apply_matrix(op.qubits,
-                                           Linalg::VMatrix::rzx(op.params[0]));
+    BaseState::qreg_.apply_matrix(op.qubits,
+                                  Linalg::VMatrix::rzx(op.params[0]));
     break;
   case Gates::ecr:
-    BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ECR);
+    BaseState::qreg_.apply_matrix(op.qubits, Linalg::VMatrix::ECR);
     break;
   case Gates::id:
     break;
   case Gates::h:
-    apply_gate_mcu(iChunk, op.qubits, M_PI / 2., 0., M_PI, 0.);
+    apply_gate_mcu(op.qubits, M_PI / 2., 0., M_PI, 0.);
     break;
   case Gates::s:
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(0., 1.));
+    apply_gate_phase(op.qubits[0], complex_t(0., 1.));
     break;
   case Gates::sdg:
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(0., -1.));
+    apply_gate_phase(op.qubits[0], complex_t(0., -1.));
     break;
   case Gates::pauli:
-    BaseState::qregs_[iChunk].apply_pauli(op.qubits, op.string_params[0]);
+    BaseState::qreg_.apply_pauli(op.qubits, op.string_params[0]);
     break;
   case Gates::t: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2));
+    apply_gate_phase(op.qubits[0], complex_t(isqrt2, isqrt2));
   } break;
   case Gates::tdg: {
     const double isqrt2{1. / std::sqrt(2)};
-    apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2));
+    apply_gate_phase(op.qubits[0], complex_t(isqrt2, -isqrt2));
   } break;
   case Gates::mcswap:
     // Includes SWAP, CSWAP, etc
-    BaseState::qregs_[iChunk].apply_mcswap(op.qubits);
+    BaseState::qreg_.apply_mcswap(op.qubits);
     break;
   case Gates::mcu3:
     // Includes u3, cu3, etc
-    apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]),
-                   std::real(op.params[1]), std::real(op.params[2]), 0.);
+    apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]),
+                   std::real(op.params[2]), 0.);
     break;
   case Gates::mcu:
     // Includes u, cu, etc
-    apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]),
-                   std::real(op.params[1]), std::real(op.params[2]),
-                   std::real(op.params[3]));
+    apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]),
+                   std::real(op.params[2]), std::real(op.params[3]));
     break;
   case Gates::mcu2:
     // Includes u2, cu2, etc
-    apply_gate_mcu(iChunk, op.qubits, M_PI / 2., std::real(op.params[0]),
+    apply_gate_mcu(op.qubits, M_PI / 2., std::real(op.params[0]),
                    std::real(op.params[1]), 0.);
     break;
   case Gates::mcp:
     // Includes u1, cu1, p, cp, mcp, etc
-    BaseState::qregs_[iChunk].apply_mcphase(
-        op.qubits, std::exp(complex_t(0, 1) * op.params[0]));
+    BaseState::qreg_.apply_mcphase(op.qubits,
+                                   std::exp(complex_t(0, 1) * op.params[0]));
     break;
   case Gates::mcsx:
     // Includes sx, csx, mcsx etc
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SX);
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SX);
     break;
   case Gates::mcsxdg:
-    BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SXDG);
+    BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SXDG);
     break;
   default:
     // We shouldn't reach here unless there is a bug in gateset
@@ -691,102 +536,85 @@ void State<unitary_matrix_t>::apply_gate(const int_t iChunk,
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_matrix(const int_t iChunk,
-                                           const reg_t &qubits,
+void State<unitary_matrix_t>::apply_matrix(const reg_t &qubits,
                                            const cmatrix_t &mat) {
   if (qubits.empty() == false && mat.size() > 0) {
-    apply_matrix(iChunk, qubits, Utils::vectorize_matrix(mat));
+    apply_matrix(qubits, Utils::vectorize_matrix(mat));
   }
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_matrix(const int_t iChunk,
-                                           const reg_t &qubits,
+void State<unitary_matrix_t>::apply_matrix(const reg_t &qubits,
                                            const cvector_t &vmat) {
   // Check if diagonal matrix
   if (vmat.size() == 1ULL << qubits.size()) {
-    apply_diagonal_matrix(iChunk, qubits, vmat);
+    apply_diagonal_matrix(qubits, vmat);
   } else {
-    BaseState::qregs_[iChunk].apply_matrix(qubits, vmat);
+    BaseState::qreg_.apply_matrix(qubits, vmat);
   }
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_diagonal_matrix(const int_t iChunk,
-                                                    const reg_t &qubits,
+void State<unitary_matrix_t>::apply_diagonal_matrix(const reg_t &qubits,
                                                     const cvector_t &diag) {
-  if (BaseState::global_chunk_indexing_ ||
-      !BaseState::multi_chunk_distribution_) {
-    // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix
-    // as is
-    reg_t qubits_chunk = qubits;
-    for (uint_t i = 0; i < qubits.size(); i++) {
-      if (qubits_chunk[i] >= BaseState::chunk_bits_) {
-        qubits_chunk[i] += BaseState::chunk_bits_;
+  if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits()) {
+    if (!BaseState::qreg_.support_global_indexing()) {
+      reg_t qubits_in = qubits;
+      cvector_t diag_in = diag;
+      Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(),
+                                   BaseState::qreg_.num_qubits(), qubits_in,
+                                   diag_in);
+      BaseState::qreg_.apply_diagonal_matrix(qubits_in, diag_in);
+    } else {
+      reg_t qubits_chunk = qubits;
+      for (uint_t i = 0; i < qubits.size(); i++) {
+        if (qubits_chunk[i] >= BaseState::qreg_.num_qubits())
+          qubits_chunk[i] += BaseState::qreg_.num_qubits();
       }
+      BaseState::qreg_.apply_diagonal_matrix(qubits_chunk, diag);
     }
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_chunk, diag);
   } else {
-    reg_t qubits_in = qubits;
-    cvector_t diag_in = diag;
-
-    BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in);
-    BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in, diag_in);
+    BaseState::qreg_.apply_diagonal_matrix(qubits, diag);
   }
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_gate_phase(const int_t iChunk, uint_t qubit,
-                                               complex_t phase) {
+void State<unitary_matrix_t>::apply_gate_phase(uint_t qubit, complex_t phase) {
   cvector_t diag(2);
   diag[0] = 1.0;
   diag[1] = phase;
-  apply_diagonal_matrix(iChunk, reg_t({qubit}), diag);
+  apply_diagonal_matrix(reg_t({qubit}), diag);
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_gate_phase(const int_t iChunk,
-                                               const reg_t &qubits,
+void State<unitary_matrix_t>::apply_gate_phase(const reg_t &qubits,
                                                complex_t phase) {
   cvector_t diag((1 << qubits.size()), 1.0);
   diag[(1 << qubits.size()) - 1] = phase;
-  apply_diagonal_matrix(iChunk, qubits, diag);
+  apply_diagonal_matrix(qubits, diag);
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_gate_mcu(const int_t iChunk,
-                                             const reg_t &qubits, double theta,
+void State<unitary_matrix_t>::apply_gate_mcu(const reg_t &qubits, double theta,
                                              double phi, double lambda,
                                              double gamma) {
   const auto u4 = Linalg::Matrix::u4(theta, phi, lambda, gamma);
-  BaseState::qregs_[iChunk].apply_mcu(qubits, Utils::vectorize_matrix(u4));
+  BaseState::qreg_.apply_mcu(qubits, Utils::vectorize_matrix(u4));
 }
 
 template <class unitary_matrix_t>
 void State<unitary_matrix_t>::apply_global_phase() {
   if (BaseState::has_global_phase_) {
-    if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) {
-#pragma omp parallel for
-      for (int_t ig = 0; ig < BaseState::num_groups_; ig++) {
-        for (int_t i = BaseState::top_chunk_of_group_[ig];
-             i < BaseState::top_chunk_of_group_[ig + 1]; i++)
-          apply_diagonal_matrix(
-              i, {0}, {BaseState::global_phase_, BaseState::global_phase_});
-      }
-    } else {
-      for (int_t i = 0; i < BaseState::qregs_.size(); i++)
-        apply_diagonal_matrix(
-            i, {0}, {BaseState::global_phase_, BaseState::global_phase_});
-    }
+    apply_diagonal_matrix({0},
+                          {BaseState::global_phase_, BaseState::global_phase_});
   }
 }
 
 template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_save_unitary(const int_t iChunk,
-                                                 const Operations::Op &op,
+void State<unitary_matrix_t>::apply_save_unitary(const Operations::Op &op,
                                                  ExperimentResult &result,
                                                  bool last_op) {
-  if (op.qubits.size() != BaseState::num_qubits_) {
+  if (op.qubits.size() != BaseState::qreg_.num_qubits()) {
     throw std::invalid_argument(op.name +
                                 " was not applied to all qubits."
                                 " Only the full unitary can be saved.");
@@ -795,43 +623,21 @@ void State<unitary_matrix_t>::apply_save_unitary(const int_t iChunk,
       (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0];
 
   if (last_op) {
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), key,
-                             move_to_matrix(iChunk),
+    result.save_data_pershot(BaseState::creg(), key, move_to_matrix(),
                              Operations::OpType::save_unitary, op.save_type);
   } else {
-    result.save_data_pershot(BaseState::chunk_creg(iChunk), key,
-                             copy_to_matrix(iChunk),
+    result.save_data_pershot(BaseState::creg(), key, copy_to_matrix(),
                              Operations::OpType::save_unitary, op.save_type);
   }
 }
 
 template <class unitary_matrix_t>
-double State<unitary_matrix_t>::expval_pauli(const int_t iChunk,
-                                             const reg_t &qubits,
+double State<unitary_matrix_t>::expval_pauli(const reg_t &qubits,
                                              const std::string &pauli) {
   throw std::runtime_error(
       "Unitary simulator does not support Pauli expectation values.");
 }
 
-// swap between chunks
-template <class unitary_matrix_t>
-void State<unitary_matrix_t>::apply_chunk_swap(const reg_t &qubits) {
-  uint_t q0, q1;
-  q0 = qubits[0];
-  q1 = qubits[1];
-
-  std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]);
-
-  if (qubits[0] >= BaseState::chunk_bits_) {
-    q0 += BaseState::chunk_bits_;
-  }
-  if (qubits[1] >= BaseState::chunk_bits_) {
-    q1 += BaseState::chunk_bits_;
-  }
-  reg_t qs0 = {{q0, q1}};
-  BaseState::apply_chunk_swap(qs0);
-}
-
 //------------------------------------------------------------------------------
 } // namespace QubitUnitary
 } // end namespace AER
diff --git a/src/simulators/unitary/unitarymatrix.hpp b/src/simulators/unitary/unitarymatrix.hpp
index e45183299e..494d57e84e 100644
--- a/src/simulators/unitary/unitarymatrix.hpp
+++ b/src/simulators/unitary/unitarymatrix.hpp
@@ -52,7 +52,7 @@ class UnitaryMatrix : public QubitVector<data_t> {
   //-----------------------------------------------------------------------
 
   // Set the size of the vector in terms of qubit number
-  void set_num_qubits(size_t num_qubits);
+  void set_num_qubits(size_t num_qubits) override;
 
   // Return the number of rows in the matrix
   size_t num_rows() const { return rows_; }
@@ -75,6 +75,14 @@ class UnitaryMatrix : public QubitVector<data_t> {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const UnitaryMatrix<data_t> &obj) {
+    BaseVector::initialize(obj);
+    num_qubits_ = obj.num_qubits_;
+    rows_ = obj.rows_;
+    identity_threshold_ = obj.identity_threshold_;
+  }
+
   // Initializes the vector to a custom initial state.
   // If the length of the statevector does not match the number of qubits
   // an exception is raised.
diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp
old mode 100644
new mode 100755
index df95a9f027..f11e107425
--- a/src/simulators/unitary/unitarymatrix_thrust.hpp
+++ b/src/simulators/unitary/unitarymatrix_thrust.hpp
@@ -82,6 +82,14 @@ class UnitaryMatrixThrust : public QubitVectorThrust<data_t> {
   // Initializes the current vector so that all qubits are in the |0> state.
   void initialize();
 
+  // initialize from existing state (copy)
+  void initialize(const UnitaryMatrixThrust<data_t> &obj) {
+    BaseVector::initialize(obj);
+    num_qubits_ = obj.num_qubits_;
+    rows_ = obj.rows_;
+    identity_threshold_ = obj.identity_threshold_;
+  }
+
   // Initializes the vector to a custom initial state.
   // If the length of the statevector does not match the number of qubits
   // an exception is raised.
diff --git a/test/terra/backends/aer_simulator/test_conditional.py b/test/terra/backends/aer_simulator/test_conditional.py
index 07f1298557..13f3ccb5b7 100644
--- a/test/terra/backends/aer_simulator/test_conditional.py
+++ b/test/terra/backends/aer_simulator/test_conditional.py
@@ -29,7 +29,6 @@ class TestConditionalGates(SimulatorTestCase):
         "density_matrix",
         "matrix_product_state",
         "extended_stabilizer",
-        "tensor_network",
     ]
 
     # ---------------------------------------------------------------------
@@ -66,8 +65,6 @@ def test_conditional_gates_2bit(self, method, device):
     def test_conditional_gates_64bit(self, method, device):
         """Test conditional gate operations on 64-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         # [value of conditional register, list of condtional values]
         cases = ref_conditionals.conditional_cases_64bit()
         backend = self.backend(method=method, device=device)
@@ -87,8 +84,6 @@ def test_conditional_gates_64bit(self, method, device):
     def test_conditional_gates_132bit(self, method, device):
         """Test conditional gate operations on 132-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_132bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -112,7 +107,6 @@ class TestConditionalUnitary(SimulatorTestCase):
         "statevector",
         "density_matrix",
         "matrix_product_state",
-        "tensor_network",
     ]
 
     # ---------------------------------------------------------------------
@@ -149,8 +143,6 @@ def test_conditional_unitary_2bit(self, method, device):
     def test_conditional_unitary_64bit(self, method, device):
         """Test conditional unitary operations on 64-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_64bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -167,8 +159,6 @@ def test_conditional_unitary_64bit(self, method, device):
     def test_conditional_unitary_132bit(self, method, device):
         """Test conditional unitary operations on 132-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_132bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -190,7 +180,6 @@ class TestConditionalKraus(SimulatorTestCase):
         "statevector",
         "density_matrix",
         "matrix_product_state",
-        "tensor_network",
     ]
 
     # ---------------------------------------------------------------------
@@ -227,8 +216,6 @@ def test_conditional_kraus_2bit(self, method, device):
     def test_conditional_kraus_64bit(self, method, device):
         """Test conditional kraus operations on 64-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_64bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -245,8 +232,6 @@ def test_conditional_kraus_64bit(self, method, device):
     def test_conditional_kraus_132bit(self, method, device):
         """Test conditional kraus operations on 132-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_132bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -263,7 +248,7 @@ def test_conditional_kraus_132bit(self, method, device):
 class TestConditionalSuperOp(SimulatorTestCase):
     """AerSimulator conditional superop tests."""
 
-    SUPPORTED_METHODS = ["automatic", "density_matrix", "tensor_network"]
+    SUPPORTED_METHODS = ["automatic", "density_matrix"]
 
     # ---------------------------------------------------------------------
     # Test conditional
@@ -299,8 +284,6 @@ def test_conditional_superop_2bit(self, method, device):
     def test_conditional_superop_64bit(self, method, device):
         """Test conditional superop operations on 64-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_64bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
@@ -317,8 +300,6 @@ def test_conditional_superop_64bit(self, method, device):
     def test_conditional_superop_132bit(self, method, device):
         """Test conditional superop operations on 132-bit conditional register."""
         shots = 100
-        if "tensor_network" in method:
-            shots = 1
         cases = ref_conditionals.conditional_cases_132bit()
         backend = self.backend(method=method, device=device)
         backend.set_options(max_parallel_experiments=0)
diff --git a/test/terra/backends/aer_simulator/test_measure.py b/test/terra/backends/aer_simulator/test_measure.py
index f3a495befd..c705869817 100644
--- a/test/terra/backends/aer_simulator/test_measure.py
+++ b/test/terra/backends/aer_simulator/test_measure.py
@@ -93,9 +93,6 @@ def test_measure_nondeterministic_without_sampling(self, method, device):
         backend = self.backend(method=method, device=device)
         shots = 4000
         delta = 0.05
-        if "tensor_network" in method:
-            shots = 100
-            delta = 0.1
         circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=False)
         targets = ref_measure.measure_counts_nondeterministic(shots)
         result = backend.run(circuits, shots=shots).result()
@@ -195,9 +192,6 @@ def test_measure_nondeterministic_multi_qubit_without_sampling(self, method, dev
         backend = self.backend(method=method, device=device)
         shots = 4000
         delta = 0.05
-        if "tensor_network" in method:
-            shots = 100
-            delta = 0.1
         circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=False)
         targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots)
         result = backend.run(circuits, shots=shots).result()
diff --git a/test/terra/backends/aer_simulator/test_shot_branching.py b/test/terra/backends/aer_simulator/test_shot_branching.py
new file mode 100644
index 0000000000..ac3ff0a810
--- /dev/null
+++ b/test/terra/backends/aer_simulator/test_shot_branching.py
@@ -0,0 +1,782 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2018, 2019.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+"""
+AerSimulator Integration Tests
+"""
+
+from ddt import ddt
+from test.terra.reference import ref_measure
+from test.terra.reference import ref_reset
+from test.terra.reference import ref_initialize
+from test.terra.reference import ref_kraus_noise
+from test.terra.reference import ref_pauli_noise
+from test.terra.reference import ref_readout_noise
+from test.terra.reference import ref_reset_noise
+from test.terra.reference import ref_conditionals
+
+from qiskit import QuantumCircuit
+from qiskit import transpile
+from qiskit_aer import AerSimulator
+from qiskit_aer.noise import NoiseModel
+from qiskit_aer.noise.errors import ReadoutError, depolarizing_error
+from qiskit.circuit.library import QuantumVolume
+from qiskit.quantum_info.random import random_unitary
+from test.terra.backends.simulator_test_case import SimulatorTestCase, supported_methods
+
+from qiskit_aer import noise
+
+import qiskit.quantum_info as qi
+from qiskit.circuit.library import QFT
+from qiskit.circuit import QuantumCircuit, Reset
+from qiskit.circuit.library.standard_gates import IGate, HGate
+from qiskit.quantum_info.states.densitymatrix import DensityMatrix
+
+from qiskit.circuit import Parameter, Qubit, Clbit, QuantumRegister, ClassicalRegister
+from qiskit.circuit.controlflow import *
+from qiskit_aer.library.default_qubits import default_qubits
+from qiskit_aer.library.control_flow_instructions import AerMark, AerJump
+
+import numpy as np
+
+
+SUPPORTED_METHODS = [
+    "statevector",
+    "density_matrix",
+]
+# tensor_network is tested in other test cases by setting shot_branching_enable by default
+
+SUPPORTED_METHODS_INITIALIZE = [
+    "statevector",
+]
+
+
+@ddt
+class TestShotBranching(SimulatorTestCase):
+    """AerSimulator measure tests."""
+
+    OPTIONS = {"seed_simulator": 41411}
+
+    # ---------------------------------------------------------------------
+    # Test measure
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_measure_nondeterministic_with_sampling(self, method, device):
+        """Test AerSimulator measure with non-deterministic counts with sampling"""
+        backend = self.backend(method=method, device=device)
+        shots = 4000
+        circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=True)
+        targets = ref_measure.measure_counts_nondeterministic(shots)
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0.05 * shots)
+        # Test sampling was enabled
+        for res in result.results:
+            self.assertIn("measure_sampling", res.metadata)
+            self.assertEqual(res.metadata["measure_sampling"], True)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_measure_nondeterministic_without_sampling(self, method, device):
+        """Test AerSimulator measure with non-deterministic counts without sampling"""
+        backend = self.backend(method=method, device=device)
+        shots = 4000
+        delta = 0.05
+        circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=False)
+        targets = ref_measure.measure_counts_nondeterministic(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=delta * shots)
+        self.compare_result_metadata(result, circuits, "measure_sampling", False)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_measure_sampling_with_quantum_noise(self, method, device):
+        """Test AerSimulator measure with deterministic counts with sampling and readout-error"""
+        readout_error = [0.01, 0.1]
+        noise_model = NoiseModel()
+        depolarizing = {"u3": (1, 0.001), "cx": (2, 0.02)}
+        readout = [
+            [1.0 - readout_error[0], readout_error[0]],
+            [readout_error[1], 1.0 - readout_error[1]],
+        ]
+        noise_model.add_all_qubit_readout_error(ReadoutError(readout))
+        for gate, (num_qubits, gate_error) in depolarizing.items():
+            noise_model.add_all_qubit_quantum_error(
+                depolarizing_error(gate_error, num_qubits), gate
+            )
+
+        backend = self.backend(method=method, device=device, noise_model=noise_model)
+        shots = 1000
+        circuits = ref_measure.measure_circuits_deterministic(allow_sampling=True)
+        targets = ref_measure.measure_counts_deterministic(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        sampling = method == "density_matrix" or method == "tensor_network"
+        self.compare_result_metadata(result, circuits, "measure_sampling", sampling)
+
+    # ---------------------------------------------------------------------
+    # Test multi-qubit measure qobj instruction
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_measure_nondeterministic_multi_qubit_with_sampling(
+        self, method, device
+    ):
+        """Test AerSimulator measure with non-deterministic counts"""
+        backend = self.backend(method=method, device=device)
+        shots = 4000
+        circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=True)
+        targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots)
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0.05 * shots)
+        self.compare_result_metadata(result, circuits, "measure_sampling", True)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_measure_nondeterministic_multi_qubit_without_sampling(
+        self, method, device
+    ):
+        """Test AerSimulator measure with non-deterministic counts"""
+        backend = self.backend(method=method, device=device)
+        shots = 4000
+        delta = 0.05
+        circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=False)
+        targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=delta * shots)
+        self.compare_result_metadata(result, circuits, "measure_sampling", False)
+
+    # ---------------------------------------------------------------------
+    # Test reset
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_reset_nondeterministic(self, method, device):
+        """Test AerSimulator reset with for circuits with non-deterministic counts"""
+        backend = self.backend(method=method, device=device)
+        # For statevector output we can combine deterministic and non-deterministic
+        # count output circuits
+        shots = 4000
+        circuits = ref_reset.reset_circuits_nondeterministic(final_measure=True)
+        targets = ref_reset.reset_counts_nondeterministic(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_repeated_resets(self, method, device):
+        """Test repeated reset operations"""
+        backend = self.backend(method=method, device=device)
+        shots = 100
+        circuits = ref_reset.reset_circuits_repeated()
+        targets = ref_reset.reset_counts_repeated(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_reset_moving_qubits(self, method, device):
+        """Test AerSimulator reset with for circuits where qubits have moved"""
+        backend = self.backend(method=method, device=device)
+        # count output circuits
+        shots = 1000
+        circuits = ref_reset.reset_circuits_with_entangled_and_moving_qubits(final_measure=True)
+        targets = ref_reset.reset_counts_with_entangled_and_moving_qubits(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0.05 * shots)
+
+    # ---------------------------------------------------------------------
+    # Test initialize instr make it through the wrapper
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_wrapper_1(self, method, device):
+        """Test AerSimulator initialize"""
+        backend = self.backend(method=method, device=device)
+        shots = 100
+        if "tensor_network" in method:
+            shots = 10
+        lst = [0, 1]
+        init_states = [
+            np.array(lst),
+            np.array(lst, dtype=float),
+            np.array(lst, dtype=np.float32),
+            np.array(lst, dtype=complex),
+            np.array(lst, dtype=np.complex64),
+        ]
+        circuits = []
+        [
+            circuits.extend(ref_initialize.initialize_circuits_w_1(init_state))
+            for init_state in init_states
+        ]
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+
+    # ---------------------------------------------------------------------
+    # Test initialize instr make it through the wrapper
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_wrapper_2(self, method, device):
+        """Test AerSimulator initialize"""
+        backend = self.backend(method=method, device=device)
+        shots = 100
+        lst = [0, 1, 0, 0]
+        init_states = [
+            np.array(lst),
+            np.array(lst, dtype=float),
+            np.array(lst, dtype=np.float32),
+            np.array(lst, dtype=complex),
+            np.array(lst, dtype=np.complex64),
+        ]
+        circuits = []
+        [
+            circuits.extend(ref_initialize.initialize_circuits_w_2(init_state))
+            for init_state in init_states
+        ]
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+
+    # ---------------------------------------------------------------------
+    # Test initialize
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_1(self, method, device):
+        """Test AerSimulator initialize"""
+        backend = self.backend(method=method, device=device)
+        # For statevector output we can combine deterministic and non-deterministic
+        # count output circuits
+        shots = 1000
+        delta = 0.05
+        circuits = ref_initialize.initialize_circuits_1(final_measure=True)
+        targets = ref_initialize.initialize_counts_1(shots)
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=delta * shots)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_2(self, method, device):
+        """Test AerSimulator initialize"""
+        backend = self.backend(method=method, device=device)
+        # For statevector output we can combine deterministic and non-deterministic
+        # count output circuits
+        shots = 1000
+        delta = 0.05
+        circuits = ref_initialize.initialize_circuits_2(final_measure=True)
+        targets = ref_initialize.initialize_counts_2(shots)
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=delta * shots)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_entangled_qubits(self, method, device):
+        """Test initialize entangled qubits"""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        delta = 0.05
+        circuits = ref_initialize.initialize_entangled_qubits()
+        targets = ref_initialize.initialize_counts_entangled_qubits(shots)
+        result = backend.run(
+            circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=delta * shots)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_sampling_opt_disabled(self, method, device):
+        """Test sampling optimization"""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuit = QuantumCircuit(2)
+        circuit.h([0, 1])
+        circuit.initialize([0, 1], [1])
+        circuit.measure_all()
+        result = backend.run(
+            circuit, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True
+        ).result()
+        self.assertSuccess(result)
+        sampling = result.results[0].metadata.get("measure_sampling", None)
+        self.assertFalse(sampling)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_with_labels(self, method, device):
+        """Test sampling optimization"""
+        backend = self.backend(method=method, device=device)
+
+        circ = QuantumCircuit(4)
+        circ.initialize("+-rl")
+        circ.save_statevector()
+        actual = (
+            backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True)
+            .result()
+            .get_statevector(circ)
+        )
+
+        for q4, p4 in enumerate([1, 1]):
+            for q3, p3 in enumerate([1, -1]):
+                for q2, p2 in enumerate([1, 1j]):
+                    for q1, p1 in enumerate([1, -1j]):
+                        index = int("{}{}{}{}".format(q4, q3, q2, q1), 2)
+                        self.assertAlmostEqual(actual[index], 0.25 * p1 * p2 * p3 * p4)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_with_int(self, method, device):
+        """Test sampling with int"""
+        backend = self.backend(method=method, device=device)
+
+        circ = QuantumCircuit(4)
+        circ.initialize(5, [0, 1, 2])
+        circ.save_statevector()
+        actual = (
+            backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True)
+            .result()
+            .get_statevector(circ)
+        )
+
+        self.assertAlmostEqual(actual[5], 1)
+
+    @supported_methods(SUPPORTED_METHODS_INITIALIZE)
+    def test_shot_branching_initialize_with_int_twice(self, method, device):
+        """Test sampling with int twice"""
+        backend = self.backend(method=method, device=device)
+
+        circ = QuantumCircuit(4)
+        circ.initialize(1, [0])
+        circ.initialize(1, [2])
+        circ.save_statevector()
+        actual = (
+            backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True)
+            .result()
+            .get_statevector(circ)
+        )
+
+        self.assertAlmostEqual(actual[5], 1)
+
+    # ---------------------------------------------------------------------
+    # Test noise
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_empty_circuit_noise(self, method, device):
+        """Test simulation with empty circuit and noise model."""
+        backend = self.backend(method=method, device=device)
+        noise_model = noise.NoiseModel()
+        noise_model.add_all_qubit_quantum_error(noise.depolarizing_error(0.1, 1), ["x"])
+        result = backend.run(
+            QuantumCircuit(), shots=1, noise_model=noise_model, shot_branching_enable=True
+        ).result()
+        self.assertSuccess(result)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_readout_noise(self, method, device):
+        """Test simulation with classical readout error noise model."""
+        backend = self.backend(method=method, device=device)
+        # For statevector output we can combine deterministic and non-deterministic
+        # count output circuits
+        shots = 4000
+        circuits = ref_readout_noise.readout_error_circuits()
+        noise_models = ref_readout_noise.readout_error_noise_models()
+        targets = ref_readout_noise.readout_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_pauli_gate_noise(self, method, device):
+        """Test simulation with Pauli gate error noise model."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuits = ref_pauli_noise.pauli_gate_error_circuits()
+        noise_models = ref_pauli_noise.pauli_gate_error_noise_models()
+        targets = ref_pauli_noise.pauli_gate_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_pauli_reset_noise(self, method, device):
+        """Test simulation with Pauli reset error noise model."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuits = ref_pauli_noise.pauli_reset_error_circuits()
+        noise_models = ref_pauli_noise.pauli_reset_error_noise_models()
+        targets = ref_pauli_noise.pauli_reset_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_pauli_measure_noise(self, method, device):
+        """Test simulation with Pauli measure error noise model."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuits = ref_pauli_noise.pauli_measure_error_circuits()
+        noise_models = ref_pauli_noise.pauli_measure_error_noise_models()
+        targets = ref_pauli_noise.pauli_measure_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_reset_gate_noise(self, method, device):
+        """Test simulation with reset gate error noise model."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuits = ref_reset_noise.reset_gate_error_circuits()
+        noise_models = ref_reset_noise.reset_gate_error_noise_models()
+        targets = ref_reset_noise.reset_gate_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_kraus_gate_noise(self, method, device):
+        """Test simulation with Kraus gate error noise model."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        circuits = ref_kraus_noise.kraus_gate_error_circuits()
+        noise_models = ref_kraus_noise.kraus_gate_error_noise_models()
+        targets = ref_kraus_noise.kraus_gate_error_counts(shots)
+
+        for circuit, noise_model, target in zip(circuits, noise_models, targets):
+            backend.set_options(noise_model=noise_model)
+            result = backend.run(circuit, shots=shots, shot_branching_enable=True).result()
+            self.assertSuccess(result)
+            self.compare_counts(result, [circuit], [target], delta=0.05 * shots)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_kraus_gate_noise_on_QFT(self, method, device):
+        """Test Kraus noise on a QFT circuit"""
+        shots = 10000
+
+        # Build noise model
+        error1 = noise.amplitude_damping_error(0.2)
+        error2 = error1.tensor(error1)
+        noise_model = noise.NoiseModel()
+        noise_model.add_all_qubit_quantum_error(error1, ["h"])
+        noise_model.add_all_qubit_quantum_error(error2, ["cp", "swap"])
+
+        backend = self.backend(method=method, device=device, noise_model=noise_model)
+        ideal_circuit = transpile(QFT(3), backend)
+
+        # manaully build noise circuit
+        noise_circuit = QuantumCircuit(3)
+        for inst, qargs, cargs in ideal_circuit.data:
+            noise_circuit.append(inst, qargs, cargs)
+            if inst.name == "h":
+                noise_circuit.append(error1, qargs)
+            elif inst.name in ["cp", "swap"]:
+                noise_circuit.append(error2, qargs)
+        # compute target counts
+        noise_state = DensityMatrix(noise_circuit)
+        ref_target = {i: shots * p for i, p in noise_state.probabilities_dict().items()}
+
+        # Run sim
+        ideal_circuit.measure_all()
+        result = backend.run(ideal_circuit, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(
+            result, [ideal_circuit], [ref_target], hex_counts=False, delta=0.1 * shots
+        )
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_clifford_circuit_noise(self, method, device):
+        """Test simulation with mixed Clifford quantum errors in circuit."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        error1 = noise.QuantumError(
+            [([(IGate(), [0])], 0.8), ([(Reset(), [0])], 0.1), ([(HGate(), [0])], 0.1)]
+        )
+
+        error2 = noise.QuantumError(
+            [
+                ([(IGate(), [0])], 0.75),
+                ([(Reset(), [0])], 0.1),
+                ([(Reset(), [1])], 0.1),
+                ([(Reset(), [0]), (Reset(), [1])], 0.05),
+            ]
+        )
+
+        qc = QuantumCircuit(2)
+        qc.h(0)
+        qc.append(error1, [0])
+        qc.cx(0, 1)
+        qc.append(error2, [0, 1])
+        target_probs = qi.DensityMatrix(qc).probabilities_dict()
+
+        # Add measurement
+        qc.measure_all()
+        result = backend.run(qc, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        probs = {key: val / shots for key, val in result.get_counts(0).items()}
+        self.assertDictAlmostEqual(target_probs, probs, delta=0.1)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_kraus_circuit_noise(self, method, device):
+        """Test simulation with Kraus quantum errors in circuit."""
+        backend = self.backend(method=method, device=device)
+        shots = 1000
+        error0 = noise.amplitude_damping_error(0.05)
+        error1 = noise.amplitude_damping_error(0.15)
+        error01 = error1.tensor(error0)
+
+        # Target Circuit 0
+        tc0 = QuantumCircuit(2)
+        tc0.h(0)
+        tc0.append(qi.Kraus(error0), [0])
+        tc0.cx(0, 1)
+        tc0.append(qi.Kraus(error01), [0, 1])
+        target_probs0 = qi.DensityMatrix(tc0).probabilities_dict()
+
+        # Sim circuit 0
+        qc0 = QuantumCircuit(2)
+        qc0.h(0)
+        qc0.append(error0, [0])
+        qc0.cx(0, 1)
+        qc0.append(error01, [0, 1])
+        qc0.measure_all()
+
+        # Target Circuit 1
+        tc1 = QuantumCircuit(2)
+        tc1.h(1)
+        tc1.append(qi.Kraus(error0), [1])
+        tc1.cx(1, 0)
+        tc1.append(qi.Kraus(error01), [1, 0])
+        target_probs1 = qi.DensityMatrix(tc1).probabilities_dict()
+
+        # Sim circuit 1
+        qc1 = QuantumCircuit(2)
+        qc1.h(1)
+        qc1.append(error0, [1])
+        qc1.cx(1, 0)
+        qc1.append(error01, [1, 0])
+        qc1.measure_all()
+
+        result = backend.run([qc0, qc1], shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        probs = [{key: val / shots for key, val in result.get_counts(i).items()} for i in range(2)]
+        self.assertDictAlmostEqual(target_probs0, probs[0], delta=0.1)
+        self.assertDictAlmostEqual(target_probs1, probs[1], delta=0.1)
+
+    # ---------------------------------------------------------------------
+    # Test conditional
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_gates_1bit(self, method, device):
+        """Test conditional gate operations on 1-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        circuits = ref_conditionals.conditional_circuits_1bit(
+            final_measure=True, conditional_type="gate"
+        )
+        targets = ref_conditionals.conditional_counts_1bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_gates_2bit(self, method, device):
+        """Test conditional gate operations on 2-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_2bit(
+            final_measure=True, conditional_type="gate"
+        )
+        targets = ref_conditionals.conditional_counts_2bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_gates_64bit(self, method, device):
+        """Test conditional gate operations on 64-bit conditional register."""
+        shots = 100
+        # [value of conditional register, list of condtional values]
+        cases = ref_conditionals.conditional_cases_64bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            64, cases, final_measure=True, conditional_type="gate"
+        )
+        # not using hex counts because number of leading zeros in results
+        # doesn't seem consistent
+        targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False)
+
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_gates_132bit(self, method, device):
+        """Test conditional gate operations on 132-bit conditional register."""
+        shots = 100
+        cases = ref_conditionals.conditional_cases_132bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            132, cases, final_measure=True, conditional_type="gate"
+        )
+        targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False)
+        circuits = circuits[0:1]
+        targets = targets[0:1]
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    # ---------------------------------------------------------------------
+    # Test conditional
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_unitary_1bit(self, method, device):
+        """Test conditional unitary operations on 1-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        circuits = ref_conditionals.conditional_circuits_1bit(
+            final_measure=True, conditional_type="unitary"
+        )
+        targets = ref_conditionals.conditional_counts_1bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_unitary_2bit(self, method, device):
+        """Test conditional unitary operations on 2-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_2bit(
+            final_measure=True, conditional_type="unitary"
+        )
+        targets = ref_conditionals.conditional_counts_2bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_unitary_64bit(self, method, device):
+        """Test conditional unitary operations on 64-bit conditional register."""
+        shots = 100
+        cases = ref_conditionals.conditional_cases_64bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            64, cases, final_measure=True, conditional_type="unitary"
+        )
+        targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False)
+
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_unitary_132bit(self, method, device):
+        """Test conditional unitary operations on 132-bit conditional register."""
+        shots = 100
+        cases = ref_conditionals.conditional_cases_132bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            132, cases, final_measure=True, conditional_type="unitary"
+        )
+        targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    # ---------------------------------------------------------------------
+    # Test conditional
+    # ---------------------------------------------------------------------
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_unitary_1bit(self, method, device):
+        """Test conditional kraus operations on 1-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        circuits = ref_conditionals.conditional_circuits_1bit(
+            final_measure=True, conditional_type="kraus"
+        )
+        targets = ref_conditionals.conditional_counts_1bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_kraus_2bit(self, method, device):
+        """Test conditional kraus operations on 2-bit conditional register."""
+        shots = 100
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_2bit(
+            final_measure=True, conditional_type="kraus"
+        )
+        targets = ref_conditionals.conditional_counts_2bit(shots)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_kraus_64bit(self, method, device):
+        """Test conditional kraus operations on 64-bit conditional register."""
+        shots = 100
+        cases = ref_conditionals.conditional_cases_64bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            64, cases, final_measure=True, conditional_type="kraus"
+        )
+        targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False)
+
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    @supported_methods(SUPPORTED_METHODS)
+    def test_shot_branching_conditional_kraus_132bit(self, method, device):
+        """Test conditional kraus operations on 132-bit conditional register."""
+        shots = 100
+        cases = ref_conditionals.conditional_cases_132bit()
+        backend = self.backend(method=method, device=device)
+        backend.set_options(max_parallel_experiments=0)
+        circuits = ref_conditionals.conditional_circuits_nbit(
+            132, cases, final_measure=True, conditional_type="kraus"
+        )
+        targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False)
+        result = backend.run(circuits, shots=shots, shot_branching_enable=True).result()
+        self.assertSuccess(result)
+        self.compare_counts(result, circuits, targets, hex_counts=False, delta=0)
+
+    # ---------------------------------------------------------------------
+    # Test control flow
+    # ---------------------------------------------------------------------
diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py
index 2a41d69518..1e3c99b145 100644
--- a/test/terra/backends/simulator_test_case.py
+++ b/test/terra/backends/simulator_test_case.py
@@ -42,6 +42,10 @@ def backend(self, **options):
                 sim_options["batched_shots_gpu"] = True
             else:
                 sim_options[key] = val
+            # enable shot_branching is method is tensor_network
+            if "method" == key and "tensor_network" in val:
+                sim_options["shot_branching_enable"] = True
+                sim_options["shot_branching_sampling_enable"] = True
         return self.BACKEND(**sim_options)
 
 
@@ -82,6 +86,7 @@ def _method_device(methods):
     cuStateVec = check_cuStateVec(available_devices)
 
     gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"]
+    batchable_methods = ["statevector", "density_matrix"]
     data_args = []
     for method in methods:
         if method in available_methods:
@@ -94,8 +99,9 @@ def _method_device(methods):
                     for device in available_devices:
                         data_args.append((method, device))
                         if device == "GPU":
-                            # add batched optimization test for GPU
-                            data_args.append((method, "GPU_batch"))
+                            if method in batchable_methods:
+                                # add batched optimization test for GPU
+                                data_args.append((method, "GPU_batch"))
                     # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec'
                     #'GPU_cuStateVec' is used only inside tests not available in Aer
                     # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added
diff --git a/test/terra/common.py b/test/terra/common.py
index 8ce447f970..e7092df517 100644
--- a/test/terra/common.py
+++ b/test/terra/common.py
@@ -47,7 +47,7 @@ class QiskitAerTestCase(FullQiskitTestCase):
 
     def setUp(self):
         super().setUp()
-        self.useFixture(fixtures.Timeout(120, gentle=False))
+        self.useFixture(fixtures.Timeout(240, gentle=False))
 
     @classmethod
     def setUpClass(cls):
diff --git a/tox.ini b/tox.ini
index 8f418f383c..725e45bbc2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -38,6 +38,7 @@ commands =
   pylint -j 2 -rn qiskit_aer
 
 [testenv:clang-format]
+allowlist_externals = sh
 envdir = .tox/lint
 commands = sh tools/clang-format.sh -i