diff --git a/README.md b/README.md index 1d8fe978de..b3be611454 100755 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Now you're set up and ready to check out some of the other examples from our ## Authors and Citation Qiskit Aer is the work of [many people](https://github.com/Qiskit/qiskit-aer/graphs/contributors) who contribute -to the project at different levels. If you use Qiskit, please cite as per the included [BibTeX file](https://github.com/Qiskit/qiskit/blob/master/Qiskit.bib). +to the project at different levels. If you use Qiskit, please cite as per the included [BibTeX file](https://github.com/Qiskit/qiskit-terra/blob/main/CITATION.bib). ## License diff --git a/qiskit_aer/backends/aer_compiler.py b/qiskit_aer/backends/aer_compiler.py index 2e7930d198..4909f73537 100644 --- a/qiskit_aer/backends/aer_compiler.py +++ b/qiskit_aer/backends/aer_compiler.py @@ -465,6 +465,8 @@ def compile_circuit(circuits, basis_gates=None, optypes=None): "chunk_swap_buffer_qubits": (int, np.integer), "batched_shots_gpu": (bool, np.bool_), "batched_shots_gpu_max_qubits": (int, np.integer), + "shot_branching_enable": (bool, np.bool_), + "shot_branching_sampling_enable": (bool, np.bool_), "num_threads_per_device": (int, np.integer), "statevector_parallel_threshold": (int, np.integer), "statevector_sample_measure_opt": (int, np.integer), @@ -488,6 +490,7 @@ def compile_circuit(circuits, basis_gates=None, optypes=None): "use_cuTensorNet_autotuning": (bool, np.bool_), "parameterizations": (list), "fusion_parallelization_threshold": (int, np.integer), + "target_gpus": (list), } diff --git a/qiskit_aer/backends/aer_simulator.py b/qiskit_aer/backends/aer_simulator.py index 0b7565e539..1154dab407 100644 --- a/qiskit_aer/backends/aer_simulator.py +++ b/qiskit_aer/backends/aer_simulator.py @@ -170,6 +170,10 @@ class AerSimulator(AerBackend): If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled by setting ``cuStateVec_enable=True``. + * ``target_gpus`` (list): List of GPU's IDs starting from 0 sets + the target GPUs used for the simulation. + If this option is not specified, all the available GPUs are used for + chunks/shots distribution. **Additional Backend Options** @@ -287,6 +291,30 @@ class AerSimulator(AerBackend): threads per GPU. This parameter is used to optimize Pauli noise simulation with multiple-GPUs (Default: 1). + * ``shot_branching_enable`` (bool): This option enables/disables + applying shot-branching technique to speed up multi-shots of dynamic + circutis simulations or circuits simulations with noise models. + (Default: False). + Starting from single state shared with multiple shots and + state will be branched dynamically at runtime. + This option can decrease runs of shots if there will be less branches + than number of total shots. + This option is available for ``"statevector"``, ``"density_matrix"`` + and ``"tensor_network"``. + + * ``shot_branching_sampling_enable`` (bool): This option enables/disables + applying sampling measure if the input circuit has all the measure + operations at the end of the circuit. (Default: False). + Because measure operation branches state into 2 states, it is not + efficient to apply branching for measure. + Sampling measure improves speed to get counts for multiple-shots + sharing the same state. + Note that the counts obtained by sampling measure may not be as same as + the counts calculated by multiple measure operations, + becuase sampling measure takes only one randome number per shot. + This option is available for ``"statevector"``, ``"density_matrix"`` + and ``"tensor_network"``. + * ``accept_distributed_results`` (bool): This option enables storing results independently in each process (Default: None). @@ -709,6 +737,9 @@ def _default_options(cls): batched_shots_gpu=False, batched_shots_gpu_max_qubits=16, num_threads_per_device=1, + # multi-shot branching + shot_branching_enable=False, + shot_branching_sampling_enable=False, # statevector options statevector_parallel_threshold=14, statevector_sample_measure_opt=10, diff --git a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp index bf5296b18a..67e057c74f 100644 --- a/qiskit_aer/backends/wrappers/aer_controller_binding.hpp +++ b/qiskit_aer/backends/wrappers/aer_controller_binding.hpp @@ -182,6 +182,11 @@ void bind_aer_controller(MODULE m) { [](Config &config, uint_t val) { config.num_threads_per_device.value(val); }); + // # multi-shot branching + aer_config.def_readwrite("shot_branching_enable", + &Config::shot_branching_enable); + aer_config.def_readwrite("shot_branching_sampling_enable", + &Config::shot_branching_sampling_enable); // # statevector options aer_config.def_readwrite("statevector_parallel_threshold", &Config::statevector_parallel_threshold); @@ -403,6 +408,10 @@ void bind_aer_controller(MODULE m) { [](Config &config, uint_t val) { config.extended_stabilizer_norm_estimation_default_samples.value(val); }); + aer_config.def_property( + "target_gpus", + [](const Config &config) { return config.target_gpus.val; }, + [](Config &config, reg_t val) { config.target_gpus.value(val); }); aer_config.def(py::pickle( [](const AER::Config &config) { @@ -488,12 +497,14 @@ void bind_aer_controller(MODULE m) { write_value(77, config.unitary_parallel_threshold), write_value(78, config.memory_blocking_bits), write_value( - 79, - config.extended_stabilizer_norm_estimation_default_samples)); + 79, config.extended_stabilizer_norm_estimation_default_samples), + write_value(80, config.shot_branching_enable), + write_value(81, config.shot_branching_sampling_enable), + write_value(82, config.target_gpus)); }, [](py::tuple t) { AER::Config config; - if (t.size() != 79) + if (t.size() != 82) throw std::runtime_error("Invalid serialization format."); read_value(t, 0, config.shots); @@ -580,6 +591,9 @@ void bind_aer_controller(MODULE m) { read_value(t, 78, config.memory_blocking_bits); read_value(t, 79, config.extended_stabilizer_norm_estimation_default_samples); + read_value(t, 80, config.shot_branching_enable); + read_value(t, 81, config.shot_branching_sampling_enable); + read_value(t, 82, config.target_gpus); return config; })); } diff --git a/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml b/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml new file mode 100644 index 0000000000..3d27fd9482 --- /dev/null +++ b/releasenotes/notes/add_executor-a03f2d23cf6f4ca9.yaml @@ -0,0 +1,30 @@ +--- +features: + - | + This release restructures ``State`` classes. + Adding circuit executor classes that runs a circuit and manages multiple + states for multi-shots simulations or multi-chunk simulations for large + number of qubits. + Previously ``StateChunk`` class manages multiple chunks for multi-shots or + multi-chunk simulations but now ``State`` class only has one state + and all the parallelization codes are moved to ``Executor`` classes. + Now all ``State`` classes are independent from parallelization. + Also some of the functions in ``Aer::Controller`` class are moved to + ``CircuitExecutor::Executor`` class. + - | + Shot-branching technique that accelerates dynamic circuits simulations + is implemented with restructured ``Executor`` classes. + Shot-branching is currently applicable to statevector density_matrix + and tensor_network methods. + Shot-branching provides dynamic distribution of multi-shots + by branching states when applying dynamic operations + (measure, reset, initialize, noises) + By default ``shot_branching_enable`` is disabled. + And by setting ``shot_branching_sampling_enable``, final measures will be + done by sampling measure that will speed up to get counts for multiple shots + sharing the same state. + - | + A new option ``target_gpus`` is added to select GPUs used for the + simulation. A list of target GPU's ID is passed for example + ``target_gpus=[0, 2]`` select 2 GPUs to be used. + Without this option, all the available GPUs are used. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp old mode 100644 new mode 100755 index 2a45353dcd..7ea6c35553 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -52,15 +52,15 @@ #include "transpile/cacheblocking.hpp" #include "transpile/fusion.hpp" -#include "simulators/density_matrix/densitymatrix_state.hpp" -#include "simulators/extended_stabilizer/extended_stabilizer_state.hpp" -#include "simulators/matrix_product_state/matrix_product_state.hpp" -#include "simulators/stabilizer/stabilizer_state.hpp" -#include "simulators/statevector/qubitvector.hpp" -#include "simulators/statevector/statevector_state.hpp" -#include "simulators/superoperator/superoperator_state.hpp" -#include "simulators/tensor_network/tensor_net_state.hpp" -#include "simulators/unitary/unitary_state.hpp" +#include "simulators/simulators.hpp" + +#include "simulators/circuit_executor.hpp" +#include "simulators/multi_state_executor.hpp" + +#include "simulators/density_matrix/densitymatrix_executor.hpp" +#include "simulators/statevector/statevector_executor.hpp" +#include "simulators/tensor_network/tensor_net_executor.hpp" +#include "simulators/unitary/unitary_executor.hpp" namespace AER { @@ -104,35 +104,6 @@ class Controller { // Simulation types //----------------------------------------------------------------------- - // Simulation methods for the Qasm Controller - enum class Method { - automatic, - statevector, - density_matrix, - matrix_product_state, - stabilizer, - extended_stabilizer, - unitary, - superop, - tensor_network - }; - - enum class Device { CPU, GPU, ThrustCPU }; - - // Simulation precision - enum class Precision { Double, Single }; - - const std::unordered_map method_names_ = { - {Method::automatic, "automatic"}, - {Method::statevector, "statevector"}, - {Method::density_matrix, "density_matrix"}, - {Method::matrix_product_state, "matrix_product_state"}, - {Method::stabilizer, "stabilizer"}, - {Method::extended_stabilizer, "extended_stabilizer"}, - {Method::unitary, "unitary"}, - {Method::superop, "superop"}, - {Method::tensor_network, "tensor_network"}}; - //----------------------------------------------------------------------- // Config //----------------------------------------------------------------------- @@ -140,12 +111,6 @@ class Controller { // Timer type using myclock_t = std::chrono::high_resolution_clock; - // Validation threshold for validating states and operators - double validation_threshold_ = 1e-8; - - // Save counts as memory list - bool save_creg_memory_ = false; - // Simulation method Method method_ = Method::automatic; @@ -156,78 +121,6 @@ class Controller { // Simulation precision Precision sim_precision_ = Precision::Double; - // Controller-level parameter for CH method - bool extended_stabilizer_measure_sampling_ = false; - - //----------------------------------------------------------------------- - // Circuit Execution - //----------------------------------------------------------------------- - - // Abstract method for executing a circuit. - // This method must initialize a state and return output data for - // the required number of shots. - void run_circuit(const Circuit &circ, const Noise::NoiseModel &noise, - const Method method, const Config &config, - ExperimentResult &result) const; - - //---------------------------------------------------------------- - // Run circuit helpers - //---------------------------------------------------------------- - - // Execute n-shots of a circuit on the input state - template - void run_circuit_helper(const Circuit &circ, const Noise::NoiseModel &noise, - const Config &config, const Method method, - ExperimentResult &result) const; - - // Execute a single shot a of circuit by initializing the state vector, - // running all ops in circ, and updating data with - // simulation output. - template - void run_single_shot(const Circuit &circ, State_t &state, - ExperimentResult &result, RngEngine &rng) const; - - // Execute a single shot a of circuit by initializing the state vector, - // running all ops in circ, and updating data with - // simulation output. - template - void run_with_sampling(const Circuit &circ, State_t &state, - ExperimentResult &result, RngEngine &rng, - const uint_t block_bits, const uint_t shots) const; - - // Execute multiple shots a of circuit by initializing the state vector, - // running all ops in circ, and updating data with - // simulation output. Will use measurement sampling if possible - template - void run_circuit_without_sampled_noise(Circuit &circ, - const Noise::NoiseModel &noise, - const Config &config, - const Method method, - ExperimentResult &result) const; - - template - void run_circuit_with_sampled_noise(const Circuit &circ, - const Noise::NoiseModel &noise, - const Config &config, const Method method, - ExperimentResult &result) const; - - //---------------------------------------------------------------- - // Measurement - //---------------------------------------------------------------- - - // Sample measurement outcomes for the input measure ops from the - // current state of the input State_t - template - void measure_sampler(InputIterator first_meas, InputIterator last_meas, - uint_t shots, State_t &state, ExperimentResult &result, - RngEngine &rng, int_t shot_index = -1) const; - - // Check if measure sampling optimization is valid for the input circuit - // for the given method. This checks if operation types before - // the first measurement in the circuit prevent sampling - bool check_measure_sampling_opt(const Circuit &circ, - const Method method) const; - //------------------------------------------------------------------------- // State validation //------------------------------------------------------------------------- @@ -242,53 +135,28 @@ class Controller { const Noise::NoiseModel &noise, bool throw_except = false) const; - template - bool validate_state(const state_t &state, const Circuit &circ, - const Noise::NoiseModel &noise, - bool throw_except = false) const; - - // Return an estimate of the required memory for a circuit. - size_t required_memory_mb(const Circuit &circuit, - const Noise::NoiseModel &noise, - const Method method) const; - //---------------------------------------------------------------- // Utility functions //---------------------------------------------------------------- + std::shared_ptr + make_circuit_executor(const Method method) const; // Return a vector of simulation methods for each circuit. // If the default method is automatic this will be computed based on the // circuit and noise model. // The noise model will be modified to enable superop or kraus sampling // methods if required by the chosen methods. - std::vector + std::vector simulation_methods(std::vector> &circuits, Noise::NoiseModel &noise_model) const; // Return the simulation method to use based on the input circuit // and noise model - Controller::Method + Method automatic_simulation_method(const Circuit &circ, const Noise::NoiseModel &noise_model) const; - // Return a fusion transpilation pass configured for the current - // method, circuit and config - Transpile::Fusion transpile_fusion(Method method, - const Operations::OpSet &opset, - const Config &config) const; - - // Return cache blocking transpiler pass - Transpile::CacheBlocking - transpile_cache_blocking(Controller::Method method, const Circuit &circ, - const Noise::NoiseModel &noise, - const Config &config) const; - - // return maximum number of qubits for matrix - int_t get_max_matrix_qubits(const Circuit &circ) const; - int_t get_matrix_bits(const Operations::Op &op) const; - bool has_statevector_ops(const Circuit &circuit) const; - //----------------------------------------------------------------------- // Parallelization Config //----------------------------------------------------------------------- @@ -301,82 +169,32 @@ class Controller { const std::vector> &circuits, const Noise::NoiseModel &noise, const std::vector &methods); - // Set circuit parallelization - void set_parallelization_circuit(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method); - - bool multiple_chunk_required(const Circuit &circuit, - const Noise::NoiseModel &noise, - const Method method) const; - - bool multiple_shots_required(const Circuit &circuit, - const Noise::NoiseModel &noise, - const Method method) const; - void save_exception_to_results(Result &result, const std::exception &e) const; // Get system memory size size_t get_system_memory_mb(); size_t get_gpu_memory_mb(); - size_t get_min_memory_mb() const { - if (sim_device_ == Device::GPU && num_gpus_ > 0) { - return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size - } - return max_memory_mb_; - } - // The maximum number of threads to use for various levels of parallelization int max_parallel_threads_; // Parameters for parallelization management in configuration int max_parallel_experiments_; - int max_parallel_shots_; size_t max_memory_mb_; size_t max_gpu_memory_mb_; - int num_gpus_; // max number of GPU per process // use explicit parallelization bool explicit_parallelization_; // Parameters for parallelization management for experiments int parallel_experiments_; - int parallel_shots_; - int parallel_state_update_; bool parallel_nested_ = false; - // max number of states can be stored on memory for batched - // multi-shots/experiments optimization - int max_batched_states_; - - // max number of qubits in given circuits - int max_qubits_; - - // results are stored independently in each process if true - bool accept_distributed_results_ = true; - // process information (MPI) int myrank_ = 0; int num_processes_ = 1; int num_process_per_experiment_ = 1; - - uint_t cache_block_qubit_ = 0; - - // multi-chunks are required to simulate circuits - bool multi_chunk_required_ = false; - - // config setting for multi-shot parallelization - bool batched_shots_gpu_ = true; - int_t batched_shots_gpu_max_qubits_ = - 16; // multi-shot parallelization is applied if qubits is less than max - // qubits - bool enable_batch_multi_shots_ = - false; // multi-shot parallelization can be applied - - // settings for cuStateVec - bool cuStateVec_enable_ = false; }; //========================================================================= @@ -389,21 +207,12 @@ class Controller { void Controller::set_config(const Config &config) { - // Load validation threshold - validation_threshold_ = config.validation_threshold; - - // Load config for memory (creg list data) - if (config.memory.has_value()) - save_creg_memory_ = config.memory.value(); - #ifdef _OPENMP // Load OpenMP maximum thread settings if (config.max_parallel_threads.has_value()) max_parallel_threads_ = config.max_parallel_threads.value(); if (config.max_parallel_experiments.has_value()) max_parallel_experiments_ = config.max_parallel_experiments.value(); - if (config.max_parallel_shots.has_value()) - max_parallel_shots_ = config.max_parallel_shots.value(); // Limit max threads based on number of available OpenMP threads auto omp_threads = omp_get_max_threads(); max_parallel_threads_ = (max_parallel_threads_ > 0) @@ -412,7 +221,6 @@ void Controller::set_config(const Config &config) { #else // No OpenMP so we disable parallelization max_parallel_threads_ = 1; - max_parallel_shots_ = 1; max_parallel_experiments_ = 1; parallel_nested_ = false; #endif @@ -430,38 +238,18 @@ void Controller::set_config(const Config &config) { // for debugging if (config._parallel_shots.has_value()) { - parallel_shots_ = config._parallel_shots.value(); explicit_parallelization_ = true; } // for debugging if (config._parallel_state_update.has_value()) { - parallel_state_update_ = config._parallel_state_update.value(); explicit_parallelization_ = true; } if (explicit_parallelization_) { parallel_experiments_ = std::max({parallel_experiments_, 1}); - parallel_shots_ = std::max({parallel_shots_, 1}); - parallel_state_update_ = std::max({parallel_state_update_, 1}); } - if (config.accept_distributed_results.has_value()) - accept_distributed_results_ = config.accept_distributed_results.value(); - - // enable multiple qregs if cache blocking is enabled - if (config.blocking_qubits.has_value()) - cache_block_qubit_ = config.blocking_qubits.value(); - - // enable batched multi-shots/experiments optimization - batched_shots_gpu_ = config.batched_shots_gpu; - batched_shots_gpu_max_qubits_ = config.batched_shots_gpu_max_qubits; - - // cuStateVec configs - cuStateVec_enable_ = false; - if (config.cuStateVec_enable.has_value()) - cuStateVec_enable_ = config.cuStateVec_enable.value(); - // Override automatic simulation method with a fixed method std::string method = config.method; if (config.method == "statevector") { @@ -485,9 +273,6 @@ void Controller::set_config(const Config &config) { method + std::string(").")); } - if (method_ == Method::density_matrix || method_ == Method::unitary) - batched_shots_gpu_max_qubits_ /= 2; - // Override automatic simulation method with a fixed method sim_device_name_ = config.device; if (sim_device_name_ == "CPU") { @@ -506,10 +291,13 @@ void Controller::set_config(const Config &config) { #else #ifndef AER_CUSTATEVEC - if (cuStateVec_enable_) { - // Aer is not built for cuStateVec - throw std::runtime_error("Simulation device \"GPU\" does not support " - "cuStateVec on this system"); + // cuStateVec configs + if (config.cuStateVec_enable.has_value()) { + if (config.cuStateVec_enable.value()) { + // Aer is not built for cuStateVec + throw std::runtime_error("Simulation device \"GPU\" does not support " + "cuStateVec on this system"); + } } #endif int nDev; @@ -546,7 +334,6 @@ void Controller::set_config(const Config &config) { void Controller::clear_config() { clear_parallelization(); - validation_threshold_ = 1e-8; method_ = Method::automatic; sim_device_ = Device::CPU; sim_precision_ = Precision::Double; @@ -555,18 +342,12 @@ void Controller::clear_config() { void Controller::clear_parallelization() { max_parallel_threads_ = 0; max_parallel_experiments_ = 1; - max_parallel_shots_ = 0; - max_batched_states_ = 1; parallel_experiments_ = 1; - parallel_shots_ = 1; - parallel_state_update_ = 1; parallel_nested_ = false; num_process_per_experiment_ = 1; - num_gpus_ = 0; - explicit_parallelization_ = false; max_memory_mb_ = get_system_memory_mb(); max_gpu_memory_mb_ = get_gpu_memory_mb(); @@ -575,35 +356,6 @@ void Controller::clear_parallelization() { void Controller::set_parallelization_experiments( const std::vector> &circuits, const Noise::NoiseModel &noise, const std::vector &methods) { - std::vector required_memory_mb_list(circuits.size()); - max_qubits_ = 0; - for (size_t j = 0; j < circuits.size(); j++) { - if (circuits[j]->num_qubits > max_qubits_) - max_qubits_ = circuits[j]->num_qubits; - required_memory_mb_list[j] = - required_memory_mb(*circuits[j], noise, methods[j]); - } - std::sort(required_memory_mb_list.begin(), required_memory_mb_list.end(), - std::greater<>()); - - // set max batchable number of circuits - if (batched_shots_gpu_) { - if (required_memory_mb_list[0] == 0 || max_qubits_ == 0) - max_batched_states_ = 1; - else { - if (sim_device_ == Device::GPU) { - max_batched_states_ = ((max_gpu_memory_mb_ / num_gpus_ * 8 / 10) / - required_memory_mb_list[0]) * - num_gpus_; - } else { - max_batched_states_ = - (max_memory_mb_ * 8 / 10) / required_memory_mb_list[0]; - } - } - } - if (max_qubits_ == 0) - max_qubits_ = 1; - if (explicit_parallelization_) return; @@ -626,6 +378,17 @@ void Controller::set_parallelization_experiments( } // If memory allows, execute experiments in parallel + std::vector required_memory_mb_list(circuits.size()); + for (size_t j = 0; j < circuits.size(); j++) { + std::shared_ptr executor = + make_circuit_executor(methods[j]); + required_memory_mb_list[j] = + executor->required_memory_mb(*circuits[j], noise); + executor.reset(); + } + std::sort(required_memory_mb_list.begin(), required_memory_mb_list.end(), + std::greater<>()); + size_t total_memory = 0; int parallel_experiments = 0; for (size_t required_memory_mb : required_memory_mb_list) { @@ -643,139 +406,6 @@ void Controller::set_parallelization_experiments( max_parallel_threads_, static_cast(circuits.size())}); } -void Controller::set_parallelization_circuit(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method) { - enable_batch_multi_shots_ = false; - if (batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && - max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits) { - enable_batch_multi_shots_ = true; - } - - if (sim_device_ == Device::GPU && cuStateVec_enable_) { - enable_batch_multi_shots_ = - false; // cuStateVec does not support batch execution of multi-shots - return; - } - - if (explicit_parallelization_) - return; - - // Check for trivial parallelization conditions - switch (method) { - case Method::statevector: - case Method::stabilizer: - case Method::unitary: - case Method::matrix_product_state: { - if (circ.shots == 1 || num_process_per_experiment_ > 1 || - (!noise.has_quantum_errors() && - check_measure_sampling_opt(circ, method))) { - parallel_shots_ = 1; - parallel_state_update_ = - std::max({1, max_parallel_threads_ / parallel_experiments_}); - return; - } - break; - } - case Method::density_matrix: - case Method::superop: - case Method::tensor_network: { - if (circ.shots == 1 || num_process_per_experiment_ > 1 || - check_measure_sampling_opt(circ, method)) { - parallel_shots_ = 1; - parallel_state_update_ = - std::max({1, max_parallel_threads_ / parallel_experiments_}); - return; - } - break; - } - case Method::extended_stabilizer: - break; - default: - throw std::invalid_argument( - "Cannot set parallelization for unresolved method."); - } - - // Use a local variable to not override stored maximum based - // on currently executed circuits - const auto max_shots = - (max_parallel_shots_ > 0) - ? std::min({max_parallel_shots_, max_parallel_threads_}) - : max_parallel_threads_; - - // If we are executing circuits in parallel we disable - // parallel shots - if (max_shots == 1 || parallel_experiments_ > 1) { - parallel_shots_ = 1; - } else { - // Parallel shots is > 1 - // Limit parallel shots by available memory and number of shots - // And assign the remaining threads to state update - int circ_memory_mb = - required_memory_mb(circ, noise, method) / num_process_per_experiment_; - size_t mem_size = - (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_; - if (mem_size < circ_memory_mb) - throw std::runtime_error( - "a circuit requires more memory than max_memory_mb."); - // If circ memory is 0, set it to 1 so that we don't divide by zero - circ_memory_mb = std::max({1, circ_memory_mb}); - - int shots = circ.shots; - parallel_shots_ = std::min( - {static_cast(mem_size / (circ_memory_mb * 2)), max_shots, shots}); - } - parallel_state_update_ = - (parallel_shots_ > 1) - ? std::max({1, max_parallel_threads_ / parallel_shots_}) - : std::max({1, max_parallel_threads_ / parallel_experiments_}); -} - -bool Controller::multiple_chunk_required(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method) const { - if (circ.num_qubits < 3) - return false; - if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits) - return true; - - if (num_process_per_experiment_ == 1 && sim_device_ == Device::GPU && - num_gpus_ > 0) { - return (max_gpu_memory_mb_ / num_gpus_ < - required_memory_mb(circ, noise, method)); - } - if (num_process_per_experiment_ > 1) { - size_t total_mem = max_memory_mb_; - if (sim_device_ == Device::GPU) - total_mem += max_gpu_memory_mb_; - if (total_mem * num_process_per_experiment_ > - required_memory_mb(circ, noise, method)) - return true; - } - - return false; -} - -bool Controller::multiple_shots_required(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method) const { - if (circ.shots < 2) - return false; - if (method == Method::density_matrix || method == Method::superop || - method == Method::unitary) { - return false; - } - - bool can_sample = check_measure_sampling_opt(circ, method); - - if (noise.is_ideal()) { - return !can_sample; - } - - return true; -} - size_t Controller::get_system_memory_mb() { size_t total_physical_memory = Utils::get_system_memory_mb(); #ifdef AER_MPI @@ -803,7 +433,6 @@ size_t Controller::get_gpu_memory_mb() { cudaMemGetInfo(&freeMem, &totalMem); total_physical_memory += totalMem; } - num_gpus_ = nDev; #endif #ifdef AER_MPI @@ -812,41 +441,11 @@ size_t Controller::get_gpu_memory_mb() { locMem = total_physical_memory; MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, MPI_COMM_WORLD); total_physical_memory = minMem; - - int t = num_gpus_; - MPI_Allreduce(&t, &num_gpus_, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); #endif return total_physical_memory >> 20; } -Transpile::CacheBlocking Controller::transpile_cache_blocking( - Controller::Method method, const Circuit &circ, - const Noise::NoiseModel &noise, const Config &config) const { - Transpile::CacheBlocking cache_block_pass; - - const bool is_matrix = - (method == Method::density_matrix || method == Method::unitary); - const auto complex_size = (sim_precision_ == Precision::Single) - ? sizeof(std::complex) - : sizeof(std::complex); - - cache_block_pass.set_num_processes(num_process_per_experiment_); - cache_block_pass.set_config(config); - - if (!cache_block_pass.enabled()) { - // if blocking is not set by config, automatically set if required - if (multiple_chunk_required(circ, noise, method)) { - int nplace = num_process_per_experiment_; - if (sim_device_ == Device::GPU && num_gpus_ > 0) - nplace *= num_gpus_; - cache_block_pass.set_blocking(circ.num_qubits, get_min_memory_mb() << 20, - nplace, complex_size, is_matrix); - } - } - return cache_block_pass; -} - //------------------------------------------------------------------------- // Qobj execution //------------------------------------------------------------------------- @@ -916,18 +515,7 @@ Result Controller::execute(std::vector> &circuits, // Execute each circuit in a try block try { - // check if multi-chunk distribution is required - bool multi_chunk_required_ = false; - for (size_t j = 0; j < circuits.size(); j++) { - if (circuits[j]->num_qubits > 0) { - if (multiple_chunk_required(*circuits[j], noise_model, methods[j])) - multi_chunk_required_ = true; - } - } - if (multi_chunk_required_) - num_process_per_experiment_ = num_processes_; - else - num_process_per_experiment_ = 1; + num_process_per_experiment_ = num_processes_; // set parallelization for experiments try { @@ -938,23 +526,13 @@ Result Controller::execute(std::vector> &circuits, save_exception_to_results(result, e); } -#ifdef _OPENMP - result.metadata.add(true, "omp_enabled"); -#else - result.metadata.add(false, "omp_enabled"); -#endif result.metadata.add(parallel_experiments_, "parallel_experiments"); result.metadata.add(max_memory_mb_, "max_memory_mb"); result.metadata.add(max_gpu_memory_mb_, "max_gpu_memory_mb"); - // store rank and number of processes, if no distribution rank=0 procs=1 is - // set - result.metadata.add(num_process_per_experiment_, - "num_processes_per_experiments"); - result.metadata.add(num_processes_, "num_mpi_processes"); - result.metadata.add(myrank_, "mpi_rank"); - #ifdef _OPENMP + result.metadata.add(true, "omp_enabled"); + // Check if circuit parallelism is nested with one of the others if (parallel_experiments_ > 1 && parallel_experiments_ < max_parallel_threads_) { @@ -972,9 +550,18 @@ Result Controller::execute(std::vector> &circuits, } else { parallel_nested_ = false; } +#else + result.metadata.add(false, "omp_enabled"); #endif #ifdef AER_MPI + // store rank and number of processes, if no distribution rank=0 procs=1 is + // set + result.metadata.add(num_process_per_experiment_, + "num_processes_per_experiments"); + result.metadata.add(num_processes_, "num_mpi_processes"); + result.metadata.add(myrank_, "mpi_rank"); + // average random seed to set the same seed to each process (when // seed_simulator is not set) if (num_processes_ > 1) { @@ -995,16 +582,20 @@ Result Controller::execute(std::vector> &circuits, // in #pragma omp) if (parallel_experiments_ == 1) { for (int j = 0; j < NUM_RESULTS; ++j) { - set_parallelization_circuit(*circuits[j], noise_model, methods[j]); - run_circuit(*circuits[j], noise_model, methods[j], config, - result.results[j]); + std::shared_ptr executor = + make_circuit_executor(methods[j]); + executor->run_circuit(*circuits[j], noise_model, config, methods[j], + sim_device_, result.results[j]); + executor.reset(); } } else { #pragma omp parallel for num_threads(parallel_experiments_) for (int j = 0; j < NUM_RESULTS; ++j) { - set_parallelization_circuit(*circuits[j], noise_model, methods[j]); - run_circuit(*circuits[j], noise_model, methods[j], config, - result.results[j]); + std::shared_ptr executor = + make_circuit_executor(methods[j]); + executor->run_circuit(*circuits[j], noise_model, config, methods[j], + sim_device_, result.results[j]); + executor.reset(); } } @@ -1042,813 +633,128 @@ Result Controller::execute(std::vector> &circuits, } //------------------------------------------------------------------------- -// Base class override +// Utility methods //------------------------------------------------------------------------- -void Controller::run_circuit(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method, const Config &config, - ExperimentResult &result) const { +std::shared_ptr +Controller::make_circuit_executor(const Method method) const { // Run the circuit switch (method) { - case Method::statevector: { + case Method::statevector: if (sim_device_ == Device::CPU) { - // Chunk based simualtion if (sim_precision_ == Precision::Double) { // Double-precision Statevector simulation - return run_circuit_helper>>( - circ, noise, config, Method::statevector, result); + return std::make_shared>>>(); } else { // Single-precision Statevector simulation - return run_circuit_helper>>( - circ, noise, config, Method::statevector, result); + return std::make_shared>>>(); } } else { #ifdef AER_THRUST_SUPPORTED // Chunk based simulation if (sim_precision_ == Precision::Double) { // Double-precision Statevector simulation - return run_circuit_helper< - Statevector::State>>( - circ, noise, config, Method::statevector, result); + return std::make_shared>>>(); } else { // Single-precision Statevector simulation - return run_circuit_helper< - Statevector::State>>( - circ, noise, config, Method::statevector, result); + return std::make_shared>>>(); } #endif } - } - case Method::density_matrix: { + break; + case Method::density_matrix: if (sim_device_ == Device::CPU) { if (sim_precision_ == Precision::Double) { - // Double-precision density matrix simulation - return run_circuit_helper< - DensityMatrix::State>>( - circ, noise, config, Method::density_matrix, result); + // Double-precision DensityMatrix simulation + return std::make_shared>>>(); } else { - // Single-precision density matrix simulation - return run_circuit_helper< - DensityMatrix::State>>( - circ, noise, config, Method::density_matrix, result); + // Single-precision DensityMatrix simulation + return std::make_shared>>>(); } } else { #ifdef AER_THRUST_SUPPORTED + // Chunk based simulation if (sim_precision_ == Precision::Double) { - // Double-precision density matrix simulation - return run_circuit_helper< - DensityMatrix::State>>( - circ, noise, config, Method::density_matrix, result); + // Double-precision DensityMatrix simulation + return std::make_shared>>>(); } else { - // Single-precision density matrix simulation - return run_circuit_helper< - DensityMatrix::State>>( - circ, noise, config, Method::density_matrix, result); + // Single-precision DensityMatrix simulation + return std::make_shared>>>(); } #endif } - } - case Method::unitary: { + break; + case Method::unitary: if (sim_device_ == Device::CPU) { if (sim_precision_ == Precision::Double) { // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, Method::unitary, result); + return std::make_shared>>>(); } else { // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, Method::unitary, result); + return std::make_shared>>>(); } } else { #ifdef AER_THRUST_SUPPORTED + // Chunk based simulation if (sim_precision_ == Precision::Double) { // Double-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, Method::unitary, result); + return std::make_shared>>>(); } else { // Single-precision unitary simulation - return run_circuit_helper< - QubitUnitary::State>>( - circ, noise, config, Method::unitary, result); + return std::make_shared>>>(); } #endif } - } - case Method::superop: { - if (sim_precision_ == Precision::Double) { - return run_circuit_helper< - QubitSuperoperator::State>>( - circ, noise, config, Method::superop, result); - } else { - return run_circuit_helper< - QubitSuperoperator::State>>( - circ, noise, config, Method::superop, result); - } - } - case Method::stabilizer: - // Stabilizer simulation - // TODO: Stabilizer doesn't yet support custom state initialization - return run_circuit_helper(circ, noise, config, - Method::stabilizer, result); - case Method::extended_stabilizer: - return run_circuit_helper( - circ, noise, config, Method::extended_stabilizer, result); - case Method::matrix_product_state: - return run_circuit_helper( - circ, noise, config, Method::matrix_product_state, result); - case Method::tensor_network: { + break; + case Method::superop: if (sim_precision_ == Precision::Double) { - return run_circuit_helper< - TensorNetwork::State>>( - circ, noise, config, Method::tensor_network, result); - } else { - return run_circuit_helper< - TensorNetwork::State>>( - circ, noise, config, Method::tensor_network, result); - } - } - default: - throw std::runtime_error("Controller:Invalid simulation method"); - } -} - -//------------------------------------------------------------------------- -// Utility methods -//------------------------------------------------------------------------- - -size_t Controller::required_memory_mb(const Circuit &circ, - const Noise::NoiseModel &noise, - const Method method) const { - switch (method) { - case Method::statevector: { - if (sim_precision_ == Precision::Single) { - Statevector::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); + return std::make_shared>>>(); } else { - Statevector::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); + return std::make_shared>>>(); } - } - case Method::density_matrix: { - if (sim_precision_ == Precision::Single) { - DensityMatrix::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } else { - DensityMatrix::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } - } - case Method::unitary: { - if (sim_precision_ == Precision::Single) { - QubitUnitary::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } else { - QubitUnitary::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } - } - case Method::superop: { - if (sim_precision_ == Precision::Single) { - QubitSuperoperator::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } else { - QubitSuperoperator::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } - } + break; case Method::stabilizer: { - Stabilizer::State state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } + return std::make_shared>(); + } break; case Method::extended_stabilizer: { - ExtendedStabilizer::State state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } + return std::make_shared< + CircuitExecutor::Executor>(); + } break; case Method::matrix_product_state: { - MatrixProductState::State state; - return state.required_memory_mb(circ.num_qubits, circ.ops); - } + return std::make_shared< + CircuitExecutor::Executor>(); + } break; case Method::tensor_network: { - if (sim_precision_ == Precision::Single) { - TensorNetwork::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); + if (sim_precision_ == Precision::Double) { + return std::make_shared>>>(); } else { - TensorNetwork::State> state; - return state.required_memory_mb(circ.num_qubits, circ.ops); + return std::make_shared>>>(); } - } + } break; + case Method::automatic: + throw std::runtime_error( + "Cannot make circuit executor for automatic simulation method."); default: - // We shouldn't get here, so throw an exception if we do - throw std::runtime_error("Controller: Invalid simulation method"); - } -} - -Transpile::Fusion Controller::transpile_fusion(Method method, - const Operations::OpSet &opset, - const Config &config) const { - Transpile::Fusion fusion_pass; - fusion_pass.set_parallelization(parallel_state_update_); - - if (opset.contains(Operations::OpType::superop)) { - fusion_pass.allow_superop = true; - } - if (opset.contains(Operations::OpType::kraus)) { - fusion_pass.allow_kraus = true; - } - switch (method) { - case Method::density_matrix: - case Method::superop: { - // Halve the default threshold and max fused qubits for density matrix - fusion_pass.threshold /= 2; - fusion_pass.max_qubit /= 2; - break; - } - case Method::matrix_product_state: { - fusion_pass.active = false; - return fusion_pass; // Do not allow the config to set active for MPS - } - case Method::statevector: { - if (fusion_pass.allow_kraus) { - // Halve default max fused qubits for Kraus noise fusion - fusion_pass.max_qubit /= 2; - } - break; - } - case Method::unitary: { - // max_qubit is the same with statevector - fusion_pass.threshold /= 2; - break; - } - case Method::tensor_network: { - if (opset.contains(Operations::OpType::save_statevec) || - opset.contains(Operations::OpType::save_statevec_dict)) { - if (fusion_pass.allow_kraus) { - // Halve default max fused qubits for Kraus noise fusion - fusion_pass.max_qubit /= 2; - } - } else { - // Halve the default threshold and max fused qubits for density matrix - fusion_pass.threshold /= 2; - fusion_pass.max_qubit /= 2; - } - break; - } - default: { - fusion_pass.active = false; - return fusion_pass; - } - } - // Override default fusion settings with custom config - fusion_pass.set_config(config); - return fusion_pass; -} - -//------------------------------------------------------------------------- -// Run circuit helpers -//------------------------------------------------------------------------- - -template -void Controller::run_circuit_helper(const Circuit &circ, - const Noise::NoiseModel &noise, - const Config &config, const Method method, - ExperimentResult &result) const { - // Start individual circuit timer - auto timer_start = myclock_t::now(); // state circuit timer - - // Execute in try block so we can catch errors and return the error message - // for individual circuit failures. - try { - // Rng engine (this one is used to add noise on circuit) - RngEngine rng; - rng.set_seed(circ.seed); - - // Output data container - result.set_config(config); - result.metadata.add(method_names_.at(method), "method"); - if (method == Method::statevector || method == Method::density_matrix || - method == Method::unitary || method == Method::tensor_network) { - result.metadata.add(sim_device_name_, "device"); - } else { - result.metadata.add("CPU", "device"); - } - - // Circuit qubit metadata - result.metadata.add(circ.num_qubits, "num_qubits"); - result.metadata.add(circ.num_memory, "num_clbits"); - result.metadata.add(circ.qubits(), "active_input_qubits"); - result.metadata.add(circ.qubit_map(), "input_qubit_map"); - result.metadata.add(circ.remapped_qubits, "remapped_qubits"); - - // Add measure sampling to metadata - // Note: this will set to `true` if sampling is enabled for the circuit - result.metadata.add(false, "measure_sampling"); - result.metadata.add(false, "batched_shots_optimization"); - - if (circ.num_qubits > 0) { // do nothing for query steps - // Choose execution method based on noise and method - Circuit opt_circ; - bool noise_sampling = false; - - // Ideal circuit - if (noise.is_ideal()) { - opt_circ = circ; - result.metadata.add("ideal", "noise"); - } - // Readout error only - else if (noise.has_quantum_errors() == false) { - opt_circ = noise.sample_noise(circ, rng); - result.metadata.add("readout", "noise"); - } - // Superop noise sampling - else if (method == Method::density_matrix || method == Method::superop || - (method == Method::tensor_network && - !has_statevector_ops(circ))) { - // Sample noise using SuperOp method - opt_circ = - noise.sample_noise(circ, rng, Noise::NoiseModel::Method::superop); - result.metadata.add("superop", "noise"); - } - // Kraus noise sampling - else if (noise.opset().contains(Operations::OpType::kraus) || - noise.opset().contains(Operations::OpType::superop)) { - opt_circ = - noise.sample_noise(circ, rng, Noise::NoiseModel::Method::kraus); - result.metadata.add("kraus", "noise"); - } - // General circuit noise sampling - else { - if (enable_batch_multi_shots_ && !multi_chunk_required_) { - // batched optimization samples noise at runtime - opt_circ = noise.sample_noise( - circ, rng, Noise::NoiseModel::Method::circuit, true); - } else { - noise_sampling = true; - } - result.metadata.add("circuit", "noise"); - } - - if (noise_sampling) { - run_circuit_with_sampled_noise(circ, noise, config, method, - result); - } else { - // Run multishot simulation without noise sampling - run_circuit_without_sampled_noise(opt_circ, noise, config, - method, result); - } - } - - // Report success - result.status = ExperimentResult::Status::completed; - - // Pass through circuit header and add metadata - result.header = circ.header; - result.shots = circ.shots; - result.seed = circ.seed; - result.metadata.add(parallel_shots_, "parallel_shots"); - result.metadata.add(parallel_state_update_, "parallel_state_update"); - if (parallel_shots_ > 1 && parallel_state_update_ > 1) - result.metadata.add(true, "omp_nested"); - else - result.metadata.add(false, "omp_nested"); - - // Add timer data - auto timer_stop = myclock_t::now(); // stop timer - double time_taken = - std::chrono::duration(timer_stop - timer_start).count(); - result.time_taken = time_taken; - } - // If an exception occurs during execution, catch it and pass it to the output - catch (std::exception &e) { - result.status = ExperimentResult::Status::error; - result.message = e.what(); - } -} - -template -void Controller::run_single_shot(const Circuit &circ, State_t &state, - ExperimentResult &result, - RngEngine &rng) const { - state.initialize_qreg(circ.num_qubits); - state.initialize_creg(circ.num_memory, circ.num_registers); - state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), result, rng, true); - result.save_count_data(state.cregs(), save_creg_memory_); -} - -template -void Controller::run_with_sampling(const Circuit &circ, State_t &state, - ExperimentResult &result, RngEngine &rng, - const uint_t block_bits, - const uint_t shots) const { - auto &ops = circ.ops; - auto first_meas = circ.first_measure_pos; // Position of first measurement op - bool final_ops = (first_meas == ops.size()); - - // allocate qubit register - state.allocate(circ.num_qubits, block_bits); - - // Run circuit instructions before first measure - state.initialize_qreg(circ.num_qubits); - state.initialize_creg(circ.num_memory, circ.num_registers); - - state.apply_ops(ops.cbegin(), ops.cbegin() + first_meas, result, rng, - final_ops); - - // Get measurement operations and set of measured qubits - measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), shots, state, - result, rng); -} - -template -void Controller::run_circuit_without_sampled_noise( - Circuit &circ, const Noise::NoiseModel &noise, const Config &config, - const Method method, ExperimentResult &result) const { - State_t state; - - // Validate gateset and memory requirements, raise exception if they're - // exceeded - validate_state(state, circ, noise, true); - - // Set state config - state.set_config(config); - state.set_parallelization(parallel_state_update_); - state.set_global_phase(circ.global_phase_angle); - state.enable_density_matrix(!has_statevector_ops(circ)); - - bool can_sample = circ.can_sample; - - // Optimize circuit - Noise::NoiseModel dummy_noise; - - auto fusion_pass = transpile_fusion(method, circ.opset(), config); - fusion_pass.optimize_circuit(circ, dummy_noise, state.opset(), result); - - // Cache blocking pass - uint_t block_bits = circ.num_qubits; - if (state.multi_chunk_distribution_supported()) { - auto cache_block_pass = - transpile_cache_blocking(method, circ, dummy_noise, config); - cache_block_pass.set_sample_measure(can_sample); - cache_block_pass.optimize_circuit(circ, dummy_noise, state.opset(), result); - if (cache_block_pass.enabled()) { - block_bits = cache_block_pass.block_bits(); - } - } - // Check if measure sampling supported - can_sample &= check_measure_sampling_opt(circ, method); - auto max_bits = get_max_matrix_qubits(circ); - - // Check if measure sampler and optimization are valid - if (can_sample) { - // Implement measure sampler - if (parallel_shots_ <= 1) { - state.set_distribution(num_process_per_experiment_); - state.set_max_matrix_qubits(max_bits); - RngEngine rng; - rng.set_seed(circ.seed); - run_with_sampling(circ, state, result, rng, block_bits, circ.shots); - } else { - // Vector to store parallel thread output data - std::vector par_results(parallel_shots_); - -#pragma omp parallel for num_threads(parallel_shots_) - for (int i = 0; i < parallel_shots_; i++) { - uint_t i_shot = circ.shots * i / parallel_shots_; - uint_t shot_end = circ.shots * (i + 1) / parallel_shots_; - uint_t this_shot = shot_end - i_shot; - - State_t shot_state; - // Set state config - shot_state.set_config(config); - shot_state.set_parallelization(parallel_state_update_); - shot_state.set_global_phase(circ.global_phase_angle); - shot_state.enable_density_matrix(!has_statevector_ops(circ)); - - shot_state.set_max_matrix_qubits(max_bits); - - RngEngine rng; - rng.set_seed(circ.seed + i); - - run_with_sampling(circ, shot_state, par_results[i], rng, block_bits, - this_shot); - - shot_state.add_metadata(par_results[i]); - } - for (auto &res : par_results) { - result.combine(std::move(res)); - } - - if (sim_device_name_ == "GPU") { - if (parallel_shots_ >= num_gpus_) - result.metadata.add(num_gpus_, "gpu_parallel_shots_"); - else - result.metadata.add(parallel_shots_, "gpu_parallel_shots_"); - } - } - // Add measure sampling metadata - result.metadata.add(true, "measure_sampling"); - - } else { - // Perform standard execution if we cannot apply the - // measurement sampling optimization - - if (block_bits == circ.num_qubits && enable_batch_multi_shots_ && - state.multi_shot_parallelization_supported()) { - // apply batched multi-shots optimization (currenly only on GPU) - state.set_max_bached_shots(max_batched_states_); - state.set_distribution(num_processes_); - state.set_max_matrix_qubits(max_bits); - state.set_num_creg_bits(circ.num_memory, circ.num_registers); - state.allocate(circ.num_qubits, circ.num_qubits, - circ.shots); // allocate multiple-shots - - // qreg is initialized inside state class - state.initialize_creg(circ.num_memory, circ.num_registers); - - state.apply_ops_multi_shots(circ.ops.cbegin(), circ.ops.cend(), noise, - result, circ.seed, true); - - result.save_count_data(state.cregs(), save_creg_memory_); - - // Add batched multi-shots optimizaiton metadata - result.metadata.add(true, "batched_shots_optimization"); - } else { - std::vector par_results(parallel_shots_); - int_t par_shots = parallel_shots_; - if (block_bits != circ.num_qubits) - par_shots = 1; - - auto run_circuit_without_sampled_noise_lambda = - [this, &par_results, circ, noise, config, method, block_bits, - max_bits, par_shots](int_t i) { - uint_t i_shot, shot_end; - i_shot = circ.shots * i / par_shots; - shot_end = circ.shots * (i + 1) / par_shots; - - State_t par_state; - // Set state config - par_state.set_config(config); - par_state.set_parallelization(parallel_state_update_); - par_state.set_global_phase(circ.global_phase_angle); - par_state.enable_density_matrix(!has_statevector_ops(circ)); - - par_state.set_distribution(num_process_per_experiment_); - par_state.set_max_matrix_qubits(max_bits); - - // allocate qubit register - par_state.allocate(circ.num_qubits, block_bits); - - for (; i_shot < shot_end; i_shot++) { - RngEngine rng; - rng.set_seed(circ.seed + i_shot); - run_single_shot(circ, par_state, par_results[i], rng); - } - par_state.add_metadata(par_results[i]); - }; - Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, - run_circuit_without_sampled_noise_lambda); - - for (auto &res : par_results) { - result.combine(std::move(res)); - } - if (sim_device_name_ == "GPU") { - if (par_shots >= num_gpus_) - result.metadata.add(num_gpus_, "gpu_parallel_shots_"); - else - result.metadata.add(par_shots, "gpu_parallel_shots_"); - } - } - } - state.add_metadata(result); -} - -template -void Controller::run_circuit_with_sampled_noise( - const Circuit &circ, const Noise::NoiseModel &noise, const Config &config, - const Method method, ExperimentResult &result) const { - std::vector par_results(parallel_shots_); - - auto run_circuit_with_sampled_noise_lambda = [this, &par_results, circ, noise, - config, method](int_t i) { - State_t state; - uint_t i_shot, shot_end; - Noise::NoiseModel dummy_noise; - - // Validate gateset and memory requirements, raise exception if they're - // exceeded - validate_state(state, circ, noise, true); - - // Set state config - state.set_config(config); - state.set_parallelization(parallel_state_update_); - state.set_global_phase(circ.global_phase_angle); - state.enable_density_matrix(!has_statevector_ops(circ)); - - // Transpilation for circuit noise method - auto fusion_pass = transpile_fusion(method, circ.opset(), config); - auto cache_block_pass = - transpile_cache_blocking(method, circ, noise, config); - - i_shot = circ.shots * i / parallel_shots_; - shot_end = circ.shots * (i + 1) / parallel_shots_; - - for (; i_shot < shot_end; i_shot++) { - RngEngine rng; - rng.set_seed(circ.seed + i_shot); - - // Sample noise using circuit method - Circuit noise_circ = noise.sample_noise(circ, rng); - - noise_circ.shots = 1; - fusion_pass.optimize_circuit(noise_circ, dummy_noise, state.opset(), - par_results[i]); - uint_t block_bits = circ.num_qubits; - if (state.multi_chunk_distribution_supported()) { - cache_block_pass.optimize_circuit(noise_circ, dummy_noise, - state.opset(), par_results[i]); - if (cache_block_pass.enabled()) { - block_bits = cache_block_pass.block_bits(); - } - } - - state.set_distribution(num_process_per_experiment_); - state.set_max_matrix_qubits(get_max_matrix_qubits(noise_circ)); - // allocate qubit register - state.allocate(noise_circ.num_qubits, block_bits); - - run_single_shot(noise_circ, state, par_results[i], rng); - } - state.add_metadata(par_results[i]); - }; - Utils::apply_omp_parallel_for((parallel_shots_ > 1), 0, parallel_shots_, - run_circuit_with_sampled_noise_lambda); - - for (auto &res : par_results) { - result.combine(std::move(res)); - } - - if (sim_device_name_ == "GPU") { - if (parallel_shots_ >= num_gpus_) - result.metadata.add(num_gpus_, "gpu_parallel_shots_"); - else - result.metadata.add(parallel_shots_, "gpu_parallel_shots_"); - } -} - -//------------------------------------------------------------------------- -// Measure sampling optimization -//------------------------------------------------------------------------- - -bool Controller::check_measure_sampling_opt(const Circuit &circ, - const Method method) const { - // Check if circuit has sampling flag disabled - if (circ.can_sample == false) { - return false; - } - - // If density matrix, unitary, superop method all supported instructions - // allow sampling - if (method == Method::density_matrix || method == Method::superop || - method == Method::unitary) { - return true; - } - if (method == Method::tensor_network) { - // if there are no save statevec ops, tensor network simulator runs as - // density matrix simulator - if ((!circ.opset().contains(Operations::OpType::save_statevec)) && - (!circ.opset().contains(Operations::OpType::save_statevec_dict))) { - return true; - } - } - - // If circuit contains a non-initial initialize that is not a full width - // instruction we can't sample - if (circ.can_sample_initialize == false) { - return false; - } - - // Check if non-density matrix simulation and circuit contains - // a stochastic instruction before measurement - // ie. reset, kraus, superop - // TODO: - // * Resets should be allowed if applied to |0> state (no gates before). - if (circ.opset().contains(Operations::OpType::reset) || - circ.opset().contains(Operations::OpType::kraus) || - circ.opset().contains(Operations::OpType::superop) || - circ.opset().contains(Operations::OpType::jump) || - circ.opset().contains(Operations::OpType::mark)) { - return false; - } - // Otherwise true - return true; -} - -template -void Controller::measure_sampler(InputIterator first_meas, - InputIterator last_meas, uint_t shots, - State_t &state, ExperimentResult &result, - RngEngine &rng, int_t shot_index) const { - // Check if meas_circ is empty, and if so return initial creg - if (first_meas == last_meas) { - while (shots-- > 0) { - result.save_count_data(state.cregs(), save_creg_memory_); - } - return; - } - - std::vector meas_ops; - std::vector roerror_ops; - for (auto op = first_meas; op != last_meas; op++) { - if (op->type == Operations::OpType::roerror) { - roerror_ops.push_back(*op); - } else { /*(op.type == Operations::OpType::measure) */ - meas_ops.push_back(*op); - } - } - - // Get measured qubits from circuit sort and delete duplicates - std::vector meas_qubits; // measured qubits - for (const auto &op : meas_ops) { - for (size_t j = 0; j < op.qubits.size(); ++j) - meas_qubits.push_back(op.qubits[j]); - } - sort(meas_qubits.begin(), meas_qubits.end()); - meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()), - meas_qubits.end()); - - // Generate the samples - uint_t shots_or_index; - if (shot_index < 0) - shots_or_index = shots; - else - shots_or_index = shot_index; - - auto timer_start = myclock_t::now(); - auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng); - auto time_taken = - std::chrono::duration(myclock_t::now() - timer_start).count(); - result.metadata.add(time_taken, "sample_measure_time"); - - // Make qubit map of position in vector of measured qubits - std::unordered_map qubit_map; - for (uint_t j = 0; j < meas_qubits.size(); ++j) { - qubit_map[meas_qubits[j]] = j; - } - - // Maps of memory and register to qubit position - std::map memory_map; - std::map register_map; - for (const auto &op : meas_ops) { - for (size_t j = 0; j < op.qubits.size(); ++j) { - auto pos = qubit_map[op.qubits[j]]; - if (!op.memory.empty()) - memory_map[op.memory[j]] = pos; - if (!op.registers.empty()) - register_map[op.registers[j]] = pos; - } - } - - // Process samples - uint_t num_memory = - (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first; - uint_t num_registers = - (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first; - ClassicalRegister creg; - while (!all_samples.empty()) { - auto sample = all_samples.back(); - creg.initialize(num_memory, num_registers); - - // process memory bit measurements - for (const auto &pair : memory_map) { - creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}), - reg_t()); - } - // process register bit measurements - for (const auto &pair : register_map) { - creg.store_measure(reg_t({sample[pair.second]}), reg_t(), - reg_t({pair.first})); - } - - // process read out errors for memory and registers - for (const Operations::Op &roerror : roerror_ops) { - creg.apply_roerror(roerror, rng); - } - - // Save count data - result.save_count_data(creg, save_creg_memory_); - - // pop off processed sample - all_samples.pop_back(); + throw std::runtime_error("Controller:Invalid simulation method"); } } -//------------------------------------------------------------------------- -// Validation -//------------------------------------------------------------------------- - -std::vector +std::vector Controller::simulation_methods(std::vector> &circuits, Noise::NoiseModel &noise_model) const { // Does noise model contain kraus noise @@ -1904,7 +810,7 @@ Controller::simulation_methods(std::vector> &circuits, return sim_methods; } -Controller::Method Controller::automatic_simulation_method( +Method Controller::automatic_simulation_method( const Circuit &circ, const Noise::NoiseModel &noise_model) const { // If circuit and noise model are Clifford run on Stabilizer simulator if (validate_method(Method::stabilizer, circ, noise_model, false)) { @@ -1918,7 +824,7 @@ Controller::Method Controller::automatic_simulation_method( if (noise_model.has_quantum_errors() && circ.num_qubits < 64 && circ.shots > (1ULL << circ.num_qubits) && validate_method(Method::density_matrix, circ, noise_model, false) && - check_measure_sampling_opt(circ, Method::density_matrix)) { + circ.can_sample) { return Method::density_matrix; } @@ -1942,95 +848,6 @@ Controller::Method Controller::automatic_simulation_method( return Method::statevector; } -bool Controller::validate_method(Method method, const Circuit &circ, - const Noise::NoiseModel &noise_model, - bool throw_except) const { - // Switch wrapper for templated function validate_state - switch (method) { - case Method::stabilizer: - return validate_state(Stabilizer::State(), circ, noise_model, throw_except); - case Method::extended_stabilizer: - return validate_state(ExtendedStabilizer::State(), circ, noise_model, - throw_except); - case Method::matrix_product_state: - return validate_state(MatrixProductState::State(), circ, noise_model, - throw_except); - case Method::statevector: - return validate_state(Statevector::State<>(), circ, noise_model, - throw_except); - case Method::density_matrix: - return validate_state(DensityMatrix::State<>(), circ, noise_model, - throw_except); - case Method::unitary: - return validate_state(QubitUnitary::State<>(), circ, noise_model, - throw_except); - case Method::superop: - return validate_state(QubitSuperoperator::State<>(), circ, noise_model, - throw_except); - case Method::tensor_network: - return validate_state(TensorNetwork::State<>(), circ, noise_model, - throw_except); - case Method::automatic: - throw std::runtime_error( - "Cannot validate circuit for unresolved simulation method."); - } -} - -template -bool Controller::validate_state(const state_t &state, const Circuit &circ, - const Noise::NoiseModel &noise, - bool throw_except) const { - std::stringstream error_msg; - std::string circ_name; - JSON::get_value(circ_name, "name", circ.header); - - // Check if a circuit is valid for state ops - bool circ_valid = state.opset().contains(circ.opset()); - if (throw_except && !circ_valid) { - error_msg << "Circuit " << circ_name << " contains invalid instructions "; - error_msg << state.opset().difference(circ.opset()); - error_msg << " for \"" << state.name() << "\" method."; - } - - // Check if a noise model valid for state ops - bool noise_valid = noise.is_ideal() || state.opset().contains(noise.opset()); - if (throw_except && !noise_valid) { - error_msg << "Noise model contains invalid instructions "; - error_msg << state.opset().difference(noise.opset()); - error_msg << " for \"" << state.name() << "\" method."; - } - - // Validate memory requirements - bool memory_valid = true; - if (max_memory_mb_ > 0) { - size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) / - num_process_per_experiment_; - size_t mem_size = (sim_device_ == Device::GPU) - ? max_memory_mb_ + max_gpu_memory_mb_ - : max_memory_mb_; - memory_valid = (required_mb <= mem_size); - if (throw_except && !memory_valid) { - error_msg << "Insufficient memory to run circuit " << circ_name; - error_msg << " using the " << state.name() << " simulator."; - error_msg << " Required memory: " << required_mb - << "M, max memory: " << max_memory_mb_ << "M"; - if (sim_device_ == Device::GPU) { - error_msg << " (Host) + " << max_gpu_memory_mb_ << "M (GPU)"; - } - } - } - - if (noise_valid && circ_valid && memory_valid) { - return true; - } - - // One of the validation checks failed for the current state - if (throw_except) { - throw std::runtime_error(error_msg.str()); - } - return false; -} - void Controller::save_exception_to_results(Result &result, const std::exception &e) const { result.status = Result::Status::error; @@ -2041,40 +858,25 @@ void Controller::save_exception_to_results(Result &result, } } -int_t Controller::get_matrix_bits(const Operations::Op &op) const { - int_t bit = 1; - if (op.type == Operations::OpType::matrix || - op.type == Operations::OpType::diagonal_matrix || - op.type == Operations::OpType::initialize) - bit = op.qubits.size(); - else if (op.type == Operations::OpType::kraus || - op.type == Operations::OpType::superop) { - if (method_ == Method::density_matrix) - bit = op.qubits.size() * 2; - else - bit = op.qubits.size(); - } - return bit; -} - -int_t Controller::get_max_matrix_qubits(const Circuit &circ) const { - int_t max_bits = 0; - int_t i; - - for (i = 0; i < circ.ops.size(); i++) { - int_t bit = 1; - bit = get_matrix_bits(circ.ops[i]); - max_bits = std::max(max_bits, bit); - } - return max_bits; -} - bool Controller::has_statevector_ops(const Circuit &circ) const { return circ.opset().contains(Operations::OpType::save_statevec) || circ.opset().contains(Operations::OpType::save_statevec_dict) || circ.opset().contains(Operations::OpType::save_amps); } +//------------------------------------------------------------------------- +// Validation +//------------------------------------------------------------------------- +bool Controller::validate_method(Method method, const Circuit &circ, + const Noise::NoiseModel &noise_model, + bool throw_except) const { + std::shared_ptr executor = + make_circuit_executor(method); + bool ret = executor->validate_state(circ, noise_model, throw_except); + executor.reset(); + return ret; +} + //------------------------------------------------------------------------- } // end namespace AER //------------------------------------------------------------------------- diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp index bb7ac166e7..52791d16f6 100644 --- a/src/controllers/state_controller.hpp +++ b/src/controllers/state_controller.hpp @@ -804,7 +804,7 @@ reg_t AerState::initialize_statevector(uint_t num_of_qubits, complex_t *data, auto qv = QV::QubitVector(); qv.move_from_vector(std::move(vec)); - state->initialize_qreg(num_of_qubits_, std::move(qv)); + state->initialize_statevector(num_of_qubits_, std::move(qv)); state->initialize_creg(num_of_qubits_, num_of_qubits_); initialized_ = true; diff --git a/src/framework/config.hpp b/src/framework/config.hpp index 56a8015a0b..60a5d7c313 100644 --- a/src/framework/config.hpp +++ b/src/framework/config.hpp @@ -100,6 +100,9 @@ struct Config { bool batched_shots_gpu = false; uint_t batched_shots_gpu_max_qubits = 16; optional num_threads_per_device; + // # multi-shot branching + bool shot_branching_enable = false; + bool shot_branching_sampling_enable = false; // # statevector options uint_t statevector_parallel_threshold = 14; uint_t statevector_sample_measure_opt = 10; @@ -167,6 +170,7 @@ struct Config { optional unitary_parallel_threshold; optional memory_blocking_bits; optional extended_stabilizer_norm_estimation_default_samples; + optional target_gpus; void clear() { shots = 1024; @@ -201,6 +205,9 @@ struct Config { batched_shots_gpu = false; batched_shots_gpu_max_qubits = 16; num_threads_per_device.clear(); + // # multi-shot branching + shot_branching_enable = false; + shot_branching_sampling_enable = false; // # statevector options statevector_parallel_threshold = 14; statevector_sample_measure_opt = 10; @@ -263,6 +270,7 @@ struct Config { unitary_parallel_threshold.clear(); memory_blocking_bits.clear(); extended_stabilizer_norm_estimation_default_samples.clear(); + target_gpus.clear(); } void merge(const Config &other) { @@ -312,6 +320,9 @@ struct Config { batched_shots_gpu_max_qubits = other.batched_shots_gpu_max_qubits; if (other.num_threads_per_device.has_value()) num_threads_per_device.value(other.num_threads_per_device.value()); + // # multi-shot branching + shot_branching_enable = other.shot_branching_enable; + shot_branching_sampling_enable = other.shot_branching_sampling_enable; // # statevector options statevector_parallel_threshold = other.statevector_parallel_threshold; statevector_sample_measure_opt = other.statevector_sample_measure_opt; @@ -401,6 +412,8 @@ struct Config { if (other.extended_stabilizer_norm_estimation_default_samples.has_value()) extended_stabilizer_norm_estimation_default_samples.value( other.extended_stabilizer_norm_estimation_default_samples.value()); + if (other.target_gpus.has_value()) + target_gpus.value(other.target_gpus.value()); } }; @@ -440,6 +453,10 @@ inline void from_json(const json_t &js, Config &config) { get_value(config.batched_shots_gpu_max_qubits, "batched_shots_gpu_max_qubits", js); get_value(config.num_threads_per_device, "num_threads_per_device", js); + // # multi-shot branching + get_value(config.shot_branching_enable, "shot_branching_enable", js); + get_value(config.shot_branching_sampling_enable, + "shot_branching_sampling_enable", js); // # statevector options get_value(config.statevector_parallel_threshold, "statevector_parallel_threshold", js); @@ -511,6 +528,7 @@ inline void from_json(const json_t &js, Config &config) { get_value(config.memory_blocking_bits, "memory_blocking_bits", js); get_value(config.extended_stabilizer_norm_estimation_default_samples, "extended_stabilizer_norm_estimation_default_samples", js); + get_value(config.target_gpus, "target_gpus", js); } } // namespace AER diff --git a/src/framework/operations.hpp b/src/framework/operations.hpp old mode 100644 new mode 100755 index da1f575054..4ec55757ff --- a/src/framework/operations.hpp +++ b/src/framework/operations.hpp @@ -61,6 +61,7 @@ enum class OpType { superop, roerror, noise_switch, + sample_noise, // Save instructions save_state, save_expval, @@ -207,6 +208,9 @@ inline std::ostream &operator<<(std::ostream &stream, const OpType &type) { case OpType::qerror_loc: stream << "qerror_loc"; break; + case OpType::sample_noise: + stream << "sample_noise"; + break; case OpType::noise_switch: stream << "noise_switch"; break; diff --git a/src/framework/results/experiment_result.hpp b/src/framework/results/experiment_result.hpp index b956e5f06b..f8e2771e61 100644 --- a/src/framework/results/experiment_result.hpp +++ b/src/framework/results/experiment_result.hpp @@ -62,8 +62,6 @@ struct ExperimentResult { // save creg as count data void save_count_data(const ClassicalRegister &creg, bool save_memory); - void save_count_data(const std::vector &cregs, - bool save_memory); // Save data type which can be averaged over all shots. // This supports DataSubTypes: list, c_list, accum, c_accum, average, @@ -148,12 +146,6 @@ void ExperimentResult::save_count_data(const ClassicalRegister &creg, } } -void ExperimentResult::save_count_data( - const std::vector &cregs, bool save_memory) { - for (int_t i = 0; i < cregs.size(); i++) - save_count_data(cregs[i], save_memory); -} - template void ExperimentResult::save_data_average(const ClassicalRegister &creg, const std::string &key, const T &datum, diff --git a/src/framework/utils.hpp b/src/framework/utils.hpp old mode 100644 new mode 100755 index e2d3b8407b..6c3cc52d77 --- a/src/framework/utils.hpp +++ b/src/framework/utils.hpp @@ -1327,6 +1327,30 @@ double apply_omp_parallel_for_reduction(bool enabled, int_t i_begin, return val; } +// apply OpenMP parallel loop to lambda function and return reduced integer if +// enabled +template +int apply_omp_parallel_for_reduction_int(bool enabled, int_t i_begin, + int_t i_end, Lambda &func, + int nthreads = 0) { + int val = 0; + if (enabled) { + if (nthreads > 0) { +#pragma omp parallel for reduction(+ : val) num_threads(nthreads) + for (int_t i = i_begin; i < i_end; i++) + val += func(i); + } else { +#pragma omp parallel for reduction(+ : val) + for (int_t i = i_begin; i < i_end; i++) + val += func(i); + } + } else { + for (int_t i = i_begin; i < i_end; i++) + val += func(i); + } + return val; +} + //------------------------------------------------------------------------------ } // end namespace Utils //------------------------------------------------------------------------------ diff --git a/src/noise/noise_model.hpp b/src/noise/noise_model.hpp index d1207fa4b2..feff38054e 100644 --- a/src/noise/noise_model.hpp +++ b/src/noise/noise_model.hpp @@ -528,7 +528,7 @@ NoiseModel::sample_noise_helper(const Operations::Op &op, RngEngine &rng, // Combine errors auto &noise_ops = noise_before; noise_ops.reserve(noise_before.size() + noise_after.size() + 1); - if (op.type != Operations::OpType::qerror_loc) { + if (op.type != Operations::OpType::sample_noise) { noise_ops.push_back(op); } noise_ops.insert(noise_ops.end(), @@ -802,7 +802,7 @@ NoiseModel::NoiseOps NoiseModel::create_noise_loc(const Operations::Op &op) const { NoiseOps ops(1); ops[0] = op; - ops[0].type = Operations::OpType::qerror_loc; + ops[0].type = Operations::OpType::sample_noise; return ops; } diff --git a/src/simulators/batch_shots_executor.hpp b/src/simulators/batch_shots_executor.hpp new file mode 100644 index 0000000000..e0e7b544a8 --- /dev/null +++ b/src/simulators/batch_shots_executor.hpp @@ -0,0 +1,477 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _batch_shots_executor_hpp_ +#define _batch_shots_executor_hpp_ + +#include "simulators/parallel_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace CircuitExecutor { + +//------------------------------------------------------------------------- +// batched-shots executor class implementation +//------------------------------------------------------------------------- +template +class BatchShotsExecutor : public virtual MultiStateExecutor { + using Base = MultiStateExecutor; + +protected: + // config setting for multi-shot parallelization + bool batched_shots_gpu_ = true; + int_t batched_shots_gpu_max_qubits_ = + 16; // multi-shot parallelization is applied if qubits is less than max + // qubits + bool enable_batch_multi_shots_ = + false; // multi-shot parallelization can be applied + uint_t local_state_index_; // local shot ID of current loop +public: + BatchShotsExecutor(); + virtual ~BatchShotsExecutor(); + +protected: + void set_config(const Config &config) override; + void set_parallelization(const Circuit &circ, + const Noise::NoiseModel &noise) override; + + void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise) override; + + // apply ops for multi-shots to one group + template + void apply_ops_batched_shots_for_group(int_t i_group, InputIterator first, + InputIterator last, + const Noise::NoiseModel &noise, + ExperimentResult &result, + RngEngine &init_rng, uint_t rng_seed, + bool final_ops); + + // apply op to multiple shots , return flase if op is not supported to execute + // in a batch + virtual bool apply_batched_op(const int_t istate, const Operations::Op &op, + ExperimentResult &result, + std::vector &rng, + bool final_op = false) { + return false; + } + + // apply sampled noise to multiple-shots (this is used for ops contains + // non-Pauli operators) + void apply_batched_noise_ops( + const int_t i_group, const std::vector> &ops, + ExperimentResult &result, std::vector &rng); +}; + +template +BatchShotsExecutor::BatchShotsExecutor() {} + +template +BatchShotsExecutor::~BatchShotsExecutor() {} + +template +void BatchShotsExecutor::set_config(const Config &config) { + Base::set_config(config); + + // enable batched multi-shots/experiments optimization + batched_shots_gpu_ = config.batched_shots_gpu; + + batched_shots_gpu_max_qubits_ = config.batched_shots_gpu_max_qubits; + if (Base::method_ == Method::density_matrix || + Base::method_ == Method::unitary) + batched_shots_gpu_max_qubits_ /= 2; +} + +template +void BatchShotsExecutor::set_parallelization( + const Circuit &circ, const Noise::NoiseModel &noise) { + Base::set_parallelization(circ, noise); + + enable_batch_multi_shots_ = false; + if (batched_shots_gpu_ && Base::sim_device_ != Device::CPU) { + enable_batch_multi_shots_ = true; + if (circ.num_qubits >= batched_shots_gpu_max_qubits_) + enable_batch_multi_shots_ = false; + else if (circ.shots == 1) + enable_batch_multi_shots_ = false; + // else if (Base::multiple_chunk_required(circ, noise)) + // enable_batch_multi_shots_ = false; + } + +#ifdef AER_CUSTATEVEC + // disable cuStateVec for batch-shots optimization + if (enable_batch_multi_shots_ && Base::cuStateVec_enable_) + Base::cuStateVec_enable_ = false; +#endif +} + +template +void BatchShotsExecutor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + state_t dummy_state; + // if batched-shot is not applicable, use base multi-shots executor + if (!enable_batch_multi_shots_) { + return Base::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } + + Noise::NoiseModel dummy_noise; + + Base::num_qubits_ = circ.num_qubits; + Base::num_creg_memory_ = circ.num_memory; + Base::num_creg_registers_ = circ.num_registers; + + if (Base::sim_device_ == Device::GPU) { +#ifdef _OPENMP + if (omp_get_num_threads() == 1) + Base::shot_omp_parallel_ = true; +#endif + } else if (Base::sim_device_ == Device::ThrustCPU) { + Base::shot_omp_parallel_ = false; + } + + Base::set_distribution(circ.shots); + Base::num_max_shots_ = Base::get_max_parallel_shots(circ, noise); + if (Base::num_max_shots_ == 0) + Base::num_max_shots_ = 1; + + RngEngine rng = init_rng; + + Circuit circ_opt; + if (sample_noise) + circ_opt = + noise.sample_noise(circ, rng, Noise::NoiseModel::Method::circuit, true); + else + circ_opt = circ; + auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config); + + fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(), + result); + Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt); + + // Add batched multi-shots optimizaiton metadata + result.metadata.add(true, "batched_shots_optimization"); + + int_t i; + int_t i_begin, n_shots; + +#ifdef AER_MPI + // if shots are distributed to MPI processes, allocate cregs to be gathered + if (Base::num_process_per_experiment_ > 1) + Base::cregs_.resize(circ_opt.shots); +#endif + + i_begin = 0; + while (i_begin < Base::num_local_states_) { + local_state_index_ = Base::global_state_index_ + i_begin; + + // loop for states can be stored in available memory + n_shots = std::min(Base::num_local_states_, Base::num_max_shots_); + if (i_begin + n_shots > Base::num_local_states_) { + n_shots = Base::num_local_states_ - i_begin; + } + + // allocate shots + this->allocate_states(n_shots, config); + + // Set state config + for (i = 0; i < n_shots; i++) { + Base::states_[i].set_parallelization(Base::parallel_state_update_); + Base::states_[i].set_global_phase(circ.global_phase_angle); + } + this->set_global_phase(circ_opt.global_phase_angle); + + // initialization (equivalent to initialize_qreg + initialize_creg) + auto init_group = [this](int_t ig) { + for (uint_t j = Base::top_state_of_group_[ig]; + j < Base::top_state_of_group_[ig + 1]; j++) { + // enabling batch shots optimization + Base::states_[j].qreg().enable_batch(true); + + // initialize qreg here + Base::states_[j].qreg().set_num_qubits(Base::num_qubits_); + Base::states_[j].qreg().initialize(); + + // initialize creg here + Base::states_[j].qreg().initialize_creg(Base::num_creg_memory_, + Base::num_creg_registers_); + } + }; + Utils::apply_omp_parallel_for( + (Base::num_groups_ > 1 && Base::shot_omp_parallel_), 0, + Base::num_groups_, init_group); + + this->apply_global_phase(); // this is parallelized in sub-classes + + // apply ops to multiple-shots + if (Base::num_groups_ > 1 && Base::shot_omp_parallel_) { + std::vector par_results(Base::num_groups_); +#pragma omp parallel for num_threads(Base::num_groups_) + for (i = 0; i < Base::num_groups_; i++) + apply_ops_batched_shots_for_group( + i, circ_opt.ops.cbegin(), circ_opt.ops.cend(), noise, + par_results[i], rng, circ_opt.seed, true); + + for (auto &res : par_results) + result.combine(std::move(res)); + } else { + for (i = 0; i < Base::num_groups_; i++) + apply_ops_batched_shots_for_group(i, circ_opt.ops.cbegin(), + circ_opt.ops.cend(), noise, result, + rng, circ_opt.seed, true); + } + + // collect measured bits and copy memory + for (i = 0; i < n_shots; i++) { + if (Base::num_process_per_experiment_ > 1) { + Base::states_[i].qreg().read_measured_data( + Base::cregs_[local_state_index_ + i]); + } else { + Base::states_[i].qreg().read_measured_data(Base::states_[i].creg()); + result.save_count_data(Base::states_[i].creg(), + Base::save_creg_memory_); + } + } + + i_begin += n_shots; + } + + // gather cregs on MPI processes and save to result +#ifdef AER_MPI + if (Base::num_process_per_experiment_ > 1) { + Base::gather_creg_memory(Base::cregs_, Base::state_index_begin_); + + for (i = 0; i < circ_opt.shots; i++) + result.save_count_data(Base::cregs_[i], Base::save_creg_memory_); + Base::cregs_.clear(); + } +#endif + +#ifdef AER_THRUST_CUDA + if (Base::sim_device_ == Device::GPU) { + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + nDev = 0; + } + if (nDev > Base::num_groups_) + nDev = Base::num_groups_; + result.metadata.add(nDev, "batched_shots_optimization_parallel_gpus"); + } +#endif +} + +template +template +void BatchShotsExecutor::apply_ops_batched_shots_for_group( + int_t i_group, InputIterator first, InputIterator last, + const Noise::NoiseModel &noise, ExperimentResult &result, + RngEngine &init_rng, uint_t rng_seed, bool final_ops) { + uint_t istate = Base::top_state_of_group_[i_group]; + std::vector rng(Base::num_states_in_group_[i_group]); +#ifdef _OPENMP + int num_inner_threads = omp_get_max_threads() / omp_get_num_threads(); +#else + int num_inner_threads = 1; +#endif + + for (uint_t j = Base::top_state_of_group_[i_group]; + j < Base::top_state_of_group_[i_group + 1]; j++) + if (local_state_index_ + j == 0) + rng[j - Base::top_state_of_group_[i_group]] = init_rng; + else { + rng[j - Base::top_state_of_group_[i_group]].set_seed( + rng_seed + local_state_index_ + j); + } + + for (auto op = first; op != last; ++op) { + if (op->type == Operations::OpType::sample_noise) { + // sample error here + uint_t count = Base::num_states_in_group_[i_group]; + std::vector> noise_ops(count); + + uint_t count_ops = 0; + uint_t non_pauli_gate_count = 0; + if (num_inner_threads > 1) { +#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads) + for (int_t j = 0; j < count; j++) { + noise_ops[j] = noise.sample_noise_loc(*op, rng[j]); + + if (!(noise_ops[j].size() == 0 || + (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) { + count_ops++; + for (int_t k = 0; k < noise_ops[j].size(); k++) { + if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && + noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && + noise_ops[j][k].name != "pauli") { + non_pauli_gate_count++; + break; + } + } + } + } + } else { + for (int_t j = 0; j < count; j++) { + noise_ops[j] = noise.sample_noise_loc(*op, rng[j]); + + if (!(noise_ops[j].size() == 0 || + (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) { + count_ops++; + for (int_t k = 0; k < noise_ops[j].size(); k++) { + if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && + noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && + noise_ops[j][k].name != "pauli") { + non_pauli_gate_count++; + break; + } + } + } + } + } + + if (count_ops == 0) { + continue; // do nothing + } + if (non_pauli_gate_count == 0) { // ptimization for Pauli error + Base::states_[istate].qreg().apply_batched_pauli_ops(noise_ops); + } else { + // otherwise execute each circuit + apply_batched_noise_ops(i_group, noise_ops, result, rng); + } + } else { + if (!apply_batched_op(istate, *op, result, rng, + final_ops && (op + 1 == last))) { + // call apply_op for each state + for (uint_t j = Base::top_state_of_group_[i_group]; + j < Base::top_state_of_group_[i_group + 1]; j++) { + Base::states_[j].qreg().enable_batch(false); + Base::states_[j].qreg().read_measured_data(Base::states_[j].creg()); + Base::states_[j].apply_op(*op, result, + rng[j - Base::top_state_of_group_[i_group]], + final_ops && (op + 1 == last)); + Base::states_[j].qreg().enable_batch(true); + } + } + } + } +} + +template +void BatchShotsExecutor::apply_batched_noise_ops( + const int_t i_group, const std::vector> &ops, + ExperimentResult &result, std::vector &rng) { + int_t i, j, k, count, nop, pos = 0; + uint_t istate = Base::top_state_of_group_[i_group]; + count = ops.size(); + + reg_t mask(count); + std::vector finished(count, false); + for (i = 0; i < count; i++) { + int_t cond_reg = -1; + + if (finished[i]) + continue; + if (ops[i].size() == 0 || (ops[i].size() == 1 && ops[i][0].name == "id")) { + finished[i] = true; + continue; + } + mask[i] = 1; + + // find same ops to be exectuted in a batch + for (j = i + 1; j < count; j++) { + if (finished[j]) { + mask[j] = 0; + continue; + } + if (ops[j].size() == 0 || + (ops[j].size() == 1 && ops[j][0].name == "id")) { + mask[j] = 0; + finished[j] = true; + continue; + } + + if (ops[i].size() != ops[j].size()) { + mask[j] = 0; + continue; + } + + mask[j] = true; + for (k = 0; k < ops[i].size(); k++) { + if (ops[i][k].conditional) { + cond_reg = ops[i][k].conditional_reg; + } + if (ops[i][k].type != ops[j][k].type || + ops[i][k].name != ops[j][k].name) { + mask[j] = false; + break; + } + } + if (mask[j]) + finished[j] = true; + } + + // mask conditional register + int_t sys_reg = Base::states_[istate].qreg().set_batched_system_conditional( + cond_reg, mask); + + // batched execution on same ops + for (k = 0; k < ops[i].size(); k++) { + Operations::Op cop = ops[i][k]; + + // mark op conditional to mask shots + cop.conditional = true; + cop.conditional_reg = sys_reg; + + if (!apply_batched_op(istate, cop, result, rng, false)) { + // call apply_op for each state + /*if(cop.conditional){ + //copy creg to local state + reg_t reg_pos(1); + reg_t mem_pos; + int bit = + Base::states_[j].qreg().measured_cregister(cop.conditional_reg); + const reg_t reg = Utils::int2reg(bit, 2, 1); + reg_pos[0] = cop.conditional_reg; + Base::states_[j].creg().store_measure(reg, mem_pos, reg_pos); + }*/ + for (uint_t j = Base::top_state_of_group_[i_group]; + j < Base::top_state_of_group_[i_group + 1]; j++) { + Base::states_[j].qreg().enable_batch(false); + Base::states_[j].apply_op( + cop, result, rng[j - Base::top_state_of_group_[i_group]], false); + Base::states_[j].qreg().enable_batch(true); + } + } + } + mask[i] = 0; + finished[i] = true; + } +} + +//------------------------------------------------------------------------- +} // end namespace CircuitExecutor +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/chunk_utils.hpp b/src/simulators/chunk_utils.hpp new file mode 100644 index 0000000000..3277e2c0fd --- /dev/null +++ b/src/simulators/chunk_utils.hpp @@ -0,0 +1,116 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019.2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _chunk_utils_hpp +#define _chunk_utils_hpp + +#include "framework/opset.hpp" +#include "framework/types.hpp" + +namespace AER { + +namespace Chunk { + +void get_qubits_inout(const int chunk_qubits, const reg_t &qubits, + reg_t &qubits_in, reg_t &qubits_out) { + int_t i; + qubits_in.clear(); + qubits_out.clear(); + for (i = 0; i < qubits.size(); i++) { + if (qubits[i] < chunk_qubits) { // in chunk + qubits_in.push_back(qubits[i]); + } else { + qubits_out.push_back(qubits[i]); + } + } +} + +void get_inout_ctrl_qubits(const Operations::Op &op, const uint_t num_qubits, + reg_t &qubits_in, reg_t &qubits_out) { + if (op.type == Operations::OpType::gate && + (op.name[0] == 'c' || op.name.find("mc") == 0)) { + for (int i = 0; i < op.qubits.size(); i++) { + if (op.qubits[i] < num_qubits) + qubits_in.push_back(op.qubits[i]); + else + qubits_out.push_back(op.qubits[i]); + } + } +} + +Operations::Op correct_gate_op_in_chunk(const Operations::Op &op, + reg_t &qubits_in) { + Operations::Op new_op = op; + new_op.qubits = qubits_in; + // change gate name if there is no control qubits inside chunk + if (op.name.find("swap") != std::string::npos && qubits_in.size() == 2) { + new_op.name = "swap"; + } + if (op.name.find("ccx") != std::string::npos) { + if (qubits_in.size() == 1) + new_op.name = "x"; + else + new_op.name = "cx"; + } else if (qubits_in.size() == 1) { + if (op.name[0] == 'c') + new_op.name = op.name.substr(1); + else if (op.name == "mcphase") + new_op.name = "p"; + else + new_op.name = op.name.substr(2); // remove "mc" + } + return new_op; +} + +void block_diagonal_matrix(const uint_t gid, const uint_t chunk_bits, + reg_t &qubits, cvector_t &diag) { + uint_t i; + uint_t mask_out = 0; + uint_t mask_id = 0; + + reg_t qubits_in; + cvector_t diag_in; + + for (i = 0; i < qubits.size(); i++) { + if (qubits[i] < chunk_bits) { // in chunk + qubits_in.push_back(qubits[i]); + } else { + mask_out |= (1ull << i); + if ((gid >> (qubits[i] - chunk_bits)) & 1) + mask_id |= (1ull << i); + } + } + + if (qubits_in.size() < qubits.size()) { + for (i = 0; i < diag.size(); i++) { + if ((i & mask_out) == mask_id) + diag_in.push_back(diag[i]); + } + + if (qubits_in.size() == 0) { + qubits_in.push_back(0); + diag_in.resize(2); + diag_in[1] = diag_in[0]; + } + qubits = qubits_in; + diag = diag_in; + } +} + +//------------------------------------------------------------------------- +} // namespace Chunk +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp new file mode 100644 index 0000000000..aaa0e7b217 --- /dev/null +++ b/src/simulators/circuit_executor.hpp @@ -0,0 +1,1189 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _circuit_executor_hpp_ +#define _circuit_executor_hpp_ + +#include "framework/config.hpp" +#include "framework/creg.hpp" +#include "framework/json.hpp" +#include "framework/opset.hpp" +#include "framework/results/experiment_result.hpp" +#include "framework/results/result.hpp" +#include "framework/rng.hpp" +#include "framework/types.hpp" +#include "noise/noise_model.hpp" + +#include "transpile/cacheblocking.hpp" +#include "transpile/fusion.hpp" + +#include "simulators/state.hpp" + +namespace AER { + +namespace CircuitExecutor { + +using OpItr = std::vector::const_iterator; + +// Timer type +using myclock_t = std::chrono::high_resolution_clock; + +//------------------------------------------------------------------------- +// Executor base class +//------------------------------------------------------------------------- +class Base { +protected: +public: + Base() {} + virtual ~Base() {} + + virtual void run_circuit(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, const Method method, + const Device device, ExperimentResult &result) = 0; + + // Return an estimate of the required memory for a circuit. + virtual size_t required_memory_mb(const Circuit &circuit, + const Noise::NoiseModel &noise) const = 0; + virtual size_t max_memory_mb(void) = 0; + + virtual bool validate_state(const Circuit &circ, + const Noise::NoiseModel &noise, + bool throw_except) const = 0; +}; + +//------------------------------------------------------------------------- +// Simple Executor +//------------------------------------------------------------------------- +template +class Executor : public Base { +protected: + // Simulation method + Method method_; + + // Simulation device + Device sim_device_ = Device::CPU; + + // Simulation precision + Precision sim_precision_ = Precision::Double; + + // Save counts as memory list + bool save_creg_memory_ = false; + + // The maximum number of threads to use for various levels of parallelization + int max_parallel_threads_; + + // Parameters for parallelization management in configuration + int max_parallel_shots_; + size_t max_memory_mb_; + size_t max_gpu_memory_mb_; + int num_gpus_; // max number of GPU per process + reg_t target_gpus_; // GPUs to be used + + // use explicit parallelization + bool explicit_parallelization_; + + // Parameters for parallelization management for experiments + int parallel_experiments_; + int parallel_shots_; + int parallel_state_update_; + + // results are stored independently in each process if true + bool accept_distributed_results_ = true; + + uint_t myrank_; // process ID + uint_t nprocs_; // number of processes + uint_t distributed_rank_; // process ID in communicator group + uint_t distributed_procs_; // number of processes in communicator group + uint_t distributed_group_; // group id of distribution + int_t distributed_proc_bits_; // distributed_procs_=2^distributed_proc_bits_ + // (if nprocs != power of 2, set -1) + int num_process_per_experiment_ = 1; + +#ifdef AER_MPI + // communicator group to simulate a circuit (for multi-experiments) + MPI_Comm distributed_comm_; +#endif + +#ifdef AER_CUSTATEVEC + // settings for cuStateVec + bool cuStateVec_enable_ = false; +#endif + + // if circuit has statevector operations or not + bool has_statevector_ops_; + +public: + Executor(); + virtual ~Executor() {} + + void run_circuit(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, const Method method, + const Device device, ExperimentResult &result) override; + + // Return an estimate of the required memory for a circuit. + size_t required_memory_mb(const Circuit &circuit, + const Noise::NoiseModel &noise) const override { + state_t tmp; + return tmp.required_memory_mb(circuit.num_qubits, circuit.ops); + } + size_t max_memory_mb(void) override { return max_memory_mb_; } + + bool validate_state(const Circuit &circ, const Noise::NoiseModel &noise, + bool throw_except) const override; + +protected: + // Return a fusion transpilation pass configured for the current + // method, circuit and config + Transpile::Fusion transpile_fusion(const Operations::OpSet &opset, + const Config &config) const; + + // return maximum number of qubits for matrix + int_t get_max_matrix_qubits(const Circuit &circ) const; + int_t get_matrix_bits(const Operations::Op &op) const; + + // Get system memory size + size_t get_system_memory_mb(); + size_t get_gpu_memory_mb(); + + size_t get_min_memory_mb() const { + if (sim_device_ == Device::GPU && num_gpus_ > 0) { + return max_gpu_memory_mb_ / num_gpus_; // return per GPU memory size + } + return max_memory_mb_; + } + + // get max shots stored on memory + uint_t get_max_parallel_shots(const Circuit &circuit, + const Noise::NoiseModel &noise) const; + + bool multiple_shots_required(const Circuit &circuit, + const Noise::NoiseModel &noise) const; + + // Check if measure sampling optimization is valid for the input circuit + // for the given method. This checks if operation types before + // the first measurement in the circuit prevent sampling + bool check_measure_sampling_opt(const Circuit &circ) const; + + bool has_statevector_ops(const Circuit &circ) const; + + virtual void set_config(const Config &config); + virtual void set_parallelization(const Circuit &circ, + const Noise::NoiseModel &noise); + + virtual void run_circuit_with_sampling(Circuit &circ, const Config &config, + RngEngine &init_rng, + ExperimentResult &result); + + virtual void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise); + + template + void measure_sampler(InputIterator first_meas, InputIterator last_meas, + uint_t shots, state_t &state, ExperimentResult &result, + RngEngine &rng) const; + +#ifdef AER_MPI + void gather_creg_memory(std::vector &cregs, + reg_t &shot_index); +#endif +}; + +template +Executor::Executor() { + max_memory_mb_ = 0; + max_gpu_memory_mb_ = 0; + max_parallel_threads_ = 0; + max_parallel_shots_ = 0; + + parallel_shots_ = 1; + parallel_state_update_ = 1; + + num_process_per_experiment_ = 0; + + num_gpus_ = 0; + + explicit_parallelization_ = false; + + has_statevector_ops_ = false; + + myrank_ = 0; + nprocs_ = 1; + + distributed_procs_ = 1; + distributed_rank_ = 0; + distributed_group_ = 0; + distributed_proc_bits_ = 0; + +#ifdef AER_MPI + distributed_comm_ = MPI_COMM_WORLD; +#endif +} + +template +void Executor::set_config(const Config &config) { + // Load config for memory (creg list data) + if (config.memory.has_value()) + save_creg_memory_ = config.memory.value(); + +#ifdef _OPENMP + // Load OpenMP maximum thread settings + if (config.max_parallel_threads.has_value()) + max_parallel_threads_ = config.max_parallel_threads.value(); + if (config.max_parallel_shots.has_value()) + max_parallel_shots_ = config.max_parallel_shots.value(); + // Limit max threads based on number of available OpenMP threads + auto omp_threads = omp_get_max_threads(); + max_parallel_threads_ = (max_parallel_threads_ > 0) + ? std::min(max_parallel_threads_, omp_threads) + : std::max(1, omp_threads); +#else + // No OpenMP so we disable parallelization + max_parallel_threads_ = 1; + max_parallel_shots_ = 1; +#endif + + // Load configurations for parallelization + + if (config.max_memory_mb.has_value()) + max_memory_mb_ = config.max_memory_mb.value(); + + // for debugging + if (config._parallel_shots.has_value()) { + parallel_shots_ = config._parallel_shots.value(); + explicit_parallelization_ = true; + } + + // for debugging + if (config._parallel_state_update.has_value()) { + parallel_state_update_ = config._parallel_state_update.value(); + explicit_parallelization_ = true; + } + + if (explicit_parallelization_) { + parallel_shots_ = std::max({parallel_shots_, 1}); + parallel_state_update_ = std::max({parallel_state_update_, 1}); + } + + if (config.accept_distributed_results.has_value()) + accept_distributed_results_ = config.accept_distributed_results.value(); + +#ifdef AER_CUSTATEVEC + // cuStateVec configs + cuStateVec_enable_ = false; + if (config.cuStateVec_enable.has_value()) + cuStateVec_enable_ = config.cuStateVec_enable.value(); +#endif + + std::string precision = config.precision; + if (precision == "double") { + sim_precision_ = Precision::Double; + } else if (precision == "single") { + sim_precision_ = Precision::Single; + } + + // set target GPUs +#ifdef AER_THRUST_CUDA + int nDev = 0; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + nDev = 0; + } + if (config.target_gpus.has_value()) { + target_gpus_ = config.target_gpus.value(); + if (nDev < target_gpus_.size()) { + throw std::invalid_argument("target_gpus has more GPUs than available."); + } + num_gpus_ = target_gpus_.size(); + } else { + num_gpus_ = nDev; + target_gpus_.resize(num_gpus_); + for (int_t i = 0; i < num_gpus_; i++) + target_gpus_[i] = i; + } +#endif +} + +template +size_t Executor::get_system_memory_mb() { + size_t total_physical_memory = Utils::get_system_memory_mb(); +#ifdef AER_MPI + // get minimum memory size per process + uint64_t locMem, minMem; + locMem = total_physical_memory; + MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, distributed_comm_); + total_physical_memory = minMem; +#endif + + return total_physical_memory; +} + +template +size_t Executor::get_gpu_memory_mb() { + size_t total_physical_memory = 0; +#ifdef AER_THRUST_CUDA + for (int_t iDev = 0; iDev < target_gpus_.size(); iDev++) { + size_t freeMem, totalMem; + cudaSetDevice(target_gpus_[iDev]); + cudaMemGetInfo(&freeMem, &totalMem); + total_physical_memory += totalMem; + } +#endif + +#ifdef AER_MPI + // get minimum memory size per process + uint64_t locMem, minMem; + locMem = total_physical_memory; + MPI_Allreduce(&locMem, &minMem, 1, MPI_UINT64_T, MPI_MIN, distributed_comm_); + total_physical_memory = minMem; + + int t = num_gpus_; + MPI_Allreduce(&t, &num_gpus_, 1, MPI_INT, MPI_MAX, distributed_comm_); +#endif + + return total_physical_memory >> 20; +} + +template +bool Executor::multiple_shots_required( + const Circuit &circ, const Noise::NoiseModel &noise) const { + if (circ.shots < 2) + return false; + if (method_ == Method::density_matrix || method_ == Method::superop || + method_ == Method::unitary) { + return false; + } + + bool can_sample = check_measure_sampling_opt(circ); + + if (noise.is_ideal()) { + return !can_sample; + } + + return true; +} + +template +uint_t Executor::get_max_parallel_shots( + const Circuit &circ, const Noise::NoiseModel &noise) const { + uint_t mem = required_memory_mb(circ, noise); + if (mem == 0) + return circ.shots; + + if (sim_device_ == Device::GPU && num_gpus_ > 0) { + return std::min(circ.shots, (max_gpu_memory_mb_ * 8 / 10 / mem)); + } else { + return std::min(circ.shots, (max_memory_mb_ / mem)); + } +} + +template +void Executor::set_parallelization(const Circuit &circ, + const Noise::NoiseModel &noise) { + // MPI setting + myrank_ = 0; + nprocs_ = 1; +#ifdef AER_MPI + int t; + MPI_Comm_size(MPI_COMM_WORLD, &t); + nprocs_ = t; + MPI_Comm_rank(MPI_COMM_WORLD, &t); + myrank_ = t; +#endif + if (num_process_per_experiment_ == 0) + num_process_per_experiment_ = nprocs_; + + distributed_procs_ = num_process_per_experiment_; + distributed_rank_ = myrank_ % distributed_procs_; + distributed_group_ = myrank_ / distributed_procs_; + + distributed_proc_bits_ = 0; + int proc_bits = 0; + uint_t p = distributed_procs_; + while (p > 1) { + if ((p & 1) != 0) { // procs is not power of 2 + distributed_proc_bits_ = -1; + break; + } + distributed_proc_bits_++; + p >>= 1; + } + +#ifdef AER_MPI + if (num_process_per_experiment_ != nprocs_) { + MPI_Comm_split(MPI_COMM_WORLD, (int)distributed_group_, + (int)distributed_rank_, &distributed_comm_); + } else { + distributed_comm_ = MPI_COMM_WORLD; + } +#endif + + if (max_memory_mb_ == 0) + max_memory_mb_ = get_system_memory_mb(); + max_gpu_memory_mb_ = get_gpu_memory_mb(); + + // number of threads for parallel loop of experiments + parallel_experiments_ = omp_get_num_threads(); + + if (explicit_parallelization_) + return; + + // Check for trivial parallelization conditions + switch (method_) { + case Method::statevector: + case Method::stabilizer: + case Method::unitary: + case Method::matrix_product_state: { + if (circ.shots == 1 || num_process_per_experiment_ > 1 || + (!noise.has_quantum_errors() && check_measure_sampling_opt(circ))) { + parallel_shots_ = 1; + parallel_state_update_ = + std::max({1, max_parallel_threads_ / parallel_experiments_}); + return; + } + break; + } + case Method::density_matrix: + case Method::superop: + case Method::tensor_network: { + if (circ.shots == 1 || num_process_per_experiment_ > 1 || + check_measure_sampling_opt(circ)) { + parallel_shots_ = 1; + parallel_state_update_ = + std::max({1, max_parallel_threads_ / parallel_experiments_}); + return; + } + break; + } + case Method::extended_stabilizer: + break; + default: + throw std::invalid_argument( + "Cannot set parallelization for unresolved method."); + } + + // Use a local variable to not override stored maximum based + // on currently executed circuits + const auto max_shots = + (max_parallel_shots_ > 0) + ? std::min({max_parallel_shots_, max_parallel_threads_}) + : max_parallel_threads_; + + // If we are executing circuits in parallel we disable + // parallel shots + if (max_shots == 1 || parallel_experiments_ > 1) { + parallel_shots_ = 1; + } else { + // Parallel shots is > 1 + // Limit parallel shots by available memory and number of shots + // And assign the remaining threads to state update + int circ_memory_mb = + required_memory_mb(circ, noise) / num_process_per_experiment_; + size_t mem_size = + (sim_device_ == Device::GPU) ? max_gpu_memory_mb_ : max_memory_mb_; + if (mem_size < circ_memory_mb) + throw std::runtime_error( + "a circuit requires more memory than max_memory_mb."); + // If circ memory is 0, set it to 1 so that we don't divide by zero + circ_memory_mb = std::max({1, circ_memory_mb}); + + int shots = circ.shots; + parallel_shots_ = std::min( + {static_cast(mem_size / (circ_memory_mb * 2)), max_shots, shots}); + } + parallel_state_update_ = + (parallel_shots_ > 1) + ? std::max({1, max_parallel_threads_ / parallel_shots_}) + : std::max({1, max_parallel_threads_ / parallel_experiments_}); +} + +template +void Executor::run_circuit(Circuit &circ, + const Noise::NoiseModel &noise, + const Config &config, const Method method, + const Device device, + ExperimentResult &result) { + // Start individual circuit timer + auto timer_start = myclock_t::now(); // state circuit timer + + // Execute in try block so we can catch errors and return the error message + // for individual circuit failures. + try { + // set configuration + method_ = method; + sim_device_ = device; + + set_config(config); + set_parallelization(circ, noise); + + // Rng engine (this one is used to add noise on circuit) + RngEngine rng; + rng.set_seed(circ.seed); + + // Output data container + result.set_config(config); + result.metadata.add(method_names_.at(method), "method"); + if (sim_device_ == Device::GPU) + result.metadata.add("GPU", "device"); + else if (sim_device_ == Device::ThrustCPU) + result.metadata.add("Thrust", "device"); + else + result.metadata.add("CPU", "device"); + + // Circuit qubit metadata + result.metadata.add(circ.num_qubits, "num_qubits"); + result.metadata.add(circ.num_memory, "num_clbits"); + result.metadata.add(circ.qubits(), "active_input_qubits"); + result.metadata.add(circ.qubit_map(), "input_qubit_map"); + result.metadata.add(circ.remapped_qubits, "remapped_qubits"); + + // Add measure sampling to metadata + // Note: this will set to `true` if sampling is enabled for the circuit + result.metadata.add(false, "measure_sampling"); + result.metadata.add(false, "batched_shots_optimization"); + + // Validate gateset and memory requirements, raise exception if they're + // exceeded + validate_state(circ, noise, true); + + has_statevector_ops_ = has_statevector_ops(circ); + + if (circ.num_qubits > 0) { // do nothing for query steps + // Choose execution method based on noise and method + Circuit opt_circ; + bool noise_sampling = false; + + // Ideal circuit + if (noise.is_ideal()) { + opt_circ = circ; + result.metadata.add("ideal", "noise"); + } + // Readout error only + else if (noise.has_quantum_errors() == false) { + opt_circ = noise.sample_noise(circ, rng); + result.metadata.add("readout", "noise"); + } + // Superop noise sampling + else if (method == Method::density_matrix || method == Method::superop || + (method == Method::tensor_network && !has_statevector_ops_)) { + // Sample noise using SuperOp method + opt_circ = + noise.sample_noise(circ, rng, Noise::NoiseModel::Method::superop); + result.metadata.add("superop", "noise"); + } + // Kraus noise sampling + else if (noise.opset().contains(Operations::OpType::kraus) || + noise.opset().contains(Operations::OpType::superop)) { + opt_circ = + noise.sample_noise(circ, rng, Noise::NoiseModel::Method::kraus); + result.metadata.add("kraus", "noise"); + } + // General circuit noise sampling + else { + noise_sampling = true; + result.metadata.add("circuit", "noise"); + } + + if (noise_sampling) { + run_circuit_shots(circ, noise, config, rng, result, true); + } else { + // Run multishot simulation without noise sampling + bool can_sample = opt_circ.can_sample; + can_sample &= check_measure_sampling_opt(opt_circ); + + if (can_sample) + run_circuit_with_sampling(opt_circ, config, rng, result); + else + run_circuit_shots(opt_circ, noise, config, rng, result, false); + } + } + // Report success + result.status = ExperimentResult::Status::completed; + + // Pass through circuit header and add metadata + result.header = circ.header; + result.shots = circ.shots; + result.seed = circ.seed; + result.metadata.add(parallel_shots_, "parallel_shots"); + result.metadata.add(parallel_state_update_, "parallel_state_update"); +#ifdef AER_CUSTATEVEC + if (sim_device_ == Device::GPU) + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif + if (sim_device_ == Device::GPU) + result.metadata.add(target_gpus_, "target_gpus"); + + // Add timer data + auto timer_stop = myclock_t::now(); // stop timer + double time_taken = + std::chrono::duration(timer_stop - timer_start).count(); + result.time_taken = time_taken; + } + // If an exception occurs during execution, catch it and pass it to the output + catch (std::exception &e) { + result.status = ExperimentResult::Status::error; + result.message = e.what(); + } +} + +template +void Executor::run_circuit_with_sampling(Circuit &circ, + const Config &config, + RngEngine &init_rng, + ExperimentResult &result) { + state_t state; + + // Optimize circuit + Noise::NoiseModel dummy_noise; + + auto fusion_pass = transpile_fusion(circ.opset(), config); + fusion_pass.optimize_circuit(circ, dummy_noise, state.opset(), result); + + auto max_bits = get_max_matrix_qubits(circ); + + // Set state config + state.set_config(config); + state.set_parallelization(parallel_state_update_); + state.set_global_phase(circ.global_phase_angle); + + state.set_distribution(1); + state.set_max_matrix_qubits(max_bits); + + RngEngine rng = init_rng; + + auto first_meas = circ.first_measure_pos; // Position of first measurement op + bool final_ops = (first_meas == circ.ops.size()); + + // allocate qubit register +#ifdef AER_CUSTATEVEC + state.enable_cuStateVec(cuStateVec_enable_); +#endif + state.allocate(circ.num_qubits, circ.num_qubits); + state.set_num_global_qubits(circ.num_qubits); + state.enable_density_matrix(!has_statevector_ops_); + + // Run circuit instructions before first measure + state.initialize_qreg(circ.num_qubits); + state.initialize_creg(circ.num_memory, circ.num_registers); + + state.apply_ops(circ.ops.cbegin(), circ.ops.cbegin() + first_meas, result, + rng, final_ops); + + // Get measurement operations and set of measured qubits + measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots, + state, result, rng); + + // Add measure sampling metadata + result.metadata.add(true, "measure_sampling"); + + state.add_metadata(result); +} + +template +void Executor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + + // insert runtime noise sample ops here + int_t par_shots = (int_t)get_max_parallel_shots(circ, noise); + par_shots = std::min((int_t)parallel_shots_, par_shots); + std::vector par_results(par_shots); + + uint_t num_shots = circ.shots; + uint_t seed_begin = circ.seed; + + // MPI distribution settings + std::vector cregs; + reg_t shot_begin(distributed_procs_); + reg_t shot_end(distributed_procs_); + for (int_t i = 0; i < distributed_procs_; i++) { + shot_begin[i] = circ.shots * i / distributed_procs_; + shot_end[i] = circ.shots * (i + 1) / distributed_procs_; + } + num_shots = shot_end[distributed_rank_] - shot_begin[distributed_rank_]; + seed_begin += shot_begin[distributed_rank_]; + cregs.resize(circ.shots); + + int max_matrix_qubits; + auto fusion_pass = transpile_fusion(circ.opset(), config); + if (!sample_noise) { + Noise::NoiseModel dummy_noise; + state_t dummy_state; + fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(), + result); + max_matrix_qubits = get_max_matrix_qubits(circ); + } else { + max_matrix_qubits = get_max_matrix_qubits(circ); + max_matrix_qubits = std::max(max_matrix_qubits, (int)fusion_pass.max_qubit); + } + + // run each shot + auto run_circuit_lambda = [this, &par_results, circ, noise, config, par_shots, + sample_noise, num_shots, seed_begin, shot_begin, + &cregs, init_rng, max_matrix_qubits, + fusion_pass](int_t i) { + state_t state; + uint_t i_shot, shot_end; + i_shot = num_shots * i / par_shots; + shot_end = num_shots * (i + 1) / par_shots; + + // Set state config + state.set_config(config); + state.set_parallelization(this->parallel_state_update_); + state.set_global_phase(circ.global_phase_angle); + state.enable_density_matrix(!has_statevector_ops_); + + state.set_distribution(this->num_process_per_experiment_); + state.set_num_global_qubits(circ.num_qubits); + state.set_max_matrix_qubits(max_matrix_qubits); +#ifdef AER_CUSTATEVEC + state.enable_cuStateVec(cuStateVec_enable_); +#endif + state.allocate(circ.num_qubits, circ.num_qubits); + + for (; i_shot < shot_end; i_shot++) { + RngEngine rng; + if (i_shot == 0) + rng = init_rng; + else + rng.set_seed(seed_begin + i_shot); + + state.initialize_qreg(circ.num_qubits); + state.initialize_creg(circ.num_memory, circ.num_registers); + + if (sample_noise) { + Circuit circ_opt; + Noise::NoiseModel dummy_noise; + circ_opt = noise.sample_noise(circ, rng); + fusion_pass.optimize_circuit(circ_opt, dummy_noise, state.opset(), + par_results[i]); + state.apply_ops(circ_opt.ops.cbegin(), circ_opt.ops.cend(), + par_results[i], rng, true); + } else { + state.apply_ops(circ.ops.cbegin(), circ.ops.cend(), par_results[i], rng, + true); + } + if (distributed_procs_ > 1) { + // save creg to be gathered + cregs[shot_begin[distributed_rank_] + i_shot] = state.creg(); + } else { + par_results[i].save_count_data(state.creg(), save_creg_memory_); + } + } + state.add_metadata(par_results[i]); + }; + Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, + run_circuit_lambda); + + // gather cregs on MPI processes and save to result +#ifdef AER_MPI + if (num_process_per_experiment_ > 1) { + gather_creg_memory(cregs, shot_begin); + + // save cregs to result + num_shots = circ.shots; + auto save_cregs = [this, &par_results, par_shots, num_shots, + cregs](int_t i) { + uint_t i_shot, shot_end; + i_shot = num_shots * i / par_shots; + shot_end = num_shots * (i + 1) / par_shots; + + for (; i_shot < shot_end; i_shot++) { + par_results[i].save_count_data(cregs[i_shot], save_creg_memory_); + } + }; + Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, save_cregs, + par_shots); + } +#endif + + for (auto &res : par_results) { + result.combine(std::move(res)); + } +#ifdef AER_CUSTATEVEC + if (sim_device_ == Device::GPU) { + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); + if (par_shots >= num_gpus_) + result.metadata.add(num_gpus_, "gpu_parallel_shots_"); + else + result.metadata.add(par_shots, "gpu_parallel_shots_"); + } +#endif +} + +template +template +void Executor::measure_sampler(InputIterator first_meas, + InputIterator last_meas, uint_t shots, + state_t &state, + ExperimentResult &result, + RngEngine &rng) const { + // Check if meas_circ is empty, and if so return initial creg + if (first_meas == last_meas) { + while (shots-- > 0) { + result.save_count_data(state.creg(), save_creg_memory_); + } + return; + } + + std::vector meas_ops; + std::vector roerror_ops; + for (auto op = first_meas; op != last_meas; op++) { + if (op->type == Operations::OpType::roerror) { + roerror_ops.push_back(*op); + } else { /*(op.type == Operations::OpType::measure) */ + meas_ops.push_back(*op); + } + } + + // Get measured qubits from circuit sort and delete duplicates + std::vector meas_qubits; // measured qubits + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) + meas_qubits.push_back(op.qubits[j]); + } + sort(meas_qubits.begin(), meas_qubits.end()); + meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()), + meas_qubits.end()); + + // Generate the samples + auto timer_start = myclock_t::now(); + std::vector all_samples; + all_samples = state.sample_measure(meas_qubits, shots, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); + + // Make qubit map of position in vector of measured qubits + std::unordered_map qubit_map; + for (uint_t j = 0; j < meas_qubits.size(); ++j) { + qubit_map[meas_qubits[j]] = j; + } + + // Maps of memory and register to qubit position + std::map memory_map; + std::map register_map; + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) { + auto pos = qubit_map[op.qubits[j]]; + if (!op.memory.empty()) + memory_map[op.memory[j]] = pos; + if (!op.registers.empty()) + register_map[op.registers[j]] = pos; + } + } + + // Process samples + uint_t num_memory = + (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first; + uint_t num_registers = + (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first; + ClassicalRegister creg; + for (int_t i = 0; i < all_samples.size(); i++) { + creg.initialize(num_memory, num_registers); + + // process memory bit measurements + for (const auto &pair : memory_map) { + creg.store_measure(reg_t({all_samples[i][pair.second]}), + reg_t({pair.first}), reg_t()); + } + // process register bit measurements + for (const auto &pair : register_map) { + creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(), + reg_t({pair.first})); + } + + // process read out errors for memory and registers + for (const Operations::Op &roerror : roerror_ops) + creg.apply_roerror(roerror, rng); + + // Save count data + result.save_count_data(creg, save_creg_memory_); + } +} + +template +bool Executor::validate_state(const Circuit &circ, + const Noise::NoiseModel &noise, + bool throw_except) const { + std::stringstream error_msg; + std::string circ_name; + state_t state; + + JSON::get_value(circ_name, "name", circ.header); + + // Check if a circuit is valid for state ops + bool circ_valid = state.opset().contains(circ.opset()); + if (throw_except && !circ_valid) { + error_msg << "Circuit " << circ_name << " contains invalid instructions "; + error_msg << state.opset().difference(circ.opset()); + error_msg << " for \"" << state.name() << "\" method."; + } + + // Check if a noise model valid for state ops + bool noise_valid = noise.is_ideal() || state.opset().contains(noise.opset()); + if (throw_except && !noise_valid) { + error_msg << "Noise model contains invalid instructions "; + error_msg << state.opset().difference(noise.opset()); + error_msg << " for \"" << state.name() << "\" method."; + } + + // Validate memory requirements + bool memory_valid = true; + if (max_memory_mb_ > 0) { + size_t required_mb = state.required_memory_mb(circ.num_qubits, circ.ops) / + num_process_per_experiment_; + size_t mem_size = (sim_device_ == Device::GPU) + ? max_memory_mb_ + max_gpu_memory_mb_ + : max_memory_mb_; + memory_valid = (required_mb <= mem_size); + if (throw_except && !memory_valid) { + error_msg << "Insufficient memory to run circuit " << circ_name; + error_msg << " using the " << state.name() << " simulator."; + error_msg << " Required memory: " << required_mb + << "M, max memory: " << max_memory_mb_ << "M"; + if (sim_device_ == Device::GPU) { + error_msg << " (Host) + " << max_gpu_memory_mb_ << "M (GPU)"; + } + } + } + + if (noise_valid && circ_valid && memory_valid) { + return true; + } + + // One of the validation checks failed for the current state + if (throw_except) { + throw std::runtime_error(error_msg.str()); + } + return false; +} + +template +Transpile::Fusion +Executor::transpile_fusion(const Operations::OpSet &opset, + const Config &config) const { + Transpile::Fusion fusion_pass; + fusion_pass.set_parallelization(parallel_state_update_); + + if (opset.contains(Operations::OpType::superop)) { + fusion_pass.allow_superop = true; + } + if (opset.contains(Operations::OpType::kraus)) { + fusion_pass.allow_kraus = true; + } + switch (method_) { + case Method::density_matrix: + case Method::superop: { + // Halve the default threshold and max fused qubits for density matrix + fusion_pass.threshold /= 2; + fusion_pass.max_qubit /= 2; + break; + } + case Method::matrix_product_state: { + fusion_pass.active = false; + return fusion_pass; // Do not allow the config to set active for MPS + } + case Method::statevector: { + if (fusion_pass.allow_kraus) { + // Halve default max fused qubits for Kraus noise fusion + fusion_pass.max_qubit /= 2; + } + break; + } + case Method::unitary: { + // max_qubit is the same with statevector + fusion_pass.threshold /= 2; + break; + } + case Method::tensor_network: { + if (opset.contains(Operations::OpType::save_statevec) || + opset.contains(Operations::OpType::save_statevec_dict)) { + if (fusion_pass.allow_kraus) { + // Halve default max fused qubits for Kraus noise fusion + fusion_pass.max_qubit /= 2; + } + } else { + // Halve the default threshold and max fused qubits for density matrix + fusion_pass.threshold /= 2; + fusion_pass.max_qubit /= 2; + } + break; + } + default: { + fusion_pass.active = false; + return fusion_pass; + } + } + // Override default fusion settings with custom config + fusion_pass.set_config(config); + return fusion_pass; +} + +template +bool Executor::check_measure_sampling_opt(const Circuit &circ) const { + // Check if circuit has sampling flag disabled + if (circ.can_sample == false) { + return false; + } + + // If density matrix, unitary, superop method all supported instructions + // allow sampling + if (method_ == Method::density_matrix || method_ == Method::superop || + method_ == Method::unitary) { + return true; + } + if (method_ == Method::tensor_network) { + // if there are no save statevec ops, tensor network simulator runs as + // density matrix simulator + if ((!circ.opset().contains(Operations::OpType::save_statevec)) && + (!circ.opset().contains(Operations::OpType::save_statevec_dict))) { + return true; + } + } + + // If circuit contains a non-initial initialize that is not a full width + // instruction we can't sample + if (circ.can_sample_initialize == false) { + return false; + } + + // Check if non-density matrix simulation and circuit contains + // a stochastic instruction before measurement + // ie. reset, kraus, superop + // TODO: + // * Resets should be allowed if applied to |0> state (no gates before). + if (circ.opset().contains(Operations::OpType::reset) || + circ.opset().contains(Operations::OpType::kraus) || + circ.opset().contains(Operations::OpType::superop) || + circ.opset().contains(Operations::OpType::jump) || + circ.opset().contains(Operations::OpType::mark)) { + return false; + } + // Otherwise true + return true; +} + +template +int_t Executor::get_matrix_bits(const Operations::Op &op) const { + int_t bit = 1; + if (op.type == Operations::OpType::matrix || + op.type == Operations::OpType::diagonal_matrix || + op.type == Operations::OpType::initialize) + bit = op.qubits.size(); + else if (op.type == Operations::OpType::kraus || + op.type == Operations::OpType::superop) { + if (method_ == Method::density_matrix) + bit = op.qubits.size() * 2; + else + bit = op.qubits.size(); + } + return bit; +} + +template +int_t Executor::get_max_matrix_qubits(const Circuit &circ) const { + int_t max_bits = 0; + int_t i; + + if (sim_device_ != Device::CPU) { // Only applicable for GPU (and Thrust) + for (i = 0; i < circ.ops.size(); i++) { + int_t bit = 1; + bit = get_matrix_bits(circ.ops[i]); + max_bits = std::max(max_bits, bit); + } + } + return max_bits; +} + +template +bool Executor::has_statevector_ops(const Circuit &circ) const { + return circ.opset().contains(Operations::OpType::save_statevec) || + circ.opset().contains(Operations::OpType::save_statevec_dict) || + circ.opset().contains(Operations::OpType::save_amps); +} + +#ifdef AER_MPI +template +void Executor::gather_creg_memory( + std::vector &cregs, reg_t &shot_index) { + int_t i, j; + uint_t n64, i64, ibit, num_local_shots; + + if (distributed_procs_ == 0) + return; + if (cregs.size() == 0) + return; + int_t size = cregs[0].memory_size(); + if (size == 0) + return; + + if (distributed_rank_ == distributed_procs_ - 1) + num_local_shots = cregs.size() - shot_index[distributed_rank_]; + else + num_local_shots = + shot_index[distributed_rank_ + 1] - shot_index[distributed_rank_]; + + // number of 64-bit integers per memory + n64 = (size + 63) >> 6; + + reg_t bin_memory(n64 * num_local_shots, 0); + // compress memory string to binary +#pragma omp parallel for private(i, j, i64, ibit) + for (i = 0; i < num_local_shots; i++) { + for (j = 0; j < size; j++) { + i64 = j >> 6; + ibit = j & 63; + if (cregs[shot_index[distributed_rank_] + i].creg_memory()[j] == '1') { + bin_memory[i * n64 + i64] |= (1ull << ibit); + } + } + } + + reg_t recv(n64 * cregs.size()); + std::vector recv_counts(distributed_procs_); + std::vector recv_offset(distributed_procs_); + + for (i = 0; i < distributed_procs_ - 1; i++) { + recv_offset[i] = shot_index[i]; + recv_counts[i] = shot_index[i + 1] - shot_index[i]; + } + recv_offset[distributed_procs_ - 1] = shot_index[distributed_procs_ - 1]; + recv_counts[i] = cregs.size() - shot_index[distributed_procs_ - 1]; + + MPI_Allgatherv(&bin_memory[0], n64 * num_local_shots, MPI_UINT64_T, &recv[0], + &recv_counts[0], &recv_offset[0], MPI_UINT64_T, + distributed_comm_); + + // store gathered memory +#pragma omp parallel for private(i, j, i64, ibit) + for (i = 0; i < cregs.size(); i++) { + for (j = 0; j < size; j++) { + i64 = j >> 6; + ibit = j & 63; + if (((recv[i * n64 + i64] >> ibit) & 1) == 1) + cregs[i].creg_memory()[j] = '1'; + else + cregs[i].creg_memory()[j] = '0'; + } + } +} +#endif + +//------------------------------------------------------------------------- +} // end namespace CircuitExecutor +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/density_matrix/densitymatrix.hpp b/src/simulators/density_matrix/densitymatrix.hpp old mode 100644 new mode 100755 index d574bef6f6..cdbc6c8336 --- a/src/simulators/density_matrix/densitymatrix.hpp +++ b/src/simulators/density_matrix/densitymatrix.hpp @@ -59,6 +59,11 @@ class DensityMatrix : public UnitaryMatrix { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const DensityMatrix &obj) { + BaseMatrix::initialize(obj); + } + // Initializes the vector to a custom initial state. // The vector can be either a statevector or a vectorized density matrix // If the length of the data vector does not match either case for the diff --git a/src/simulators/density_matrix/densitymatrix_executor.hpp b/src/simulators/density_matrix/densitymatrix_executor.hpp new file mode 100644 index 0000000000..d656a6f9a0 --- /dev/null +++ b/src/simulators/density_matrix/densitymatrix_executor.hpp @@ -0,0 +1,1408 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _densitymatrix_executor_hpp_ +#define _densitymatrix_executor_hpp_ + +#include "simulators/batch_shots_executor.hpp" +#include "simulators/parallel_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace DensityMatrix { + +//------------------------------------------------------------------------- +// batched-shots executor for density matrix +//------------------------------------------------------------------------- +template +class Executor : public CircuitExecutor::ParallelStateExecutor, + public CircuitExecutor::BatchShotsExecutor { + using Base = CircuitExecutor::MultiStateExecutor; + using BasePar = CircuitExecutor::ParallelStateExecutor; + using BaseBatch = CircuitExecutor::BatchShotsExecutor; + +protected: +public: + Executor() {} + virtual ~Executor() {} + +protected: + void set_config(const Config &config) override; + + bool shot_branching_supported(void) override { return true; } + + // apply parallel operations + bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op) override; + + // apply op to multiple shots , return flase if op is not supported to execute + // in a batch + bool apply_batched_op(const int_t istate, const Operations::Op &op, + ExperimentResult &result, std::vector &rng, + bool final_op = false) override; + + bool apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, ExperimentResult &result, + bool final_op) override; + + // Initializes an n-qubit state to the all |0> state + void initialize_qreg(uint_t num_qubits) override; + + auto move_to_matrix(); + auto copy_to_matrix(); + + template + void initialize_from_vector(const list_t &vec); + + void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise) override; + + bool allocate_states(uint_t num_states, const Config &config) override { + return BasePar::allocate_states(num_states, config); + } + //----------------------------------------------------------------------- + // Apply instructions + //----------------------------------------------------------------------- + + // Measure qubits and return a list of outcomes [q0, q1, ...] + // If a state subclass supports this function it then "measure" + // should be contained in the set returned by the 'allowed_ops' + // method. + void apply_measure(const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister, RngEngine &rng); + + // Reset the specified qubits to the |0> state by tracing out qubits + void apply_reset(const reg_t &qubits); + + // Apply a Kraus error operation + void apply_kraus(const reg_t &qubits, const std::vector &kraus); + + //----------------------------------------------------------------------- + // Save data instructions + //----------------------------------------------------------------------- + + // Save the current full density matrix + void apply_save_state(const Operations::Op &op, ExperimentResult &result, + bool last_op = false); + + // Save the current density matrix or reduced density matrix + void apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result, + bool last_op = false); + + // Helper function for computing expectation value + void apply_save_probs(const Operations::Op &op, ExperimentResult &result); + + // Helper function for saving amplitudes squared + void apply_save_amplitudes_sq(const Operations::Op &op, + ExperimentResult &result); + + // Helper function for computing expectation value + virtual double expval_pauli(const reg_t &qubits, + const std::string &pauli) override; + + // Return the reduced density matrix for the simulator + cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false); + cmatrix_t reduced_density_matrix_helper(const reg_t &qubits, + const reg_t &qubits_sorted); + + //----------------------------------------------------------------------- + // Measurement Helpers + //----------------------------------------------------------------------- + + // Return vector of measure probabilities for specified qubits + // If a state subclass supports this function it then "measure" + // should be contained in the set returned by the 'allowed_ops' + // method. + rvector_t measure_probs(const reg_t &qubits) const; + + // Sample the measurement outcome for qubits + // return a pair (m, p) of the outcome m, and its corresponding + // probability p. + // Outcome is given as an int: Eg for two-qubits {q0, q1} we have + // 0 -> |q1 = 0, q0 = 0> state + // 1 -> |q1 = 0, q0 = 1> state + // 2 -> |q1 = 1, q0 = 0> state + // 3 -> |q1 = 1, q0 = 1> state + std::pair sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng); + + void measure_reset_update(const std::vector &qubits, + const uint_t final_state, const uint_t meas_state, + const double meas_prob); + + // Sample n-measurement outcomes without applying the measure operation + // to the system state + std::vector sample_measure(const reg_t &qubits, uint_t shots, + RngEngine &rng) const override; + + rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits); + void measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs); + void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits, + const reg_t &cmemory, const reg_t &cregister); + + std::vector sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const override; + + //----------------------------------------------------------------------- + // Functions for multi-chunk distribution + //----------------------------------------------------------------------- + // swap between chunks + void apply_chunk_swap(const reg_t &qubits) override; + + // apply multiple swaps between chunks + void apply_multi_chunk_swap(const reg_t &qubits) override; + + // scale for density matrix = 2 + // this function is used in the base class to scale chunk qubits for + // multi-chunk distribution + uint_t qubit_scale(void) override { return 2; } +}; + +//------------------------------------------------------------------------- +// Initialization +//------------------------------------------------------------------------- +template +void Executor::initialize_qreg(uint_t num_qubits) { + for (int_t i = 0; i < Base::states_.size(); i++) { + Base::states_[i].qreg().set_num_qubits(BasePar::chunk_bits_); + } + + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + if (Base::global_state_index_ + iChunk == 0) { + Base::states_[iChunk].qreg().initialize(); + } else { + Base::states_[iChunk].qreg().zero(); + } + } + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) { + if (Base::global_state_index_ + i == 0) { + Base::states_[i].qreg().initialize(); + } else { + Base::states_[i].qreg().zero(); + } + } + } +} + +template +template +void Executor::initialize_from_vector(const list_t &vec) { + if ((1ull << (Base::num_qubits_ * 2)) == vec.size()) { + BasePar::initialize_from_vector(vec); + } else if ((1ull << (Base::num_qubits_ * 2)) == vec.size() * vec.size()) { + int_t iChunk; + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + uint_t irow_chunk = ((iChunk + Base::global_state_index_) >> + ((Base::num_qubits_ - BasePar::chunk_bits_))) + << (BasePar::chunk_bits_); + uint_t icol_chunk = + ((iChunk + Base::global_state_index_) & + ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1)) + << (BasePar::chunk_bits_); + + // copy part of state for this chunk + uint_t i, row, col; + list_t vec1(1ull << BasePar::chunk_bits_); + list_t vec2(1ull << BasePar::chunk_bits_); + + for (i = 0; i < (1ull << BasePar::chunk_bits_); i++) { + vec1[i] = vec[(irow_chunk << BasePar::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BasePar::chunk_bits_) + i]); + } + Base::states_[iChunk].qreg().initialize_from_vector( + AER::Utils::tensor_product(vec1, vec2)); + } + } + } else { + for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) { + uint_t irow_chunk = ((iChunk + Base::global_state_index_) >> + ((Base::num_qubits_ - BasePar::chunk_bits_))) + << (BasePar::chunk_bits_); + uint_t icol_chunk = + ((iChunk + Base::global_state_index_) & + ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1)) + << (BasePar::chunk_bits_); + + // copy part of state for this chunk + uint_t i, row, col; + list_t vec1(1ull << BasePar::chunk_bits_); + list_t vec2(1ull << BasePar::chunk_bits_); + + for (i = 0; i < (1ull << BasePar::chunk_bits_); i++) { + vec1[i] = vec[(irow_chunk << BasePar::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BasePar::chunk_bits_) + i]); + } + Base::states_[iChunk].qreg().initialize_from_vector( + AER::Utils::tensor_product(vec1, vec2)); + } + } + } else { + throw std::runtime_error( + "DensityMatrixChunk::initialize input vector is incorrect length. " + "Expected: " + + std::to_string((1ull << (Base::num_qubits_ * 2))) + + " Received: " + std::to_string(vec.size())); + } +} + +template +auto Executor::move_to_matrix() { + return BasePar::apply_to_matrix(false); +} + +template +auto Executor::copy_to_matrix() { + return BasePar::apply_to_matrix(true); +} + +//------------------------------------------------------------------------- +// Utility +//------------------------------------------------------------------------- + +template +void Executor::set_config(const Config &config) { + BasePar::set_config(config); + BaseBatch::set_config(config); +} + +template +void Executor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + state_t dummy_state; + if (BasePar::multiple_chunk_required(circ, noise)) { + return BasePar::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } else { + return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } +} + +//========================================================================= +// Implementation: apply operations +//========================================================================= + +template +bool Executor::apply_parallel_op(const Operations::Op &op, + ExperimentResult &result, + RngEngine &rng, bool final_ops) { + if (Base::states_[0].creg().check_conditional(op)) { + switch (op.type) { + case Operations::OpType::reset: + apply_reset(op.qubits); + break; + case Operations::OpType::measure: + apply_measure(op.qubits, op.memory, op.registers, rng); + break; + case Operations::OpType::bfunc: + BasePar::apply_bfunc(op); + break; + case Operations::OpType::roerror: + BasePar::apply_roerror(op, rng); + break; + case Operations::OpType::kraus: + apply_kraus(op.qubits, op.mats); + break; + case Operations::OpType::set_statevec: + initialize_from_vector(op.params); + break; + case Operations::OpType::set_densmat: + BasePar::initialize_from_matrix(op.mats[0]); + break; + case Operations::OpType::save_expval: + case Operations::OpType::save_expval_var: + BasePar::apply_save_expval(op, result); + break; + case Operations::OpType::save_state: + apply_save_state(op, result, final_ops); + break; + case Operations::OpType::save_densmat: + apply_save_density_matrix(op, result, final_ops); + break; + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + apply_save_probs(op, result); + break; + case Operations::OpType::save_amps_sq: + apply_save_amplitudes_sq(op, result); + break; + default: + return false; + } + } + return true; +} + +template +bool Executor::apply_batched_op(const int_t istate, + const Operations::Op &op, + ExperimentResult &result, + std::vector &rng, + bool final_op) { + if (op.conditional) { + Base::states_[istate].qreg().set_conditional(op.conditional_reg); + } + + switch (op.type) { + case Operations::OpType::barrier: + case Operations::OpType::nop: + case Operations::OpType::qerror_loc: + break; + case Operations::OpType::reset: + Base::states_[istate].apply_reset(op.qubits); + break; + case Operations::OpType::measure: + Base::states_[istate].qreg().apply_batched_measure(op.qubits, rng, + op.memory, op.registers); + break; + case Operations::OpType::bfunc: + Base::states_[istate].qreg().apply_bfunc(op); + break; + case Operations::OpType::roerror: + Base::states_[istate].qreg().apply_roerror(op, rng); + break; + case Operations::OpType::gate: + Base::states_[istate].apply_gate(op); + break; + case Operations::OpType::matrix: + Base::states_[istate].apply_matrix(op.qubits, op.mats[0]); + break; + case Operations::OpType::diagonal_matrix: + Base::states_[istate].apply_diagonal_unitary_matrix(op.qubits, op.params); + break; + case Operations::OpType::superop: + Base::states_[istate].qreg().apply_superop_matrix( + op.qubits, Utils::vectorize_matrix(op.mats[0])); + break; + case Operations::OpType::kraus: + Base::states_[istate].apply_kraus(op.qubits, op.mats); + break; + default: + // other operations should be called to indivisual chunks by apply_op + return false; + } + return true; +} + +template +bool Executor::apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, + bool final_op) { + RngEngine dummy; + if (Base::states_[root.state_index()].creg().check_conditional(op)) { + switch (op.type) { + // ops with branching + // case Operations::OpType::reset: + // apply_reset(root, op.qubits); + // break; + case Operations::OpType::measure: + apply_measure(root, op.qubits, op.memory, op.registers); + break; + // save ops + case Operations::OpType::save_expval: + case Operations::OpType::save_expval_var: + case Operations::OpType::save_state: + case Operations::OpType::save_densmat: + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + case Operations::OpType::save_amps_sq: + // call save functions in state class + Base::states_[root.state_index()].apply_op(op, result, dummy, final_op); + break; + default: + return false; + } + } + return true; +} + +//========================================================================= +// Implementation: Save data +//========================================================================= + +template +void Executor::apply_save_probs(const Operations::Op &op, + ExperimentResult &result) { + auto probs = measure_probs(op.qubits); + if (op.type == Operations::OpType::save_probs_ket) { + result.save_data_average( + Base::states_[0].creg(), op.string_params[0], + Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type, + op.save_type); + } else { + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + std::move(probs), op.type, op.save_type); + } +} + +template +void Executor::apply_save_amplitudes_sq(const Operations::Op &op, + ExperimentResult &result) { + if (op.int_params.empty()) { + throw std::invalid_argument( + "Invalid save_amplitudes_sq instructions (empty params)."); + } + const int_t size = op.int_params.size(); + rvector_t amps_sq(size); + + int_t iChunk; +#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk) + for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + iChunk) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + iChunk) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + if (irow != icol) + continue; + + for (int_t i = 0; i < size; ++i) { + uint_t idx = BasePar::mapped_index(op.int_params[i]); + if (idx >= (irow << BasePar::chunk_bits_) && + idx < ((irow + 1) << BasePar::chunk_bits_)) + amps_sq[i] = Base::states_[iChunk].qreg().probability( + idx - (irow << BasePar::chunk_bits_)); + } + } +#ifdef AER_MPI + BasePar::reduce_sum(amps_sq); +#endif + + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + std::move(amps_sq), op.type, op.save_type); +} + +template +double Executor::expval_pauli(const reg_t &qubits, + const std::string &pauli) { + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + std::string pauli_in_chunk; + std::string pauli_out_chunk; + int_t i, n; + double expval(0.); + + // get inner/outer chunk pauli string + n = pauli.size(); + for (i = 0; i < n; i++) { + if (qubits[i] < BasePar::chunk_bits_) { + qubits_in_chunk.push_back(qubits[i]); + pauli_in_chunk.push_back(pauli[n - i - 1]); + } else { + qubits_out_chunk.push_back(qubits[i]); + pauli_out_chunk.push_back(pauli[n - i - 1]); + } + } + + int_t nrows = 1ull << ((Base::num_qubits_ - BasePar::chunk_bits_)); + + if (qubits_out_chunk.size() > 0) { // there are bits out of chunk + std::complex phase = 1.0; + + std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end()); + std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end()); + + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = + AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk); + + z_mask >>= (BasePar::chunk_bits_); + if (x_mask != 0) { + x_mask >>= (BasePar::chunk_bits_); + x_max -= (BasePar::chunk_bits_); + + AER::QV::add_y_phase(num_y, phase); + + const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); + const uint_t mask_l = (1ull << x_max) - 1; + + for (i = 0; i < nrows / 2; i++) { + uint_t irow = ((i << 1) & mask_u) | (i & mask_l); + uint_t iChunk = (irow ^ x_mask) + irow * nrows; + + if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk && + Base::state_index_end_[Base::distributed_rank_] > + iChunk) { // on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli_non_diagonal_chunk( + qubits_in_chunk, pauli_in_chunk, phase); + } + } + } else { + for (i = 0; i < nrows; i++) { + uint_t iChunk = i * (nrows + 1); + if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk && + Base::state_index_end_[Base::distributed_rank_] > + iChunk) { // on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += + sign * Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli(qubits_in_chunk, pauli_in_chunk, 1.0); + } + } + } + } else { // all bits are inside chunk + for (i = 0; i < nrows; i++) { + uint_t iChunk = i * (nrows + 1); + if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk && + Base::state_index_end_[Base::distributed_rank_] > + iChunk) { // on this process + expval += Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli(qubits, pauli, 1.0); + } + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(expval); +#endif + return expval; +} + +template +void Executor::apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + reduced_density_matrix(op.qubits, last_op), op.type, + op.save_type); +} + +template +void Executor::apply_save_state(const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + " was not applied to all qubits." + " Only the full state can be saved."); + } + // Renamp single data type to average + Operations::DataSubType save_type; + switch (op.save_type) { + case Operations::DataSubType::single: + save_type = Operations::DataSubType::average; + break; + case Operations::DataSubType::c_single: + save_type = Operations::DataSubType::c_average; + break; + default: + save_type = op.save_type; + } + + // Default key + std::string key = (op.string_params[0] == "_method_") ? "density_matrix" + : op.string_params[0]; + if (last_op) { + result.save_data_average(Base::states_[0].creg(), key, move_to_matrix(), + Operations::OpType::save_densmat, save_type); + } else { + result.save_data_average(Base::states_[0].creg(), key, copy_to_matrix(), + Operations::OpType::save_densmat, save_type); + } +} + +template +cmatrix_t Executor::reduced_density_matrix(const reg_t &qubits, + bool last_op) { + cmatrix_t reduced_state; + + // Check if tracing over all qubits + if (qubits.empty()) { + reduced_state = cmatrix_t(1, 1); + std::complex sum = 0.0; + for (int_t i = 0; i < Base::states_.size(); i++) { + sum += Base::states_[i].qreg().trace(); + } +#ifdef AER_MPI + BasePar::reduce_sum(sum); +#endif + reduced_state[0] = sum; + } else { + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + if ((qubits.size() == Base::num_qubits_) && (qubits == qubits_sorted)) { + if (last_op) { + reduced_state = move_to_matrix(); + } else { + reduced_state = copy_to_matrix(); + } + } else { + reduced_state = reduced_density_matrix_helper(qubits, qubits_sorted); + } + } + return reduced_state; +} + +template +cmatrix_t +Executor::reduced_density_matrix_helper(const reg_t &qubits, + const reg_t &qubits_sorted) { + int_t iChunk; + uint_t size = 1ull << (BasePar::chunk_bits_ * 2); + uint_t mask = (1ull << (BasePar::chunk_bits_)) - 1; + uint_t num_threads = Base::states_[0].qreg().get_omp_threads(); + + size_t size_required = + (sizeof(std::complex) << (qubits.size() * 2)) + + (sizeof(std::complex) << (BasePar::chunk_bits_ * 2)) * + Base::num_local_states_; + if ((size_required >> 20) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to store density matrix")); + } + cmatrix_t reduced_state(1ull << qubits.size(), 1ull << qubits.size(), true); + + if (Base::distributed_rank_ == 0) { + auto tmp = Base::states_[0].copy_to_matrix(); + for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) { + int_t i; + uint_t irow_chunk = + (iChunk >> ((Base::num_qubits_ - BasePar::chunk_bits_))) + << BasePar::chunk_bits_; + uint_t icol_chunk = + (iChunk & + ((1ull << ((Base::num_qubits_ - BasePar::chunk_bits_))) - 1)) + << BasePar::chunk_bits_; + + if (iChunk < Base::num_local_states_) + tmp = Base::states_[iChunk].qreg().copy_to_matrix(); +#ifdef AER_MPI + else + BasePar::recv_data(tmp.data(), size, 0, iChunk); +#endif +#pragma omp parallel for if (num_threads > 1) num_threads(num_threads) + for (i = 0; i < size; i++) { + uint_t irow = (i >> (BasePar::chunk_bits_)) + irow_chunk; + uint_t icol = (i & mask) + icol_chunk; + uint_t irow_out = 0; + uint_t icol_out = 0; + int j; + for (j = 0; j < qubits.size(); j++) { + if ((irow >> qubits[j]) & 1) { + irow &= ~(1ull << qubits[j]); + irow_out += (1ull << j); + } + if ((icol >> qubits[j]) & 1) { + icol &= ~(1ull << qubits[j]); + icol_out += (1ull << j); + } + } + if (irow == icol) { // only diagonal base can be reduced + uint_t idx = ((irow_out) << qubits.size()) + icol_out; +#pragma omp critical + reduced_state[idx] += tmp[i]; + } + } + } + } else { +#ifdef AER_MPI + // send matrices to process 0 + for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) { + uint_t iProc = BasePar::get_process_by_chunk(iChunk); + if (iProc == Base::distributed_rank_) { + auto tmp = Base::states_[iChunk - Base::global_state_index_] + .qreg() + .copy_to_matrix(); + BasePar::send_data(tmp.data(), size, iChunk, 0); + } + } +#endif + } + + return reduced_state; +} + +//========================================================================= +// Implementation: Reset and Measurement Sampling +//========================================================================= + +template +void Executor::apply_measure(const reg_t &qubits, + const reg_t &cmemory, + const reg_t &cregister, + RngEngine &rng) { + // Actual measurement outcome + const auto meas = sample_measure_with_prob(qubits, rng); + // Implement measurement update + measure_reset_update(qubits, meas.first, meas.first, meas.second); + const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size()); + BasePar::store_measure(outcome, cmemory, cregister); +} + +template +rvector_t Executor::measure_probs(const reg_t &qubits) const { + uint_t dim = 1ull << qubits.size(); + rvector_t sum(dim, 0.0); + int_t i, j, k; + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + for (i = 0; i < qubits.size(); i++) { + if (qubits[i] < BasePar::chunk_bits_) { + qubits_in_chunk.push_back(qubits[i]); + } else { + qubits_out_chunk.push_back(qubits[i]); + } + } + + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(i, j, k) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + i) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + + if (irow == icol) { // diagonal chunk + if (qubits_in_chunk.size() > 0) { + auto chunkSum = + Base::states_[i].qreg().probabilities(qubits_in_chunk); + if (qubits_in_chunk.size() == qubits.size()) { + for (j = 0; j < dim; j++) { +#pragma omp atomic + sum[j] += chunkSum[j]; + } + } else { + for (j = 0; j < chunkSum.size(); j++) { + int idx = 0; + int i_in = 0; + for (k = 0; k < qubits.size(); k++) { + if (qubits[k] < (BasePar::chunk_bits_)) { + idx += (((j >> i_in) & 1) << k); + i_in++; + } else { + if ((((i + Base::global_state_index_) + << (BasePar::chunk_bits_)) >> + qubits[k]) & + 1) { + idx += 1ull << k; + } + } + } +#pragma omp atomic + sum[idx] += chunkSum[j]; + } + } + } else { // there is no bit in chunk + auto tr = std::real(Base::states_[i].qreg().trace()); + int idx = 0; + for (k = 0; k < qubits_out_chunk.size(); k++) { + if ((((i + Base::global_state_index_) + << (BasePar::chunk_bits_)) >> + qubits_out_chunk[k]) & + 1) { + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } + } + } + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + i) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + + if (irow == icol) { // diagonal chunk + if (qubits_in_chunk.size() > 0) { + auto chunkSum = + Base::states_[i].qreg().probabilities(qubits_in_chunk); + if (qubits_in_chunk.size() == qubits.size()) { + for (j = 0; j < dim; j++) { + sum[j] += chunkSum[j]; + } + } else { + for (j = 0; j < chunkSum.size(); j++) { + int idx = 0; + int i_in = 0; + for (k = 0; k < qubits.size(); k++) { + if (qubits[k] < (BasePar::chunk_bits_)) { + idx += (((j >> i_in) & 1) << k); + i_in++; + } else { + if ((((i + Base::global_state_index_) + << (BasePar::chunk_bits_)) >> + qubits[k]) & + 1) { + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } + } + } else { // there is no bit in chunk + auto tr = std::real(Base::states_[i].qreg().trace()); + int idx = 0; + for (k = 0; k < qubits_out_chunk.size(); k++) { + if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >> + qubits_out_chunk[k]) & + 1) { + idx += 1ull << k; + } + } + sum[idx] += tr; + } + } + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(sum); +#endif + + return sum; +} + +template +void Executor::apply_reset(const reg_t &qubits) { + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + Base::states_[iChunk].qreg().apply_reset(qubits); + } + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_reset(qubits); + } +} + +template +std::pair +Executor::sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng) { + rvector_t probs = measure_probs(qubits); + // Randomly pick outcome and return pair + uint_t outcome = rng.rand_int(probs); + return std::make_pair(outcome, probs[outcome]); +} + +template +void Executor::measure_reset_update(const reg_t &qubits, + const uint_t final_state, + const uint_t meas_state, + const double meas_prob) { + // Update a state vector based on an outcome pair [m, p] from + // sample_measure_with_prob function, and a desired post-measurement + // final_state Single-qubit case + if (qubits.size() == 1) { + // Diagonal matrix for projecting and renormalizing to measurement outcome + cvector_t mdiag(2, 0.); + mdiag[meas_state] = 1. / std::sqrt(meas_prob); + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag); + } + + // If it doesn't agree with the reset state update + if (final_state != meas_state) { + if (qubits[0] < BasePar::chunk_bits_) { + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().apply_x(qubits[0]); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_x(qubits[0]); + } + } else { + BasePar::apply_chunk_x(qubits[0]); + BasePar::apply_chunk_x(qubits[0] + BasePar::chunk_bits_); + } + } + } + // Multi qubit case + else { + // Diagonal matrix for projecting and renormalizing to measurement outcome + const size_t dim = 1ULL << qubits.size(); + cvector_t mdiag(dim, 0.); + mdiag[meas_state] = 1. / std::sqrt(meas_prob); + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_diagonal_unitary_matrix(qubits, mdiag); + } + + // If it doesn't agree with the reset state update + // TODO This function could be optimized as a permutation update + if (final_state != meas_state) { + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != meas_state) + perm[j * dim + j] = 1.; + } + // apply permutation to swap state + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + for (int_t i = 0; i < qubits.size(); i++) { + if (qubits[i] < BasePar::chunk_bits_) { + qubits_in_chunk.push_back(qubits[i]); + } else { + qubits_out_chunk.push_back(qubits[i]); + } + } + if (qubits_in_chunk.size() > 0) { // in chunk exchange + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().apply_unitary_matrix(qubits, perm); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_unitary_matrix(qubits, perm); + } + } + if (qubits_out_chunk.size() > 0) { // out of chunk exchange + for (int_t i = 0; i < qubits_out_chunk.size(); i++) { + BasePar::apply_chunk_x(qubits_out_chunk[i]); + BasePar::apply_chunk_x(qubits_out_chunk[i] + + (Base::num_qubits_ - BasePar::chunk_bits_)); + } + } + } + } +} + +template +std::vector Executor::sample_measure(const reg_t &qubits, + uint_t shots, + RngEngine &rng) const { + // Generate flat register for storing + std::vector rnds; + rnds.reserve(shots); + for (uint_t i = 0; i < shots; ++i) + rnds.push_back(rng.rand(0, 1)); + reg_t allbit_samples(shots, 0); + + int_t i, j; + std::vector chunkSum(Base::states_.size() + 1, 0); + double sum, localSum; + // calculate per chunk sum + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(i) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + i) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + if (irow == icol) // only diagonal chunk has probabilities + chunkSum[i] = std::real(Base::states_[i].qreg().trace()); + else + chunkSum[i] = 0.0; + } + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + i) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + if (irow == icol) // only diagonal chunk has probabilities + chunkSum[i] = std::real(Base::states_[i].qreg().trace()); + else + chunkSum[i] = 0.0; + } + } + localSum = 0.0; + for (i = 0; i < Base::states_.size(); i++) { + sum = localSum; + localSum += chunkSum[i]; + chunkSum[i] = sum; + } + chunkSum[Base::states_.size()] = localSum; + + double globalSum = 0.0; + if (Base::nprocs_ > 1) { + std::vector procTotal(Base::nprocs_); + + for (i = 0; i < Base::nprocs_; i++) { + procTotal[i] = localSum; + } + BasePar::gather_value(procTotal); + + for (i = 0; i < Base::myrank_; i++) { + globalSum += procTotal[i]; + } + } + + reg_t local_samples(shots, 0); + + // get rnds positions for each chunk + for (i = 0; i < Base::states_.size(); i++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + icol = (Base::global_state_index_ + i) - + (irow << ((Base::num_qubits_ - BasePar::chunk_bits_))); + if (irow != icol) + continue; + + uint_t nIn; + std::vector vIdx; + std::vector vRnd; + + // find rnds in this chunk + nIn = 0; + for (j = 0; j < shots; j++) { + if (rnds[j] >= chunkSum[i] + globalSum && + rnds[j] < chunkSum[i + 1] + globalSum) { + vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); + vIdx.push_back(j); + nIn++; + } + } + + if (nIn > 0) { + auto chunkSamples = Base::states_[i].qreg().sample_measure(vRnd); + uint_t ir; + ir = (Base::global_state_index_ + i) >> + ((Base::num_qubits_ - BasePar::chunk_bits_)); + + for (j = 0; j < chunkSamples.size(); j++) { + local_samples[vIdx[j]] = (ir << BasePar::chunk_bits_) + chunkSamples[j]; + } + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(local_samples); +#endif + allbit_samples = local_samples; + + // Convert to reg_t format + std::vector all_samples; + all_samples.reserve(shots); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); + } + all_samples.push_back(sample); + } + return all_samples; +} + +template +rvector_t +Executor::sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits) { + rvector_t probs = + Base::states_[root.state_index()].qreg().probabilities(qubits); + uint_t nshots = root.num_shots(); + reg_t shot_branch(nshots); + + for (int_t i = 0; i < nshots; i++) { + shot_branch[i] = root.rng_shots()[i].rand_int(probs); + } + + // branch shots + root.creg() = Base::states_[root.state_index()].creg(); + root.branch_shots(shot_branch, probs.size()); + + return probs; +} + +template +void Executor::measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs) { + // Update a state vector based on an outcome pair [m, p] from + // sample_measure_with_prob function, and a desired post-measurement + // final_state + + // Single-qubit case + if (qubits.size() == 1) { + // Diagonal matrix for projecting and renormalizing to measurement outcome + for (int_t i = 0; i < 2; i++) { + cvector_t mdiag(2, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + Operations::Op op; + op.type = OpType::gate; + op.name = "x"; + op.qubits = qubits; + root.branches()[i]->add_op_after_branch(op); + } + } + } + // Multi qubit case + else { + // Diagonal matrix for projecting and renormalizing to measurement outcome + const size_t dim = 1ULL << qubits.size(); + for (int_t i = 0; i < dim; i++) { + cvector_t mdiag(dim, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + i] = 1.; + perm[i * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != i) + perm[j * dim + j] = 1.; + } + Operations::Op op; + op.type = OpType::matrix; + op.qubits = qubits; + op.mats.push_back(Utils::devectorize_matrix(perm)); + root.branches()[i]->add_op_after_branch(op); + } + } + } +} + +template +void Executor::apply_measure(CircuitExecutor::Branch &root, + const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister) { + rvector_t probs = sample_measure_with_prob(root, qubits); + + // save result to cregs + for (int_t i = 0; i < probs.size(); i++) { + const reg_t outcome = Utils::int2reg(i, 2, qubits.size()); + root.branches()[i]->creg().store_measure(outcome, cmemory, cregister); + } + + measure_reset_update(root, qubits, -1, probs); +} +/* +template +void Executor::apply_reset(CircuitExecutor::Branch& root, const +reg_t &qubits) +{ + rvector_t probs = sample_measure_with_prob(root, qubits); + + measure_reset_update(root, qubits, 0, probs); +} +*/ + +template +std::vector +Executor::sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const { + int_t i, j; + std::vector rnds; + rnds.reserve(shots); + + /* + double norm = std::real( state.qreg().trace() ); + std::cout << " trace = " << norm< all_samples; + all_samples.reserve(shots); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); + } + all_samples.push_back(sample); + } + return all_samples; +} + +//========================================================================= +// Implementation: Kraus Noise +//========================================================================= + +template +void Executor::apply_kraus(const reg_t &qubits, + const std::vector &kmats) { + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + Base::states_[iChunk].qreg().apply_superop_matrix( + qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats))); + } + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_superop_matrix( + qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats))); + } +} + +//----------------------------------------------------------------------- +// Functions for multi-chunk distribution +//----------------------------------------------------------------------- +// swap between chunks +template +void Executor::apply_chunk_swap(const reg_t &qubits) { + uint_t q0, q1; + q0 = qubits[0]; + q1 = qubits[1]; + + std::swap(BasePar::qubit_map_[q0], BasePar::qubit_map_[q1]); + + if (qubits[0] >= BasePar::chunk_bits_) { + q0 += BasePar::chunk_bits_; + } + if (qubits[1] >= BasePar::chunk_bits_) { + q1 += BasePar::chunk_bits_; + } + reg_t qs0 = {{q0, q1}}; + BasePar::apply_chunk_swap(qs0); + + if (qubits[0] >= BasePar::chunk_bits_) { + q0 += (Base::num_qubits_ - BasePar::chunk_bits_); + } else { + q0 += BasePar::chunk_bits_; + } + if (qubits[1] >= BasePar::chunk_bits_) { + q1 += (Base::num_qubits_ - BasePar::chunk_bits_); + } else { + q1 += BasePar::chunk_bits_; + } + reg_t qs1 = {{q0, q1}}; + BasePar::apply_chunk_swap(qs1); +} + +template +void Executor::apply_multi_chunk_swap(const reg_t &qubits) { + reg_t qubits_density; + + for (int_t i = 0; i < qubits.size(); i += 2) { + uint_t q0, q1; + q0 = qubits[i * 2]; + q1 = qubits[i * 2 + 1]; + + std::swap(BasePar::qubit_map_[q0], BasePar::qubit_map_[q1]); + + if (q1 >= BasePar::chunk_bits_) { + q1 += BasePar::chunk_bits_; + } + qubits_density.push_back(q0); + qubits_density.push_back(q1); + + q0 += BasePar::chunk_bits_; + if (q1 >= BasePar::chunk_bits_) { + q1 += (Base::num_qubits_ - BasePar::chunk_bits_ * 2); + } + } + + BasePar::apply_multi_chunk_swap(qubits_density); +} + +//------------------------------------------------------------------------- +} // end namespace DensityMatrix +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index ab8e3b4fd3..a5bfa46585 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -23,7 +23,8 @@ #include "framework/json.hpp" #include "framework/opset.hpp" #include "framework/utils.hpp" -#include "simulators/state_chunk.hpp" +#include "simulators/chunk_utils.hpp" +#include "simulators/state.hpp" #ifdef AER_THRUST_SUPPORTED #include "densitymatrix_thrust.hpp" #endif @@ -90,9 +91,9 @@ enum class Gates { //========================================================================= template > -class State : public QuantumState::StateChunk { +class State : public QuantumState::State { public: - using BaseState = QuantumState::StateChunk; + using BaseState = QuantumState::State; State() : BaseState(StateOpSet) {} virtual ~State() = default; @@ -102,32 +103,35 @@ class State : public QuantumState::StateChunk { //----------------------------------------------------------------------- // Return the string name of the State class - virtual std::string name() const override { return densmat_t::name(); } + std::string name() const override { return densmat_t::name(); } // Apply an operation // If the op is not in allowed_ops an exeption will be raised. - virtual void apply_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, RngEngine &rng, - bool final_op = false) override; + void apply_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op = false) override; + + // memory allocation (previously called before inisitalize_qreg) + bool allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots = 1) override; // Initializes an n-qubit state to the all |0> state - virtual void initialize_qreg(uint_t num_qubits) override; + void initialize_qreg(uint_t num_qubits) override; // Returns the required memory for storing an n-qubit state in megabytes. // For this state the memory is indepdentent of the number of ops // and is approximately 16 * 1 << num_qubits bytes - virtual size_t + size_t required_memory_mb(uint_t num_qubits, const std::vector &ops) const override; // Load the threshold for applying OpenMP parallelization // if the controller/engine allows threads for it - virtual void set_config(const Config &config) override; + void set_config(const Config &config) override; // Sample n-measurement outcomes without applying the measure operation // to the system state - virtual std::vector sample_measure(const reg_t &qubits, uint_t shots, - RngEngine &rng) override; + std::vector sample_measure(const reg_t &qubits, uint_t shots, + RngEngine &rng) override; //----------------------------------------------------------------------- // Additional methods @@ -139,96 +143,80 @@ class State : public QuantumState::StateChunk { // Initialize OpenMP settings for the underlying DensityMatrix class void initialize_omp(); - auto move_to_matrix(const int_t iChunk); - auto copy_to_matrix(const int_t iChunk); + auto move_to_matrix(); + auto copy_to_matrix(); -protected: template - void initialize_from_vector(const int_t iChunk, const list_t &vec); + void initialize_from_vector(const list_t &vec); //----------------------------------------------------------------------- // Apply instructions //----------------------------------------------------------------------- - // apply op to multiple shots , return flase if op is not supported to execute - // in a batch - bool apply_batched_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, std::vector &rng, - bool final_op = false) override; - // Applies a sypported Gate operation to the state class. // If the input is not in allowed_gates an exeption will be raised. - void apply_gate(const int_t iChunk, const Operations::Op &op); + void apply_gate(const Operations::Op &op); // apply (multi) control gate by statevector - void apply_gate_statevector(const int_t iChunk, const Operations::Op &op); + void apply_gate_statevector(const Operations::Op &op); // Measure qubits and return a list of outcomes [q0, q1, ...] // If a state subclass supports this function it then "measure" // should be contained in the set returned by the 'allowed_ops' // method. - virtual void apply_measure(const int_t iChunk, const reg_t &qubits, - const reg_t &cmemory, const reg_t &cregister, - RngEngine &rng); + virtual void apply_measure(const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister, RngEngine &rng); // Reset the specified qubits to the |0> state by tracing out qubits - void apply_reset(const int_t iChunk, const reg_t &qubits); + void apply_reset(const reg_t &qubits); // Apply a matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const reg_t &qubits, - const cmatrix_t &mat); + void apply_matrix(const reg_t &qubits, const cmatrix_t &mat); // Apply a vectorized matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const reg_t &qubits, - const cvector_t &vmat); + void apply_matrix(const reg_t &qubits, const cvector_t &vmat); // apply diagonal matrix - void apply_diagonal_unitary_matrix(const int_t iChunk, const reg_t &qubits, + void apply_diagonal_unitary_matrix(const reg_t &qubits, const cvector_t &diag); // Apply a Kraus error operation - void apply_kraus(const int_t iChunk, const reg_t &qubits, - const std::vector &kraus); + void apply_kraus(const reg_t &qubits, const std::vector &kraus); // Apply an N-qubit Pauli gate - void apply_pauli(const int_t iChunk, const reg_t &qubits, - const std::string &pauli); + void apply_pauli(const reg_t &qubits, const std::string &pauli); // apply phase - void apply_phase(const int_t iChunk, const uint_t qubit, - const complex_t phase); - void apply_phase(const int_t iChunk, const reg_t &qubits, - const complex_t phase); + void apply_phase(const uint_t qubit, const complex_t phase); + void apply_phase(const reg_t &qubits, const complex_t phase); +protected: //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- // Save the current full density matrix - void apply_save_state(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, bool last_op = false); + void apply_save_state(const Operations::Op &op, ExperimentResult &result, + bool last_op = false); // Save the current density matrix or reduced density matrix - void apply_save_density_matrix(const int_t iChunk, const Operations::Op &op, + void apply_save_density_matrix(const Operations::Op &op, ExperimentResult &result, bool last_op = false); // Helper function for computing expectation value - void apply_save_probs(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result); + void apply_save_probs(const Operations::Op &op, ExperimentResult &result); // Helper function for saving amplitudes squared - void apply_save_amplitudes_sq(const int_t iChunk, const Operations::Op &op, + void apply_save_amplitudes_sq(const Operations::Op &op, ExperimentResult &result); // Helper function for computing expectation value - virtual double expval_pauli(const int_t iChunk, const reg_t &qubits, + virtual double expval_pauli(const reg_t &qubits, const std::string &pauli) override; // Return the reduced density matrix for the simulator - cmatrix_t reduced_density_matrix(const int_t iChunk, const reg_t &qubits, - bool last_op = false); - cmatrix_t reduced_density_matrix_helper(const int_t iChunk, - const reg_t &qubits, + cmatrix_t reduced_density_matrix(const reg_t &qubits, bool last_op = false); + cmatrix_t reduced_density_matrix_helper(const reg_t &qubits, const reg_t &qubits_sorted); //----------------------------------------------------------------------- @@ -240,7 +228,7 @@ class State : public QuantumState::StateChunk { // should be contained in the set returned by the 'allowed_ops' // method. // TODO: move to private (no longer part of base class) - rvector_t measure_probs(const int_t iChunk, const reg_t &qubits) const; + rvector_t measure_probs(const reg_t &qubits) const; // Sample the measurement outcome for qubits // return a pair (m, p) of the outcome m, and its corresponding @@ -250,12 +238,10 @@ class State : public QuantumState::StateChunk { // 1 -> |q1 = 0, q0 = 1> state // 2 -> |q1 = 1, q0 = 0> state // 3 -> |q1 = 1, q0 = 1> state - std::pair sample_measure_with_prob(const int_t iChunk, - const reg_t &qubits, + std::pair sample_measure_with_prob(const reg_t &qubits, RngEngine &rng); - void measure_reset_update(const int_t iChunk, - const std::vector &qubits, + void measure_reset_update(const std::vector &qubits, const uint_t final_state, const uint_t meas_state, const double meas_prob); @@ -264,8 +250,8 @@ class State : public QuantumState::StateChunk { //----------------------------------------------------------------------- // Apply a waltz gate specified by parameters u3(theta, phi, lambda) - void apply_gate_u3(const int_t iChunk, const uint_t qubit, const double theta, - const double phi, const double lambda); + void apply_gate_u3(const uint_t qubit, const double theta, const double phi, + const double lambda); //----------------------------------------------------------------------- // Config Settings @@ -281,20 +267,6 @@ class State : public QuantumState::StateChunk { // Table of allowed gate names to gate enum class members const static stringmap_t gateset_; - - // scale for density matrix = 2 - // this function is used in the base class to scale chunk qubits for - // multi-chunk distribution - int qubit_scale(void) override { return 2; } - - //----------------------------------------------------------------------- - // Functions for multi-chunk distribution - //----------------------------------------------------------------------- - // swap between chunks - void apply_chunk_swap(const reg_t &qubits) override; - - // apply multiple swaps between chunks - void apply_multi_chunk_swap(const reg_t &qubits) override; }; //========================================================================= @@ -356,41 +328,22 @@ const stringmap_t State::gateset_({ //------------------------------------------------------------------------- template void State::initialize_qreg(uint_t num_qubits) { - if (BaseState::qregs_.size() == 0) - BaseState::allocate(num_qubits, num_qubits, 1); initialize_omp(); - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_num_qubits(BaseState::chunk_bits_); - } + BaseState::qreg_.set_num_qubits(num_qubits); + BaseState::qreg_.initialize(); +} - if (BaseState::multi_chunk_distribution_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - if (BaseState::global_chunk_index_ + iChunk == 0) { - BaseState::qregs_[iChunk].initialize(); - } else { - BaseState::qregs_[iChunk].zero(); - } - } - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - if (BaseState::global_chunk_index_ + i == 0) { - BaseState::qregs_[i].initialize(); - } else { - BaseState::qregs_[i].zero(); - } - } - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].initialize(); - } - } +template +bool State::allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots) { + if (BaseState::max_matrix_qubits_ > 0) + BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_); + + BaseState::qreg_.set_target_gpus(BaseState::target_gpus_); + BaseState::qreg_.chunk_setup(block_bits * 2, block_bits * 2, 0, 1); + + return true; } template @@ -401,159 +354,33 @@ void State::initialize_qreg(uint_t num_qubits, densmat_t &&state) { "initial state does not match qubit number"); } - if (BaseState::qregs_.size() == 1) { - BaseState::qregs_[0] = std::move(state); - } else { - initialize_omp(); - for (int_t iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); - } - - if (BaseState::multi_chunk_distribution_) { - auto matrix = state.move_to_matrix(); - uint_t size = 1ull << (BaseState::chunk_bits_ * 2); - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - - auto copy_matrix_to_chunks_lambda = [this, &matrix, size, - mask](int_t ig) { - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - uint_t irow_chunk = - ((iChunk + BaseState::global_chunk_index_) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - << (BaseState::chunk_bits_); - uint_t icol_chunk = - ((iChunk + BaseState::global_chunk_index_) & - ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - - 1)) - << (BaseState::chunk_bits_); - - auto sub_mat = - BaseState::qregs_[iChunk] - .copy_to_matrix(); // allocate sub-matrix by copying data type - // and storage from chunk - for (int_t i = 0; i < size; i++) { - uint_t irow = (i >> (BaseState::chunk_bits_)) + irow_chunk; - uint_t icol = (i & mask) + icol_chunk; - sub_mat[i] = matrix[(irow << BaseState::num_qubits_) + icol]; - } - BaseState::qregs_[iChunk].initialize_from_vector(sub_mat); - } - }; - Utils::apply_omp_parallel_for( - (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0), 0, - BaseState::num_groups_, copy_matrix_to_chunks_lambda); - } else { - auto mat = state.copy_to_matrix(); - for (int_t iChunk = 0; iChunk < BaseState::qregs_.size() - 1; iChunk++) { - BaseState::qregs_[iChunk].initialize_from_vector(mat); - } - BaseState::qregs_[BaseState::qregs_.size() - 1] = std::move(state); - } - } + BaseState::qreg_ = std::move(state); } template void State::initialize_omp() { uint_t i; - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_); - if (BaseState::threads_ > 0) - BaseState::qregs_[i].set_omp_threads( - BaseState::threads_); // set allowed OMP threads in qubitvector - } + BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_); + if (BaseState::threads_ > 0) + BaseState::qreg_.set_omp_threads( + BaseState::threads_); // set allowed OMP threads in qubitvector } template template -void State::initialize_from_vector(const int_t iChunkIn, - const list_t &vec) { - if ((1ull << (BaseState::num_qubits_ * 2)) == vec.size()) { - BaseState::initialize_from_vector(iChunkIn, vec); - } else if ((1ull << (BaseState::num_qubits_ * 2)) == - vec.size() * vec.size()) { - int_t iChunk; - if (BaseState::multi_chunk_distribution_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - uint_t irow_chunk = - ((iChunk + BaseState::global_chunk_index_) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & - ((1ull << ((BaseState::num_qubits_ - - BaseState::chunk_bits_))) - - 1)) - << (BaseState::chunk_bits_); - - // copy part of state for this chunk - uint_t i, row, col; - list_t vec1(1ull << BaseState::chunk_bits_); - list_t vec2(1ull << BaseState::chunk_bits_); - - for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) { - vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; - vec2[i] = - std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); - } - BaseState::qregs_[iChunk].initialize_from_vector( - AER::Utils::tensor_product(vec1, vec2)); - } - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - uint_t irow_chunk = - ((iChunk + BaseState::global_chunk_index_) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - << (BaseState::chunk_bits_); - uint_t icol_chunk = - ((iChunk + BaseState::global_chunk_index_) & - ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - - 1)) - << (BaseState::chunk_bits_); - - // copy part of state for this chunk - uint_t i, row, col; - list_t vec1(1ull << BaseState::chunk_bits_); - list_t vec2(1ull << BaseState::chunk_bits_); - - for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) { - vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; - vec2[i] = - std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); - } - BaseState::qregs_[iChunk].initialize_from_vector( - AER::Utils::tensor_product(vec1, vec2)); - } - } - } else { - BaseState::qregs_[iChunkIn].initialize_from_vector( - AER::Utils::tensor_product(AER::Utils::conjugate(vec), vec)); - } - } else { - throw std::runtime_error( - "DensityMatrixChunk::initialize input vector is incorrect length. " - "Expected: " + - std::to_string((1ull << (BaseState::num_qubits_ * 2))) + - " Received: " + std::to_string(vec.size())); - } +void State::initialize_from_vector(const list_t &vec) { + BaseState::qreg_.initialize_from_vector( + AER::Utils::tensor_product(AER::Utils::conjugate(vec), vec)); } template -auto State::move_to_matrix(const int_t iChunk) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].move_to_matrix(); - return BaseState::apply_to_matrix(false); +auto State::move_to_matrix() { + return BaseState::qreg_.move_to_matrix(); } template -auto State::copy_to_matrix(const int_t iChunk) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].copy_to_matrix(); - return BaseState::apply_to_matrix(true); +auto State::copy_to_matrix() { + return BaseState::qreg_.copy_to_matrix(); } //------------------------------------------------------------------------- @@ -575,9 +402,7 @@ void State::set_config(const Config &config) { // Set threshold for truncating snapshots json_chop_threshold_ = config.chop_threshold; uint_t i; - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_); - } + BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_); // Set OMP threshold for state update functions omp_qubit_threshold_ = config.statevector_parallel_threshold; @@ -588,64 +413,64 @@ void State::set_config(const Config &config) { //========================================================================= template -void State::apply_op(const int_t iChunk, const Operations::Op &op, +void State::apply_op(const Operations::Op &op, ExperimentResult &result, RngEngine &rng, bool final_ops) { - if (BaseState::check_conditional(iChunk, op)) { + if (BaseState::creg().check_conditional(op)) { switch (op.type) { case OpType::barrier: case OpType::qerror_loc: break; case OpType::reset: - apply_reset(iChunk, op.qubits); + apply_reset(op.qubits); break; case OpType::measure: - apply_measure(iChunk, op.qubits, op.memory, op.registers, rng); + apply_measure(op.qubits, op.memory, op.registers, rng); break; case OpType::bfunc: - BaseState::cregs_[0].apply_bfunc(op); + BaseState::creg().apply_bfunc(op); break; case OpType::roerror: - BaseState::cregs_[0].apply_roerror(op, rng); + BaseState::creg().apply_roerror(op, rng); break; case OpType::gate: - apply_gate(iChunk, op); + apply_gate(op); break; case OpType::matrix: - apply_matrix(iChunk, op.qubits, op.mats[0]); + apply_matrix(op.qubits, op.mats[0]); break; case OpType::diagonal_matrix: - apply_diagonal_unitary_matrix(iChunk, op.qubits, op.params); + apply_diagonal_unitary_matrix(op.qubits, op.params); break; case OpType::superop: - BaseState::qregs_[iChunk].apply_superop_matrix( + BaseState::qreg_.apply_superop_matrix( op.qubits, Utils::vectorize_matrix(op.mats[0])); break; case OpType::kraus: - apply_kraus(iChunk, op.qubits, op.mats); + apply_kraus(op.qubits, op.mats); break; case OpType::set_statevec: - initialize_from_vector(iChunk, op.params); + initialize_from_vector(op.params); break; case OpType::set_densmat: - BaseState::initialize_from_matrix(iChunk, op.mats[0]); + BaseState::qreg_.initialize_from_matrix(op.mats[0]); break; case OpType::save_expval: case OpType::save_expval_var: - BaseState::apply_save_expval(iChunk, op, result); + BaseState::apply_save_expval(op, result); break; case OpType::save_state: - apply_save_state(iChunk, op, result, final_ops); + apply_save_state(op, result, final_ops); break; case OpType::save_densmat: - apply_save_density_matrix(iChunk, op, result, final_ops); + apply_save_density_matrix(op, result, final_ops); break; case OpType::save_probs: case OpType::save_probs_ket: - apply_save_probs(iChunk, op, result); + apply_save_probs(op, result); break; case OpType::save_amps_sq: - apply_save_amplitudes_sq(iChunk, op, result); + apply_save_amplitudes_sq(op, result); break; default: throw std::invalid_argument( @@ -654,80 +479,26 @@ void State::apply_op(const int_t iChunk, const Operations::Op &op, } } -template -bool State::apply_batched_op(const int_t iChunk, - const Operations::Op &op, - ExperimentResult &result, - std::vector &rng, - bool final_ops) { - if (op.conditional) - BaseState::qregs_[iChunk].set_conditional(op.conditional_reg); - - switch (op.type) { - case OpType::barrier: - case OpType::nop: - case OpType::qerror_loc: - break; - case OpType::reset: - BaseState::qregs_[iChunk].apply_reset(op.qubits); - break; - case OpType::measure: - BaseState::qregs_[iChunk].apply_batched_measure(op.qubits, rng, op.memory, - op.registers); - break; - case OpType::bfunc: - BaseState::qregs_[iChunk].apply_bfunc(op); - break; - case OpType::roerror: - BaseState::qregs_[iChunk].apply_roerror(op, rng); - break; - case OpType::gate: - apply_gate(iChunk, op); - break; - case OpType::matrix: - apply_matrix(iChunk, op.qubits, op.mats[0]); - break; - case OpType::diagonal_matrix: - BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(op.qubits, - op.params); - break; - case OpType::superop: - BaseState::qregs_[iChunk].apply_superop_matrix( - op.qubits, Utils::vectorize_matrix(op.mats[0])); - break; - case OpType::kraus: - apply_kraus(iChunk, op.qubits, op.mats); - break; - default: - // other operations should be called to indivisual chunks by apply_op - return false; - } - return true; -} - //========================================================================= // Implementation: Save data //========================================================================= template -void State::apply_save_probs(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_probs(const Operations::Op &op, ExperimentResult &result) { - auto probs = measure_probs(iChunk, op.qubits); - auto cr = this->creg(BaseState::get_global_shot_index(iChunk)); + auto probs = measure_probs(op.qubits); if (op.type == OpType::save_probs_ket) { - result.save_data_average(cr, op.string_params[0], + result.save_data_average(BaseState::creg(), op.string_params[0], Utils::vec2ket(probs, json_chop_threshold_, 16), op.type, op.save_type); } else { - result.save_data_average(cr, op.string_params[0], std::move(probs), op.type, - op.save_type); + result.save_data_average(BaseState::creg(), op.string_params[0], + std::move(probs), op.type, op.save_type); } } template -void State::apply_save_amplitudes_sq(const int_t iChunkIn, - const Operations::Op &op, +void State::apply_save_amplitudes_sq(const Operations::Op &op, ExperimentResult &result) { if (op.int_params.empty()) { throw std::invalid_argument( @@ -736,162 +507,37 @@ void State::apply_save_amplitudes_sq(const int_t iChunkIn, const int_t size = op.int_params.size(); rvector_t amps_sq(size); - if (BaseState::multi_chunk_distribution_) { - int_t iChunk; -#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk) - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + iChunk) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow != icol) - continue; - -#pragma omp parallel for if (size > pow(2, omp_qubit_threshold_) && \ - BaseState::threads_ > 1 && \ - !BaseState::chunk_omp_parallel_) \ - num_threads(BaseState::threads_) - for (int_t i = 0; i < size; ++i) { - uint_t idx = BaseState::mapped_index(op.int_params[i]); - if (idx >= (irow << BaseState::chunk_bits_) && - idx < ((irow + 1) << BaseState::chunk_bits_)) - amps_sq[i] = BaseState::qregs_[iChunk].probability( - idx - (irow << BaseState::chunk_bits_)); - } - } -#ifdef AER_MPI - BaseState::reduce_sum(amps_sq); -#endif - } else { #pragma omp parallel for if (size > pow(2, omp_qubit_threshold_) && \ BaseState::threads_ > 1) \ num_threads(BaseState::threads_) - for (int_t i = 0; i < size; ++i) { - amps_sq[i] = BaseState::qregs_[iChunkIn].probability(op.int_params[i]); - } + for (int_t i = 0; i < size; ++i) { + amps_sq[i] = BaseState::qreg_.probability(op.int_params[i]); } - auto cr = this->creg(BaseState::get_global_shot_index(iChunkIn)); - result.save_data_average(cr, op.string_params[0], std::move(amps_sq), op.type, - op.save_type); + + result.save_data_average(BaseState::creg(), op.string_params[0], + std::move(amps_sq), op.type, op.save_type); } template -double State::expval_pauli(const int_t iChunk, const reg_t &qubits, +double State::expval_pauli(const reg_t &qubits, const std::string &pauli) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].expval_pauli(qubits, pauli); - - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - std::string pauli_in_chunk; - std::string pauli_out_chunk; - int_t i, n; - double expval(0.); - - // get inner/outer chunk pauli string - n = pauli.size(); - for (i = 0; i < n; i++) { - if (qubits[i] < BaseState::chunk_bits_) { - qubits_in_chunk.push_back(qubits[i]); - pauli_in_chunk.push_back(pauli[n - i - 1]); - } else { - qubits_out_chunk.push_back(qubits[i]); - pauli_out_chunk.push_back(pauli[n - i - 1]); - } - } - - int_t nrows = 1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - - if (qubits_out_chunk.size() > 0) { // there are bits out of chunk - std::complex phase = 1.0; - - std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end()); - std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end()); - - uint_t x_mask, z_mask, num_y, x_max; - std::tie(x_mask, z_mask, num_y, x_max) = - AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk); - - z_mask >>= (BaseState::chunk_bits_); - if (x_mask != 0) { - x_mask >>= (BaseState::chunk_bits_); - x_max -= (BaseState::chunk_bits_); - - AER::QV::add_y_phase(num_y, phase); - - const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); - const uint_t mask_l = (1ull << x_max) - 1; - - for (i = 0; i < nrows / 2; i++) { - uint_t irow = ((i << 1) & mask_u) | (i & mask_l); - uint_t iChunk = (irow ^ x_mask) + irow * nrows; - - if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= - iChunk && - BaseState::chunk_index_end_[BaseState::distributed_rank_] > - iChunk) { // on this process - double sign = 2.0; - if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) - sign = -2.0; - expval += sign * - BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli_non_diagonal_chunk(qubits_in_chunk, - pauli_in_chunk, phase); - } - } - } else { - for (i = 0; i < nrows; i++) { - uint_t iChunk = i * (nrows + 1); - if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= - iChunk && - BaseState::chunk_index_end_[BaseState::distributed_rank_] > - iChunk) { // on this process - double sign = 1.0; - if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) - sign = -1.0; - expval += - sign * BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli(qubits_in_chunk, pauli_in_chunk, 1.0); - } - } - } - } else { // all bits are inside chunk - for (i = 0; i < nrows; i++) { - uint_t iChunk = i * (nrows + 1); - if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= - iChunk && - BaseState::chunk_index_end_[BaseState::distributed_rank_] > - iChunk) { // on this process - expval += BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli(qubits, pauli, 1.0); - } - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(expval); -#endif - return expval; + return BaseState::qreg_.expval_pauli(qubits, pauli); } template -void State::apply_save_density_matrix(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_density_matrix(const Operations::Op &op, ExperimentResult &result, bool last_op) { - auto cr = this->creg(BaseState::get_global_shot_index(iChunk)); - result.save_data_average(cr, op.string_params[0], - reduced_density_matrix(iChunk, op.qubits, last_op), - op.type, op.save_type); + result.save_data_average(BaseState::creg(), op.string_params[0], + reduced_density_matrix(op.qubits, last_op), op.type, + op.save_type); } template -void State::apply_save_state(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_state(const Operations::Op &op, ExperimentResult &result, bool last_op) { - if (op.qubits.size() != BaseState::num_qubits_) { + if (op.qubits.size() != BaseState::qreg_.num_qubits()) { throw std::invalid_argument(op.name + " was not applied to all qubits." " Only the full state can be saved."); } @@ -911,164 +557,75 @@ void State::apply_save_state(const int_t iChunk, // Default key std::string key = (op.string_params[0] == "_method_") ? "density_matrix" : op.string_params[0]; - auto cr = this->creg(BaseState::get_global_shot_index(iChunk)); if (last_op) { - result.save_data_average(cr, key, move_to_matrix(iChunk), + result.save_data_average(BaseState::creg(), key, move_to_matrix(), OpType::save_densmat, save_type); } else { - result.save_data_average(cr, key, copy_to_matrix(iChunk), + result.save_data_average(BaseState::creg(), key, copy_to_matrix(), OpType::save_densmat, save_type); } } template -cmatrix_t State::reduced_density_matrix(const int_t iChunk, - const reg_t &qubits, +cmatrix_t State::reduced_density_matrix(const reg_t &qubits, bool last_op) { cmatrix_t reduced_state; // Check if tracing over all qubits if (qubits.empty()) { reduced_state = cmatrix_t(1, 1); - if (!BaseState::multi_chunk_distribution_) { - reduced_state[0] = BaseState::qregs_[iChunk].trace(); - } else { - std::complex sum = 0.0; - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - sum += BaseState::qregs_[i].trace(); - } -#ifdef AER_MPI - BaseState::reduce_sum(sum); -#endif - reduced_state[0] = sum; - } + reduced_state[0] = BaseState::qreg_.trace(); } else { auto qubits_sorted = qubits; std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((qubits.size() == BaseState::num_qubits_) && + if ((qubits.size() == BaseState::qreg_.num_qubits()) && (qubits == qubits_sorted)) { if (last_op) { - reduced_state = move_to_matrix(iChunk); + reduced_state = move_to_matrix(); } else { - reduced_state = copy_to_matrix(iChunk); + reduced_state = copy_to_matrix(); } } else { - reduced_state = - reduced_density_matrix_helper(iChunk, qubits, qubits_sorted); + reduced_state = reduced_density_matrix_helper(qubits, qubits_sorted); } } return reduced_state; } template -cmatrix_t State::reduced_density_matrix_helper( - const int_t iChunkIn, const reg_t &qubits, const reg_t &qubits_sorted) { - if (!BaseState::multi_chunk_distribution_) { - // Get superoperator qubits - const reg_t squbits = BaseState::qregs_[iChunkIn].superop_qubits(qubits); - const reg_t squbits_sorted = - BaseState::qregs_[iChunkIn].superop_qubits(qubits_sorted); - - // Get dimensions - const size_t N = qubits.size(); - const size_t DIM = 1ULL << N; - const int_t VDIM = 1ULL << (2 * N); - const size_t END = 1ULL << (BaseState::qregs_[iChunkIn].num_qubits() - N); - const size_t SHIFT = END + 1; - - // Copy vector to host memory - auto vmat = BaseState::qregs_[iChunkIn].vector(); - cmatrix_t reduced_state(DIM, DIM, false); - { - // Fill matrix with first iteration - const auto inds = QV::indexes(squbits, squbits_sorted, 0); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] = std::move(vmat[inds[i]]); - } - } - // Accumulate with remaning blocks - for (size_t k = 1; k < END; k++) { - const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT); - for (int_t i = 0; i < VDIM; ++i) { - reduced_state[i] += complex_t(std::move(vmat[inds[i]])); - } +cmatrix_t +State::reduced_density_matrix_helper(const reg_t &qubits, + const reg_t &qubits_sorted) { + // Get superoperator qubits + const reg_t squbits = BaseState::qreg_.superop_qubits(qubits); + const reg_t squbits_sorted = BaseState::qreg_.superop_qubits(qubits_sorted); + + // Get dimensions + const size_t N = qubits.size(); + const size_t DIM = 1ULL << N; + const int_t VDIM = 1ULL << (2 * N); + const size_t END = 1ULL << (BaseState::qreg_.num_qubits() - N); + const size_t SHIFT = END + 1; + + // Copy vector to host memory + auto vmat = BaseState::qreg_.vector(); + cmatrix_t reduced_state(DIM, DIM, false); + { + // Fill matrix with first iteration + const auto inds = QV::indexes(squbits, squbits_sorted, 0); + for (int_t i = 0; i < VDIM; ++i) { + reduced_state[i] = std::move(vmat[inds[i]]); } - return reduced_state; - } - - int_t iChunk; - uint_t size = 1ull << (BaseState::chunk_bits_ * 2); - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - uint_t num_threads = BaseState::qregs_[0].get_omp_threads(); - - size_t size_required = - (sizeof(std::complex) << (qubits.size() * 2)) + - (sizeof(std::complex) << (BaseState::chunk_bits_ * 2)) * - BaseState::num_local_chunks_; - if ((size_required >> 20) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to store density matrix")); } - cmatrix_t reduced_state(1ull << qubits.size(), 1ull << qubits.size(), true); - - if (BaseState::distributed_rank_ == 0) { - auto tmp = BaseState::qregs_[0].copy_to_matrix(); - for (iChunk = 0; iChunk < BaseState::num_global_chunks_; iChunk++) { - int_t i; - uint_t irow_chunk = - (iChunk >> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - << BaseState::chunk_bits_; - uint_t icol_chunk = - (iChunk & - ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - 1)) - << BaseState::chunk_bits_; - - if (iChunk < BaseState::num_local_chunks_) - tmp = BaseState::qregs_[iChunk].copy_to_matrix(); -#ifdef AER_MPI - else - BaseState::recv_data(tmp.data(), size, 0, iChunk); -#endif -#pragma omp parallel for if (num_threads > 1) num_threads(num_threads) - for (i = 0; i < size; i++) { - uint_t irow = (i >> (BaseState::chunk_bits_)) + irow_chunk; - uint_t icol = (i & mask) + icol_chunk; - uint_t irow_out = 0; - uint_t icol_out = 0; - int j; - for (j = 0; j < qubits.size(); j++) { - if ((irow >> qubits[j]) & 1) { - irow &= ~(1ull << qubits[j]); - irow_out += (1ull << j); - } - if ((icol >> qubits[j]) & 1) { - icol &= ~(1ull << qubits[j]); - icol_out += (1ull << j); - } - } - if (irow == icol) { // only diagonal base can be reduced - uint_t idx = ((irow_out) << qubits.size()) + icol_out; -#pragma omp critical - reduced_state[idx] += tmp[i]; - } - } - } - } else { -#ifdef AER_MPI - // send matrices to process 0 - for (iChunk = 0; iChunk < BaseState::num_global_chunks_; iChunk++) { - uint_t iProc = BaseState::get_process_by_chunk(iChunk); - if (iProc == BaseState::distributed_rank_) { - auto tmp = BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .copy_to_matrix(); - BaseState::send_data(tmp.data(), size, iChunk, 0); - } + // Accumulate with remaning blocks + for (size_t k = 1; k < END; k++) { + const auto inds = QV::indexes(squbits, squbits_sorted, k * SHIFT); + for (int_t i = 0; i < VDIM; ++i) { + reduced_state[i] += complex_t(std::move(vmat[inds[i]])); } -#endif } - return reduced_state; } @@ -1077,40 +634,42 @@ cmatrix_t State::reduced_density_matrix_helper( //========================================================================= template -void State::apply_gate(const int_t iChunk, - const Operations::Op &op) { - if (!BaseState::global_chunk_indexing_) { +void State::apply_gate(const Operations::Op &op) { + // CPU qubit vector does not handle chunk ID inside kernel, so modify op here + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() && + !BaseState::qreg_.support_global_indexing()) { reg_t qubits_in, qubits_out; bool ctrl_chunk = true; bool ctrl_chunk_sp = true; - BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in); + if (op.name[0] == 'c' || op.name.find("mc") == 0) { + Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in, + qubits_out); + } if (qubits_out.size() > 0) { uint_t mask = 0; for (int i = 0; i < qubits_out.size(); i++) { - mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_)); + mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits())); } - if (((BaseState::global_chunk_index_ + iChunk) & mask) != mask) { + if ((BaseState::qreg_.chunk_index() & mask) != mask) { ctrl_chunk = false; } - if ((((BaseState::global_chunk_index_ + iChunk) >> - (BaseState::num_qubits_ - BaseState::chunk_bits_)) & + if (((BaseState::qreg_.chunk_index() >> + (BaseState::num_global_qubits_ - BaseState::qreg_.num_qubits())) & mask) != mask) { ctrl_chunk_sp = false; } if (!ctrl_chunk && !ctrl_chunk_sp) return; // do nothing for this chunk else { - Operations::Op new_op = - BaseState::remake_gate_in_chunk_qubits(op, qubits_in); + Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in); if (ctrl_chunk && ctrl_chunk_sp) - apply_gate(iChunk, - new_op); // apply gate by using op with internal qubits + apply_gate(new_op); // apply gate by using op with internal qubits else if (ctrl_chunk) - apply_gate_statevector(iChunk, new_op); + apply_gate_statevector(new_op); else { for (int i = 0; i < new_op.qubits.size(); i++) - new_op.qubits[i] += BaseState::chunk_bits_; - apply_gate_statevector(iChunk, new_op); + new_op.qubits[i] += BaseState::qreg_.num_qubits(); + apply_gate_statevector(new_op); } return; } @@ -1124,111 +683,106 @@ void State::apply_gate(const int_t iChunk, "DensityMatrixState::invalid gate instruction \'" + op.name + "\'."); switch (it->second) { case Gates::u3: - apply_gate_u3(iChunk, op.qubits[0], std::real(op.params[0]), + apply_gate_u3(op.qubits[0], std::real(op.params[0]), std::real(op.params[1]), std::real(op.params[2])); break; case Gates::u2: - apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., std::real(op.params[0]), + apply_gate_u3(op.qubits[0], M_PI / 2., std::real(op.params[0]), std::real(op.params[1])); break; case Gates::u1: - apply_phase(iChunk, op.qubits[0], - std::exp(complex_t(0., 1.) * op.params[0])); + apply_phase(op.qubits[0], std::exp(complex_t(0., 1.) * op.params[0])); break; case Gates::cx: - BaseState::qregs_[iChunk].apply_cnot(op.qubits[0], op.qubits[1]); + BaseState::qreg_.apply_cnot(op.qubits[0], op.qubits[1]); break; case Gates::cy: - BaseState::qregs_[iChunk].apply_cy(op.qubits[0], op.qubits[1]); + BaseState::qreg_.apply_cy(op.qubits[0], op.qubits[1]); break; case Gates::cz: - BaseState::qregs_[iChunk].apply_cphase(op.qubits[0], op.qubits[1], -1); + BaseState::qreg_.apply_cphase(op.qubits[0], op.qubits[1], -1); break; case Gates::cp: - BaseState::qregs_[iChunk].apply_cphase( - op.qubits[0], op.qubits[1], std::exp(complex_t(0., 1.) * op.params[0])); + BaseState::qreg_.apply_cphase(op.qubits[0], op.qubits[1], + std::exp(complex_t(0., 1.) * op.params[0])); break; case Gates::id: break; case Gates::x: - BaseState::qregs_[iChunk].apply_x(op.qubits[0]); + BaseState::qreg_.apply_x(op.qubits[0]); break; case Gates::y: - BaseState::qregs_[iChunk].apply_y(op.qubits[0]); + BaseState::qreg_.apply_y(op.qubits[0]); break; case Gates::z: - apply_phase(iChunk, op.qubits[0], -1); + apply_phase(op.qubits[0], -1); break; case Gates::h: - apply_gate_u3(iChunk, op.qubits[0], M_PI / 2., 0., M_PI); + apply_gate_u3(op.qubits[0], M_PI / 2., 0., M_PI); break; case Gates::s: - apply_phase(iChunk, op.qubits[0], complex_t(0., 1.)); + apply_phase(op.qubits[0], complex_t(0., 1.)); break; case Gates::sdg: - apply_phase(iChunk, op.qubits[0], complex_t(0., -1.)); + apply_phase(op.qubits[0], complex_t(0., -1.)); break; case Gates::sx: - BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, - Linalg::VMatrix::SX); + BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::SX); break; case Gates::sxdg: - BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, - Linalg::VMatrix::SXDG); + BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::SXDG); break; case Gates::t: { const double isqrt2{1. / std::sqrt(2)}; - apply_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2)); + apply_phase(op.qubits[0], complex_t(isqrt2, isqrt2)); } break; case Gates::tdg: { const double isqrt2{1. / std::sqrt(2)}; - apply_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2)); + apply_phase(op.qubits[0], complex_t(isqrt2, -isqrt2)); } break; case Gates::swap: { - BaseState::qregs_[iChunk].apply_swap(op.qubits[0], op.qubits[1]); + BaseState::qreg_.apply_swap(op.qubits[0], op.qubits[1]); } break; case Gates::ecr: { - BaseState::qregs_[iChunk].apply_unitary_matrix(op.qubits, - Linalg::VMatrix::ECR); + BaseState::qreg_.apply_unitary_matrix(op.qubits, Linalg::VMatrix::ECR); } break; case Gates::ccx: - BaseState::qregs_[iChunk].apply_toffoli(op.qubits[0], op.qubits[1], - op.qubits[2]); + BaseState::qreg_.apply_toffoli(op.qubits[0], op.qubits[1], op.qubits[2]); break; case Gates::r: - BaseState::qregs_[iChunk].apply_unitary_matrix( + BaseState::qreg_.apply_unitary_matrix( op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::rx: - BaseState::qregs_[iChunk].apply_unitary_matrix( - op.qubits, Linalg::VMatrix::rx(op.params[0])); + BaseState::qreg_.apply_unitary_matrix(op.qubits, + Linalg::VMatrix::rx(op.params[0])); break; case Gates::ry: - BaseState::qregs_[iChunk].apply_unitary_matrix( - op.qubits, Linalg::VMatrix::ry(op.params[0])); + BaseState::qreg_.apply_unitary_matrix(op.qubits, + Linalg::VMatrix::ry(op.params[0])); break; case Gates::rz: - apply_diagonal_unitary_matrix(iChunk, op.qubits, + apply_diagonal_unitary_matrix(op.qubits, Linalg::VMatrix::rz_diag(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_unitary_matrix( - op.qubits, Linalg::VMatrix::rxx(op.params[0])); + BaseState::qreg_.apply_unitary_matrix(op.qubits, + Linalg::VMatrix::rxx(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_unitary_matrix( - op.qubits, Linalg::VMatrix::ryy(op.params[0])); + BaseState::qreg_.apply_unitary_matrix(op.qubits, + Linalg::VMatrix::ryy(op.params[0])); break; case Gates::rzz: - apply_diagonal_unitary_matrix(iChunk, op.qubits, + apply_diagonal_unitary_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_unitary_matrix( - op.qubits, Linalg::VMatrix::rzx(op.params[0])); + BaseState::qreg_.apply_unitary_matrix(op.qubits, + Linalg::VMatrix::rzx(op.params[0])); break; case Gates::pauli: - apply_pauli(iChunk, op.qubits, op.string_params[0]); + apply_pauli(op.qubits, op.string_params[0]); break; default: // We shouldn't reach here unless there is a bug in gateset @@ -1238,8 +792,7 @@ void State::apply_gate(const int_t iChunk, } template -void State::apply_gate_statevector(const int_t iChunk, - const Operations::Op &op) { +void State::apply_gate_statevector(const Operations::Op &op) { // Look for gate name in gateset auto it = gateset_.find(op.name); if (it == gateset_.end()) @@ -1248,22 +801,22 @@ void State::apply_gate_statevector(const int_t iChunk, switch (it->second) { case Gates::x: case Gates::cx: - BaseState::qregs_[iChunk].apply_mcx(op.qubits); + BaseState::qreg_.apply_mcx(op.qubits); break; case Gates::u1: - if (op.qubits[op.qubits.size() - 1] < BaseState::chunk_bits_) { - BaseState::qregs_[iChunk].apply_mcphase( + if (op.qubits[op.qubits.size() - 1] < BaseState::qreg_.num_qubits()) { + BaseState::qreg_.apply_mcphase( op.qubits, std::exp(complex_t(0., 1.) * op.params[0])); } else { - BaseState::qregs_[iChunk].apply_mcphase( + BaseState::qreg_.apply_mcphase( op.qubits, std::conj(std::exp(complex_t(0., 1.) * op.params[0]))); } break; case Gates::y: - BaseState::qregs_[iChunk].apply_mcy(op.qubits); + BaseState::qreg_.apply_mcy(op.qubits); break; case Gates::z: - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + BaseState::qreg_.apply_mcphase(op.qubits, -1); break; default: // We shouldn't reach here unless there is a bug in gateset @@ -1273,87 +826,84 @@ void State::apply_gate_statevector(const int_t iChunk, } template -void State::apply_matrix(const int_t iChunk, const reg_t &qubits, - const cmatrix_t &mat) { +void State::apply_matrix(const reg_t &qubits, const cmatrix_t &mat) { if (mat.GetRows() == 1) { - apply_diagonal_unitary_matrix(iChunk, qubits, Utils::vectorize_matrix(mat)); + apply_diagonal_unitary_matrix(qubits, Utils::vectorize_matrix(mat)); } else { - BaseState::qregs_[iChunk].apply_unitary_matrix( - qubits, Utils::vectorize_matrix(mat)); + BaseState::qreg_.apply_unitary_matrix(qubits, Utils::vectorize_matrix(mat)); } } template -void State::apply_gate_u3(const int_t iChunk, uint_t qubit, - double theta, double phi, double lambda) { - BaseState::qregs_[iChunk].apply_unitary_matrix( +void State::apply_gate_u3(uint_t qubit, double theta, double phi, + double lambda) { + BaseState::qreg_.apply_unitary_matrix( reg_t({qubit}), Linalg::VMatrix::u3(theta, phi, lambda)); } template -void State::apply_diagonal_unitary_matrix(const int_t iChunk, - const reg_t &qubits, +void State::apply_diagonal_unitary_matrix(const reg_t &qubits, const cvector_t &diag) { - if (BaseState::global_chunk_indexing_ || - !BaseState::multi_chunk_distribution_) { - // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix - // as is - BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits, diag); - } else { + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() && + !BaseState::qreg_.support_global_indexing()) { reg_t qubits_in = qubits; reg_t qubits_row = qubits; cvector_t diag_in = diag; cvector_t diag_row = diag; - BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in); + Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(), + BaseState::qreg_.num_qubits(), qubits_in, + diag_in); if (qubits_in.size() == qubits.size()) { - BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits, diag); + BaseState::qreg_.apply_diagonal_unitary_matrix(qubits, diag); } else { for (int_t i = 0; i < qubits.size(); i++) { - if (qubits[i] >= BaseState::chunk_bits_) - qubits_row[i] = - qubits[i] + BaseState::num_qubits_ - BaseState::chunk_bits_; + if (qubits[i] >= BaseState::qreg_.num_qubits()) + qubits_row[i] = qubits[i] + BaseState::num_global_qubits_ - + BaseState::qreg_.num_qubits(); } - BaseState::block_diagonal_matrix(iChunk, qubits_row, diag_row); + Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(), + BaseState::qreg_.num_qubits(), qubits_row, + diag_row); reg_t qubits_chunk(qubits_in.size() * 2); for (int_t i = 0; i < qubits_in.size(); i++) { qubits_chunk[i] = qubits_in[i]; qubits_chunk[i + qubits_in.size()] = - qubits_in[i] + BaseState::chunk_bits_; + qubits_in[i] + BaseState::qreg_.num_qubits(); } - BaseState::qregs_[iChunk].apply_diagonal_matrix( + BaseState::qreg_.apply_diagonal_matrix( qubits_chunk, AER::Utils::tensor_product(AER::Utils::conjugate(diag_row), diag_in)); } + } else { + BaseState::qreg_.apply_diagonal_unitary_matrix(qubits, diag); } } template -void State::apply_phase(const int_t iChunk, const uint_t qubit, - const complex_t phase) { +void State::apply_phase(const uint_t qubit, const complex_t phase) { cvector_t diag(2); diag[0] = 1.0; diag[1] = phase; - apply_diagonal_unitary_matrix(iChunk, reg_t({qubit}), diag); + apply_diagonal_unitary_matrix(reg_t({qubit}), diag); } template -void State::apply_phase(const int_t iChunk, const reg_t &qubits, - const complex_t phase) { +void State::apply_phase(const reg_t &qubits, const complex_t phase) { cvector_t diag((1 << qubits.size()), 1.0); diag[(1 << qubits.size()) - 1] = phase; - apply_diagonal_unitary_matrix(iChunk, qubits, diag); + apply_diagonal_unitary_matrix(qubits, diag); } template -void State::apply_pauli(const int_t iChunk, const reg_t &qubits, +void State::apply_pauli(const reg_t &qubits, const std::string &pauli) { // Pauli as a superoperator is (-1)^num_y P\otimes P complex_t coeff = (std::count(pauli.begin(), pauli.end(), 'Y') % 2) ? -1 : 1; - BaseState::qregs_[iChunk].apply_pauli( - BaseState::qregs_[iChunk].superop_qubits(qubits), pauli + pauli, coeff); + BaseState::qreg_.apply_pauli(BaseState::qreg_.superop_qubits(qubits), + pauli + pauli, coeff); } //========================================================================= @@ -1361,171 +911,38 @@ void State::apply_pauli(const int_t iChunk, const reg_t &qubits, //========================================================================= template -void State::apply_measure(const int_t iChunk, const reg_t &qubits, - const reg_t &cmemory, +void State::apply_measure(const reg_t &qubits, const reg_t &cmemory, const reg_t &cregister, RngEngine &rng) { - int_t ishot = BaseState::get_global_shot_index(iChunk); // Actual measurement outcome - const auto meas = sample_measure_with_prob(iChunk, qubits, rng); + const auto meas = sample_measure_with_prob(qubits, rng); // Implement measurement update - measure_reset_update(iChunk, qubits, meas.first, meas.first, meas.second); + measure_reset_update(qubits, meas.first, meas.first, meas.second); const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size()); - BaseState::cregs_[ishot].store_measure(outcome, cmemory, cregister); + BaseState::creg().store_measure(outcome, cmemory, cregister); } template -rvector_t State::measure_probs(const int_t iChunk, - const reg_t &qubits) const { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].probabilities(qubits); - - uint_t dim = 1ull << qubits.size(); - rvector_t sum(dim, 0.0); - int_t i, j, k; - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - - for (i = 0; i < qubits.size(); i++) { - if (qubits[i] < BaseState::chunk_bits_) { - qubits_in_chunk.push_back(qubits[i]); - } else { - qubits_out_chunk.push_back(qubits[i]); - } - } - - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(i, j, k) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - if (irow == icol) { // diagonal chunk - if (qubits_in_chunk.size() > 0) { - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if (qubits_in_chunk.size() == qubits.size()) { - for (j = 0; j < dim; j++) { -#pragma omp atomic - sum[j] += chunkSum[j]; - } - } else { - for (j = 0; j < chunkSum.size(); j++) { - int idx = 0; - int i_in = 0; - for (k = 0; k < qubits.size(); k++) { - if (qubits[k] < (BaseState::chunk_bits_)) { - idx += (((j >> i_in) & 1) << k); - i_in++; - } else { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits[k]) & - 1) { - idx += 1ull << k; - } - } - } -#pragma omp atomic - sum[idx] += chunkSum[j]; - } - } - } else { // there is no bit in chunk - auto tr = std::real(BaseState::qregs_[i].trace()); - int idx = 0; - for (k = 0; k < qubits_out_chunk.size(); k++) { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits_out_chunk[k]) & - 1) { - idx += 1ull << k; - } - } -#pragma omp atomic - sum[idx] += tr; - } - } - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - if (irow == icol) { // diagonal chunk - if (qubits_in_chunk.size() > 0) { - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if (qubits_in_chunk.size() == qubits.size()) { - for (j = 0; j < dim; j++) { - sum[j] += chunkSum[j]; - } - } else { - for (j = 0; j < chunkSum.size(); j++) { - int idx = 0; - int i_in = 0; - for (k = 0; k < qubits.size(); k++) { - if (qubits[k] < (BaseState::chunk_bits_)) { - idx += (((j >> i_in) & 1) << k); - i_in++; - } else { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits[k]) & - 1) { - idx += 1ull << k; - } - } - } - sum[idx] += chunkSum[j]; - } - } - } else { // there is no bit in chunk - auto tr = std::real(BaseState::qregs_[i].trace()); - int idx = 0; - for (k = 0; k < qubits_out_chunk.size(); k++) { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits_out_chunk[k]) & - 1) { - idx += 1ull << k; - } - } - sum[idx] += tr; - } - } - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(sum); -#endif - - return sum; +rvector_t State::measure_probs(const reg_t &qubits) const { + return BaseState::qreg_.probabilities(qubits); } template -void State::apply_reset(const int_t iChunk, const reg_t &qubits) { - BaseState::qregs_[iChunk].apply_reset(qubits); +void State::apply_reset(const reg_t &qubits) { + BaseState::qreg_.apply_reset(qubits); } template -std::pair State::sample_measure_with_prob( - const int_t iChunk, const reg_t &qubits, RngEngine &rng) { - rvector_t probs = measure_probs(iChunk, qubits); +std::pair +State::sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng) { + rvector_t probs = measure_probs(qubits); // Randomly pick outcome and return pair uint_t outcome = rng.rand_int(probs); return std::make_pair(outcome, probs[outcome]); } template -void State::measure_reset_update(const int_t iChunk, - const reg_t &qubits, +void State::measure_reset_update(const reg_t &qubits, const uint_t final_state, const uint_t meas_state, const double meas_prob) { @@ -1536,44 +953,11 @@ void State::measure_reset_update(const int_t iChunk, // Diagonal matrix for projecting and renormalizing to measurement outcome cvector_t mdiag(2, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); - if (!BaseState::multi_chunk_distribution_) - apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - apply_diagonal_unitary_matrix(i, qubits, mdiag); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_diagonal_unitary_matrix(i, qubits, mdiag); - } - } + apply_diagonal_unitary_matrix(qubits, mdiag); // If it doesn't agree with the reset state update if (final_state != meas_state) { - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].apply_x(qubits[0]); - else { - if (qubits[0] < BaseState::chunk_bits_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - BaseState::qregs_[i].apply_x(qubits[0]); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - BaseState::qregs_[i].apply_x(qubits[0]); - } - } else { - BaseState::apply_chunk_x(qubits[0]); - BaseState::apply_chunk_x(qubits[0] + BaseState::chunk_bits_); - } - } + BaseState::qreg_.apply_x(qubits[0]); } } // Multi qubit case @@ -1582,21 +966,7 @@ void State::measure_reset_update(const int_t iChunk, const size_t dim = 1ULL << qubits.size(); cvector_t mdiag(dim, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); - if (!BaseState::multi_chunk_distribution_) - apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - apply_diagonal_unitary_matrix(i, qubits, mdiag); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_diagonal_unitary_matrix(i, qubits, mdiag); - } - } + apply_diagonal_unitary_matrix(qubits, mdiag); // If it doesn't agree with the reset state update // TODO This function could be optimized as a permutation update @@ -1610,41 +980,7 @@ void State::measure_reset_update(const int_t iChunk, perm[j * dim + j] = 1.; } // apply permutation to swap state - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].apply_unitary_matrix(qubits, perm); - else { - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - - for (int_t i = 0; i < qubits.size(); i++) { - if (qubits[i] < BaseState::chunk_bits_) { - qubits_in_chunk.push_back(qubits[i]); - } else { - qubits_out_chunk.push_back(qubits[i]); - } - } - if (qubits_in_chunk.size() > 0) { // in chunk exchange - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - BaseState::qregs_[i].apply_unitary_matrix(qubits, perm); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - BaseState::qregs_[i].apply_unitary_matrix(qubits, perm); - } - } - if (qubits_out_chunk.size() > 0) { // out of chunk exchange - for (int_t i = 0; i < qubits_out_chunk.size(); i++) { - BaseState::apply_chunk_x(qubits_out_chunk[i]); - BaseState::apply_chunk_x( - qubits_out_chunk[i] + - (BaseState::num_qubits_ - BaseState::chunk_bits_)); - } - } - } + BaseState::qreg_.apply_unitary_matrix(qubits, perm); } } } @@ -1660,115 +996,13 @@ std::vector State::sample_measure(const reg_t &qubits, rnds.push_back(rng.rand(0, 1)); reg_t allbit_samples(shots, 0); - if (!BaseState::multi_chunk_distribution_) - allbit_samples = BaseState::qregs_[0].sample_measure(rnds); - else { - int_t i, j; - std::vector chunkSum(BaseState::qregs_.size() + 1, 0); - double sum, localSum; - // calculate per chunk sum - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for private(i) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow == icol) // only diagonal chunk has probabilities - chunkSum[i] = std::real(BaseState::qregs_[i].trace()); - else - chunkSum[i] = 0.0; - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow == icol) // only diagonal chunk has probabilities - chunkSum[i] = std::real(BaseState::qregs_[i].trace()); - else - chunkSum[i] = 0.0; - } - } - localSum = 0.0; - for (i = 0; i < BaseState::qregs_.size(); i++) { - sum = localSum; - localSum += chunkSum[i]; - chunkSum[i] = sum; - } - chunkSum[BaseState::qregs_.size()] = localSum; - - double globalSum = 0.0; - if (BaseState::nprocs_ > 1) { - std::vector procTotal(BaseState::nprocs_); - - for (i = 0; i < BaseState::nprocs_; i++) { - procTotal[i] = localSum; - } - BaseState::gather_value(procTotal); - - for (i = 0; i < BaseState::myrank_; i++) { - globalSum += procTotal[i]; - } - } - - reg_t local_samples(shots, 0); - - // get rnds positions for each chunk - for (i = 0; i < BaseState::qregs_.size(); i++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow != icol) - continue; - - uint_t nIn; - std::vector vIdx; - std::vector vRnd; - - // find rnds in this chunk - nIn = 0; - for (j = 0; j < shots; j++) { - if (rnds[j] >= chunkSum[i] + globalSum && - rnds[j] < chunkSum[i + 1] + globalSum) { - vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); - vIdx.push_back(j); - nIn++; - } - } - - if (nIn > 0) { - auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd); - uint_t ir; - ir = (BaseState::global_chunk_index_ + i) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - - for (j = 0; j < chunkSamples.size(); j++) { - local_samples[vIdx[j]] = - (ir << BaseState::chunk_bits_) + chunkSamples[j]; - } - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(local_samples); -#endif - allbit_samples = local_samples; - } + allbit_samples = BaseState::qreg_.sample_measure(rnds); // Convert to reg_t format std::vector all_samples; all_samples.reserve(shots); for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); + reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits()); reg_t sample; sample.reserve(qubits.size()); for (uint_t qubit : qubits) { @@ -1784,73 +1018,12 @@ std::vector State::sample_measure(const reg_t &qubits, //========================================================================= template -void State::apply_kraus(const int_t iChunk, const reg_t &qubits, +void State::apply_kraus(const reg_t &qubits, const std::vector &kmats) { - BaseState::qregs_[iChunk].apply_superop_matrix( + BaseState::qreg_.apply_superop_matrix( qubits, Utils::vectorize_matrix(Utils::kraus_superop(kmats))); } -//----------------------------------------------------------------------- -// Functions for multi-chunk distribution -//----------------------------------------------------------------------- -// swap between chunks -template -void State::apply_chunk_swap(const reg_t &qubits) { - uint_t q0, q1; - q0 = qubits[0]; - q1 = qubits[1]; - - std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]); - - if (qubits[0] >= BaseState::chunk_bits_) { - q0 += BaseState::chunk_bits_; - } - if (qubits[1] >= BaseState::chunk_bits_) { - q1 += BaseState::chunk_bits_; - } - reg_t qs0 = {{q0, q1}}; - BaseState::apply_chunk_swap(qs0); - - if (qubits[0] >= BaseState::chunk_bits_) { - q0 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } else { - q0 += BaseState::chunk_bits_; - } - if (qubits[1] >= BaseState::chunk_bits_) { - q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_); - } else { - q1 += BaseState::chunk_bits_; - } - reg_t qs1 = {{q0, q1}}; - BaseState::apply_chunk_swap(qs1); -} - -template -void State::apply_multi_chunk_swap(const reg_t &qubits) { - reg_t qubits_density; - - for (int_t i = 0; i < qubits.size(); i += 2) { - uint_t q0, q1; - q0 = qubits[i * 2]; - q1 = qubits[i * 2 + 1]; - - std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]); - - if (q1 >= BaseState::chunk_bits_) { - q1 += BaseState::chunk_bits_; - } - qubits_density.push_back(q0); - qubits_density.push_back(q1); - - q0 += BaseState::chunk_bits_; - if (q1 >= BaseState::chunk_bits_) { - q1 += (BaseState::num_qubits_ - BaseState::chunk_bits_ * 2); - } - } - - BaseState::apply_multi_chunk_swap(qubits_density); -} - //------------------------------------------------------------------------- } // end namespace DensityMatrix //------------------------------------------------------------------------- diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp old mode 100644 new mode 100755 index cdb8b67acb..73f8dca3cf --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -61,6 +61,11 @@ class DensityMatrixThrust : public UnitaryMatrixThrust { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const DensityMatrixThrust &obj) { + BaseMatrix::initialize(obj); + } + // Initializes the vector to a custom initial state. // The vector can be either a statevector or a vectorized density matrix // If the length of the data vector does not match either case for the @@ -1275,12 +1280,13 @@ reg_t DensityMatrixThrust::sample_measure( const std::vector &rnds) const { uint_t count = 1; if (!BaseVector::multi_chunk_distribution_) { - if (BaseVector::enable_batch_ && BaseVector::chunk_.pos() != 0) { - return reg_t(); // first chunk execute all in batch + if (BaseVector::enable_batch_) { + if (BaseVector::chunk_.pos() != 0) + return reg_t(); // first chunk execute all in batch + else + count = BaseVector::chunk_.container()->num_chunks(); } - count = BaseVector::chunk_.container()->num_chunks(); } - uint_t nrows = BaseMatrix::num_rows(); #ifdef AER_DEBUG diff --git a/src/simulators/extended_stabilizer/ch_runner.hpp b/src/simulators/extended_stabilizer/ch_runner.hpp index 78eb03a81c..489d6b77ad 100644 --- a/src/simulators/extended_stabilizer/ch_runner.hpp +++ b/src/simulators/extended_stabilizer/ch_runner.hpp @@ -77,6 +77,10 @@ class Runner { virtual ~Runner() = default; void initialize(uint_t n_qubits); + // initialize from existing state (copy) + void initialize(const Runner &obj) { + } // imlement this if extended stabilizer will support shot-branching + void initialize_omp(uint_t n_threads, uint_t threshold_rank); bool empty() const { return (n_qubits_ == 0 || num_states_ == 0); } diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp new file mode 100644 index 0000000000..2d0da87e4a --- /dev/null +++ b/src/simulators/multi_state_executor.hpp @@ -0,0 +1,815 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _multi_state_executor_hpp_ +#define _multi_state_executor_hpp_ + +#include "simulators/circuit_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +#include "simulators/shot_branching.hpp" + +namespace AER { + +namespace CircuitExecutor { + +//------------------------------------------------------------------------- +// Multiple-shots executor class implementation +//------------------------------------------------------------------------- +template +class MultiStateExecutor : public Executor { + using Base = Executor; + +protected: + std::vector states_; + std::vector cregs_; // classical registers for all shots + + // number of qubits for the circuit + uint_t num_qubits_; + + uint_t num_global_states_; // number of total shots + uint_t num_local_states_; // number of local shots + + uint_t global_state_index_; // beginning chunk index for this process + reg_t state_index_begin_; // beginning chunk index for each process + reg_t state_index_end_; // ending chunk index for each process + uint_t num_active_states_; // number of active shots in current loop + + bool shot_omp_parallel_; // using thread parallel to process loop of chunks or + // not + + bool set_parallelization_called_ = + false; // this flag is used to check set_parallelization is already + // called, if yes the call sets max_batched_shots_ + uint_t num_max_shots_ = + 1; // max number of shots can be stored on available memory + + int max_matrix_qubits_; // max qubits for matrix + + // shot branching + bool shot_branching_enable_ = true; + bool shot_branching_sampling_enable_ = false; + + // group of states (GPU devices) + uint_t num_groups_; // number of groups of chunks + reg_t top_state_of_group_; + reg_t num_states_in_group_; + int num_threads_per_group_; // number of outer threads per group + + uint_t num_creg_memory_ = + 0; // number of total bits for creg (reserve for multi-shots) + uint_t num_creg_registers_ = 0; + + // OpenMP qubit threshold + int omp_qubit_threshold_ = 14; + + // Threshold for chopping small values to zero in JSON + double json_chop_threshold_ = 1e-10; + + // Set a global phase exp(1j * theta) for the state + bool has_global_phase_ = false; + complex_t global_phase_ = 1; + + // number of threads for inner loop of shot-branching + int_t shot_branch_parallel_ = 1; + +public: + MultiStateExecutor(); + virtual ~MultiStateExecutor(); + + size_t required_memory_mb(const Circuit &circuit, + const Noise::NoiseModel &noise) const override { + state_t tmp; + return tmp.required_memory_mb(circuit.num_qubits, circuit.ops); + } + + uint_t get_process_by_chunk(uint_t cid); + +protected: + void set_config(const Config &config) override; + + // distribute states on processes + void set_distribution(uint_t num_states); + + virtual uint_t qubit_scale(void) { return 1; } + + virtual bool allocate_states(uint_t num_shots, const Config &config); + + void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise) override; + + void run_circuit_with_shot_branching( + uint_t top_state, uint_t num_states, Circuit &circ, + const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng, + uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise); + + // apply op for shot-branching, return false if op is not applied in sub-class + virtual bool apply_branching_op(Branch &root, const Operations::Op &op, + ExperimentResult &result, bool final_op) { + std::cout << " base is called, implement for each method" << std::endl; + return false; + } + + // Apply the global phase + virtual void apply_global_phase() {} + void set_global_phase(double theta); + + void set_parallelization(const Circuit &circ, + const Noise::NoiseModel &noise) override; + + virtual bool shot_branching_supported(void) { + return false; // return true in the sub-class if supports shot-branching + } + + template + void measure_sampler(InputIterator first_meas, InputIterator last_meas, + uint_t shots, Branch &branch, ExperimentResult &result, + std::vector &rng); + + // sampling measure + virtual std::vector sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const { + // this is for single rng, impement in sub-class for multi-shots case + return state.sample_measure(qubits, shots, rng[0]); + } +}; + +template +MultiStateExecutor::MultiStateExecutor() { + num_global_states_ = 0; + num_local_states_ = 0; + + shot_omp_parallel_ = false; + + shot_branching_enable_ = false; +} + +template +MultiStateExecutor::~MultiStateExecutor() { + states_.clear(); + cregs_.clear(); +} + +template +void MultiStateExecutor::set_config(const Config &config) { + Base::set_config(config); + + // Set threshold for truncating states to be saved + json_chop_threshold_ = config.zero_threshold; + + // Set OMP threshold for state update functions + omp_qubit_threshold_ = config.statevector_parallel_threshold; + + // shot branching optimization + shot_branching_enable_ = config.shot_branching_enable; + shot_branching_sampling_enable_ = config.shot_branching_sampling_enable; + + if (config.num_threads_per_device.has_value()) + num_threads_per_group_ = config.num_threads_per_device.value(); +} + +template +void MultiStateExecutor::set_global_phase(double theta) { + if (Linalg::almost_equal(theta, 0.0)) { + has_global_phase_ = false; + global_phase_ = 1; + } else { + has_global_phase_ = true; + global_phase_ = std::exp(complex_t(0.0, theta)); + } +} + +template +void MultiStateExecutor::set_distribution(uint_t num_states) { + + num_global_states_ = num_states; + + state_index_begin_.resize(Base::distributed_procs_); + state_index_end_.resize(Base::distributed_procs_); + for (int_t i = 0; i < Base::distributed_procs_; i++) { + state_index_begin_[i] = num_global_states_ * i / Base::distributed_procs_; + state_index_end_[i] = + num_global_states_ * (i + 1) / Base::distributed_procs_; + } + + num_local_states_ = state_index_end_[Base::distributed_rank_] - + state_index_begin_[Base::distributed_rank_]; + global_state_index_ = state_index_begin_[Base::distributed_rank_]; +} + +template +void MultiStateExecutor::set_parallelization( + const Circuit &circ, const Noise::NoiseModel &noise) { + Base::set_parallelization(circ, noise); +} + +template +bool MultiStateExecutor::allocate_states(uint_t num_shots, + const Config &config) { + int_t i; + bool ret = true; + + states_.resize(num_shots); + + num_active_states_ = num_shots; + + // initialize groups + top_state_of_group_.resize(1); + num_states_in_group_.resize(1); + num_groups_ = 1; + top_state_of_group_[0] = 0; + num_states_in_group_[0] = num_shots; + + for (i = 0; i < num_shots; i++) { + states_[i].set_config(config); + states_[i].set_num_global_qubits(num_qubits_); + } + + return ret; +} + +template +void MultiStateExecutor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + num_qubits_ = circ.num_qubits; + num_creg_memory_ = circ.num_memory; + num_creg_registers_ = circ.num_registers; + + if (this->sim_device_ == Device::GPU) { +#ifdef _OPENMP + if (omp_get_num_threads() == 1) + shot_omp_parallel_ = true; +#endif + } else if (this->sim_device_ == Device::ThrustCPU) { + shot_omp_parallel_ = false; + } + + set_distribution(circ.shots); + num_max_shots_ = Base::get_max_parallel_shots(circ, noise); + + bool shot_branching = false; + if (shot_branching_enable_ && num_local_states_ > 1 && + shot_branching_supported() && num_max_shots_ > 1) { + shot_branching = true; + } else + shot_branching = false; + + if (!shot_branching) { + return Base::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } + // disable cuStateVec if shot-branching is enabled +#ifdef AER_CUSTATEVEC + if (Base::cuStateVec_enable_) + Base::cuStateVec_enable_ = false; +#endif + + Noise::NoiseModel dummy_noise; + state_t dummy_state; + + Circuit circ_opt; + if (sample_noise) { + RngEngine dummy_rng; + circ_opt = noise.sample_noise(circ, dummy_rng, + Noise::NoiseModel::Method::circuit, true); + auto fusion_pass = Base::transpile_fusion(circ_opt.opset(), config); + fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(), + result); + max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt); + } else { + auto fusion_pass = Base::transpile_fusion(circ.opset(), config); + fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(), + result); + max_matrix_qubits_ = Base::get_max_matrix_qubits(circ); + } + +#ifdef AER_MPI + // if shots are distributed to MPI processes, allocate cregs to be gathered + if (Base::num_process_per_experiment_ > 1) + cregs_.resize(circ.shots); +#endif + + // reserve states + allocate_states(num_max_shots_, config); + + int_t par_shots; + if (Base::sim_device_ == Device::GPU) { + par_shots = num_groups_; + } else { + par_shots = + std::min((int_t)Base::parallel_shots_, (int_t)num_local_states_); + } + shot_branch_parallel_ = Base::parallel_shots_ / par_shots; + std::vector par_results(par_shots); + + auto parallel_shot_branching = [this, &par_results, par_shots, &circ, + &circ_opt, noise, config, &init_rng, + sample_noise](int_t i) { + // shot distribution + uint_t ishot = i * num_local_states_ / par_shots; + uint_t nshots = (i + 1) * num_local_states_ / par_shots; + nshots -= ishot; + + // state distribution + uint_t istate, nstates; + if (Base::sim_device_ == Device::GPU) { + istate = top_state_of_group_[i]; + nstates = num_states_in_group_[i]; + } else { + istate = i * num_active_states_ / par_shots; + nstates = (i + 1) * num_active_states_ / par_shots; + nstates -= istate; + } + + if (nshots > 0) { + if (sample_noise) { + run_circuit_with_shot_branching(istate, nstates, circ_opt, noise, + config, init_rng, ishot, nshots, + par_results[i], sample_noise); + } else { + run_circuit_with_shot_branching(istate, nstates, circ, noise, config, + init_rng, ishot, nshots, par_results[i], + sample_noise); + } + } + }; + Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, + parallel_shot_branching, par_shots); + + // gather cregs on MPI processes and save to result +#ifdef AER_MPI + if (Base::num_process_per_experiment_ > 1) { + Base::gather_creg_memory(cregs_, state_index_begin_); + + // save cregs to result + auto save_cregs = [this, &par_results, par_shots](int_t i) { + uint_t i_shot, shot_end; + i_shot = num_global_states_ * i / par_shots; + shot_end = num_global_states_ * (i + 1) / par_shots; + + for (; i_shot < shot_end; i_shot++) { + if (cregs_[i_shot].memory_size() > 0) { + std::string memory_hex = cregs_[i_shot].memory_hex(); + par_results[i].data.add_accum(static_cast(1ULL), "counts", + memory_hex); + if (Base::save_creg_memory_) { + par_results[i].data.add_list(std::move(memory_hex), "memory"); + } + } + } + }; + Utils::apply_omp_parallel_for((par_shots > 1), 0, par_shots, save_cregs, + par_shots); + cregs_.clear(); + } +#endif + + for (auto &res : par_results) { + result.combine(std::move(res)); + } + + result.metadata.add(true, "shot_branching_enabled"); +} + +template +void MultiStateExecutor::run_circuit_with_shot_branching( + uint_t top_state, uint_t num_states, Circuit &circ, + const Noise::NoiseModel &noise, const Config &config, RngEngine &init_rng, + uint_t ishot, uint_t nshots, ExperimentResult &result, bool sample_noise) { + std::vector> branches; + OpItr first; + OpItr last; + + first = circ.ops.cbegin(); + last = circ.ops.cend(); + + // check if there is sequence of measure at the end of operations + bool can_sample = false; + OpItr measure_seq = last; + OpItr it = last - 1; + int_t num_measure = 0; + + if (shot_branching_sampling_enable_) { + do { + if (it->type != Operations::OpType::measure) { + measure_seq = it + 1; + break; + } + num_measure += it->qubits.size(); + it--; + } while (it != first); + + if (num_measure >= num_qubits_ && measure_seq != last) { + can_sample = true; + } else { + measure_seq = last; + } + } + + int_t par_shots = std::min(shot_branch_parallel_, (int_t)num_states); + if (par_shots == 0) + par_shots = 1; + + // initialize local shots + std::vector shots_storage(nshots); + if (global_state_index_ + ishot == 0) + shots_storage[0] = init_rng; + else + shots_storage[0].set_seed(circ.seed + global_state_index_ + ishot); + if (par_shots > 1) { +#pragma omp parallel for num_threads(par_shots) + for (int_t i = 1; i < nshots; i++) + shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i); + } else { + for (int_t i = 1; i < nshots; i++) + shots_storage[i].set_seed(circ.seed + global_state_index_ + ishot + i); + } + + std::vector par_results(par_shots); + + uint_t num_shots_saved = 0; + + // loop until all local shots are simulated + while (shots_storage.size() > 0) { + uint_t num_active_states = 1; + + // initial state + branches.push_back(std::make_shared()); + branches[0]->state_index() = top_state; + branches[0]->set_shots(shots_storage); + branches[0]->op_iterator() = first; + branches[0]->shot_index() = + global_state_index_ + nshots - shots_storage.size(); + shots_storage.clear(); + + // initialize initial state + states_[top_state].set_parallelization(this->parallel_state_update_); + states_[top_state].set_global_phase(circ.global_phase_angle); + states_[top_state].enable_density_matrix(!Base::has_statevector_ops_); + states_[top_state].initialize_qreg(num_qubits_); + states_[top_state].initialize_creg(num_creg_memory_, num_creg_registers_); + + while (num_active_states > 0) { // loop until all branches execute all ops + // functor for ops execution + auto apply_ops_func = [this, &branches, &noise, &par_results, measure_seq, + par_shots, num_active_states](int_t i) { + uint_t istate, state_end; + istate = branches.size() * i / par_shots; + state_end = branches.size() * (i + 1) / par_shots; + uint_t nbranch = 0; + RngEngine dummy_rng; + + for (; istate < state_end; istate++) { + while (branches[istate]->op_iterator() != measure_seq || + branches[istate]->additional_ops().size() > 0) { + // execute additional ops first if avaiable + if (branches[istate]->additional_ops().size() > 0) { + int_t iadd = 0; + int_t num_add = branches[istate]->additional_ops().size(); + while (iadd < num_add) { + if (apply_branching_op(*branches[istate], + branches[istate]->additional_ops()[iadd], + par_results[i], false)) { + // check if there are new branches + if (branches[istate]->num_branches() > 0) { + // if there are additional ops remaining, queue them on new + // branches + for (int_t k = iadd + 1; + k < branches[istate]->additional_ops().size(); k++) { + for (int_t l = 0; l < branches[istate]->num_branches(); + l++) + branches[istate]->branches()[l]->add_op_after_branch( + branches[istate]->additional_ops()[k]); + } + branches[istate]->remove_empty_branches(); + states_[branches[istate]->state_index()].creg() = + branches[istate]->creg(); + // if there are some branches still remaining + if (branches[istate]->num_branches() > 0) { + nbranch += branches[istate]->num_branches(); + break; + } + iadd = 0; + num_add = branches[istate]->additional_ops().size(); + } + } else { + states_[branches[istate]->state_index()].apply_op( + branches[istate]->additional_ops()[iadd], par_results[i], + dummy_rng, false); + } + iadd++; + } + branches[istate]->clear_additional_ops(); + // if there are some branches still remaining + if (branches[istate]->num_branches() > 0) { + nbranch += branches[istate]->num_branches(); + break; + } + } + // then execute ops + if (branches[istate]->op_iterator() != measure_seq) { + if (!branches[istate]->apply_control_flow( + states_[branches[istate]->state_index()].creg(), + measure_seq)) { + if (!branches[istate]->apply_runtime_noise_sampling( + states_[branches[istate]->state_index()].creg(), + *branches[istate]->op_iterator(), noise)) { + if (!apply_branching_op(*branches[istate], + *branches[istate]->op_iterator(), + par_results[i], true)) { + states_[branches[istate]->state_index()].apply_op( + *branches[istate]->op_iterator(), par_results[i], + dummy_rng, true); + } + } + branches[istate]->advance_iterator(); + if (branches[istate]->num_branches() > 0) { + branches[istate]->remove_empty_branches(); + states_[branches[istate]->state_index()].creg() = + branches[istate]->creg(); + + // if there are some branches still remaining + if (branches[istate]->num_branches() > 0) { + nbranch += branches[istate]->num_branches(); + break; + } + } + } + } + } + } + return nbranch; + }; + + // apply ops until some branch operations are executed in some branches + uint_t nbranch = Utils::apply_omp_parallel_for_reduction_int( + (par_shots > 1 && branches.size() > 1 && shot_omp_parallel_), 0, + par_shots, apply_ops_func, par_shots); + + // repeat until new branch is available + if (nbranch > 0) { + uint_t num_states_prev = branches.size(); + for (int_t i = 0; i < num_states_prev; i++) { + // add new branches + if (branches[i]->num_branches() > 0) { + for (int_t j = 0; j < branches[i]->num_branches(); j++) { + if (branches[i]->branches()[j]->num_shots() > 0) { + // add new branched state + uint_t pos = branches.size(); + if (pos >= num_states) { // if there is not enough memory to + // allocate copied state, shots are + // reserved to the next iteration + // reset seed to reproduce same results + for (int_t k = 0; k < branches[i]->branches()[j]->num_shots(); + k++) { + branches[i]->branches()[j]->rng_shots()[k].set_seed( + branches[i] + ->branches()[j] + ->rng_shots()[k] + .initial_seed()); + } + shots_storage.insert( + shots_storage.end(), + branches[i]->branches()[j]->rng_shots().begin(), + branches[i]->branches()[j]->rng_shots().end()); + } else { + branches.push_back(branches[i]->branches()[j]); + branches[pos]->state_index() = top_state + pos; + branches[pos]->root_state_index() = + branches[i]->state_index(); + } + } else { + branches[i]->branches()[j].reset(); + } + } + branches[i]->clear_branch(); + } + } + + // copy state to new branch + uint_t num_new_branches = branches.size() - num_states_prev; + auto copy_branch_func = [this, &branches, par_shots, circ, + num_new_branches, num_states_prev](int_t i) { + uint_t pos, pos_end; + pos = num_states_prev + num_new_branches * i / par_shots; + pos_end = num_states_prev + num_new_branches * (i + 1) / par_shots; + for (; pos < pos_end; pos++) { + uint_t istate = branches[pos]->state_index(); + states_[istate].set_parallelization(this->parallel_state_update_); + states_[istate].set_global_phase(circ.global_phase_angle); + states_[istate].enable_density_matrix(!Base::has_statevector_ops_); + states_[istate].qreg().initialize( + states_[branches[pos]->root_state_index()].qreg()); + states_[istate].creg() = branches[pos]->creg(); + } + }; + Utils::apply_omp_parallel_for( + (par_shots > 1 && num_new_branches > 1 && shot_omp_parallel_), 0, + par_shots, copy_branch_func, par_shots); + } + + // check if there are remaining ops + num_active_states = 0; + for (int_t i = 0; i < branches.size(); i++) { + if (branches[i]->op_iterator() != measure_seq || + branches[i]->additional_ops().size() > 0) + num_active_states++; + } + } + + if (can_sample) { + // apply sampling measure for each branch + auto sampling_measure_func = [this, &branches, &par_results, measure_seq, + last, par_shots](int_t i) { + uint_t istate, state_end; + istate = branches.size() * i / par_shots; + state_end = branches.size() * (i + 1) / par_shots; + + for (; istate < state_end; istate++) { + measure_sampler(measure_seq, last, branches[istate]->num_shots(), + *branches[istate], par_results[i], + branches[istate]->rng_shots()); + } + }; + bool can_parallel = par_shots > 1 && branches.size() > 1; +#ifdef AER_CUSTATEVEC + can_parallel &= !Base::cuStateVec_enable_; +#endif + Utils::apply_omp_parallel_for(can_parallel, 0, par_shots, + sampling_measure_func, par_shots); + + result.metadata.add(true, "shot_branching_sampling_enabled"); + } else { + // save cregs to result + auto save_cregs = [this, &branches, &par_results, par_shots](int_t i) { + uint_t istate, state_end; + istate = branches.size() * i / par_shots; + state_end = branches.size() * (i + 1) / par_shots; + + for (; istate < state_end; istate++) { + if (Base::num_process_per_experiment_ > 1) { + for (int_t j = 0; j < branches[istate]->num_shots(); j++) { + cregs_[branches[istate]->shot_index() + j] = + states_[branches[istate]->state_index()].creg(); + } + } else { + std::string memory_hex = + states_[branches[istate]->state_index()].creg().memory_hex(); + for (int_t j = 0; j < branches[istate]->num_shots(); j++) + par_results[i].data.add_accum(static_cast(1ULL), "counts", + memory_hex); + if (Base::save_creg_memory_) { + for (int_t j = 0; j < branches[istate]->num_shots(); j++) + par_results[i].data.add_list(memory_hex, "memory"); + } + } + } + }; + Utils::apply_omp_parallel_for( + (par_shots > 1 && branches.size() > 1 && shot_omp_parallel_), 0, + par_shots, save_cregs, par_shots); + } + + // clear + for (int_t i = 0; i < branches.size(); i++) { + branches[i].reset(); + } + branches.clear(); + } + + for (auto &res : par_results) { + result.combine(std::move(res)); + } +} + +template +template +void MultiStateExecutor::measure_sampler(InputIterator first_meas, + InputIterator last_meas, + uint_t shots, Branch &branch, + ExperimentResult &result, + std::vector &rng) { + state_t &state = states_[branch.state_index()]; + // Check if meas_circ is empty, and if so return initial creg + if (first_meas == last_meas) { + for (int_t i = 0; i < shots; i++) { + if (Base::num_process_per_experiment_ > 1) { + cregs_[branch.shot_index() + i] = state.creg(); + } else { + result.save_count_data(state.creg(), Base::save_creg_memory_); + } + } + return; + } + + std::vector meas_ops; + std::vector roerror_ops; + for (auto op = first_meas; op != last_meas; op++) { + if (op->type == Operations::OpType::roerror) { + roerror_ops.push_back(*op); + } else { /*(op.type == Operations::OpType::measure) */ + meas_ops.push_back(*op); + } + } + + // Get measured qubits from circuit sort and delete duplicates + std::vector meas_qubits; // measured qubits + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) + meas_qubits.push_back(op.qubits[j]); + } + sort(meas_qubits.begin(), meas_qubits.end()); + meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()), + meas_qubits.end()); + + // Generate the samples + auto timer_start = myclock_t::now(); + std::vector all_samples; + all_samples = sample_measure(state, meas_qubits, shots, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); + + // Make qubit map of position in vector of measured qubits + std::unordered_map qubit_map; + for (uint_t j = 0; j < meas_qubits.size(); ++j) { + qubit_map[meas_qubits[j]] = j; + } + + // Maps of memory and register to qubit position + std::map memory_map; + std::map register_map; + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) { + auto pos = qubit_map[op.qubits[j]]; + if (!op.memory.empty()) + memory_map[op.memory[j]] = pos; + if (!op.registers.empty()) + register_map[op.registers[j]] = pos; + } + } + + // Process samples + uint_t num_memory = + (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first; + uint_t num_registers = + (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first; + ClassicalRegister creg; + for (int_t i = 0; i < all_samples.size(); i++) { + creg = state.creg(); + + // process memory bit measurements + for (const auto &pair : memory_map) { + creg.store_measure(reg_t({all_samples[i][pair.second]}), + reg_t({pair.first}), reg_t()); + } + // process register bit measurements + for (const auto &pair : register_map) { + creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(), + reg_t({pair.first})); + } + + // process read out errors for memory and registers + for (const Operations::Op &roerror : roerror_ops) + creg.apply_roerror(roerror, rng[i]); + + // save creg to gather + if (Base::num_process_per_experiment_ > 1) { + for (int_t j = 0; j < shots; j++) + cregs_[branch.shot_index() + j] = creg; + } else { + std::string memory_hex = creg.memory_hex(); + result.data.add_accum(static_cast(1ULL), "counts", memory_hex); + if (Base::save_creg_memory_) + result.data.add_list(memory_hex, "memory"); + } + } +} + +//------------------------------------------------------------------------- +} // end namespace CircuitExecutor +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/parallel_state_executor.hpp b/src/simulators/parallel_state_executor.hpp new file mode 100644 index 0000000000..5e5074449c --- /dev/null +++ b/src/simulators/parallel_state_executor.hpp @@ -0,0 +1,1869 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _parallel_executor_hpp_ +#define _parallel_executor_hpp_ + +#include "simulators/multi_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace CircuitExecutor { + +//------------------------------------------------------------------------- +// Parallel executor class implementation +//------------------------------------------------------------------------- +template +class ParallelStateExecutor : public virtual MultiStateExecutor { + using Base = MultiStateExecutor; + +protected: + // extra parameters for parallel simulations + uint_t chunk_bits_; // number of qubits per chunk + + bool chunk_omp_parallel_; // using thread parallel to process loop of chunks + // or not + bool global_chunk_indexing_; // using global index for control qubits and + // diagonal matrix + + reg_t qubit_map_; // qubit map to restore swapped qubits + + bool multi_chunk_swap_enable_ = true; // enable multi-chunk swaps + // maximum buffer size in qubits for chunk swap + uint_t chunk_swap_buffer_qubits_ = 15; + uint_t max_multi_swap_; // maximum swaps can be applied at a time, calculated + // by chunk_swap_buffer_bits_ + + uint_t cache_block_qubit_ = 0; + +public: + ParallelStateExecutor(); + virtual ~ParallelStateExecutor(); + + size_t required_memory_mb(const Circuit &circuit, + const Noise::NoiseModel &noise) const override { + state_t tmp; + return tmp.required_memory_mb(circuit.num_qubits, circuit.ops); + } + + uint_t get_process_by_chunk(uint_t cid); + +protected: + void set_config(const Config &config) override; + + virtual uint_t qubit_scale(void) { return 1; } + + bool multiple_chunk_required(const Circuit &circuit, + const Noise::NoiseModel &noise) const; + + // Return cache blocking transpiler pass + Transpile::CacheBlocking + transpile_cache_blocking(const Circuit &circ, const Noise::NoiseModel &noise, + const Config &config) const; + + bool allocate(uint_t num_qubits, const Config &config); + bool allocate_states(uint_t num_shots, const Config &config) override; + + void run_circuit_with_sampling(Circuit &circ, const Config &config, + RngEngine &init_rng, + ExperimentResult &result) override; + + void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise) override; + + template + void measure_sampler(InputIterator first_meas, InputIterator last_meas, + uint_t shots, ExperimentResult &result, + RngEngine &rng) const; + + // apply operations for multi-chunk simulator + template + void apply_ops_chunks(InputIterator first, InputIterator last, + ExperimentResult &result, RngEngine &rng, + bool final_ops); + + // apply ops on cache memory + template + void apply_cache_blocking_ops(const int_t iGroup, InputIterator first, + InputIterator last, ExperimentResult &result, + RngEngine &rng); + + // apply parallel operations (implement for each simulation method) + virtual bool apply_parallel_op(const Operations::Op &op, + ExperimentResult &result, RngEngine &rng, + bool final_op) = 0; + + // store measure to cregs + void store_measure(const reg_t &outcome, const reg_t &memory, + const reg_t ®isters); + + void apply_bfunc(const Operations::Op &op); + void apply_roerror(const Operations::Op &op, RngEngine &rng); + + //----------------------------------------------------------------------- + // Initialization + //----------------------------------------------------------------------- + template + void initialize_from_vector(const list_t &vec); + + template + void initialize_from_matrix(const list_t &mat); + + // Initializes an n-qubit state to the all |0> state + virtual void initialize_qreg(uint_t num_qubits) = 0; + + //----------------------------------------------------------------------- + // Functions for multi-chunk distribution + //----------------------------------------------------------------------- + // Helper function for computing expectation value + virtual double expval_pauli(const reg_t &qubits, + const std::string &pauli) = 0; + + // Apply a save expectation value instruction + void apply_save_expval(const Operations::Op &op, ExperimentResult &result); + + // Sample n-measurement outcomes without applying the measure operation + // to the system state + virtual std::vector sample_measure(const reg_t &qubits, uint_t shots, + RngEngine &rng) const { + std::vector ret; + return ret; + }; + + // swap between chunks + virtual void apply_chunk_swap(const reg_t &qubits); + + // apply multiple swaps between chunks + virtual void apply_multi_chunk_swap(const reg_t &qubits); + + // apply X gate over chunks + virtual void apply_chunk_x(const uint_t qubit); + + // send/receive chunk in receive buffer + void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index); + void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index); + + template + void send_data(data_t *pSend, uint_t size, uint_t myid, uint_t pairid); + template + void recv_data(data_t *pRecv, uint_t size, uint_t myid, uint_t pairid); + + // reduce values over processes + void reduce_sum(reg_t &sum) const; + void reduce_sum(rvector_t &sum) const; + void reduce_sum(complex_t &sum) const; + void reduce_sum(double &sum) const; + + // gather values on each process + void gather_value(rvector_t &val) const; + + // barrier all processes + void sync_process(void) const; + + // gather distributed state into vector (if memory is enough) + template + void gather_state(std::vector> &state); + + template + void gather_state(AER::Vector> &state); + + // collect matrix over multiple chunks + auto apply_to_matrix(bool copy = false); + + // Apply the global phase + virtual void apply_global_phase(); + + uint_t mapped_index(const uint_t idx); +}; + +template +ParallelStateExecutor::ParallelStateExecutor() { + chunk_omp_parallel_ = false; + global_chunk_indexing_ = false; + chunk_bits_ = 0; + cache_block_qubit_ = 0; +} + +template +ParallelStateExecutor::~ParallelStateExecutor() {} + +template +void ParallelStateExecutor::set_config(const Config &config) { + Base::set_config(config); + + if (config.chunk_swap_buffer_qubits.has_value()) + chunk_swap_buffer_qubits_ = config.chunk_swap_buffer_qubits.value(); + + // enable multiple qregs if cache blocking is enabled + cache_block_qubit_ = 0; + if (config.blocking_qubits.has_value()) + cache_block_qubit_ = config.blocking_qubits.value(); +} + +template +bool ParallelStateExecutor::multiple_chunk_required( + const Circuit &circ, const Noise::NoiseModel &noise) const { + if (circ.num_qubits < 3) + return false; + if (cache_block_qubit_ >= 2 && cache_block_qubit_ < circ.num_qubits) + return true; + + if (Base::num_process_per_experiment_ == 1 && + Base::sim_device_ == Device::GPU && Base::num_gpus_ > 0) { + return (Base::max_gpu_memory_mb_ / Base::num_gpus_ < + Base::required_memory_mb(circ, noise)); + } + if (Base::num_process_per_experiment_ > 1) { + size_t total_mem = Base::max_memory_mb_; + if (Base::sim_device_ == Device::GPU) + total_mem += Base::max_gpu_memory_mb_; + if (total_mem * Base::num_process_per_experiment_ > + Base::required_memory_mb(circ, noise)) + return true; + } + + return false; +} + +template +Transpile::CacheBlocking +ParallelStateExecutor::transpile_cache_blocking( + const Circuit &circ, const Noise::NoiseModel &noise, + const Config &config) const { + Transpile::CacheBlocking cache_block_pass; + + const bool is_matrix = (Base::method_ == Method::density_matrix || + Base::method_ == Method::unitary); + const auto complex_size = (Base::sim_precision_ == Precision::Single) + ? sizeof(std::complex) + : sizeof(std::complex); + + cache_block_pass.set_num_processes(Base::num_process_per_experiment_); + cache_block_pass.set_config(config); + + if (!cache_block_pass.enabled()) { + // if blocking is not set by config, automatically set if required + if (multiple_chunk_required(circ, noise)) { + int nplace = Base::num_process_per_experiment_; + if (Base::sim_device_ == Device::GPU && Base::num_gpus_ > 0) + nplace *= Base::num_gpus_; + cache_block_pass.set_blocking(circ.num_qubits, + Base::get_min_memory_mb() << 20, nplace, + complex_size, is_matrix); + } + } + return cache_block_pass; +} + +template +bool ParallelStateExecutor::allocate(uint_t num_qubits, + const Config &config) { + int_t i; + Base::num_qubits_ = num_qubits; + chunk_bits_ = cache_block_qubit_; + + global_chunk_indexing_ = false; + chunk_omp_parallel_ = false; + if (Base::sim_device_ == Device::GPU) { +#ifdef _OPENMP + if (omp_get_num_threads() == 1) + chunk_omp_parallel_ = true; +#endif + + global_chunk_indexing_ = true; // cuStateVec does not handle global chunk + // index for diagonal matrix +#ifdef AER_CUSTATEVEC + if (!Base::cuStateVec_enable_) + global_chunk_indexing_ = false; +#endif + } else if (Base::sim_device_ == Device::ThrustCPU) { + global_chunk_indexing_ = true; + chunk_omp_parallel_ = false; + } + + allocate_states(Base::num_local_states_, config); + + // initialize qubit map + qubit_map_.resize(Base::num_qubits_); + for (i = 0; i < Base::num_qubits_; i++) { + qubit_map_[i] = i; + } + + if (chunk_bits_ <= chunk_swap_buffer_qubits_ + 1) + multi_chunk_swap_enable_ = false; + else + max_multi_swap_ = chunk_bits_ - chunk_swap_buffer_qubits_; + + return true; +} + +template +bool ParallelStateExecutor::allocate_states(uint_t num_states, + const Config &config) { + int_t i; + bool init_states = true; + bool ret = true; + // deallocate qregs before reallocation + if (Base::states_.size() > 0) { + if (Base::states_.size() == num_states) + init_states = false; // can reuse allocated chunks + else + Base::states_.clear(); + } + if (init_states) { + Base::states_.resize(num_states); + + if (Base::num_creg_memory_ != 0 || Base::num_creg_registers_ != 0) { + for (i = 0; i < num_states; i++) { + // set number of creg bits before actual initialization + Base::states_[i].initialize_creg(Base::num_creg_memory_, + Base::num_creg_registers_); + } + } + uint_t gqubits = Base::num_qubits_ * this->qubit_scale(); + uint_t squbits; + if (chunk_bits_ == 0) + squbits = Base::num_qubits_ * this->qubit_scale(); + else + squbits = chunk_bits_ * this->qubit_scale(); + + // allocate qregs + Base::states_[0].set_config(config); + Base::states_[0].qreg().set_max_matrix_bits(Base::max_matrix_qubits_); + Base::states_[0].qreg().set_num_threads_per_group( + Base::num_threads_per_group_); + Base::states_[0].set_num_global_qubits(Base::num_qubits_); +#ifdef AER_CUSTATEVEC + Base::states_[0].qreg().cuStateVec_enable(Base::cuStateVec_enable_); +#endif + Base::states_[0].qreg().set_target_gpus(Base::target_gpus_); + + ret &= Base::states_[0].qreg().chunk_setup( + squbits, gqubits, Base::global_state_index_, num_states); + for (i = 1; i < num_states; i++) { + Base::states_[i].set_config(config); + ret &= Base::states_[i].qreg().chunk_setup(Base::states_[0].qreg(), + Base::global_state_index_ + i); + Base::states_[i].qreg().set_num_threads_per_group( + Base::num_threads_per_group_); + Base::states_[i].set_num_global_qubits(Base::num_qubits_); + } + } + Base::num_active_states_ = num_states; + + // initialize groups + Base::top_state_of_group_.clear(); + Base::num_groups_ = 0; + for (i = 0; i < num_states; i++) { + if (Base::states_[i].qreg().top_of_group()) { + Base::top_state_of_group_.push_back(i); + Base::num_groups_++; + } + } + Base::top_state_of_group_.push_back(num_states); + Base::num_states_in_group_.resize(Base::num_groups_); + for (i = 0; i < Base::num_groups_; i++) { + Base::num_states_in_group_[i] = + Base::top_state_of_group_[i + 1] - Base::top_state_of_group_[i]; + } + return ret; +} + +template +uint_t ParallelStateExecutor::get_process_by_chunk(uint_t cid) { + uint_t i; + for (i = 0; i < Base::distributed_procs_; i++) { + if (cid >= Base::state_index_begin_[i] && cid < Base::state_index_end_[i]) { + return i; + } + } + return Base::distributed_procs_; +} + +template +uint_t ParallelStateExecutor::mapped_index(const uint_t idx) { + uint_t i, ret = 0; + uint_t t = idx; + + for (i = 0; i < Base::num_qubits_; i++) { + if (t & 1) { + ret |= (1ull << qubit_map_[i]); + } + t >>= 1; + } + return ret; +} + +template +void ParallelStateExecutor::run_circuit_with_sampling( + Circuit &circ, const Config &config, RngEngine &init_rng, + ExperimentResult &result) { + + // Optimize circuit + Noise::NoiseModel dummy_noise; + state_t dummy_state; + + bool cache_block = false; + if (multiple_chunk_required(circ, dummy_noise)) { + auto fusion_pass = Base::transpile_fusion(circ.opset(), config); + fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(), + result); + + // Cache blocking pass + auto cache_block_pass = transpile_cache_blocking(circ, dummy_noise, config); + cache_block_pass.set_sample_measure(true); + cache_block_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(), + result); + cache_block = cache_block_pass.enabled(); + } + if (!cache_block) { + return Executor::run_circuit_with_sampling(circ, config, init_rng, + result); + } + Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ); + + uint_t nchunks = + 1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale()); + Base::set_distribution(nchunks); + allocate(circ.num_qubits, config); + // Set state config + for (uint_t i = 0; i < Base::states_.size(); i++) { + Base::states_[i].set_parallelization(Base::parallel_state_update_); + Base::states_[i].set_global_phase(circ.global_phase_angle); + } + Base::set_global_phase(circ.global_phase_angle); + + // run with multi-chunks + RngEngine rng = init_rng; + + auto &ops = circ.ops; + auto first_meas = circ.first_measure_pos; // Position of first measurement op + bool final_ops = (first_meas == ops.size()); + + initialize_qreg(circ.num_qubits); + for (uint_t i = 0; i < Base::states_.size(); i++) { + Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers); + } + + // Run circuit instructions before first measure + apply_ops_chunks(ops.cbegin(), ops.cbegin() + first_meas, result, rng, + final_ops); + + // Get measurement operations and set of measured qubits + measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots, + result, rng); + + // Add measure sampling metadata + result.metadata.add(true, "measure_sampling"); + Base::states_[0].add_metadata(result); +} + +template +void ParallelStateExecutor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + + if (!multiple_chunk_required(circ, noise)) { + return Base::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } + + uint_t nchunks = + 1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale()); + Base::set_distribution(nchunks); + + auto fusion_pass = Base::transpile_fusion(circ.opset(), config); + auto cache_block_pass = transpile_cache_blocking(circ, noise, config); + + for (int_t ishot = 0; ishot < circ.shots; ishot++) { + RngEngine rng; + if (ishot == 0) + rng = init_rng; + else + rng.set_seed(circ.seed + ishot); + + // Optimize circuit + Noise::NoiseModel dummy_noise; + state_t dummy_state; + + Circuit circ_opt; + if (sample_noise) { + circ_opt = noise.sample_noise(circ, rng); + } else { + circ_opt = circ; + } + fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(), + result); + Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt); + + // Cache blocking pass + cache_block_pass.set_sample_measure(false); + cache_block_pass.optimize_circuit(circ_opt, dummy_noise, + dummy_state.opset(), result); + allocate(circ.num_qubits, config); + + // Set state config + for (uint_t i = 0; i < Base::states_.size(); i++) { + Base::states_[i].set_parallelization(Base::parallel_state_update_); + Base::states_[i].set_global_phase(circ.global_phase_angle); + } + Base::set_global_phase(circ.global_phase_angle); + + initialize_qreg(circ.num_qubits); + for (uint_t i = 0; i < Base::states_.size(); i++) { + Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers); + } + + apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(), result, rng, + true); + result.save_count_data(Base::states_[0].creg(), Base::save_creg_memory_); + } + Base::states_[0].add_metadata(result); +} + +template +template +void ParallelStateExecutor::measure_sampler(InputIterator first_meas, + InputIterator last_meas, + uint_t shots, + ExperimentResult &result, + RngEngine &rng) const { + // Check if meas_circ is empty, and if so return initial creg + if (first_meas == last_meas) { + while (shots-- > 0) { + result.save_count_data(Base::states_[0].creg(), Base::save_creg_memory_); + } + return; + } + + std::vector meas_ops; + std::vector roerror_ops; + for (auto op = first_meas; op != last_meas; op++) { + if (op->type == Operations::OpType::roerror) { + roerror_ops.push_back(*op); + } else { /*(op.type == Operations::OpType::measure) */ + meas_ops.push_back(*op); + } + } + + // Get measured qubits from circuit sort and delete duplicates + std::vector meas_qubits; // measured qubits + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) + meas_qubits.push_back(op.qubits[j]); + } + sort(meas_qubits.begin(), meas_qubits.end()); + meas_qubits.erase(unique(meas_qubits.begin(), meas_qubits.end()), + meas_qubits.end()); + + // Generate the samples + auto timer_start = myclock_t::now(); + auto all_samples = sample_measure(meas_qubits, shots, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); + + // Make qubit map of position in vector of measured qubits + std::unordered_map qubit_map; + for (uint_t j = 0; j < meas_qubits.size(); ++j) { + qubit_map[meas_qubits[j]] = j; + } + + // Maps of memory and register to qubit position + std::map memory_map; + std::map register_map; + for (const auto &op : meas_ops) { + for (size_t j = 0; j < op.qubits.size(); ++j) { + auto pos = qubit_map[op.qubits[j]]; + if (!op.memory.empty()) + memory_map[op.memory[j]] = pos; + if (!op.registers.empty()) + register_map[op.registers[j]] = pos; + } + } + + // Process samples + uint_t num_memory = + (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first; + uint_t num_registers = + (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first; + ClassicalRegister creg; + while (!all_samples.empty()) { + auto sample = all_samples.back(); + creg.initialize(num_memory, num_registers); + + // process memory bit measurements + for (const auto &pair : memory_map) { + creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}), + reg_t()); + } + // process register bit measurements + for (const auto &pair : register_map) { + creg.store_measure(reg_t({sample[pair.second]}), reg_t(), + reg_t({pair.first})); + } + + // process read out errors for memory and registers + for (const Operations::Op &roerror : roerror_ops) { + creg.apply_roerror(roerror, rng); + } + + // Save count data + result.save_count_data(creg, Base::save_creg_memory_); + + // pop off processed sample + all_samples.pop_back(); + } +} + +template +void ParallelStateExecutor::store_measure(const reg_t &outcome, + const reg_t &memory, + const reg_t ®isters) { + auto apply_store_measure = [this, outcome, memory, registers](int_t iGroup) { + int_t iChunk = Base::top_state_of_group_[iGroup]; + int_t nChunk = 1; +#ifdef AER_CUSTATEVEC + if (Base::cuStateVec_enable_) { + nChunk = Base::num_states_in_group_[iGroup]; + } +#endif + for (int_t i = 0; i < nChunk; i++) + Base::states_[iChunk + i].creg().store_measure(outcome, memory, + registers); + }; + Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1), + 0, Base::num_groups_, apply_store_measure); +} + +template +void ParallelStateExecutor::apply_bfunc(const Operations::Op &op) { + auto bfunc_kernel = [this, op](int_t iGroup) { + int_t iChunk = Base::top_state_of_group_[iGroup]; + int_t nChunk = 1; +#ifdef AER_CUSTATEVEC + if (Base::cuStateVec_enable_) { + nChunk = Base::num_states_in_group_[iGroup]; + } +#endif + for (int_t i = 0; i < nChunk; i++) + Base::states_[iChunk + i].creg().apply_bfunc(op); + }; + Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1), + 0, Base::num_groups_, bfunc_kernel); +} + +template +void ParallelStateExecutor::apply_roerror(const Operations::Op &op, + RngEngine &rng) { + auto roerror_kernel = [this, op, &rng](int_t iGroup) { + int_t iChunk = Base::top_state_of_group_[iGroup]; + int_t nChunk = 1; +#ifdef AER_CUSTATEVEC + if (Base::cuStateVec_enable_) { + nChunk = Base::num_states_in_group_[iGroup]; + } +#endif + for (int_t i = 0; i < nChunk; i++) + Base::states_[iChunk + i].creg().apply_roerror(op, rng); + }; + Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1), + 0, Base::num_groups_, roerror_kernel); +} + +template +template +void ParallelStateExecutor::apply_ops_chunks(InputIterator first, + InputIterator last, + ExperimentResult &result, + RngEngine &rng, + bool final_ops) { + uint_t iOp, nOp; + reg_t multi_swap; + + nOp = std::distance(first, last); + iOp = 0; + + while (iOp < nOp) { + const Operations::Op op_iOp = *(first + iOp); + if (op_iOp.type == Operations::OpType::gate && + op_iOp.name == "swap_chunk") { + // apply swap between chunks + if (multi_chunk_swap_enable_ && op_iOp.qubits[0] < chunk_bits_ && + op_iOp.qubits[1] >= chunk_bits_) { + if (Base::distributed_proc_bits_ < 0 || + (op_iOp.qubits[1] >= + (Base::num_qubits_ * qubit_scale() - + Base::distributed_proc_bits_))) { // apply multi-swap when swap is + // cross + // qubits + multi_swap.push_back(op_iOp.qubits[0]); + multi_swap.push_back(op_iOp.qubits[1]); + if (multi_swap.size() >= max_multi_swap_ * 2) { + apply_multi_chunk_swap(multi_swap); + multi_swap.clear(); + } + } else + apply_chunk_swap(op_iOp.qubits); + } else { + if (multi_swap.size() > 0) { + apply_multi_chunk_swap(multi_swap); + multi_swap.clear(); + } + apply_chunk_swap(op_iOp.qubits); + } + iOp++; + continue; + } else if (multi_swap.size() > 0) { + apply_multi_chunk_swap(multi_swap); + multi_swap.clear(); + } + + if (op_iOp.type == Operations::OpType::sim_op && + op_iOp.name == "begin_blocking") { + // applying sequence of gates inside each chunk + + uint_t iOpEnd = iOp; + while (iOpEnd < nOp) { + const Operations::Op op_iOpEnd = *(first + iOpEnd); + if (op_iOpEnd.type == Operations::OpType::sim_op && + op_iOpEnd.name == "end_blocking") { + break; + } + iOpEnd++; + } + + uint_t iOpBegin = iOp + 1; + if (Base::num_groups_ > 1 && chunk_omp_parallel_) { +#pragma omp parallel for num_threads(Base::num_groups_) + for (int_t ig = 0; ig < Base::num_groups_; ig++) + apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result, + rng); + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) + apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result, + rng); + } + iOp = iOpEnd; + } else { + if (!apply_parallel_op(op_iOp, result, rng, + final_ops && nOp == iOp + 1)) { + if (Base::num_groups_ > 1 && chunk_omp_parallel_) { +#pragma omp parallel for num_threads(Base::num_groups_) + for (int_t ig = 0; ig < Base::num_groups_; ig++) + apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result, + rng); + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) + apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result, + rng); + } + } + } + iOp++; + } + + if (multi_swap.size() > 0) + apply_multi_chunk_swap(multi_swap); + + if (Base::num_groups_ > 1 && chunk_omp_parallel_) { +#pragma omp parallel for num_threads(Base::num_groups_) + for (int_t ig = 0; ig < Base::num_groups_; ig++) + Base::states_[Base::top_state_of_group_[ig]].qreg().synchronize(); + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) + Base::states_[Base::top_state_of_group_[ig]].qreg().synchronize(); + } + + if (Base::sim_device_ == Device::GPU) { +#ifdef AER_THRUST_CUDA + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + nDev = 0; + } + if (nDev > Base::num_groups_) + nDev = Base::num_groups_; + result.metadata.add(nDev, "cacheblocking", "chunk_parallel_gpus"); +#endif + } + +#ifdef AER_MPI + result.metadata.add(multi_chunk_swap_enable_, "cacheblocking", + "multiple_chunk_swaps_enable"); + if (multi_chunk_swap_enable_) { + result.metadata.add(chunk_swap_buffer_qubits_, "cacheblocking", + "multiple_chunk_swaps_buffer_qubits"); + result.metadata.add(max_multi_swap_, "cacheblocking", + "max_multiple_chunk_swaps"); + } +#endif +} + +template +template +void ParallelStateExecutor::apply_cache_blocking_ops( + const int_t iGroup, InputIterator first, InputIterator last, + ExperimentResult &result, RngEngine &rng) { + // for each chunk in group + for (int_t iChunk = Base::top_state_of_group_[iGroup]; + iChunk < Base::top_state_of_group_[iGroup + 1]; iChunk++) { + // fecth chunk in cache + if (Base::states_[iChunk].qreg().fetch_chunk()) { + Base::states_[iChunk].apply_ops(first, last, result, rng, false); + + // release chunk from cache + Base::states_[iChunk].qreg().release_chunk(); + } + } +} + +template +template +void ParallelStateExecutor::initialize_from_vector(const list_t &vec) { + int_t iChunk; + + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(iChunk) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + list_t tmp(1ull << (chunk_bits_ * qubit_scale())); + for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { + tmp[i] = vec[((Base::global_state_index_ + iChunk) + << (chunk_bits_ * qubit_scale())) + + i]; + } + Base::states_[iChunk].qreg().initialize_from_vector(tmp); + } + } + } else { + for (iChunk = 0; iChunk < Base::num_local_states_; iChunk++) { + list_t tmp(1ull << (chunk_bits_ * qubit_scale())); + for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { + tmp[i] = vec[((Base::global_state_index_ + iChunk) + << (chunk_bits_ * qubit_scale())) + + i]; + } + Base::states_[iChunk].qreg().initialize_from_vector(tmp); + } + } +} + +template +template +void ParallelStateExecutor::initialize_from_matrix(const list_t &mat) { + int_t iChunk; + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(iChunk) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_)); + uint_t irow_chunk = ((iChunk + Base::global_state_index_) >> + ((Base::num_qubits_ - chunk_bits_))) + << (chunk_bits_); + uint_t icol_chunk = + ((iChunk + Base::global_state_index_) & + ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1)) + << (chunk_bits_); + + // copy part of state for this chunk + uint_t i, row, col; + for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { + uint_t icol = i & ((1ull << chunk_bits_) - 1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + + ((irow_chunk + irow) << Base::num_qubits_)]; + } + Base::states_[iChunk].qreg().initialize_from_matrix(tmp); + } + } + } else { + for (iChunk = 0; iChunk < Base::num_local_states_; iChunk++) { + list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_)); + uint_t irow_chunk = ((iChunk + Base::global_state_index_) >> + ((Base::num_qubits_ - chunk_bits_))) + << (chunk_bits_); + uint_t icol_chunk = ((iChunk + Base::global_state_index_) & + ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1)) + << (chunk_bits_); + + // copy part of state for this chunk + uint_t i, row, col; + for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { + uint_t icol = i & ((1ull << chunk_bits_) - 1); + uint_t irow = i >> chunk_bits_; + tmp[i] = + mat[icol_chunk + icol + ((irow_chunk + irow) << Base::num_qubits_)]; + } + Base::states_[iChunk].qreg().initialize_from_matrix(tmp); + } + } +} + +template +auto ParallelStateExecutor::apply_to_matrix(bool copy) { + // this function is used to collect states over chunks + int_t iChunk; + uint_t size = 1ull << (chunk_bits_ * qubit_scale()); + uint_t mask = (1ull << (chunk_bits_)) - 1; + uint_t num_threads = Base::states_[0].qreg().get_omp_threads(); + + size_t size_required = + 2 * (sizeof(std::complex) << (Base::num_qubits_ * 2)) + + (sizeof(std::complex) << (chunk_bits_ * 2)) * + Base::num_local_states_; + if ((size_required >> 20) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to store states as matrix")); + } + + auto matrix = Base::states_[0].qreg().copy_to_matrix(); + + if (Base::distributed_rank_ == 0) { + matrix.resize(1ull << (Base::num_qubits_), 1ull << (Base::num_qubits_)); + + auto tmp = Base::states_[0].qreg().copy_to_matrix(); + for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) { + int_t i; + uint_t irow_chunk = (iChunk >> ((Base::num_qubits_ - chunk_bits_))) + << chunk_bits_; + uint_t icol_chunk = + (iChunk & ((1ull << ((Base::num_qubits_ - chunk_bits_))) - 1)) + << chunk_bits_; + + if (iChunk < Base::num_local_states_) { + if (copy) + tmp = Base::states_[iChunk].qreg().copy_to_matrix(); + else + tmp = Base::states_[iChunk].qreg().move_to_matrix(); + } +#ifdef AER_MPI + else + recv_data(tmp.data(), size, 0, iChunk); +#endif +#pragma omp parallel for if (num_threads > 1) num_threads(num_threads) + for (i = 0; i < size; i++) { + uint_t irow = i >> (chunk_bits_); + uint_t icol = i & mask; + uint_t idx = + ((irow + irow_chunk) << (Base::num_qubits_)) + icol_chunk + icol; + matrix[idx] = tmp[i]; + } + } + } else { +#ifdef AER_MPI + // send matrices to process 0 + for (iChunk = 0; iChunk < Base::num_global_states_; iChunk++) { + uint_t iProc = get_process_by_chunk(iChunk); + if (iProc == Base::distributed_rank_) { + if (copy) { + auto tmp = Base::states_[iChunk - Base::global_state_index_] + .qreg() + .copy_to_matrix(); + send_data(tmp.data(), size, iChunk, 0); + } else { + auto tmp = Base::states_[iChunk - Base::global_state_index_] + .qreg() + .move_to_matrix(); + send_data(tmp.data(), size, iChunk, 0); + } + } + } +#endif + } + + return matrix; +} + +template +void ParallelStateExecutor::apply_save_expval( + const Operations::Op &op, ExperimentResult &result) { + // Check empty edge case + if (op.expval_params.empty()) { + throw std::invalid_argument( + "Invalid save expval instruction (Pauli components are empty)."); + } + bool variance = (op.type == Operations::OpType::save_expval_var); + + // Accumulate expval components + double expval(0.); + double sq_expval(0.); + + for (const auto ¶m : op.expval_params) { + // param is tuple (pauli, coeff, sq_coeff) + const auto val = expval_pauli(op.qubits, std::get<0>(param)); + expval += std::get<1>(param) * val; + if (variance) { + sq_expval += std::get<2>(param) * val; + } + } + if (variance) { + std::vector expval_var(2); + expval_var[0] = expval; // mean + expval_var[1] = sq_expval - expval * expval; // variance + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + expval_var, op.type, op.save_type); + } else { + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + expval, op.type, op.save_type); + } +} + +template +void ParallelStateExecutor::apply_global_phase() { + if (Base::has_global_phase_) { + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_diagonal_matrix( + {0}, {Base::global_phase_, Base::global_phase_}); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_diagonal_matrix( + {0}, {Base::global_phase_, Base::global_phase_}); + } + } +} + +template +void ParallelStateExecutor::apply_chunk_swap(const reg_t &qubits) { + uint_t nLarge = 1; + uint_t q0, q1; + int_t iChunk; + + q0 = qubits[qubits.size() - 2]; + q1 = qubits[qubits.size() - 1]; + + if (qubit_scale() == 1) { + std::swap(qubit_map_[q0], qubit_map_[q1]); + } + + if (q0 > q1) { + std::swap(q0, q1); + } + + if (q1 < chunk_bits_ * qubit_scale()) { + // inside chunk + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for num_threads(Base::num_groups_) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_mcswap(qubits); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_mcswap(qubits); + } + } + } else { // swap over chunks + uint_t mask0, mask1; + + mask0 = (1ull << q0); + mask1 = (1ull << q1); + mask0 >>= (chunk_bits_ * qubit_scale()); + mask1 >>= (chunk_bits_ * qubit_scale()); + + if (Base::distributed_procs_ == 1 || + (Base::distributed_proc_bits_ >= 0 && + q1 < (Base::num_qubits_ * qubit_scale() - + Base::distributed_proc_bits_))) { // no data transfer between + // processes + // is needed + auto apply_chunk_swap_1qubit = [this, mask1, qubits](int_t iGroup) { + for (int_t ic = Base::top_state_of_group_[iGroup]; + ic < Base::top_state_of_group_[iGroup + 1]; ic++) { + uint_t baseChunk; + baseChunk = ic & (~mask1); + if (ic == baseChunk) + Base::states_[ic].qreg().apply_chunk_swap( + qubits, Base::states_[ic | mask1].qreg(), true); + } + }; + auto apply_chunk_swap_2qubits = [this, mask0, mask1, + qubits](int_t iGroup) { + for (int_t ic = Base::top_state_of_group_[iGroup]; + ic < Base::top_state_of_group_[iGroup + 1]; ic++) { + uint_t baseChunk; + baseChunk = ic & (~(mask0 | mask1)); + uint_t iChunk1 = baseChunk | mask0; + uint_t iChunk2 = baseChunk | mask1; + if (ic == iChunk1) + Base::states_[iChunk1].qreg().apply_chunk_swap( + qubits, Base::states_[iChunk2].qreg(), true); + } + }; + if (q0 < chunk_bits_ * qubit_scale()) + Utils::apply_omp_parallel_for( + (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, + Base::num_groups_, apply_chunk_swap_1qubit); + else + Utils::apply_omp_parallel_for( + (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, + Base::num_groups_, apply_chunk_swap_2qubits); + } +#ifdef AER_MPI + else { + int_t iPair; + uint_t nPair; + uint_t baseChunk, iChunk1, iChunk2; + + if (q0 < chunk_bits_ * qubit_scale()) + nLarge = 1; + else + nLarge = 2; + + // chunk scheduler that supports any number of processes + uint_t nu[3]; + uint_t ub[3]; + uint_t iu[3]; + uint_t add; + uint_t iLocalChunk, iRemoteChunk, iProc; + int i; + + if (q0 < chunk_bits_ * qubit_scale()) { + nLarge = 1; + nu[0] = 1ull << (q1 - chunk_bits_ * qubit_scale()); + ub[0] = 0; + iu[0] = 0; + + nu[1] = 1ull << (Base::num_qubits_ * qubit_scale() - q1 - 1); + ub[1] = (q1 - chunk_bits_ * qubit_scale()) + 1; + iu[1] = 0; + } else { + nLarge = 2; + nu[0] = 1ull << (q0 - chunk_bits_ * qubit_scale()); + ub[0] = 0; + iu[0] = 0; + + nu[1] = 1ull << (q1 - q0 - 1); + ub[1] = (q0 - chunk_bits_ * qubit_scale()) + 1; + iu[1] = 0; + + nu[2] = 1ull << (Base::num_qubits_ * qubit_scale() - q1 - 1); + ub[2] = (q1 - chunk_bits_ * qubit_scale()) + 1; + iu[2] = 0; + } + nPair = 1ull << (Base::num_qubits_ * qubit_scale() - + chunk_bits_ * qubit_scale() - nLarge); + + for (iPair = 0; iPair < nPair; iPair++) { + // calculate index of pair of chunks + baseChunk = 0; + add = 1; + for (i = nLarge; i >= 0; i--) { + baseChunk += (iu[i] << ub[i]); + // update for next + iu[i] += add; + add = 0; + if (iu[i] >= nu[i]) { + iu[i] = 0; + add = 1; + } + } + + iChunk1 = baseChunk | mask0; + iChunk2 = baseChunk | mask1; + + if (iChunk1 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk1 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk1 is + // on + // this process + if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk2 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk2 + // is on + // this process + Base::states_[iChunk1 - Base::global_state_index_] + .qreg() + .apply_chunk_swap( + qubits, + Base::states_[iChunk2 - Base::global_state_index_].qreg(), + true); + continue; + } else { + iLocalChunk = iChunk1; + iRemoteChunk = iChunk2; + iProc = get_process_by_chunk(iChunk2); + } + } else { + if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk2 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk2 + // is on + // this process + iLocalChunk = iChunk2; + iRemoteChunk = iChunk1; + iProc = get_process_by_chunk(iChunk1); + } else { + continue; // there is no chunk for this pair on this process + } + } + + MPI_Request reqSend, reqRecv; + MPI_Status st; + uint_t sizeRecv, sizeSend; + + auto pRecv = Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .recv_buffer(sizeRecv); + MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, + Base::distributed_comm_, &reqRecv); + + auto pSend = Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .send_buffer(sizeSend); + MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, + Base::distributed_comm_, &reqSend); + + MPI_Wait(&reqSend, &st); + MPI_Wait(&reqRecv, &st); + + Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .apply_chunk_swap(qubits, iRemoteChunk); + } + } +#endif + } +} + +template +void ParallelStateExecutor::apply_multi_chunk_swap( + const reg_t &qubits) { + int_t nswap = qubits.size() / 2; + reg_t chunk_shuffle_qubits(nswap, 0); + reg_t local_swaps; + uint_t baseChunk = 0; + uint_t nchunk = 1ull << nswap; + reg_t chunk_procs(nchunk); + reg_t chunk_offset(nchunk); + + if (qubit_scale() == 1) { + for (int_t i = 0; i < nswap; i++) + std::swap(qubit_map_[qubits[i * 2]], qubit_map_[qubits[i * 2] + 1]); + } + + // define local swaps + for (int_t i = 0; i < nswap; i++) { + if (qubits[i * 2] >= chunk_bits_ * qubit_scale() - nswap) // no swap + // required + chunk_shuffle_qubits[qubits[i * 2] + nswap - + chunk_bits_ * qubit_scale()] = qubits[i * 2 + 1]; + } + int_t pos = 0; + for (int_t i = 0; i < nswap; i++) { + if (qubits[i * 2] < + chunk_bits_ * qubit_scale() - nswap) { // local swap required + // find empty position + while (pos < nswap) { + if (chunk_shuffle_qubits[pos] < chunk_bits_ * qubit_scale()) { + chunk_shuffle_qubits[pos] = qubits[i * 2 + 1]; + local_swaps.push_back(qubits[i * 2]); + local_swaps.push_back(chunk_bits_ * qubit_scale() - nswap + pos); + pos++; + break; + } + pos++; + } + } + } + for (int_t i = 0; i < nswap; i++) + chunk_shuffle_qubits[i] -= chunk_bits_ * qubit_scale(); + + // swap inside chunks to prepare for all-to-all shuffle + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps); + } + } + + // apply all-to-all chunk shuffle + int_t nPair; + reg_t chunk_shuffle_qubits_sorted = chunk_shuffle_qubits; + std::sort(chunk_shuffle_qubits_sorted.begin(), + chunk_shuffle_qubits_sorted.end()); + + nPair = Base::num_global_states_ >> nswap; + + for (uint_t i = 0; i < nchunk; i++) { + chunk_offset[i] = 0; + for (uint_t k = 0; k < nswap; k++) { + if (((i >> k) & 1) != 0) + chunk_offset[i] += (1ull << chunk_shuffle_qubits[k]); + } + } + +#ifdef AER_MPI + std::vector reqSend(nchunk); + std::vector reqRecv(nchunk); +#endif + + for (int_t iPair = 0; iPair < nPair; iPair++) { + uint_t i1, i2, k, ii, t; + baseChunk = 0; + ii = iPair; + for (k = 0; k < nswap; k++) { + t = ii & ((1ull << chunk_shuffle_qubits_sorted[k]) - 1); + baseChunk += t; + ii = (ii - t) << 1; + } + baseChunk += ii; + + for (i1 = 0; i1 < nchunk; i1++) { + chunk_procs[i1] = get_process_by_chunk(baseChunk + chunk_offset[i1]); + } + + // all-to-all + // send data + for (uint_t iswap = 1; iswap < nchunk; iswap++) { + uint_t sizeRecv, sizeSend; + uint_t num_local_swap = 0; + for (i1 = 0; i1 < nchunk; i1++) { + i2 = i1 ^ iswap; + if (i1 >= i2) + continue; + + uint_t iProc1 = chunk_procs[i1]; + uint_t iProc2 = chunk_procs[i2]; + if (iProc1 != Base::distributed_rank_ && + iProc2 != Base::distributed_rank_) + continue; + if (iProc1 == iProc2) { // on the same process + num_local_swap++; + continue; // swap while data is exchanged between processes + } +#ifdef AER_MPI + uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap); + uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); + uint_t iChunk1 = + baseChunk + chunk_offset[i1] - Base::global_state_index_; + uint_t iChunk2 = + baseChunk + chunk_offset[i2] - Base::global_state_index_; + + int_t tid = (iPair << nswap) + iswap; + + if (iProc1 == Base::distributed_rank_) { + auto pRecv = Base::states_[iChunk1].qreg().recv_buffer(sizeRecv); + MPI_Irecv(pRecv + offset2, (sizeRecv >> nswap), MPI_BYTE, iProc2, tid, + Base::distributed_comm_, &reqRecv[i2]); + + auto pSend = Base::states_[iChunk1].qreg().send_buffer(sizeSend); + MPI_Isend(pSend + offset2, (sizeSend >> nswap), MPI_BYTE, iProc2, tid, + Base::distributed_comm_, &reqSend[i2]); + } else { + auto pRecv = Base::states_[iChunk2].qreg().recv_buffer(sizeRecv); + MPI_Irecv(pRecv + offset1, (sizeRecv >> nswap), MPI_BYTE, iProc1, tid, + Base::distributed_comm_, &reqRecv[i1]); + + auto pSend = Base::states_[iChunk2].qreg().send_buffer(sizeSend); + MPI_Isend(pSend + offset1, (sizeSend >> nswap), MPI_BYTE, iProc1, tid, + Base::distributed_comm_, &reqSend[i1]); + } +#endif + } + + // swaps inside process + if (num_local_swap > 0) { + for (i1 = 0; i1 < nchunk; i1++) { + i2 = i1 ^ iswap; + if (i1 > i2) + continue; + + uint_t iProc1 = chunk_procs[i1]; + uint_t iProc2 = chunk_procs[i2]; + if (iProc1 != Base::distributed_rank_ && + iProc2 != Base::distributed_rank_) + continue; + if (iProc1 == iProc2) { // on the same process + uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap); + uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); + uint_t iChunk1 = + baseChunk + chunk_offset[i1] - Base::global_state_index_; + uint_t iChunk2 = + baseChunk + chunk_offset[i2] - Base::global_state_index_; + Base::states_[iChunk1].qreg().apply_chunk_swap( + Base::states_[iChunk2].qreg(), offset2, offset1, + (1ull << (chunk_bits_ * qubit_scale() - nswap))); + } + } + } + +#ifdef AER_MPI + // recv data + for (i1 = 0; i1 < nchunk; i1++) { + i2 = i1 ^ iswap; + + uint_t iProc1 = chunk_procs[i1]; + uint_t iProc2 = chunk_procs[i2]; + if (iProc1 != Base::distributed_rank_) + continue; + if (iProc1 == iProc2) { // on the same process + continue; + } + uint_t iChunk1 = + baseChunk + chunk_offset[i1] - Base::global_state_index_; + uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); + + MPI_Status st; + MPI_Wait(&reqSend[i2], &st); + MPI_Wait(&reqRecv[i2], &st); + + // copy states from recv buffer to chunk + Base::states_[iChunk1].qreg().apply_chunk_swap( + Base::states_[iChunk1].qreg(), offset2, offset2, + (1ull << (chunk_bits_ * qubit_scale() - nswap))); + } +#endif + } + } + + // restore qubits order + if (chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_multi_swaps(local_swaps); + } + } +} + +template +void ParallelStateExecutor::apply_chunk_x(const uint_t qubit) { + int_t iChunk; + uint_t nLarge = 1; + + if (qubit < chunk_bits_ * qubit_scale()) { + auto apply_mcx = [this, qubit](int_t ig) { + reg_t qubits(1, qubit); + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].qreg().apply_mcx(qubits); + }; + Utils::apply_omp_parallel_for( + (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, Base::num_groups_, + apply_mcx); + } else { // exchange over chunks + int_t iPair; + uint_t nPair, mask; + uint_t baseChunk, iChunk1, iChunk2; + reg_t qubits(2); + qubits[0] = qubit; + qubits[1] = qubit; + + mask = (1ull << qubit); + mask >>= (chunk_bits_ * qubit_scale()); + + if (Base::distributed_procs_ == 1 || + (Base::distributed_proc_bits_ >= 0 && + qubit < (Base::num_qubits_ * qubit_scale() - + Base::distributed_proc_bits_))) { // no data transfer between + // processes is needed + nPair = Base::num_local_states_ >> 1; + + auto apply_chunk_swap = [this, mask, qubits](int_t iGroup) { + for (int_t ic = Base::top_state_of_group_[iGroup]; + ic < Base::top_state_of_group_[iGroup + 1]; ic++) { + uint_t pairChunk; + pairChunk = ic ^ mask; + if (ic < pairChunk) + Base::states_[ic].qreg().apply_chunk_swap( + qubits, Base::states_[pairChunk].qreg(), true); + } + }; + Utils::apply_omp_parallel_for( + (chunk_omp_parallel_ && Base::num_groups_ > 1), 0, nPair, + apply_chunk_swap); + } +#ifdef AER_MPI + else { + // chunk scheduler that supports any number of processes + uint_t nu[3]; + uint_t ub[3]; + uint_t iu[3]; + uint_t add; + uint_t iLocalChunk, iRemoteChunk, iProc; + int i; + + nLarge = 1; + nu[0] = 1ull << (qubit - chunk_bits_ * qubit_scale()); + ub[0] = 0; + iu[0] = 0; + + nu[1] = 1ull << (Base::num_qubits_ * qubit_scale() - qubit - 1); + ub[1] = (qubit - chunk_bits_ * qubit_scale()) + 1; + iu[1] = 0; + nPair = 1ull << (Base::num_qubits_ * qubit_scale() - + chunk_bits_ * qubit_scale() - 1); + + for (iPair = 0; iPair < nPair; iPair++) { + // calculate index of pair of chunks + baseChunk = 0; + add = 1; + for (i = 1; i >= 0; i--) { + baseChunk += (iu[i] << ub[i]); + // update for next + iu[i] += add; + add = 0; + if (iu[i] >= nu[i]) { + iu[i] = 0; + add = 1; + } + } + + iChunk1 = baseChunk; + iChunk2 = baseChunk | mask; + + if (iChunk1 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk1 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk1 is + // on + // this process + if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk2 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk2 + // is on + // this process + Base::states_[iChunk1 - Base::global_state_index_] + .qreg() + .apply_chunk_swap( + qubits, + Base::states_[iChunk2 - Base::global_state_index_].qreg(), + true); + continue; + } else { + iLocalChunk = iChunk1; + iRemoteChunk = iChunk2; + iProc = get_process_by_chunk(iChunk2); + } + } else { + if (iChunk2 >= Base::state_index_begin_[Base::distributed_rank_] && + iChunk2 < + Base::state_index_end_[Base::distributed_rank_]) { // chunk2 + // is on + // this process + iLocalChunk = iChunk2; + iRemoteChunk = iChunk1; + iProc = get_process_by_chunk(iChunk1); + } else { + continue; // there is no chunk for this pair on this process + } + } + + MPI_Request reqSend, reqRecv; + MPI_Status st; + uint_t sizeRecv, sizeSend; + + auto pSend = Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .send_buffer(sizeSend); + MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, + Base::distributed_comm_, &reqSend); + + auto pRecv = Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .recv_buffer(sizeRecv); + MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, + Base::distributed_comm_, &reqRecv); + + MPI_Wait(&reqSend, &st); + MPI_Wait(&reqRecv, &st); + + Base::states_[iLocalChunk - Base::global_state_index_] + .qreg() + .apply_chunk_swap(qubits, iRemoteChunk); + } + } +#endif + } +} + +template +void ParallelStateExecutor::send_chunk(uint_t local_chunk_index, + uint_t global_pair_index) { +#ifdef AER_MPI + MPI_Request reqSend; + MPI_Status st; + uint_t sizeSend; + uint_t iProc; + + iProc = get_process_by_chunk(global_pair_index); + + auto pSend = Base::states_[local_chunk_index].qreg().send_buffer(sizeSend); + MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, + local_chunk_index + Base::global_state_index_, + Base::distributed_comm_, &reqSend); + + MPI_Wait(&reqSend, &st); + + Base::states_[local_chunk_index].qreg().release_send_buffer(); +#endif +} + +template +void ParallelStateExecutor::recv_chunk(uint_t local_chunk_index, + uint_t global_pair_index) { +#ifdef AER_MPI + MPI_Request reqRecv; + MPI_Status st; + uint_t sizeRecv; + uint_t iProc; + + iProc = get_process_by_chunk(global_pair_index); + + auto pRecv = Base::states_[local_chunk_index].qreg().recv_buffer(sizeRecv); + MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, global_pair_index, + Base::distributed_comm_, &reqRecv); + + MPI_Wait(&reqRecv, &st); +#endif +} + +template +template +void ParallelStateExecutor::send_data(data_t *pSend, uint_t size, + uint_t myid, uint_t pairid) { +#ifdef AER_MPI + MPI_Request reqSend; + MPI_Status st; + uint_t iProc; + + iProc = get_process_by_chunk(pairid); + + MPI_Isend(pSend, size * sizeof(data_t), MPI_BYTE, iProc, myid, + Base::distributed_comm_, &reqSend); + + MPI_Wait(&reqSend, &st); +#endif +} + +template +template +void ParallelStateExecutor::recv_data(data_t *pRecv, uint_t size, + uint_t myid, uint_t pairid) { +#ifdef AER_MPI + MPI_Request reqRecv; + MPI_Status st; + uint_t iProc; + + iProc = get_process_by_chunk(pairid); + + MPI_Irecv(pRecv, size * sizeof(data_t), MPI_BYTE, iProc, pairid, + Base::distributed_comm_, &reqRecv); + + MPI_Wait(&reqRecv, &st); +#endif +} + +template +void ParallelStateExecutor::reduce_sum(reg_t &sum) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + uint_t i, n = sum.size(); + reg_t tmp(n); + MPI_Allreduce(&sum[0], &tmp[0], n, MPI_UINT64_T, MPI_SUM, + Base::distributed_comm_); + for (i = 0; i < n; i++) { + sum[i] = tmp[i]; + } + } +#endif +} + +template +void ParallelStateExecutor::reduce_sum(rvector_t &sum) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + uint_t i, n = sum.size(); + rvector_t tmp(n); + MPI_Allreduce(&sum[0], &tmp[0], n, MPI_DOUBLE_PRECISION, MPI_SUM, + Base::distributed_comm_); + for (i = 0; i < n; i++) { + sum[i] = tmp[i]; + } + } +#endif +} + +template +void ParallelStateExecutor::reduce_sum(complex_t &sum) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + complex_t tmp; + MPI_Allreduce(&sum, &tmp, 2, MPI_DOUBLE_PRECISION, MPI_SUM, + Base::distributed_comm_); + sum = tmp; + } +#endif +} + +template +void ParallelStateExecutor::reduce_sum(double &sum) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + double tmp; + MPI_Allreduce(&sum, &tmp, 1, MPI_DOUBLE_PRECISION, MPI_SUM, + Base::distributed_comm_); + sum = tmp; + } +#endif +} + +template +void ParallelStateExecutor::gather_value(rvector_t &val) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + rvector_t tmp = val; + MPI_Alltoall(&tmp[0], 1, MPI_DOUBLE_PRECISION, &val[0], 1, + MPI_DOUBLE_PRECISION, Base::distributed_comm_); + } +#endif +} + +template +void ParallelStateExecutor::sync_process(void) const { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + MPI_Barrier(Base::distributed_comm_); + } +#endif +} + +// gather distributed state into vector (if memory is enough) +template +template +void ParallelStateExecutor::gather_state( + std::vector> &state) { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + uint_t size, local_size, global_size, offset; + int i; + std::vector recv_counts(Base::distributed_procs_); + std::vector recv_offset(Base::distributed_procs_); + + global_size = 0; + for (i = 0; i < Base::distributed_procs_; i++) { + recv_offset[i] = + (int)(Base::state_index_begin_[i] << (chunk_bits_ * qubit_scale())) * + 2; + recv_counts[i] = + (int)((Base::state_index_end_[i] - Base::state_index_begin_[i]) + << (chunk_bits_ * qubit_scale())); + global_size += recv_counts[i]; + recv_counts[i] *= 2; + } + if ((global_size >> 21) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to gather state")); + } + std::vector> local_state = state; + state.resize(global_size); + + if (sizeof(std::complex) == 16) { + MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_], + MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0], + &recv_offset[0], MPI_DOUBLE_PRECISION, + Base::distributed_comm_); + } else { + MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_], + MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0], + MPI_FLOAT, Base::distributed_comm_); + } + } +#endif +} + +template +template +void ParallelStateExecutor::gather_state( + AER::Vector> &state) { +#ifdef AER_MPI + if (Base::distributed_procs_ > 1) { + uint_t size, local_size, global_size, offset; + int i; + + std::vector recv_counts(Base::distributed_procs_); + std::vector recv_offset(Base::distributed_procs_); + + global_size = 0; + for (i = 0; i < Base::distributed_procs_; i++) { + recv_offset[i] = + (int)(Base::state_index_begin_[i] << (chunk_bits_ * qubit_scale())) * + 2; + recv_counts[i] = + (int)((Base::state_index_end_[i] - Base::state_index_begin_[i]) + << (chunk_bits_ * qubit_scale())); + global_size += recv_counts[i]; + recv_counts[i] *= 2; + } + if ((global_size >> 21) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to gather state")); + } + AER::Vector> local_state = state; + state.resize(global_size); + + if (sizeof(std::complex) == 16) { + MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_], + MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0], + &recv_offset[0], MPI_DOUBLE_PRECISION, + Base::distributed_comm_); + } else { + MPI_Allgatherv(local_state.data(), recv_counts[Base::distributed_rank_], + MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0], + MPI_FLOAT, Base::distributed_comm_); + } + } +#endif +} + +//------------------------------------------------------------------------- +} // end namespace CircuitExecutor +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/shot_branching.hpp b/src/simulators/shot_branching.hpp new file mode 100644 index 0000000000..358b07c08d --- /dev/null +++ b/src/simulators/shot_branching.hpp @@ -0,0 +1,301 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019.2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _shot_branching_hpp +#define _shot_branching_hpp + +namespace AER { + +namespace CircuitExecutor { + +using OpItr = std::vector::const_iterator; + +class Branch; + +// class for shared state for sho-branching +class Branch { +protected: + uint_t state_index_; // state index + uint_t root_state_index_; + + uint_t shot_index_; // starting shot index + + // creg to be stored to the state + ClassicalRegister creg_; + // random generators for shots + std::vector shots_; + // additional operations applied after shot branching + std::vector additional_ops_; + + // mark for control flow + std::unordered_map flow_marks_; + + // current iterator of operations + OpItr iter_; + + // branches from this + std::vector> branches_; + +public: + Branch(void) {} + ~Branch() { + shots_.clear(); + additional_ops_.clear(); + branches_.clear(); + } + Branch(const Branch &src) { + shots_ = src.shots_; + creg_ = src.creg_; + iter_ = src.iter_; + flow_marks_ = src.flow_marks_; + } + + uint_t &state_index(void) { return state_index_; } + uint_t &root_state_index(void) { return root_state_index_; } + uint_t &shot_index(void) { return shot_index_; } + ClassicalRegister &creg(void) { return creg_; } + std::vector &rng_shots(void) { return shots_; } + OpItr &op_iterator(void) { return iter_; } + std::unordered_map &marks(void) { return flow_marks_; } + uint_t num_branches(void) { return branches_.size(); } + std::vector> &branches(void) { return branches_; } + + uint_t num_shots(void) { return shots_.size(); } + void clear(void) { + shots_.clear(); + additional_ops_.clear(); + branches_.clear(); + } + void clear_branch(void) { branches_.clear(); } + + void set_shots(std::vector &shots) { shots_ = shots; } + void initialize_shots(const uint_t nshots, const uint_t seed) { + shots_.resize(nshots); + for (int_t i = 0; i < nshots; i++) { + shots_[i].set_seed(seed + i); + } + } + + void add_op_after_branch(Operations::Op &op) { + additional_ops_.push_back(op); + } + void copy_ops_after_branch(std::vector &ops) { + additional_ops_ = ops; + } + void clear_additional_ops(void) { additional_ops_.clear(); } + + std::vector &additional_ops(void) { return additional_ops_; } + + void branch_shots(reg_t &shots, int_t nbranch); + + bool apply_control_flow(ClassicalRegister &creg, OpItr last) { + if (iter_->type == Operations::OpType::mark) { + flow_marks_[iter_->string_params[0]] = iter_; + iter_++; + return true; + } else if (iter_->type == Operations::OpType::jump) { + if (creg.check_conditional(*iter_)) { + const auto &mark_name = iter_->string_params[0]; + auto mark_it = flow_marks_.find(mark_name); + if (mark_it != flow_marks_.end()) { + iter_ = mark_it->second; + } else { + for (++iter_; iter_ != last; ++iter_) { + if (iter_->type == Operations::OpType::mark) { + flow_marks_[iter_->string_params[0]] = iter_; + if (iter_->string_params[0] == mark_name) { + break; + } + } + } + if (iter_ == last) { + std::stringstream msg; + msg << "Invalid jump destination:\"" << mark_name << "\"." + << std::endl; + throw std::runtime_error(msg.str()); + } + } + } + iter_++; + return true; + } + return false; + } + + void advance_iterator(void); + + bool apply_runtime_noise_sampling(const ClassicalRegister &creg, + const Operations::Op &op, + const Noise::NoiseModel &noise); + + void remove_empty_branches(void); +}; + +void Branch::branch_shots(reg_t &shots, int_t nbranch) { + branches_.resize(nbranch); + + for (int_t i = 0; i < nbranch; i++) { + branches_[i] = std::make_shared(); + branches_[i]->creg_ = creg_; + branches_[i]->iter_ = iter_; + branches_[i]->flow_marks_ = flow_marks_; + } + for (int_t i = 0; i < shots.size(); i++) { + branches_[shots[i]]->shots_.push_back(shots_[i]); + } + // update shot indices + uint_t index = shot_index_; + for (int_t i = 0; i < nbranch; i++) { + branches_[i]->shot_index_ = index; + index += branches_[i]->shots_.size(); + } +} + +void Branch::advance_iterator(void) { + iter_++; + for (int_t i = 0; i < branches_.size(); i++) { + branches_[i]->iter_++; + } +} + +bool Branch::apply_runtime_noise_sampling(const ClassicalRegister &creg, + const Operations::Op &op, + const Noise::NoiseModel &noise) { + if (op.type != Operations::OpType::sample_noise) + return false; + + uint_t nshots = num_shots(); + reg_t shot_map(nshots); + std::vector> noises; + + for (int_t i = 0; i < nshots; i++) { + std::vector noise_ops = + noise.sample_noise_loc(op, shots_[i]); + + // search same noise ops + int_t pos = -1; + for (int_t j = 0; j < noises.size(); j++) { + if (noise_ops.size() != noises[j].size()) + continue; + bool same = true; + for (int_t k = 0; k < noise_ops.size(); k++) { + if (noise_ops[k].type != noises[j][k].type || + noise_ops[k].name != noises[j][k].name) + same = false; + else if (noise_ops[k].qubits.size() != noises[j][k].qubits.size()) + same = false; + else { + for (int_t l = 0; l < noise_ops[k].qubits.size(); l++) { + if (noise_ops[k].qubits[l] != noises[j][k].qubits[l]) { + same = false; + break; + } + } + } + if (!same) + break; + if (noise_ops[k].type == Operations::OpType::gate) { + if (noise_ops[k].name == "pauli") { + if (noise_ops[k].string_params[0] != noises[j][k].string_params[0]) + same = false; + } else if (noise_ops[k].params.size() != noises[j][k].params.size()) + same = false; + else { + for (int_t l = 0; l < noise_ops[k].params.size(); l++) { + if (noise_ops[k].params[l] != noises[j][k].params[l]) { + same = false; + break; + } + } + } + } else if (noise_ops[k].type == Operations::OpType::matrix || + noise_ops[k].type == Operations::OpType::diagonal_matrix) { + if (noise_ops[k].mats.size() != noises[j][k].mats.size()) + same = false; + else { + for (int_t l = 0; l < noise_ops[k].mats.size(); l++) { + if (noise_ops[k].mats[l].size() != noises[j][k].mats[l].size()) { + same = false; + break; + } + for (int_t m = 0; m < noise_ops[k].mats[l].size(); m++) { + if (noise_ops[k].mats[l][m] != noises[j][k].mats[l][m]) { + same = false; + break; + } + } + if (!same) + break; + } + } + } + if (!same) + break; + } + if (same) { + pos = j; + break; + } + } + + if (pos < 0) { // if not found, add noise ops to the list + shot_map[i] = noises.size(); + noises.push_back(noise_ops); + } else { // if found, add shot + shot_map[i] = pos; + } + } + + creg_ = creg; + branch_shots(shot_map, noises.size()); + for (int_t i = 0; i < noises.size(); i++) { + branches_[i]->copy_ops_after_branch(noises[i]); + } + + return true; +} + +void Branch::remove_empty_branches(void) { + int_t istart = 0; + for (int_t j = 0; j < branches_.size(); j++) { + if (branches_[j]->num_shots() > 0) { + // copy shots to the root + shots_ = branches_[j]->rng_shots(); + additional_ops_ = branches_[j]->additional_ops(); + shot_index_ = branches_[j]->shot_index(); + creg_ = branches_[j]->creg(); + branches_[j].reset(); + istart = j + 1; + break; + } + branches_[j].reset(); + } + + std::vector> new_branches; + + for (int_t j = istart; j < branches_.size(); j++) { + if (branches_[j]->num_shots() > 0) + new_branches.push_back(branches_[j]); + else + branches_[j].reset(); + } + branches_ = new_branches; +} + +//------------------------------------------------------------------------- +} // namespace CircuitExecutor +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/simulators.hpp b/src/simulators/simulators.hpp new file mode 100644 index 0000000000..017979e8fd --- /dev/null +++ b/src/simulators/simulators.hpp @@ -0,0 +1,61 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _aer_simulators_hpp_ +#define _aer_simulators_hpp_ + +#include "simulators/density_matrix/densitymatrix_state.hpp" +#include "simulators/extended_stabilizer/extended_stabilizer_state.hpp" +#include "simulators/matrix_product_state/matrix_product_state.hpp" +#include "simulators/stabilizer/stabilizer_state.hpp" +#include "simulators/statevector/statevector_state.hpp" +#include "simulators/superoperator/superoperator_state.hpp" +#include "simulators/tensor_network/tensor_net_state.hpp" +#include "simulators/unitary/unitary_state.hpp" + +namespace AER { + +// Simulation methods +enum class Method { + automatic, + statevector, + density_matrix, + matrix_product_state, + stabilizer, + extended_stabilizer, + unitary, + superop, + tensor_network +}; + +enum class Device { CPU, GPU, ThrustCPU }; + +// Simulation precision +enum class Precision { Double, Single }; + +const std::unordered_map method_names_ = { + {Method::automatic, "automatic"}, + {Method::statevector, "statevector"}, + {Method::density_matrix, "density_matrix"}, + {Method::matrix_product_state, "matrix_product_state"}, + {Method::stabilizer, "stabilizer"}, + {Method::extended_stabilizer, "extended_stabilizer"}, + {Method::unitary, "unitary"}, + {Method::superop, "superop"}, + {Method::tensor_network, "tensor_network"}}; + +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/stabilizer/clifford.hpp b/src/simulators/stabilizer/clifford.hpp index 15c5bcc202..e54844e573 100644 --- a/src/simulators/stabilizer/clifford.hpp +++ b/src/simulators/stabilizer/clifford.hpp @@ -45,6 +45,9 @@ class Clifford { Clifford() = default; explicit Clifford(const uint64_t nqubit); + // initialize from existing state (copy) + void initialize(const Clifford &obj); + //----------------------------------------------------------------------- // Utility functions //----------------------------------------------------------------------- @@ -224,6 +227,17 @@ void Clifford::initialize(uint64_t nq) { stabilizer_phases_.setLength(nq); } +void Clifford::initialize(const Clifford &obj) { + destabilizer_table_ = obj.destabilizer_table_; + stabilizer_table_ = obj.stabilizer_table_; + destabilizer_phases_ = obj.destabilizer_phases_; + stabilizer_phases_ = obj.stabilizer_phases_; + num_qubits_ = obj.num_qubits_; + omp_threads_ = obj.omp_threads_; + omp_threshold_ = obj.omp_threshold_; + json_chop_threshold_ = obj.json_chop_threshold_; +} + //------------------------------------------------------------------------------ // Apply Clifford gates //------------------------------------------------------------------------------ diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 4136230f8f..c8aebfef79 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -225,6 +225,11 @@ class Base { // can apply density matrix (without statevector output required) virtual void enable_density_matrix(bool flg) {} + + void set_num_global_qubits(uint_t qubits) { num_global_qubits_ = qubits; } + + void enable_cuStateVec(bool flg) { cuStateVec_enable_ = flg; } + //----------------------------------------------------------------------- // Common instructions //----------------------------------------------------------------------- @@ -250,10 +255,20 @@ class Base { int_t max_matrix_qubits_ = 0; std::string sim_device_name_ = "CPU"; + + uint_t num_global_qubits_; // used for chunk parallelization + + bool cuStateVec_enable_ = false; + + reg_t target_gpus_; }; void Base::set_config(const Config &config) { sim_device_name_ = config.device; + + if (config.target_gpus.has_value()) { + target_gpus_ = config.target_gpus.value(); + } } std::vector Base::sample_measure(const reg_t &qubits, uint_t shots, diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp deleted file mode 100644 index 0b0c455d7e..0000000000 --- a/src/simulators/state_chunk.hpp +++ /dev/null @@ -1,2288 +0,0 @@ -/** - * This code is part of Qiskit. - * - * (C) Copyright IBM 2018, 2019. - * - * This code is licensed under the Apache License, Version 2.0. You may - * obtain a copy of this license in the LICENSE.txt file in the root directory - * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. - * - * Any modifications or derivative works of this code must retain this - * copyright notice, and modified files need to carry a notice indicating - * that they have been altered from the originals. - */ - -#ifndef _aer_base_state_chunk_hpp_ -#define _aer_base_state_chunk_hpp_ - -#include "framework/creg.hpp" -#include "framework/json.hpp" -#include "framework/opset.hpp" -#include "framework/results/experiment_result.hpp" -#include "framework/types.hpp" - -#include "noise/noise_model.hpp" - -#ifdef _OPENMP -#include -#endif - -#ifdef AER_MPI -#include -#endif - -namespace AER { - -namespace QuantumState { - -#define STATE_APPLY_TO_ALL_CHUNKS 0 - -//========================================================================= -// StateChunk interface base class with multiple chunks for Qiskit-Aer -// The base state class that supports multi-chunk distribution/ multi-shot -// parallelization -//========================================================================= - -template -class StateChunk : public State { - -public: - using ignore_argument = void; - using BaseState = State; - using DataSubType = Operations::DataSubType; - using OpType = Operations::OpType; - using OpItr = std::vector::const_iterator; - - //----------------------------------------------------------------------- - // Constructors - //----------------------------------------------------------------------- - - // The constructor arguments are used to initialize the OpSet - // for the StateChunk class for checking supported simulator Operations - // - // Standard OpTypes that can be included here are: - // - `OpType::gate` if gates are supported - // - `OpType::measure` if measure is supported - // - `OpType::reset` if reset is supported - // - `OpType::barrier` if barrier is supported - // - `OpType::matrix` if arbitrary unitary matrices are supported - // - `OpType::kraus` if general Kraus noise channels are supported - // - // For gate ops allowed gates are specified by a set of string names, - // for example this could include {"u1", "u2", "u3", "U", "cx", "CX"} - // - - StateChunk(const Operations::OpSet &opset) : BaseState(opset) { - num_global_chunks_ = 0; - num_local_chunks_ = 0; - - myrank_ = 0; - nprocs_ = 1; - - distributed_procs_ = 1; - distributed_rank_ = 0; - distributed_group_ = 0; - distributed_proc_bits_ = 0; - - chunk_omp_parallel_ = false; - global_chunk_indexing_ = false; - -#ifdef AER_MPI - distributed_comm_ = MPI_COMM_WORLD; -#endif - } - - virtual ~StateChunk(); - - //----------------------------------------------------------------------- - // Data accessors - //----------------------------------------------------------------------- - - // Return the state qreg object - auto &qreg(int_t idx = 0) { return qregs_[idx]; } - const auto &qreg(int_t idx = 0) const { return qregs_[idx]; } - - // Return the creg object - auto &chunk_creg(uint_t iChunk) { - return BaseState::creg(get_global_shot_index(iChunk)); - } - const auto &chunk_creg(uint_t iChunk) const { - return BaseState::creg(get_global_shot_index(iChunk)); - } - - //======================================================================= - // Subclass Override Methods - // - // The following methods should be implemented by any StateChunk subclasses. - // Abstract methods are required, while some methods are optional for - // StateChunk classes that support measurement to be compatible with a general - // QasmController. - //======================================================================= - - //----------------------------------------------------------------------- - // Abstract methods - // - // The implementation of these methods must be defined in all subclasses - //----------------------------------------------------------------------- - - // Return a string name for the StateChunk type - virtual std::string name() const = 0; - - // Initializes the StateChunk to the default state. - // Typically this is the n-qubit all |0> state - virtual void initialize_qreg(uint_t num_qubits) = 0; - - // Return an estimate of the required memory for implementing the - // specified sequence of operations on a `num_qubit` sized StateChunk. - virtual size_t - required_memory_mb(uint_t num_qubits, - const std::vector &ops) const = 0; - - // memory allocation (previously called before inisitalize_qreg) - virtual bool allocate(uint_t num_qubits, uint_t block_bits, - uint_t num_parallel_shots = 1); - - // Return the expectation value of a N-qubit Pauli operator - // If the simulator does not support Pauli expectation value this should - // raise an exception. - double expval_pauli(const reg_t &qubits, - const std::string &pauli) override final { - return 0.0; - } - - virtual double expval_pauli(const int_t iChunk, const reg_t &qubits, - const std::string &pauli) = 0; - - //----------------------------------------------------------------------- - // Optional: Load config settings - //----------------------------------------------------------------------- - - // Load any settings for the StateChunk class from a config - virtual void set_config(const Config &config); - - //======================================================================= - // Standard non-virtual methods - // - // These methods should not be modified in any StateChunk subclasses - //======================================================================= - - //----------------------------------------------------------------------- - // Apply circuits and ops - //----------------------------------------------------------------------- - - // Apply a single operation - // The `final_op` flag indicates no more instructions will be applied - // to the state after this sequence, so the state can be modified at the - // end of the instructions. - - // this is not used for StateChunk - void apply_op(const Operations::Op &op, ExperimentResult &result, - RngEngine &rng, bool final_op = false) override final { - apply_op(0, op, result, rng, final_op); - } - - // so this one is used - virtual void apply_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, RngEngine &rng, - bool final_op = false) = 0; - - // Apply a sequence of operations to the current state of the StateChunk - // class. It is up to the StateChunk subclass to decide how this sequence - // should be executed (ie in sequence, or some other execution strategy.) If - // this sequence contains operations not in the supported opset an exeption - // will be thrown. The `final_ops` flag indicates no more instructions will be - // applied to the state after this sequence, so the state can be modified at - // the end of the instructions. - void apply_ops(OpItr first, OpItr last, ExperimentResult &result, - RngEngine &rng, bool final_ops = false) override; - - // apply ops to multiple shots - // this function should be separately defined since apply_ops is called in - // quantum_error - template - void apply_ops_multi_shots(InputIterator first, InputIterator last, - const Noise::NoiseModel &noise, - ExperimentResult &result, uint_t rng_seed, - bool final_ops = false); - - //----------------------------------------------------------------------- - // Initialization - //----------------------------------------------------------------------- - template - void initialize_from_vector(const int_t iChunk, const list_t &vec); - - template - void initialize_from_matrix(const int_t iChunk, const list_t &mat); - - //----------------------------------------------------------------------- - // ClassicalRegister methods - //----------------------------------------------------------------------- - - // Initialize classical memory and register to default value (all-0) - virtual void initialize_creg(uint_t num_memory, uint_t num_register); - - // Initialize classical memory and register to specific values - virtual void initialize_creg(uint_t num_memory, uint_t num_register, - const std::string &memory_hex, - const std::string ®ister_hex); - - //----------------------------------------------------------------------- - // Common instructions - //----------------------------------------------------------------------- - - // Apply a save expectation value instruction - void apply_save_expval(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result); - - //----------------------------------------------------------------------- - // Config Settings - //----------------------------------------------------------------------- - - // set number of processes to be distributed - virtual void set_distribution(uint_t nprocs); - - // set max number of shots to execute in a batch - void set_max_bached_shots(uint_t shots) { max_batched_shots_ = shots; } - - // Does this state support multi-chunk distribution? - virtual bool multi_chunk_distribution_supported(void) { return true; } - // Does this state support multi-shot parallelization? - virtual bool multi_shot_parallelization_supported(void) { return true; } - - // set creg bit counts before initialize creg - void set_num_creg_bits(uint_t num_memory, uint_t num_register) override { - num_creg_memory_ = num_memory; - num_creg_registers_ = num_register; - } - -protected: - // The array of the quantum state data structure - std::vector qregs_; - - // number of qubits for the circuit - uint_t num_qubits_; - - // extra parameters for parallel simulations - uint_t num_global_chunks_; // number of total chunks - uint_t num_local_chunks_; // number of local chunks - uint_t chunk_bits_; // number of qubits per chunk - uint_t block_bits_; // number of cache blocked qubits - - uint_t global_chunk_index_; // beginning chunk index for this process - reg_t chunk_index_begin_; // beginning chunk index for each process - reg_t chunk_index_end_; // ending chunk index for each process - uint_t local_shot_index_; // local shot ID of current batch loop - - uint_t myrank_; // process ID - uint_t nprocs_; // number of processes - uint_t distributed_rank_; // process ID in communicator group - uint_t distributed_procs_; // number of processes in communicator group - uint_t distributed_group_; // group id of distribution - int_t distributed_proc_bits_; // distributed_procs_=2^distributed_proc_bits_ - // (if nprocs != power of 2, set -1) - - bool chunk_omp_parallel_; // using thread parallel to process loop of chunks - // or not - bool global_chunk_indexing_; // using global index for control qubits and - // diagonal matrix - - bool multi_chunk_distribution_ = - false; // distributing chunks to apply cache blocking parallelization - bool multi_shots_parallelization_ = - false; // using chunks as multiple shots parallelization - bool set_parallelization_called_ = - false; // this flag is used to check set_parallelization is already - // called, if yes the call sets max_batched_shots_ - uint_t max_batched_shots_ = - 1; // max number of shots can be stored on available memory - - reg_t qubit_map_; // qubit map to restore swapped qubits - - bool multi_chunk_swap_enable_ = true; // enable multi-chunk swaps - uint_t chunk_swap_buffer_qubits_ = - 15; // maximum buffer size in qubits for chunk swap - uint_t max_multi_swap_; // maximum swaps can be applied at a time, calculated - // by chunk_swap_buffer_bits_ - - // group of states (GPU devices) - uint_t num_groups_; // number of groups of chunks - reg_t top_chunk_of_group_; - reg_t num_chunks_in_group_; - int num_threads_per_group_; // number of outer threads per group - - // cuStateVec settings - bool cuStateVec_enable_ = false; - - uint_t num_creg_memory_ = - 0; // number of total bits for creg (reserve for multi-shots) - uint_t num_creg_registers_ = 0; - - //----------------------------------------------------------------------- - // Apply circuits and ops - //----------------------------------------------------------------------- - // apply ops for multi-chunk distribution - template - void apply_ops_chunks(InputIterator first, InputIterator last, - ExperimentResult &result, RngEngine &rng, - bool final_ops = false); - - // apply cache blocked ops in each chunk - template - void apply_cache_blocking_ops(const int_t iGroup, InputIterator first, - InputIterator last, ExperimentResult &result, - RngEngine &rng); - - // apply ops for multi-shots to one group - template - void apply_ops_multi_shots_for_group(int_t i_group, InputIterator first, - InputIterator last, - const Noise::NoiseModel &noise, - ExperimentResult &result, - uint_t rng_seed, bool final_ops); - - // apply op to multiple shots , return flase if op is not supported to execute - // in a batch - virtual bool apply_batched_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, - std::vector &rng, - bool final_op = false) { - return false; - } - - // apply sampled noise to multiple-shots (this is used for ops contains - // non-Pauli operators) - void apply_batched_noise_ops( - const int_t i_group, const std::vector> &ops, - ExperimentResult &result, std::vector &rng); - - // check conditional - bool check_conditional(const int_t iChunk, const Operations::Op &op); - - // this function is used to scale chunk qubits for multi-chunk distribution - virtual int qubit_scale(void) { - return 1; // scale of qubit number (x2 for density and unitary matrices) - } - uint_t get_process_by_chunk(uint_t cid); - - // allocate qregs - bool allocate_qregs(uint_t num_chunks); - - //----------------------------------------------------------------------- - // Functions for multi-chunk distribution - //----------------------------------------------------------------------- - // swap between chunks - virtual void apply_chunk_swap(const reg_t &qubits); - - // apply multiple swaps between chunks - virtual void apply_multi_chunk_swap(const reg_t &qubits); - - // apply X gate over chunks - virtual void apply_chunk_x(const uint_t qubit); - - // send/receive chunk in receive buffer - void send_chunk(uint_t local_chunk_index, uint_t global_chunk_index); - void recv_chunk(uint_t local_chunk_index, uint_t global_chunk_index); - - template - void send_data(data_t *pSend, uint_t size, uint_t myid, uint_t pairid); - template - void recv_data(data_t *pRecv, uint_t size, uint_t myid, uint_t pairid); - - // reduce values over processes - void reduce_sum(reg_t &sum) const; - void reduce_sum(rvector_t &sum) const; - void reduce_sum(complex_t &sum) const; - void reduce_sum(double &sum) const; - - // gather values on each process - void gather_value(rvector_t &val) const; - - // gather cregs - void gather_creg_memory(void); - - // barrier all processes - void sync_process(void) const; - - // gather distributed state into vector (if memory is enough) - template - void gather_state(std::vector> &state); - - template - void gather_state(AER::Vector> &state); - - // block diagonal matrix in chunk - void block_diagonal_matrix(const int_t iChunk, reg_t &qubits, - cvector_t &diag); - void qubits_inout(const reg_t &qubits, reg_t &qubits_in, - reg_t &qubits_out) const; - - // collect matrix over multiple chunks - auto apply_to_matrix(bool copy = false); - - // Apply the global phase - virtual void apply_global_phase() override {} - - // check if the operator should be applied to each chunk - virtual bool is_applied_to_each_chunk(const Operations::Op &op); - - // return global shot index for the chunk - inline int_t get_global_shot_index(const int_t iChunk) const { - return multi_shots_parallelization_ - ? (iChunk + local_shot_index_ + global_chunk_index_) - : 0; - } - - // separate inside and outside qubits for (multi) control gates - void get_inout_ctrl_qubits(const Operations::Op &op, reg_t &qubits_out, - reg_t &qubits_in); - - // remake gate operation by qubits inside chunk - Operations::Op remake_gate_in_chunk_qubits(const Operations::Op &op, - reg_t &qubits_in); - -#ifdef AER_MPI - // communicator group to simulate a circuit (for multi-experiments) - MPI_Comm distributed_comm_; -#endif - - uint_t mapped_index(const uint_t idx); -}; - -//========================================================================= -// Implementations -//========================================================================= - -template -StateChunk::~StateChunk(void) { -#ifdef AER_MPI - if (distributed_comm_ != MPI_COMM_WORLD) { - MPI_Comm_free(&distributed_comm_); - } -#endif -} - -template -void StateChunk::set_config(const Config &config) { - BaseState::set_config(config); - - num_threads_per_group_ = 1; - if (config.num_threads_per_device.has_value()) - num_threads_per_group_ = config.num_threads_per_device.value(); - - if (config.chunk_swap_buffer_qubits.has_value()) - chunk_swap_buffer_qubits_ = config.chunk_swap_buffer_qubits.value(); - -#ifdef AER_CUSTATEVEC - // cuStateVec configs - if (config.cuStateVec_enable.has_value()) - cuStateVec_enable_ = config.cuStateVec_enable.value(); -#endif -} - -template -void StateChunk::set_distribution(uint_t nprocs) { - myrank_ = 0; - nprocs_ = 1; -#ifdef AER_MPI - int t; - MPI_Comm_size(MPI_COMM_WORLD, &t); - nprocs_ = t; - MPI_Comm_rank(MPI_COMM_WORLD, &t); - myrank_ = t; -#endif - - distributed_procs_ = nprocs; - distributed_rank_ = myrank_ % nprocs; - distributed_group_ = myrank_ / nprocs; - - distributed_proc_bits_ = 0; - int proc_bits = 0; - uint_t p = distributed_procs_; - while (p > 1) { - if ((p & 1) != 0) { // procs is not power of 2 - distributed_proc_bits_ = -1; - break; - } - distributed_proc_bits_++; - p >>= 1; - } - -#ifdef AER_MPI - if (nprocs != nprocs_) { - MPI_Comm_split(MPI_COMM_WORLD, (int)distributed_group_, - (int)distributed_rank_, &distributed_comm_); - } else { - distributed_comm_ = MPI_COMM_WORLD; - } -#endif -} - -template -bool StateChunk::allocate(uint_t num_qubits, uint_t block_bits, - uint_t num_parallel_shots) { - int_t i; - num_qubits_ = num_qubits; - block_bits_ = block_bits; - - if (block_bits_ > 0) { - chunk_bits_ = block_bits_; - if (chunk_bits_ > num_qubits_) { - chunk_bits_ = num_qubits_; - } - } else { - chunk_bits_ = num_qubits_; - } - - if (chunk_bits_ < num_qubits_) { - // multi-chunk distribution with cache blocking transpiler - multi_chunk_distribution_ = true; - multi_shots_parallelization_ = false; - num_global_chunks_ = 1ull << ((num_qubits_ - chunk_bits_) * qubit_scale()); - - BaseState::cregs_.resize(1); - } else { - // multi-shots parallelization - multi_chunk_distribution_ = false; - if (num_parallel_shots > 1) - multi_shots_parallelization_ = true; - else - multi_shots_parallelization_ = false; - num_global_chunks_ = num_parallel_shots; - - // classical registers for all shots - BaseState::cregs_.resize(num_parallel_shots); - } - - chunk_index_begin_.resize(distributed_procs_); - chunk_index_end_.resize(distributed_procs_); - for (i = 0; i < distributed_procs_; i++) { - chunk_index_begin_[i] = num_global_chunks_ * i / distributed_procs_; - chunk_index_end_[i] = num_global_chunks_ * (i + 1) / distributed_procs_; - } - - num_local_chunks_ = chunk_index_end_[distributed_rank_] - - chunk_index_begin_[distributed_rank_]; - global_chunk_index_ = chunk_index_begin_[distributed_rank_]; - local_shot_index_ = 0; - - global_chunk_indexing_ = false; - chunk_omp_parallel_ = false; - if (BaseState::sim_device_name_ == "GPU") { -#ifdef _OPENMP - if (omp_get_num_threads() == 1) - chunk_omp_parallel_ = true; -#endif - - // set cuStateVec_enable_ - if (cuStateVec_enable_) { - if (multi_shots_parallelization_) - cuStateVec_enable_ = false; // multi-shots parallelization is not - // supported for cuStateVec - } - - if (!cuStateVec_enable_) - global_chunk_indexing_ = true; // cuStateVec does not handle global chunk - // index for diagonal matrix - } else if (BaseState::sim_device_name_ == "Thrust") { - global_chunk_indexing_ = true; - chunk_omp_parallel_ = false; - } - - if (multi_shots_parallelization_) { - allocate_qregs(std::min(num_local_chunks_, max_batched_shots_)); - } else { - allocate_qregs(num_local_chunks_); - } - - // initialize qubit map - qubit_map_.resize(num_qubits_); - for (i = 0; i < num_qubits_; i++) { - qubit_map_[i] = i; - } - - if (chunk_bits_ <= chunk_swap_buffer_qubits_ + 1) - multi_chunk_swap_enable_ = false; - else - max_multi_swap_ = chunk_bits_ - chunk_swap_buffer_qubits_; - - return true; -} - -template -bool StateChunk::allocate_qregs(uint_t num_chunks) { - int_t i; - // deallocate qregs before reallocation - if (qregs_.size() > 0) { - if (qregs_.size() == num_chunks) - return true; // can reuse allocated chunks - - qregs_.clear(); - } - - qregs_.resize(num_chunks); - - if (num_creg_memory_ != 0 || num_creg_registers_ != 0) { - for (i = 0; i < num_chunks; i++) { - // set number of creg bits before actual initialization - qregs_[i].initialize_creg(num_creg_memory_, num_creg_registers_); - } - } - - // allocate qregs - uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0; - bool ret = true; - qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_); - qregs_[0].set_num_threads_per_group(num_threads_per_group_); - qregs_[0].cuStateVec_enable(cuStateVec_enable_); - ret &= - qregs_[0].chunk_setup(chunk_bits_ * qubit_scale(), - num_qubits_ * qubit_scale(), chunk_id, num_chunks); - for (i = 1; i < num_chunks; i++) { - uint_t gid = i + chunk_id; - ret &= qregs_[i].chunk_setup(qregs_[0], gid); - qregs_[i].set_num_threads_per_group(num_threads_per_group_); - } - - // initialize groups - top_chunk_of_group_.clear(); - num_groups_ = 0; - for (i = 0; i < qregs_.size(); i++) { - if (qregs_[i].top_of_group()) { - top_chunk_of_group_.push_back(i); - num_groups_++; - } - } - top_chunk_of_group_.push_back(qregs_.size()); - num_chunks_in_group_.resize(num_groups_); - for (i = 0; i < num_groups_; i++) { - num_chunks_in_group_[i] = - top_chunk_of_group_[i + 1] - top_chunk_of_group_[i]; - } - - return ret; -} - -template -uint_t StateChunk::get_process_by_chunk(uint_t cid) { - uint_t i; - for (i = 0; i < distributed_procs_; i++) { - if (cid >= chunk_index_begin_[i] && cid < chunk_index_end_[i]) { - return i; - } - } - return distributed_procs_; -} - -template -void StateChunk::apply_ops(OpItr first, OpItr last, - ExperimentResult &result, RngEngine &rng, - bool final_ops) { - if (multi_chunk_distribution_) { - apply_ops_chunks(first, last, result, rng, final_ops); - } else { - Base::apply_ops(first, last, result, rng, final_ops); - } - - qregs_[0].synchronize(); - -#ifdef AER_CUSTATEVEC - result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); -#endif -} - -template -template -void StateChunk::apply_ops_chunks(InputIterator first, - InputIterator last, - ExperimentResult &result, - RngEngine &rng, bool final_ops) { - uint_t iOp, nOp; - reg_t multi_swap; - - nOp = std::distance(first, last); - iOp = 0; - - while (iOp < nOp) { - const Operations::Op op_iOp = *(first + iOp); - - if (op_iOp.type == Operations::OpType::gate && - op_iOp.name == "swap_chunk") { - // apply swap between chunks - if (multi_chunk_swap_enable_ && op_iOp.qubits[0] < chunk_bits_ && - op_iOp.qubits[1] >= chunk_bits_) { - if (distributed_proc_bits_ < 0 || - (op_iOp.qubits[1] >= - (num_qubits_ * qubit_scale() - - distributed_proc_bits_))) { // apply multi-swap when swap is cross - // qubits - multi_swap.push_back(op_iOp.qubits[0]); - multi_swap.push_back(op_iOp.qubits[1]); - if (multi_swap.size() >= max_multi_swap_ * 2) { - apply_multi_chunk_swap(multi_swap); - multi_swap.clear(); - } - } else - apply_chunk_swap(op_iOp.qubits); - } else { - if (multi_swap.size() > 0) { - apply_multi_chunk_swap(multi_swap); - multi_swap.clear(); - } - apply_chunk_swap(op_iOp.qubits); - } - iOp++; - continue; - } else if (multi_swap.size() > 0) { - apply_multi_chunk_swap(multi_swap); - multi_swap.clear(); - } - - if (op_iOp.type == Operations::OpType::sim_op && - op_iOp.name == "begin_blocking") { - // applying sequence of gates inside each chunk - - uint_t iOpEnd = iOp; - while (iOpEnd < nOp) { - const Operations::Op op_iOpEnd = *(first + iOpEnd); - if (op_iOpEnd.type == Operations::OpType::sim_op && - op_iOpEnd.name == "end_blocking") { - break; - } - iOpEnd++; - } - - uint_t iOpBegin = iOp + 1; - if (num_groups_ > 1 && chunk_omp_parallel_) { -#pragma omp parallel for num_threads(num_groups_) - for (int_t ig = 0; ig < num_groups_; ig++) - apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result, - rng); - } else { - for (int_t ig = 0; ig < num_groups_; ig++) - apply_cache_blocking_ops(ig, first + iOpBegin, first + iOpEnd, result, - rng); - } - iOp = iOpEnd; - } else if (is_applied_to_each_chunk(op_iOp)) { - if (num_groups_ > 1 && chunk_omp_parallel_) { -#pragma omp parallel for num_threads(num_groups_) - for (int_t ig = 0; ig < num_groups_; ig++) - apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result, - rng); - } else { - for (int_t ig = 0; ig < num_groups_; ig++) - apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result, - rng); - } - } else { - // parallelize inside state implementations - apply_op(STATE_APPLY_TO_ALL_CHUNKS, op_iOp, result, rng, - final_ops && nOp == iOp + 1); - } - iOp++; - } - - if (multi_swap.size() > 0) - apply_multi_chunk_swap(multi_swap); - - if (num_groups_ > 1 && chunk_omp_parallel_) { -#pragma omp parallel for num_threads(num_groups_) - for (int_t ig = 0; ig < num_groups_; ig++) - qregs_[top_chunk_of_group_[ig]].synchronize(); - } else { - for (int_t ig = 0; ig < num_groups_; ig++) - qregs_[top_chunk_of_group_[ig]].synchronize(); - } - - if (BaseState::sim_device_name_ == "GPU") { -#ifdef AER_THRUST_CUDA - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - nDev = 0; - } - if (nDev > num_groups_) - nDev = num_groups_; - result.metadata.add(nDev, "cacheblocking", "chunk_parallel_gpus"); -#endif - -#ifdef AER_CUSTATEVEC - result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); -#endif - } - -#ifdef AER_MPI - result.metadata.add(multi_chunk_swap_enable_, "cacheblocking", - "multiple_chunk_swaps_enable"); - if (multi_chunk_swap_enable_) { - result.metadata.add(chunk_swap_buffer_qubits_, "cacheblocking", - "multiple_chunk_swaps_buffer_qubits"); - result.metadata.add(max_multi_swap_, "cacheblocking", - "max_multiple_chunk_swaps"); - } -#endif -} - -template -template -void StateChunk::apply_cache_blocking_ops(const int_t iGroup, - InputIterator first, - InputIterator last, - ExperimentResult &result, - RngEngine &rng) { - // for each chunk in group - for (int_t iChunk = top_chunk_of_group_[iGroup]; - iChunk < top_chunk_of_group_[iGroup + 1]; iChunk++) { - // fecth chunk in cache - if (qregs_[iChunk].fetch_chunk()) { - for (auto it = first; it != last; ++it) { - apply_op(iChunk, *it, result, rng, false); - } - // release chunk from cache - qregs_[iChunk].release_chunk(); - } - } -} - -template -void StateChunk::get_inout_ctrl_qubits(const Operations::Op &op, - reg_t &qubits_out, - reg_t &qubits_in) { - if (op.type == Operations::OpType::gate && - (op.name[0] == 'c' || op.name.find("mc") == 0)) { - for (int i = 0; i < op.qubits.size(); i++) { - if (op.qubits[i] < chunk_bits_) - qubits_in.push_back(op.qubits[i]); - else - qubits_out.push_back(op.qubits[i]); - } - } -} - -template -Operations::Op -StateChunk::remake_gate_in_chunk_qubits(const Operations::Op &op, - reg_t &qubits_in) { - Operations::Op new_op = op; - new_op.qubits = qubits_in; - // change gate name if there is no control qubits inside chunk - if (op.name.find("swap") != std::string::npos && qubits_in.size() == 2) { - new_op.name = "swap"; - } - if (op.name.find("ccx") != std::string::npos) { - if (qubits_in.size() == 1) - new_op.name = "x"; - else - new_op.name = "cx"; - } else if (qubits_in.size() == 1) { - if (op.name[0] == 'c') - new_op.name = op.name.substr(1); - else if (op.name == "mcphase") - new_op.name = "p"; - else - new_op.name = op.name.substr(2); // remove "mc" - } - return new_op; -} - -template -bool StateChunk::is_applied_to_each_chunk(const Operations::Op &op) { - if (op.type == Operations::OpType::gate || - op.type == Operations::OpType::matrix || - op.type == Operations::OpType::diagonal_matrix || - op.type == Operations::OpType::multiplexer || - op.type == Operations::OpType::superop) { - return true; - } - return false; -} - -template -bool StateChunk::check_conditional(const int_t iChunk, - const Operations::Op &op) { - if (multi_shots_parallelization_) { - // multi-shots parallelization - if (op.conditional) { - qregs_[iChunk].set_conditional(op.conditional_reg); - } - return true; - } else { - return BaseState::cregs_[0].check_conditional(op); - } -} - -template -template -void StateChunk::apply_ops_multi_shots( - InputIterator first, InputIterator last, const Noise::NoiseModel &noise, - ExperimentResult &result, uint_t rng_seed, bool final_ops) { - int_t i; - int_t i_begin, n_shots; - - i_begin = 0; - while (i_begin < num_local_chunks_) { - local_shot_index_ = i_begin; - - // loop for states can be stored in available memory - n_shots = qregs_.size(); - if (i_begin + n_shots > num_local_chunks_) { - n_shots = num_local_chunks_ - i_begin; - // resize qregs - allocate_qregs(n_shots); - } - // initialization (equivalent to initialize_qreg + initialize_creg) - auto init_group = [this](int_t ig) { - for (uint_t j = top_chunk_of_group_[ig]; j < top_chunk_of_group_[ig + 1]; - j++) { - // enabling batch shots optimization - qregs_[j].enable_batch(true); - - // initialize qreg here - qregs_[j].set_num_qubits(chunk_bits_); - qregs_[j].initialize(); - - // initialize creg here - qregs_[j].initialize_creg(this->creg(0).memory_size(), - this->creg(0).register_size()); - } - }; - Utils::apply_omp_parallel_for((num_groups_ > 1 && chunk_omp_parallel_), 0, - num_groups_, init_group); - - apply_global_phase(); // this is parallelized in StateChunk sub-classes - - // apply ops to multiple-shots - if (num_groups_ > 1 && chunk_omp_parallel_) { - std::vector par_results(num_groups_); -#pragma omp parallel for num_threads(num_groups_) - for (i = 0; i < num_groups_; i++) - apply_ops_multi_shots_for_group(i, first, last, noise, par_results[i], - rng_seed, final_ops); - - for (auto &res : par_results) - result.combine(std::move(res)); - } else { - for (i = 0; i < num_groups_; i++) - apply_ops_multi_shots_for_group(i, first, last, noise, result, rng_seed, - final_ops); - } - - // collect measured bits and copy memory - for (i = 0; i < n_shots; i++) { - qregs_[i].read_measured_data( - this->creg(global_chunk_index_ + i_begin + i)); - } - - i_begin += n_shots; - } - - gather_creg_memory(); - -#ifdef AER_THRUST_CUDA - if (BaseState::sim_device_name_ == "GPU") { - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - nDev = 0; - } - if (nDev > num_groups_) - nDev = num_groups_; - result.metadata.add(nDev, "batched_shots_optimization_parallel_gpus"); - } -#endif -} - -template -template -void StateChunk::apply_ops_multi_shots_for_group( - int_t i_group, InputIterator first, InputIterator last, - const Noise::NoiseModel &noise, ExperimentResult &result, uint_t rng_seed, - bool final_ops) { - uint_t istate = top_chunk_of_group_[i_group]; - std::vector rng(num_chunks_in_group_[i_group]); -#ifdef _OPENMP - int num_inner_threads = omp_get_max_threads() / omp_get_num_threads(); -#else - int num_inner_threads = 1; -#endif - - for (uint_t j = top_chunk_of_group_[i_group]; - j < top_chunk_of_group_[i_group + 1]; j++) - rng[j - top_chunk_of_group_[i_group]].set_seed( - rng_seed + global_chunk_index_ + local_shot_index_ + j); - - for (auto op = first; op != last; ++op) { - if (op->type == Operations::OpType::qerror_loc) { - // sample error here - uint_t count = num_chunks_in_group_[i_group]; - std::vector> noise_ops(count); - - uint_t count_ops = 0; - uint_t non_pauli_gate_count = 0; - if (num_inner_threads > 1) { -#pragma omp parallel for reduction(+: count_ops,non_pauli_gate_count) num_threads(num_inner_threads) - for (int_t j = 0; j < count; j++) { - noise_ops[j] = noise.sample_noise_loc(*op, rng[j]); - - if (!(noise_ops[j].size() == 0 || - (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) { - count_ops++; - for (int_t k = 0; k < noise_ops[j].size(); k++) { - if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && - noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && - noise_ops[j][k].name != "pauli") { - non_pauli_gate_count++; - break; - } - } - } - } - } else { - for (int_t j = 0; j < count; j++) { - noise_ops[j] = noise.sample_noise_loc(*op, rng[j]); - - if (!(noise_ops[j].size() == 0 || - (noise_ops[j].size() == 1 && noise_ops[j][0].name == "id"))) { - count_ops++; - for (int_t k = 0; k < noise_ops[j].size(); k++) { - if (noise_ops[j][k].name != "id" && noise_ops[j][k].name != "x" && - noise_ops[j][k].name != "y" && noise_ops[j][k].name != "z" && - noise_ops[j][k].name != "pauli") { - non_pauli_gate_count++; - break; - } - } - } - } - } - - if (count_ops == 0) { - continue; // do nothing - } - if (non_pauli_gate_count == 0) { // ptimization for Pauli error - qregs_[istate].apply_batched_pauli_ops(noise_ops); - } else { - // otherwise execute each circuit - apply_batched_noise_ops(i_group, noise_ops, result, rng); - } - } else { - if (!apply_batched_op(istate, *op, result, rng, - final_ops && (op + 1 == last))) { - // call apply_op for each state - for (uint_t j = top_chunk_of_group_[i_group]; - j < top_chunk_of_group_[i_group + 1]; j++) { - qregs_[j].enable_batch(false); - apply_op(j, *op, result, rng[j - top_chunk_of_group_[i_group]], - final_ops && (op + 1 == last)); - qregs_[j].enable_batch(true); - } - } - } - } -} - -template -void StateChunk::apply_batched_noise_ops( - const int_t i_group, const std::vector> &ops, - ExperimentResult &result, std::vector &rng) { - int_t i, j, k, count, nop, pos = 0; - uint_t istate = top_chunk_of_group_[i_group]; - count = ops.size(); - - reg_t mask(count); - std::vector finished(count, false); - for (i = 0; i < count; i++) { - int_t cond_reg = -1; - - if (finished[i]) - continue; - if (ops[i].size() == 0 || (ops[i].size() == 1 && ops[i][0].name == "id")) { - finished[i] = true; - continue; - } - mask[i] = 1; - - // find same ops to be exectuted in a batch - for (j = i + 1; j < count; j++) { - if (finished[j]) { - mask[j] = 0; - continue; - } - if (ops[j].size() == 0 || - (ops[j].size() == 1 && ops[j][0].name == "id")) { - mask[j] = 0; - finished[j] = true; - continue; - } - - if (ops[i].size() != ops[j].size()) { - mask[j] = 0; - continue; - } - - mask[j] = true; - for (k = 0; k < ops[i].size(); k++) { - if (ops[i][k].conditional) { - cond_reg = ops[i][k].conditional_reg; - } - if (ops[i][k].type != ops[j][k].type || - ops[i][k].name != ops[j][k].name) { - mask[j] = false; - break; - } - } - if (mask[j]) - finished[j] = true; - } - - // mask conditional register - int_t sys_reg = - qregs_[istate].set_batched_system_conditional(cond_reg, mask); - - // batched execution on same ops - for (k = 0; k < ops[i].size(); k++) { - Operations::Op cop = ops[i][k]; - - // mark op conditional to mask shots - cop.conditional = true; - cop.conditional_reg = sys_reg; - - if (!apply_batched_op(istate, cop, result, rng, false)) { - // call apply_op for each state - for (uint_t j = top_chunk_of_group_[i_group]; - j < top_chunk_of_group_[i_group + 1]; j++) { - qregs_[j].enable_batch(false); - apply_op(j, cop, result, rng[j - top_chunk_of_group_[i_group]], - false); - qregs_[j].enable_batch(true); - } - } - } - mask[i] = 0; - finished[i] = true; - } -} - -template -void StateChunk::initialize_creg(uint_t num_memory, - uint_t num_register) { - for (int_t i = 0; i < BaseState::cregs_.size(); i++) { - BaseState::cregs_[i].initialize(num_memory, num_register); - } -} - -template -void StateChunk::initialize_creg(uint_t num_memory, - uint_t num_register, - const std::string &memory_hex, - const std::string ®ister_hex) { - for (int_t i = 0; i < BaseState::cregs_.size(); i++) { - BaseState::cregs_[i].initialize(num_memory, num_register, memory_hex, - register_hex); - } -} - -template -void StateChunk::apply_save_expval(const int_t iChunk, - const Operations::Op &op, - ExperimentResult &result) { - // Check empty edge case - if (op.expval_params.empty()) { - throw std::invalid_argument( - "Invalid save expval instruction (Pauli components are empty)."); - } - bool variance = (op.type == Operations::OpType::save_expval_var); - - // Accumulate expval components - double expval(0.); - double sq_expval(0.); - - for (const auto ¶m : op.expval_params) { - // param is tuple (pauli, coeff, sq_coeff) - const auto val = expval_pauli(iChunk, op.qubits, std::get<0>(param)); - expval += std::get<1>(param) * val; - if (variance) { - sq_expval += std::get<2>(param) * val; - } - } - if (variance) { - std::vector expval_var(2); - expval_var[0] = expval; // mean - expval_var[1] = sq_expval - expval * expval; // variance - result.save_data_average(BaseState::cregs_[get_global_shot_index(iChunk)], - op.string_params[0], expval_var, op.type, - op.save_type); - } else { - result.save_data_average(BaseState::cregs_[get_global_shot_index(iChunk)], - op.string_params[0], expval, op.type, - op.save_type); - } -} - -//------------------------------------------------------------------------- -// functions for multi-chunk distribution -//------------------------------------------------------------------------- -template -void StateChunk::block_diagonal_matrix(const int_t iChunk, - reg_t &qubits, - cvector_t &diag) { - uint_t gid = global_chunk_index_ + iChunk; - uint_t i; - uint_t mask_out = 0; - uint_t mask_id = 0; - - reg_t qubits_in; - cvector_t diag_in; - - for (i = 0; i < qubits.size(); i++) { - if (qubits[i] < chunk_bits_) { // in chunk - qubits_in.push_back(qubits[i]); - } else { - mask_out |= (1ull << i); - if ((gid >> (qubits[i] - chunk_bits_)) & 1) - mask_id |= (1ull << i); - } - } - - if (qubits_in.size() < qubits.size()) { - for (i = 0; i < diag.size(); i++) { - if ((i & mask_out) == mask_id) - diag_in.push_back(diag[i]); - } - - if (qubits_in.size() == 0) { - qubits_in.push_back(0); - diag_in.resize(2); - diag_in[1] = diag_in[0]; - } - qubits = qubits_in; - diag = diag_in; - } -} - -template -void StateChunk::qubits_inout(const reg_t &qubits, reg_t &qubits_in, - reg_t &qubits_out) const { - int_t i; - qubits_in.clear(); - qubits_out.clear(); - for (i = 0; i < qubits.size(); i++) { - if (qubits[i] < chunk_bits_) { // in chunk - qubits_in.push_back(qubits[i]); - } else { - qubits_out.push_back(qubits[i]); - } - } -} - -template -template -void StateChunk::initialize_from_vector(const int_t iChunkIn, - const list_t &vec) { - int_t iChunk; - - if (multi_chunk_distribution_) { - if (chunk_omp_parallel_ && num_groups_ > 1) { -#pragma omp parallel for private(iChunk) - for (int_t ig = 0; ig < num_groups_; ig++) { - for (iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) { - list_t tmp(1ull << (chunk_bits_ * qubit_scale())); - for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { - tmp[i] = vec[((global_chunk_index_ + iChunk) - << (chunk_bits_ * qubit_scale())) + - i]; - } - qregs_[iChunk].initialize_from_vector(tmp); - } - } - } else { - for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) { - list_t tmp(1ull << (chunk_bits_ * qubit_scale())); - for (int_t i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { - tmp[i] = vec[((global_chunk_index_ + iChunk) - << (chunk_bits_ * qubit_scale())) + - i]; - } - qregs_[iChunk].initialize_from_vector(tmp); - } - } - } else { - if (iChunkIn == STATE_APPLY_TO_ALL_CHUNKS) { - for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) { - qregs_[iChunk].initialize_from_vector(vec); - } - } else - qregs_[iChunkIn].initialize_from_vector(vec); - } -} - -template -template -void StateChunk::initialize_from_matrix(const int_t iChunkIn, - const list_t &mat) { - int_t iChunk; - if (multi_chunk_distribution_) { - if (chunk_omp_parallel_ && num_groups_ > 1) { -#pragma omp parallel for private(iChunk) - for (int_t ig = 0; ig < num_groups_; ig++) { - for (iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) { - list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_)); - uint_t irow_chunk = - ((iChunk + global_chunk_index_) >> ((num_qubits_ - chunk_bits_))) - << (chunk_bits_); - uint_t icol_chunk = ((iChunk + global_chunk_index_) & - ((1ull << ((num_qubits_ - chunk_bits_))) - 1)) - << (chunk_bits_); - - // copy part of state for this chunk - uint_t i, row, col; - for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { - uint_t icol = i & ((1ull << chunk_bits_) - 1); - uint_t irow = i >> chunk_bits_; - tmp[i] = - mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; - } - qregs_[iChunk].initialize_from_matrix(tmp); - } - } - } else { - for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) { - list_t tmp(1ull << (chunk_bits_), 1ull << (chunk_bits_)); - uint_t irow_chunk = - ((iChunk + global_chunk_index_) >> ((num_qubits_ - chunk_bits_))) - << (chunk_bits_); - uint_t icol_chunk = ((iChunk + global_chunk_index_) & - ((1ull << ((num_qubits_ - chunk_bits_))) - 1)) - << (chunk_bits_); - - // copy part of state for this chunk - uint_t i, row, col; - for (i = 0; i < (1ull << (chunk_bits_ * qubit_scale())); i++) { - uint_t icol = i & ((1ull << chunk_bits_) - 1); - uint_t irow = i >> chunk_bits_; - tmp[i] = - mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; - } - qregs_[iChunk].initialize_from_matrix(tmp); - } - } - } else { - if (iChunkIn == STATE_APPLY_TO_ALL_CHUNKS) { - for (iChunk = 0; iChunk < num_local_chunks_; iChunk++) { - qregs_[iChunk].initialize_from_matrix(mat); - } - } else - qregs_[iChunkIn].initialize_from_matrix(mat); - } -} - -template -auto StateChunk::apply_to_matrix(bool copy) { - // this function is used to collect states over chunks - int_t iChunk; - uint_t size = 1ull << (chunk_bits_ * qubit_scale()); - uint_t mask = (1ull << (chunk_bits_)) - 1; - uint_t num_threads = qregs_[0].get_omp_threads(); - - size_t size_required = - 2 * (sizeof(std::complex) << (num_qubits_ * 2)) + - (sizeof(std::complex) << (chunk_bits_ * 2)) * num_local_chunks_; - if ((size_required >> 20) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to store states as matrix")); - } - - auto matrix = qregs_[0].copy_to_matrix(); - - if (distributed_rank_ == 0) { - matrix.resize(1ull << (num_qubits_), 1ull << (num_qubits_)); - - auto tmp = qregs_[0].copy_to_matrix(); - for (iChunk = 0; iChunk < num_global_chunks_; iChunk++) { - int_t i; - uint_t irow_chunk = (iChunk >> ((num_qubits_ - chunk_bits_))) - << chunk_bits_; - uint_t icol_chunk = - (iChunk & ((1ull << ((num_qubits_ - chunk_bits_))) - 1)) - << chunk_bits_; - - if (iChunk < num_local_chunks_) { - if (copy) - tmp = qregs_[iChunk].copy_to_matrix(); - else - tmp = qregs_[iChunk].move_to_matrix(); - } -#ifdef AER_MPI - else - recv_data(tmp.data(), size, 0, iChunk); -#endif -#pragma omp parallel for if (num_threads > 1) num_threads(num_threads) - for (i = 0; i < size; i++) { - uint_t irow = i >> (chunk_bits_); - uint_t icol = i & mask; - uint_t idx = ((irow + irow_chunk) << (num_qubits_)) + icol_chunk + icol; - matrix[idx] = tmp[i]; - } - } - } else { -#ifdef AER_MPI - // send matrices to process 0 - for (iChunk = 0; iChunk < num_global_chunks_; iChunk++) { - uint_t iProc = get_process_by_chunk(iChunk); - if (iProc == distributed_rank_) { - if (copy) { - auto tmp = qregs_[iChunk - global_chunk_index_].copy_to_matrix(); - send_data(tmp.data(), size, iChunk, 0); - } else { - auto tmp = qregs_[iChunk - global_chunk_index_].move_to_matrix(); - send_data(tmp.data(), size, iChunk, 0); - } - } - } -#endif - } - - return matrix; -} - -template -uint_t StateChunk::mapped_index(const uint_t idx) { - uint_t i, ret = 0; - uint_t t = idx; - - for (i = 0; i < num_qubits_; i++) { - if (t & 1) { - ret |= (1ull << qubit_map_[i]); - } - t >>= 1; - } - return ret; -} - -template -void StateChunk::apply_chunk_swap(const reg_t &qubits) { - uint_t nLarge = 1; - uint_t q0, q1; - int_t iChunk; - - q0 = qubits[qubits.size() - 2]; - q1 = qubits[qubits.size() - 1]; - - if (qubit_scale() == 1) { - std::swap(qubit_map_[q0], qubit_map_[q1]); - } - - if (q0 > q1) { - std::swap(q0, q1); - } - - if (q1 < chunk_bits_ * qubit_scale()) { - // inside chunk - if (chunk_omp_parallel_ && num_groups_ > 1) { -#pragma omp parallel for num_threads(num_groups_) - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_mcswap(qubits); - } - } else { - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_mcswap(qubits); - } - } - } else { // swap over chunks - uint_t mask0, mask1; - - mask0 = (1ull << q0); - mask1 = (1ull << q1); - mask0 >>= (chunk_bits_ * qubit_scale()); - mask1 >>= (chunk_bits_ * qubit_scale()); - - if (distributed_procs_ == 1 || - (distributed_proc_bits_ >= 0 && - q1 < (num_qubits_ * qubit_scale() - - distributed_proc_bits_))) { // no data transfer between processes - // is needed - auto apply_chunk_swap_1qubit = [this, mask1, qubits](int_t iGroup) { - for (int_t ic = top_chunk_of_group_[iGroup]; - ic < top_chunk_of_group_[iGroup + 1]; ic++) { - uint_t baseChunk; - baseChunk = ic & (~mask1); - if (ic == baseChunk) - qregs_[ic].apply_chunk_swap(qubits, qregs_[ic | mask1], true); - } - }; - auto apply_chunk_swap_2qubits = [this, mask0, mask1, - qubits](int_t iGroup) { - for (int_t ic = top_chunk_of_group_[iGroup]; - ic < top_chunk_of_group_[iGroup + 1]; ic++) { - uint_t baseChunk; - baseChunk = ic & (~(mask0 | mask1)); - uint_t iChunk1 = baseChunk | mask0; - uint_t iChunk2 = baseChunk | mask1; - if (ic == iChunk1) - qregs_[iChunk1].apply_chunk_swap(qubits, qregs_[iChunk2], true); - } - }; - if (q0 < chunk_bits_ * qubit_scale()) - Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), - 0, num_groups_, apply_chunk_swap_1qubit); - else - Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), - 0, num_groups_, apply_chunk_swap_2qubits); - } -#ifdef AER_MPI - else { - int_t iPair; - uint_t nPair; - uint_t baseChunk, iChunk1, iChunk2; - - if (q0 < chunk_bits_ * qubit_scale()) - nLarge = 1; - else - nLarge = 2; - - // chunk scheduler that supports any number of processes - uint_t nu[3]; - uint_t ub[3]; - uint_t iu[3]; - uint_t add; - uint_t iLocalChunk, iRemoteChunk, iProc; - int i; - - if (q0 < chunk_bits_ * qubit_scale()) { - nLarge = 1; - nu[0] = 1ull << (q1 - chunk_bits_ * qubit_scale()); - ub[0] = 0; - iu[0] = 0; - - nu[1] = 1ull << (num_qubits_ * qubit_scale() - q1 - 1); - ub[1] = (q1 - chunk_bits_ * qubit_scale()) + 1; - iu[1] = 0; - } else { - nLarge = 2; - nu[0] = 1ull << (q0 - chunk_bits_ * qubit_scale()); - ub[0] = 0; - iu[0] = 0; - - nu[1] = 1ull << (q1 - q0 - 1); - ub[1] = (q0 - chunk_bits_ * qubit_scale()) + 1; - iu[1] = 0; - - nu[2] = 1ull << (num_qubits_ * qubit_scale() - q1 - 1); - ub[2] = (q1 - chunk_bits_ * qubit_scale()) + 1; - iu[2] = 0; - } - nPair = 1ull << (num_qubits_ * qubit_scale() - - chunk_bits_ * qubit_scale() - nLarge); - - for (iPair = 0; iPair < nPair; iPair++) { - // calculate index of pair of chunks - baseChunk = 0; - add = 1; - for (i = nLarge; i >= 0; i--) { - baseChunk += (iu[i] << ub[i]); - // update for next - iu[i] += add; - add = 0; - if (iu[i] >= nu[i]) { - iu[i] = 0; - add = 1; - } - } - - iChunk1 = baseChunk | mask0; - iChunk2 = baseChunk | mask1; - - if (iChunk1 >= chunk_index_begin_[distributed_rank_] && - iChunk1 < chunk_index_end_[distributed_rank_]) { // chunk1 is on - // this process - if (iChunk2 >= chunk_index_begin_[distributed_rank_] && - iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on - // this process - qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap( - qubits, qregs_[iChunk2 - global_chunk_index_], true); - continue; - } else { - iLocalChunk = iChunk1; - iRemoteChunk = iChunk2; - iProc = get_process_by_chunk(iChunk2); - } - } else { - if (iChunk2 >= chunk_index_begin_[distributed_rank_] && - iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on - // this process - iLocalChunk = iChunk2; - iRemoteChunk = iChunk1; - iProc = get_process_by_chunk(iChunk1); - } else { - continue; // there is no chunk for this pair on this process - } - } - - MPI_Request reqSend, reqRecv; - MPI_Status st; - uint_t sizeRecv, sizeSend; - - auto pRecv = - qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv); - MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, distributed_comm_, - &reqRecv); - - auto pSend = - qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend); - MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, distributed_comm_, - &reqSend); - - MPI_Wait(&reqSend, &st); - MPI_Wait(&reqRecv, &st); - - qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap( - qubits, iRemoteChunk); - } - } -#endif - } -} - -template -void StateChunk::apply_multi_chunk_swap(const reg_t &qubits) { - int_t nswap = qubits.size() / 2; - reg_t chunk_shuffle_qubits(nswap, 0); - reg_t local_swaps; - uint_t baseChunk = 0; - uint_t nchunk = 1ull << nswap; - reg_t chunk_procs(nchunk); - reg_t chunk_offset(nchunk); - - if (qubit_scale() == 1) { - for (int_t i = 0; i < nswap; i++) - std::swap(qubit_map_[qubits[i * 2]], qubit_map_[qubits[i * 2] + 1]); - } - - // define local swaps - for (int_t i = 0; i < nswap; i++) { - if (qubits[i * 2] >= chunk_bits_ * qubit_scale() - nswap) // no swap - // required - chunk_shuffle_qubits[qubits[i * 2] + nswap - - chunk_bits_ * qubit_scale()] = qubits[i * 2 + 1]; - } - int_t pos = 0; - for (int_t i = 0; i < nswap; i++) { - if (qubits[i * 2] < - chunk_bits_ * qubit_scale() - nswap) { // local swap required - // find empty position - while (pos < nswap) { - if (chunk_shuffle_qubits[pos] < chunk_bits_ * qubit_scale()) { - chunk_shuffle_qubits[pos] = qubits[i * 2 + 1]; - local_swaps.push_back(qubits[i * 2]); - local_swaps.push_back(chunk_bits_ * qubit_scale() - nswap + pos); - pos++; - break; - } - pos++; - } - } - } - for (int_t i = 0; i < nswap; i++) - chunk_shuffle_qubits[i] -= chunk_bits_ * qubit_scale(); - - // swap inside chunks to prepare for all-to-all shuffle - if (chunk_omp_parallel_ && num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_multi_swaps(local_swaps); - } - } else { - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_multi_swaps(local_swaps); - } - } - - // apply all-to-all chunk shuffle - int_t nPair; - reg_t chunk_shuffle_qubits_sorted = chunk_shuffle_qubits; - std::sort(chunk_shuffle_qubits_sorted.begin(), - chunk_shuffle_qubits_sorted.end()); - - nPair = num_global_chunks_ >> nswap; - - for (uint_t i = 0; i < nchunk; i++) { - chunk_offset[i] = 0; - for (uint_t k = 0; k < nswap; k++) { - if (((i >> k) & 1) != 0) - chunk_offset[i] += (1ull << chunk_shuffle_qubits[k]); - } - } - -#ifdef AER_MPI - std::vector reqSend(nchunk); - std::vector reqRecv(nchunk); -#endif - - for (int_t iPair = 0; iPair < nPair; iPair++) { - uint_t i1, i2, k, ii, t; - baseChunk = 0; - ii = iPair; - for (k = 0; k < nswap; k++) { - t = ii & ((1ull << chunk_shuffle_qubits_sorted[k]) - 1); - baseChunk += t; - ii = (ii - t) << 1; - } - baseChunk += ii; - - for (i1 = 0; i1 < nchunk; i1++) { - chunk_procs[i1] = get_process_by_chunk(baseChunk + chunk_offset[i1]); - } - - // all-to-all - // send data - for (uint_t iswap = 1; iswap < nchunk; iswap++) { - uint_t sizeRecv, sizeSend; - uint_t num_local_swap = 0; - for (i1 = 0; i1 < nchunk; i1++) { - i2 = i1 ^ iswap; - if (i1 >= i2) - continue; - - uint_t iProc1 = chunk_procs[i1]; - uint_t iProc2 = chunk_procs[i2]; - if (iProc1 != distributed_rank_ && iProc2 != distributed_rank_) - continue; - if (iProc1 == iProc2) { // on the same process - num_local_swap++; - continue; // swap while data is exchanged between processes - } -#ifdef AER_MPI - uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap); - uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); - uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_; - uint_t iChunk2 = baseChunk + chunk_offset[i2] - global_chunk_index_; - - int_t tid = (iPair << nswap) + iswap; - - if (iProc1 == distributed_rank_) { - auto pRecv = qregs_[iChunk1].recv_buffer(sizeRecv); - MPI_Irecv(pRecv + offset2, (sizeRecv >> nswap), MPI_BYTE, iProc2, tid, - distributed_comm_, &reqRecv[i2]); - - auto pSend = qregs_[iChunk1].send_buffer(sizeSend); - MPI_Isend(pSend + offset2, (sizeSend >> nswap), MPI_BYTE, iProc2, tid, - distributed_comm_, &reqSend[i2]); - } else { - auto pRecv = qregs_[iChunk2].recv_buffer(sizeRecv); - MPI_Irecv(pRecv + offset1, (sizeRecv >> nswap), MPI_BYTE, iProc1, tid, - distributed_comm_, &reqRecv[i1]); - - auto pSend = qregs_[iChunk2].send_buffer(sizeSend); - MPI_Isend(pSend + offset1, (sizeSend >> nswap), MPI_BYTE, iProc1, tid, - distributed_comm_, &reqSend[i1]); - } -#endif - } - - // swaps inside process - if (num_local_swap > 0) { - for (i1 = 0; i1 < nchunk; i1++) { - i2 = i1 ^ iswap; - if (i1 > i2) - continue; - - uint_t iProc1 = chunk_procs[i1]; - uint_t iProc2 = chunk_procs[i2]; - if (iProc1 != distributed_rank_ && iProc2 != distributed_rank_) - continue; - if (iProc1 == iProc2) { // on the same process - uint_t offset1 = i1 << (chunk_bits_ * qubit_scale() - nswap); - uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); - uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_; - uint_t iChunk2 = baseChunk + chunk_offset[i2] - global_chunk_index_; - qregs_[iChunk1].apply_chunk_swap( - qregs_[iChunk2], offset2, offset1, - (1ull << (chunk_bits_ * qubit_scale() - nswap))); - } - } - } - -#ifdef AER_MPI - // recv data - for (i1 = 0; i1 < nchunk; i1++) { - i2 = i1 ^ iswap; - - uint_t iProc1 = chunk_procs[i1]; - uint_t iProc2 = chunk_procs[i2]; - if (iProc1 != distributed_rank_) - continue; - if (iProc1 == iProc2) { // on the same process - continue; - } - uint_t iChunk1 = baseChunk + chunk_offset[i1] - global_chunk_index_; - uint_t offset2 = i2 << (chunk_bits_ * qubit_scale() - nswap); - - MPI_Status st; - MPI_Wait(&reqSend[i2], &st); - MPI_Wait(&reqRecv[i2], &st); - - // copy states from recv buffer to chunk - qregs_[iChunk1].apply_chunk_swap( - qregs_[iChunk1], offset2, offset2, - (1ull << (chunk_bits_ * qubit_scale() - nswap))); - } -#endif - } - } - - // restore qubits order - if (chunk_omp_parallel_ && num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_multi_swaps(local_swaps); - } - } else { - for (int_t ig = 0; ig < num_groups_; ig++) { - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_multi_swaps(local_swaps); - } - } -} - -template -void StateChunk::apply_chunk_x(const uint_t qubit) { - int_t iChunk; - uint_t nLarge = 1; - - if (qubit < chunk_bits_ * qubit_scale()) { - auto apply_mcx = [this, qubit](int_t ig) { - reg_t qubits(1, qubit); - for (int_t iChunk = top_chunk_of_group_[ig]; - iChunk < top_chunk_of_group_[ig + 1]; iChunk++) - qregs_[iChunk].apply_mcx(qubits); - }; - Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), 0, - num_groups_, apply_mcx); - } else { // exchange over chunks - int_t iPair; - uint_t nPair, mask; - uint_t baseChunk, iChunk1, iChunk2; - reg_t qubits(2); - qubits[0] = qubit; - qubits[1] = qubit; - - mask = (1ull << qubit); - mask >>= (chunk_bits_ * qubit_scale()); - - if (distributed_procs_ == 1 || - (distributed_proc_bits_ >= 0 && - qubit < (num_qubits_ * qubit_scale() - - distributed_proc_bits_))) { // no data transfer between - // processes is needed - nPair = num_local_chunks_ >> 1; - - auto apply_chunk_swap = [this, mask, qubits](int_t iGroup) { - for (int_t ic = top_chunk_of_group_[iGroup]; - ic < top_chunk_of_group_[iGroup + 1]; ic++) { - uint_t pairChunk; - pairChunk = ic ^ mask; - if (ic < pairChunk) - qregs_[ic].apply_chunk_swap(qubits, qregs_[pairChunk], true); - } - }; - Utils::apply_omp_parallel_for((chunk_omp_parallel_ && num_groups_ > 1), 0, - nPair, apply_chunk_swap); - } -#ifdef AER_MPI - else { - // chunk scheduler that supports any number of processes - uint_t nu[3]; - uint_t ub[3]; - uint_t iu[3]; - uint_t add; - uint_t iLocalChunk, iRemoteChunk, iProc; - int i; - - nLarge = 1; - nu[0] = 1ull << (qubit - chunk_bits_ * qubit_scale()); - ub[0] = 0; - iu[0] = 0; - - nu[1] = 1ull << (num_qubits_ * qubit_scale() - qubit - 1); - ub[1] = (qubit - chunk_bits_ * qubit_scale()) + 1; - iu[1] = 0; - nPair = 1ull << (num_qubits_ * qubit_scale() - - chunk_bits_ * qubit_scale() - 1); - - for (iPair = 0; iPair < nPair; iPair++) { - // calculate index of pair of chunks - baseChunk = 0; - add = 1; - for (i = 1; i >= 0; i--) { - baseChunk += (iu[i] << ub[i]); - // update for next - iu[i] += add; - add = 0; - if (iu[i] >= nu[i]) { - iu[i] = 0; - add = 1; - } - } - - iChunk1 = baseChunk; - iChunk2 = baseChunk | mask; - - if (iChunk1 >= chunk_index_begin_[distributed_rank_] && - iChunk1 < chunk_index_end_[distributed_rank_]) { // chunk1 is on - // this process - if (iChunk2 >= chunk_index_begin_[distributed_rank_] && - iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on - // this process - qregs_[iChunk1 - global_chunk_index_].apply_chunk_swap( - qubits, qregs_[iChunk2 - global_chunk_index_], true); - continue; - } else { - iLocalChunk = iChunk1; - iRemoteChunk = iChunk2; - iProc = get_process_by_chunk(iChunk2); - } - } else { - if (iChunk2 >= chunk_index_begin_[distributed_rank_] && - iChunk2 < chunk_index_end_[distributed_rank_]) { // chunk2 is on - // this process - iLocalChunk = iChunk2; - iRemoteChunk = iChunk1; - iProc = get_process_by_chunk(iChunk1); - } else { - continue; // there is no chunk for this pair on this process - } - } - - MPI_Request reqSend, reqRecv; - MPI_Status st; - uint_t sizeRecv, sizeSend; - - auto pSend = - qregs_[iLocalChunk - global_chunk_index_].send_buffer(sizeSend); - MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, iPair, distributed_comm_, - &reqSend); - - auto pRecv = - qregs_[iLocalChunk - global_chunk_index_].recv_buffer(sizeRecv); - MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, iPair, distributed_comm_, - &reqRecv); - - MPI_Wait(&reqSend, &st); - MPI_Wait(&reqRecv, &st); - - qregs_[iLocalChunk - global_chunk_index_].apply_chunk_swap( - qubits, iRemoteChunk); - } - } -#endif - } -} - -template -void StateChunk::send_chunk(uint_t local_chunk_index, - uint_t global_pair_index) { -#ifdef AER_MPI - MPI_Request reqSend; - MPI_Status st; - uint_t sizeSend; - uint_t iProc; - - iProc = get_process_by_chunk(global_pair_index); - - auto pSend = qregs_[local_chunk_index].send_buffer(sizeSend); - MPI_Isend(pSend, sizeSend, MPI_BYTE, iProc, - local_chunk_index + global_chunk_index_, distributed_comm_, - &reqSend); - - MPI_Wait(&reqSend, &st); - - qregs_[local_chunk_index].release_send_buffer(); -#endif -} - -template -void StateChunk::recv_chunk(uint_t local_chunk_index, - uint_t global_pair_index) { -#ifdef AER_MPI - MPI_Request reqRecv; - MPI_Status st; - uint_t sizeRecv; - uint_t iProc; - - iProc = get_process_by_chunk(global_pair_index); - - auto pRecv = qregs_[local_chunk_index].recv_buffer(sizeRecv); - MPI_Irecv(pRecv, sizeRecv, MPI_BYTE, iProc, global_pair_index, - distributed_comm_, &reqRecv); - - MPI_Wait(&reqRecv, &st); -#endif -} - -template -template -void StateChunk::send_data(data_t *pSend, uint_t size, uint_t myid, - uint_t pairid) { -#ifdef AER_MPI - MPI_Request reqSend; - MPI_Status st; - uint_t iProc; - - iProc = get_process_by_chunk(pairid); - - MPI_Isend(pSend, size * sizeof(data_t), MPI_BYTE, iProc, myid, - distributed_comm_, &reqSend); - - MPI_Wait(&reqSend, &st); -#endif -} - -template -template -void StateChunk::recv_data(data_t *pRecv, uint_t size, uint_t myid, - uint_t pairid) { -#ifdef AER_MPI - MPI_Request reqRecv; - MPI_Status st; - uint_t iProc; - - iProc = get_process_by_chunk(pairid); - - MPI_Irecv(pRecv, size * sizeof(data_t), MPI_BYTE, iProc, pairid, - distributed_comm_, &reqRecv); - - MPI_Wait(&reqRecv, &st); -#endif -} - -template -void StateChunk::reduce_sum(reg_t &sum) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - uint_t i, n = sum.size(); - reg_t tmp(n); - MPI_Allreduce(&sum[0], &tmp[0], n, MPI_UINT64_T, MPI_SUM, - distributed_comm_); - for (i = 0; i < n; i++) { - sum[i] = tmp[i]; - } - } -#endif -} - -template -void StateChunk::reduce_sum(rvector_t &sum) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - uint_t i, n = sum.size(); - rvector_t tmp(n); - MPI_Allreduce(&sum[0], &tmp[0], n, MPI_DOUBLE_PRECISION, MPI_SUM, - distributed_comm_); - for (i = 0; i < n; i++) { - sum[i] = tmp[i]; - } - } -#endif -} - -template -void StateChunk::reduce_sum(complex_t &sum) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - complex_t tmp; - MPI_Allreduce(&sum, &tmp, 2, MPI_DOUBLE_PRECISION, MPI_SUM, - distributed_comm_); - sum = tmp; - } -#endif -} - -template -void StateChunk::reduce_sum(double &sum) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - double tmp; - MPI_Allreduce(&sum, &tmp, 1, MPI_DOUBLE_PRECISION, MPI_SUM, - distributed_comm_); - sum = tmp; - } -#endif -} - -template -void StateChunk::gather_value(rvector_t &val) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - rvector_t tmp = val; - MPI_Alltoall(&tmp[0], 1, MPI_DOUBLE_PRECISION, &val[0], 1, - MPI_DOUBLE_PRECISION, distributed_comm_); - } -#endif -} - -template -void StateChunk::sync_process(void) const { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - MPI_Barrier(distributed_comm_); - } -#endif -} - -// gather distributed state into vector (if memory is enough) -template -template -void StateChunk::gather_state( - std::vector> &state) { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - uint_t size, local_size, global_size, offset; - int i; - std::vector recv_counts(distributed_procs_); - std::vector recv_offset(distributed_procs_); - - global_size = 0; - for (i = 0; i < distributed_procs_; i++) { - recv_offset[i] = - (int)(chunk_index_begin_[i] << (chunk_bits_ * qubit_scale())) * 2; - recv_counts[i] = (int)((chunk_index_end_[i] - chunk_index_begin_[i]) - << (chunk_bits_ * qubit_scale())); - global_size += recv_counts[i]; - recv_counts[i] *= 2; - } - if ((global_size >> 21) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to gather state")); - } - std::vector> local_state = state; - state.resize(global_size); - - if (sizeof(std::complex) == 16) { - MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_], - MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0], - &recv_offset[0], MPI_DOUBLE_PRECISION, distributed_comm_); - } else { - MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_], - MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0], - MPI_FLOAT, distributed_comm_); - } - } -#endif -} - -template -template -void StateChunk::gather_state( - AER::Vector> &state) { -#ifdef AER_MPI - if (distributed_procs_ > 1) { - uint_t size, local_size, global_size, offset; - int i; - - std::vector recv_counts(distributed_procs_); - std::vector recv_offset(distributed_procs_); - - global_size = 0; - for (i = 0; i < distributed_procs_; i++) { - recv_offset[i] = - (int)(chunk_index_begin_[i] << (chunk_bits_ * qubit_scale())) * 2; - recv_counts[i] = (int)((chunk_index_end_[i] - chunk_index_begin_[i]) - << (chunk_bits_ * qubit_scale())); - global_size += recv_counts[i]; - recv_counts[i] *= 2; - } - if ((global_size >> 21) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to gather state")); - } - AER::Vector> local_state = state; - state.resize(global_size); - - if (sizeof(std::complex) == 16) { - MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_], - MPI_DOUBLE_PRECISION, state.data(), &recv_counts[0], - &recv_offset[0], MPI_DOUBLE_PRECISION, distributed_comm_); - } else { - MPI_Allgatherv(local_state.data(), recv_counts[distributed_rank_], - MPI_FLOAT, state.data(), &recv_counts[0], &recv_offset[0], - MPI_FLOAT, distributed_comm_); - } - } -#endif -} - -template -void StateChunk::gather_creg_memory(void) { -#ifdef AER_MPI - int_t i, j; - uint_t n64, i64, ibit; - - if (distributed_procs_ == 1) - return; - if (BaseState::cregs_[0].memory_size() == 0) - return; - - // number of 64-bit integers per memory - n64 = (BaseState::cregs_[0].memory_size() + 63) >> 6; - - reg_t bin_memory(n64 * num_local_chunks_, 0); - // compress memory string to binary -#pragma omp parallel for private(i, j, i64, ibit) - for (i = 0; i < num_local_chunks_; i++) { - for (j = 0; j < BaseState::cregs_[0].memory_size(); j++) { - i64 = j >> 6; - ibit = j & 63; - if (BaseState::cregs_[global_chunk_index_ + i].creg_memory()[j] == '1') { - bin_memory[i * n64 + i64] |= (1ull << ibit); - } - } - } - - reg_t recv(n64 * num_global_chunks_); - std::vector recv_counts(distributed_procs_); - std::vector recv_offset(distributed_procs_); - - for (i = 0; i < distributed_procs_; i++) { - recv_offset[i] = num_global_chunks_ * i / distributed_procs_; - recv_counts[i] = - (num_global_chunks_ * (i + 1) / distributed_procs_) - recv_offset[i]; - } - - MPI_Allgatherv(&bin_memory[0], n64 * num_local_chunks_, MPI_UINT64_T, - &recv[0], &recv_counts[0], &recv_offset[0], MPI_UINT64_T, - distributed_comm_); - - // store gathered memory -#pragma omp parallel for private(i, j, i64, ibit) - for (i = 0; i < num_global_chunks_; i++) { - for (j = 0; j < BaseState::cregs_[0].memory_size(); j++) { - i64 = j >> 6; - ibit = j & 63; - if (((recv[i * n64 + i64] >> ibit) & 1) == 1) - BaseState::cregs_[i].creg_memory()[j] = '1'; - else - BaseState::cregs_[i].creg_memory()[j] = '0'; - } - } -#endif -} - -//------------------------------------------------------------------------- -} // namespace QuantumState -//------------------------------------------------------------------------- -} // end namespace AER -//------------------------------------------------------------------------- -#endif diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index 67c0c454ef..37067c172a 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -399,6 +399,12 @@ class Chunk { void probabilities(std::vector &probs, const reg_t &qubits) const { chunk_container_.lock()->probabilities(probs, chunk_pos_, qubits); } + // get norm of matrix multiplication + double expval_matrix(const reg_t &qubits, const cvector_t &mat, + const uint_t count) const { + return chunk_container_.lock()->expval_matrix(chunk_pos_, qubits, mat, + count); + } // Pauli expectation values double expval_pauli(const reg_t &qubits, const std::string &pauli, const complex_t initial_phase) const { diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index a609f135a8..b674e6217c 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -174,11 +174,11 @@ class ChunkContainer virtual thrust::complex Get(uint_t i) const = 0; virtual void StoreMatrix(const std::vector> &mat, - uint_t iChunk) = 0; + uint_t iChunk) const = 0; virtual void StoreMatrix(const std::complex *mat, uint_t iChunk, - uint_t size) = 0; + uint_t size) const = 0; virtual void StoreUintParams(const std::vector &prm, - uint_t iChunk) = 0; + uint_t iChunk) const = 0; virtual void ResizeMatrixBuffers(int bits) = 0; virtual void CopyIn(Chunk &src, uint_t iChunk) = 0; @@ -310,6 +310,11 @@ class ChunkContainer virtual void probabilities(std::vector &probs, const uint_t iChunk, const reg_t &qubits) const; + // get norm of matrix multiplication + virtual double expval_matrix(const uint_t iChunk, const reg_t &qubits, + const cvector_t &mat, + const uint_t count) const; + // Pauli expectation values virtual double expval_pauli(const uint_t iChunk, const reg_t &qubits, const std::string &pauli, @@ -1009,6 +1014,32 @@ double ChunkContainer::trace(uint_t iChunk, uint_t row, return ret; } +template +double ChunkContainer::expval_matrix(const uint_t iChunk, + const reg_t &qubits, + const cvector_t &mat, + const uint_t count) const { + double ret; + const size_t N = qubits.size(); + + if (N == 1) + ExecuteSum(&ret, NormMatrixMult2x2(mat, qubits[0]), iChunk, count); + else { + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + for (int_t i = 0; i < N; i++) { + qubits_sorted.push_back(qubits[i]); + } + + StoreMatrix(mat, iChunk); + StoreUintParams(qubits_sorted, iChunk); + + ExecuteSum(&ret, NormMatrixMultNxN(N), iChunk, count); + } + + return ret; +} + template double ChunkContainer::expval_pauli(const uint_t iChunk, const reg_t &qubits, diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index 8023b1699c..1efc57db52 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -57,6 +57,8 @@ class ChunkManager { int num_threads_per_group_; uint_t num_creg_bits_ = 0; + reg_t target_gpus_; + public: ChunkManager(); @@ -71,7 +73,7 @@ class ChunkManager { uint_t Allocate(int chunk_bits, int nqubits, uint_t nchunks, uint_t chunk_index, int matrix_bit, bool density_mat, - bool enable_cuStatevec); + reg_t &gpus, bool enable_cuStatevec); void Free(void); int num_devices(void) { return num_devices_; } @@ -160,7 +162,7 @@ template uint_t ChunkManager::Allocate(int chunk_bits, int nqubits, uint_t nchunks, uint_t chunk_index, int matrix_bit, bool density_mat, - bool enable_cuStatevec) { + reg_t &gpus, bool enable_cuStatevec) { uint_t num_buffers; int iDev; uint_t is, ie, nc; @@ -183,6 +185,17 @@ uint_t ChunkManager::Allocate(int chunk_bits, int nqubits, density_matrix_ = density_mat; enable_cuStatevec_ = enable_cuStatevec; + target_gpus_ = gpus; + if (target_gpus_.size() > 0) { + num_devices_ = target_gpus_.size(); + if (num_devices_ > 1) + multi_gpu = true; + } else { + target_gpus_.resize(num_devices_); + for (iDev = 0; iDev < num_devices_; iDev++) { + target_gpus_[iDev] = iDev; + } + } chunk_index_ = chunk_index; @@ -246,7 +259,7 @@ uint_t ChunkManager::Allocate(int chunk_bits, int nqubits, if (!multi_gpu) { size_t freeMem, totalMem; - cudaSetDevice(0); + cudaSetDevice(target_gpus_[0]); cudaMemGetInfo(&freeMem, &totalMem); if (freeMem > (((uint_t)sizeof(thrust::complex) * (nchunks + num_buffers + AER_DUMMY_BUFFERS)) @@ -295,14 +308,16 @@ uint_t ChunkManager::Allocate(int chunk_bits, int nqubits, chunk_index_ + chunks_allocated); // set first chunk index for the container chunks_[iDev]->set_num_creg_bits(num_creg_bits_); - if (num_devices_ > 0) - chunks_allocated += chunks_[iDev]->Allocate( - (iDev + idev_start) % num_devices_, chunk_bits, nqubits, nc, - num_buffers, multi_shots_, matrix_bit, density_matrix_); - else + if (num_devices_ > 0) { + int id = target_gpus_[(iDev + idev_start) % num_devices_]; + chunks_allocated += + chunks_[iDev]->Allocate(id, chunk_bits, nqubits, nc, num_buffers, + multi_shots_, matrix_bit, density_matrix_); + } else { chunks_allocated += chunks_[iDev]->Allocate(iDev, chunk_bits, nqubits, nc, num_buffers, multi_shots_, matrix_bit, density_matrix_); + } } if (chunks_allocated < num_chunks_) { int nplaces_add = num_places_; diff --git a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp index 3fd95b94ce..9fe2fadefd 100644 --- a/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp +++ b/src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp @@ -142,7 +142,7 @@ uint_t cuStateVecChunkContainer::Allocate( throw std::runtime_error(str.str()); } - err = custatevecSetStream(custatevec_handle_, BaseContainer::stream_); + err = custatevecSetStream(custatevec_handle_, BaseContainer::stream(0)); if (err != CUSTATEVEC_STATUS_SUCCESS) { std::stringstream str; str << "cuStateVecChunkContainer::allocate::custatevecSetStream : " @@ -214,13 +214,13 @@ reg_t cuStateVecChunkContainer::sample_measure( reg_t samples(SHOTS, 0); BaseContainer::set_device(); - custatevecSetStream(custatevec_handle_, BaseContainer::stream_); + custatevecSetStream(custatevec_handle_, BaseContainer::stream(0)); custatevecStatus_t err; custatevecSamplerDescriptor_t sampler; size_t extSize; - cudaStreamSynchronize(BaseContainer::stream_); + cudaStreamSynchronize(BaseContainer::stream(0)); cudaDataType_t state_type; if (sizeof(data_t) == sizeof(double)) diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index e4237dec67..bfd75cb92b 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -31,13 +31,17 @@ namespace Chunk { template class DeviceChunkContainer : public ChunkContainer { protected: - AERDeviceVector> - data_; // device vector to chunks and buffers - AERDeviceVector> matrix_; // storage for large matrix - mutable AERDeviceVector params_; // storage for additional parameters - AERDeviceVector reduce_buffer_; // buffer for reduction - AERDeviceVector - probability_buffer_; // buffer used for measure probability + // device vector to chunks and buffers + AERDeviceVector> data_; + // storage for large matrix + mutable AERDeviceVector> matrix_; + // storage for additional parameters + mutable AERDeviceVector params_; + // buffer for reduction + AERDeviceVector reduce_buffer_; + // buffer used for measure probability + AERDeviceVector probability_buffer_; + AERDeviceVector cregs_; AERHostVector cregs_host_; int device_id_; // device index @@ -51,6 +55,7 @@ class DeviceChunkContainer : public ChunkContainer { bool multi_shots_; // multi-shot parallelization bool creg_host_update_; + bool creg_dev_update_; // for register blocking thrust::host_vector blocked_qubits_holder_; @@ -60,8 +65,7 @@ class DeviceChunkContainer : public ChunkContainer { reg_t num_blocked_qubits_; #ifdef AER_THRUST_CUDA - cudaStream_t stream_; // asynchronous execution - cudaStream_t stream_cache_; // asynchronous execution + std::vector stream_; // asynchronous execution #endif public: @@ -72,10 +76,7 @@ class DeviceChunkContainer : public ChunkContainer { num_matrices_ = 1; multi_shots_ = false; creg_host_update_ = true; -#ifdef AER_THRUST_CUDA - stream_ = nullptr; - stream_cache_ = nullptr; -#endif + creg_dev_update_ = false; } ~DeviceChunkContainer(); @@ -106,10 +107,11 @@ class DeviceChunkContainer : public ChunkContainer { void Deallocate(void) override; void StoreMatrix(const std::vector> &mat, - uint_t iChunk) override; + uint_t iChunk) const override; void StoreMatrix(const std::complex *mat, uint_t iChunk, - uint_t size) override; - void StoreUintParams(const std::vector &prm, uint_t iChunk) override; + uint_t size) const override; + void StoreUintParams(const std::vector &prm, + uint_t iChunk) const override; void ResizeMatrixBuffers(int bits) override; void calculate_matrix_buffer_size(int bits); @@ -123,8 +125,10 @@ class DeviceChunkContainer : public ChunkContainer { #ifdef AER_THRUST_CUDA cudaStream_t stream(uint_t iChunk) const { if (iChunk >= this->num_chunks_) - return stream_cache_; - return stream_; + return stream_[(num_matrices_ + iChunk - this->num_chunks_)]; + if (num_matrices_ == 1) + return stream_[0]; + return stream_[iChunk]; } #endif @@ -212,9 +216,9 @@ class DeviceChunkContainer : public ChunkContainer { #ifdef AER_THRUST_CUDA cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()), thrust::raw_pointer_cast(cregs_.data()), - sizeof(uint_t) * this->num_chunks_ * n64, - cudaMemcpyDeviceToHost, stream_); - cudaStreamSynchronize(stream_); + sizeof(uint_t) * num_matrices_ * n64, + cudaMemcpyDeviceToHost, stream_[0]); + cudaStreamSynchronize(stream_[0]); #else thrust::copy_n(cregs_.begin(), this->num_chunks_ * n64, cregs_host_.begin()); @@ -224,6 +228,50 @@ class DeviceChunkContainer : public ChunkContainer { return (cregs_host_[iChunk * n64 + i64] >> ibit) & 1; } + void write_cbit(uint_t iChunk, int qubit, int val) { + uint_t n64, i64, ibit; + if (qubit >= this->num_creg_bits_) + return; + n64 = (this->num_creg_bits_ + 63) >> 6; + i64 = qubit >> 6; + ibit = qubit & 63; + if (iChunk == 0 && creg_host_update_) { + creg_host_update_ = false; +#ifdef AER_THRUST_CUDA + cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()), + thrust::raw_pointer_cast(cregs_.data()), + sizeof(uint_t) * num_matrices_ * n64, + cudaMemcpyDeviceToHost, stream_[0]); + cudaStreamSynchronize(stream_[0]); +#else + thrust::copy_n(cregs_.begin(), this->num_chunks_ * n64, + cregs_host_.begin()); +#endif + } + + cregs_host_[iChunk * n64 + i64] = + (cregs_host_[iChunk * n64 + i64] & (~(1ull << ibit))) | + (((uint_t)val & 1) << ibit); + creg_dev_update_ = true; + } + void store_cbits(void) { + if (creg_dev_update_) { + uint_t n64; + n64 = (this->num_creg_bits_ + 63) >> 6; + creg_dev_update_ = false; + creg_host_update_ = false; +#ifdef AER_THRUST_CUDA + cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_.data()), + thrust::raw_pointer_cast(cregs_host_.data()), + sizeof(uint_t) * num_matrices_ * n64, + cudaMemcpyHostToDevice, stream_[0]); +#else + thrust::copy_n(cregs_host_.begin(), this->num_chunks_ * n64, + cregs_.begin()); +#endif + } + } + uint_t *creg_buffer(uint_t iChunk) const { uint_t n64; n64 = (this->num_creg_bits_ + 63) >> 6; @@ -234,10 +282,7 @@ class DeviceChunkContainer : public ChunkContainer { void synchronize(uint_t iChunk) { #ifdef AER_THRUST_CUDA set_device(); - if (iChunk >= this->num_chunks_) - cudaStreamSynchronize(stream_cache_); - else - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(stream(iChunk)); #endif } @@ -276,28 +321,24 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, set_device(); #ifdef AER_THRUST_CUDA - if (!multi_shots) { - int ip, nd; - cudaGetDeviceCount(&nd); - peer_access_.resize(nd); - for (i = 0; i < nd; i++) { - ip = 1; - if (i != device_id_) { - cudaDeviceCanAccessPeer(&ip, device_id_, i); - } - if (ip) { - if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess) - cudaGetLastError(); - peer_access_[i] = true; - } else - peer_access_[i] = false; + int ip, nd; + cudaGetDeviceCount(&nd); + peer_access_.resize(nd); + for (i = 0; i < nd; i++) { + ip = 1; + if (i != device_id_) { + cudaDeviceCanAccessPeer(&ip, device_id_, i); } - } else { -#endif - peer_access_.resize(1); - peer_access_[0] = true; -#ifdef AER_THRUST_CUDA + if (ip) { + if (cudaDeviceEnablePeerAccess(i, 0) != cudaSuccess) + cudaGetLastError(); + peer_access_[i] = true; + } else + peer_access_[i] = false; } +#else + peer_access_.resize(1); + peer_access_[0] = true; #endif this->num_buffers_ = buffers; @@ -352,10 +393,7 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, } } - cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking); - cudaStreamCreateWithFlags(&stream_cache_, cudaStreamNonBlocking); #endif - ResizeMatrixBuffers(matrix_bit); this->num_chunks_ = nc; @@ -369,14 +407,29 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, nc_tmp >>= 1; } + uint_t size = num_matrices_ + this->num_buffers_; + +#ifdef AER_THRUST_CUDA + stream_.resize(size); + for (int i = 0; i < size; i++) + cudaStreamCreateWithFlags(&stream_[i], cudaStreamNonBlocking); + + if (chunk_bits < 10) { + reduce_buffer_size_ = 1; + } else { + reduce_buffer_size_ = (1ull << (chunk_bits - 10)); + } +#else + reduce_buffer_size_ = 1; +#endif + + reduce_buffer_size_ *= 2; reduce_buffer_.resize(reduce_buffer_size_ * nc); - if (multi_shots) - probability_buffer_.resize(nc * QV_PROBABILITY_BUFFER_SIZE); + probability_buffer_.resize(nc * QV_PROBABILITY_BUFFER_SIZE); creg_host_update_ = false; this->num_creg_bits_ = num_qubits; - uint_t size = num_matrices_ + this->num_buffers_; num_blocked_gates_.resize(size); num_blocked_matrix_.resize(size); num_blocked_qubits_.resize(size); @@ -401,8 +454,10 @@ void DeviceChunkContainer::allocate_creg(uint_t num_mem, this->num_cmemory_ = num_mem; uint_t n64 = (this->num_creg_bits_ + 63) >> 6; - cregs_.resize(num_matrices_ * n64); - cregs_host_.resize(num_matrices_ * n64); + if (cregs_.size() != num_matrices_ * n64) { + cregs_.resize(num_matrices_ * n64); + cregs_host_.resize(num_matrices_ * n64); + } } template @@ -431,14 +486,9 @@ void DeviceChunkContainer::Deallocate(void) { blocked_qubits_holder_.clear(); #ifdef AER_THRUST_CUDA - if (stream_) { - cudaStreamDestroy(stream_); - stream_ = nullptr; - } - if (stream_cache_) { - cudaStreamDestroy(stream_cache_); - stream_cache_ = nullptr; - } + for (int i = 0; i < stream_.size(); i++) + cudaStreamDestroy(stream_[i]); + stream_.clear(); #endif ChunkContainer::deallocate_chunks(); } @@ -489,7 +539,7 @@ void DeviceChunkContainer::ResizeMatrixBuffers(int bits) { template void DeviceChunkContainer::StoreMatrix( - const std::vector> &mat, uint_t iChunk) { + const std::vector> &mat, uint_t iChunk) const { set_device(); #ifdef AER_THRUST_CUDA @@ -520,7 +570,8 @@ void DeviceChunkContainer::StoreMatrix( template void DeviceChunkContainer::StoreMatrix(const std::complex *mat, - uint_t iChunk, uint_t size) { + uint_t iChunk, + uint_t size) const { set_device(); #ifdef AER_THRUST_CUDA @@ -552,7 +603,7 @@ void DeviceChunkContainer::StoreMatrix(const std::complex *mat, template void DeviceChunkContainer::StoreUintParams( - const std::vector &prm, uint_t iChunk) { + const std::vector &prm, uint_t iChunk) const { set_device(); #ifdef AER_THRUST_CUDA @@ -589,10 +640,10 @@ void DeviceChunkContainer::CopyIn(Chunk &src, uint_t iChunk) { if (peer_access(src.device())) { cudaMemcpyAsync(chunk_pointer(iChunk), src.pointer(), size * sizeof(thrust::complex), - cudaMemcpyDeviceToDevice, stream_); + cudaMemcpyDeviceToDevice, stream(iChunk)); } else { cudaMemcpyPeerAsync(chunk_pointer(iChunk), device_id_, src.pointer(), - src.device(), size, stream_); + src.device(), size, stream(iChunk)); } } else { cudaMemcpyAsync(chunk_pointer(iChunk), src.pointer(), @@ -621,10 +672,10 @@ void DeviceChunkContainer::CopyOut(Chunk &dest, uint_t iChunk) { if (peer_access(dest.device())) { cudaMemcpyAsync(dest.pointer(), chunk_pointer(iChunk), size * sizeof(thrust::complex), - cudaMemcpyDeviceToDevice, stream_); + cudaMemcpyDeviceToDevice, stream(iChunk)); } else { cudaMemcpyPeerAsync(dest.pointer(), dest.device(), chunk_pointer(iChunk), - device_id_, size, stream_); + device_id_, size, stream(iChunk)); } } else { cudaMemcpyAsync(dest.pointer(), chunk_pointer(iChunk), @@ -650,8 +701,12 @@ template void DeviceChunkContainer::CopyIn(thrust::complex *src, uint_t iChunk, uint_t size) { uint_t this_size = 1ull << this->chunk_bits_; - if (this_size < size) - throw std::runtime_error("CopyIn chunk size is less than provided size"); + if (this_size < size) { + std::stringstream str; + str << "DeviceChunkContainer::CopyIn chunk size " << this_size + << " is less than " << size; + throw std::runtime_error(str.str()); + } synchronize(iChunk); thrust::copy_n(src, size, data_.begin() + (iChunk << this->chunk_bits_)); @@ -661,9 +716,12 @@ template void DeviceChunkContainer::CopyOut(thrust::complex *dest, uint_t iChunk, uint_t size) { uint_t this_size = 1ull << this->chunk_bits_; - if (this_size < size) - throw std::runtime_error("CopyOut chunk size is less than provided size"); - + if (this_size < size) { + std::stringstream str; + str << "DeviceChunkContainer::CopyOut chunk size " << this_size + << " is less than " << size; + throw std::runtime_error(str.str()); + } synchronize(iChunk); thrust::copy_n(data_.begin() + (iChunk << this->chunk_bits_), size, dest); } @@ -689,26 +747,26 @@ void DeviceChunkContainer::Swap(Chunk &src, uint_t iChunk, thrust::complex *pSrc = src.pointer(); cudaMemcpyPeerAsync(pBuffer + dest_offset, device_id_, pSrc + src_offset, src.device(), size * sizeof(thrust::complex), - stream_); + stream(iChunk)); this->Execute(BufferSwap_func(chunk_pointer(iChunk) + dest_offset, pBuffer + dest_offset, size, true), iChunk, 0, 1); - cudaMemcpyPeerAsync(pSrc + src_offset, src.device(), - pBuffer + dest_offset, device_id_, - size * sizeof(thrust::complex), stream_); + cudaMemcpyPeerAsync( + pSrc + src_offset, src.device(), pBuffer + dest_offset, device_id_, + size * sizeof(thrust::complex), stream(iChunk)); } } else { thrust::complex *pBuffer = buffer_pointer(); thrust::complex *pSrc = src.pointer(); cudaMemcpyAsync(pBuffer + dest_offset, pSrc + src_offset, size * sizeof(thrust::complex), - cudaMemcpyHostToDevice, stream_cache_); + cudaMemcpyHostToDevice, stream(this->num_chunks_)); this->Execute(BufferSwap_func(chunk_pointer(iChunk) + dest_offset, pBuffer + dest_offset, size, true), iChunk, 0, 1); cudaMemcpyAsync(pSrc + src_offset, pBuffer + dest_offset, size * sizeof(thrust::complex), - cudaMemcpyDeviceToHost, stream_cache_); + cudaMemcpyDeviceToHost, stream(this->num_chunks_)); } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { @@ -728,7 +786,7 @@ template void DeviceChunkContainer::Zero(uint_t iChunk, uint_t count) { set_device(); #ifdef AER_THRUST_CUDA - thrust::fill_n(thrust::cuda::par.on(stream_), + thrust::fill_n(thrust::cuda::par.on(stream(iChunk)), data_.begin() + (iChunk << this->chunk_bits_), count, 0.0); #else if (this->omp_threads_ > 1) @@ -755,24 +813,31 @@ reg_t DeviceChunkContainer::sample_measure( #ifdef AER_THRUST_CUDA if (dot) - thrust::transform_inclusive_scan( - thrust::cuda::par.on(stream_), iter.begin(), iter.end(), iter.begin(), - complex_dot_scan(), thrust::plus>()); + thrust::transform_inclusive_scan(thrust::cuda::par.on(stream(iChunk)), + iter.begin(), iter.end(), iter.begin(), + complex_dot_scan(), + thrust::plus>()); else - thrust::inclusive_scan(thrust::cuda::par.on(stream_), iter.begin(), + thrust::inclusive_scan(thrust::cuda::par.on(stream(iChunk)), iter.begin(), iter.end(), iter.begin(), thrust::plus>()); + uint_t i, nshots, size; uint_t iBuf = 0; - if (multi_shots_) + if (multi_shots_) { iBuf = iChunk; + size = matrix_buffer_size_ * 2; + if (size > params_buffer_size_) + size = params_buffer_size_; + } else { + size = matrix_.size() * 2; + if (size > params_.size()) + size = params_.size(); + } double *pRnd = (double *)matrix_pointer(iBuf); uint_t *pSmp = param_pointer(iBuf); thrust::device_ptr rnd_dev_ptr = thrust::device_pointer_cast(pRnd); - uint_t i, nshots, size = matrix_.size() * 2; - if (size > params_.size()) - size = params_.size(); for (i = 0; i < SHOTS; i += size) { nshots = size; @@ -780,17 +845,17 @@ reg_t DeviceChunkContainer::sample_measure( nshots = SHOTS - i; cudaMemcpyAsync(pRnd, &rnds[i], nshots * sizeof(double), - cudaMemcpyHostToDevice, stream_); + cudaMemcpyHostToDevice, stream(iChunk)); - thrust::lower_bound(thrust::cuda::par.on(stream_), iter.begin(), iter.end(), - rnd_dev_ptr, rnd_dev_ptr + nshots, + thrust::lower_bound(thrust::cuda::par.on(stream(iChunk)), iter.begin(), + iter.end(), rnd_dev_ptr, rnd_dev_ptr + nshots, params_.begin() + (iBuf * params_buffer_size_), complex_less()); cudaMemcpyAsync(&samples[i], pSmp, nshots * sizeof(uint_t), - cudaMemcpyDeviceToHost, stream_); + cudaMemcpyDeviceToHost, stream(iChunk)); } - cudaStreamSynchronize(stream_); + cudaStreamSynchronize(stream(iChunk)); #else if (this->omp_threads_ > 1) { if (dot) @@ -854,7 +919,7 @@ void DeviceChunkContainer::set_blocked_qubits(uint_t iChunk, set_device(); cudaMemcpyAsync(param_pointer(iChunk), (uint_t *)&qubits_sorted[0], qubits.size() * sizeof(uint_t), cudaMemcpyHostToDevice, - stream_); + stream(iChunk)); #endif num_blocked_gates_[iBlock] = 0; @@ -944,7 +1009,7 @@ void DeviceChunkContainer::queue_blocked_gate( num_blocked_qubits_[iBlock]) + num_blocked_gates_[iBlock], ¶ms, sizeof(BlockedGateParams), cudaMemcpyHostToDevice, - stream_); + stream(iChunk)); if (pMat != NULL) { if (gate == 'd') { // diagonal matrix @@ -953,14 +1018,14 @@ void DeviceChunkContainer::queue_blocked_gate( cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock], (thrust::complex *)&mat[0], 2 * sizeof(thrust::complex), - cudaMemcpyHostToDevice, stream_); + cudaMemcpyHostToDevice, stream(iChunk)); num_blocked_matrix_[iBlock] += 2; } else if (gate == 'p') { // phase mat[0] = pMat[0]; cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock], (thrust::complex *)&mat[0], 1 * sizeof(thrust::complex), - cudaMemcpyHostToDevice, stream_); + cudaMemcpyHostToDevice, stream(iChunk)); num_blocked_matrix_[iBlock] += 1; } else { // otherwise, 2x2 matrix mat[0] = pMat[0]; @@ -970,7 +1035,7 @@ void DeviceChunkContainer::queue_blocked_gate( cudaMemcpyAsync(matrix_pointer(iChunk) + num_blocked_matrix_[iBlock], (thrust::complex *)&mat[0], 4 * sizeof(thrust::complex), - cudaMemcpyHostToDevice, stream_); + cudaMemcpyHostToDevice, stream(iChunk)); num_blocked_matrix_[iBlock] += 4; } } @@ -1265,13 +1330,14 @@ void DeviceChunkContainer::apply_blocked_gates(uint_t iChunk) { dev_apply_register_blocked_gates <<), - stream_>>>(chunk_pointer(iChunk), num_blocked_gates_[iBlock], - num_blocked_qubits_[iBlock], num_blocked_matrix_[iBlock], - pQubits, pParams, pMatrix); + stream(iChunk)>>>(chunk_pointer(iChunk), num_blocked_gates_[iBlock], + num_blocked_qubits_[iBlock], + num_blocked_matrix_[iBlock], pQubits, pParams, + pMatrix); } else { // using shared memory blocking (<=10 qubits) dev_apply_shared_memory_blocked_gates - <<), stream_>>>( + <<), stream(iChunk)>>>( chunk_pointer(iChunk), num_blocked_gates_[iBlock], num_blocked_qubits_[iBlock], pQubits, pParams, pMatrix); } @@ -1288,7 +1354,8 @@ void DeviceChunkContainer::copy_to_probability_buffer( #ifdef AER_THRUST_CUDA set_device(); cudaMemcpyAsync(probability_buffer(0) + pos * this->num_chunks_, &buf[0], - buf.size() * sizeof(double), cudaMemcpyHostToDevice, stream_); + buf.size() * sizeof(double), cudaMemcpyHostToDevice, + stream_[0]); #else thrust::copy_n(buf.begin(), buf.size(), probability_buffer_.begin()); #endif diff --git a/src/simulators/statevector/chunk/host_chunk_container.hpp b/src/simulators/statevector/chunk/host_chunk_container.hpp index 53d3de7a1f..9e95316fd2 100644 --- a/src/simulators/statevector/chunk/host_chunk_container.hpp +++ b/src/simulators/statevector/chunk/host_chunk_container.hpp @@ -29,8 +29,8 @@ class HostChunkContainer : public ChunkContainer { protected: AERHostVector> data_; // host vector for chunks + buffers - std::vector *> matrix_; // pointer to matrix - std::vector params_; // pointer to additional parameters + mutable std::vector *> matrix_; // pointer to matrix + mutable std::vector params_; // pointer to additional parameters public: HostChunkContainer() {} ~HostChunkContainer(); @@ -47,15 +47,16 @@ class HostChunkContainer : public ChunkContainer { void Deallocate(void) override; void StoreMatrix(const std::vector> &mat, - uint_t iChunk) override { + uint_t iChunk) const override { matrix_[iChunk] = (thrust::complex *)&mat[0]; } void StoreMatrix(const std::complex *mat, uint_t iChunk, - uint_t size) override { + uint_t size) const override { matrix_[iChunk] = (thrust::complex *)mat; } - void StoreUintParams(const std::vector &prm, uint_t iChunk) override { + void StoreUintParams(const std::vector &prm, + uint_t iChunk) const override { params_[iChunk] = (uint_t *)&prm[0]; } void ResizeMatrixBuffers(int bits) {} diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp index c7f9f11610..f8bec5f665 100644 --- a/src/simulators/statevector/chunk/thrust_kernels.hpp +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -407,12 +407,15 @@ template class initialize_component_func : public GateFuncBase { protected: int nqubits; - uint_t matSize; + uint_t offset; + uint_t mat_pos; + uint_t mat_num; public: - initialize_component_func(const cvector_t &mat, const reg_t &qb) { - nqubits = qb.size(); - matSize = 1ull << nqubits; + initialize_component_func(const int nq, const uint_t pos, const uint_t num) { + nqubits = nq; + mat_pos = pos; + mat_num = num; } int qubits_count(void) { return nqubits; } @@ -445,13 +448,17 @@ class initialize_component_func : public GateFuncBase { idx += ii; q0 = vec[idx]; - for (k = 0; k < matSize; k++) { + for (k = mat_pos; k < mat_pos + mat_num; k++) { ii = idx; for (j = 0; j < nqubits; j++) { if (((k >> j) & 1) != 0) ii += (1ull << qubits[j]); } - q = q0 * state[k]; + if (ii == idx) { + if (mat_pos > 0) + continue; + } + q = q0 * state[k - mat_pos]; vec[ii] = q; } } @@ -459,44 +466,6 @@ class initialize_component_func : public GateFuncBase { const char *name(void) { return "initialize_component"; } }; -template -class initialize_large_component_func : public GateFuncBase { -protected: - int num_qubits_; - uint_t mask_; - uint_t cmask_; - thrust::complex init_; - -public: - initialize_large_component_func(thrust::complex m, - const reg_t &qubits, int i) { - num_qubits_ = qubits.size(); - init_ = m; - - mask_ = 0; - cmask_ = 0; - for (int k = 0; k < num_qubits_; k++) { - mask_ |= (1ull << qubits[k]); - - if (((i >> k) & 1) != 0) { - cmask_ |= (1ull << qubits[k]); - } - } - } - bool is_diagonal(void) { return true; } - - __host__ __device__ void operator()(const uint_t &i) const { - thrust::complex *vec; - thrust::complex q; - vec = this->data_; - if ((i & mask_) == cmask_) { - q = vec[i]; - vec[i] = init_ * q; - } - } - const char *name(void) { return "initialize_large_component"; } -}; - //------------------------------------------------------------------------------ // Zero clear //------------------------------------------------------------------------------ @@ -1371,7 +1340,7 @@ class DiagonalMult2x2Controlled : public GateFuncBase { m0 = mat[0]; m1 = mat[1]; - mask = (1ull << qubits[nqubits - 1]) - 1; + mask = (1ull << qubits[nqubits - 1]); cmask = 0; for (i = 0; i < nqubits - 1; i++) { cmask |= (1ull << qubits[i]); diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp old mode 100644 new mode 100755 index 4ae87ed670..3cc84d8a79 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -99,7 +99,7 @@ class QubitVector { static std::string name() { return "statevector"; } // Set the size of the vector in terms of qubit number - void set_num_qubits(size_t num_qubits); + virtual void set_num_qubits(size_t num_qubits); // Returns the number of qubits for the current vector virtual uint_t num_qubits() const { return num_qubits_; } @@ -147,6 +147,7 @@ class QubitVector { bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index, uint_t num_local_chunks); bool chunk_setup(QubitVector &base, const uint_t chunk_index); + uint_t chunk_index(void) { return chunk_index_; } // cache control for chunks on host bool fetch_chunk(void) const { return true; } @@ -159,6 +160,7 @@ class QubitVector { // prepare buffer for MPI send/recv std::complex *send_buffer(uint_t &size_in_byte); std::complex *recv_buffer(uint_t &size_in_byte); + void release_send_buffer(void) const; void release_recv_buffer(void) const; @@ -186,6 +188,9 @@ class QubitVector { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const QubitVector &obj) { copy_qv(obj); } + // Initializes the vector to a custom initial state. // If the length of the data vector does not match the number of qubits // an exception is raised. @@ -429,6 +434,8 @@ class QubitVector { // cuStateVec void cuStateVec_enable(bool flg) {} + void set_target_gpus(reg_t &t) {} + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -441,6 +448,8 @@ class QubitVector { virtual bool enable_batch(bool flg) const { return false; } + bool support_global_indexing(void) { return false; } + protected: //----------------------------------------------------------------------- // Protected data members @@ -623,6 +632,9 @@ class QubitVector { // Allocates memory for the checkoiunt void allocate_checkpoint(size_t data_size); + + // copy state from other QubitVector + void copy_qv(const QubitVector &obj); }; /******************************************************************************* @@ -741,6 +753,22 @@ QubitVector::~QubitVector() { free_checkpoint(); } +template +void QubitVector::copy_qv(const QubitVector &obj) { + data_ = nullptr; + checkpoint_ = nullptr; + set_num_qubits(obj.num_qubits()); + set_transformer_method(); + + initialize_from_data(obj.data_, obj.data_size_); + + chunk_index_ = obj.chunk_index_; + omp_threads_ = obj.omp_threads_; + omp_threshold_ = obj.omp_threshold_; + sample_measure_index_size_ = obj.sample_measure_index_size_; + json_chop_threshold_ = obj.json_chop_threshold_; +} + template QubitVector &QubitVector::operator=(QubitVector &&obj) { num_qubits_ = obj.num_qubits_; @@ -753,6 +781,7 @@ QubitVector &QubitVector::operator=(QubitVector &&obj) { omp_threshold_ = obj.omp_threshold_; sample_measure_index_size_ = obj.sample_measure_index_size_; json_chop_threshold_ = obj.json_chop_threshold_; + obj.data_ = nullptr; obj.checkpoint_ = nullptr; return *this; @@ -1298,7 +1327,6 @@ void QubitVector::apply_multiplexer(const reg_t &control_qubits, template void QubitVector::apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag) { - transformer_->apply_diagonal_matrix(data_, data_size_, omp_threads_managed(), qubits, diag); } diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 06aef07eec..2be6721de7 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -144,7 +144,9 @@ class QubitVectorThrust { // chunk setup bool chunk_setup(int chunk_bits, int num_qubits, uint_t chunk_index, uint_t num_local_chunks); - bool chunk_setup(QubitVectorThrust &base, const uint_t chunk_index); + bool chunk_setup(const QubitVectorThrust &base, + const uint_t chunk_index); + uint_t chunk_index(void) { return chunk_index_; } // cache control for chunks on host bool fetch_chunk(void) const; @@ -185,6 +187,9 @@ class QubitVectorThrust { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const QubitVectorThrust &obj) { copy_qv(obj); } + // Initializes the vector to a custom initial state. // If the length of the data vector does not match the number of qubits // an exception is raised. @@ -233,7 +238,8 @@ class QubitVectorThrust { // Apply a N-qubit diagonal matrix to the state vector. // The matrix is input as vector of the matrix diagonal. - void apply_diagonal_matrix(const reg_t &qubits, const cvector_t &mat); + virtual void apply_diagonal_matrix(const reg_t &qubits, + const cvector_t &mat); // Swap pairs of indicies in the underlying vector void @@ -321,7 +327,7 @@ class QubitVectorThrust { //----------------------------------------------------------------------- virtual bool batched_optimization_supported(void) { #ifdef AER_THRUST_CUDA - if (multi_shots_ && enable_batch_) + if (enable_batch_) return true; else return false; @@ -379,28 +385,12 @@ class QubitVectorThrust { // expectation value of A^\dagger A, and could probably be removed because // of this - // Return the norm for of the vector obtained after apply the 1-qubit - // matrix mat to the vector. - // The matrix is input as vector of the column-major vectorized 1-qubit - // matrix. - double norm(const uint_t qubit, const cvector_t &mat) const; - // Return the norm for of the vector obtained after apply the N-qubit // matrix mat to the vector. // The matrix is input as vector of the column-major vectorized N-qubit // matrix. double norm(const reg_t &qubits, const cvector_t &mat) const; - // Return the norm for of the vector obtained after apply the 1-qubit - // diagonal matrix mat to the vector. - // The matrix is input as vector of the matrix diagonal. - double norm_diagonal(const uint_t qubit, const cvector_t &mat) const; - - // Return the norm for of the vector obtained after apply the N-qubit - // diagonal matrix mat to the vector. - // The matrix is input as vector of the matrix diagonal. - double norm_diagonal(const reg_t &qubits, const cvector_t &mat) const; - //----------------------------------------------------------------------- // Expectation Value //----------------------------------------------------------------------- @@ -452,6 +442,9 @@ class QubitVectorThrust { // cuStateVec void cuStateVec_enable(bool flg) { cuStateVec_enable_ = flg; } + bool support_global_indexing(void) { return (!cuStateVec_enable_); } + + void set_target_gpus(reg_t &t) { target_gpus_ = t; } //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -479,9 +472,9 @@ class QubitVectorThrust { uint_t chunk_index_; bool multi_chunk_distribution_; - bool multi_shots_; mutable bool enable_batch_; bool cuStateVec_enable_ = false; + reg_t target_gpus_; bool register_blocking_; @@ -531,7 +524,10 @@ class QubitVectorThrust { bool async = false) const; // get number of chunk to be applied - uint_t get_chunk_count(void); + uint_t get_chunk_count(void) const; + + // copy from other qv + void copy_qv(const QubitVectorThrust &obj); #ifdef AER_DEBUG // for debugging @@ -649,7 +645,6 @@ QubitVectorThrust::QubitVectorThrust(size_t num_qubits) : num_qubits_(0) { chunk_index_ = 0; multi_chunk_distribution_ = false; - multi_shots_ = false; enable_batch_ = false; max_matrix_bits_ = 0; @@ -679,6 +674,25 @@ QubitVectorThrust::~QubitVectorThrust() { checkpoint_.clear(); } +template +void QubitVectorThrust::copy_qv(const QubitVectorThrust &obj) { + omp_threads_ = obj.omp_threads_; + omp_threshold_ = obj.omp_threshold_; + sample_measure_index_size_ = obj.sample_measure_index_size_; + json_chop_threshold_ = obj.json_chop_threshold_; + chunk_index_ = obj.chunk_index_; + num_threads_per_group_ = obj.num_threads_per_group_; + max_matrix_bits_ = obj.max_matrix_bits_; + + if (!chunk_setup(obj, obj.chunk_index_)) { + throw std::runtime_error( + "QubitVectorThrust: can not allocate chunk for copy"); + } + set_num_qubits(obj.num_qubits()); + + chunk_.set_device(); + chunk_.CopyIn(obj.chunk_); +} //------------------------------------------------------------------------------ // Element access operators //------------------------------------------------------------------------------ @@ -787,7 +801,7 @@ void QubitVectorThrust::initialize_component( if (qubits.size() == 1) { apply_function(Chunk::initialize_component_1qubit_func( qubits[0], state0[0], state0[1])); - } else if (qubits.size() <= chunk_.container()->matrix_bits()) { + } else { auto qubits_sorted = qubits; std::sort(qubits_sorted.begin(), qubits_sorted.end()); @@ -796,19 +810,19 @@ void QubitVectorThrust::initialize_component( for (i = 0; i < qubits.size(); i++) qubits_param.push_back(qubits_sorted[i]); - // chunk_.StoreMatrix(state0); - // chunk_.StoreUintParams(qubits_param); - - apply_function( - Chunk::initialize_component_func(state0, qubits_sorted), state0, - qubits_param); - } else { - // if initial state is larger that matrix buffer, set one by one. - uint_t DIM = 1ull << qubits.size(); - uint_t i; - for (i = 0; i < DIM; i++) { - apply_function( - Chunk::initialize_large_component_func(state0[i], qubits, i)); + int nbit = chunk_.container()->matrix_bits(); + if (nbit > qubits.size()) + nbit = qubits.size(); + + uint_t dim = 1ull << qubits.size(); + uint_t sub_dim = 1ull << nbit; + for (uint_t i = 0; i < dim; i += sub_dim) { + cvector_t state(sub_dim); + for (uint_t j = 0; j < sub_dim; j++) + state[j] = state0[dim - sub_dim - i + j]; + apply_function(Chunk::initialize_component_func( + qubits.size(), dim - sub_dim - i, sub_dim), + state, qubits_param); } } } @@ -858,7 +872,8 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits, int num_qubits, chunk_manager_->set_num_creg_bits(num_creg_bits_ + num_cmem_bits_); chunk_manager_->Allocate(chunk_bits, num_qubits, num_local_chunks, chunk_index_, max_matrix_bits_, - is_density_matrix(), cuStateVec_enable_); + is_density_matrix(), target_gpus_, + cuStateVec_enable_); } multi_chunk_distribution_ = false; @@ -866,8 +881,10 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits, int num_qubits, multi_chunk_distribution_ = true; } - chunk_.unmap(); - buffer_chunk_.unmap(); + if (chunk_.is_mapped()) + chunk_manager_->UnmapChunk(chunk_); + if (buffer_chunk_.is_mapped()) + chunk_manager_->UnmapBufferChunk(buffer_chunk_); send_chunk_.unmap(); recv_chunk_.unmap(); @@ -879,30 +896,28 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits, int num_qubits, } template -bool QubitVectorThrust::chunk_setup(QubitVectorThrust &base, - const uint_t chunk_index) { - chunk_manager_ = base.chunk_manager_; - +bool QubitVectorThrust::chunk_setup( + const QubitVectorThrust &base, const uint_t chunk_index) { multi_chunk_distribution_ = base.multi_chunk_distribution_; - if (!multi_chunk_distribution_) { - if (chunk_manager_->chunk_bits() == chunk_manager_->num_qubits()) { - multi_shots_ = true; - base.multi_shots_ = true; - } - } cuStateVec_enable_ = base.cuStateVec_enable_; + target_gpus_ = base.target_gpus_; // set global chunk ID / shot ID chunk_index_ = chunk_index; + chunk_.set_chunk_index(chunk_index_); - chunk_.unmap(); - buffer_chunk_.unmap(); + if (buffer_chunk_.is_mapped()) + chunk_manager_->UnmapBufferChunk(buffer_chunk_); send_chunk_.unmap(); recv_chunk_.unmap(); + if (chunk_.is_mapped()) { + return true; + } + // mapping/setting chunk + chunk_manager_ = base.chunk_manager_; bool mapped = chunk_manager_->MapChunk(chunk_, 0); - chunk_.set_chunk_index(chunk_index_); return mapped; } @@ -1160,7 +1175,7 @@ bool QubitVectorThrust::enable_batch(bool flg) const { } template -uint_t QubitVectorThrust::get_chunk_count(void) { +uint_t QubitVectorThrust::get_chunk_count(void) const { if (multi_chunk_distribution_) { if (chunk_.device() < 0 || cuStateVec_enable_) return 1; @@ -1169,6 +1184,8 @@ uint_t QubitVectorThrust::get_chunk_count(void) { } else { // multi-shots if (enable_batch_ && chunk_.pos() != 0) return 0; // first chunk execute all in batch + else if (!enable_batch_) + return 1; } return chunk_.container()->num_chunks(); } @@ -1921,65 +1938,19 @@ double QubitVectorThrust::norm() const { template double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t &mat) const { - const size_t N = qubits.size(); - - if (N == 1) { - return norm(qubits[0], mat); - } else { - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - for (int_t i = 0; i < N; i++) { - qubits_sorted.push_back(qubits[i]); + uint_t count = 1; +#ifdef AER_THRUST_CUDA + if (!cuStateVec_enable_ && + ((multi_chunk_distribution_ && chunk_.device() >= 0 && + num_qubits_ == num_qubits()) || + (enable_batch_))) { + if (chunk_.pos() != 0) { + return 0.0; } - - chunk_.StoreMatrix(mat); - chunk_.StoreUintParams(qubits_sorted); - - double ret; - apply_function_sum(&ret, Chunk::NormMatrixMultNxN(N)); - return ret; - } -} - -template -double -QubitVectorThrust::norm_diagonal(const reg_t &qubits, - const cvector_t &mat) const { - - const uint_t N = qubits.size(); - - if (N == 1) { - return norm_diagonal(qubits[0], mat); - } else { - chunk_.StoreMatrix(mat); - chunk_.StoreUintParams(qubits); - - double ret; - apply_function_sum(&ret, Chunk::NormDiagonalMultNxN(qubits)); - return ret; + count = chunk_.container()->num_chunks(); } -} - -//------------------------------------------------------------------------------ -// Single-qubit specialization -//------------------------------------------------------------------------------ -template -double QubitVectorThrust::norm(const uint_t qubit, - const cvector_t &mat) const { - double ret; - apply_function_sum(&ret, Chunk::NormMatrixMult2x2(mat, qubit)); - - return ret; -} - -template -double -QubitVectorThrust::norm_diagonal(const uint_t qubit, - const cvector_t &mat) const { - double ret; - apply_function_sum(&ret, Chunk::NormDiagonalMult2x2(mat, qubit)); - - return ret; +#endif + return chunk_.expval_matrix(qubits, mat, count); } /******************************************************************************* @@ -2003,8 +1974,6 @@ std::vector QubitVectorThrust::probabilities() const { DebugMsg("calling probabilities"); #endif -#pragma omp parallel for if (num_qubits_ > omp_threshold_ && omp_threads_ > 1) \ - num_threads(omp_threads_) for (int_t j = 0; j < END; j++) { probs[j] = probability(j); } diff --git a/src/simulators/statevector/statevector_executor.hpp b/src/simulators/statevector/statevector_executor.hpp new file mode 100644 index 0000000000..28312f4aae --- /dev/null +++ b/src/simulators/statevector/statevector_executor.hpp @@ -0,0 +1,1807 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _statevector_executor_hpp_ +#define _statevector_executor_hpp_ + +#include "simulators/batch_shots_executor.hpp" +#include "simulators/parallel_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace Statevector { + +//------------------------------------------------------------------------- +// Executor for statevector +//------------------------------------------------------------------------- +template +class Executor : public CircuitExecutor::ParallelStateExecutor, + public CircuitExecutor::BatchShotsExecutor { + using Base = CircuitExecutor::MultiStateExecutor; + using BasePar = CircuitExecutor::ParallelStateExecutor; + using BaseBatch = CircuitExecutor::BatchShotsExecutor; + +protected: +public: + Executor() {} + virtual ~Executor() {} + +protected: + void set_config(const Config &config) override; + + void apply_global_phase() override; + + bool shot_branching_supported(void) override { return true; } + + // apply parallel operations + bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op) override; + + // apply op to multiple shots , return flase if op is not supported to execute + // in a batch + bool apply_batched_op(const int_t istate, const Operations::Op &op, + ExperimentResult &result, std::vector &rng, + bool final_op = false) override; + + bool apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, ExperimentResult &result, + bool final_op) override; + + // Initializes an n-qubit state to the all |0> state + void initialize_qreg(uint_t num_qubits) override; + + auto move_to_vector(void); + auto copy_to_vector(void); + + void run_circuit_shots(Circuit &circ, const Noise::NoiseModel &noise, + const Config &config, RngEngine &init_rng, + ExperimentResult &result, bool sample_noise) override; + + bool allocate_states(uint_t num_states, const Config &config) override { + return BasePar::allocate_states(num_states, config); + } + + //----------------------------------------------------------------------- + // Apply instructions + //----------------------------------------------------------------------- + // Measure qubits and return a list of outcomes [q0, q1, ...] + // If a state subclass supports this function it then "measure" + // should be contained in the set returned by the 'allowed_ops' + // method. + void apply_measure(const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister, RngEngine &rng); + + // Reset the specified qubits to the |0> state by simulating + // a measurement, applying a conditional x-gate if the outcome is 1, and + // then discarding the outcome. + void apply_reset(const reg_t &qubits, RngEngine &rng); + + // Initialize the specified qubits to a given state |psi> + // by applying a reset to the these qubits and then + // computing the tensor product with the new state |psi> + // /psi> is given in params + void apply_initialize(const reg_t &qubits, const cvector_t ¶ms, + RngEngine &rng); + + void initialize_from_vector(const cvector_t ¶ms); + + // Apply a Kraus error operation + void apply_kraus(const reg_t &qubits, const std::vector &krausops, + RngEngine &rng); + + void apply_reset(CircuitExecutor::Branch &root, const reg_t &qubits); + void apply_initialize(CircuitExecutor::Branch &root, const reg_t &qubits, + const cvector_t ¶ms); + void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits, + const std::vector &kmats); + + //----------------------------------------------------------------------- + // Save data instructions + //----------------------------------------------------------------------- + + // Save the current state of the statevector simulator + // If `last_op` is True this will use move semantics to move the simulator + // state to the results, otherwise it will use copy semantics to leave + // the current simulator state unchanged. + void apply_save_statevector(const Operations::Op &op, + ExperimentResult &result, bool last_op); + + // Save the current state of the statevector simulator as a ket-form map. + void apply_save_statevector_dict(const Operations::Op &op, + ExperimentResult &result); + + // Save the current density matrix or reduced density matrix + void apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result); + + // Helper function for computing expectation value + void apply_save_probs(const Operations::Op &op, ExperimentResult &result); + + // Helper function for saving amplitudes and amplitudes squared + void apply_save_amplitudes(const Operations::Op &op, + ExperimentResult &result); + + void apply_save_statevector(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, bool last_op); + void apply_save_statevector_dict(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result); + void apply_save_amplitudes(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result); + + // Helper function for computing expectation value + double expval_pauli(const reg_t &qubits, const std::string &pauli) override; + //----------------------------------------------------------------------- + // Measurement Helpers + //----------------------------------------------------------------------- + + // Return vector of measure probabilities for specified qubits + // If a state subclass supports this function it then "measure" + // should be contained in the set returned by the 'allowed_ops' + // method. + rvector_t measure_probs(const reg_t &qubits) const; + + // Sample the measurement outcome for qubits + // return a pair (m, p) of the outcome m, and its corresponding + // probability p. + // Outcome is given as an int: Eg for two-qubits {q0, q1} we have + // 0 -> |q1 = 0, q0 = 0> state + // 1 -> |q1 = 0, q0 = 1> state + // 2 -> |q1 = 1, q0 = 0> state + // 3 -> |q1 = 1, q0 = 1> state + std::pair sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng); + + void measure_reset_update(const std::vector &qubits, + const uint_t final_state, const uint_t meas_state, + const double meas_prob); + + rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits); + void measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs); + void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits, + const reg_t &cmemory, const reg_t &cregister); + + std::vector sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const override; + + // Return the reduced density matrix for the simulator + cmatrix_t density_matrix(const reg_t &qubits); + + // Sample n-measurement outcomes without applying the measure operation + // to the system state + std::vector sample_measure(const reg_t &qubits, uint_t shots, + RngEngine &rng) const override; +}; + +template +void Executor::set_config(const Config &config) { + BasePar::set_config(config); + BaseBatch::set_config(config); +} + +template +void Executor::apply_global_phase() { + if (Base::has_global_phase_) { + int_t i; + if (Base::shot_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + Base::states_[iChunk].apply_diagonal_matrix( + {0}, {Base::global_phase_, Base::global_phase_}); + } + } else { + for (i = 0; i < Base::states_.size(); i++) + Base::states_[i].apply_diagonal_matrix( + {0}, {Base::global_phase_, Base::global_phase_}); + } + } +} + +template +void Executor::run_circuit_shots( + Circuit &circ, const Noise::NoiseModel &noise, const Config &config, + RngEngine &init_rng, ExperimentResult &result, bool sample_noise) { + state_t dummy_state; + if (BasePar::multiple_chunk_required(circ, noise)) { + return BasePar::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } else { + return BaseBatch::run_circuit_shots(circ, noise, config, init_rng, result, + sample_noise); + } +} + +template +bool Executor::apply_parallel_op(const Operations::Op &op, + ExperimentResult &result, + RngEngine &rng, bool final_op) { + // temporary : this is for statevector + if (Base::states_[0].creg().check_conditional(op)) { + switch (op.type) { + case Operations::OpType::reset: + apply_reset(op.qubits, rng); + break; + case Operations::OpType::initialize: + apply_initialize(op.qubits, op.params, rng); + break; + case Operations::OpType::measure: + apply_measure(op.qubits, op.memory, op.registers, rng); + break; + case Operations::OpType::bfunc: + BasePar::apply_bfunc(op); + break; + case Operations::OpType::roerror: + BasePar::apply_roerror(op, rng); + break; + case Operations::OpType::kraus: + apply_kraus(op.qubits, op.mats, rng); + break; + case Operations::OpType::set_statevec: + initialize_from_vector(op.params); + break; + case Operations::OpType::save_expval: + case Operations::OpType::save_expval_var: + BasePar::apply_save_expval(op, result); + break; + case Operations::OpType::save_densmat: + apply_save_density_matrix(op, result); + break; + case Operations::OpType::save_state: + case Operations::OpType::save_statevec: + apply_save_statevector(op, result, final_op); + break; + case Operations::OpType::save_statevec_dict: + apply_save_statevector_dict(op, result); + break; + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + apply_save_probs(op, result); + break; + case Operations::OpType::save_amps: + case Operations::OpType::save_amps_sq: + apply_save_amplitudes(op, result); + break; + default: + return false; + } + } + return true; +} + +template +bool Executor::apply_batched_op(const int_t istate, + const Operations::Op &op, + ExperimentResult &result, + std::vector &rng, + bool final_op) { + if (op.conditional) { + Base::states_[istate].qreg().set_conditional(op.conditional_reg); + } + + switch (op.type) { + case Operations::OpType::barrier: + case Operations::OpType::nop: + case Operations::OpType::qerror_loc: + break; + case Operations::OpType::reset: + Base::states_[istate].qreg().apply_batched_reset(op.qubits, rng); + break; + case Operations::OpType::initialize: + Base::states_[istate].qreg().apply_batched_reset(op.qubits, rng); + Base::states_[istate].qreg().initialize_component(op.qubits, op.params); + break; + case Operations::OpType::measure: + Base::states_[istate].qreg().apply_batched_measure(op.qubits, rng, + op.memory, op.registers); + break; + case Operations::OpType::bfunc: + Base::states_[istate].qreg().apply_bfunc(op); + break; + case Operations::OpType::roerror: + Base::states_[istate].qreg().apply_roerror(op, rng); + break; + case Operations::OpType::gate: + Base::states_[istate].apply_gate(op); + break; + case Operations::OpType::matrix: + Base::states_[istate].apply_matrix(op); + break; + case Operations::OpType::diagonal_matrix: + Base::states_[istate].qreg().apply_diagonal_matrix(op.qubits, op.params); + break; + case Operations::OpType::multiplexer: + Base::states_[istate].apply_multiplexer( + op.regs[0], op.regs[1], + op.mats); // control qubits ([0]) & target qubits([1]) + break; + case Operations::OpType::kraus: + Base::states_[istate].qreg().apply_batched_kraus(op.qubits, op.mats, rng); + break; + case Operations::OpType::sim_op: + if (op.name == "begin_register_blocking") { + Base::states_[istate].qreg().enter_register_blocking(op.qubits); + } else if (op.name == "end_register_blocking") { + Base::states_[istate].qreg().leave_register_blocking(); + } else { + return false; + } + break; + case Operations::OpType::set_statevec: + Base::states_[istate].qreg().initialize_from_vector(op.params); + break; + default: + // other operations should be called to indivisual chunks by apply_op + return false; + } + return true; +} + +template +bool Executor::apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, + bool final_op) { + RngEngine dummy; + if (Base::states_[root.state_index()].creg().check_conditional(op)) { + switch (op.type) { + // ops with branching + case Operations::OpType::reset: + apply_reset(root, op.qubits); + break; + case Operations::OpType::initialize: + apply_initialize(root, op.qubits, op.params); + break; + case Operations::OpType::measure: + apply_measure(root, op.qubits, op.memory, op.registers); + break; + case Operations::OpType::kraus: + apply_kraus(root, op.qubits, op.mats); + break; + // save ops + case Operations::OpType::save_expval: + case Operations::OpType::save_expval_var: + case Operations::OpType::save_densmat: + case Operations::OpType::save_probs: + case Operations::OpType::save_probs_ket: + // call save functions in state class + Base::states_[root.state_index()].apply_op(op, result, dummy, final_op); + break; + case Operations::OpType::save_state: + case Operations::OpType::save_statevec: + apply_save_statevector(root, op, result, final_op); + break; + case Operations::OpType::save_statevec_dict: + apply_save_statevector_dict(root, op, result); + break; + case Operations::OpType::save_amps: + case Operations::OpType::save_amps_sq: + apply_save_amplitudes(root, op, result); + break; + default: + return false; + } + } + return true; +} + +template +void Executor::initialize_qreg(uint_t num_qubits) { + int_t i; + + for (i = 0; i < Base::states_.size(); i++) { + Base::states_[i].qreg().set_num_qubits(BasePar::chunk_bits_); + } + + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + if (Base::global_state_index_ + iChunk == 0 || + this->num_qubits_ == this->chunk_bits_) { + Base::states_[iChunk].qreg().initialize(); + } else { + Base::states_[iChunk].qreg().zero(); + } + } + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + if (Base::global_state_index_ + i == 0 || + this->num_qubits_ == this->chunk_bits_) { + Base::states_[i].qreg().initialize(); + } else { + Base::states_[i].qreg().zero(); + } + } + } + + BasePar::apply_global_phase(); +} + +template +auto Executor::move_to_vector(void) { + size_t size_required = + 2 * (sizeof(std::complex) << Base::num_qubits_) + + (sizeof(std::complex) << BasePar::chunk_bits_) * + Base::num_local_states_; + if ((size_required >> 20) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to store states")); + } + int_t iChunk; + auto state = Base::states_[0].qreg().move_to_vector(); + state.resize(Base::num_local_states_ << BasePar::chunk_bits_); + +#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk) + for (iChunk = 1; iChunk < Base::states_.size(); iChunk++) { + auto tmp = Base::states_[iChunk].qreg().move_to_vector(); + uint_t j, offset = iChunk << BasePar::chunk_bits_; + for (j = 0; j < tmp.size(); j++) { + state[offset + j] = tmp[j]; + } + } + +#ifdef AER_MPI + BasePar::gather_state(state); +#endif + return state; +} + +template +auto Executor::copy_to_vector(void) { + size_t size_required = + 2 * (sizeof(std::complex) << Base::num_qubits_) + + (sizeof(std::complex) << BasePar::chunk_bits_) * + Base::num_local_states_; + if ((size_required >> 20) > Utils::get_system_memory_mb()) { + throw std::runtime_error( + std::string("There is not enough memory to store states")); + } + int_t iChunk; + auto state = Base::states_[0].qreg().copy_to_vector(); + state.resize(Base::num_local_states_ << BasePar::chunk_bits_); + +#pragma omp parallel for if (BasePar::chunk_omp_parallel_) private(iChunk) + for (iChunk = 1; iChunk < Base::states_.size(); iChunk++) { + auto tmp = Base::states_[iChunk].qreg().copy_to_vector(); + uint_t j, offset = iChunk << BasePar::chunk_bits_; + for (j = 0; j < tmp.size(); j++) { + state[offset + j] = tmp[j]; + } + } + +#ifdef AER_MPI + BasePar::gather_state(state); +#endif + return state; +} + +//========================================================================= +// Implementation: Save data +//========================================================================= + +template +void Executor::apply_save_probs(const Operations::Op &op, + ExperimentResult &result) { + // get probs as hexadecimal + auto probs = measure_probs(op.qubits); + if (op.type == Operations::OpType::save_probs_ket) { + // Convert to ket dict + result.save_data_average( + Base::states_[0].creg(), op.string_params[0], + Utils::vec2ket(probs, Base::json_chop_threshold_, 16), op.type, + op.save_type); + } else { + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + std::move(probs), op.type, op.save_type); + } +} + +template +double Executor::expval_pauli(const reg_t &qubits, + const std::string &pauli) { + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + std::string pauli_in_chunk; + std::string pauli_out_chunk; + int_t i, n; + double expval(0.); + + // get inner/outer chunk pauli string + n = pauli.size(); + for (i = 0; i < n; i++) { + if (qubits[i] < BasePar::chunk_bits_) { + qubits_in_chunk.push_back(qubits[i]); + pauli_in_chunk.push_back(pauli[n - i - 1]); + } else { + qubits_out_chunk.push_back(qubits[i]); + pauli_out_chunk.push_back(pauli[n - i - 1]); + } + } + + if (qubits_out_chunk.size() > 0) { // there are bits out of chunk + std::complex phase = 1.0; + + std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end()); + std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end()); + + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = + AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk); + + AER::QV::add_y_phase(num_y, phase); + + if (x_mask != 0) { // pairing state is out of chunk + bool on_same_process = true; +#ifdef AER_MPI + int proc_bits = 0; + uint_t procs = Base::distributed_procs_; + while (procs > 1) { + if ((procs & 1) != 0) { + proc_bits = -1; + break; + } + proc_bits++; + procs >>= 1; + } + if (x_mask & (~((1ull << (Base::num_qubits_ - proc_bits)) - 1)) != + 0) { // data exchange between processes is required + on_same_process = false; + } +#endif + + x_mask >>= BasePar::chunk_bits_; + z_mask >>= BasePar::chunk_bits_; + x_max -= BasePar::chunk_bits_; + + const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); + const uint_t mask_l = (1ull << x_max) - 1; + if (on_same_process) { + auto apply_expval_pauli_chunk = [this, x_mask, z_mask, x_max, mask_u, + mask_l, qubits_in_chunk, + pauli_in_chunk, phase](int_t iGroup) { + double expval = 0.0; + for (int_t iChunk = Base::top_state_of_group_[iGroup]; + iChunk < Base::top_state_of_group_[iGroup + 1]; iChunk++) { + uint_t pair_chunk = iChunk ^ x_mask; + if (iChunk < pair_chunk) { + uint_t z_count, z_count_pair; + z_count = AER::Utils::popcount(iChunk & z_mask); + z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); + + expval += Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli(qubits_in_chunk, pauli_in_chunk, + Base::states_[pair_chunk].qreg(), + z_count, z_count_pair, phase); + } + } + return expval; + }; + expval += Utils::apply_omp_parallel_for_reduction( + (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0, + Base::num_global_states_ / 2, apply_expval_pauli_chunk); + } else { + for (int_t i = 0; i < Base::num_global_states_ / 2; i++) { + uint_t iChunk = ((i << 1) & mask_u) | (i & mask_l); + uint_t pair_chunk = iChunk ^ x_mask; + uint_t iProc = BasePar::get_process_by_chunk(pair_chunk); + if (Base::state_index_begin_[Base::distributed_rank_] <= iChunk && + Base::state_index_end_[Base::distributed_rank_] > + iChunk) { // on this process + uint_t z_count, z_count_pair; + z_count = AER::Utils::popcount(iChunk & z_mask); + z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); + + if (iProc == Base::distributed_rank_) { // pair is on the + // same process + expval += + Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli( + qubits_in_chunk, pauli_in_chunk, + Base::states_[pair_chunk - Base::global_state_index_] + .qreg(), + z_count, z_count_pair, phase); + } else { + BasePar::recv_chunk(iChunk - Base::global_state_index_, + pair_chunk); + // refer receive buffer to calculate expectation value + expval += + Base::states_[iChunk - Base::global_state_index_] + .qreg() + .expval_pauli( + qubits_in_chunk, pauli_in_chunk, + Base::states_[iChunk - Base::global_state_index_] + .qreg(), + z_count, z_count_pair, phase); + } + } else if (iProc == Base::distributed_rank_) { // pair is on + // this process + BasePar::send_chunk(iChunk - Base::global_state_index_, pair_chunk); + } + } + } + } else { // no exchange between chunks + z_mask >>= BasePar::chunk_bits_; + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for reduction(+ : expval) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + double e_tmp = 0.0; + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + double sign = 1.0; + if (z_mask && (AER::Utils::popcount( + (iChunk + Base::global_state_index_) & z_mask) & + 1)) + sign = -1.0; + e_tmp += sign * Base::states_[iChunk].qreg().expval_pauli( + qubits_in_chunk, pauli_in_chunk); + } + expval += e_tmp; + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + double sign = 1.0; + if (z_mask && + (AER::Utils::popcount((i + Base::global_state_index_) & z_mask) & + 1)) + sign = -1.0; + expval += sign * Base::states_[i].qreg().expval_pauli(qubits_in_chunk, + pauli_in_chunk); + } + } + } + } else { // all bits are inside chunk + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for reduction(+ : expval) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + double e_tmp = 0.0; + for (int_t iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) + e_tmp += Base::states_[iChunk].qreg().expval_pauli(qubits, pauli); + expval += e_tmp; + } + } else { + for (i = 0; i < Base::states_.size(); i++) + expval += Base::states_[i].qreg().expval_pauli(qubits, pauli); + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(expval); +#endif + return expval; +} + +template +void Executor::apply_save_statevector(const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + std::string key = + (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0]; + + if (last_op) { + auto v = move_to_vector(); + result.save_data_pershot(Base::states_[0].creg(), key, std::move(v), + Operations::OpType::save_statevec, op.save_type); + } else { + result.save_data_pershot(Base::states_[0].creg(), key, copy_to_vector(), + Operations::OpType::save_statevec, op.save_type); + } +} + +template +void Executor::apply_save_statevector_dict(const Operations::Op &op, + ExperimentResult &result) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + auto vec = copy_to_vector(); + std::map result_state_ket; + for (size_t k = 0; k < vec.size(); ++k) { + if (std::abs(vec[k]) >= Base::json_chop_threshold_) { + std::string key = Utils::int2hex(k); + result_state_ket.insert({key, vec[k]}); + } + } + result.save_data_pershot(Base::states_[0].creg(), op.string_params[0], + std::move(result_state_ket), op.type, op.save_type); +} + +template +void Executor::apply_save_density_matrix(const Operations::Op &op, + ExperimentResult &result) { + cmatrix_t reduced_state; + + // Check if tracing over all qubits + if (op.qubits.empty()) { + reduced_state = cmatrix_t(1, 1); + + double sum = 0.0; + if (BasePar::chunk_omp_parallel_) { +#pragma omp parallel for reduction(+ : sum) + for (int_t i = 0; i < Base::states_.size(); i++) + sum += Base::states_[i].qreg().norm(); + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + sum += Base::states_[i].qreg().norm(); + } +#ifdef AER_MPI + BasePar::reduce_sum(sum); +#endif + reduced_state[0] = sum; + } else { + reduced_state = density_matrix(op.qubits); + } + + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + std::move(reduced_state), op.type, op.save_type); +} + +template +void Executor::apply_save_amplitudes(const Operations::Op &op, + ExperimentResult &result) { + if (op.int_params.empty()) { + throw std::invalid_argument( + "Invalid save_amplitudes instructions (empty params)."); + } + const int_t size = op.int_params.size(); + if (op.type == Operations::OpType::save_amps) { + Vector amps(size, false); + for (int_t i = 0; i < size; ++i) { + uint_t idx = BasePar::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BasePar::chunk_bits_; + amps[i] = 0.0; + if (iChunk >= Base::global_state_index_ && + iChunk < Base::global_state_index_ + Base::states_.size()) { + amps[i] = + Base::states_[iChunk - Base::global_state_index_].qreg().get_state( + idx - (iChunk << BasePar::chunk_bits_)); + } +#ifdef AER_MPI + complex_t amp = amps[i]; + BasePar::reduce_sum(amp); + amps[i] = amp; +#endif + } + result.save_data_pershot(Base::states_[0].creg(), op.string_params[0], + std::move(amps), op.type, op.save_type); + } else { + rvector_t amps_sq(size, 0); + for (int_t i = 0; i < size; ++i) { + uint_t idx = BasePar::mapped_index(op.int_params[i]); + uint_t iChunk = idx >> BasePar::chunk_bits_; + if (iChunk >= Base::global_state_index_ && + iChunk < Base::global_state_index_ + Base::states_.size()) { + amps_sq[i] = Base::states_[iChunk - Base::global_state_index_] + .qreg() + .probability(idx - (iChunk << BasePar::chunk_bits_)); + } + } +#ifdef AER_MPI + BasePar::reduce_sum(amps_sq); +#endif + result.save_data_average(Base::states_[0].creg(), op.string_params[0], + std::move(amps_sq), op.type, op.save_type); + } +} + +template +cmatrix_t Executor::density_matrix(const reg_t &qubits) { + const size_t N = qubits.size(); + const size_t DIM = 1ULL << N; + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + auto vec = copy_to_vector(); + + // Return full density matrix + cmatrix_t densmat(DIM, DIM); + if ((N == Base::num_qubits_) && (qubits == qubits_sorted)) { + const int_t mask = QV::MASKS[N]; +#pragma omp parallel for + for (int_t rowcol = 0; rowcol < int_t(DIM * DIM); ++rowcol) { + const int_t row = rowcol >> N; + const int_t col = rowcol & mask; + densmat(row, col) = complex_t(vec[row]) * complex_t(std::conj(vec[col])); + } + } else { + const size_t END = 1ULL << (Base::num_qubits_ - N); + // Initialize matrix values with first block + { + const auto inds = QV::indexes(qubits, qubits_sorted, 0); + for (size_t row = 0; row < DIM; ++row) + for (size_t col = 0; col < DIM; ++col) { + densmat(row, col) = + complex_t(vec[inds[row]]) * complex_t(std::conj(vec[inds[col]])); + } + } + // Accumulate remaining blocks + for (size_t k = 1; k < END; k++) { + // store entries touched by U + const auto inds = QV::indexes(qubits, qubits_sorted, k); + for (size_t row = 0; row < DIM; ++row) + for (size_t col = 0; col < DIM; ++col) { + densmat(row, col) += + complex_t(vec[inds[row]]) * complex_t(std::conj(vec[inds[col]])); + } + } + } + return densmat; +} + +//========================================================================= +// Implementation: Reset, Initialize and Measurement Sampling +//========================================================================= + +template +void Executor::apply_measure(const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister, RngEngine &rng) { + // Actual measurement outcome + const auto meas = sample_measure_with_prob(qubits, rng); + // Implement measurement update + measure_reset_update(qubits, meas.first, meas.first, meas.second); + const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size()); + BasePar::store_measure(outcome, cmemory, cregister); +} + +template +rvector_t Executor::measure_probs(const reg_t &qubits) const { + uint_t dim = 1ull << qubits.size(); + rvector_t sum(dim, 0.0); + int_t i, j, k; + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk, + qubits_out_chunk); + + if (qubits_in_chunk.size() > 0) { + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(i, j, k) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) { + auto chunkSum = + Base::states_[i].qreg().probabilities(qubits_in_chunk); + + if (qubits_in_chunk.size() == qubits.size()) { + for (j = 0; j < dim; j++) { +#pragma omp atomic + sum[j] += chunkSum[j]; + } + } else { + for (j = 0; j < chunkSum.size(); j++) { + int idx = 0; + int i_in = 0; + for (k = 0; k < qubits.size(); k++) { + if (qubits[k] < BasePar::chunk_bits_) { + idx += (((j >> i_in) & 1) << k); + i_in++; + } else { + if ((((i + Base::global_state_index_) + << BasePar::chunk_bits_) >> + qubits[k]) & + 1) { + idx += 1ull << k; + } + } + } +#pragma omp atomic + sum[idx] += chunkSum[j]; + } + } + } + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + auto chunkSum = Base::states_[i].qreg().probabilities(qubits_in_chunk); + + if (qubits_in_chunk.size() == qubits.size()) { + for (j = 0; j < dim; j++) { + sum[j] += chunkSum[j]; + } + } else { + for (j = 0; j < chunkSum.size(); j++) { + int idx = 0; + int i_in = 0; + for (k = 0; k < qubits.size(); k++) { + if (qubits[k] < BasePar::chunk_bits_) { + idx += (((j >> i_in) & 1) << k); + i_in++; + } else { + if ((((i + Base::global_state_index_) + << BasePar::chunk_bits_) >> + qubits[k]) & + 1) { + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } + } + } + } + } else { // there is no bit in chunk + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(i, j, k) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) { + auto nr = std::real(Base::states_[i].qreg().norm()); + int idx = 0; + for (k = 0; k < qubits_out_chunk.size(); k++) { + if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >> + qubits_out_chunk[k]) & + 1) { + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += nr; + } + } + } else { + for (i = 0; i < Base::states_.size(); i++) { + auto nr = std::real(Base::states_[i].qreg().norm()); + int idx = 0; + for (k = 0; k < qubits_out_chunk.size(); k++) { + if ((((i + Base::global_state_index_) << (BasePar::chunk_bits_)) >> + qubits_out_chunk[k]) & + 1) { + idx += 1ull << k; + } + } + sum[idx] += nr; + } + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(sum); +#endif + + return sum; +} + +template +void Executor::apply_reset(const reg_t &qubits, RngEngine &rng) { + // Simulate unobserved measurement + const auto meas = sample_measure_with_prob(qubits, rng); + // Apply update to reset state + measure_reset_update(qubits, 0, meas.first, meas.second); +} + +template +std::pair +Executor::sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng) { + rvector_t probs = measure_probs(qubits); + + // Randomly pick outcome and return pair + uint_t outcome = rng.rand_int(probs); + return std::make_pair(outcome, probs[outcome]); +} + +template +void Executor::measure_reset_update(const std::vector &qubits, + const uint_t final_state, + const uint_t meas_state, + const double meas_prob) { + // Update a state vector based on an outcome pair [m, p] from + // sample_measure_with_prob function, and a desired post-measurement + // final_state + + // Single-qubit case + if (qubits.size() == 1) { + // Diagonal matrix for projecting and renormalizing to measurement outcome + cvector_t mdiag(2, 0.); + mdiag[meas_state] = 1. / std::sqrt(meas_prob); + + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].apply_diagonal_matrix(qubits, mdiag); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].apply_diagonal_matrix(qubits, mdiag); + } + } + + // If it doesn't agree with the reset state update + if (final_state != meas_state) { + BasePar::apply_chunk_x(qubits[0]); + } + } + // Multi qubit case + else { + // Diagonal matrix for projecting and renormalizing to measurement outcome + const size_t dim = 1ULL << qubits.size(); + cvector_t mdiag(dim, 0.); + mdiag[meas_state] = 1. / std::sqrt(meas_prob); + + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].apply_diagonal_matrix(qubits, mdiag); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].apply_diagonal_matrix(qubits, mdiag); + } + } + + // If it doesn't agree with the reset state update + // This function could be optimized as a permutation update + if (final_state != meas_state) { + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + + Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk, + qubits_out_chunk); + + if (qubits_in_chunk.size() == qubits.size()) { // all bits are inside + // chunk + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != meas_state) + perm[j * dim + j] = 1.; + } + // apply permutation to swap state + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, perm); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, perm); + } + } + } else { + for (int_t i = 0; i < qubits.size(); i++) { + if (((final_state >> i) & 1) != ((meas_state >> i) & 1)) { + BasePar::apply_chunk_x(qubits[i]); + } + } + } + } + } +} + +template +std::vector Executor::sample_measure(const reg_t &qubits, + uint_t shots, + RngEngine &rng) const { + int_t i, j; + // Generate flat register for storing + std::vector rnds; + rnds.reserve(shots); + reg_t allbit_samples(shots, 0); + + for (i = 0; i < shots; ++i) + rnds.push_back(rng.rand(0, 1)); + + std::vector chunkSum(Base::states_.size() + 1, 0); + double sum, localSum; + + // calculate per chunk sum + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) { + bool batched = Base::states_[ic].qreg().enable_batch( + true); // return sum of all chunks in group + chunkSum[ic] = Base::states_[ic].qreg().norm(); + Base::states_[ic].qreg().enable_batch(batched); + } + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) { + bool batched = Base::states_[ic].qreg().enable_batch( + true); // return sum of all chunks in group + chunkSum[ic] = Base::states_[ic].qreg().norm(); + Base::states_[ic].qreg().enable_batch(batched); + } + } + } + + localSum = 0.0; + for (i = 0; i < Base::states_.size(); i++) { + sum = localSum; + localSum += chunkSum[i]; + chunkSum[i] = sum; + } + chunkSum[Base::states_.size()] = localSum; + + double globalSum = 0.0; + if (Base::nprocs_ > 1) { + std::vector procTotal(Base::nprocs_); + + for (i = 0; i < Base::nprocs_; i++) { + procTotal[i] = localSum; + } + + BasePar::gather_value(procTotal); + + for (i = 0; i < Base::myrank_; i++) { + globalSum += procTotal[i]; + } + } + + reg_t local_samples(shots, 0); + + // get rnds positions for each chunk + for (i = 0; i < Base::states_.size(); i++) { + uint_t nIn; + std::vector vIdx; + std::vector vRnd; + + // find rnds in this chunk + nIn = 0; + for (j = 0; j < shots; j++) { + if (rnds[j] >= chunkSum[i] + globalSum && + rnds[j] < chunkSum[i + 1] + globalSum) { + vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); + vIdx.push_back(j); + nIn++; + } + } + + if (nIn > 0) { + auto chunkSamples = Base::states_[i].qreg().sample_measure(vRnd); + + for (j = 0; j < chunkSamples.size(); j++) { + local_samples[vIdx[j]] = + ((Base::global_state_index_ + i) << BasePar::chunk_bits_) + + chunkSamples[j]; + } + } + } + +#ifdef AER_MPI + BasePar::reduce_sum(local_samples); +#endif + allbit_samples = local_samples; + + // Convert to reg_t format + std::vector all_samples; + all_samples.reserve(shots); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); + } + all_samples.push_back(sample); + } + + return all_samples; +} + +template +void Executor::apply_initialize(const reg_t &qubits, + const cvector_t ¶ms, + RngEngine &rng) { + auto sorted_qubits = qubits; + std::sort(sorted_qubits.begin(), sorted_qubits.end()); + if (qubits.size() == Base::num_qubits_) { + // If qubits is all ordered qubits in the statevector + // we can just initialize the whole state directly + if (qubits == sorted_qubits) { + initialize_from_vector(params); + return; + } + } + // Apply reset to qubits + apply_reset(qubits, rng); + + // Apply initialize_component + reg_t qubits_in_chunk; + reg_t qubits_out_chunk; + Chunk::get_qubits_inout(BasePar::chunk_bits_, qubits, qubits_in_chunk, + qubits_out_chunk); + + if (qubits_out_chunk.size() == 0) { // no qubits outside of chunk + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().initialize_component(qubits, params); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().initialize_component(qubits, params); + } + } else { + // scatter base states + if (qubits_in_chunk.size() > 0) { + // scatter inside chunks + const size_t dim = 1ULL << qubits_in_chunk.size(); + cvector_t perm(dim * dim, 0.); + for (int_t i = 0; i < dim; i++) { + perm[i] = 1.0; + } + + if (BasePar::chunk_omp_parallel_) { +#pragma omp parallel for + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_matrix(qubits_in_chunk, perm); + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_matrix(qubits_in_chunk, perm); + } + } + if (qubits_out_chunk.size() > 0) { + // then scatter outside chunk + auto sorted_qubits_out = qubits_out_chunk; + std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end()); + + for (int_t i = 0; i < (1ull << (Base::num_qubits_ - BasePar::chunk_bits_ - + qubits_out_chunk.size())); + i++) { + uint_t baseChunk = 0; + uint_t j, ii, t; + ii = i; + for (j = 0; j < qubits_out_chunk.size(); j++) { + t = ii & ((1ull << qubits_out_chunk[j]) - 1); + baseChunk += t; + ii = (ii - t) << 1; + } + baseChunk += ii; + baseChunk >>= BasePar::chunk_bits_; + + for (j = 1; j < (1ull << qubits_out_chunk.size()); j++) { + int_t ic = baseChunk; + for (t = 0; t < qubits_out_chunk.size(); t++) { + if ((j >> t) & 1) + ic += (1ull << (qubits_out_chunk[t] - BasePar::chunk_bits_)); + } + + if (ic >= Base::state_index_begin_[Base::distributed_rank_] && + ic < Base::state_index_end_[Base::distributed_rank_]) { // on this + // process + if (baseChunk >= + Base::state_index_begin_[Base::distributed_rank_] && + baseChunk < Base::state_index_end_ + [Base::distributed_rank_]) { // base chunk is on + // this process + Base::states_[ic].qreg().initialize_from_data( + Base::states_[baseChunk].qreg().data(), + 1ull << BasePar::chunk_bits_); + } else { + BasePar::recv_chunk(ic, baseChunk); + // using swap chunk function to release send/recv buffers for + // Thrust + reg_t swap(2); + swap[0] = BasePar::chunk_bits_; + swap[1] = BasePar::chunk_bits_; + Base::states_[ic].qreg().apply_chunk_swap(swap, baseChunk); + } + } else if (baseChunk >= + Base::state_index_begin_[Base::distributed_rank_] && + baseChunk < Base::state_index_end_ + [Base::distributed_rank_]) { // base chunk + // is on this + // process + BasePar::send_chunk(baseChunk - Base::global_state_index_, ic); + } + } + } + } + + // initialize by params + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + Base::states_[i].qreg().apply_diagonal_matrix(qubits, params); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + Base::states_[i].qreg().apply_diagonal_matrix(qubits, params); + } + } +} + +template +void Executor::initialize_from_vector(const cvector_t ¶ms) { + uint_t local_offset = Base::global_state_index_ << BasePar::chunk_bits_; + +#pragma omp parallel for if (BasePar::chunk_omp_parallel_) + for (int_t i = 0; i < Base::states_.size(); i++) { + // copy part of state for this chunk + cvector_t tmp(1ull << BasePar::chunk_bits_); + std::copy(params.begin() + local_offset + (i << BasePar::chunk_bits_), + params.begin() + local_offset + ((i + 1) << BasePar::chunk_bits_), + tmp.begin()); + Base::states_[i].qreg().initialize_from_vector(tmp); + } +} + +//========================================================================= +// Implementation: Kraus Noise +//========================================================================= +template +void Executor::apply_kraus(const reg_t &qubits, + const std::vector &kmats, + RngEngine &rng) { + // Check edge case for empty Kraus set (this shouldn't happen) + if (kmats.empty()) + return; // end function early + + // Choose a real in [0, 1) to choose the applied kraus operator once + // the accumulated probability is greater than r. + // We know that the Kraus noise must be normalized + // So we only compute probabilities for the first N-1 kraus operators + // and infer the probability of the last one from 1 - sum of the previous + + double r = rng.rand(0., 1.); + double accum = 0.; + double p; + bool complete = false; + + // Loop through N-1 kraus operators + for (size_t j = 0; j < kmats.size() - 1; j++) { + + // Calculate probability + cvector_t vmat = Utils::vectorize_matrix(kmats[j]); + + p = 0.0; + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for reduction(+ : p) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t i = Base::top_state_of_group_[ig]; + i < Base::top_state_of_group_[ig + 1]; i++) + p += Base::states_[i].qreg().norm(qubits, vmat); + } + } else { + for (int_t i = 0; i < Base::states_.size(); i++) + p += Base::states_[i].qreg().norm(qubits, vmat); + } + +#ifdef AER_MPI + BasePar::reduce_sum(p); +#endif + accum += p; + + // check if we need to apply this operator + if (accum > r) { + // rescale vmat so projection is normalized + Utils::scalar_multiply_inplace(vmat, 1 / std::sqrt(p)); + // apply Kraus projection operator + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, vmat); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, vmat); + } + } + complete = true; + break; + } + } + + // check if we haven't applied a kraus operator yet + if (complete == false) { + // Compute probability from accumulated + complex_t renorm = 1 / std::sqrt(1. - accum); + auto vmat = Utils::vectorize_matrix(renorm * kmats.back()); + if (BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, vmat); + } + } else { + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (int_t ic = Base::top_state_of_group_[ig]; + ic < Base::top_state_of_group_[ig + 1]; ic++) + Base::states_[ic].qreg().apply_matrix(qubits, vmat); + } + } + } +} + +template +rvector_t +Executor::sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits) { + rvector_t probs = + Base::states_[root.state_index()].qreg().probabilities(qubits); + uint_t nshots = root.num_shots(); + reg_t shot_branch(nshots); + + for (int_t i = 0; i < nshots; i++) { + shot_branch[i] = root.rng_shots()[i].rand_int(probs); + } + + // branch shots + root.creg() = Base::states_[root.state_index()].creg(); + root.branch_shots(shot_branch, probs.size()); + + return probs; +} + +template +void Executor::measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs) { + // Update a state vector based on an outcome pair [m, p] from + // sample_measure_with_prob function, and a desired post-measurement + // final_state + + // Single-qubit case + if (qubits.size() == 1) { + // Diagonal matrix for projecting and renormalizing to measurement outcome + for (int_t i = 0; i < 2; i++) { + cvector_t mdiag(2, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + Operations::Op op; + op.type = OpType::gate; + op.name = "mcx"; + op.qubits = qubits; + root.branches()[i]->add_op_after_branch(op); + } + } + } + // Multi qubit case + else { + // Diagonal matrix for projecting and renormalizing to measurement outcome + const size_t dim = 1ULL << qubits.size(); + for (int_t i = 0; i < dim; i++) { + cvector_t mdiag(dim, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + i] = 1.; + perm[i * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != i) + perm[j * dim + j] = 1.; + } + Operations::Op op; + op.type = OpType::matrix; + op.qubits = qubits; + op.mats.push_back(Utils::devectorize_matrix(perm)); + root.branches()[i]->add_op_after_branch(op); + } + } + } +} + +template +void Executor::apply_measure(CircuitExecutor::Branch &root, + const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister) { + rvector_t probs = sample_measure_with_prob(root, qubits); + + // save result to cregs + for (int_t i = 0; i < probs.size(); i++) { + const reg_t outcome = Utils::int2reg(i, 2, qubits.size()); + root.branches()[i]->creg().store_measure(outcome, cmemory, cregister); + } + + measure_reset_update(root, qubits, -1, probs); +} + +template +void Executor::apply_reset(CircuitExecutor::Branch &root, + const reg_t &qubits) { + rvector_t probs = sample_measure_with_prob(root, qubits); + + measure_reset_update(root, qubits, 0, probs); +} + +template +void Executor::apply_initialize(CircuitExecutor::Branch &root, + const reg_t &qubits, + const cvector_t ¶ms) { + if (qubits.size() == Base::num_qubits_) { + auto sorted_qubits = qubits; + std::sort(sorted_qubits.begin(), sorted_qubits.end()); + // If qubits is all ordered qubits in the statevector + // we can just initialize the whole state directly + if (qubits == sorted_qubits) { + Base::states_[root.state_index()].initialize_from_vector(params); + return; + } + } + + if (root.additional_ops().size() == 0) { + apply_reset(root, qubits); + + Operations::Op op; + op.type = OpType::initialize; + op.name = "initialize"; + op.qubits = qubits; + op.params = params; + for (int_t i = 0; i < root.num_branches(); i++) { + root.branches()[i]->add_op_after_branch(op); + } + return; // initialization will be done in next call because of shot + // branching in reset + } + + Base::states_[root.state_index()].qreg().initialize_component(qubits, params); +} + +template +void Executor::apply_kraus(CircuitExecutor::Branch &root, + const reg_t &qubits, + const std::vector &kmats) { + // Check edge case for empty Kraus set (this shouldn't happen) + if (kmats.empty()) + return; // end function early + + // Choose a real in [0, 1) to choose the applied kraus operator once + // the accumulated probability is greater than r. + // We know that the Kraus noise must be normalized + // So we only compute probabilities for the first N-1 kraus operators + // and infer the probability of the last one from 1 - sum of the previous + + double r; + double accum = 0.; + double p; + bool complete = false; + + reg_t shot_branch; + uint_t nshots; + rvector_t rshots, pmats; + uint_t nshots_multiplied = 0; + + nshots = root.num_shots(); + shot_branch.resize(nshots); + rshots.resize(nshots); + for (int_t i = 0; i < nshots; i++) { + shot_branch[i] = kmats.size() - 1; + rshots[i] = root.rng_shots()[i].rand(0., 1.); + } + pmats.resize(kmats.size()); + + // Loop through N-1 kraus operators + for (size_t j = 0; j < kmats.size() - 1; j++) { + // Calculate probability + cvector_t vmat = Utils::vectorize_matrix(kmats[j]); + + p = Base::states_[root.state_index()].qreg().norm(qubits, vmat); + accum += p; + + // check if we need to apply this operator + pmats[j] = p; + for (int_t i = 0; i < nshots; i++) { + if (shot_branch[i] >= kmats.size() - 1) { + if (accum > rshots[i]) { + shot_branch[i] = j; + nshots_multiplied++; + } + } + } + if (nshots_multiplied >= nshots) { + complete = true; + break; + } + } + + // check if we haven't applied a kraus operator yet + pmats[pmats.size() - 1] = 1. - accum; + + root.creg() = Base::states_[root.state_index()].creg(); + root.branch_shots(shot_branch, kmats.size()); + for (int_t i = 0; i < kmats.size(); i++) { + Operations::Op op; + op.type = OpType::matrix; + op.qubits = qubits; + op.mats.push_back(kmats[i]); + p = 1 / std::sqrt(pmats[i]); + for (int_t j = 0; j < op.mats[0].size(); j++) + op.mats[0][j] *= p; + root.branches()[i]->add_op_after_branch(op); + } +} + +template +void Executor::apply_save_statevector(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + std::string key = + (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0]; + + if (last_op) { + const auto v = Base::states_[root.state_index()].move_to_vector(); + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v, + OpType::save_statevec, op.save_type); + } + } else { + const auto v = Base::states_[root.state_index()].copy_to_vector(); + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v, + OpType::save_statevec, op.save_type); + } + } +} + +template +void Executor::apply_save_statevector_dict( + CircuitExecutor::Branch &root, const Operations::Op &op, + ExperimentResult &result) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + auto state_ket = Base::states_[root.state_index()].qreg().vector_ket( + Base::json_chop_threshold_); + std::map result_state_ket; + for (auto const &it : state_ket) { + result_state_ket[it.first] = it.second; + } + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot( + Base::states_[root.state_index()].creg(), op.string_params[0], + (const std::map &)result_state_ket, op.type, + op.save_type); + } +} + +template +void Executor::apply_save_amplitudes(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result) { + if (op.int_params.empty()) { + throw std::invalid_argument( + "Invalid save_amplitudes instructions (empty params)."); + } + const int_t size = op.int_params.size(); + if (op.type == Operations::OpType::save_amps) { + Vector amps(size, false); + for (int_t i = 0; i < size; ++i) { + amps[i] = + Base::states_[root.state_index()].qreg().get_state(op.int_params[i]); + } + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot( + Base::states_[root.state_index()].creg(), op.string_params[0], + (const Vector &)amps, op.type, op.save_type); + } + } else { + rvector_t amps_sq(size, 0); + for (int_t i = 0; i < size; ++i) { + amps_sq[i] = Base::states_[root.state_index()].qreg().probability( + op.int_params[i]); + } + result.save_data_average(Base::states_[root.state_index()].creg(), + op.string_params[0], amps_sq, op.type, + op.save_type); + } +} + +template +std::vector +Executor::sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const { + int_t i, j; + std::vector rnds; + rnds.reserve(shots); + + for (i = 0; i < shots; ++i) + rnds.push_back(rng[i].rand(0, 1)); + + bool flg = state.qreg().enable_batch(false); + auto allbit_samples = state.qreg().sample_measure(rnds); + state.qreg().enable_batch(flg); + + // Convert to reg_t format + std::vector all_samples; + all_samples.reserve(shots); + for (int_t val : allbit_samples) { + reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_); + reg_t sample; + sample.reserve(qubits.size()); + for (uint_t qubit : qubits) { + sample.push_back(allbit_sample[qubit]); + } + all_samples.push_back(sample); + } + return all_samples; +} + +//------------------------------------------------------------------------- +} // end namespace Statevector +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp old mode 100644 new mode 100755 index 9a257ef08d..6746cd897a --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -23,7 +23,9 @@ #include "framework/json.hpp" #include "framework/utils.hpp" #include "qubitvector.hpp" -#include "simulators/state_chunk.hpp" +#include "simulators/chunk_utils.hpp" +#include "simulators/state.hpp" + #ifdef AER_THRUST_SUPPORTED #include "qubitvector_thrust.hpp" #endif @@ -109,9 +111,9 @@ enum class Gates { //========================================================================= template > -class State : public QuantumState::StateChunk { +class State : public QuantumState::State { public: - using BaseState = QuantumState::StateChunk; + using BaseState = QuantumState::State; State() : BaseState(StateOpSet) {} virtual ~State() = default; @@ -125,12 +127,18 @@ class State : public QuantumState::StateChunk { // Apply an operation // If the op is not in allowed_ops an exeption will be raised. - void apply_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, RngEngine &rng, - bool final_op = false) override; + void apply_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op = false) override; + + // memory allocation (previously called before inisitalize_qreg) + bool allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots = 1) override; // Initializes an n-qubit state to the all |0> state - virtual void initialize_qreg(uint_t num_qubits) override; + void initialize_qreg(uint_t num_qubits) override; + + // Initializes to a specific n-qubit state + void initialize_statevector(uint_t num_qubits, statevec_t &&state); // Returns the required memory for storing an n-qubit state in megabytes. // For this state the memory is independent of the number of ops @@ -155,74 +163,66 @@ class State : public QuantumState::StateChunk { // Initialize OpenMP settings for the underlying QubitVector class void initialize_omp(); - // Initializes to a specific n-qubit state - virtual void initialize_qreg(uint_t num_qubits, statevec_t &&state); - - auto move_to_vector(const int_t iChunk); - auto copy_to_vector(const int_t iChunk); + auto move_to_vector(void); + auto copy_to_vector(void); -protected: //----------------------------------------------------------------------- // Apply instructions //----------------------------------------------------------------------- - // apply op to multiple shots , return flase if op is not supported to execute - // in a batch - bool apply_batched_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, std::vector &rng, - bool final_op = false) override; // Applies a sypported Gate operation to the state class. // If the input is not in allowed_gates an exeption will be raised. - void apply_gate(const int_t iChunk, const Operations::Op &op); + void apply_gate(const Operations::Op &op); // Measure qubits and return a list of outcomes [q0, q1, ...] // If a state subclass supports this function it then "measure" // should be contained in the set returned by the 'allowed_ops' // method. - virtual void apply_measure(const int_t iChunk, const reg_t &qubits, - const reg_t &cmemory, const reg_t &cregister, - RngEngine &rng); + virtual void apply_measure(const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister, RngEngine &rng); // Reset the specified qubits to the |0> state by simulating // a measurement, applying a conditional x-gate if the outcome is 1, and // then discarding the outcome. - void apply_reset(const int_t iChunk, const reg_t &qubits, RngEngine &rng); + void apply_reset(const reg_t &qubits, RngEngine &rng); // Initialize the specified qubits to a given state |psi> // by applying a reset to the these qubits and then // computing the tensor product with the new state |psi> // /psi> is given in params - void apply_initialize(const int_t iChunk, const reg_t &qubits, - const cvector_t ¶ms, RngEngine &rng); + void apply_initialize(const reg_t &qubits, const cvector_t ¶ms, + RngEngine &rng); - void initialize_from_vector(const int_t iChunk, const cvector_t ¶ms); + void initialize_from_vector(const cvector_t ¶ms); // Apply a matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const Operations::Op &op); + void apply_matrix(const Operations::Op &op); // Apply a vectorized matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const reg_t &qubits, - const cvector_t &vmat); + void apply_matrix(const reg_t &qubits, const cvector_t &vmat); // apply diagonal matrix - void apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits, - const cvector_t &diag); + void apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag); // Apply a vector of control matrices to given qubits (identity on all other // qubits) - void apply_multiplexer(const int_t iChunk, const reg_t &control_qubits, + void apply_multiplexer(const reg_t &control_qubits, const reg_t &target_qubits, const std::vector &mmat); // Apply stacked (flat) version of multiplexer matrix to target qubits (using // control qubits to select matrix instance) - void apply_multiplexer(const int_t iChunk, const reg_t &control_qubits, + void apply_multiplexer(const reg_t &control_qubits, const reg_t &target_qubits, const cmatrix_t &mat); // Apply a Kraus error operation - void apply_kraus(const int_t iChunk, const reg_t &qubits, - const std::vector &krausops, RngEngine &rng); + void apply_kraus(const reg_t &qubits, const std::vector &krausops, + RngEngine &rng); + // Return the reduced density matrix for the simulator + cmatrix_t density_matrix(const reg_t &qubits); + +protected: //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- @@ -231,27 +231,26 @@ class State : public QuantumState::StateChunk { // If `last_op` is True this will use move semantics to move the simulator // state to the results, otherwise it will use copy semantics to leave // the current simulator state unchanged. - void apply_save_statevector(const int_t iChunk, const Operations::Op &op, + void apply_save_statevector(const Operations::Op &op, ExperimentResult &result, bool last_op); // Save the current state of the statevector simulator as a ket-form map. - void apply_save_statevector_dict(const int_t iChunk, const Operations::Op &op, + void apply_save_statevector_dict(const Operations::Op &op, ExperimentResult &result); // Save the current density matrix or reduced density matrix - void apply_save_density_matrix(const int_t iChunk, const Operations::Op &op, + void apply_save_density_matrix(const Operations::Op &op, ExperimentResult &result); // Helper function for computing expectation value - void apply_save_probs(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result); + void apply_save_probs(const Operations::Op &op, ExperimentResult &result); // Helper function for saving amplitudes and amplitudes squared - void apply_save_amplitudes(const int_t iChunk, const Operations::Op &op, + void apply_save_amplitudes(const Operations::Op &op, ExperimentResult &result); // Helper function for computing expectation value - virtual double expval_pauli(const int_t iChunk, const reg_t &qubits, + virtual double expval_pauli(const reg_t &qubits, const std::string &pauli) override; //----------------------------------------------------------------------- // Measurement Helpers @@ -262,7 +261,7 @@ class State : public QuantumState::StateChunk { // should be contained in the set returned by the 'allowed_ops' // method. // TODO: move to private (no longer part of base class) - rvector_t measure_probs(const int_t iChunk, const reg_t &qubits) const; + rvector_t measure_probs(const reg_t &qubits) const; // Sample the measurement outcome for qubits // return a pair (m, p) of the outcome m, and its corresponding @@ -272,18 +271,13 @@ class State : public QuantumState::StateChunk { // 1 -> |q1 = 0, q0 = 1> state // 2 -> |q1 = 1, q0 = 0> state // 3 -> |q1 = 1, q0 = 1> state - std::pair sample_measure_with_prob(const int_t iChunk, - const reg_t &qubits, + std::pair sample_measure_with_prob(const reg_t &qubits, RngEngine &rng); - void measure_reset_update(const int_t iChunk, - const std::vector &qubits, + void measure_reset_update(const std::vector &qubits, const uint_t final_state, const uint_t meas_state, const double meas_prob); - // Return the reduced density matrix for the simulator - cmatrix_t density_matrix(const int_t iChunk, const reg_t &qubits); - // Helper function to convert a vector to a reduced density matrix template cmatrix_t vec2density(const reg_t &qubits, const T &vec); @@ -293,8 +287,7 @@ class State : public QuantumState::StateChunk { //----------------------------------------------------------------------- // Optimize phase gate with diagonal [1, phase] - void apply_gate_phase(const int_t iChunk, const uint_t qubit, - const complex_t phase); + void apply_gate_phase(const uint_t qubit, const complex_t phase); //----------------------------------------------------------------------- // Multi-controlled u3 @@ -303,9 +296,8 @@ class State : public QuantumState::StateChunk { // Apply N-qubit multi-controlled single qubit gate specified by // 4 parameters u4(theta, phi, lambda, gamma) // NOTE: if N=1 this is just a regular u4 gate. - void apply_gate_mcu(const int_t iChunk, const reg_t &qubits, - const double theta, const double phi, const double lambda, - const double gamma); + void apply_gate_mcu(const reg_t &qubits, const double theta, const double phi, + const double lambda, const double gamma); //----------------------------------------------------------------------- // Config Settings @@ -411,92 +403,23 @@ const stringmap_t State::gateset_( template void State::initialize_qreg(uint_t num_qubits) { int_t i; - if (BaseState::qregs_.size() == 0) - BaseState::allocate(num_qubits, num_qubits, 1); - initialize_omp(); - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_num_qubits(BaseState::chunk_bits_); - } + BaseState::qreg_.set_num_qubits(num_qubits); + BaseState::qreg_.initialize(); - if (BaseState::multi_chunk_distribution_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - if (BaseState::global_chunk_index_ + iChunk == 0 || - this->num_qubits_ == this->chunk_bits_) { - BaseState::qregs_[iChunk].initialize(); - } else { - BaseState::qregs_[iChunk].zero(); - } - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - if (BaseState::global_chunk_index_ + i == 0 || - this->num_qubits_ == this->chunk_bits_) { - BaseState::qregs_[i].initialize(); - } else { - BaseState::qregs_[i].zero(); - } - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].initialize(); - } - } apply_global_phase(); } template -void State::initialize_qreg(uint_t num_qubits, statevec_t &&state) { +void State::initialize_statevector(uint_t num_qubits, + statevec_t &&state) { if (state.num_qubits() != num_qubits) { throw std::invalid_argument("QubitVector::State::initialize: initial state " "does not match qubit number"); } - if (BaseState::qregs_.size() == 1) { - BaseState::qregs_[0] = std::move(state); - } else { - if (BaseState::qregs_.size() == 0) - BaseState::allocate(num_qubits, num_qubits, 1); - initialize_omp(); - - int_t iChunk; - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); - } - - if (BaseState::multi_chunk_distribution_) { - uint_t local_offset = BaseState::global_chunk_index_ - << BaseState::chunk_bits_; - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(iChunk) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) - BaseState::qregs_[iChunk].initialize_from_data( - state.data() + local_offset + - (iChunk << BaseState::chunk_bits_), - 1ull << BaseState::chunk_bits_); - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) - BaseState::qregs_[iChunk].initialize_from_data( - state.data() + local_offset + (iChunk << BaseState::chunk_bits_), - 1ull << BaseState::chunk_bits_); - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - BaseState::qregs_[iChunk].initialize_from_data( - state.data(), 1ull << BaseState::chunk_bits_); - } - } - } + BaseState::qreg_ = std::move(state); apply_global_phase(); } @@ -505,12 +428,21 @@ template void State::initialize_omp() { uint_t i; - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_); - if (BaseState::threads_ > 0) - BaseState::qregs_[i].set_omp_threads( - BaseState::threads_); // set allowed OMP threads in qubitvector - } + BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_); + if (BaseState::threads_ > 0) // set allowed OMP threads in qubitvector + BaseState::qreg_.set_omp_threads(BaseState::threads_); +} + +template +bool State::allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots) { + if (BaseState::max_matrix_qubits_ > 0) + BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_); + + BaseState::qreg_.set_target_gpus(BaseState::target_gpus_); + BaseState::qreg_.chunk_setup(block_bits, num_qubits, 0, 1); + + return true; } //------------------------------------------------------------------------- @@ -519,30 +451,16 @@ void State::initialize_omp() { template void State::apply_global_phase() { - if (BaseState::has_global_phase_) { - int_t i; - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) - BaseState::qregs_[iChunk].apply_diagonal_matrix( - {0}, {BaseState::global_phase_, BaseState::global_phase_}); - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) - BaseState::qregs_[i].apply_diagonal_matrix( - {0}, {BaseState::global_phase_, BaseState::global_phase_}); - } - } + if (BaseState::has_global_phase_) + BaseState::qreg_.apply_diagonal_matrix( + {0}, {BaseState::global_phase_, BaseState::global_phase_}); } template size_t State::required_memory_mb( uint_t num_qubits, const std::vector &ops) const { (void)ops; // avoid unused variable compiler warning - statevec_t tmp; - return tmp.required_memory_mb(num_qubits); + return BaseState::qreg_.required_memory_mb(num_qubits); } template @@ -551,9 +469,7 @@ void State::set_config(const Config &config) { // Set threshold for truncating states to be saved json_chop_threshold_ = config.zero_threshold; - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_); - } + BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_); // Set OMP threshold for state update functions omp_qubit_threshold_ = config.statevector_parallel_threshold; @@ -561,152 +477,95 @@ void State::set_config(const Config &config) { // Set the sample measure indexing size if (config.statevector_sample_measure_opt) { int index_size = config.statevector_sample_measure_opt; - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_sample_measure_index_size(index_size); - } + BaseState::qreg_.set_sample_measure_index_size(index_size); } } template -auto State::move_to_vector(const int_t iChunkIn) { - if (BaseState::multi_chunk_distribution_) { - size_t size_required = - 2 * (sizeof(std::complex) << BaseState::num_qubits_) + - (sizeof(std::complex) << BaseState::chunk_bits_) * - BaseState::num_local_chunks_; - if ((size_required >> 20) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to store states")); - } - int_t iChunk; - auto state = BaseState::qregs_[0].move_to_vector(); - state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); - -#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk) - for (iChunk = 1; iChunk < BaseState::qregs_.size(); iChunk++) { - auto tmp = BaseState::qregs_[iChunk].move_to_vector(); - uint_t j, offset = iChunk << BaseState::chunk_bits_; - for (j = 0; j < tmp.size(); j++) { - state[offset + j] = tmp[j]; - } - } - -#ifdef AER_MPI - BaseState::gather_state(state); -#endif - return state; - } else { - return std::move(BaseState::qregs_[iChunkIn].move_to_vector()); - } +auto State::move_to_vector(void) { + return std::move(BaseState::qreg_.move_to_vector()); } template -auto State::copy_to_vector(const int_t iChunkIn) { - if (BaseState::multi_chunk_distribution_) { - size_t size_required = - 2 * (sizeof(std::complex) << BaseState::num_qubits_) + - (sizeof(std::complex) << BaseState::chunk_bits_) * - BaseState::num_local_chunks_; - if ((size_required >> 20) > Utils::get_system_memory_mb()) { - throw std::runtime_error( - std::string("There is not enough memory to store states")); - } - int_t iChunk; - auto state = BaseState::qregs_[0].copy_to_vector(); - state.resize(BaseState::num_local_chunks_ << BaseState::chunk_bits_); - -#pragma omp parallel for if (BaseState::chunk_omp_parallel_) private(iChunk) - for (iChunk = 1; iChunk < BaseState::qregs_.size(); iChunk++) { - auto tmp = BaseState::qregs_[iChunk].copy_to_vector(); - uint_t j, offset = iChunk << BaseState::chunk_bits_; - for (j = 0; j < tmp.size(); j++) { - state[offset + j] = tmp[j]; - } - } - -#ifdef AER_MPI - BaseState::gather_state(state); -#endif - return state; - } else - return BaseState::qregs_[iChunkIn].copy_to_vector(); +auto State::copy_to_vector(void) { + return BaseState::qreg_.copy_to_vector(); } //========================================================================= // Implementation: apply operations //========================================================================= template -void State::apply_op(const int_t iChunk, const Operations::Op &op, +void State::apply_op(const Operations::Op &op, ExperimentResult &result, RngEngine &rng, bool final_op) { - if (BaseState::check_conditional(iChunk, op)) { + if (BaseState::creg().check_conditional(op)) { switch (op.type) { case OpType::barrier: case OpType::nop: case OpType::qerror_loc: break; case OpType::reset: - apply_reset(iChunk, op.qubits, rng); + apply_reset(op.qubits, rng); break; case OpType::initialize: - apply_initialize(iChunk, op.qubits, op.params, rng); + apply_initialize(op.qubits, op.params, rng); break; case OpType::measure: - apply_measure(iChunk, op.qubits, op.memory, op.registers, rng); + apply_measure(op.qubits, op.memory, op.registers, rng); break; case OpType::bfunc: - BaseState::cregs_[0].apply_bfunc(op); + BaseState::creg().apply_bfunc(op); break; case OpType::roerror: - BaseState::cregs_[0].apply_roerror(op, rng); + BaseState::creg().apply_roerror(op, rng); break; case OpType::gate: - apply_gate(iChunk, op); + apply_gate(op); break; case OpType::matrix: - apply_matrix(iChunk, op); + apply_matrix(op); break; case OpType::diagonal_matrix: - apply_diagonal_matrix(iChunk, op.qubits, op.params); + apply_diagonal_matrix(op.qubits, op.params); break; case OpType::multiplexer: - apply_multiplexer(iChunk, op.regs[0], op.regs[1], + apply_multiplexer(op.regs[0], op.regs[1], op.mats); // control qubits ([0]) & target qubits([1]) break; case OpType::kraus: - apply_kraus(iChunk, op.qubits, op.mats, rng); + apply_kraus(op.qubits, op.mats, rng); break; case OpType::sim_op: if (op.name == "begin_register_blocking") { - BaseState::qregs_[iChunk].enter_register_blocking(op.qubits); + BaseState::qreg_.enter_register_blocking(op.qubits); } else if (op.name == "end_register_blocking") { - BaseState::qregs_[iChunk].leave_register_blocking(); + BaseState::qreg_.leave_register_blocking(); } break; case OpType::set_statevec: - initialize_from_vector(iChunk, op.params); + initialize_from_vector(op.params); break; case OpType::save_expval: case OpType::save_expval_var: - BaseState::apply_save_expval(iChunk, op, result); + BaseState::apply_save_expval(op, result); break; case OpType::save_densmat: - apply_save_density_matrix(iChunk, op, result); + apply_save_density_matrix(op, result); break; case OpType::save_state: case OpType::save_statevec: - apply_save_statevector(iChunk, op, result, final_op); + apply_save_statevector(op, result, final_op); break; case OpType::save_statevec_dict: - apply_save_statevector_dict(iChunk, op, result); + apply_save_statevector_dict(op, result); break; case OpType::save_probs: case OpType::save_probs_ket: - apply_save_probs(iChunk, op, result); + apply_save_probs(op, result); break; case OpType::save_amps: case OpType::save_amps_sq: - apply_save_amplitudes(iChunk, op, result); + apply_save_amplitudes(op, result); break; default: throw std::invalid_argument("QubitVector::State::invalid instruction \'" + @@ -715,282 +574,37 @@ void State::apply_op(const int_t iChunk, const Operations::Op &op, } } -template -bool State::apply_batched_op(const int_t iChunk, - const Operations::Op &op, - ExperimentResult &result, - std::vector &rng, - bool final_op) { - if (op.conditional) { - BaseState::qregs_[iChunk].set_conditional(op.conditional_reg); - } - - switch (op.type) { - case OpType::barrier: - case OpType::nop: - case OpType::qerror_loc: - break; - case OpType::reset: - BaseState::qregs_[iChunk].apply_batched_reset(op.qubits, rng); - break; - case OpType::initialize: - BaseState::qregs_[iChunk].apply_batched_reset(op.qubits, rng); - BaseState::qregs_[iChunk].initialize_component(op.qubits, op.params); - break; - case OpType::measure: - BaseState::qregs_[iChunk].apply_batched_measure(op.qubits, rng, op.memory, - op.registers); - break; - case OpType::bfunc: - BaseState::qregs_[iChunk].apply_bfunc(op); - break; - case OpType::roerror: - BaseState::qregs_[iChunk].apply_roerror(op, rng); - break; - case OpType::gate: - apply_gate(iChunk, op); - break; - case OpType::matrix: - apply_matrix(iChunk, op); - break; - case OpType::diagonal_matrix: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, op.params); - break; - case OpType::multiplexer: - apply_multiplexer(iChunk, op.regs[0], op.regs[1], - op.mats); // control qubits ([0]) & target qubits([1]) - break; - case OpType::kraus: - BaseState::qregs_[iChunk].apply_batched_kraus(op.qubits, op.mats, rng); - break; - case OpType::sim_op: - if (op.name == "begin_register_blocking") { - BaseState::qregs_[iChunk].enter_register_blocking(op.qubits); - } else if (op.name == "end_register_blocking") { - BaseState::qregs_[iChunk].leave_register_blocking(); - } else { - return false; - } - break; - case OpType::set_statevec: - BaseState::qregs_[iChunk].initialize_from_vector(op.params); - break; - default: - // other operations should be called to indivisual chunks by apply_op - return false; - } - return true; -} - //========================================================================= // Implementation: Save data //========================================================================= template -void State::apply_save_probs(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_probs(const Operations::Op &op, ExperimentResult &result) { // get probs as hexadecimal - auto probs = measure_probs(iChunk, op.qubits); + auto probs = measure_probs(op.qubits); if (op.type == Operations::OpType::save_probs_ket) { // Convert to ket dict - result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0], + result.save_data_average(BaseState::creg(), op.string_params[0], Utils::vec2ket(probs, json_chop_threshold_, 16), op.type, op.save_type); } else { - result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0], + result.save_data_average(BaseState::creg(), op.string_params[0], std::move(probs), op.type, op.save_type); } } template -double State::expval_pauli(const int_t iChunk, const reg_t &qubits, +double State::expval_pauli(const reg_t &qubits, const std::string &pauli) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].expval_pauli(qubits, pauli); - - // multi-chunk distribution - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - std::string pauli_in_chunk; - std::string pauli_out_chunk; - int_t i, n; - double expval(0.); - - // get inner/outer chunk pauli string - n = pauli.size(); - for (i = 0; i < n; i++) { - if (qubits[i] < BaseState::chunk_bits_) { - qubits_in_chunk.push_back(qubits[i]); - pauli_in_chunk.push_back(pauli[n - i - 1]); - } else { - qubits_out_chunk.push_back(qubits[i]); - pauli_out_chunk.push_back(pauli[n - i - 1]); - } - } - - if (qubits_out_chunk.size() > 0) { // there are bits out of chunk - std::complex phase = 1.0; - - std::reverse(pauli_out_chunk.begin(), pauli_out_chunk.end()); - std::reverse(pauli_in_chunk.begin(), pauli_in_chunk.end()); - - uint_t x_mask, z_mask, num_y, x_max; - std::tie(x_mask, z_mask, num_y, x_max) = - AER::QV::pauli_masks_and_phase(qubits_out_chunk, pauli_out_chunk); - - AER::QV::add_y_phase(num_y, phase); - - if (x_mask != 0) { // pairing state is out of chunk - bool on_same_process = true; -#ifdef AER_MPI - int proc_bits = 0; - uint_t procs = BaseState::distributed_procs_; - while (procs > 1) { - if ((procs & 1) != 0) { - proc_bits = -1; - break; - } - proc_bits++; - procs >>= 1; - } - if (x_mask & (~((1ull << (BaseState::num_qubits_ - proc_bits)) - 1)) != - 0) { // data exchange between processes is required - on_same_process = false; - } -#endif - - x_mask >>= BaseState::chunk_bits_; - z_mask >>= BaseState::chunk_bits_; - x_max -= BaseState::chunk_bits_; - - const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); - const uint_t mask_l = (1ull << x_max) - 1; - if (on_same_process) { - auto apply_expval_pauli_chunk = [this, x_mask, z_mask, x_max, mask_u, - mask_l, qubits_in_chunk, - pauli_in_chunk, phase](int_t iGroup) { - double expval = 0.0; - for (int_t iChunk = BaseState::top_chunk_of_group_[iGroup]; - iChunk < BaseState::top_chunk_of_group_[iGroup + 1]; iChunk++) { - uint_t pair_chunk = iChunk ^ x_mask; - if (iChunk < pair_chunk) { - uint_t z_count, z_count_pair; - z_count = AER::Utils::popcount(iChunk & z_mask); - z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); - - expval += - BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli(qubits_in_chunk, pauli_in_chunk, - BaseState::qregs_[pair_chunk], z_count, - z_count_pair, phase); - } - } - return expval; - }; - expval += Utils::apply_omp_parallel_for_reduction( - (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0), 0, - BaseState::num_global_chunks_ / 2, apply_expval_pauli_chunk); - } else { - for (int_t i = 0; i < BaseState::num_global_chunks_ / 2; i++) { - uint_t iChunk = ((i << 1) & mask_u) | (i & mask_l); - uint_t pair_chunk = iChunk ^ x_mask; - uint_t iProc = BaseState::get_process_by_chunk(pair_chunk); - if (BaseState::chunk_index_begin_[BaseState::distributed_rank_] <= - iChunk && - BaseState::chunk_index_end_[BaseState::distributed_rank_] > - iChunk) { // on this process - uint_t z_count, z_count_pair; - z_count = AER::Utils::popcount(iChunk & z_mask); - z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); - - if (iProc == - BaseState::distributed_rank_) { // pair is on the same process - expval += - BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli( - qubits_in_chunk, pauli_in_chunk, - BaseState::qregs_[pair_chunk - - BaseState::global_chunk_index_], - z_count, z_count_pair, phase); - } else { - BaseState::recv_chunk(iChunk - BaseState::global_chunk_index_, - pair_chunk); - // refer receive buffer to calculate expectation value - expval += - BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .expval_pauli( - qubits_in_chunk, pauli_in_chunk, - BaseState::qregs_[iChunk - - BaseState::global_chunk_index_], - z_count, z_count_pair, phase); - } - } else if (iProc == - BaseState::distributed_rank_) { // pair is on this process - BaseState::send_chunk(iChunk - BaseState::global_chunk_index_, - pair_chunk); - } - } - } - } else { // no exchange between chunks - z_mask >>= BaseState::chunk_bits_; - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for reduction(+ : expval) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - double e_tmp = 0.0; - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - double sign = 1.0; - if (z_mask && - (AER::Utils::popcount( - (iChunk + BaseState::global_chunk_index_) & z_mask) & - 1)) - sign = -1.0; - e_tmp += sign * BaseState::qregs_[iChunk].expval_pauli( - qubits_in_chunk, pauli_in_chunk); - } - expval += e_tmp; - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - double sign = 1.0; - if (z_mask && (AER::Utils::popcount( - (i + BaseState::global_chunk_index_) & z_mask) & - 1)) - sign = -1.0; - expval += sign * BaseState::qregs_[i].expval_pauli(qubits_in_chunk, - pauli_in_chunk); - } - } - } - } else { // all bits are inside chunk - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for reduction(+ : expval) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - double e_tmp = 0.0; - for (int_t iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) - e_tmp += BaseState::qregs_[iChunk].expval_pauli(qubits, pauli); - expval += e_tmp; - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) - expval += BaseState::qregs_[i].expval_pauli(qubits, pauli); - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(expval); -#endif - return expval; + return BaseState::qreg_.expval_pauli(qubits, pauli); } template -void State::apply_save_statevector(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_statevector(const Operations::Op &op, ExperimentResult &result, bool last_op) { - if (op.qubits.size() != BaseState::num_qubits_) { + if (op.qubits.size() != BaseState::qreg_.num_qubits()) { throw std::invalid_argument(op.name + " was not applied to all qubits." " Only the full statevector can be saved."); @@ -999,52 +613,34 @@ void State::apply_save_statevector(const int_t iChunk, (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0]; if (last_op) { - auto v = move_to_vector(iChunk); - result.save_data_pershot(BaseState::chunk_creg(iChunk), key, std::move(v), + auto v = move_to_vector(); + result.save_data_pershot(BaseState::creg(), key, std::move(v), OpType::save_statevec, op.save_type); } else { - result.save_data_pershot(BaseState::chunk_creg(iChunk), key, - copy_to_vector(iChunk), OpType::save_statevec, - op.save_type); + result.save_data_pershot(BaseState::creg(), key, copy_to_vector(), + OpType::save_statevec, op.save_type); } } template -void State::apply_save_statevector_dict(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_statevector_dict(const Operations::Op &op, ExperimentResult &result) { - if (op.qubits.size() != BaseState::num_qubits_) { + if (op.qubits.size() != BaseState::qreg_.num_qubits()) { throw std::invalid_argument(op.name + " was not applied to all qubits." " Only the full statevector can be saved."); } - if (BaseState::multi_chunk_distribution_) { - auto vec = copy_to_vector(iChunk); - std::map result_state_ket; - for (size_t k = 0; k < vec.size(); ++k) { - if (std::abs(vec[k]) >= json_chop_threshold_) { - std::string key = Utils::int2hex(k); - result_state_ket.insert({key, vec[k]}); - } - } - result.save_data_pershot(BaseState::chunk_creg(iChunk), op.string_params[0], - std::move(result_state_ket), op.type, - op.save_type); - } else { - auto state_ket = BaseState::qregs_[iChunk].vector_ket(json_chop_threshold_); - std::map result_state_ket; - for (auto const &it : state_ket) { - result_state_ket[it.first] = it.second; - } - result.save_data_pershot(BaseState::chunk_creg(iChunk), op.string_params[0], - std::move(result_state_ket), op.type, - op.save_type); + auto state_ket = BaseState::qreg_.vector_ket(json_chop_threshold_); + std::map result_state_ket; + for (auto const &it : state_ket) { + result_state_ket[it.first] = it.second; } + result.save_data_pershot(BaseState::creg(), op.string_params[0], + std::move(result_state_ket), op.type, op.save_type); } template -void State::apply_save_density_matrix(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_density_matrix(const Operations::Op &op, ExperimentResult &result) { cmatrix_t reduced_state; @@ -1052,34 +648,17 @@ void State::apply_save_density_matrix(const int_t iChunk, if (op.qubits.empty()) { reduced_state = cmatrix_t(1, 1); - if (!BaseState::multi_chunk_distribution_) { - reduced_state[0] = BaseState::qregs_[iChunk].norm(); - } else { - double sum = 0.0; - if (BaseState::chunk_omp_parallel_) { -#pragma omp parallel for reduction(+ : sum) - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - sum += BaseState::qregs_[i].norm(); - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - sum += BaseState::qregs_[i].norm(); - } -#ifdef AER_MPI - BaseState::reduce_sum(sum); -#endif - reduced_state[0] = sum; - } + reduced_state[0] = BaseState::qreg_.norm(); } else { - reduced_state = density_matrix(iChunk, op.qubits); + reduced_state = density_matrix(op.qubits); } - result.save_data_average(BaseState::chunk_creg(iChunk), op.string_params[0], + result.save_data_average(BaseState::creg(), op.string_params[0], std::move(reduced_state), op.type, op.save_type); } template -void State::apply_save_amplitudes(const int_t iChunkIn, - const Operations::Op &op, +void State::apply_save_amplitudes(const Operations::Op &op, ExperimentResult &result) { if (op.int_params.empty()) { throw std::invalid_argument( @@ -1088,63 +667,24 @@ void State::apply_save_amplitudes(const int_t iChunkIn, const int_t size = op.int_params.size(); if (op.type == Operations::OpType::save_amps) { Vector amps(size, false); - if (!BaseState::multi_chunk_distribution_) { - for (int_t i = 0; i < size; ++i) { - amps[i] = BaseState::qregs_[iChunkIn].get_state(op.int_params[i]); - } - } else { - for (int_t i = 0; i < size; ++i) { - uint_t idx = BaseState::mapped_index(op.int_params[i]); - uint_t iChunk = idx >> BaseState::chunk_bits_; - amps[i] = 0.0; - if (iChunk >= BaseState::global_chunk_index_ && - iChunk < - BaseState::global_chunk_index_ + BaseState::qregs_.size()) { - amps[i] = BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .get_state(idx - (iChunk << BaseState::chunk_bits_)); - } -#ifdef AER_MPI - complex_t amp = amps[i]; - BaseState::reduce_sum(amp); - amps[i] = amp; -#endif - } + for (int_t i = 0; i < size; ++i) { + amps[i] = BaseState::qreg_.get_state(op.int_params[i]); } - result.save_data_pershot(BaseState::chunk_creg(iChunkIn), - op.string_params[0], std::move(amps), op.type, - op.save_type); + result.save_data_pershot(BaseState::creg(), op.string_params[0], + std::move(amps), op.type, op.save_type); } else { rvector_t amps_sq(size, 0); - if (!BaseState::multi_chunk_distribution_) { - for (int_t i = 0; i < size; ++i) { - amps_sq[i] = BaseState::qregs_[iChunkIn].probability(op.int_params[i]); - } - } else { - for (int_t i = 0; i < size; ++i) { - uint_t idx = BaseState::mapped_index(op.int_params[i]); - uint_t iChunk = idx >> BaseState::chunk_bits_; - if (iChunk >= BaseState::global_chunk_index_ && - iChunk < - BaseState::global_chunk_index_ + BaseState::qregs_.size()) { - amps_sq[i] = - BaseState::qregs_[iChunk - BaseState::global_chunk_index_] - .probability(idx - (iChunk << BaseState::chunk_bits_)); - } - } -#ifdef AER_MPI - BaseState::reduce_sum(amps_sq); -#endif + for (int_t i = 0; i < size; ++i) { + amps_sq[i] = BaseState::qreg_.probability(op.int_params[i]); } - result.save_data_average(BaseState::chunk_creg(iChunkIn), - op.string_params[0], std::move(amps_sq), op.type, - op.save_type); + result.save_data_average(BaseState::creg(), op.string_params[0], + std::move(amps_sq), op.type, op.save_type); } } template -cmatrix_t State::density_matrix(const int_t iChunk, - const reg_t &qubits) { - return vec2density(qubits, copy_to_vector(iChunk)); +cmatrix_t State::density_matrix(const reg_t &qubits) { + return vec2density(qubits, copy_to_vector()); } template @@ -1157,7 +697,7 @@ cmatrix_t State::vec2density(const reg_t &qubits, const T &vec) { // Return full density matrix cmatrix_t densmat(DIM, DIM); - if ((N == BaseState::num_qubits_) && (qubits == qubits_sorted)) { + if ((N == BaseState::qreg_.num_qubits()) && (qubits == qubits_sorted)) { const int_t mask = QV::MASKS[N]; #pragma omp parallel for if (2 * N > omp_qubit_threshold_ && \ BaseState::threads_ > 1) \ @@ -1168,7 +708,7 @@ cmatrix_t State::vec2density(const reg_t &qubits, const T &vec) { densmat(row, col) = complex_t(vec[row]) * complex_t(std::conj(vec[col])); } } else { - const size_t END = 1ULL << (BaseState::num_qubits_ - N); + const size_t END = 1ULL << (BaseState::qreg_.num_qubits() - N); // Initialize matrix values with first block { const auto inds = QV::indexes(qubits, qubits_sorted, 0); @@ -1197,20 +737,23 @@ cmatrix_t State::vec2density(const reg_t &qubits, const T &vec) { //========================================================================= template -void State::apply_gate(const int_t iChunk, - const Operations::Op &op) { - if (!BaseState::global_chunk_indexing_) { +void State::apply_gate(const Operations::Op &op) { + // CPU qubit vector does not handle chunk ID inside kernel, so modify op here + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() && + !BaseState::qreg_.support_global_indexing()) { reg_t qubits_in, qubits_out; - BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in); + if (op.name[0] == 'c' || op.name.find("mc") == 0) { + Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in, + qubits_out); + } if (qubits_out.size() > 0) { uint_t mask = 0; for (int i = 0; i < qubits_out.size(); i++) { - mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_)); + mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits())); } - if (((BaseState::global_chunk_index_ + iChunk) & mask) == mask) { - Operations::Op new_op = - BaseState::remake_gate_in_chunk_qubits(op, qubits_in); - apply_gate(iChunk, new_op); + if ((BaseState::qreg_.chunk_index() & mask) == mask) { + Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in); + apply_gate(new_op); } return; } @@ -1224,103 +767,102 @@ void State::apply_gate(const int_t iChunk, switch (it->second) { case Gates::mcx: // Includes X, CX, CCX, etc - BaseState::qregs_[iChunk].apply_mcx(op.qubits); + BaseState::qreg_.apply_mcx(op.qubits); break; case Gates::mcy: // Includes Y, CY, CCY, etc - BaseState::qregs_[iChunk].apply_mcy(op.qubits); + BaseState::qreg_.apply_mcy(op.qubits); break; case Gates::mcz: // Includes Z, CZ, CCZ, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + BaseState::qreg_.apply_mcphase(op.qubits, -1); break; case Gates::mcr: - BaseState::qregs_[iChunk].apply_mcu( - op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); + BaseState::qreg_.apply_mcu(op.qubits, + Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::mcrx: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::x, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::x, + std::real(op.params[0])); break; case Gates::mcry: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::y, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::y, + std::real(op.params[0])); break; case Gates::mcrz: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::z, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::z, + std::real(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::xx, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::xx, + std::real(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::yy, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::yy, + std::real(op.params[0])); break; case Gates::rzz: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zz, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::zz, + std::real(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zx, - std::real(op.params[0])); + BaseState::qreg_.apply_rotation(op.qubits, QV::Rotation::zx, + std::real(op.params[0])); break; case Gates::ecr: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ECR); + BaseState::qreg_.apply_matrix(op.qubits, Linalg::VMatrix::ECR); case Gates::id: break; case Gates::h: - apply_gate_mcu(iChunk, op.qubits, M_PI / 2., 0., M_PI, 0.); + apply_gate_mcu(op.qubits, M_PI / 2., 0., M_PI, 0.); break; case Gates::s: - apply_gate_phase(iChunk, op.qubits[0], complex_t(0., 1.)); + apply_gate_phase(op.qubits[0], complex_t(0., 1.)); break; case Gates::sdg: - apply_gate_phase(iChunk, op.qubits[0], complex_t(0., -1.)); + apply_gate_phase(op.qubits[0], complex_t(0., -1.)); break; case Gates::t: { const double isqrt2{1. / std::sqrt(2)}; - apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2)); + apply_gate_phase(op.qubits[0], complex_t(isqrt2, isqrt2)); } break; case Gates::tdg: { const double isqrt2{1. / std::sqrt(2)}; - apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2)); + apply_gate_phase(op.qubits[0], complex_t(isqrt2, -isqrt2)); } break; case Gates::mcswap: // Includes SWAP, CSWAP, etc - BaseState::qregs_[iChunk].apply_mcswap(op.qubits); + BaseState::qreg_.apply_mcswap(op.qubits); break; case Gates::mcu3: // Includes u3, cu3, etc - apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]), - std::real(op.params[1]), std::real(op.params[2]), 0.); + apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]), + std::real(op.params[2]), 0.); break; case Gates::mcu: // Includes u3, cu3, etc - apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]), - std::real(op.params[1]), std::real(op.params[2]), - std::real(op.params[3])); + apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]), + std::real(op.params[2]), std::real(op.params[3])); break; case Gates::mcu2: // Includes u2, cu2, etc - apply_gate_mcu(iChunk, op.qubits, M_PI / 2., std::real(op.params[0]), + apply_gate_mcu(op.qubits, M_PI / 2., std::real(op.params[0]), std::real(op.params[1]), 0.); break; case Gates::mcp: // Includes u1, cu1, p, cp, mcp etc - BaseState::qregs_[iChunk].apply_mcphase( - op.qubits, std::exp(complex_t(0, 1) * op.params[0])); + BaseState::qreg_.apply_mcphase(op.qubits, + std::exp(complex_t(0, 1) * op.params[0])); break; case Gates::mcsx: // Includes sx, csx, mcsx etc - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SX); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SX); break; case Gates::mcsxdg: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SXDG); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SXDG); break; case Gates::pauli: - BaseState::qregs_[iChunk].apply_pauli(op.qubits, op.string_params[0]); + BaseState::qreg_.apply_pauli(op.qubits, op.string_params[0]); break; default: // We shouldn't reach here unless there is a bug in gateset @@ -1330,74 +872,67 @@ void State::apply_gate(const int_t iChunk, } template -void State::apply_multiplexer(const int_t iChunk, - const reg_t &control_qubits, +void State::apply_multiplexer(const reg_t &control_qubits, const reg_t &target_qubits, const cmatrix_t &mat) { if (control_qubits.empty() == false && target_qubits.empty() == false && mat.size() > 0) { cvector_t vmat = Utils::vectorize_matrix(mat); - BaseState::qregs_[iChunk].apply_multiplexer(control_qubits, target_qubits, - vmat); + BaseState::qreg_.apply_multiplexer(control_qubits, target_qubits, vmat); } } template -void State::apply_matrix(const int_t iChunk, - const Operations::Op &op) { +void State::apply_matrix(const Operations::Op &op) { if (op.qubits.empty() == false && op.mats[0].size() > 0) { if (Utils::is_diagonal(op.mats[0], .0)) { - apply_diagonal_matrix(iChunk, op.qubits, - Utils::matrix_diagonal(op.mats[0])); + apply_diagonal_matrix(op.qubits, Utils::matrix_diagonal(op.mats[0])); } else { - BaseState::qregs_[iChunk].apply_matrix( - op.qubits, Utils::vectorize_matrix(op.mats[0])); + BaseState::qreg_.apply_matrix(op.qubits, + Utils::vectorize_matrix(op.mats[0])); } } } template -void State::apply_matrix(const int_t iChunk, const reg_t &qubits, +void State::apply_matrix(const reg_t &qubits, const cvector_t &vmat) { // Check if diagonal matrix if (vmat.size() == 1ULL << qubits.size()) { - apply_diagonal_matrix(iChunk, qubits, vmat); + apply_diagonal_matrix(qubits, vmat); } else { - BaseState::qregs_[iChunk].apply_matrix(qubits, vmat); + BaseState::qreg_.apply_matrix(qubits, vmat); } } template -void State::apply_diagonal_matrix(const int_t iChunk, - const reg_t &qubits, +void State::apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag) { - if (BaseState::global_chunk_indexing_ || - !BaseState::multi_chunk_distribution_) { - // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix - // as is - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, diag); - } else { + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() && + !BaseState::qreg_.support_global_indexing()) { reg_t qubits_in = qubits; cvector_t diag_in = diag; - - BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in); - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in, diag_in); + Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(), + BaseState::qreg_.num_qubits(), qubits_in, + diag_in); + BaseState::qreg_.apply_diagonal_matrix(qubits_in, diag_in); + } else { + BaseState::qreg_.apply_diagonal_matrix(qubits, diag); } } template -void State::apply_gate_mcu(const int_t iChunk, const reg_t &qubits, - double theta, double phi, double lambda, +void State::apply_gate_mcu(const reg_t &qubits, double theta, + double phi, double lambda, double gamma) { - BaseState::qregs_[iChunk].apply_mcu( - qubits, Linalg::VMatrix::u4(theta, phi, lambda, gamma)); + BaseState::qreg_.apply_mcu(qubits, + Linalg::VMatrix::u4(theta, phi, lambda, gamma)); } template -void State::apply_gate_phase(const int_t iChunk, uint_t qubit, - complex_t phase) { +void State::apply_gate_phase(uint_t qubit, complex_t phase) { cvector_t diag = {{1., phase}}; - apply_diagonal_matrix(iChunk, reg_t({qubit}), diag); + apply_diagonal_matrix(reg_t({qubit}), diag); } //========================================================================= @@ -1405,163 +940,41 @@ void State::apply_gate_phase(const int_t iChunk, uint_t qubit, //========================================================================= template -void State::apply_measure(const int_t iChunk, const reg_t &qubits, - const reg_t &cmemory, +void State::apply_measure(const reg_t &qubits, const reg_t &cmemory, const reg_t &cregister, RngEngine &rng) { - int_t ishot = BaseState::get_global_shot_index(iChunk); // Actual measurement outcome - const auto meas = sample_measure_with_prob(iChunk, qubits, rng); + const auto meas = sample_measure_with_prob(qubits, rng); // Implement measurement update - measure_reset_update(iChunk, qubits, meas.first, meas.first, meas.second); + measure_reset_update(qubits, meas.first, meas.first, meas.second); const reg_t outcome = Utils::int2reg(meas.first, 2, qubits.size()); - BaseState::cregs_[ishot].store_measure(outcome, cmemory, cregister); + BaseState::creg().store_measure(outcome, cmemory, cregister); } template -rvector_t State::measure_probs(const int_t iChunk, - const reg_t &qubits) const { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].probabilities(qubits); - - uint_t dim = 1ull << qubits.size(); - rvector_t sum(dim, 0.0); - int_t i, j, k; - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - - BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk); - - if (qubits_in_chunk.size() > 0) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(i, j, k) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) { - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - - if (qubits_in_chunk.size() == qubits.size()) { - for (j = 0; j < dim; j++) { -#pragma omp atomic - sum[j] += chunkSum[j]; - } - } else { - for (j = 0; j < chunkSum.size(); j++) { - int idx = 0; - int i_in = 0; - for (k = 0; k < qubits.size(); k++) { - if (qubits[k] < BaseState::chunk_bits_) { - idx += (((j >> i_in) & 1) << k); - i_in++; - } else { - if ((((i + BaseState::global_chunk_index_) - << BaseState::chunk_bits_) >> - qubits[k]) & - 1) { - idx += 1ull << k; - } - } - } -#pragma omp atomic - sum[idx] += chunkSum[j]; - } - } - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - - if (qubits_in_chunk.size() == qubits.size()) { - for (j = 0; j < dim; j++) { - sum[j] += chunkSum[j]; - } - } else { - for (j = 0; j < chunkSum.size(); j++) { - int idx = 0; - int i_in = 0; - for (k = 0; k < qubits.size(); k++) { - if (qubits[k] < BaseState::chunk_bits_) { - idx += (((j >> i_in) & 1) << k); - i_in++; - } else { - if ((((i + BaseState::global_chunk_index_) - << BaseState::chunk_bits_) >> - qubits[k]) & - 1) { - idx += 1ull << k; - } - } - } - sum[idx] += chunkSum[j]; - } - } - } - } - } else { // there is no bit in chunk - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(i, j, k) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) { - auto nr = std::real(BaseState::qregs_[i].norm()); - int idx = 0; - for (k = 0; k < qubits_out_chunk.size(); k++) { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits_out_chunk[k]) & - 1) { - idx += 1ull << k; - } - } -#pragma omp atomic - sum[idx] += nr; - } - } - } else { - for (i = 0; i < BaseState::qregs_.size(); i++) { - auto nr = std::real(BaseState::qregs_[i].norm()); - int idx = 0; - for (k = 0; k < qubits_out_chunk.size(); k++) { - if ((((i + BaseState::global_chunk_index_) - << (BaseState::chunk_bits_)) >> - qubits_out_chunk[k]) & - 1) { - idx += 1ull << k; - } - } - sum[idx] += nr; - } - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(sum); -#endif - - return sum; +rvector_t State::measure_probs(const reg_t &qubits) const { + return BaseState::qreg_.probabilities(qubits); } template -void State::apply_reset(const int_t iChunk, const reg_t &qubits, - RngEngine &rng) { +void State::apply_reset(const reg_t &qubits, RngEngine &rng) { // Simulate unobserved measurement - const auto meas = sample_measure_with_prob(iChunk, qubits, rng); + const auto meas = sample_measure_with_prob(qubits, rng); // Apply update to reset state - measure_reset_update(iChunk, qubits, 0, meas.first, meas.second); + measure_reset_update(qubits, 0, meas.first, meas.second); } template -std::pair State::sample_measure_with_prob( - const int_t iChunk, const reg_t &qubits, RngEngine &rng) { - rvector_t probs = measure_probs(iChunk, qubits); +std::pair +State::sample_measure_with_prob(const reg_t &qubits, + RngEngine &rng) { + rvector_t probs = measure_probs(qubits); // Randomly pick outcome and return pair uint_t outcome = rng.rand_int(probs); return std::make_pair(outcome, probs[outcome]); } template -void State::measure_reset_update(const int_t iChunk, - const std::vector &qubits, +void State::measure_reset_update(const std::vector &qubits, const uint_t final_state, const uint_t meas_state, const double meas_prob) { @@ -1575,32 +988,11 @@ void State::measure_reset_update(const int_t iChunk, cvector_t mdiag(2, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_diagonal_matrix(ic, qubits, mdiag); - } - } else { - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_diagonal_matrix(ic, qubits, mdiag); - } - } - } + BaseState::qreg_.apply_diagonal_matrix(qubits, mdiag); // If it doesn't agree with the reset state update - if (final_state != meas_state) { - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].apply_mcx(qubits); - else - BaseState::apply_chunk_x(qubits[0]); - } + if (final_state != meas_state) + BaseState::qreg_.apply_mcx(qubits); } // Multi qubit case else { @@ -1609,53 +1001,21 @@ void State::measure_reset_update(const int_t iChunk, cvector_t mdiag(dim, 0.); mdiag[meas_state] = 1. / std::sqrt(meas_prob); - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_diagonal_matrix(ic, qubits, mdiag); - } - } else { - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_diagonal_matrix(ic, qubits, mdiag); - } - } - } + BaseState::qreg_.apply_diagonal_matrix(qubits, mdiag); // If it doesn't agree with the reset state update // This function could be optimized as a permutation update if (final_state != meas_state) { - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - - BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk); - - if (!BaseState::multi_chunk_distribution_ || - qubits_in_chunk.size() == - qubits.size()) { // all bits are inside chunk - // build vectorized permutation matrix - cvector_t perm(dim * dim, 0.); - perm[final_state * dim + meas_state] = 1.; - perm[meas_state * dim + final_state] = 1.; - for (size_t j = 0; j < dim; j++) { - if (j != final_state && j != meas_state) - perm[j * dim + j] = 1.; - } - // apply permutation to swap state - apply_matrix(iChunk, qubits, perm); - } else { - for (int_t i = 0; i < qubits.size(); i++) { - if (((final_state >> i) & 1) != ((meas_state >> i) & 1)) { - BaseState::apply_chunk_x(qubits[i]); - } - } + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != meas_state) + perm[j * dim + j] = 1.; } + // apply permutation to swap state + apply_matrix(qubits, perm); } } } @@ -1673,100 +1033,13 @@ std::vector State::sample_measure(const reg_t &qubits, for (i = 0; i < shots; ++i) rnds.push_back(rng.rand(0, 1)); - if (!BaseState::multi_chunk_distribution_) - allbit_samples = BaseState::qregs_[0].sample_measure(rnds); - else { - std::vector chunkSum(BaseState::qregs_.size() + 1, 0); - double sum, localSum; - - // calculate per chunk sum - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) { - bool batched = BaseState::qregs_[ic].enable_batch( - true); // return sum of all chunks in group - chunkSum[ic] = BaseState::qregs_[ic].norm(); - BaseState::qregs_[ic].enable_batch(batched); - } - } - } else { - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) { - bool batched = BaseState::qregs_[ic].enable_batch( - true); // return sum of all chunks in group - chunkSum[ic] = BaseState::qregs_[ic].norm(); - BaseState::qregs_[ic].enable_batch(batched); - } - } - } - - localSum = 0.0; - for (i = 0; i < BaseState::qregs_.size(); i++) { - sum = localSum; - localSum += chunkSum[i]; - chunkSum[i] = sum; - } - chunkSum[BaseState::qregs_.size()] = localSum; - - double globalSum = 0.0; - if (BaseState::nprocs_ > 1) { - std::vector procTotal(BaseState::nprocs_); - - for (i = 0; i < BaseState::nprocs_; i++) { - procTotal[i] = localSum; - } - - BaseState::gather_value(procTotal); - - for (i = 0; i < BaseState::myrank_; i++) { - globalSum += procTotal[i]; - } - } - - reg_t local_samples(shots, 0); - - // get rnds positions for each chunk - for (i = 0; i < BaseState::qregs_.size(); i++) { - uint_t nIn; - std::vector vIdx; - std::vector vRnd; - - // find rnds in this chunk - nIn = 0; - for (j = 0; j < shots; j++) { - if (rnds[j] >= chunkSum[i] + globalSum && - rnds[j] < chunkSum[i + 1] + globalSum) { - vRnd.push_back(rnds[j] - (globalSum + chunkSum[i])); - vIdx.push_back(j); - nIn++; - } - } - - if (nIn > 0) { - auto chunkSamples = BaseState::qregs_[i].sample_measure(vRnd); - - for (j = 0; j < chunkSamples.size(); j++) { - local_samples[vIdx[j]] = - ((BaseState::global_chunk_index_ + i) << BaseState::chunk_bits_) + - chunkSamples[j]; - } - } - } - -#ifdef AER_MPI - BaseState::reduce_sum(local_samples); -#endif - allbit_samples = local_samples; - } + allbit_samples = BaseState::qreg_.sample_measure(rnds); // Convert to reg_t format std::vector all_samples; all_samples.reserve(shots); for (int_t val : allbit_samples) { - reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::num_qubits_); + reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits()); reg_t sample; sample.reserve(qubits.size()); for (uint_t qubit : qubits) { @@ -1779,161 +1052,29 @@ std::vector State::sample_measure(const reg_t &qubits, } template -void State::apply_initialize(const int_t iChunk, - const reg_t &qubits, +void State::apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng) { auto sorted_qubits = qubits; std::sort(sorted_qubits.begin(), sorted_qubits.end()); - if (qubits.size() == BaseState::num_qubits_) { + if (qubits.size() == BaseState::qreg_.num_qubits()) { // If qubits is all ordered qubits in the statevector // we can just initialize the whole state directly if (qubits == sorted_qubits) { - initialize_from_vector(iChunk, params); + initialize_from_vector(params); return; } } // Apply reset to qubits - apply_reset(iChunk, qubits, rng); + apply_reset(qubits, rng); // Apply initialize_component - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].initialize_component(qubits, params); - else { - reg_t qubits_in_chunk; - reg_t qubits_out_chunk; - BaseState::qubits_inout(qubits, qubits_in_chunk, qubits_out_chunk); - - if (qubits_out_chunk.size() == 0) { // no qubits outside of chunk - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - BaseState::qregs_[i].initialize_component(qubits, params); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - BaseState::qregs_[i].initialize_component(qubits, params); - } - } else { - // scatter base states - if (qubits_in_chunk.size() > 0) { - // scatter inside chunks - const size_t dim = 1ULL << qubits_in_chunk.size(); - cvector_t perm(dim * dim, 0.); - for (int_t i = 0; i < dim; i++) { - perm[i] = 1.0; - } - - if (BaseState::chunk_omp_parallel_) { -#pragma omp parallel for - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_matrix(i, qubits_in_chunk, perm); - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_matrix(i, qubits_in_chunk, perm); - } - } - if (qubits_out_chunk.size() > 0) { - // then scatter outside chunk - auto sorted_qubits_out = qubits_out_chunk; - std::sort(sorted_qubits_out.begin(), sorted_qubits_out.end()); - - for (int_t i = 0; - i < (1ull << (BaseState::num_qubits_ - BaseState::chunk_bits_ - - qubits_out_chunk.size())); - i++) { - uint_t baseChunk = 0; - uint_t j, ii, t; - ii = i; - for (j = 0; j < qubits_out_chunk.size(); j++) { - t = ii & ((1ull << qubits_out_chunk[j]) - 1); - baseChunk += t; - ii = (ii - t) << 1; - } - baseChunk += ii; - baseChunk >>= BaseState::chunk_bits_; - - for (j = 1; j < (1ull << qubits_out_chunk.size()); j++) { - int_t ic = baseChunk; - for (t = 0; t < qubits_out_chunk.size(); t++) { - if ((j >> t) & 1) - ic += (1ull << (qubits_out_chunk[t] - BaseState::chunk_bits_)); - } - - if (ic >= BaseState::chunk_index_begin_ - [BaseState::distributed_rank_] && - ic < BaseState::chunk_index_end_ - [BaseState::distributed_rank_]) { // on this process - if (baseChunk >= BaseState::chunk_index_begin_ - [BaseState::distributed_rank_] && - baseChunk < - BaseState::chunk_index_end_ - [BaseState::distributed_rank_]) { // base chunk is on - // this process - BaseState::qregs_[ic].initialize_from_data( - BaseState::qregs_[baseChunk].data(), - 1ull << BaseState::chunk_bits_); - } else { - BaseState::recv_chunk(ic, baseChunk); - // using swap chunk function to release send/recv buffers for - // Thrust - reg_t swap(2); - swap[0] = BaseState::chunk_bits_; - swap[1] = BaseState::chunk_bits_; - BaseState::qregs_[ic].apply_chunk_swap(swap, baseChunk); - } - } else if (baseChunk >= BaseState::chunk_index_begin_ - [BaseState::distributed_rank_] && - baseChunk < - BaseState::chunk_index_end_ - [BaseState::distributed_rank_]) { // base chunk - // is on this - // process - BaseState::send_chunk(baseChunk - BaseState::global_chunk_index_, - ic); - } - } - } - } - - // initialize by params - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - apply_diagonal_matrix(i, qubits, params); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_diagonal_matrix(i, qubits, params); - } - } - } + BaseState::qreg_.initialize_component(qubits, params); } template -void State::initialize_from_vector(const int_t iChunk, - const cvector_t ¶ms) { - if (!BaseState::multi_chunk_distribution_) - BaseState::qregs_[iChunk].initialize_from_vector(params); - else { // multi-chunk distribution - uint_t local_offset = BaseState::global_chunk_index_ - << BaseState::chunk_bits_; - -#pragma omp parallel for if (BaseState::chunk_omp_parallel_) - for (int_t i = 0; i < BaseState::qregs_.size(); i++) { - // copy part of state for this chunk - cvector_t tmp(1ull << BaseState::chunk_bits_); - std::copy(params.begin() + local_offset + (i << BaseState::chunk_bits_), - params.begin() + local_offset + - ((i + 1) << BaseState::chunk_bits_), - tmp.begin()); - BaseState::qregs_[i].initialize_from_vector(tmp); - } - } +void State::initialize_from_vector(const cvector_t ¶ms) { + BaseState::qreg_.initialize_from_vector(params); } //========================================================================= @@ -1941,8 +1082,7 @@ void State::initialize_from_vector(const int_t iChunk, //========================================================================= template -void State::apply_multiplexer(const int_t iChunk, - const reg_t &control_qubits, +void State::apply_multiplexer(const reg_t &control_qubits, const reg_t &target_qubits, const std::vector &mmat) { // (1) Pack vector of matrices into single (stacked) matrix ... note: matrix @@ -1950,14 +1090,14 @@ void State::apply_multiplexer(const int_t iChunk, cmatrix_t multiplexer_matrix = Utils::stacked_matrix(mmat); // (2) Treat as single, large(r), chained/batched matrix operator - apply_multiplexer(iChunk, control_qubits, target_qubits, multiplexer_matrix); + apply_multiplexer(control_qubits, target_qubits, multiplexer_matrix); } //========================================================================= // Implementation: Kraus Noise //========================================================================= template -void State::apply_kraus(const int_t iChunk, const reg_t &qubits, +void State::apply_kraus(const reg_t &qubits, const std::vector &kmats, RngEngine &rng) { // Check edge case for empty Kraus set (this shouldn't happen) @@ -1981,52 +1121,14 @@ void State::apply_kraus(const int_t iChunk, const reg_t &qubits, // Calculate probability cvector_t vmat = Utils::vectorize_matrix(kmats[j]); - if (!BaseState::multi_chunk_distribution_) { - p = BaseState::qregs_[iChunk].norm(qubits, vmat); - accum += p; - } else { - p = 0.0; - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for reduction(+ : p) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - p += BaseState::qregs_[i].norm(qubits, vmat); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - p += BaseState::qregs_[i].norm(qubits, vmat); - } - -#ifdef AER_MPI - BaseState::reduce_sum(p); -#endif - accum += p; - } - + p = BaseState::qreg_.norm(qubits, vmat); + accum += p; // check if we need to apply this operator if (accum > r) { // rescale vmat so projection is normalized Utils::scalar_multiply_inplace(vmat, 1 / std::sqrt(p)); // apply Kraus projection operator - if (!BaseState::multi_chunk_distribution_) - apply_matrix(iChunk, qubits, vmat); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_matrix(ic, qubits, vmat); - } - } else { - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_matrix(ic, qubits, vmat); - } - } - } + apply_matrix(qubits, vmat); complete = true; break; } @@ -2037,24 +1139,7 @@ void State::apply_kraus(const int_t iChunk, const reg_t &qubits, // Compute probability from accumulated complex_t renorm = 1 / std::sqrt(1. - accum); auto vmat = Utils::vectorize_matrix(renorm * kmats.back()); - if (!BaseState::multi_chunk_distribution_) - apply_matrix(iChunk, qubits, vmat); - else { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_matrix(ic, qubits, vmat); - } - } else { - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t ic = BaseState::top_chunk_of_group_[ig]; - ic < BaseState::top_chunk_of_group_[ig + 1]; ic++) - apply_matrix(ic, qubits, vmat); - } - } - } + apply_matrix(qubits, vmat); } } diff --git a/src/simulators/superoperator/superoperator.hpp b/src/simulators/superoperator/superoperator.hpp index c9264fb638..54343ad932 100644 --- a/src/simulators/superoperator/superoperator.hpp +++ b/src/simulators/superoperator/superoperator.hpp @@ -61,6 +61,12 @@ class Superoperator : public DensityMatrix { // Initialize to the identity superoperator void initialize(); + // initialize from existing state (copy) + void initialize(const Superoperator &obj) { + BaseDensity::copy_qv(obj); + num_qubits_ = obj.num_qubits_; + } + // Initializes the vector to a custom initial state. // The matrix can either be superoperator matrix or unitary matrix. // The type is inferred by the dimensions of the input matrix. diff --git a/src/simulators/superoperator/superoperator_thrust.hpp b/src/simulators/superoperator/superoperator_thrust.hpp index 6ad6c8ce1a..538122c29d 100644 --- a/src/simulators/superoperator/superoperator_thrust.hpp +++ b/src/simulators/superoperator/superoperator_thrust.hpp @@ -61,6 +61,12 @@ class SuperoperatorThrust : public DensityMatrixThrust { // Initialize to the identity superoperator void initialize(); + // initialize from existing state (copy) + void initialize(const SuperoperatorThrust &obj) { + BaseDensity::copy_qv(obj); + num_qubits_ = obj.num_qubits_; + } + // Initializes the vector to a custom initial state. // The matrix can either be superoperator matrix or unitary matrix. // The type is inferred by the dimensions of the input matrix. diff --git a/src/simulators/tensor_network/tensor_net_contractor.hpp b/src/simulators/tensor_network/tensor_net_contractor.hpp index b8d6f0c3b9..00b28fe733 100644 --- a/src/simulators/tensor_network/tensor_net_contractor.hpp +++ b/src/simulators/tensor_network/tensor_net_contractor.hpp @@ -54,6 +54,8 @@ class TensorNetContractor { virtual void allocate_sampling_buffers(uint_t size = AER_TENSOR_NET_MAX_SAMPLING) = 0; virtual void deallocate_sampling_buffers(void) = 0; + + virtual void set_target_gpus(reg_t &t) {} }; template diff --git a/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp b/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp index 1d52e1674b..cc69b93e38 100644 --- a/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp +++ b/src/simulators/tensor_network/tensor_net_contractor_cuTensorNet.hpp @@ -842,6 +842,8 @@ class TensorNetContractor_cuTensorNet : public TensorNetContractor { int nprocs_ = 1; int myrank_ = 0; + reg_t target_gpus_; + public: TensorNetContractor_cuTensorNet(); ~TensorNetContractor_cuTensorNet(); @@ -872,6 +874,8 @@ class TensorNetContractor_cuTensorNet : public TensorNetContractor { allocate_sampling_buffers(uint_t size = AER_TENSOR_NET_MAX_SAMPLING) override; void deallocate_sampling_buffers(void) override; + void set_target_gpus(reg_t &t) override { target_gpus_ = t; } + protected: void remove_additional_tensors(void); @@ -903,10 +907,18 @@ void TensorNetContractor_cuTensorNet::set_network( // allocate tensor data storage for each device if (cudaGetDeviceCount(&num_devices_) != cudaSuccess) cudaGetLastError(); + if (target_gpus_.size() > 0) { + num_devices_ = target_gpus_.size(); + } else { + target_gpus_.resize(num_devices_); + for (int_t i = 0; i < num_devices_; i++) + target_gpus_[i] = i; + } + tensor_data_.clear(); tensor_data_.resize(num_devices_); for (int_t i = 0; i < num_devices_; i++) { - tensor_data_[i].set_device(i); + tensor_data_[i].set_device(target_gpus_[i]); } // count number of tensors @@ -1022,7 +1034,7 @@ void TensorNetContractor_cuTensorNet::setup_contraction( // allocate work buffer on GPU if (!tensor_data_[0].work_allocated()) { - cudaSetDevice(0); + cudaSetDevice(target_gpus_[0]); HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem)); work_size = (freeMem / nid) * 0.9; tensor_data_[0].allocate_work(work_size); @@ -1049,7 +1061,7 @@ void TensorNetContractor_cuTensorNet::setup_contraction( if (ns > 0) { // setup for the device if (!tensor_data_[i].work_allocated()) { - cudaSetDevice(i); + cudaSetDevice(target_gpus_[i]); HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem)); work_size = (freeMem / nid) * 0.9; tensor_data_[i].allocate_work(work_size); diff --git a/src/simulators/tensor_network/tensor_net_executor.hpp b/src/simulators/tensor_network/tensor_net_executor.hpp new file mode 100644 index 0000000000..74be04051e --- /dev/null +++ b/src/simulators/tensor_network/tensor_net_executor.hpp @@ -0,0 +1,469 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _tensor_network_executor_hpp_ +#define _tensor_network_executor_hpp_ + +#include "simulators/multi_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace TensorNetwork { + +//------------------------------------------------------------------------- +// Batched-shots executor for statevector +//------------------------------------------------------------------------- +template +class Executor : public CircuitExecutor::MultiStateExecutor { + using Base = CircuitExecutor::MultiStateExecutor; + +protected: +public: + Executor() {} + virtual ~Executor() {} + +protected: + void set_config(const Config &config) override; + + bool shot_branching_supported(void) override { return true; } + + bool apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, ExperimentResult &result, + bool final_op) override; + + rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits); + void measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs); + void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits, + const reg_t &cmemory, const reg_t &cregister); + void apply_reset(CircuitExecutor::Branch &root, const reg_t &qubits); + void apply_initialize(CircuitExecutor::Branch &root, const reg_t &qubits, + const cvector_t ¶ms); + void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits, + const std::vector &kmats); + + std::vector sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const override; + + void apply_save_statevector(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, bool last_op); + void apply_save_statevector_dict(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result); + void apply_save_amplitudes(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result); +}; + +template +void Executor::set_config(const Config &config) { + Base::set_config(config); +} + +template +bool Executor::apply_branching_op(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, + bool final_op) { + RngEngine dummy; + if (Base::states_[root.state_index()].creg().check_conditional(op)) { + switch (op.type) { + case OpType::reset: + apply_reset(root, op.qubits); + break; + case OpType::initialize: + apply_initialize(root, op.qubits, op.params); + break; + case OpType::measure: + apply_measure(root, op.qubits, op.memory, op.registers); + break; + case OpType::kraus: + if (!Base::has_statevector_ops_) + return false; + apply_kraus(root, op.qubits, op.mats); + break; + case OpType::save_expval: + case OpType::save_expval_var: + case OpType::save_densmat: + case OpType::save_probs: + case OpType::save_probs_ket: + // call save functions in state class + Base::states_[root.state_index()].apply_op(op, result, dummy, final_op); + break; + case OpType::save_state: + case OpType::save_statevec: + apply_save_statevector(root, op, result, final_op); + break; + case OpType::save_statevec_dict: + apply_save_statevector_dict(root, op, result); + break; + case OpType::save_amps: + case OpType::save_amps_sq: + apply_save_amplitudes(root, op, result); + break; + default: + return false; + } + } + return true; +} + +template +rvector_t +Executor::sample_measure_with_prob(CircuitExecutor::Branch &root, + const reg_t &qubits) { + rvector_t probs = + Base::states_[root.state_index()].qreg().probabilities(qubits); + uint_t nshots = root.num_shots(); + reg_t shot_branch(nshots); + + for (int_t i = 0; i < nshots; i++) { + shot_branch[i] = root.rng_shots()[i].rand_int(probs); + } + + // branch shots + root.creg() = Base::states_[root.state_index()].creg(); + root.branch_shots(shot_branch, probs.size()); + + return probs; +} + +template +void Executor::measure_reset_update(CircuitExecutor::Branch &root, + const std::vector &qubits, + const int_t final_state, + const rvector_t &meas_probs) { + // Update a state vector based on an outcome pair [m, p] from + // sample_measure_with_prob function, and a desired post-measurement + // final_state + + // Single-qubit case + if (qubits.size() == 1) { + // Diagonal matrix for projecting and renormalizing to measurement outcome + for (int_t i = 0; i < 2; i++) { + cvector_t mdiag(2, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + Operations::Op op; + op.type = OpType::gate; + op.name = "mcx"; + op.qubits = qubits; + root.branches()[i]->add_op_after_branch(op); + } + } + } + // Multi qubit case + else { + // Diagonal matrix for projecting and renormalizing to measurement outcome + const size_t dim = 1ULL << qubits.size(); + for (int_t i = 0; i < dim; i++) { + cvector_t mdiag(dim, 0.); + mdiag[i] = 1. / std::sqrt(meas_probs[i]); + + Operations::Op op; + op.type = OpType::diagonal_matrix; + op.qubits = qubits; + op.params = mdiag; + root.branches()[i]->add_op_after_branch(op); + + if (final_state >= 0 && final_state != i) { + // build vectorized permutation matrix + cvector_t perm(dim * dim, 0.); + perm[final_state * dim + i] = 1.; + perm[i * dim + final_state] = 1.; + for (size_t j = 0; j < dim; j++) { + if (j != final_state && j != i) + perm[j * dim + j] = 1.; + } + Operations::Op op; + op.type = OpType::matrix; + op.qubits = qubits; + op.mats.push_back(Utils::devectorize_matrix(perm)); + root.branches()[i]->add_op_after_branch(op); + } + } + } +} + +template +void Executor::apply_measure(CircuitExecutor::Branch &root, + const reg_t &qubits, const reg_t &cmemory, + const reg_t &cregister) { + rvector_t probs = sample_measure_with_prob(root, qubits); + + // save result to cregs + for (int_t i = 0; i < probs.size(); i++) { + const reg_t outcome = Utils::int2reg(i, 2, qubits.size()); + root.branches()[i]->creg().store_measure(outcome, cmemory, cregister); + } + + measure_reset_update(root, qubits, -1, probs); +} + +template +void Executor::apply_reset(CircuitExecutor::Branch &root, + const reg_t &qubits) { + rvector_t probs = sample_measure_with_prob(root, qubits); + + measure_reset_update(root, qubits, 0, probs); +} + +template +void Executor::apply_initialize(CircuitExecutor::Branch &root, + const reg_t &qubits, + const cvector_t ¶ms) { + if (qubits.size() == Base::num_qubits_) { + auto sorted_qubits = qubits; + std::sort(sorted_qubits.begin(), sorted_qubits.end()); + // If qubits is all ordered qubits in the statevector + // we can just initialize the whole state directly + if (qubits == sorted_qubits) { + Base::states_[root.state_index()].initialize_from_vector(params); + return; + } + } + + if (root.additional_ops().size() == 0) { + apply_reset(root, qubits); + + Operations::Op op; + op.type = OpType::initialize; + op.name = "initialize"; + op.qubits = qubits; + op.params = params; + for (int_t i = 0; i < root.num_branches(); i++) { + root.branches()[i]->add_op_after_branch(op); + } + return; // initialization will be done in next call because of shot + // branching in reset + } + + Base::states_[root.state_index()].qreg().initialize_component(qubits, params); +} + +template +void Executor::apply_kraus(CircuitExecutor::Branch &root, + const reg_t &qubits, + const std::vector &kmats) { + // Check edge case for empty Kraus set (this shouldn't happen) + if (kmats.empty()) + return; // end function early + + // Choose a real in [0, 1) to choose the applied kraus operator once + // the accumulated probability is greater than r. + // We know that the Kraus noise must be normalized + // So we only compute probabilities for the first N-1 kraus operators + // and infer the probability of the last one from 1 - sum of the previous + + double r; + double accum = 0.; + double p; + bool complete = false; + + reg_t shot_branch; + uint_t nshots; + rvector_t rshots, pmats; + uint_t nshots_multiplied = 0; + + nshots = root.num_shots(); + shot_branch.resize(nshots); + rshots.resize(nshots); + for (int_t i = 0; i < nshots; i++) { + shot_branch[i] = kmats.size() - 1; + rshots[i] = root.rng_shots()[i].rand(0., 1.); + } + pmats.resize(kmats.size()); + + // Loop through N-1 kraus operators + for (size_t j = 0; j < kmats.size() - 1; j++) { + // Calculate probability + cvector_t vmat = Utils::vectorize_matrix(kmats[j]); + + p = Base::states_[root.state_index()].qreg().norm(qubits, vmat); + accum += p; + + // check if we need to apply this operator + pmats[j] = p; + for (int_t i = 0; i < nshots; i++) { + if (shot_branch[i] >= kmats.size() - 1) { + if (accum > rshots[i]) { + shot_branch[i] = j; + nshots_multiplied++; + } + } + } + if (nshots_multiplied >= nshots) { + complete = true; + break; + } + } + + // check if we haven't applied a kraus operator yet + pmats[pmats.size() - 1] = 1. - accum; + + root.creg() = Base::states_[root.state_index()].creg(); + root.branch_shots(shot_branch, kmats.size()); + for (int_t i = 0; i < kmats.size(); i++) { + Operations::Op op; + op.type = OpType::matrix; + op.qubits = qubits; + op.mats.push_back(kmats[i]); + p = 1 / std::sqrt(pmats[i]); + for (int_t j = 0; j < op.mats[0].size(); j++) + op.mats[0][j] *= p; + root.branches()[i]->add_op_after_branch(op); + } +} + +template +void Executor::apply_save_statevector(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + std::string key = + (op.string_params[0] == "_method_") ? "statevector" : op.string_params[0]; + + if (last_op) { + const auto v = Base::states_[root.state_index()].move_to_vector(); + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v, + OpType::save_statevec, op.save_type); + } + } else { + const auto v = Base::states_[root.state_index()].copy_to_vector(); + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot(Base::states_[root.state_index()].creg(), key, v, + OpType::save_statevec, op.save_type); + } + } +} + +template +void Executor::apply_save_statevector_dict( + CircuitExecutor::Branch &root, const Operations::Op &op, + ExperimentResult &result) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full statevector can be saved."); + } + auto state_ket = Base::states_[root.state_index()].qreg().vector_ket( + Base::json_chop_threshold_); + std::map result_state_ket; + for (auto const &it : state_ket) { + result_state_ket[it.first] = it.second; + } + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot( + Base::states_[root.state_index()].creg(), op.string_params[0], + (const std::map &)result_state_ket, op.type, + op.save_type); + } +} + +template +void Executor::apply_save_amplitudes(CircuitExecutor::Branch &root, + const Operations::Op &op, + ExperimentResult &result) { + if (op.int_params.empty()) { + throw std::invalid_argument( + "Invalid save_amplitudes instructions (empty params)."); + } + const int_t size = op.int_params.size(); + if (op.type == Operations::OpType::save_amps) { + Vector amps(size, false); + for (int_t i = 0; i < size; ++i) { + amps[i] = + Base::states_[root.state_index()].qreg().get_state(op.int_params[i]); + } + for (int_t i = 0; i < root.num_shots(); i++) { + result.save_data_pershot( + Base::states_[root.state_index()].creg(), op.string_params[0], + (const Vector &)amps, op.type, op.save_type); + } + } else { + rvector_t amps_sq(size, 0); + for (int_t i = 0; i < size; ++i) { + amps_sq[i] = Base::states_[root.state_index()].qreg().probability( + op.int_params[i]); + } + result.save_data_average(Base::states_[root.state_index()].creg(), + op.string_params[0], amps_sq, op.type, + op.save_type); + } +} + +template +std::vector +Executor::sample_measure(state_t &state, const reg_t &qubits, + uint_t shots, + std::vector &rng) const { + int_t i, j; + std::vector rnds; + rnds.reserve(shots); + + for (i = 0; i < shots; ++i) + rnds.push_back(rng[i].rand(0, 1)); + + std::vector samples = state.qreg().sample_measure(rnds); + std::vector ret(shots); + + if (omp_get_num_threads() > 1) { + for (i = 0; i < shots; ++i) { + ret[i].resize(qubits.size()); + for (j = 0; j < qubits.size(); j++) + ret[i][j] = samples[i][qubits[j]]; + } + } else { +#pragma omp parallel for private(j) + for (i = 0; i < shots; ++i) { + ret[i].resize(qubits.size()); + for (j = 0; j < qubits.size(); j++) + ret[i][j] = samples[i][qubits[j]]; + } + } + return ret; +} + +//------------------------------------------------------------------------- +} // namespace TensorNetwork +//------------------------------------------------------------------------- +} // end namespace AER +//------------------------------------------------------------------------- +#endif diff --git a/src/simulators/tensor_network/tensor_net_state.hpp b/src/simulators/tensor_network/tensor_net_state.hpp index 44a6221c36..a1004a2312 100644 --- a/src/simulators/tensor_network/tensor_net_state.hpp +++ b/src/simulators/tensor_network/tensor_net_state.hpp @@ -23,7 +23,7 @@ #include "framework/json.hpp" #include "framework/opset.hpp" #include "framework/utils.hpp" -#include "simulators/state_chunk.hpp" +#include "simulators/state.hpp" #include "tensor_net.hpp" #include "simulators/tensor_network/tensor_net.hpp" @@ -152,6 +152,8 @@ class State : public QuantumState::State { // Initializes to a specific n-qubit state void initialize_qreg(const tensor_net_t &tensor); + void initialize_from_vector(const cvector_t ¶ms); + //----------------------------------------------------------------------- // Additional methods //----------------------------------------------------------------------- @@ -190,8 +192,6 @@ class State : public QuantumState::State { void apply_initialize(const reg_t &qubits, const cvector_t ¶ms, RngEngine &rng); - void initialize_from_vector(const cvector_t ¶ms); - void initialize_from_matrix(const cmatrix_t ¶ms); // Apply a matrix to given qubits (identity on all other qubits) diff --git a/src/simulators/unitary/unitary_executor.hpp b/src/simulators/unitary/unitary_executor.hpp new file mode 100644 index 0000000000..240d806870 --- /dev/null +++ b/src/simulators/unitary/unitary_executor.hpp @@ -0,0 +1,213 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019. 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + +#ifndef _unitary_executor_hpp +#define _unitary_executor_hpp + +#include "simulators/parallel_state_executor.hpp" + +#ifdef _OPENMP +#include +#endif + +#ifdef AER_MPI +#include +#endif + +namespace AER { + +namespace QubitUnitary { + +//------------------------------------------------------------------------- +// Parallel executor for QubitUnitar +//------------------------------------------------------------------------- + +template +class Executor : public CircuitExecutor::ParallelStateExecutor { + using Base = CircuitExecutor::ParallelStateExecutor; + +protected: +public: + Executor() {} + virtual ~Executor() {} + + auto move_to_matrix(void); + auto copy_to_matrix(void); + +protected: + void set_config(const Config &config) override; + + // apply parallel operations + bool apply_parallel_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op) override; + + void initialize_qreg(uint_t num_qubits) override; + + //----------------------------------------------------------------------- + // Apply Instructions + //----------------------------------------------------------------------- + // swap between chunks + void apply_chunk_swap(const reg_t &qubits) override; + + //----------------------------------------------------------------------- + // Save data instructions + //----------------------------------------------------------------------- + + // Save the unitary matrix for the simulator + void apply_save_unitary(const Operations::Op &op, ExperimentResult &result, + bool last_op); + + // Helper function for computing expectation value + double expval_pauli(const reg_t &qubits, const std::string &pauli) override; + + // scale for unitary = 2 + // this function is used in the base class to scale chunk qubits for + // multi-chunk distribution + uint_t qubit_scale(void) override { return 2; } +}; + +template +void Executor::set_config(const Config &config) { + Base::set_config(config); +} + +template +void Executor::initialize_qreg(uint_t num_qubits) { + int_t iChunk; + for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) { + Base::states_[iChunk].qreg().set_num_qubits(Base::chunk_bits_); + } + + if (Base::chunk_omp_parallel_ && Base::num_groups_ > 1) { +#pragma omp parallel for private(iChunk) + for (int_t ig = 0; ig < Base::num_groups_; ig++) { + for (iChunk = Base::top_state_of_group_[ig]; + iChunk < Base::top_state_of_group_[ig + 1]; iChunk++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + iChunk) >> + ((Base::num_qubits_ - Base::chunk_bits_)); + icol = (Base::global_state_index_ + iChunk) - + (irow << ((Base::num_qubits_ - Base::chunk_bits_))); + if (irow == icol) + Base::states_[iChunk].qreg().initialize(); + else + Base::states_[iChunk].qreg().zero(); + } + } + } else { + for (iChunk = 0; iChunk < Base::states_.size(); iChunk++) { + uint_t irow, icol; + irow = (Base::global_state_index_ + iChunk) >> + ((Base::num_qubits_ - Base::chunk_bits_)); + icol = (Base::global_state_index_ + iChunk) - + (irow << ((Base::num_qubits_ - Base::chunk_bits_))); + if (irow == icol) + Base::states_[iChunk].qreg().initialize(); + else + Base::states_[iChunk].qreg().zero(); + } + } + + Base::apply_global_phase(); +} + +template +bool Executor::apply_parallel_op(const Operations::Op &op, + ExperimentResult &result, + RngEngine &rng, bool final_op) { + // temporary : this is for statevector + if (Base::states_[0].creg().check_conditional(op)) { + switch (op.type) { + case Operations::OpType::bfunc: + Base::states_[0].creg().apply_bfunc(op); + break; + case Operations::OpType::roerror: + Base::states_[0].creg().apply_roerror(op, rng); + break; + case Operations::OpType::set_unitary: + Base::initialize_from_matrix(op.mats[0]); + break; + case Operations::OpType::save_state: + case Operations::OpType::save_unitary: + apply_save_unitary(op, result, final_op); + break; + default: + return false; + } + } + return true; +} + +template +auto Executor::move_to_matrix(void) { + return Base::apply_to_matrix(false); +} + +template +auto Executor::copy_to_matrix(void) { + return Base::apply_to_matrix(true); +} + +template +void Executor::apply_save_unitary(const Operations::Op &op, + ExperimentResult &result, + bool last_op) { + if (op.qubits.size() != Base::num_qubits_) { + throw std::invalid_argument(op.name + + " was not applied to all qubits." + " Only the full unitary can be saved."); + } + std::string key = + (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0]; + + if (last_op) { + result.save_data_pershot(Base::states_[0].creg(), key, move_to_matrix(), + Operations::OpType::save_unitary, op.save_type); + } else { + result.save_data_pershot(Base::states_[0].creg(), key, copy_to_matrix(), + Operations::OpType::save_unitary, op.save_type); + } +} + +template +double Executor::expval_pauli(const reg_t &qubits, + const std::string &pauli) { + throw std::runtime_error( + "Unitary simulator does not support Pauli expectation values."); +} + +// swap between chunks +template +void Executor::apply_chunk_swap(const reg_t &qubits) { + uint_t q0, q1; + q0 = qubits[0]; + q1 = qubits[1]; + + std::swap(Base::qubit_map_[q0], Base::qubit_map_[q1]); + + if (qubits[0] >= Base::chunk_bits_) { + q0 += Base::chunk_bits_; + } + if (qubits[1] >= Base::chunk_bits_) { + q1 += Base::chunk_bits_; + } + reg_t qs0 = {{q0, q1}}; + Base::apply_chunk_swap(qs0); +} + +//------------------------------------------------------------------------------ +} // namespace QubitUnitary +} // end namespace AER +//------------------------------------------------------------------------------ +#endif diff --git a/src/simulators/unitary/unitary_state.hpp b/src/simulators/unitary/unitary_state.hpp old mode 100644 new mode 100755 index 0b86625cef..e7352b84c2 --- a/src/simulators/unitary/unitary_state.hpp +++ b/src/simulators/unitary/unitary_state.hpp @@ -17,11 +17,10 @@ #include #define _USE_MATH_DEFINES -#include "framework/config.hpp" #include "framework/json.hpp" #include "framework/utils.hpp" +#include "simulators/chunk_utils.hpp" #include "simulators/state.hpp" -#include "simulators/state_chunk.hpp" #include "unitarymatrix.hpp" #include #ifdef AER_THRUST_SUPPORTED @@ -86,9 +85,9 @@ enum class Gates { //========================================================================= template > -class State : public virtual QuantumState::StateChunk { +class State : public virtual QuantumState::State { public: - using BaseState = QuantumState::StateChunk; + using BaseState = QuantumState::State; State() : BaseState(StateOpSet) {} virtual ~State() = default; @@ -102,9 +101,12 @@ class State : public virtual QuantumState::StateChunk { // Apply an operation // If the op is not in allowed_ops an exeption will be raised. - virtual void apply_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, RngEngine &rng, - bool final_op = false) override; + virtual void apply_op(const Operations::Op &op, ExperimentResult &result, + RngEngine &rng, bool final_op = false) override; + + // memory allocation (previously called before inisitalize_qreg) + bool allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots = 1) override; // Initializes an n-qubit unitary to the identity matrix virtual void initialize_qreg(uint_t num_qubits) override; @@ -131,49 +133,35 @@ class State : public virtual QuantumState::StateChunk { // Initialize OpenMP settings for the underlying QubitVector class void initialize_omp(); - auto move_to_matrix(const int_t iChunk); - auto copy_to_matrix(const int_t iChunk); + auto move_to_matrix(); + auto copy_to_matrix(); protected: //----------------------------------------------------------------------- // Apply Instructions //----------------------------------------------------------------------- - // apply op to multiple shots , return flase if op is not supported to execute - // in a batch - bool apply_batched_op(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, std::vector &rng, - bool final_op = false) override; - // Applies a Gate operation to the state class. // This should support all and only the operations defined in // allowed_operations. - void apply_gate(const int_t iChunk, const Operations::Op &op); + void apply_gate(const Operations::Op &op); // Apply a matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const reg_t &qubits, - const cmatrix_t &mat); + void apply_matrix(const reg_t &qubits, const cmatrix_t &mat); // Apply a matrix to given qubits (identity on all other qubits) - void apply_matrix(const int_t iChunk, const reg_t &qubits, - const cvector_t &vmat); + void apply_matrix(const reg_t &qubits, const cvector_t &vmat); // Apply a diagonal matrix - void apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits, - const cvector_t &diag); - - // swap between chunks - virtual void apply_chunk_swap(const reg_t &qubits) override; + void apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag); //----------------------------------------------------------------------- // 1-Qubit Gates //----------------------------------------------------------------------- // Optimize phase gate with diagonal [1, phase] - void apply_gate_phase(const int_t iChunk, const uint_t qubit, - const complex_t phase); + void apply_gate_phase(const uint_t qubit, const complex_t phase); - void apply_gate_phase(const int_t iChunk, const reg_t &qubits, - const complex_t phase); + void apply_gate_phase(const reg_t &qubits, const complex_t phase); //----------------------------------------------------------------------- // Multi-controlled u @@ -182,19 +170,19 @@ class State : public virtual QuantumState::StateChunk { // Apply N-qubit multi-controlled single qubit gate specified by // 4 parameters u4(theta, phi, lambda, gamma) // NOTE: if N=1 this is just a regular u4 gate. - void apply_gate_mcu(const int_t iChunk, const reg_t &qubits, double theta, - double phi, double lambda, double gamma); + void apply_gate_mcu(const reg_t &qubits, double theta, double phi, + double lambda, double gamma); //----------------------------------------------------------------------- // Save data instructions //----------------------------------------------------------------------- // Save the unitary matrix for the simulator - void apply_save_unitary(const int_t iChunk, const Operations::Op &op, - ExperimentResult &result, bool last_op); + void apply_save_unitary(const Operations::Op &op, ExperimentResult &result, + bool last_op); // Helper function for computing expectation value - virtual double expval_pauli(const int_t iChunk, const reg_t &qubits, + virtual double expval_pauli(const reg_t &qubits, const std::string &pauli) override; //----------------------------------------------------------------------- @@ -212,11 +200,6 @@ class State : public virtual QuantumState::StateChunk { // Table of allowed gate names to gate enum class members const static stringmap_t gateset_; - - // scale for unitary = 2 - // this function is used in the base class to scale chunk qubits for - // multi-chunk distribution - int qubit_scale(void) override { return 2; } }; //============================================================================ @@ -298,36 +281,35 @@ const stringmap_t State::gateset_({ //============================================================================ template -void State::apply_op(const int_t iChunk, - const Operations::Op &op, +void State::apply_op(const Operations::Op &op, ExperimentResult &result, RngEngine &rng, bool final_op) { - if (BaseState::check_conditional(iChunk, op)) { + if (BaseState::creg().check_conditional(op)) { switch (op.type) { case Operations::OpType::barrier: case Operations::OpType::qerror_loc: break; case Operations::OpType::bfunc: - BaseState::cregs_[0].apply_bfunc(op); + BaseState::creg().apply_bfunc(op); break; case Operations::OpType::roerror: - BaseState::cregs_[0].apply_roerror(op, rng); + BaseState::creg().apply_roerror(op, rng); break; case Operations::OpType::gate: - apply_gate(iChunk, op); + apply_gate(op); break; case Operations::OpType::set_unitary: - BaseState::initialize_from_matrix(iChunk, op.mats[0]); + BaseState::qreg_.initialize_from_matrix(op.mats[0]); break; case Operations::OpType::save_state: case Operations::OpType::save_unitary: - apply_save_unitary(iChunk, op, result, final_op); + apply_save_unitary(op, result, final_op); break; case Operations::OpType::matrix: - apply_matrix(iChunk, op.qubits, op.mats[0]); + apply_matrix(op.qubits, op.mats[0]); break; case Operations::OpType::diagonal_matrix: - apply_diagonal_matrix(iChunk, op.qubits, op.params); + apply_diagonal_matrix(op.qubits, op.params); break; default: throw std::invalid_argument( @@ -336,42 +318,6 @@ void State::apply_op(const int_t iChunk, } } -template -bool State::apply_batched_op(const int_t iChunk, - const Operations::Op &op, - ExperimentResult &result, - std::vector &rng, - bool final_ops) { - if (op.conditional) - BaseState::qregs_[iChunk].set_conditional(op.conditional_reg); - - switch (op.type) { - case Operations::OpType::barrier: - case Operations::OpType::nop: - case Operations::OpType::qerror_loc: - break; - case Operations::OpType::bfunc: - BaseState::qregs_[iChunk].apply_bfunc(op); - break; - case Operations::OpType::roerror: - BaseState::qregs_[iChunk].apply_roerror(op, rng); - break; - case Operations::OpType::gate: - apply_gate(iChunk, op); - break; - case Operations::OpType::matrix: - apply_matrix(iChunk, op.qubits, op.mats[0]); - break; - case Operations::OpType::diagonal_matrix: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, op.params); - break; - default: - // other operations should be called to indivisual chunks by apply_op - return false; - } - return true; -} - template size_t State::required_memory_mb( uint_t num_qubits, const std::vector &ops) const { @@ -391,57 +337,16 @@ void State::set_config(const Config &config) { // Set threshold for truncating snapshots json_chop_threshold_ = config.zero_threshold; - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - BaseState::qregs_[i].set_json_chop_threshold(json_chop_threshold_); + BaseState::qreg_.set_json_chop_threshold(json_chop_threshold_); } template void State::initialize_qreg(uint_t num_qubits) { - if (BaseState::qregs_.size() == 0) - BaseState::allocate(num_qubits, num_qubits, 1); - initialize_omp(); - int_t iChunk; - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); - } + BaseState::qreg_.set_num_qubits(num_qubits); + BaseState::qreg_.initialize(); - if (BaseState::multi_chunk_distribution_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(iChunk) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + iChunk) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow == icol) - BaseState::qregs_[iChunk].initialize(); - else - BaseState::qregs_[iChunk].zero(); - } - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - uint_t irow, icol; - irow = (BaseState::global_chunk_index_ + iChunk) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if (irow == icol) - BaseState::qregs_[iChunk].initialize(); - else - BaseState::qregs_[iChunk].zero(); - } - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - BaseState::qregs_[iChunk].initialize(); - } - } apply_global_phase(); } @@ -454,101 +359,43 @@ void State::initialize_qreg(uint_t num_qubits, "Unitary::State::initialize: initial state does not match qubit " "number"); } - if (BaseState::qregs_.size() == 0) - BaseState::allocate(num_qubits, num_qubits, 1); initialize_omp(); - int_t iChunk; - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); + BaseState::qreg_.set_num_qubits(num_qubits); + BaseState::qreg_.initialize_from_matrix(unitary); - if (BaseState::multi_chunk_distribution_) { - uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - // this function should be called in-order - BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); - } - - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for private(iChunk) - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (iChunk = BaseState::top_chunk_of_group_[ig]; - iChunk < BaseState::top_chunk_of_group_[ig + 1]; iChunk++) { - uint_t irow_chunk = - ((iChunk + BaseState::global_chunk_index_) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = - ((iChunk + BaseState::global_chunk_index_) & - ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - - 1)); - - // copy part of state for this chunk - uint_t i, row, col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) { - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol + (irow_chunk << BaseState::chunk_bits_)) - << (BaseState::num_qubits_)) + - (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = unitary[idx]; - } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); - } - } - } else { - for (iChunk = 0; iChunk < BaseState::qregs_.size(); iChunk++) { - uint_t irow_chunk = - ((iChunk + BaseState::global_chunk_index_) >> - ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = - ((iChunk + BaseState::global_chunk_index_) & - ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_))) - - 1)); - - // copy part of state for this chunk - uint_t i, row, col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for (i = 0; i < (1ull << BaseState::chunk_bits_); i++) { - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol + (irow_chunk << BaseState::chunk_bits_)) - << (BaseState::num_qubits_)) + - (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = unitary[idx]; - } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); - } - } - } else { - BaseState::qregs_[iChunk].initialize_from_matrix(unitary); - } apply_global_phase(); } template void State::initialize_omp() { uint_t i; - for (i = 0; i < BaseState::qregs_.size(); i++) { - BaseState::qregs_[i].set_omp_threshold(omp_qubit_threshold_); - if (BaseState::threads_ > 0) - BaseState::qregs_[i].set_omp_threads( - BaseState::threads_); // set allowed OMP threads in qubitvector - } + BaseState::qreg_.set_omp_threshold(omp_qubit_threshold_); + if (BaseState::threads_ > 0) + BaseState::qreg_.set_omp_threads( + BaseState::threads_); // set allowed OMP threads in qubitvector } template -auto State::move_to_matrix(const int_t iChunk) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].move_to_matrix(); - return BaseState::apply_to_matrix(false); +bool State::allocate(uint_t num_qubits, uint_t block_bits, + uint_t num_parallel_shots) { + if (BaseState::max_matrix_qubits_ > 0) + BaseState::qreg_.set_max_matrix_bits(BaseState::max_matrix_qubits_); + + BaseState::qreg_.set_target_gpus(BaseState::target_gpus_); + BaseState::qreg_.chunk_setup(block_bits * 2, num_qubits * 2, 0, 1); + + return true; +} + +template +auto State::move_to_matrix() { + return BaseState::qreg_.move_to_matrix(); } template -auto State::copy_to_matrix(const int_t iChunk) { - if (!BaseState::multi_chunk_distribution_) - return BaseState::qregs_[iChunk].copy_to_matrix(); - return BaseState::apply_to_matrix(true); +auto State::copy_to_matrix() { + return BaseState::qreg_.copy_to_matrix(); } //========================================================================= @@ -556,20 +403,23 @@ auto State::copy_to_matrix(const int_t iChunk) { //========================================================================= template -void State::apply_gate(const int_t iChunk, - const Operations::Op &op) { - if (!BaseState::global_chunk_indexing_) { +void State::apply_gate(const Operations::Op &op) { + // CPU qubit vector does not handle chunk ID inside kernel, so modify op here + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits() && + !BaseState::qreg_.support_global_indexing()) { reg_t qubits_in, qubits_out; - BaseState::get_inout_ctrl_qubits(op, qubits_out, qubits_in); + if (op.name[0] == 'c' || op.name.find("mc") == 0) { + Chunk::get_inout_ctrl_qubits(op, BaseState::qreg_.num_qubits(), qubits_in, + qubits_out); + } if (qubits_out.size() > 0) { uint_t mask = 0; for (int i = 0; i < qubits_out.size(); i++) { - mask |= (1ull << (qubits_out[i] - BaseState::chunk_bits_)); + mask |= (1ull << (qubits_out[i] - BaseState::qreg_.num_qubits())); } - if (((BaseState::global_chunk_index_ + iChunk) & mask) == mask) { - Operations::Op new_op = - BaseState::remake_gate_in_chunk_qubits(op, qubits_in); - apply_gate(iChunk, new_op); + if ((BaseState::qreg_.chunk_index() & mask) == mask) { + Operations::Op new_op = Chunk::correct_gate_op_in_chunk(op, qubits_in); + apply_gate(new_op); } return; } @@ -584,104 +434,99 @@ void State::apply_gate(const int_t iChunk, switch (g) { case Gates::mcx: // Includes X, CX, CCX, etc - BaseState::qregs_[iChunk].apply_mcx(op.qubits); + BaseState::qreg_.apply_mcx(op.qubits); break; case Gates::mcy: // Includes Y, CY, CCY, etc - BaseState::qregs_[iChunk].apply_mcy(op.qubits); + BaseState::qreg_.apply_mcy(op.qubits); break; case Gates::mcz: // Includes Z, CZ, CCZ, etc - BaseState::qregs_[iChunk].apply_mcphase(op.qubits, -1); + BaseState::qreg_.apply_mcphase(op.qubits, -1); break; case Gates::mcr: - BaseState::qregs_[iChunk].apply_mcu( - op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); + BaseState::qreg_.apply_mcu(op.qubits, + Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::mcrx: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, - Linalg::VMatrix::rx(op.params[0])); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0])); break; case Gates::mcry: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, - Linalg::VMatrix::ry(op.params[0])); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0])); break; case Gates::mcrz: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, - Linalg::VMatrix::rz(op.params[0])); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, - Linalg::VMatrix::rxx(op.params[0])); + BaseState::qreg_.apply_matrix(op.qubits, + Linalg::VMatrix::rxx(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, - Linalg::VMatrix::ryy(op.params[0])); + BaseState::qreg_.apply_matrix(op.qubits, + Linalg::VMatrix::ryy(op.params[0])); break; case Gates::rzz: - apply_diagonal_matrix(iChunk, op.qubits, - Linalg::VMatrix::rzz_diag(op.params[0])); + apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, - Linalg::VMatrix::rzx(op.params[0])); + BaseState::qreg_.apply_matrix(op.qubits, + Linalg::VMatrix::rzx(op.params[0])); break; case Gates::ecr: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ECR); + BaseState::qreg_.apply_matrix(op.qubits, Linalg::VMatrix::ECR); break; case Gates::id: break; case Gates::h: - apply_gate_mcu(iChunk, op.qubits, M_PI / 2., 0., M_PI, 0.); + apply_gate_mcu(op.qubits, M_PI / 2., 0., M_PI, 0.); break; case Gates::s: - apply_gate_phase(iChunk, op.qubits[0], complex_t(0., 1.)); + apply_gate_phase(op.qubits[0], complex_t(0., 1.)); break; case Gates::sdg: - apply_gate_phase(iChunk, op.qubits[0], complex_t(0., -1.)); + apply_gate_phase(op.qubits[0], complex_t(0., -1.)); break; case Gates::pauli: - BaseState::qregs_[iChunk].apply_pauli(op.qubits, op.string_params[0]); + BaseState::qreg_.apply_pauli(op.qubits, op.string_params[0]); break; case Gates::t: { const double isqrt2{1. / std::sqrt(2)}; - apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, isqrt2)); + apply_gate_phase(op.qubits[0], complex_t(isqrt2, isqrt2)); } break; case Gates::tdg: { const double isqrt2{1. / std::sqrt(2)}; - apply_gate_phase(iChunk, op.qubits[0], complex_t(isqrt2, -isqrt2)); + apply_gate_phase(op.qubits[0], complex_t(isqrt2, -isqrt2)); } break; case Gates::mcswap: // Includes SWAP, CSWAP, etc - BaseState::qregs_[iChunk].apply_mcswap(op.qubits); + BaseState::qreg_.apply_mcswap(op.qubits); break; case Gates::mcu3: // Includes u3, cu3, etc - apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]), - std::real(op.params[1]), std::real(op.params[2]), 0.); + apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]), + std::real(op.params[2]), 0.); break; case Gates::mcu: // Includes u, cu, etc - apply_gate_mcu(iChunk, op.qubits, std::real(op.params[0]), - std::real(op.params[1]), std::real(op.params[2]), - std::real(op.params[3])); + apply_gate_mcu(op.qubits, std::real(op.params[0]), std::real(op.params[1]), + std::real(op.params[2]), std::real(op.params[3])); break; case Gates::mcu2: // Includes u2, cu2, etc - apply_gate_mcu(iChunk, op.qubits, M_PI / 2., std::real(op.params[0]), + apply_gate_mcu(op.qubits, M_PI / 2., std::real(op.params[0]), std::real(op.params[1]), 0.); break; case Gates::mcp: // Includes u1, cu1, p, cp, mcp, etc - BaseState::qregs_[iChunk].apply_mcphase( - op.qubits, std::exp(complex_t(0, 1) * op.params[0])); + BaseState::qreg_.apply_mcphase(op.qubits, + std::exp(complex_t(0, 1) * op.params[0])); break; case Gates::mcsx: // Includes sx, csx, mcsx etc - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SX); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SX); break; case Gates::mcsxdg: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::SXDG); + BaseState::qreg_.apply_mcu(op.qubits, Linalg::VMatrix::SXDG); break; default: // We shouldn't reach here unless there is a bug in gateset @@ -691,102 +536,85 @@ void State::apply_gate(const int_t iChunk, } template -void State::apply_matrix(const int_t iChunk, - const reg_t &qubits, +void State::apply_matrix(const reg_t &qubits, const cmatrix_t &mat) { if (qubits.empty() == false && mat.size() > 0) { - apply_matrix(iChunk, qubits, Utils::vectorize_matrix(mat)); + apply_matrix(qubits, Utils::vectorize_matrix(mat)); } } template -void State::apply_matrix(const int_t iChunk, - const reg_t &qubits, +void State::apply_matrix(const reg_t &qubits, const cvector_t &vmat) { // Check if diagonal matrix if (vmat.size() == 1ULL << qubits.size()) { - apply_diagonal_matrix(iChunk, qubits, vmat); + apply_diagonal_matrix(qubits, vmat); } else { - BaseState::qregs_[iChunk].apply_matrix(qubits, vmat); + BaseState::qreg_.apply_matrix(qubits, vmat); } } template -void State::apply_diagonal_matrix(const int_t iChunk, - const reg_t &qubits, +void State::apply_diagonal_matrix(const reg_t &qubits, const cvector_t &diag) { - if (BaseState::global_chunk_indexing_ || - !BaseState::multi_chunk_distribution_) { - // GPU computes all chunks in one kernel, so pass qubits and diagonal matrix - // as is - reg_t qubits_chunk = qubits; - for (uint_t i = 0; i < qubits.size(); i++) { - if (qubits_chunk[i] >= BaseState::chunk_bits_) { - qubits_chunk[i] += BaseState::chunk_bits_; + if (BaseState::num_global_qubits_ > BaseState::qreg_.num_qubits()) { + if (!BaseState::qreg_.support_global_indexing()) { + reg_t qubits_in = qubits; + cvector_t diag_in = diag; + Chunk::block_diagonal_matrix(BaseState::qreg_.chunk_index(), + BaseState::qreg_.num_qubits(), qubits_in, + diag_in); + BaseState::qreg_.apply_diagonal_matrix(qubits_in, diag_in); + } else { + reg_t qubits_chunk = qubits; + for (uint_t i = 0; i < qubits.size(); i++) { + if (qubits_chunk[i] >= BaseState::qreg_.num_qubits()) + qubits_chunk[i] += BaseState::qreg_.num_qubits(); } + BaseState::qreg_.apply_diagonal_matrix(qubits_chunk, diag); } - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_chunk, diag); } else { - reg_t qubits_in = qubits; - cvector_t diag_in = diag; - - BaseState::block_diagonal_matrix(iChunk, qubits_in, diag_in); - BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits_in, diag_in); + BaseState::qreg_.apply_diagonal_matrix(qubits, diag); } } template -void State::apply_gate_phase(const int_t iChunk, uint_t qubit, - complex_t phase) { +void State::apply_gate_phase(uint_t qubit, complex_t phase) { cvector_t diag(2); diag[0] = 1.0; diag[1] = phase; - apply_diagonal_matrix(iChunk, reg_t({qubit}), diag); + apply_diagonal_matrix(reg_t({qubit}), diag); } template -void State::apply_gate_phase(const int_t iChunk, - const reg_t &qubits, +void State::apply_gate_phase(const reg_t &qubits, complex_t phase) { cvector_t diag((1 << qubits.size()), 1.0); diag[(1 << qubits.size()) - 1] = phase; - apply_diagonal_matrix(iChunk, qubits, diag); + apply_diagonal_matrix(qubits, diag); } template -void State::apply_gate_mcu(const int_t iChunk, - const reg_t &qubits, double theta, +void State::apply_gate_mcu(const reg_t &qubits, double theta, double phi, double lambda, double gamma) { const auto u4 = Linalg::Matrix::u4(theta, phi, lambda, gamma); - BaseState::qregs_[iChunk].apply_mcu(qubits, Utils::vectorize_matrix(u4)); + BaseState::qreg_.apply_mcu(qubits, Utils::vectorize_matrix(u4)); } template void State::apply_global_phase() { if (BaseState::has_global_phase_) { - if (BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 0) { -#pragma omp parallel for - for (int_t ig = 0; ig < BaseState::num_groups_; ig++) { - for (int_t i = BaseState::top_chunk_of_group_[ig]; - i < BaseState::top_chunk_of_group_[ig + 1]; i++) - apply_diagonal_matrix( - i, {0}, {BaseState::global_phase_, BaseState::global_phase_}); - } - } else { - for (int_t i = 0; i < BaseState::qregs_.size(); i++) - apply_diagonal_matrix( - i, {0}, {BaseState::global_phase_, BaseState::global_phase_}); - } + apply_diagonal_matrix({0}, + {BaseState::global_phase_, BaseState::global_phase_}); } } template -void State::apply_save_unitary(const int_t iChunk, - const Operations::Op &op, +void State::apply_save_unitary(const Operations::Op &op, ExperimentResult &result, bool last_op) { - if (op.qubits.size() != BaseState::num_qubits_) { + if (op.qubits.size() != BaseState::qreg_.num_qubits()) { throw std::invalid_argument(op.name + " was not applied to all qubits." " Only the full unitary can be saved."); @@ -795,43 +623,21 @@ void State::apply_save_unitary(const int_t iChunk, (op.string_params[0] == "_method_") ? "unitary" : op.string_params[0]; if (last_op) { - result.save_data_pershot(BaseState::chunk_creg(iChunk), key, - move_to_matrix(iChunk), + result.save_data_pershot(BaseState::creg(), key, move_to_matrix(), Operations::OpType::save_unitary, op.save_type); } else { - result.save_data_pershot(BaseState::chunk_creg(iChunk), key, - copy_to_matrix(iChunk), + result.save_data_pershot(BaseState::creg(), key, copy_to_matrix(), Operations::OpType::save_unitary, op.save_type); } } template -double State::expval_pauli(const int_t iChunk, - const reg_t &qubits, +double State::expval_pauli(const reg_t &qubits, const std::string &pauli) { throw std::runtime_error( "Unitary simulator does not support Pauli expectation values."); } -// swap between chunks -template -void State::apply_chunk_swap(const reg_t &qubits) { - uint_t q0, q1; - q0 = qubits[0]; - q1 = qubits[1]; - - std::swap(BaseState::qubit_map_[q0], BaseState::qubit_map_[q1]); - - if (qubits[0] >= BaseState::chunk_bits_) { - q0 += BaseState::chunk_bits_; - } - if (qubits[1] >= BaseState::chunk_bits_) { - q1 += BaseState::chunk_bits_; - } - reg_t qs0 = {{q0, q1}}; - BaseState::apply_chunk_swap(qs0); -} - //------------------------------------------------------------------------------ } // namespace QubitUnitary } // end namespace AER diff --git a/src/simulators/unitary/unitarymatrix.hpp b/src/simulators/unitary/unitarymatrix.hpp index e45183299e..494d57e84e 100644 --- a/src/simulators/unitary/unitarymatrix.hpp +++ b/src/simulators/unitary/unitarymatrix.hpp @@ -52,7 +52,7 @@ class UnitaryMatrix : public QubitVector { //----------------------------------------------------------------------- // Set the size of the vector in terms of qubit number - void set_num_qubits(size_t num_qubits); + void set_num_qubits(size_t num_qubits) override; // Return the number of rows in the matrix size_t num_rows() const { return rows_; } @@ -75,6 +75,14 @@ class UnitaryMatrix : public QubitVector { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const UnitaryMatrix &obj) { + BaseVector::initialize(obj); + num_qubits_ = obj.num_qubits_; + rows_ = obj.rows_; + identity_threshold_ = obj.identity_threshold_; + } + // Initializes the vector to a custom initial state. // If the length of the statevector does not match the number of qubits // an exception is raised. diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp old mode 100644 new mode 100755 index df95a9f027..f11e107425 --- a/src/simulators/unitary/unitarymatrix_thrust.hpp +++ b/src/simulators/unitary/unitarymatrix_thrust.hpp @@ -82,6 +82,14 @@ class UnitaryMatrixThrust : public QubitVectorThrust { // Initializes the current vector so that all qubits are in the |0> state. void initialize(); + // initialize from existing state (copy) + void initialize(const UnitaryMatrixThrust &obj) { + BaseVector::initialize(obj); + num_qubits_ = obj.num_qubits_; + rows_ = obj.rows_; + identity_threshold_ = obj.identity_threshold_; + } + // Initializes the vector to a custom initial state. // If the length of the statevector does not match the number of qubits // an exception is raised. diff --git a/test/terra/backends/aer_simulator/test_conditional.py b/test/terra/backends/aer_simulator/test_conditional.py index 07f1298557..13f3ccb5b7 100644 --- a/test/terra/backends/aer_simulator/test_conditional.py +++ b/test/terra/backends/aer_simulator/test_conditional.py @@ -29,7 +29,6 @@ class TestConditionalGates(SimulatorTestCase): "density_matrix", "matrix_product_state", "extended_stabilizer", - "tensor_network", ] # --------------------------------------------------------------------- @@ -66,8 +65,6 @@ def test_conditional_gates_2bit(self, method, device): def test_conditional_gates_64bit(self, method, device): """Test conditional gate operations on 64-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 # [value of conditional register, list of condtional values] cases = ref_conditionals.conditional_cases_64bit() backend = self.backend(method=method, device=device) @@ -87,8 +84,6 @@ def test_conditional_gates_64bit(self, method, device): def test_conditional_gates_132bit(self, method, device): """Test conditional gate operations on 132-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_132bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -112,7 +107,6 @@ class TestConditionalUnitary(SimulatorTestCase): "statevector", "density_matrix", "matrix_product_state", - "tensor_network", ] # --------------------------------------------------------------------- @@ -149,8 +143,6 @@ def test_conditional_unitary_2bit(self, method, device): def test_conditional_unitary_64bit(self, method, device): """Test conditional unitary operations on 64-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_64bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -167,8 +159,6 @@ def test_conditional_unitary_64bit(self, method, device): def test_conditional_unitary_132bit(self, method, device): """Test conditional unitary operations on 132-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_132bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -190,7 +180,6 @@ class TestConditionalKraus(SimulatorTestCase): "statevector", "density_matrix", "matrix_product_state", - "tensor_network", ] # --------------------------------------------------------------------- @@ -227,8 +216,6 @@ def test_conditional_kraus_2bit(self, method, device): def test_conditional_kraus_64bit(self, method, device): """Test conditional kraus operations on 64-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_64bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -245,8 +232,6 @@ def test_conditional_kraus_64bit(self, method, device): def test_conditional_kraus_132bit(self, method, device): """Test conditional kraus operations on 132-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_132bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -263,7 +248,7 @@ def test_conditional_kraus_132bit(self, method, device): class TestConditionalSuperOp(SimulatorTestCase): """AerSimulator conditional superop tests.""" - SUPPORTED_METHODS = ["automatic", "density_matrix", "tensor_network"] + SUPPORTED_METHODS = ["automatic", "density_matrix"] # --------------------------------------------------------------------- # Test conditional @@ -299,8 +284,6 @@ def test_conditional_superop_2bit(self, method, device): def test_conditional_superop_64bit(self, method, device): """Test conditional superop operations on 64-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_64bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) @@ -317,8 +300,6 @@ def test_conditional_superop_64bit(self, method, device): def test_conditional_superop_132bit(self, method, device): """Test conditional superop operations on 132-bit conditional register.""" shots = 100 - if "tensor_network" in method: - shots = 1 cases = ref_conditionals.conditional_cases_132bit() backend = self.backend(method=method, device=device) backend.set_options(max_parallel_experiments=0) diff --git a/test/terra/backends/aer_simulator/test_measure.py b/test/terra/backends/aer_simulator/test_measure.py index f3a495befd..c705869817 100644 --- a/test/terra/backends/aer_simulator/test_measure.py +++ b/test/terra/backends/aer_simulator/test_measure.py @@ -93,9 +93,6 @@ def test_measure_nondeterministic_without_sampling(self, method, device): backend = self.backend(method=method, device=device) shots = 4000 delta = 0.05 - if "tensor_network" in method: - shots = 100 - delta = 0.1 circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=False) targets = ref_measure.measure_counts_nondeterministic(shots) result = backend.run(circuits, shots=shots).result() @@ -195,9 +192,6 @@ def test_measure_nondeterministic_multi_qubit_without_sampling(self, method, dev backend = self.backend(method=method, device=device) shots = 4000 delta = 0.05 - if "tensor_network" in method: - shots = 100 - delta = 0.1 circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=False) targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots) result = backend.run(circuits, shots=shots).result() diff --git a/test/terra/backends/aer_simulator/test_shot_branching.py b/test/terra/backends/aer_simulator/test_shot_branching.py new file mode 100644 index 0000000000..ac3ff0a810 --- /dev/null +++ b/test/terra/backends/aer_simulator/test_shot_branching.py @@ -0,0 +1,782 @@ +# This code is part of Qiskit. +# +# (C) Copyright IBM 2018, 2019. +# +# This code is licensed under the Apache License, Version 2.0. You may +# obtain a copy of this license in the LICENSE.txt file in the root directory +# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. +# +# Any modifications or derivative works of this code must retain this +# copyright notice, and modified files need to carry a notice indicating +# that they have been altered from the originals. +""" +AerSimulator Integration Tests +""" + +from ddt import ddt +from test.terra.reference import ref_measure +from test.terra.reference import ref_reset +from test.terra.reference import ref_initialize +from test.terra.reference import ref_kraus_noise +from test.terra.reference import ref_pauli_noise +from test.terra.reference import ref_readout_noise +from test.terra.reference import ref_reset_noise +from test.terra.reference import ref_conditionals + +from qiskit import QuantumCircuit +from qiskit import transpile +from qiskit_aer import AerSimulator +from qiskit_aer.noise import NoiseModel +from qiskit_aer.noise.errors import ReadoutError, depolarizing_error +from qiskit.circuit.library import QuantumVolume +from qiskit.quantum_info.random import random_unitary +from test.terra.backends.simulator_test_case import SimulatorTestCase, supported_methods + +from qiskit_aer import noise + +import qiskit.quantum_info as qi +from qiskit.circuit.library import QFT +from qiskit.circuit import QuantumCircuit, Reset +from qiskit.circuit.library.standard_gates import IGate, HGate +from qiskit.quantum_info.states.densitymatrix import DensityMatrix + +from qiskit.circuit import Parameter, Qubit, Clbit, QuantumRegister, ClassicalRegister +from qiskit.circuit.controlflow import * +from qiskit_aer.library.default_qubits import default_qubits +from qiskit_aer.library.control_flow_instructions import AerMark, AerJump + +import numpy as np + + +SUPPORTED_METHODS = [ + "statevector", + "density_matrix", +] +# tensor_network is tested in other test cases by setting shot_branching_enable by default + +SUPPORTED_METHODS_INITIALIZE = [ + "statevector", +] + + +@ddt +class TestShotBranching(SimulatorTestCase): + """AerSimulator measure tests.""" + + OPTIONS = {"seed_simulator": 41411} + + # --------------------------------------------------------------------- + # Test measure + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_measure_nondeterministic_with_sampling(self, method, device): + """Test AerSimulator measure with non-deterministic counts with sampling""" + backend = self.backend(method=method, device=device) + shots = 4000 + circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=True) + targets = ref_measure.measure_counts_nondeterministic(shots) + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0.05 * shots) + # Test sampling was enabled + for res in result.results: + self.assertIn("measure_sampling", res.metadata) + self.assertEqual(res.metadata["measure_sampling"], True) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_measure_nondeterministic_without_sampling(self, method, device): + """Test AerSimulator measure with non-deterministic counts without sampling""" + backend = self.backend(method=method, device=device) + shots = 4000 + delta = 0.05 + circuits = ref_measure.measure_circuits_nondeterministic(allow_sampling=False) + targets = ref_measure.measure_counts_nondeterministic(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=delta * shots) + self.compare_result_metadata(result, circuits, "measure_sampling", False) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_measure_sampling_with_quantum_noise(self, method, device): + """Test AerSimulator measure with deterministic counts with sampling and readout-error""" + readout_error = [0.01, 0.1] + noise_model = NoiseModel() + depolarizing = {"u3": (1, 0.001), "cx": (2, 0.02)} + readout = [ + [1.0 - readout_error[0], readout_error[0]], + [readout_error[1], 1.0 - readout_error[1]], + ] + noise_model.add_all_qubit_readout_error(ReadoutError(readout)) + for gate, (num_qubits, gate_error) in depolarizing.items(): + noise_model.add_all_qubit_quantum_error( + depolarizing_error(gate_error, num_qubits), gate + ) + + backend = self.backend(method=method, device=device, noise_model=noise_model) + shots = 1000 + circuits = ref_measure.measure_circuits_deterministic(allow_sampling=True) + targets = ref_measure.measure_counts_deterministic(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + sampling = method == "density_matrix" or method == "tensor_network" + self.compare_result_metadata(result, circuits, "measure_sampling", sampling) + + # --------------------------------------------------------------------- + # Test multi-qubit measure qobj instruction + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_measure_nondeterministic_multi_qubit_with_sampling( + self, method, device + ): + """Test AerSimulator measure with non-deterministic counts""" + backend = self.backend(method=method, device=device) + shots = 4000 + circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=True) + targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots) + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0.05 * shots) + self.compare_result_metadata(result, circuits, "measure_sampling", True) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_measure_nondeterministic_multi_qubit_without_sampling( + self, method, device + ): + """Test AerSimulator measure with non-deterministic counts""" + backend = self.backend(method=method, device=device) + shots = 4000 + delta = 0.05 + circuits = ref_measure.multiqubit_measure_circuits_nondeterministic(allow_sampling=False) + targets = ref_measure.multiqubit_measure_counts_nondeterministic(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=delta * shots) + self.compare_result_metadata(result, circuits, "measure_sampling", False) + + # --------------------------------------------------------------------- + # Test reset + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_reset_nondeterministic(self, method, device): + """Test AerSimulator reset with for circuits with non-deterministic counts""" + backend = self.backend(method=method, device=device) + # For statevector output we can combine deterministic and non-deterministic + # count output circuits + shots = 4000 + circuits = ref_reset.reset_circuits_nondeterministic(final_measure=True) + targets = ref_reset.reset_counts_nondeterministic(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_repeated_resets(self, method, device): + """Test repeated reset operations""" + backend = self.backend(method=method, device=device) + shots = 100 + circuits = ref_reset.reset_circuits_repeated() + targets = ref_reset.reset_counts_repeated(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_reset_moving_qubits(self, method, device): + """Test AerSimulator reset with for circuits where qubits have moved""" + backend = self.backend(method=method, device=device) + # count output circuits + shots = 1000 + circuits = ref_reset.reset_circuits_with_entangled_and_moving_qubits(final_measure=True) + targets = ref_reset.reset_counts_with_entangled_and_moving_qubits(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0.05 * shots) + + # --------------------------------------------------------------------- + # Test initialize instr make it through the wrapper + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_wrapper_1(self, method, device): + """Test AerSimulator initialize""" + backend = self.backend(method=method, device=device) + shots = 100 + if "tensor_network" in method: + shots = 10 + lst = [0, 1] + init_states = [ + np.array(lst), + np.array(lst, dtype=float), + np.array(lst, dtype=np.float32), + np.array(lst, dtype=complex), + np.array(lst, dtype=np.complex64), + ] + circuits = [] + [ + circuits.extend(ref_initialize.initialize_circuits_w_1(init_state)) + for init_state in init_states + ] + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + + # --------------------------------------------------------------------- + # Test initialize instr make it through the wrapper + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_wrapper_2(self, method, device): + """Test AerSimulator initialize""" + backend = self.backend(method=method, device=device) + shots = 100 + lst = [0, 1, 0, 0] + init_states = [ + np.array(lst), + np.array(lst, dtype=float), + np.array(lst, dtype=np.float32), + np.array(lst, dtype=complex), + np.array(lst, dtype=np.complex64), + ] + circuits = [] + [ + circuits.extend(ref_initialize.initialize_circuits_w_2(init_state)) + for init_state in init_states + ] + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + + # --------------------------------------------------------------------- + # Test initialize + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_1(self, method, device): + """Test AerSimulator initialize""" + backend = self.backend(method=method, device=device) + # For statevector output we can combine deterministic and non-deterministic + # count output circuits + shots = 1000 + delta = 0.05 + circuits = ref_initialize.initialize_circuits_1(final_measure=True) + targets = ref_initialize.initialize_counts_1(shots) + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=delta * shots) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_2(self, method, device): + """Test AerSimulator initialize""" + backend = self.backend(method=method, device=device) + # For statevector output we can combine deterministic and non-deterministic + # count output circuits + shots = 1000 + delta = 0.05 + circuits = ref_initialize.initialize_circuits_2(final_measure=True) + targets = ref_initialize.initialize_counts_2(shots) + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=delta * shots) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_entangled_qubits(self, method, device): + """Test initialize entangled qubits""" + backend = self.backend(method=method, device=device) + shots = 1000 + delta = 0.05 + circuits = ref_initialize.initialize_entangled_qubits() + targets = ref_initialize.initialize_counts_entangled_qubits(shots) + result = backend.run( + circuits, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=delta * shots) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_sampling_opt_disabled(self, method, device): + """Test sampling optimization""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuit = QuantumCircuit(2) + circuit.h([0, 1]) + circuit.initialize([0, 1], [1]) + circuit.measure_all() + result = backend.run( + circuit, shots=shots, shot_branching_enable=True, shot_branching_sampling_enable=True + ).result() + self.assertSuccess(result) + sampling = result.results[0].metadata.get("measure_sampling", None) + self.assertFalse(sampling) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_with_labels(self, method, device): + """Test sampling optimization""" + backend = self.backend(method=method, device=device) + + circ = QuantumCircuit(4) + circ.initialize("+-rl") + circ.save_statevector() + actual = ( + backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True) + .result() + .get_statevector(circ) + ) + + for q4, p4 in enumerate([1, 1]): + for q3, p3 in enumerate([1, -1]): + for q2, p2 in enumerate([1, 1j]): + for q1, p1 in enumerate([1, -1j]): + index = int("{}{}{}{}".format(q4, q3, q2, q1), 2) + self.assertAlmostEqual(actual[index], 0.25 * p1 * p2 * p3 * p4) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_with_int(self, method, device): + """Test sampling with int""" + backend = self.backend(method=method, device=device) + + circ = QuantumCircuit(4) + circ.initialize(5, [0, 1, 2]) + circ.save_statevector() + actual = ( + backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True) + .result() + .get_statevector(circ) + ) + + self.assertAlmostEqual(actual[5], 1) + + @supported_methods(SUPPORTED_METHODS_INITIALIZE) + def test_shot_branching_initialize_with_int_twice(self, method, device): + """Test sampling with int twice""" + backend = self.backend(method=method, device=device) + + circ = QuantumCircuit(4) + circ.initialize(1, [0]) + circ.initialize(1, [2]) + circ.save_statevector() + actual = ( + backend.run(circ, shot_branching_enable=True, shot_branching_sampling_enable=True) + .result() + .get_statevector(circ) + ) + + self.assertAlmostEqual(actual[5], 1) + + # --------------------------------------------------------------------- + # Test noise + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_empty_circuit_noise(self, method, device): + """Test simulation with empty circuit and noise model.""" + backend = self.backend(method=method, device=device) + noise_model = noise.NoiseModel() + noise_model.add_all_qubit_quantum_error(noise.depolarizing_error(0.1, 1), ["x"]) + result = backend.run( + QuantumCircuit(), shots=1, noise_model=noise_model, shot_branching_enable=True + ).result() + self.assertSuccess(result) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_readout_noise(self, method, device): + """Test simulation with classical readout error noise model.""" + backend = self.backend(method=method, device=device) + # For statevector output we can combine deterministic and non-deterministic + # count output circuits + shots = 4000 + circuits = ref_readout_noise.readout_error_circuits() + noise_models = ref_readout_noise.readout_error_noise_models() + targets = ref_readout_noise.readout_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_pauli_gate_noise(self, method, device): + """Test simulation with Pauli gate error noise model.""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuits = ref_pauli_noise.pauli_gate_error_circuits() + noise_models = ref_pauli_noise.pauli_gate_error_noise_models() + targets = ref_pauli_noise.pauli_gate_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_pauli_reset_noise(self, method, device): + """Test simulation with Pauli reset error noise model.""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuits = ref_pauli_noise.pauli_reset_error_circuits() + noise_models = ref_pauli_noise.pauli_reset_error_noise_models() + targets = ref_pauli_noise.pauli_reset_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_pauli_measure_noise(self, method, device): + """Test simulation with Pauli measure error noise model.""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuits = ref_pauli_noise.pauli_measure_error_circuits() + noise_models = ref_pauli_noise.pauli_measure_error_noise_models() + targets = ref_pauli_noise.pauli_measure_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_reset_gate_noise(self, method, device): + """Test simulation with reset gate error noise model.""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuits = ref_reset_noise.reset_gate_error_circuits() + noise_models = ref_reset_noise.reset_gate_error_noise_models() + targets = ref_reset_noise.reset_gate_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_kraus_gate_noise(self, method, device): + """Test simulation with Kraus gate error noise model.""" + backend = self.backend(method=method, device=device) + shots = 1000 + circuits = ref_kraus_noise.kraus_gate_error_circuits() + noise_models = ref_kraus_noise.kraus_gate_error_noise_models() + targets = ref_kraus_noise.kraus_gate_error_counts(shots) + + for circuit, noise_model, target in zip(circuits, noise_models, targets): + backend.set_options(noise_model=noise_model) + result = backend.run(circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, [circuit], [target], delta=0.05 * shots) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_kraus_gate_noise_on_QFT(self, method, device): + """Test Kraus noise on a QFT circuit""" + shots = 10000 + + # Build noise model + error1 = noise.amplitude_damping_error(0.2) + error2 = error1.tensor(error1) + noise_model = noise.NoiseModel() + noise_model.add_all_qubit_quantum_error(error1, ["h"]) + noise_model.add_all_qubit_quantum_error(error2, ["cp", "swap"]) + + backend = self.backend(method=method, device=device, noise_model=noise_model) + ideal_circuit = transpile(QFT(3), backend) + + # manaully build noise circuit + noise_circuit = QuantumCircuit(3) + for inst, qargs, cargs in ideal_circuit.data: + noise_circuit.append(inst, qargs, cargs) + if inst.name == "h": + noise_circuit.append(error1, qargs) + elif inst.name in ["cp", "swap"]: + noise_circuit.append(error2, qargs) + # compute target counts + noise_state = DensityMatrix(noise_circuit) + ref_target = {i: shots * p for i, p in noise_state.probabilities_dict().items()} + + # Run sim + ideal_circuit.measure_all() + result = backend.run(ideal_circuit, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts( + result, [ideal_circuit], [ref_target], hex_counts=False, delta=0.1 * shots + ) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_clifford_circuit_noise(self, method, device): + """Test simulation with mixed Clifford quantum errors in circuit.""" + backend = self.backend(method=method, device=device) + shots = 1000 + error1 = noise.QuantumError( + [([(IGate(), [0])], 0.8), ([(Reset(), [0])], 0.1), ([(HGate(), [0])], 0.1)] + ) + + error2 = noise.QuantumError( + [ + ([(IGate(), [0])], 0.75), + ([(Reset(), [0])], 0.1), + ([(Reset(), [1])], 0.1), + ([(Reset(), [0]), (Reset(), [1])], 0.05), + ] + ) + + qc = QuantumCircuit(2) + qc.h(0) + qc.append(error1, [0]) + qc.cx(0, 1) + qc.append(error2, [0, 1]) + target_probs = qi.DensityMatrix(qc).probabilities_dict() + + # Add measurement + qc.measure_all() + result = backend.run(qc, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + probs = {key: val / shots for key, val in result.get_counts(0).items()} + self.assertDictAlmostEqual(target_probs, probs, delta=0.1) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_kraus_circuit_noise(self, method, device): + """Test simulation with Kraus quantum errors in circuit.""" + backend = self.backend(method=method, device=device) + shots = 1000 + error0 = noise.amplitude_damping_error(0.05) + error1 = noise.amplitude_damping_error(0.15) + error01 = error1.tensor(error0) + + # Target Circuit 0 + tc0 = QuantumCircuit(2) + tc0.h(0) + tc0.append(qi.Kraus(error0), [0]) + tc0.cx(0, 1) + tc0.append(qi.Kraus(error01), [0, 1]) + target_probs0 = qi.DensityMatrix(tc0).probabilities_dict() + + # Sim circuit 0 + qc0 = QuantumCircuit(2) + qc0.h(0) + qc0.append(error0, [0]) + qc0.cx(0, 1) + qc0.append(error01, [0, 1]) + qc0.measure_all() + + # Target Circuit 1 + tc1 = QuantumCircuit(2) + tc1.h(1) + tc1.append(qi.Kraus(error0), [1]) + tc1.cx(1, 0) + tc1.append(qi.Kraus(error01), [1, 0]) + target_probs1 = qi.DensityMatrix(tc1).probabilities_dict() + + # Sim circuit 1 + qc1 = QuantumCircuit(2) + qc1.h(1) + qc1.append(error0, [1]) + qc1.cx(1, 0) + qc1.append(error01, [1, 0]) + qc1.measure_all() + + result = backend.run([qc0, qc1], shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + probs = [{key: val / shots for key, val in result.get_counts(i).items()} for i in range(2)] + self.assertDictAlmostEqual(target_probs0, probs[0], delta=0.1) + self.assertDictAlmostEqual(target_probs1, probs[1], delta=0.1) + + # --------------------------------------------------------------------- + # Test conditional + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_gates_1bit(self, method, device): + """Test conditional gate operations on 1-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + circuits = ref_conditionals.conditional_circuits_1bit( + final_measure=True, conditional_type="gate" + ) + targets = ref_conditionals.conditional_counts_1bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_gates_2bit(self, method, device): + """Test conditional gate operations on 2-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_2bit( + final_measure=True, conditional_type="gate" + ) + targets = ref_conditionals.conditional_counts_2bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_gates_64bit(self, method, device): + """Test conditional gate operations on 64-bit conditional register.""" + shots = 100 + # [value of conditional register, list of condtional values] + cases = ref_conditionals.conditional_cases_64bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 64, cases, final_measure=True, conditional_type="gate" + ) + # not using hex counts because number of leading zeros in results + # doesn't seem consistent + targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False) + + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_gates_132bit(self, method, device): + """Test conditional gate operations on 132-bit conditional register.""" + shots = 100 + cases = ref_conditionals.conditional_cases_132bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 132, cases, final_measure=True, conditional_type="gate" + ) + targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False) + circuits = circuits[0:1] + targets = targets[0:1] + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + # --------------------------------------------------------------------- + # Test conditional + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_unitary_1bit(self, method, device): + """Test conditional unitary operations on 1-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + circuits = ref_conditionals.conditional_circuits_1bit( + final_measure=True, conditional_type="unitary" + ) + targets = ref_conditionals.conditional_counts_1bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_unitary_2bit(self, method, device): + """Test conditional unitary operations on 2-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_2bit( + final_measure=True, conditional_type="unitary" + ) + targets = ref_conditionals.conditional_counts_2bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_unitary_64bit(self, method, device): + """Test conditional unitary operations on 64-bit conditional register.""" + shots = 100 + cases = ref_conditionals.conditional_cases_64bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 64, cases, final_measure=True, conditional_type="unitary" + ) + targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False) + + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_unitary_132bit(self, method, device): + """Test conditional unitary operations on 132-bit conditional register.""" + shots = 100 + cases = ref_conditionals.conditional_cases_132bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 132, cases, final_measure=True, conditional_type="unitary" + ) + targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + # --------------------------------------------------------------------- + # Test conditional + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_unitary_1bit(self, method, device): + """Test conditional kraus operations on 1-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + circuits = ref_conditionals.conditional_circuits_1bit( + final_measure=True, conditional_type="kraus" + ) + targets = ref_conditionals.conditional_counts_1bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_kraus_2bit(self, method, device): + """Test conditional kraus operations on 2-bit conditional register.""" + shots = 100 + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_2bit( + final_measure=True, conditional_type="kraus" + ) + targets = ref_conditionals.conditional_counts_2bit(shots) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_kraus_64bit(self, method, device): + """Test conditional kraus operations on 64-bit conditional register.""" + shots = 100 + cases = ref_conditionals.conditional_cases_64bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 64, cases, final_measure=True, conditional_type="kraus" + ) + targets = ref_conditionals.condtional_counts_nbit(64, cases, shots, hex_counts=False) + + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + @supported_methods(SUPPORTED_METHODS) + def test_shot_branching_conditional_kraus_132bit(self, method, device): + """Test conditional kraus operations on 132-bit conditional register.""" + shots = 100 + cases = ref_conditionals.conditional_cases_132bit() + backend = self.backend(method=method, device=device) + backend.set_options(max_parallel_experiments=0) + circuits = ref_conditionals.conditional_circuits_nbit( + 132, cases, final_measure=True, conditional_type="kraus" + ) + targets = ref_conditionals.condtional_counts_nbit(132, cases, shots, hex_counts=False) + result = backend.run(circuits, shots=shots, shot_branching_enable=True).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, hex_counts=False, delta=0) + + # --------------------------------------------------------------------- + # Test control flow + # --------------------------------------------------------------------- diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 2a41d69518..1e3c99b145 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -42,6 +42,10 @@ def backend(self, **options): sim_options["batched_shots_gpu"] = True else: sim_options[key] = val + # enable shot_branching is method is tensor_network + if "method" == key and "tensor_network" in val: + sim_options["shot_branching_enable"] = True + sim_options["shot_branching_sampling_enable"] = True return self.BACKEND(**sim_options) @@ -82,6 +86,7 @@ def _method_device(methods): cuStateVec = check_cuStateVec(available_devices) gpu_methods = ["statevector", "density_matrix", "unitary", "tensor_network"] + batchable_methods = ["statevector", "density_matrix"] data_args = [] for method in methods: if method in available_methods: @@ -94,8 +99,9 @@ def _method_device(methods): for device in available_devices: data_args.append((method, device)) if device == "GPU": - # add batched optimization test for GPU - data_args.append((method, "GPU_batch")) + if method in batchable_methods: + # add batched optimization test for GPU + data_args.append((method, "GPU_batch")) # add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' #'GPU_cuStateVec' is used only inside tests not available in Aer # and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added diff --git a/test/terra/common.py b/test/terra/common.py index 8ce447f970..e7092df517 100644 --- a/test/terra/common.py +++ b/test/terra/common.py @@ -47,7 +47,7 @@ class QiskitAerTestCase(FullQiskitTestCase): def setUp(self): super().setUp() - self.useFixture(fixtures.Timeout(120, gentle=False)) + self.useFixture(fixtures.Timeout(240, gentle=False)) @classmethod def setUpClass(cls): diff --git a/tox.ini b/tox.ini index 8f418f383c..725e45bbc2 100644 --- a/tox.ini +++ b/tox.ini @@ -38,6 +38,7 @@ commands = pylint -j 2 -rn qiskit_aer [testenv:clang-format] +allowlist_externals = sh envdir = .tox/lint commands = sh tools/clang-format.sh -i