Skip to content

Commit

Permalink
merge shot-branching multi-GPU distribution
Browse files Browse the repository at this point in the history
  • Loading branch information
doichanj committed Jul 10, 2023
1 parent 3bd2583 commit 3c18608
Show file tree
Hide file tree
Showing 12 changed files with 308 additions and 206 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,11 @@ if(AER_THRUST_SUPPORTED)
message(STATUS "PYTHON_SITE_PATH = ${PYTHON_SITE_PATH}")

set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA)
set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC AER_CUTENSORNET)
if(DEFINED AER_DISABLE_TENSORNETWORK)
set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC)
else()
set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC AER_CUTENSORNET)
endif()

set(CUDA_NVCC_FLAGS "${AER_CUDA_ARCH_FLAGS_EXPAND} -DAER_THRUST_CUDA -I${AER_SIMULATOR_CPP_SRC_DIR} -isystem ${AER_SIMULATOR_CPP_SRC_DIR}/third-party/headers -use_fast_math --expt-extended-lambda")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -I${PYTHON_SITE_PATH}/cuquantum/include")
Expand Down
2 changes: 1 addition & 1 deletion src/simulators/batch_shots_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ void BatchShotsExecutor<state_t>::run_circuit_shots(
//convert parameters into matrix in cvector_t format
Transpile::BatchConverter batch_converter;
batch_converter.set_config(config);
batch_converter.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
batch_converter.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
fusion_result);

Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);
Expand Down
261 changes: 160 additions & 101 deletions src/simulators/multi_state_executor.hpp

Large diffs are not rendered by default.

180 changes: 116 additions & 64 deletions src/simulators/parallel_state_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ bool ParallelStateExecutor<state_t>::allocate_states(uint_t num_states,
squbits, gqubits, Base::global_state_index_, num_states);
for (i = 1; i < num_states_allocated; i++) {
Base::states_[i].set_config(config);
Base::states_[i].qreg().chunk_setup(Base::states_[0].qreg(),
Base::states_[i].qreg().chunk_setup(
Base::states_[0].qreg(),
Base::global_state_index_ + i);
Base::states_[i].qreg().set_num_threads_per_group(
Base::num_threads_per_group_);
Expand Down Expand Up @@ -417,18 +418,20 @@ void ParallelStateExecutor<state_t>::run_circuit_with_sampling(
// Optimize circuit
Noise::NoiseModel dummy_noise;
state_t dummy_state;
ExperimentResult fusion_result;

//optimize circuit
bool cache_block = false;
if (multiple_chunk_required(circ, dummy_noise)) {
auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
*result_it);
fusion_result);

// Cache blocking pass
auto cache_block_pass = transpile_cache_blocking(circ, dummy_noise, config);
cache_block_pass.set_sample_measure(true);
cache_block_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
*result_it);
fusion_result);
cache_block = cache_block_pass.enabled();
}
if (!cache_block) {
Expand All @@ -441,9 +444,13 @@ void ParallelStateExecutor<state_t>::run_circuit_with_sampling(
uint_t nchunks =
1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());

Base::set_distribution(nchunks);
allocate(circ.num_qubits, config);

for(uint_t iparam=0;iparam<Base::num_bind_params_;iparam++){
Base::set_distribution(nchunks);
allocate(circ.num_qubits, config);
ExperimentResult& result = *(result_it + iparam);
result.metadata.copy(fusion_result.metadata);

// Set state config
for (uint_t i = 0; i < Base::states_.size(); i++) {
Base::states_[i].set_parallelization(Base::parallel_state_update_);
Expand All @@ -457,8 +464,10 @@ void ParallelStateExecutor<state_t>::run_circuit_with_sampling(
RngEngine rng;
if(iparam == 0)
rng = init_rng;
else
else if(Base::num_bind_params_ > 1)
rng.set_seed(circ.seed_for_params[iparam]);
else
rng.set_seed(circ.seed);

auto &ops = circ.ops;
auto first_meas = circ.first_measure_pos; // Position of first measurement op
Expand All @@ -471,15 +480,15 @@ void ParallelStateExecutor<state_t>::run_circuit_with_sampling(

// Run circuit instructions before first measure
apply_ops_chunks(ops.cbegin(), ops.cbegin() + first_meas,
*(result_it + iparam), rng, iparam, final_ops);
result, rng, iparam, final_ops);

// Get measurement operations and set of measured qubits
measure_sampler(circ.ops.begin() + first_meas, circ.ops.end(), circ.shots,
*(result_it + iparam), rng);
result, rng);

// Add measure sampling metadata
(result_it + iparam)->metadata.add(true, "measure_sampling");
Base::states_[0].add_metadata(*(result_it + iparam));
result.metadata.add(true, "measure_sampling");
Base::states_[0].add_metadata(result);
}
}

Expand All @@ -495,41 +504,47 @@ void ParallelStateExecutor<state_t>::run_circuit_shots(

uint_t nchunks =
1ull << ((circ.num_qubits - cache_block_qubit_) * qubit_scale());
Base::set_distribution(nchunks);
Base::num_bind_params_ = circ.num_bind_params;

// Optimize circuit
Noise::NoiseModel dummy_noise;
state_t dummy_state;
auto fusion_pass = Base::transpile_fusion(circ.opset(), config);
auto cache_block_pass = transpile_cache_blocking(circ, noise, config);
ExperimentResult fusion_result;
if (!sample_noise) {
fusion_pass.optimize_circuit(circ, dummy_noise, dummy_state.opset(),
fusion_result);
// Cache blocking pass
cache_block_pass.set_sample_measure(false);
cache_block_pass.optimize_circuit(circ, dummy_noise,
dummy_state.opset(), fusion_result);
Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
}
else{
Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ);
Base::max_matrix_qubits_ = std::max(Base::max_matrix_qubits_, (int)fusion_pass.max_qubit);
}

Base::set_distribution(nchunks);
allocate(circ.num_qubits, config);

Base::num_bind_params_ = circ.num_bind_params;
for(uint_t iparam=0;iparam<Base::num_bind_params_;iparam++){
if (!sample_noise) {
ExperimentResult& result = *(result_it + iparam);
result.metadata.copy(fusion_result.metadata);
}

for (int_t ishot = 0; ishot < circ.shots; ishot++) {
RngEngine rng;
if(iparam == 0 && ishot == 0)
rng = init_rng;
else
else if(Base::num_bind_params_ > 1)
rng.set_seed(circ.seed_for_params[iparam] + ishot);
else
rng.set_seed(circ.seed + ishot);

// Optimize circuit
Noise::NoiseModel dummy_noise;
state_t dummy_state;

Circuit circ_opt;
if (sample_noise) {
circ_opt = noise.sample_noise(circ, rng);
} else {
circ_opt = circ;
}
fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
*result_it);
Base::max_matrix_qubits_ = Base::get_max_matrix_qubits(circ_opt);

// Cache blocking pass
cache_block_pass.set_sample_measure(false);
cache_block_pass.optimize_circuit(circ_opt, dummy_noise,
dummy_state.opset(), *result_it);
allocate(circ.num_qubits, config);

// Set state config
// Set state config and global phase
for (uint_t i = 0; i < Base::states_.size(); i++) {
Base::states_[i].set_parallelization(Base::parallel_state_update_);
if(circ.global_phase_for_params.size() == circ.num_bind_params)
Expand All @@ -538,14 +553,27 @@ void ParallelStateExecutor<state_t>::run_circuit_shots(
Base::states_[i].set_global_phase(circ.global_phase_angle);
}

//initialize
initialize_qreg(circ.num_qubits);
for (uint_t i = 0; i < Base::states_.size(); i++) {
Base::states_[i].initialize_creg(circ.num_memory, circ.num_registers);
}

apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(), *(result_it + iparam), rng, iparam,
true);
result_it->save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
if (sample_noise) {
Circuit circ_opt = noise.sample_noise(circ, rng);
fusion_pass.optimize_circuit(circ_opt, dummy_noise, dummy_state.opset(),
*(result_it + iparam));
// Cache blocking pass
cache_block_pass.set_sample_measure(false);
cache_block_pass.optimize_circuit(circ_opt, dummy_noise,
dummy_state.opset(), *(result_it + iparam));

apply_ops_chunks(circ_opt.ops.cbegin(), circ_opt.ops.cend(), *(result_it + iparam), rng, iparam, true);
}
else{
apply_ops_chunks(circ.ops.cbegin(), circ.ops.cend(), *(result_it + iparam), rng, iparam, true);
}
(result_it + iparam)->save_count_data(Base::states_[0].creg(), Base::save_creg_memory_);
}
Base::states_[0].add_metadata(*(result_it + iparam));
}
Expand Down Expand Up @@ -651,10 +679,15 @@ void ParallelStateExecutor<state_t>::store_measure(const reg_t &outcome,
const reg_t &memory,
const reg_t &registers) {
auto apply_store_measure = [this, outcome, memory, registers](int_t iGroup) {
// store creg to the all top states of groups so that conditional ops can be
// applied correctly
Base::states_[Base::top_state_of_group_[iGroup]].creg().store_measure(
outcome, memory, registers);
int_t iChunk = Base::top_state_of_group_[iGroup];
int_t nChunk = 1;
#ifdef AER_CUSTATEVEC
if(Base::cuStateVec_enable_){
nChunk = Base::num_states_in_group_[iGroup];
}
#endif
for(int_t i=0;i<nChunk;i++)
Base::states_[iChunk + i].creg().store_measure(outcome, memory, registers);
};
Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
0, Base::num_groups_, apply_store_measure);
Expand All @@ -663,9 +696,15 @@ void ParallelStateExecutor<state_t>::store_measure(const reg_t &outcome,
template <class state_t>
void ParallelStateExecutor<state_t>::apply_bfunc(const Operations::Op &op) {
auto bfunc_kernel = [this, op](int_t iGroup) {
// store creg to the all top states of groups so that conditional ops can be
// applied correctly
Base::states_[Base::top_state_of_group_[iGroup]].creg().apply_bfunc(op);
int_t iChunk = Base::top_state_of_group_[iGroup];
int_t nChunk = 1;
#ifdef AER_CUSTATEVEC
if(Base::cuStateVec_enable_){
nChunk = Base::num_states_in_group_[iGroup];
}
#endif
for(int_t i=0;i<nChunk;i++)
Base::states_[iChunk + i].creg().apply_bfunc(op);
};
Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
0, Base::num_groups_, bfunc_kernel);
Expand All @@ -675,10 +714,15 @@ template <class state_t>
void ParallelStateExecutor<state_t>::apply_roerror(const Operations::Op &op,
RngEngine &rng) {
auto roerror_kernel = [this, op, &rng](int_t iGroup) {
// store creg to the all top states of groups so that conditional ops can be
// applied correctly
Base::states_[Base::top_state_of_group_[iGroup]].creg().apply_roerror(op,
rng);
int_t iChunk = Base::top_state_of_group_[iGroup];
int_t nChunk = 1;
#ifdef AER_CUSTATEVEC
if(Base::cuStateVec_enable_){
nChunk = Base::num_states_in_group_[iGroup];
}
#endif
for(int_t i=0;i<nChunk;i++)
Base::states_[iChunk + i].creg().apply_roerror(op, rng);
};
Utils::apply_omp_parallel_for((chunk_omp_parallel_ && Base::num_groups_ > 1),
0, Base::num_groups_, roerror_kernel);
Expand Down Expand Up @@ -760,25 +804,33 @@ void ParallelStateExecutor<state_t>::apply_ops_chunks(InputIterator first,
}
iOp = iOpEnd;
} else {
bool param_op = false;
if(op_iOp.has_bind_params){
Operations::Op bind_op = Operations::make_parameter_bind(op_iOp, iparam, Base::num_bind_params_);
if(apply_parallel_op(bind_op, result, rng, final_ops && nOp == iOp + 1))
param_op = true;
std::vector<Operations::Op> bind_op(1);
bind_op[0] = Operations::make_parameter_bind(op_iOp, iparam, Base::num_bind_params_);
if(!apply_parallel_op(bind_op[0], result, rng, final_ops && nOp == iOp + 1)){
if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
#pragma omp parallel for num_threads(Base::num_groups_)
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, bind_op.cbegin(), bind_op.cend(), result,
rng, iparam);
} else {
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, bind_op.cbegin(), bind_op.cend(), result,
rng, iparam);
}
}
}else{
if(apply_parallel_op(op_iOp, result, rng, final_ops && nOp == iOp + 1))
param_op = true;
}
if(!param_op){
if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
if(!apply_parallel_op(op_iOp, result, rng, final_ops && nOp == iOp + 1)){
if (Base::num_groups_ > 1 && chunk_omp_parallel_) {
#pragma omp parallel for num_threads(Base::num_groups_)
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
rng, iparam);
} else {
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
rng, iparam);
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
rng, iparam);
} else {
for (int_t ig = 0; ig < Base::num_groups_; ig++)
apply_cache_blocking_ops(ig, first + iOp, first + iOp + 1, result,
rng, iparam);
}
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/simulators/shot_branching.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Branch;
class Branch {
protected:
uint_t state_index_; // state index
uint_t root_state_index_;

uint_t shot_index_; // starting shot index

Expand Down Expand Up @@ -61,6 +62,7 @@ class Branch {
}

uint_t &state_index(void) { return state_index_; }
uint_t &root_state_index(void) { return root_state_index_; }
uint_t &shot_index(void) { return shot_index_; }
ClassicalRegister &creg(void) { return creg_; }
std::vector<RngEngine> &rng_shots(void) { return shots_; }
Expand Down
4 changes: 2 additions & 2 deletions src/simulators/statevector/qubitvector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1033,14 +1033,14 @@ uint_t QubitVector<data_t>::chunk_setup(int chunk_bits, int num_qubits,
uint_t chunk_index,
uint_t num_local_chunks) {
chunk_index_ = chunk_index;
return 1;
return num_local_chunks;
}

template <typename data_t>
uint_t QubitVector<data_t>::chunk_setup(QubitVector<data_t> &base,
const uint_t chunk_index) {
chunk_index_ = chunk_index;
return 1;
return 0;
}

// prepare buffer for MPI send/recv
Expand Down
17 changes: 8 additions & 9 deletions src/simulators/statevector/qubitvector_thrust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,10 +705,8 @@ void QubitVectorThrust<data_t>::copy_qv(const QubitVectorThrust<data_t> &obj) {
num_threads_per_group_ = obj.num_threads_per_group_;
max_matrix_bits_ = obj.max_matrix_bits_;

if (!chunk_setup(obj, obj.chunk_index_)) {
throw std::runtime_error(
"QubitVectorThrust: can not allocate chunk for copy");
}
chunk_setup(obj, obj.chunk_index_);

set_num_qubits(obj.num_qubits());

chunk_.CopyIn(obj.chunk_);
Expand Down Expand Up @@ -881,7 +879,7 @@ uint_t QubitVectorThrust<data_t>::chunk_setup(int chunk_bits, int num_qubits,
chunk_manager_->num_qubits() == num_qubits) {
bool mapped = chunk_manager_->MapChunk(chunk_, 0);
chunk_.set_chunk_index(chunk_index_);
return mapped;
return num_local_chunks;
}
chunk_manager_.reset();
}
Expand Down Expand Up @@ -923,21 +921,22 @@ uint_t QubitVectorThrust<data_t>::chunk_setup(

// set global chunk ID / shot ID
chunk_index_ = chunk_index;
chunk_.set_chunk_index(chunk_index_);

if (chunk_.is_mapped())
chunk_manager_->UnmapChunk(chunk_);
if (buffer_chunk_.is_mapped())
chunk_manager_->UnmapBufferChunk(buffer_chunk_);
send_chunk_.unmap();
recv_chunk_.unmap();

chunk_.set_chunk_index(chunk_index_);
if (chunk_.is_mapped()) {
return 0;
}

// mapping/setting chunk
chunk_manager_ = base.chunk_manager_;
bool mapped = chunk_manager_->MapChunk(chunk_, 0);

return chunk_manager_->num_chunks();
return 0;
}

template <typename data_t>
Expand Down
Loading

0 comments on commit 3c18608

Please sign in to comment.