From 00e787ff4b6905062fd8e9d1bfc89eda5ece408d Mon Sep 17 00:00:00 2001 From: MarcMaussner <114133362+MarcMaussner@users.noreply.github.com> Date: Tue, 22 Aug 2023 06:01:19 +0200 Subject: [PATCH 1/5] Implemented ecr for stabilizer simulator. (#1892) * Implemented ecr for stabilizer simulator. * Implemented ecr for stabilizer simulator. * Adapted to coding style. * fixed testcode for ecr in stabilizer. --- qiskit_aer/backends/backend_utils.py | 1 + qiskit_aer/backends/qasm_simulator.py | 1 + ...stabilizer_simulator-00110a1b39d35054.yaml | 6 ++ .../stabilizer/stabilizer_state.hpp | 45 ++++++++--- .../backends/aer_simulator/test_cliffords.py | 18 +++++ test/terra/reference/ref_2q_clifford.py | 79 +++++++++++++++++++ 6 files changed, 141 insertions(+), 9 deletions(-) create mode 100644 releasenotes/notes/enhancement_ecr_for_stabilizer_simulator-00110a1b39d35054.yaml diff --git a/qiskit_aer/backends/backend_utils.py b/qiskit_aer/backends/backend_utils.py index 446406c796..717f0a4b59 100644 --- a/qiskit_aer/backends/backend_utils.py +++ b/qiskit_aer/backends/backend_utils.py @@ -210,6 +210,7 @@ "swap", "delay", "pauli", + "ecr", ] ), "extended_stabilizer": sorted( diff --git a/qiskit_aer/backends/qasm_simulator.py b/qiskit_aer/backends/qasm_simulator.py index ade47cd01e..1901fa066f 100644 --- a/qiskit_aer/backends/qasm_simulator.py +++ b/qiskit_aer/backends/qasm_simulator.py @@ -785,6 +785,7 @@ def _method_basis_gates(self): "swap", "delay", "pauli", + "ecr", ] ) if method == "extended_stabilizer": diff --git a/releasenotes/notes/enhancement_ecr_for_stabilizer_simulator-00110a1b39d35054.yaml b/releasenotes/notes/enhancement_ecr_for_stabilizer_simulator-00110a1b39d35054.yaml new file mode 100644 index 0000000000..a2f6571c43 --- /dev/null +++ b/releasenotes/notes/enhancement_ecr_for_stabilizer_simulator-00110a1b39d35054.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Enable 2-qubit gate ECR for aer_stabilizer_simulator. Refer to + `#1883 State::gateset_({ {"sx", Gates::sx}, // Sqrt X gate. {"sxdg", Gates::sxdg}, // Inverse Sqrt X gate. // Two-qubit gates - {"CX", Gates::cx}, // Controlled-X gate (CNOT) - {"cx", Gates::cx}, // Controlled-X gate (CNOT), - {"cy", Gates::cy}, // Controlled-Y gate - {"cz", Gates::cz}, // Controlled-Z gate - {"swap", Gates::swap}, // SWAP gate - {"pauli", Gates::pauli} // Pauli gate + {"CX", Gates::cx}, // Controlled-X gate (CNOT) + {"cx", Gates::cx}, // Controlled-X gate (CNOT), + {"cy", Gates::cy}, // Controlled-Y gate + {"cz", Gates::cz}, // Controlled-Z gate + {"swap", Gates::swap}, // SWAP gate + {"pauli", Gates::pauli}, // Pauli gate + {"ecr", Gates::ecr} // ECR gate }); //============================================================================ @@ -342,6 +359,16 @@ void State::apply_gate(const Operations::Op &op) { case Gates::pauli: apply_pauli(op.qubits, op.string_params[0]); break; + case Gates::ecr: + BaseState::qreg_.append_h(op.qubits[1]); + BaseState::qreg_.append_s(op.qubits[0]); + BaseState::qreg_.append_z(op.qubits[1]); // sdg(1) + BaseState::qreg_.append_s(op.qubits[1]); // sdg(1) + BaseState::qreg_.append_h(op.qubits[1]); + BaseState::qreg_.append_cx(op.qubits[0], op.qubits[1]); + BaseState::qreg_.append_x(op.qubits[0]); + BaseState::qreg_.append_x(op.qubits[1]); + break; default: // We shouldn't reach here unless there is a bug in gateset throw std::invalid_argument( diff --git a/test/terra/backends/aer_simulator/test_cliffords.py b/test/terra/backends/aer_simulator/test_cliffords.py index 3c3c0af1b5..b6c430ae0e 100644 --- a/test/terra/backends/aer_simulator/test_cliffords.py +++ b/test/terra/backends/aer_simulator/test_cliffords.py @@ -28,6 +28,10 @@ "tensor_network", ] +SUPPORTED_ECR_METHODS = [ + "stabilizer", +] + @ddt class TestCliffords(SimulatorTestCase): @@ -242,6 +246,20 @@ def test_pauli_gate_deterministic(self, method, device): self.assertSuccess(result) self.compare_counts(result, circuits, targets, delta=0) + # --------------------------------------------------------------------- + # Test ecr gate + # --------------------------------------------------------------------- + @supported_methods(SUPPORTED_ECR_METHODS) + def test_ecr_gate_nondeterministic(self, method, device): + """Test ecr gate circuits""" + backend = self.backend(method=method, device=device, seed_simulator=self.SEED) + shots = 100 + circuits = ref_2q_clifford.ecr_gate_circuits_nondeterministic(final_measure=True) + targets = ref_2q_clifford.ecr_gate_counts_nondeterministic(shots) + result = backend.run(circuits, shots=shots).result() + self.assertSuccess(result) + self.compare_counts(result, circuits, targets, delta=0.05 * shots) + # --------------------------------------------------------------------- # Test identity gate # --------------------------------------------------------------------- diff --git a/test/terra/reference/ref_2q_clifford.py b/test/terra/reference/ref_2q_clifford.py index f6903dba7d..59cc2b0629 100644 --- a/test/terra/reference/ref_2q_clifford.py +++ b/test/terra/reference/ref_2q_clifford.py @@ -687,3 +687,82 @@ def swap_gate_unitary_nondeterministic(): / np.sqrt(2) ) return targets + + +# ========================================================================== +# ECR gate +# ========================================================================== + + +def ecr_gate_circuits_nondeterministic(final_measure=True): + """ECR-gate test circuits with nondeterministic counts.""" + circuits = [] + qr = QuantumRegister(2) + qr = QuantumRegister(2) + if final_measure: + cr = ClassicalRegister(2) + regs = (qr, cr) + else: + regs = (qr,) + + # ECR, |00> state + circuit = QuantumCircuit(*regs) + circuit.ecr(qr[0], qr[1]) + if final_measure: + circuit.barrier(qr) + circuit.measure(qr, cr) + circuits.append(circuit) + + # ECR, |01> state + circuit = QuantumCircuit(*regs) + circuit.x(qr[0]) + circuit.ecr(qr[0], qr[1]) + if final_measure: + circuit.barrier(qr) + circuit.measure(qr, cr) + circuits.append(circuit) + + # ECR, |10> state + circuit = QuantumCircuit(*regs) + circuit.x(qr[1]) + circuit.ecr(qr[0], qr[1]) + if final_measure: + circuit.barrier(qr) + circuit.measure(qr, cr) + circuits.append(circuit) + + # ECR, |11> state + circuit = QuantumCircuit(*regs) + circuit.x(qr[0]) + circuit.x(qr[1]) + circuit.ecr(qr[0], qr[1]) + if final_measure: + circuit.barrier(qr) + circuit.measure(qr, cr) + circuits.append(circuit) + return circuits + + +def ecr_gate_counts_nondeterministic(shots, hex_counts=True): + """ECR-gate circuits reference counts.""" + targets = [] + if hex_counts: + # ECR, |00> state + targets.append({"0x1": shots / 2, "0x3": shots / 2}) + # ECR, |01> state + targets.append({"0x0": shots / 2, "0x2": shots / 2}) + # ECR, |10> state + targets.append({"0x1": shots / 2, "0x3": shots / 2}) + # ECR, |11> state + targets.append({"0x0": shots / 2, "0x2": shots / 2}) + + else: + # ECR, |00> state + targets.append({"01": shots / 2, "11": shots / 2}) + # ECR, |01> state + targets.append({"00": shots / 2, "10": shots / 2}) + # ECR, |10> state + targets.append({"01": shots / 2, "11": shots / 2}) + # ECR, |11> state + targets.append({"00": shots / 2, "10": shots / 2}) + return targets From 9037f62a938cecf0e38939fca548b7a1aab59655 Mon Sep 17 00:00:00 2001 From: Luciano Bello Date: Tue, 22 Aug 2023 18:04:09 +0200 Subject: [PATCH 2/5] remove deploy documentation to /documentation/aer (#1891) Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- tools/deploy_documentation.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/deploy_documentation.sh b/tools/deploy_documentation.sh index a059bc7ac4..201e275c8f 100755 --- a/tools/deploy_documentation.sh +++ b/tools/deploy_documentation.sh @@ -12,7 +12,7 @@ # copyright notice, and modified files need to carry a notice indicating # that they have been altered from the originals. -# Script for pushing the documentation to qiskit.org. +# Script for pushing the documentation to qiskit.org/ecosystem. set -e curl https://downloads.rclone.org/rclone-current-linux-amd64.deb -o rclone.deb @@ -30,6 +30,3 @@ pwd openssl aes-256-cbc -K $encrypted_rclone_key -iv $encrypted_rclone_iv -in tools/rclone.conf.enc -out $RCLONE_CONFIG_PATH -d echo "Pushing built docs to website" rclone sync --progress ./docs/_build/html IBMCOS:qiskit-org-web-resources/ecosystem/aer - -# Push to qiskit.org/documentation -rclone sync --progress ./docs/_build/html IBMCOS:qiskit-org-web-resources/documentation/aer From 8998d9403be0c3d7f77191320381347d117a0511 Mon Sep 17 00:00:00 2001 From: Hiroshi Horii Date: Thu, 24 Aug 2023 14:45:58 +0900 Subject: [PATCH 3/5] correct bit ordering in measure of state_controller (#1898) Co-authored-by: Jun Doi --- ...orrect_measure_in_state_controller-a92692fd7083c476.yaml | 6 ++++++ src/controllers/state_controller.hpp | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/correct_measure_in_state_controller-a92692fd7083c476.yaml diff --git a/releasenotes/notes/correct_measure_in_state_controller-a92692fd7083c476.yaml b/releasenotes/notes/correct_measure_in_state_controller-a92692fd7083c476.yaml new file mode 100644 index 0000000000..30531dfa9f --- /dev/null +++ b/releasenotes/notes/correct_measure_in_state_controller-a92692fd7083c476.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + ``measure`` in libaer.so wrongly read classical memory by assuming + opposite ordering of its indices. This fix corrects the assumed ordering. + This change affects only libaer.so and not python applications. diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp index 52791d16f6..74b89afbbf 100644 --- a/src/controllers/state_controller.hpp +++ b/src/controllers/state_controller.hpp @@ -1313,8 +1313,9 @@ uint_t AerState::apply_measure(const reg_t &qubits) { uint_t bitstring = 0; uint_t bit = 1; + uint_t mem_size = state_->creg().memory_size(); for (const auto &qubit : qubits) { - if (state_->creg().creg_memory()[qubit] == '1') + if (state_->creg().creg_memory()[mem_size - qubit - 1] == '1') bitstring |= bit; bit <<= 1; } From d656553c57dbed1f9832243ce3a057976b7cfb86 Mon Sep 17 00:00:00 2001 From: Samuel F Antao Date: Thu, 24 Aug 2023 10:45:12 +0100 Subject: [PATCH 4/5] Change the the existing CUDA implementation to a generic GPU implementation. (#1903) * Add Eclipse IDE project files to .gitignore. * Change existing CUDA implementation to reflect a generic GPU. * Fix formatting. * Cancel .gitignore changes for Eclipse IDE. --------- Co-authored-by: Jun Doi --- CMakeLists.txt | 18 ++- src/controllers/aer_controller.hpp | 4 +- src/controllers/state_controller.hpp | 2 +- src/misc/gpu_static_properties.hpp | 24 ++++ src/misc/wrap_thrust.hpp | 2 +- src/simulators/batch_shots_executor.hpp | 2 +- src/simulators/circuit_executor.hpp | 4 +- .../density_matrix/densitymatrix_thrust.hpp | 2 +- src/simulators/parallel_state_executor.hpp | 2 +- src/simulators/statevector/chunk/chunk.hpp | 2 +- .../statevector/chunk/chunk_container.hpp | 14 +-- .../statevector/chunk/chunk_manager.hpp | 4 +- .../statevector/chunk/cuda_kernels.hpp | 111 ++++++++++-------- .../chunk/device_chunk_container.hpp | 68 ++++++----- .../statevector/chunk/thrust_kernels.hpp | 8 +- .../statevector/qubitvector_thrust.hpp | 16 +-- .../unitary/unitarymatrix_thrust.hpp | 2 +- 17 files changed, 166 insertions(+), 119 deletions(-) create mode 100644 src/misc/gpu_static_properties.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index da36e3fcca..ab1560d9df 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,15 @@ cmake_minimum_required(VERSION 3.8 FATAL_ERROR) file(STRINGS "qiskit_aer/VERSION.txt" VERSION_NUM) +# Add CUDA to the project if needed. +set(EXTRA_LANGUAGES "") +if(AER_THRUST_BACKEND STREQUAL "CUDA") + list(APPEND EXTRA_LANGUAGES CUDA) +endif() + include(CheckLanguage) -project(qasm_simulator VERSION ${VERSION_NUM} LANGUAGES CXX C) +project(qasm_simulator VERSION ${VERSION_NUM} LANGUAGES CXX C ${EXTRA_LANGUAGES} ) + list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_SOURCE_DIR}/cmake) @@ -284,7 +291,7 @@ if(AER_THRUST_SUPPORTED) set(CMAKE_CUDA_RUNTIME_LIBRARY None) - set(CUDA_NVCC_FLAGS "${AER_CUDA_ARCH_FLAGS_EXPAND} -DAER_THRUST_CUDA -I${AER_SIMULATOR_CPP_SRC_DIR} -isystem ${AER_SIMULATOR_CPP_SRC_DIR}/third-party/headers -use_fast_math --expt-extended-lambda") + set(CUDA_NVCC_FLAGS "${AER_CUDA_ARCH_FLAGS_EXPAND} -DAER_THRUST_GPU -DAER_THRUST_CUDA -I${AER_SIMULATOR_CPP_SRC_DIR} -isystem ${AER_SIMULATOR_CPP_SRC_DIR}/third-party/headers -use_fast_math --expt-extended-lambda") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -I${PYTHON_SITE_PATH}/cuquantum/include") set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -I${PYTHON_SITE_PATH}/cutensor/include") set(THRUST_DEPENDANT_LIBS "${THRUST_DEPENDANT_LIBS} -Wl,--disable-new-dtags") @@ -311,7 +318,7 @@ if(AER_THRUST_SUPPORTED) string(STRIP ${CUDA_NVCC_FLAGS} CUDA_NVCC_FLAGS) string(STRIP ${THRUST_DEPENDANT_LIBS} THRUST_DEPENDANT_LIBS) else() - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${AER_CUDA_ARCH_FLAGS_EXPAND} -DAER_THRUST_CUDA -I${AER_SIMULATOR_CPP_SRC_DIR} -isystem ${AER_SIMULATOR_CPP_SRC_DIR}/third-party/headers -use_fast_math --expt-extended-lambda") + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} ${AER_CUDA_ARCH_FLAGS_EXPAND} -DAER_THRUST_GPU -DAER_THRUST_CUDA -I${AER_SIMULATOR_CPP_SRC_DIR} -isystem ${AER_SIMULATOR_CPP_SRC_DIR}/third-party/headers -use_fast_math --expt-extended-lambda") set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA) set(THRUST_DEPENDANT_LIBS "-L${CUDA_TOOLKIT_ROOT_DIR}/lib64") @@ -455,7 +462,6 @@ else() # Standalone build LINK_FLAGS ${AER_LINKER_FLAGS} RUNTIME_OUTPUT_DIRECTORY_DEBUG Debug RUNTIME_OUTPUT_DIRECTORY_RELEASE Release) - enable_language(CUDA) endfunction() function(build_cpu target src_file is_exec) @@ -519,7 +525,9 @@ else() # Standalone build if(BUILD_TESTS AND NOT AER_MPI) add_executable(test_libaer "${PROJECT_SOURCE_DIR}/test/runtime/runtime_sample.c") target_include_directories(test_libaer PUBLIC "${PROJECT_SOURCE_DIR}/contrib/runtime/") - set_target_properties(test_libaer PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bin) + # AER_LINKER_FLAGS carry eventual OpenMP linking flags. + set_target_properties(test_libaer PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bin + LINK_FLAGS ${AER_LINKER_FLAGS}) target_link_libraries(test_libaer PRIVATE ${AER_LIBRARIES}) target_link_libraries(test_libaer PRIVATE aer) add_test(NAME aer_runtime_test COMMAND bin/test_libaer) diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 7ea6c35553..c455f5fc20 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -285,7 +285,7 @@ void Controller::set_config(const Config &config) { sim_device_ = Device::ThrustCPU; #endif } else if (sim_device_name_ == "GPU") { -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU throw std::runtime_error( "Simulation device \"GPU\" is not supported on this system"); #else @@ -421,7 +421,7 @@ size_t Controller::get_system_memory_mb() { size_t Controller::get_gpu_memory_mb() { size_t total_physical_memory = 0; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU int iDev, nDev, j; if (cudaGetDeviceCount(&nDev) != cudaSuccess) { cudaGetLastError(); diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp index 74b89afbbf..8b3cdf30cd 100644 --- a/src/controllers/state_controller.hpp +++ b/src/controllers/state_controller.hpp @@ -439,7 +439,7 @@ class AerState { }; bool AerState::is_gpu(bool raise_error) const { -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU if (raise_error) throw std::runtime_error( "Simulation device \"GPU\" is not supported on this system"); diff --git a/src/misc/gpu_static_properties.hpp b/src/misc/gpu_static_properties.hpp new file mode 100644 index 0000000000..5730797cf2 --- /dev/null +++ b/src/misc/gpu_static_properties.hpp @@ -0,0 +1,24 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright AMD 2023. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ +#ifndef __GPU_STATIC_PRIORITIES_H__ +#define __GPU_STATIC_PRIORITIES_H__ + +#ifdef AER_THRUST_CUDA +// In CUDA warpSize could not be a compile-time constant so we use 32 directly. +#define _WS 32 +// Maximum number of threads in a block. +#define _MAX_THD 1024 +#endif // AER_THRUST_CUDA + +#endif //__GPU_STATIC_PRIORITIES_H__ diff --git a/src/misc/wrap_thrust.hpp b/src/misc/wrap_thrust.hpp index 1737c918fa..b02122c73c 100644 --- a/src/misc/wrap_thrust.hpp +++ b/src/misc/wrap_thrust.hpp @@ -38,7 +38,7 @@ DISABLE_WARNING_PUSH #include #include -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU #include #endif #include diff --git a/src/simulators/batch_shots_executor.hpp b/src/simulators/batch_shots_executor.hpp index e0e7b544a8..eef2f85751 100644 --- a/src/simulators/batch_shots_executor.hpp +++ b/src/simulators/batch_shots_executor.hpp @@ -268,7 +268,7 @@ void BatchShotsExecutor::run_circuit_shots( } #endif -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (Base::sim_device_ == Device::GPU) { int nDev; if (cudaGetDeviceCount(&nDev) != cudaSuccess) { diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp index aaa0e7b217..425bb7d097 100644 --- a/src/simulators/circuit_executor.hpp +++ b/src/simulators/circuit_executor.hpp @@ -294,7 +294,7 @@ void Executor::set_config(const Config &config) { } // set target GPUs -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU int nDev = 0; if (cudaGetDeviceCount(&nDev) != cudaSuccess) { cudaGetLastError(); @@ -332,7 +332,7 @@ size_t Executor::get_system_memory_mb() { template size_t Executor::get_gpu_memory_mb() { size_t total_physical_memory = 0; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU for (int_t iDev = 0; iDev < target_gpus_.size(); iDev++) { size_t freeMem, totalMem; cudaSetDevice(target_gpus_[iDev]); diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 73f8dca3cf..7cbce3cd45 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -51,7 +51,7 @@ class DensityMatrixThrust : public UnitaryMatrixThrust { //----------------------------------------------------------------------- // Return the string name of the class -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU static std::string name() { return "density_matrix_gpu"; } #else static std::string name() { return "density_matrix_thrust"; } diff --git a/src/simulators/parallel_state_executor.hpp b/src/simulators/parallel_state_executor.hpp index 5e5074449c..b40ac2556f 100644 --- a/src/simulators/parallel_state_executor.hpp +++ b/src/simulators/parallel_state_executor.hpp @@ -796,7 +796,7 @@ void ParallelStateExecutor::apply_ops_chunks(InputIterator first, } if (Base::sim_device_ == Device::GPU) { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU int nDev; if (cudaGetDeviceCount(&nDev) != cudaSuccess) { cudaGetLastError(); diff --git a/src/simulators/statevector/chunk/chunk.hpp b/src/simulators/statevector/chunk/chunk.hpp index 37067c172a..7d5c66415b 100644 --- a/src/simulators/statevector/chunk/chunk.hpp +++ b/src/simulators/statevector/chunk/chunk.hpp @@ -230,7 +230,7 @@ class Chunk { return chunk_container_.lock()->trace(chunk_pos_, row, count); } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaStream_t stream(void) { return std::static_pointer_cast>( chunk_container_.lock()) diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index b674e6217c..6afa8e09f4 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -49,7 +49,7 @@ DISABLE_WARNING_POP #define QV_PROBABILITY_BUFFER_SIZE 4 #define QV_NUM_INTERNAL_REGS 4 -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU #define AERDeviceVector thrust::device_vector #else #define AERDeviceVector thrust::host_vector @@ -58,7 +58,7 @@ DISABLE_WARNING_POP #include "framework/utils.hpp" -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU #include "simulators/statevector/chunk/cuda_kernels.hpp" #endif @@ -144,7 +144,7 @@ class ChunkContainer virtual void set_device(void) const {} -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU virtual cudaStream_t stream(uint_t iChunk) const { return nullptr; } #endif @@ -395,7 +395,7 @@ void ChunkContainer::Execute(Function func, uint_t iChunk, conditional_bit_ = -1; // reset conditional } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaStream_t strm = stream(iChunk); if (strm) { uint_t nt, nb; @@ -457,7 +457,7 @@ template void ChunkContainer::ExecuteSum(double *pSum, Function func, uint_t iChunk, uint_t count) const { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU uint_t size = count * func.size(chunk_bits_); set_device(); @@ -637,7 +637,7 @@ template void ChunkContainer::ExecuteSum2(double *pSum, Function func, uint_t iChunk, uint_t count) const { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU uint_t size = count * func.size(chunk_bits_); set_device(); @@ -816,7 +816,7 @@ void ChunkContainer::apply_matrix( } else { auto qubits_sorted = qubits; std::sort(qubits_sorted.begin(), qubits_sorted.end()); -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU if (N == 3) { StoreMatrix(mat, iChunk); Execute(MatrixMult8x8(qubits, qubits_sorted), iChunk, gid, count); diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index 1efc57db52..6bfca5397e 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -122,7 +122,7 @@ ChunkManager::ChunkManager() { num_places_ = 1; #else -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (cudaGetDeviceCount(&num_devices_) == cudaSuccess) { num_places_ = num_devices_; } else { @@ -248,7 +248,7 @@ uint_t ChunkManager::Allocate(int chunk_bits, int nqubits, num_buffers = AER_MAX_BUFFERS; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU num_places_ = num_devices_; if (num_threads_per_group_ > 1) num_places_ *= num_threads_per_group_; diff --git a/src/simulators/statevector/chunk/cuda_kernels.hpp b/src/simulators/statevector/chunk/cuda_kernels.hpp index dc7936fcf1..b74bb514ce 100644 --- a/src/simulators/statevector/chunk/cuda_kernels.hpp +++ b/src/simulators/statevector/chunk/cuda_kernels.hpp @@ -15,6 +15,8 @@ #ifndef _qv_cuda_kernels_hpp_ #define _qv_cuda_kernels_hpp_ +#include "misc/gpu_static_properties.hpp" + namespace AER { namespace QV { namespace Chunk { @@ -32,7 +34,8 @@ __global__ void dev_apply_function(kernel_t func, uint_t count) { template __global__ void dev_apply_function_with_cache(kernel_t func, uint_t count) { - __shared__ thrust::complex cache[1024]; + // One cache entry per thread. + __shared__ thrust::complex cache[_MAX_THD]; uint_t i, idx; i = blockIdx.x * blockDim.x + threadIdx.x; @@ -53,7 +56,8 @@ __global__ void dev_apply_function_with_cache(kernel_t func, uint_t count) { template __global__ void dev_apply_function_sum(double *pReduceBuffer, kernel_t func, uint_t buf_size, uint_t count) { - __shared__ double cache[32]; + // One cache entry per warp/wavefront + __shared__ double cache[_MAX_THD / _WS]; double sum; uint_t i, j, iChunk, nw; @@ -68,25 +72,25 @@ __global__ void dev_apply_function_sum(double *pReduceBuffer, kernel_t func, sum = func(i); // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block - if ((threadIdx.x & 31) == 0) { - cache[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + cache[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < 32) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = cache[threadIdx.x]; else sum = 0.0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } @@ -101,7 +105,8 @@ template __global__ void dev_apply_function_sum_with_cache(double *pReduceBuffer, kernel_t func, uint_t buf_size, uint_t count) { - __shared__ thrust::complex cache[1024]; + // One cache entry per thread. + __shared__ thrust::complex cache[_MAX_THD]; uint_t i, idx; uint_t j, iChunk, nw; double sum; @@ -122,26 +127,26 @@ dev_apply_function_sum_with_cache(double *pReduceBuffer, kernel_t func, sum = func.run_with_cache_sum(threadIdx.x, idx, cache); // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block __syncthreads(); - if ((threadIdx.x & 31) == 0) { - ((double *)cache)[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + ((double *)cache)[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < 32) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = ((double *)cache)[threadIdx.x]; else sum = 0.0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } @@ -154,7 +159,8 @@ dev_apply_function_sum_with_cache(double *pReduceBuffer, kernel_t func, __global__ void dev_reduce_sum(double *pReduceBuffer, uint_t n, uint_t buf_size) { - __shared__ double cache[32]; + // One cache entry per warp/wavefront + __shared__ double cache[_MAX_THD / _WS]; double sum; uint_t i, j, iChunk, nw; @@ -167,25 +173,25 @@ __global__ void dev_reduce_sum(double *pReduceBuffer, uint_t n, sum = 0.0; // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block - if ((threadIdx.x & 31) == 0) { - cache[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + cache[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < 32) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = cache[threadIdx.x]; else sum = 0.0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } @@ -200,7 +206,8 @@ template __global__ void dev_apply_function_sum_complex(thrust::complex *pReduceBuffer, kernel_t func, uint_t buf_size, uint_t count) { - __shared__ thrust::complex cache[32]; + // One cache entry per warp/wavefront + __shared__ thrust::complex cache[_MAX_THD / _WS]; thrust::complex sum; double tr, ti; uint_t i, j, iChunk, nw; @@ -216,27 +223,27 @@ dev_apply_function_sum_complex(thrust::complex *pReduceBuffer, sum = func(i); // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { tr = __shfl_xor_sync(0xffffffff, sum.real(), j, 32); ti = __shfl_xor_sync(0xffffffff, sum.imag(), j, 32); sum += thrust::complex(tr, ti); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block - if ((threadIdx.x & 31) == 0) { - cache[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + cache[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < warpSize) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = cache[threadIdx.x]; else sum = 0.0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { tr = __shfl_xor_sync(0xffffffff, sum.real(), j, 32); ti = __shfl_xor_sync(0xffffffff, sum.imag(), j, 32); @@ -251,7 +258,8 @@ dev_apply_function_sum_complex(thrust::complex *pReduceBuffer, __global__ void dev_reduce_sum_complex(thrust::complex *pReduceBuffer, uint_t n, uint_t buf_size) { - __shared__ thrust::complex cache[32]; + // One cache entry per warp/wavefront + __shared__ thrust::complex cache[_MAX_THD / _WS]; thrust::complex sum; double tr, ti; uint_t i, j, iChunk, nw; @@ -265,27 +273,27 @@ __global__ void dev_reduce_sum_complex(thrust::complex *pReduceBuffer, sum = 0.0; // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { tr = __shfl_xor_sync(0xffffffff, sum.real(), j, 32); ti = __shfl_xor_sync(0xffffffff, sum.imag(), j, 32); sum += thrust::complex(tr, ti); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block - if ((threadIdx.x & 31) == 0) { - cache[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + cache[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < 32) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = cache[threadIdx.x]; else sum = 0.0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { tr = __shfl_xor_sync(0xffffffff, sum.real(), j, 32); ti = __shfl_xor_sync(0xffffffff, sum.imag(), j, 32); @@ -300,7 +308,8 @@ __global__ void dev_reduce_sum_complex(thrust::complex *pReduceBuffer, __global__ void dev_reduce_sum_uint(uint_t *pReduceBuffer, uint_t n, uint_t buf_size) { - __shared__ uint_t cache[32]; + // One cache entry per warp/wavefront + __shared__ uint_t cache[_MAX_THD / _WS]; uint_t sum; uint_t i, j, iChunk, nw; @@ -313,27 +322,27 @@ __global__ void dev_reduce_sum_uint(uint_t *pReduceBuffer, uint_t n, sum = 0; // reduce in warp - nw = min(blockDim.x, warpSize); + nw = min(blockDim.x, _WS); for (j = 1; j < nw; j *= 2) { - sum += __shfl_xor_sync(0xffffffff, sum, j, warpSize); + sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } - if (blockDim.x > warpSize) { + if (blockDim.x > _WS) { // reduce in thread block - if ((threadIdx.x & 31) == 0) { - cache[(threadIdx.x >> 5)] = sum; + if ((threadIdx.x & (_WS - 1)) == 0) { + cache[(threadIdx.x / _WS)] = sum; } __syncthreads(); - if (threadIdx.x < warpSize) { - if (threadIdx.x < ((blockDim.x + warpSize - 1) >> 5)) + if (threadIdx.x < _WS) { + if (threadIdx.x < ((blockDim.x + _WS - 1) / _WS)) sum = cache[threadIdx.x]; else sum = 0; // reduce in warp - nw = warpSize; + nw = _WS; for (j = 1; j < nw; j *= 2) { - sum += __shfl_xor_sync(0xffffffff, sum, j, warpSize); + sum += __shfl_xor_sync(0xffffffff, sum, j, 32); } } } diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index bfd75cb92b..6098fb613b 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -18,6 +18,12 @@ #include "simulators/statevector/chunk/chunk_container.hpp" #include "simulators/statevector/chunk/thrust_kernels.hpp" +#include "misc/gpu_static_properties.hpp" + +#ifdef AER_THRUST_CUDA +namespace thrust_gpu = thrust::cuda; +#endif + namespace AER { namespace QV { namespace Chunk { @@ -64,7 +70,7 @@ class DeviceChunkContainer : public ChunkContainer { reg_t num_blocked_matrix_; reg_t num_blocked_qubits_; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU std::vector stream_; // asynchronous execution #endif @@ -117,12 +123,12 @@ class DeviceChunkContainer : public ChunkContainer { void calculate_matrix_buffer_size(int bits); void set_device(void) const { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaSetDevice(device_id_); #endif } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaStream_t stream(uint_t iChunk) const { if (iChunk >= this->num_chunks_) return stream_[(num_matrices_ + iChunk - this->num_chunks_)]; @@ -213,7 +219,7 @@ class DeviceChunkContainer : public ChunkContainer { ibit = qubit & 63; if (iChunk == 0 && creg_host_update_) { creg_host_update_ = false; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()), thrust::raw_pointer_cast(cregs_.data()), sizeof(uint_t) * num_matrices_ * n64, @@ -237,7 +243,7 @@ class DeviceChunkContainer : public ChunkContainer { ibit = qubit & 63; if (iChunk == 0 && creg_host_update_) { creg_host_update_ = false; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_host_.data()), thrust::raw_pointer_cast(cregs_.data()), sizeof(uint_t) * num_matrices_ * n64, @@ -260,7 +266,7 @@ class DeviceChunkContainer : public ChunkContainer { n64 = (this->num_creg_bits_ + 63) >> 6; creg_dev_update_ = false; creg_host_update_ = false; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(thrust::raw_pointer_cast(cregs_.data()), thrust::raw_pointer_cast(cregs_host_.data()), sizeof(uint_t) * num_matrices_ * n64, @@ -280,7 +286,7 @@ class DeviceChunkContainer : public ChunkContainer { void request_creg_update(void) { creg_host_update_ = true; } void synchronize(uint_t iChunk) { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU set_device(); cudaStreamSynchronize(stream(iChunk)); #endif @@ -320,7 +326,7 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, device_id_ = idev; set_device(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU int ip, nd; cudaGetDeviceCount(&nd); peer_access_.resize(nd); @@ -363,7 +369,7 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, reduce_buffer_size_ = 2; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU size_t param_size = sizeof(thrust::complex) * matrix_buffer_size_ + sizeof(uint_t) * params_buffer_size_; @@ -409,7 +415,7 @@ uint_t DeviceChunkContainer::Allocate(int idev, int chunk_bits, uint_t size = num_matrices_ + this->num_buffers_; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU stream_.resize(size); for (int i = 0; i < size; i++) cudaStreamCreateWithFlags(&stream_[i], cudaStreamNonBlocking); @@ -485,7 +491,7 @@ void DeviceChunkContainer::Deallocate(void) { num_blocked_qubits_.clear(); blocked_qubits_holder_.clear(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU for (int i = 0; i < stream_.size(); i++) cudaStreamDestroy(stream_[i]); stream_.clear(); @@ -542,7 +548,7 @@ void DeviceChunkContainer::StoreMatrix( const std::vector> &mat, uint_t iChunk) const { set_device(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(matrix_pointer(iChunk), &mat[0], mat.size() * sizeof(thrust::complex), cudaMemcpyHostToDevice, stream(iChunk)); @@ -574,7 +580,7 @@ void DeviceChunkContainer::StoreMatrix(const std::complex *mat, uint_t size) const { set_device(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(matrix_pointer(iChunk), mat, size * sizeof(thrust::complex), cudaMemcpyHostToDevice, stream(iChunk)); @@ -606,7 +612,7 @@ void DeviceChunkContainer::StoreUintParams( const std::vector &prm, uint_t iChunk) const { set_device(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaMemcpyAsync(param_pointer(iChunk), &prm[0], prm.size() * sizeof(uint_t), cudaMemcpyHostToDevice, stream(iChunk)); @@ -635,7 +641,7 @@ void DeviceChunkContainer::StoreUintParams( template void DeviceChunkContainer::CopyIn(Chunk &src, uint_t iChunk) { uint_t size = 1ull << this->chunk_bits_; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (src.device() >= 0) { if (peer_access(src.device())) { cudaMemcpyAsync(chunk_pointer(iChunk), src.pointer(), @@ -667,7 +673,7 @@ template void DeviceChunkContainer::CopyOut(Chunk &dest, uint_t iChunk) { uint_t size = 1ull << this->chunk_bits_; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (dest.device() >= 0) { if (peer_access(dest.device())) { cudaMemcpyAsync(dest.pointer(), chunk_pointer(iChunk), @@ -735,7 +741,7 @@ void DeviceChunkContainer::Swap(Chunk &src, uint_t iChunk, size = 1ull << this->chunk_bits_; set_device(); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (src.device() >= 0) { if (peer_access(src.device())) { this->Execute(BufferSwap_func(chunk_pointer(iChunk) + dest_offset, @@ -785,8 +791,8 @@ void DeviceChunkContainer::Swap(Chunk &src, uint_t iChunk, template void DeviceChunkContainer::Zero(uint_t iChunk, uint_t count) { set_device(); -#ifdef AER_THRUST_CUDA - thrust::fill_n(thrust::cuda::par.on(stream(iChunk)), +#ifdef AER_THRUST_GPU + thrust::fill_n(thrust_gpu::par.on(stream(iChunk)), data_.begin() + (iChunk << this->chunk_bits_), count, 0.0); #else if (this->omp_threads_ > 1) @@ -810,15 +816,15 @@ reg_t DeviceChunkContainer::sample_measure( strided_range *> iter( chunk_pointer(iChunk), chunk_pointer(iChunk + count), stride); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (dot) - thrust::transform_inclusive_scan(thrust::cuda::par.on(stream(iChunk)), + thrust::transform_inclusive_scan(thrust_gpu::par.on(stream(iChunk)), iter.begin(), iter.end(), iter.begin(), complex_dot_scan(), thrust::plus>()); else - thrust::inclusive_scan(thrust::cuda::par.on(stream(iChunk)), iter.begin(), + thrust::inclusive_scan(thrust_gpu::par.on(stream(iChunk)), iter.begin(), iter.end(), iter.begin(), thrust::plus>()); @@ -847,7 +853,7 @@ reg_t DeviceChunkContainer::sample_measure( cudaMemcpyAsync(pRnd, &rnds[i], nshots * sizeof(double), cudaMemcpyHostToDevice, stream(iChunk)); - thrust::lower_bound(thrust::cuda::par.on(stream(iChunk)), iter.begin(), + thrust::lower_bound(thrust_gpu::par.on(stream(iChunk)), iter.begin(), iter.end(), rnd_dev_ptr, rnd_dev_ptr + nshots, params_.begin() + (iBuf * params_buffer_size_), complex_less()); @@ -915,7 +921,7 @@ void DeviceChunkContainer::set_blocked_qubits(uint_t iChunk, for (i = 0; i < qubits.size(); i++) { blocked_qubits_holder_[iBlock * QV_MAX_REGISTERS + i] = qubits_sorted[i]; } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU set_device(); cudaMemcpyAsync(param_pointer(iChunk), (uint_t *)&qubits_sorted[0], qubits.size() * sizeof(uint_t), cudaMemcpyHostToDevice, @@ -992,7 +998,7 @@ void DeviceChunkContainer::queue_blocked_gate( apply_blocked_gates(iChunk); } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU BlockedGateParams params; params.mask_ = mask; @@ -1083,7 +1089,7 @@ void DeviceChunkContainer::queue_blocked_gate( #endif } -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU template __global__ void @@ -1100,7 +1106,7 @@ dev_apply_register_blocked_gates(thrust::complex *data, int num_gates, thrust::complex *matrix_load; i = blockIdx.x * blockDim.x + threadIdx.x; - laneID = i & 31; + laneID = i & (_WS - 1); // index for this thread idx = 0; @@ -1222,8 +1228,8 @@ dev_apply_shared_memory_blocked_gates(thrust::complex *data, // warp shuffle to get pair amplitude qr = q.real(); qi = q.imag(); - qr = __shfl_sync(0xffffffff, qr, iPair & 31, 32); - qi = __shfl_sync(0xffffffff, qi, iPair & 31, 32); + qr = __shfl_sync(0xffffffff, qr, iPair & (_WS - 1), 32); + qi = __shfl_sync(0xffffffff, qi, iPair & (_WS - 1), 32); qp = thrust::complex(qr, qi); } else { __syncthreads(); @@ -1298,7 +1304,7 @@ void DeviceChunkContainer::apply_blocked_gates(uint_t iChunk) { if (num_blocked_gates_[iBlock] == 0) return; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU uint_t size; uint_t *pQubits; @@ -1351,7 +1357,7 @@ void DeviceChunkContainer::apply_blocked_gates(uint_t iChunk) { template void DeviceChunkContainer::copy_to_probability_buffer( std::vector &buf, int pos) { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU set_device(); cudaMemcpyAsync(probability_buffer(0) + pos * this->num_chunks_, &buf[0], buf.size() * sizeof(double), cudaMemcpyHostToDevice, diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp index f8bec5f665..2c27acae13 100644 --- a/src/simulators/statevector/chunk/thrust_kernels.hpp +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -38,7 +38,7 @@ DISABLE_WARNING_POP #include "framework/utils.hpp" -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU #include "simulators/statevector/chunk/cuda_kernels.hpp" #endif @@ -60,7 +60,7 @@ class GateFuncBase { uint_t *cregs_; uint_t num_creg_bits_; int_t conditional_bit_; -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU uint_t index_offset_; #endif public: @@ -70,7 +70,7 @@ class GateFuncBase { cregs_ = NULL; num_creg_bits_ = 0; conditional_bit_ = -1; -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU index_offset_ = 0; #endif } @@ -86,7 +86,7 @@ class GateFuncBase { } void set_conditional(int_t bit) { conditional_bit_ = bit; } -#ifndef AER_THRUST_CUDA +#ifndef AER_THRUST_GPU void set_index_offset(uint_t i) { index_offset_ = i; } #endif diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index 2be6721de7..57f09d9bee 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -93,7 +93,7 @@ class QubitVectorThrust { //----------------------------------------------------------------------- // Return the string name of the QubitVector class -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU static std::string name() { return "statevector_gpu"; } #else static std::string name() { return "statevector_thrust"; } @@ -326,7 +326,7 @@ class QubitVectorThrust { // for batched optimization //----------------------------------------------------------------------- virtual bool batched_optimization_supported(void) { -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (enable_batch_) return true; else @@ -1022,7 +1022,7 @@ std::complex QubitVectorThrust::inner_product() const { vec0 = (data_t *)chunk_.pointer(); vec1 = (data_t *)thrust::raw_pointer_cast(checkpoint_.data()); -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU cudaStream_t strm = chunk_.stream(); if (strm) dot = thrust::inner_product(thrust::device, vec0, vec0 + data_size_ * 2, @@ -1386,7 +1386,7 @@ template void QubitVectorThrust::apply_function_sum(double *pSum, Function func, bool async) const { uint_t count = 1; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (!cuStateVec_enable_ && func.batch_enable() && ((multi_chunk_distribution_ && chunk_.device() >= 0 && num_qubits_ == num_qubits()) || @@ -1416,7 +1416,7 @@ template void QubitVectorThrust::apply_function_sum2(double *pSum, Function func, bool async) const { uint_t count = 1; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (!cuStateVec_enable_ && func.batch_enable() && ((multi_chunk_distribution_ && chunk_.device() >= 0 && num_qubits_ == num_qubits()) || @@ -1917,7 +1917,7 @@ double QubitVectorThrust::norm() const { double ret; uint_t count = 1; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (enable_batch_ && ((multi_chunk_distribution_ && chunk_.device() >= 0) || !multi_chunk_distribution_)) { if (chunk_.pos() != 0) @@ -1939,7 +1939,7 @@ template double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t &mat) const { uint_t count = 1; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if (!cuStateVec_enable_ && ((multi_chunk_distribution_ && chunk_.device() >= 0 && num_qubits_ == num_qubits()) || @@ -2566,7 +2566,7 @@ template reg_t QubitVectorThrust::sample_measure( const std::vector &rnds) const { uint_t count = 1; -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU if ((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) { if (chunk_.pos() != 0) return reg_t(); // first chunk execute all in batch diff --git a/src/simulators/unitary/unitarymatrix_thrust.hpp b/src/simulators/unitary/unitarymatrix_thrust.hpp index f11e107425..8687f8e2c7 100755 --- a/src/simulators/unitary/unitarymatrix_thrust.hpp +++ b/src/simulators/unitary/unitarymatrix_thrust.hpp @@ -50,7 +50,7 @@ class UnitaryMatrixThrust : public QubitVectorThrust { //----------------------------------------------------------------------- // Return the string name of the class -#ifdef AER_THRUST_CUDA +#ifdef AER_THRUST_GPU static std::string name() { return "unitary_gpu"; } #else static std::string name() { return "unitary_thrust"; } From d8beeabcefb2bf92e672cfaafc9821866b406e79 Mon Sep 17 00:00:00 2001 From: Samuel F Antao Date: Fri, 25 Aug 2023 02:22:40 +0100 Subject: [PATCH 5/5] Fix errors coming from deprecations in recent NumPy versions. (#1904) * Add Eclipse IDE project files to .gitignore. * Change existing CUDA implementation to reflect a generic GPU. * Correct NumPy deprecation warnings. * Fix formatting. * Cancel .gitignore changes for Eclipse IDE. * Rebase without CUDA refactor changes. --------- Co-authored-by: Jun Doi --- qiskit_aer/quantum_info/states/aer_densitymatrix.py | 2 +- qiskit_aer/quantum_info/states/aer_statevector.py | 2 +- .../backends/aer_simulator/test_save_matrix_product_state.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/qiskit_aer/quantum_info/states/aer_densitymatrix.py b/qiskit_aer/quantum_info/states/aer_densitymatrix.py index f9c5090562..3c0d5fd7a2 100644 --- a/qiskit_aer/quantum_info/states/aer_densitymatrix.py +++ b/qiskit_aer/quantum_info/states/aer_densitymatrix.py @@ -274,7 +274,7 @@ def from_label(cls, label): @staticmethod def from_int(i, dims): - size = np.product(dims) + size = np.prod(dims) state = np.zeros((size, size), dtype=complex) state[i, i] = 1.0 return AerDensityMatrix(state, dims=dims) diff --git a/qiskit_aer/quantum_info/states/aer_statevector.py b/qiskit_aer/quantum_info/states/aer_statevector.py index b243dd2d4f..fa4acf321e 100644 --- a/qiskit_aer/quantum_info/states/aer_statevector.py +++ b/qiskit_aer/quantum_info/states/aer_statevector.py @@ -281,7 +281,7 @@ def from_label(cls, label): @staticmethod def from_int(i, dims): - size = np.product(dims) + size = np.prod(dims) state = np.zeros(size, dtype=complex) state[i] = 1.0 return AerStatevector(state, dims=dims) diff --git a/test/terra/backends/aer_simulator/test_save_matrix_product_state.py b/test/terra/backends/aer_simulator/test_save_matrix_product_state.py index 533c293cd8..79584ca136 100644 --- a/test/terra/backends/aer_simulator/test_save_matrix_product_state.py +++ b/test/terra/backends/aer_simulator/test_save_matrix_product_state.py @@ -13,6 +13,7 @@ AerSimulator Integration Tests for SaveMatrixProductState instruction """ from ddt import ddt +import math import numpy as np from qiskit import QuantumCircuit, transpile from test.terra.backends.simulator_test_case import SimulatorTestCase, supported_methods @@ -36,7 +37,7 @@ def test_save_matrix_product_state(self, method, device): target_qreg.append((np.array([[1]], dtype=complex), np.array([[0]], dtype=complex))) target_lambda_reg = [] - target_lambda_reg.append(np.array([1 / np.math.sqrt(2)], dtype=float)) + target_lambda_reg.append(np.array([1 / math.sqrt(2)], dtype=float)) target_lambda_reg.append(np.array([1], dtype=float)) # Matrix product state test circuit