From 888280ce1ab9e6073a579f4d6462d327ff4e77ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jinguo=20Liu=20=28=E5=88=98=E9=87=91=E5=9B=BD=29?= Date: Thu, 29 Feb 2024 14:34:03 +0800 Subject: [PATCH] Make CuYao an extension (#494) * add CuYao as extension * tests pass * add makefile * update make file * rm CuYao LICENSE * add CuYao and YaoPlots doc --- Makefile | 37 ++++ Project.toml | 7 + docs/Project.toml | 1 + docs/make.jl | 1 + docs/src/man/cuda.md | 71 +++++++ docs/src/man/plot.md | 54 ++++- ext/CuYao.jl | 3 + ext/CuYao/.gitignore | 23 ++ ext/CuYao/README.md | 8 + ext/CuYao/benchmarks/TestCuda.jl | 54 +++++ ext/CuYao/benchmarks/gates.jl | 19 ++ ext/CuYao/benchmarks/gcompile.jl | 92 ++++++++ ext/CuYao/benchmarks/paralleldot.jl | 11 + ext/CuYao/benchmarks/statexpect.jl | 13 ++ ext/CuYao/src/CUDApatch.jl | 64 ++++++ ext/CuYao/src/CuYao.jl | 38 ++++ ext/CuYao/src/instructs.jl | 312 ++++++++++++++++++++++++++++ ext/CuYao/src/register.jl | 270 ++++++++++++++++++++++++ ext/CuYao/test/CUDApatch.jl | 35 ++++ ext/CuYao/test/Project.toml | 23 ++ ext/CuYao/test/extra.jl | 52 +++++ ext/CuYao/test/instructs.jl | 135 ++++++++++++ ext/CuYao/test/register.jl | 139 +++++++++++++ ext/CuYao/test/runtests.jl | 18 ++ lib/YaoPlots/src/vizcircuit.jl | 14 +- src/Yao.jl | 7 +- src/cudainterfaces.jl | 41 ++++ 27 files changed, 1532 insertions(+), 10 deletions(-) create mode 100644 Makefile create mode 100644 docs/src/man/cuda.md create mode 100644 ext/CuYao.jl create mode 100644 ext/CuYao/.gitignore create mode 100644 ext/CuYao/README.md create mode 100644 ext/CuYao/benchmarks/TestCuda.jl create mode 100644 ext/CuYao/benchmarks/gates.jl create mode 100644 ext/CuYao/benchmarks/gcompile.jl create mode 100644 ext/CuYao/benchmarks/paralleldot.jl create mode 100644 ext/CuYao/benchmarks/statexpect.jl create mode 100644 ext/CuYao/src/CUDApatch.jl create mode 100644 ext/CuYao/src/CuYao.jl create mode 100644 ext/CuYao/src/instructs.jl create mode 100644 ext/CuYao/src/register.jl create mode 100644 ext/CuYao/test/CUDApatch.jl create mode 100644 ext/CuYao/test/Project.toml create mode 100644 ext/CuYao/test/extra.jl create mode 100644 ext/CuYao/test/instructs.jl create mode 100644 ext/CuYao/test/register.jl create mode 100644 ext/CuYao/test/runtests.jl create mode 100644 src/cudainterfaces.jl diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..45e53f2d --- /dev/null +++ b/Makefile @@ -0,0 +1,37 @@ +JL = julia --project + +default: init test + +init: + $(JL) -e 'using Pkg; Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()' +init-docs: + $(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()' + +update: + $(JL) -e 'using Pkg; Pkg.update(); Pkg.precompile()' +update-docs: + $(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.update(); Pkg.precompile()' + +test-Yao: + $(JL) -e 'using Pkg; Pkg.test("Yao")' +test-CuYao: + $(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); Pkg.develop(path="."); Pkg.update()' + $(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); include("ext/CuYao/test/runtests.jl")' + +test-%: + $(JL) -e 'using Pkg; Pkg.test("$*")' + +test: + $(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"])' + +coverage: + $(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"]; coverage=true)' + +servedocs: + $(JL) -e 'using Pkg; Pkg.activate("docs"); using LiveServer; servedocs(;skip_dirs=["docs/src/assets", "docs/src/generated"])' + +clean: + rm -rf docs/build + find . -name "*.cov" -type f -print0 | xargs -0 /bin/rm -f + +.PHONY: init test diff --git a/Project.toml b/Project.toml index 88493caf..4faa42d5 100644 --- a/Project.toml +++ b/Project.toml @@ -13,8 +13,15 @@ YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df" YaoPlots = "32cfe2d9-419e-45f2-8191-2267705d8dbc" YaoSym = "3b27209a-d3d6-11e9-3c0f-41eb92b2cb9d" +[weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" + +[extensions] +CuYao = "CUDA" + [compat] BitBasis = "0.8, 0.9" +CUDA = "4, 5" LinearAlgebra = "1" LuxurySparse = "0.7" Reexport = "1" diff --git a/docs/Project.toml b/docs/Project.toml index d6d7fde0..d0ab5651 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -7,6 +7,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77" Latexify = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" diff --git a/docs/make.jl b/docs/make.jl index f67cad1a..9a549392 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -58,6 +58,7 @@ const PAGES = [ "man/registers.md", "man/blocks.md", "man/symbolic.md", + "man/cuda.md", "man/plot.md", "man/automatic_differentiation.md", "man/simplification.md", diff --git a/docs/src/man/cuda.md b/docs/src/man/cuda.md new file mode 100644 index 00000000..98454f1b --- /dev/null +++ b/docs/src/man/cuda.md @@ -0,0 +1,71 @@ +```@meta +CurrentModule = Yao +``` + +# CUDA extension - CuYao + +## Tutorial +`CuYao` is a CUDA extension of Yao, which allows you to run Yao circuits on GPU. The usage of `CuYao` is similar to `Yao`, but with some extra APIs to upload and download registers between CPU and GPU: +- `cu(reg)` to upload a registe `reg` to GPU, and +- `cpu(cureg)` to download a register `cureg` from GPU to CPU. + +```julia +julia> using Yao, CUDA + +# create a register on GPU +julia> cureg = rand_state(9; nbatch=1000) |> cu; # or `curand_state(9; nbatch=1000)`. + +# run a circuit on GPU +julia> cureg |> put(9, 2=>Z); + +# measure the register on GPU +julia> measure!(cureg) +1000-element CuArray{DitStr{2, 9, Int64}, 1, CUDA.Mem.DeviceBuffer}: + 110110100 ₍₂₎ + 000100001 ₍₂₎ + 111111001 ₍₂₎ + ⋮ + 010001101 ₍₂₎ + 000100110 ₍₂₎ + +# download the register to CPU +julia> reg = cureg |> cpu; +``` + + +## Features +Supported gates: + +- general U(N) gate +- general U(1) gate +- X, Y, Z gate +- T, S gate +- SWAP gate +- control gates + +Supported register operations: + +- measure!, measure_reset!, measure_remove!, select +- append_qudits!, append_qubits! +- insert_qudit!, insert_qubits! +- focus!, relax! +- join +- density_matrix +- fidelity +- expect + +Autodiff: +- autodiff is supported when the only parameterized gates are rotation gates in a circuit. + +## API +```@docs +cpu +curand_state +cuzero_state +cuproduct_state +cuuniform_state +cughz_state +``` + +!!! note + the `cu` function is not documented in this module, but it is used to upload a register to GPU. \ No newline at end of file diff --git a/docs/src/man/plot.md b/docs/src/man/plot.md index 7c0a3d6e..7e1e9160 100644 --- a/docs/src/man/plot.md +++ b/docs/src/man/plot.md @@ -11,7 +11,59 @@ end # Quantum Circuit Visualization -Quantum circuit visualization support for Yao. +`YaoPlots` is the Quantum circuit visualization component for Yao. + + +## Examples +#### Example 1: Visualize a QBIR define in Yao + +```@example plot +using Yao.EasyBuild, YaoPlots + +# show a qft circuit +vizcircuit(qft_circuit(5)) +``` + +If you are using a Pluto/Jupyter notebook, Atom/VSCode editor, you should see the above image in your plotting panel. + +#### Example 2: Visualize a single qubit +```@example plot +using YaoPlots, Yao + +reg = zero_state(1) |> Rx(π/8) |> Rx(π/8) +rho = density_matrix(ghz_state(2), 1) + +bloch_sphere("|ψ⟩"=>reg, "ρ"=>rho; show_projection_lines=true) +``` + +See more [examples](lib/YaoPlots/examples/circuits.jl). + +### Adjusting the plot attributes + +Various attributes of the visualizations can be altered. +The plot can be modified, if we change the following attributes + +- `YaoPlots.CircuitStyles.linecolor[]` for line color, default value being `"#000000"` (black color) +- `YaoPlots.CircuitStyles.gate_bgcolor[]` for background color of square blocks, the default value being `"#FFFFFF"` (white color) +- `YaoPlots.CircuitStyles.textcolor[]` for text color, default value being `"#000000` +- `YaoPlots.CircuitStyles.lw[]` for line width, default value being `1` (pt) +- `YaoPlots.CircuitStyles.textsize[]` for text size, default value being `16` (pt) +- `YaoPlots.CircuitStyles.paramtextsize[]` for parameter text size, for parameterized gates, default value being `10` (pt) + +For example, + +```@example plot +using YaoPlots, Yao +YaoPlots.CircuitStyles.linecolor[] = "pink" +YaoPlots.CircuitStyles.gate_bgcolor[] = "yellow" +YaoPlots.CircuitStyles.textcolor[] = "#000080" # the navy blue color +YaoPlots.CircuitStyles.fontfamily[] = "JuliaMono" +YaoPlots.CircuitStyles.lw[] = 2.5 +YaoPlots.CircuitStyles.textsize[] = 13 +YaoPlots.CircuitStyles.paramtextsize[] = 8 + +vizcircuit(chain(3, put(1=>X), repeat(3, H), put(2=>Y), repeat(3, Rx(π/2)))) +``` ## Circuit Visualization ```@docs diff --git a/ext/CuYao.jl b/ext/CuYao.jl new file mode 100644 index 00000000..1421abce --- /dev/null +++ b/ext/CuYao.jl @@ -0,0 +1,3 @@ +module CuYao +include("CuYao/src/CuYao.jl") +end diff --git a/ext/CuYao/.gitignore b/ext/CuYao/.gitignore new file mode 100644 index 00000000..56b59ee8 --- /dev/null +++ b/ext/CuYao/.gitignore @@ -0,0 +1,23 @@ +*.jl.cov +*.jl.*.cov +*.jl.mem + +docs/build/ +docs/site/ +docs/src/tutorial/RegisterBasics.md +docs/src/tutorial/BlockBasics.md +docs/src/tutorial/BinaryBasics.md +docs/src/tutorial/QCBM.md + +*.ipynb_checkpoints +**/*.ipynb_checkpoints +**/**/*.ipynb_checkpoints + +_*.dat +*.swp +__pycache__/ +.vscode/ + +Manifest.toml + +_local/ diff --git a/ext/CuYao/README.md b/ext/CuYao/README.md new file mode 100644 index 00000000..becb9632 --- /dev/null +++ b/ext/CuYao/README.md @@ -0,0 +1,8 @@ +## How to run tests + +Open a terminal, switch to Yao folder and run the following commands: +```bash +make init-CuYao +make test-CuYao +``` + diff --git a/ext/CuYao/benchmarks/TestCuda.jl b/ext/CuYao/benchmarks/TestCuda.jl new file mode 100644 index 00000000..8c6a6cd5 --- /dev/null +++ b/ext/CuYao/benchmarks/TestCuda.jl @@ -0,0 +1,54 @@ +using CUDA +using LinearAlgebra +using BenchmarkTools + +function ms!(X::CuArray, s::Number) + function kernel(X, s) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + @inbounds X[i] *= s + return + end + @cuda blocks=length(X) kernel(X, s) + X +end + +function ms1!(X::CuArray, a, b, c) + k = f(a) + function kernel(X, a, b, c) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + k(X, i) + k(X, i) + k(X, i) + #@inbounds X[i] *= a + #@inbounds X[i] *= b + #@inbounds X[i] *= c + return + end + @cuda blocks=length(X)÷256 threads=256 kernel(X, a, b, c) + X +end + +@inline function f(s) + @inline function kernel(X, i) + X[i]*=s + end +end +function ms2!(X::CuArray, s::Number) + function kernel(X, s) + i = (blockIdx().x-1) * blockDim().x + threadIdx().x + @inbounds X[i] *= s + return + end + @cuda blocks=length(X)÷256 threads=256 kernel(X, s) + X +end + +a = randn(1<<10) +cua = cu(a) +@benchmark ms1!($cua, 1.001, 1.001, 1.001) +@benchmark ms2!(ms2!(ms2!($cua, 1.001), 1.001), 1.001) +bss = 1:length(cua) +@benchmark f(1.001).(cua, bss) + +@benchmark rmul!($a, 1.01) +@benchmark ms!($cua, 1.0) diff --git a/ext/CuYao/benchmarks/gates.jl b/ext/CuYao/benchmarks/gates.jl new file mode 100644 index 00000000..b63dbd49 --- /dev/null +++ b/ext/CuYao/benchmarks/gates.jl @@ -0,0 +1,19 @@ +using Yao, CuYao, CUDA +using BenchmarkTools + +reg = rand_state(12; nbatch=1000) +creg = reg |> cu +@benchmark CUDA.@sync creg |> put(12, 3=>Z) +@benchmark CUDA.@sync creg |> put(12, 3=>X) +@benchmark reg |> put(12, 3=>Z) +@benchmark CUDA.@sync creg |> control(12, 6, 3=>X) +@benchmark reg |> control(12, 6, 3=>X) +@benchmark CUDA.@sync creg |> put(12, 3=>rot(X, 0.3)) +@benchmark reg |> put(12, 3=>rot(X, 0.3)) + +reg = rand_state(20) +creg = reg |> cu +g = swap(20, 7, 2) +g = put(20, (7, 2)=>matblock(rand_unitary(4))) +@benchmark reg |> g +@benchmark CUDA.@sync creg |> g diff --git a/ext/CuYao/benchmarks/gcompile.jl b/ext/CuYao/benchmarks/gcompile.jl new file mode 100644 index 00000000..62a90ae9 --- /dev/null +++ b/ext/CuYao/benchmarks/gcompile.jl @@ -0,0 +1,92 @@ +using Yao, Yao.Boost, Yao.Intrinsics, StaticArrays, Yao.Blocks +using CuYao, CUDA +using BenchmarkTools, Profile + +nbit = 12 +c = chain(put(nbit, 2=>X), put(nbit, 2=>rot(X, 0.2)), control(nbit, 3, 2=>rot(X,0.3))) +#c = chain(c..., c...,c...) +cc = c |> KernelCompiled +reg = rand_state(nbit) |> cu + +@benchmark $reg |> copy |> $c[1] seconds = 2 +@benchmark $reg |> copy |> $cc seconds = 2 + + +reg = rand_state(9, 1000) +creg = reg |> cu +@benchmark focus!(reg |> copy, 1) do r + measure!(r) + r +end + +@benchmark focus!(creg|>copy, 1) do r + measure!(r) + r +end + +@benchmark focus!(creg, 1) do r + measure_reset!(r) + r +end + +@profile for i=1:10 focus!(creg |> copy, 1) do r + measure!(r) + r +end +end + +a = randn(1<<10) +ca = a |> cu + +gpu_call(ca, (x->x^2, ca)) do state, f, ca + ilin = linear_index(state) + ca[ilin] = f(ca[ilin]) + return +end + +gpu_call(1:1<<10, (x->x^2, ca)) do state, f, ca + ilin = linear_index(state) + ca[ilin] = f(ca[ilin]) + return +end + + +function xx_kernel(bits::Ints) + ctrl = controller(bits[1], 0) + mask = bmask(bits...) + function kernel(state, inds) + i = inds[1] + b = i-1 + ctrl(b) && swaprows!(piecewise(state, inds), i, flip(b, mask) + 1) + end +end + +using CuYao: x_kernel, y_kernel, z_kernel +fx = x_kernel((3,4)) +fy = y_kernel(3) +fz = z_kernel((3,4)) + +a = randn(1<<10) +ca = a |> cu +fs = (fx, fz) + +@benchmark gpu_call(ca, (fs, ca)) do state, f, ca + #ilin = linear_index(state) + ilin = @cartesianidx ca + for fi in fs + fi(ca, ilin) + end + return +end + +@device_code_warntype gpu_call(ca, ((fx, fz), ca)) do state, fs, ca + ilin = @cartesianidx ca + #for fi in fs fi(ca, ilin) end + @Base.Cartesian.@nexprs $(length(fs)) i->fs[i](ca, ilin) + #fs(ca, ilin) + return +end + +a ≈ Vector(ca) + +@edit controller(3,0) diff --git a/ext/CuYao/benchmarks/paralleldot.jl b/ext/CuYao/benchmarks/paralleldot.jl new file mode 100644 index 00000000..38e5dd6e --- /dev/null +++ b/ext/CuYao/benchmarks/paralleldot.jl @@ -0,0 +1,11 @@ +function paralleldot(matrices::CuVector, ptrA, ptrB) + @inline function kernel(ctx, matrices) + inds = @cartesianidx state + i = inds[1] + piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a + return + end + gpu_call(kernel, state, a, d, mask; elements=length(state)) + return state +end + diff --git a/ext/CuYao/benchmarks/statexpect.jl b/ext/CuYao/benchmarks/statexpect.jl new file mode 100644 index 00000000..9e3ca5dd --- /dev/null +++ b/ext/CuYao/benchmarks/statexpect.jl @@ -0,0 +1,13 @@ +using CuYao +using Yao +using BenchmarkTools + +sf(x, y) = abs(x-y) +a = randn(1024) +ca = a |> cu +b = randn(1024) +cb = b |> cu +@benchmark expect(StatFunctional{2}(sf), a, b) seconds=2 +@benchmark expect(StatFunctional{2}(sf), a) seconds=2 +@benchmark expect(StatFunctional{2}(sf), ca, cb) seconds=2 +@benchmark expect(StatFunctional{2}(sf), ca) seconds=2 diff --git a/ext/CuYao/src/CUDApatch.jl b/ext/CuYao/src/CUDApatch.jl new file mode 100644 index 00000000..aeaf7d36 --- /dev/null +++ b/ext/CuYao/src/CUDApatch.jl @@ -0,0 +1,64 @@ +# TODO +# support norm(view(reshape(A, m, n), :, 1)) +norm2(A::DenseCuArray; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDA.sqrt + +piecewise(state::AbstractVector, inds) = state +piecewise(state::AbstractMatrix, inds) = @inbounds view(state,:,inds[2]) + +function kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2} + res = CUDA.zeros(promote_type(T1,T2), (size(A).*size(B))...) + @inline function kernel(ctx, res, A, B) + state = @linearidx res + @inbounds inds = CartesianIndices(res)[state].I + inds_A = (inds.-1) .÷ size(B) .+ 1 + inds_B = (inds.-1) .% size(B) .+ 1 + @inbounds res[state] = A[inds_A...]*B[inds_B...] + return + end + + gpu_call(kernel, res, A, B) + res +end + +""" + kron!(C::CuArray, A, B) + +Computes Kronecker products in-place on the GPU. +The results are stored in 'C', overwriting the existing values of 'C'. +""" +function Yao.YaoArrayRegister.kron!(C::CuArray{T3}, A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2, T3} + @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2))) || throw(DimensionMismatch()) + CI = Base.CartesianIndices(C) + @inline function kernel(ctx, C, A, B) + state = @linearidx C + @inbounds inds = CI[state].I + inds_A = (inds.-1) .÷ size(B) .+ 1 + inds_B = (inds.-1) .% size(B) .+ 1 + @inbounds C[state] = A[inds_A...]*B[inds_B...] + return + end + + gpu_call(kernel, C, A, B) + C +end + +function getindex(A::DenseCuVector{T}, B::DenseCuArray{<:Integer}) where T + res = CUDA.zeros(T, size(B)...) + @inline function kernel(ctx, res, A, B) + state = @linearidx res + @inbounds res[state] = A[B[state]] + return + end + gpu_call(kernel, res, A, B) + res +end + +function getindex(A::AbstractVector, B::DenseCuArray{<:Integer}) + A[Array(B)] +end + +YaoBlocks.AD.as_scalar(x::DenseCuArray) = Array(x)[] + +# patch for ExponentialUtilities +YaoBlocks.compatible_multiplicative_operand(::CuArray, source::AbstractArray) = CuArray(source) +YaoBlocks.compatible_multiplicative_operand(::CuArray, source::CuArray) = source diff --git a/ext/CuYao/src/CuYao.jl b/ext/CuYao/src/CuYao.jl new file mode 100644 index 00000000..a04c2417 --- /dev/null +++ b/ext/CuYao/src/CuYao.jl @@ -0,0 +1,38 @@ +# module CuYao +using Yao.YaoArrayRegister.LuxurySparse, Yao.YaoArrayRegister.StaticArrays, LinearAlgebra, Base.Cartesian +using Yao.YaoArrayRegister.StatsBase, Yao.YaoArrayRegister.BitBasis +using Yao.Reexport +import Yao.YaoArrayRegister.TupleTools +using Yao.YaoArrayRegister.Random + +using Yao.YaoArrayRegister +using Yao +using CUDA +using CUDA.GPUArrays: gpu_call, @linearidx, @cartesianidx, linear_index +using Yao.YaoArrayRegister +using Yao.YaoBlocks +using Yao.ConstGate: SWAPGate +using Yao.ConstGate: S, T, Sdag, Tdag + +import Yao.YaoArrayRegister: insert_qudits!, join +import CUDA: cu +import Yao.YaoArrayRegister: _measure, measure, measure! +import Yao.YaoArrayRegister: batch_normalize! +import Yao.YaoBlocks: BlockedBasis, nblocks, subblock +import Yao: expect +import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!, mulrow! +import Yao.YaoArrayRegister.LinearAlgebra: norm +import Base: kron, getindex + +import Yao: cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state + +const Ints = NTuple{<:Any, Int} + +include("CUDApatch.jl") +include("register.jl") +include("instructs.jl") + +function __init__() + CUDA.allowscalar(false) +end +# end diff --git a/ext/CuYao/src/instructs.jl b/ext/CuYao/src/instructs.jl new file mode 100644 index 00000000..6ed8e35d --- /dev/null +++ b/ext/CuYao/src/instructs.jl @@ -0,0 +1,312 @@ +# get index +macro idx(shape, grididx=1, ctxsym=:ctx) + quote + x = $(esc(shape)) + i = $linear_index($(esc(ctxsym)), $(esc(grididx))) + i > $prod2(x) && return + @inbounds Base.CartesianIndices(x)[i].I + end +end +replace_first(x::NTuple{2}, v) = (v, x[2]) +replace_first(x::NTuple{1}, v) = (v,) +prod2(x::NTuple{2}) = x[1] * x[2] +prod2(x::NTuple{1}) = x[1] + +gpu_compatible(A::AbstractVecOrMat) = A |> staticize +gpu_compatible(A::StaticArray) = A + +###################### unapply! ############################ +function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} + @debug "The generic U(N) matrix of size ($(size(U0))), on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)." + nbit = log2dim1(state) + # reorder a unirary matrix. + U = gpu_compatible(all(TupleTools.diff(locs).>0) ? U0 : reorder(U0, collect(locs)|>sortperm)) + locked_bits = [clocs..., locs...] + locked_vals = [cvals..., zeros(Int, M)...] + locs_raw = gpu_compatible([i+1 for i in itercontrol(nbit, setdiff(1:nbit, locs), zeros(Int, nbit-M))]) + configs = itercontrol(nbit, locked_bits, locked_vals) + + len = length(configs) + @inline function kernel(ctx, state, locs_raw, U, configs, len) + CUDA.assume(len > 0) + sz = size(state) + CUDA.assume(length(sz) == 1 || length(sz) == 2) + inds = @idx replace_first(sz, len) + x = @inbounds configs[inds[1]] + @inbounds unrows!(piecewise(state, inds), x .+ locs_raw, U) + return + end + + @inline function kernel_single_entry_diag(ctx, state, loc, val, configs, len) + CUDA.assume(len > 0) + sz = size(state) + CUDA.assume(length(sz) == 1 || length(sz) == 2) + inds = @idx replace_first(sz, len) + x = @inbounds configs[inds[1]] + @inbounds piecewise(state, inds)[x + loc] *= val + return + end + + elements = len*size(state,2) + if U isa Diagonal && count(!isone, U.diag) == 1 + @debug "The single entry diagonal matrix, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)." + k = findfirst(!isone, U.diag) + loc = locs_raw[k] + val = U.diag[k] + gpu_call(kernel_single_entry_diag, state, loc, val, configs, len; elements) + else + gpu_call(kernel, state, locs_raw, U, configs, len; elements) + end + state +end +instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state +instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals) + +################## General U1 apply! ################### +function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDSparseMatrixCSC, loc::Int) + instruct!(Val(2), state, Matrix(U1), loc, clocs, cval) +end +function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::AbstractMatrix, loc::Int) + @debug "The generic U(2) matrix of size ($(size(U1))), on: GPU, locations: $(loc)." + a, c, b, d = U1 + nbit = log2dim1(state) + step = 1<<(loc-1) + configs = itercontrol(nbit, [loc], [0]) + + len = length(configs) + @inline function kernel(ctx, state, a, b, c, d, len) + inds = @idx replace_first(size(state), len) + i = @inbounds configs[inds[1]]+1 + @inbounds u1rows!(piecewise(state, inds), i, i+step, a, b, c, d) + return + end + gpu_call(kernel, state, a, b, c, d, len; elements=len*size(state,2)) + return state +end + +function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDPermMatrix, loc::Int) + @debug "The single qubit permutation matrix of size ($(size(U1))), on: GPU, locations: $(loc)." + nbit = log2dim1(state) + b, c = U1.vals + step = 1<<(loc-1) + configs = itercontrol(nbit, [loc], [0]) + + len = length(configs) + function kernel(ctx, state, b, c, step, len, configs) + inds = @idx replace_first(size(state), len) + x = @inbounds configs[inds[1]] + 1 + @inbounds swaprows!(piecewise(state, inds), x, x+step, c, b) + return + end + gpu_call(kernel, state, b, c, step, len, configs; elements=len*size(state,2)) + return state +end + +function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDDiagonal, loc::Int) + @debug "The single qubit diagonal matrix of size ($(size(U1))), on: GPU, locations: $(loc)." + a, d = U1.diag + nbit = log2dim1(state) + mask = bmask(loc) + @inline function kernel(ctx, state, a, d, mask) + inds = @cartesianidx state + i = inds[1] + piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a + return + end + gpu_call(kernel, state, a, d, mask; elements=length(state)) + return state +end + +YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U::IMatrix, loc::Int) = state + +################## XYZ ############# + +_instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}) where {L} = _instruct!(state, Val(:X), locs, (), ()) +function _instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {L,C} + @debug "The X gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)." + length(locs) == 0 && return state + nbit = log2dim1(state) + configs = itercontrol(nbit, [clocs..., locs[1]], [cvals..., 0]) + mask = bmask(locs...) + len = length(configs) + @inline function kernel(ctx, state, mask, len, configs) + inds = @idx replace_first(size(state), len) + b = @inbounds configs[inds[1]] + @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1) + return + end + gpu_call(kernel, state, mask, len, configs; elements=len*size(state,2)) + return state +end + +function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::NTuple{C,Int}) where C + @debug "The Y gate, on: GPU, locations: $(locs)." + length(locs) == 0 && return state + nbit = log2dim1(state) + mask = bmask(Int, locs...) + configs = itercontrol(nbit, [locs[1]], [0]) + bit_parity = length(locs)%2 == 0 ? 1 : -1 + factor = (-im)^length(locs) + len = length(configs) + @inline function kernel(ctx, state, mask, bit_parity, configs, len) + inds = @idx replace_first(size(state), len) + b = @inbounds configs[inds[1]] + i_ = flip(b, mask) + 1 + factor1 = count_ones(b&mask)%2 == 1 ? -factor : factor + factor2 = factor1*bit_parity + @inbounds swaprows!(piecewise(state, inds), b+1, i_, factor2, factor1) + return + end + gpu_call(kernel, state, mask, bit_parity, configs, len; elements=len*size(state,2)) + return state +end + +function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C + @debug "The Y gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)." + length(locs) == 0 && return state + nbit = log2dim1(state) + configs = itercontrol(nbit, [clocs..., locs...], [cvals..., 0]) + mask = bmask(locs...) + len = length(configs) + @inline function kernel(ctx, state, configs, mask, len) + inds = @idx replace_first(size(state), len) + b = @inbounds configs[inds[1]] + @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1, im, -im) + return + end + gpu_call(kernel, state, configs, mask, len; elements=len*size(state,2)) + return state +end + +function _instruct!(state::DenseCuVecOrMat, ::Val{:Z}, locs::NTuple{C,Int}) where C + @debug "The Z gate, on: GPU, locations: $(locs)." + length(locs) == 0 && return state + nbit = log2dim1(state) + mask = bmask(locs...) + @inline function kernel(ctx, state, mask) + inds = @cartesianidx state + i = inds[1] + piecewise(state, inds)[i] *= count_ones((i-1)&mask)%2==1 ? -1 : 1 + return + end + gpu_call(kernel, state, mask; elements=length(state)) + return state +end + + +for (G, FACTOR) in zip([:Z, :S, :T, :Sdag, :Tdag], [:(-one(Int32)), :(1f0im), :($(exp(im*π/4))), :(-1f0im), :($(exp(-im*π/4)))]) + if G !== :Z + @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C + @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs)." + length(locs) == 0 && return state + nbit = log2dim1(state) + mask = bmask(Int32, locs...) + @inline function kernel(ctx, state, mask) + inds = @cartesianidx state + i = inds[1] + piecewise(state, inds)[i] *= $FACTOR ^ count_ones(Int32(i-1)&mask) + return + end + gpu_call(kernel, state, mask; elements=length(state)) + return state + end + end + @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C + @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)." + ctrl = controller((clocs..., locs...), (cvals..., 1)) + @inline function kernel(ctx, state, ctrl) + inds = @cartesianidx state + i = inds[1] + piecewise(state, inds)[i] *= ctrl(i-1) ? $FACTOR : one($FACTOR) + return + end + gpu_call(kernel, state, ctrl; elements=length(state)) + return state + end +end + +for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag] + @eval begin + function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C + _instruct!(state, g, locs) + end + + function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}) + _instruct!(state, g, locs) + end + + function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C + _instruct!(state, g, locs, clocs, cvals) + end + + function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, locs::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int}) + _instruct!(state, vg, locs, cloc, cval) + end + end + +end + +function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int}) + @debug "The SWAP gate, on: GPU, locations: $(locs)." + b1, b2 = locs + mask1 = bmask(b1) + mask2 = bmask(b2) + + configs = itercontrol(log2dim1(state), [locs...], [1,0]) + function kf(ctx, state, mask1, mask2) + inds = @idx replace_first(size(state), length(configs)) + + b = configs[inds[1]] + i = b+1 + i_ = b ⊻ (mask1|mask2) + 1 + swaprows!(piecewise(state, inds), i, i_) + nothing + end + gpu_call(kf, state, mask1, mask2; elements=length(configs)*size(state,2)) + state +end + +############## other gates ################ +# parametrized swap gate + +function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real) + @debug "The PSWAP gate, on: GPU, locations: $(locs)." + nbit = log2dim1(state) + mask1 = bmask(locs[1]) + mask2 = bmask(locs[2]) + mask12 = mask1|mask2 + a, c, b_, d = mat(Rx(θ)) + e = exp(-im/2*θ) + configs = itercontrol(nbit, [locs...], [0,0]) + len = length(configs) + @inline function kernel(ctx, state, mask2, mask12, configs, a, b_, c, d) + inds = @idx replace_first(size(state), len) + @inbounds x = configs[inds[1]] + piecewise(state, inds)[x+1] *= e + piecewise(state, inds)[x⊻mask12+1] *= e + y = x ⊻ mask2 + @inbounds u1rows!(piecewise(state, inds), y+1, y⊻mask12+1, a, b_, c, d) + return + end + gpu_call(kernel, state, mask2, mask12, configs, a, b_, c, d; elements=len*size(state,2)) + state +end + +function YaoBlocks._apply_fallback!(r::AbstractCuArrayReg{B,T}, b::AbstractBlock) where {B,T} + YaoBlocks._check_size(r, b) + r.state .= CUDA.adapt(CuArray{T}, mat(T, b)) * r.state + return r +end + +for RG in [:Rx, :Ry, :Rz] + @eval function instruct!( + ::Val{2}, + state::DenseCuVecOrMat{T}, + ::Val{$(QuoteNode(RG))}, + locs::Tuple{Int}, + theta::Number + ) where {T} + YaoArrayRegister.instruct!(Val(2), state, Val($(QuoteNode(RG))), locs, (), (), theta) + return state + end +end diff --git a/ext/CuYao/src/register.jl b/ext/CuYao/src/register.jl new file mode 100644 index 00000000..01c83cbc --- /dev/null +++ b/ext/CuYao/src/register.jl @@ -0,0 +1,270 @@ +cu(reg::ArrayReg{D}) where D = ArrayReg{D}(CuArray(reg.state)) +cpu(reg::ArrayReg{D}) where D = ArrayReg{D}(Array(reg.state)) +cu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(CuArray(reg.state), reg.nbatch) +cpu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(Array(reg.state), reg.nbatch) +cu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(CuArray(reg.state)) +cpu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(Array(reg.state)) +const AbstractCuArrayReg{D, T, MT} = AbstractArrayReg{D, T, MT} where MT<:DenseCuArray +const CuArrayReg{D, T, MT} = ArrayReg{D, T, MT} where MT<:DenseCuArray +const CuBatchedArrayReg{D, T, MT} = BatchedArrayReg{D, T, MT} where MT<:DenseCuArray +const CuDensityMatrix{D, T, MT} = DensityMatrix{D, T, MT} where MT<:DenseCuMatrix + +function batch_normalize!(s::DenseCuArray, p::Real=2) + p!=2 && throw(ArgumentError("p must be 2!")) + s./=norm2(s, dims=1) + return s +end + +@inline function tri2ij(l::Int) + i = ceil(Int, sqrt(2*l+0.25)-0.5) + j = l-i*(i-1)÷2 + return i+1,j +end + +############### MEASURE ################## +function measure(::ComputationalBasis, reg::ArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T} + _measure(rng, basis(reg), reg |> probs |> Vector, nshots) +end + +# TODO: optimize the batch dimension using parallel sampling +function measure(::ComputationalBasis, reg::BatchedArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T} + regm = reg |> rank3 + pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) + return _measure(rng, basis(reg), pl |> Matrix, nshots) +end + +function measure!(::RemoveMeasured, ::ComputationalBasis, reg::AbstractCuArrayReg{D}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where D + regm = reg |> rank3 + B = size(regm, 3) + nregm = similar(regm, D ^ nremain(reg), B) + pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) + pl_cpu = pl |> Matrix + res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B) + res = CuArray(res_cpu) + CI = Base.CartesianIndices(nregm) + @inline function kernel(ctx, nregm, regm, res, pl) + state = @linearidx nregm + @inbounds i,j = CI[state].I + @inbounds r = Int(res[j])+1 + @inbounds nregm[i,j] = regm[r,i,j]/CUDA.sqrt(pl[r, j]) + return + end + gpu_call(kernel, nregm, regm, res, pl) + reg.state = reshape(nregm,1,:) + return reg isa ArrayReg ? Array(res)[] : res +end + +function measure!(::NoPostProcess, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T} + regm = reg |> rank3 + B = size(regm, 3) + pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) + pl_cpu = pl |> Matrix + res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B) + res = CuArray(res_cpu) + CI = Base.CartesianIndices(regm) + + @inline function kernel(ctx, regm, res, pl) + state = @linearidx regm + @inbounds k,i,j = CI[state].I + @inbounds rind = Int(res[j]) + 1 + @inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDA.sqrt(pl[k, j]) : T(0) + return + end + gpu_call(kernel, regm, res, pl) + return reg isa ArrayReg ? Array(res)[] : res +end + +function YaoArrayRegister.measure!( + ::NoPostProcess, + bb::BlockedBasis, + reg::AbstractCuArrayReg{D,T}, + ::AllLocs; + rng::AbstractRNG = Random.GLOBAL_RNG, +) where {D,T} + state = @inbounds (reg|>rank3)[bb.perm, :, :] # permute to make eigen values sorted + B = size(state, 3) + pl = dropdims(mapreduce(abs2, +, state, dims=2), dims=2) + pl_cpu = pl |> Matrix + pl_block = zeros(eltype(pl), nblocks(bb), B) + @inbounds for ib = 1:B + for i = 1:nblocks(bb) + for k in subblock(bb, i) + pl_block[i, ib] += pl_cpu[k, ib] + end + end + end + # perform measurements on CPU + res_cpu = Vector{Int}(undef, B) + @inbounds @views for ib = 1:B + ires = sample(rng, 1:nblocks(bb), Weights(pl_block[:, ib])) + # notice ires is `BitStr` type, can be use as indices directly. + range = subblock(bb, ires) + state[range, :, ib] ./= sqrt(pl_block[ires, ib]) + state[1:range.start-1, :, ib] .= zero(T) + state[range.stop+1:size(state, 1), :, ib] .= zero(T) + res_cpu[ib] = ires + end + # undo permute and assign back + _state = reshape(state, 1 << nactive(reg), :) + rstate = reshape(reg.state, 1 << nactive(reg), :) + @inbounds rstate[bb.perm, :] .= _state + return reg isa ArrayReg ? bb.values[res_cpu[]] : CuArray(bb.values[res_cpu]) +end + +function measure!(rst::ResetTo, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T} + regm = reg |> rank3 + B = size(regm, 3) + pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2) + pl_cpu = pl |> Matrix + res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B) + res = CuArray(res_cpu) + CI = Base.CartesianIndices(regm) + + @inline function kernel(ctx, regm, res, pl, val) + state = @linearidx regm + @inbounds k,i,j = CI[state].I + @inbounds rind = Int(res[j]) + 1 + @inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDA.sqrt(pl[rind, j])) + CUDA.sync_threads() + @inbounds k!=val+1 && (regm[k,i,j] = 0) + return + end + + gpu_call(kernel, regm, res, pl, rst.x) + return reg isa ArrayReg ? Array(res)[] : res +end + +function YaoArrayRegister.batched_kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1 ,T2} + res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3)) + CI = Base.CartesianIndices(res) + @inline function kernel(ctx, res, A, B) + state = @linearidx res + @inbounds i,j,b = CI[state].I + i_A, i_B = divrem((i-1), size(B,1)) + j_A, j_B = divrem((j-1), size(B,2)) + @inbounds res[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b] + return + end + + gpu_call(kernel, res, A, B) + return res +end + +""" + YaoArrayRegister.batched_kron!(C::CuArray, A, B) + +Performs batched Kronecker products in-place on the GPU. +The results are stored in 'C', overwriting the existing values of 'C'. +""" +function YaoArrayRegister.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArray) where {T3} + @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch()) + @boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch()) + CI = Base.CartesianIndices(C) + @inline function kernel(ctx, C, A, B) + state = @linearidx C + @inbounds i,j,b = CI[state].I + i_A, i_B = divrem((i-1), size(B,1)) + j_A, j_B = divrem((j-1), size(B,2)) + @inbounds C[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b] + return + end + + gpu_call(kernel, C, A, B) + return C +end + +function join(reg1::AbstractCuArrayReg{D}, reg2::AbstractCuArrayReg{D}) where {D} + @assert nbatch(reg1) == nbatch(reg2) + s1 = reg1 |> rank3 + s2 = reg2 |> rank3 + state = YaoArrayRegister.batched_kron(s1, s2) + return arrayreg(copy(reshape(state, size(state, 1), :)); nlevel=D, nbatch=nbatch(reg1)) +end + +function Yao.insert_qudits!(reg::AbstractCuArrayReg{D}, loc::Int; nqudits::Int=1) where D + na = nactive(reg) + focus!(reg, 1:loc-1) + reg2 = join(zero_state(nqudits; nbatch=nbatch(reg)) |> cu, reg) |> relax! |> focus!((1:na+nqudits)...) + reg.state = reg2.state + return reg +end + +cuproduct_state(bit_str::BitStr; nbatch::Union{NoBatch,Int} = NoBatch()) = + cuproduct_state(ComplexF64, bit_str; nbatch = nbatch) +cuproduct_state(bit_str::AbstractVector; nbatch::Union{NoBatch,Int} = NoBatch()) = + cuproduct_state(ComplexF64, bit_str; nbatch = nbatch) +cuproduct_state(total::Int, bit_config::Integer; kwargs...) = + cuproduct_state(ComplexF64, total, bit_config; kwargs...) +cuproduct_state(::Type{T}, bit_str::BitStr{N}; kwargs...) where {T,N} = + cuproduct_state(T, N, buffer(bit_str); kwargs...) +cuproduct_state(::Type{T}, bit_configs::AbstractVector; kwargs...) where {T} = + cuproduct_state(T, bit_literal(bit_configs...); kwargs...) +function cuproduct_state( + ::Type{T}, + total::Int, + bit_config::Integer; + nbatch::Union{Int,NoBatch} = NoBatch(), + nlevel::Int=2, +) where {T} + raw = CUDA.zeros(T, nlevel ^ total, YaoArrayRegister._asint(nbatch)) + raw[Int(bit_config)+1,:] .= Ref(one(T)) + return arrayreg(raw; nbatch=nbatch, nlevel=nlevel) +end + +cuzero_state(n::Int; kwargs...) = cuzero_state(ComplexF64, n; kwargs...) +cuzero_state(::Type{T}, n::Int; kwargs...) where {T} = cuproduct_state(T, n, 0; kwargs...) + +curand_state(n::Int; kwargs...) = curand_state(ComplexF64, n; kwargs...) + +function curand_state( + ::Type{T}, + n::Int; + nbatch::Union{Int,NoBatch} = NoBatch(), + nlevel = 2, +) where {T} + raw = CUDA.randn(T, nlevel ^ n, YaoArrayRegister._asint(nbatch)) + return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel)) +end + +cuuniform_state(n::Int; kwargs...) = cuuniform_state(ComplexF64, n; kwargs...) +function cuuniform_state(::Type{T}, n::Int; + nbatch::Union{Int,NoBatch} = NoBatch(), + nlevel::Int = 2, +) where {T} + raw = CUDA.ones(T, nlevel ^ n, YaoArrayRegister._asint(nbatch)) + return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel)) +end + +cughz_state(n::Int; kwargs...) = cughz_state(ComplexF64, n; kwargs...) +function cughz_state(::Type{T}, n::Int; kwargs...) where {T} + reg = cuzero_state(T, n; kwargs...) + reg.state[1:1,:] .= Ref(sqrt(T(0.5))) + reg.state[end:end,:] .= Ref(sqrt(T(0.5))) + return reg +end + +#= +for FUNC in [:measure!, :measure!] + @eval function $FUNC(rng::AbstractRNG, op::AbstractBlock, reg::AbstractCuArrayReg, al::AllLocs; kwargs...) where B + E, V = eigen!(mat(op) |> Matrix) + ei = Eigen(E|>cu, V|>cu) + $FUNC(rng::AbstractRNG, ei, reg, al; kwargs...) + end +end +=# + +function YaoBlocks.expect(op::AbstractAdd, dm::CuDensityMatrix) + sum(x->expect(x, dm), subblocks(op)) +end +function YaoBlocks.expect(op::AbstractBlock, dm::CuDensityMatrix{D}) where D + return tr(apply(ArrayReg{D}(dm.state), op).state) +end + +measure( + ::ComputationalBasis, + reg::CuDensityMatrix, + ::AllLocs; + nshots::Int = 1, + rng::AbstractRNG = Random.GLOBAL_RNG, +) = YaoArrayRegister._measure(rng, basis(reg), Array(reg |> probs), nshots) + diff --git a/ext/CuYao/test/CUDApatch.jl b/ext/CuYao/test/CUDApatch.jl new file mode 100644 index 00000000..136d7aeb --- /dev/null +++ b/ext/CuYao/test/CUDApatch.jl @@ -0,0 +1,35 @@ +using Yao +using CUDA +using Test +using Yao.YaoBlocks + +@testset "isapprox-complex" begin + ca = CuArray(randn(ComplexF64,3,3)) + cb = copy(ca) + #@test ca ≈ cb still error! + cb[1:1, 1:1] .+= 1e-7im + @test isapprox(ca, cb, atol=1e-5) + @test !isapprox(ca, cb, atol=1e-9) +end + +@testset "view general" begin + a = randn(5,6,8) |> CuArray + @test view(a, 2:4, 4, [1,4,3]) |> size == (3, 3) +end + +@testset "permutedims vector" begin + ca = randn(ComplexF64,3,4,5,1) + @test permutedims(CuArray(ca), [2,1,4,3]) |> Array ≈ permutedims(ca, [2,1,4,3]) +end + +@testset "Complex pow" begin + for T in [ComplexF64, ComplexF32] + a = CuArray(randn(T, 4, 4)) + @test Array(a .^ Int32(3)) ≈ Array(a).^3 + @test Array(a .^ real(T)(3)) ≈ Array(a).^3 + end +end + +@testset "as_scalar" begin + @test YaoBlocks.AD.as_scalar([2.0] |> CuArray) == 2.0 +end diff --git a/ext/CuYao/test/Project.toml b/ext/CuYao/test/Project.toml new file mode 100644 index 00000000..2e5f158c --- /dev/null +++ b/ext/CuYao/test/Project.toml @@ -0,0 +1,23 @@ +[deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6" +Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c" + +[compat] +CUDA = "4, 5" +Reexport = "0.2, 1" +TupleTools = "1" +Yao = "0.8" +YaoBlocks = "0.13.10" +julia = "1" + +[extras] +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df" + +[targets] +test = ["Test", "Statistics", "YaoBlocks"] diff --git a/ext/CuYao/test/extra.jl b/ext/CuYao/test/extra.jl new file mode 100644 index 00000000..95e759ce --- /dev/null +++ b/ext/CuYao/test/extra.jl @@ -0,0 +1,52 @@ +using Yao, Test, CUDA +CUDA.allowscalar(false) + +@testset "gradient" begin + reg1 = rand_state(10) + reg2 = rand_state(10) + c = EasyBuild.variational_circuit(10) + g1, g2 = fidelity'(reg1=>c, reg2) + cg1, cg2 = fidelity'(cu(reg1)=>c, cu(reg2)) + @test g1.first ≈ cpu(cg1.first) + @test g1.second ≈ cg1.second + @test g2 ≈ cpu(cg2) + + h = EasyBuild.heisenberg(10) + g1 = expect'(h, reg1=>c) + cg1 = expect'(h, cu(reg1)=>c) + @test g1.first ≈ cpu(cg1.first) + @test g1.second ≈ cg1.second +end + +@testset "apply density matrix" begin + reg = rand_state(6) + creg = cu(reg) + rho = density_matrix(reg, (3,4)) + crho = density_matrix(creg, (3,4)) + @test rho ≈ cpu(crho) + g = put(2, 1=>Rx(0.3)) + @test cpu(apply(crho, g)) ≈ apply(rho, g) + rho = density_matrix(reg) + crho = density_matrix(creg) + @test rho ≈ cpu(crho) + @test probs(crho) ≈ probs(creg) + g = put(6, 1=>Rx(0.3)) + @test cpu(apply(crho, g)) ≈ apply(rho, g) + + # channel + c = UnitaryChannel([put(6, 1=>Rx(0.3)), put(6, 2=>Z)], [0.4, 0.6]) + @test cpu(apply(crho, c)) ≈ apply(rho, c) +end + +@testset "expect on density matrix" begin + reg = rand_state(6) + rho = density_matrix(reg, (3,4,5)) + crho = cu(rho) + h = EasyBuild.heisenberg(3) + a = expect(h, rho) + b = expect(h, crho) + @test a ≈ b + # fidelity + @test fidelity(crho, crho) ≈ 1 + @test measure(crho; nshots=2) isa Vector +end diff --git a/ext/CuYao/test/instructs.jl b/ext/CuYao/test/instructs.jl new file mode 100644 index 00000000..48ae4c69 --- /dev/null +++ b/ext/CuYao/test/instructs.jl @@ -0,0 +1,135 @@ +using LinearAlgebra, Yao.ConstGate +using Test, Random +using Yao +using Yao.YaoArrayRegister.StaticArrays +using Yao.ConstGate: SWAPGate +using CUDA + +@testset "gpu instruct nbit!" begin + Random.seed!(3) + nbit = 6 + N = 1<Z)), + mat(ComplexF32, put(2,2=>I2)), + mat(ComplexF32, put(2,2=>P0)) + ] + + @test instruct!(Val(2),v1 |> CuArray, UN, (3,1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, UN, (3,1)) + @test instruct!(Val(2),vn |> CuArray, UN, (3,1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, UN, (3,1)) + end + # sparse matrix like P0, P1 et. al. are not implemented. + for g in [mat(ComplexF32, ConstGate.T), mat(ComplexF32, ConstGate.H), mat(ComplexF32, rot(Z, 0.5))] + @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,)) + @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,)) + @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,)) + @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,)) + end +end + +@testset "gpu swapapply!" begin + nbit = 6 + N = 1< CuArray, Val(:SWAP), (3,5)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(:SWAP), (3,5)) + @test instruct!(Val(2),vn |> CuArray, Val(:SWAP), (3,5)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(:SWAP), (3,5)) +end + +@testset "gpu instruct! 1bit" begin + nbit = 6 + N = 1< CuArray, U1, (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, U1, (3,)) + @test instruct!(Val(2),vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, U1, (3,)) + end + # sparse matrix like P0, P1 et. al. are not implemented. +end + +@testset "gpu xyz-instruct!" begin + nbit = 6 + N = 1< CuArray, Val(G), (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,)) + if G != :H + @test instruct!(Val(2),v1 |> CuArray, Val(G), (1,3,4)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (1,3,4)) + @test instruct!(Val(2),vn |> CuArray, Val(G),(1,3,4)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (1,3,4)) + end + end +end + +@testset "gpu cn-xyz-instruct!" begin + nbit = 6 + N = 1< CuArray, Val(G), (3,), (4,5), (0, 1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,), (4,5), (0, 1)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,), (4,5), (0, 1)) + @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (1,), (1,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G),(3,), (1,), (1,)) + @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (1,), (1,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G),(3,), (1,), (1,)) + end +end + +@testset "pswap" begin + ps = put(6, (2,4)=>rot(SWAP, π/2)) + reg = rand_state(6; nbatch=10) + @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps) + @test apply!(reg |> cu, ps).state isa CuArray +end + +@testset "regression test: Rx, Ry, Rz, CPHASE" begin + Random.seed!(3) + nbit = 6 + for ps in [put(6, (2,)=>Rx(π/2)), put(6, 2=>Ry(0.5)), put(6, 2=>Rz(0.4))] + reg = rand_state(6; nbatch=10) + @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps) + @test apply!(reg |> cu, ps).state isa CuArray + end +end + +@testset "density matrix" begin + nbit = 6 + reg = rand_state(nbit) + rho = density_matrix(reg) + c = put(6, (2,)=>Rx(π/3)) + @test apply(rho |> cu, c) |> cpu ≈ apply(rho, c) +end + +@testset "time evolve" begin + g = time_evolve(kron(10, 2=>X, 3=>X), 0.5) + reg = rand_state(10) + @test apply!(copy(reg), g) ≈ apply!(reg |> cu, g) |> cpu +end + +# fix: https://github.com/QuantumBFS/CuYao.jl/issues/81 +@testset "generic sparse (pqc circuit)" begin + # kron of Rx + pqc_circuit = subroutine(10, kron(Rx(0.4), Rx(0.5), Rx(0.6), Rx(0.8)), (1, 2, 6, 5)) + proxy_reg = zero_state(10) + @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit) + + # kron of Rz + pqc_circuit = subroutine(10, kron(Rz(0.4), Rz(0.5), Rz(0.6), Rz(0.8)), (1, 2, 6, 5)) + proxy_reg = zero_state(10) + @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit) +end diff --git a/ext/CuYao/test/register.jl b/ext/CuYao/test/register.jl new file mode 100644 index 00000000..8c6dc1d6 --- /dev/null +++ b/ext/CuYao/test/register.jl @@ -0,0 +1,139 @@ +using Test +using Yao +const CuYao = Base.get_extension(Yao, :CuYao) +using LinearAlgebra +using Yao.YaoArrayRegister.BitBasis +using Statistics: mean +using Yao.YaoArrayRegister.StaticArrays +using CUDA +using Yao.YaoArrayRegister: batched_kron, batched_kron!, batch_normalize! +CUDA.allowscalar(false) + +@testset "basics" begin + a = randn(ComplexF64, 50, 20) + ca = a|>cu + batch_normalize!(ca) + batch_normalize!(a) + @test ca |> Matrix ≈ a + + for l = 1:100 + i, j = CuYao.tri2ij(l) + @test (i-1)*(i-2)÷2+j == l + end +end + +@testset "constructor an measure" begin + reg = rand_state(10) + greg = reg |> cu + @test greg isa CuYao.AbstractCuArrayReg + @test eltype(greg.state) == ComplexF64 + myvec(x) = Vector(x) + myvec(x::Number) = [x] + for reg in [rand_state(10, nbatch=333), rand_state(10)] + greg = reg |> cu + @test size(measure(greg |> copy, nshots=10)) == size(measure(reg, nshots=10)) + @test size(measure!(greg |> copy)) == size(measure!(reg |> copy)) + @test size(measure!(ResetTo(0), greg |> copy)) == size(measure!(ResetTo(0), reg |> copy)) + @test size(measure!(RemoveMeasured(), greg |> copy)) == size(measure!(RemoveMeasured(), reg |> copy)) + @test select(greg |> copy, 12) |> cpu ≈ select(reg, 12) + @test size(measure!(greg |> copy |> focus!(3,4,1))) == size(measure!(reg |> copy |> focus!(3,4,1))) + @test greg |> copy |> focus!(3,4,1) |> relax!(3,4,1) |> cpu ≈ reg + + if nbatch(greg) == 1 + greg1 = greg |> copy |> focus!(1,4,3) + greg0 = copy(greg1) + res = measure!(RemoveMeasured(), greg1) + @test select(greg0, res |> myvec) |> normalize! |> cpu ≈ greg1 |> cpu + end + + greg1 = greg |> copy |> focus!(1,4,3) + greg0 = copy(greg1) + res = measure!(ResetTo(3), greg1) + @test all(measure(greg1, nshots=10) .== 3) + @test greg1 |> isnormalized + @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), 3)) + + greg1 = greg |> copy |> focus!(1,4,3) + greg0 = copy(greg1) + res = measure!(greg1) + @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), res|>myvec)) + end + + @test join(rand_state(3) |> cu, rand_state(3) |> cu) |> nactive == 6 + @test join(rand_state(3, nbatch=10) |> cu, rand_state(3, nbatch=10) |> cu) |> nactive == 6 +end + +@testset "insert_qubits!" begin + reg = rand_state(5; nbatch=10) + res = insert_qudits!(reg |> cu, 3, 2) |> cpu + @test insert_qudits!(reg, 3, 2) ≈ res + @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3) + + reg = rand_state(5, nbatch=10) |>focus!(2,3) + res = insert_qudits!(reg |> cu, 3, 2) |> cpu + @test insert_qudits!(reg, 3, 2) ≈ res + + @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3) +end + +@testset "cuda-op-measures" begin + reg = rand_state(8; nbatch=32) |> cu + op = repeat(5, X, 1:5) + + # measure! + reg2 = reg |> copy + res = measure!(op, reg2, 2:6) + res2 = measure!(op, reg2, 2:6) + @test size(res) == (32,) + @test res2 == res + + reg = clone(ArrayReg([1,-1+0im]/sqrt(2.0)), 10) |> cu + @test measure!(X, reg) |> mean ≈ -1 + reg = clone(ArrayReg([1.0,0+0im]), 1000) + @test abs(measure!(X, reg) |> mean) < 0.1 +end + +@testset "cuda kron getindex" begin + a = randn(3,4) + b = randn(4,2) + c = zeros(12,8) + ca, cb, cc = cu(a), cu(b), cu(c) + @test kron(ca, cb) |> Array ≈ kron(a, b) + @test Yao.YaoArrayRegister.kron!(cc, ca, cb) |> Array ≈ kron(a,b) + + Yao.YaoArrayRegister.kron!(c,a,b) + @test cc |> Array ≈ c + + v = randn(100) |> cu + inds = [3,5,2,1,7,1] + @test v[inds] ≈ v[inds |> CuVector] +end + +@testset "cuda batched_kron" begin + a = randn(3,4,5) + b = randn(4,2,5) + c = zeros(12,8,5) + ca, cb, cc = cu(a), cu(b), cu(c) + + @test batched_kron(ca, cb) |> Array ≈ batched_kron(a, b) + @test batched_kron!(cc, ca, cb) |> Array ≈ batched_kron(a, b) + + batched_kron!(c, a, b) + @test cc |> Array ≈ c +end + +@testset "zero_state, et al" begin + for b = [4, NoBatch()] + reg = cuzero_state(3; nbatch=b) + @test cpu(reg) ≈ zero_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg + reg = curand_state(3; nbatch=b) + @test reg isa CuYao.AbstractCuArrayReg + reg = cuuniform_state(3; nbatch=b) + @test cpu(reg) ≈ uniform_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg + reg = cuproduct_state(bit"110"; nbatch=b) + @test cpu(reg) ≈ product_state(bit"110"; nbatch=b) && reg isa CuYao.AbstractCuArrayReg + reg = cughz_state(3; nbatch=b) + @test cpu(reg) ≈ ghz_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg + end +end + diff --git a/ext/CuYao/test/runtests.jl b/ext/CuYao/test/runtests.jl new file mode 100644 index 00000000..420a735c --- /dev/null +++ b/ext/CuYao/test/runtests.jl @@ -0,0 +1,18 @@ +using CUDA, Yao, Test +CUDA.allowscalar(false) + +@testset "CUDA patch" begin + include("CUDApatch.jl") +end + +@testset "GPU reg" begin + include("register.jl") +end + +@testset "gpu applies" begin + include("instructs.jl") +end + +@testset "extra" begin + include("extra.jl") +end diff --git a/lib/YaoPlots/src/vizcircuit.jl b/lib/YaoPlots/src/vizcircuit.jl index 9e5478f9..d4d28cb9 100644 --- a/lib/YaoPlots/src/vizcircuit.jl +++ b/lib/YaoPlots/src/vizcircuit.jl @@ -367,13 +367,13 @@ for B in [:LabelBlock, :GeneralMatrixBlock, :Add] _draw!(c, [controls..., (address, c.gatestyles.g, string(cb))]) end end -for GT in [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)] - @eval function draw!(c::CircuitGrid, p::$GT, address, controls) - barrier_style = CircuitStyles.barrier_for_chain[] - CircuitStyles.barrier_for_chain[] = false - draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls) - CircuitStyles.barrier_for_chain[] = barrier_style - end + +# [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)] +function draw!(c::CircuitGrid, p::CompositeBlock, address, controls) + barrier_style = CircuitStyles.barrier_for_chain[] + CircuitStyles.barrier_for_chain[] = false + draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls) + CircuitStyles.barrier_for_chain[] = barrier_style end for (GATE, SYM) in [(:XGate, :Rx), (:YGate, :Ry), (:ZGate, :Rz)] @eval get_brush_texts(c, b::RotationGate{D,T,<:$GATE}) where {D,T} = (c.gatestyles.g, "$($(SYM))($(pretty_angle(b.theta)))") diff --git a/src/Yao.jl b/src/Yao.jl index a42710b0..87d1ac4b 100644 --- a/src/Yao.jl +++ b/src/Yao.jl @@ -16,7 +16,6 @@ const 幺 = Yao using Reexport @reexport using YaoArrayRegister, YaoBlocks, YaoSym, YaoPlots -export EasyBuild using YaoArrayRegister.BitBasis, YaoAPI using YaoBlocks: @@ -28,8 +27,12 @@ using YaoBlocks: print_title, print_block +export EasyBuild +# CUDA APIs +export cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state -include("deprecations.jl") +include("cudainterfaces.jl") include("EasyBuild/easybuild.jl") +include("deprecations.jl") end # module diff --git a/src/cudainterfaces.jl b/src/cudainterfaces.jl new file mode 100644 index 00000000..335ba000 --- /dev/null +++ b/src/cudainterfaces.jl @@ -0,0 +1,41 @@ +""" + cuproduct_state([T=ComplexF64], total::Int, bit_config::Integer; nbatch=NoBatch()) + +The GPU version of [`product_state`](@ref). +""" +function cuproduct_state end + +""" + curand_state([T=ComplexF64], n::Int; nbatch=1) + +The GPU version of [`rand_state`](@ref). +""" +function curand_state end + +""" + cuzero_state([T=ComplexF64], n::Int; nbatch=1) + +The GPU version of [`zero_state`](@ref). +""" +function cuzero_state end + +""" + cuuniform_state([T=ComplexF64], n::Int; nbatch=1) + +The GPU version of [`uniform_state`](@ref). +""" +function cuuniform_state end + +""" + cughz_state([T=ComplexF64], n::Int; nbatch=1) + +The GPU version of [`ghz_state`](@ref). +""" +function cughz_state end + +""" + cpu(cureg) + +Download the register state from GPU to CPU. +""" +function cpu end \ No newline at end of file