From 888280ce1ab9e6073a579f4d6462d327ff4e77ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jinguo=20Liu=20=28=E5=88=98=E9=87=91=E5=9B=BD=29?=
 <cacate0129@gmail.com>
Date: Thu, 29 Feb 2024 14:34:03 +0800
Subject: [PATCH] Make CuYao an extension (#494)

* add CuYao as extension

* tests pass

* add makefile

* update make file

* rm CuYao LICENSE

* add CuYao and YaoPlots doc
---
 Makefile                            |  37 ++++
 Project.toml                        |   7 +
 docs/Project.toml                   |   1 +
 docs/make.jl                        |   1 +
 docs/src/man/cuda.md                |  71 +++++++
 docs/src/man/plot.md                |  54 ++++-
 ext/CuYao.jl                        |   3 +
 ext/CuYao/.gitignore                |  23 ++
 ext/CuYao/README.md                 |   8 +
 ext/CuYao/benchmarks/TestCuda.jl    |  54 +++++
 ext/CuYao/benchmarks/gates.jl       |  19 ++
 ext/CuYao/benchmarks/gcompile.jl    |  92 ++++++++
 ext/CuYao/benchmarks/paralleldot.jl |  11 +
 ext/CuYao/benchmarks/statexpect.jl  |  13 ++
 ext/CuYao/src/CUDApatch.jl          |  64 ++++++
 ext/CuYao/src/CuYao.jl              |  38 ++++
 ext/CuYao/src/instructs.jl          | 312 ++++++++++++++++++++++++++++
 ext/CuYao/src/register.jl           | 270 ++++++++++++++++++++++++
 ext/CuYao/test/CUDApatch.jl         |  35 ++++
 ext/CuYao/test/Project.toml         |  23 ++
 ext/CuYao/test/extra.jl             |  52 +++++
 ext/CuYao/test/instructs.jl         | 135 ++++++++++++
 ext/CuYao/test/register.jl          | 139 +++++++++++++
 ext/CuYao/test/runtests.jl          |  18 ++
 lib/YaoPlots/src/vizcircuit.jl      |  14 +-
 src/Yao.jl                          |   7 +-
 src/cudainterfaces.jl               |  41 ++++
 27 files changed, 1532 insertions(+), 10 deletions(-)
 create mode 100644 Makefile
 create mode 100644 docs/src/man/cuda.md
 create mode 100644 ext/CuYao.jl
 create mode 100644 ext/CuYao/.gitignore
 create mode 100644 ext/CuYao/README.md
 create mode 100644 ext/CuYao/benchmarks/TestCuda.jl
 create mode 100644 ext/CuYao/benchmarks/gates.jl
 create mode 100644 ext/CuYao/benchmarks/gcompile.jl
 create mode 100644 ext/CuYao/benchmarks/paralleldot.jl
 create mode 100644 ext/CuYao/benchmarks/statexpect.jl
 create mode 100644 ext/CuYao/src/CUDApatch.jl
 create mode 100644 ext/CuYao/src/CuYao.jl
 create mode 100644 ext/CuYao/src/instructs.jl
 create mode 100644 ext/CuYao/src/register.jl
 create mode 100644 ext/CuYao/test/CUDApatch.jl
 create mode 100644 ext/CuYao/test/Project.toml
 create mode 100644 ext/CuYao/test/extra.jl
 create mode 100644 ext/CuYao/test/instructs.jl
 create mode 100644 ext/CuYao/test/register.jl
 create mode 100644 ext/CuYao/test/runtests.jl
 create mode 100644 src/cudainterfaces.jl

diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..45e53f2d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,37 @@
+JL = julia --project
+
+default: init test
+
+init:
+	$(JL) -e 'using Pkg; Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()'
+init-docs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.develop([Pkg.PackageSpec(path = joinpath("lib", pkg)) for pkg in ["YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "YaoAPI"]]); Pkg.precompile()'
+
+update:
+	$(JL) -e 'using Pkg; Pkg.update(); Pkg.precompile()'
+update-docs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); Pkg.update(); Pkg.precompile()'
+
+test-Yao:
+	$(JL) -e 'using Pkg; Pkg.test("Yao")'
+test-CuYao:
+	$(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); Pkg.develop(path="."); Pkg.update()'
+	$(JL) -e 'using Pkg; Pkg.activate("ext/CuYao/test"); include("ext/CuYao/test/runtests.jl")'
+
+test-%:
+	$(JL) -e 'using Pkg; Pkg.test("$*")'
+
+test:
+	$(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"])'
+
+coverage:
+	$(JL) -e 'using Pkg; Pkg.test(["YaoAPI", "YaoArrayRegister", "YaoBlocks", "YaoSym", "YaoPlots", "Yao"]; coverage=true)'
+
+servedocs:
+	$(JL) -e 'using Pkg; Pkg.activate("docs"); using LiveServer; servedocs(;skip_dirs=["docs/src/assets", "docs/src/generated"])'
+
+clean:
+	rm -rf docs/build
+	find . -name "*.cov" -type f -print0 | xargs -0 /bin/rm -f
+
+.PHONY: init test
diff --git a/Project.toml b/Project.toml
index 88493caf..4faa42d5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,8 +13,15 @@ YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"
 YaoPlots = "32cfe2d9-419e-45f2-8191-2267705d8dbc"
 YaoSym = "3b27209a-d3d6-11e9-3c0f-41eb92b2cb9d"
 
+[weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+
+[extensions]
+CuYao = "CUDA"
+
 [compat]
 BitBasis = "0.8, 0.9"
+CUDA = "4, 5"
 LinearAlgebra = "1"
 LuxurySparse = "0.7"
 Reexport = "1"
diff --git a/docs/Project.toml b/docs/Project.toml
index d6d7fde0..d0ab5651 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -7,6 +7,7 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 Latexify = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
diff --git a/docs/make.jl b/docs/make.jl
index f67cad1a..9a549392 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -58,6 +58,7 @@ const PAGES = [
         "man/registers.md",
         "man/blocks.md",
         "man/symbolic.md",
+        "man/cuda.md",
         "man/plot.md",
         "man/automatic_differentiation.md",
         "man/simplification.md",
diff --git a/docs/src/man/cuda.md b/docs/src/man/cuda.md
new file mode 100644
index 00000000..98454f1b
--- /dev/null
+++ b/docs/src/man/cuda.md
@@ -0,0 +1,71 @@
+```@meta
+CurrentModule = Yao
+```
+
+# CUDA extension - CuYao
+
+## Tutorial
+`CuYao` is a CUDA extension of Yao, which allows you to run Yao circuits on GPU. The usage of `CuYao` is similar to `Yao`, but with some extra APIs to upload and download registers between CPU and GPU:
+- `cu(reg)` to upload a registe `reg` to GPU, and
+- `cpu(cureg)` to download a register `cureg` from GPU to CPU.
+
+```julia
+julia> using Yao, CUDA
+
+# create a register on GPU
+julia> cureg = rand_state(9; nbatch=1000) |> cu;   # or `curand_state(9; nbatch=1000)`.
+
+# run a circuit on GPU
+julia> cureg |> put(9, 2=>Z);
+
+# measure the register on GPU
+julia> measure!(cureg)
+1000-element CuArray{DitStr{2, 9, Int64}, 1, CUDA.Mem.DeviceBuffer}:
+ 110110100 ₍₂₎
+ 000100001 ₍₂₎
+ 111111001 ₍₂₎
+             ⋮
+ 010001101 ₍₂₎
+ 000100110 ₍₂₎
+
+# download the register to CPU
+julia> reg = cureg |> cpu;
+```
+
+
+## Features
+Supported gates:
+
+- general U(N) gate
+- general U(1) gate
+- X, Y, Z gate
+- T, S gate
+- SWAP gate
+- control gates
+
+Supported register operations:
+
+- measure!, measure_reset!, measure_remove!, select
+- append_qudits!, append_qubits!
+- insert_qudit!, insert_qubits!
+- focus!, relax!
+- join
+- density_matrix
+- fidelity
+- expect
+
+Autodiff:
+- autodiff is supported when the only parameterized gates are rotation gates in a circuit.
+
+## API
+```@docs
+cpu
+curand_state
+cuzero_state
+cuproduct_state
+cuuniform_state
+cughz_state
+```
+
+!!! note
+    the `cu` function is not documented in this module, but it is used to upload a register to GPU.
\ No newline at end of file
diff --git a/docs/src/man/plot.md b/docs/src/man/plot.md
index 7c0a3d6e..7e1e9160 100644
--- a/docs/src/man/plot.md
+++ b/docs/src/man/plot.md
@@ -11,7 +11,59 @@ end
 
 # Quantum Circuit Visualization
 
-Quantum circuit visualization support for Yao.
+`YaoPlots` is the Quantum circuit visualization component for Yao.
+
+
+## Examples
+#### Example 1: Visualize a QBIR define in Yao
+
+```@example plot
+using Yao.EasyBuild, YaoPlots
+
+# show a qft circuit
+vizcircuit(qft_circuit(5))
+```
+
+If you are using a Pluto/Jupyter notebook, Atom/VSCode editor, you should see the above image in your plotting panel.
+
+#### Example 2: Visualize a single qubit
+```@example plot
+using YaoPlots, Yao
+
+reg = zero_state(1) |> Rx(π/8) |> Rx(π/8)
+rho = density_matrix(ghz_state(2), 1)
+
+bloch_sphere("|ψ⟩"=>reg, "ρ"=>rho; show_projection_lines=true)
+```
+
+See more [examples](lib/YaoPlots/examples/circuits.jl).
+
+### Adjusting the plot attributes
+
+Various attributes of the visualizations can be altered. 
+The plot can be modified, if we change the following attributes
+
+- `YaoPlots.CircuitStyles.linecolor[]` for line color, default value being `"#000000"` (black color)
+- `YaoPlots.CircuitStyles.gate_bgcolor[]` for background color of square blocks, the default value being `"#FFFFFF"` (white color)
+- `YaoPlots.CircuitStyles.textcolor[]` for text color, default value being `"#000000`
+- `YaoPlots.CircuitStyles.lw[]` for line width, default value being `1` (pt)
+- `YaoPlots.CircuitStyles.textsize[]` for text size, default value being `16` (pt)
+- `YaoPlots.CircuitStyles.paramtextsize[]` for parameter text size, for parameterized gates, default value being `10` (pt)
+
+For example,
+
+```@example plot
+using YaoPlots, Yao
+YaoPlots.CircuitStyles.linecolor[] = "pink" 
+YaoPlots.CircuitStyles.gate_bgcolor[] = "yellow" 
+YaoPlots.CircuitStyles.textcolor[] = "#000080" # the navy blue color
+YaoPlots.CircuitStyles.fontfamily[] = "JuliaMono"
+YaoPlots.CircuitStyles.lw[] = 2.5
+YaoPlots.CircuitStyles.textsize[] = 13
+YaoPlots.CircuitStyles.paramtextsize[] = 8
+		
+vizcircuit(chain(3, put(1=>X), repeat(3, H), put(2=>Y), repeat(3, Rx(π/2))))
+```
 
 ## Circuit Visualization
 ```@docs
diff --git a/ext/CuYao.jl b/ext/CuYao.jl
new file mode 100644
index 00000000..1421abce
--- /dev/null
+++ b/ext/CuYao.jl
@@ -0,0 +1,3 @@
+module CuYao
+include("CuYao/src/CuYao.jl")
+end
diff --git a/ext/CuYao/.gitignore b/ext/CuYao/.gitignore
new file mode 100644
index 00000000..56b59ee8
--- /dev/null
+++ b/ext/CuYao/.gitignore
@@ -0,0 +1,23 @@
+*.jl.cov
+*.jl.*.cov
+*.jl.mem
+
+docs/build/
+docs/site/
+docs/src/tutorial/RegisterBasics.md
+docs/src/tutorial/BlockBasics.md
+docs/src/tutorial/BinaryBasics.md
+docs/src/tutorial/QCBM.md
+
+*.ipynb_checkpoints
+**/*.ipynb_checkpoints
+**/**/*.ipynb_checkpoints
+
+_*.dat
+*.swp
+__pycache__/
+.vscode/
+
+Manifest.toml
+
+_local/
diff --git a/ext/CuYao/README.md b/ext/CuYao/README.md
new file mode 100644
index 00000000..becb9632
--- /dev/null
+++ b/ext/CuYao/README.md
@@ -0,0 +1,8 @@
+## How to run tests
+    
+Open a terminal, switch to Yao folder and run the following commands:
+```bash
+make init-CuYao
+make test-CuYao
+```
+
diff --git a/ext/CuYao/benchmarks/TestCuda.jl b/ext/CuYao/benchmarks/TestCuda.jl
new file mode 100644
index 00000000..8c6a6cd5
--- /dev/null
+++ b/ext/CuYao/benchmarks/TestCuda.jl
@@ -0,0 +1,54 @@
+using CUDA
+using LinearAlgebra
+using BenchmarkTools
+
+function ms!(X::CuArray, s::Number)
+    function kernel(X, s)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        @inbounds X[i] *= s
+        return
+    end
+    @cuda blocks=length(X) kernel(X, s)
+    X
+end
+
+function ms1!(X::CuArray, a, b, c)
+    k = f(a)
+    function kernel(X, a, b, c)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        k(X, i)
+        k(X, i)
+        k(X, i)
+        #@inbounds X[i] *= a
+        #@inbounds X[i] *= b
+        #@inbounds X[i] *= c
+        return
+    end
+    @cuda blocks=length(X)÷256 threads=256 kernel(X, a, b, c)
+    X
+end
+
+@inline function f(s)
+    @inline function kernel(X, i)
+        X[i]*=s
+    end
+end
+function ms2!(X::CuArray, s::Number)
+    function kernel(X, s)
+        i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+        @inbounds X[i] *= s
+        return
+    end
+    @cuda blocks=length(X)÷256 threads=256 kernel(X, s)
+    X
+end
+
+a = randn(1<<10)
+cua = cu(a)
+@benchmark ms1!($cua, 1.001, 1.001, 1.001)
+@benchmark ms2!(ms2!(ms2!($cua, 1.001), 1.001), 1.001)
+bss = 1:length(cua)
+@benchmark f(1.001).(cua, bss)
+
+@benchmark rmul!($a, 1.01)
+@benchmark ms!($cua, 1.0)
diff --git a/ext/CuYao/benchmarks/gates.jl b/ext/CuYao/benchmarks/gates.jl
new file mode 100644
index 00000000..b63dbd49
--- /dev/null
+++ b/ext/CuYao/benchmarks/gates.jl
@@ -0,0 +1,19 @@
+using Yao, CuYao, CUDA
+using BenchmarkTools
+
+reg = rand_state(12; nbatch=1000)
+creg = reg |> cu
+@benchmark CUDA.@sync creg |> put(12, 3=>Z)
+@benchmark CUDA.@sync creg |> put(12, 3=>X)
+@benchmark reg |> put(12, 3=>Z)
+@benchmark CUDA.@sync creg |> control(12, 6, 3=>X)
+@benchmark reg |> control(12, 6, 3=>X)
+@benchmark CUDA.@sync creg |> put(12, 3=>rot(X, 0.3))
+@benchmark reg |> put(12, 3=>rot(X, 0.3))
+
+reg = rand_state(20)
+creg = reg |> cu
+g = swap(20, 7, 2)
+g = put(20, (7, 2)=>matblock(rand_unitary(4)))
+@benchmark reg |> g
+@benchmark CUDA.@sync creg |> g
diff --git a/ext/CuYao/benchmarks/gcompile.jl b/ext/CuYao/benchmarks/gcompile.jl
new file mode 100644
index 00000000..62a90ae9
--- /dev/null
+++ b/ext/CuYao/benchmarks/gcompile.jl
@@ -0,0 +1,92 @@
+using Yao, Yao.Boost, Yao.Intrinsics, StaticArrays, Yao.Blocks
+using CuYao, CUDA
+using BenchmarkTools, Profile
+
+nbit = 12
+c = chain(put(nbit, 2=>X), put(nbit, 2=>rot(X, 0.2)), control(nbit, 3, 2=>rot(X,0.3)))
+#c = chain(c..., c...,c...)
+cc = c |> KernelCompiled
+reg = rand_state(nbit) |> cu
+
+@benchmark $reg |> copy |> $c[1] seconds = 2
+@benchmark $reg |> copy |> $cc seconds = 2
+
+
+reg = rand_state(9, 1000)
+creg = reg |> cu
+@benchmark focus!(reg |> copy, 1) do r
+    measure!(r)
+    r
+end
+
+@benchmark focus!(creg|>copy, 1) do r
+    measure!(r)
+    r
+end
+
+@benchmark focus!(creg, 1) do r
+    measure_reset!(r)
+    r
+end
+
+@profile for i=1:10 focus!(creg |> copy, 1) do r
+    measure!(r)
+    r
+end
+end
+
+a = randn(1<<10)
+ca = a |> cu
+
+gpu_call(ca, (x->x^2, ca)) do state, f, ca
+    ilin = linear_index(state)
+    ca[ilin] = f(ca[ilin])
+    return
+end
+
+gpu_call(1:1<<10, (x->x^2, ca)) do state, f, ca
+    ilin = linear_index(state)
+    ca[ilin] = f(ca[ilin])
+    return
+end
+
+
+function xx_kernel(bits::Ints)
+    ctrl = controller(bits[1], 0)
+    mask = bmask(bits...)
+    function kernel(state, inds)
+        i = inds[1]
+        b = i-1
+        ctrl(b) && swaprows!(piecewise(state, inds), i, flip(b, mask) + 1)
+    end
+end
+
+using CuYao: x_kernel, y_kernel, z_kernel
+fx = x_kernel((3,4))
+fy = y_kernel(3)
+fz = z_kernel((3,4))
+
+a = randn(1<<10)
+ca = a |> cu
+fs = (fx, fz)
+
+@benchmark gpu_call(ca, (fs, ca)) do state, f, ca
+    #ilin = linear_index(state)
+    ilin = @cartesianidx ca
+    for fi in fs
+        fi(ca, ilin)
+    end
+    return
+end
+
+@device_code_warntype gpu_call(ca, ((fx, fz), ca)) do state, fs, ca
+    ilin = @cartesianidx ca
+    #for fi in fs fi(ca, ilin) end
+    @Base.Cartesian.@nexprs $(length(fs)) i->fs[i](ca, ilin)
+    #fs(ca, ilin)
+    return
+end
+
+a ≈ Vector(ca)
+
+@edit controller(3,0)
diff --git a/ext/CuYao/benchmarks/paralleldot.jl b/ext/CuYao/benchmarks/paralleldot.jl
new file mode 100644
index 00000000..38e5dd6e
--- /dev/null
+++ b/ext/CuYao/benchmarks/paralleldot.jl
@@ -0,0 +1,11 @@
+function paralleldot(matrices::CuVector, ptrA, ptrB)
+    @inline function kernel(ctx, matrices)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
+        return
+    end
+    gpu_call(kernel, state, a, d, mask; elements=length(state))
+    return state
+end
+
diff --git a/ext/CuYao/benchmarks/statexpect.jl b/ext/CuYao/benchmarks/statexpect.jl
new file mode 100644
index 00000000..9e3ca5dd
--- /dev/null
+++ b/ext/CuYao/benchmarks/statexpect.jl
@@ -0,0 +1,13 @@
+using CuYao
+using Yao
+using BenchmarkTools
+
+sf(x, y) = abs(x-y)
+a = randn(1024)
+ca = a |> cu
+b = randn(1024)
+cb = b |> cu
+@benchmark expect(StatFunctional{2}(sf), a, b) seconds=2
+@benchmark expect(StatFunctional{2}(sf), a) seconds=2
+@benchmark expect(StatFunctional{2}(sf), ca, cb) seconds=2
+@benchmark expect(StatFunctional{2}(sf), ca) seconds=2
diff --git a/ext/CuYao/src/CUDApatch.jl b/ext/CuYao/src/CUDApatch.jl
new file mode 100644
index 00000000..aeaf7d36
--- /dev/null
+++ b/ext/CuYao/src/CUDApatch.jl
@@ -0,0 +1,64 @@
+# TODO
+# support norm(view(reshape(A, m, n), :, 1))
+norm2(A::DenseCuArray; dims=1) = mapreduce(abs2, +, A, dims=dims) .|> CUDA.sqrt
+
+piecewise(state::AbstractVector, inds) = state
+piecewise(state::AbstractMatrix, inds) = @inbounds view(state,:,inds[2])
+
+function kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2}
+    res = CUDA.zeros(promote_type(T1,T2), (size(A).*size(B))...)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds inds = CartesianIndices(res)[state].I
+        inds_A = (inds.-1) .÷ size(B) .+ 1
+        inds_B = (inds.-1) .% size(B) .+ 1
+        @inbounds res[state] = A[inds_A...]*B[inds_B...]
+        return
+    end
+
+    gpu_call(kernel, res, A, B)
+    res
+end
+
+"""
+    kron!(C::CuArray, A, B)
+
+Computes Kronecker products in-place on the GPU.
+The results are stored in 'C', overwriting the existing values of 'C'.
+"""
+function Yao.YaoArrayRegister.kron!(C::CuArray{T3}, A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1, T2, T3}
+    @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2))) || throw(DimensionMismatch())
+    CI = Base.CartesianIndices(C)
+    @inline function kernel(ctx, C, A, B)
+        state = @linearidx C
+        @inbounds inds = CI[state].I
+        inds_A = (inds.-1) .÷ size(B) .+ 1
+        inds_B = (inds.-1) .% size(B) .+ 1
+        @inbounds C[state] = A[inds_A...]*B[inds_B...]
+        return
+    end
+
+    gpu_call(kernel, C, A, B)
+    C
+end
+
+function getindex(A::DenseCuVector{T}, B::DenseCuArray{<:Integer}) where T
+    res = CUDA.zeros(T, size(B)...)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds res[state] = A[B[state]]
+        return
+    end
+    gpu_call(kernel, res, A, B)
+    res
+end
+
+function getindex(A::AbstractVector, B::DenseCuArray{<:Integer})
+    A[Array(B)]
+end
+
+YaoBlocks.AD.as_scalar(x::DenseCuArray) = Array(x)[]
+
+# patch for ExponentialUtilities
+YaoBlocks.compatible_multiplicative_operand(::CuArray, source::AbstractArray) = CuArray(source)
+YaoBlocks.compatible_multiplicative_operand(::CuArray, source::CuArray) = source
diff --git a/ext/CuYao/src/CuYao.jl b/ext/CuYao/src/CuYao.jl
new file mode 100644
index 00000000..a04c2417
--- /dev/null
+++ b/ext/CuYao/src/CuYao.jl
@@ -0,0 +1,38 @@
+# module CuYao
+using Yao.YaoArrayRegister.LuxurySparse, Yao.YaoArrayRegister.StaticArrays, LinearAlgebra, Base.Cartesian
+using Yao.YaoArrayRegister.StatsBase, Yao.YaoArrayRegister.BitBasis
+using Yao.Reexport
+import Yao.YaoArrayRegister.TupleTools
+using Yao.YaoArrayRegister.Random
+
+using Yao.YaoArrayRegister
+using Yao
+using CUDA
+using CUDA.GPUArrays: gpu_call, @linearidx, @cartesianidx, linear_index
+using Yao.YaoArrayRegister
+using Yao.YaoBlocks
+using Yao.ConstGate: SWAPGate
+using Yao.ConstGate: S, T, Sdag, Tdag
+
+import Yao.YaoArrayRegister: insert_qudits!, join
+import CUDA: cu
+import Yao.YaoArrayRegister: _measure, measure, measure!
+import Yao.YaoArrayRegister: batch_normalize!
+import Yao.YaoBlocks: BlockedBasis, nblocks, subblock
+import Yao: expect
+import Yao.YaoArrayRegister: u1rows!, unrows!, autostatic, instruct!, swaprows!, mulrow!
+import Yao.YaoArrayRegister.LinearAlgebra: norm
+import Base: kron, getindex
+
+import Yao: cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
+
+const Ints = NTuple{<:Any, Int}
+
+include("CUDApatch.jl")
+include("register.jl")
+include("instructs.jl")
+
+function __init__()
+    CUDA.allowscalar(false)
+end
+# end
diff --git a/ext/CuYao/src/instructs.jl b/ext/CuYao/src/instructs.jl
new file mode 100644
index 00000000..6ed8e35d
--- /dev/null
+++ b/ext/CuYao/src/instructs.jl
@@ -0,0 +1,312 @@
+# get index
+macro idx(shape, grididx=1, ctxsym=:ctx)
+    quote
+        x = $(esc(shape))
+        i = $linear_index($(esc(ctxsym)), $(esc(grididx)))
+        i > $prod2(x) && return
+        @inbounds Base.CartesianIndices(x)[i].I
+    end
+end
+replace_first(x::NTuple{2}, v) = (v, x[2])
+replace_first(x::NTuple{1}, v) = (v,)
+prod2(x::NTuple{2}) = x[1] * x[2]
+prod2(x::NTuple{1}) = x[1]
+
+gpu_compatible(A::AbstractVecOrMat) = A |> staticize
+gpu_compatible(A::StaticArray) = A
+
+###################### unapply! ############################
+function instruct!(::Val{2}, state::DenseCuVecOrMat, U0::AbstractMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M}
+    @debug "The generic U(N) matrix of size ($(size(U0))), on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    nbit = log2dim1(state)
+    # reorder a unirary matrix.
+    U = gpu_compatible(all(TupleTools.diff(locs).>0) ? U0 : reorder(U0, collect(locs)|>sortperm))
+    locked_bits = [clocs..., locs...]
+    locked_vals = [cvals..., zeros(Int, M)...]
+    locs_raw = gpu_compatible([i+1 for i in itercontrol(nbit, setdiff(1:nbit, locs), zeros(Int, nbit-M))])
+    configs = itercontrol(nbit, locked_bits, locked_vals)
+
+    len = length(configs)
+    @inline function kernel(ctx, state, locs_raw, U, configs, len)
+        CUDA.assume(len > 0)
+        sz = size(state)
+        CUDA.assume(length(sz) == 1 || length(sz) == 2)
+        inds = @idx replace_first(sz, len)
+        x = @inbounds configs[inds[1]]
+        @inbounds unrows!(piecewise(state, inds), x .+ locs_raw, U)
+        return
+    end
+
+    @inline function kernel_single_entry_diag(ctx, state, loc, val, configs, len)
+        CUDA.assume(len > 0)
+        sz = size(state)
+        CUDA.assume(length(sz) == 1 || length(sz) == 2)
+        inds = @idx replace_first(sz, len)
+        x = @inbounds configs[inds[1]]
+        @inbounds piecewise(state, inds)[x + loc] *= val
+        return
+    end
+
+    elements = len*size(state,2)
+    if U isa Diagonal && count(!isone, U.diag) == 1
+        @debug "The single entry diagonal matrix, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+        k = findfirst(!isone, U.diag)
+        loc = locs_raw[k]
+        val = U.diag[k]
+        gpu_call(kernel_single_entry_diag, state, loc, val, configs, len; elements)
+    else
+        gpu_call(kernel, state, locs_raw, U, configs, len; elements)
+    end
+    state
+end
+instruct!(::Val{2}, state::DenseCuVecOrMat, U0::IMatrix, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = state
+instruct!(::Val{2}, state::DenseCuVecOrMat, U0::SDSparseMatrixCSC, locs::NTuple{M, Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {C, M} = instruct!(Val(2), state, U0 |> Matrix, locs, clocs, cvals)
+
+################## General U1 apply! ###################
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDSparseMatrixCSC, loc::Int)
+    instruct!(Val(2), state, Matrix(U1), loc, clocs, cval)
+end
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::AbstractMatrix, loc::Int)
+    @debug "The generic U(2) matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    a, c, b, d = U1
+    nbit = log2dim1(state)
+    step = 1<<(loc-1)
+    configs = itercontrol(nbit, [loc], [0])
+
+    len = length(configs)
+    @inline function kernel(ctx, state, a, b, c, d, len)
+        inds = @idx replace_first(size(state), len)
+        i = @inbounds configs[inds[1]]+1
+        @inbounds u1rows!(piecewise(state, inds), i, i+step, a, b, c, d)
+        return
+    end
+    gpu_call(kernel, state, a, b, c, d, len; elements=len*size(state,2))
+    return state
+end
+
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDPermMatrix, loc::Int)
+    @debug "The single qubit permutation matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    nbit = log2dim1(state)
+    b, c = U1.vals
+    step = 1<<(loc-1)
+    configs = itercontrol(nbit, [loc], [0])
+
+    len = length(configs)
+    function kernel(ctx, state, b, c, step, len, configs)
+        inds = @idx replace_first(size(state), len)
+        x = @inbounds configs[inds[1]] + 1
+        @inbounds swaprows!(piecewise(state, inds), x, x+step, c, b)
+        return
+    end
+    gpu_call(kernel, state, b, c, step, len, configs; elements=len*size(state,2))
+    return state
+end
+
+function YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U1::SDDiagonal, loc::Int)
+    @debug "The single qubit diagonal matrix of size ($(size(U1))), on: GPU, locations: $(loc)."
+    a, d = U1.diag
+    nbit = log2dim1(state)
+    mask = bmask(loc)
+    @inline function kernel(ctx, state, a, d, mask)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= anyone(i-1, mask) ? d : a
+        return
+    end
+    gpu_call(kernel, state, a, d, mask; elements=length(state))
+    return state
+end
+
+YaoArrayRegister.single_qubit_instruct!(state::DenseCuVecOrMat, U::IMatrix, loc::Int) = state
+
+################## XYZ #############
+
+_instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}) where {L} = _instruct!(state, Val(:X), locs, (), ())
+function _instruct!(state::DenseCuVecOrMat, ::Val{:X}, locs::NTuple{L,Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where {L,C}
+    @debug "The X gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    configs = itercontrol(nbit, [clocs..., locs[1]], [cvals..., 0])
+    mask = bmask(locs...)
+    len = length(configs)
+    @inline function kernel(ctx, state, mask, len, configs)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1)
+        return
+    end
+    gpu_call(kernel, state, mask, len, configs; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::NTuple{C,Int}) where C
+    @debug "The Y gate, on: GPU, locations: $(locs)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    mask = bmask(Int, locs...)
+    configs = itercontrol(nbit, [locs[1]], [0])
+    bit_parity = length(locs)%2 == 0 ? 1 : -1
+    factor = (-im)^length(locs)
+    len = length(configs)
+    @inline function kernel(ctx, state, mask, bit_parity, configs, len)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        i_ = flip(b, mask) + 1
+        factor1 = count_ones(b&mask)%2 == 1 ? -factor : factor
+        factor2 = factor1*bit_parity
+        @inbounds swaprows!(piecewise(state, inds), b+1, i_, factor2, factor1)
+        return
+    end
+    gpu_call(kernel, state, mask, bit_parity, configs, len; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Y}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+    @debug "The Y gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    configs = itercontrol(nbit, [clocs..., locs...], [cvals..., 0])
+    mask = bmask(locs...)
+    len = length(configs)
+    @inline function kernel(ctx, state, configs, mask, len)
+        inds = @idx replace_first(size(state), len)
+        b = @inbounds configs[inds[1]]
+        @inbounds swaprows!(piecewise(state, inds), b+1, flip(b, mask) + 1, im, -im)
+        return
+    end
+    gpu_call(kernel, state, configs, mask, len; elements=len*size(state,2))
+    return state
+end
+
+function _instruct!(state::DenseCuVecOrMat, ::Val{:Z}, locs::NTuple{C,Int}) where C
+    @debug "The Z gate, on: GPU, locations: $(locs)."
+    length(locs) == 0 && return state
+    nbit = log2dim1(state)
+    mask = bmask(locs...)
+    @inline function kernel(ctx, state, mask)
+        inds = @cartesianidx state
+        i = inds[1]
+        piecewise(state, inds)[i] *= count_ones((i-1)&mask)%2==1 ? -1 : 1
+        return
+    end
+    gpu_call(kernel, state, mask; elements=length(state))
+    return state
+end
+
+
+for (G, FACTOR) in zip([:Z, :S, :T, :Sdag, :Tdag], [:(-one(Int32)), :(1f0im), :($(exp(im*π/4))), :(-1f0im), :($(exp(-im*π/4)))])
+    if G !== :Z
+        @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
+            @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs)."
+            length(locs) == 0 && return state
+            nbit = log2dim1(state)
+            mask = bmask(Int32, locs...)
+            @inline function kernel(ctx, state, mask)
+                inds = @cartesianidx state
+                i = inds[1]
+                piecewise(state, inds)[i] *= $FACTOR ^ count_ones(Int32(i-1)&mask)
+                return
+            end
+            gpu_call(kernel, state, mask; elements=length(state))
+            return state
+        end
+    end
+    @eval function _instruct!(state::DenseCuVecOrMat, ::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+        @debug "The $($(QuoteNode(G))) gate, on: GPU, locations: $(locs), controlled by: $(clocs) = $(cvals)."
+        ctrl = controller((clocs..., locs...), (cvals..., 1))
+        @inline function kernel(ctx, state, ctrl)
+            inds = @cartesianidx state
+            i = inds[1]
+            piecewise(state, inds)[i] *= ctrl(i-1) ? $FACTOR : one($FACTOR)
+            return
+        end
+        gpu_call(kernel, state, ctrl; elements=length(state))
+        return state
+    end
+end
+
+for G in [:X, :Y, :Z, :S, :T, :Sdag, :Tdag]
+    @eval begin
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::NTuple{C,Int}) where C
+            _instruct!(state, g, locs)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int})
+            _instruct!(state, g, locs)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, g::Val{$(QuoteNode(G))}, locs::Tuple{Int}, clocs::NTuple{C, Int}, cvals::NTuple{C, Int}) where C
+            _instruct!(state, g, locs, clocs, cvals)
+        end
+
+        function YaoArrayRegister.instruct!(::Val{2}, state::DenseCuVecOrMat, vg::Val{$(QuoteNode(G))}, locs::Tuple{Int}, cloc::Tuple{Int}, cval::Tuple{Int})
+            _instruct!(state, vg, locs, cloc, cval)
+        end
+    end
+
+end
+
+function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:SWAP}, locs::Tuple{Int,Int})
+    @debug "The SWAP gate, on: GPU, locations: $(locs)."
+    b1, b2 = locs
+    mask1 = bmask(b1)
+    mask2 = bmask(b2)
+
+    configs = itercontrol(log2dim1(state), [locs...], [1,0])
+    function kf(ctx, state, mask1, mask2)
+        inds = @idx replace_first(size(state), length(configs))
+
+        b = configs[inds[1]]
+        i = b+1
+        i_ = b ⊻ (mask1|mask2) + 1
+        swaprows!(piecewise(state, inds), i, i_)
+        nothing
+    end
+    gpu_call(kf, state, mask1, mask2; elements=length(configs)*size(state,2))
+    state
+end
+
+############## other gates ################
+# parametrized swap gate
+
+function instruct!(::Val{2}, state::DenseCuVecOrMat, ::Val{:PSWAP}, locs::Tuple{Int, Int}, θ::Real)
+    @debug "The PSWAP gate, on: GPU, locations: $(locs)."
+    nbit = log2dim1(state)
+    mask1 = bmask(locs[1])
+    mask2 = bmask(locs[2])
+    mask12 = mask1|mask2
+    a, c, b_, d = mat(Rx(θ))
+    e = exp(-im/2*θ)
+    configs = itercontrol(nbit, [locs...], [0,0])
+    len = length(configs)
+    @inline function kernel(ctx, state, mask2, mask12, configs, a, b_, c, d)
+        inds = @idx replace_first(size(state), len)
+        @inbounds x = configs[inds[1]]
+        piecewise(state, inds)[x+1] *= e
+        piecewise(state, inds)[x⊻mask12+1] *= e
+        y = x ⊻ mask2
+        @inbounds u1rows!(piecewise(state, inds), y+1, y⊻mask12+1, a, b_, c, d)
+        return
+    end
+    gpu_call(kernel, state, mask2, mask12, configs, a, b_, c, d; elements=len*size(state,2))
+    state
+end
+
+function YaoBlocks._apply_fallback!(r::AbstractCuArrayReg{B,T}, b::AbstractBlock) where {B,T}
+    YaoBlocks._check_size(r, b)
+    r.state .= CUDA.adapt(CuArray{T}, mat(T, b)) * r.state
+    return r
+end
+
+for RG in [:Rx, :Ry, :Rz]
+    @eval function instruct!(
+            ::Val{2}, 
+            state::DenseCuVecOrMat{T},
+            ::Val{$(QuoteNode(RG))},
+            locs::Tuple{Int},
+            theta::Number
+        ) where {T}
+        YaoArrayRegister.instruct!(Val(2), state, Val($(QuoteNode(RG))), locs, (), (), theta)
+        return state
+    end
+end
diff --git a/ext/CuYao/src/register.jl b/ext/CuYao/src/register.jl
new file mode 100644
index 00000000..01c83cbc
--- /dev/null
+++ b/ext/CuYao/src/register.jl
@@ -0,0 +1,270 @@
+cu(reg::ArrayReg{D}) where D = ArrayReg{D}(CuArray(reg.state))
+cpu(reg::ArrayReg{D}) where D = ArrayReg{D}(Array(reg.state))
+cu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(CuArray(reg.state), reg.nbatch)
+cpu(reg::BatchedArrayReg{D}) where D = BatchedArrayReg{D}(Array(reg.state), reg.nbatch)
+cu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(CuArray(reg.state))
+cpu(reg::DensityMatrix{D}) where D = DensityMatrix{D}(Array(reg.state))
+const AbstractCuArrayReg{D, T, MT} = AbstractArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuArrayReg{D, T, MT} = ArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuBatchedArrayReg{D, T, MT} = BatchedArrayReg{D, T, MT} where MT<:DenseCuArray
+const CuDensityMatrix{D, T, MT} = DensityMatrix{D, T, MT} where MT<:DenseCuMatrix
+
+function batch_normalize!(s::DenseCuArray, p::Real=2)
+    p!=2 && throw(ArgumentError("p must be 2!"))
+    s./=norm2(s, dims=1)
+    return s
+end
+
+@inline function tri2ij(l::Int)
+    i = ceil(Int, sqrt(2*l+0.25)-0.5)
+    j = l-i*(i-1)÷2
+    return i+1,j
+end
+
+############### MEASURE ##################
+function measure(::ComputationalBasis, reg::ArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T}
+    _measure(rng, basis(reg), reg |> probs |> Vector, nshots)
+end
+
+# TODO: optimize the batch dimension using parallel sampling
+function measure(::ComputationalBasis, reg::BatchedArrayReg{D, T, MT} where MT<:DenseCuArray, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG, nshots::Int=1) where {D,T}
+    regm = reg |> rank3
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    return _measure(rng, basis(reg), pl |> Matrix, nshots)
+end
+
+function measure!(::RemoveMeasured, ::ComputationalBasis, reg::AbstractCuArrayReg{D}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where D
+    regm = reg |> rank3
+    B = size(regm, 3)
+    nregm = similar(regm, D ^ nremain(reg), B)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(nregm)
+    @inline function kernel(ctx, nregm, regm, res, pl)
+        state = @linearidx nregm
+        @inbounds i,j = CI[state].I
+        @inbounds r = Int(res[j])+1
+        @inbounds nregm[i,j] = regm[r,i,j]/CUDA.sqrt(pl[r, j])
+        return
+    end
+    gpu_call(kernel, nregm, regm, res, pl)
+    reg.state = reshape(nregm,1,:)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function measure!(::NoPostProcess, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T}
+    regm = reg |> rank3
+    B = size(regm, 3)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(regm)
+
+    @inline function kernel(ctx, regm, res, pl)
+        state = @linearidx regm
+        @inbounds k,i,j = CI[state].I
+        @inbounds rind = Int(res[j]) + 1
+        @inbounds regm[k,i,j] = k==rind ? regm[k,i,j]/CUDA.sqrt(pl[k, j]) : T(0)
+        return
+    end
+    gpu_call(kernel, regm, res, pl)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function YaoArrayRegister.measure!(
+    ::NoPostProcess,
+    bb::BlockedBasis,
+    reg::AbstractCuArrayReg{D,T},
+    ::AllLocs;
+    rng::AbstractRNG = Random.GLOBAL_RNG,
+) where {D,T}
+    state = @inbounds (reg|>rank3)[bb.perm, :, :]  # permute to make eigen values sorted
+    B = size(state, 3)
+    pl = dropdims(mapreduce(abs2, +, state, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    pl_block = zeros(eltype(pl), nblocks(bb), B)
+    @inbounds for ib = 1:B
+        for i = 1:nblocks(bb)
+            for k in subblock(bb, i)
+                pl_block[i, ib] += pl_cpu[k, ib]
+            end
+        end
+    end
+    # perform measurements on CPU
+    res_cpu = Vector{Int}(undef, B)
+    @inbounds @views for ib = 1:B
+        ires = sample(rng, 1:nblocks(bb), Weights(pl_block[:, ib]))
+        # notice ires is `BitStr` type, can be use as indices directly.
+        range = subblock(bb, ires)
+        state[range, :, ib] ./= sqrt(pl_block[ires, ib])
+        state[1:range.start-1, :, ib] .= zero(T)
+        state[range.stop+1:size(state, 1), :, ib] .= zero(T)
+        res_cpu[ib] = ires
+    end
+    # undo permute and assign back
+    _state = reshape(state, 1 << nactive(reg), :)
+    rstate = reshape(reg.state, 1 << nactive(reg), :)
+    @inbounds rstate[bb.perm, :] .= _state
+    return reg isa ArrayReg ? bb.values[res_cpu[]] : CuArray(bb.values[res_cpu])
+end
+
+function measure!(rst::ResetTo, ::ComputationalBasis, reg::AbstractCuArrayReg{D, T}, ::AllLocs; rng::AbstractRNG=Random.GLOBAL_RNG) where {D, T}
+    regm = reg |> rank3
+    B = size(regm, 3)
+    pl = dropdims(mapreduce(abs2, +, regm, dims=2), dims=2)
+    pl_cpu = pl |> Matrix
+    res_cpu = map(ib->_measure(rng, basis(reg), view(pl_cpu, :, ib), 1)[], 1:B)
+    res = CuArray(res_cpu)
+    CI = Base.CartesianIndices(regm)
+
+    @inline function kernel(ctx, regm, res, pl, val)
+        state = @linearidx regm
+        @inbounds k,i,j = CI[state].I
+        @inbounds rind = Int(res[j]) + 1
+        @inbounds k==val+1 && (regm[k,i,j] = regm[rind,i,j]/CUDA.sqrt(pl[rind, j]))
+        CUDA.sync_threads()
+        @inbounds k!=val+1 && (regm[k,i,j] = 0)
+        return
+    end
+
+    gpu_call(kernel, regm, res, pl, rst.x)
+    return reg isa ArrayReg ? Array(res)[] : res
+end
+
+function YaoArrayRegister.batched_kron(A::DenseCuArray{T1}, B::DenseCuArray{T2}) where {T1 ,T2}
+    res = CUDA.zeros(promote_type(T1,T2), size(A,1)*size(B, 1), size(A,2)*size(B,2), size(A, 3))
+    CI = Base.CartesianIndices(res)
+    @inline function kernel(ctx, res, A, B)
+        state = @linearidx res
+        @inbounds i,j,b = CI[state].I
+        i_A, i_B = divrem((i-1), size(B,1))
+        j_A, j_B = divrem((j-1), size(B,2))
+        @inbounds res[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b]
+        return
+    end
+
+    gpu_call(kernel, res, A, B)
+    return res
+end
+
+"""
+    YaoArrayRegister.batched_kron!(C::CuArray, A, B)
+
+Performs batched Kronecker products in-place on the GPU.
+The results are stored in 'C', overwriting the existing values of 'C'.
+"""
+function YaoArrayRegister.batched_kron!(C::CuArray{T3, 3}, A::DenseCuArray, B::DenseCuArray) where {T3}
+    @boundscheck (size(C) == (size(A,1)*size(B,1), size(A,2)*size(B,2), size(A,3))) || throw(DimensionMismatch())
+    @boundscheck (size(A,3) == size(B,3) == size(C,3)) || throw(DimensionMismatch())
+    CI = Base.CartesianIndices(C)
+    @inline function kernel(ctx, C, A, B)
+        state = @linearidx C
+        @inbounds i,j,b = CI[state].I
+        i_A, i_B = divrem((i-1), size(B,1))
+        j_A, j_B = divrem((j-1), size(B,2))
+        @inbounds C[state] = A[i_A+1, j_A+1, b]*B[i_B+1, j_B+1, b]
+        return
+    end
+
+    gpu_call(kernel, C, A, B)
+    return C
+end
+
+function join(reg1::AbstractCuArrayReg{D}, reg2::AbstractCuArrayReg{D}) where {D}
+    @assert nbatch(reg1) == nbatch(reg2)
+    s1 = reg1 |> rank3
+    s2 = reg2 |> rank3
+    state = YaoArrayRegister.batched_kron(s1, s2)
+    return arrayreg(copy(reshape(state, size(state, 1), :)); nlevel=D, nbatch=nbatch(reg1))
+end
+
+function Yao.insert_qudits!(reg::AbstractCuArrayReg{D}, loc::Int; nqudits::Int=1) where D
+    na = nactive(reg)
+    focus!(reg, 1:loc-1)
+    reg2 = join(zero_state(nqudits; nbatch=nbatch(reg)) |> cu, reg) |> relax! |> focus!((1:na+nqudits)...)
+    reg.state = reg2.state
+    return reg
+end
+
+cuproduct_state(bit_str::BitStr; nbatch::Union{NoBatch,Int} = NoBatch()) =
+    cuproduct_state(ComplexF64, bit_str; nbatch = nbatch)
+cuproduct_state(bit_str::AbstractVector; nbatch::Union{NoBatch,Int} = NoBatch()) =
+    cuproduct_state(ComplexF64, bit_str; nbatch = nbatch)
+cuproduct_state(total::Int, bit_config::Integer; kwargs...) =
+    cuproduct_state(ComplexF64, total, bit_config; kwargs...)
+cuproduct_state(::Type{T}, bit_str::BitStr{N}; kwargs...) where {T,N} =
+    cuproduct_state(T, N, buffer(bit_str); kwargs...)
+cuproduct_state(::Type{T}, bit_configs::AbstractVector; kwargs...) where {T} =
+    cuproduct_state(T, bit_literal(bit_configs...); kwargs...)
+function cuproduct_state(
+    ::Type{T},
+    total::Int,
+    bit_config::Integer;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel::Int=2,
+) where {T}
+    raw = CUDA.zeros(T, nlevel ^ total, YaoArrayRegister._asint(nbatch))
+    raw[Int(bit_config)+1,:] .= Ref(one(T))
+    return arrayreg(raw; nbatch=nbatch, nlevel=nlevel)
+end
+
+cuzero_state(n::Int; kwargs...) = cuzero_state(ComplexF64, n; kwargs...)
+cuzero_state(::Type{T}, n::Int; kwargs...) where {T} = cuproduct_state(T, n, 0; kwargs...)
+
+curand_state(n::Int; kwargs...) = curand_state(ComplexF64, n; kwargs...)
+
+function curand_state(
+    ::Type{T},
+    n::Int;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel = 2,
+) where {T}
+    raw = CUDA.randn(T, nlevel ^ n, YaoArrayRegister._asint(nbatch))
+    return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
+end
+
+cuuniform_state(n::Int; kwargs...) = cuuniform_state(ComplexF64, n; kwargs...)
+function cuuniform_state(::Type{T}, n::Int;
+    nbatch::Union{Int,NoBatch} = NoBatch(),
+    nlevel::Int = 2,
+) where {T}
+    raw = CUDA.ones(T, nlevel ^ n, YaoArrayRegister._asint(nbatch))
+    return normalize!(arrayreg(raw; nbatch=nbatch, nlevel=nlevel))
+end
+
+cughz_state(n::Int; kwargs...) = cughz_state(ComplexF64, n; kwargs...)
+function cughz_state(::Type{T}, n::Int; kwargs...) where {T}
+    reg = cuzero_state(T, n; kwargs...)
+    reg.state[1:1,:] .= Ref(sqrt(T(0.5)))
+    reg.state[end:end,:] .= Ref(sqrt(T(0.5)))
+    return reg
+end
+
+#=
+for FUNC in [:measure!, :measure!]
+    @eval function $FUNC(rng::AbstractRNG, op::AbstractBlock, reg::AbstractCuArrayReg, al::AllLocs; kwargs...) where B
+        E, V = eigen!(mat(op) |> Matrix)
+        ei = Eigen(E|>cu, V|>cu)
+        $FUNC(rng::AbstractRNG, ei, reg, al; kwargs...)
+    end
+end
+=#
+
+function YaoBlocks.expect(op::AbstractAdd, dm::CuDensityMatrix)
+    sum(x->expect(x, dm), subblocks(op))
+end
+function YaoBlocks.expect(op::AbstractBlock, dm::CuDensityMatrix{D}) where D
+    return tr(apply(ArrayReg{D}(dm.state), op).state)
+end
+
+measure(
+    ::ComputationalBasis,
+    reg::CuDensityMatrix,
+    ::AllLocs;
+    nshots::Int = 1,
+    rng::AbstractRNG = Random.GLOBAL_RNG,
+) = YaoArrayRegister._measure(rng, basis(reg), Array(reg |> probs), nshots)
+
diff --git a/ext/CuYao/test/CUDApatch.jl b/ext/CuYao/test/CUDApatch.jl
new file mode 100644
index 00000000..136d7aeb
--- /dev/null
+++ b/ext/CuYao/test/CUDApatch.jl
@@ -0,0 +1,35 @@
+using Yao
+using CUDA
+using Test
+using Yao.YaoBlocks
+
+@testset "isapprox-complex" begin
+    ca = CuArray(randn(ComplexF64,3,3))
+    cb = copy(ca)
+    #@test ca ≈ cb still error!
+    cb[1:1, 1:1] .+= 1e-7im
+    @test isapprox(ca, cb, atol=1e-5)
+    @test !isapprox(ca, cb, atol=1e-9)
+end
+
+@testset "view general" begin
+    a = randn(5,6,8) |> CuArray
+    @test view(a, 2:4, 4, [1,4,3]) |> size == (3, 3)
+end
+
+@testset "permutedims vector" begin
+    ca = randn(ComplexF64,3,4,5,1)
+    @test permutedims(CuArray(ca), [2,1,4,3]) |> Array ≈ permutedims(ca, [2,1,4,3])
+end
+
+@testset "Complex pow" begin
+    for T in [ComplexF64, ComplexF32]
+        a = CuArray(randn(T, 4, 4))
+        @test Array(a .^ Int32(3)) ≈ Array(a).^3
+        @test Array(a .^ real(T)(3)) ≈ Array(a).^3
+    end
+end
+
+@testset "as_scalar" begin
+    @test YaoBlocks.AD.as_scalar([2.0] |> CuArray) == 2.0
+end
diff --git a/ext/CuYao/test/Project.toml b/ext/CuYao/test/Project.toml
new file mode 100644
index 00000000..2e5f158c
--- /dev/null
+++ b/ext/CuYao/test/Project.toml
@@ -0,0 +1,23 @@
+[deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
+Yao = "5872b779-8223-5990-8dd0-5abbb0748c8c"
+
+[compat]
+CUDA = "4, 5"
+Reexport = "0.2, 1"
+TupleTools = "1"
+Yao = "0.8"
+YaoBlocks = "0.13.10"
+julia = "1"
+
+[extras]
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+YaoBlocks = "418bc28f-b43b-5e0b-a6e7-61bbc1a2c1df"
+
+[targets]
+test = ["Test", "Statistics", "YaoBlocks"]
diff --git a/ext/CuYao/test/extra.jl b/ext/CuYao/test/extra.jl
new file mode 100644
index 00000000..95e759ce
--- /dev/null
+++ b/ext/CuYao/test/extra.jl
@@ -0,0 +1,52 @@
+using Yao, Test, CUDA
+CUDA.allowscalar(false)
+
+@testset "gradient" begin
+    reg1 = rand_state(10)
+    reg2 = rand_state(10)
+    c = EasyBuild.variational_circuit(10)
+    g1, g2 = fidelity'(reg1=>c, reg2)
+    cg1, cg2 = fidelity'(cu(reg1)=>c, cu(reg2))
+    @test g1.first ≈ cpu(cg1.first)
+    @test g1.second ≈ cg1.second
+    @test g2 ≈ cpu(cg2)
+
+    h = EasyBuild.heisenberg(10)
+    g1 = expect'(h, reg1=>c)
+    cg1 = expect'(h, cu(reg1)=>c)
+    @test g1.first ≈ cpu(cg1.first)
+    @test g1.second ≈ cg1.second
+end
+
+@testset "apply density matrix" begin
+    reg = rand_state(6)
+    creg = cu(reg)
+    rho = density_matrix(reg, (3,4))
+    crho = density_matrix(creg, (3,4))
+    @test rho ≈ cpu(crho)
+    g = put(2, 1=>Rx(0.3))
+    @test cpu(apply(crho, g)) ≈ apply(rho, g)
+    rho = density_matrix(reg)
+    crho = density_matrix(creg)
+    @test rho ≈ cpu(crho)
+    @test probs(crho) ≈ probs(creg)
+    g = put(6, 1=>Rx(0.3))
+    @test cpu(apply(crho, g)) ≈ apply(rho, g)
+
+    # channel
+    c = UnitaryChannel([put(6, 1=>Rx(0.3)), put(6, 2=>Z)], [0.4, 0.6])
+    @test cpu(apply(crho, c)) ≈ apply(rho, c)
+end
+
+@testset "expect on density matrix" begin
+    reg = rand_state(6)
+    rho = density_matrix(reg, (3,4,5))
+    crho = cu(rho)
+    h = EasyBuild.heisenberg(3)
+    a = expect(h, rho)
+    b = expect(h, crho)
+    @test a ≈ b
+    # fidelity
+    @test fidelity(crho, crho) ≈ 1
+    @test measure(crho; nshots=2) isa Vector
+end
diff --git a/ext/CuYao/test/instructs.jl b/ext/CuYao/test/instructs.jl
new file mode 100644
index 00000000..48ae4c69
--- /dev/null
+++ b/ext/CuYao/test/instructs.jl
@@ -0,0 +1,135 @@
+using LinearAlgebra, Yao.ConstGate
+using Test, Random
+using Yao
+using Yao.YaoArrayRegister.StaticArrays
+using Yao.ConstGate: SWAPGate
+using CUDA
+
+@testset "gpu instruct nbit!" begin
+    Random.seed!(3)
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for UN in [
+            rand_unitary(ComplexF32, 4),
+            mat(ComplexF32, CNOT),
+            mat(ComplexF32, control(2,2,1=>Z)),
+            mat(ComplexF32, put(2,2=>I2)),
+            mat(ComplexF32, put(2,2=>P0))
+            ]
+
+        @test instruct!(Val(2),v1 |> CuArray, UN, (3,1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, UN, (3,1))
+        @test instruct!(Val(2),vn |> CuArray, UN, (3,1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, UN, (3,1))
+    end
+    # sparse matrix like P0, P1 et. al. are not implemented.
+    for g in [mat(ComplexF32, ConstGate.T), mat(ComplexF32, ConstGate.H), mat(ComplexF32, rot(Z, 0.5))]
+        @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), v1 |> CuArray, g, (3,), (4,), (1,)) |> Vector ≈ instruct!(Val(2), v1 |> copy, g, (3,), (4,), (1,))
+        @test instruct!(Val(2), vn |> CuArray, g, (3,), (4,), (1,)) |> Matrix ≈ instruct!(Val(2), vn |> copy, g, (3,), (4,), (1,))
+    end
+end
+
+@testset "gpu swapapply!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    @test instruct!(Val(2),v1 |> CuArray, Val(:SWAP), (3,5)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(:SWAP), (3,5))
+    @test instruct!(Val(2),vn |> CuArray, Val(:SWAP), (3,5)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(:SWAP), (3,5))
+end
+
+@testset "gpu instruct! 1bit" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF64, N)
+    vn = randn(ComplexF64, N, 333)
+
+    for U1 in [mat(H), mat(Z), mat(I2), mat(ConstGate.P0), mat(X), mat(Y)]
+        @test instruct!(Val(2),v1 |> CuArray, U1, (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, U1, (3,))
+        @test instruct!(Val(2),vn |> CuArray, U1, (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, U1, (3,))
+    end
+    # sparse matrix like P0, P1 et. al. are not implemented.
+end
+
+@testset "gpu xyz-instruct!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for G in [:X, :Y, :Z, :T, :H, :Tdag, :S, :Sdag]
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,))
+        if G != :H
+            @test instruct!(Val(2),v1 |> CuArray, Val(G), (1,3,4)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (1,3,4))
+            @test instruct!(Val(2),vn |> CuArray,  Val(G),(1,3,4)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (1,3,4))
+        end
+    end
+end
+
+@testset "gpu cn-xyz-instruct!" begin
+    nbit = 6
+    N = 1<<nbit
+    LOC1 = SVector{2}([0, 1])
+    v1 = randn(ComplexF32, N)
+    vn = randn(ComplexF32, N, 333)
+
+    for G in [:X, :Y, :Z, :T, :Tdag, :S, :Sdag]
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G), (3,), (4,5), (0, 1))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (4,5), (0, 1)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G), (3,), (4,5), (0, 1))
+        @test instruct!(Val(2),v1 |> CuArray, Val(G), (3,), (1,), (1,)) |> Vector ≈ instruct!(Val(2),v1 |> copy, Val(G),(3,), (1,), (1,))
+        @test instruct!(Val(2),vn |> CuArray, Val(G), (3,), (1,), (1,)) |> Matrix ≈ instruct!(Val(2),vn |> copy, Val(G),(3,), (1,), (1,))
+    end
+end
+
+@testset "pswap" begin
+    ps = put(6, (2,4)=>rot(SWAP, π/2))
+    reg = rand_state(6; nbatch=10)
+    @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps)
+    @test apply!(reg |> cu, ps).state isa CuArray
+end
+
+@testset "regression test: Rx, Ry, Rz, CPHASE" begin
+    Random.seed!(3)
+    nbit = 6
+    for ps in [put(6, (2,)=>Rx(π/2)), put(6, 2=>Ry(0.5)),  put(6, 2=>Rz(0.4))]
+        reg = rand_state(6; nbatch=10)
+        @test apply!(reg |> cu, ps) |> cpu ≈ apply!(copy(reg), ps)
+        @test apply!(reg |> cu, ps).state isa CuArray
+    end
+end
+
+@testset "density matrix" begin
+    nbit = 6
+    reg = rand_state(nbit)
+    rho = density_matrix(reg)
+    c = put(6, (2,)=>Rx(π/3))
+    @test apply(rho |> cu, c) |> cpu ≈ apply(rho, c)
+end
+
+@testset "time evolve" begin
+    g = time_evolve(kron(10, 2=>X, 3=>X), 0.5)
+    reg = rand_state(10)
+    @test apply!(copy(reg), g) ≈ apply!(reg |> cu, g) |> cpu
+end
+
+# fix: https://github.com/QuantumBFS/CuYao.jl/issues/81
+@testset "generic sparse (pqc circuit)" begin
+    # kron of Rx
+    pqc_circuit = subroutine(10, kron(Rx(0.4), Rx(0.5), Rx(0.6), Rx(0.8)), (1, 2, 6, 5))
+    proxy_reg = zero_state(10)
+    @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit)
+
+    # kron of Rz
+    pqc_circuit = subroutine(10, kron(Rz(0.4), Rz(0.5), Rz(0.6), Rz(0.8)), (1, 2, 6, 5))
+    proxy_reg = zero_state(10)
+    @test apply!(proxy_reg |> cu, pqc_circuit) |> cpu ≈ apply(proxy_reg, pqc_circuit)
+end
diff --git a/ext/CuYao/test/register.jl b/ext/CuYao/test/register.jl
new file mode 100644
index 00000000..8c6dc1d6
--- /dev/null
+++ b/ext/CuYao/test/register.jl
@@ -0,0 +1,139 @@
+using Test
+using Yao
+const CuYao = Base.get_extension(Yao, :CuYao)
+using LinearAlgebra
+using Yao.YaoArrayRegister.BitBasis
+using Statistics: mean
+using Yao.YaoArrayRegister.StaticArrays
+using CUDA
+using Yao.YaoArrayRegister: batched_kron, batched_kron!, batch_normalize!
+CUDA.allowscalar(false)
+
+@testset "basics" begin
+    a = randn(ComplexF64, 50, 20)
+    ca = a|>cu
+    batch_normalize!(ca)
+    batch_normalize!(a)
+    @test ca |> Matrix ≈ a
+
+    for l = 1:100
+        i, j = CuYao.tri2ij(l)
+        @test (i-1)*(i-2)÷2+j == l
+    end
+end
+
+@testset "constructor an measure" begin
+    reg = rand_state(10)
+    greg = reg |> cu
+    @test greg isa CuYao.AbstractCuArrayReg
+    @test eltype(greg.state) == ComplexF64
+    myvec(x) = Vector(x)
+    myvec(x::Number) = [x]
+    for reg in [rand_state(10, nbatch=333), rand_state(10)]
+        greg = reg |> cu
+        @test size(measure(greg |> copy, nshots=10)) == size(measure(reg, nshots=10))
+        @test size(measure!(greg |> copy)) == size(measure!(reg |> copy))
+        @test size(measure!(ResetTo(0), greg |> copy)) == size(measure!(ResetTo(0), reg |> copy))
+        @test size(measure!(RemoveMeasured(), greg |> copy)) == size(measure!(RemoveMeasured(), reg |> copy))
+        @test select(greg |> copy, 12) |> cpu ≈ select(reg, 12)
+        @test size(measure!(greg |> copy |> focus!(3,4,1))) == size(measure!(reg |> copy |> focus!(3,4,1)))
+        @test greg |> copy |> focus!(3,4,1) |> relax!(3,4,1) |> cpu ≈ reg
+
+        if nbatch(greg) == 1
+            greg1 = greg |> copy |> focus!(1,4,3)
+            greg0 = copy(greg1)
+            res = measure!(RemoveMeasured(), greg1)
+            @test select(greg0, res |> myvec) |> normalize! |> cpu ≈ greg1 |> cpu
+        end
+
+        greg1 = greg |> copy |> focus!(1,4,3)
+        greg0 = copy(greg1)
+        res = measure!(ResetTo(3), greg1)
+        @test all(measure(greg1, nshots=10) .== 3)
+        @test greg1 |> isnormalized
+        @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), 3))
+
+        greg1 = greg |> copy |> focus!(1,4,3)
+        greg0 = copy(greg1)
+        res = measure!(greg1)
+        @test all(select.(BatchedArrayReg(greg0 |> cpu), res |> myvec) .|> normalize! .≈ select.(BatchedArrayReg(greg1 |> cpu), res|>myvec))
+    end
+
+    @test join(rand_state(3) |> cu, rand_state(3) |> cu) |> nactive == 6
+    @test join(rand_state(3, nbatch=10) |> cu, rand_state(3, nbatch=10) |> cu) |> nactive == 6
+end
+
+@testset "insert_qubits!" begin
+    reg = rand_state(5; nbatch=10)
+    res = insert_qudits!(reg |> cu, 3, 2) |> cpu
+    @test insert_qudits!(reg, 3, 2) ≈ res
+    @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3)
+
+    reg = rand_state(5, nbatch=10) |>focus!(2,3)
+    res = insert_qudits!(reg |> cu, 3, 2) |> cpu
+    @test insert_qudits!(reg, 3, 2) ≈ res
+
+    @test append_qudits!(copy(reg) |> cu, 3) |> cpu ≈ append_qudits!(copy(reg), 3)
+end
+
+@testset "cuda-op-measures" begin
+    reg = rand_state(8; nbatch=32) |> cu
+    op = repeat(5, X, 1:5)
+
+    # measure!
+    reg2 = reg |> copy
+    res = measure!(op, reg2, 2:6)
+    res2 = measure!(op, reg2, 2:6)
+    @test size(res) == (32,)
+    @test res2 == res
+
+    reg = clone(ArrayReg([1,-1+0im]/sqrt(2.0)), 10) |> cu
+    @test measure!(X, reg) |> mean ≈ -1
+    reg = clone(ArrayReg([1.0,0+0im]), 1000)
+    @test abs(measure!(X, reg) |> mean) < 0.1
+end
+
+@testset "cuda kron getindex" begin
+    a = randn(3,4)
+    b = randn(4,2)
+    c = zeros(12,8)
+    ca, cb, cc = cu(a), cu(b), cu(c)
+    @test kron(ca, cb) |> Array ≈ kron(a, b)
+    @test Yao.YaoArrayRegister.kron!(cc, ca, cb) |> Array ≈ kron(a,b)
+
+    Yao.YaoArrayRegister.kron!(c,a,b)
+    @test cc |> Array ≈ c
+
+    v = randn(100) |> cu
+    inds = [3,5,2,1,7,1]
+    @test v[inds] ≈ v[inds |> CuVector]
+end
+
+@testset "cuda batched_kron" begin
+    a = randn(3,4,5)
+    b = randn(4,2,5)
+    c = zeros(12,8,5)
+    ca, cb, cc = cu(a), cu(b), cu(c)
+
+    @test batched_kron(ca, cb) |> Array ≈ batched_kron(a, b)
+    @test batched_kron!(cc, ca, cb) |> Array ≈ batched_kron(a, b)
+
+    batched_kron!(c, a, b)
+    @test cc |> Array ≈ c
+end
+
+@testset "zero_state, et al" begin
+    for b = [4, NoBatch()]
+        reg = cuzero_state(3; nbatch=b)
+        @test cpu(reg) ≈ zero_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
+        reg = curand_state(3; nbatch=b)
+        @test reg isa CuYao.AbstractCuArrayReg
+        reg = cuuniform_state(3; nbatch=b)
+        @test cpu(reg) ≈ uniform_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
+        reg = cuproduct_state(bit"110"; nbatch=b)
+        @test cpu(reg) ≈ product_state(bit"110"; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
+        reg = cughz_state(3; nbatch=b)
+        @test cpu(reg) ≈ ghz_state(3; nbatch=b) && reg isa CuYao.AbstractCuArrayReg
+    end
+end
+
diff --git a/ext/CuYao/test/runtests.jl b/ext/CuYao/test/runtests.jl
new file mode 100644
index 00000000..420a735c
--- /dev/null
+++ b/ext/CuYao/test/runtests.jl
@@ -0,0 +1,18 @@
+using CUDA, Yao, Test
+CUDA.allowscalar(false)
+
+@testset "CUDA patch" begin
+    include("CUDApatch.jl")
+end
+
+@testset "GPU reg" begin
+    include("register.jl")
+end
+
+@testset "gpu applies" begin
+    include("instructs.jl")
+end
+
+@testset "extra" begin
+    include("extra.jl")
+end
diff --git a/lib/YaoPlots/src/vizcircuit.jl b/lib/YaoPlots/src/vizcircuit.jl
index 9e5478f9..d4d28cb9 100644
--- a/lib/YaoPlots/src/vizcircuit.jl
+++ b/lib/YaoPlots/src/vizcircuit.jl
@@ -367,13 +367,13 @@ for B in [:LabelBlock, :GeneralMatrixBlock, :Add]
         _draw!(c, [controls..., (address, c.gatestyles.g, string(cb))])
     end
 end
-for GT in [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)]
-    @eval function draw!(c::CircuitGrid, p::$GT, address, controls)
-        barrier_style = CircuitStyles.barrier_for_chain[]
-        CircuitStyles.barrier_for_chain[] = false
-        draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls)
-        CircuitStyles.barrier_for_chain[] = barrier_style
-    end
+
+# [:KronBlock, :RepeatedBlock, :CachedBlock, :Subroutine, :(YaoBlocks.AD.NoParams)]
+function draw!(c::CircuitGrid, p::CompositeBlock, address, controls)
+    barrier_style = CircuitStyles.barrier_for_chain[]
+    CircuitStyles.barrier_for_chain[] = false
+    draw!(c, YaoBlocks.Optimise.to_basictypes(p), address, controls)
+    CircuitStyles.barrier_for_chain[] = barrier_style
 end
 for (GATE, SYM) in [(:XGate, :Rx), (:YGate, :Ry), (:ZGate, :Rz)]
     @eval get_brush_texts(c, b::RotationGate{D,T,<:$GATE}) where {D,T} = (c.gatestyles.g, "$($(SYM))($(pretty_angle(b.theta)))")
diff --git a/src/Yao.jl b/src/Yao.jl
index a42710b0..87d1ac4b 100644
--- a/src/Yao.jl
+++ b/src/Yao.jl
@@ -16,7 +16,6 @@ const 幺 = Yao
 
 using Reexport
 @reexport using YaoArrayRegister, YaoBlocks, YaoSym, YaoPlots
-export EasyBuild
 using YaoArrayRegister.BitBasis, YaoAPI
 
 using YaoBlocks:
@@ -28,8 +27,12 @@ using YaoBlocks:
     print_title,
     print_block
 
+export EasyBuild
+# CUDA APIs
+export cpu, cuzero_state, cuuniform_state, curand_state, cuproduct_state, cughz_state
 
-include("deprecations.jl")
+include("cudainterfaces.jl")
 include("EasyBuild/easybuild.jl")
+include("deprecations.jl")
 
 end # module
diff --git a/src/cudainterfaces.jl b/src/cudainterfaces.jl
new file mode 100644
index 00000000..335ba000
--- /dev/null
+++ b/src/cudainterfaces.jl
@@ -0,0 +1,41 @@
+"""
+    cuproduct_state([T=ComplexF64], total::Int, bit_config::Integer; nbatch=NoBatch())
+
+The GPU version of [`product_state`](@ref).
+"""
+function cuproduct_state end
+
+"""
+    curand_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`rand_state`](@ref).
+"""
+function curand_state end
+
+"""
+    cuzero_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`zero_state`](@ref).
+"""
+function cuzero_state end
+
+"""
+    cuuniform_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`uniform_state`](@ref).
+"""
+function cuuniform_state end
+
+"""
+    cughz_state([T=ComplexF64], n::Int; nbatch=1)
+
+The GPU version of [`ghz_state`](@ref).
+"""
+function cughz_state end
+
+"""
+    cpu(cureg)
+
+Download the register state from GPU to CPU.
+"""
+function cpu end
\ No newline at end of file