Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable usage of multiple GPUs in PyKokkos #93

Merged
merged 5 commits into from
Sep 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions examples/pykokkos/multi_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pykokkos as pk

import numpy as np
import cupy as cp

pk.set_default_space(pk.Cuda)

size = 10000

pk.set_device_id(0)
cp_arr_0 = cp.arange(size).astype(np.int32)

pk.set_device_id(1)
cp_arr_1 = cp.arange(size).astype(np.int32)

print(cp_arr_0.device)
print(cp_arr_1.device)

@pk.workunit(cp_arr = pk.ViewTypeInfo(space=pk.CudaSpace))
def reduction_cp(i: int, acc: pk.Acc[int], cp_arr: pk.View1D[int]):
acc += cp_arr[i]

pk.set_device_id(1)
cp_view_0 = pk.from_cupy(cp_arr_1)
result_0 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduction_cp, cp_arr=cp_view_0)
print(result_0)

pk.set_device_id(0)
cp_view_1 = pk.from_cupy(cp_arr_0)
result_1 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduction_cp, cp_arr=cp_view_1)

print(f"Reducing array 0: {result_0}")
print(f"Reducing array 1: {result_1}")
print(f"Sum: {result_0 + result_1}")

pk.set_device_id(0)
view_0 = pk.View((size,), dtype=int)

pk.set_device_id(1)
view_1 = pk.View((size,), dtype=int)

@pk.workunit
def init_view(i: int, view: pk.View1D[int]):
view[i] = i

@pk.workunit
def reduce_view(i: int, acc: pk.Acc[int], view: pk.View1D[int]):
acc += view[i]

pk.set_device_id(0)
pk.parallel_for(pk.RangePolicy(pk.Cuda, 0, size), init_view, view=view_0)
result_0 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduce_view, view=view_0)

pk.set_device_id(1)
pk.parallel_for(pk.RangePolicy(pk.Cuda, 0, size), init_view, view=view_1)
result_1 = pk.parallel_reduce(pk.RangePolicy(pk.Cuda, 0, size), reduce_view, view=view_1)

print(f"Reducing view 0: {result_0}")
print(f"Reducing view 1: {result_1}")
print(f"Sum: {result_0 + result_1}")
3 changes: 2 additions & 1 deletion pykokkos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
initialize, finalize,
get_default_space, set_default_space,
get_default_precision, set_default_precision,
is_uvm_enabled, enable_uvm, disable_uvm
is_uvm_enabled, enable_uvm, disable_uvm,
set_device_id
)

initialize()
Expand Down
14 changes: 8 additions & 6 deletions pykokkos/core/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ PK_REAL="${6}"
KOKKOS_LIB_PATH="${7}"
KOKKOS_INCLUDE_PATH="${8}"
COMPUTE_CAPABILITY="${9}"
LIB_SUFFIX="${10}"
COMPILER_PATH="${11}"
SRC=$(find -name "*.cpp")


Expand All @@ -34,11 +36,11 @@ if [ "${COMPILER}" == "g++" ]; then
-shared \
-fopenmp \
"${SRC}".o -o "${MODULE}" \
"${KOKKOS_LIB_PATH}/libkokkoscontainers.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore.so"
"${KOKKOS_LIB_PATH}/libkokkoscontainers${LIB_SUFFIX}.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore${LIB_SUFFIX}.so"

elif [ "${COMPILER}" == "nvcc" ]; then
"${KOKKOS_LIB_PATH}/../bin/nvcc_wrapper" \
"${COMPILER_PATH}" \
`python3 -m pybind11 --includes` \
-I.. \
-O3 \
Expand All @@ -54,14 +56,14 @@ elif [ "${COMPILER}" == "nvcc" ]; then
-Dpk_exec_space="Kokkos::${EXEC_SPACE}" \
-Dpk_real="${PK_REAL}"

"${KOKKOS_LIB_PATH}/../bin/nvcc_wrapper" \
"${COMPILER_PATH}" \
-I.. \
-O3 \
-shared \
-arch="${COMPUTE_CAPABILITY}" \
--expt-extended-lambda \
-fopenmp \
"${SRC}".o -o "${MODULE}" \
"${KOKKOS_LIB_PATH}/libkokkoscontainers.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore.so"
"${KOKKOS_LIB_PATH}/libkokkoscontainers${LIB_SUFFIX}.so" \
"${KOKKOS_LIB_PATH}/libkokkoscore${LIB_SUFFIX}.so"
fi
2 changes: 1 addition & 1 deletion pykokkos/core/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def compile_entity(
if module_setup.is_compiled():
return

cpp_setup = CppSetup(module_setup.module_file, self.functor_file, self.bindings_file)
cpp_setup = CppSetup(module_setup.module_file, module_setup.gpu_module_files, self.functor_file, self.bindings_file)
translator = StaticTranslator(module_setup.name, self.functor_file, members)

t_start: float = time.perf_counter()
Expand Down
118 changes: 100 additions & 18 deletions pykokkos/core/cpp_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
import shutil
import subprocess
import sys
from types import ModuleType
from typing import List, Tuple


from pykokkos.interface import ExecutionSpace, get_default_layout, get_default_memory_space
from pykokkos.interface import (
ExecutionSpace, get_default_layout, get_default_memory_space,
is_host_execution_space
)
import pykokkos.kokkos_manager as km


Expand All @@ -15,16 +18,18 @@ class CppSetup:
Creates the directory to hold the translation and invokes the compiler
"""

def __init__(self, module_file: str, functor: str, bindings: str):
def __init__(self, module_file: str, gpu_module_files: List[str], functor: str, bindings: str):
"""
CppSetup constructor

:param module: the name of the file containing the compiled Python module
:param gpu_module_files: the list of names of files containing for each gpu module
:param functor: the name of the generated functor file
:param bindings: the name of the generated bindings file
"""

self.module_file: str = module_file
self.gpu_module_files: List[str] = gpu_module_files
self.functor_file: str = functor
self.bindings_file: str = bindings

Expand Down Expand Up @@ -58,6 +63,8 @@ def compile(
self.write_source(output_dir, functor, bindings)
self.copy_script(output_dir)
self.invoke_script(output_dir, space, enable_uvm, compiler)
if space is ExecutionSpace.Cuda and km.is_multi_gpu_enabled():
self.copy_multi_gpu_kernel(output_dir)


def initialize_directory(self, name: Path) -> None:
Expand Down Expand Up @@ -115,15 +122,17 @@ def copy_script(self, output_dir: Path) -> None:
print(f"Exception while copying views and makefile: {ex}")
sys.exit(1)

def get_kokkos_paths(self) -> Tuple[Path, Path]:
def get_kokkos_paths(self, space: ExecutionSpace, compiler: str) -> Tuple[Path, Path, Path]:
"""
Get the paths of the Kokkos instal lib and include
directories. If the environment variable is set, use that
Kokkos install. If not, fall back to installed pykokkos-base
package.
Kokkos install. If not, fall back to the installed
pykokkos-base package.

:returns: a tuple of paths to the Kokkos lib/ and include/
directories respectively
:param space: the execution space to compile for
:param compiler: what compiler to use
:returns: a tuple of paths to the Kokkos lib/, include/,
and compiler to be used
"""

lib_path: Path
Expand All @@ -139,20 +148,46 @@ def get_kokkos_paths(self) -> Tuple[Path, Path]:

return lib_path, include_path

from pykokkos.bindings import kokkos
install_path = Path(kokkos.__path__[0]).parent
is_cpu: bool = is_host_execution_space(space)
kokkos_lib: ModuleType = km.get_kokkos_module(is_cpu)
install_path = Path(kokkos_lib.__path__[0])
lib_parent_path: Path
if km.is_multi_gpu_enabled():
lib_parent_path = install_path
else:
lib_parent_path = install_path.parent

if (install_path / "lib").is_dir():
lib_path = install_path / "lib"
elif (install_path / "lib64").is_dir():
lib_path = install_path / "lib64"
if (lib_parent_path / "lib").is_dir():
lib_path = lib_parent_path / "lib"
elif (lib_parent_path / "lib64").is_dir():
lib_path = lib_parent_path / "lib64"
else:
raise RuntimeError("lib/ or lib64/ directories not found in installed pykokkos-base package."
f" Try setting {self.lib_path_env} instead.")

include_path = lib_path.parent / "include/kokkos"
include_path = install_path.parent / "include/kokkos"

compiler_path: Path
if compiler != "nvcc":
compiler_path = Path("g++")
else:
compiler_path = install_path.parent / "bin/nvcc_wrapper"

return lib_path, include_path, compiler_path

def get_kokkos_lib_suffix(self, space: ExecutionSpace) -> str:
"""
Get the suffix of the libkokkoscore and libkokkoscontainers
libraries corresponding to the enabled device

:param space: the execution space to compile for
:returns: the suffix as a string
"""

if is_host_execution_space(space) or not km.is_multi_gpu_enabled():
return ""

return lib_path, include_path
return f"_{km.get_device_id()}"

def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: bool, compiler: str) -> None:
"""
Expand All @@ -176,8 +211,10 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
precision: str = km.get_default_precision().__name__.split(".")[-1]
lib_path: Path
include_path: Path
lib_path, include_path = self.get_kokkos_paths()
compiler_path: Path
lib_path, include_path, compiler_path = self.get_kokkos_paths(space, compiler)
compute_capability: str = self.get_cuda_compute_capability(compiler)
lib_suffix: str = self.get_kokkos_lib_suffix(space)

command: List[str] = [f"./{self.script}",
compiler, # What compiler to use
Expand All @@ -188,7 +225,9 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
precision, # Default real precision
str(lib_path), # Path to Kokkos install lib/ directory
str(include_path), # Path to Kokkos install include/ directory
compute_capability] # Device compute capability
compute_capability, # Device compute capability
lib_suffix, # The libkokkos* suffix identifying the gpu
str(compiler_path)] # The path to the compiler to use
compile_result = subprocess.run(command, cwd=output_dir, capture_output=True, check=False)

if compile_result.returncode != 0:
Expand All @@ -207,6 +246,49 @@ def invoke_script(self, output_dir: Path, space: ExecutionSpace, enable_uvm: boo
print(f"patchelf failed")
sys.exit(1)

def copy_multi_gpu_kernel(self, output_dir: Path) -> None:
"""
Copy the kernel .so file once for each device and run patchelf
to point to the right library

:param output_dir: the base directory
"""

original_module: Path = output_dir / self.module_file
for id, (kernel_filename, kokkos_gpu_module) in enumerate(zip(self.gpu_module_files, km.get_kokkos_gpu_modules())):
kernel_path: Path = output_dir / kernel_filename

try:
shutil.copy(original_module, kernel_path)
except Exception as ex:
print(f"Exception while copying kernel: {ex}")
sys.exit(1)

lib_path: Path = Path(kokkos_gpu_module.__path__[0]) / "lib"
patchelf: List[str] = ["patchelf",
"--set-rpath",
str(lib_path),
kernel_filename]

patchelf_result = subprocess.run(patchelf, cwd=output_dir, capture_output=True, check=False)
if patchelf_result.returncode != 0:
print(patchelf_result.stderr.decode("utf-8"))
print(f"patchelf failed")
sys.exit(1)

# Now replace the needed libkokkos* libraries with the correct version
needed_libraries: str = subprocess.run(["patchelf", "--print-needed", kernel_filename], cwd=output_dir, capture_output=True, check=False).stdout.decode("utf-8")

for line in needed_libraries.splitlines():
if "libkokkoscore" in line or "libkokkoscontainers" in line:
# Line will be of the form f"libkokkoscore_{id}.so.3.4"
# This will extract id
current_id: int = int(line.split("_")[1].split(".")[0])
to_remove: str = line
to_add: str = line.replace(f"_{current_id}", f"_{id}")

subprocess.run(["patchelf", "--replace-needed", to_remove, to_add, kernel_filename], cwd=output_dir, capture_output=True, check=False)

def get_cuda_compute_capability(self, compiler: str) -> str:
"""
Get the compute capability of an Nvidia GPU
Expand Down
8 changes: 7 additions & 1 deletion pykokkos/core/module_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys
import sysconfig
import time
from typing import Callable, Optional, Union
from typing import Callable, List, Optional, Union

from pykokkos.interface import ExecutionSpace
import pykokkos.kokkos_manager as km
Expand Down Expand Up @@ -105,9 +105,15 @@ def __init__(

self.main: Path = self.get_main_path()
self.output_dir: Optional[Path] = self.get_output_dir(self.main, self.metadata, space)
self.gpu_module_files: List[str] = []
if km.is_multi_gpu_enabled():
self.gpu_module_files = [f"kernel{device_id}{suffix}" for device_id in range(km.get_num_gpus())]

if self.output_dir is not None:
self.path: str = os.path.join(self.output_dir, self.module_file)
if km.is_multi_gpu_enabled():
self.gpu_module_paths: str = [os.path.join(self.output_dir, module_file) for module_file in self.gpu_module_files]

self.name: str = self.path.replace("/", "_")
self.name: str = self.name.replace("-", "_")
self.name: str = self.name.replace(".", "_")
Expand Down
Loading