Skip to content

Commit

Permalink
Create v1.0-kubeshare
Browse files Browse the repository at this point in the history
  • Loading branch information
kerwenwwer committed Oct 16, 2022
1 parent b05ea75 commit 953052b
Show file tree
Hide file tree
Showing 15 changed files with 784 additions and 361 deletions.
5 changes: 1 addition & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
# Gemini

## Note !!!
Version v1.1 is for the version that support latest cuda version and multi GPU

## About

Gemini is an efficient GPU resource sharing system with fine-grained control for Linux platforms.
Expand Down Expand Up @@ -61,4 +58,4 @@ For more details, refer to those scripts and source code.
[jim90247](https://github.com/jim90247)
[eee4017](https://github.com/eee4017)
[ncy9371](https://github.com/ncy9371)
[kerwenwwer](https://github.com/kerwenwwer)

110 changes: 110 additions & 0 deletions launcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import argparse
import inotify.adapters
import os
import sys
import signal
import shlex
import subprocess as sp
import time

args = None
podlist = {}

def prepare_env(name, port, schd_port):
client_env = os.environ.copy()
client_env['SCHEDULER_IP'] = '127.0.0.1'
client_env['SCHEDULER_PORT'] = str(schd_port)
client_env['POD_MANAGER_IP'] = '0.0.0.0'
client_env['POD_MANAGER_PORT'] = str(port)
client_env['POD_NAME'] = name
return client_env

def launch_scheduler():
cfg_h, cfg_t = os.path.split(args.pod_list)
if cfg_h == '':
cfg_h = os.getcwd()

cmd = "{} -p {} -f {} -P {} -q {} -m {} -w {}".format(
args.schd, cfg_h, cfg_t, args.port, args.base_quota, args.min_quota, args.window
)
with open("/kubeshare/log/gemini-scheduler.log","a") as err:
proc = sp.Popen(shlex.split(cmd), universal_newlines=True, bufsize=1, stderr = err)
return proc

def update_podmanager(file):
with open(file) as f:
lines = f.readlines()
if not lines:
return
podnum = int(lines[0])
for _, val in podlist.items():
val[0] = False
for i in range(1, podnum+1):
name, port = lines[i].split()
name_port = lines[i][:-1]
if name_port not in podlist:
sys.stderr.write("[launcher] pod manager id '{}' port '{}' start running\n".format(name_port, port))
sys.stderr.flush()
with open("/kubeshare/log/pod-manager.log","a") as err:
proc = sp.Popen(
shlex.split(args.pmgr),
env=prepare_env(name, port, args.port),
preexec_fn=os.setpgrp,
stderr=err
)
podlist[name_port] = [True, proc]
else:
podlist[name_port][0] = True
del_list = []
for n, val in podlist.items():
if not val[0]:
os.killpg(os.getpgid(val[1].pid), signal.SIGKILL)
val[1].wait()
sys.stderr.write("[launcher] pod manager id '{}' has been deleted\n".format(n))
sys.stderr.flush()
del_list.append(n)
for n in del_list:
del podlist[n]

def main():
global args
parser = argparse.ArgumentParser()
parser.add_argument('schd', help='path to scheduler executable')
parser.add_argument('pmgr', help='path to pod-manager executable')
parser.add_argument('gpu_uuid', help='scheduling system GPU UUID')
parser.add_argument('pod_list', help='path to pod list file')
parser.add_argument('pmgr_port_dir', help='path to pod port dir')
parser.add_argument('--port', type=int, default=49901, help='base port')
parser.add_argument('--base_quota', type=float, default=300, help='base quota (ms)')
parser.add_argument('--min_quota', type=float, default=20, help='minimum quota (ms)')
parser.add_argument('--window', type=float, default=10000, help='time window (ms)')
args = parser.parse_args()

launch_scheduler()
sys.stderr.write(f"[launcher] scheduler started on 0.0.0.0:{args.port}\n")
sys.stderr.flush()

update_podmanager(os.path.join(args.pmgr_port_dir, args.gpu_uuid)) #first time

ino = inotify.adapters.Inotify()
ino.add_watch(args.pmgr_port_dir, inotify.constants.IN_CLOSE_WRITE)
for event in ino.event_gen(yield_nones=False):
(_, type_names, path, filename) = event
try:
if filename == args.gpu_uuid:
update_podmanager(os.path.join(args.pmgr_port_dir, args.gpu_uuid))
except: # file content may not correct
sys.stderr.write("Catch exception in update_podmanager: {}\n".format(sys.exc_info()))
sys.stderr.flush()

if __name__ == '__main__':
os.setpgrp()
try:
main()
except:
sys.stderr.write("Catch exception: {}\n".format(sys.exc_info()))
sys.stderr.flush()
finally:
for _, val in podlist.items():
os.killpg(os.getpgid(val[1].pid), signal.SIGKILL)
os.killpg(0, signal.SIGKILL)
7 changes: 4 additions & 3 deletions resource-config.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
2
client1 0.5 0.5 327374182400
client2 0.5 0.5 327374182400
3
client1 0.1 0.5 1073741824
client2 0.2 0.8 1073741824
client3 0.4 0.5 2147483648

16 changes: 8 additions & 8 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,19 @@ CUDA_PATH ?= /usr/local/cuda-10.0
PWD := $(shell pwd)
PREFIX ?= $(PWD)/..

SMS ?= 35 37 50 52 60 61 70
SMS ?= 35 37 50 52 60 61 70
GENCODE_FLAGS += $(foreach sm,$(SMS),-gencode arch=compute_$(sm),code=sm_$(sm))

CXX ?= g++-8
CXX ?= g++
NVCC ?= $(CUDA_PATH)/bin/nvcc -ccbin $(CXX)

CUDA_LDFLAGS += -lcuda -L$(CUDA_PATH)/lib64 -L$(CUDA_PATH)/lib64/stubs
LDFLAGS += -ldl -lrt
CUDA_LDFLAGS += -lcuda -L$(CUDA_PATH)/lib64 -L$(CUDA_PATH)/lib64/stubs
LDFLAGS += -ldl -lrt

CXXFLAGS += -std=c++11 -fPIC
CXXFLAGS += -std=c++11 -fPIC

ifeq ($(DEBUG),1)
CXXFLAGS += -g -D_DEBUG -Wall
CXXFLAGS += -g -D_DEBUG -Wall
else
CXXFLAGS += -O2
endif
Expand Down Expand Up @@ -61,7 +61,7 @@ schd-priority.o: schd-priority.cpp scheduler.h
$(EXEC) g++ $(CXXFLAGS) -o $@ -c $<

gem-schd: scheduler.o schd-priority.o debug.o comm.o
$(EXEC) g++ $(LDFLAGS) -pthread -rdynamic $+ -o $@
$(EXEC) g++ $(LDFLAGS) -pthread -rdynamic $+ -o $@
$(EXEC) mkdir -p $(PREFIX)/bin
$(EXEC) cp $@ $(PREFIX)/bin

Expand All @@ -74,4 +74,4 @@ gem-pmgr: pod-manager.o debug.o comm.o
$(EXEC) cp $@ $(PREFIX)/bin

clean:
rm -f *.o
rm -f *.o && rm ./gem-schd && rm ./gem-pmgr && rm ./libgemhook.so.1
7 changes: 4 additions & 3 deletions src/comm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/

#include "comm.h"
#include <cerrno>

#include "debug.h"

reqid_t prepare_request(char *buf, comm_request_t type, ...) {
Expand Down Expand Up @@ -123,12 +123,13 @@ char *parse_response(char *buf, reqid_t *id) {
// -1, errno will be returned.
int multiple_attempt(std::function<int()> func, int max_attempt, int interval) {
int rc;
char* log_name = "/kubeshare/log/comm.log";
for (int attempt = 1; attempt <= max_attempt; attempt++) {
rc = func();
if (rc == 0) break;
if (rc == -1) rc = errno;
ERROR("attempt %d: %s", attempt, strerror(rc));
hERROR(log_name, __FILE__, (long)__LINE__, "attempt %d: %s", attempt, strerror(rc));
if (interval > 0) sleep(interval);
}
return rc;
}
}
36 changes: 28 additions & 8 deletions src/debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
*/

#include "debug.h"

#include<iostream>
#include<fstream>
void sprint_date(char *buf, const size_t len) {
time_t timer;
struct tm *tm_info;
Expand All @@ -34,22 +35,41 @@ void sprint_date(char *buf, const size_t len) {
}

#define GENERATE_PRINT(func, level) \
void func(const char *format, ...) { \
void func(const char* log_name, const char* file, long line, const char *format, ...) { \
char buf[DEBUG_MSG_LEN], date_buf[100]; \
va_list args; \
\
sprint_date(date_buf, 100); \
va_start(args, format); \
vsnprintf(buf, DEBUG_MSG_LEN, format, args); \
fprintf(stderr, "%s " level ":%s:%ld %s\n", date_buf, file, line, buf); \
}

#define GENERATE_f(func, level) \
void func(const char* log_name, const char* file, long line, const char *format, ...) { \
char buf[DEBUG_MSG_LEN], date_buf[100]; \
va_list args; \
\
fprintf(stderr, "%s Gemini " level "/ %s\n", date_buf, buf); \
sprint_date(date_buf, 100); \
va_start(args, format); \
vsnprintf(buf, DEBUG_MSG_LEN, format, args); \
std::ofstream logger; \
logger.open ("/kubeshare/log/hook.log", std::ios::out | std::ios::app);\
logger<<date_buf<<" "<<level<<":"<<file<<":"<<line<<" "<<buf<<std::endl; \
logger.close(); \
}

#ifdef _DEBUG
GENERATE_PRINT(DEBUG, "D")
GENERATE_PRINT(DEBUG, "DEBU")
GENERATE_f(hDEBUG, "DEBU")
#else
void DEBUG(const char *format, ...) {}
void DEBUG(const char* log_name, const char* file, long line, const char *format, ...) {}
void hDEBUG(const char* file, long line, const char *format, ...) {}
#endif
GENERATE_PRINT(INFO, "I")
GENERATE_PRINT(WARNING, "W")
GENERATE_PRINT(ERROR, "E")
GENERATE_PRINT(INFO, "INFO")
GENERATE_PRINT(WARNING, "WARN")
GENERATE_PRINT(ERROR, "ERRO")
GENERATE_f(hINFO, "INFO")
GENERATE_f(hWARNING, "WARN")
GENERATE_f(hERROR, "ERRO")
//fprintf(stderr, "%s " level "%s:%ld %s\n", date_buf, file, line, buf);
16 changes: 9 additions & 7 deletions src/debug.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/**
* Copyright 2020 Hung-Hsin Chen, LSA Lab, National Tsing Hua University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* Licensed under the Apache License, Version 2.0 (const char* log_name, const char* file, long line, the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
Expand All @@ -23,14 +23,16 @@
#include <cstdio>
#include <cstring>
#include <ctime>
#include <sys/types.h>
#include <unistd.h>

#define DEBUG_MSG_LEN 256

void DEBUG(const char *format, ...);
void INFO(const char *format, ...);
void WARNING(const char *format, ...);
void ERROR(const char *format, ...);
void DEBUG(const char* log_name, const char* file, long line, const char *format, ...);
void INFO(const char* log_name, const char* file, long line, const char *format, ...);
void WARNING(const char* log_name, const char* file, long line, const char *format, ...);
void ERROR(const char* log_name, const char* file, long line, const char *format, ...);
void hDEBUG(const char* log_name, const char* file, long line, const char *format, ...);
void hINFO(const char* log_name, const char* file, long line, const char *format, ...);
void hWARNING(const char* log_name, const char* file, long line, const char *format, ...);
void hERROR(const char* log_name, const char* file, long line, const char *format, ...);

#endif
Loading

0 comments on commit 953052b

Please sign in to comment.