Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP; adding mac support #54

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# Mac finder directory settings
.DS_Store
121 changes: 64 additions & 57 deletions gradio_app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os
import platform
is_mac = platform.system() == 'Darwin'
from huggingface_hub import snapshot_download

os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download')
HF_TOKEN = None
Expand All @@ -11,11 +14,12 @@
import gradio as gr
import tempfile

from openai import OpenAI
import subprocess

gradio_temp_dir = os.path.join(tempfile.gettempdir(), 'gradio')
os.makedirs(gradio_temp_dir, exist_ok=True)

from threading import Thread

# Phi3 Hijack
from transformers.models.phi3.modeling_phi3 import Phi3PreTrainedModel

Expand All @@ -32,6 +36,9 @@

import lib_omost.canvas as omost_canvas

# https://medium.com/@natsunoyuki/using-civitai-models-with-diffusers-package-45e0c475a67e
# https://huggingface.co/docs/diffusers/en/api/loaders/single_file
# https://github.com/huggingface/diffusers/blob/v0.28.0/scripts/convert_original_stable_diffusion_to_diffusers.py

# SDXL

Expand Down Expand Up @@ -66,25 +73,49 @@

memory_management.unload_all_models([text_encoder, text_encoder_2, vae, unet])

openai_api_base = "http://127.0.0.1:8080/v1"
client = OpenAI(api_key="EMPTY", base_url=openai_api_base)

# LLM
# llm_name = "mlx-community/Phi-3-mini-128k-instruct-8bit"
llm_name = "mlx-community/Meta-Llama-3-8B-4bit"
# llm_name = "mlx-community/dolphin-2.9.1-llama-3-8b-4bit"

# llm_name = 'lllyasviel/omost-phi-3-mini-128k-8bits'
llm_name = 'lllyasviel/omost-llama-3-8b-4bits'
# llm_name = 'lllyasviel/omost-dolphin-2.9-llama3-8b-4bits'
def load_model(model_name):
global process

llm_model = AutoModelForCausalLM.from_pretrained(
llm_name,
torch_dtype=torch.bfloat16, # This is computation type, not load/memory type. The loading quant type is baked in config.
token=HF_TOKEN,
device_map="auto" # This will load model to gpu with an offload system
)
local_model_dir = os.path.join(
os.environ['HF_HOME'], llm_name.split("/")[1]
)

llm_tokenizer = AutoTokenizer.from_pretrained(
llm_name,
token=HF_TOKEN
)
if not os.path.exists(local_model_dir):
snapshot_download(repo_id=llm_name, local_dir=local_model_dir)

command = ["python3", "-m", "mlx_lm.server", "--model", local_model_dir]

try:
process = subprocess.Popen(
command, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
process.stdin.write("y\n")
process.stdin.flush()
print("Model Loaded")
return True #{model_status: "Model Loaded"}
except Exception as e:
print(f"Exception occurred: {str(e)}")
return False #{model_status: f"Exception occurred: {str(e)}"}

load_model(llm_name)

def kill_process():
global process
process.terminate()
time.sleep(2)
if process.poll() is None: # Check if the process has indeed terminated
process.kill() # Force kill if still running

memory_management.unload_all_models(llm_model)
print("Model Killed")
return {model_status: "Model Unloaded"}


@torch.inference_mode()
Expand All @@ -110,7 +141,6 @@ def resize_without_crop(image, target_width, target_height):
resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
return np.array(resized_image)


@torch.inference_mode()
def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: float, max_new_tokens: int) -> str:
np.random.seed(int(seed))
Expand All @@ -125,49 +155,26 @@ def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: fl

conversation.append({"role": "user", "content": message})

memory_management.load_models_to_gpu(llm_model)

input_ids = llm_tokenizer.apply_chat_template(
conversation, return_tensors="pt", add_generation_prompt=True).to(llm_model.device)

streamer = TextIteratorStreamer(llm_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

def interactive_stopping_criteria(*args, **kwargs) -> bool:
if getattr(streamer, 'user_interrupted', False):
print('User stopped generation')
return True
else:
return False

stopping_criteria = StoppingCriteriaList([interactive_stopping_criteria])

def interrupter():
streamer.user_interrupted = True
return

generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
stopping_criteria=stopping_criteria,
max_new_tokens=max_new_tokens,
do_sample=True,
response = client.chat.completions.create(
model="gpt",
messages=conversation,
temperature=temperature,
top_p=top_p,
# frequency_penalty=freq_penalty,
max_tokens=max_new_tokens,
stream=True,
)

if temperature == 0:
generate_kwargs['do_sample'] = False

Thread(target=llm_model.generate, kwargs=generate_kwargs).start()

outputs = []
for text in streamer:
outputs.append(text)
# print(outputs)
yield "".join(outputs), interrupter

return

stop = ["<|im_end|>", "<|endoftext|>"]
partial_message = ""
for chunk in response:
if len(chunk.choices) != 0:
if chunk.choices[0].delta.content not in stop:
partial_message = partial_message + chunk.choices[0].delta.content
else:
partial_message = partial_message + ""
yield partial_message

return partial_message

@torch.inference_mode()
def post_chat(history):
Expand Down
16 changes: 13 additions & 3 deletions lib_omost/memory_management.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import torch
from contextlib import contextmanager

import platform
is_mac = platform.system() == 'Darwin'

high_vram = False
gpu = torch.device('cuda')
if is_mac:
gpu = torch.device('mps')
else:
gpu = torch.device('cuda')
cpu = torch.device('cpu')

torch.zeros((1, 1)).to(gpu, torch.float32)
torch.cuda.empty_cache()

torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache()

models_in_gpu = []

Expand All @@ -27,6 +33,8 @@ def movable_bnb_model(m):


def load_models_to_gpu(models):
if is_mac: return

global models_in_gpu

if not isinstance(models, (tuple, list)):
Expand All @@ -49,11 +57,13 @@ def load_models_to_gpu(models):
print('Load to GPU:', m.__class__.__name__)

models_in_gpu = list(set(models_in_gpu + models))
torch.cuda.empty_cache()
torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache()
return


def unload_all_models(extra_models=None):
if is_mac: return

global models_in_gpu

if extra_models is None:
Expand Down
89 changes: 89 additions & 0 deletions mlx_lm_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from typing import Any, Callable, Dict, Generator, Optional, Tuple, Union
# from mlx_lm import load, PreTrainedTokenizer, TokenizerWrapper
import mlx
import mlx_lm
import transformers as tf
# from transformers import AutoTokenizer, TextIteratorStreamer
from transformers.generation.stopping_criteria import StoppingCriteriaList
from transformers.generation.utils import GenerateOutput
import numpy as np
import torch

def load_mlx_lm(llm_name: str) -> Tuple[mlx.nn.Module, tf.PreTrainedTokenizer]:
llm_model, llm_tokenizer = mlx_lm.load(llm_name)
return MLX_LLM_TransformersWrapper(llm_model, llm_tokenizer), llm_tokenizer

class MLX_LLM_TransformersWrapper(mlx.nn.Module):
def __init__(self, model: mlx.nn.Module, tokenizer: tf.PreTrainedTokenizer):
self.model = model
self.tokenizer = tokenizer

def generate(self,
input_ids: np.ndarray,
streamer: tf.TextIteratorStreamer, #Optional["BaseStreamer"] = None,
# inputs: Optional[torch.Tensor] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
max_new_tokens: int = 100,
do_sample: bool = True,
temperature: float = 1.0,
top_p: float = 1.0,
**kwargs
) -> Union[GenerateOutput, torch.LongTensor]:

if streamer is not None:
streamer.put(input_ids.cpu())

# has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
# return self.__stream_generate(self.model, self.tokenizer, input_ids, max_new_tokens, **kwargs)


def __stream_generate(self,
model: torch.nn.Module,
tokenizer: tf.PreTrainedTokenizer,
prompt: Union[str, np.ndarray],
max_tokens: int = 100,
**kwargs,
) -> Union[str, Generator[str, None, None]]:
"""
A generator producing text based on the given prompt from the model.

Args:
prompt (mx.array): The input prompt.
model (nn.Module): The model to use for generation.
max_tokens (int): The ma
kwargs: The remaining options get passed to :func:`generate_step`.
See :func:`generate_step` for more details.

Yields:
Generator[Tuple[mx.array, mx.array]]: A generator producing text.
"""
# if not isinstance(tokenizer, TokenizerWrapper):
# tokenizer = TokenizerWrapper(tokenizer)

if isinstance(prompt, str):
prompt_tokens = mx.array(tokenizer.encode(prompt))
else:
prompt_tokens = mx.array(prompt)

detokenizer = tokenizer.detokenizer
detokenizer.reset()
print("generating...")
for (token, prob), n in zip(
generate_step(
prompt=prompt_tokens,
model=model,
temp=kwargs.pop("temperature", 1.0),
**kwargs),
range(max_tokens),
):
print(f"n: {n}")
if token == tokenizer.eos_token_id:
print("EOS")
break
detokenizer.add_token(token)
print(f"Token: {token}")
# Yield the last segment if streaming
yield detokenizer.last_segment

detokenizer.finalize()
yield detokenizer.last_segment
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
diffusers==0.28.0
transformers==4.41.1
gradio==4.31.5
bitsandbytes==0.43.1
mlx-lm==0.14.3; sys_platform == 'darwin'
bitsandbytes==0.43.1; sys_platform != 'darwin'
accelerate==0.30.1
protobuf==3.20
opencv-python
Expand All @@ -11,3 +12,4 @@ pillow
einops
torch
peft
openai