lllyasviel · dfl · Jun 3, 2024 · Jun 3, 2024 · Jun 4, 2024 · Jun 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# Mac finder directory settings
+.DS_Store
diff --git a/gradio_app.py b/gradio_app.py
@@ -1,4 +1,7 @@
 import os
+import platform
+is_mac = platform.system() == 'Darwin'
+from huggingface_hub import snapshot_download
 
 os.environ['HF_HOME'] = os.path.join(os.path.dirname(__file__), 'hf_download')
 HF_TOKEN = None
@@ -11,11 +14,12 @@
 import gradio as gr
 import tempfile
 
+from openai import OpenAI
+import subprocess
+
 gradio_temp_dir = os.path.join(tempfile.gettempdir(), 'gradio')
 os.makedirs(gradio_temp_dir, exist_ok=True)
 
-from threading import Thread
-
 # Phi3 Hijack
 from transformers.models.phi3.modeling_phi3 import Phi3PreTrainedModel
 
@@ -32,6 +36,9 @@
 
 import lib_omost.canvas as omost_canvas
 
+# https://medium.com/@natsunoyuki/using-civitai-models-with-diffusers-package-45e0c475a67e
+# https://huggingface.co/docs/diffusers/en/api/loaders/single_file
+# https://github.com/huggingface/diffusers/blob/v0.28.0/scripts/convert_original_stable_diffusion_to_diffusers.py
 
 # SDXL
 
@@ -66,25 +73,49 @@
 
 memory_management.unload_all_models([text_encoder, text_encoder_2, vae, unet])
 
+openai_api_base = "http://127.0.0.1:8080/v1"
+client = OpenAI(api_key="EMPTY", base_url=openai_api_base)
+
 # LLM
+# llm_name = "mlx-community/Phi-3-mini-128k-instruct-8bit"
+llm_name = "mlx-community/Meta-Llama-3-8B-4bit"
+# llm_name = "mlx-community/dolphin-2.9.1-llama-3-8b-4bit"
 
-# llm_name = 'lllyasviel/omost-phi-3-mini-128k-8bits'
-llm_name = 'lllyasviel/omost-llama-3-8b-4bits'
-# llm_name = 'lllyasviel/omost-dolphin-2.9-llama3-8b-4bits'
+def load_model(model_name):
+    global process
 
-llm_model = AutoModelForCausalLM.from_pretrained(
-    llm_name,
-    torch_dtype=torch.bfloat16,  # This is computation type, not load/memory type. The loading quant type is baked in config.
-    token=HF_TOKEN,
-    device_map="auto"  # This will load model to gpu with an offload system
-)
+    local_model_dir = os.path.join(
+        os.environ['HF_HOME'], llm_name.split("/")[1]
+    )
 
-llm_tokenizer = AutoTokenizer.from_pretrained(
-    llm_name,
-    token=HF_TOKEN
-)
+    if not os.path.exists(local_model_dir):
+        snapshot_download(repo_id=llm_name, local_dir=local_model_dir)
+
+    command = ["python3", "-m", "mlx_lm.server", "--model", local_model_dir]
+
+    try:
+        process = subprocess.Popen(
+            command, stdin=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+        )
+        process.stdin.write("y\n")
+        process.stdin.flush()
+        print("Model Loaded")
+        return True #{model_status: "Model Loaded"}
+    except Exception as e:
+        print(f"Exception occurred: {str(e)}")
+        return False #{model_status: f"Exception occurred: {str(e)}"}
+
+load_model(llm_name)
+
+def kill_process():
+    global process
+    process.terminate()
+    time.sleep(2)
+    if process.poll() is None:  # Check if the process has indeed terminated
+        process.kill()  # Force kill if still running
 
-memory_management.unload_all_models(llm_model)
+    print("Model Killed")
+    return {model_status: "Model Unloaded"}
 
 
 @torch.inference_mode()
@@ -110,7 +141,6 @@ def resize_without_crop(image, target_width, target_height):
     resized_image = pil_image.resize((target_width, target_height), Image.LANCZOS)
     return np.array(resized_image)
 
-
 @torch.inference_mode()
 def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: float, max_new_tokens: int) -> str:
     np.random.seed(int(seed))
@@ -125,49 +155,26 @@ def chat_fn(message: str, history: list, seed:int, temperature: float, top_p: fl
 
     conversation.append({"role": "user", "content": message})
 
-    memory_management.load_models_to_gpu(llm_model)
-
-    input_ids = llm_tokenizer.apply_chat_template(
-        conversation, return_tensors="pt", add_generation_prompt=True).to(llm_model.device)
-
-    streamer = TextIteratorStreamer(llm_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
-
-    def interactive_stopping_criteria(*args, **kwargs) -> bool:
-        if getattr(streamer, 'user_interrupted', False):
-            print('User stopped generation')
-            return True
-        else:
-            return False
-
-    stopping_criteria = StoppingCriteriaList([interactive_stopping_criteria])
-
-    def interrupter():
-        streamer.user_interrupted = True
-        return
-
-    generate_kwargs = dict(
-        input_ids=input_ids,
-        streamer=streamer,
-        stopping_criteria=stopping_criteria,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
+    response = client.chat.completions.create(
+        model="gpt",
+        messages=conversation,
         temperature=temperature,
         top_p=top_p,
+        # frequency_penalty=freq_penalty,
+        max_tokens=max_new_tokens,
+        stream=True,
     )
-
-    if temperature == 0:
-        generate_kwargs['do_sample'] = False
-
-    Thread(target=llm_model.generate, kwargs=generate_kwargs).start()
-
-    outputs = []
-    for text in streamer:
-        outputs.append(text)
-        # print(outputs)
-        yield "".join(outputs), interrupter
-
-    return
-
+    stop = ["<|im_end|>", "<|endoftext|>"]
+    partial_message = ""
+    for chunk in response:
+        if len(chunk.choices) != 0:
+            if chunk.choices[0].delta.content not in stop:
+                partial_message = partial_message + chunk.choices[0].delta.content
+            else:
+                partial_message = partial_message + ""
+            yield partial_message
+
+    return partial_message
 
 @torch.inference_mode()
 def post_chat(history):

diff --git a/lib_omost/memory_management.py b/lib_omost/memory_management.py
@@ -1,13 +1,19 @@
 import torch
 from contextlib import contextmanager
 
+import platform
+is_mac = platform.system() == 'Darwin'
 
 high_vram = False
-gpu = torch.device('cuda')
+if is_mac:
+    gpu = torch.device('mps')
+else:
+    gpu = torch.device('cuda')
 cpu = torch.device('cpu')
 
 torch.zeros((1, 1)).to(gpu, torch.float32)
-torch.cuda.empty_cache()
+
+torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache()
 
 models_in_gpu = []
 
@@ -27,6 +33,8 @@ def movable_bnb_model(m):
 
 
 def load_models_to_gpu(models):
+    if is_mac: return
+
     global models_in_gpu
 
     if not isinstance(models, (tuple, list)):
@@ -49,11 +57,13 @@ def load_models_to_gpu(models):
         print('Load to GPU:', m.__class__.__name__)
 
     models_in_gpu = list(set(models_in_gpu + models))
-    torch.cuda.empty_cache()
+    torch.cuda.empty_cache() if not is_mac else torch.mps.empty_cache()
     return
 
 
 def unload_all_models(extra_models=None):
+    if is_mac: return
+
     global models_in_gpu
 
     if extra_models is None:

diff --git a/mlx_lm_wrapper.py b/mlx_lm_wrapper.py
@@ -0,0 +1,89 @@
+from typing import Any, Callable, Dict, Generator, Optional, Tuple, Union
+# from mlx_lm import load, PreTrainedTokenizer, TokenizerWrapper
+import mlx
+import mlx_lm
+import transformers as tf
+# from transformers import AutoTokenizer, TextIteratorStreamer
+from transformers.generation.stopping_criteria import StoppingCriteriaList
+from transformers.generation.utils import GenerateOutput
+import numpy as np
+import torch
+
+def load_mlx_lm(llm_name: str) -> Tuple[mlx.nn.Module, tf.PreTrainedTokenizer]:
+  llm_model, llm_tokenizer = mlx_lm.load(llm_name)
+  return MLX_LLM_TransformersWrapper(llm_model, llm_tokenizer), llm_tokenizer
+
+class MLX_LLM_TransformersWrapper(mlx.nn.Module):
+  def __init__(self, model: mlx.nn.Module, tokenizer: tf.PreTrainedTokenizer):
+    self.model = model
+    self.tokenizer = tokenizer
+
+  def generate(self,
+    input_ids: np.ndarray,
+    streamer: tf.TextIteratorStreamer, #Optional["BaseStreamer"] = None,
+    # inputs: Optional[torch.Tensor] = None,
+    stopping_criteria: Optional[StoppingCriteriaList] = None,
+    max_new_tokens: int = 100,
+    do_sample: bool = True,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+    **kwargs
+  ) -> Union[GenerateOutput, torch.LongTensor]:
+
+    if streamer is not None:
+      streamer.put(input_ids.cpu())
+
+    # has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+    # return self.__stream_generate(self.model, self.tokenizer, input_ids, max_new_tokens, **kwargs)
+
+
+  def __stream_generate(self,
+      model: torch.nn.Module,
+      tokenizer: tf.PreTrainedTokenizer,
+      prompt: Union[str, np.ndarray],
+      max_tokens: int = 100,
+      **kwargs,
+  ) -> Union[str, Generator[str, None, None]]:
+      """
+      A generator producing text based on the given prompt from the model.
+
+      Args:
+          prompt (mx.array): The input prompt.
+          model (nn.Module): The model to use for generation.
+          max_tokens (int): The ma
+          kwargs: The remaining options get passed to :func:`generate_step`.
+            See :func:`generate_step` for more details.
+
+      Yields:
+          Generator[Tuple[mx.array, mx.array]]: A generator producing text.
+      """
+      # if not isinstance(tokenizer, TokenizerWrapper):
+      #     tokenizer = TokenizerWrapper(tokenizer)
+
+      if isinstance(prompt, str):
+        prompt_tokens = mx.array(tokenizer.encode(prompt))
+      else:
+        prompt_tokens = mx.array(prompt)
+
+      detokenizer = tokenizer.detokenizer
+      detokenizer.reset()
+      print("generating...")
+      for (token, prob), n in zip(
+          generate_step(
+            prompt=prompt_tokens,
+            model=model,
+            temp=kwargs.pop("temperature", 1.0),
+            **kwargs),
+          range(max_tokens),
+      ):
+          print(f"n: {n}")
+          if token == tokenizer.eos_token_id:
+              print("EOS")
+              break
+          detokenizer.add_token(token)
+          print(f"Token: {token}")
+          # Yield the last segment if streaming
+          yield detokenizer.last_segment
+
+      detokenizer.finalize()
+      yield detokenizer.last_segment
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,8 @@
 diffusers==0.28.0
 transformers==4.41.1
 gradio==4.31.5
-bitsandbytes==0.43.1
+mlx-lm==0.14.3; sys_platform == 'darwin'
+bitsandbytes==0.43.1; sys_platform != 'darwin'
 accelerate==0.30.1
 protobuf==3.20
 opencv-python
@@ -11,3 +12,4 @@ pillow
 einops
 torch
 peft
+openai