-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
43 lines (36 loc) · 1.28 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import sys
# # Be sure we're using the user's site-packages instead of root's
# INTRINSIC_SITE_PKGS = '/home/intrinsic/.local/lib/python3.8/site-packages'
# if INTRINSIC_SITE_PKGS not in sys.path:
# print("using intrinsic site-packages")
# sys.path.insert(0, INTRINSIC_SITE_PKGS)
import llama
import os
import time
import torch
ROOT_DIR = "/DATA1/LLM_repo/steps/llama"
CKPT_DIR = os.path.join(ROOT_DIR, "llama-2-7b-chat")
TOKENIZER_PATH = os.path.join(ROOT_DIR, "tokenizer.model")
MAX_SEQ_LEN = 2048
print("loading model...")
model = llama.Llama.build(
ckpt_dir = CKPT_DIR,
tokenizer_path = TOKENIZER_PATH,
max_seq_len = MAX_SEQ_LEN,
max_batch_size = 1,
)
print("model loaded.")
print("Begin warmup executions...")
for i in range(3):
start = time.time()
completion = model.text_completion(prompts=["What is the weather in New York?"], max_gen_len=100)
duration = time.time() - start
print(f"warmup exec'ed in {duration:.2f}s")
print("executing 4real")
# start = time.time()
# torch.cuda.cudart().cudaProfilerStart()
# completion = model.text_completion(prompts=["What is the weather in Washington DC?"], max_gen_len=100)
# torch.cuda.cudart().cudaProfilerStop()
# duration = time.time() - start
# print(f"inference exec'ed in {duration:.2f}s")
# print(completion[0]["generation"])