forked from jjhw/llama.openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquantize.py
69 lines (62 loc) · 2.37 KB
/
quantize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import openvino as ov
from transformers import AutoConfig, AutoTokenizer
import nncf
from pathlib import Path
import argparse
import shutil
from utils.gptq import is_gptq
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('-h',
'--help',
action='help',
help='Show this help message and exit.')
parser.add_argument('-m',
'--model_id',
default='ir_model',
required=False,
type=str,
help='orignal model path')
parser.add_argument('-o',
'--output',
default='./compressed_model',
required=False,
type=str,
help='Required. path to save the int4 ir model')
parser.add_argument('-p',
'--precision',
required=False,
default="int4",
type=str,
choices=["int8", "int4"],
help='int8 or int4')
args = parser.parse_args()
compressed_model_path = Path(args.output)
orignal_model_path = Path(args.model_id)
if compressed_model_path.exists() == False:
os.mkdir(compressed_model_path)
model_config = AutoConfig.from_pretrained(
args.model_id, trust_remote_code=True)
gptq_applied = is_gptq(model_config)
print(" --- loading model --- ")
if not orignal_model_path.exists():
print(" Please run 'export.py' to export IR model to local ")
else:
ov_model = ov.Core().read_model(orignal_model_path / "openvino_model.xml")
print(" --- compressing model --- ")
if args.precision == "int4" and not gptq_applied:
print(" --- exporting int4 model --- ")
compressed_model = nncf.compress_weights(
ov_model, mode=nncf.CompressWeightsMode.INT4_ASYM, group_size=128, ratio=0.8)
elif args.precision == "int8" and not gptq_applied:
print(" --- exporting int8 model --- ")
compressed_model = nncf.compress_weights(ov_model)
else:
raise RuntimeError(
"Can not quantize a GPTQ model"
)
ov.save_model(compressed_model, compressed_model_path / "openvino_model.xml")
shutil.copy(orignal_model_path / 'config.json', compressed_model_path / 'config.json')
print(" --- exporting tokenizer --- ")
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
tokenizer.save_pretrained(compressed_model_path)