-
Notifications
You must be signed in to change notification settings - Fork 14
/
config.py
21 lines (16 loc) · 1.04 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
"""
tokenizer 和 训练数据配置文件
"""
LANGUAGE = "enzh" # [en, zh, enzh]
# https://huggingface.co/baichuan-inc/Baichuan-7B/blob/main/tokenizer.model
# TOKENIZER_MODEL = "tokenizers/baichuan/tokenizer.model" # the baichuan sentencepiece tokenizer model
# TOKENIZER_BIN = "tokenizers/baichuan/tokenizer.bin" # binary version of the tokenizer for inference in C
# https://huggingface.co/ziqingyang/chinese-llama-2-7b/blob/main/tokenizer.model
# TOKENIZER_MODEL = "tokenizers/llama2enzh/tokenizer.model" # the llama2.
# TOKENIZER_BIN = "tokenizers/llama2enzh/tokenizer.bin" # binary version of the tokenizer for inference in C
# base llama2,
# TOKENIZER_MODEL = "tokenizers/llama2en/tokenizer.model" # the llama2-enzh.
# TOKENIZER_BIN = "tokenizers/llama2en/tokenizer.bin" # binary version of the tokenizer for inference in C
#自定义中文词表(红楼梦.txt)
TOKENIZER_MODEL = "tokenizers/custom_tokenizer/meng.model" # the llama2-zh.
TOKENIZER_BIN = "tokenizers/custom_tokenizer/meng.bin" # binary version of the tokenizer for inference in C