Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Composable SFT #28

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
241 changes: 203 additions & 38 deletions scripts/lang_adapt/madx_run_clm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
"""
Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py
"""

#TODO: hailey composable sft impl. (this comment shouldn't make it into main!)
# use the LT Trainer class from composable-sft
# see how this interacts with adapters
# see how this interacts with the embeddings being trained
# todo: computations for changing
import logging
import math
import os
Expand Down Expand Up @@ -41,6 +45,10 @@
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

from sft import (
LotteryTicketSparseFineTuner,
)


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.11.0")
Expand All @@ -53,6 +61,8 @@
MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

trainer_class_mapping = {'emb': Trainer, 'emb-and-adpt': AdapterTrainer, 'emb-then-adpt': AdapterTrainer, 'emb-and-sft': LotteryTicketSparseFineTuner}
yongzx marked this conversation as resolved.
Show resolved Hide resolved


@dataclass
class ModelArguments:
Expand Down Expand Up @@ -105,7 +115,7 @@ class ModelArguments:
)
lang_adapt_strategies: str = field(
default="",
metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"},
metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt', 'emb-and-sft'"},
yongzx marked this conversation as resolved.
Show resolved Hide resolved
)
embedding_strategies: str = field(
default="",
Expand Down Expand Up @@ -227,6 +237,92 @@ class ParamEfficientArguments(MultiLingAdapterArguments):
default='lora',
metadata={"help": "If True, add LoRA to the output MLP weights of a model. Defaults to False."},
)
### composable SFT unique args ###
train_sft: bool = field(
default=False, metadata={"help": "Whether to train sparse fine-tuning."}
)
full_ft_max_steps_per_iteration: Optional[int] = field(
default=5000,
metadata={
"help": "Maximum number of steps per parameter selection iteration during full fine-tuning."},
)
sparse_ft_max_steps_per_iteration: Optional[int] = field(
default=None,
metadata={
"help": "Maximum of steps per sparse fine-tuning iteration. Overriden by `--max_steps` if set."},
)
full_ft_min_steps_per_iteration: Optional[int] = field(
default=None,
metadata={
"help": "Minimum number of steps per parameter selection iteration during full fine-tuning."
},
)
sparse_ft_min_steps_per_iteration: Optional[int] = field(
default=None,
metadata={
"help": "Minimum of steps per parameter selection iteration during sparse fine-tuning."
},
)
full_ft_max_epochs_per_iteration: Optional[int] = field(
default=None,
metadata={
"help": "Maximum number of epochs per parameter selection iteration during full fine-tuning."
},
)
sparse_ft_max_epochs_per_iteration: Optional[int] = field(
default=None,
metadata={
"help": "Maximum number of epochs per parameter selection iteration during sparse fine-tuning."
},
)
n_ft_iterations: Optional[int] = field(
default=1,
metadata={
"help": "The number of parameter selection iterations during fine-tuning."},
)
ft_params_proportion: Optional[float] = field(
default=None,
metadata={
"help": "The proportion of model parameters for which to learn non-zero differences during fine-tuning.\
Will override `ft_params_num` if both are set."},
)
ft_params_num: Optional[int] = field(
default=None,
metadata={
"help": "The number of model parameters for which to learn non-zero differences during fine-tuning. \
Defaults to a number equivalent to the number of adapter (reduction factor 16) parameters."},
)
freeze_head: bool = field(
default=False,
metadata={"help": "Whether to freeze language modeling head."},
)
untie_embeddings: bool = field(
default=False,
metadata={"help": "Whether to untie input and output embeddings."},
)
freeze_layer_norm: bool = field(
default=True,
metadata={"help": "Whether to freeze layer normalisation parameters."},
) # changed from False to True
full_l1_reg: Optional[float] = field(
default=0.1, metadata={"help": "Coefficient of L1 regularisation during full fine-tuning."}
) # changed from 0.0 to 0.1
sparse_l1_reg: Optional[float] = field(
default=0.1, metadata={"help": "Coefficient of L1 regularisation during sparse fine-tuning."}
) # changed from 0.0 to 0.1
apply_reg_to_sparse_only: bool = field(
default=False,
metadata={
"help": "If true, only applies regularisation to those parameters which are eligible for sparse fine-tuning."
},
)
sparse_ft_method: Optional[str] = field(
default='LotteryTicket',
metadata={"help": 'Sparse fine-tuning method. Can be LotteryTicket or Random.'},
)


### end of composable SFT unique args ###

def load_tokenizer(model_args):
tokenizer_kwargs = {
Expand Down Expand Up @@ -552,7 +648,7 @@ def get_adapter_config(adapter_args, model_args):
raise ValueError(
"Adapters can only be loaded in adapters training mode."
"Use --train_adapter to enable adapter training"
)
)

print(f"✅ Use Embedding Strategy: {model_args.embedding_strategies}")

Expand Down Expand Up @@ -602,33 +698,56 @@ def zero_grad(grad):
return grad

embedding_layer.weight.register_hook(lambda grad: zero_grad(grad))

if adapter_args.train_sft: # Hailey: might need to put some more args here.
lm_head = model.lm_head

if adapter_args.freeze_head:
for param in lm_head.parameters():
param.requires_grad = False
# if adapter_args.load_sft:
# model.load_sft(adapter_args.load_sft)
if adapter_args.freeze_layer_norm:
for name, param in model.named_parameters():
if "layer_norm" in name or "ln_f" in name:
param.requires_grad = False


#if model_args.embedding_strategies == "overlap-replace":
# if not tokenizer.name_or_path == model_args.model_name_or_path:
# orig_tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
# model.add_embeddings('lng_emb', tokenizer, reference_embedding='default', reference_tokenizer=orig_tokenizer )
# model._active_embedding = "lng_emb"
# model.delete_embeddings('default')
# model.tie_weights()
#elif model_args.embedding_strategies == "replace":
# model.resize_token_embeddings(len(tokenizer))

trainable_params = 0
frozen_params = 0
emb_params = 0
for name, param in model.named_parameters():
if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name:
param.requires_grad = True
emb_params += param.numel()
elif model_args.lang_adapt_strategies == "emb":
param.requires_grad = False

if not param.requires_grad:
print(f"🥶 Frozen layer '{name}'")
frozen_params += param.numel()
else:
print(f"🚀 Trainable layer '{name}'")
trainable_params += param.numel()
if adapter_args.train_adapter:
for name, param in model.named_parameters():
if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name:
param.requires_grad = True
emb_params += param.numel()
elif model_args.lang_adapt_strategies == "emb":
param.requires_grad = False

if not param.requires_grad:
print(f"🥶 Frozen layer '{name}'")
frozen_params += param.numel()
else:
print(f"🚀 Trainable layer '{name}'")
trainable_params += param.numel()
elif adapter_args.train_sft:
for name, param in model.named_parameters():
if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name:
param.requires_grad = True
emb_params += param.numel()
elif model_args.lang_adapt_strategies == "emb":
param.requires_grad = True

if not param.requires_grad:
print(f"🥶 Frozen layer '{name}'")
frozen_params += param.numel()
elif "word_embeddings" in name or "wte" in name or "wpe" in name and param.requires_grad:
print(f"🚀 Trainable layer '{name}'")
trainable_params += param.numel()
else:
print(f"🚀 Sparsely Trainable layer '{name}'")
trainable_params += param.numel()


print(f"Total frozen parameters: {frozen_params}")
Expand All @@ -652,8 +771,12 @@ def main():
model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()

training_args.data_dir = f'{training_args.output_dir}'

if adapter_args.train_sft and training_args.max_steps:
# override sparse_ft_max_steps_per_iteration if training_args.max_steps is set
adapter_args.sparse_ft_max_steps_per_iteration = training_args.max_steps

assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'lora')
assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'lora', 'emb-and-sft')
assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace')

# Setup logging
Expand Down Expand Up @@ -711,11 +834,50 @@ def main():
if training_args.do_eval:
eval_dataset = lm_datasets["validation"]

# compute K value for SFT (https://arxiv.org/pdf/2110.07560.pdf)
if adapter_args.train_sft and not adapter_args.train_adapter:
# override the K value if adapter_reduction_factor is set
if adapter_args.adapter_reduction_factor:
logger.info(f"Overriding K value for SFT with adapter_reduction_factor: {adapter_args.adapter_reduction_factor}")
# calc appropriate K value
num_layers = len(model.transformer.h)
sft_k = num_layers * model.transformer.word_embeddings.weight.shape[1] ** 2 // adapter_args.adapter_reduction_factor * 2 #* 2 for the up and down proj

sft_k += model.transformer.word_embeddings.weight.shape[1] ** 2 // 2 # inv adapters. TODO: if we use other adapter configs, this breaks (code works, but K no longer matches adapter budget)

adapter_args.ft_params_num = int(sft_k)
logger.info(f"K value for SFT is {adapter_args.ft_params_num}")
yongzx marked this conversation as resolved.
Show resolved Hide resolved

if adapter_args.train_adapter:
yongzx marked this conversation as resolved.
Show resolved Hide resolved
trainable_params = 0
for name, param in model.named_parameters():
if "adapter" in name:
print(f"🚀 Trainable layer '{name}'")
trainable_params += param.numel()
logger.info(f"adapter elements: {trainable_params}")

num_layers = len(model.transformer.h)
sft_k = num_layers * model.transformer.word_embeddings.weight.shape[1] ** 2 // adapter_args.adapter_reduction_factor * 2 #* 2 for the up and down proj

sft_k += model.transformer.word_embeddings.weight.shape[1] ** 2 // 2 # inv adapters. TODO: if we use other adapter configs, this breaks (code works, but K no longer matches adapter budget)

adapter_args.ft_params_num = int(sft_k)
logger.info(f"K value for SFT is {adapter_args.ft_params_num}")

# only needed for composable sft
yongzx marked this conversation as resolved.
Show resolved Hide resolved
maskable_params = [
n for n, p in model.named_parameters()
if n.startswith(model.base_model_prefix) and p.requires_grad and not
("wte" in n or "wpe" in n or "word_embedding" in n or "lm_head" in n)
]

# Initialize our Trainer
trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer
trainer_class = trainer_class_mapping[model_args.lang_adapt_strategies]
trainer = trainer_class(
model=model,
args=training_args,
**{'sft_args': adapter_args} if 'sft' in model_args.lang_adapt_strategies else {},
**{'maskable_params': maskable_params} if 'sft' in model_args.lang_adapt_strategies else {},
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
Expand All @@ -728,7 +890,7 @@ def main():


# print("Embeddings at start of run:", model.get_input_embeddings().weight[250880:,:]) # get original weight for embedding layer
# orig_embeddings = model.get_input_embeddings().weight.detach().clone() # clone original weight for embedding layer
orig_embeddings = model.get_input_embeddings().weight.detach().clone() # clone original weight for embedding layer
# Training
if training_args.do_train:
checkpoint = None
Expand Down Expand Up @@ -763,17 +925,20 @@ def main():
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


if 'sft' in model_args.lang_adapt_strategies:
trainer.sft().save(f'{training_args.output_dir}/')

# uncomment to test whether extending vocab gradient masking is working correctly.
# if model_args.embedding_strategies == "extend":
# print("Unsliced, post-training:", model.get_input_embeddings().weight) # get updated weight
# if not torch.equal(orig_embeddings[:250880, :], model.get_input_embeddings().weight[:250880, :]):
# raise ValueError("embedding layer is updated where it shouldn't....")

# if torch.equal(orig_embeddings[250880:, :], model.get_input_embeddings().weight[250880:, :]):
# print("original embeddings:", orig_embeddings[250880:, :])
# print("updated embeddings:", model.get_input_embeddings().weight[250880:, :])
# raise ValueError("embedding layer is not updated where it should....")
if model_args.embedding_strategies == "extend":
print("Unsliced, post-training:", model.get_input_embeddings().weight) # get updated weight
if not torch.equal(orig_embeddings[:250880, :], model.get_input_embeddings().weight[:250880, :]):
raise ValueError("embedding layer is updated where it shouldn't....")

if torch.equal(orig_embeddings[250880:, :], model.get_input_embeddings().weight[250880:, :]):
print("original embeddings:", orig_embeddings[250880:, :])
print("updated embeddings:", model.get_input_embeddings().weight[250880:, :])
raise ValueError("embedding layer is not updated where it should....")


# Evaluation
Expand Down
57 changes: 57 additions & 0 deletions scripts/lang_adapt/run_clm_sft.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# axis
LANG="th"
MAX_TRAIN_SAMPLES=100000
BIGS_MODEL="bigscience/bloom-350m"
ADPT_REDUCTION_FACTOR=16
ADPT_STRATEGY="emb-and-sft"
EMB_STRATEGY="replace"

tokenizer_dir=./tokenizers/tok_bloom-350m_th_oscar_10000samples_5000vocab_extend/
cache_dir="./cache"
output_dir="./sft_testing_save"
logging_dir="./sft_testing_save"
mkdir -p $output_dir
mkdir -p $logging_dir

CUDA_VISIBLE_DEVICES=5 python madx_run_clm.py \
--seed 0 \
--fp16 \
--model_name_or_path $BIGS_MODEL \
--tokenizer_name $tokenizer_dir \
--dataset_name oscar \
--dataset_config_name "unshuffled_deduplicated_$LANG" \
--cache_dir $cache_dir \
--logging_dir $logging_dir \
--report_to "tensorboard" \
--learning_rate 0.001 \
--do_train \
--do_eval \
--train_sft \
--load_best_model_at_end \
--output_dir $output_dir \
--preprocessing_num_workers 8 \
--overwrite_output_dir \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 8 \
--per_device_eval_batch_size 1 \
--eval_accumulation_steps 8 \
--eval_steps 500 \
--evaluation_strategy "steps" \
--max_eval_samples 5000 \
--logging_steps 100 \
--save_steps 5000 \
--save_strategy "steps" \
--max_train_samples $MAX_TRAIN_SAMPLES \
--max_steps 50000 \
--lang_adapt_strategies "$ADPT_STRATEGY" \
--embedding_strategies "$EMB_STRATEGY" \
--adapter_reduction_factor $ADPT_REDUCTION_FACTOR \
--language $LANG \
--full_ft_max_steps_per_iteration 2500 \
--sparse_ft_max_steps_per_iteration 10000 \
--n_ft_iterations 1





Loading