Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix missing req. arg for new datasets package #1334

Merged
merged 2 commits into from
Aug 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions notebook/autogen_chatgpt_gpt4.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@
"import datasets\n",
"\n",
"seed = 41\n",
"data = datasets.load_dataset(\"competition_math\")\n",
"data = datasets.load_dataset(\"competition_math\", trust_remote_code=True)\n",
"train_data = data[\"train\"].shuffle(seed=seed)\n",
"test_data = data[\"test\"].shuffle(seed=seed)\n",
"n_tune_data = 20\n",
Expand Down Expand Up @@ -390,7 +390,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m[I 2023-08-01 22:38:01,549]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
"\u001B[32m[I 2023-08-01 22:38:01,549]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions notebook/autogen_openai_completion.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@
"import datasets\n",
"\n",
"seed = 41\n",
"data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
"data = datasets.load_dataset(\"openai_humaneval\", trust_remote_code=True)[\"test\"].shuffle(seed=seed)\n",
"n_tune_data = 20\n",
"tune_data = [\n",
" {\n",
Expand Down Expand Up @@ -444,8 +444,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m[I 2023-07-30 04:19:08,150]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
"\u001b[32m[I 2023-07-30 04:19:08,153]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n"
"\u001B[32m[I 2023-07-30 04:19:08,150]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n",
"\u001B[32m[I 2023-07-30 04:19:08,153]\u001B[0m A new study created in memory with name: optuna\u001B[0m\n"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion notebook/research/autogen_code.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
"import datasets\n",
"\n",
"seed = 41\n",
"data = datasets.load_dataset(\"openai_humaneval\")[\"test\"].shuffle(seed=seed)\n",
"data = datasets.load_dataset(\"openai_humaneval\", trust_remote_code=True)[\"test\"].shuffle(seed=seed)\n",
"data = data.select(range(len(data))).rename_column(\"prompt\", \"definition\").remove_columns([\"task_id\", \"canonical_solution\"])"
]
},
Expand Down
2 changes: 1 addition & 1 deletion notebook/research/math_level5counting.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
"import datasets\n",
"\n",
"seed = 41\n",
"data = datasets.load_dataset(\"competition_math\")\n",
"data = datasets.load_dataset(\"competition_math\", trust_remote_code=True)\n",
"train_data = data[\"train\"].shuffle(seed=seed)\n",
"test_data = data[\"test\"].shuffle(seed=seed)\n",
"n_tune_data = 20\n",
Expand Down
40 changes: 18 additions & 22 deletions notebook/tune_huggingface.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,7 @@
]
}
],
"source": [
"raw_dataset = datasets.load_dataset(\"glue\", TASK)"
]
"source": "raw_dataset = datasets.load_dataset(\"glue\", TASK, trust_remote_code=True)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -425,9 +423,7 @@
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"metric = datasets.load_metric(\"glue\", TASK)"
]
"source": "metric = datasets.load_metric(\"glue\", TASK, trust_remote_code=True)"
},
{
"cell_type": "code",
Expand Down Expand Up @@ -646,15 +642,15 @@
"def train_distilbert(config: dict):\n",
"\n",
" # Load CoLA dataset and apply tokenizer\n",
" cola_raw = datasets.load_dataset(\"glue\", TASK)\n",
" cola_raw = datasets.load_dataset(\"glue\", TASK, trust_remote_code=True)\n",
" cola_encoded = cola_raw.map(tokenize, batched=True)\n",
" train_dataset, eval_dataset = cola_encoded[\"train\"], cola_encoded[\"validation\"]\n",
"\n",
" model = AutoModelForSequenceClassification.from_pretrained(\n",
" MODEL_CHECKPOINT, num_labels=NUM_LABELS\n",
" )\n",
"\n",
" metric = datasets.load_metric(\"glue\", TASK)\n",
" metric = datasets.load_metric(\"glue\", TASK, trust_remote_code=True)\n",
" def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
Expand Down Expand Up @@ -847,7 +843,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m Reusing dataset glue (/home/ec2-user/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 22%|██▏ | 2/9 [00:00<00:00, 19.41ba/s]\n",
" 56%|█████▌ | 5/9 [00:00<00:00, 20.98ba/s]\n",
Expand All @@ -856,25 +852,25 @@
"100%|██████████| 2/2 [00:00<00:00, 42.79ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 41.48ba/s]\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m To disable this warning, you can either:\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001b[2m\u001b[36m(pid=11344)\u001b[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m To disable this warning, you can either:\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m To disable this warning, you can either:\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m \t- Avoid using `tokenizers` before the fork if possible\n",
"\u001B[2m\u001B[36m(pid=11344)\u001B[0m \t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
]
}
],
Expand Down
4 changes: 2 additions & 2 deletions test/autogen/oai/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_humaneval(num_samples=1):
)

seed = 41
data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
data = datasets.load_dataset("openai_humaneval", trust_remote_code=True)["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
{
Expand Down Expand Up @@ -334,7 +334,7 @@ def test_math(num_samples=-1):
return

seed = 41
data = datasets.load_dataset("competition_math")
data = datasets.load_dataset("competition_math", trust_remote_code=True)
train_data = data["train"].shuffle(seed=seed)
test_data = data["test"].shuffle(seed=seed)
n_tune_data = 20
Expand Down
Loading