-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy patho1.bib
2094 lines (2033 loc) · 123 KB
/
o1.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@ARTICLE{Wang2022-px,
title = "Self-Consistency Improves Chain of Thought Reasoning in
Language Models",
author = "Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc
and Chi, Ed and Narang, Sharan and Chowdhery, Aakanksha and
Zhou, Denny",
journal = "arXiv [cs.CL]",
abstract = "Chain-of-thought prompting combined with pre-trained large
language models has achieved encouraging results on complex
reasoning tasks. In this paper, we propose a new decoding
strategy, self-consistency, to replace the naive greedy
decoding used in chain-of-thought prompting. It first samples
a diverse set of reasoning paths instead of only taking the
greedy one, and then selects the most consistent answer by
marginalizing out the sampled reasoning paths.
Self-consistency leverages the intuition that a complex
reasoning problem typically admits multiple different ways of
thinking leading to its unique correct answer. Our extensive
empirical evaluation shows that self-consistency boosts the
performance of chain-of-thought prompting with a striking
margin on a range of popular arithmetic and commonsense
reasoning benchmarks, including GSM8K (+17.9\%), SVAMP
(+11.0\%), AQuA (+12.2\%), StrategyQA (+6.4\%) and
ARC-challenge (+3.9\%).",
month = "21~" # mar,
year = 2022,
url = "http://arxiv.org/abs/2203.11171",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "MiniChain;o1"
}
@ARTICLE{Wei2022-uj,
title = "Chain-of-thought prompting elicits reasoning in large
language models",
author = "Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma,
Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le,
Quoc and Zhou, Denny",
editor = "Koyejo, S and Mohamed, S and Agarwal, A and Belgrave, D and
Cho, K and Oh, A",
journal = "arXiv [cs.CL]",
pages = "24824--24837",
abstract = "We explore how generating a chain of thought -- a series of
intermediate reasoning steps -- significantly improves the
ability of large language models to perform complex
reasoning. In particular, we show how such reasoning
abilities emerge naturally in sufficiently large language
models via a simple method called chain of thought prompting,
where a few chain of thought demonstrations are provided as
exemplars in prompting. Experiments on three large language
models show that chain of thought prompting improves
performance on a range of arithmetic, commonsense, and
symbolic reasoning tasks. The empirical gains can be
striking. For instance, prompting a 540B-parameter language
model with just eight chain of thought exemplars achieves
state of the art accuracy on the GSM8K benchmark of math word
problems, surpassing even finetuned GPT-3 with a verifier.",
month = "27~" # jan,
year = 2022,
url = "https://proceedings.neurips.cc/paper_files/paper/2022/file/9d5609613524ecf4f15af0f7b31abca4-Paper-Conference.pdf",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "MiniChain;o1"
}
@ARTICLE{Hendrycks2021-jr,
title = "Measuring Mathematical Problem Solving With the {MATH}
Dataset",
author = "Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and
Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn
and Steinhardt, Jacob",
journal = "arXiv [cs.LG]",
abstract = "Many intellectual endeavors require mathematical problem
solving, but this skill remains beyond the capabilities of
computers. To measure this ability in machine learning
models, we introduce MATH, a new dataset of 12,500
challenging competition mathematics problems. Each problem in
MATH has a full step-by-step solution which can be used to
teach models to generate answer derivations and explanations.
To facilitate future research and increase accuracy on MATH,
we also contribute a large auxiliary pretraining dataset
which helps teach models the fundamentals of mathematics.
Even though we are able to increase accuracy on MATH, our
results show that accuracy remains relatively low, even with
enormous Transformer models. Moreover, we find that simply
increasing budgets and model parameter counts will be
impractical for achieving strong mathematical reasoning if
scaling trends continue. While scaling Transformers is
automatically solving most other text-based tasks, scaling is
not currently solving MATH. To have more traction on
mathematical problem solving we will likely need new
algorithmic advancements from the broader research community.",
month = "5~" # mar,
year = 2021,
url = "http://arxiv.org/abs/2103.03874",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "MiniChain;o1"
}
@ARTICLE{Radford2019-lx,
title = "Language models are unsupervised multitask learners",
author = "Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David
and Amodei, Dario and Sutskever, Ilya",
journal = "OpenAI Blog",
publisher = "ceid.upatras.gr",
volume = 1,
number = 8,
pages = 9,
abstract = "… Our largest model, GPT - 2 , is a 1.5B parameter Transformer
that achieves state of the art results on 7 out of 8 tested lan-
guage modeling datasets in a zero-shot setting but still
underfits WebText … Correspondence to: Alec Radford . competent
generalists …",
year = 2019,
url = "https://www.ceid.upatras.gr/webpages/faculty/zaro/teaching/alg-ds/PRESENTATIONS/PAPERS/2019-Radford-et-al_Language-Models-Are-Unsupervised-Multitask-%20Learners.pdf",
keywords = "Transformers;o1"
}
@MISC{Hendrycks2021-tt,
title = "Measuring Massive Multitask Language Understanding",
author = "Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou,
Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob",
journal = "arXiv [cs.CY]",
year = 2021,
url = "http://arxiv.org/abs/2009.03300",
archivePrefix = "arXiv",
primaryClass = "cs.CY",
keywords = "zephyr;o1"
}
@ARTICLE{Ouyang2022-ut,
title = "Training language models to follow instructions with human
feedback",
author = "Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo
and Wainwright, Carroll L and Mishkin, Pamela and Zhang,
Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex
and Schulman, John and Hilton, Jacob and Kelton, Fraser and
Miller, Luke and Simens, Maddie and Askell, Amanda and
Welinder, Peter and Christiano, Paul and Leike, Jan and Lowe,
Ryan",
editor = "Koyejo, S and Mohamed, S and Agarwal, A and Belgrave, D and
Cho, K and Oh, A",
journal = "arXiv [cs.CL]",
pages = "27730--27744",
abstract = "Making language models bigger does not inherently make them
better at following a user's intent. For example, large
language models can generate outputs that are untruthful,
toxic, or simply not helpful to the user. In other words,
these models are not aligned with their users. In this paper,
we show an avenue for aligning language models with user
intent on a wide range of tasks by fine-tuning with human
feedback. Starting with a set of labeler-written prompts and
prompts submitted through the OpenAI API, we collect a
dataset of labeler demonstrations of the desired model
behavior, which we use to fine-tune GPT-3 using supervised
learning. We then collect a dataset of rankings of model
outputs, which we use to further fine-tune this supervised
model using reinforcement learning from human feedback. We
call the resulting models InstructGPT. In human evaluations
on our prompt distribution, outputs from the 1.3B parameter
InstructGPT model are preferred to outputs from the 175B
GPT-3, despite having 100x fewer parameters. Moreover,
InstructGPT models show improvements in truthfulness and
reductions in toxic output generation while having minimal
performance regressions on public NLP datasets. Even though
InstructGPT still makes simple mistakes, our results show
that fine-tuning with human feedback is a promising direction
for aligning language models with human intent.",
month = "4~" # mar,
year = 2022,
url = "https://proceedings.neurips.cc/paper_files/paper/2022/file/b1efde53be364a73914f58805a001731-Paper-Conference.pdf",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "zephyr;o1"
}
@ARTICLE{Hubert2021-ju,
title = "Learning and planning in complex action spaces",
author = "Hubert, Thomas and Schrittwieser, Julian and Antonoglou,
Ioannis and Barekatain, Mohammadamin and Schmitt, Simon and
Silver, David",
journal = "arXiv [cs.LG]",
abstract = "Many important real-world problems have action spaces that
are high-dimensional, continuous or both, making full
enumeration of all possible actions infeasible. Instead, only
small subsets of actions can be sampled for the purpose of
policy evaluation and improvement. In this paper, we propose
a general framework to reason in a principled way about
policy evaluation and improvement over such sampled action
subsets. This sample-based policy iteration framework can in
principle be applied to any reinforcement learning algorithm
based upon policy iteration. Concretely, we propose Sampled
MuZero, an extension of the MuZero algorithm that is able to
learn in domains with arbitrarily complex action spaces by
planning over sampled actions. We demonstrate this approach
on the classical board game of Go and on two continuous
control benchmark domains: DeepMind Control Suite and
Real-World RL Suite.",
month = "13~" # apr,
year = 2021,
url = "http://arxiv.org/abs/2104.06303",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Brown2020-on,
title = "Language Models are Few-Shot Learners",
author = "Brown, Tom B and Mann, Benjamin and Ryder, Nick and Subbiah,
Melanie and Kaplan, Jared and Dhariwal, Prafulla and
Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and
Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel
and Krueger, Gretchen and Henighan, Tom and Child, Rewon and
Ramesh, Aditya and Ziegler, Daniel M and Wu, Jeffrey and
Winter, Clemens and Hesse, Christopher and Chen, Mark and
Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess,
Benjamin and Clark, Jack and Berner, Christopher and
McCandlish, Sam and Radford, Alec and Sutskever, Ilya and
Amodei, Dario",
journal = "arXiv [cs.CL]",
abstract = "Recent work has demonstrated substantial gains on many NLP
tasks and benchmarks by pre-training on a large corpus of
text followed by fine-tuning on a specific task. While
typically task-agnostic in architecture, this method still
requires task-specific fine-tuning datasets of thousands or
tens of thousands of examples. By contrast, humans can
generally perform a new language task from only a few
examples or from simple instructions - something which
current NLP systems still largely struggle to do. Here we
show that scaling up language models greatly improves
task-agnostic, few-shot performance, sometimes even reaching
competitiveness with prior state-of-the-art fine-tuning
approaches. Specifically, we train GPT-3, an autoregressive
language model with 175 billion parameters, 10x more than any
previous non-sparse language model, and test its performance
in the few-shot setting. For all tasks, GPT-3 is applied
without any gradient updates or fine-tuning, with tasks and
few-shot demonstrations specified purely via text interaction
with the model. GPT-3 achieves strong performance on many NLP
datasets, including translation, question-answering, and
cloze tasks, as well as several tasks that require on-the-fly
reasoning or domain adaptation, such as unscrambling words,
using a novel word in a sentence, or performing 3-digit
arithmetic. At the same time, we also identify some datasets
where GPT-3's few-shot learning still struggles, as well as
some datasets where GPT-3 faces methodological issues related
to training on large web corpora. Finally, we find that GPT-3
can generate samples of news articles which human evaluators
have difficulty distinguishing from articles written by
humans. We discuss broader societal impacts of this finding
and of GPT-3 in general.",
month = "28~" # may,
year = 2020,
url = "http://arxiv.org/abs/2005.14165",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "MiniChain;o1"
}
@ARTICLE{Hoffmann2022-mn,
title = "Training Compute-Optimal Large Language Models",
author = "Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur
and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza
and de Las Casas, Diego and Hendricks, Lisa Anne and Welbl,
Johannes and Clark, Aidan and Hennigan, Tom and Noland, Eric
and Millican, Katie and van den Driessche, George and Damoc,
Bogdan and Guy, Aurelia and Osindero, Simon and Simonyan,
Karen and Elsen, Erich and Rae, Jack W and Vinyals, Oriol and
Sifre, Laurent",
journal = "arXiv [cs.CL]",
abstract = "We investigate the optimal model size and number of tokens
for training a transformer language model under a given
compute budget. We find that current large language models
are significantly undertrained, a consequence of the recent
focus on scaling language models whilst keeping the amount of
training data constant. By training over 400 language models
ranging from 70 million to over 16 billion parameters on 5 to
500 billion tokens, we find that for compute-optimal
training, the model size and the number of training tokens
should be scaled equally: for every doubling of model size
the number of training tokens should also be doubled. We test
this hypothesis by training a predicted compute-optimal
model, Chinchilla, that uses the same compute budget as
Gopher but with 70B parameters and 4$\times$ more more data.
Chinchilla uniformly and significantly outperforms Gopher
(280B), GPT-3 (175B), Jurassic-1 (178B), and Megatron-Turing
NLG (530B) on a large range of downstream evaluation tasks.
This also means that Chinchilla uses substantially less
compute for fine-tuning and inference, greatly facilitating
downstream usage. As a highlight, Chinchilla reaches a
state-of-the-art average accuracy of 67.5\% on the MMLU
benchmark, greater than a 7\% improvement over Gopher.",
month = "29~" # mar,
year = 2022,
url = "http://arxiv.org/abs/2203.15556",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"
}
@MISC{Sutton2019-my,
title = "The Bitter Lesson",
author = "Sutton, R",
journal = "Incomplete Ideas (blog)",
publisher = "cs.utexas.edu",
year = 2019,
url = "https://www.cs.utexas.edu/~eunsol/courses/data/bitter_lesson.pdf",
keywords = "o1"
}
@MISC{OpenAI2024-jh,
title = "Learning to Reason with {LLMs}",
author = "{OpenAI}",
abstract = "We are introducing OpenAI o1, a new large language model
trained with reinforcement learning to perform complex
reasoning. o1 thinks before it answers—it can produce a long
internal chain of thought before responding to the user.",
year = 2024,
howpublished = "\url{https://openai.com/index/learning-to-reason-with-llms/}",
note = "Accessed: 2024-10-29",
keywords = "o1",
language = "en"
}
@ARTICLE{Feng2023-sz,
title = "Alphazero-like tree-search can guide large language model
decoding and training",
author = "Feng, Xidong and Wan, Ziyu and Wen, Muning and McAleer,
Stephen Marcus and Wen, Ying and Zhang, Weinan and Wang, Jun",
journal = "arXiv [cs.LG]",
abstract = "Recent works like Tree-of-Thought (ToT) and Reasoning via
Planning (RAP) aim to augment the reasoning capabilities of
LLMs by using tree-search algorithms to guide multi-step
reasoning. These methods rely on prompting a pre-trained
model to serve as a value function and focus on problems with
low search depth. As a result, these methods will not work in
domains where the pre-trained LLM does not have enough
knowledge to serve as an effective value function or in
domains that require long-horizon planning. To address these
limitations, we present an AlphaZero-like tree-search
learning framework for LLMs (termed TS-LLM), systematically
illustrating how tree-search with a learned value function
can guide LLM decoding. TS-LLM distinguishes itself in two
key ways. (1) Leveraging a learned value function and
AlphaZero-like algorithms, our approach can be generally
adaptable to a wide range of tasks, language models of any
size, and tasks of varying search depths. (2) Our approach
can guide LLMs during both inference and training,
iteratively improving the LLM. Empirical results across
reasoning, planning, alignment, and decision-making tasks
show that TS-LLM outperforms existing approaches and can
handle trees with a depth of 64.",
month = "29~" # sep,
year = 2023,
url = "http://arxiv.org/abs/2309.17179",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Wang2023-ur,
title = "Math-Shepherd: Verify and reinforce {LLMs} step-by-step
without human annotations",
author = "Wang, Peiyi and Li, Lei and Shao, Zhihong and Xu, R X and
Dai, Damai and Li, Yifei and Chen, Deli and Wu, Y and Sui,
Zhifang",
journal = "arXiv [cs.AI]",
abstract = "In this paper, we present an innovative process-oriented math
process reward model called \textbf{Math-Shepherd}, which
assigns a reward score to each step of math problem
solutions. The training of Math-Shepherd is achieved using
automatically constructed process-wise supervision data,
breaking the bottleneck of heavy reliance on manual
annotation in existing work. We explore the effectiveness of
Math-Shepherd in two scenarios: 1) \textit{Verification}:
Math-Shepherd is utilized for reranking multiple outputs
generated by Large Language Models (LLMs); 2)
\textit{Reinforcement Learning}: Math-Shepherd is employed to
reinforce LLMs with step-by-step Proximal Policy Optimization
(PPO). With Math-Shepherd, a series of open-source LLMs
demonstrates exceptional performance. For instance, the
step-by-step PPO with Math-Shepherd significantly improves
the accuracy of Mistral-7B (77.9\%$\to$84.1\% on GSM8K and
28.6\%$\to$33.0\% on MATH). The accuracy can be further
enhanced to 89.1\% and 43.5\% on GSM8K and MATH with the
verification of Math-Shepherd, respectively. We believe that
automatic process supervision holds significant potential for
the future evolution of LLMs.",
month = "14~" # dec,
year = 2023,
url = "http://arxiv.org/abs/2312.08935",
archivePrefix = "arXiv",
primaryClass = "cs.AI",
keywords = "o1"
}
@ARTICLE{Singh2023-eb,
title = "Beyond human data: Scaling self-training for problem-solving
with language models",
author = "Singh, Avi and Co-Reyes, John D and Agarwal, Rishabh and
Anand, Ankesh and Patil, Piyush and Garcia, Xavier and Liu,
Peter J and Harrison, James and Lee, Jaehoon and Xu, Kelvin
and Parisi, Aaron and Kumar, Abhishek and Alemi, Alex and
Rizkowsky, Alex and Nova, Azade and Adlam, Ben and Bohnet,
Bernd and Elsayed, Gamaleldin and Sedghi, Hanie and Mordatch,
Igor and Simpson, Isabelle and Gur, Izzeddin and Snoek,
Jasper and Pennington, Jeffrey and Hron, Jiri and Kenealy,
Kathleen and Swersky, Kevin and Mahajan, Kshiteej and Culp,
Laura and Xiao, Lechao and Bileschi, Maxwell L and Constant,
Noah and Novak, Roman and Liu, Rosanne and Warkentin, Tris
and Qian, Yundi and Bansal, Yamini and Dyer, Ethan and
Neyshabur, Behnam and Sohl-Dickstein, Jascha and Fiedel, Noah",
journal = "arXiv [cs.LG]",
abstract = "Fine-tuning language models~(LMs) on human-generated data
remains a prevalent practice. However, the performance of
such models is often limited by the quantity and diversity of
high-quality human data. In this paper, we explore whether we
can go beyond human data on tasks where we have access to
scalar feedback, for example, on math problems where one can
verify correctness. To do so, we investigate a simple
self-training method based on expectation-maximization, which
we call ReST$^{EM}$, where we (1) generate samples from the
model and filter them using binary feedback, (2) fine-tune
the model on these samples, and (3) repeat this process a few
times. Testing on advanced MATH reasoning and APPS coding
benchmarks using PaLM-2 models, we find that ReST$^{EM}$
scales favorably with model size and significantly surpasses
fine-tuning only on human data. Overall, our findings suggest
self-training with feedback can substantially reduce
dependence on human-generated data.",
month = "11~" # dec,
year = 2023,
url = "http://arxiv.org/abs/2312.06585",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Zhang2024-tq,
title = "Backtracking improves generation safety",
author = "Zhang, Yiming and Chi, Jianfeng and Nguyen, Hailey and
Upasani, Kartikeya and Bikel, Daniel M and Weston, Jason and
Smith, Eric Michael",
journal = "arXiv [cs.LG]",
abstract = "Text generation has a fundamental limitation almost by
definition: there is no taking back tokens that have been
generated, even when they are clearly problematic. In the
context of language model safety, when a partial unsafe
generation is produced, language models by their nature tend
to happily keep on generating similarly unsafe additional
text. This is in fact how safety alignment of frontier models
gets circumvented in the wild, despite great efforts in
improving their safety. Deviating from the paradigm of
approaching safety alignment as prevention (decreasing the
probability of harmful responses), we propose backtracking, a
technique that allows language models to ``undo'' and recover
from their own unsafe generation through the introduction of
a special [RESET] token. Our method can be incorporated into
either SFT or DPO training to optimize helpfulness and
harmlessness. We show that models trained to backtrack are
consistently safer than baseline models: backtracking
Llama-3-8B is four times more safe than the baseline model
(6.1\% $\to$ 1.5\%) in our evaluations without regression in
helpfulness. Our method additionally provides protection
against four adversarial attacks including an adaptive
attack, despite not being trained to do so.",
month = "22~" # sep,
year = 2024,
url = "http://arxiv.org/abs/2409.14586",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Hao2023-gp,
title = "Reasoning with language model is planning with world model",
author = "Hao, Shibo and Gu, Yi and Ma, Haodi and Hong, Joshua Jiahua
and Wang, Zhen and Wang, Daisy Zhe and Hu, Zhiting",
journal = "arXiv [cs.CL]",
abstract = "Large language models (LLMs) have shown remarkable reasoning
capabilities, especially when prompted to generate
intermediate reasoning steps (e.g., Chain-of-Thought, CoT).
However, LLMs can still struggle with problems that are easy
for humans, such as generating action plans for executing
tasks in a given environment, or performing complex math,
logical, and commonsense reasoning. The deficiency stems from
the key fact that LLMs lack an internal $\textit{world
model}$ to predict the world $\textit{state}$ (e.g.,
environment status, intermediate variable values) and
simulate long-term outcomes of actions. This prevents LLMs
from performing deliberate planning akin to human brains,
which involves exploring alternative reasoning paths,
anticipating future states and rewards, and iteratively
refining existing reasoning steps. To overcome the
limitations, we propose a new LLM reasoning framework,
$\underline{R}$easoning vi$\underline{a}$
$\underline{P}$lanning $\textbf{(RAP)}$. RAP repurposes the
LLM as both a world model and a reasoning agent, and
incorporates a principled planning algorithm (based on Monto
Carlo Tree Search) for strategic exploration in the vast
reasoning space. During reasoning, the LLM (as agent)
incrementally builds a reasoning tree under the guidance of
the LLM (as world model) and task-specific rewards, and
obtains a high-reward reasoning path efficiently with a
proper balance between exploration $\textit{vs.}$
exploitation. We apply RAP to a variety of challenging
reasoning problems including plan generation, math reasoning,
and logical inference. Empirical results on these tasks
demonstrate the superiority of RAP over various strong
baselines, including CoT and least-to-most prompting with
self-consistency. RAP on LLAMA-33B surpasses CoT on GPT-4
with 33\% relative improvement in a plan generation setting.",
month = "24~" # may,
year = 2023,
url = "http://arxiv.org/abs/2305.14992",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"
}
@INCOLLECTION{Neal1998-np,
title = "A View of the Em Algorithm that Justifies Incremental, Sparse,
and other Variants",
author = "Neal, Radford M and Hinton, Geoffrey E",
booktitle = "Learning in Graphical Models",
publisher = "Springer Netherlands",
address = "Dordrecht",
pages = "355--368",
abstract = "The EM algorithm performs maximum likelihood estimation for data
in which some variables are unobserved. We present a function
that resembles negative free energy and show that the M step
maximizes this function with respect to the model parameters and
the E step maximizes it with respect to the distribution over the
unobserved variables. From this perspective, it is easy to
justify an incremental variant of the EM algorithm in which the
distribution for only one of the unobserved variables is
recalculated in each E step. This variant is shown empirically to
give faster convergence in a mixture estimation problem. A
variant of the algorithm that exploits sparse conditional
distributions is also described, and a wide range of other
variant algorithms are also seen to be possible.",
year = 1998,
url = "https://link.springer.com/chapter/10.1007/978-94-011-5014-9_12",
keywords = "o1",
language = "en"
}
@ARTICLE{Dempster1977-sw,
title = "Maximum likelihood from incomplete data via the \textit{EM}
algorithm",
author = "Dempster, A P and Laird, N M and Rubin, D B",
journal = "J. R. Stat. Soc. Series B Stat. Methodol.",
publisher = "Oxford University Press (OUP)",
volume = 39,
number = 1,
pages = "1--22",
abstract = "Summary A broadly applicable algorithm for computing maximum
likelihood estimates from incomplete data is presented at various
levels of generality. Theory showing the monotone behaviour of
the likelihood and convergence of the algorithm is derived. Many
examples are sketched, including missing value situations,
applications to grouped, censored or truncated data, finite
mixture models, variance component estimation, hyperparameter
estimation, iteratively reweighted least squares and factor
analysis.",
month = "1~" # sep,
year = 1977,
url = "https://onlinelibrary.wiley.com/doi/abs/10.1111/j.2517-6161.1977.tb01600.x",
keywords = "maximum likelihood; incomplete data; em algorithm; posterior
mode;o1",
language = "en"
}
@ARTICLE{Hendrycks2021-nv,
title = "Measuring coding challenge competence with {APPS}",
author = "Hendrycks, Dan and Basart, Steven and Kadavath, Saurav and
Mazeika, Mantas and Arora, Akul and Guo, Ethan and Burns,
Collin and Puranik, Samir and He, Horace and Song, Dawn and
Steinhardt, Jacob",
journal = "arXiv [cs.SE]",
abstract = "While programming is one of the most broadly applicable
skills in modern society, modern machine learning models
still cannot code solutions to basic problems. Despite its
importance, there has been surprisingly little work on
evaluating code generation, and it can be difficult to
accurately assess code generation performance rigorously. To
meet this challenge, we introduce APPS, a benchmark for code
generation. Unlike prior work in more restricted settings,
our benchmark measures the ability of models to take an
arbitrary natural language specification and generate
satisfactory Python code. Similar to how companies assess
candidate software developers, we then evaluate models by
checking their generated code on test cases. Our benchmark
includes 10,000 problems, which range from having simple
one-line solutions to being substantial algorithmic
challenges. We fine-tune large language models on both GitHub
and our training set, and we find that the prevalence of
syntax errors is decreasing exponentially as models improve.
Recent models such as GPT-Neo can pass approximately 20\% of
the test cases of introductory problems, so we find that
machine learning models are now beginning to learn how to
code. As the social significance of automatic code generation
increases over the coming years, our benchmark can provide an
important measure for tracking advancements.",
month = "20~" # may,
year = 2021,
url = "http://arxiv.org/abs/2105.09938",
archivePrefix = "arXiv",
primaryClass = "cs.SE",
keywords = "o1"
}
@ARTICLE{Tunstall2023-kv,
title = "Zephyr: Direct distillation of {LM} alignment",
author = "Tunstall, Lewis and Beeching, Edward and Lambert, Nathan and
Rajani, Nazneen and Rasul, Kashif and Belkada, Younes and
Huang, Shengyi and von Werra, Leandro and Fourrier,
Clémentine and Habib, Nathan and Sarrazin, Nathan and
Sanseviero, Omar and Rush, Alexander M and Wolf, Thomas",
journal = "arXiv [cs.LG]",
abstract = "We aim to produce a smaller language model that is aligned to
user intent. Previous research has shown that applying
distilled supervised fine-tuning (dSFT) on larger models
significantly improves task accuracy; however, these models
are unaligned, i.e. they do not respond well to natural
prompts. To distill this property, we experiment with the use
of preference data from AI Feedback (AIF). Starting from a
dataset of outputs ranked by a teacher model, we apply
distilled direct preference optimization (dDPO) to learn a
chat model with significantly improved intent alignment. The
approach requires only a few hours of training without any
additional sampling during fine-tuning. The final result,
Zephyr-7B, sets the state-of-the-art on chat benchmarks for
7B parameter models, and requires no human annotation. In
particular, results on MT-Bench show that Zephyr-7B surpasses
Llama2-Chat-70B, the best open-access RLHF-based model. Code,
models, data, and tutorials for the system are available at
https://github.com/huggingface/alignment-handbook.",
month = "25~" # oct,
year = 2023,
url = "http://arxiv.org/abs/2310.16944",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@INCOLLECTION{Kocsis2006-er,
title = "Bandit Based Monte-Carlo Planning",
author = "Kocsis, Levente and Szepesvári, Csaba",
booktitle = "Lecture Notes in Computer Science",
publisher = "Springer Berlin Heidelberg",
address = "Berlin, Heidelberg",
pages = "282--293",
series = "Lecture notes in computer science",
year = 2006,
url = "https://dl.acm.org/doi/10.1007/11871842_29",
keywords = "o1",
language = "en"
}
@ARTICLE{Zhang2024-sa,
title = "Generative verifiers: Reward modeling as next-token
prediction",
author = "Zhang, Lunjun and Hosseini, Arian and Bansal, Hritik and
Kazemi, Mehran and Kumar, Aviral and Agarwal, Rishabh",
journal = "arXiv [cs.LG]",
abstract = "Verifiers or reward models are often used to enhance the
reasoning performance of large language models (LLMs). A
common approach is the Best-of-N method, where N candidate
solutions generated by the LLM are ranked by a verifier, and
the best one is selected. While LLM-based verifiers are
typically trained as discriminative classifiers to score
solutions, they do not utilize the text generation
capabilities of pretrained LLMs. To overcome this limitation,
we instead propose training verifiers using the ubiquitous
next-token prediction objective, jointly on verification and
solution generation. Compared to standard verifiers, such
generative verifiers (GenRM) can benefit from several
advantages of LLMs: they integrate seamlessly with
instruction tuning, enable chain-of-thought reasoning, and
can utilize additional test-time compute via majority voting
for better verification. We demonstrate that GenRM
outperforms discriminative, DPO verifiers, and
LLM-as-a-Judge, resulting in a 16-40\% improvement in the
number of problems solved with Best-of-N on algorithmic and
math reasoning tasks. Furthermore, we find that training
GenRM with synthetic verification rationales is sufficient to
pick out subtle errors on math problems. Finally, we
demonstrate that generative verifiers scale favorably with
model size and inference-time compute.",
month = "27~" # aug,
year = 2024,
url = "http://arxiv.org/abs/2408.15240",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Welleck2022-xr,
title = "Generating sequences by learning to Self-correct",
author = "Welleck, Sean and Lu, Ximing and West, Peter and Brahman,
Faeze and Shen, Tianxiao and Khashabi, Daniel and Choi, Yejin",
journal = "arXiv [cs.CL]",
abstract = "Sequence generation applications require satisfying semantic
constraints, such as ensuring that programs are correct,
using certain keywords, or avoiding undesirable content.
Language models, whether fine-tuned or prompted with few-shot
demonstrations, frequently violate these constraints, and
lack a mechanism to iteratively revise their outputs.
Moreover, some powerful language models are of extreme scale
or inaccessible, making it inefficient, if not infeasible, to
update their parameters for task-specific adaptation. We
present Self-Correction, an approach that decouples an
imperfect base generator (an off-the-shelf language model or
supervised sequence-to-sequence model) from a separate
corrector that learns to iteratively correct imperfect
generations. To train the corrector, we propose an online
training procedure that can use either scalar or natural
language feedback on intermediate imperfect generations. We
show that Self-Correction improves upon the base generator in
three diverse generation tasks - mathematical program
synthesis, lexically-constrained generation, and toxicity
control - even when the corrector is much smaller than the
base generator.",
month = "31~" # oct,
year = 2022,
url = "http://arxiv.org/abs/2211.00053",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"
}
@ARTICLE{Xin2024-su,
title = "{DeepSeek}-Prover-{V1}.5: Harnessing proof assistant feedback
for reinforcement learning and Monte-Carlo tree search",
author = "Xin, Huajian and Ren, Z Z and Song, Junxiao and Shao, Zhihong
and Zhao, Wanjia and Wang, Haocheng and Liu, Bo and Zhang,
Liyue and Lu, Xuan and Du, Qiushi and Gao, Wenjun and Zhu,
Qihao and Yang, Dejian and Gou, Zhibin and Wu, Z F and Luo,
Fuli and Ruan, Chong",
journal = "arXiv [cs.CL]",
abstract = "We introduce DeepSeek-Prover-V1.5, an open-source language
model designed for theorem proving in Lean 4, which enhances
DeepSeek-Prover-V1 by optimizing both training and inference
processes. Pre-trained on DeepSeekMath-Base with
specialization in formal mathematical languages, the model
undergoes supervised fine-tuning using an enhanced formal
theorem proving dataset derived from DeepSeek-Prover-V1.
Further refinement is achieved through reinforcement learning
from proof assistant feedback (RLPAF). Beyond the single-pass
whole-proof generation approach of DeepSeek-Prover-V1, we
propose RMaxTS, a variant of Monte-Carlo tree search that
employs an intrinsic-reward-driven exploration strategy to
generate diverse proof paths. DeepSeek-Prover-V1.5
demonstrates significant improvements over
DeepSeek-Prover-V1, achieving new state-of-the-art results on
the test set of the high school level miniF2F benchmark
($63.5\%$) and the undergraduate level ProofNet benchmark
($25.3\%$).",
month = "15~" # aug,
year = 2024,
url = "http://arxiv.org/abs/2408.08152",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"
}
@ARTICLE{Ankner2024-uw,
title = "Critique-out-Loud reward models",
author = "Ankner, Zachary and Paul, Mansheej and Cui, Brandon and
Chang, Jonathan D and Ammanabrolu, Prithviraj",
journal = "arXiv [cs.LG]",
abstract = "Traditionally, reward models used for reinforcement learning
from human feedback (RLHF) are trained to directly predict
preference scores without leveraging the generation
capabilities of the underlying large language model (LLM).
This limits the capabilities of reward models as they must
reason implicitly about the quality of a response, i.e.,
preference modeling must be performed in a single forward
pass through the model. To enable reward models to reason
explicitly about the quality of a response, we introduce
Critique-out-Loud (CLoud) reward models. CLoud reward models
operate by first generating a natural language critique of
the assistant's response that is then used to predict a
scalar reward for the quality of the response. We demonstrate
the success of CLoud reward models for both Llama-3-8B and
70B base models: compared to classic reward models CLoud
reward models improve pairwise preference classification
accuracy on RewardBench by 4.65 and 5.84 percentage points
for the 8B and 70B base models respectively. Furthermore,
CLoud reward models lead to a Pareto improvement for win rate
on ArenaHard when used as the scoring model for Best-of-N.
Finally, we explore how to exploit the dynamic inference
compute capabilities of CLoud reward models by performing
self-consistency decoding for reward prediction.",
month = "21~" # aug,
year = 2024,
url = "http://arxiv.org/abs/2408.11791",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Shao2024-fb,
title = "{DeepSeekMath}: Pushing the limits of mathematical reasoning
in open language models",
author = "Shao, Zhihong and Wang, Peiyi and Zhu, Qihao and Xu, Runxin
and Song, Junxiao and Zhang, Mingchuan and Li, Y K and Wu, Y
and Guo, Daya",
journal = "arXiv [cs.CL]",
abstract = "Mathematical reasoning poses a significant challenge for
language models due to its complex and structured nature. In
this paper, we introduce DeepSeekMath 7B, which continues
pre-training DeepSeek-Coder-Base-v1.5 7B with 120B
math-related tokens sourced from Common Crawl, together with
natural language and code data. DeepSeekMath 7B has achieved
an impressive score of 51.7\% on the competition-level MATH
benchmark without relying on external toolkits and voting
techniques, approaching the performance level of Gemini-Ultra
and GPT-4. Self-consistency over 64 samples from DeepSeekMath
7B achieves 60.9\% on MATH. The mathematical reasoning
capability of DeepSeekMath is attributed to two key factors:
First, we harness the significant potential of publicly
available web data through a meticulously engineered data
selection pipeline. Second, we introduce Group Relative
Policy Optimization (GRPO), a variant of Proximal Policy
Optimization (PPO), that enhances mathematical reasoning
abilities while concurrently optimizing the memory usage of
PPO.",
month = "5~" # feb,
year = 2024,
url = "http://arxiv.org/abs/2402.03300",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"
}
@MISC{Paul-G-Allen-School2024-da,
title = "Parables on the Power of Planning in {AI}: From Poker to
Diplomacy: Noam Brown ({OpenAI})",
author = "{Paul G. Allen School}",
publisher = "Youtube",
abstract = "Title: Parables on the Power of Planning in AI: From Poker to
DiplomacySpeaker: Noam Brown (OpenAI)Date: Thursday, May 23,
2024Abstract: from Deep Blue in 19...",
month = "17~" # sep,
year = 2024,
url = "https://www.youtube.com/watch?v=eaAonE58sLU",
keywords = "Paul G. Allen School of Computer Science \& Engineering;
University of Washington;o1"
}
@ARTICLE{Silver2016-ag,
title = "Mastering the game of Go with deep neural networks and tree
search",
author = "Silver, David and Huang, Aja and Maddison, Chris J and Guez,
Arthur and Sifre, Laurent and van den Driessche, George and
Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam,
Veda and Lanctot, Marc and Dieleman, Sander and Grewe, Dominik
and Nham, John and Kalchbrenner, Nal and Sutskever, Ilya and
Lillicrap, Timothy and Leach, Madeleine and Kavukcuoglu, Koray
and Graepel, Thore and Hassabis, Demis",
journal = "Nature",
publisher = "Nature Publishing Group",
volume = 529,
number = 7587,
pages = "484--489",
abstract = "The game of Go has long been viewed as the most challenging of
classic games for artificial intelligence owing to its enormous
search space and the difficulty of evaluating board positions and
moves. Here we introduce a new approach to computer Go that uses
'value networks' to evaluate board positions and 'policy
networks' to select moves. These deep neural networks are trained
by a novel combination of supervised learning from human expert
games, and reinforcement learning from games of self-play.
Without any lookahead search, the neural networks play Go at the
level of state-of-the-art Monte Carlo tree search programs that
simulate thousands of random games of self-play. We also
introduce a new search algorithm that combines Monte Carlo
simulation with value and policy networks. Using this search
algorithm, our program AlphaGo achieved a 99.8\% winning rate
against other Go programs, and defeated the human European Go
champion by 5 games to 0. This is the first time that a computer
program has defeated a human professional player in the
full-sized game of Go, a feat previously thought to be at least a
decade away.",
month = "28~" # jan,
year = 2016,
url = "https://www.nature.com/articles/nature16961",
keywords = "o1",
language = "en"
}
@ARTICLE{Jones2021-di,
title = "Scaling scaling laws with board games",
author = "Jones, Andy L",
journal = "arXiv [cs.LG]",
abstract = "The largest experiments in machine learning now require
resources far beyond the budget of all but a few
institutions. Fortunately, it has recently been shown that
the results of these huge experiments can often be
extrapolated from the results of a sequence of far smaller,
cheaper experiments. In this work, we show that not only can
the extrapolation be done based on the size of the model, but
on the size of the problem as well. By conducting a sequence
of experiments using AlphaZero and Hex, we show that the
performance achievable with a fixed amount of compute
degrades predictably as the game gets larger and harder.
Along with our main result, we further show that the
test-time and train-time compute available to an agent can be
traded off while maintaining performance.",
month = "7~" # apr,
year = 2021,
url = "http://arxiv.org/abs/2104.03113",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Brown2024-bs,
title = "Large language monkeys: Scaling inference compute with
repeated sampling",
author = "Brown, Bradley and Juravsky, Jordan and Ehrlich, Ryan and
Clark, Ronald and Le, Quoc V and Ré, Christopher and
Mirhoseini, Azalia",
journal = "arXiv [cs.LG]",
abstract = "Scaling the amount of compute used to train language models
has dramatically improved their capabilities. However, when
it comes to inference, we often limit the amount of compute
to only one attempt per problem. Here, we explore inference
compute as another axis for scaling by increasing the number
of generated samples. Across multiple tasks and models, we
observe that coverage - the fraction of problems solved by
any attempt - scales with the number of samples over four
orders of magnitude. In domains like coding and formal
proofs, where all answers can be automatically verified,
these increases in coverage directly translate into improved
performance. When we apply repeated sampling to SWE-bench
Lite, the fraction of issues solved with
DeepSeek-V2-Coder-Instruct increases from 15.9\% with one
sample to 56\% with 250 samples, outperforming the
single-attempt state-of-the-art of 43\% which uses more
capable frontier models. Moreover, using current API pricing,
amplifying the cheaper DeepSeek model with five samples is
more cost-effective and solves more issues than paying a
premium for one sample from GPT-4o or Claude 3.5 Sonnet.
Interestingly, the relationship between coverage and the
number of samples is often log-linear and can be modelled
with an exponentiated power law, suggesting the existence of
inference-time scaling laws. Finally, we find that
identifying correct samples out of many generations remains
an important direction for future research in domains without
automatic verifiers. When solving math word problems from
GSM8K and MATH, coverage with Llama-3 models grows to over
95\% with 10,000 samples. However, common methods to pick
correct solutions from a sample collection, such as majority
voting or reward models, plateau beyond several hundred
samples and fail to fully scale with the sample budget.",
month = "31~" # jul,
year = 2024,
url = "http://arxiv.org/abs/2407.21787",
archivePrefix = "arXiv",
primaryClass = "cs.LG",
keywords = "o1"
}
@ARTICLE{Welleck2024-yr,
title = "From decoding to meta-generation: Inference-time algorithms
for large language models",
author = "Welleck, Sean and Bertsch, Amanda and Finlayson, Matthew and
Schoelkopf, Hailey and Xie, Alex and Neubig, Graham and
Kulikov, Ilia and Harchaoui, Zaid",
journal = "arXiv [cs.CL]",
abstract = "One of the most striking findings in modern research on large
language models (LLMs) is that scaling up compute during
training leads to better results. However, less attention has
been given to the benefits of scaling compute during
inference. This survey focuses on these inference-time
approaches. We explore three areas under a unified
mathematical formalism: token-level generation algorithms,
meta-generation algorithms, and efficient generation.
Token-level generation algorithms, often called decoding
algorithms, operate by sampling a single token at a time or
constructing a token-level search space and then selecting an
output. These methods typically assume access to a language
model's logits, next-token distributions, or probability
scores. Meta-generation algorithms work on partial or full
sequences, incorporating domain knowledge, enabling
backtracking, and integrating external information. Efficient
generation methods aim to reduce token costs and improve the
speed of generation. Our survey unifies perspectives from
three research communities: traditional natural language
processing, modern LLMs, and machine learning systems.",
month = "24~" # jun,
year = 2024,
url = "http://arxiv.org/abs/2406.16838",
archivePrefix = "arXiv",
primaryClass = "cs.CL",
keywords = "o1"