@article{cobbe2021gsm8k,
title={Training Verifiers to Solve Math Word Problems},
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and Hesse, Christopher and Schulman, John},
journal={arXiv preprint arXiv:2110.14168},
year={2021}
}
@inproceedings{
wang2023selfconsistency,
title={Self-Consistency Improves Chain of Thought Reasoning in Language Models},
author={Xuezhi Wang and Jason Wei and Dale Schuurmans and Quoc V Le and Ed H. Chi and Sharan Narang and Aakanksha Chowdhery and Denny Zhou},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=1PL1NIMMrw}
}
@article{qwen2.5,
title = {Qwen2.5 Technical Report},
author = {An Yang and Baosong Yang and Beichen Zhang and Binyuan Hui and Bo Zheng and Bowen Yu and Chengyuan Li and Dayiheng Liu and Fei Huang and Haoran Wei and Huan Lin and Jian Yang and Jianhong Tu and Jianwei Zhang and Jianxin Yang and Jiaxi Yang and Jingren Zhou and Junyang Lin and Kai Dang and Keming Lu and Keqin Bao and Kexin Yang and Le Yu and Mei Li and Mingfeng Xue and Pei Zhang and Qin Zhu and Rui Men and Runji Lin and Tianhao Li and Tingyu Xia and Xingzhang Ren and Xuancheng Ren and Yang Fan and Yang Su and Yichang Zhang and Yu Wan and Yuqiong Liu and Zeyu Cui and Zhenru Zhang and Zihan Qiu},
journal = {arXiv preprint arXiv:2412.15115},
year = {2024}
}
@article{guo2025deepseek,
title={Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning},
author={Guo, Daya and Yang, Dejian and Zhang, Haowei and Song, Junxiao and Zhang, Ruoyu and Xu, Runxin and Zhu, Qihao and Ma, Shirong and Wang, Peiyi and Bi, Xiao and others},
journal={arXiv preprint arXiv:2501.12948},
year={2025}
}
@article{agarwal2025gpt,
title={gpt-oss-120b \& gpt-oss-20b model card},
author={Agarwal, Sandhini and Ahmad, Lama and Ai, Jason and Altman, Sam and Applebaum, Andy and Arbus, Edwin and Arora, Rahul K and Bai, Yu and Baker, Bowen and Bao, Haiming and others},
journal={arXiv preprint arXiv:2508.10925},
year={2025}
}
@article{chollet2019measure,
title={On the measure of intelligence},
author={Chollet, Fran{\c{c}}ois},
journal={arXiv preprint arXiv:1911.01547},
year={2019}
}
@article{chollet2025arc,
title={Arc-agi-2: A new challenge for frontier ai reasoning systems},
author={Chollet, Francois and Knoop, Mike and Kamradt, Gregory and Landers, Bryan and Pinkard, Henry},
journal={arXiv preprint arXiv:2505.11831},
year={2025}
}
@misc{Moffitt2025ARCGEN,
title={ARC-GEN: A Mimetic Procedural Benchmark Generator for the Abstraction and Reasoning Corpus},
author={Michael D. Moffitt},
year={2025},
eprint={2511.00162},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2511.00162},
}
@misc{metr2025measure,
title = {Measuring AI Ability to Complete Long Tasks},
author = {METR},
howpublished = {\url{https://metr.org/blog/2025-03-19-measuring-ai-ability-to-complete-long-tasks/}},
year = {2025},
month = {03},
}
@misc{openai2025gpt52,
author = {OpenAI},
title = {Introducing GPT-5.2},
year = {2025},
month = dec,
url = {https://openai.com/index/introducing-gpt-5-2/},
note = {Accessed: 2026-01-20}
}
@misc{aime2025,
author = {Mathematical Association of America},
title = {American Invitational Mathematics Examination (AIME) 2025},
year = {2025},
howpublished = {\url{https://maa.org}},
note = {Official competition problems}
}
@inproceedings{rein2024gpqa,
title={Gpqa: A graduate-level google-proof q\&a benchmark},
author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
booktitle={First Conference on Language Modeling},
year={2024}
}
@online{google_gemini3_2025,
author = {{Google}},
title = {A New Era of Intelligence with Gemini 3},
year = {2025},
url = {https://blog.google/products-and-platforms/products/gemini/gemini-3/},
note = {Note from the CEO, accessed January 2026}
}
@inproceedings{levy2025digit,
title = "Language Models Encode Numbers Using Digit Representations in Base 10",
author = "Levy, Amit Arnold and
Geva, Mor",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.33/",
doi = "10.18653/v1/2025.naacl-short.33",
pages = "385--395",
ISBN = "979-8-89176-190-2",
}