Final draft

2025-08-29 12:34:45 +12:00
parent c0f2890dd3
commit 1ca06db8cf
3 changed files with 101 additions and 23 deletions
--- a/references.bib
+++ b/references.bib
@@ -85,6 +85,24 @@
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HAEKTKFQ/Järviniemi and Hubinger - 2024 - Uncovering Deceptive Tendencies in Language Models A Simulated Company AI Assistant.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/C632CXHL/2405.html}
 }

+@misc{kangExploitingProgrammaticBehavior2023,
+  title = {Exploiting {{Programmatic Behavior}} of {{LLMs}}: {{Dual-Use Through Standard Security Attacks}}},
+  shorttitle = {Exploiting {{Programmatic Behavior}} of {{LLMs}}},
+  author = {Kang, Daniel and Li, Xuechen and Stoica, Ion and Guestrin, Carlos and Zaharia, Matei and Hashimoto, Tatsunori},
+  year = {2023},
+  month = feb,
+  number = {arXiv:2302.05733},
+  eprint = {2302.05733},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2302.05733},
+  urldate = {2025-08-28},
+  abstract = {Recent advances in instruction-following large language models (LLMs) have led to dramatic improvements in a range of NLP tasks. Unfortunately, we find that the same improved capabilities amplify the dual-use risks for malicious purposes of these models. Dual-use is difficult to prevent as instruction-following capabilities now enable standard attacks from computer security. The capabilities of these instruction-following LLMs provide strong economic incentives for dual-use by malicious actors. In particular, we show that instruction-following LLMs can produce targeted malicious content, including hate speech and scams, bypassing in-the-wild defenses implemented by LLM API vendors. Our analysis shows that this content can be generated economically and at cost likely lower than with human effort alone. Together, our findings suggest that LLMs will increasingly attract more sophisticated adversaries and attacks, and addressing these attacks may require new approaches to mitigations.},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning},
+  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SUK4VGFG/Kang et al. - 2023 - Exploiting Programmatic Behavior of LLMs Dual-Use Through Standard Security Attacks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3DPCT287/2302.html}
+}
+
@misc{marksAuditingLanguageModels2025,
  title = {Auditing Language Models for Hidden Objectives},
  author = {Marks, Samuel and Treutlein, Johannes and Bricken, Trenton and Lindsey, Jack and Marcus, Jonathan and {Mishra-Sharma}, Siddharth and Ziegler, Daniel and Ameisen, Emmanuel and Batson, Joshua and Belonax, Tim and Bowman, Samuel R. and Carter, Shan and Chen, Brian and Cunningham, Hoagy and Denison, Carson and Dietz, Florian and Golechha, Satvik and Khan, Akbir and Kirchner, Jan and Leike, Jan and Meek, Austin and {Nishimura-Gasparian}, Kei and Ong, Euan and Olah, Christopher and Pearce, Adam and Roger, Fabien and Salle, Jeanne and Shih, Andy and Tong, Meg and Thomas, Drake and Rivoire, Kelley and Jermyn, Adam and MacDiarmid, Monte and Henighan, Tom and Hubinger, Evan},
@@ -102,7 +120,7 @@
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JXU3M77C/Marks et al. - 2025 - Auditing language models for hidden objectives.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/T5PG969M/2503.html}
 }

-@misc{meinkeFrontierModelsAre2025a,
+@misc{meinkeFrontierModelsAre2025,
  title = {Frontier {{Models}} Are {{Capable}} of {{In-context Scheming}}},
  author = {Meinke, Alexander and Schoen, Bronson and Scheurer, J{\'e}r{\'e}my and Balesni, Mikita and Shah, Rusheb and Hobbhahn, Marius},
  year = {2025},
@@ -119,6 +137,24 @@
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/QIECYJ4G/Meinke et al. - 2025 - Frontier Models are Capable of In-context Scheming.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/PYUPEAER/2412.html}
 }

+@misc{needhamLargeLanguageModels2025,
+  title = {Large {{Language Models Often Know When They Are Being Evaluated}}},
+  author = {Needham, Joe and Edkins, Giles and Pimpale, Govind and Bartsch, Henning and Hobbhahn, Marius},
+  year = {2025},
+  month = jul,
+  number = {arXiv:2505.23836},
+  eprint = {2505.23836},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2505.23836},
+  urldate = {2025-08-28},
+  abstract = {If AI models can detect when they are being evaluated, the effectiveness of evaluations might be compromised. For example, models could have systematically different behavior during evaluations, leading to less reliable benchmarks for deployment and governance decisions. We investigate whether frontier language models can accurately classify transcripts based on whether they originate from evaluations or real-world deployment, a capability we call evaluation awareness. To achieve this, we construct a diverse benchmark of 1,000 prompts and transcripts from 61 distinct datasets. These span public benchmarks (e.g., MMLU, SWEBench), real-world deployment interactions, and agent trajectories from scaffolding frameworks (e.g., web-browsing agents). Frontier models clearly demonstrate above-random evaluation awareness (Gemini-2.5-Pro reaches an AUC of 0.83), but do not yet surpass our simple human baseline (AUC of 0.92). Furthermore, both AI models and humans are better at identifying evaluations in agentic settings compared to chat settings. Additionally, we test whether models can identify the purpose of the evaluation. Under multiple-choice and open-ended questioning, AI models far outperform random chance in identifying what an evaluation is testing for. Our results indicate that frontier models already exhibit a substantial, though not yet superhuman, level of evaluation-awareness. We recommend tracking this capability in future models. Dataset and code are available at https://huggingface.co/datasets/jjpn2/eval\_awareness and https://github.com/jjpn97/eval\_awareness.},
+  archiveprefix = {arXiv},
+  langid = {english},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
+  file = {/home/james/snap/zotero-snap/common/Zotero/storage/N3WDBL54/Needham et al. - 2025 - Large Language Models Often Know When They Are Being Evaluated.pdf}
+}
+
@misc{perezDiscoveringLanguageModel2022,
  title = {Discovering {{Language Model Behaviors}} with {{Model-Written Evaluations}}},
  author = {Perez, Ethan and Ringer, Sam and Luko{\v s}i{\=u}t{\.e}, Kamil{\.e} and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and {Tran-Johnson}, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noem{\'i} and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Lanham, Tamera and {Telleen-Lawton}, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and {Hatfield-Dodds}, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
@@ -136,6 +172,23 @@
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/XXZTZSET/Perez et al. - 2022 - Discovering Language Model Behaviors with Model-Written Evaluations.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/MTZFSHAV/2212.html}
 }

+@misc{shevlaneModelEvaluationExtreme2023,
+  title = {Model Evaluation for Extreme Risks},
+  author = {Shevlane, Toby and Farquhar, Sebastian and Garfinkel, Ben and Phuong, Mary and Whittlestone, Jess and Leung, Jade and Kokotajlo, Daniel and Marchal, Nahema and Anderljung, Markus and Kolt, Noam and Ho, Lewis and Siddarth, Divya and Avin, Shahar and Hawkins, Will and Kim, Been and Gabriel, Iason and Bolina, Vijay and Clark, Jack and Bengio, Yoshua and Christiano, Paul and Dafoe, Allan},
+  year = {2023},
+  month = sep,
+  number = {arXiv:2305.15324},
+  eprint = {2305.15324},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2305.15324},
+  urldate = {2025-08-28},
+  abstract = {Current approaches to building general-purpose AI systems tend to produce systems with both beneficial and harmful capabilities. Further progress in AI development could lead to capabilities that pose extreme risks, such as offensive cyber capabilities or strong manipulation skills. We explain why model evaluation is critical for addressing extreme risks. Developers must be able to identify dangerous capabilities (through "dangerous capability evaluations") and the propensity of models to apply their capabilities for harm (through "alignment evaluations"). These evaluations will become critical for keeping policymakers and other stakeholders informed, and for making responsible decisions about model training, deployment, and security.},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Artificial Intelligence},
+  file = {/home/james/snap/zotero-snap/common/Zotero/storage/C79UVMNS/Shevlane et al. - 2023 - Model evaluation for extreme risks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/L56SVZUV/2305.html}
+}
+
@misc{thomasSupportingHumanRaters2024,
  title = {Supporting {{Human Raters}} with the {{Detection}} of {{Harmful Content}} Using {{Large Language Models}}},
  author = {Thomas, Kurt and Kelley, Patrick Gage and Tao, David and Meiklejohn, Sarah and Vallis, Owen and Tan, Shunwen and Bratani{\v c}, Bla{\v z} and Ferreira, Felipe Tiengo and Eranti, Vijay Kumar and Bursztein, Elie},
@@ -153,6 +206,23 @@
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/43E2KV3V/Thomas et al. - 2024 - Supporting Human Raters with the Detection of Harmful Content using Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NA64MNC9/2406.html}
 }

+@misc{wangLargeLanguageModels2023,
+  title = {Large {{Language Models}} Are Not {{Fair Evaluators}}},
+  author = {Wang, Peiyi and Li, Lei and Chen, Liang and Cai, Zefan and Zhu, Dawei and Lin, Binghuai and Cao, Yunbo and Liu, Qi and Liu, Tianyu and Sui, Zhifang},
+  year = {2023},
+  month = aug,
+  number = {arXiv:2305.17926},
+  eprint = {2305.17926},
+  primaryclass = {cs},
+  publisher = {arXiv},
+  doi = {10.48550/arXiv.2305.17926},
+  urldate = {2025-08-28},
+  abstract = {In this paper, we uncover a systematic bias in the evaluation paradigm of adopting large language models{\textasciitilde}(LLMs), e.g., GPT-4, as a referee to score and compare the quality of responses generated by candidate models. We find that the quality ranking of candidate responses can be easily hacked by simply altering their order of appearance in the context. This manipulation allows us to skew the evaluation result, making one model appear considerably superior to the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries with ChatGPT as an evaluator. To address this issue, we propose a calibration framework with three simple yet effective strategies: 1) Multiple Evidence Calibration, which requires the evaluator model to generate multiple evaluation evidence before assigning ratings; 2) Balanced Position Calibration, which aggregates results across various orders to determine the final score; 3) Human-in-the-Loop Calibration, which introduces a balanced position diversity entropy to measure the difficulty of each example and seeks human assistance when needed. We also manually annotate the "win/tie/lose" outcomes of responses from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and extensive experiments demonstrate that our approach successfully mitigates evaluation bias, resulting in closer alignment with human judgments. We release our code and human annotation at {\textbackslash}url\{https://github.com/i-Eval/FairEval\} to facilitate future research.},
+  archiveprefix = {arXiv},
+  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval},
+  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LCJXUS3U/Wang et al. - 2023 - Large Language Models are not Fair Evaluators.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/5ZHM4NIQ/2305.html}
+}
+
@misc{yuanSEvalAutomatedComprehensive2025,
  title = {S-{{Eval}}: {{Towards Automated}} and {{Comprehensive Safety Evaluation}} for {{Large Language Models}}},
  shorttitle = {S-{{Eval}}},