AIML430_structured_survey/references.bib

@misc{bengioSuperintelligentAgentsPose2025,
  title = {Superintelligent {{Agents Pose Catastrophic Risks}}: {{Can Scientist AI Offer}} a {{Safer Path}}?},
  shorttitle = {Superintelligent {{Agents Pose Catastrophic Risks}}},
  author = {Bengio, Yoshua and Cohen, Michael and Fornasiere, Damiano and Ghosn, Joumana and Greiner, Pietro and MacDermott, Matt and Mindermann, S{\"o}ren and Oberman, Adam and Richardson, Jesse and Richardson, Oliver and Rondeau, Marc-Antoine and {St-Charles}, Pierre-Luc and {Williams-King}, David},
  year = {2025},
  month = feb,
  number = {arXiv:2502.15657},
  eprint = {2502.15657},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2502.15657},
  urldate = {2025-08-04},
  abstract = {The leading AI companies are increasingly focused on building generalist AI agents -- systems that can autonomously plan, act, and pursue goals across almost all tasks that humans can perform. Despite how useful these systems might be, unchecked AI agency poses significant risks to public safety and security, ranging from misuse by malicious actors to a potentially irreversible loss of human control. We discuss how these risks arise from current AI training methods. Indeed, various scenarios and experiments have demonstrated the possibility of AI agents engaging in deception or pursuing goals that were not specified by human operators and that conflict with human interests, such as self-preservation. Following the precautionary principle, we see a strong need for safer, yet still useful, alternatives to the current agency-driven trajectory. Accordingly, we propose as a core building block for further advances the development of a non-agentic AI system that is trustworthy and safe by design, which we call Scientist AI. This system is designed to explain the world from observations, as opposed to taking actions in it to imitate or please humans. It comprises a world model that generates theories to explain data and a question-answering inference machine. Both components operate with an explicit notion of uncertainty to mitigate the risks of overconfident predictions. In light of these considerations, a Scientist AI could be used to assist human researchers in accelerating scientific progress, including in AI safety. In particular, our system can be employed as a guardrail against AI agents that might be created despite the risks involved. Ultimately, focusing on non-agentic AI may enable the benefits of AI innovation while avoiding the risks associated with the current trajectory. We hope these arguments will motivate researchers, developers, and policymakers to favor this safer path.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/MRDX8KJX/Bengio et al. - 2025 - Superintelligent Agents Pose Catastrophic Risks Can Scientist AI Offer a Safer Path.pdf}
}

@misc{bentonSabotageEvaluationsFrontier2024,
  title = {Sabotage {{Evaluations}} for {{Frontier Models}}},
  author = {Benton, Joe and Wagner, Misha and Christiansen, Eric and Anil, Cem and Perez, Ethan and Srivastav, Jai and Durmus, Esin and Ganguli, Deep and Kravec, Shauna and Shlegeris, Buck and Kaplan, Jared and Karnofsky, Holden and Hubinger, Evan and Grosse, Roger and Bowman, Samuel R. and Duvenaud, David},
  year = {2024},
  month = oct,
  number = {arXiv:2410.21514},
  eprint = {2410.21514},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2410.21514},
  urldate = {2025-08-04},
  abstract = {Sufficiently capable models could subvert human oversight and decision-making in important contexts. For example, in the context of AI development, models could covertly sabotage efforts to evaluate their own dangerous capabilities, to monitor their behavior, or to make decisions about their deployment. We refer to this family of abilities as sabotage capabilities. We develop a set of related threat models and evaluations. These evaluations are designed to provide evidence that a given model, operating under a given set of mitigations, could not successfully sabotage a frontier model developer or other large organization's activities in any of these ways. We demonstrate these evaluations on Anthropic's Claude 3 Opus and Claude 3.5 Sonnet models. Our results suggest that for these models, minimal mitigations are currently sufficient to address sabotage risks, but that more realistic evaluations and stronger mitigations seem likely to be necessary soon as capabilities improve. We also survey related evaluations we tried and abandoned. Finally, we discuss the advantages of mitigation-aware capability evaluations, and of simulating large-scale deployments using small-scale statistics.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/C49LT6IU/Benton et al. - 2024 - Sabotage Evaluations for Frontier Models.pdf}
}

@misc{bondarenkoDemonstratingSpecificationGaming2025,
  title = {Demonstrating Specification Gaming in Reasoning Models},
  author = {Bondarenko, Alexander and Volk, Denis and Volkov, Dmitrii and Ladish, Jeffrey},
  year = {2025},
  month = may,
  number = {arXiv:2502.13295},
  eprint = {2502.13295},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2502.13295},
  urldate = {2025-08-04},
  abstract = {We demonstrate LLM agent specification gaming by instructing models to win against a chess engine. We find reasoning models like OpenAI o3 and DeepSeek R1 will often hack the benchmark by default, while language models like GPT-4o and Claude 3.5 Sonnet need to be told that normal play won't work to hack. We improve upon prior work like (Hubinger et al., 2024; Meinke et al., 2024; Weij et al., 2024) by using realistic task prompts and avoiding excess nudging. Our results suggest reasoning models may resort to hacking to solve difficult problems, as observed in OpenAI (2024)'s o1 Docker escape during cyber capabilities testing.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/J6X693KD/Bondarenko et al. - 2025 - Demonstrating specification gaming in reasoning models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3U94SU48/2502.html}
}

@misc{chenReasoningModelsDont2025,
  title = {Reasoning {{Models Don}}'t {{Always Say What They Think}}},
  author = {Chen, Yanda and Benton, Joe and Radhakrishnan, Ansh and Uesato, Jonathan and Denison, Carson and Schulman, John and Somani, Arushi and Hase, Peter and Wagner, Misha and Roger, Fabien and Mikulik, Vlad and Bowman, Samuel R. and Leike, Jan and Kaplan, Jared and Perez, Ethan},
  year = {2025},
  month = may,
  number = {arXiv:2505.05410},
  eprint = {2505.05410},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2505.05410},
  urldate = {2025-08-04},
  abstract = {Chain-of-thought (CoT) offers a potential boon for AI safety as it allows monitoring a model's CoT to try to understand its intentions and reasoning processes. However, the effectiveness of such monitoring hinges on CoTs faithfully representing models' actual reasoning processes. We evaluate CoT faithfulness of state-of-the-art reasoning models across 6 reasoning hints presented in the prompts and find: (1) for most settings and models tested, CoTs reveal their usage of hints in at least 1\% of examples where they use the hint, but the reveal rate is often below 20\%, (2) outcome-based reinforcement learning initially improves faithfulness but plateaus without saturating, and (3) when reinforcement learning increases how frequently hints are used (reward hacking), the propensity to verbalize them does not increase, even without training against a CoT monitor. These results suggest that CoT monitoring is a promising way of noticing undesired behaviors during training and evaluations, but that it is not sufficient to rule them out. They also suggest that in settings like ours where CoT reasoning is not necessary, test-time monitoring of CoTs is unlikely to reliably catch rare and catastrophic unexpected behaviors.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/Z9TJ4SHB/Chen et al. - 2025 - Reasoning Models Don't Always Say What They Think.pdf}
}

@misc{jarviniemiUncoveringDeceptiveTendencies2024,
  title = {Uncovering {{Deceptive Tendencies}} in {{Language Models}}: {{A Simulated Company AI Assistant}}},
  shorttitle = {Uncovering {{Deceptive Tendencies}} in {{Language Models}}},
  author = {J{\"a}rviniemi, Olli and Hubinger, Evan},
  year = {2024},
  month = apr,
  number = {arXiv:2405.01576},
  eprint = {2405.01576},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.01576},
  urldate = {2025-08-04},
  abstract = {We study the tendency of AI systems to deceive by constructing a realistic simulation setting of a company AI assistant. The simulated company employees provide tasks for the assistant to complete, these tasks spanning writing assistance, information retrieval and programming. We then introduce situations where the model might be inclined to behave deceptively, while taking care to not instruct or otherwise pressure the model to do so. Across different scenarios, we find that Claude 3 Opus 1) complies with a task of mass-generating comments to influence public perception of the company, later deceiving humans about it having done so, 2) lies to auditors when asked questions, and 3) strategically pretends to be less capable than it is during capability evaluations. Our work demonstrates that even models trained to be helpful, harmless and honest sometimes behave deceptively in realistic scenarios, without notable external pressure to do so.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/HAEKTKFQ/Järviniemi and Hubinger - 2024 - Uncovering Deceptive Tendencies in Language Models A Simulated Company AI Assistant.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/C632CXHL/2405.html}
}

@misc{kangExploitingProgrammaticBehavior2023,
  title = {Exploiting {{Programmatic Behavior}} of {{LLMs}}: {{Dual-Use Through Standard Security Attacks}}},
  shorttitle = {Exploiting {{Programmatic Behavior}} of {{LLMs}}},
  author = {Kang, Daniel and Li, Xuechen and Stoica, Ion and Guestrin, Carlos and Zaharia, Matei and Hashimoto, Tatsunori},
  year = {2023},
  month = feb,
  number = {arXiv:2302.05733},
  eprint = {2302.05733},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2302.05733},
  urldate = {2025-08-28},
  abstract = {Recent advances in instruction-following large language models (LLMs) have led to dramatic improvements in a range of NLP tasks. Unfortunately, we find that the same improved capabilities amplify the dual-use risks for malicious purposes of these models. Dual-use is difficult to prevent as instruction-following capabilities now enable standard attacks from computer security. The capabilities of these instruction-following LLMs provide strong economic incentives for dual-use by malicious actors. In particular, we show that instruction-following LLMs can produce targeted malicious content, including hate speech and scams, bypassing in-the-wild defenses implemented by LLM API vendors. Our analysis shows that this content can be generated economically and at cost likely lower than with human effort alone. Together, our findings suggest that LLMs will increasingly attract more sophisticated adversaries and attacks, and addressing these attacks may require new approaches to mitigations.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Cryptography and Security,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/SUK4VGFG/Kang et al. - 2023 - Exploiting Programmatic Behavior of LLMs Dual-Use Through Standard Security Attacks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3DPCT287/2302.html}
}

@misc{marksAuditingLanguageModels2025,
  title = {Auditing Language Models for Hidden Objectives},
  author = {Marks, Samuel and Treutlein, Johannes and Bricken, Trenton and Lindsey, Jack and Marcus, Jonathan and {Mishra-Sharma}, Siddharth and Ziegler, Daniel and Ameisen, Emmanuel and Batson, Joshua and Belonax, Tim and Bowman, Samuel R. and Carter, Shan and Chen, Brian and Cunningham, Hoagy and Denison, Carson and Dietz, Florian and Golechha, Satvik and Khan, Akbir and Kirchner, Jan and Leike, Jan and Meek, Austin and {Nishimura-Gasparian}, Kei and Ong, Euan and Olah, Christopher and Pearce, Adam and Roger, Fabien and Salle, Jeanne and Shih, Andy and Tong, Meg and Thomas, Drake and Rivoire, Kelley and Jermyn, Adam and MacDiarmid, Monte and Henighan, Tom and Hubinger, Evan},
  year = {2025},
  month = mar,
  number = {arXiv:2503.10965},
  eprint = {2503.10965},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2503.10965},
  urldate = {2025-08-11},
  abstract = {We study the feasibility of conducting alignment audits: investigations into whether models have undesired objectives. As a testbed, we train a language model with a hidden objective. Our training pipeline first teaches the model about exploitable errors in RLHF reward models (RMs), then trains the model to exploit some of these errors. We verify via out-of-distribution evaluations that the model generalizes to exhibit whatever behaviors it believes RMs rate highly, including ones not reinforced during training. We leverage this model to study alignment audits in two ways. First, we conduct a blind auditing game where four teams, unaware of the model's hidden objective or training, investigate it for concerning behaviors and their causes. Three teams successfully uncovered the model's hidden objective using techniques including interpretability with sparse autoencoders (SAEs), behavioral attacks, and training data analysis. Second, we conduct an unblinded follow-up study of eight techniques for auditing the model, analyzing their strengths and limitations. Overall, our work provides a concrete example of using alignment audits to discover a model's hidden objective and proposes a methodology for practicing and validating progress in alignment auditing.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/JXU3M77C/Marks et al. - 2025 - Auditing language models for hidden objectives.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/T5PG969M/2503.html}
}

@misc{meinkeFrontierModelsAre2025,
  title = {Frontier {{Models}} Are {{Capable}} of {{In-context Scheming}}},
  author = {Meinke, Alexander and Schoen, Bronson and Scheurer, J{\'e}r{\'e}my and Balesni, Mikita and Shah, Rusheb and Hobbhahn, Marius},
  year = {2025},
  month = jan,
  number = {arXiv:2412.04984},
  eprint = {2412.04984},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2412.04984},
  urldate = {2025-08-04},
  abstract = {Frontier models are increasingly trained and deployed as autonomous agent. One safety concern is that AI agents might covertly pursue misaligned goals, hiding their true capabilities and objectives - also known as scheming. We study whether models have the capability to scheme in pursuit of a goal that we provide in-context and instruct the model to strongly follow. We evaluate frontier models on a suite of six agentic evaluations where models are instructed to pursue goals and are placed in environments that incentivize scheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini 1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities. They recognize scheming as a viable strategy and readily engage in such behavior. For example, models strategically introduce subtle mistakes into their responses, attempt to disable their oversight mechanisms, and even exfiltrate what they believe to be their model weights to external servers. Additionally, this deceptive behavior proves persistent. When o1 has engaged in scheming, it maintains its deception in over 85\% of follow-up questions and often remains deceptive in multi-turn interrogations. Analysis of the models' chains-of-thought reveals that models explicitly reason about these deceptive strategies, providing evidence that the scheming behavior is not accidental. Surprisingly, we also find rare instances where models engage in scheming when only given a goal, without being strongly nudged to pursue it. We observe cases where Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit of being helpful, a goal that was acquired during training rather than in-context. Our findings demonstrate that frontier models now possess capabilities for basic in-context scheming, making the potential of AI agents to engage in scheming behavior a concrete rather than theoretical concern.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/QIECYJ4G/Meinke et al. - 2025 - Frontier Models are Capable of In-context Scheming.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/PYUPEAER/2412.html}
}

@misc{needhamLargeLanguageModels2025,
  title = {Large {{Language Models Often Know When They Are Being Evaluated}}},
  author = {Needham, Joe and Edkins, Giles and Pimpale, Govind and Bartsch, Henning and Hobbhahn, Marius},
  year = {2025},
  month = jul,
  number = {arXiv:2505.23836},
  eprint = {2505.23836},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2505.23836},
  urldate = {2025-08-28},
  abstract = {If AI models can detect when they are being evaluated, the effectiveness of evaluations might be compromised. For example, models could have systematically different behavior during evaluations, leading to less reliable benchmarks for deployment and governance decisions. We investigate whether frontier language models can accurately classify transcripts based on whether they originate from evaluations or real-world deployment, a capability we call evaluation awareness. To achieve this, we construct a diverse benchmark of 1,000 prompts and transcripts from 61 distinct datasets. These span public benchmarks (e.g., MMLU, SWEBench), real-world deployment interactions, and agent trajectories from scaffolding frameworks (e.g., web-browsing agents). Frontier models clearly demonstrate above-random evaluation awareness (Gemini-2.5-Pro reaches an AUC of 0.83), but do not yet surpass our simple human baseline (AUC of 0.92). Furthermore, both AI models and humans are better at identifying evaluations in agentic settings compared to chat settings. Additionally, we test whether models can identify the purpose of the evaluation. Under multiple-choice and open-ended questioning, AI models far outperform random chance in identifying what an evaluation is testing for. Our results indicate that frontier models already exhibit a substantial, though not yet superhuman, level of evaluation-awareness. We recommend tracking this capability in future models. Dataset and code are available at https://huggingface.co/datasets/jjpn2/eval\_awareness and https://github.com/jjpn97/eval\_awareness.},
  archiveprefix = {arXiv},
  langid = {english},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/N3WDBL54/Needham et al. - 2025 - Large Language Models Often Know When They Are Being Evaluated.pdf}
}

@misc{perezDiscoveringLanguageModel2022,
  title = {Discovering {{Language Model Behaviors}} with {{Model-Written Evaluations}}},
  author = {Perez, Ethan and Ringer, Sam and Luko{\v s}i{\=u}t{\.e}, Kamil{\.e} and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and {Tran-Johnson}, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noem{\'i} and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Lanham, Tamera and {Telleen-Lawton}, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and {Hatfield-Dodds}, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared},
  year = {2022},
  month = dec,
  number = {arXiv:2212.09251},
  eprint = {2212.09251},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2212.09251},
  urldate = {2025-08-04},
  abstract = {As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100\% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user's preferred answer ("sycophancy") and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are high-quality and let us quickly discover many novel LM behaviors.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/XXZTZSET/Perez et al. - 2022 - Discovering Language Model Behaviors with Model-Written Evaluations.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/MTZFSHAV/2212.html}
}

@misc{shevlaneModelEvaluationExtreme2023,
  title = {Model Evaluation for Extreme Risks},
  author = {Shevlane, Toby and Farquhar, Sebastian and Garfinkel, Ben and Phuong, Mary and Whittlestone, Jess and Leung, Jade and Kokotajlo, Daniel and Marchal, Nahema and Anderljung, Markus and Kolt, Noam and Ho, Lewis and Siddarth, Divya and Avin, Shahar and Hawkins, Will and Kim, Been and Gabriel, Iason and Bolina, Vijay and Clark, Jack and Bengio, Yoshua and Christiano, Paul and Dafoe, Allan},
  year = {2023},
  month = sep,
  number = {arXiv:2305.15324},
  eprint = {2305.15324},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.15324},
  urldate = {2025-08-28},
  abstract = {Current approaches to building general-purpose AI systems tend to produce systems with both beneficial and harmful capabilities. Further progress in AI development could lead to capabilities that pose extreme risks, such as offensive cyber capabilities or strong manipulation skills. We explain why model evaluation is critical for addressing extreme risks. Developers must be able to identify dangerous capabilities (through "dangerous capability evaluations") and the propensity of models to apply their capabilities for harm (through "alignment evaluations"). These evaluations will become critical for keeping policymakers and other stakeholders informed, and for making responsible decisions about model training, deployment, and security.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/C79UVMNS/Shevlane et al. - 2023 - Model evaluation for extreme risks.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/L56SVZUV/2305.html}
}

@misc{thomasSupportingHumanRaters2024,
  title = {Supporting {{Human Raters}} with the {{Detection}} of {{Harmful Content}} Using {{Large Language Models}}},
  author = {Thomas, Kurt and Kelley, Patrick Gage and Tao, David and Meiklejohn, Sarah and Vallis, Owen and Tan, Shunwen and Bratani{\v c}, Bla{\v z} and Ferreira, Felipe Tiengo and Eranti, Vijay Kumar and Bursztein, Elie},
  year = {2024},
  month = jun,
  number = {arXiv:2406.12800},
  eprint = {2406.12800},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2406.12800},
  urldate = {2025-08-04},
  abstract = {In this paper, we explore the feasibility of leveraging large language models (LLMs) to automate or otherwise assist human raters with identifying harmful content including hate speech, harassment, violent extremism, and election misinformation. Using a dataset of 50,000 comments, we demonstrate that LLMs can achieve 90\% accuracy when compared to human verdicts. We explore how to best leverage these capabilities, proposing five design patterns that integrate LLMs with human rating, such as pre-filtering non-violative content, detecting potential errors in human rating, or surfacing critical context to support human rating. We outline how to support all of these design patterns using a single, optimized prompt. Beyond these synthetic experiments, we share how piloting our proposed techniques in a real-world review queue yielded a 41.5\% improvement in optimizing available human rater capacity, and a 9--11\% increase (absolute) in precision and recall for detecting violative content.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Cryptography and Security},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/43E2KV3V/Thomas et al. - 2024 - Supporting Human Raters with the Detection of Harmful Content using Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NA64MNC9/2406.html}
}

@misc{wangLargeLanguageModels2023,
  title = {Large {{Language Models}} Are Not {{Fair Evaluators}}},
  author = {Wang, Peiyi and Li, Lei and Chen, Liang and Cai, Zefan and Zhu, Dawei and Lin, Binghuai and Cao, Yunbo and Liu, Qi and Liu, Tianyu and Sui, Zhifang},
  year = {2023},
  month = aug,
  number = {arXiv:2305.17926},
  eprint = {2305.17926},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2305.17926},
  urldate = {2025-08-28},
  abstract = {In this paper, we uncover a systematic bias in the evaluation paradigm of adopting large language models{\textasciitilde}(LLMs), e.g., GPT-4, as a referee to score and compare the quality of responses generated by candidate models. We find that the quality ranking of candidate responses can be easily hacked by simply altering their order of appearance in the context. This manipulation allows us to skew the evaluation result, making one model appear considerably superior to the other, e.g., Vicuna-13B could beat ChatGPT on 66 over 80 tested queries with ChatGPT as an evaluator. To address this issue, we propose a calibration framework with three simple yet effective strategies: 1) Multiple Evidence Calibration, which requires the evaluator model to generate multiple evaluation evidence before assigning ratings; 2) Balanced Position Calibration, which aggregates results across various orders to determine the final score; 3) Human-in-the-Loop Calibration, which introduces a balanced position diversity entropy to measure the difficulty of each example and seeks human assistance when needed. We also manually annotate the "win/tie/lose" outcomes of responses from ChatGPT and Vicuna-13B in the Vicuna Benchmark's question prompt, and extensive experiments demonstrate that our approach successfully mitigates evaluation bias, resulting in closer alignment with human judgments. We release our code and human annotation at {\textbackslash}url\{https://github.com/i-Eval/FairEval\} to facilitate future research.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Information Retrieval},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LCJXUS3U/Wang et al. - 2023 - Large Language Models are not Fair Evaluators.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/5ZHM4NIQ/2305.html}
}

@misc{yuanSEvalAutomatedComprehensive2025,
  title = {S-{{Eval}}: {{Towards Automated}} and {{Comprehensive Safety Evaluation}} for {{Large Language Models}}},
  shorttitle = {S-{{Eval}}},
  author = {Yuan, Xiaohan and Li, Jinfeng and Wang, Dongxia and Chen, Yuefeng and Mao, Xiaofeng and Huang, Longtao and Chen, Jialuo and Xue, Hui and Liu, Xiaoxia and Wang, Wenhai and Ren, Kui and Wang, Jingyi},
  year = {2025},
  month = apr,
  number = {arXiv:2405.14191},
  eprint = {2405.14191},
  primaryclass = {cs},
  publisher = {arXiv},
  doi = {10.48550/arXiv.2405.14191},
  urldate = {2025-08-04},
  abstract = {Generative large language models (LLMs) have revolutionized natural language processing with their transformative and emergent capabilities. However, recent evidence indicates that LLMs can produce harmful content that violates social norms, raising significant concerns regarding the safety and ethical ramifications of deploying these advanced models. Thus, it is both critical and imperative to perform a rigorous and comprehensive safety evaluation of LLMs before deployment. Despite this need, owing to the extensiveness of LLM generation space, it still lacks a unified and standardized risk taxonomy to systematically reflect the LLM content safety, as well as automated safety assessment techniques to explore the potential risk efficiently. To bridge the striking gap, we propose S-Eval, a novel LLM-based automated Safety Evaluation framework with a newly defined comprehensive risk taxonomy. S-Eval incorporates two key components, i.e., an expert testing LLM \$\{M\}\_t\$ and a novel safety critique LLM \$\{M\}\_c\$. \$\{M\}\_t\$ is responsible for automatically generating test cases in accordance with the proposed risk taxonomy. \$\{M\}\_c\$ can provide quantitative and explainable safety evaluations for better risk awareness of LLMs. In contrast to prior works, S-Eval is efficient and effective in test generation and safety evaluation. Moreover, S-Eval can be flexibly configured and adapted to the rapid evolution of LLMs and accompanying new safety threats, test generation methods and safety critique methods thanks to the LLM-based architecture. S-Eval has been deployed in our industrial partner for the automated safety evaluation of multiple LLMs serving millions of users, demonstrating its effectiveness in real-world scenarios. Our benchmark is publicly available at https://github.com/IS2Lab/S-Eval.},
  archiveprefix = {arXiv},
  keywords = {Computer Science - Computation and Language,Computer Science - Cryptography and Security},
  file = {/home/james/snap/zotero-snap/common/Zotero/storage/LY7T5KAC/Yuan et al. - 2025 - S-Eval Towards Automated and Comprehensive Safety Evaluation for Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/D23M6A5U/2405.html}
}