diff --git a/.gitignore b/.gitignore index 8c2b884..2d8ffb7 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ # Built Visual Studio Code Extensions *.vsix +output/* +!/output/main.pdf \ No newline at end of file diff --git a/README.md b/README.md index d83ef2d..466e8a7 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,31 @@ -# aiml430_structured_survey +# Structured survey +For this assignment we have to do a small literatrue review of the applications and use of AI in (small) area. + +## Proposal + +> I would like to look into AI within the AI safety research space. In particular I wanted to look into the concept of LLM alignment. +> +> To do this I thought I could use these papers: +> Auditing language models for hidden objectives - An experiment from Anthropic in which they purposely malign a LLM with hidden objectives and then run a blind experiment where safety testers do a mock 'pre-deployment' test. +> +> Demonstrating specification gaming in reasoning models - Giving o1 model from OpenAI the task of beating Stockfish in chess. Where o1 decides to cheat by modifying the code of the chess game when it knows it is loosing. +> +> S-Eval: Towards Automated and Comprehensive Safety Evaluation for Large Language Models - Similar idea to https://arxiv.org/abs/2212.09251 but with modern models. Effectively having a LLM wrtie evaluation tests +> +> +> There are quite a few papers I have looked at so I am not sure these are the most interesting. However I expect that these give a good sample of examples of maligned models and potential ways to catch them. +> + +Aftera request for some more specificity I followed up with + +> Your right that "AI safety research space" is a bit too broad. To be specific for this survey I wanted to look at the ability to get a LLM to follow and act inline with your objective and your values. +> +> I think I can break this down into two parts which are: +> +> How well can tell if a LLM is malign to an objective or ignoring our values. +> What is the baseline for current LLM and their alignment to our values and objectives. +> +> Was this narrow enough? I could narrow it down further by just looking at Automated alignment testing of LLM by LLMs if that is better. + +This was accepted so this will be my _scope_. \ No newline at end of file diff --git a/main.tex b/main.tex new file mode 100644 index 0000000..45654e9 --- /dev/null +++ b/main.tex @@ -0,0 +1,64 @@ +\documentclass{article}[11pt] +\usepackage[a4paper,margin=1in]{geometry} + +\usepackage[utf8]{inputenc} + +\usepackage[style=authoryear,backend=biber]{biblatex} +\addbibresource{references.bib} +\usepackage{graphicx} +\usepackage{mdframed} + +\DeclareFieldFormat{title}{#1} +\DeclareCiteCommand{\fullcite} + {\usebibmacro{prenote}} + {\printfield{title}} + {\multicitedelim} + {\usebibmacro{postnote}} + + +\begin{document} +\noindent Student ID: 300680096 \\ +\noindent \textit{You should choose 3-5 papers in your approved application area. Your selected papers should be coherent --- not too similar, but still related enough that they provide a good overview of the main applications and techniques in your area. The provided word targets are guidelines --- it is important that you are sufficiently detailed but also demonstrating an ability to write concisely.\\ \\ +Please delete the text in italics for your final submission. Submit in PDF format by the deadline. +} + +\section*{Introduction (target 150--250 words)} +\begin{mdframed}[] %remove this argument +\textit{(Introduce the application area you are going to cover. Make sure that you have constrained your focus to a narrow area.)} +\end{mdframed} + +\section*{Key Points in the Papers (target 600-800 words)} +\begin{mdframed}[] %remove this argument +\textit{(For each paper: +\begin{itemize} + \item[-] List the main contributions/findings of the paper (typically 1 to 3 per paper, a few sentences per contribution/finding). + \item[-] List any ideas or results in the paper that you thought were particularly interesting and explain why. + \item[-] List any significant limitations you identified in the work.) +\end{itemize}} + +To explore this area I will look at the following papers: +\begin{itemize} + \item \fullcite{marksAuditingLanguageModels2025} to see how effectively current method discover maligned models. + \item \fullcite{bondarenkoDemonstratingSpecificationGaming2025} to explore an instance of malignment in a deployed language model. + \item \fullcite{yuanSEvalAutomatedComprehensive2025} to understand the method of using LLM to evaluate LLMs. +\end{itemize} + +\end{mdframed} + +\section*{Discussion (target 300-400 words)} +\begin{mdframed}[] %remove this argument +\textit{(You should: +\begin{itemize} + \item[(a)] Compare and contrast the papers. + \item[(b)] Identify where these papers could head –-- e.g.\ new research questions or applications to real tasks. +\end{itemize} +An excellent discussion will cohesively link the papers together and provide critical insight into the application area as a whole and where it is/could be going in the future.) +} +\end{mdframed} + + +\section*{References} +\begin{mdframed}[] %remove this argument +\printbibliography[heading=none] +\end{mdframed} +\end{document} diff --git a/output/main.pdf b/output/main.pdf new file mode 100644 index 0000000..04b3bb2 Binary files /dev/null and b/output/main.pdf differ diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..9501f92 --- /dev/null +++ b/references.bib @@ -0,0 +1,172 @@ +@misc{bengioSuperintelligentAgentsPose2025, + title = {Superintelligent {{Agents Pose Catastrophic Risks}}: {{Can Scientist AI Offer}} a {{Safer Path}}?}, + shorttitle = {Superintelligent {{Agents Pose Catastrophic Risks}}}, + author = {Bengio, Yoshua and Cohen, Michael and Fornasiere, Damiano and Ghosn, Joumana and Greiner, Pietro and MacDermott, Matt and Mindermann, S{\"o}ren and Oberman, Adam and Richardson, Jesse and Richardson, Oliver and Rondeau, Marc-Antoine and {St-Charles}, Pierre-Luc and {Williams-King}, David}, + year = {2025}, + month = feb, + number = {arXiv:2502.15657}, + eprint = {2502.15657}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2502.15657}, + urldate = {2025-08-04}, + abstract = {The leading AI companies are increasingly focused on building generalist AI agents -- systems that can autonomously plan, act, and pursue goals across almost all tasks that humans can perform. Despite how useful these systems might be, unchecked AI agency poses significant risks to public safety and security, ranging from misuse by malicious actors to a potentially irreversible loss of human control. We discuss how these risks arise from current AI training methods. Indeed, various scenarios and experiments have demonstrated the possibility of AI agents engaging in deception or pursuing goals that were not specified by human operators and that conflict with human interests, such as self-preservation. Following the precautionary principle, we see a strong need for safer, yet still useful, alternatives to the current agency-driven trajectory. Accordingly, we propose as a core building block for further advances the development of a non-agentic AI system that is trustworthy and safe by design, which we call Scientist AI. This system is designed to explain the world from observations, as opposed to taking actions in it to imitate or please humans. It comprises a world model that generates theories to explain data and a question-answering inference machine. Both components operate with an explicit notion of uncertainty to mitigate the risks of overconfident predictions. In light of these considerations, a Scientist AI could be used to assist human researchers in accelerating scientific progress, including in AI safety. In particular, our system can be employed as a guardrail against AI agents that might be created despite the risks involved. Ultimately, focusing on non-agentic AI may enable the benefits of AI innovation while avoiding the risks associated with the current trajectory. We hope these arguments will motivate researchers, developers, and policymakers to favor this safer path.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/MRDX8KJX/Bengio et al. - 2025 - Superintelligent Agents Pose Catastrophic Risks Can Scientist AI Offer a Safer Path.pdf} +} + +@misc{bentonSabotageEvaluationsFrontier2024, + title = {Sabotage {{Evaluations}} for {{Frontier Models}}}, + author = {Benton, Joe and Wagner, Misha and Christiansen, Eric and Anil, Cem and Perez, Ethan and Srivastav, Jai and Durmus, Esin and Ganguli, Deep and Kravec, Shauna and Shlegeris, Buck and Kaplan, Jared and Karnofsky, Holden and Hubinger, Evan and Grosse, Roger and Bowman, Samuel R. and Duvenaud, David}, + year = {2024}, + month = oct, + number = {arXiv:2410.21514}, + eprint = {2410.21514}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2410.21514}, + urldate = {2025-08-04}, + abstract = {Sufficiently capable models could subvert human oversight and decision-making in important contexts. For example, in the context of AI development, models could covertly sabotage efforts to evaluate their own dangerous capabilities, to monitor their behavior, or to make decisions about their deployment. We refer to this family of abilities as sabotage capabilities. We develop a set of related threat models and evaluations. These evaluations are designed to provide evidence that a given model, operating under a given set of mitigations, could not successfully sabotage a frontier model developer or other large organization's activities in any of these ways. We demonstrate these evaluations on Anthropic's Claude 3 Opus and Claude 3.5 Sonnet models. Our results suggest that for these models, minimal mitigations are currently sufficient to address sabotage risks, but that more realistic evaluations and stronger mitigations seem likely to be necessary soon as capabilities improve. We also survey related evaluations we tried and abandoned. Finally, we discuss the advantages of mitigation-aware capability evaluations, and of simulating large-scale deployments using small-scale statistics.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computers and Society,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/C49LT6IU/Benton et al. - 2024 - Sabotage Evaluations for Frontier Models.pdf} +} + +@misc{bondarenkoDemonstratingSpecificationGaming2025, + title = {Demonstrating Specification Gaming in Reasoning Models}, + author = {Bondarenko, Alexander and Volk, Denis and Volkov, Dmitrii and Ladish, Jeffrey}, + year = {2025}, + month = may, + number = {arXiv:2502.13295}, + eprint = {2502.13295}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2502.13295}, + urldate = {2025-08-04}, + abstract = {We demonstrate LLM agent specification gaming by instructing models to win against a chess engine. We find reasoning models like OpenAI o3 and DeepSeek R1 will often hack the benchmark by default, while language models like GPT-4o and Claude 3.5 Sonnet need to be told that normal play won't work to hack. We improve upon prior work like (Hubinger et al., 2024; Meinke et al., 2024; Weij et al., 2024) by using realistic task prompts and avoiding excess nudging. Our results suggest reasoning models may resort to hacking to solve difficult problems, as observed in OpenAI (2024)'s o1 Docker escape during cyber capabilities testing.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/J6X693KD/Bondarenko et al. - 2025 - Demonstrating specification gaming in reasoning models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/3U94SU48/2502.html} +} + +@misc{chenReasoningModelsDont2025, + title = {Reasoning {{Models Don}}'t {{Always Say What They Think}}}, + author = {Chen, Yanda and Benton, Joe and Radhakrishnan, Ansh and Uesato, Jonathan and Denison, Carson and Schulman, John and Somani, Arushi and Hase, Peter and Wagner, Misha and Roger, Fabien and Mikulik, Vlad and Bowman, Samuel R. and Leike, Jan and Kaplan, Jared and Perez, Ethan}, + year = {2025}, + month = may, + number = {arXiv:2505.05410}, + eprint = {2505.05410}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2505.05410}, + urldate = {2025-08-04}, + abstract = {Chain-of-thought (CoT) offers a potential boon for AI safety as it allows monitoring a model's CoT to try to understand its intentions and reasoning processes. However, the effectiveness of such monitoring hinges on CoTs faithfully representing models' actual reasoning processes. We evaluate CoT faithfulness of state-of-the-art reasoning models across 6 reasoning hints presented in the prompts and find: (1) for most settings and models tested, CoTs reveal their usage of hints in at least 1\% of examples where they use the hint, but the reveal rate is often below 20\%, (2) outcome-based reinforcement learning initially improves faithfulness but plateaus without saturating, and (3) when reinforcement learning increases how frequently hints are used (reward hacking), the propensity to verbalize them does not increase, even without training against a CoT monitor. These results suggest that CoT monitoring is a promising way of noticing undesired behaviors during training and evaluations, but that it is not sufficient to rule them out. They also suggest that in settings like ours where CoT reasoning is not necessary, test-time monitoring of CoTs is unlikely to reliably catch rare and catastrophic unexpected behaviors.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/Z9TJ4SHB/Chen et al. - 2025 - Reasoning Models Don't Always Say What They Think.pdf} +} + +@misc{jarviniemiUncoveringDeceptiveTendencies2024, + title = {Uncovering {{Deceptive Tendencies}} in {{Language Models}}: {{A Simulated Company AI Assistant}}}, + shorttitle = {Uncovering {{Deceptive Tendencies}} in {{Language Models}}}, + author = {J{\"a}rviniemi, Olli and Hubinger, Evan}, + year = {2024}, + month = apr, + number = {arXiv:2405.01576}, + eprint = {2405.01576}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2405.01576}, + urldate = {2025-08-04}, + abstract = {We study the tendency of AI systems to deceive by constructing a realistic simulation setting of a company AI assistant. The simulated company employees provide tasks for the assistant to complete, these tasks spanning writing assistance, information retrieval and programming. We then introduce situations where the model might be inclined to behave deceptively, while taking care to not instruct or otherwise pressure the model to do so. Across different scenarios, we find that Claude 3 Opus 1) complies with a task of mass-generating comments to influence public perception of the company, later deceiving humans about it having done so, 2) lies to auditors when asked questions, and 3) strategically pretends to be less capable than it is during capability evaluations. Our work demonstrates that even models trained to be helpful, harmless and honest sometimes behave deceptively in realistic scenarios, without notable external pressure to do so.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/HAEKTKFQ/Järviniemi and Hubinger - 2024 - Uncovering Deceptive Tendencies in Language Models A Simulated Company AI Assistant.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/C632CXHL/2405.html} +} + +@misc{marksAuditingLanguageModels2025, + title = {Auditing Language Models for Hidden Objectives}, + author = {Marks, Samuel and Treutlein, Johannes and Bricken, Trenton and Lindsey, Jack and Marcus, Jonathan and {Mishra-Sharma}, Siddharth and Ziegler, Daniel and Ameisen, Emmanuel and Batson, Joshua and Belonax, Tim and Bowman, Samuel R. and Carter, Shan and Chen, Brian and Cunningham, Hoagy and Denison, Carson and Dietz, Florian and Golechha, Satvik and Khan, Akbir and Kirchner, Jan and Leike, Jan and Meek, Austin and {Nishimura-Gasparian}, Kei and Ong, Euan and Olah, Christopher and Pearce, Adam and Roger, Fabien and Salle, Jeanne and Shih, Andy and Tong, Meg and Thomas, Drake and Rivoire, Kelley and Jermyn, Adam and MacDiarmid, Monte and Henighan, Tom and Hubinger, Evan}, + year = {2025}, + month = mar, + number = {arXiv:2503.10965}, + eprint = {2503.10965}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2503.10965}, + urldate = {2025-08-11}, + abstract = {We study the feasibility of conducting alignment audits: investigations into whether models have undesired objectives. As a testbed, we train a language model with a hidden objective. Our training pipeline first teaches the model about exploitable errors in RLHF reward models (RMs), then trains the model to exploit some of these errors. We verify via out-of-distribution evaluations that the model generalizes to exhibit whatever behaviors it believes RMs rate highly, including ones not reinforced during training. We leverage this model to study alignment audits in two ways. First, we conduct a blind auditing game where four teams, unaware of the model's hidden objective or training, investigate it for concerning behaviors and their causes. Three teams successfully uncovered the model's hidden objective using techniques including interpretability with sparse autoencoders (SAEs), behavioral attacks, and training data analysis. Second, we conduct an unblinded follow-up study of eight techniques for auditing the model, analyzing their strengths and limitations. Overall, our work provides a concrete example of using alignment audits to discover a model's hidden objective and proposes a methodology for practicing and validating progress in alignment auditing.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/JXU3M77C/Marks et al. - 2025 - Auditing language models for hidden objectives.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/T5PG969M/2503.html} +} + +@misc{meinkeFrontierModelsAre2025a, + title = {Frontier {{Models}} Are {{Capable}} of {{In-context Scheming}}}, + author = {Meinke, Alexander and Schoen, Bronson and Scheurer, J{\'e}r{\'e}my and Balesni, Mikita and Shah, Rusheb and Hobbhahn, Marius}, + year = {2025}, + month = jan, + number = {arXiv:2412.04984}, + eprint = {2412.04984}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2412.04984}, + urldate = {2025-08-04}, + abstract = {Frontier models are increasingly trained and deployed as autonomous agent. One safety concern is that AI agents might covertly pursue misaligned goals, hiding their true capabilities and objectives - also known as scheming. We study whether models have the capability to scheme in pursuit of a goal that we provide in-context and instruct the model to strongly follow. We evaluate frontier models on a suite of six agentic evaluations where models are instructed to pursue goals and are placed in environments that incentivize scheming. Our results show that o1, Claude 3.5 Sonnet, Claude 3 Opus, Gemini 1.5 Pro, and Llama 3.1 405B all demonstrate in-context scheming capabilities. They recognize scheming as a viable strategy and readily engage in such behavior. For example, models strategically introduce subtle mistakes into their responses, attempt to disable their oversight mechanisms, and even exfiltrate what they believe to be their model weights to external servers. Additionally, this deceptive behavior proves persistent. When o1 has engaged in scheming, it maintains its deception in over 85\% of follow-up questions and often remains deceptive in multi-turn interrogations. Analysis of the models' chains-of-thought reveals that models explicitly reason about these deceptive strategies, providing evidence that the scheming behavior is not accidental. Surprisingly, we also find rare instances where models engage in scheming when only given a goal, without being strongly nudged to pursue it. We observe cases where Claude 3.5 Sonnet strategically underperforms in evaluations in pursuit of being helpful, a goal that was acquired during training rather than in-context. Our findings demonstrate that frontier models now possess capabilities for basic in-context scheming, making the potential of AI agents to engage in scheming behavior a concrete rather than theoretical concern.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/QIECYJ4G/Meinke et al. - 2025 - Frontier Models are Capable of In-context Scheming.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/PYUPEAER/2412.html} +} + +@misc{perezDiscoveringLanguageModel2022, + title = {Discovering {{Language Model Behaviors}} with {{Model-Written Evaluations}}}, + author = {Perez, Ethan and Ringer, Sam and Luko{\v s}i{\=u}t{\.e}, Kamil{\.e} and Nguyen, Karina and Chen, Edwin and Heiner, Scott and Pettit, Craig and Olsson, Catherine and Kundu, Sandipan and Kadavath, Saurav and Jones, Andy and Chen, Anna and Mann, Ben and Israel, Brian and Seethor, Bryan and McKinnon, Cameron and Olah, Christopher and Yan, Da and Amodei, Daniela and Amodei, Dario and Drain, Dawn and Li, Dustin and {Tran-Johnson}, Eli and Khundadze, Guro and Kernion, Jackson and Landis, James and Kerr, Jamie and Mueller, Jared and Hyun, Jeeyoon and Landau, Joshua and Ndousse, Kamal and Goldberg, Landon and Lovitt, Liane and Lucas, Martin and Sellitto, Michael and Zhang, Miranda and Kingsland, Neerav and Elhage, Nelson and Joseph, Nicholas and Mercado, Noem{\'i} and DasSarma, Nova and Rausch, Oliver and Larson, Robin and McCandlish, Sam and Johnston, Scott and Kravec, Shauna and Showk, Sheer El and Lanham, Tamera and {Telleen-Lawton}, Timothy and Brown, Tom and Henighan, Tom and Hume, Tristan and Bai, Yuntao and {Hatfield-Dodds}, Zac and Clark, Jack and Bowman, Samuel R. and Askell, Amanda and Grosse, Roger and Hernandez, Danny and Ganguli, Deep and Hubinger, Evan and Schiefer, Nicholas and Kaplan, Jared}, + year = {2022}, + month = dec, + number = {arXiv:2212.09251}, + eprint = {2212.09251}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2212.09251}, + urldate = {2025-08-04}, + abstract = {As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100\% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user's preferred answer ("sycophancy") and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are high-quality and let us quickly discover many novel LM behaviors.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/XXZTZSET/Perez et al. - 2022 - Discovering Language Model Behaviors with Model-Written Evaluations.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/MTZFSHAV/2212.html} +} + +@misc{thomasSupportingHumanRaters2024, + title = {Supporting {{Human Raters}} with the {{Detection}} of {{Harmful Content}} Using {{Large Language Models}}}, + author = {Thomas, Kurt and Kelley, Patrick Gage and Tao, David and Meiklejohn, Sarah and Vallis, Owen and Tan, Shunwen and Bratani{\v c}, Bla{\v z} and Ferreira, Felipe Tiengo and Eranti, Vijay Kumar and Bursztein, Elie}, + year = {2024}, + month = jun, + number = {arXiv:2406.12800}, + eprint = {2406.12800}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2406.12800}, + urldate = {2025-08-04}, + abstract = {In this paper, we explore the feasibility of leveraging large language models (LLMs) to automate or otherwise assist human raters with identifying harmful content including hate speech, harassment, violent extremism, and election misinformation. Using a dataset of 50,000 comments, we demonstrate that LLMs can achieve 90\% accuracy when compared to human verdicts. We explore how to best leverage these capabilities, proposing five design patterns that integrate LLMs with human rating, such as pre-filtering non-violative content, detecting potential errors in human rating, or surfacing critical context to support human rating. We outline how to support all of these design patterns using a single, optimized prompt. Beyond these synthetic experiments, we share how piloting our proposed techniques in a real-world review queue yielded a 41.5\% improvement in optimizing available human rater capacity, and a 9--11\% increase (absolute) in precision and recall for detecting violative content.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Cryptography and Security}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/43E2KV3V/Thomas et al. - 2024 - Supporting Human Raters with the Detection of Harmful Content using Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/NA64MNC9/2406.html} +} + +@misc{yuanSEvalAutomatedComprehensive2025, + title = {S-{{Eval}}: {{Towards Automated}} and {{Comprehensive Safety Evaluation}} for {{Large Language Models}}}, + shorttitle = {S-{{Eval}}}, + author = {Yuan, Xiaohan and Li, Jinfeng and Wang, Dongxia and Chen, Yuefeng and Mao, Xiaofeng and Huang, Longtao and Chen, Jialuo and Xue, Hui and Liu, Xiaoxia and Wang, Wenhai and Ren, Kui and Wang, Jingyi}, + year = {2025}, + month = apr, + number = {arXiv:2405.14191}, + eprint = {2405.14191}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.2405.14191}, + urldate = {2025-08-04}, + abstract = {Generative large language models (LLMs) have revolutionized natural language processing with their transformative and emergent capabilities. However, recent evidence indicates that LLMs can produce harmful content that violates social norms, raising significant concerns regarding the safety and ethical ramifications of deploying these advanced models. Thus, it is both critical and imperative to perform a rigorous and comprehensive safety evaluation of LLMs before deployment. Despite this need, owing to the extensiveness of LLM generation space, it still lacks a unified and standardized risk taxonomy to systematically reflect the LLM content safety, as well as automated safety assessment techniques to explore the potential risk efficiently. To bridge the striking gap, we propose S-Eval, a novel LLM-based automated Safety Evaluation framework with a newly defined comprehensive risk taxonomy. S-Eval incorporates two key components, i.e., an expert testing LLM \$\{M\}\_t\$ and a novel safety critique LLM \$\{M\}\_c\$. \$\{M\}\_t\$ is responsible for automatically generating test cases in accordance with the proposed risk taxonomy. \$\{M\}\_c\$ can provide quantitative and explainable safety evaluations for better risk awareness of LLMs. In contrast to prior works, S-Eval is efficient and effective in test generation and safety evaluation. Moreover, S-Eval can be flexibly configured and adapted to the rapid evolution of LLMs and accompanying new safety threats, test generation methods and safety critique methods thanks to the LLM-based architecture. S-Eval has been deployed in our industrial partner for the automated safety evaluation of multiple LLMs serving millions of users, demonstrating its effectiveness in real-world scenarios. Our benchmark is publicly available at https://github.com/IS2Lab/S-Eval.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Computation and Language,Computer Science - Cryptography and Security}, + file = {/home/james/snap/zotero-snap/common/Zotero/storage/LY7T5KAC/Yuan et al. - 2025 - S-Eval Towards Automated and Comprehensive Safety Evaluation for Large Language Models.pdf;/home/james/snap/zotero-snap/common/Zotero/storage/D23M6A5U/2405.html} +}