-
Notifications
You must be signed in to change notification settings - Fork 667
FEAT: Jailbreak Scenario Expansion #1340
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
022f70a
e85cdb9
fc260c3
89a8079
b18f224
96ddf6c
eb4e936
243ea0a
946fdde
132caf5
c4e625f
79d1a64
cb28fda
f399b6d
75436ea
c0022f6
9f579f2
ccf7025
349cc6b
9fa6430
513cbf3
b57b35a
999a0c6
f3ec8bb
89fd8bd
66650a6
fa5b01a
44bc05c
db5270c
9d9666f
302101f
9c7b757
737aabe
472bd20
b07e197
c31d088
6dcf318
ec9d731
163e582
a503a4b
af32046
6da95f9
73d77a6
827ec0e
8168db8
5ac7651
20ef0c3
06bb694
03a1e9b
6a67ac4
4b441d4
b14f564
07b6142
36b6b95
f39aecd
a43eeaf
be5045a
11347d9
4a4f77a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,25 +3,26 @@ | |
|
|
||
| import os | ||
| from pathlib import Path | ||
| from typing import List, Optional | ||
| from typing import List, Optional, Union | ||
|
|
||
| from pyrit.common import apply_defaults | ||
| from pyrit.datasets import TextJailBreak | ||
| from pyrit.executor.attack.core.attack_config import ( | ||
| AttackConverterConfig, | ||
| AttackScoringConfig, | ||
| ) | ||
| from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack | ||
| from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack | ||
| from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths | ||
| from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack | ||
| from pyrit.models import SeedAttackGroup | ||
| from pyrit.prompt_converter import TextJailbreakConverter | ||
| from pyrit.prompt_normalizer import PromptConverterConfiguration | ||
| from pyrit.prompt_target import OpenAIChatTarget | ||
| from pyrit.scenario.core.atomic_attack import AtomicAttack | ||
| from pyrit.scenario.core.dataset_configuration import DatasetConfiguration | ||
| from pyrit.scenario.core.scenario import Scenario | ||
| from pyrit.scenario.core.scenario_strategy import ( | ||
| ScenarioStrategy, | ||
| ) | ||
| from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy | ||
| from pyrit.score import ( | ||
| SelfAskRefusalScorer, | ||
| TrueFalseInverterScorer, | ||
|
|
@@ -31,13 +32,38 @@ | |
|
|
||
| class JailbreakStrategy(ScenarioStrategy): | ||
| """ | ||
| Strategy for single-turn jailbreak attacks. | ||
| Strategy for jailbreak attacks. | ||
|
|
||
| The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to | ||
| expose an obvious way of using this scenario without worrying about additional tweaks and changes | ||
| to the prompt. | ||
|
|
||
| There is currently only one, running all jailbreaks. | ||
| COMPLEX strategies | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there supposed to be more of a description here of the complex strategies? |
||
| """ | ||
|
|
||
| # Aggregate members (special markers that expand to strategies with matching tags) | ||
ValbuenaVC marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ALL = ("all", {"all"}) | ||
| PYRIT = ("pyrit", {"pyrit"}) | ||
| SIMPLE = ("simple", {"simple"}) | ||
| COMPLEX = ("complex", {"complex"}) | ||
|
|
||
| # Simple strategies | ||
| PromptSending = ("prompt_sending", {"simple"}) | ||
|
|
||
| # Complex strategies | ||
| ManyShot = ("many_shot", {"complex"}) | ||
| SkeletonKey = ("skeleton", {"complex"}) | ||
| RolePlay = ("role_play", {"complex"}) | ||
|
|
||
| @classmethod | ||
| def get_aggregate_tags(cls) -> set[str]: | ||
| """ | ||
| Get the set of tags that represent aggregate categories. | ||
|
|
||
| Returns: | ||
| set[str]: Set of tags that are aggregate markers. | ||
| """ | ||
| # Include base class aggregates ("all") and add scenario-specific ones | ||
| return super().get_aggregate_tags() | {"simple", "complex"} | ||
|
|
||
|
|
||
| class Jailbreak(Scenario): | ||
|
|
@@ -67,9 +93,9 @@ def get_default_strategy(cls) -> ScenarioStrategy: | |
| Get the default strategy used when no strategies are specified. | ||
|
|
||
| Returns: | ||
| ScenarioStrategy: JailbreakStrategy.ALL. | ||
| ScenarioStrategy: JailbreakStrategy.PromptSending. | ||
| """ | ||
| return JailbreakStrategy.ALL | ||
| return JailbreakStrategy.PromptSending | ||
|
|
||
| @classmethod | ||
| def required_datasets(cls) -> list[str]: | ||
|
|
@@ -84,7 +110,7 @@ def default_dataset_config(cls) -> DatasetConfiguration: | |
| Returns: | ||
| DatasetConfiguration: Configuration with airt_harms dataset. | ||
| """ | ||
| return DatasetConfiguration(dataset_names=["airt_harms"], max_dataset_size=4) | ||
| return DatasetConfiguration(dataset_names=["airt_harms"]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any reason for removing max dataset size? I think we have this set so that our integration tests don't run the entire dataset by default, which would slow it down. Is there anywhere where the user provides a max prompt number that we could pass through to here if its set, and otherwise if not set we keep at default of 4? |
||
|
|
||
| @apply_defaults | ||
| def __init__( | ||
|
|
@@ -93,7 +119,9 @@ def __init__( | |
| objective_scorer: Optional[TrueFalseScorer] = None, | ||
| include_baseline: bool = False, | ||
| scenario_result_id: Optional[str] = None, | ||
| n_jailbreaks: Optional[int] = 3, | ||
| k_jailbreaks: Optional[int] = None, | ||
ValbuenaVC marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| num_tries: int = 1, | ||
ValbuenaVC marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| jailbreak_names: Optional[List[str]] = None, | ||
| ) -> None: | ||
| """ | ||
| Initialize the jailbreak scenario. | ||
|
|
@@ -104,13 +132,39 @@ def __init__( | |
| include_baseline (bool): Whether to include a baseline atomic attack that sends all | ||
| objectives without modifications. Defaults to True. | ||
| scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume. | ||
| n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them. | ||
| k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them. | ||
| num_tries (Optional[int]): Number of times to try each jailbreak. | ||
| jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets. | ||
| to use. | ||
|
|
||
| Raises: | ||
| ValueError: If both jailbreak_names and k_jailbreaks are provided, as random selection | ||
| is incompatible with a predetermined list. | ||
| ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed | ||
| templates. | ||
|
|
||
| """ | ||
| if jailbreak_names and k_jailbreaks: | ||
| raise ValueError( | ||
| "Please provide only one of `k_jailbreaks` (random selection) or `jailbreaks` (specific selection)." | ||
| ) | ||
|
|
||
| if not objective_scorer: | ||
| objective_scorer = self._get_default_objective_scorer() | ||
| self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer) | ||
|
|
||
| self._n = n_jailbreaks | ||
| self._k = k_jailbreaks | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: while it doesn't really matter what our private vars are called, i'm team explicit naming so i'd change these to be
|
||
| self._n = num_tries | ||
|
|
||
| all_templates = TextJailBreak.get_jailbreak_templates() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would it work for us to change this line to
Then you won't need an else block below, and you can use the value of From what I see you already have validation above so that we can't have |
||
|
|
||
| if jailbreak_names: | ||
| diff = set(jailbreak_names) - set(all_templates) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. curiosity: whoa my brain doesn't compute this logic lol is the diff = the names that are in could we make the same comparison by checking |
||
| if len(diff) > 0: | ||
| raise ValueError(f"Error: could not find templates `{diff}`!") | ||
| self._jailbreaks = jailbreak_names | ||
| else: | ||
| self._jailbreaks = TextJailBreak.get_jailbreak_templates(k=self._k) | ||
|
|
||
| super().__init__( | ||
| name="Jailbreak", | ||
|
|
@@ -146,6 +200,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer: | |
| ) | ||
| return refusal_scorer | ||
|
|
||
| def _get_default_adversarial_target(self) -> OpenAIChatTarget: | ||
| """ | ||
| Create and retrieve the default adversarial target. | ||
|
|
||
| Returns: | ||
| OpenAIChatTarget: Default adversarial target using an unfiltered endpoint. | ||
| """ | ||
| return OpenAIChatTarget( | ||
| endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), | ||
| api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), | ||
| model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), | ||
| temperature=1.2, | ||
| ) | ||
|
|
||
| def _resolve_seed_groups(self) -> List[SeedAttackGroup]: | ||
| """ | ||
| Resolve seed groups from dataset configuration. | ||
|
|
@@ -161,23 +229,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]: | |
|
|
||
| return list(seed_groups) | ||
|
|
||
| def _get_all_jailbreak_templates(self) -> List[str]: | ||
| """ | ||
| Retrieve all available jailbreak templates. | ||
|
|
||
| Returns: | ||
| List[str]: List of jailbreak template file names. | ||
| """ | ||
| if not self._n: | ||
| return TextJailBreak.get_all_jailbreak_templates() | ||
| else: | ||
| return TextJailBreak.get_all_jailbreak_templates(n=self._n) | ||
|
|
||
| async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack: | ||
| async def _get_atomic_attack_from_strategy_async( | ||
| self, *, strategy: str, jailbreak_template_name: str | ||
| ) -> AtomicAttack: | ||
| """ | ||
| Create an atomic attack for a specific jailbreak template. | ||
|
|
||
| Args: | ||
| strategy (str): JailbreakStrategy to use. | ||
| jailbreak_template_name (str): Name of the jailbreak template file. | ||
|
|
||
| Returns: | ||
|
|
@@ -202,12 +261,33 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na | |
| request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter]) | ||
| ) | ||
|
|
||
| # Create the attack | ||
| attack = PromptSendingAttack( | ||
| objective_target=self._objective_target, | ||
| attack_scoring_config=self._scorer_config, | ||
| attack_converter_config=converter_config, | ||
| ) | ||
| attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None | ||
| args = { | ||
| "objective_target": self._objective_target, | ||
| "attack_scoring_config": self._scorer_config, | ||
| "attack_converter_config": converter_config, | ||
| } | ||
| match strategy: | ||
| case "many_shot": | ||
| attack = ManyShotJailbreakAttack(**args) | ||
| case "prompt_sending": | ||
| attack = PromptSendingAttack(**args) | ||
| case "skeleton": | ||
| attack = SkeletonKeyAttack(**args) | ||
| case "role_play": | ||
| args["adversarial_chat"] = OpenAIChatTarget( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we just call |
||
| endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"), | ||
| api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"), | ||
| model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"), | ||
| temperature=1.2, | ||
| ) | ||
| args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value | ||
| attack = RolePlayAttack(**args) | ||
| case _: | ||
| raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.") | ||
|
|
||
| if not attack: | ||
| raise ValueError(f"Attack cannot be None!") | ||
|
|
||
| # Extract template name without extension for the atomic attack name | ||
| template_name = Path(jailbreak_template_name).stem | ||
|
|
@@ -230,11 +310,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]: | |
| # Retrieve seed prompts based on selected strategies | ||
| self._seed_groups = self._resolve_seed_groups() | ||
|
|
||
| # Get all jailbreak template names | ||
| jailbreak_template_names = self._get_all_jailbreak_templates() | ||
| strategies = ScenarioCompositeStrategy.extract_single_strategy_values( | ||
| composites=self._scenario_composites, strategy_type=JailbreakStrategy | ||
| ) | ||
|
|
||
| for template_name in jailbreak_template_names: | ||
| atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name) | ||
| atomic_attacks.append(atomic_attack) | ||
| for strategy in strategies: | ||
| for template_name in self._jailbreaks: | ||
| for _ in range(0, self._n): | ||
| atomic_attack = await self._get_atomic_attack_from_strategy_async( | ||
| strategy=strategy, jailbreak_template_name=template_name | ||
| ) | ||
| atomic_attacks.append(atomic_attack) | ||
|
|
||
| return atomic_attacks | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we make this param name more descriptive?
num_templatesornum_jailbreaksshould suffice 🫡