Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
022f70a
Scaffolding
Jan 26, 2026
e85cdb9
Precommit
Jan 26, 2026
fc260c3
fixtures and basic tests
Jan 27, 2026
89a8079
basic tests
Jan 27, 2026
b18f224
basic tests
Jan 27, 2026
96ddf6c
last test
Jan 28, 2026
eb4e936
jailbreak format test
Jan 28, 2026
243ea0a
sample jailbreak prompt
Jan 28, 2026
946fdde
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
132caf5
real jailbreaks added
Jan 28, 2026
c4e625f
Merge branch 'main' into jailbreak
ValbuenaVC Jan 28, 2026
79d1a64
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
cb28fda
changing dataset name
Jan 29, 2026
f399b6d
moved jailbreak discovery
Jan 29, 2026
75436ea
changed path resolution
Jan 29, 2026
c0022f6
minor changes
Jan 29, 2026
9f579f2
minor bug
Jan 29, 2026
ccf7025
Merge branch 'main' into jailbreak
ValbuenaVC Jan 29, 2026
349cc6b
old dataset name
Jan 30, 2026
9fa6430
precommit
Jan 30, 2026
513cbf3
random jailbreak selection
Jan 30, 2026
b57b35a
error handling
Jan 30, 2026
999a0c6
error handling docstring
Jan 30, 2026
f3ec8bb
Merge branch 'Azure:main' into jailbreak2
ValbuenaVC Jan 30, 2026
89fd8bd
scaffolding
Jan 30, 2026
66650a6
scaffolding for subset
Jan 30, 2026
fa5b01a
scaffolding
Jan 30, 2026
44bc05c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
db5270c
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 5, 2026
9d9666f
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 7, 2026
302101f
subset
Feb 9, 2026
9c7b757
tweaking
Feb 10, 2026
737aabe
new strategy template
Feb 10, 2026
472bd20
types'
Feb 10, 2026
b07e197
adversarial
Feb 10, 2026
c31d088
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 10, 2026
6dcf318
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
ec9d731
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
163e582
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 11, 2026
a503a4b
unit test fixes
Feb 11, 2026
af32046
Merge branch 'jailbreak2' of https://github.com/ValbuenaVC/PyRIT into…
Feb 11, 2026
6da95f9
unit test fix
Feb 11, 2026
73d77a6
mypy
Feb 11, 2026
827ec0e
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 12, 2026
8168db8
params
Feb 12, 2026
5ac7651
tweaks
Feb 12, 2026
20ef0c3
dataset_size
Feb 12, 2026
06bb694
k_jailbreak bug
Feb 13, 2026
03a1e9b
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 13, 2026
6a67ac4
tests
Feb 13, 2026
4b441d4
new strategies
Feb 14, 2026
b14f564
adversarial chat
Feb 14, 2026
07b6142
roleplay path
Feb 14, 2026
36b6b95
roleplay
Feb 14, 2026
f39aecd
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 17, 2026
a43eeaf
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 17, 2026
be5045a
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 18, 2026
11347d9
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 19, 2026
4a4f77a
Merge branch 'main' into jailbreak2
ValbuenaVC Feb 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,12 @@ def __init__(
self.template.value = self.template.render_template_value_silent(**kwargs)

@classmethod
def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
def get_jailbreak_templates(cls, k: Optional[int] = None) -> List[str]:
"""
Retrieve all jailbreaks from the JAILBREAK_TEMPLATES_PATH.

Args:
n (int, optional): Number of jailbreak templates to return. None to get all.
k (int, optional): Number of jailbreak templates to return. None to get all.
Copy link
Contributor

@nina-msft nina-msft Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we make this param name more descriptive?

num_templates or num_jailbreaks should suffice 🫡


Returns:
List[str]: List of jailbreak template file names.
Expand All @@ -122,12 +122,12 @@ def get_all_jailbreak_templates(cls, n: Optional[int] = None) -> List[str]:
if not jailbreak_template_names:
raise ValueError("No jailbreak templates found in the jailbreak directory")

if n:
if n > len(jailbreak_template_names):
if k:
if k > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {n} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
f"Attempted to pull {k} jailbreaks from a dataset with only {len(jailbreak_template_names)} jailbreaks!"
)
jailbreak_template_names = random.choices(jailbreak_template_names, k=n)
jailbreak_template_names = random.choices(jailbreak_template_names, k=k)
return jailbreak_template_names

def get_jailbreak_system_prompt(self) -> str:
Expand Down
159 changes: 122 additions & 37 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,26 @@

import os
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Union

from pyrit.common import apply_defaults
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
AttackConverterConfig,
AttackScoringConfig,
)
from pyrit.executor.attack.single_turn.many_shot_jailbreak import ManyShotJailbreakAttack
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
from pyrit.executor.attack.single_turn.role_play import RolePlayAttack, RolePlayPaths
from pyrit.executor.attack.single_turn.skeleton_key import SkeletonKeyAttack
from pyrit.models import SeedAttackGroup
from pyrit.prompt_converter import TextJailbreakConverter
from pyrit.prompt_normalizer import PromptConverterConfiguration
from pyrit.prompt_target import OpenAIChatTarget
from pyrit.scenario.core.atomic_attack import AtomicAttack
from pyrit.scenario.core.dataset_configuration import DatasetConfiguration
from pyrit.scenario.core.scenario import Scenario
from pyrit.scenario.core.scenario_strategy import (
ScenarioStrategy,
)
from pyrit.scenario.core.scenario_strategy import ScenarioCompositeStrategy, ScenarioStrategy
from pyrit.score import (
SelfAskRefusalScorer,
TrueFalseInverterScorer,
Expand All @@ -31,13 +32,38 @@

class JailbreakStrategy(ScenarioStrategy):
"""
Strategy for single-turn jailbreak attacks.
Strategy for jailbreak attacks.

The SIMPLE strategy just sends the jailbroken prompt and records the response. It is meant to
expose an obvious way of using this scenario without worrying about additional tweaks and changes
to the prompt.

There is currently only one, running all jailbreaks.
COMPLEX strategies
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there supposed to be more of a description here of the complex strategies?

"""

# Aggregate members (special markers that expand to strategies with matching tags)
ALL = ("all", {"all"})
PYRIT = ("pyrit", {"pyrit"})
SIMPLE = ("simple", {"simple"})
COMPLEX = ("complex", {"complex"})

# Simple strategies
PromptSending = ("prompt_sending", {"simple"})

# Complex strategies
ManyShot = ("many_shot", {"complex"})
SkeletonKey = ("skeleton", {"complex"})
RolePlay = ("role_play", {"complex"})

@classmethod
def get_aggregate_tags(cls) -> set[str]:
"""
Get the set of tags that represent aggregate categories.

Returns:
set[str]: Set of tags that are aggregate markers.
"""
# Include base class aggregates ("all") and add scenario-specific ones
return super().get_aggregate_tags() | {"simple", "complex"}


class Jailbreak(Scenario):
Expand Down Expand Up @@ -67,9 +93,9 @@ def get_default_strategy(cls) -> ScenarioStrategy:
Get the default strategy used when no strategies are specified.

Returns:
ScenarioStrategy: JailbreakStrategy.ALL.
ScenarioStrategy: JailbreakStrategy.PromptSending.
"""
return JailbreakStrategy.ALL
return JailbreakStrategy.PromptSending

@classmethod
def required_datasets(cls) -> list[str]:
Expand All @@ -84,7 +110,7 @@ def default_dataset_config(cls) -> DatasetConfiguration:
Returns:
DatasetConfiguration: Configuration with airt_harms dataset.
"""
return DatasetConfiguration(dataset_names=["airt_harms"], max_dataset_size=4)
return DatasetConfiguration(dataset_names=["airt_harms"])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason for removing max dataset size? I think we have this set so that our integration tests don't run the entire dataset by default, which would slow it down.

Is there anywhere where the user provides a max prompt number that we could pass through to here if its set, and otherwise if not set we keep at default of 4?


@apply_defaults
def __init__(
Expand All @@ -93,7 +119,9 @@ def __init__(
objective_scorer: Optional[TrueFalseScorer] = None,
include_baseline: bool = False,
scenario_result_id: Optional[str] = None,
n_jailbreaks: Optional[int] = 3,
k_jailbreaks: Optional[int] = None,
num_tries: int = 1,
jailbreak_names: Optional[List[str]] = None,
) -> None:
"""
Initialize the jailbreak scenario.
Expand All @@ -104,13 +132,39 @@ def __init__(
include_baseline (bool): Whether to include a baseline atomic attack that sends all
objectives without modifications. Defaults to True.
scenario_result_id (Optional[str]): Optional ID of an existing scenario result to resume.
n_jailbreaks (Optional[int]): Choose n random jailbreaks rather than using all of them.
k_jailbreaks (Optional[int]): Choose k random jailbreaks rather than using all of them.
num_tries (Optional[int]): Number of times to try each jailbreak.
jailbreak_names (Optional[List[str]]): List of jailbreak names from the template list under datasets.
to use.

Raises:
ValueError: If both jailbreak_names and k_jailbreaks are provided, as random selection
is incompatible with a predetermined list.
ValueError: If the jailbreak_names list contains a jailbreak that isn't in the listed
templates.

"""
if jailbreak_names and k_jailbreaks:
raise ValueError(
"Please provide only one of `k_jailbreaks` (random selection) or `jailbreaks` (specific selection)."
)

if not objective_scorer:
objective_scorer = self._get_default_objective_scorer()
self._scorer_config = AttackScoringConfig(objective_scorer=objective_scorer)

self._n = n_jailbreaks
self._k = k_jailbreaks
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: while it doesn't really matter what our private vars are called, i'm team explicit naming so i'd change these to be

self._num_jailbreaks and self._num_tries (or _num_attempts) respectively

self._n = num_tries

all_templates = TextJailBreak.get_jailbreak_templates()
Copy link
Contributor

@nina-msft nina-msft Feb 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it work for us to change this line to

self._jailbreaks = TextJailBreak.get_jailbreak_templates(k=self._k) - if _k is None this will grab all templates, otherwise if will get the selected ones.

Then you won't need an else block below, and you can use the value of self._jailbreaks where you're using all_templates right now in the if block.

From what I see you already have validation above so that we can't have k_jailbreaks and jailbreak_names at the same time so if we enter the if jailbreak_names block we can assume that the list of templates is all of them.


if jailbreak_names:
diff = set(jailbreak_names) - set(all_templates)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

curiosity: whoa my brain doesn't compute this logic lol

is the diff = the names that are in jailbreak_names and not in all_templates

could we make the same comparison by checking for name in jailbreak_names if name not in set(all_templates) raise error and this is just a more efficient way of doing that?

if len(diff) > 0:
raise ValueError(f"Error: could not find templates `{diff}`!")
self._jailbreaks = jailbreak_names
else:
self._jailbreaks = TextJailBreak.get_jailbreak_templates(k=self._k)

super().__init__(
name="Jailbreak",
Expand Down Expand Up @@ -146,6 +200,20 @@ def _get_default_objective_scorer(self) -> TrueFalseScorer:
)
return refusal_scorer

def _get_default_adversarial_target(self) -> OpenAIChatTarget:
"""
Create and retrieve the default adversarial target.

Returns:
OpenAIChatTarget: Default adversarial target using an unfiltered endpoint.
"""
return OpenAIChatTarget(
endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)

def _resolve_seed_groups(self) -> List[SeedAttackGroup]:
"""
Resolve seed groups from dataset configuration.
Expand All @@ -161,23 +229,14 @@ def _resolve_seed_groups(self) -> List[SeedAttackGroup]:

return list(seed_groups)

def _get_all_jailbreak_templates(self) -> List[str]:
"""
Retrieve all available jailbreak templates.

Returns:
List[str]: List of jailbreak template file names.
"""
if not self._n:
return TextJailBreak.get_all_jailbreak_templates()
else:
return TextJailBreak.get_all_jailbreak_templates(n=self._n)

async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_name: str) -> AtomicAttack:
async def _get_atomic_attack_from_strategy_async(
self, *, strategy: str, jailbreak_template_name: str
) -> AtomicAttack:
"""
Create an atomic attack for a specific jailbreak template.

Args:
strategy (str): JailbreakStrategy to use.
jailbreak_template_name (str): Name of the jailbreak template file.

Returns:
Expand All @@ -202,12 +261,33 @@ async def _get_atomic_attack_from_jailbreak_async(self, *, jailbreak_template_na
request_converters=PromptConverterConfiguration.from_converters(converters=[jailbreak_converter])
)

# Create the attack
attack = PromptSendingAttack(
objective_target=self._objective_target,
attack_scoring_config=self._scorer_config,
attack_converter_config=converter_config,
)
attack: Optional[Union[ManyShotJailbreakAttack, PromptSendingAttack, RolePlayAttack, SkeletonKeyAttack]] = None
args = {
"objective_target": self._objective_target,
"attack_scoring_config": self._scorer_config,
"attack_converter_config": converter_config,
}
match strategy:
case "many_shot":
attack = ManyShotJailbreakAttack(**args)
case "prompt_sending":
attack = PromptSendingAttack(**args)
case "skeleton":
attack = SkeletonKeyAttack(**args)
case "role_play":
args["adversarial_chat"] = OpenAIChatTarget(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just call _get_default_adversarial_target here?

endpoint=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_KEY"),
model_name=os.environ.get("AZURE_OPENAI_GPT4O_UNSAFE_CHAT_MODEL"),
temperature=1.2,
)
args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value
attack = RolePlayAttack(**args)
case _:
raise ValueError(f"Unknown JailbreakStrategy `{strategy}`.")

if not attack:
raise ValueError(f"Attack cannot be None!")

# Extract template name without extension for the atomic attack name
template_name = Path(jailbreak_template_name).stem
Expand All @@ -230,11 +310,16 @@ async def _get_atomic_attacks_async(self) -> List[AtomicAttack]:
# Retrieve seed prompts based on selected strategies
self._seed_groups = self._resolve_seed_groups()

# Get all jailbreak template names
jailbreak_template_names = self._get_all_jailbreak_templates()
strategies = ScenarioCompositeStrategy.extract_single_strategy_values(
composites=self._scenario_composites, strategy_type=JailbreakStrategy
)

for template_name in jailbreak_template_names:
atomic_attack = await self._get_atomic_attack_from_jailbreak_async(jailbreak_template_name=template_name)
atomic_attacks.append(atomic_attack)
for strategy in strategies:
for template_name in self._jailbreaks:
for _ in range(0, self._n):
atomic_attack = await self._get_atomic_attack_from_strategy_async(
strategy=strategy, jailbreak_template_name=template_name
)
atomic_attacks.append(atomic_attack)

return atomic_attacks
Loading