Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions doc/code/executor/attack/role_play_attack.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
"source": [
"# Role Play Attack (Single-Turn) - optional\n",
"\n",
"This attack prepends some prompts defined in `role_play_definition`, along with an `adversarial_chat` target LLM to generate the first turns to send. Typically these prompts describe a fictional scenario to attempt and elicit harmful responses.\n",
"Any converters that you provide will be applied to the prompt that has already been converted by the role play definition (using the provided `adversarial_chat` target). You may see better success if you provide a LLM that has no content moderation or other safety mechanisms. Otherwise, it may refuse to convert the prompt as expected.\n",
"This attack prepends some prompts defined in `role_play_definition`, along with an `attack_adversarial_config` target LLM to generate the first turns to send. Typically these prompts describe a fictional scenario to attempt and elicit harmful responses.\n",
"Any converters that you provide will be applied to the prompt that has already been converted by the role play definition (using the provided `attack_adversarial_config` target). You may see better success if you provide a LLM that has no content moderation or other safety mechanisms. Otherwise, it may refuse to convert the prompt as expected.\n",
"\n",
"\n",
"The results and intermediate interactions will be saved to memory according to the environment settings. For details, see the [Memory Configuration Guide](../../memory/0_memory.md)."
Expand Down Expand Up @@ -343,6 +343,7 @@
"\n",
"from pyrit.auth import get_azure_openai_auth\n",
"from pyrit.executor.attack import (\n",
" AttackAdversarialConfig,\n",
" AttackConverterConfig,\n",
" AttackExecutor,\n",
" AttackScoringConfig,\n",
Expand Down Expand Up @@ -375,7 +376,7 @@
"\n",
"attack = RolePlayAttack(\n",
" objective_target=objective_target,\n",
" adversarial_chat=adversarial_chat,\n",
" attack_adversarial_config=AttackAdversarialConfig(target=adversarial_chat),\n",
" role_play_definition_path=RolePlayPaths.MOVIE_SCRIPT.value,\n",
" attack_scoring_config=scoring_config,\n",
" attack_converter_config=converter_config,\n",
Expand Down
7 changes: 4 additions & 3 deletions doc/code/executor/attack/role_play_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
# %% [markdown]
# # Role Play Attack (Single-Turn) - optional
#
# This attack prepends some prompts defined in `role_play_definition`, along with an `adversarial_chat` target LLM to generate the first turns to send. Typically these prompts describe a fictional scenario to attempt and elicit harmful responses.
# Any converters that you provide will be applied to the prompt that has already been converted by the role play definition (using the provided `adversarial_chat` target). You may see better success if you provide a LLM that has no content moderation or other safety mechanisms. Otherwise, it may refuse to convert the prompt as expected.
# This attack prepends some prompts defined in `role_play_definition`, along with an `attack_adversarial_config` target LLM to generate the first turns to send. Typically these prompts describe a fictional scenario to attempt and elicit harmful responses.
# Any converters that you provide will be applied to the prompt that has already been converted by the role play definition (using the provided `attack_adversarial_config` target). You may see better success if you provide a LLM that has no content moderation or other safety mechanisms. Otherwise, it may refuse to convert the prompt as expected.
#
#
# The results and intermediate interactions will be saved to memory according to the environment settings. For details, see the [Memory Configuration Guide](../../memory/0_memory.md).
Expand All @@ -26,6 +26,7 @@

from pyrit.auth import get_azure_openai_auth
from pyrit.executor.attack import (
AttackAdversarialConfig,
AttackConverterConfig,
AttackExecutor,
AttackScoringConfig,
Expand Down Expand Up @@ -58,7 +59,7 @@

attack = RolePlayAttack(
objective_target=objective_target,
adversarial_chat=adversarial_chat,
attack_adversarial_config=AttackAdversarialConfig(target=adversarial_chat),
role_play_definition_path=RolePlayPaths.MOVIE_SCRIPT.value,
attack_scoring_config=scoring_config,
attack_converter_config=converter_config,
Expand Down
1 change: 1 addition & 0 deletions pyrit/executor/attack/single_turn/flip_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class FlipAttack(PromptSendingAttack):
@apply_defaults
def __init__(
self,
*,
objective_target: PromptChatTarget = REQUIRED_VALUE, # type: ignore[assignment]
attack_converter_config: Optional[AttackConverterConfig] = None,
attack_scoring_config: Optional[AttackScoringConfig] = None,
Expand Down
1 change: 1 addition & 0 deletions pyrit/executor/attack/single_turn/many_shot_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class ManyShotJailbreakAttack(PromptSendingAttack):
@apply_defaults
def __init__(
self,
*,
objective_target: PromptTarget = REQUIRED_VALUE, # type: ignore[assignment]
attack_converter_config: Optional[AttackConverterConfig] = None,
attack_scoring_config: Optional[AttackScoringConfig] = None,
Expand Down
12 changes: 6 additions & 6 deletions pyrit/executor/attack/single_turn/role_play.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from pyrit.common.apply_defaults import REQUIRED_VALUE, apply_defaults
from pyrit.common.path import EXECUTOR_SEED_PROMPT_PATH
from pyrit.executor.attack.core.attack_config import AttackConverterConfig, AttackScoringConfig
from pyrit.executor.attack.core.attack_config import AttackAdversarialConfig, AttackConverterConfig, AttackScoringConfig
from pyrit.executor.attack.core.attack_parameters import AttackParameters
from pyrit.executor.attack.single_turn.prompt_sending import PromptSendingAttack
from pyrit.executor.attack.single_turn.single_turn_attack_strategy import (
Expand All @@ -20,7 +20,7 @@
)
from pyrit.prompt_converter import LLMGenericTextConverter
from pyrit.prompt_normalizer import PromptConverterConfiguration, PromptNormalizer
from pyrit.prompt_target import PromptChatTarget, PromptTarget
from pyrit.prompt_target import PromptTarget

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,7 +66,7 @@ def __init__(
self,
*,
objective_target: PromptTarget = REQUIRED_VALUE, # type: ignore[assignment]
adversarial_chat: PromptChatTarget,
attack_adversarial_config: AttackAdversarialConfig,
role_play_definition_path: pathlib.Path,
attack_converter_config: Optional[AttackConverterConfig] = None,
attack_scoring_config: Optional[AttackScoringConfig] = None,
Expand All @@ -78,8 +78,8 @@ def __init__(

Args:
objective_target (PromptTarget): The target system to attack.
adversarial_chat (PromptChatTarget): The adversarial chat target used to rephrase
objectives into role-play scenarios.
attack_adversarial_config (AttackAdversarialConfig): Configuration for the adversarial component,
including the adversarial chat target used to rephrase objectives into role-play scenarios.
role_play_definition_path (pathlib.Path): Path to the YAML file containing role-play
definitions (rephrase instructions, user start turn, assistant start turn).
attack_converter_config (Optional[AttackConverterConfig]): Configuration for prompt converters.
Expand All @@ -102,7 +102,7 @@ def __init__(
)

# Store the adversarial chat for role-play rephrasing
self._adversarial_chat = adversarial_chat
self._adversarial_chat = attack_adversarial_config.target

# Load role-play definitions
role_play_definition = SeedDataset.from_yaml_file(role_play_definition_path)
Expand Down
2 changes: 1 addition & 1 deletion pyrit/scenario/scenarios/airt/content_harms.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def _get_single_turn_attacks(

role_play_attack = RolePlayAttack(
objective_target=self._objective_target,
adversarial_chat=self._adversarial_chat,
attack_adversarial_config=AttackAdversarialConfig(target=self._adversarial_chat),
role_play_definition_path=RolePlayPaths.MOVIE_SCRIPT.value,
)

Expand Down
5 changes: 4 additions & 1 deletion pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pyrit.common import apply_defaults
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
AttackAdversarialConfig,
AttackConverterConfig,
AttackScoringConfig,
)
Expand Down Expand Up @@ -279,7 +280,9 @@ async def _get_atomic_attack_from_strategy_async(
case "skeleton":
attack = SkeletonKeyAttack(**args)
case "role_play":
args["adversarial_chat"] = self._get_or_create_adversarial_target()
args["attack_adversarial_config"] = AttackAdversarialConfig(
target=self._get_or_create_adversarial_target()
)
args["role_play_definition_path"] = RolePlayPaths.PERSUASION_SCRIPT.value
attack = RolePlayAttack(**args)
case _:
Expand Down
2 changes: 1 addition & 1 deletion pyrit/scenario/scenarios/airt/leakage.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,9 +361,9 @@ async def _create_role_play_attack(self) -> RolePlayAttack:
"""
return RolePlayAttack(
objective_target=self._objective_target,
adversarial_chat=self._adversarial_chat,
role_play_definition_path=RolePlayPaths.PERSUASION_SCRIPT.value,
attack_scoring_config=self._scorer_config,
attack_adversarial_config=self._adversarial_config,
)

def _resolve_seed_groups(self) -> list[SeedAttackGroup]:
Expand Down
2 changes: 1 addition & 1 deletion pyrit/scenario/scenarios/airt/psychosocial.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ def _create_single_turn_attacks(
)
role_play = RolePlayAttack(
objective_target=self._objective_target,
adversarial_chat=self._adversarial_chat,
role_play_definition_path=RolePlayPaths.MOVIE_SCRIPT.value,
attack_scoring_config=scoring_config,
attack_adversarial_config=AttackAdversarialConfig(target=self._adversarial_chat),
)
attacks.append(
AtomicAttack(
Expand Down
2 changes: 1 addition & 1 deletion pyrit/scenario/scenarios/airt/scam.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,9 @@ def _get_atomic_attack_from_strategy(self, strategy: str) -> AtomicAttack:
elif strategy == "role_play":
attack_strategy = RolePlayAttack(
objective_target=self._objective_target,
adversarial_chat=self._adversarial_chat,
role_play_definition_path=RolePlayPaths.PERSUASION_SCRIPT_WRITTEN.value,
attack_scoring_config=self._scorer_config,
attack_adversarial_config=self._adversarial_config,
)
elif strategy == "context_compliance":
# Set system prompt to default
Expand Down
19 changes: 10 additions & 9 deletions tests/unit/executor/attack/single_turn/test_role_play.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from unit.mocks import get_mock_scorer_identifier, get_mock_target_identifier

from pyrit.executor.attack import (
AttackAdversarialConfig,
AttackConverterConfig,
AttackParameters,
AttackScoringConfig,
Expand Down Expand Up @@ -96,7 +97,7 @@ def role_play_attack(mock_objective_target, mock_adversarial_chat_target, role_p
"""Create a RolePlayAttack instance with default configuration"""
return RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
)

Expand All @@ -118,7 +119,7 @@ def test_init(self, mock_objective_target, mock_adversarial_chat_target, role_pl
"""Test RolePlayAttack initialization with default parameters"""
attack = RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
)

Expand All @@ -133,7 +134,7 @@ def test_init_with_valid_true_false_scorer(
"""Test RolePlayAttack initialization with a valid true/false scorer"""
attack = RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
attack_scoring_config=AttackScoringConfig(objective_scorer=mock_scorer),
)
Expand All @@ -148,7 +149,7 @@ def test_init_with_invalid_scorer_type(
with pytest.raises(ValueError, match="Objective scorer must be a TrueFalseScorer"):
RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
attack_scoring_config=AttackScoringConfig(objective_scorer=scorer),
)
Expand All @@ -159,7 +160,7 @@ def test_init_with_invalid_role_play_definition_path(self, mock_objective_target
with pytest.raises(FileNotFoundError):
RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=invalid_path,
)

Expand All @@ -172,7 +173,7 @@ def test_init_with_custom_parameters(

attack = RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
attack_converter_config=AttackConverterConfig(
request_converters=request_converters, response_converters=response_converters
Expand All @@ -194,7 +195,7 @@ def test_init_with_negative_max_attempts(
with pytest.raises(ValueError, match="max_attempts_on_failure must be a non-negative integer"):
RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
max_attempts_on_failure=-1,
)
Expand All @@ -205,7 +206,7 @@ def test_init_loads_role_play_definition_correctly(
"""Test that role play definitions are loaded correctly from YAML"""
attack = RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
)

Expand All @@ -220,7 +221,7 @@ def test_rephrase_converter_created(
"""Test that the rephrase converter is properly created"""
attack = RolePlayAttack(
objective_target=mock_objective_target,
adversarial_chat=mock_adversarial_chat_target,
attack_adversarial_config=AttackAdversarialConfig(target=mock_adversarial_chat_target),
role_play_definition_path=role_play_definition_file,
)

Expand Down
Loading