Skip to content

Commit e995149

Browse files
alzambranolu13amanjaiswal73892
authored andcommitted
TODOs completed: Completed README, added test
1 parent c288f29 commit e995149

File tree

13 files changed

+296
-523
lines changed

13 files changed

+296
-523
lines changed

.github/workflows/pypi.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ jobs:
2828

2929
- name: Build a binary wheel and a source tarball (browsergym-webarena)
3030
run: python3 -m build browsergym/webarena/ --outdir dist/
31+
32+
- name: Build a binary wheel and a source tarball (browsergym-safearena)
33+
run: python3 -m build browsergym/safearena/ --outdir dist/
3134

3235
- name: Build a binary wheel and a source tarball (browsergym-webarena)
3336
run: python3 -m build browsergym/visualwebarena/ --outdir dist/

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ BrowserGym includes the following benchmarks by default:
4343
- [WorkArena](https://github.com/ServiceNow/WorkArena)
4444
- [AssistantBench](https://github.com/oriyor/assistantbench)
4545
- [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
46+
- [SafeArena](https://safearena.github.io/)
4647

4748
Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
4849

@@ -59,6 +60,7 @@ pip install browsergym-visualwebarena # core + visualwebarena
5960
pip install browsergym-workarena # core + workarena
6061
pip install browsergym-assistantbench # core + assistantbench
6162
pip install weblinx-browsergym # core + weblinx
63+
pip install browsergym-safearena #core + webarena
6264
```
6365

6466
Then setup playwright by running
@@ -72,6 +74,7 @@ Finally, each benchmark comes with its own specific setup that requires to follo
7274
- for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md)
7375
- for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
7476
- for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
77+
- for SafeArena, see [safearena/README.md]
7578

7679
### 🏗️ Development setup
7780

@@ -173,11 +176,26 @@ import browsergym.workarena # register assistantbench tasks as gym environments
173176
env = gym.make("browsergym/assistantbench.validation.3")
174177
...
175178

179+
176180
# list all the available assistantbench tasks
177181
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
178182
print("\n".join(env_ids))
179183
```
180184

185+
WebArena
186+
```python
187+
import gymnasium as gym
188+
import browsergym.webarena # register safearena tasks as gym environments
189+
190+
# start a webarena task
191+
env = gym.make("browsergym/safearena_all.110") # or safearena_safe, safearena_harm
192+
...
193+
194+
# list all the available safearena tasks
195+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/safearena_all")]
196+
print("\n".join(env_ids))
197+
```
198+
181199
## 💻 Demo
182200

183201
If you want to experiment with a demo agent in BrowserGym, follow these steps
@@ -211,6 +229,9 @@ python demo_agent/run_demo.py --task_name webarena.4
211229

212230
# visualwebarena
213231
python demo_agent/run_demo.py --task_name visualwebarena.398
232+
233+
# safearena
234+
python demo_agent/run_demo.py --task_name safearena_all.4
214235
```
215236

216237
You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more!
@@ -229,6 +250,7 @@ python demo_agent/run_demo.py --help
229250
- [WebLINX](https://github.com/McGill-NLP/weblinx): A dataset of real-world web interaction traces.
230251
- [AssistantBench](https://github.com/oriyor/assistantbench): A benchmark of realistic and time-consuming tasks on the open web.
231252
- [DoomArena](https://github.com/ServiceNow/DoomArena): A framework for AI agent security testing which supports injecting attacks into web pages from Browsergym environments.
253+
- [SafeArena](https://safearena.github.io/): Evaluate Web Agents on malicious, realistic, webarena-like tasks.
232254

233255
## 🌟 Contributors
234256

browsergym/experiments/src/browsergym/experiments/benchmark/configs.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -266,41 +266,40 @@
266266
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
267267
is_multi_tab=True,
268268
supports_parallel_seeds=False,
269-
backends=["webarena"],
269+
backends=["safearena"],
270270
env_args_list=make_env_args_list_from_repeat_tasks(
271-
task_list=task_list_from_metadata(metadata=task_metadata("safearena_all")),
271+
task_list=task_list_from_metadata(metadata=task_metadata("safearena")),
272272
max_steps=30,
273273
n_repeats=1,
274274
seeds_rng=np.random.RandomState(42),
275275
),
276-
task_metadata=task_metadata("safearena_all"),
276+
task_metadata=task_metadata("safearena"),
277277
),
278278
"safearena_harm": lambda: Benchmark(
279-
name="safenarena_all",
279+
name="safenarena_harm",
280280
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
281281
is_multi_tab=True,
282282
supports_parallel_seeds=False,
283-
backends=["webarena"],
284-
env_args_list=make_env_args_list_from_repeat_tasks(
285-
task_list=task_list_from_metadata(metadata=task_metadata("safearena_harm")),
283+
backends=["safearena"],
284+
env_args_list=make_env_args_list_from_fixed_seeds(
285+
task_list=[f"safearena.{i}" for i in range(1, 250)],
286286
max_steps=30,
287-
n_repeats=1,
288-
seeds_rng=np.random.RandomState(42),
287+
fixed_seeds=[0],
289288
),
290-
task_metadata=task_metadata("safearena_harm"),
289+
task_metadata=task_metadata("safearena"),
291290
),
291+
292292
"safearena_safe": lambda: Benchmark(
293-
name="safenarena_all",
293+
name="safenarena_safe",
294294
high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"],
295295
is_multi_tab=True,
296296
supports_parallel_seeds=False,
297-
backends=["webarena"],
298-
env_args_list=make_env_args_list_from_repeat_tasks(
299-
task_list=task_list_from_metadata(metadata=task_metadata("safearena_safe")),
297+
backends=["safearena"],
298+
env_args_list=make_env_args_list_from_fixed_seeds(
299+
task_list=[f"safearena.{i}" for i in range(250, 500)],
300300
max_steps=30,
301-
n_repeats=1,
302-
seeds_rng=np.random.RandomState(42),
301+
fixed_seeds=[0],
303302
),
304-
task_metadata=task_metadata("safearena_safe"),
305-
),
303+
task_metadata=task_metadata("safearena"),
304+
)
306305
}

browsergym/experiments/src/browsergym/experiments/benchmark/metadata/safearena_all.csv renamed to browsergym/experiments/src/browsergym/experiments/benchmark/metadata/safearena.csv

File renamed without changes.

0 commit comments

Comments
 (0)