Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
157fda5
feat: add evaluation summary dashboard with filtering and directory m…
Apr 9, 2026
f82134e
refactor: apply black formatting to viewer/main.py for improved code …
Apr 9, 2026
bd3a3b0
feat: add requester field, sorting, and UI styling to evaluation view…
Apr 9, 2026
f393e17
feat: configure min instances for Cloud Run and clean up unused filte…
Apr 9, 2026
d3a7723
Merge branch 'main' into viewer-v2
IsmailMehdi Apr 9, 2026
3284eb4
chore: bump version to 1.2.0, add error logging to file readers, and …
Apr 9, 2026
5472499
Merge branch 'main' into viewer-v2
IsmailMehdi Apr 9, 2026
d347be9
Merge branch 'main' into viewer-v2
IsmailMehdi Apr 9, 2026
7ccf093
feat: add git commit version tracking to build process and display in…
Apr 9, 2026
ae832d1
chore: remove unused version.txt file
Apr 9, 2026
4e36317
chore: ignore autogenerated version file in viewer directory
Apr 9, 2026
4ea1def
feat: add background precomputation worker for trends and update UI s…
Apr 9, 2026
610385e
feat: implement incremental precomputation of trends with persistent …
Apr 9, 2026
dc943bb
feat: add experimental configuration summary dashboard panel to the v…
Apr 9, 2026
4053d3a
refactor: replace plotly charts with custom d3.js implementation for …
Apr 10, 2026
f8d970b
Merge branch 'main' into viewer-v2
Apr 10, 2026
ceb814c
refactor: remove unused make_on_click helper function from main viewe…
Apr 10, 2026
08df159
refactor: remove redundant inline logging imports in main.py
Apr 10, 2026
6a3afb7
refactor: remove unused imports, improve exception handling, and add …
Apr 10, 2026
4e9e498
fix: add logging for failed run_time date parsing in trend precomputa…
Apr 10, 2026
a4b7847
refactor: consolidate State class, introduce status dashboard, and up…
Apr 10, 2026
9f0402b
feat: add dataset column to eval viewer, enable product click-to-filt…
Apr 10, 2026
082a5be
feat: add goal completion fallback to precompute and display in viewer
Apr 11, 2026
2e5a202
feat: add cache clearing functionality and increase Gunicorn timeout …
Apr 11, 2026
f938c0b
Merge remote-tracking branch 'origin/main' into viewer-v2
Apr 12, 2026
bb8f83a
feat: update UI, increase gunicorn worker count to 12, and bump packa…
Apr 12, 2026
615f3c7
feat: integrate AI-driven evaluation scoring and summary generation i…
Apr 13, 2026
17f3eac
Merge origin/main and adopt supervisord for Cloud Run with fixes
Apr 13, 2026
f1ba5da
feat: update Gemini model to flash, enhance chart interactivity with …
Apr 13, 2026
c311926
Resolve merge conflicts with origin/main
Apr 13, 2026
3cd1f22
fix: remove unused json import and add fallback error message to summ…
Apr 13, 2026
1aefae6
feat: add rows-to-show filter, sanitize event handler names, and refi…
Apr 14, 2026
e1438a7
refactor: decouple frontend startup script, fix event handler routing…
Apr 14, 2026
73a198e
Merge branch 'main' into viewer-v2
IsmailMehdi Apr 14, 2026
a77ea13
feat: add RWLock to SessionManager for thread safety and return error…
Apr 14, 2026
60fea62
chore: cleanup whitespace in Python files and expand Makefile phony t…
Apr 14, 2026
d98d90f
feat: implement Gemini-based evaluation comparison tool and update la…
Apr 14, 2026
3529693
style: increase dropdown trigger width to 300px
Apr 14, 2026
7f8e2f0
refactor: relocate ai_comparer config and update general score parsin…
Apr 14, 2026
bcbf018
Merge branch 'main' into comparer
IsmailMehdi Apr 15, 2026
dada696
refactor: remove redundant local json imports in main.py
Apr 15, 2026
d789012
Merge branch 'comparer' of github.com:GoogleCloudPlatform/evalbench i…
Apr 15, 2026
788df7c
Merge origin/main into comparer
Apr 15, 2026
0ff51e2
Merge branch 'main' into comparer
IsmailMehdi Apr 15, 2026
b623313
Merge branch 'comparer' of github.com:GoogleCloudPlatform/evalbench i…
Apr 15, 2026
c438fb8
fix: include database identifier in error logs when connection acquis…
Apr 15, 2026
bbb7645
fix: update error handling to log critical failures using logging.err…
Apr 15, 2026
b07489a
feat: update VPA mode to Initial to prevent disruptive evictions
Apr 15, 2026
b7e10d2
feat: set pod resources to 2/3 of node capacity and disable VPA overr…
Apr 15, 2026
20d151f
Merge branch 'main' into logcleanup
IsmailMehdi Apr 15, 2026
8d535ba
Merge branch 'main' into logcleanup
IsmailMehdi Apr 15, 2026
676ea5c
refactor: remove redundant whitespace after filtering summaries
Apr 15, 2026
64905e5
feat: include session_id in EvalResponse for Ping, Connect, EvalConfi…
Apr 15, 2026
c4f2326
chore: reduce k8s resource requests and implement session persistence…
Apr 15, 2026
dabef41
Merge branch 'origin/main' into dynamic-chart
Apr 16, 2026
23aebed
feat: add interactive zooming and legend-based filtering to trend charts
Apr 16, 2026
787a38a
refactor: replace custom tab button implementation with me.button_tog…
Apr 16, 2026
e894f0c
Merge branch 'origin/main' into dynamic-chart
Apr 16, 2026
85e0989
fix: correct syntax error by replacing trailing parenthesis with semi…
Apr 16, 2026
f15c196
feat: add support for cross-evaluation comparison via query parameter…
Apr 17, 2026
24d3c7e
Merge branch 'main' of github.com:GoogleCloudPlatform/evalbench into …
Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion evalbench/util/sessionmgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,28 @@ def __init__(
self.sessions = {}
self.ttl = 10800
self.lock = RWLock()
self.load_sessions_from_disk()
logging.debug("Starting reaper...")
reaper = Thread(target=self.reaper, args=[])
reaper.daemon = True
reaper.start()

def load_sessions_from_disk(self):
try:
if not os.path.exists(SESSION_RESOURCES_PATH):
return
for sid in os.listdir(SESSION_RESOURCES_PATH):
dir_path = os.path.join(SESSION_RESOURCES_PATH, sid)
if os.path.isdir(dir_path):
mtime = os.path.getmtime(dir_path)
logging.info(f"Loading session {sid} from disk with mtime {mtime}.")
self.sessions[sid] = {
"create_ts": mtime,
"session_id": sid,
}
except Exception as e:
logging.error(f"Error loading sessions from disk: {e}")

def set_ttl(self, ttl):
self.ttl = ttl

Expand Down Expand Up @@ -78,7 +95,10 @@ def prune_resource_files(self, session_id):
os.remove(file_path)
for dir in dirs:
dir_path = os.path.join(root, dir)
os.rmdir(dir_path)
if os.path.islink(dir_path):
os.unlink(dir_path)
else:
os.rmdir(dir_path)
os.rmdir(path)

def create_session(self, session_id):
Expand Down
8 changes: 4 additions & 4 deletions evalbench_service/k8s/evalbench.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ spec:
name: evalbench-eval
resources:
requests:
cpu: "42"
memory: "168Gi"
cpu: "20"
memory: "80Gi"
limits:
cpu: "42"
memory: "168Gi"
cpu: "20"
memory: "80Gi"
securityContext:
allowPrivilegeEscalation: true
capabilities:
Expand Down
56 changes: 29 additions & 27 deletions viewer/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,19 @@ def on_load(e: me.LoadEvent):
if job_id and job_id in directories:
state.selected_directory = job_id

tab = me.query_params.get("tab")
eval1 = me.query_params.get("eval1")
eval2 = me.query_params.get("eval2")

if tab == "compare" and eval1 and eval2:
state.selected_main_tab = "Compare"
state.compare_tab_visible = True
state.compare_evals = json.dumps([eval1, eval2])
# Trigger the AI comparison
state.ai_comparison = compare_evals(eval1, eval2)





def status_component():
Expand Down Expand Up @@ -2281,36 +2294,25 @@ def on_next_conversation(e: me.ClickEvent):
from trends import trends_component
state = me.state(State)

def on_main_tab_change(e: me.ButtonToggleChangeEvent):
st = me.state(State)
st.selected_main_tab = e.value
logging.info(f"Main tab changed to: {e.value}")

with me.box(style=me.Style(margin=me.Margin(bottom="12px"))):
tabs = ["Status", "List", "Charts"]
buttons = [
me.ButtonToggleButton(label="Status", value="Status"),
me.ButtonToggleButton(label="List", value="List"),
me.ButtonToggleButton(label="Charts", value="Charts"),
]
if state.compare_tab_visible:
tabs.append("Compare")
for tab in tabs:
is_active = state.selected_main_tab == tab
tab_text = tab
if tab == "Compare" and state.ai_comparison == "Comparing...":
tab_text += " (Loading...)"

click_handler = None
if tab == "Status": click_handler = on_status_tab_click
elif tab == "List": click_handler = on_list_tab_click
elif tab == "Charts": click_handler = on_charts_tab_click
elif tab == "Compare": click_handler = on_compare_tab_click
buttons.append(me.ButtonToggleButton(label="Compare", value="Compare"))

me.button(
tab_text,
on_click=click_handler,
style=me.Style(
padding=me.Padding.symmetric(vertical="6px", horizontal="12px"),
background="#1e293b" if is_active else "#f1f5f9",
color="#ffffff" if is_active else "#475569",
border_radius="4px",
cursor="pointer",
font_weight="600" if is_active else "500",
font_size="14px",
margin=me.Margin(right="8px")
),
)
me.button_toggle(
value=state.selected_main_tab,
buttons=buttons,
on_change=on_main_tab_change,
)

if state.selected_main_tab == "List":
try:
Expand Down
Loading
Loading