From 87516aa4f98025bde768128a4d5ef477d0a51997 Mon Sep 17 00:00:00 2001 From: Mohammed Taha Khan Date: Mon, 20 Apr 2026 11:49:55 +0200 Subject: [PATCH] fix: improve harvester reports access and log downloads --- invenio.cfg | 4 +- .../administration/harvester_reports.py | 7 +- .../harvesterReports/DownloadButton.js | 20 ++- .../harvesterReports/SearchBar.js | 9 +- site/cds_rdm/generators.py | 13 +- .../harvester_download/resources/resource.py | 155 ++++++++++++++---- site/cds_rdm/permissions.py | 9 + .../invenio_jobs/emails/run_notification.html | 10 +- .../invenio_jobs/emails/run_notification.txt | 2 - site/tests/test_permissions.py | 23 ++- 10 files changed, 190 insertions(+), 62 deletions(-) diff --git a/invenio.cfg b/invenio.cfg index 47994fe6..ba92aa8f 100644 --- a/invenio.cfg +++ b/invenio.cfg @@ -19,7 +19,8 @@ from cds_rdm.permissions import ( CDSRequestsPermissionPolicy, CDSRDMPreservationSyncPermissionPolicy, lock_edit_record_published_files, - CDSAuditLogPermissionPolicy + CDSAuditLogPermissionPolicy, + CDSJobLogsPermissionPolicy, ) from cds_rdm.files import storage_factory from cds_rdm.inspire_harvester.reader import InspireHTTPReader @@ -774,3 +775,4 @@ AUDIT_LOGS_ENABLED = True RDM_ALLOW_OWNERS_REMOVE_COMMUNITY_FROM_RECORD = False AUDIT_LOGS_PERMISSION_POLICY = CDSAuditLogPermissionPolicy +APP_LOGS_PERMISSION_POLICY = CDSJobLogsPermissionPolicy diff --git a/site/cds_rdm/administration/harvester_reports.py b/site/cds_rdm/administration/harvester_reports.py index 1bcd6b93..e78a25fd 100644 --- a/site/cds_rdm/administration/harvester_reports.py +++ b/site/cds_rdm/administration/harvester_reports.py @@ -125,5 +125,8 @@ def init_search_config(self, **kwargs): headers=self.get_search_request_headers(**kwargs), pagination_options=(20, 50), default_size=20, - hidden_params=[["action", "record.publish"]], - ) \ No newline at end of file + hidden_params=[ + ["action", "record.publish"], + ["user_id", "system"], + ], + ) diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js index 63f11ba3..8ca4c67a 100644 --- a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js +++ b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/DownloadButton.js @@ -8,18 +8,21 @@ import React from "react"; import { withState } from "react-searchkit"; import { Button, Icon } from "semantic-ui-react"; import { i18next } from "@translations/invenio_administration/i18next"; +import { extractRunIdFromQuery } from "./utils"; const DownloadButtonComponent = ({ currentQueryState }) => { - const handleDownload = () => { - const query = currentQueryState.queryString || ""; - const hiddenParams = currentQueryState.hiddenParams || []; + const domContainer = document.getElementById("invenio-search-config"); + const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]"); - const params = new URLSearchParams(); - if (query) params.set("q", query); - hiddenParams.forEach(([key, value]) => params.append(key, value)); + const runId = extractRunIdFromQuery( + currentQueryState.queryString || "", + runs + ); - const downloadUrl = `/harvester-reports/download?${params.toString()}`; - window.location.href = downloadUrl; + const handleDownload = () => { + if (!runId) return; + const params = new URLSearchParams({ run_id: runId }); + window.location.href = `/harvester-reports/download?${params.toString()}`; }; return ( @@ -27,6 +30,7 @@ const DownloadButtonComponent = ({ currentQueryState }) => { icon labelPosition="left" onClick={handleDownload} + disabled={!runId} className="harvester-download-button" size="small" > diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js index feadb6ff..ad37da32 100644 --- a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js +++ b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js @@ -16,6 +16,11 @@ import { DownloadButton } from "./DownloadButton"; * Custom SearchBar component with run selector */ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { + const hiddenParams = [ + ["action", "record.publish"], + ["user_id", "system"], + ]; + // Get runs from data attributes const domContainer = document.getElementById("invenio-search-config"); const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]"); @@ -48,7 +53,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { updateQueryState({ ...currentQueryState, queryString, - hiddenParams: [["action", "record.publish"]], + hiddenParams, }); }; @@ -61,7 +66,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { updateQueryState({ ...currentQueryState, queryString: inputValue, - hiddenParams: [["action", "record.publish"]], + hiddenParams, }); }; diff --git a/site/cds_rdm/generators.py b/site/cds_rdm/generators.py index f027c31a..813b521a 100644 --- a/site/cds_rdm/generators.py +++ b/site/cds_rdm/generators.py @@ -117,10 +117,15 @@ def needs(self, **kwargs): return [harvester_admin_access_action] def query_filter(self, identity=None, **kwargs): - """Restrict harvester curators to system user audit logs only.""" - for need in identity.provides: - if need == harvester_admin_access_action: - return dsl.Q("term", **{"user.id": "system"}) + """Restrict harvester curators to system-user ``record.publish`` audit logs.""" + if identity and Permission(harvester_admin_access_action).allows(identity): + return dsl.Q( + "bool", + must=[ + dsl.Q("term", **{"user.id": "system"}), + dsl.Q("term", action="record.publish"), + ], + ) return [] diff --git a/site/cds_rdm/harvester_download/resources/resource.py b/site/cds_rdm/harvester_download/resources/resource.py index 2a5ecdee..52f30fbb 100644 --- a/site/cds_rdm/harvester_download/resources/resource.py +++ b/site/cds_rdm/harvester_download/resources/resource.py @@ -7,14 +7,20 @@ """Harvester download resource.""" +import re +import uuid from datetime import datetime -from flask import Response, g, request, stream_with_context +from flask import Response, current_app, request from flask_resources import Resource, route -from invenio_audit_logs.proxies import current_audit_logs_service +from invenio_access.permissions import system_identity +from invenio_jobs.models import Run +from invenio_jobs.proxies import current_jobs_logs_service from cds_rdm.administration.permissions import curators_permission +INSPIRE_HARVESTER_TASK = "process_inspire" + class HarvesterDownloadResource(Resource): """Harvester download resource.""" @@ -27,44 +33,129 @@ def create_url_rules(self): ] def download(self): - """Download audit logs for harvester reports as plain text file.""" + """Download a harvester run's logs as a plain-text ``.log`` file. + + Mirrors the admin job-run page: status header, failure banner, + truncation warning, and task-grouped entries formatted as + ``[yyyy-MM-dd HH:mm] LEVEL message``. + """ permission = curators_permission if not permission.can(): return {"message": "Permission denied"}, 403 - query = request.args.get("q", "") - action = request.args.get("action", "") - - if not query: - return {"message": "No query provided"}, 400 - - params = {"q": query, "size": 1000} - if action: - params["action"] = action - - result = current_audit_logs_service.search( - identity=g.identity, - params=params, - ) - - def generate_logs(): - """Generate log lines one by one.""" - for hit in result.hits: - timestamp = hit.get("created", "N/A") - action = hit.get("action", "N/A") - resource_type = hit.get("resource", {}).get("type", "N/A") - resource_id = hit.get("resource", {}).get("id", "N/A") - user_email = hit.get("user", {}).get("email", "N/A") - - # Format: [timestamp] action resource_type/resource_id user - line = f"[{timestamp}] {action} {resource_type}/{resource_id} {user_email}\n" - yield line + run_id = request.args.get("run_id", "").strip() + if not run_id: + return {"message": "Missing run_id"}, 400 + try: + uuid.UUID(run_id) + except ValueError: + return {"message": "Invalid run_id"}, 400 + + run = Run.query.filter_by(id=run_id, parent_run_id=None).one_or_none() + if not run: + return {"message": "Run not found"}, 404 + + if not run.job or run.job.task != INSPIRE_HARVESTER_TASK: + return {"message": "Run is not a harvester run"}, 404 + + + max_results = current_app.config.get("JOBS_LOGS_MAX_RESULTS", 2000) + try: + result = current_jobs_logs_service.search( + system_identity, + params={"q": str(run.id), "sort": "timestamp"}, + ) + hits = list(result.hits) + total = result.total or len(hits) + except Exception: + current_app.logger.exception( + "Failed to fetch structured job logs for harvester run %s", run.id + ) + hits = [] + total = 0 + + def _format_timestamp(raw): + # Admin UI (RunsLogs.js) format. + if not raw: + return "N/A" + try: + return datetime.fromisoformat( + raw.replace("Z", "+00:00") + ).strftime("%Y-%m-%d %H:%M") + except (ValueError, TypeError): + return raw + + # Group by context.task_id in first-seen order (RunsLogs.js buildLogTree). + task_groups = {} + seen = set() + error_count = 0 + warning_count = 0 + for hit in hits: + raw_ts = hit.get("timestamp") + level = hit.get("level", "INFO") + # Collapse whitespace so multi-line errors render on one line + # (admin UI does the same via ``white-space: normal``). + message = re.sub(r"\s+", " ", (hit.get("message") or "")).strip() + key = (raw_ts, level, message) + if key in seen: + continue + seen.add(key) + if level == "ERROR": + error_count += 1 + elif level == "WARNING": + warning_count += 1 + task_id = (hit.get("context") or {}).get("task_id") or "unknown" + task_groups.setdefault(task_id, []).append( + f"[{_format_timestamp(raw_ts)}] {level} {message}" + ) + + lines = [line for group in task_groups.values() for line in group] + + header = [] + status = getattr(run.status, "name", str(run.status)) + header.append(f"Status: {status}") + header.append(f"Started: {_format_timestamp(run.started_at.isoformat())}") + if run.finished_at: + header.append( + f"Finished: {_format_timestamp(run.finished_at.isoformat())}" + ) + + summary = [] + if status in ("FAILED", "PARTIAL_SUCCESS", "SUCCESS"): + summary.append( + { + "FAILED": "Job failed", + "PARTIAL_SUCCESS": "Job partially succeeded", + "SUCCESS": "Job completed successfully", + }[status] + ) + if run.message: + summary.append(run.message) + if error_count: + summary.append(f"{error_count} error(s) found in logs below") + if warning_count: + summary.append(f"{warning_count} warning(s) found in logs below") + if summary: + header.append("") + header.extend(summary) + + if total and total > len(lines): + header.append( + f"Showing first {len(lines)} of {total} log entries " + f"(truncated at JOBS_LOGS_MAX_RESULTS={max_results})." + ) + header.append("=" * 80) + + logs = "\n".join(header + lines) + + if not lines: + logs += "\n" + (run.message or "No logs available for this run.\n") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"harvester_logs_{timestamp}.txt" + filename = f"harvester_logs_{run.id}_{timestamp}.log" return Response( - stream_with_context(generate_logs()), + logs, mimetype="text/plain", headers={"Content-Disposition": f'attachment; filename="{filename}"'}, ) diff --git a/site/cds_rdm/permissions.py b/site/cds_rdm/permissions.py index bfd0cd1e..33127df3 100644 --- a/site/cds_rdm/permissions.py +++ b/site/cds_rdm/permissions.py @@ -12,6 +12,7 @@ from invenio_administration.permissions import administration_permission from invenio_audit_logs.services.permissions import AuditLogPermissionPolicy from invenio_communities.permissions import CommunityPermissionPolicy +from invenio_jobs.services.permissions import JobLogsPermissionPolicy from invenio_preservation_sync.services.permissions import ( DefaultPreservationInfoPermissionPolicy, ) @@ -71,6 +72,9 @@ class CDSRDMRecordPermissionPolicy(RDMRecordPermissionPolicy): can_create = [AuthenticatedRegularUser(), SystemProcess()] can_read = RDMRecordPermissionPolicy.can_read + [ArchiverRead()] can_search = RDMRecordPermissionPolicy.can_search + [ArchiverRead()] + can_search_revisions = RDMRecordPermissionPolicy.can_search_revisions + [ + HarvesterCurator() + ] can_read_files = RDMRecordPermissionPolicy.can_read_files + [ArchiverRead()] can_get_content_files = RDMRecordPermissionPolicy.can_get_content_files + [ ArchiverRead() @@ -111,6 +115,11 @@ class CDSAuditLogPermissionPolicy(AuditLogPermissionPolicy): can_read = AuditLogPermissionPolicy.can_read + [HarvesterCurator()] +class CDSJobLogsPermissionPolicy(JobLogsPermissionPolicy): + """Job logs permission policy for CDS.""" + + can_read = JobLogsPermissionPolicy.can_search + class CDSRequestsPermissionPolicy(RDMRequestsPermissionPolicy): """Requests permission policy.""" diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html index 75726324..dcb53f11 100644 --- a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html +++ b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html @@ -31,14 +31,6 @@

Summary

{% endif %} - -
- - View Full Details - -
- {% if job.task == "process_inspire" %} {% set start_time = run.started_at | string | replace(' ', 'T') %} @@ -122,4 +114,4 @@

Harvester Actions

{% endif %} -{% endblock content %} \ No newline at end of file +{% endblock content %} diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt index 7db59b8a..07f278f8 100644 --- a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt +++ b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt @@ -12,8 +12,6 @@ {{ status_info.action }} -View full details: {{ run_url }} - {% if run.status.name == "PARTIAL_SUCCESS" and run.errored_entries and run.total_entries %} Summary: - Successfully processed: {{ success_count }} items diff --git a/site/tests/test_permissions.py b/site/tests/test_permissions.py index ba62749a..3e10f612 100644 --- a/site/tests/test_permissions.py +++ b/site/tests/test_permissions.py @@ -13,6 +13,7 @@ from invenio_rdm_records.records.api import RDMDraft, RDMParent, RDMRecord from invenio_records_resources.services.errors import PermissionDeniedError +from cds_rdm import generators from cds_rdm.administration.permissions import harvester_admin_access_action from cds_rdm.generators import HarvesterCurator @@ -44,12 +45,30 @@ def test_archiver_permissions( @pytest.mark.parametrize( ("provides", "expected_filter"), [ - ({harvester_admin_access_action}, {"term": {"user.id": "system"}}), + ( + {harvester_admin_access_action}, + { + "bool": { + "must": [ + {"term": {"user.id": "system"}}, + {"term": {"action": "record.publish"}}, + ] + } + }, + ), (set(), []), ], ) -def test_harvester_curator_permissions(provides, expected_filter): +def test_harvester_curator_permissions(monkeypatch, provides, expected_filter): """Harvester permissions use the action need and filter system logs only.""" + monkeypatch.setattr( + generators, + "Permission", + lambda *_: SimpleNamespace( + allows=lambda i: harvester_admin_access_action in i.provides + ), + ) + assert HarvesterCurator().needs() == [harvester_admin_access_action] identity = SimpleNamespace(provides=provides)