diff --git a/site/cds_rdm/administration/harvester_reports.py b/site/cds_rdm/administration/harvester_reports.py index 1bcd6b93..e78a25fd 100644 --- a/site/cds_rdm/administration/harvester_reports.py +++ b/site/cds_rdm/administration/harvester_reports.py @@ -125,5 +125,8 @@ def init_search_config(self, **kwargs): headers=self.get_search_request_headers(**kwargs), pagination_options=(20, 50), default_size=20, - hidden_params=[["action", "record.publish"]], - ) \ No newline at end of file + hidden_params=[ + ["action", "record.publish"], + ["user_id", "system"], + ], + ) diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js index feadb6ff..ad37da32 100644 --- a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js +++ b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js @@ -16,6 +16,11 @@ import { DownloadButton } from "./DownloadButton"; * Custom SearchBar component with run selector */ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { + const hiddenParams = [ + ["action", "record.publish"], + ["user_id", "system"], + ]; + // Get runs from data attributes const domContainer = document.getElementById("invenio-search-config"); const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]"); @@ -48,7 +53,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { updateQueryState({ ...currentQueryState, queryString, - hiddenParams: [["action", "record.publish"]], + hiddenParams, }); }; @@ -61,7 +66,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => { updateQueryState({ ...currentQueryState, queryString: inputValue, - hiddenParams: [["action", "record.publish"]], + hiddenParams, }); }; diff --git a/site/cds_rdm/harvester_download/resources/resource.py b/site/cds_rdm/harvester_download/resources/resource.py index 2a5ecdee..5979ed97 100644 --- a/site/cds_rdm/harvester_download/resources/resource.py +++ b/site/cds_rdm/harvester_download/resources/resource.py @@ -7,11 +7,14 @@ """Harvester download resource.""" +import re from datetime import datetime -from flask import Response, g, request, stream_with_context +from flask import Response, current_app, request from flask_resources import Resource, route -from invenio_audit_logs.proxies import current_audit_logs_service +from invenio_jobs.models import Run +from invenio_search import current_search_client +from invenio_search.utils import prefix_index from cds_rdm.administration.permissions import curators_permission @@ -19,6 +22,10 @@ class HarvesterDownloadResource(Resource): """Harvester download resource.""" + TIMESTAMP_QUERY_PATTERN = re.compile( + r'@timestamp:\["?([^"\]]+)"?\s+TO\s+"?([^"\]]+|\*)"?\]' + ) + def create_url_rules(self): """Create the URL rules for the download resource.""" routes = self.config.routes @@ -27,44 +34,149 @@ def create_url_rules(self): ] def download(self): - """Download audit logs for harvester reports as plain text file.""" + """Download a harvester run's logs as a plain-text ``.log`` file. + + Mirrors the admin job-run page: status header, failure banner, + truncation warning, and task-grouped entries formatted as + ``[yyyy-MM-dd HH:mm] LEVEL message``. + """ permission = curators_permission if not permission.can(): return {"message": "Permission denied"}, 403 query = request.args.get("q", "") - action = request.args.get("action", "") - if not query: return {"message": "No query provided"}, 400 - params = {"q": query, "size": 1000} - if action: - params["action"] = action - - result = current_audit_logs_service.search( - identity=g.identity, - params=params, - ) - - def generate_logs(): - """Generate log lines one by one.""" - for hit in result.hits: - timestamp = hit.get("created", "N/A") - action = hit.get("action", "N/A") - resource_type = hit.get("resource", {}).get("type", "N/A") - resource_id = hit.get("resource", {}).get("id", "N/A") - user_email = hit.get("user", {}).get("email", "N/A") - - # Format: [timestamp] action resource_type/resource_id user - line = f"[{timestamp}] {action} {resource_type}/{resource_id} {user_email}\n" - yield line + timestamp_match = self.TIMESTAMP_QUERY_PATTERN.search(query) + if not timestamp_match: + return {"message": "Invalid harvester run query"}, 400 + + start_time, end_time = timestamp_match.groups() + try: + started_at = datetime.fromisoformat(start_time) + except ValueError: + return {"message": "Invalid harvester run query"}, 400 + + run_query = Run.query.filter_by(started_at=started_at, parent_run_id=None) + if end_time != "*": + try: + finished_at = datetime.fromisoformat(end_time) + except ValueError: + return {"message": "Invalid harvester run query"}, 400 + run_query = run_query.filter_by(finished_at=finished_at) + else: + run_query = run_query.filter_by(finished_at=None) + + run = run_query.one_or_none() + if not run: + return {"message": "Run not found"}, 404 + + # Query the job-logs index directly: the stock JobLogsPermissionPolicy + # has ``can_read = [Disable()]``, which blocks every non-superuser + # identity from reading logs via ``current_jobs_logs_service``. The + # caller is already gated by ``curators_permission.can()`` above. + full_index_name = prefix_index(current_app.config["JOBS_LOGGING_INDEX"]) + max_results = current_app.config.get("JOBS_LOGS_MAX_RESULTS", 2000) + search_query = { + "query": { + "bool": { + "filter": [ + {"term": {"context.run_id": str(run.id)}}, + {"term": {"context.job_id": str(run.job_id)}}, + ] + } + }, + "sort": [ + {"@timestamp": {"order": "asc"}}, + {"_id": {"order": "asc"}}, + ], + "size": max_results, + "track_total_hits": True, + } + + try: + response = current_search_client.search( + index=full_index_name, body=search_query + ) + except Exception: + current_app.logger.exception( + "Failed to fetch structured job logs for harvester run %s", run.id + ) + response = {} + + hits = response.get("hits", {}).get("hits", []) + total = response.get("hits", {}).get("total", {}).get("value", len(hits)) + + def _format_timestamp(raw): + # Admin UI (RunsLogs.js) format. + if not raw: + return "N/A" + try: + return datetime.fromisoformat( + raw.replace("Z", "+00:00") + ).strftime("%Y-%m-%d %H:%M") + except (ValueError, TypeError): + return raw + + # Group by context.task_id in first-seen order (RunsLogs.js buildLogTree). + task_groups = {} + seen = set() + error_count = 0 + for hit in hits: + src = hit.get("_source", {}) + raw_ts = src.get("@timestamp") + level = src.get("level", "INFO") + # Collapse whitespace so multi-line errors render on one line + # (admin UI does the same via ``white-space: normal``). + message = re.sub(r"\s+", " ", (src.get("message") or "")).strip() + key = (raw_ts, level, message) + if key in seen: + continue + seen.add(key) + if level == "ERROR": + error_count += 1 + task_id = (src.get("context") or {}).get("task_id") or "unknown" + task_groups.setdefault(task_id, []).append( + f"[{_format_timestamp(raw_ts)}] {level} {message}" + ) + + lines = [line for group in task_groups.values() for line in group] + + # Admin UI header: run metadata + FAILED/PARTIAL_SUCCESS banner. + header = [] + status = getattr(run.status, "name", str(run.status)) + header.append(f"Status: {status}") + header.append(f"Started: {_format_timestamp(run.started_at.isoformat())}") + if run.finished_at: + header.append( + f"Finished: {_format_timestamp(run.finished_at.isoformat())}" + ) + if status in ("FAILED", "PARTIAL_SUCCESS"): + banner = "Job failed" if status == "FAILED" else "Job partially succeeded" + header.append("") + header.append(banner) + if run.message: + header.append(run.message) + if error_count: + header.append(f"{error_count} error(s) found in logs below") + if total and total > len(lines): + header.append( + f"Showing first {len(lines)} of {total} log entries " + f"(truncated at JOBS_LOGS_MAX_RESULTS={max_results})." + ) + header.append("=" * 80) + + logs = "\n".join(header + lines) + + if not lines: + logs += "\n" + (run.message or "No logs available for this run.\n") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"harvester_logs_{timestamp}.txt" + filename = f"harvester_logs_{run.id}_{timestamp}.log" return Response( - stream_with_context(generate_logs()), + logs, mimetype="text/plain", headers={"Content-Disposition": f'attachment; filename="{filename}"'}, ) diff --git a/site/cds_rdm/permissions.py b/site/cds_rdm/permissions.py index bfd0cd1e..7c915fd7 100644 --- a/site/cds_rdm/permissions.py +++ b/site/cds_rdm/permissions.py @@ -71,6 +71,9 @@ class CDSRDMRecordPermissionPolicy(RDMRecordPermissionPolicy): can_create = [AuthenticatedRegularUser(), SystemProcess()] can_read = RDMRecordPermissionPolicy.can_read + [ArchiverRead()] can_search = RDMRecordPermissionPolicy.can_search + [ArchiverRead()] + can_search_revisions = RDMRecordPermissionPolicy.can_search_revisions + [ + HarvesterCurator() + ] can_read_files = RDMRecordPermissionPolicy.can_read_files + [ArchiverRead()] can_get_content_files = RDMRecordPermissionPolicy.can_get_content_files + [ ArchiverRead() diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html index 75726324..dcb53f11 100644 --- a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html +++ b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html @@ -31,14 +31,6 @@

Summary

{% endif %} - -
- - View Full Details - -
- {% if job.task == "process_inspire" %} {% set start_time = run.started_at | string | replace(' ', 'T') %} @@ -122,4 +114,4 @@

Harvester Actions

{% endif %} -{% endblock content %} \ No newline at end of file +{% endblock content %} diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt index 7db59b8a..07f278f8 100644 --- a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt +++ b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt @@ -12,8 +12,6 @@ {{ status_info.action }} -View full details: {{ run_url }} - {% if run.status.name == "PARTIAL_SUCCESS" and run.errored_entries and run.total_entries %} Summary: - Successfully processed: {{ success_count }} items