CERNDocumentServer · TahaKhan998 · Apr 20, 2026 · kpsherva · Apr 22, 2026 · kpsherva
diff --git a/site/cds_rdm/administration/harvester_reports.py b/site/cds_rdm/administration/harvester_reports.py
@@ -125,5 +125,8 @@ def init_search_config(self, **kwargs):
             headers=self.get_search_request_headers(**kwargs),
             pagination_options=(20, 50),
             default_size=20,
-            hidden_params=[["action", "record.publish"]],
-        )
+            hidden_params=[
+                ["action", "record.publish"],
+                ["user_id", "system"],
+            ],
+        )
diff --git a/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js b/site/cds_rdm/assets/semantic-ui/js/cds_rdm/administration/harvesterReports/SearchBar.js
@@ -16,6 +16,11 @@ import { DownloadButton } from "./DownloadButton";
  * Custom SearchBar component with run selector
  */
 const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
+  const hiddenParams = [
+    ["action", "record.publish"],
+    ["user_id", "system"],
+  ];
+
   // Get runs from data attributes
   const domContainer = document.getElementById("invenio-search-config");
   const runs = JSON.parse(domContainer?.dataset.harvesterRuns || "[]");
@@ -48,7 +53,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
     updateQueryState({
       ...currentQueryState,
       queryString,
-      hiddenParams: [["action", "record.publish"]],
+      hiddenParams,
     });
   };
 
@@ -61,7 +66,7 @@ const SearchBarComponent = ({ updateQueryState, currentQueryState }) => {
     updateQueryState({
       ...currentQueryState,
       queryString: inputValue,
-      hiddenParams: [["action", "record.publish"]],
+      hiddenParams,
     });
   };
 

diff --git a/site/cds_rdm/harvester_download/resources/resource.py b/site/cds_rdm/harvester_download/resources/resource.py
@@ -7,18 +7,25 @@
 
 """Harvester download resource."""
 
+import re
 from datetime import datetime
 
-from flask import Response, g, request, stream_with_context
+from flask import Response, current_app, request
 from flask_resources import Resource, route
-from invenio_audit_logs.proxies import current_audit_logs_service
+from invenio_jobs.models import Run
+from invenio_search import current_search_client
+from invenio_search.utils import prefix_index
 
 from cds_rdm.administration.permissions import curators_permission
 
 
 class HarvesterDownloadResource(Resource):
     """Harvester download resource."""
 
+    TIMESTAMP_QUERY_PATTERN = re.compile(
+        r'@timestamp:\["?([^"\]]+)"?\s+TO\s+"?([^"\]]+|\*)"?\]'
+    )
+
     def create_url_rules(self):
         """Create the URL rules for the download resource."""
         routes = self.config.routes
@@ -27,44 +34,149 @@ def create_url_rules(self):
         ]
 
     def download(self):
-        """Download audit logs for harvester reports as plain text file."""
+        """Download a harvester run's logs as a plain-text ``.log`` file.
+
+        Mirrors the admin job-run page: status header, failure banner,
+        truncation warning, and task-grouped entries formatted as
+        ``[yyyy-MM-dd HH:mm] LEVEL message``.
+        """
         permission = curators_permission
         if not permission.can():
             return {"message": "Permission denied"}, 403
 
         query = request.args.get("q", "")
-        action = request.args.get("action", "")
-
         if not query:
             return {"message": "No query provided"}, 400
 
-        params = {"q": query, "size": 1000}
-        if action:
-            params["action"] = action
-
-        result = current_audit_logs_service.search(
-            identity=g.identity,
-            params=params,
-        )
-
-        def generate_logs():
-            """Generate log lines one by one."""
-            for hit in result.hits:
-                timestamp = hit.get("created", "N/A")
-                action = hit.get("action", "N/A")
-                resource_type = hit.get("resource", {}).get("type", "N/A")
-                resource_id = hit.get("resource", {}).get("id", "N/A")
-                user_email = hit.get("user", {}).get("email", "N/A")
-
-                # Format: [timestamp] action resource_type/resource_id user
-                line = f"[{timestamp}] {action} {resource_type}/{resource_id} {user_email}\n"
-                yield line
+        timestamp_match = self.TIMESTAMP_QUERY_PATTERN.search(query)
+        if not timestamp_match:
+            return {"message": "Invalid harvester run query"}, 400
+
+        start_time, end_time = timestamp_match.groups()
+        try:
+            started_at = datetime.fromisoformat(start_time)
+        except ValueError:
+            return {"message": "Invalid harvester run query"}, 400
+
+        run_query = Run.query.filter_by(started_at=started_at, parent_run_id=None)
+        if end_time != "*":
+            try:
+                finished_at = datetime.fromisoformat(end_time)
+            except ValueError:
+                return {"message": "Invalid harvester run query"}, 400
+            run_query = run_query.filter_by(finished_at=finished_at)
+        else:
+            run_query = run_query.filter_by(finished_at=None)
+
+        run = run_query.one_or_none()
+        if not run:
+            return {"message": "Run not found"}, 404
+
+        # Query the job-logs index directly: the stock JobLogsPermissionPolicy
+        # has ``can_read = [Disable()]``, which blocks every non-superuser
+        # identity from reading logs via ``current_jobs_logs_service``. The
+        # caller is already gated by ``curators_permission.can()`` above.
+        full_index_name = prefix_index(current_app.config["JOBS_LOGGING_INDEX"])
+        max_results = current_app.config.get("JOBS_LOGS_MAX_RESULTS", 2000)
+        search_query = {
+            "query": {
+                "bool": {
+                    "filter": [
+                        {"term": {"context.run_id": str(run.id)}},
+                        {"term": {"context.job_id": str(run.job_id)}},
+                    ]
+                }
+            },
+            "sort": [
+                {"@timestamp": {"order": "asc"}},
+                {"_id": {"order": "asc"}},
+            ],
+            "size": max_results,
+            "track_total_hits": True,
+        }
+
+        try:
+            response = current_search_client.search(
+                index=full_index_name, body=search_query
+            )
+        except Exception:
+            current_app.logger.exception(
+                "Failed to fetch structured job logs for harvester run %s", run.id
+            )
+            response = {}
+
+        hits = response.get("hits", {}).get("hits", [])
+        total = response.get("hits", {}).get("total", {}).get("value", len(hits))
+
+        def _format_timestamp(raw):
+            # Admin UI (RunsLogs.js) format.
+            if not raw:
+                return "N/A"
+            try:
+                return datetime.fromisoformat(
+                    raw.replace("Z", "+00:00")
+                ).strftime("%Y-%m-%d %H:%M")
+            except (ValueError, TypeError):
+                return raw
+
+        # Group by context.task_id in first-seen order (RunsLogs.js buildLogTree).
+        task_groups = {}
+        seen = set()
+        error_count = 0
+        for hit in hits:
+            src = hit.get("_source", {})
+            raw_ts = src.get("@timestamp")
+            level = src.get("level", "INFO")
+            # Collapse whitespace so multi-line errors render on one line
+            # (admin UI does the same via ``white-space: normal``).
+            message = re.sub(r"\s+", " ", (src.get("message") or "")).strip()
+            key = (raw_ts, level, message)
+            if key in seen:
+                continue
+            seen.add(key)
+            if level == "ERROR":
+                error_count += 1
+            task_id = (src.get("context") or {}).get("task_id") or "unknown"
+            task_groups.setdefault(task_id, []).append(
+                f"[{_format_timestamp(raw_ts)}] {level} {message}"
+            )
+
+        lines = [line for group in task_groups.values() for line in group]
+
+        # Admin UI header: run metadata + FAILED/PARTIAL_SUCCESS banner.
+        header = []
+        status = getattr(run.status, "name", str(run.status))
+        header.append(f"Status: {status}")
+        header.append(f"Started: {_format_timestamp(run.started_at.isoformat())}")
+        if run.finished_at:
+            header.append(
+                f"Finished: {_format_timestamp(run.finished_at.isoformat())}"
+            )
+        if status in ("FAILED", "PARTIAL_SUCCESS"):
+            banner = "Job failed" if status == "FAILED" else "Job partially succeeded"
+            header.append("")
+            header.append(banner)
+            if run.message:
+                header.append(run.message)
+            if error_count:
+                header.append(f"{error_count} error(s) found in logs below")
+        if total and total > len(lines):
+            header.append(
+                f"Showing first {len(lines)} of {total} log entries "
+                f"(truncated at JOBS_LOGS_MAX_RESULTS={max_results})."
+            )
+        header.append("=" * 80)
+
+        logs = "\n".join(header + lines)
+
+        if not lines:
+            logs += "\n" + (run.message or "No logs available for this run.\n")
 
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"harvester_logs_{timestamp}.txt"
+        filename = f"harvester_logs_{run.id}_{timestamp}.log"
 
         return Response(
-            stream_with_context(generate_logs()),
+            logs,
             mimetype="text/plain",
             headers={"Content-Disposition": f'attachment; filename="{filename}"'},
         )
diff --git a/site/cds_rdm/permissions.py b/site/cds_rdm/permissions.py
@@ -71,6 +71,9 @@ class CDSRDMRecordPermissionPolicy(RDMRecordPermissionPolicy):
     can_create = [AuthenticatedRegularUser(), SystemProcess()]
     can_read = RDMRecordPermissionPolicy.can_read + [ArchiverRead()]
     can_search = RDMRecordPermissionPolicy.can_search + [ArchiverRead()]
+    can_search_revisions = RDMRecordPermissionPolicy.can_search_revisions + [
+        HarvesterCurator()
+    ]
     can_read_files = RDMRecordPermissionPolicy.can_read_files + [ArchiverRead()]
     can_get_content_files = RDMRecordPermissionPolicy.can_get_content_files + [
         ArchiverRead()

diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.html
@@ -31,14 +31,6 @@ <h3 style="margin-top: 0; color: #856404; font-size: 16px;">Summary</h3>
 </div>
 {% endif %}
 
-<!-- Call to action button -->
-<div style="text-align: center; margin: 30px 0;">
-    <a href="{{ run_url }}"
-        style="display: inline-block; padding: 12px 30px; background-color: {{ status_info.color }}; color: white; text-decoration: none; border-radius: 5px; font-weight: bold; font-size: 16px;">
-        View Full Details
-    </a>
-</div>
-
 {% if job.task == "process_inspire" %}
 <!-- Harvester-specific actions (only for INSPIRE harvester jobs) -->
 {% set start_time = run.started_at | string | replace(' ', 'T') %}
@@ -122,4 +114,4 @@ <h3 style="margin-top: 0; color: #333; font-size: 16px;">Harvester Actions</h3>
         {% endif %}
     </details>
 </div>
-{% endblock content %}
+{% endblock content %}
diff --git a/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt b/site/cds_rdm/templates/semantic-ui/invenio_jobs/emails/run_notification.txt
@@ -12,8 +12,6 @@
 
 {{ status_info.action }}
 
-View full details: {{ run_url }}
-
 {% if run.status.name == "PARTIAL_SUCCESS" and run.errored_entries and run.total_entries %}
 Summary:
 - Successfully processed: {{ success_count }} items