From 6ea5363c8839a2e23678a72eddbd3c836c97b76a Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 23 May 2026 22:32:53 -0500
Subject: [PATCH 1/3] feat: add timestamp for scraper content

---
 apps/api/src/services/inoreader/__init__.py   | 10 +++++
 .../database/mongodb/cache.py                 |  4 +-
 .../database/mongodb/models/metadata.py       |  1 +
 .../models/metadata_item.py                   | 34 +++++++++++----
 .../services/file_export/video_download.py    | 32 +++++++++++++-
 .../services/scrapers/bluesky/__init__.py     |  1 +
 .../services/scrapers/bluesky/scraper.py      | 14 ++++++
 .../services/scrapers/common.py               |  1 +
 .../services/scrapers/douban/__init__.py      |  6 ++-
 .../services/scrapers/general/__init__.py     |  1 +
 .../services/scrapers/general/base.py         | 13 +++++-
 .../services/scrapers/general/firecrawl.py    |  5 ++-
 .../services/scrapers/general/zyte.py         | 20 +++++++++
 .../services/scrapers/instagram/__init__.py   | 26 ++++++++++-
 .../services/scrapers/reddit/__init__.py      |  4 +-
 .../services/scrapers/threads/__init__.py     |  2 +
 .../services/scrapers/twitter/__init__.py     | 21 ++++++++-
 .../services/scrapers/wechat/__init__.py      |  3 ++
 .../services/scrapers/weibo/__init__.py       |  2 +-
 .../services/scrapers/weibo/scraper.py        | 20 ++++++++-
 .../services/scrapers/xiaohongshu/__init__.py | 21 +++++++--
 .../services/scrapers/zhihu/__init__.py       | 12 ++++++
 .../shared/fastfetchbot_shared/utils/parse.py |  6 ++-
 tests/unit/database/mongodb/test_cache.py     | 30 +++++++++++++
 tests/unit/scrapers/test_common_cache.py      |  6 +++
 tests/unit/scrapers/test_wechat.py            |  2 +
 tests/unit/test_published_timestamp.py        | 43 +++++++++++++++++++
 27 files changed, 315 insertions(+), 25 deletions(-)
 create mode 100644 tests/unit/test_published_timestamp.py

diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py
index 2bbaa15..e072bc6 100644
--- a/apps/api/src/services/inoreader/__init__.py
+++ b/apps/api/src/services/inoreader/__init__.py
@@ -30,10 +30,13 @@ def __init__(self, url: str = None, data: dict = None, **kwargs):
             self.category = data.get("category", "")
             self.raw_content = data.get("content", "")
             self.content = self.raw_content
+            self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))
         if kwargs.get("category"):
             self.category = kwargs["category"]
         self.media_files = []
         self.message_type = MessageType.LONG
+        if not hasattr(self, "timestamp"):
+            self.timestamp = None
 
     def _from_data(self, data: dict):
         self.title = data.get("title", "")
@@ -43,6 +46,7 @@ def _from_data(self, data: dict):
         self.category = data.get("category", "")
         self.raw_content = data.get("content", "")
         self.content = self.raw_content
+        self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))
 
     async def get_item(self, api: bool = False) -> dict:
         if api:
@@ -161,3 +165,9 @@ async def get_api_info(
                 headers=headers,
             )
             return resp
+
+
+def _parse_inoreader_timestamp(timestamp: int | None) -> int | None:
+    if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
+        return None
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
index 3684332..20a01e9 100644
--- a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
+++ b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
@@ -72,7 +72,9 @@ async def save_metadata(metadata_item: dict) -> Metadata:
     new_version = (latest.version + 1) if latest else 1
     metadata_item["version"] = new_version
 
-    doc = Metadata.model_construct(**metadata_item)
+    document_data = dict(metadata_item)
+    document_data["published_timestamp"] = document_data.pop("timestamp", None)
+    doc = Metadata.model_construct(**document_data)
     await Metadata.insert(doc)
 
     logger.info(f"Saved metadata for {url} (version={new_version})")
diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py b/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py
index 358f87f..96ee0e8 100644
--- a/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py
+++ b/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py
@@ -34,6 +34,7 @@ class Metadata(Document):
     source: Optional[str] = None
     media_files: Optional[list[DatabaseMediaFile]] = None
     telegraph_url: Optional[str] = None
+    published_timestamp: Optional[int] = None
     timestamp: datetime = Field(default_factory=datetime.utcnow)
     scrape_status: bool = False
     version: int = Field(default=1, ge=1)
diff --git a/packages/shared/fastfetchbot_shared/models/metadata_item.py b/packages/shared/fastfetchbot_shared/models/metadata_item.py
index 4c37c1c..2613802 100644
--- a/packages/shared/fastfetchbot_shared/models/metadata_item.py
+++ b/packages/shared/fastfetchbot_shared/models/metadata_item.py
@@ -20,6 +20,12 @@ def from_str(x: Any) -> str:
     return x
 
 
+def from_optional_int(x: Any) -> Optional[int]:
+    if isinstance(x, bool) or not isinstance(x, int) or x <= 0:
+        return None
+    return x
+
+
 def from_list(f: Callable[[Any], T], x: Any) -> List[T]:
     assert isinstance(x, list)
     return [f(y) for y in x]
@@ -75,6 +81,7 @@ class MetadataItem:
     author_url: Optional[str]
     category: str
     message_type: Optional[MessageType]
+    timestamp: Optional[int] = None
 
     @staticmethod
     def from_dict(obj: Any) -> "MetadataItem":
@@ -89,6 +96,7 @@ def from_dict(obj: Any) -> "MetadataItem":
         author_url = from_str(obj.get("author_url"))
         category = from_str(obj.get("category"))
         message_type = MessageType(obj.get("message_type"))
+        timestamp = from_optional_int(obj.get("timestamp"))
         return MetadataItem(
             url,
             telegraph_url,
@@ -100,21 +108,29 @@ def from_dict(obj: Any) -> "MetadataItem":
             author_url,
             category,
             message_type,
+            timestamp,
         )
 
     def to_dict(self) -> dict:
+        timestamp = from_optional_int(getattr(self, "timestamp", None))
+        message_type = getattr(self, "message_type", None)
+        message_type_value = (
+            message_type.value if isinstance(message_type, MessageType) else message_type
+        )
         result: dict = {
-            "url": from_str(self.url),
-            "telegraph_url": "", "content": from_str(self.content),
-            "text": from_str(self.text),
+            "url": from_str(getattr(self, "url", "")),
+            "telegraph_url": "",
+            "content": from_str(getattr(self, "content", "")),
+            "text": from_str(getattr(self, "text", "")),
             "media_files": from_list(
-                lambda x: to_class(MediaFile, x), self.media_files
+                lambda x: to_class(MediaFile, x), getattr(self, "media_files", [])
             ),
-            "author": from_str(self.author),
-            "title": from_str(self.title),
-            "author_url": from_str(self.author_url),
-            "category": from_str(self.category),
-            "message_type": self.message_type.value
+            "author": from_str(getattr(self, "author", "")),
+            "title": from_str(getattr(self, "title", "")),
+            "author_url": from_str(getattr(self, "author_url", "")),
+            "category": from_str(getattr(self, "category", "")),
+            "message_type": message_type_value,
+            "timestamp": timestamp,
         }
         return result
 
diff --git a/packages/shared/fastfetchbot_shared/services/file_export/video_download.py b/packages/shared/fastfetchbot_shared/services/file_export/video_download.py
index 5be1735..519a699 100644
--- a/packages/shared/fastfetchbot_shared/services/file_export/video_download.py
+++ b/packages/shared/fastfetchbot_shared/services/file_export/video_download.py
@@ -6,12 +6,18 @@
 """
 
 import asyncio
+import datetime
+import re
 from urllib.parse import urlparse, parse_qs
 
 import httpx
 
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
-from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    second_to_time,
+    unix_timestamp_to_utc,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.logger import logger
 from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV
 
@@ -58,6 +64,7 @@ def __init__(
         self.category = category
         self.media_files = []
         self.created = None
+        self.timestamp = None
         self.duration = None
         self.celery_app = celery_app
         self.timeout = timeout
@@ -219,6 +226,7 @@ def _video_info_formatting(self, meta_info: dict):
         if len(meta_info["description"]) > 800:
             meta_info["description"] = meta_info["description"][:800] + "..."
         self.created = meta_info["upload_date"]
+        self.timestamp = meta_info.get("timestamp")
         self.duration = meta_info["duration"]
         self.text = video_info_template.render(
             data={
@@ -250,6 +258,7 @@ def _youtube_info_parse(video_info: dict) -> dict:
             "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u8bc4\u8bba\u6570\uff1a{video_info['comment_count']}",
             "author_avatar": video_info["thumbnail"],
             "upload_date": str(video_info["upload_date"]),
+            "timestamp": _parse_youtube_upload_date(video_info["upload_date"]),
             "duration": second_to_time(round(video_info["duration"])),
         }
 
@@ -266,5 +275,26 @@ def _bilibili_info_parse(video_info: dict) -> dict:
             "description": video_info["description"],
             "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u5f39\u5e55\u6570\uff1a{video_info['comment_count']} \u70b9\u8d5e\u6570\uff1a{video_info['like_count']}",
             "upload_date": unix_timestamp_to_utc(video_info["timestamp"]),
+            "timestamp": _parse_bilibili_timestamp(video_info["timestamp"]),
             "duration": second_to_time(round(video_info["duration"])),
         }
+
+
+def _parse_youtube_upload_date(upload_date: str | int | None) -> int | None:
+    if upload_date is None:
+        return None
+    upload_date_text = str(upload_date).strip()
+    if not re.fullmatch(r"\d{8}", upload_date_text):
+        return None
+    try:
+        parsed = datetime.datetime.strptime(upload_date_text, "%Y%m%d")
+    except ValueError:
+        return None
+    parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
+
+
+def _parse_bilibili_timestamp(timestamp: int | None) -> int | None:
+    if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
+        return None
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py
index 274d049..27f7987 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py
@@ -32,6 +32,7 @@ def from_dict(obj: Any) -> "Bluesky":
             media_files=bluesky_item.media_files,
             category=bluesky_item.category,
             message_type=bluesky_item.message_type,
+            timestamp=bluesky_item.timestamp,
             cid=bluesky_item.cid,
             author_did=bluesky_item.author_did,
         )
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
index 8c3d163..0799fd5 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
@@ -1,3 +1,4 @@
+import datetime
 from typing import Optional
 from urllib.parse import urlparse
 
@@ -119,6 +120,7 @@ async def _resolve_single_post_data(post_data: PostView) -> dict:
             "category": "bluesky",
             "media_files": [],
             "created_at": created_at,
+            "timestamp": _parse_bluesky_created_at(created_at),
             "author_did": author_did,
         }
 
@@ -190,3 +192,15 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost:
         except Exception as e:
             logger.error(f"Error while getting post data: {e}")
             raise
+
+
+def _parse_bluesky_created_at(created_at: str | None) -> int | None:
+    if not created_at:
+        return None
+    try:
+        parsed = datetime.datetime.fromisoformat(created_at.replace("Z", "+00:00"))
+    except (TypeError, ValueError):
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py
index f1d3570..9c133a1 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/common.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/common.py
@@ -90,6 +90,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
                 if cached is not None:
                     logger.info("Cache hit, returning cached metadata")
                     result = cached.model_dump(mode="json", exclude={"id"})
+                    result["timestamp"] = result.pop("published_timestamp", None)
                     result["_cached"] = True
                     return result
             except Exception as e:
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
index ee9a083..6b759c3 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
@@ -6,7 +6,10 @@
 from bs4 import BeautifulSoup
 from lxml import etree
 
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.network import get_selector, HEADERS
 from fastfetchbot_shared.utils.logger import logger
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType
@@ -57,6 +60,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.text_group: Optional[str] = None
         self.raw_content: Optional[str] = None
         self.date: Optional[str] = None
+        self.timestamp: Optional[int] = None
         # reqeust fields
         self.headers = HEADERS
         self.headers["Cookie"] = kwargs.get("cookie", "")
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py
index f256512..46090cc 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py
@@ -27,6 +27,7 @@ def from_dict(obj: Any) -> "GeneralItem":
             media_files=metadata_item.media_files,
             category=metadata_item.category,
             message_type=metadata_item.message_type,
+            timestamp=metadata_item.timestamp,
             id=obj.get("id", ""),
             raw_content=obj.get("raw_content", ""),
             scraper_type=obj.get("scraper_type", ""),
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py
index 62817d1..d8eb84b 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py
@@ -11,7 +11,10 @@
 from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
 from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor
 from fastfetchbot_shared.services.scrapers.general import GeneralItem
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.logger import logger
 
 GENERAL_TEXT_LIMIT = 800
@@ -67,6 +70,7 @@ async def _build_item_data(
         markdown_content: str,
         html_content: str,
         og_image: Optional[str] = None,
+        timestamp: Optional[int] = None,
     ) -> None:
         """
         Common method to build item data from scraped content.
@@ -79,6 +83,13 @@ async def _build_item_data(
             "author": author or self.url_parser.netloc,
             "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
             "scraper_type": self.scraper_type,
+            "timestamp": (
+                timestamp
+                if not isinstance(timestamp, bool)
+                and isinstance(timestamp, int)
+                and timestamp > 0
+                else None
+            ),
         }
 
         # Process text content - use description or first part of markdown
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py
index c018daf..62e1fd9 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py
@@ -10,7 +10,10 @@
 from fastfetchbot_shared.services.scrapers.scraper import DataProcessor
 from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
 from fastfetchbot_shared.utils.logger import logger
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 
 # HTML tags to exclude from Firecrawl output at the source
 FIRECRAWL_EXCLUDE_TAGS = [
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
index d011d6f..0186373 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
@@ -1,3 +1,5 @@
+import datetime
+
 from zyte_api import AsyncZyteAPI
 
 from fastfetchbot_shared.services.scrapers.config import settings
@@ -59,6 +61,11 @@ async def _process_zyte_result(self, result: dict) -> None:
         # Extract main image
         main_image = article.get("mainImage", {})
         og_image = main_image.get("url") if main_image else None
+        timestamp = _parse_zyte_date_published(
+            article.get("datePublished")
+            or article.get("date_published")
+            or article.get("publishedDate")
+        )
 
         await self._build_item_data(
             title=title,
@@ -67,6 +74,7 @@ async def _process_zyte_result(self, result: dict) -> None:
             markdown_content=markdown_content,
             html_content=html_content,
             og_image=og_image,
+            timestamp=timestamp,
         )
 
 
@@ -77,3 +85,15 @@ class ZyteScraper(BaseGeneralScraper):
 
     async def get_processor_by_url(self, url: str) -> DataProcessor:
         return ZyteDataProcessor(url)
+
+
+def _parse_zyte_date_published(value: str | None) -> int | None:
+    if not value:
+        return None
+    try:
+        parsed = datetime.datetime.fromisoformat(str(value).strip().replace("Z", "+00:00"))
+    except ValueError:
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py
index ae99e0c..52af9da 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py
@@ -7,7 +7,9 @@
 
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
 from fastfetchbot_shared.utils.network import get_response
-from fastfetchbot_shared.utils.parse import get_html_text_length
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+)
 from fastfetchbot_shared.utils.logger import logger
 from .config import API_HEADERS_LIST, ALL_SCRAPERS
 from fastfetchbot_shared.services.scrapers.config import settings
@@ -23,6 +25,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
             "/", ""
         )
         self.message_type = MessageType.SHORT
+        self.timestamp = None
 
     async def get_item(self):
         await self.get_instagram()
@@ -115,6 +118,9 @@ def _get_ins_post_looter2(ins_data: dict) -> dict:
         )
         ins_info["content"] = ""
         ins_info["text"] = ins_text_data
+        ins_info["timestamp"] = _parse_instagram_timestamp(
+            ins_data.get("taken_at_timestamp") or ins_data.get("taken_at")
+        )
         ins_info["author"] = ins_data["owner"]["username"]
         if ins_data["owner"]["full_name"]:
             ins_info["author"] += "(" + ins_data["owner"]["full_name"] + ")"
@@ -190,6 +196,10 @@ def _get_ins_post_ins28_scraper2(ins_data):
         )
         ins_info["content"] = ""
         ins_info["text"] = ins_text_data
+        ins_info["timestamp"] = _parse_instagram_timestamp(
+            ins_data["items"][0].get("taken_at")
+            or ins_data["items"][0].get("taken_at_timestamp")
+        )
         ins_info["author"] = ins_data["items"][0]["user"]["username"]
         if ins_data["items"][0]["user"]["full_name"]:
             ins_info["author"] += "(" + ins_data["items"][0]["user"]["full_name"] + ")"
@@ -269,3 +279,17 @@ def _get_ins_post_ins28_scraper2(ins_data):
 
     async def _get_story_info(self):
         pass
+
+
+def _parse_instagram_timestamp(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        timestamp = int(value)
+    except (TypeError, ValueError):
+        return None
+    if timestamp <= 0:
+        return None
+    if timestamp > 10_000_000_000:
+        timestamp //= 1000
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py
index 9a907cb..98c804f 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py
@@ -19,6 +19,7 @@ def __init__(self, url, data: Optional[Any] = None, **kwargs):
         self.category = "reddit"
         self.media_files = []
         self.message_type = MessageType.LONG
+        self.timestamp = None
 
     async def get_item(self) -> dict:
         await self.get_reddit()
@@ -47,7 +48,8 @@ async def _process_reddit_data(self, reddit_data) -> None:
         self.author = reddit_data["author"].name
         self.author_url = f"https://www.reddit.com/user/{self.author}"
         self.raw_content = reddit_data["selftext_html"] or ""
-        self.created = unix_timestamp_to_utc(int(reddit_data["created_utc"]))
+        self.timestamp = int(reddit_data["created_utc"])
+        self.created = unix_timestamp_to_utc(self.timestamp)
         self.score = reddit_data["score"]
         self.comments_count = reddit_data["num_comments"]
         self.upvote_ratio = reddit_data["upvote_ratio"]
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py
index 69be60a..22b2ca7 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py
@@ -30,6 +30,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.code = urlparse(url).path.split("/")[2]
         self.pics_url = []
         self.videos_url = []
+        self.timestamp = None
 
     async def get_item(self) -> dict:
         await self.get_threads()
@@ -115,6 +116,7 @@ def process_single_threads(self, thread: Dict) -> None:
             self.title = thread["username"] + "'s Threads"
             self.author = thread["username"]
             self.author_url = f"https://threads.net/@{thread['username']}"
+            self.timestamp = thread["published_on"]
             created_at = unix_timestamp_to_utc(thread["published_on"])
             reply_count = thread["reply_count"]
             like_count = thread["like_count"]
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py
index 4c5b280..b79af00 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py
@@ -1,5 +1,7 @@
 # TODO: https://rapidapi.com/Glavier/api/twitter135
 import asyncio
+import datetime
+from email.utils import parsedate_to_datetime
 from urllib.parse import urlparse
 from typing import Dict, List, Optional, Any, Tuple
 
@@ -7,7 +9,10 @@
 import jmespath
 
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError
 from twitter.scraper import Scraper
 from .config import (
@@ -40,6 +45,7 @@ def __init__(
         self.media_files: list[MediaFile] = []
         self.category = "twitter"
         self.message_type = MessageType.SHORT
+        self.timestamp = None
         # auxiliary fields
         self.tid = urlparse(url).path.split("/")[-1]
         self.text_group = ""
@@ -168,6 +174,7 @@ def process_single_tweet_Twitter135(self, tweet: Dict, retweeted=False) -> None:
             self.author = tweet["name"]
             self.author_url = f"https://twitter.com/{tweet['username']}"
             self.date = tweet["date"]
+            self.timestamp = _parse_twitter_created_at(tweet["date"])
         tweet_info = self.parse_single_tweet_Twitter135(tweet, retweeted=retweeted)
         self.text_group += tweet_info["text_group"]
         self.content_group += tweet_info["content_group"]
@@ -326,6 +333,18 @@ def _find_article_media_url(article: Dict, media_id: str) -> str:
     return ""
 
 
+def _parse_twitter_created_at(created_at: str | None) -> int | None:
+    if not created_at:
+        return None
+    try:
+        parsed = parsedate_to_datetime(created_at)
+    except (TypeError, ValueError, IndexError):
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
+
+
 def _apply_inline_formatting(
         text: str,
         style_ranges: List[Dict],
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py
index 2951684..3f81eb3 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py
@@ -19,6 +19,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.media_files: list[MediaFile] = []
         self.category = "wechat"
         self.message_type = MessageType.LONG
+        self.timestamp = None
         # auxiliary fields
         self.sid = ""
         self.official_account = ""
@@ -52,6 +53,8 @@ def _wechat_data_parse(wechat_data: etree.HTML) -> Dict:
             ),
         }
         for k, v in meta_data.items():
+            if v is None:
+                continue
             new_string = v.replace("\n", "")
             meta_data[k] = new_string.strip()
         return meta_data
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py
index c84494d..056816b 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py
@@ -44,6 +44,7 @@ def from_dict(obj: Any) -> "Weibo":
             media_files=weibo_item.media_files,
             category=weibo_item.category,
             message_type=weibo_item.message_type,
+            timestamp=weibo_item.timestamp,
             id=weibo_item.id,
         )
 
@@ -51,4 +52,3 @@ def to_dict(self) -> dict:
         result: dict = super().to_dict()
         result["id"] = self.id
         return result
-
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py
index acbf414..4913a8a 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py
@@ -1,4 +1,6 @@
 import json
+import datetime
+from email.utils import parsedate_to_datetime
 from typing import Optional, Any, Union
 from urllib.parse import urlparse
 
@@ -12,7 +14,10 @@
 from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor
 from fastfetchbot_shared.services.scrapers.weibo import Weibo
 from fastfetchbot_shared.utils.network import get_response_json, get_random_user_agent
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from .config import (
     AJAX_HOST,
     AJAX_LONGTEXT_HOST,
@@ -153,6 +158,7 @@ async def _process_weibo_item(self, weibo_info: dict) -> None:
             "author_url": weibo_info.get("author_url"),
             "title": weibo_info.get("author") + "的微博",
             "date": weibo_info.get("created", None),
+            "timestamp": _parse_weibo_created_at(weibo_info.get("created")),
             "source": weibo_info.get("source", None),
             "region_name": weibo_info.get("region_name", None),
             "attitudes_count": self._string_to_int(weibo_info.get("attitudes_count", 0)),
@@ -523,3 +529,15 @@ class WeiboScraper(Scraper):
 
     async def get_processor_by_url(self, url) -> DataProcessor:
         return WeiboDataProcessor(url, cookies=self.weibo_cookies)
+
+
+def _parse_weibo_created_at(created_at: str | None) -> int | None:
+    if not created_at:
+        return None
+    try:
+        parsed = parsedate_to_datetime(created_at)
+    except (TypeError, ValueError, IndexError):
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py
index b586215..72047e5 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py
@@ -30,6 +30,7 @@ def __init__(self, url: str, data: Any, **kwargs):
         self.like_count = None
         self.updated = None
         self.created = None
+        self.timestamp = None
         self.raw_content = None
 
     async def get_item(self) -> dict:
@@ -59,11 +60,11 @@ async def _process_xiaohongshu_note(self, json_data: dict):
         self.raw_content = json_data.get("desc", "")
         raw_time = json_data.get("time", 0)
         raw_updated = json_data.get("last_update_time", 0)
-        self.created = (
-            unix_timestamp_to_utc(int(raw_time) / 1000) if raw_time else None
-        )
+        self.timestamp = _parse_xiaohongshu_timestamp(raw_time)
+        updated_timestamp = _parse_xiaohongshu_timestamp(raw_updated)
+        self.created = unix_timestamp_to_utc(self.timestamp) if self.timestamp else None
         self.updated = (
-            unix_timestamp_to_utc(int(raw_updated) / 1000) if raw_updated else None
+            unix_timestamp_to_utc(updated_timestamp) if updated_timestamp else None
         )
         self.like_count = json_data.get("liked_count")
         self.collected_count = json_data.get("collected_count")
@@ -92,3 +93,15 @@ async def _process_xiaohongshu_note(self, json_data: dict):
                     f'<p><video src="{media_file.url}" controls="controls"></video></p>'
                 )
         self.content = content_template.render(data=data)
+
+
+def _parse_xiaohongshu_timestamp(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        timestamp = int(value)
+    except (TypeError, ValueError):
+        return None
+    if timestamp <= 0:
+        return None
+    return timestamp // 1000 if timestamp > 10_000_000_000 else timestamp
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py
index 8f8923d..1d7f847 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py
@@ -90,6 +90,12 @@ def replace_inner_quotes(match):
     return raw_str
 
 
+def _parse_zhihu_timestamp(timestamp: Any) -> int | None:
+    if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
+        return None
+    return timestamp
+
+
 class Zhihu(MetadataItem):
     def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         # metadata fields
@@ -112,6 +118,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.raw_content = ""
         self.date = ""
         self.updated = ""
+        self.timestamp = None
         self.retweet_html = ""
         self.upvote: int = 0
         self.retweeted: bool = False
@@ -374,6 +381,7 @@ async def _get_zhihu_status(self):
                 self.raw_content = unmask_zhihu_links(self.raw_content)
             self.media_files.extend(data["media_files"])
             self.date = unix_timestamp_to_utc(data["created"])
+            self.timestamp = _parse_zhihu_timestamp(data["created"])
             self.updated = unix_timestamp_to_utc(data["updated"])
             self.upvote = data["like_count"]
             if data["origin_pin_id"]:
@@ -464,6 +472,7 @@ def _process_picture(pictures, content_attr):
                 )
                 self.raw_content = status_data["content"]
                 self.date = unix_timestamp_to_utc(status_data["created"])
+                self.timestamp = _parse_zhihu_timestamp(status_data["created"])
                 self.updated = unix_timestamp_to_utc(status_data["updated"])
                 self.upvote = status_data["like_count"]
                 self.comment_count = status_data["comment_count"]
@@ -549,6 +558,7 @@ async def _get_zhihu_article(self):
                 self.upvote = json_data["voteup_count"]
                 self.comment_count = json_data.get("comment_count", 0)
                 self.date = unix_timestamp_to_utc(json_data.get("created", 0))
+                self.timestamp = _parse_zhihu_timestamp(json_data.get("created", 0))
                 self.updated = unix_timestamp_to_utc(json_data.get("updated", 0))
                 if json_data.get("column"):
                     self.column = json_data["column"].get("title", "")
@@ -579,6 +589,7 @@ async def _get_zhihu_article(self):
                 self.upvote = article_data["voteup_count"]
                 self.comment_count = article_data["comment_count"]
                 self.date = unix_timestamp_to_utc(article_data["created"])
+                self.timestamp = _parse_zhihu_timestamp(article_data["created"])
                 self.updated = unix_timestamp_to_utc(article_data["updated"])
                 self.column = article_data["column"]
                 self.column_url = article_data["column_url"]
@@ -711,6 +722,7 @@ def _resolve_answer_json_data(self, answer_data: Dict) -> None:
                           ) or ""
         self.raw_content = answer_data["content"] or ""
         self.date = unix_timestamp_to_utc(answer_data["created"] or "") or ""
+        self.timestamp = _parse_zhihu_timestamp(answer_data["created"] or "")
         self.updated = unix_timestamp_to_utc(answer_data["updated"] or "") or ""
         self.comment_count = answer_data["comment_count"] or 0
         self.upvote = answer_data["voteup_count"] or 0
diff --git a/packages/shared/fastfetchbot_shared/utils/parse.py b/packages/shared/fastfetchbot_shared/utils/parse.py
index 98a5d4a..08a02fd 100644
--- a/packages/shared/fastfetchbot_shared/utils/parse.py
+++ b/packages/shared/fastfetchbot_shared/utils/parse.py
@@ -11,6 +11,7 @@
 from fastfetchbot_shared.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS
 
 TELEGRAM_TEXT_LIMIT = 900
+BEIJING_TZ = datetime.timezone(datetime.timedelta(hours=8))
 
 mimetypes.init()
 
@@ -47,8 +48,9 @@ def format_telegram_short_text(soup: BeautifulSoup) -> BeautifulSoup:
 def unix_timestamp_to_utc(timestamp: int) -> str | None:
     if not timestamp:
         return None
-    utc_time = datetime.datetime.utcfromtimestamp(timestamp)
-    beijing_time = utc_time + datetime.timedelta(hours=8)
+    beijing_time = datetime.datetime.fromtimestamp(
+        timestamp, datetime.timezone.utc
+    ).astimezone(BEIJING_TZ)
     return beijing_time.strftime("%Y-%m-%d %H:%M")
 
 
diff --git a/tests/unit/database/mongodb/test_cache.py b/tests/unit/database/mongodb/test_cache.py
index 0bdb549..e40d3bc 100644
--- a/tests/unit/database/mongodb/test_cache.py
+++ b/tests/unit/database/mongodb/test_cache.py
@@ -132,9 +132,39 @@ async def test_first_save_uses_version_1(self):
 
         assert item["version"] == 1
         MockMetadata.model_construct.assert_called_once()
+        assert (
+            MockMetadata.model_construct.call_args.kwargs["published_timestamp"]
+            is None
+        )
+        assert "timestamp" not in MockMetadata.model_construct.call_args.kwargs
         MockMetadata.insert.assert_awaited_once_with(mock_constructed)
         assert result is mock_constructed
 
+    @pytest.mark.asyncio
+    async def test_maps_metadata_timestamp_to_published_timestamp(self):
+        mock_find = _make_find_chain(None)
+
+        with patch(
+            "fastfetchbot_shared.database.mongodb.cache.Metadata"
+        ) as MockMetadata:
+            MockMetadata.find.return_value = mock_find
+            MockMetadata.model_construct.return_value = MagicMock()
+            MockMetadata.insert = AsyncMock()
+
+            from fastfetchbot_shared.database.mongodb.cache import save_metadata
+
+            item = {
+                "url": "https://example.com",
+                "title": "Test",
+                "timestamp": 1704067200,
+            }
+            await save_metadata(item)
+
+        construct_kwargs = MockMetadata.model_construct.call_args.kwargs
+        assert construct_kwargs["published_timestamp"] == 1704067200
+        assert "timestamp" not in construct_kwargs
+        assert item["timestamp"] == 1704067200
+
     @pytest.mark.asyncio
     async def test_increments_version_from_existing(self):
         existing_doc = _make_mock_metadata(version=3)
diff --git a/tests/unit/scrapers/test_common_cache.py b/tests/unit/scrapers/test_common_cache.py
index f55b6e1..e3ff819 100644
--- a/tests/unit/scrapers/test_common_cache.py
+++ b/tests/unit/scrapers/test_common_cache.py
@@ -69,6 +69,9 @@ async def test_cache_hit_returns_cached_result(self, make_service):
             "title": "Cached Title",
             "url": "https://example.com/post/1",
             "media_files": [],
+            "content": "<p>Cached body</p>",
+            "timestamp": "2026-01-01T00:00:00",
+            "published_timestamp": 1704067200,
         }
 
         svc = make_service(store_database=True, database_cache_ttl=3600)
@@ -82,6 +85,9 @@ async def test_cache_hit_returns_cached_result(self, make_service):
 
         assert result["_cached"] is True
         assert result["title"] == "Cached Title"
+        assert result["timestamp"] == 1704067200
+        assert result["content"] == "<p>Cached body</p>"
+        assert "published_timestamp" not in result
         mock_cached_doc.model_dump.assert_called_once_with(
             mode="json", exclude={"id"}
         )
diff --git a/tests/unit/scrapers/test_wechat.py b/tests/unit/scrapers/test_wechat.py
index 2c77c9d..e5d1e0a 100644
--- a/tests/unit/scrapers/test_wechat.py
+++ b/tests/unit/scrapers/test_wechat.py
@@ -43,6 +43,7 @@ def test_parses_article_data(self):
         html_str = """
         <html>
         <body>
+        <script>var ct = "1704067200";</script>
         <div id="js_article">
             <h1 id="activity-name">  Test Title\n  </h1>
             <a id="js_name">  Test Author\n  </a>
@@ -56,6 +57,7 @@ def test_parses_article_data(self):
         assert result["title"] == "Test Title"
         assert result["author"] == "Test Author"
         assert "Test content paragraph" in result["content"]
+        assert "timestamp" not in result
 
     def test_strips_newlines_and_whitespace(self):
         html_str = """
diff --git a/tests/unit/test_published_timestamp.py b/tests/unit/test_published_timestamp.py
new file mode 100644
index 0000000..31c5d05
--- /dev/null
+++ b/tests/unit/test_published_timestamp.py
@@ -0,0 +1,43 @@
+from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType, MetadataItem
+
+
+def test_metadata_item_to_dict_includes_timestamp_without_changing_content():
+    item = MetadataItem(
+        url="https://example.com/post",
+        telegraph_url="",
+        content="<p>Body</p>",
+        text="Body",
+        media_files=[MediaFile(media_type="image", url="https://example.com/a.jpg")],
+        author="Author",
+        title="Title",
+        author_url="https://example.com/author",
+        category="example",
+        message_type=MessageType.SHORT,
+        timestamp=1704067200,
+    )
+
+    data = item.to_dict()
+
+    assert data["timestamp"] == 1704067200
+    assert data["content"] == "<p>Body</p>"
+
+
+def test_metadata_item_to_dict_does_not_parse_datetime_strings():
+    item = MetadataItem(
+        url="https://example.com/post",
+        telegraph_url="",
+        content="<p>Body</p>",
+        text="Body",
+        media_files=[],
+        author="Author",
+        title="Title",
+        author_url="https://example.com/author",
+        category="example",
+        message_type=MessageType.SHORT,
+        timestamp="2024-01-01T00:00:00",  # type: ignore[arg-type]
+    )
+
+    data = item.to_dict()
+
+    assert data["timestamp"] is None
+    assert data["content"] == "<p>Body</p>"

From 99eaeddaf5d299db4aa5c9413ef98a5a041be009 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 23 May 2026 22:38:46 -0500
Subject: [PATCH 2/3] Update zyte.py

---
 .../services/scrapers/general/zyte.py         | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
index 0186373..d011d6f 100644
--- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
+++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py
@@ -1,5 +1,3 @@
-import datetime
-
 from zyte_api import AsyncZyteAPI
 
 from fastfetchbot_shared.services.scrapers.config import settings
@@ -61,11 +59,6 @@ async def _process_zyte_result(self, result: dict) -> None:
         # Extract main image
         main_image = article.get("mainImage", {})
         og_image = main_image.get("url") if main_image else None
-        timestamp = _parse_zyte_date_published(
-            article.get("datePublished")
-            or article.get("date_published")
-            or article.get("publishedDate")
-        )
 
         await self._build_item_data(
             title=title,
@@ -74,7 +67,6 @@ async def _process_zyte_result(self, result: dict) -> None:
             markdown_content=markdown_content,
             html_content=html_content,
             og_image=og_image,
-            timestamp=timestamp,
         )
 
 
@@ -85,15 +77,3 @@ class ZyteScraper(BaseGeneralScraper):
 
     async def get_processor_by_url(self, url: str) -> DataProcessor:
         return ZyteDataProcessor(url)
-
-
-def _parse_zyte_date_published(value: str | None) -> int | None:
-    if not value:
-        return None
-    try:
-        parsed = datetime.datetime.fromisoformat(str(value).strip().replace("Z", "+00:00"))
-    except ValueError:
-        return None
-    if parsed.tzinfo is None:
-        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
-    return int(parsed.timestamp())

From ff68610eca713a8e0235111063de30ada560f283 Mon Sep 17 00:00:00 2001
From: aturret <enturreopy@gmail.com>
Date: Sat, 23 May 2026 22:45:32 -0500
Subject: [PATCH 3/3] fix mongodb cache

---
 .../database/mongodb/cache.py                 |  9 ++++-
 tests/unit/database/mongodb/test_cache.py     | 40 +++++++++++++------
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
index 20a01e9..83b838e 100644
--- a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
+++ b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
@@ -6,6 +6,8 @@
 from datetime import datetime, timedelta
 from typing import Optional
 
+from pydantic import ValidationError
+
 from fastfetchbot_shared.database.mongodb.models.metadata import Metadata
 from fastfetchbot_shared.utils.logger import logger
 
@@ -74,7 +76,12 @@ async def save_metadata(metadata_item: dict) -> Metadata:
 
     document_data = dict(metadata_item)
     document_data["published_timestamp"] = document_data.pop("timestamp", None)
-    doc = Metadata.model_construct(**document_data)
+    try:
+        doc = Metadata(**document_data)
+    except (ValidationError, ValueError) as e:
+        logger.error(f"Invalid metadata document for {url}: {e}")
+        raise ValueError("invalid metadata document") from e
+
     await Metadata.insert(doc)
 
     logger.info(f"Saved metadata for {url} (version={new_version})")
diff --git a/tests/unit/database/mongodb/test_cache.py b/tests/unit/database/mongodb/test_cache.py
index e40d3bc..a7d277d 100644
--- a/tests/unit/database/mongodb/test_cache.py
+++ b/tests/unit/database/mongodb/test_cache.py
@@ -121,8 +121,7 @@ async def test_first_save_uses_version_1(self):
             "fastfetchbot_shared.database.mongodb.cache.Metadata"
         ) as MockMetadata:
             MockMetadata.find.return_value = mock_find
-            mock_constructed = MagicMock()
-            MockMetadata.model_construct.return_value = mock_constructed
+            mock_document = MockMetadata.return_value
             MockMetadata.insert = AsyncMock()
 
             from fastfetchbot_shared.database.mongodb.cache import save_metadata
@@ -131,14 +130,14 @@ async def test_first_save_uses_version_1(self):
             result = await save_metadata(item)
 
         assert item["version"] == 1
-        MockMetadata.model_construct.assert_called_once()
+        MockMetadata.assert_called_once()
         assert (
-            MockMetadata.model_construct.call_args.kwargs["published_timestamp"]
+            MockMetadata.call_args.kwargs["published_timestamp"]
             is None
         )
-        assert "timestamp" not in MockMetadata.model_construct.call_args.kwargs
-        MockMetadata.insert.assert_awaited_once_with(mock_constructed)
-        assert result is mock_constructed
+        assert "timestamp" not in MockMetadata.call_args.kwargs
+        MockMetadata.insert.assert_awaited_once_with(mock_document)
+        assert result is mock_document
 
     @pytest.mark.asyncio
     async def test_maps_metadata_timestamp_to_published_timestamp(self):
@@ -148,7 +147,6 @@ async def test_maps_metadata_timestamp_to_published_timestamp(self):
             "fastfetchbot_shared.database.mongodb.cache.Metadata"
         ) as MockMetadata:
             MockMetadata.find.return_value = mock_find
-            MockMetadata.model_construct.return_value = MagicMock()
             MockMetadata.insert = AsyncMock()
 
             from fastfetchbot_shared.database.mongodb.cache import save_metadata
@@ -160,7 +158,7 @@ async def test_maps_metadata_timestamp_to_published_timestamp(self):
             }
             await save_metadata(item)
 
-        construct_kwargs = MockMetadata.model_construct.call_args.kwargs
+        construct_kwargs = MockMetadata.call_args.kwargs
         assert construct_kwargs["published_timestamp"] == 1704067200
         assert "timestamp" not in construct_kwargs
         assert item["timestamp"] == 1704067200
@@ -174,8 +172,6 @@ async def test_increments_version_from_existing(self):
             "fastfetchbot_shared.database.mongodb.cache.Metadata"
         ) as MockMetadata:
             MockMetadata.find.return_value = mock_find
-            mock_constructed = MagicMock()
-            MockMetadata.model_construct.return_value = mock_constructed
             MockMetadata.insert = AsyncMock()
 
             from fastfetchbot_shared.database.mongodb.cache import save_metadata
@@ -193,7 +189,6 @@ async def test_uses_url_from_metadata_item(self):
             "fastfetchbot_shared.database.mongodb.cache.Metadata"
         ) as MockMetadata:
             MockMetadata.find.return_value = mock_find
-            MockMetadata.model_construct.return_value = MagicMock()
             MockMetadata.insert = AsyncMock()
 
             from fastfetchbot_shared.database.mongodb.cache import save_metadata
@@ -204,6 +199,27 @@ async def test_uses_url_from_metadata_item(self):
         # Verify the find was called (to look up existing version)
         MockMetadata.find.assert_called()
 
+    @pytest.mark.asyncio
+    async def test_invalid_document_is_not_inserted(self):
+        mock_find = _make_find_chain(None)
+
+        with patch(
+            "fastfetchbot_shared.database.mongodb.cache.Metadata"
+        ) as MockMetadata, patch(
+            "fastfetchbot_shared.database.mongodb.cache.logger"
+        ) as mock_logger:
+            MockMetadata.find.return_value = mock_find
+            MockMetadata.side_effect = ValueError("bad payload")
+            MockMetadata.insert = AsyncMock()
+
+            from fastfetchbot_shared.database.mongodb.cache import save_metadata
+
+            with pytest.raises(ValueError, match="invalid metadata document"):
+                await save_metadata({"url": "https://example.com", "title": "Test"})
+
+        mock_logger.error.assert_called_once()
+        MockMetadata.insert.assert_not_awaited()
+
     @pytest.mark.asyncio
     async def test_missing_url_raises_value_error(self):
         from fastfetchbot_shared.database.mongodb.cache import save_metadata