aturret · aturret · May 24, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py
@@ -30,10 +30,13 @@ def __init__(self, url: str = None, data: dict = None, **kwargs):
             self.category = data.get("category", "")
             self.raw_content = data.get("content", "")
             self.content = self.raw_content
+            self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))
         if kwargs.get("category"):
             self.category = kwargs["category"]
         self.media_files = []
         self.message_type = MessageType.LONG
+        if not hasattr(self, "timestamp"):
+            self.timestamp = None
 
     def _from_data(self, data: dict):
         self.title = data.get("title", "")
@@ -43,6 +46,7 @@ def _from_data(self, data: dict):
         self.category = data.get("category", "")
         self.raw_content = data.get("content", "")
         self.content = self.raw_content
+        self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))
 
     async def get_item(self, api: bool = False) -> dict:
         if api:
@@ -161,3 +165,9 @@ async def get_api_info(
                 headers=headers,
             )
             return resp
+
+
+def _parse_inoreader_timestamp(timestamp: int | None) -> int | None:
+    if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
+        return None
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py
@@ -6,6 +6,8 @@
 from datetime import datetime, timedelta
 from typing import Optional
 
+from pydantic import ValidationError
+
 from fastfetchbot_shared.database.mongodb.models.metadata import Metadata
 from fastfetchbot_shared.utils.logger import logger
 
@@ -72,7 +74,14 @@ async def save_metadata(metadata_item: dict) -> Metadata:
     new_version = (latest.version + 1) if latest else 1
     metadata_item["version"] = new_version
 
-    doc = Metadata.model_construct(**metadata_item)
+    document_data = dict(metadata_item)
+    document_data["published_timestamp"] = document_data.pop("timestamp", None)
+    try:
+        doc = Metadata(**document_data)
+    except (ValidationError, ValueError) as e:
+        logger.error(f"Invalid metadata document for {url}: {e}")
+        raise ValueError("invalid metadata document") from e
+
     await Metadata.insert(doc)
 
     logger.info(f"Saved metadata for {url} (version={new_version})")

diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py b/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py
@@ -34,6 +34,7 @@ class Metadata(Document):
     source: Optional[str] = None
     media_files: Optional[list[DatabaseMediaFile]] = None
     telegraph_url: Optional[str] = None
+    published_timestamp: Optional[int] = None
     timestamp: datetime = Field(default_factory=datetime.utcnow)
     scrape_status: bool = False
     version: int = Field(default=1, ge=1)

diff --git a/packages/shared/fastfetchbot_shared/models/metadata_item.py b/packages/shared/fastfetchbot_shared/models/metadata_item.py
@@ -20,6 +20,12 @@ def from_str(x: Any) -> str:
     return x
 
 
+def from_optional_int(x: Any) -> Optional[int]:
+    if isinstance(x, bool) or not isinstance(x, int) or x <= 0:
+        return None
+    return x
+
+
 def from_list(f: Callable[[Any], T], x: Any) -> List[T]:
     assert isinstance(x, list)
     return [f(y) for y in x]
@@ -75,6 +81,7 @@ class MetadataItem:
     author_url: Optional[str]
     category: str
     message_type: Optional[MessageType]
+    timestamp: Optional[int] = None
 
     @staticmethod
     def from_dict(obj: Any) -> "MetadataItem":
@@ -89,6 +96,7 @@ def from_dict(obj: Any) -> "MetadataItem":
         author_url = from_str(obj.get("author_url"))
         category = from_str(obj.get("category"))
         message_type = MessageType(obj.get("message_type"))
+        timestamp = from_optional_int(obj.get("timestamp"))
         return MetadataItem(
             url,
             telegraph_url,
@@ -100,21 +108,29 @@ def from_dict(obj: Any) -> "MetadataItem":
             author_url,
             category,
             message_type,
+            timestamp,
         )
 
     def to_dict(self) -> dict:
+        timestamp = from_optional_int(getattr(self, "timestamp", None))
+        message_type = getattr(self, "message_type", None)
+        message_type_value = (
+            message_type.value if isinstance(message_type, MessageType) else message_type
+        )
         result: dict = {
-            "url": from_str(self.url),
-            "telegraph_url": "", "content": from_str(self.content),
-            "text": from_str(self.text),
+            "url": from_str(getattr(self, "url", "")),
+            "telegraph_url": "",
+            "content": from_str(getattr(self, "content", "")),
+            "text": from_str(getattr(self, "text", "")),
             "media_files": from_list(
-                lambda x: to_class(MediaFile, x), self.media_files
+                lambda x: to_class(MediaFile, x), getattr(self, "media_files", [])
             ),
-            "author": from_str(self.author),
-            "title": from_str(self.title),
-            "author_url": from_str(self.author_url),
-            "category": from_str(self.category),
-            "message_type": self.message_type.value
+            "author": from_str(getattr(self, "author", "")),
+            "title": from_str(getattr(self, "title", "")),
+            "author_url": from_str(getattr(self, "author_url", "")),
+            "category": from_str(getattr(self, "category", "")),
+            "message_type": message_type_value,
+            "timestamp": timestamp,
         }
         return result
 

diff --git a/packages/shared/fastfetchbot_shared/services/file_export/video_download.py b/packages/shared/fastfetchbot_shared/services/file_export/video_download.py
@@ -6,12 +6,18 @@
 """
 
 import asyncio
+import datetime
+import re
 from urllib.parse import urlparse, parse_qs
 
 import httpx
 
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
-from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    second_to_time,
+    unix_timestamp_to_utc,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.logger import logger
 from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV
 
@@ -58,6 +64,7 @@ def __init__(
         self.category = category
         self.media_files = []
         self.created = None
+        self.timestamp = None
         self.duration = None
         self.celery_app = celery_app
         self.timeout = timeout
@@ -219,6 +226,7 @@ def _video_info_formatting(self, meta_info: dict):
         if len(meta_info["description"]) > 800:
             meta_info["description"] = meta_info["description"][:800] + "..."
         self.created = meta_info["upload_date"]
+        self.timestamp = meta_info.get("timestamp")
         self.duration = meta_info["duration"]
         self.text = video_info_template.render(
             data={
@@ -250,6 +258,7 @@ def _youtube_info_parse(video_info: dict) -> dict:
             "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u8bc4\u8bba\u6570\uff1a{video_info['comment_count']}",
             "author_avatar": video_info["thumbnail"],
             "upload_date": str(video_info["upload_date"]),
+            "timestamp": _parse_youtube_upload_date(video_info["upload_date"]),
             "duration": second_to_time(round(video_info["duration"])),
         }
 
@@ -266,5 +275,26 @@ def _bilibili_info_parse(video_info: dict) -> dict:
             "description": video_info["description"],
             "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u5f39\u5e55\u6570\uff1a{video_info['comment_count']} \u70b9\u8d5e\u6570\uff1a{video_info['like_count']}",
             "upload_date": unix_timestamp_to_utc(video_info["timestamp"]),
+            "timestamp": _parse_bilibili_timestamp(video_info["timestamp"]),
             "duration": second_to_time(round(video_info["duration"])),
         }
+
+
+def _parse_youtube_upload_date(upload_date: str | int | None) -> int | None:
+    if upload_date is None:
+        return None
+    upload_date_text = str(upload_date).strip()
+    if not re.fullmatch(r"\d{8}", upload_date_text):
+        return None
+    try:
+        parsed = datetime.datetime.strptime(upload_date_text, "%Y%m%d")
+    except ValueError:
+        return None
+    parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
+
+
+def _parse_bilibili_timestamp(timestamp: int | None) -> int | None:
+    if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
+        return None
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py
@@ -32,6 +32,7 @@ def from_dict(obj: Any) -> "Bluesky":
             media_files=bluesky_item.media_files,
             category=bluesky_item.category,
             message_type=bluesky_item.message_type,
+            timestamp=bluesky_item.timestamp,
             cid=bluesky_item.cid,
             author_did=bluesky_item.author_did,
         )

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py
@@ -1,3 +1,4 @@
+import datetime
 from typing import Optional
 from urllib.parse import urlparse
 
@@ -119,6 +120,7 @@ async def _resolve_single_post_data(post_data: PostView) -> dict:
             "category": "bluesky",
             "media_files": [],
             "created_at": created_at,
+            "timestamp": _parse_bluesky_created_at(created_at),
             "author_did": author_did,
         }
 
@@ -190,3 +192,15 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost:
         except Exception as e:
             logger.error(f"Error while getting post data: {e}")
             raise
+
+
+def _parse_bluesky_created_at(created_at: str | None) -> int | None:
+    if not created_at:
+        return None
+    try:
+        parsed = datetime.datetime.fromisoformat(created_at.replace("Z", "+00:00"))
+    except (TypeError, ValueError):
+        return None
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=datetime.timezone.utc)
+    return int(parsed.timestamp())
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py
@@ -90,6 +90,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
                 if cached is not None:
                     logger.info("Cache hit, returning cached metadata")
                     result = cached.model_dump(mode="json", exclude={"id"})
+                    result["timestamp"] = result.pop("published_timestamp", None)
                     result["_cached"] = True
                     return result
             except Exception as e:

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py
@@ -6,7 +6,10 @@
 from bs4 import BeautifulSoup
 from lxml import etree
 
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.network import get_selector, HEADERS
 from fastfetchbot_shared.utils.logger import logger
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType
@@ -57,6 +60,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
         self.text_group: Optional[str] = None
         self.raw_content: Optional[str] = None
         self.date: Optional[str] = None
+        self.timestamp: Optional[int] = None
         # reqeust fields
         self.headers = HEADERS
         self.headers["Cookie"] = kwargs.get("cookie", "")

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py
@@ -27,6 +27,7 @@ def from_dict(obj: Any) -> "GeneralItem":
             media_files=metadata_item.media_files,
             category=metadata_item.category,
             message_type=metadata_item.message_type,
+            timestamp=metadata_item.timestamp,
             id=obj.get("id", ""),
             raw_content=obj.get("raw_content", ""),
             scraper_type=obj.get("scraper_type", ""),

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py
@@ -11,7 +11,10 @@
 from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
 from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor
 from fastfetchbot_shared.services.scrapers.general import GeneralItem
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 from fastfetchbot_shared.utils.logger import logger
 
 GENERAL_TEXT_LIMIT = 800
@@ -67,6 +70,7 @@ async def _build_item_data(
         markdown_content: str,
         html_content: str,
         og_image: Optional[str] = None,
+        timestamp: Optional[int] = None,
     ) -> None:
         """
         Common method to build item data from scraped content.
@@ -79,6 +83,13 @@ async def _build_item_data(
             "author": author or self.url_parser.netloc,
             "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
             "scraper_type": self.scraper_type,
+            "timestamp": (
+                timestamp
+                if not isinstance(timestamp, bool)
+                and isinstance(timestamp, int)
+                and timestamp > 0
+                else None
+            ),
         }
 
         # Process text content - use description or first part of markdown

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py
@@ -10,7 +10,10 @@
 from fastfetchbot_shared.services.scrapers.scraper import DataProcessor
 from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
 from fastfetchbot_shared.utils.logger import logger
-from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+    wrap_text_into_html,
+)
 
 # HTML tags to exclude from Firecrawl output at the source
 FIRECRAWL_EXCLUDE_TAGS = [

diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py
@@ -7,7 +7,9 @@
 
 from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
 from fastfetchbot_shared.utils.network import get_response
-from fastfetchbot_shared.utils.parse import get_html_text_length
+from fastfetchbot_shared.utils.parse import (
+    get_html_text_length,
+)
 from fastfetchbot_shared.utils.logger import logger
 from .config import API_HEADERS_LIST, ALL_SCRAPERS
 from fastfetchbot_shared.services.scrapers.config import settings
@@ -23,6 +25,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
             "/", ""
         )
         self.message_type = MessageType.SHORT
+        self.timestamp = None
 
     async def get_item(self):
         await self.get_instagram()
@@ -115,6 +118,9 @@ def _get_ins_post_looter2(ins_data: dict) -> dict:
         )
         ins_info["content"] = ""
         ins_info["text"] = ins_text_data
+        ins_info["timestamp"] = _parse_instagram_timestamp(
+            ins_data.get("taken_at_timestamp") or ins_data.get("taken_at")
+        )
         ins_info["author"] = ins_data["owner"]["username"]
         if ins_data["owner"]["full_name"]:
             ins_info["author"] += "(" + ins_data["owner"]["full_name"] + ")"
@@ -190,6 +196,10 @@ def _get_ins_post_ins28_scraper2(ins_data):
         )
         ins_info["content"] = ""
         ins_info["text"] = ins_text_data
+        ins_info["timestamp"] = _parse_instagram_timestamp(
+            ins_data["items"][0].get("taken_at")
+            or ins_data["items"][0].get("taken_at_timestamp")
+        )
         ins_info["author"] = ins_data["items"][0]["user"]["username"]
         if ins_data["items"][0]["user"]["full_name"]:
             ins_info["author"] += "(" + ins_data["items"][0]["user"]["full_name"] + ")"
@@ -269,3 +279,17 @@ def _get_ins_post_ins28_scraper2(ins_data):
 
     async def _get_story_info(self):
         pass
+
+
+def _parse_instagram_timestamp(value: Any) -> int | None:
+    if value in (None, ""):
+        return None
+    try:
+        timestamp = int(value)
+    except (TypeError, ValueError):
+        return None
+    if timestamp <= 0:
+        return None
+    if timestamp > 10_000_000_000:
+        timestamp //= 1000
+    return timestamp
diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py
@@ -19,6 +19,7 @@ def __init__(self, url, data: Optional[Any] = None, **kwargs):
         self.category = "reddit"
         self.media_files = []
         self.message_type = MessageType.LONG
+        self.timestamp = None
 
     async def get_item(self) -> dict:
         await self.get_reddit()
@@ -47,7 +48,8 @@ async def _process_reddit_data(self, reddit_data) -> None:
         self.author = reddit_data["author"].name
         self.author_url = f"https://www.reddit.com/user/{self.author}"
         self.raw_content = reddit_data["selftext_html"] or ""
-        self.created = unix_timestamp_to_utc(int(reddit_data["created_utc"]))
+        self.timestamp = int(reddit_data["created_utc"])
+        self.created = unix_timestamp_to_utc(self.timestamp)
         self.score = reddit_data["score"]
         self.comments_count = reddit_data["num_comments"]
         self.upvote_ratio = reddit_data["upvote_ratio"]