From 6ea5363c8839a2e23678a72eddbd3c836c97b76a Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 23 May 2026 22:32:53 -0500 Subject: [PATCH 1/3] feat: add timestamp for scraper content --- apps/api/src/services/inoreader/__init__.py | 10 +++++ .../database/mongodb/cache.py | 4 +- .../database/mongodb/models/metadata.py | 1 + .../models/metadata_item.py | 34 +++++++++++---- .../services/file_export/video_download.py | 32 +++++++++++++- .../services/scrapers/bluesky/__init__.py | 1 + .../services/scrapers/bluesky/scraper.py | 14 ++++++ .../services/scrapers/common.py | 1 + .../services/scrapers/douban/__init__.py | 6 ++- .../services/scrapers/general/__init__.py | 1 + .../services/scrapers/general/base.py | 13 +++++- .../services/scrapers/general/firecrawl.py | 5 ++- .../services/scrapers/general/zyte.py | 20 +++++++++ .../services/scrapers/instagram/__init__.py | 26 ++++++++++- .../services/scrapers/reddit/__init__.py | 4 +- .../services/scrapers/threads/__init__.py | 2 + .../services/scrapers/twitter/__init__.py | 21 ++++++++- .../services/scrapers/wechat/__init__.py | 3 ++ .../services/scrapers/weibo/__init__.py | 2 +- .../services/scrapers/weibo/scraper.py | 20 ++++++++- .../services/scrapers/xiaohongshu/__init__.py | 21 +++++++-- .../services/scrapers/zhihu/__init__.py | 12 ++++++ .../shared/fastfetchbot_shared/utils/parse.py | 6 ++- tests/unit/database/mongodb/test_cache.py | 30 +++++++++++++ tests/unit/scrapers/test_common_cache.py | 6 +++ tests/unit/scrapers/test_wechat.py | 2 + tests/unit/test_published_timestamp.py | 43 +++++++++++++++++++ 27 files changed, 315 insertions(+), 25 deletions(-) create mode 100644 tests/unit/test_published_timestamp.py diff --git a/apps/api/src/services/inoreader/__init__.py b/apps/api/src/services/inoreader/__init__.py index 2bbaa15..e072bc6 100644 --- a/apps/api/src/services/inoreader/__init__.py +++ b/apps/api/src/services/inoreader/__init__.py @@ -30,10 +30,13 @@ def __init__(self, url: str = None, data: dict = None, **kwargs): self.category = data.get("category", "") self.raw_content = data.get("content", "") self.content = self.raw_content + self.timestamp = _parse_inoreader_timestamp(data.get("timestamp")) if kwargs.get("category"): self.category = kwargs["category"] self.media_files = [] self.message_type = MessageType.LONG + if not hasattr(self, "timestamp"): + self.timestamp = None def _from_data(self, data: dict): self.title = data.get("title", "") @@ -43,6 +46,7 @@ def _from_data(self, data: dict): self.category = data.get("category", "") self.raw_content = data.get("content", "") self.content = self.raw_content + self.timestamp = _parse_inoreader_timestamp(data.get("timestamp")) async def get_item(self, api: bool = False) -> dict: if api: @@ -161,3 +165,9 @@ async def get_api_info( headers=headers, ) return resp + + +def _parse_inoreader_timestamp(timestamp: int | None) -> int | None: + if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0: + return None + return timestamp diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py index 3684332..20a01e9 100644 --- a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py +++ b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py @@ -72,7 +72,9 @@ async def save_metadata(metadata_item: dict) -> Metadata: new_version = (latest.version + 1) if latest else 1 metadata_item["version"] = new_version - doc = Metadata.model_construct(**metadata_item) + document_data = dict(metadata_item) + document_data["published_timestamp"] = document_data.pop("timestamp", None) + doc = Metadata.model_construct(**document_data) await Metadata.insert(doc) logger.info(f"Saved metadata for {url} (version={new_version})") diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py b/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py index 358f87f..96ee0e8 100644 --- a/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py +++ b/packages/shared/fastfetchbot_shared/database/mongodb/models/metadata.py @@ -34,6 +34,7 @@ class Metadata(Document): source: Optional[str] = None media_files: Optional[list[DatabaseMediaFile]] = None telegraph_url: Optional[str] = None + published_timestamp: Optional[int] = None timestamp: datetime = Field(default_factory=datetime.utcnow) scrape_status: bool = False version: int = Field(default=1, ge=1) diff --git a/packages/shared/fastfetchbot_shared/models/metadata_item.py b/packages/shared/fastfetchbot_shared/models/metadata_item.py index 4c37c1c..2613802 100644 --- a/packages/shared/fastfetchbot_shared/models/metadata_item.py +++ b/packages/shared/fastfetchbot_shared/models/metadata_item.py @@ -20,6 +20,12 @@ def from_str(x: Any) -> str: return x +def from_optional_int(x: Any) -> Optional[int]: + if isinstance(x, bool) or not isinstance(x, int) or x <= 0: + return None + return x + + def from_list(f: Callable[[Any], T], x: Any) -> List[T]: assert isinstance(x, list) return [f(y) for y in x] @@ -75,6 +81,7 @@ class MetadataItem: author_url: Optional[str] category: str message_type: Optional[MessageType] + timestamp: Optional[int] = None @staticmethod def from_dict(obj: Any) -> "MetadataItem": @@ -89,6 +96,7 @@ def from_dict(obj: Any) -> "MetadataItem": author_url = from_str(obj.get("author_url")) category = from_str(obj.get("category")) message_type = MessageType(obj.get("message_type")) + timestamp = from_optional_int(obj.get("timestamp")) return MetadataItem( url, telegraph_url, @@ -100,21 +108,29 @@ def from_dict(obj: Any) -> "MetadataItem": author_url, category, message_type, + timestamp, ) def to_dict(self) -> dict: + timestamp = from_optional_int(getattr(self, "timestamp", None)) + message_type = getattr(self, "message_type", None) + message_type_value = ( + message_type.value if isinstance(message_type, MessageType) else message_type + ) result: dict = { - "url": from_str(self.url), - "telegraph_url": "", "content": from_str(self.content), - "text": from_str(self.text), + "url": from_str(getattr(self, "url", "")), + "telegraph_url": "", + "content": from_str(getattr(self, "content", "")), + "text": from_str(getattr(self, "text", "")), "media_files": from_list( - lambda x: to_class(MediaFile, x), self.media_files + lambda x: to_class(MediaFile, x), getattr(self, "media_files", []) ), - "author": from_str(self.author), - "title": from_str(self.title), - "author_url": from_str(self.author_url), - "category": from_str(self.category), - "message_type": self.message_type.value + "author": from_str(getattr(self, "author", "")), + "title": from_str(getattr(self, "title", "")), + "author_url": from_str(getattr(self, "author_url", "")), + "category": from_str(getattr(self, "category", "")), + "message_type": message_type_value, + "timestamp": timestamp, } return result diff --git a/packages/shared/fastfetchbot_shared/services/file_export/video_download.py b/packages/shared/fastfetchbot_shared/services/file_export/video_download.py index 5be1735..519a699 100644 --- a/packages/shared/fastfetchbot_shared/services/file_export/video_download.py +++ b/packages/shared/fastfetchbot_shared/services/file_export/video_download.py @@ -6,12 +6,18 @@ """ import asyncio +import datetime +import re from urllib.parse import urlparse, parse_qs import httpx from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile -from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + second_to_time, + unix_timestamp_to_utc, + wrap_text_into_html, +) from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV @@ -58,6 +64,7 @@ def __init__( self.category = category self.media_files = [] self.created = None + self.timestamp = None self.duration = None self.celery_app = celery_app self.timeout = timeout @@ -219,6 +226,7 @@ def _video_info_formatting(self, meta_info: dict): if len(meta_info["description"]) > 800: meta_info["description"] = meta_info["description"][:800] + "..." self.created = meta_info["upload_date"] + self.timestamp = meta_info.get("timestamp") self.duration = meta_info["duration"] self.text = video_info_template.render( data={ @@ -250,6 +258,7 @@ def _youtube_info_parse(video_info: dict) -> dict: "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u8bc4\u8bba\u6570\uff1a{video_info['comment_count']}", "author_avatar": video_info["thumbnail"], "upload_date": str(video_info["upload_date"]), + "timestamp": _parse_youtube_upload_date(video_info["upload_date"]), "duration": second_to_time(round(video_info["duration"])), } @@ -266,5 +275,26 @@ def _bilibili_info_parse(video_info: dict) -> dict: "description": video_info["description"], "playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u5f39\u5e55\u6570\uff1a{video_info['comment_count']} \u70b9\u8d5e\u6570\uff1a{video_info['like_count']}", "upload_date": unix_timestamp_to_utc(video_info["timestamp"]), + "timestamp": _parse_bilibili_timestamp(video_info["timestamp"]), "duration": second_to_time(round(video_info["duration"])), } + + +def _parse_youtube_upload_date(upload_date: str | int | None) -> int | None: + if upload_date is None: + return None + upload_date_text = str(upload_date).strip() + if not re.fullmatch(r"\d{8}", upload_date_text): + return None + try: + parsed = datetime.datetime.strptime(upload_date_text, "%Y%m%d") + except ValueError: + return None + parsed = parsed.replace(tzinfo=datetime.timezone.utc) + return int(parsed.timestamp()) + + +def _parse_bilibili_timestamp(timestamp: int | None) -> int | None: + if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0: + return None + return timestamp diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py index 274d049..27f7987 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/__init__.py @@ -32,6 +32,7 @@ def from_dict(obj: Any) -> "Bluesky": media_files=bluesky_item.media_files, category=bluesky_item.category, message_type=bluesky_item.message_type, + timestamp=bluesky_item.timestamp, cid=bluesky_item.cid, author_did=bluesky_item.author_did, ) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py index 8c3d163..0799fd5 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py @@ -1,3 +1,4 @@ +import datetime from typing import Optional from urllib.parse import urlparse @@ -119,6 +120,7 @@ async def _resolve_single_post_data(post_data: PostView) -> dict: "category": "bluesky", "media_files": [], "created_at": created_at, + "timestamp": _parse_bluesky_created_at(created_at), "author_did": author_did, } @@ -190,3 +192,15 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost: except Exception as e: logger.error(f"Error while getting post data: {e}") raise + + +def _parse_bluesky_created_at(created_at: str | None) -> int | None: + if not created_at: + return None + try: + parsed = datetime.datetime.fromisoformat(created_at.replace("Z", "+00:00")) + except (TypeError, ValueError): + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=datetime.timezone.utc) + return int(parsed.timestamp()) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/common.py b/packages/shared/fastfetchbot_shared/services/scrapers/common.py index f1d3570..9c133a1 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/common.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/common.py @@ -90,6 +90,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict: if cached is not None: logger.info("Cache hit, returning cached metadata") result = cached.model_dump(mode="json", exclude={"id"}) + result["timestamp"] = result.pop("published_timestamp", None) result["_cached"] = True return result except Exception as e: diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py index ee9a083..6b759c3 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/douban/__init__.py @@ -6,7 +6,10 @@ from bs4 import BeautifulSoup from lxml import etree -from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, + wrap_text_into_html, +) from fastfetchbot_shared.utils.network import get_selector, HEADERS from fastfetchbot_shared.utils.logger import logger from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType @@ -57,6 +60,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): self.text_group: Optional[str] = None self.raw_content: Optional[str] = None self.date: Optional[str] = None + self.timestamp: Optional[int] = None # reqeust fields self.headers = HEADERS self.headers["Cookie"] = kwargs.get("cookie", "") diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py index f256512..46090cc 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/__init__.py @@ -27,6 +27,7 @@ def from_dict(obj: Any) -> "GeneralItem": media_files=metadata_item.media_files, category=metadata_item.category, message_type=metadata_item.message_type, + timestamp=metadata_item.timestamp, id=obj.get("id", ""), raw_content=obj.get("raw_content", ""), scraper_type=obj.get("scraper_type", ""), diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py index 62817d1..d8eb84b 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/base.py @@ -11,7 +11,10 @@ from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.general import GeneralItem -from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, + wrap_text_into_html, +) from fastfetchbot_shared.utils.logger import logger GENERAL_TEXT_LIMIT = 800 @@ -67,6 +70,7 @@ async def _build_item_data( markdown_content: str, html_content: str, og_image: Optional[str] = None, + timestamp: Optional[int] = None, ) -> None: """ Common method to build item data from scraped content. @@ -79,6 +83,13 @@ async def _build_item_data( "author": author or self.url_parser.netloc, "author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}", "scraper_type": self.scraper_type, + "timestamp": ( + timestamp + if not isinstance(timestamp, bool) + and isinstance(timestamp, int) + and timestamp > 0 + else None + ), } # Process text content - use description or first part of markdown diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py index c018daf..62e1fd9 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/firecrawl.py @@ -10,7 +10,10 @@ from fastfetchbot_shared.services.scrapers.scraper import DataProcessor from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType from fastfetchbot_shared.utils.logger import logger -from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, + wrap_text_into_html, +) # HTML tags to exclude from Firecrawl output at the source FIRECRAWL_EXCLUDE_TAGS = [ diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index d011d6f..0186373 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -1,3 +1,5 @@ +import datetime + from zyte_api import AsyncZyteAPI from fastfetchbot_shared.services.scrapers.config import settings @@ -59,6 +61,11 @@ async def _process_zyte_result(self, result: dict) -> None: # Extract main image main_image = article.get("mainImage", {}) og_image = main_image.get("url") if main_image else None + timestamp = _parse_zyte_date_published( + article.get("datePublished") + or article.get("date_published") + or article.get("publishedDate") + ) await self._build_item_data( title=title, @@ -67,6 +74,7 @@ async def _process_zyte_result(self, result: dict) -> None: markdown_content=markdown_content, html_content=html_content, og_image=og_image, + timestamp=timestamp, ) @@ -77,3 +85,15 @@ class ZyteScraper(BaseGeneralScraper): async def get_processor_by_url(self, url: str) -> DataProcessor: return ZyteDataProcessor(url) + + +def _parse_zyte_date_published(value: str | None) -> int | None: + if not value: + return None + try: + parsed = datetime.datetime.fromisoformat(str(value).strip().replace("Z", "+00:00")) + except ValueError: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=datetime.timezone.utc) + return int(parsed.timestamp()) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py index ae99e0c..52af9da 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py @@ -7,7 +7,9 @@ from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile from fastfetchbot_shared.utils.network import get_response -from fastfetchbot_shared.utils.parse import get_html_text_length +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, +) from fastfetchbot_shared.utils.logger import logger from .config import API_HEADERS_LIST, ALL_SCRAPERS from fastfetchbot_shared.services.scrapers.config import settings @@ -23,6 +25,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): "/", "" ) self.message_type = MessageType.SHORT + self.timestamp = None async def get_item(self): await self.get_instagram() @@ -115,6 +118,9 @@ def _get_ins_post_looter2(ins_data: dict) -> dict: ) ins_info["content"] = "" ins_info["text"] = ins_text_data + ins_info["timestamp"] = _parse_instagram_timestamp( + ins_data.get("taken_at_timestamp") or ins_data.get("taken_at") + ) ins_info["author"] = ins_data["owner"]["username"] if ins_data["owner"]["full_name"]: ins_info["author"] += "(" + ins_data["owner"]["full_name"] + ")" @@ -190,6 +196,10 @@ def _get_ins_post_ins28_scraper2(ins_data): ) ins_info["content"] = "" ins_info["text"] = ins_text_data + ins_info["timestamp"] = _parse_instagram_timestamp( + ins_data["items"][0].get("taken_at") + or ins_data["items"][0].get("taken_at_timestamp") + ) ins_info["author"] = ins_data["items"][0]["user"]["username"] if ins_data["items"][0]["user"]["full_name"]: ins_info["author"] += "(" + ins_data["items"][0]["user"]["full_name"] + ")" @@ -269,3 +279,17 @@ def _get_ins_post_ins28_scraper2(ins_data): async def _get_story_info(self): pass + + +def _parse_instagram_timestamp(value: Any) -> int | None: + if value in (None, ""): + return None + try: + timestamp = int(value) + except (TypeError, ValueError): + return None + if timestamp <= 0: + return None + if timestamp > 10_000_000_000: + timestamp //= 1000 + return timestamp diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py index 9a907cb..98c804f 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/reddit/__init__.py @@ -19,6 +19,7 @@ def __init__(self, url, data: Optional[Any] = None, **kwargs): self.category = "reddit" self.media_files = [] self.message_type = MessageType.LONG + self.timestamp = None async def get_item(self) -> dict: await self.get_reddit() @@ -47,7 +48,8 @@ async def _process_reddit_data(self, reddit_data) -> None: self.author = reddit_data["author"].name self.author_url = f"https://www.reddit.com/user/{self.author}" self.raw_content = reddit_data["selftext_html"] or "" - self.created = unix_timestamp_to_utc(int(reddit_data["created_utc"])) + self.timestamp = int(reddit_data["created_utc"]) + self.created = unix_timestamp_to_utc(self.timestamp) self.score = reddit_data["score"] self.comments_count = reddit_data["num_comments"] self.upvote_ratio = reddit_data["upvote_ratio"] diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py index 69be60a..22b2ca7 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/threads/__init__.py @@ -30,6 +30,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): self.code = urlparse(url).path.split("/")[2] self.pics_url = [] self.videos_url = [] + self.timestamp = None async def get_item(self) -> dict: await self.get_threads() @@ -115,6 +116,7 @@ def process_single_threads(self, thread: Dict) -> None: self.title = thread["username"] + "'s Threads" self.author = thread["username"] self.author_url = f"https://threads.net/@{thread['username']}" + self.timestamp = thread["published_on"] created_at = unix_timestamp_to_utc(thread["published_on"]) reply_count = thread["reply_count"] like_count = thread["like_count"] diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py index 4c5b280..b79af00 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/twitter/__init__.py @@ -1,5 +1,7 @@ # TODO: https://rapidapi.com/Glavier/api/twitter135 import asyncio +import datetime +from email.utils import parsedate_to_datetime from urllib.parse import urlparse from typing import Dict, List, Optional, Any, Tuple @@ -7,7 +9,10 @@ import jmespath from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType -from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, + wrap_text_into_html, +) from fastfetchbot_shared.exceptions import ScraperError, ScraperParseError from twitter.scraper import Scraper from .config import ( @@ -40,6 +45,7 @@ def __init__( self.media_files: list[MediaFile] = [] self.category = "twitter" self.message_type = MessageType.SHORT + self.timestamp = None # auxiliary fields self.tid = urlparse(url).path.split("/")[-1] self.text_group = "" @@ -168,6 +174,7 @@ def process_single_tweet_Twitter135(self, tweet: Dict, retweeted=False) -> None: self.author = tweet["name"] self.author_url = f"https://twitter.com/{tweet['username']}" self.date = tweet["date"] + self.timestamp = _parse_twitter_created_at(tweet["date"]) tweet_info = self.parse_single_tweet_Twitter135(tweet, retweeted=retweeted) self.text_group += tweet_info["text_group"] self.content_group += tweet_info["content_group"] @@ -326,6 +333,18 @@ def _find_article_media_url(article: Dict, media_id: str) -> str: return "" +def _parse_twitter_created_at(created_at: str | None) -> int | None: + if not created_at: + return None + try: + parsed = parsedate_to_datetime(created_at) + except (TypeError, ValueError, IndexError): + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=datetime.timezone.utc) + return int(parsed.timestamp()) + + def _apply_inline_formatting( text: str, style_ranges: List[Dict], diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py index 2951684..3f81eb3 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/wechat/__init__.py @@ -19,6 +19,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): self.media_files: list[MediaFile] = [] self.category = "wechat" self.message_type = MessageType.LONG + self.timestamp = None # auxiliary fields self.sid = "" self.official_account = "" @@ -52,6 +53,8 @@ def _wechat_data_parse(wechat_data: etree.HTML) -> Dict: ), } for k, v in meta_data.items(): + if v is None: + continue new_string = v.replace("\n", "") meta_data[k] = new_string.strip() return meta_data diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py index c84494d..056816b 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/__init__.py @@ -44,6 +44,7 @@ def from_dict(obj: Any) -> "Weibo": media_files=weibo_item.media_files, category=weibo_item.category, message_type=weibo_item.message_type, + timestamp=weibo_item.timestamp, id=weibo_item.id, ) @@ -51,4 +52,3 @@ def to_dict(self) -> dict: result: dict = super().to_dict() result["id"] = self.id return result - diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py index acbf414..4913a8a 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/weibo/scraper.py @@ -1,4 +1,6 @@ import json +import datetime +from email.utils import parsedate_to_datetime from typing import Optional, Any, Union from urllib.parse import urlparse @@ -12,7 +14,10 @@ from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor from fastfetchbot_shared.services.scrapers.weibo import Weibo from fastfetchbot_shared.utils.network import get_response_json, get_random_user_agent -from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html +from fastfetchbot_shared.utils.parse import ( + get_html_text_length, + wrap_text_into_html, +) from .config import ( AJAX_HOST, AJAX_LONGTEXT_HOST, @@ -153,6 +158,7 @@ async def _process_weibo_item(self, weibo_info: dict) -> None: "author_url": weibo_info.get("author_url"), "title": weibo_info.get("author") + "的微博", "date": weibo_info.get("created", None), + "timestamp": _parse_weibo_created_at(weibo_info.get("created")), "source": weibo_info.get("source", None), "region_name": weibo_info.get("region_name", None), "attitudes_count": self._string_to_int(weibo_info.get("attitudes_count", 0)), @@ -523,3 +529,15 @@ class WeiboScraper(Scraper): async def get_processor_by_url(self, url) -> DataProcessor: return WeiboDataProcessor(url, cookies=self.weibo_cookies) + + +def _parse_weibo_created_at(created_at: str | None) -> int | None: + if not created_at: + return None + try: + parsed = parsedate_to_datetime(created_at) + except (TypeError, ValueError, IndexError): + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=datetime.timezone.utc) + return int(parsed.timestamp()) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py index b586215..72047e5 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/xiaohongshu/__init__.py @@ -30,6 +30,7 @@ def __init__(self, url: str, data: Any, **kwargs): self.like_count = None self.updated = None self.created = None + self.timestamp = None self.raw_content = None async def get_item(self) -> dict: @@ -59,11 +60,11 @@ async def _process_xiaohongshu_note(self, json_data: dict): self.raw_content = json_data.get("desc", "") raw_time = json_data.get("time", 0) raw_updated = json_data.get("last_update_time", 0) - self.created = ( - unix_timestamp_to_utc(int(raw_time) / 1000) if raw_time else None - ) + self.timestamp = _parse_xiaohongshu_timestamp(raw_time) + updated_timestamp = _parse_xiaohongshu_timestamp(raw_updated) + self.created = unix_timestamp_to_utc(self.timestamp) if self.timestamp else None self.updated = ( - unix_timestamp_to_utc(int(raw_updated) / 1000) if raw_updated else None + unix_timestamp_to_utc(updated_timestamp) if updated_timestamp else None ) self.like_count = json_data.get("liked_count") self.collected_count = json_data.get("collected_count") @@ -92,3 +93,15 @@ async def _process_xiaohongshu_note(self, json_data: dict): f'

' ) self.content = content_template.render(data=data) + + +def _parse_xiaohongshu_timestamp(value: Any) -> int | None: + if value in (None, ""): + return None + try: + timestamp = int(value) + except (TypeError, ValueError): + return None + if timestamp <= 0: + return None + return timestamp // 1000 if timestamp > 10_000_000_000 else timestamp diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py index 8f8923d..1d7f847 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/zhihu/__init__.py @@ -90,6 +90,12 @@ def replace_inner_quotes(match): return raw_str +def _parse_zhihu_timestamp(timestamp: Any) -> int | None: + if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0: + return None + return timestamp + + class Zhihu(MetadataItem): def __init__(self, url: str, data: Optional[Any] = None, **kwargs): # metadata fields @@ -112,6 +118,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): self.raw_content = "" self.date = "" self.updated = "" + self.timestamp = None self.retweet_html = "" self.upvote: int = 0 self.retweeted: bool = False @@ -374,6 +381,7 @@ async def _get_zhihu_status(self): self.raw_content = unmask_zhihu_links(self.raw_content) self.media_files.extend(data["media_files"]) self.date = unix_timestamp_to_utc(data["created"]) + self.timestamp = _parse_zhihu_timestamp(data["created"]) self.updated = unix_timestamp_to_utc(data["updated"]) self.upvote = data["like_count"] if data["origin_pin_id"]: @@ -464,6 +472,7 @@ def _process_picture(pictures, content_attr): ) self.raw_content = status_data["content"] self.date = unix_timestamp_to_utc(status_data["created"]) + self.timestamp = _parse_zhihu_timestamp(status_data["created"]) self.updated = unix_timestamp_to_utc(status_data["updated"]) self.upvote = status_data["like_count"] self.comment_count = status_data["comment_count"] @@ -549,6 +558,7 @@ async def _get_zhihu_article(self): self.upvote = json_data["voteup_count"] self.comment_count = json_data.get("comment_count", 0) self.date = unix_timestamp_to_utc(json_data.get("created", 0)) + self.timestamp = _parse_zhihu_timestamp(json_data.get("created", 0)) self.updated = unix_timestamp_to_utc(json_data.get("updated", 0)) if json_data.get("column"): self.column = json_data["column"].get("title", "") @@ -579,6 +589,7 @@ async def _get_zhihu_article(self): self.upvote = article_data["voteup_count"] self.comment_count = article_data["comment_count"] self.date = unix_timestamp_to_utc(article_data["created"]) + self.timestamp = _parse_zhihu_timestamp(article_data["created"]) self.updated = unix_timestamp_to_utc(article_data["updated"]) self.column = article_data["column"] self.column_url = article_data["column_url"] @@ -711,6 +722,7 @@ def _resolve_answer_json_data(self, answer_data: Dict) -> None: ) or "" self.raw_content = answer_data["content"] or "" self.date = unix_timestamp_to_utc(answer_data["created"] or "") or "" + self.timestamp = _parse_zhihu_timestamp(answer_data["created"] or "") self.updated = unix_timestamp_to_utc(answer_data["updated"] or "") or "" self.comment_count = answer_data["comment_count"] or 0 self.upvote = answer_data["voteup_count"] or 0 diff --git a/packages/shared/fastfetchbot_shared/utils/parse.py b/packages/shared/fastfetchbot_shared/utils/parse.py index 98a5d4a..08a02fd 100644 --- a/packages/shared/fastfetchbot_shared/utils/parse.py +++ b/packages/shared/fastfetchbot_shared/utils/parse.py @@ -11,6 +11,7 @@ from fastfetchbot_shared.utils.config import SOCIAL_MEDIA_WEBSITE_PATTERNS, VIDEO_WEBSITE_PATTERNS, BANNED_PATTERNS TELEGRAM_TEXT_LIMIT = 900 +BEIJING_TZ = datetime.timezone(datetime.timedelta(hours=8)) mimetypes.init() @@ -47,8 +48,9 @@ def format_telegram_short_text(soup: BeautifulSoup) -> BeautifulSoup: def unix_timestamp_to_utc(timestamp: int) -> str | None: if not timestamp: return None - utc_time = datetime.datetime.utcfromtimestamp(timestamp) - beijing_time = utc_time + datetime.timedelta(hours=8) + beijing_time = datetime.datetime.fromtimestamp( + timestamp, datetime.timezone.utc + ).astimezone(BEIJING_TZ) return beijing_time.strftime("%Y-%m-%d %H:%M") diff --git a/tests/unit/database/mongodb/test_cache.py b/tests/unit/database/mongodb/test_cache.py index 0bdb549..e40d3bc 100644 --- a/tests/unit/database/mongodb/test_cache.py +++ b/tests/unit/database/mongodb/test_cache.py @@ -132,9 +132,39 @@ async def test_first_save_uses_version_1(self): assert item["version"] == 1 MockMetadata.model_construct.assert_called_once() + assert ( + MockMetadata.model_construct.call_args.kwargs["published_timestamp"] + is None + ) + assert "timestamp" not in MockMetadata.model_construct.call_args.kwargs MockMetadata.insert.assert_awaited_once_with(mock_constructed) assert result is mock_constructed + @pytest.mark.asyncio + async def test_maps_metadata_timestamp_to_published_timestamp(self): + mock_find = _make_find_chain(None) + + with patch( + "fastfetchbot_shared.database.mongodb.cache.Metadata" + ) as MockMetadata: + MockMetadata.find.return_value = mock_find + MockMetadata.model_construct.return_value = MagicMock() + MockMetadata.insert = AsyncMock() + + from fastfetchbot_shared.database.mongodb.cache import save_metadata + + item = { + "url": "https://example.com", + "title": "Test", + "timestamp": 1704067200, + } + await save_metadata(item) + + construct_kwargs = MockMetadata.model_construct.call_args.kwargs + assert construct_kwargs["published_timestamp"] == 1704067200 + assert "timestamp" not in construct_kwargs + assert item["timestamp"] == 1704067200 + @pytest.mark.asyncio async def test_increments_version_from_existing(self): existing_doc = _make_mock_metadata(version=3) diff --git a/tests/unit/scrapers/test_common_cache.py b/tests/unit/scrapers/test_common_cache.py index f55b6e1..e3ff819 100644 --- a/tests/unit/scrapers/test_common_cache.py +++ b/tests/unit/scrapers/test_common_cache.py @@ -69,6 +69,9 @@ async def test_cache_hit_returns_cached_result(self, make_service): "title": "Cached Title", "url": "https://example.com/post/1", "media_files": [], + "content": "

Cached body

", + "timestamp": "2026-01-01T00:00:00", + "published_timestamp": 1704067200, } svc = make_service(store_database=True, database_cache_ttl=3600) @@ -82,6 +85,9 @@ async def test_cache_hit_returns_cached_result(self, make_service): assert result["_cached"] is True assert result["title"] == "Cached Title" + assert result["timestamp"] == 1704067200 + assert result["content"] == "

Cached body

" + assert "published_timestamp" not in result mock_cached_doc.model_dump.assert_called_once_with( mode="json", exclude={"id"} ) diff --git a/tests/unit/scrapers/test_wechat.py b/tests/unit/scrapers/test_wechat.py index 2c77c9d..e5d1e0a 100644 --- a/tests/unit/scrapers/test_wechat.py +++ b/tests/unit/scrapers/test_wechat.py @@ -43,6 +43,7 @@ def test_parses_article_data(self): html_str = """ +

Test Title\n

Test Author\n @@ -56,6 +57,7 @@ def test_parses_article_data(self): assert result["title"] == "Test Title" assert result["author"] == "Test Author" assert "Test content paragraph" in result["content"] + assert "timestamp" not in result def test_strips_newlines_and_whitespace(self): html_str = """ diff --git a/tests/unit/test_published_timestamp.py b/tests/unit/test_published_timestamp.py new file mode 100644 index 0000000..31c5d05 --- /dev/null +++ b/tests/unit/test_published_timestamp.py @@ -0,0 +1,43 @@ +from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType, MetadataItem + + +def test_metadata_item_to_dict_includes_timestamp_without_changing_content(): + item = MetadataItem( + url="https://example.com/post", + telegraph_url="", + content="

Body

", + text="Body", + media_files=[MediaFile(media_type="image", url="https://example.com/a.jpg")], + author="Author", + title="Title", + author_url="https://example.com/author", + category="example", + message_type=MessageType.SHORT, + timestamp=1704067200, + ) + + data = item.to_dict() + + assert data["timestamp"] == 1704067200 + assert data["content"] == "

Body

" + + +def test_metadata_item_to_dict_does_not_parse_datetime_strings(): + item = MetadataItem( + url="https://example.com/post", + telegraph_url="", + content="

Body

", + text="Body", + media_files=[], + author="Author", + title="Title", + author_url="https://example.com/author", + category="example", + message_type=MessageType.SHORT, + timestamp="2024-01-01T00:00:00", # type: ignore[arg-type] + ) + + data = item.to_dict() + + assert data["timestamp"] is None + assert data["content"] == "

Body

" From 99eaeddaf5d299db4aa5c9413ef98a5a041be009 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 23 May 2026 22:38:46 -0500 Subject: [PATCH 2/3] Update zyte.py --- .../services/scrapers/general/zyte.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py index 0186373..d011d6f 100644 --- a/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py +++ b/packages/shared/fastfetchbot_shared/services/scrapers/general/zyte.py @@ -1,5 +1,3 @@ -import datetime - from zyte_api import AsyncZyteAPI from fastfetchbot_shared.services.scrapers.config import settings @@ -61,11 +59,6 @@ async def _process_zyte_result(self, result: dict) -> None: # Extract main image main_image = article.get("mainImage", {}) og_image = main_image.get("url") if main_image else None - timestamp = _parse_zyte_date_published( - article.get("datePublished") - or article.get("date_published") - or article.get("publishedDate") - ) await self._build_item_data( title=title, @@ -74,7 +67,6 @@ async def _process_zyte_result(self, result: dict) -> None: markdown_content=markdown_content, html_content=html_content, og_image=og_image, - timestamp=timestamp, ) @@ -85,15 +77,3 @@ class ZyteScraper(BaseGeneralScraper): async def get_processor_by_url(self, url: str) -> DataProcessor: return ZyteDataProcessor(url) - - -def _parse_zyte_date_published(value: str | None) -> int | None: - if not value: - return None - try: - parsed = datetime.datetime.fromisoformat(str(value).strip().replace("Z", "+00:00")) - except ValueError: - return None - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=datetime.timezone.utc) - return int(parsed.timestamp()) From ff68610eca713a8e0235111063de30ada560f283 Mon Sep 17 00:00:00 2001 From: aturret Date: Sat, 23 May 2026 22:45:32 -0500 Subject: [PATCH 3/3] fix mongodb cache --- .../database/mongodb/cache.py | 9 ++++- tests/unit/database/mongodb/test_cache.py | 40 +++++++++++++------ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py index 20a01e9..83b838e 100644 --- a/packages/shared/fastfetchbot_shared/database/mongodb/cache.py +++ b/packages/shared/fastfetchbot_shared/database/mongodb/cache.py @@ -6,6 +6,8 @@ from datetime import datetime, timedelta from typing import Optional +from pydantic import ValidationError + from fastfetchbot_shared.database.mongodb.models.metadata import Metadata from fastfetchbot_shared.utils.logger import logger @@ -74,7 +76,12 @@ async def save_metadata(metadata_item: dict) -> Metadata: document_data = dict(metadata_item) document_data["published_timestamp"] = document_data.pop("timestamp", None) - doc = Metadata.model_construct(**document_data) + try: + doc = Metadata(**document_data) + except (ValidationError, ValueError) as e: + logger.error(f"Invalid metadata document for {url}: {e}") + raise ValueError("invalid metadata document") from e + await Metadata.insert(doc) logger.info(f"Saved metadata for {url} (version={new_version})") diff --git a/tests/unit/database/mongodb/test_cache.py b/tests/unit/database/mongodb/test_cache.py index e40d3bc..a7d277d 100644 --- a/tests/unit/database/mongodb/test_cache.py +++ b/tests/unit/database/mongodb/test_cache.py @@ -121,8 +121,7 @@ async def test_first_save_uses_version_1(self): "fastfetchbot_shared.database.mongodb.cache.Metadata" ) as MockMetadata: MockMetadata.find.return_value = mock_find - mock_constructed = MagicMock() - MockMetadata.model_construct.return_value = mock_constructed + mock_document = MockMetadata.return_value MockMetadata.insert = AsyncMock() from fastfetchbot_shared.database.mongodb.cache import save_metadata @@ -131,14 +130,14 @@ async def test_first_save_uses_version_1(self): result = await save_metadata(item) assert item["version"] == 1 - MockMetadata.model_construct.assert_called_once() + MockMetadata.assert_called_once() assert ( - MockMetadata.model_construct.call_args.kwargs["published_timestamp"] + MockMetadata.call_args.kwargs["published_timestamp"] is None ) - assert "timestamp" not in MockMetadata.model_construct.call_args.kwargs - MockMetadata.insert.assert_awaited_once_with(mock_constructed) - assert result is mock_constructed + assert "timestamp" not in MockMetadata.call_args.kwargs + MockMetadata.insert.assert_awaited_once_with(mock_document) + assert result is mock_document @pytest.mark.asyncio async def test_maps_metadata_timestamp_to_published_timestamp(self): @@ -148,7 +147,6 @@ async def test_maps_metadata_timestamp_to_published_timestamp(self): "fastfetchbot_shared.database.mongodb.cache.Metadata" ) as MockMetadata: MockMetadata.find.return_value = mock_find - MockMetadata.model_construct.return_value = MagicMock() MockMetadata.insert = AsyncMock() from fastfetchbot_shared.database.mongodb.cache import save_metadata @@ -160,7 +158,7 @@ async def test_maps_metadata_timestamp_to_published_timestamp(self): } await save_metadata(item) - construct_kwargs = MockMetadata.model_construct.call_args.kwargs + construct_kwargs = MockMetadata.call_args.kwargs assert construct_kwargs["published_timestamp"] == 1704067200 assert "timestamp" not in construct_kwargs assert item["timestamp"] == 1704067200 @@ -174,8 +172,6 @@ async def test_increments_version_from_existing(self): "fastfetchbot_shared.database.mongodb.cache.Metadata" ) as MockMetadata: MockMetadata.find.return_value = mock_find - mock_constructed = MagicMock() - MockMetadata.model_construct.return_value = mock_constructed MockMetadata.insert = AsyncMock() from fastfetchbot_shared.database.mongodb.cache import save_metadata @@ -193,7 +189,6 @@ async def test_uses_url_from_metadata_item(self): "fastfetchbot_shared.database.mongodb.cache.Metadata" ) as MockMetadata: MockMetadata.find.return_value = mock_find - MockMetadata.model_construct.return_value = MagicMock() MockMetadata.insert = AsyncMock() from fastfetchbot_shared.database.mongodb.cache import save_metadata @@ -204,6 +199,27 @@ async def test_uses_url_from_metadata_item(self): # Verify the find was called (to look up existing version) MockMetadata.find.assert_called() + @pytest.mark.asyncio + async def test_invalid_document_is_not_inserted(self): + mock_find = _make_find_chain(None) + + with patch( + "fastfetchbot_shared.database.mongodb.cache.Metadata" + ) as MockMetadata, patch( + "fastfetchbot_shared.database.mongodb.cache.logger" + ) as mock_logger: + MockMetadata.find.return_value = mock_find + MockMetadata.side_effect = ValueError("bad payload") + MockMetadata.insert = AsyncMock() + + from fastfetchbot_shared.database.mongodb.cache import save_metadata + + with pytest.raises(ValueError, match="invalid metadata document"): + await save_metadata({"url": "https://example.com", "title": "Test"}) + + mock_logger.error.assert_called_once() + MockMetadata.insert.assert_not_awaited() + @pytest.mark.asyncio async def test_missing_url_raises_value_error(self): from fastfetchbot_shared.database.mongodb.cache import save_metadata