Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions apps/api/src/services/inoreader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ def __init__(self, url: str = None, data: dict = None, **kwargs):
self.category = data.get("category", "")
self.raw_content = data.get("content", "")
self.content = self.raw_content
self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))
if kwargs.get("category"):
self.category = kwargs["category"]
self.media_files = []
self.message_type = MessageType.LONG
if not hasattr(self, "timestamp"):
self.timestamp = None

def _from_data(self, data: dict):
self.title = data.get("title", "")
Expand All @@ -43,6 +46,7 @@ def _from_data(self, data: dict):
self.category = data.get("category", "")
self.raw_content = data.get("content", "")
self.content = self.raw_content
self.timestamp = _parse_inoreader_timestamp(data.get("timestamp"))

async def get_item(self, api: bool = False) -> dict:
if api:
Expand Down Expand Up @@ -161,3 +165,9 @@ async def get_api_info(
headers=headers,
)
return resp


def _parse_inoreader_timestamp(timestamp: int | None) -> int | None:
if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
return None
return timestamp
11 changes: 10 additions & 1 deletion packages/shared/fastfetchbot_shared/database/mongodb/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from datetime import datetime, timedelta
from typing import Optional

from pydantic import ValidationError

from fastfetchbot_shared.database.mongodb.models.metadata import Metadata
from fastfetchbot_shared.utils.logger import logger

Expand Down Expand Up @@ -72,7 +74,14 @@ async def save_metadata(metadata_item: dict) -> Metadata:
new_version = (latest.version + 1) if latest else 1
metadata_item["version"] = new_version

doc = Metadata.model_construct(**metadata_item)
document_data = dict(metadata_item)
document_data["published_timestamp"] = document_data.pop("timestamp", None)
try:
doc = Metadata(**document_data)
except (ValidationError, ValueError) as e:
logger.error(f"Invalid metadata document for {url}: {e}")
raise ValueError("invalid metadata document") from e

await Metadata.insert(doc)

logger.info(f"Saved metadata for {url} (version={new_version})")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class Metadata(Document):
source: Optional[str] = None
media_files: Optional[list[DatabaseMediaFile]] = None
telegraph_url: Optional[str] = None
published_timestamp: Optional[int] = None
timestamp: datetime = Field(default_factory=datetime.utcnow)
scrape_status: bool = False
version: int = Field(default=1, ge=1)
Expand Down
34 changes: 25 additions & 9 deletions packages/shared/fastfetchbot_shared/models/metadata_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def from_str(x: Any) -> str:
return x


def from_optional_int(x: Any) -> Optional[int]:
if isinstance(x, bool) or not isinstance(x, int) or x <= 0:
return None
return x


def from_list(f: Callable[[Any], T], x: Any) -> List[T]:
assert isinstance(x, list)
return [f(y) for y in x]
Expand Down Expand Up @@ -75,6 +81,7 @@ class MetadataItem:
author_url: Optional[str]
category: str
message_type: Optional[MessageType]
timestamp: Optional[int] = None

@staticmethod
def from_dict(obj: Any) -> "MetadataItem":
Expand All @@ -89,6 +96,7 @@ def from_dict(obj: Any) -> "MetadataItem":
author_url = from_str(obj.get("author_url"))
category = from_str(obj.get("category"))
message_type = MessageType(obj.get("message_type"))
timestamp = from_optional_int(obj.get("timestamp"))
return MetadataItem(
url,
telegraph_url,
Expand All @@ -100,21 +108,29 @@ def from_dict(obj: Any) -> "MetadataItem":
author_url,
category,
message_type,
timestamp,
)

def to_dict(self) -> dict:
timestamp = from_optional_int(getattr(self, "timestamp", None))
message_type = getattr(self, "message_type", None)
message_type_value = (
message_type.value if isinstance(message_type, MessageType) else message_type
)
result: dict = {
"url": from_str(self.url),
"telegraph_url": "", "content": from_str(self.content),
"text": from_str(self.text),
"url": from_str(getattr(self, "url", "")),
"telegraph_url": "",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Preserve telegraph_url during serialization.

Line 122 hardcodes telegraph_url to an empty string, which drops existing telegraph links and causes data loss in downstream storage/response payloads.

💡 Proposed fix
-            "telegraph_url": "",
+            "telegraph_url": from_str(getattr(self, "telegraph_url", "")),
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/shared/fastfetchbot_shared/models/metadata_item.py` at line 122, The
serializer is currently hardcoding "telegraph_url": "" which drops stored
telegraph links; update the serialization in the MetadataItem (likely the
to_dict/serialize method in metadata_item.py) to include the actual attribute
(e.g. self.telegraph_url or getattr(self, "telegraph_url", "")) instead of the
empty string so existing telegraph links are preserved in output payloads and
downstream storage/response.

"content": from_str(getattr(self, "content", "")),
"text": from_str(getattr(self, "text", "")),
"media_files": from_list(
lambda x: to_class(MediaFile, x), self.media_files
lambda x: to_class(MediaFile, x), getattr(self, "media_files", [])
),
"author": from_str(self.author),
"title": from_str(self.title),
"author_url": from_str(self.author_url),
"category": from_str(self.category),
"message_type": self.message_type.value
"author": from_str(getattr(self, "author", "")),
"title": from_str(getattr(self, "title", "")),
"author_url": from_str(getattr(self, "author_url", "")),
"category": from_str(getattr(self, "category", "")),
"message_type": message_type_value,
"timestamp": timestamp,
}
return result

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@
"""

import asyncio
import datetime
import re
from urllib.parse import urlparse, parse_qs

import httpx

from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
from fastfetchbot_shared.utils.parse import unix_timestamp_to_utc, second_to_time, wrap_text_into_html
from fastfetchbot_shared.utils.parse import (
second_to_time,
unix_timestamp_to_utc,
wrap_text_into_html,
)
from fastfetchbot_shared.utils.logger import logger
from fastfetchbot_shared.services.scrapers.config import JINJA2_ENV

Expand Down Expand Up @@ -58,6 +64,7 @@ def __init__(
self.category = category
self.media_files = []
self.created = None
self.timestamp = None
self.duration = None
self.celery_app = celery_app
self.timeout = timeout
Expand Down Expand Up @@ -219,6 +226,7 @@ def _video_info_formatting(self, meta_info: dict):
if len(meta_info["description"]) > 800:
meta_info["description"] = meta_info["description"][:800] + "..."
self.created = meta_info["upload_date"]
self.timestamp = meta_info.get("timestamp")
self.duration = meta_info["duration"]
self.text = video_info_template.render(
data={
Expand Down Expand Up @@ -250,6 +258,7 @@ def _youtube_info_parse(video_info: dict) -> dict:
"playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u8bc4\u8bba\u6570\uff1a{video_info['comment_count']}",
"author_avatar": video_info["thumbnail"],
"upload_date": str(video_info["upload_date"]),
"timestamp": _parse_youtube_upload_date(video_info["upload_date"]),
"duration": second_to_time(round(video_info["duration"])),
}

Expand All @@ -266,5 +275,26 @@ def _bilibili_info_parse(video_info: dict) -> dict:
"description": video_info["description"],
"playback_data": f"\u89c6\u9891\u64ad\u653e\u91cf\uff1a{video_info['view_count']} \u5f39\u5e55\u6570\uff1a{video_info['comment_count']} \u70b9\u8d5e\u6570\uff1a{video_info['like_count']}",
"upload_date": unix_timestamp_to_utc(video_info["timestamp"]),
"timestamp": _parse_bilibili_timestamp(video_info["timestamp"]),
"duration": second_to_time(round(video_info["duration"])),
}


def _parse_youtube_upload_date(upload_date: str | int | None) -> int | None:
if upload_date is None:
return None
upload_date_text = str(upload_date).strip()
if not re.fullmatch(r"\d{8}", upload_date_text):
return None
try:
parsed = datetime.datetime.strptime(upload_date_text, "%Y%m%d")
except ValueError:
return None
parsed = parsed.replace(tzinfo=datetime.timezone.utc)
return int(parsed.timestamp())


def _parse_bilibili_timestamp(timestamp: int | None) -> int | None:
if isinstance(timestamp, bool) or not isinstance(timestamp, int) or timestamp <= 0:
return None
return timestamp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def from_dict(obj: Any) -> "Bluesky":
media_files=bluesky_item.media_files,
category=bluesky_item.category,
message_type=bluesky_item.message_type,
timestamp=bluesky_item.timestamp,
cid=bluesky_item.cid,
author_did=bluesky_item.author_did,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
from typing import Optional
from urllib.parse import urlparse

Expand Down Expand Up @@ -119,6 +120,7 @@ async def _resolve_single_post_data(post_data: PostView) -> dict:
"category": "bluesky",
"media_files": [],
"created_at": created_at,
"timestamp": _parse_bluesky_created_at(created_at),
"author_did": author_did,
}

Expand Down Expand Up @@ -190,3 +192,15 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost:
except Exception as e:
logger.error(f"Error while getting post data: {e}")
raise


def _parse_bluesky_created_at(created_at: str | None) -> int | None:
if not created_at:
return None
try:
parsed = datetime.datetime.fromisoformat(created_at.replace("Z", "+00:00"))
except (TypeError, ValueError):
return None
if parsed.tzinfo is None:
parsed = parsed.replace(tzinfo=datetime.timezone.utc)
return int(parsed.timestamp())
Comment on lines +197 to +206
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Avoid silent parse failures in timestamp helper.

This helper swallows parse exceptions and returns None without logging or documented handling. Add at least debug/exception logging (or explicitly document this fallback behavior).

As per coding guidelines, "Never silently swallow exceptions—always either re-raise, log with logger.exception(), or explicitly handle and document the behavior. Do not return None or empty data on failure without logging".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/shared/fastfetchbot_shared/services/scrapers/bluesky/scraper.py`
around lines 197 - 206, The helper _parse_bluesky_created_at currently swallows
parse errors and returns None without any trace; update it to log the failure
before returning so exceptions aren't silent: catch the (TypeError, ValueError)
in _parse_bluesky_created_at and call logger.exception (or logger.debug with the
exception) including the input created_at and context (e.g., "failed to parse
created_at") before returning None, ensuring the module has an
imported/initialized logger variable referenced by name in the change.

Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict:
if cached is not None:
logger.info("Cache hit, returning cached metadata")
result = cached.model_dump(mode="json", exclude={"id"})
result["timestamp"] = result.pop("published_timestamp", None)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Keep legacy cache timestamps when published_timestamp is missing.

Line 93 sets timestamp to None for older cached records that only contain timestamp, which drops usable data.

💡 Proposed fix
-                    result["timestamp"] = result.pop("published_timestamp", None)
+                    result["timestamp"] = result.pop(
+                        "published_timestamp", result.get("timestamp")
+                    )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/shared/fastfetchbot_shared/services/scrapers/common.py` at line 93,
The current line sets result["timestamp"] using
result.pop("published_timestamp", None), which overwrites legacy records'
existing "timestamp" with None; change the logic in the block that manipulates
the result dict so that if "published_timestamp" exists you move it into
"timestamp" (remove the old key) but if it does not exist you leave any existing
result["timestamp"] intact (or leave it absent) — i.e., prefer
result["published_timestamp"] when present, otherwise keep result["timestamp"];
adjust the code around the result dict (the use of
result.pop("published_timestamp", None) and assignment to result["timestamp"])
accordingly.

result["_cached"] = True
return result
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
from bs4 import BeautifulSoup
from lxml import etree

from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
from fastfetchbot_shared.utils.parse import (
get_html_text_length,
wrap_text_into_html,
)
from fastfetchbot_shared.utils.network import get_selector, HEADERS
from fastfetchbot_shared.utils.logger import logger
from fastfetchbot_shared.models.metadata_item import MetadataItem, MediaFile, MessageType
Expand Down Expand Up @@ -57,6 +60,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
self.text_group: Optional[str] = None
self.raw_content: Optional[str] = None
self.date: Optional[str] = None
self.timestamp: Optional[int] = None
# reqeust fields
self.headers = HEADERS
self.headers["Cookie"] = kwargs.get("cookie", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def from_dict(obj: Any) -> "GeneralItem":
media_files=metadata_item.media_files,
category=metadata_item.category,
message_type=metadata_item.message_type,
timestamp=metadata_item.timestamp,
id=obj.get("id", ""),
raw_content=obj.get("raw_content", ""),
scraper_type=obj.get("scraper_type", ""),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
from fastfetchbot_shared.services.scrapers.scraper import Scraper, DataProcessor
from fastfetchbot_shared.services.scrapers.general import GeneralItem
from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
from fastfetchbot_shared.utils.parse import (
get_html_text_length,
wrap_text_into_html,
)
from fastfetchbot_shared.utils.logger import logger

GENERAL_TEXT_LIMIT = 800
Expand Down Expand Up @@ -67,6 +70,7 @@ async def _build_item_data(
markdown_content: str,
html_content: str,
og_image: Optional[str] = None,
timestamp: Optional[int] = None,
) -> None:
"""
Common method to build item data from scraped content.
Expand All @@ -79,6 +83,13 @@ async def _build_item_data(
"author": author or self.url_parser.netloc,
"author_url": f"{self.url_parser.scheme}://{self.url_parser.netloc}",
"scraper_type": self.scraper_type,
"timestamp": (
timestamp
if not isinstance(timestamp, bool)
and isinstance(timestamp, int)
and timestamp > 0
else None
),
}

# Process text content - use description or first part of markdown
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
from fastfetchbot_shared.services.scrapers.scraper import DataProcessor
from fastfetchbot_shared.models.metadata_item import MediaFile, MessageType
from fastfetchbot_shared.utils.logger import logger
from fastfetchbot_shared.utils.parse import get_html_text_length, wrap_text_into_html
from fastfetchbot_shared.utils.parse import (
get_html_text_length,
wrap_text_into_html,
)

# HTML tags to exclude from Firecrawl output at the source
FIRECRAWL_EXCLUDE_TAGS = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile
from fastfetchbot_shared.utils.network import get_response
from fastfetchbot_shared.utils.parse import get_html_text_length
from fastfetchbot_shared.utils.parse import (
get_html_text_length,
)
from fastfetchbot_shared.utils.logger import logger
from .config import API_HEADERS_LIST, ALL_SCRAPERS
from fastfetchbot_shared.services.scrapers.config import settings
Expand All @@ -23,6 +25,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs):
"/", ""
)
self.message_type = MessageType.SHORT
self.timestamp = None

async def get_item(self):
await self.get_instagram()
Expand Down Expand Up @@ -115,6 +118,9 @@ def _get_ins_post_looter2(ins_data: dict) -> dict:
)
ins_info["content"] = ""
ins_info["text"] = ins_text_data
ins_info["timestamp"] = _parse_instagram_timestamp(
ins_data.get("taken_at_timestamp") or ins_data.get("taken_at")
)
ins_info["author"] = ins_data["owner"]["username"]
if ins_data["owner"]["full_name"]:
ins_info["author"] += "(" + ins_data["owner"]["full_name"] + ")"
Expand Down Expand Up @@ -190,6 +196,10 @@ def _get_ins_post_ins28_scraper2(ins_data):
)
ins_info["content"] = ""
ins_info["text"] = ins_text_data
ins_info["timestamp"] = _parse_instagram_timestamp(
ins_data["items"][0].get("taken_at")
or ins_data["items"][0].get("taken_at_timestamp")
)
ins_info["author"] = ins_data["items"][0]["user"]["username"]
if ins_data["items"][0]["user"]["full_name"]:
ins_info["author"] += "(" + ins_data["items"][0]["user"]["full_name"] + ")"
Expand Down Expand Up @@ -269,3 +279,17 @@ def _get_ins_post_ins28_scraper2(ins_data):

async def _get_story_info(self):
pass


def _parse_instagram_timestamp(value: Any) -> int | None:
if value in (None, ""):
return None
try:
timestamp = int(value)
except (TypeError, ValueError):
return None
if timestamp <= 0:
return None
if timestamp > 10_000_000_000:
timestamp //= 1000
return timestamp
Comment on lines +284 to +295
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Make invalid timestamp handling observable.

_parse_instagram_timestamp returns None on parse failures without logging or explicit documented behavior. Please add logging for failed conversions (or document silent fallback intent).

As per coding guidelines, "Never silently swallow exceptions—always either re-raise, log with logger.exception(), or explicitly handle and document the behavior. Do not return None or empty data on failure without logging".

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/shared/fastfetchbot_shared/services/scrapers/instagram/__init__.py`
around lines 284 - 295, The function _parse_instagram_timestamp currently
swallows parse errors and invalid values; update it to use a module-level logger
(e.g., logger = logging.getLogger(__name__)) and log failures instead of
silently returning None: call logger.exception or logger.warning inside the
except (TypeError, ValueError) block including the raw value, and also emit a
logger.warning when timestamp <= 0 (include the parsed timestamp and original
value); keep returning None for invalid cases but ensure all failure paths are
logged with clear context identifying _parse_instagram_timestamp and the
offending value.

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self, url, data: Optional[Any] = None, **kwargs):
self.category = "reddit"
self.media_files = []
self.message_type = MessageType.LONG
self.timestamp = None

async def get_item(self) -> dict:
await self.get_reddit()
Expand Down Expand Up @@ -47,7 +48,8 @@ async def _process_reddit_data(self, reddit_data) -> None:
self.author = reddit_data["author"].name
self.author_url = f"https://www.reddit.com/user/{self.author}"
self.raw_content = reddit_data["selftext_html"] or ""
self.created = unix_timestamp_to_utc(int(reddit_data["created_utc"]))
self.timestamp = int(reddit_data["created_utc"])
self.created = unix_timestamp_to_utc(self.timestamp)
self.score = reddit_data["score"]
self.comments_count = reddit_data["num_comments"]
self.upvote_ratio = reddit_data["upvote_ratio"]
Expand Down
Loading
Loading