-
Notifications
You must be signed in to change notification settings - Fork 4
feat: add timestamp for scraper content #84
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,4 @@ | ||
| import datetime | ||
| from typing import Optional | ||
| from urllib.parse import urlparse | ||
|
|
||
|
|
@@ -119,6 +120,7 @@ async def _resolve_single_post_data(post_data: PostView) -> dict: | |
| "category": "bluesky", | ||
| "media_files": [], | ||
| "created_at": created_at, | ||
| "timestamp": _parse_bluesky_created_at(created_at), | ||
| "author_did": author_did, | ||
| } | ||
|
|
||
|
|
@@ -190,3 +192,15 @@ async def _request_post_data(self, bluesky_post: BlueskyPost) -> ThreadViewPost: | |
| except Exception as e: | ||
| logger.error(f"Error while getting post data: {e}") | ||
| raise | ||
|
|
||
|
|
||
| def _parse_bluesky_created_at(created_at: str | None) -> int | None: | ||
| if not created_at: | ||
| return None | ||
| try: | ||
| parsed = datetime.datetime.fromisoformat(created_at.replace("Z", "+00:00")) | ||
| except (TypeError, ValueError): | ||
| return None | ||
| if parsed.tzinfo is None: | ||
| parsed = parsed.replace(tzinfo=datetime.timezone.utc) | ||
| return int(parsed.timestamp()) | ||
|
Comment on lines
+197
to
+206
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid silent parse failures in timestamp helper. This helper swallows parse exceptions and returns As per coding guidelines, "Never silently swallow exceptions—always either re-raise, log with 🤖 Prompt for AI Agents |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -90,6 +90,7 @@ async def get_item(self, metadata_item: Optional[dict] = None) -> dict: | |
| if cached is not None: | ||
| logger.info("Cache hit, returning cached metadata") | ||
| result = cached.model_dump(mode="json", exclude={"id"}) | ||
| result["timestamp"] = result.pop("published_timestamp", None) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep legacy cache timestamps when Line 93 sets 💡 Proposed fix- result["timestamp"] = result.pop("published_timestamp", None)
+ result["timestamp"] = result.pop(
+ "published_timestamp", result.get("timestamp")
+ )🤖 Prompt for AI Agents |
||
| result["_cached"] = True | ||
| return result | ||
| except Exception as e: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,9 @@ | |
|
|
||
| from fastfetchbot_shared.models.metadata_item import MetadataItem, MessageType, MediaFile | ||
| from fastfetchbot_shared.utils.network import get_response | ||
| from fastfetchbot_shared.utils.parse import get_html_text_length | ||
| from fastfetchbot_shared.utils.parse import ( | ||
| get_html_text_length, | ||
| ) | ||
| from fastfetchbot_shared.utils.logger import logger | ||
| from .config import API_HEADERS_LIST, ALL_SCRAPERS | ||
| from fastfetchbot_shared.services.scrapers.config import settings | ||
|
|
@@ -23,6 +25,7 @@ def __init__(self, url: str, data: Optional[Any] = None, **kwargs): | |
| "/", "" | ||
| ) | ||
| self.message_type = MessageType.SHORT | ||
| self.timestamp = None | ||
|
|
||
| async def get_item(self): | ||
| await self.get_instagram() | ||
|
|
@@ -115,6 +118,9 @@ def _get_ins_post_looter2(ins_data: dict) -> dict: | |
| ) | ||
| ins_info["content"] = "" | ||
| ins_info["text"] = ins_text_data | ||
| ins_info["timestamp"] = _parse_instagram_timestamp( | ||
| ins_data.get("taken_at_timestamp") or ins_data.get("taken_at") | ||
| ) | ||
| ins_info["author"] = ins_data["owner"]["username"] | ||
| if ins_data["owner"]["full_name"]: | ||
| ins_info["author"] += "(" + ins_data["owner"]["full_name"] + ")" | ||
|
|
@@ -190,6 +196,10 @@ def _get_ins_post_ins28_scraper2(ins_data): | |
| ) | ||
| ins_info["content"] = "" | ||
| ins_info["text"] = ins_text_data | ||
| ins_info["timestamp"] = _parse_instagram_timestamp( | ||
| ins_data["items"][0].get("taken_at") | ||
| or ins_data["items"][0].get("taken_at_timestamp") | ||
| ) | ||
| ins_info["author"] = ins_data["items"][0]["user"]["username"] | ||
| if ins_data["items"][0]["user"]["full_name"]: | ||
| ins_info["author"] += "(" + ins_data["items"][0]["user"]["full_name"] + ")" | ||
|
|
@@ -269,3 +279,17 @@ def _get_ins_post_ins28_scraper2(ins_data): | |
|
|
||
| async def _get_story_info(self): | ||
| pass | ||
|
|
||
|
|
||
| def _parse_instagram_timestamp(value: Any) -> int | None: | ||
| if value in (None, ""): | ||
| return None | ||
| try: | ||
| timestamp = int(value) | ||
| except (TypeError, ValueError): | ||
| return None | ||
| if timestamp <= 0: | ||
| return None | ||
| if timestamp > 10_000_000_000: | ||
| timestamp //= 1000 | ||
| return timestamp | ||
|
Comment on lines
+284
to
+295
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make invalid timestamp handling observable.
As per coding guidelines, "Never silently swallow exceptions—always either re-raise, log with 🤖 Prompt for AI Agents |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Preserve
telegraph_urlduring serialization.Line 122 hardcodes
telegraph_urlto an empty string, which drops existing telegraph links and causes data loss in downstream storage/response payloads.💡 Proposed fix
🤖 Prompt for AI Agents