From 7471c3d36dc9a123693a7c1ab76a96b9de8f974c Mon Sep 17 00:00:00 2001 From: LockeShor <75901583+LockeShor@users.noreply.github.com> Date: Mon, 2 Mar 2026 18:31:01 -0500 Subject: [PATCH] use correct screenshots --- README.md | 1 + watcher.py | 106 ++++++++++++++++++++++------------------------------- 2 files changed, 45 insertions(+), 62 deletions(-) diff --git a/README.md b/README.md index eb93fe0..b4d05f4 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ It also listens for Telegram commands from the configured chat: - `LOG_LEVEL` (default: `INFO`) - `MAX_SCREENSHOTS_PER_APP` (default: `3`) - `TELEGRAM_POLL_SECONDS` (default: `10`) +- `MEDIA_BASE_URL` (default: `https://media.sys.truenas.net`) ## Build diff --git a/watcher.py b/watcher.py index 5189b27..12930c1 100644 --- a/watcher.py +++ b/watcher.py @@ -28,6 +28,7 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() MAX_MESSAGE_LEN = 3900 MAX_SCREENSHOTS_PER_APP = int(os.getenv("MAX_SCREENSHOTS_PER_APP", "3")) TELEGRAM_POLL_SECONDS = int(os.getenv("TELEGRAM_POLL_SECONDS", "10")) +MEDIA_BASE_URL = os.getenv("MEDIA_BASE_URL", "https://media.sys.truenas.net") last_telegram_update_id: Optional[int] = None @@ -246,6 +247,47 @@ def truncate_text(value: str, limit: int) -> str: return f"{text[: max(0, limit - 1)].rstrip()}…" +def extract_app_id_from_url(app_url: str) -> str: + path_parts = [part for part in urlparse(app_url).path.strip("/").split("/") if part] + if not path_parts: + return "" + + if "catalog" in path_parts: + catalog_index = path_parts.index("catalog") + if catalog_index + 1 < len(path_parts): + return path_parts[catalog_index + 1] + + return path_parts[-1] + + +def build_storj_screenshot_urls(session: requests.Session, app_id: str) -> List[str]: + if not app_id: + return [] + + screenshot_urls: List[str] = [] + for index in range(1, MAX_SCREENSHOTS_PER_APP + 1): + screenshot_url = f"{MEDIA_BASE_URL.rstrip('/')}/apps/{app_id}/screenshots/screenshot{index}.png" + try: + response = session.get( + screenshot_url, + timeout=REQUEST_TIMEOUT_SECONDS, + headers={"User-Agent": USER_AGENT}, + ) + except requests.RequestException: + break + + if response.status_code != 200: + break + + content_type = str(response.headers.get("Content-Type", "")).lower() + if content_type and "image" not in content_type: + break + + screenshot_urls.append(screenshot_url) + + return screenshot_urls + + def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[str, object]: response = session.get( app_url, @@ -257,68 +299,8 @@ def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[ soup = BeautifulSoup(response.text, "html.parser") page_title = normalize_text(soup.title.get_text(" ", strip=True)) if soup.title else "" - - screenshot_candidates: List[Tuple[int, int, str]] = [] - seen_urls: set[str] = set() - - og_image = soup.find("meta", attrs={"property": "og:image"}) - if og_image and og_image.get("content"): - og_image_url = urljoin(app_url, str(og_image["content"]).strip()) - if og_image_url.startswith("http"): - screenshot_candidates.append((2, 0, og_image_url)) - - for index, tag in enumerate(soup.find_all("img", src=True), start=1): - raw_src = str(tag.get("src", "")).strip() - if not raw_src or raw_src.startswith("data:"): - continue - - image_url = urljoin(app_url, raw_src) - if not image_url.startswith("http") or image_url in seen_urls: - continue - seen_urls.add(image_url) - - width_value = str(tag.get("width", "")).strip() - height_value = str(tag.get("height", "")).strip() - if width_value.isdigit() and int(width_value) < 200: - continue - if height_value.isdigit() and int(height_value) < 120: - continue - - descriptor = " ".join( - [ - str(tag.get("alt", "")), - str(tag.get("title", "")), - " ".join(tag.get("class", [])), - str(tag.get("id", "")), - image_url, - ] - ).lower() - - if any(skip in descriptor for skip in ["logo", "favicon", "icon", "avatar", "badge"]): - continue - - score = 0 - if "screenshot" in descriptor or "screen-shot" in descriptor or "screen shot" in descriptor: - score += 4 - if "gallery" in descriptor or "carousel" in descriptor or "preview" in descriptor: - score += 2 - if re.search(r"\.(png|jpe?g|webp)(\?|$)", image_url, flags=re.IGNORECASE): - score += 1 - - if score > 0: - screenshot_candidates.append((score, index, image_url)) - - screenshot_candidates.sort(key=lambda item: (-item[0], item[1])) - - screenshot_urls: List[str] = [] - emitted: set[str] = set() - for _, _, image_url in screenshot_candidates: - if image_url in emitted: - continue - emitted.add(image_url) - screenshot_urls.append(image_url) - if len(screenshot_urls) >= MAX_SCREENSHOTS_PER_APP: - break + app_id = extract_app_id_from_url(app_url) + screenshot_urls = build_storj_screenshot_urls(session, app_id) return { "page_title": page_title,