add screenshots

2026-03-02 16:51:08 -05:00
parent 46476aba51
commit abf7d58157
2 changed files with 114 additions and 64 deletions
--- a/README.md
+++ b/README.md
@@ -14,7 +14,8 @@ The watcher pulls `https://apps.truenas.com/catalog` over plain HTTP (no browser

 On changes, it sends Telegram messages with:

- one detailed message per newly added app (name, URL, train, added date, catalog summary, plus extra details parsed from the app page when available)
+- one detailed message per newly added app (name, URL, train, added date, catalog summary, and page title)
+- screenshot images from the app page, posted as Telegram photos (up to the configured per-app limit)
 - removed apps (`-`)
 - updated apps (`~`) and field-level diffs

@@ -27,6 +28,7 @@ On changes, it sends Telegram messages with:
 - `CATALOG_URL` (default: `https://apps.truenas.com/catalog`)
 - `REQUEST_TIMEOUT_SECONDS` (default: `30`)
 - `LOG_LEVEL` (default: `INFO`)
+- `MAX_SCREENSHOTS_PER_APP` (default: `3`)

 ## Build

--- a/watcher.py
+++ b/watcher.py
@@ -25,6 +25,7 @@ USER_AGENT = os.getenv(
 )
 LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
 MAX_MESSAGE_LEN = 3900
+MAX_SCREENSHOTS_PER_APP = int(os.getenv("MAX_SCREENSHOTS_PER_APP", "3"))


@dataclass
@@ -225,6 +226,9 @@ def truncate_text(value: str, limit: int) -> str:
    if len(text) <= limit:
        return text
    return f"{text[: max(0, limit - 1)].rstrip()}…"
+
+
+def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[str, object]:
    response = session.get(
        app_url,
        timeout=REQUEST_TIMEOUT_SECONDS,
@@ -235,77 +239,76 @@ def truncate_text(value: str, limit: int) -> str:
    soup = BeautifulSoup(response.text, "html.parser")

    page_title = normalize_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
-    description = ""
-    for attrs in (
-        {"property": "og:description"},
-        {"name": "description"},
-        {"name": "twitter:description"},
-    ):
-        tag = soup.find("meta", attrs=attrs)
-        if tag and tag.get("content"):
-            description = normalize_text(str(tag["content"]))
-            if description:
+
+    screenshot_candidates: List[Tuple[int, int, str]] = []
+    seen_urls: set[str] = set()
+
+    og_image = soup.find("meta", attrs={"property": "og:image"})
+    if og_image and og_image.get("content"):
+        og_image_url = urljoin(app_url, str(og_image["content"]).strip())
+        if og_image_url.startswith("http"):
+            screenshot_candidates.append((2, 0, og_image_url))
+
+    for index, tag in enumerate(soup.find_all("img", src=True), start=1):
+        raw_src = str(tag.get("src", "")).strip()
+        if not raw_src or raw_src.startswith("data:"):
+            continue
+
+        image_url = urljoin(app_url, raw_src)
+        if not image_url.startswith("http") or image_url in seen_urls:
+            continue
+        seen_urls.add(image_url)
+
+        width_value = str(tag.get("width", "")).strip()
+        height_value = str(tag.get("height", "")).strip()
+        if width_value.isdigit() and int(width_value) < 200:
+            continue
+        if height_value.isdigit() and int(height_value) < 120:
+            continue
+
+        descriptor = " ".join(
+            [
+                str(tag.get("alt", "")),
+                str(tag.get("title", "")),
+                " ".join(tag.get("class", [])),
+                str(tag.get("id", "")),
+                image_url,
+            ]
+        ).lower()
+
+        if any(skip in descriptor for skip in ["logo", "favicon", "icon", "avatar", "badge"]):
+            continue
+
+        score = 0
+        if "screenshot" in descriptor or "screen-shot" in descriptor or "screen shot" in descriptor:
+            score += 4
+        if "gallery" in descriptor or "carousel" in descriptor or "preview" in descriptor:
+            score += 2
+        if re.search(r"\.(png|jpe?g|webp)(\?|$)", image_url, flags=re.IGNORECASE):
+            score += 1
+
+        if score > 0:
+            screenshot_candidates.append((score, index, image_url))
+
+    screenshot_candidates.sort(key=lambda item: (-item[0], item[1]))
+
+    screenshot_urls: List[str] = []
+    emitted: set[str] = set()
+    for _, _, image_url in screenshot_candidates:
+        if image_url in emitted:
+            continue
+        emitted.add(image_url)
+        screenshot_urls.append(image_url)
+        if len(screenshot_urls) >= MAX_SCREENSHOTS_PER_APP:
            break

-    headings: List[str] = []
-    for tag in soup.find_all(["h1", "h2", "h3"]):
-        heading = normalize_text(tag.get_text(" ", strip=True))
-        if not heading:
-            continue
-        if heading not in headings:
-            headings.append(heading)
-        if len(headings) >= 6:
-            break
-
-    external_links: List[str] = []
-    seen_links = set()
-    for anchor in soup.find_all("a", href=True):
-        href = str(anchor.get("href", "")).strip()
-        if not href or href.startswith("#"):
-            continue
-        full_href = urljoin(app_url, href)
-        if not full_href.startswith("http"):
-            continue
-        if full_href.startswith(CATALOG_URL):
-            continue
-        if full_href in seen_links:
-            continue
-        seen_links.add(full_href)
-
-        label = normalize_text(anchor.get_text(" ", strip=True))
-        if not label:
-            label = full_href
-        label = truncate_text(label, 60)
-        external_links.append(f"{label} -> {full_href}")
-        if len(external_links) >= 5:
-            break
-
-    detected_fields: List[str] = []
-    body_text = normalize_text(soup.get_text(" ", strip=True))
-    label_patterns = {
-        "Version": r"(?:App\s+Version|Version)\s*[:\-]\s*([^\n\r|]{1,80})",
-        "Chart": r"(?:Chart\s+Version|Helm\s+Chart)\s*[:\-]\s*([^\n\r|]{1,80})",
-        "Category": r"Category\s*[:\-]\s*([^\n\r|]{1,80})",
-        "Maintainer": r"Maintainer(?:s)?\s*[:\-]\s*([^\n\r|]{1,120})",
-        "Homepage": r"Homepage\s*[:\-]\s*([^\n\r|]{1,160})",
-        "Source": r"Source\s*[:\-]\s*([^\n\r|]{1,160})",
-    }
-    for label, pattern in label_patterns.items():
-        match = re.search(pattern, body_text, flags=re.IGNORECASE)
-        if match:
-            value = truncate_text(match.group(1), 120)
-            detected_fields.append(f"{label}: {value}")
-
    return {
        "page_title": page_title,
-        "description": description,
-        "headings": headings,
-        "external_links": external_links,
-        "detected_fields": detected_fields,
+        "screenshot_urls": screenshot_urls,
    }


-def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str:
+def build_new_app_message(app: AppSnapshot, page_title: str = "", screenshot_count: int = 0) -> str:
    lines: List[str] = [
        "🆕 New TrueNAS app detected",
        f"Detected: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}",
@@ -319,6 +322,10 @@ def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str:
        lines.append(f"Added date: {app.added}")
    if app.summary:
        lines.append(f"Catalog summary: {truncate_text(app.summary, 700)}")
+    if page_title:
+        lines.append(f"Page title: {truncate_text(page_title, 180)}")
+    if screenshot_count > 0:
+        lines.append(f"Screenshots: {screenshot_count} attached")

    message = "\n".join(lines)
    if len(message) <= MAX_MESSAGE_LEN:
@@ -366,6 +373,23 @@ def send_telegram_message(session: requests.Session, text: str) -> None:
    response.raise_for_status()


+def send_telegram_photo(session: requests.Session, photo_url: str, caption: str = "") -> None:
+    if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
+        logging.warning("Telegram token/chat id missing; skipping photo")
+        return
+
+    endpoint = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendPhoto"
+    payload = {
+        "chat_id": TELEGRAM_CHAT_ID,
+        "photo": photo_url,
+    }
+    if caption:
+        payload["caption"] = truncate_text(caption, 900)
+
+    response = session.post(endpoint, json=payload, timeout=REQUEST_TIMEOUT_SECONDS)
+    response.raise_for_status()
+
+
 def send_startup_notification(session: requests.Session) -> None:
    message = (
        "TrueNAS catalog watcher is running ✅\n"
@@ -405,7 +429,31 @@ def run_once(session: requests.Session, first_run: bool) -> bool:

        for url in added_urls:
            app = current_state[url]
-            send_telegram_message(session, build_new_app_message(session, app))
+            page_title = ""
+            screenshot_urls: List[str] = []
+            try:
+                details = fetch_new_app_page_details(session, app.url)
+                page_title = str(details.get("page_title", ""))
+                screenshot_data = details.get("screenshot_urls", [])
+                if isinstance(screenshot_data, list):
+                    screenshot_urls = [str(item) for item in screenshot_data if str(item).startswith("http")]
+            except requests.RequestException as exc:
+                logging.warning("Unable to fetch app page details for %s: %s", app.url, exc)
+
+            send_telegram_message(
+                session,
+                build_new_app_message(app, page_title=page_title, screenshot_count=len(screenshot_urls)),
+            )
+
+            for index, screenshot_url in enumerate(screenshot_urls, start=1):
+                try:
+                    send_telegram_photo(
+                        session,
+                        screenshot_url,
+                        caption=f"{app.name} screenshot {index}/{len(screenshot_urls)}",
+                    )
+                except requests.RequestException as exc:
+                    logging.warning("Failed to send screenshot for %s: %s", app.name, exc)

        header, summary_lines = build_summary_message(
            added_count=len(added_urls),