diff --git a/README.md b/README.md index 9516a09..f716040 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,8 @@ The watcher pulls `https://apps.truenas.com/catalog` over plain HTTP (no browser On changes, it sends Telegram messages with: -- one detailed message per newly added app (name, URL, train, added date, catalog summary, plus extra details parsed from the app page when available) +- one detailed message per newly added app (name, URL, train, added date, catalog summary, and page title) +- screenshot images from the app page, posted as Telegram photos (up to the configured per-app limit) - removed apps (`-`) - updated apps (`~`) and field-level diffs @@ -27,6 +28,7 @@ On changes, it sends Telegram messages with: - `CATALOG_URL` (default: `https://apps.truenas.com/catalog`) - `REQUEST_TIMEOUT_SECONDS` (default: `30`) - `LOG_LEVEL` (default: `INFO`) +- `MAX_SCREENSHOTS_PER_APP` (default: `3`) ## Build diff --git a/watcher.py b/watcher.py index f65c724..5dbd75a 100644 --- a/watcher.py +++ b/watcher.py @@ -25,6 +25,7 @@ USER_AGENT = os.getenv( ) LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() MAX_MESSAGE_LEN = 3900 +MAX_SCREENSHOTS_PER_APP = int(os.getenv("MAX_SCREENSHOTS_PER_APP", "3")) @dataclass @@ -225,6 +226,9 @@ def truncate_text(value: str, limit: int) -> str: if len(text) <= limit: return text return f"{text[: max(0, limit - 1)].rstrip()}…" + + +def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[str, object]: response = session.get( app_url, timeout=REQUEST_TIMEOUT_SECONDS, @@ -235,77 +239,76 @@ def truncate_text(value: str, limit: int) -> str: soup = BeautifulSoup(response.text, "html.parser") page_title = normalize_text(soup.title.get_text(" ", strip=True)) if soup.title else "" - description = "" - for attrs in ( - {"property": "og:description"}, - {"name": "description"}, - {"name": "twitter:description"}, - ): - tag = soup.find("meta", attrs=attrs) - if tag and tag.get("content"): - description = normalize_text(str(tag["content"])) - if description: - break - headings: List[str] = [] - for tag in soup.find_all(["h1", "h2", "h3"]): - heading = normalize_text(tag.get_text(" ", strip=True)) - if not heading: + screenshot_candidates: List[Tuple[int, int, str]] = [] + seen_urls: set[str] = set() + + og_image = soup.find("meta", attrs={"property": "og:image"}) + if og_image and og_image.get("content"): + og_image_url = urljoin(app_url, str(og_image["content"]).strip()) + if og_image_url.startswith("http"): + screenshot_candidates.append((2, 0, og_image_url)) + + for index, tag in enumerate(soup.find_all("img", src=True), start=1): + raw_src = str(tag.get("src", "")).strip() + if not raw_src or raw_src.startswith("data:"): continue - if heading not in headings: - headings.append(heading) - if len(headings) >= 6: + + image_url = urljoin(app_url, raw_src) + if not image_url.startswith("http") or image_url in seen_urls: + continue + seen_urls.add(image_url) + + width_value = str(tag.get("width", "")).strip() + height_value = str(tag.get("height", "")).strip() + if width_value.isdigit() and int(width_value) < 200: + continue + if height_value.isdigit() and int(height_value) < 120: + continue + + descriptor = " ".join( + [ + str(tag.get("alt", "")), + str(tag.get("title", "")), + " ".join(tag.get("class", [])), + str(tag.get("id", "")), + image_url, + ] + ).lower() + + if any(skip in descriptor for skip in ["logo", "favicon", "icon", "avatar", "badge"]): + continue + + score = 0 + if "screenshot" in descriptor or "screen-shot" in descriptor or "screen shot" in descriptor: + score += 4 + if "gallery" in descriptor or "carousel" in descriptor or "preview" in descriptor: + score += 2 + if re.search(r"\.(png|jpe?g|webp)(\?|$)", image_url, flags=re.IGNORECASE): + score += 1 + + if score > 0: + screenshot_candidates.append((score, index, image_url)) + + screenshot_candidates.sort(key=lambda item: (-item[0], item[1])) + + screenshot_urls: List[str] = [] + emitted: set[str] = set() + for _, _, image_url in screenshot_candidates: + if image_url in emitted: + continue + emitted.add(image_url) + screenshot_urls.append(image_url) + if len(screenshot_urls) >= MAX_SCREENSHOTS_PER_APP: break - external_links: List[str] = [] - seen_links = set() - for anchor in soup.find_all("a", href=True): - href = str(anchor.get("href", "")).strip() - if not href or href.startswith("#"): - continue - full_href = urljoin(app_url, href) - if not full_href.startswith("http"): - continue - if full_href.startswith(CATALOG_URL): - continue - if full_href in seen_links: - continue - seen_links.add(full_href) - - label = normalize_text(anchor.get_text(" ", strip=True)) - if not label: - label = full_href - label = truncate_text(label, 60) - external_links.append(f"{label} -> {full_href}") - if len(external_links) >= 5: - break - - detected_fields: List[str] = [] - body_text = normalize_text(soup.get_text(" ", strip=True)) - label_patterns = { - "Version": r"(?:App\s+Version|Version)\s*[:\-]\s*([^\n\r|]{1,80})", - "Chart": r"(?:Chart\s+Version|Helm\s+Chart)\s*[:\-]\s*([^\n\r|]{1,80})", - "Category": r"Category\s*[:\-]\s*([^\n\r|]{1,80})", - "Maintainer": r"Maintainer(?:s)?\s*[:\-]\s*([^\n\r|]{1,120})", - "Homepage": r"Homepage\s*[:\-]\s*([^\n\r|]{1,160})", - "Source": r"Source\s*[:\-]\s*([^\n\r|]{1,160})", - } - for label, pattern in label_patterns.items(): - match = re.search(pattern, body_text, flags=re.IGNORECASE) - if match: - value = truncate_text(match.group(1), 120) - detected_fields.append(f"{label}: {value}") - return { "page_title": page_title, - "description": description, - "headings": headings, - "external_links": external_links, - "detected_fields": detected_fields, + "screenshot_urls": screenshot_urls, } -def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str: +def build_new_app_message(app: AppSnapshot, page_title: str = "", screenshot_count: int = 0) -> str: lines: List[str] = [ "🆕 New TrueNAS app detected", f"Detected: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}", @@ -319,6 +322,10 @@ def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str: lines.append(f"Added date: {app.added}") if app.summary: lines.append(f"Catalog summary: {truncate_text(app.summary, 700)}") + if page_title: + lines.append(f"Page title: {truncate_text(page_title, 180)}") + if screenshot_count > 0: + lines.append(f"Screenshots: {screenshot_count} attached") message = "\n".join(lines) if len(message) <= MAX_MESSAGE_LEN: @@ -366,6 +373,23 @@ def send_telegram_message(session: requests.Session, text: str) -> None: response.raise_for_status() +def send_telegram_photo(session: requests.Session, photo_url: str, caption: str = "") -> None: + if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID: + logging.warning("Telegram token/chat id missing; skipping photo") + return + + endpoint = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendPhoto" + payload = { + "chat_id": TELEGRAM_CHAT_ID, + "photo": photo_url, + } + if caption: + payload["caption"] = truncate_text(caption, 900) + + response = session.post(endpoint, json=payload, timeout=REQUEST_TIMEOUT_SECONDS) + response.raise_for_status() + + def send_startup_notification(session: requests.Session) -> None: message = ( "TrueNAS catalog watcher is running ✅\n" @@ -405,7 +429,31 @@ def run_once(session: requests.Session, first_run: bool) -> bool: for url in added_urls: app = current_state[url] - send_telegram_message(session, build_new_app_message(session, app)) + page_title = "" + screenshot_urls: List[str] = [] + try: + details = fetch_new_app_page_details(session, app.url) + page_title = str(details.get("page_title", "")) + screenshot_data = details.get("screenshot_urls", []) + if isinstance(screenshot_data, list): + screenshot_urls = [str(item) for item in screenshot_data if str(item).startswith("http")] + except requests.RequestException as exc: + logging.warning("Unable to fetch app page details for %s: %s", app.url, exc) + + send_telegram_message( + session, + build_new_app_message(app, page_title=page_title, screenshot_count=len(screenshot_urls)), + ) + + for index, screenshot_url in enumerate(screenshot_urls, start=1): + try: + send_telegram_photo( + session, + screenshot_url, + caption=f"{app.name} screenshot {index}/{len(screenshot_urls)}", + ) + except requests.RequestException as exc: + logging.warning("Failed to send screenshot for %s: %s", app.name, exc) header, summary_lines = build_summary_message( added_count=len(added_urls),