diff --git a/README.md b/README.md index 815fc61..9516a09 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The watcher pulls `https://apps.truenas.com/catalog` over plain HTTP (no browser On changes, it sends Telegram messages with: -- added apps (`+`) +- one detailed message per newly added app (name, URL, train, added date, catalog summary, plus extra details parsed from the app page when available) - removed apps (`-`) - updated apps (`~`) and field-level diffs diff --git a/watcher.py b/watcher.py index 0917493..aa1d9ef 100644 --- a/watcher.py +++ b/watcher.py @@ -2,6 +2,7 @@ import hashlib import json import logging import os +import re import sys import time from dataclasses import dataclass, asdict @@ -156,10 +157,10 @@ def format_field_change(label: str, old: str, new: str) -> str: return f"{label}: '{old_clean}' -> '{new_clean}'" -def build_diff_message( +def collect_diffs( previous: Dict[str, AppSnapshot], current: Dict[str, AppSnapshot], -) -> Tuple[str, List[str], int]: +) -> Tuple[List[str], List[str], List[str], int]: prev_urls = set(previous.keys()) curr_urls = set(current.keys()) @@ -193,22 +194,172 @@ def build_diff_message( for detail in details: changed_lines.append(f" - {detail}") + return added_urls, removed_urls, changed_lines, updated_count + + +def build_summary_message( + added_count: int, + removed_urls: List[str], + changed_lines: List[str], + updated_count: int, + previous: Dict[str, AppSnapshot], +) -> Tuple[str, List[str]]: + removed_count = len(removed_urls) + header = ( f"TrueNAS catalog changed at {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n" - f"Added: {len(added_urls)} | Removed: {len(removed_urls)} | Updated: {updated_count}" + f"Added: {added_count} | Removed: {removed_count} | Updated: {updated_count}" ) lines: List[str] = [] - for url in added_urls: - app = current[url] - lines.append(f"+ {app.name} ({app.url})") - for url in removed_urls: app = previous[url] lines.append(f"- {app.name} ({app.url})") lines.extend(changed_lines) - return header, lines, updated_count + return header, lines + + +def truncate_text(value: str, limit: int) -> str: + text = normalize_text(value) + if len(text) <= limit: + return text + return f"{text[: max(0, limit - 1)].rstrip()}…" + + +def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[str, object]: + response = session.get( + app_url, + timeout=REQUEST_TIMEOUT_SECONDS, + headers={"User-Agent": USER_AGENT}, + ) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + + page_title = normalize_text(soup.title.get_text(" ", strip=True)) if soup.title else "" + description = "" + for attrs in ( + {"property": "og:description"}, + {"name": "description"}, + {"name": "twitter:description"}, + ): + tag = soup.find("meta", attrs=attrs) + if tag and tag.get("content"): + description = normalize_text(str(tag["content"])) + if description: + break + + headings: List[str] = [] + for tag in soup.find_all(["h1", "h2", "h3"]): + heading = normalize_text(tag.get_text(" ", strip=True)) + if not heading: + continue + if heading not in headings: + headings.append(heading) + if len(headings) >= 6: + break + + external_links: List[str] = [] + seen_links = set() + for anchor in soup.find_all("a", href=True): + href = str(anchor.get("href", "")).strip() + if not href or href.startswith("#"): + continue + full_href = urljoin(app_url, href) + if not full_href.startswith("http"): + continue + if full_href.startswith(CATALOG_URL): + continue + if full_href in seen_links: + continue + seen_links.add(full_href) + + label = normalize_text(anchor.get_text(" ", strip=True)) + if not label: + label = full_href + label = truncate_text(label, 60) + external_links.append(f"{label} -> {full_href}") + if len(external_links) >= 5: + break + + detected_fields: List[str] = [] + body_text = normalize_text(soup.get_text(" ", strip=True)) + label_patterns = { + "Version": r"(?:App\s+Version|Version)\s*[:\-]\s*([^\n\r|]{1,80})", + "Chart": r"(?:Chart\s+Version|Helm\s+Chart)\s*[:\-]\s*([^\n\r|]{1,80})", + "Category": r"Category\s*[:\-]\s*([^\n\r|]{1,80})", + "Maintainer": r"Maintainer(?:s)?\s*[:\-]\s*([^\n\r|]{1,120})", + "Homepage": r"Homepage\s*[:\-]\s*([^\n\r|]{1,160})", + "Source": r"Source\s*[:\-]\s*([^\n\r|]{1,160})", + } + for label, pattern in label_patterns.items(): + match = re.search(pattern, body_text, flags=re.IGNORECASE) + if match: + value = truncate_text(match.group(1), 120) + detected_fields.append(f"{label}: {value}") + + return { + "page_title": page_title, + "description": description, + "headings": headings, + "external_links": external_links, + "detected_fields": detected_fields, + } + + +def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str: + lines: List[str] = [ + "🆕 New TrueNAS app detected", + f"Detected: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}", + f"Name: {app.name}", + f"URL: {app.url}", + ] + + if app.train: + lines.append(f"Train: {app.train}") + if app.added: + lines.append(f"Added date: {app.added}") + if app.summary: + lines.append(f"Catalog summary: {truncate_text(app.summary, 700)}") + + try: + details = fetch_new_app_page_details(session, app.url) + except requests.RequestException as exc: + logging.warning("Unable to fetch app details for %s: %s", app.url, exc) + details = {} + + page_title = str(details.get("page_title", "")) if details else "" + if page_title: + lines.append(f"Page title: {truncate_text(page_title, 180)}") + + description = str(details.get("description", "")) if details else "" + if description: + lines.append(f"Description: {truncate_text(description, 1000)}") + + detected_fields = details.get("detected_fields", []) if details else [] + if isinstance(detected_fields, list): + for field in detected_fields[:6]: + lines.append(str(field)) + + headings = details.get("headings", []) if details else [] + if isinstance(headings, list) and headings: + lines.append(f"Headings: {truncate_text(' | '.join(headings[:6]), 320)}") + + external_links = details.get("external_links", []) if details else [] + if isinstance(external_links, list) and external_links: + lines.append("External links:") + for link in external_links[:5]: + lines.append(f"- {truncate_text(str(link), 220)}") + + message = "\n".join(lines) + if len(message) <= MAX_MESSAGE_LEN: + return message + + trimmed_lines = [line if len(line) <= 280 else truncate_text(line, 280) for line in lines] + while len("\n".join(trimmed_lines)) > MAX_MESSAGE_LEN and len(trimmed_lines) > 8: + trimmed_lines.pop() + return "\n".join(trimmed_lines) def split_message(header: str, lines: List[str], max_len: int = MAX_MESSAGE_LEN) -> List[str]: @@ -273,13 +424,31 @@ def run_once(session: requests.Session, first_run: bool) -> bool: logging.info("Initial snapshot saved with %d apps", len(current_state)) return False - header, diff_lines, _ = build_diff_message(previous_state, current_state) - changed = bool(diff_lines) + added_urls, removed_urls, changed_lines, updated_count = collect_diffs(previous_state, current_state) + changed = bool(added_urls or removed_urls or changed_lines) if changed: - logging.info("Catalog change detected with %d line items", len(diff_lines)) - for message in split_message(header, diff_lines): - send_telegram_message(session, message) + logging.info( + "Catalog change detected (added=%d, removed=%d, updated=%d)", + len(added_urls), + len(removed_urls), + updated_count, + ) + + for url in added_urls: + app = current_state[url] + send_telegram_message(session, build_new_app_message(session, app)) + + header, summary_lines = build_summary_message( + added_count=len(added_urls), + removed_urls=removed_urls, + changed_lines=changed_lines, + updated_count=updated_count, + previous=previous_state, + ) + if summary_lines: + for message in split_message(header, summary_lines): + send_telegram_message(session, message) else: logging.info("No catalog changes detected")