This commit is contained in:
195
watcher.py
195
watcher.py
@@ -2,6 +2,7 @@ import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, asdict
|
||||
@@ -156,10 +157,10 @@ def format_field_change(label: str, old: str, new: str) -> str:
|
||||
return f"{label}: '{old_clean}' -> '{new_clean}'"
|
||||
|
||||
|
||||
def build_diff_message(
|
||||
def collect_diffs(
|
||||
previous: Dict[str, AppSnapshot],
|
||||
current: Dict[str, AppSnapshot],
|
||||
) -> Tuple[str, List[str], int]:
|
||||
) -> Tuple[List[str], List[str], List[str], int]:
|
||||
prev_urls = set(previous.keys())
|
||||
curr_urls = set(current.keys())
|
||||
|
||||
@@ -193,22 +194,172 @@ def build_diff_message(
|
||||
for detail in details:
|
||||
changed_lines.append(f" - {detail}")
|
||||
|
||||
return added_urls, removed_urls, changed_lines, updated_count
|
||||
|
||||
|
||||
def build_summary_message(
|
||||
added_count: int,
|
||||
removed_urls: List[str],
|
||||
changed_lines: List[str],
|
||||
updated_count: int,
|
||||
previous: Dict[str, AppSnapshot],
|
||||
) -> Tuple[str, List[str]]:
|
||||
removed_count = len(removed_urls)
|
||||
|
||||
header = (
|
||||
f"TrueNAS catalog changed at {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}\n"
|
||||
f"Added: {len(added_urls)} | Removed: {len(removed_urls)} | Updated: {updated_count}"
|
||||
f"Added: {added_count} | Removed: {removed_count} | Updated: {updated_count}"
|
||||
)
|
||||
|
||||
lines: List[str] = []
|
||||
for url in added_urls:
|
||||
app = current[url]
|
||||
lines.append(f"+ {app.name} ({app.url})")
|
||||
|
||||
for url in removed_urls:
|
||||
app = previous[url]
|
||||
lines.append(f"- {app.name} ({app.url})")
|
||||
|
||||
lines.extend(changed_lines)
|
||||
return header, lines, updated_count
|
||||
return header, lines
|
||||
|
||||
|
||||
def truncate_text(value: str, limit: int) -> str:
|
||||
text = normalize_text(value)
|
||||
if len(text) <= limit:
|
||||
return text
|
||||
return f"{text[: max(0, limit - 1)].rstrip()}…"
|
||||
|
||||
|
||||
def fetch_new_app_page_details(session: requests.Session, app_url: str) -> Dict[str, object]:
|
||||
response = session.get(
|
||||
app_url,
|
||||
timeout=REQUEST_TIMEOUT_SECONDS,
|
||||
headers={"User-Agent": USER_AGENT},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
page_title = normalize_text(soup.title.get_text(" ", strip=True)) if soup.title else ""
|
||||
description = ""
|
||||
for attrs in (
|
||||
{"property": "og:description"},
|
||||
{"name": "description"},
|
||||
{"name": "twitter:description"},
|
||||
):
|
||||
tag = soup.find("meta", attrs=attrs)
|
||||
if tag and tag.get("content"):
|
||||
description = normalize_text(str(tag["content"]))
|
||||
if description:
|
||||
break
|
||||
|
||||
headings: List[str] = []
|
||||
for tag in soup.find_all(["h1", "h2", "h3"]):
|
||||
heading = normalize_text(tag.get_text(" ", strip=True))
|
||||
if not heading:
|
||||
continue
|
||||
if heading not in headings:
|
||||
headings.append(heading)
|
||||
if len(headings) >= 6:
|
||||
break
|
||||
|
||||
external_links: List[str] = []
|
||||
seen_links = set()
|
||||
for anchor in soup.find_all("a", href=True):
|
||||
href = str(anchor.get("href", "")).strip()
|
||||
if not href or href.startswith("#"):
|
||||
continue
|
||||
full_href = urljoin(app_url, href)
|
||||
if not full_href.startswith("http"):
|
||||
continue
|
||||
if full_href.startswith(CATALOG_URL):
|
||||
continue
|
||||
if full_href in seen_links:
|
||||
continue
|
||||
seen_links.add(full_href)
|
||||
|
||||
label = normalize_text(anchor.get_text(" ", strip=True))
|
||||
if not label:
|
||||
label = full_href
|
||||
label = truncate_text(label, 60)
|
||||
external_links.append(f"{label} -> {full_href}")
|
||||
if len(external_links) >= 5:
|
||||
break
|
||||
|
||||
detected_fields: List[str] = []
|
||||
body_text = normalize_text(soup.get_text(" ", strip=True))
|
||||
label_patterns = {
|
||||
"Version": r"(?:App\s+Version|Version)\s*[:\-]\s*([^\n\r|]{1,80})",
|
||||
"Chart": r"(?:Chart\s+Version|Helm\s+Chart)\s*[:\-]\s*([^\n\r|]{1,80})",
|
||||
"Category": r"Category\s*[:\-]\s*([^\n\r|]{1,80})",
|
||||
"Maintainer": r"Maintainer(?:s)?\s*[:\-]\s*([^\n\r|]{1,120})",
|
||||
"Homepage": r"Homepage\s*[:\-]\s*([^\n\r|]{1,160})",
|
||||
"Source": r"Source\s*[:\-]\s*([^\n\r|]{1,160})",
|
||||
}
|
||||
for label, pattern in label_patterns.items():
|
||||
match = re.search(pattern, body_text, flags=re.IGNORECASE)
|
||||
if match:
|
||||
value = truncate_text(match.group(1), 120)
|
||||
detected_fields.append(f"{label}: {value}")
|
||||
|
||||
return {
|
||||
"page_title": page_title,
|
||||
"description": description,
|
||||
"headings": headings,
|
||||
"external_links": external_links,
|
||||
"detected_fields": detected_fields,
|
||||
}
|
||||
|
||||
|
||||
def build_new_app_message(session: requests.Session, app: AppSnapshot) -> str:
|
||||
lines: List[str] = [
|
||||
"🆕 New TrueNAS app detected",
|
||||
f"Detected: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}",
|
||||
f"Name: {app.name}",
|
||||
f"URL: {app.url}",
|
||||
]
|
||||
|
||||
if app.train:
|
||||
lines.append(f"Train: {app.train}")
|
||||
if app.added:
|
||||
lines.append(f"Added date: {app.added}")
|
||||
if app.summary:
|
||||
lines.append(f"Catalog summary: {truncate_text(app.summary, 700)}")
|
||||
|
||||
try:
|
||||
details = fetch_new_app_page_details(session, app.url)
|
||||
except requests.RequestException as exc:
|
||||
logging.warning("Unable to fetch app details for %s: %s", app.url, exc)
|
||||
details = {}
|
||||
|
||||
page_title = str(details.get("page_title", "")) if details else ""
|
||||
if page_title:
|
||||
lines.append(f"Page title: {truncate_text(page_title, 180)}")
|
||||
|
||||
description = str(details.get("description", "")) if details else ""
|
||||
if description:
|
||||
lines.append(f"Description: {truncate_text(description, 1000)}")
|
||||
|
||||
detected_fields = details.get("detected_fields", []) if details else []
|
||||
if isinstance(detected_fields, list):
|
||||
for field in detected_fields[:6]:
|
||||
lines.append(str(field))
|
||||
|
||||
headings = details.get("headings", []) if details else []
|
||||
if isinstance(headings, list) and headings:
|
||||
lines.append(f"Headings: {truncate_text(' | '.join(headings[:6]), 320)}")
|
||||
|
||||
external_links = details.get("external_links", []) if details else []
|
||||
if isinstance(external_links, list) and external_links:
|
||||
lines.append("External links:")
|
||||
for link in external_links[:5]:
|
||||
lines.append(f"- {truncate_text(str(link), 220)}")
|
||||
|
||||
message = "\n".join(lines)
|
||||
if len(message) <= MAX_MESSAGE_LEN:
|
||||
return message
|
||||
|
||||
trimmed_lines = [line if len(line) <= 280 else truncate_text(line, 280) for line in lines]
|
||||
while len("\n".join(trimmed_lines)) > MAX_MESSAGE_LEN and len(trimmed_lines) > 8:
|
||||
trimmed_lines.pop()
|
||||
return "\n".join(trimmed_lines)
|
||||
|
||||
|
||||
def split_message(header: str, lines: List[str], max_len: int = MAX_MESSAGE_LEN) -> List[str]:
|
||||
@@ -273,13 +424,31 @@ def run_once(session: requests.Session, first_run: bool) -> bool:
|
||||
logging.info("Initial snapshot saved with %d apps", len(current_state))
|
||||
return False
|
||||
|
||||
header, diff_lines, _ = build_diff_message(previous_state, current_state)
|
||||
changed = bool(diff_lines)
|
||||
added_urls, removed_urls, changed_lines, updated_count = collect_diffs(previous_state, current_state)
|
||||
changed = bool(added_urls or removed_urls or changed_lines)
|
||||
|
||||
if changed:
|
||||
logging.info("Catalog change detected with %d line items", len(diff_lines))
|
||||
for message in split_message(header, diff_lines):
|
||||
send_telegram_message(session, message)
|
||||
logging.info(
|
||||
"Catalog change detected (added=%d, removed=%d, updated=%d)",
|
||||
len(added_urls),
|
||||
len(removed_urls),
|
||||
updated_count,
|
||||
)
|
||||
|
||||
for url in added_urls:
|
||||
app = current_state[url]
|
||||
send_telegram_message(session, build_new_app_message(session, app))
|
||||
|
||||
header, summary_lines = build_summary_message(
|
||||
added_count=len(added_urls),
|
||||
removed_urls=removed_urls,
|
||||
changed_lines=changed_lines,
|
||||
updated_count=updated_count,
|
||||
previous=previous_state,
|
||||
)
|
||||
if summary_lines:
|
||||
for message in split_message(header, summary_lines):
|
||||
send_telegram_message(session, message)
|
||||
else:
|
||||
logging.info("No catalog changes detected")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user