workhive-caldav-sync/sync.py

from bs4 import BeautifulSoup
import requests
from datetime import datetime, timedelta
from caldav import DAVClient
import hashlib
import re
import os
from dotenv import load_dotenv

# load environment from .env when present
load_dotenv()
import hashlib

# Workhive and credentials (must be set via environment variables)
# Required secrets: WORKHIVE_TOKEN, CALDAV_USER, CALDAV_PASSWORD, CALDAV_LOCATION
BASE_URL = os.environ.get('BASE_URL')
TOKEN = os.environ.get('WORKHIVE_TOKEN')
NAME = os.environ.get('NAME', 'Jonathan Slivka')

# CalDAV credentials (must be provided via env)
USER = os.environ.get('CALDAV_USER')
PASSWORD = os.environ.get('CALDAV_PASSWORD')
LOCATION = os.environ.get('CALDAV_LOCATION')

# timezone to use for parsed datetimes (script runs in EST by default)
TIMEZONE = os.environ.get('TIMEZONE', 'America/New_York')

if not TOKEN:
    raise SystemExit('WORKHIVE_TOKEN not set in environment')
if not USER or not PASSWORD or not LOCATION:
    raise SystemExit('CALDAV_USER, CALDAV_PASSWORD, and CALDAV_LOCATION must be set in environment')


# with cookie passed as raw header (avoid requests' cookie encoding issues)
headers = {"Cookie": f"workhive_session={TOKEN}"}

#calculate the mondays of last week and the next 3 weeks
periods = []
today = datetime.today()
monday = today - timedelta(days=today.weekday()) - timedelta(weeks=1) # start from last week to catch any late-posted shifts
for i in range(4):
    period = (monday + timedelta(weeks=i)).strftime("%Y-%m-%d")
    periods.append(period)

print("Periods to check: " + ", ".join(periods))

facilities = {
    "fac_8ed0d011c748":"Cary Swim Club",
    "fac_f38dd7211e7e":"Scottish Hills"
}

urls = [f"{BASE_URL}?facility_id={facility}&period={period}" for facility in facilities for period in periods]

shifts = []

for url in urls:
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    for li in soup.find_all("li"):
        if NAME in li.get_text().strip():
            shift_time = list(li.find_parent("div", class_="text-white").children)[1].get_text().strip().split("\n")[1].strip()

            td = li.find_parent('td')
            row = td.find_parent('tr')
            cells = [c for c in row.find_all(['td', 'th'], recursive=False) if getattr(c, 'name', None) is not None]
            col_index = None
            for i, c in enumerate(cells):
                if c is td:
                    col_index = i
                    break

            table = row.find_parent('table')
            header_ths = []
            thead = table.find('thead')
            header_row = thead.find('tr')
            header_ths = [th for th in header_row.find_all('th', recursive=False)]

            # extract date text from header (<th> may contain month and day on separate lines)
            shift_date_text = header_ths[col_index].get_text().strip()
            month, day = shift_date_text.split("\n")
            day = day.split(" ")[1]
            shift_date = f"{month} {day}"

            facility = url.split("facility_id=")[1].split("&")[0]
            facility_name = facilities.get(facility, "Unknown Facility")

            # extract year from the period parameter in the URL so parsed datetimes use correct year
            period_param = None
            if "period=" in url:
                period_param = url.split("period=")[1].split("&")[0]
            if period_param:
                year = int(period_param.split("-")[0])
            else:
                year = datetime.today().year

            #e.g
            #date=Jun 18
            #time=2:50 PM -8:15 PM

            # parse start and end times separately to avoid duplicate regex group names
            parts = [p.strip() for p in shift_time.split('-')]
            if len(parts) >= 2:
                start_str, end_str = parts[0], parts[1]
            else:
                start_str = parts[0]
                end_str = None

            # try abbreviated month (%b) then full month (%B)
            start_dt = end_dt = None
            for fmt in ("%Y %b %d %I:%M %p", "%Y %B %d %I:%M %p"):
                try:
                    start_dt = datetime.strptime(f"{year} {month} {day} {start_str}", fmt)
                    if end_str:
                        end_dt = datetime.strptime(f"{year} {month} {day} {end_str}", fmt)
                    break
                except ValueError:
                    continue

            if start_dt is None:
                raise ValueError(f"Could not parse shift time: {shift_time!r} with date {shift_date!r}")

            shifts.append({
                "facility": facility_name,
                "date": shift_date,
                "time": shift_time,
                "start_datetime": start_dt,
                "end_datetime": end_dt,
            })


for shift in shifts:
    print(f"{shift['facility']} - {shift['date']} - {shift['time']}")
    print(f"  Start: {shift['start_datetime']}")
    print(f"  End: {shift['end_datetime']}")
    print()

# push to CalDAV server with duplicate checking
client = DAVClient(LOCATION, username=USER, password=PASSWORD)
cals = client.principal().calendars()
work_calendar = None
for c in cals:
    if getattr(c, 'url', '').rstrip('/') == LOCATION.rstrip('/'):
        work_calendar = c
        break

if not work_calendar:
    print("No CalDAV calendar found; skipping push")
else:
    print(f"Using calendar: {getattr(work_calendar, 'id', 'unknown')}")
    for shift in shifts:
        summary = f"{shift['facility']} Shift"
        start_dt = shift['start_datetime']
        end_dt = shift['end_datetime'] or start_dt

        uid_src = f"{shift['facility']}|{start_dt.isoformat()}|{end_dt.isoformat()}"
        uid = hashlib.sha1(uid_src.encode()).hexdigest() + "@workhive-sync"

        exists = False
        for e in work_calendar.events():
            raw = e.data
            txt = raw.decode('utf-8', 'ignore') if isinstance(raw, bytes) else str(raw)

            m = re.search(r'^UID:(.+)$', txt, re.M)
            if m and m.group(1).strip() == uid:
                exists = True
                break

            msum = re.search(r'^SUMMARY:(.+)$', txt, re.M)
            mdt = re.search(r'^DTSTART(?:;TZID=[^:]+)?:([0-9T]+)', txt, re.M)
            if msum and mdt:
                if msum.group(1).strip() == summary and mdt.group(1).strip().startswith(start_dt.strftime('%Y%m%dT%H%M%S')):
                    exists = True
                    break

        if exists:
            print(f"Skipping existing event: {summary} at {start_dt}")
            continue

        dtstart_str = start_dt.strftime('%Y%m%dT%H%M%S')
        dtend_str = end_dt.strftime('%Y%m%dT%H%M%S')
        ical = (
            'BEGIN:VCALENDAR\r\n'
            'VERSION:2.0\r\n'
            'PRODID:-//workhive-caldav-sync//EN\r\n'
            'BEGIN:VEVENT\r\n'
            f'UID:{uid}\r\n'
            f'SUMMARY:{summary}\r\n'
            f'DTSTART;TZID=America/New_York:{dtstart_str}\r\n'
            f'DTEND;TZID=America/New_York:{dtend_str}\r\n'
            f'DESCRIPTION:Work shift at {shift["facility"]} on {shift["date"]} from {shift["time"]}\r\n'
            'END:VEVENT\r\n'
            'END:VCALENDAR\r\n'
        )

        work_calendar.add_event(ical)
        print(f"Added event for shift on {shift['date']} at {shift['facility']}")