import os, requests, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

base = "https://amatterofmind.org/"
# nas_path = r"Z:\amatterofmind_pdfs"  # Update this to your NAS mount path
nas_path = os.path.join(os.path.dirname(__file__), "amatterofmind_pdfs")

visited = set()
pdfs = set()

def crawl(url):
    if url in visited or not url.startswith(base): return
    visited.add(url)
    try:
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup.find_all("a", href=True):
            href = tag["href"]
            full = urljoin(url, href)
            if full.endswith(".pdf"):
                pdfs.add(full)
            elif urlparse(full).netloc == urlparse(base).netloc:
                crawl(full)
    except Exception as e:
        print(f"Error crawling {url}: {e}")

def download_all():
    os.makedirs(nas_path, exist_ok=True)
    for pdf_url in sorted(pdfs):
        try:
            filename = os.path.join(nas_path, pdf_url.split("/")[-1])
            if not os.path.exists(filename):
                print(f"Downloading {pdf_url}")
                r = requests.get(pdf_url, timeout=15)
                with open(filename, "wb") as f: f.write(r.content)
        except Exception as e:
            print(f"Failed to download {pdf_url}: {e}")

crawl(base)
download_all()
