import csv
import os
from bs4 import BeautifulSoup
from tkinter import Tk, filedialog

# --- Step 1: ask user to select a folder ---
Tk().withdraw()
folder = filedialog.askdirectory(
    title="Select Folder with HTML files"
)
if not folder:
    print("❌ No folder selected. Exiting.")
    exit()

# --- Step 2: define CSV schema ---
headers = [
    "note_id", "Type", "Label", "URL", "Content", "Keywords",
    "Media URL", "Media Type", "Media File Name", "Emoji", "Position"
]
for i in range(1, 11):
    headers.append(f"link_{i}")
    headers.append(f"label_{i}")

# --- Helper: clean cell values to prevent CSV spills ---
def clean_csv_value(val: str) -> str:
    if not val:
        return ""  # keep empty
    val = str(val)
    # normalize whitespace
    val = val.replace("\r", " ").replace("\n", " ").replace("\t", " ").strip()
    # escape dangerous Excel formulas (avoid CSV injection)
    if val.startswith(("=", "+", "-", "@")):
        val = "'" + val
    return val

# --- Helper: derive keyword from filename ---
def derive_keyword(filename: str) -> str:
    base = os.path.basename(filename).lower()
    keyword = ""
    if "p_" in base:
        keyword = base.split("p_", 1)[1].split("_", 1)[0]
    elif "my_" in base:
        keyword = base.split("my_", 1)[1].split("_", 1)[0]
    return clean_csv_value(keyword)

# --- Step 3: process all HTML files in folder ---
html_files = [f for f in os.listdir(folder) if f.lower().endswith(".html")]

if not html_files:
    print("❌ No HTML files found in the selected folder.")
    exit()

for filename in html_files:
    filepath = os.path.join(folder, filename)
    keyword = derive_keyword(filename)

    if not keyword:
        print(f"⚠️ No keyword found in filename: {filename} (Keywords will be blank)")

    # --- Read HTML file ---
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    rows = []

    # --- Process <li> elements ---
    for pos, li in enumerate(soup.find_all("li"), start=1):
        anchors = li.find_all("a")
        images = li.find_all("img")

        # First anchor = main Label + URL
        label = clean_csv_value(anchors[0].get_text(strip=True)) if anchors else ""
        url = clean_csv_value(anchors[0]["href"]) if anchors and anchors[0].has_attr("href") else ""

        # Other anchors = link_n + label_n
        link_data = []
        for a in anchors[1:11]:  # up to 10 additional
            lbl = clean_csv_value(a.get_text(strip=True))
            href = clean_csv_value(a["href"]) if a.has_attr("href") else ""
            link_data.append((href, lbl))

        # pad missing slots with blanks
        while len(link_data) < 10:
            link_data.append(("", ""))

        # Image handling
        media_url = clean_csv_value(images[0]["src"]) if images and images[0].has_attr("src") else ""
        media_type = "image" if media_url else ""
        media_file_name = clean_csv_value(media_url.split("/")[-1]) if media_url else ""

        row = [
            "",  # note_id blank
            "link",  # default Type
            label,
            url,
            clean_csv_value(li.get_text(" ", strip=True)),  # Content
            keyword,  # auto-derived from filename
            media_url,
            media_type,
            media_file_name,
            "",  # Emoji
            pos
        ]

        # Add link_1,label_1 ... link_10,label_10
        for href, lbl in link_data:
            row.extend([href, lbl])

        rows.append(row)

    # --- Write CSV with BOM for Excel ---
    out_file = os.path.join(folder, filename.rsplit(".", 1)[0] + "_converted.csv")
    with open(out_file, "w", newline="", encoding="utf-8-sig") as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)  # force quoting all cells
        writer.writerow(headers)
        writer.writerows(rows)

    print(f"✅ Processed {filename} → {os.path.basename(out_file)} (keyword: '{keyword}')")

print("🎉 All files processed successfully.")
