import csv
from bs4 import BeautifulSoup
from tkinter import Tk, filedialog
# --- Step 0: ask for main keyword ---
mkwrd = input("Main keyword? ").strip()
# --- Step 1: ask user to select the HTML file ---
Tk().withdraw()
filename = filedialog.askopenfilename(
title="Select HTML file",
filetypes=[("HTML Files", "*.html"), ("All Files", "*.*")]
)
if not filename:
print("❌ No file selected. Exiting.")
exit()
# --- Step 2: read HTML file ---
with open(filename, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- Step 3: define CSV schema ---
headers = [
"note_id", "Type", "Label", "URL", "Content", "Keywords",
"Media URL", "Media Type", "Media File Name", "Emoji", "Position"
]
for i in range(1, 11):
headers.append(f"link_{i}")
headers.append(f"label_{i}")
rows = []
# --- Helper: clean cell values to prevent CSV spills ---
def clean_csv_value(val: str) -> str:
if not val:
return "" # keep empty
val = str(val)
# normalize whitespace
val = val.replace("\r", " ").replace("\n", " ").replace("\t", " ").strip()
# escape dangerous Excel formulas (avoid CSV injection)
if val.startswith(("=", "+", "-", "@")):
val = "'" + val
return val
# --- Step 4: process
elements ---
for pos, li in enumerate(soup.find_all("li"), start=1):
anchors = li.find_all("a")
images = li.find_all("img")
# First anchor = main Label + URL
label = clean_csv_value(anchors[0].get_text(strip=True)) if anchors else ""
url = clean_csv_value(anchors[0]["href"]) if anchors and anchors[0].has_attr("href") else ""
# Other anchors = link_n + label_n
link_data = []
for a in anchors[1:11]: # up to 10 additional
lbl = clean_csv_value(a.get_text(strip=True))
href = clean_csv_value(a["href"]) if a.has_attr("href") else ""
link_data.append((href, lbl))
# pad missing slots with blanks
while len(link_data) < 10:
link_data.append(("", ""))
# Image handling
media_url = clean_csv_value(images[0]["src"]) if images and images[0].has_attr("src") else ""
media_type = "image" if media_url else ""
media_file_name = clean_csv_value(media_url.split("/")[-1]) if media_url else ""
row = [
"", # note_id blank
"link", # default Type
label,
url,
clean_csv_value(li.get_text(" ", strip=True)), # Content
clean_csv_value(mkwrd), # Keywords
media_url,
media_type,
media_file_name,
"", # Emoji
pos
]
# Add link_1,label_1 ... link_10,label_10
for href, lbl in link_data:
row.extend([href, lbl])
rows.append(row)
# --- Step 5: write CSV with BOM for Excel ---
out_file = filename.rsplit(".", 1)[0] + "_converted.csv"
with open(out_file, "w", newline="", encoding="utf-8-sig") as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL) # force quoting all cells
writer.writerow(headers)
writer.writerows(rows)
print(f"✅ Conversion complete. CSV saved as: {out_file}")