import csv
from bs4 import BeautifulSoup
from tkinter import Tk, filedialog
# --- Step 0: ask for main keyword ---
mkwrd = input("Main keyword? ").strip()
# --- Step 1: ask user to select the HTML file ---
Tk().withdraw()
filename = filedialog.askopenfilename(
title="Select HTML file",
filetypes=[("HTML Files", "*.html"), ("All Files", "*.*")]
)
if not filename:
print("❌ No file selected. Exiting.")
exit()
# --- Step 2: read HTML file ---
with open(filename, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "html.parser")
# --- Step 3: define CSV schema ---
headers = [
"note_id", "Type", "Label", "URL", "Content", "Keywords",
"Media URL", "Media Type", "Media File Name", "Emoji", "Position"
]
for i in range(1, 11):
headers.append(f"link_{i}")
headers.append(f"label_{i}")
rows = []
# --- Step 4: process
elements ---
for pos, li in enumerate(soup.find_all("li"), start=1):
anchors = li.find_all("a")
images = li.find_all("img")
# First anchor = main Label + URL
label = anchors[0].get_text(strip=True) if anchors else ""
url = anchors[0]["href"] if anchors and anchors[0].has_attr("href") else ""
# Other anchors = link_n + label_n
link_data = []
for a in anchors[1:11]: # up to 10 additional
lbl = a.get_text(strip=True)
href = a["href"] if a.has_attr("href") else ""
link_data.append((href, lbl)) # swap order here
# pad missing slots with blanks
while len(link_data) < 10:
link_data.append(("", ""))
# Image handling
media_url = images[0]["src"] if images and images[0].has_attr("src") else ""
media_type = "image" if media_url else ""
media_file_name = media_url.split("/")[-1] if media_url else ""
row = [
"", # note_id blank
"link", # default Type
label,
url,
li.get_text(" ", strip=True), # Content
mkwrd, # Keywords
media_url,
media_type,
media_file_name,
"", # Emoji
pos
]
# Add link_1,label_1 ... link_10,label_10
for href, lbl in link_data:
row.extend([href, lbl])
# Keep empty strings as ""
rows.append(row)
# --- Step 5: write CSV with BOM for Excel ---
out_file = filename.rsplit(".", 1)[0] + "_converted.csv"
with open(out_file, "w", newline="", encoding="utf-8-sig") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(headers)
writer.writerows(rows)
print(f"✅ Conversion complete. CSV saved as: {out_file}")