import csv import os from bs4 import BeautifulSoup from tkinter import Tk, filedialog # --- Step 1: ask user to select a folder --- Tk().withdraw() folder = filedialog.askdirectory( title="Select Folder with HTML files" ) if not folder: print("❌ No folder selected. Exiting.") exit() # --- Step 2: define CSV schema --- headers = [ "note_id", "Type", "Label", "URL", "Content", "Keywords", "Media URL", "Media Type", "Media File Name", "Emoji", "Position" ] for i in range(1, 11): headers.append(f"link_{i}") headers.append(f"label_{i}") # --- Helper: clean cell values to prevent CSV spills --- def clean_csv_value(val: str) -> str: if not val: return "" # keep empty val = str(val) # normalize whitespace val = val.replace("\r", " ").replace("\n", " ").replace("\t", " ").strip() # escape dangerous Excel formulas (avoid CSV injection) if val.startswith(("=", "+", "-", "@")): val = "'" + val return val # --- Helper: derive keyword from filename --- def derive_keyword(filename: str) -> str: base = os.path.basename(filename).lower() keyword = "" if "p_" in base: keyword = base.split("p_", 1)[1].split("_", 1)[0] elif "my_" in base: keyword = base.split("my_", 1)[1].split("_", 1)[0] return clean_csv_value(keyword) # --- Step 3: process all HTML files in folder --- html_files = [f for f in os.listdir(folder) if f.lower().endswith(".html")] if not html_files: print("❌ No HTML files found in the selected folder.") exit() for filename in html_files: filepath = os.path.join(folder, filename) keyword = derive_keyword(filename) if not keyword: print(f"⚠️ No keyword found in filename: {filename} (Keywords will be blank)") # --- Read HTML file --- with open(filepath, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") rows = [] # --- Process

elements --- for pos, li in enumerate(soup.find_all("li"), start=1): anchors = li.find_all("a") images = li.find_all("img") # First anchor = main Label + URL label = clean_csv_value(anchors[0].get_text(strip=True)) if anchors else "" url = clean_csv_value(anchors[0]["href"]) if anchors and anchors[0].has_attr("href") else "" # Other anchors = link_n + label_n link_data = [] for a in anchors[1:11]: # up to 10 additional lbl = clean_csv_value(a.get_text(strip=True)) href = clean_csv_value(a["href"]) if a.has_attr("href") else "" link_data.append((href, lbl)) # pad missing slots with blanks while len(link_data) < 10: link_data.append(("", "")) # Image handling media_url = clean_csv_value(images[0]["src"]) if images and images[0].has_attr("src") else "" media_type = "image" if media_url else "" media_file_name = clean_csv_value(media_url.split("/")[-1]) if media_url else "" row = [ "", # note_id blank "link", # default Type label, url, clean_csv_value(li.get_text(" ", strip=True)), # Content keyword, # auto-derived from filename media_url, media_type, media_file_name, "", # Emoji pos ] # Add link_1,label_1 ... link_10,label_10 for href, lbl in link_data: row.extend([href, lbl]) rows.append(row) # --- Write CSV with BOM for Excel --- out_file = os.path.join(folder, filename.rsplit(".", 1)[0] + "_converted.csv") with open(out_file, "w", newline="", encoding="utf-8-sig") as csvfile: writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL) # force quoting all cells writer.writerow(headers) writer.writerows(rows) print(f"✅ Processed {filename} → {os.path.basename(out_file)} (keyword: '{keyword}')") print("🎉 All files processed successfully.")