import csv from bs4 import BeautifulSoup from tkinter import Tk, filedialog # --- Step 0: ask for main keyword --- mkwrd = input("Main keyword? ").strip() # --- Step 1: ask user to select the HTML file --- Tk().withdraw() filename = filedialog.askopenfilename( title="Select HTML file", filetypes=[("HTML Files", "*.html"), ("All Files", "*.*")] ) if not filename: print("❌ No file selected. Exiting.") exit() # --- Step 2: read HTML file --- with open(filename, "r", encoding="utf-8") as f: soup = BeautifulSoup(f, "html.parser") # --- Step 3: define CSV schema --- headers = [ "note_id", "Type", "Label", "URL", "Content", "Keywords", "Media URL", "Media Type", "Media File Name", "Emoji", "Position" ] for i in range(1, 11): headers.append(f"link_{i}") headers.append(f"label_{i}") rows = [] # --- Helper: clean cell values to prevent CSV spills --- def clean_csv_value(val: str) -> str: if not val: return "" # keep empty val = str(val) # normalize whitespace val = val.replace("\r", " ").replace("\n", " ").replace("\t", " ").strip() # escape dangerous Excel formulas (avoid CSV injection) if val.startswith(("=", "+", "-", "@")): val = "'" + val return val # --- Step 4: process
  • elements --- for pos, li in enumerate(soup.find_all("li"), start=1): anchors = li.find_all("a") images = li.find_all("img") # First anchor = main Label + URL label = clean_csv_value(anchors[0].get_text(strip=True)) if anchors else "" url = clean_csv_value(anchors[0]["href"]) if anchors and anchors[0].has_attr("href") else "" # Other anchors = link_n + label_n link_data = [] for a in anchors[1:11]: # up to 10 additional lbl = clean_csv_value(a.get_text(strip=True)) href = clean_csv_value(a["href"]) if a.has_attr("href") else "" link_data.append((href, lbl)) # pad missing slots with blanks while len(link_data) < 10: link_data.append(("", "")) # Image handling media_url = clean_csv_value(images[0]["src"]) if images and images[0].has_attr("src") else "" media_type = "image" if media_url else "" media_file_name = clean_csv_value(media_url.split("/")[-1]) if media_url else "" row = [ "", # note_id blank "link", # default Type label, url, clean_csv_value(li.get_text(" ", strip=True)), # Content clean_csv_value(mkwrd), # Keywords media_url, media_type, media_file_name, "", # Emoji pos ] # Add link_1,label_1 ... link_10,label_10 for href, lbl in link_data: row.extend([href, lbl]) rows.append(row) # --- Step 5: write CSV with BOM for Excel --- out_file = filename.rsplit(".", 1)[0] + "_converted.csv" with open(out_file, "w", newline="", encoding="utf-8-sig") as csvfile: writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL) # force quoting all cells writer.writerow(headers) writer.writerows(rows) print(f"✅ Conversion complete. CSV saved as: {out_file}")