#!/usr/bin/env python3
# Clean version: tg_fetch_default_truedelta_with_previews.py
import argparse, os, sys, json, csv, html, datetime as dt, re, mimetypes
from pathlib import Path
from urllib.parse import urlparse, parse_qs
from telethon import TelegramClient
from telethon.errors import RPCError
from telethon.tl.types import Message

try:
    import requests
except Exception:
    requests = None

try:
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    sys.stderr.reconfigure(encoding="utf-8", errors="replace")
except Exception:
    pass

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

def env(name, default=None, cast=str):
    v = os.getenv(name, default)
    if v is None: return None
    return cast(v) if (cast and v is not None) else v

def parse_date(s):
    if not s: return None
    for fmt in ("%Y-%m-%d", "%Y-%m-%dT%H:%M"):
        try: return dt.datetime.strptime(s, fmt)
        except ValueError: pass
    raise ValueError(f"Bad date: {s} (use YYYY-MM-DD or YYYY-MM-DDTHH:MM)")

def to_row(m: Message, media_path: str = ""):
    return {
        "id": m.id,
        "date": m.date.isoformat() if m.date else "",
        "sender_id": getattr(m.from_id, "user_id", None) or getattr(m.from_id, "channel_id", None) or "",
        "text": m.message or "",
        "reply_to_msg_id": m.reply_to_msg_id or "",
        "views": m.views or "",
        "forwards": m.forwards or "",
        "reactions": ",".join([f"{r.reaction}: {r.count}" for r in (m.reactions.results if m.reactions else [])]) if getattr(m, "reactions", None) else "",
        "media": type(m.media).__name__ if m.media else "",
        "media_path": media_path,
        "entities": type(m.entities).__name__ if m.entities else "",
    }

def ensure_dir(p: str | Path):
    Path(p).mkdir(parents=True, exist_ok=True)
    return Path(p)

def youtube_thumb(url: str) -> str | None:
    try:
        u = urlparse(url)
        host = (u.netloc or "").lower()
        path = u.path or ""
        vid = None
        if "youtube.com" in host:
            if path.startswith("/watch"):
                q = parse_qs(u.query or "")
                vid = (q.get("v") or [None])[0]
            elif path.startswith("/shorts/") or path.startswith("/live/"):
                parts = path.strip("/").split("/")
                if len(parts) >= 2:
                    vid = parts[1]
        elif "youtu.be" in host:
            vid = path.strip("/").split("/")[0] or None
        if vid:
            return f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg"
    except Exception:
        pass
    return None

def get_og_image(url: str, timeout: float = 5.0) -> str | None:
    if requests is None:
        return None
    try:
        headers = {"User-Agent": "Mozilla/5.0 (compatible; ExportScript/1.0)"}
        r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
        ctype = (r.headers.get("content-type") or "").lower()
        if "text/html" not in ctype:
            return None
        html_txt = r.text
        m = re.search(r'<meta\s+(?:property|name)\s*=\s*["\']og:image["\']\s+content\s*=\s*["\']([^"\']+)["\']', html_txt, re.IGNORECASE)
        if not m:
            m = re.search(r'<meta\s+content\s*=\s*["\']([^"\']+)["\']\s+(?:property|name)\s*=\s*["\']og:image["\']', html_txt, re.IGNORECASE)
        if m:
            img = m.group(1)
            pu = urlparse(url); iu = urlparse(img)
            if not iu.scheme:
                base = f"{pu.scheme}://{pu.netloc}"
                if img.startswith('/'):
                    img = base + img
                else:
                    base_path = pu.path.rsplit('/', 1)[0] if '/' in pu.path else ''
                    img = f"{base}{base_path}/{img}".replace('//', '/').replace(':/', '://')
            return img
    except Exception:
        return None
    return None

def normalize_chat_ref(ref):
    if isinstance(ref, str) and re.fullmatch(r"-?\d+", ref):
        try:
            return int(ref)
        except ValueError:
            pass
    return ref

def parse_tme_link(s):
    try:
        u = urlparse(s)
        if u.netloc in {"t.me", "telegram.me"} and u.path.startswith("/c/"):
            parts = u.path.strip("/").split("/")
            if len(parts) >= 3 and parts[1].isdigit() and parts[2].isdigit():
                chat_id = int("-100" + parts[1])
                topic_id = int(parts[2])
                return chat_id, topic_id
    except Exception:
        pass
    return None, None

def write_html_single(out_path: Path, chat_label: str, rows: list[dict], enable_yt=True, enable_og=True, download_previews=False, media_dir: Path | None=None):
    ensure_dir(out_path.parent)
    def maybe_download(url: str) -> str:
        if not download_previews or media_dir is None or requests is None:
            return url
        try:
            resp = requests.get(url, timeout=6, stream=True)
            if resp.status_code != 200:
                return url
            import mimetypes as _m
            ext = _m.guess_extension((resp.headers.get("content-type") or "").split(";")[0].strip()) or ".jpg"
            fname = f"preview_{abs(hash(url))}{ext}"
            ensure_dir(media_dir)
            out = media_dir / fname
            with open(out, "wb") as f:
                for chunk in resp.iter_content(65536):
                    if chunk:
                        f.write(chunk)
            return str(out)
        except Exception:
            return url

    def row_to_html(r):
        body_text = r.get("text") or ""
        body = html.escape(body_text).replace("\n", "<br>")
        meta = []
        if r["sender_id"]: meta.append(f"from: {r['sender_id']}")
        if r["views"]: meta.append(f"views: {r['views']}")
        if r["forwards"]: meta.append(f"fwd: {r['forwards']}")
        if r["reactions"]: meta.append(f"react: {html.escape(r['reactions'])}")
        if r["reply_to_msg_id"]: meta.append(f"reply→{r['reply_to_msg_id']}")
        meta_str = " • ".join(meta)

        previews = []
        for mm in re.finditer(r'https?://\S+', body_text):
            u = mm.group(0).rstrip(").,]")
            thumb = youtube_thumb(u) if enable_yt else None
            if enable_og and not thumb:
                thumb = get_og_image(u)
            if thumb:
                thumb = maybe_download(thumb)
                previews.append(f'<a class="lp" href="{html.escape(u)}" target="_blank" rel="noopener"><img src="{html.escape(thumb)}" alt="link preview"></a>')
        previews_html = "".join(previews)

        media_html = ""
        if r["media_path"]:
            rel = html.escape(r["media_path"])
            lower = rel.lower()
            if lower.endswith((".png",".jpg",".jpeg",".gif",".webp",".bmp")):
                media_html = f'<div class="media"><img src="{rel}" alt="media"></div>'
            elif lower.endswith((".mp4",".webm",".ogg",".mkv",".mov")):
                media_html = f'<div class="media"><video src="{rel}" controls></video></div>'
            else:
                media_html = f'<div class="media"><a href="{rel}">Download attachment</a></div>'

        return f"""
        <article id="m{r['id']}">
          <header>
            <a class="msgid" href="#m{r['id']}">#{r['id']}</a>
            <time>{html.escape(r['date'])}</time>
            <span class="meta">{meta_str}</span>
          </header>
          <div class="text">{body}</div>
          <div class="previews">{previews_html}</div>
          {media_html}
        </article>
        """

    items = "\n".join(row_to_html(r) for r in rows)
    title = f"Telegram export — {html.escape(chat_label)}"
    doc = f"""<!doctype html>
<html lang="en">
<meta charset="utf-8">
<title>{title}</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>
body{{font:14px/1.4 system-ui,Segoe UI,Arial,sans-serif;max-width:900px;margin:2rem auto;padding:0 1rem;background:#0b0b0b;color:#eaeaea}}
h1{{font-size:1.4rem;margin:0 0 1rem}}
header.top{{display:flex;gap:1rem;align-items:baseline;justify-content:space-between}}
article{{border:1px solid #2a2a2a;border-radius:14px;padding:12px 14px;margin:12px 0;background:#141414;box-shadow:0 1px 2px rgba(0,0,0,.2)}}
article header{{display:flex;gap:.8rem;align-items:center;font-size:.85rem;color:#bdbdbd}}
article header .msgid{{text-decoration:none;color:#8ab4f8}}
article .text{{margin-top:.3rem;white-space:normal;word-wrap:break-word}}
.media img, .media video{{max-width:100%;height:auto;border-radius:10px;margin-top:.6rem}}
.previews img{{max-width:100%;height:auto;border-radius:10px;margin-top:.6rem;display:block}}
footer{{opacity:.7;font-size:.8rem;margin:2rem 0}}
</style>
<body>
<header class="top">
  <h1>{title}</h1>
  <div>{len(rows)} messages</div>
</header>
{items}
<footer>Generated by tg_fetch_default_truedelta_with_previews.py</footer>
</body>
</html>
"""
    out_path.write_text(doc, encoding="utf-8")

def build_parser():
    p = argparse.ArgumentParser(description="Fetch/export Telegram messages from a chat or a forum topic.")
    p.add_argument("--from", dest="chat", default="@TrueDelta", help="Target chat: @username, numeric id (-100...), or t.me/c/.../.... Defaults to @TrueDelta.")
    p.add_argument("--topic-id", type=int, help="Forum topic id (root message id).")
    p.add_argument("--limit", type=int, default=200, help="Max messages (0 = all).")
    p.add_argument("--since", help="Start date (YYYY-MM-DD or YYYY-MM-DDTHH:MM).")
    p.add_argument("--until", help="End date (YYYY-MM-DD or YYYY-MM-DDTHH:MM).")
    p.add_argument("--query", help="Substring filter on text (case-insensitive).")
    p.add_argument("--reverse", action="store_true", help="Oldest first (default newest first).")
    p.add_argument("--out-json", help="Write results to JSON.")
    p.add_argument("--out-csv", help="Write results to CSV.")
    p.add_argument("--print", action="store_true", help="Print to stdout.")
    p.add_argument("--media-dir", help="Download media to this folder (optional).")
    p.add_argument("--out-html", help="Write a single combined HTML file here.")
    p.add_argument("--no-link-previews", action="store_true", help="Disable link previews (YouTube/OG).")
    p.add_argument("--download-previews", action="store_true", help="Download preview thumbnails into media-dir (requires --media-dir).")
    p.add_argument("--session", default=os.getenv("SESSION","tg_session"))
    p.add_argument("--api-id", type=int, default=os.getenv("API_ID") and int(os.getenv("API_ID")))
    p.add_argument("--api-hash", default=os.getenv("API_HASH"))
    p.add_argument("--phone", default=os.getenv("PHONE_NUMBER"))
    return p

def main():
    args = build_parser().parse_args()
    if not args.api_id or not args.api_hash:
        print("ERROR: API_ID/API_HASH missing (.env or flags).", file=sys.stderr)
        sys.exit(1)

    since_dt = parse_date(args.since) if args.since else None
    until_dt = parse_date(args.until) if args.until else None
    text_q = args.query.lower() if args.query else None
    lim = None if args.limit == 0 else args.limit

    client = TelegramClient(args.session, args.api_id, args.api_hash)

    async def run():
        if not await client.is_user_authorized():
            if not args.phone:
                print("First login requires --phone or PHONE_NUMBER in .env", file=sys.stderr)
                sys.exit(1)
            await client.send_code_request(args.phone)
            code = input("Enter the login code you received in Telegram: ").strip()
            try:
                await client.sign_in(args.phone, code)
            except RPCError as e:
                print(f"Login failed: {e}", file=sys.stderr)
                sys.exit(1)

        chat_ref = args.chat
        if isinstance(chat_ref, str):
            chat_id_from_link, topic_id_from_link = parse_tme_link(chat_ref)
            if chat_id_from_link is not None:
                chat_ref = chat_id_from_link
                if args.topic_id is None and topic_id_from_link is not None:
                    args.topic_id = topic_id_from_link
        chat_ref = normalize_chat_ref(chat_ref)

        try:
            entity = await client.get_entity(chat_ref)
        except Exception as e:
            print(f"Could not resolve chat '{chat_ref}': {e}", file=sys.stderr)
            sys.exit(1)

        results = []
        iter_kwargs = {"entity": entity, "limit": lim, "reverse": args.reverse}
        if args.topic_id:
            iter_kwargs["reply_to"] = args.topic_id

        media_dir = None
        if args.media_dir:
            media_dir = ensure_dir(args.media_dir)

        async for m in client.iter_messages(**iter_kwargs):
            if since_dt and (m.date is None or m.date < since_dt):
                if not args.reverse: break
            if until_dt and (m.date is None or m.date > until_dt):
                if args.reverse: break
                else: continue

            if text_q:
                body = (m.message or "")
                if text_q not in body.lower():
                    continue

            media_path = ""
            if media_dir and m.media:
                try:
                    saved = await client.download_media(m, file=str(media_dir / ""))
                    if saved:
                        media_path = str(Path(saved))
                except Exception as e:
                    print(f"Media download failed for msg {m.id}: {e}", file=sys.stderr)

            results.append(to_row(m, media_path))

        if args.out_json:
            ensure_dir(Path(args.out_json).parent)
            with open(args.out_json, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)
        if args.out_csv:
            ensure_dir(Path(args.out_csv).parent)
            with open(args.out_csv, "w", newline="", encoding="utf-8") as f:
                flds = ["id","date","sender_id","text","reply_to_msg_id","views","forwards","reactions","media","media_path","entities"]
                w = csv.DictWriter(f, fieldnames=flds)
                w.writeheader()
                for r in results:
                    w.writerow(r)

        if args.out_html:
            write_html_single(
                Path(args.out_html),
                str(chat_ref),
                results,
                enable_yt=not args.no_link_previews,
                enable_og=not args.no_link_previews,
                download_previews=(args.download_previews and args.media_dir is not None),
                media_dir=media_dir
            )

        if args.print or (not args.out_json and not args.out_csv and not args.out_html):
            for r in results:
                preview = (r['text'][:120] if r['text'] else '').replace('\n',' ')
            print(f"[{r['id']}] {r['date']}  {preview}")
        print(f"Fetched {len(results)} messages.", file=sys.stderr)

    with client:
        client.loop.run_until_complete(run())

if __name__ == "__main__":
    main()
