"""Encoding equivalences and name remapping.

This module defines:

1. **Directional supersets** for accuracy evaluation: detecting a superset
   encoding when the expected encoding is a subset is correct (e.g., detecting
   UTF-8 when expected is ASCII), but not the reverse.

2. **Bidirectional equivalents**: groups of encodings where detecting any
   member when another member was expected is considered correct.  This
   includes UTF-16/UTF-32 endian variants (which encode the same text with
   different byte order) and ISO-2022-JP branch variants (which are
   compatible extensions of the same base encoding).

3. **Preferred superset mapping** for the ``prefer_superset`` API option:
   replaces detected ISO/subset encoding names with their Windows/CP superset
   equivalents that modern software actually uses.

4. **Compatibility names** for the default ``compat_names=True`` mode: maps
   internal Python codec names to the names chardet 5.x/6.x returned,
   preserving backward compatibility for callers that compare encoding
   strings directly.
"""

from __future__ import annotations

import unicodedata
from collections.abc import Callable

from chardet.pipeline import DetectionDict
from chardet.registry import lookup_encoding

# Directional superset relationships: detecting any of the supersets
# when the expected encoding is the subset counts as correct.
# E.g., expected=ascii, detected=utf-8 -> correct (utf-8 ⊃ ascii).
# But expected=utf-8, detected=ascii -> wrong (ascii ⊄ utf-8).
#
# Note: some subset keys (iso-8859-11) are not in the detection
# registry — the detector never returns them.  They appear here because
# chardet test-suite expected values use these names, so the superset
# mapping is needed for accuracy evaluation only.
SUPERSETS: dict[str, frozenset[str]] = {
    "ASCII": frozenset({"utf-8", "cp1252"}),
    "TIS-620": frozenset({"iso8859-11", "cp874"}),
    "ISO-8859-11": frozenset({"cp874"}),
    "GB2312": frozenset({"gb18030"}),
    "GBK": frozenset({"gb18030"}),
    "Big5": frozenset({"big5hkscs", "cp950"}),
    "Shift_JIS": frozenset({"cp932", "shift_jis_2004"}),
    "Shift-JISX0213": frozenset({"shift_jis_2004"}),
    "EUC-JP": frozenset({"euc_jis_2004"}),
    "EUC-JISX0213": frozenset({"euc_jis_2004"}),
    "EUC-KR": frozenset({"cp949"}),
    "CP037": frozenset({"cp1140"}),
    # ISO-2022-JP subsets: any branch variant is acceptable.
    # In our registry, base ISO-2022-JP is an alias of iso2022_jp_2, so all
    # three extended variants are supersets of the same base.  While the
    # extended variants use different escape sequences for non-basic characters,
    # real-world files rarely use those extensions — the base JIS X 0208
    # character set is shared by all variants and cross-decodes identically.
    # ISO2022-JP-1 and ISO2022-JP-3 use Python codec names (no hyphen between
    # "ISO" and "2022") because they appear as expected values in the test suite,
    # not as canonical chardet output.  They are consumed through
    # _NORMALIZED_SUPERSETS which normalizes via codecs.lookup().
    "ISO-2022-JP": frozenset({"iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"}),
    "ISO2022-JP-1": frozenset({"iso2022_jp_2", "iso2022_jp_ext"}),
    "ISO2022-JP-3": frozenset({"iso2022_jp_2004"}),
    # ISO/Windows superset pairs
    "ISO-8859-1": frozenset({"cp1252"}),
    "ISO-8859-2": frozenset({"cp1250"}),
    "ISO-8859-5": frozenset({"cp1251"}),
    "ISO-8859-6": frozenset({"cp1256"}),
    "ISO-8859-7": frozenset({"cp1253"}),
    "ISO-8859-8": frozenset({"cp1255"}),
    "ISO-8859-9": frozenset({"cp1254"}),
    "ISO-8859-13": frozenset({"cp1257"}),
    # UTF-16/32: bare form (BOM-aware) is interchangeable with either endianness,
    # but LE and BE are NOT interchangeable with each other.
    "UTF-16": frozenset({"utf-16-le", "utf-16-be"}),
    "UTF-16-LE": frozenset({"utf-16"}),
    "UTF-16-BE": frozenset({"utf-16"}),
    "UTF-32": frozenset({"utf-32-le", "utf-32-be"}),
    "UTF-32-LE": frozenset({"utf-32"}),
    "UTF-32-BE": frozenset({"utf-32"}),
}

# Preferred superset name for each encoding, used by the ``should_rename_legacy``
# API option.  When enabled, detected encoding names are replaced with the
# Windows/CP superset that modern software actually uses (browsers, editors,
# etc. treat these ISO subsets as their Windows counterparts).
# Values use display-cased names (e.g. "Windows-1252") to match chardet 6.x output.
PREFERRED_SUPERSET: dict[str, str] = {
    "ascii": "cp1252",
    "euc_kr": "cp949",
    "iso8859-1": "cp1252",
    "iso8859-2": "cp1250",
    "iso8859-5": "cp1251",
    "iso8859-6": "cp1256",
    "iso8859-7": "cp1253",
    "iso8859-8": "cp1255",
    "iso8859-9": "cp1254",
    "iso8859-11": "cp874",
    "iso8859-13": "cp1257",
    "tis-620": "cp874",
}


def _remap_encoding(result: DetectionDict, mapping: dict[str, str]) -> DetectionDict:
    """Replace the encoding name using *mapping*, modifying *result* in-place."""
    enc = result.get("encoding")
    if isinstance(enc, str):
        result["encoding"] = mapping.get(enc, enc)
    return result


def apply_preferred_superset(
    result: DetectionDict,
) -> DetectionDict:
    """Replace the encoding name with its preferred Windows/CP superset.

    Modifies the ``"encoding"`` value in *result* in-place and returns *result*
    for fluent chaining.

    :param result: A detection result dict containing an ``"encoding"`` key.
    :returns: The same *result* dict, modified in-place.
    """
    return _remap_encoding(result, PREFERRED_SUPERSET)


# Deprecated alias — kept for external consumers.
apply_legacy_rename = apply_preferred_superset


# Mapping from Python codec names to chardet 5.x/6.x compatible display names.
# Only entries where codec name differs from the compat output are listed.
# Encodings where codec name == compat name (e.g., "ascii", "utf-8") and
# encodings new to v7 have no entry — the codec name passes through unchanged.
_COMPAT_NAMES: dict[str, str] = {
    # 5.x compat — these encodings existed in chardet 5.x with different names
    "big5hkscs": "Big5",
    "cp855": "IBM855",
    "cp866": "IBM866",
    "cp949": "CP949",
    "euc_jis_2004": "EUC-JP",
    "euc_kr": "EUC-KR",
    "gb18030": "GB18030",
    "hz": "HZ-GB-2312",
    "iso2022_jp_2": "ISO-2022-JP",
    "iso2022_kr": "ISO-2022-KR",
    "iso8859-1": "ISO-8859-1",
    "iso8859-5": "ISO-8859-5",
    "iso8859-7": "ISO-8859-7",
    "iso8859-8": "ISO-8859-8",
    "iso8859-9": "ISO-8859-9",
    "johab": "Johab",
    "koi8-r": "KOI8-R",
    "mac-cyrillic": "MacCyrillic",
    "mac-roman": "MacRoman",
    "shift_jis_2004": "SHIFT_JIS",
    "tis-620": "TIS-620",
    "utf-16": "UTF-16",
    "utf-32": "UTF-32",
    "utf-8-sig": "UTF-8-SIG",
    "cp1251": "Windows-1251",
    "cp1252": "Windows-1252",
    "cp1253": "Windows-1253",
    "cp1254": "Windows-1254",
    "cp1255": "Windows-1255",
    # 6.x compat — new in chardet 6.x with different names
    "kz1048": "KZ1048",
    "mac-greek": "MacGreek",
    "mac-iceland": "MacIceland",
    "mac-latin2": "MacLatin2",
    "mac-turkish": "MacTurkish",
}


def apply_compat_names(
    result: DetectionDict,
) -> DetectionDict:
    """Convert internal codec names to chardet 5.x/6.x compatible names.

    Modifies the ``"encoding"`` value in *result* in-place and returns *result*
    for fluent chaining.

    :param result: A detection result dict containing an ``"encoding"`` key.
    :returns: The same *result* dict, modified in-place.
    """
    return _remap_encoding(result, _COMPAT_NAMES)


# Bidirectional equivalents -- groups where any member is acceptable for any other.
# Bidirectional equivalents -- groups where any member is acceptable for any other.
#
# NOTE: UTF-16/32 endianness is handled via directional SUPERSETS instead,
# because wrong endianness garbles text.  ISO-2022-JP variants remain here
# because base ISO-2022-JP is an alias of iso2022_jp_2 in our registry, so
# the SUPERSETS entries already make all variants interchangeable via the
# shared base.
BIDIRECTIONAL_GROUPS: tuple[tuple[str, ...], ...] = (
    ("iso2022_jp_2", "iso2022_jp_2004", "iso2022_jp_ext"),
)

# Bidirectional language equivalences — groups of ISO 639-1 codes for
# languages that are nearly indistinguishable by statistical detection.
# Detecting any member when another member of the same group was expected
# is considered acceptable.
LANGUAGE_EQUIVALENCES: tuple[tuple[str, ...], ...] = (
    ("sk", "cs"),  # Slovak / Czech — ~85% mutual intelligibility
    (
        "uk",
        "ru",
        "bg",
        "be",
    ),  # East Slavic + Bulgarian — shared Cyrillic, high written overlap
    ("ms", "id"),  # Malay / Indonesian — standardized variants of one language
    (
        "no",
        "da",
        "sv",
    ),  # Scandinavian — mutual intelligibility across the dialect continuum
)


def _build_group_index(
    groups: tuple[tuple[str, ...], ...],
    normalize: Callable[[str], str] = lambda x: x,
) -> dict[str, frozenset[str]]:
    """Build a lookup: key -> frozenset of all equivalent keys in the same group."""
    result: dict[str, frozenset[str]] = {}
    for group in groups:
        normed = frozenset(normalize(n) for n in group)
        for name in group:
            result[normalize(name)] = normed
    return result


_LANGUAGE_EQUIV: dict[str, frozenset[str]] = _build_group_index(LANGUAGE_EQUIVALENCES)


def is_language_equivalent(expected: str, detected: str) -> bool:
    """Check whether *detected* is an acceptable language for *expected*.

    Returns ``True`` when *expected* and *detected* are the same ISO 639-1
    code, or belong to the same equivalence group in
    :data:`LANGUAGE_EQUIVALENCES`.

    :param expected: Expected ISO 639-1 language code.
    :param detected: Detected ISO 639-1 language code.
    :returns: ``True`` if the languages are equivalent.
    """
    if expected == detected:
        return True
    group = _LANGUAGE_EQUIV.get(expected)
    return group is not None and detected in group


# Pre-built normalized lookups for fast comparison.
# Built iteratively because multiple SUPERSETS keys can normalize to the same
# canonical name (e.g., Shift_JIS and Shift-JISX0213 both → shift_jis_2004).
# Values are merged (unioned) when keys collide.
_NORMALIZED_SUPERSETS: dict[str, frozenset[str]] = {}
for _subset, _supersets in SUPERSETS.items():
    _key = lookup_encoding(_subset) or _subset
    _normed = frozenset(lookup_encoding(s) or s for s in _supersets)
    _NORMALIZED_SUPERSETS[_key] = _NORMALIZED_SUPERSETS.get(_key, frozenset()) | _normed


_NORMALIZED_BIDIR: dict[str, frozenset[str]] = _build_group_index(
    BIDIRECTIONAL_GROUPS, normalize=lambda n: lookup_encoding(n) or n
)


def is_correct(expected: str | None, detected: str | None) -> bool:
    """Check whether *detected* is an acceptable answer for *expected*.

    Acceptable means:

    1. Exact match (after normalization), OR
    2. Both belong to the same bidirectional byte-order group, OR
    3. *detected* is a known superset of *expected*.

    :param expected: The expected encoding name, or ``None`` for binary files.
    :param detected: The detected encoding name, or ``None``.
    :returns: ``True`` if the detection is acceptable.
    """
    if expected is None:
        return detected is None
    if detected is None:
        return False
    norm_exp = lookup_encoding(expected) or expected.lower()
    norm_det = lookup_encoding(detected) or detected.lower()

    # 1. Exact match
    if norm_exp == norm_det:
        return True

    # 2. Bidirectional (same byte-order group)
    if norm_exp in _NORMALIZED_BIDIR and norm_det in _NORMALIZED_BIDIR[norm_exp]:
        return True

    # 3. Superset is acceptable (detected is a known superset of expected)
    return (
        norm_exp in _NORMALIZED_SUPERSETS
        and norm_det in _NORMALIZED_SUPERSETS[norm_exp]
    )


def _strip_combining(text: str) -> str:
    """NFKD-normalize *text* and strip all combining marks."""
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join(c for c in nfkd if not unicodedata.combining(c))


# Pre-computed symbol pair lookups for O(1) equivalence checks.
# Both orderings are stored to avoid constructing temporaries per call.
_EQUIVALENT_SYMBOL_PAIRS: frozenset[tuple[str, str]] = frozenset(
    {
        ("¤", "€"),
        ("€", "¤"),
    }
)


def _chars_equivalent(a: str, b: str) -> bool:
    """Return True if characters *a* and *b* are functionally equivalent.

    Equivalent means:
    - Same character, OR
    - Same base letter after stripping combining marks, OR
    - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)
    """
    if a == b:
        return True
    if (a, b) in _EQUIVALENT_SYMBOL_PAIRS:
        return True
    # Compare base letters after stripping combining marks.
    return _strip_combining(a) == _strip_combining(b)


def is_equivalent_detection(
    data: bytes, expected: str | None, detected: str | None
) -> bool:
    """Check whether *detected* produces functionally identical text to *expected*.

    Returns ``True`` when:

    1. *detected* is not ``None`` and both encoding names normalize to the same
       codec, OR
    2. Decoding *data* with both encodings yields identical strings, OR
    3. Every differing character pair is functionally equivalent: same base
       letter after stripping combining marks, or an explicitly listed symbol
       equivalence (e.g. ¤ ↔ €).

    Returns ``False`` if *detected* is ``None``, either encoding is unknown,
    or either encoding cannot decode *data*.

    :param data: The raw byte data that was detected.
    :param expected: The expected encoding name, or ``None`` for binary files.
    :param detected: The detected encoding name, or ``None``.
    :returns: ``True`` if decoding with *detected* yields functionally identical
        text to decoding with *expected*.
    """
    if expected is None:
        return detected is None
    if detected is None:
        return False

    norm_exp = lookup_encoding(expected) or expected.lower()
    norm_det = lookup_encoding(detected) or detected.lower()

    if norm_exp == norm_det:
        return True

    try:
        text_exp = data.decode(norm_exp)
        text_det = data.decode(norm_det)
    except (UnicodeDecodeError, LookupError):
        return False

    if text_exp == text_det:
        return True

    if len(text_exp) != len(text_det):
        return False

    return all(_chars_equivalent(a, b) for a, b in zip(text_exp, text_det, strict=True))