"""Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
"""

import unicodedata

from chardet.pipeline import ASCII_TEXT_BYTES, DETERMINISTIC_CONFIDENCE, DetectionResult

# How many bytes to sample for pattern analysis
_SAMPLE_SIZE = 4096

# Minimum bytes needed for reliable pattern detection
_MIN_BYTES_UTF32 = 16  # 4 full code units
_MIN_BYTES_UTF16 = 10  # 5 full code units

# Minimum fraction of null bytes in the expected position for UTF-16.
# CJK-heavy UTF-16 text (Chinese, Japanese, Korean) can have as few as
# ~4.5% null bytes in the expected position, since CJK codepoints have
# non-zero high bytes.  The validation step (decode + text quality check)
# prevents false positives from binary files at this lower threshold.
_UTF16_MIN_NULL_FRACTION = 0.03

# Minimum text-quality score to accept a UTF-16 candidate when both
# endiannesses show null-byte patterns.  A score of 0.5 corresponds to
# roughly 50% letters with no ASCII bonus (or ~40% with whitespace
# present) — sufficient to distinguish real text from coincidental byte
# patterns.
_MIN_TEXT_QUALITY = 0.5

# Minimum fraction of printable characters for a decoded sample to be
# considered text rather than binary data.
_MIN_PRINTABLE_FRACTION = 0.7

# Maximum null fraction (in the candidate null-byte position) below which
# the data is checked for a null-separator pattern.  If the null fraction
# is below this AND all non-null bytes are printable ASCII, the candidate
# is rejected as a null-separator false positive rather than real UTF-16.
# Real Latin UTF-16 has ~50% nulls; CJK UTF-16 has fewer but non-ASCII
# non-null bytes.  15% is generous — separator data is typically 1-5%.
_NULL_SEPARATOR_MAX_FRACTION = 0.15

# ASCII_TEXT_BYTES plus the null byte — used by the null-separator guard
# to check whether non-null bytes are all printable ASCII.
_NULL_SEPARATOR_ALLOWED: bytes = b"\x00" + ASCII_TEXT_BYTES


def _is_null_separator_pattern(data: bytes, null_frac: float) -> bool:
    """Return True if the data looks like ASCII with null byte separators.

    :param data: The raw byte sample to examine.
    :param null_frac: The positional null fraction for this UTF-16 candidate
        (i.e. fraction of null bytes in even positions for BE, or odd positions
        for LE) — not the total null fraction across all bytes.

    Checks two conditions:
    1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
    2. Every non-null byte is printable ASCII or common whitespace

    When both conditions are met, the nulls are likely field separators
    (e.g. ``find -print0``), not UTF-16 encoding artifacts.
    """
    if null_frac >= _NULL_SEPARATOR_MAX_FRACTION:
        return False
    return not data.translate(None, _NULL_SEPARATOR_ALLOWED)


def detect_utf1632_patterns(data: bytes) -> DetectionResult | None:
    """Detect UTF-32 or UTF-16 encoding from null-byte patterns.

    UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
    """
    sample = data[:_SAMPLE_SIZE]

    if len(sample) < _MIN_BYTES_UTF16:
        return None

    # Check UTF-32 first (more specific pattern)
    result = _check_utf32(sample)
    if result is not None:
        return result

    # Then check UTF-16
    return _check_utf16(sample)


def _check_utf32(data: bytes) -> DetectionResult | None:
    """Check for UTF-32 encoding based on 4-byte unit structure.

    For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
    - UTF-32-BE: the first byte of each 4-byte unit is always 0x00
    - UTF-32-LE: the last byte of each 4-byte unit is always 0x00

    For BMP characters (U+0000 to U+FFFF), additionally:
    - UTF-32-BE: the second byte is also 0x00
    - UTF-32-LE: the third byte is also 0x00
    """
    # Trim to a multiple of 4 bytes (like _check_utf16 trims to even length)
    trimmed_len = len(data) - (len(data) % 4)
    if trimmed_len < _MIN_BYTES_UTF32:
        return None
    data = data[:trimmed_len]

    num_units = trimmed_len // 4

    # UTF-32-BE: first byte of each 4-byte unit must be 0x00
    be_first_null = sum(1 for i in range(0, len(data), 4) if data[i] == 0)
    # Second byte is 0x00 for BMP characters (the vast majority of text)
    be_second_null = sum(1 for i in range(0, len(data), 4) if data[i + 1] == 0)

    if be_first_null == num_units and be_second_null / num_units > 0.5:
        try:
            text = data.decode("utf-32-be")
            if _looks_like_text(text):
                return DetectionResult(
                    encoding="utf-32-be",
                    confidence=DETERMINISTIC_CONFIDENCE,
                    language=None,
                )
        except UnicodeDecodeError:
            pass

    # UTF-32-LE: last byte of each 4-byte unit must be 0x00
    le_last_null = sum(1 for i in range(3, len(data), 4) if data[i] == 0)
    # Third byte is 0x00 for BMP characters
    le_third_null = sum(1 for i in range(2, len(data), 4) if data[i] == 0)

    if le_last_null == num_units and le_third_null / num_units > 0.5:
        try:
            text = data.decode("utf-32-le")
            if _looks_like_text(text):
                return DetectionResult(
                    encoding="utf-32-le",
                    confidence=DETERMINISTIC_CONFIDENCE,
                    language=None,
                )
        except UnicodeDecodeError:
            pass

    return None


def _check_utf16(data: bytes) -> DetectionResult | None:
    """Check for UTF-16 via null-byte patterns in alternating positions.

    UTF-16 encodes each BMP character as two bytes.  For characters whose
    code-point high byte is 0x00 (Latin, digits, basic punctuation, many
    control structures), one of the two bytes in each unit will be a null.
    Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
    fraction of code units still contain at least one null byte.

    Non-UTF-16 single-byte encodings never contain null bytes, so even a
    small null-byte fraction in alternating positions is a strong signal.

    When both endiannesses show null-byte patterns (e.g., Latin text where
    every other byte is null), we disambiguate by decoding both ways and
    comparing text-quality scores.
    """
    sample_len = min(len(data), _SAMPLE_SIZE)
    sample_len -= sample_len % 2
    if sample_len < _MIN_BYTES_UTF16:  # pragma: no cover - caller checks length
        return None

    num_units = sample_len // 2

    # Count null bytes in even positions (UTF-16-BE high byte for ASCII)
    be_null_count = sum(1 for i in range(0, sample_len, 2) if data[i] == 0)
    # Count null bytes in odd positions (UTF-16-LE high byte for ASCII)
    le_null_count = sum(1 for i in range(1, sample_len, 2) if data[i] == 0)

    be_frac = be_null_count / num_units
    le_frac = le_null_count / num_units

    candidates: list[tuple[str, float]] = []
    if le_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
        data[:sample_len], le_frac
    ):
        candidates.append(("utf-16-le", le_frac))
    if be_frac >= _UTF16_MIN_NULL_FRACTION and not _is_null_separator_pattern(
        data[:sample_len], be_frac
    ):
        candidates.append(("utf-16-be", be_frac))

    if not candidates:
        return None

    # If only one candidate, validate and return
    if len(candidates) == 1:
        encoding = candidates[0][0]
        try:
            text = data[:sample_len].decode(encoding)
            if _looks_like_text(text):
                return DetectionResult(
                    encoding=encoding,
                    confidence=DETERMINISTIC_CONFIDENCE,
                    language=None,
                )
        except UnicodeDecodeError:
            pass
        return None

    # Both candidates matched (common for Latin-heavy text where every other
    # byte is null).  Decode both and pick the one with higher text quality.
    best_encoding: str | None = None
    best_quality = -1.0

    for encoding, _ in candidates:
        try:
            text = data[:sample_len].decode(encoding)
        except UnicodeDecodeError:
            continue
        quality = _text_quality(text)
        if quality > best_quality:
            best_quality = quality
            best_encoding = encoding

    if best_encoding is not None and best_quality >= _MIN_TEXT_QUALITY:
        return DetectionResult(
            encoding=best_encoding,
            confidence=DETERMINISTIC_CONFIDENCE,
            language=None,
        )

    return None


def _looks_like_text(text: str) -> bool:
    """Quick check: is decoded text mostly printable characters."""
    if not text:
        return False
    sample = text[:500]
    printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t")
    return printable / len(sample) > _MIN_PRINTABLE_FRACTION


def _text_quality(text: str, limit: int = 500) -> float:
    """Score how much *text* looks like real human-readable content.

    Returns a score in the range [-1.0, ~1.6).  Higher values indicate
    more natural text.  The practical maximum is 1.5 for all-ASCII-letter
    input (1.6 approaches as sample size grows with all ASCII letters plus
    whitespace).  A score of -1.0 means the content is almost certainly not
    valid text (too many control characters or combining marks).

    Scoring factors:

    * Base score: ratio of Unicode letters (category ``L*``) to sample length.
    * ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
      primary signal for disambiguating endianness — correct decoding of
      Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
    * Space bonus: +0.1 when the sample contains at least one whitespace
      character and is longer than 20 characters.
    * Rejection: returns -1.0 if >10% control characters or >20% combining
      marks (category ``M*``).
    """
    sample = text[:limit]
    n = len(sample)
    if n == 0:  # pragma: no cover - callers always pass non-empty text
        return -1.0

    letters = 0
    marks = 0
    spaces = 0
    controls = 0
    ascii_letters = 0

    for c in sample:
        cat = unicodedata.category(c)
        if cat[0] == "L":
            letters += 1
            if ord(c) < 128:
                ascii_letters += 1
        elif cat[0] == "M":
            marks += 1
        elif cat == "Zs" or c in "\n\r\t":
            spaces += 1
        elif cat[0] == "C":
            controls += 1

    # Reject data with many control characters or combining marks
    if controls / n > 0.1:
        return -1.0
    if marks / n > 0.2:
        return -1.0

    score = letters / n
    # ASCII letters strongly indicate correct endianness
    score += (ascii_letters / n) * 0.5
    # Real text usually contains some whitespace
    if n > 20 and spaces > 0:
        score += 0.1

    return score