"""Stage 1d: UTF-8 structural validation.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
"""

from chardet.pipeline import DetectionResult

# Confidence curve parameters for UTF-8 detection.
# Even a small fraction of valid multi-byte sequences is strong evidence.
_BASE_CONFIDENCE = 0.80
_MAX_CONFIDENCE = 0.99
# Scale factor for the multi-byte byte ratio: mb_ratio * 6 saturates the
# confidence ramp at ~17% multi-byte content.
_MB_RATIO_SCALE = 6


def detect_utf8(data: bytes) -> DetectionResult | None:
    """Validate UTF-8 byte structure.

    Returns a result only if multi-byte sequences are found (pure ASCII
    is handled by the ASCII stage).

    :param data: The raw byte data to examine.
    :returns: A :class:`DetectionResult` for UTF-8, or ``None``.
    """
    if not data:
        return None

    i = 0
    length = len(data)
    multibyte_sequences = 0
    multibyte_bytes = 0

    while i < length:
        byte = data[i]

        if byte < 0x80:
            i += 1
            continue

        # Determine expected sequence length from leading byte.
        # 0xC0-0xC1 are overlong 2-byte encodings of ASCII, so we start at 0xC2.
        if 0xC2 <= byte <= 0xDF:
            seq_len = 2
        elif 0xE0 <= byte <= 0xEF:
            seq_len = 3
        elif 0xF0 <= byte <= 0xF4:
            seq_len = 4
        else:
            # Invalid start byte (0x80-0xC1, 0xF5-0xFF)
            return None

        # Truncated final sequence (e.g. from max_bytes slicing) — treat as
        # valid since the bytes seen so far are structurally correct.
        if i + seq_len > length:
            break

        # Validate continuation bytes (must be 0x80-0xBF)
        for j in range(1, seq_len):
            if not (0x80 <= data[i + j] <= 0xBF):
                return None

        # Reject overlong encodings and surrogates
        if seq_len == 3:
            # 0xE0: second byte must be >= 0xA0 (prevents overlong 3-byte)
            if byte == 0xE0 and data[i + 1] < 0xA0:
                return None
            # 0xED: second byte must be <= 0x9F (prevents UTF-16 surrogates U+D800-U+DFFF)
            if byte == 0xED and data[i + 1] > 0x9F:
                return None
        elif seq_len == 4:
            # 0xF0: second byte must be >= 0x90 (prevents overlong 4-byte)
            if byte == 0xF0 and data[i + 1] < 0x90:
                return None
            # 0xF4: second byte must be <= 0x8F (prevents codepoints above U+10FFFF)
            if byte == 0xF4 and data[i + 1] > 0x8F:
                return None

        multibyte_sequences += 1
        multibyte_bytes += seq_len
        i += seq_len

    # Pure ASCII — let the ASCII detector handle it
    if multibyte_sequences == 0:
        return None

    # Confidence scales with the proportion of multi-byte bytes in the data.
    # Even a small amount of valid multi-byte UTF-8 is strong evidence.
    mb_ratio = multibyte_bytes / length
    confidence_range = _MAX_CONFIDENCE - _BASE_CONFIDENCE
    confidence = min(
        _MAX_CONFIDENCE,
        _BASE_CONFIDENCE + confidence_range * min(mb_ratio * _MB_RATIO_SCALE, 1.0),
    )
    return DetectionResult(encoding="utf-8", confidence=confidence, language=None)