"""Stage 1b: charset declaration extraction (HTML/XML/PEP 263)."""
from __future__ import annotations
import re
from chardet.pipeline import DETERMINISTIC_CONFIDENCE, DetectionResult
from chardet.registry import lookup_encoding
_SCAN_LIMIT = 4096
_XML_ENCODING_RE = re.compile(
rb"""<\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]""", re.IGNORECASE
)
_HTML5_CHARSET_RE = re.compile(
rb"""]+charset\s*=\s*['"]?\s*([^\s'">;]+)""", re.IGNORECASE
)
_HTML4_CONTENT_TYPE_RE = re.compile(
rb"""]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)""", re.IGNORECASE
)
# PEP 263: encoding declaration in the first two lines of a Python file.
# https://peps.python.org/pep-0263/
_PEP263_RE = re.compile(rb"^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)", re.MULTILINE)
def _detect_pep263(data: bytes) -> DetectionResult | None:
"""Check the first two lines of *data* for a PEP 263 encoding declaration.
PEP 263 declarations (e.g. ``# -*- coding: utf-8 -*-``) are only valid
on line 1 or line 2 of a Python source file.
:param data: The raw byte data to scan.
:returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
"""
# PEP 263 requires a '#' comment marker on line 1 or 2.
if b"#" not in data[:200]:
return None
# Extract first two lines only.
first_two_lines = b"\n".join(data.split(b"\n", 2)[:2])
match = _PEP263_RE.search(first_two_lines)
if match:
try:
raw_name = match.group(1).decode("ascii").strip()
except (UnicodeDecodeError, ValueError):
return None
encoding = lookup_encoding(raw_name)
if encoding is not None and _validate_bytes(data, encoding):
return DetectionResult(
encoding=encoding,
confidence=DETERMINISTIC_CONFIDENCE,
language=None,
mime_type="text/x-python",
)
return None
def detect_markup_charset(data: bytes) -> DetectionResult | None:
"""Scan the first bytes of *data* for a charset declaration.
Checks for:
1. ````
2. ````
3. ````
4. PEP 263 ``# -*- coding: ... -*-`` (first two lines only)
:param data: The raw byte data to scan.
:returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
"""
if not data:
return None
head = data[:_SCAN_LIMIT]
for pattern in (_XML_ENCODING_RE, _HTML5_CHARSET_RE, _HTML4_CONTENT_TYPE_RE):
match = pattern.search(head)
if match:
try:
raw_name = match.group(1).decode("ascii").strip()
except (UnicodeDecodeError, ValueError):
continue
encoding = lookup_encoding(raw_name)
if encoding is not None and _validate_bytes(data, encoding):
mime_type = "text/xml" if pattern is _XML_ENCODING_RE else "text/html"
return DetectionResult(
encoding=encoding,
confidence=DETERMINISTIC_CONFIDENCE,
language=None,
mime_type=mime_type,
)
return _detect_pep263(data)
def _validate_bytes(data: bytes, encoding: str) -> bool:
"""Check that *data* can be decoded under *encoding* without errors.
Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a
full 200 kB input just to verify a charset declaration found in the
header.
"""
try:
data[:_SCAN_LIMIT].decode(encoding)
except (UnicodeDecodeError, LookupError, ValueError):
return False
return True