"""Internal shared utilities for chardet.""" from __future__ import annotations import warnings #: Default maximum number of bytes to examine during detection. DEFAULT_MAX_BYTES: int = 200_000 #: Default minimum confidence threshold for filtering results. MINIMUM_THRESHOLD: float = 0.20 #: Default chunk_size value (deprecated, kept for backward-compat signatures). _DEFAULT_CHUNK_SIZE: int = 65_536 def _warn_deprecated_chunk_size(chunk_size: int, stacklevel: int = 3) -> None: """Emit a deprecation warning if *chunk_size* differs from the default.""" if chunk_size != _DEFAULT_CHUNK_SIZE: warnings.warn( "chunk_size is not used in this version of chardet and will be ignored", DeprecationWarning, stacklevel=stacklevel, ) def _validate_max_bytes(max_bytes: int) -> None: """Raise ValueError if *max_bytes* is not a positive integer.""" if isinstance(max_bytes, bool) or not isinstance(max_bytes, int) or max_bytes < 1: msg = "max_bytes must be a positive integer" raise ValueError(msg) def _resolve_prefer_superset( should_rename_legacy: bool, prefer_superset: bool, stacklevel: int = 3 ) -> bool: """Resolve the deprecated *should_rename_legacy* into *prefer_superset*.""" if should_rename_legacy: warnings.warn( "should_rename_legacy is deprecated, use prefer_superset instead", DeprecationWarning, stacklevel=stacklevel, ) return True return prefer_superset #: Mapping from ISO 639-1 language codes to English names. #: Includes ``"und"`` (ISO 639-3 "Undetermined") for use when language is unknown. ISO_TO_LANGUAGE: dict[str, str] = { "ar": "arabic", "be": "belarusian", "bg": "bulgarian", "br": "breton", "cs": "czech", "cy": "welsh", "da": "danish", "de": "german", "el": "greek", "en": "english", "eo": "esperanto", "es": "spanish", "et": "estonian", "fa": "farsi", "fi": "finnish", "fr": "french", "ga": "irish", "gd": "gaelic", "he": "hebrew", "hr": "croatian", "hu": "hungarian", "id": "indonesian", "is": "icelandic", "it": "italian", "ja": "japanese", "kk": "kazakh", "ko": "korean", "lt": "lithuanian", "lv": "latvian", "mk": "macedonian", "ms": "malay", "mt": "maltese", "nl": "dutch", "no": "norwegian", "pl": "polish", "pt": "portuguese", "ro": "romanian", "ru": "russian", "sk": "slovak", "sl": "slovene", "sr": "serbian", "sv": "swedish", "tg": "tajik", "th": "thai", "tr": "turkish", "uk": "ukrainian", "und": "undetermined", "ur": "urdu", "vi": "vietnamese", "zh": "chinese", }