legacy.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from __future__ import annotations
  2. from typing import TYPE_CHECKING, Any
  3. from warnings import warn
  4. from .api import from_bytes
  5. from .constant import CHARDET_CORRESPONDENCE
  6. # TODO: remove this check when dropping Python 3.7 support
  7. if TYPE_CHECKING:
  8. from typing_extensions import TypedDict
  9. class ResultDict(TypedDict):
  10. encoding: str | None
  11. language: str
  12. confidence: float | None
  13. def detect(
  14. byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
  15. ) -> ResultDict:
  16. """
  17. chardet legacy method
  18. Detect the encoding of the given byte string. It should be mostly backward-compatible.
  19. Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
  20. This function is deprecated and should be used to migrate your project easily, consult the documentation for
  21. further information. Not planned for removal.
  22. :param byte_str: The byte sequence to examine.
  23. :param should_rename_legacy: Should we rename legacy encodings
  24. to their more modern equivalents?
  25. """
  26. if len(kwargs):
  27. warn(
  28. f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
  29. )
  30. if not isinstance(byte_str, (bytearray, bytes)):
  31. raise TypeError( # pragma: nocover
  32. "Expected object of type bytes or bytearray, got: " "{}".format(
  33. type(byte_str)
  34. )
  35. )
  36. if isinstance(byte_str, bytearray):
  37. byte_str = bytes(byte_str)
  38. r = from_bytes(byte_str).best()
  39. encoding = r.encoding if r is not None else None
  40. language = r.language if r is not None and r.language != "Unknown" else ""
  41. confidence = 1.0 - r.chaos if r is not None else None
  42. # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
  43. # but chardet does return 'utf-8-sig' and it is a valid codec name.
  44. if r is not None and encoding == "utf_8" and r.bom:
  45. encoding += "_sig"
  46. if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
  47. encoding = CHARDET_CORRESPONDENCE[encoding]
  48. return {
  49. "encoding": encoding,
  50. "language": language,
  51. "confidence": confidence,
  52. }