_encode.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. from __future__ import annotations
  2. from collections.abc import Sequence
  3. from string import ascii_letters, digits, hexdigits
  4. from urllib.parse import quote as encode_uri_component
  5. ASCII_LETTERS_AND_DIGITS = ascii_letters + digits
  6. ENCODE_DEFAULT_CHARS = ";/?:@&=+$,-_.!~*'()#"
  7. ENCODE_COMPONENT_CHARS = "-_.!~*'()"
  8. encode_cache: dict[str, list[str]] = {}
  9. # Create a lookup array where anything but characters in `chars` string
  10. # and alphanumeric chars is percent-encoded.
  11. def get_encode_cache(exclude: str) -> Sequence[str]:
  12. if exclude in encode_cache:
  13. return encode_cache[exclude]
  14. cache: list[str] = []
  15. encode_cache[exclude] = cache
  16. for i in range(128):
  17. ch = chr(i)
  18. if ch in ASCII_LETTERS_AND_DIGITS:
  19. # always allow unencoded alphanumeric characters
  20. cache.append(ch)
  21. else:
  22. cache.append("%" + ("0" + hex(i)[2:].upper())[-2:])
  23. for i in range(len(exclude)):
  24. cache[ord(exclude[i])] = exclude[i]
  25. return cache
  26. # Encode unsafe characters with percent-encoding, skipping already
  27. # encoded sequences.
  28. #
  29. # - string - string to encode
  30. # - exclude - list of characters to ignore (in addition to a-zA-Z0-9)
  31. # - keepEscaped - don't encode '%' in a correct escape sequence (default: true)
  32. def encode(
  33. string: str, exclude: str = ENCODE_DEFAULT_CHARS, *, keep_escaped: bool = True
  34. ) -> str:
  35. result = ""
  36. cache = get_encode_cache(exclude)
  37. l = len(string) # noqa: E741
  38. i = 0
  39. while i < l:
  40. code = ord(string[i])
  41. # %
  42. if keep_escaped and code == 0x25 and i + 2 < l:
  43. if all(c in hexdigits for c in string[i + 1 : i + 3]):
  44. result += string[i : i + 3]
  45. i += 2
  46. i += 1 # JS for loop statement3
  47. continue
  48. if code < 128:
  49. result += cache[code]
  50. i += 1 # JS for loop statement3
  51. continue
  52. if code >= 0xD800 and code <= 0xDFFF:
  53. if code >= 0xD800 and code <= 0xDBFF and i + 1 < l:
  54. next_code = ord(string[i + 1])
  55. if next_code >= 0xDC00 and next_code <= 0xDFFF:
  56. result += encode_uri_component(string[i] + string[i + 1])
  57. i += 1
  58. i += 1 # JS for loop statement3
  59. continue
  60. result += "%EF%BF%BD"
  61. i += 1 # JS for loop statement3
  62. continue
  63. result += encode_uri_component(string[i])
  64. i += 1 # JS for loop statement3
  65. return result