_decode.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. from __future__ import annotations
  2. from collections.abc import Sequence
  3. import functools
  4. import re
  5. DECODE_DEFAULT_CHARS = ";/?:@&=+$,#"
  6. DECODE_COMPONENT_CHARS = ""
  7. decode_cache: dict[str, list[str]] = {}
  8. def get_decode_cache(exclude: str) -> Sequence[str]:
  9. if exclude in decode_cache:
  10. return decode_cache[exclude]
  11. cache: list[str] = []
  12. decode_cache[exclude] = cache
  13. for i in range(128):
  14. ch = chr(i)
  15. cache.append(ch)
  16. for i in range(len(exclude)):
  17. ch_code = ord(exclude[i])
  18. cache[ch_code] = "%" + ("0" + hex(ch_code)[2:].upper())[-2:]
  19. return cache
  20. # Decode percent-encoded string.
  21. #
  22. def decode(string: str, exclude: str = DECODE_DEFAULT_CHARS) -> str:
  23. cache = get_decode_cache(exclude)
  24. repl_func = functools.partial(repl_func_with_cache, cache=cache)
  25. return re.sub(r"(%[a-f0-9]{2})+", repl_func, string, flags=re.IGNORECASE)
  26. def repl_func_with_cache(match: re.Match, cache: Sequence[str]) -> str:
  27. seq = match.group()
  28. result = ""
  29. i = 0
  30. l = len(seq) # noqa: E741
  31. while i < l:
  32. b1 = int(seq[i + 1 : i + 3], 16)
  33. if b1 < 0x80:
  34. result += cache[b1]
  35. i += 3 # emulate JS for loop statement3
  36. continue
  37. if (b1 & 0xE0) == 0xC0 and (i + 3 < l):
  38. # 110xxxxx 10xxxxxx
  39. b2 = int(seq[i + 4 : i + 6], 16)
  40. if (b2 & 0xC0) == 0x80:
  41. all_bytes = bytes((b1, b2))
  42. try:
  43. result += all_bytes.decode()
  44. except UnicodeDecodeError:
  45. result += "\ufffd" * 2
  46. i += 3
  47. i += 3 # emulate JS for loop statement3
  48. continue
  49. if (b1 & 0xF0) == 0xE0 and (i + 6 < l):
  50. # 1110xxxx 10xxxxxx 10xxxxxx
  51. b2 = int(seq[i + 4 : i + 6], 16)
  52. b3 = int(seq[i + 7 : i + 9], 16)
  53. if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80:
  54. all_bytes = bytes((b1, b2, b3))
  55. try:
  56. result += all_bytes.decode()
  57. except UnicodeDecodeError:
  58. result += "\ufffd" * 3
  59. i += 6
  60. i += 3 # emulate JS for loop statement3
  61. continue
  62. if (b1 & 0xF8) == 0xF0 and (i + 9 < l):
  63. # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx
  64. b2 = int(seq[i + 4 : i + 6], 16)
  65. b3 = int(seq[i + 7 : i + 9], 16)
  66. b4 = int(seq[i + 10 : i + 12], 16)
  67. if (b2 & 0xC0) == 0x80 and (b3 & 0xC0) == 0x80 and (b4 & 0xC0) == 0x80:
  68. all_bytes = bytes((b1, b2, b3, b4))
  69. try:
  70. result += all_bytes.decode()
  71. except UnicodeDecodeError:
  72. result += "\ufffd" * 4
  73. i += 9
  74. i += 3 # emulate JS for loop statement3
  75. continue
  76. result += "\ufffd"
  77. i += 3 # emulate JS for loop statement3
  78. return result