_parse.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. """URL parsing utilities."""
  2. import re
  3. import unicodedata
  4. from functools import lru_cache
  5. from typing import Union
  6. from urllib.parse import scheme_chars, uses_netloc
  7. from ._quoters import QUOTER, UNQUOTER_PLUS
  8. # Leading and trailing C0 control and space to be stripped per WHATWG spec.
  9. # == "".join([chr(i) for i in range(0, 0x20 + 1)])
  10. WHATWG_C0_CONTROL_OR_SPACE = (
  11. "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10"
  12. "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f "
  13. )
  14. # Unsafe bytes to be removed per WHATWG spec
  15. UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"]
  16. USES_AUTHORITY = frozenset(uses_netloc)
  17. SplitURLType = tuple[str, str, str, str, str]
  18. def split_url(url: str) -> SplitURLType:
  19. """Split URL into parts."""
  20. # Adapted from urllib.parse.urlsplit
  21. # Only lstrip url as some applications rely on preserving trailing space.
  22. # (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
  23. url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE)
  24. for b in UNSAFE_URL_BYTES_TO_REMOVE:
  25. if b in url:
  26. url = url.replace(b, "")
  27. scheme = netloc = query = fragment = ""
  28. i = url.find(":")
  29. if i > 0 and url[0] in scheme_chars:
  30. for c in url[1:i]:
  31. if c not in scheme_chars:
  32. break
  33. else:
  34. scheme, url = url[:i].lower(), url[i + 1 :]
  35. has_hash = "#" in url
  36. has_question_mark = "?" in url
  37. if url[:2] == "//":
  38. delim = len(url) # position of end of domain part of url, default is end
  39. if has_hash and has_question_mark:
  40. delim_chars = "/?#"
  41. elif has_question_mark:
  42. delim_chars = "/?"
  43. elif has_hash:
  44. delim_chars = "/#"
  45. else:
  46. delim_chars = "/"
  47. for c in delim_chars: # look for delimiters; the order is NOT important
  48. wdelim = url.find(c, 2) # find first of this delim
  49. if wdelim >= 0 and wdelim < delim: # if found
  50. delim = wdelim # use earliest delim position
  51. netloc = url[2:delim]
  52. url = url[delim:]
  53. has_left_bracket = "[" in netloc
  54. has_right_bracket = "]" in netloc
  55. if (has_left_bracket and not has_right_bracket) or (
  56. has_right_bracket and not has_left_bracket
  57. ):
  58. raise ValueError("Invalid IPv6 URL")
  59. if has_left_bracket:
  60. bracketed_host = netloc.partition("[")[2].partition("]")[0]
  61. # Valid bracketed hosts are defined in
  62. # https://www.rfc-editor.org/rfc/rfc3986#page-49
  63. # https://url.spec.whatwg.org/
  64. if bracketed_host[0] == "v":
  65. if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host):
  66. raise ValueError("IPvFuture address is invalid")
  67. elif ":" not in bracketed_host:
  68. raise ValueError("An IPv4 address cannot be in brackets")
  69. if has_hash:
  70. url, _, fragment = url.partition("#")
  71. if has_question_mark:
  72. url, _, query = url.partition("?")
  73. if netloc and not netloc.isascii():
  74. _check_netloc(netloc)
  75. return scheme, netloc, url, query, fragment
  76. def _check_netloc(netloc: str) -> None:
  77. # Adapted from urllib.parse._checknetloc
  78. # looking for characters like \u2100 that expand to 'a/c'
  79. # IDNA uses NFKC equivalence, so normalize for this check
  80. # ignore characters already included
  81. # but not the surrounding text
  82. n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "")
  83. normalized_netloc = unicodedata.normalize("NFKC", n)
  84. if n == normalized_netloc:
  85. return
  86. # Note that there are no unicode decompositions for the character '@' so
  87. # its currently impossible to have test coverage for this branch, however if the
  88. # one should be added in the future we want to make sure its still checked.
  89. for c in "/?#@:": # pragma: no branch
  90. if c in normalized_netloc:
  91. raise ValueError(
  92. f"netloc '{netloc}' contains invalid "
  93. "characters under NFKC normalization"
  94. )
  95. @lru_cache # match the same size as urlsplit
  96. def split_netloc(
  97. netloc: str,
  98. ) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]:
  99. """Split netloc into username, password, host and port."""
  100. if "@" not in netloc:
  101. username: Union[str, None] = None
  102. password: Union[str, None] = None
  103. hostinfo = netloc
  104. else:
  105. userinfo, _, hostinfo = netloc.rpartition("@")
  106. username, have_password, password = userinfo.partition(":")
  107. if not have_password:
  108. password = None
  109. if "[" in hostinfo:
  110. _, _, bracketed = hostinfo.partition("[")
  111. hostname, _, port_str = bracketed.partition("]")
  112. _, _, port_str = port_str.partition(":")
  113. else:
  114. hostname, _, port_str = hostinfo.partition(":")
  115. if not port_str:
  116. return username or None, password, hostname or None, None
  117. try:
  118. port = int(port_str)
  119. except ValueError:
  120. raise ValueError("Invalid URL: port can't be converted to integer")
  121. if not (0 <= port <= 65535):
  122. raise ValueError("Port out of range 0-65535")
  123. return username or None, password, hostname or None, port
  124. def unsplit_result(
  125. scheme: str, netloc: str, url: str, query: str, fragment: str
  126. ) -> str:
  127. """Unsplit a URL without any normalization."""
  128. if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//":
  129. if url and url[:1] != "/":
  130. url = f"{scheme}://{netloc}/{url}" if scheme else f"{scheme}:{url}"
  131. else:
  132. url = f"{scheme}://{netloc}{url}" if scheme else f"//{netloc}{url}"
  133. elif scheme:
  134. url = f"{scheme}:{url}"
  135. if query:
  136. url = f"{url}?{query}"
  137. return f"{url}#{fragment}" if fragment else url
  138. @lru_cache # match the same size as urlsplit
  139. def make_netloc(
  140. user: Union[str, None],
  141. password: Union[str, None],
  142. host: Union[str, None],
  143. port: Union[int, None],
  144. encode: bool = False,
  145. ) -> str:
  146. """Make netloc from parts.
  147. The user and password are encoded if encode is True.
  148. The host must already be encoded with _encode_host.
  149. """
  150. if host is None:
  151. return ""
  152. ret = host
  153. if port is not None:
  154. ret = f"{ret}:{port}"
  155. if user is None and password is None:
  156. return ret
  157. if password is not None:
  158. if not user:
  159. user = ""
  160. elif encode:
  161. user = QUOTER(user)
  162. if encode:
  163. password = QUOTER(password)
  164. user = f"{user}:{password}"
  165. elif user and encode:
  166. user = QUOTER(user)
  167. return f"{user}@{ret}" if user else ret
  168. def query_to_pairs(query_string: str) -> list[tuple[str, str]]:
  169. """Parse a query given as a string argument.
  170. Works like urllib.parse.parse_qsl with keep empty values.
  171. """
  172. pairs: list[tuple[str, str]] = []
  173. if not query_string:
  174. return pairs
  175. for k_v in query_string.split("&"):
  176. k, _, v = k_v.partition("=")
  177. pairs.append((UNQUOTER_PLUS(k), UNQUOTER_PLUS(v)))
  178. return pairs