_parse.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. # Copyright Joyent, Inc. and other Node contributors.
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a
  4. # copy of this software and associated documentation files (the
  5. # "Software"), to deal in the Software without restriction, including
  6. # without limitation the rights to use, copy, modify, merge, publish,
  7. # distribute, sublicense, and/or sell copies of the Software, and to permit
  8. # persons to whom the Software is furnished to do so, subject to the
  9. # following conditions:
  10. #
  11. # The above copyright notice and this permission notice shall be included
  12. # in all copies or substantial portions of the Software.
  13. #
  14. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15. # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
  17. # NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  18. # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  19. # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  20. # USE OR OTHER DEALINGS IN THE SOFTWARE.
  21. # Changes from joyent/node:
  22. #
  23. # 1. No leading slash in paths,
  24. # e.g. in `url.parse('http://foo?bar')` pathname is ``, not `/`
  25. #
  26. # 2. Backslashes are not replaced with slashes,
  27. # so `http:\\example.org\` is treated like a relative path
  28. #
  29. # 3. Trailing colon is treated like a part of the path,
  30. # i.e. in `http://example.org:foo` pathname is `:foo`
  31. #
  32. # 4. Nothing is URL-encoded in the resulting object,
  33. # (in joyent/node some chars in auth and paths are encoded)
  34. #
  35. # 5. `url.parse()` does not have `parseQueryString` argument
  36. #
  37. # 6. Removed extraneous result properties: `host`, `path`, `query`, etc.,
  38. # which can be constructed using other parts of the url.
  39. from __future__ import annotations
  40. from collections import defaultdict
  41. import re
  42. from mdurl._url import URL
  43. # Reference: RFC 3986, RFC 1808, RFC 2396
  44. # define these here so at least they only have to be
  45. # compiled once on the first module load.
  46. PROTOCOL_PATTERN = re.compile(r"^([a-z0-9.+-]+:)", flags=re.IGNORECASE)
  47. PORT_PATTERN = re.compile(r":[0-9]*$")
  48. # Special case for a simple path URL
  49. SIMPLE_PATH_PATTERN = re.compile(r"^(//?(?!/)[^?\s]*)(\?[^\s]*)?$")
  50. # RFC 2396: characters reserved for delimiting URLs.
  51. # We actually just auto-escape these.
  52. DELIMS = ("<", ">", '"', "`", " ", "\r", "\n", "\t")
  53. # RFC 2396: characters not allowed for various reasons.
  54. UNWISE = ("{", "}", "|", "\\", "^", "`") + DELIMS
  55. # Allowed by RFCs, but cause of XSS attacks. Always escape these.
  56. AUTO_ESCAPE = ("'",) + UNWISE
  57. # Characters that are never ever allowed in a hostname.
  58. # Note that any invalid chars are also handled, but these
  59. # are the ones that are *expected* to be seen, so we fast-path
  60. # them.
  61. NON_HOST_CHARS = ("%", "/", "?", ";", "#") + AUTO_ESCAPE
  62. HOST_ENDING_CHARS = ("/", "?", "#")
  63. HOSTNAME_MAX_LEN = 255
  64. HOSTNAME_PART_PATTERN = re.compile(r"^[+a-z0-9A-Z_-]{0,63}$")
  65. HOSTNAME_PART_START = re.compile(r"^([+a-z0-9A-Z_-]{0,63})(.*)$")
  66. # protocols that can allow "unsafe" and "unwise" chars.
  67. # protocols that never have a hostname.
  68. HOSTLESS_PROTOCOL = defaultdict(
  69. bool,
  70. {
  71. "javascript": True,
  72. "javascript:": True,
  73. },
  74. )
  75. # protocols that always contain a // bit.
  76. SLASHED_PROTOCOL = defaultdict(
  77. bool,
  78. {
  79. "http": True,
  80. "https": True,
  81. "ftp": True,
  82. "gopher": True,
  83. "file": True,
  84. "http:": True,
  85. "https:": True,
  86. "ftp:": True,
  87. "gopher:": True,
  88. "file:": True,
  89. },
  90. )
  91. class MutableURL:
  92. def __init__(self) -> None:
  93. self.protocol: str | None = None
  94. self.slashes: bool = False
  95. self.auth: str | None = None
  96. self.port: str | None = None
  97. self.hostname: str | None = None
  98. self.hash: str | None = None
  99. self.search: str | None = None
  100. self.pathname: str | None = None
  101. def parse(self, url: str, slashes_denote_host: bool) -> "MutableURL":
  102. lower_proto = ""
  103. slashes = False
  104. rest = url
  105. # trim before proceeding.
  106. # This is to support parse stuff like " http://foo.com \n"
  107. rest = rest.strip()
  108. if not slashes_denote_host and len(url.split("#")) == 1:
  109. # Try fast path regexp
  110. simple_path = SIMPLE_PATH_PATTERN.match(rest)
  111. if simple_path:
  112. self.pathname = simple_path.group(1)
  113. if simple_path.group(2):
  114. self.search = simple_path.group(2)
  115. return self
  116. proto = ""
  117. proto_match = PROTOCOL_PATTERN.match(rest)
  118. if proto_match:
  119. proto = proto_match.group()
  120. lower_proto = proto.lower()
  121. self.protocol = proto
  122. rest = rest[len(proto) :]
  123. # figure out if it's got a host
  124. # user@server is *always* interpreted as a hostname, and url
  125. # resolution will treat //foo/bar as host=foo,path=bar because that's
  126. # how the browser resolves relative URLs.
  127. if slashes_denote_host or proto or re.search(r"^//[^@/]+@[^@/]+", rest):
  128. slashes = rest.startswith("//")
  129. if slashes and not (proto and HOSTLESS_PROTOCOL[proto]):
  130. rest = rest[2:]
  131. self.slashes = True
  132. if not HOSTLESS_PROTOCOL[proto] and (
  133. slashes or (proto and not SLASHED_PROTOCOL[proto])
  134. ):
  135. # there's a hostname.
  136. # the first instance of /, ?, ;, or # ends the host.
  137. #
  138. # If there is an @ in the hostname, then non-host chars *are* allowed
  139. # to the left of the last @ sign, unless some host-ending character
  140. # comes *before* the @-sign.
  141. # URLs are obnoxious.
  142. #
  143. # ex:
  144. # http://a@b@c/ => user:a@b host:c
  145. # http://a@b?@c => user:a host:c path:/?@c
  146. # v0.12 TODO(isaacs): This is not quite how Chrome does things.
  147. # Review our test case against browsers more comprehensively.
  148. # find the first instance of any hostEndingChars
  149. host_end = -1
  150. for i in range(len(HOST_ENDING_CHARS)):
  151. hec = rest.find(HOST_ENDING_CHARS[i])
  152. if hec != -1 and (host_end == -1 or hec < host_end):
  153. host_end = hec
  154. # at this point, either we have an explicit point where the
  155. # auth portion cannot go past, or the last @ char is the decider.
  156. if host_end == -1:
  157. # atSign can be anywhere.
  158. at_sign = rest.rfind("@")
  159. else:
  160. # atSign must be in auth portion.
  161. # http://a@b/c@d => host:b auth:a path:/c@d
  162. at_sign = rest.rfind("@", 0, host_end + 1)
  163. # Now we have a portion which is definitely the auth.
  164. # Pull that off.
  165. if at_sign != -1:
  166. auth = rest[:at_sign]
  167. rest = rest[at_sign + 1 :]
  168. self.auth = auth
  169. # the host is the remaining to the left of the first non-host char
  170. host_end = -1
  171. for i in range(len(NON_HOST_CHARS)):
  172. hec = rest.find(NON_HOST_CHARS[i])
  173. if hec != -1 and (host_end == -1 or hec < host_end):
  174. host_end = hec
  175. # if we still have not hit it, then the entire thing is a host.
  176. if host_end == -1:
  177. host_end = len(rest)
  178. if host_end > 0 and rest[host_end - 1] == ":":
  179. host_end -= 1
  180. host = rest[:host_end]
  181. rest = rest[host_end:]
  182. # pull out port.
  183. self.parse_host(host)
  184. # we've indicated that there is a hostname,
  185. # so even if it's empty, it has to be present.
  186. self.hostname = self.hostname or ""
  187. # if hostname begins with [ and ends with ]
  188. # assume that it's an IPv6 address.
  189. ipv6_hostname = self.hostname.startswith("[") and self.hostname.endswith(
  190. "]"
  191. )
  192. # validate a little.
  193. if not ipv6_hostname:
  194. hostparts = self.hostname.split(".")
  195. l = len(hostparts) # noqa: E741
  196. i = 0
  197. while i < l:
  198. part = hostparts[i]
  199. if not part:
  200. i += 1 # emulate statement3 in JS for loop
  201. continue
  202. if not HOSTNAME_PART_PATTERN.search(part):
  203. newpart = ""
  204. k = len(part)
  205. j = 0
  206. while j < k:
  207. if ord(part[j]) > 127:
  208. # we replace non-ASCII char with a temporary placeholder
  209. # we need this to make sure size of hostname is not
  210. # broken by replacing non-ASCII by nothing
  211. newpart += "x"
  212. else:
  213. newpart += part[j]
  214. j += 1 # emulate statement3 in JS for loop
  215. # we test again with ASCII char only
  216. if not HOSTNAME_PART_PATTERN.search(newpart):
  217. valid_parts = hostparts[:i]
  218. not_host = hostparts[i + 1 :]
  219. bit = HOSTNAME_PART_START.search(part)
  220. if bit:
  221. valid_parts.append(bit.group(1))
  222. not_host.insert(0, bit.group(2))
  223. if not_host:
  224. rest = ".".join(not_host) + rest
  225. self.hostname = ".".join(valid_parts)
  226. break
  227. i += 1 # emulate statement3 in JS for loop
  228. if len(self.hostname) > HOSTNAME_MAX_LEN:
  229. self.hostname = ""
  230. # strip [ and ] from the hostname
  231. # the host field still retains them, though
  232. if ipv6_hostname:
  233. self.hostname = self.hostname[1:-1]
  234. # chop off from the tail first.
  235. hash = rest.find("#") # noqa: A001
  236. if hash != -1:
  237. # got a fragment string.
  238. self.hash = rest[hash:]
  239. rest = rest[:hash]
  240. qm = rest.find("?")
  241. if qm != -1:
  242. self.search = rest[qm:]
  243. rest = rest[:qm]
  244. if rest:
  245. self.pathname = rest
  246. if SLASHED_PROTOCOL[lower_proto] and self.hostname and not self.pathname:
  247. self.pathname = ""
  248. return self
  249. def parse_host(self, host: str) -> None:
  250. port_match = PORT_PATTERN.search(host)
  251. if port_match:
  252. port = port_match.group()
  253. if port != ":":
  254. self.port = port[1:]
  255. host = host[: -len(port)]
  256. if host:
  257. self.hostname = host
  258. def url_parse(url: URL | str, *, slashes_denote_host: bool = False) -> URL:
  259. if isinstance(url, URL):
  260. return url
  261. u = MutableURL()
  262. u.parse(url, slashes_denote_host)
  263. return URL(
  264. u.protocol, u.slashes, u.auth, u.port, u.hostname, u.hash, u.search, u.pathname
  265. )