http_parser.py 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. import abc
  2. import asyncio
  3. import re
  4. import string
  5. from contextlib import suppress
  6. from enum import IntEnum
  7. from typing import (
  8. Any,
  9. ClassVar,
  10. Final,
  11. Generic,
  12. List,
  13. Literal,
  14. NamedTuple,
  15. Optional,
  16. Pattern,
  17. Set,
  18. Tuple,
  19. Type,
  20. TypeVar,
  21. Union,
  22. )
  23. from multidict import CIMultiDict, CIMultiDictProxy, istr
  24. from yarl import URL
  25. from . import hdrs
  26. from .base_protocol import BaseProtocol
  27. from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor
  28. from .helpers import (
  29. _EXC_SENTINEL,
  30. DEBUG,
  31. EMPTY_BODY_METHODS,
  32. EMPTY_BODY_STATUS_CODES,
  33. NO_EXTENSIONS,
  34. BaseTimerContext,
  35. set_exception,
  36. )
  37. from .http_exceptions import (
  38. BadHttpMessage,
  39. BadHttpMethod,
  40. BadStatusLine,
  41. ContentEncodingError,
  42. ContentLengthError,
  43. InvalidHeader,
  44. InvalidURLError,
  45. LineTooLong,
  46. TransferEncodingError,
  47. )
  48. from .http_writer import HttpVersion, HttpVersion10
  49. from .streams import EMPTY_PAYLOAD, StreamReader
  50. from .typedefs import RawHeaders
  51. __all__ = (
  52. "HeadersParser",
  53. "HttpParser",
  54. "HttpRequestParser",
  55. "HttpResponseParser",
  56. "RawRequestMessage",
  57. "RawResponseMessage",
  58. )
  59. _SEP = Literal[b"\r\n", b"\n"]
  60. ASCIISET: Final[Set[str]] = set(string.printable)
  61. # See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
  62. # and https://www.rfc-editor.org/rfc/rfc9110.html#name-tokens
  63. #
  64. # method = token
  65. # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
  66. # "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
  67. # token = 1*tchar
  68. _TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
  69. TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
  70. VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
  71. DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
  72. HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")
  73. class RawRequestMessage(NamedTuple):
  74. method: str
  75. path: str
  76. version: HttpVersion
  77. headers: "CIMultiDictProxy[str]"
  78. raw_headers: RawHeaders
  79. should_close: bool
  80. compression: Optional[str]
  81. upgrade: bool
  82. chunked: bool
  83. url: URL
  84. class RawResponseMessage(NamedTuple):
  85. version: HttpVersion
  86. code: int
  87. reason: str
  88. headers: CIMultiDictProxy[str]
  89. raw_headers: RawHeaders
  90. should_close: bool
  91. compression: Optional[str]
  92. upgrade: bool
  93. chunked: bool
  94. _MsgT = TypeVar("_MsgT", RawRequestMessage, RawResponseMessage)
  95. class ParseState(IntEnum):
  96. PARSE_NONE = 0
  97. PARSE_LENGTH = 1
  98. PARSE_CHUNKED = 2
  99. PARSE_UNTIL_EOF = 3
  100. class ChunkState(IntEnum):
  101. PARSE_CHUNKED_SIZE = 0
  102. PARSE_CHUNKED_CHUNK = 1
  103. PARSE_CHUNKED_CHUNK_EOF = 2
  104. PARSE_MAYBE_TRAILERS = 3
  105. PARSE_TRAILERS = 4
  106. class HeadersParser:
  107. def __init__(
  108. self,
  109. max_line_size: int = 8190,
  110. max_headers: int = 32768,
  111. max_field_size: int = 8190,
  112. lax: bool = False,
  113. ) -> None:
  114. self.max_line_size = max_line_size
  115. self.max_headers = max_headers
  116. self.max_field_size = max_field_size
  117. self._lax = lax
  118. def parse_headers(
  119. self, lines: List[bytes]
  120. ) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
  121. headers: CIMultiDict[str] = CIMultiDict()
  122. # note: "raw" does not mean inclusion of OWS before/after the field value
  123. raw_headers = []
  124. lines_idx = 1
  125. line = lines[1]
  126. line_count = len(lines)
  127. while line:
  128. # Parse initial header name : value pair.
  129. try:
  130. bname, bvalue = line.split(b":", 1)
  131. except ValueError:
  132. raise InvalidHeader(line) from None
  133. if len(bname) == 0:
  134. raise InvalidHeader(bname)
  135. # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
  136. if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
  137. raise InvalidHeader(line)
  138. bvalue = bvalue.lstrip(b" \t")
  139. if len(bname) > self.max_field_size:
  140. raise LineTooLong(
  141. "request header name {}".format(
  142. bname.decode("utf8", "backslashreplace")
  143. ),
  144. str(self.max_field_size),
  145. str(len(bname)),
  146. )
  147. name = bname.decode("utf-8", "surrogateescape")
  148. if not TOKENRE.fullmatch(name):
  149. raise InvalidHeader(bname)
  150. header_length = len(bvalue)
  151. # next line
  152. lines_idx += 1
  153. line = lines[lines_idx]
  154. # consume continuation lines
  155. continuation = self._lax and line and line[0] in (32, 9) # (' ', '\t')
  156. # Deprecated: https://www.rfc-editor.org/rfc/rfc9112.html#name-obsolete-line-folding
  157. if continuation:
  158. bvalue_lst = [bvalue]
  159. while continuation:
  160. header_length += len(line)
  161. if header_length > self.max_field_size:
  162. raise LineTooLong(
  163. "request header field {}".format(
  164. bname.decode("utf8", "backslashreplace")
  165. ),
  166. str(self.max_field_size),
  167. str(header_length),
  168. )
  169. bvalue_lst.append(line)
  170. # next line
  171. lines_idx += 1
  172. if lines_idx < line_count:
  173. line = lines[lines_idx]
  174. if line:
  175. continuation = line[0] in (32, 9) # (' ', '\t')
  176. else:
  177. line = b""
  178. break
  179. bvalue = b"".join(bvalue_lst)
  180. else:
  181. if header_length > self.max_field_size:
  182. raise LineTooLong(
  183. "request header field {}".format(
  184. bname.decode("utf8", "backslashreplace")
  185. ),
  186. str(self.max_field_size),
  187. str(header_length),
  188. )
  189. bvalue = bvalue.strip(b" \t")
  190. value = bvalue.decode("utf-8", "surrogateescape")
  191. # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
  192. if "\n" in value or "\r" in value or "\x00" in value:
  193. raise InvalidHeader(bvalue)
  194. headers.add(name, value)
  195. raw_headers.append((bname, bvalue))
  196. return (CIMultiDictProxy(headers), tuple(raw_headers))
  197. def _is_supported_upgrade(headers: CIMultiDictProxy[str]) -> bool:
  198. """Check if the upgrade header is supported."""
  199. return headers.get(hdrs.UPGRADE, "").lower() in {"tcp", "websocket"}
  200. class HttpParser(abc.ABC, Generic[_MsgT]):
  201. lax: ClassVar[bool] = False
  202. def __init__(
  203. self,
  204. protocol: Optional[BaseProtocol] = None,
  205. loop: Optional[asyncio.AbstractEventLoop] = None,
  206. limit: int = 2**16,
  207. max_line_size: int = 8190,
  208. max_headers: int = 32768,
  209. max_field_size: int = 8190,
  210. timer: Optional[BaseTimerContext] = None,
  211. code: Optional[int] = None,
  212. method: Optional[str] = None,
  213. payload_exception: Optional[Type[BaseException]] = None,
  214. response_with_body: bool = True,
  215. read_until_eof: bool = False,
  216. auto_decompress: bool = True,
  217. ) -> None:
  218. self.protocol = protocol
  219. self.loop = loop
  220. self.max_line_size = max_line_size
  221. self.max_headers = max_headers
  222. self.max_field_size = max_field_size
  223. self.timer = timer
  224. self.code = code
  225. self.method = method
  226. self.payload_exception = payload_exception
  227. self.response_with_body = response_with_body
  228. self.read_until_eof = read_until_eof
  229. self._lines: List[bytes] = []
  230. self._tail = b""
  231. self._upgraded = False
  232. self._payload = None
  233. self._payload_parser: Optional[HttpPayloadParser] = None
  234. self._auto_decompress = auto_decompress
  235. self._limit = limit
  236. self._headers_parser = HeadersParser(
  237. max_line_size, max_headers, max_field_size, self.lax
  238. )
  239. @abc.abstractmethod
  240. def parse_message(self, lines: List[bytes]) -> _MsgT: ...
  241. @abc.abstractmethod
  242. def _is_chunked_te(self, te: str) -> bool: ...
  243. def feed_eof(self) -> Optional[_MsgT]:
  244. if self._payload_parser is not None:
  245. self._payload_parser.feed_eof()
  246. self._payload_parser = None
  247. else:
  248. # try to extract partial message
  249. if self._tail:
  250. self._lines.append(self._tail)
  251. if self._lines:
  252. if self._lines[-1] != "\r\n":
  253. self._lines.append(b"")
  254. with suppress(Exception):
  255. return self.parse_message(self._lines)
  256. return None
  257. def feed_data(
  258. self,
  259. data: bytes,
  260. SEP: _SEP = b"\r\n",
  261. EMPTY: bytes = b"",
  262. CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
  263. METH_CONNECT: str = hdrs.METH_CONNECT,
  264. SEC_WEBSOCKET_KEY1: istr = hdrs.SEC_WEBSOCKET_KEY1,
  265. ) -> Tuple[List[Tuple[_MsgT, StreamReader]], bool, bytes]:
  266. messages = []
  267. if self._tail:
  268. data, self._tail = self._tail + data, b""
  269. data_len = len(data)
  270. start_pos = 0
  271. loop = self.loop
  272. should_close = False
  273. while start_pos < data_len:
  274. # read HTTP message (request/response line + headers), \r\n\r\n
  275. # and split by lines
  276. if self._payload_parser is None and not self._upgraded:
  277. pos = data.find(SEP, start_pos)
  278. # consume \r\n
  279. if pos == start_pos and not self._lines:
  280. start_pos = pos + len(SEP)
  281. continue
  282. if pos >= start_pos:
  283. if should_close:
  284. raise BadHttpMessage("Data after `Connection: close`")
  285. # line found
  286. line = data[start_pos:pos]
  287. if SEP == b"\n": # For lax response parsing
  288. line = line.rstrip(b"\r")
  289. self._lines.append(line)
  290. start_pos = pos + len(SEP)
  291. # \r\n\r\n found
  292. if self._lines[-1] == EMPTY:
  293. try:
  294. msg: _MsgT = self.parse_message(self._lines)
  295. finally:
  296. self._lines.clear()
  297. def get_content_length() -> Optional[int]:
  298. # payload length
  299. length_hdr = msg.headers.get(CONTENT_LENGTH)
  300. if length_hdr is None:
  301. return None
  302. # Shouldn't allow +/- or other number formats.
  303. # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
  304. # msg.headers is already stripped of leading/trailing wsp
  305. if not DIGITS.fullmatch(length_hdr):
  306. raise InvalidHeader(CONTENT_LENGTH)
  307. return int(length_hdr)
  308. length = get_content_length()
  309. # do not support old websocket spec
  310. if SEC_WEBSOCKET_KEY1 in msg.headers:
  311. raise InvalidHeader(SEC_WEBSOCKET_KEY1)
  312. self._upgraded = msg.upgrade and _is_supported_upgrade(
  313. msg.headers
  314. )
  315. method = getattr(msg, "method", self.method)
  316. # code is only present on responses
  317. code = getattr(msg, "code", 0)
  318. assert self.protocol is not None
  319. # calculate payload
  320. empty_body = code in EMPTY_BODY_STATUS_CODES or bool(
  321. method and method in EMPTY_BODY_METHODS
  322. )
  323. if not empty_body and (
  324. ((length is not None and length > 0) or msg.chunked)
  325. and not self._upgraded
  326. ):
  327. payload = StreamReader(
  328. self.protocol,
  329. timer=self.timer,
  330. loop=loop,
  331. limit=self._limit,
  332. )
  333. payload_parser = HttpPayloadParser(
  334. payload,
  335. length=length,
  336. chunked=msg.chunked,
  337. method=method,
  338. compression=msg.compression,
  339. code=self.code,
  340. response_with_body=self.response_with_body,
  341. auto_decompress=self._auto_decompress,
  342. lax=self.lax,
  343. )
  344. if not payload_parser.done:
  345. self._payload_parser = payload_parser
  346. elif method == METH_CONNECT:
  347. assert isinstance(msg, RawRequestMessage)
  348. payload = StreamReader(
  349. self.protocol,
  350. timer=self.timer,
  351. loop=loop,
  352. limit=self._limit,
  353. )
  354. self._upgraded = True
  355. self._payload_parser = HttpPayloadParser(
  356. payload,
  357. method=msg.method,
  358. compression=msg.compression,
  359. auto_decompress=self._auto_decompress,
  360. lax=self.lax,
  361. )
  362. elif not empty_body and length is None and self.read_until_eof:
  363. payload = StreamReader(
  364. self.protocol,
  365. timer=self.timer,
  366. loop=loop,
  367. limit=self._limit,
  368. )
  369. payload_parser = HttpPayloadParser(
  370. payload,
  371. length=length,
  372. chunked=msg.chunked,
  373. method=method,
  374. compression=msg.compression,
  375. code=self.code,
  376. response_with_body=self.response_with_body,
  377. auto_decompress=self._auto_decompress,
  378. lax=self.lax,
  379. )
  380. if not payload_parser.done:
  381. self._payload_parser = payload_parser
  382. else:
  383. payload = EMPTY_PAYLOAD
  384. messages.append((msg, payload))
  385. should_close = msg.should_close
  386. else:
  387. self._tail = data[start_pos:]
  388. data = EMPTY
  389. break
  390. # no parser, just store
  391. elif self._payload_parser is None and self._upgraded:
  392. assert not self._lines
  393. break
  394. # feed payload
  395. elif data and start_pos < data_len:
  396. assert not self._lines
  397. assert self._payload_parser is not None
  398. try:
  399. eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
  400. except BaseException as underlying_exc:
  401. reraised_exc = underlying_exc
  402. if self.payload_exception is not None:
  403. reraised_exc = self.payload_exception(str(underlying_exc))
  404. set_exception(
  405. self._payload_parser.payload,
  406. reraised_exc,
  407. underlying_exc,
  408. )
  409. eof = True
  410. data = b""
  411. if eof:
  412. start_pos = 0
  413. data_len = len(data)
  414. self._payload_parser = None
  415. continue
  416. else:
  417. break
  418. if data and start_pos < data_len:
  419. data = data[start_pos:]
  420. else:
  421. data = EMPTY
  422. return messages, self._upgraded, data
  423. def parse_headers(
  424. self, lines: List[bytes]
  425. ) -> Tuple[
  426. "CIMultiDictProxy[str]", RawHeaders, Optional[bool], Optional[str], bool, bool
  427. ]:
  428. """Parses RFC 5322 headers from a stream.
  429. Line continuations are supported. Returns list of header name
  430. and value pairs. Header name is in upper case.
  431. """
  432. headers, raw_headers = self._headers_parser.parse_headers(lines)
  433. close_conn = None
  434. encoding = None
  435. upgrade = False
  436. chunked = False
  437. # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
  438. # https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
  439. singletons = (
  440. hdrs.CONTENT_LENGTH,
  441. hdrs.CONTENT_LOCATION,
  442. hdrs.CONTENT_RANGE,
  443. hdrs.CONTENT_TYPE,
  444. hdrs.ETAG,
  445. hdrs.HOST,
  446. hdrs.MAX_FORWARDS,
  447. hdrs.SERVER,
  448. hdrs.TRANSFER_ENCODING,
  449. hdrs.USER_AGENT,
  450. )
  451. bad_hdr = next((h for h in singletons if len(headers.getall(h, ())) > 1), None)
  452. if bad_hdr is not None:
  453. raise BadHttpMessage(f"Duplicate '{bad_hdr}' header found.")
  454. # keep-alive
  455. conn = headers.get(hdrs.CONNECTION)
  456. if conn:
  457. v = conn.lower()
  458. if v == "close":
  459. close_conn = True
  460. elif v == "keep-alive":
  461. close_conn = False
  462. # https://www.rfc-editor.org/rfc/rfc9110.html#name-101-switching-protocols
  463. elif v == "upgrade" and headers.get(hdrs.UPGRADE):
  464. upgrade = True
  465. # encoding
  466. enc = headers.get(hdrs.CONTENT_ENCODING)
  467. if enc:
  468. enc = enc.lower()
  469. if enc in ("gzip", "deflate", "br"):
  470. encoding = enc
  471. # chunking
  472. te = headers.get(hdrs.TRANSFER_ENCODING)
  473. if te is not None:
  474. if self._is_chunked_te(te):
  475. chunked = True
  476. if hdrs.CONTENT_LENGTH in headers:
  477. raise BadHttpMessage(
  478. "Transfer-Encoding can't be present with Content-Length",
  479. )
  480. return (headers, raw_headers, close_conn, encoding, upgrade, chunked)
  481. def set_upgraded(self, val: bool) -> None:
  482. """Set connection upgraded (to websocket) mode.
  483. :param bool val: new state.
  484. """
  485. self._upgraded = val
  486. class HttpRequestParser(HttpParser[RawRequestMessage]):
  487. """Read request status line.
  488. Exception .http_exceptions.BadStatusLine
  489. could be raised in case of any errors in status line.
  490. Returns RawRequestMessage.
  491. """
  492. def parse_message(self, lines: List[bytes]) -> RawRequestMessage:
  493. # request line
  494. line = lines[0].decode("utf-8", "surrogateescape")
  495. try:
  496. method, path, version = line.split(" ", maxsplit=2)
  497. except ValueError:
  498. raise BadHttpMethod(line) from None
  499. if len(path) > self.max_line_size:
  500. raise LineTooLong(
  501. "Status line is too long", str(self.max_line_size), str(len(path))
  502. )
  503. # method
  504. if not TOKENRE.fullmatch(method):
  505. raise BadHttpMethod(method)
  506. # version
  507. match = VERSRE.fullmatch(version)
  508. if match is None:
  509. raise BadStatusLine(line)
  510. version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
  511. if method == "CONNECT":
  512. # authority-form,
  513. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.3
  514. url = URL.build(authority=path, encoded=True)
  515. elif path.startswith("/"):
  516. # origin-form,
  517. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.1
  518. path_part, _hash_separator, url_fragment = path.partition("#")
  519. path_part, _question_mark_separator, qs_part = path_part.partition("?")
  520. # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based
  521. # NOTE: parser does, otherwise it results into the same
  522. # NOTE: HTTP Request-Line input producing different
  523. # NOTE: `yarl.URL()` objects
  524. url = URL.build(
  525. path=path_part,
  526. query_string=qs_part,
  527. fragment=url_fragment,
  528. encoded=True,
  529. )
  530. elif path == "*" and method == "OPTIONS":
  531. # asterisk-form,
  532. url = URL(path, encoded=True)
  533. else:
  534. # absolute-form for proxy maybe,
  535. # https://datatracker.ietf.org/doc/html/rfc7230#section-5.3.2
  536. url = URL(path, encoded=True)
  537. if url.scheme == "":
  538. # not absolute-form
  539. raise InvalidURLError(
  540. path.encode(errors="surrogateescape").decode("latin1")
  541. )
  542. # read headers
  543. (
  544. headers,
  545. raw_headers,
  546. close,
  547. compression,
  548. upgrade,
  549. chunked,
  550. ) = self.parse_headers(lines)
  551. if close is None: # then the headers weren't set in the request
  552. if version_o <= HttpVersion10: # HTTP 1.0 must asks to not close
  553. close = True
  554. else: # HTTP 1.1 must ask to close.
  555. close = False
  556. return RawRequestMessage(
  557. method,
  558. path,
  559. version_o,
  560. headers,
  561. raw_headers,
  562. close,
  563. compression,
  564. upgrade,
  565. chunked,
  566. url,
  567. )
  568. def _is_chunked_te(self, te: str) -> bool:
  569. if te.rsplit(",", maxsplit=1)[-1].strip(" \t").lower() == "chunked":
  570. return True
  571. # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.3
  572. raise BadHttpMessage("Request has invalid `Transfer-Encoding`")
  573. class HttpResponseParser(HttpParser[RawResponseMessage]):
  574. """Read response status line and headers.
  575. BadStatusLine could be raised in case of any errors in status line.
  576. Returns RawResponseMessage.
  577. """
  578. # Lax mode should only be enabled on response parser.
  579. lax = not DEBUG
  580. def feed_data(
  581. self,
  582. data: bytes,
  583. SEP: Optional[_SEP] = None,
  584. *args: Any,
  585. **kwargs: Any,
  586. ) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
  587. if SEP is None:
  588. SEP = b"\r\n" if DEBUG else b"\n"
  589. return super().feed_data(data, SEP, *args, **kwargs)
  590. def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
  591. line = lines[0].decode("utf-8", "surrogateescape")
  592. try:
  593. version, status = line.split(maxsplit=1)
  594. except ValueError:
  595. raise BadStatusLine(line) from None
  596. try:
  597. status, reason = status.split(maxsplit=1)
  598. except ValueError:
  599. status = status.strip()
  600. reason = ""
  601. if len(reason) > self.max_line_size:
  602. raise LineTooLong(
  603. "Status line is too long", str(self.max_line_size), str(len(reason))
  604. )
  605. # version
  606. match = VERSRE.fullmatch(version)
  607. if match is None:
  608. raise BadStatusLine(line)
  609. version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
  610. # The status code is a three-digit ASCII number, no padding
  611. if len(status) != 3 or not DIGITS.fullmatch(status):
  612. raise BadStatusLine(line)
  613. status_i = int(status)
  614. # read headers
  615. (
  616. headers,
  617. raw_headers,
  618. close,
  619. compression,
  620. upgrade,
  621. chunked,
  622. ) = self.parse_headers(lines)
  623. if close is None:
  624. if version_o <= HttpVersion10:
  625. close = True
  626. # https://www.rfc-editor.org/rfc/rfc9112.html#name-message-body-length
  627. elif 100 <= status_i < 200 or status_i in {204, 304}:
  628. close = False
  629. elif hdrs.CONTENT_LENGTH in headers or hdrs.TRANSFER_ENCODING in headers:
  630. close = False
  631. else:
  632. # https://www.rfc-editor.org/rfc/rfc9112.html#section-6.3-2.8
  633. close = True
  634. return RawResponseMessage(
  635. version_o,
  636. status_i,
  637. reason.strip(),
  638. headers,
  639. raw_headers,
  640. close,
  641. compression,
  642. upgrade,
  643. chunked,
  644. )
  645. def _is_chunked_te(self, te: str) -> bool:
  646. # https://www.rfc-editor.org/rfc/rfc9112#section-6.3-2.4.2
  647. return te.rsplit(",", maxsplit=1)[-1].strip(" \t").lower() == "chunked"
  648. class HttpPayloadParser:
  649. def __init__(
  650. self,
  651. payload: StreamReader,
  652. length: Optional[int] = None,
  653. chunked: bool = False,
  654. compression: Optional[str] = None,
  655. code: Optional[int] = None,
  656. method: Optional[str] = None,
  657. response_with_body: bool = True,
  658. auto_decompress: bool = True,
  659. lax: bool = False,
  660. ) -> None:
  661. self._length = 0
  662. self._type = ParseState.PARSE_UNTIL_EOF
  663. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  664. self._chunk_size = 0
  665. self._chunk_tail = b""
  666. self._auto_decompress = auto_decompress
  667. self._lax = lax
  668. self.done = False
  669. # payload decompression wrapper
  670. if response_with_body and compression and self._auto_decompress:
  671. real_payload: Union[StreamReader, DeflateBuffer] = DeflateBuffer(
  672. payload, compression
  673. )
  674. else:
  675. real_payload = payload
  676. # payload parser
  677. if not response_with_body:
  678. # don't parse payload if it's not expected to be received
  679. self._type = ParseState.PARSE_NONE
  680. real_payload.feed_eof()
  681. self.done = True
  682. elif chunked:
  683. self._type = ParseState.PARSE_CHUNKED
  684. elif length is not None:
  685. self._type = ParseState.PARSE_LENGTH
  686. self._length = length
  687. if self._length == 0:
  688. real_payload.feed_eof()
  689. self.done = True
  690. self.payload = real_payload
  691. def feed_eof(self) -> None:
  692. if self._type == ParseState.PARSE_UNTIL_EOF:
  693. self.payload.feed_eof()
  694. elif self._type == ParseState.PARSE_LENGTH:
  695. raise ContentLengthError(
  696. "Not enough data for satisfy content length header."
  697. )
  698. elif self._type == ParseState.PARSE_CHUNKED:
  699. raise TransferEncodingError(
  700. "Not enough data for satisfy transfer length header."
  701. )
  702. def feed_data(
  703. self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
  704. ) -> Tuple[bool, bytes]:
  705. # Read specified amount of bytes
  706. if self._type == ParseState.PARSE_LENGTH:
  707. required = self._length
  708. chunk_len = len(chunk)
  709. if required >= chunk_len:
  710. self._length = required - chunk_len
  711. self.payload.feed_data(chunk, chunk_len)
  712. if self._length == 0:
  713. self.payload.feed_eof()
  714. return True, b""
  715. else:
  716. self._length = 0
  717. self.payload.feed_data(chunk[:required], required)
  718. self.payload.feed_eof()
  719. return True, chunk[required:]
  720. # Chunked transfer encoding parser
  721. elif self._type == ParseState.PARSE_CHUNKED:
  722. if self._chunk_tail:
  723. chunk = self._chunk_tail + chunk
  724. self._chunk_tail = b""
  725. while chunk:
  726. # read next chunk size
  727. if self._chunk == ChunkState.PARSE_CHUNKED_SIZE:
  728. pos = chunk.find(SEP)
  729. if pos >= 0:
  730. i = chunk.find(CHUNK_EXT, 0, pos)
  731. if i >= 0:
  732. size_b = chunk[:i] # strip chunk-extensions
  733. # Verify no LF in the chunk-extension
  734. if b"\n" in (ext := chunk[i:pos]):
  735. exc = BadHttpMessage(
  736. f"Unexpected LF in chunk-extension: {ext!r}"
  737. )
  738. set_exception(self.payload, exc)
  739. raise exc
  740. else:
  741. size_b = chunk[:pos]
  742. if self._lax: # Allow whitespace in lax mode.
  743. size_b = size_b.strip()
  744. if not re.fullmatch(HEXDIGITS, size_b):
  745. exc = TransferEncodingError(
  746. chunk[:pos].decode("ascii", "surrogateescape")
  747. )
  748. set_exception(self.payload, exc)
  749. raise exc
  750. size = int(bytes(size_b), 16)
  751. chunk = chunk[pos + len(SEP) :]
  752. if size == 0: # eof marker
  753. self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
  754. if self._lax and chunk.startswith(b"\r"):
  755. chunk = chunk[1:]
  756. else:
  757. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
  758. self._chunk_size = size
  759. self.payload.begin_http_chunk_receiving()
  760. else:
  761. self._chunk_tail = chunk
  762. return False, b""
  763. # read chunk and feed buffer
  764. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK:
  765. required = self._chunk_size
  766. chunk_len = len(chunk)
  767. if required > chunk_len:
  768. self._chunk_size = required - chunk_len
  769. self.payload.feed_data(chunk, chunk_len)
  770. return False, b""
  771. else:
  772. self._chunk_size = 0
  773. self.payload.feed_data(chunk[:required], required)
  774. chunk = chunk[required:]
  775. self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
  776. self.payload.end_http_chunk_receiving()
  777. # toss the CRLF at the end of the chunk
  778. if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
  779. if self._lax and chunk.startswith(b"\r"):
  780. chunk = chunk[1:]
  781. if chunk[: len(SEP)] == SEP:
  782. chunk = chunk[len(SEP) :]
  783. self._chunk = ChunkState.PARSE_CHUNKED_SIZE
  784. else:
  785. self._chunk_tail = chunk
  786. return False, b""
  787. # if stream does not contain trailer, after 0\r\n
  788. # we should get another \r\n otherwise
  789. # trailers needs to be skipped until \r\n\r\n
  790. if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
  791. head = chunk[: len(SEP)]
  792. if head == SEP:
  793. # end of stream
  794. self.payload.feed_eof()
  795. return True, chunk[len(SEP) :]
  796. # Both CR and LF, or only LF may not be received yet. It is
  797. # expected that CRLF or LF will be shown at the very first
  798. # byte next time, otherwise trailers should come. The last
  799. # CRLF which marks the end of response might not be
  800. # contained in the same TCP segment which delivered the
  801. # size indicator.
  802. if not head:
  803. return False, b""
  804. if head == SEP[:1]:
  805. self._chunk_tail = head
  806. return False, b""
  807. self._chunk = ChunkState.PARSE_TRAILERS
  808. # read and discard trailer up to the CRLF terminator
  809. if self._chunk == ChunkState.PARSE_TRAILERS:
  810. pos = chunk.find(SEP)
  811. if pos >= 0:
  812. chunk = chunk[pos + len(SEP) :]
  813. self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
  814. else:
  815. self._chunk_tail = chunk
  816. return False, b""
  817. # Read all bytes until eof
  818. elif self._type == ParseState.PARSE_UNTIL_EOF:
  819. self.payload.feed_data(chunk, len(chunk))
  820. return False, b""
  821. class DeflateBuffer:
  822. """DeflateStream decompress stream and feed data into specified stream."""
  823. decompressor: Any
  824. def __init__(self, out: StreamReader, encoding: Optional[str]) -> None:
  825. self.out = out
  826. self.size = 0
  827. self.encoding = encoding
  828. self._started_decoding = False
  829. self.decompressor: Union[BrotliDecompressor, ZLibDecompressor]
  830. if encoding == "br":
  831. if not HAS_BROTLI: # pragma: no cover
  832. raise ContentEncodingError(
  833. "Can not decode content-encoding: brotli (br). "
  834. "Please install `Brotli`"
  835. )
  836. self.decompressor = BrotliDecompressor()
  837. else:
  838. self.decompressor = ZLibDecompressor(encoding=encoding)
  839. def set_exception(
  840. self,
  841. exc: BaseException,
  842. exc_cause: BaseException = _EXC_SENTINEL,
  843. ) -> None:
  844. set_exception(self.out, exc, exc_cause)
  845. def feed_data(self, chunk: bytes, size: int) -> None:
  846. if not size:
  847. return
  848. self.size += size
  849. # RFC1950
  850. # bits 0..3 = CM = 0b1000 = 8 = "deflate"
  851. # bits 4..7 = CINFO = 1..7 = windows size.
  852. if (
  853. not self._started_decoding
  854. and self.encoding == "deflate"
  855. and chunk[0] & 0xF != 8
  856. ):
  857. # Change the decoder to decompress incorrectly compressed data
  858. # Actually we should issue a warning about non-RFC-compliant data.
  859. self.decompressor = ZLibDecompressor(
  860. encoding=self.encoding, suppress_deflate_header=True
  861. )
  862. try:
  863. chunk = self.decompressor.decompress_sync(chunk)
  864. except Exception:
  865. raise ContentEncodingError(
  866. "Can not decode content-encoding: %s" % self.encoding
  867. )
  868. self._started_decoding = True
  869. if chunk:
  870. self.out.feed_data(chunk, len(chunk))
  871. def feed_eof(self) -> None:
  872. chunk = self.decompressor.flush()
  873. if chunk or self.size > 0:
  874. self.out.feed_data(chunk, len(chunk))
  875. if self.encoding == "deflate" and not self.decompressor.eof:
  876. raise ContentEncodingError("deflate")
  877. self.out.feed_eof()
  878. def begin_http_chunk_receiving(self) -> None:
  879. self.out.begin_http_chunk_receiving()
  880. def end_http_chunk_receiving(self) -> None:
  881. self.out.end_http_chunk_receiving()
  882. HttpRequestParserPy = HttpRequestParser
  883. HttpResponseParserPy = HttpResponseParser
  884. RawRequestMessagePy = RawRequestMessage
  885. RawResponseMessagePy = RawResponseMessage
  886. try:
  887. if not NO_EXTENSIONS:
  888. from ._http_parser import ( # type: ignore[import-not-found,no-redef]
  889. HttpRequestParser,
  890. HttpResponseParser,
  891. RawRequestMessage,
  892. RawResponseMessage,
  893. )
  894. HttpRequestParserC = HttpRequestParser
  895. HttpResponseParserC = HttpResponseParser
  896. RawRequestMessageC = RawRequestMessage
  897. RawResponseMessageC = RawResponseMessage
  898. except ImportError: # pragma: no cover
  899. pass