_parser.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """Handwritten parser of dependency specifiers.
  2. The docstring for each __parse_* function contains EBNF-inspired grammar representing
  3. the implementation.
  4. """
  5. from __future__ import annotations
  6. import ast
  7. from typing import NamedTuple, Sequence, Tuple, Union
  8. from ._tokenizer import DEFAULT_RULES, Tokenizer
  9. class Node:
  10. def __init__(self, value: str) -> None:
  11. self.value = value
  12. def __str__(self) -> str:
  13. return self.value
  14. def __repr__(self) -> str:
  15. return f"<{self.__class__.__name__}('{self}')>"
  16. def serialize(self) -> str:
  17. raise NotImplementedError
  18. class Variable(Node):
  19. def serialize(self) -> str:
  20. return str(self)
  21. class Value(Node):
  22. def serialize(self) -> str:
  23. return f'"{self}"'
  24. class Op(Node):
  25. def serialize(self) -> str:
  26. return str(self)
  27. MarkerVar = Union[Variable, Value]
  28. MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
  29. MarkerAtom = Union[MarkerItem, Sequence["MarkerAtom"]]
  30. MarkerList = Sequence[Union["MarkerList", MarkerAtom, str]]
  31. class ParsedRequirement(NamedTuple):
  32. name: str
  33. url: str
  34. extras: list[str]
  35. specifier: str
  36. marker: MarkerList | None
  37. # --------------------------------------------------------------------------------------
  38. # Recursive descent parser for dependency specifier
  39. # --------------------------------------------------------------------------------------
  40. def parse_requirement(source: str) -> ParsedRequirement:
  41. return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
  42. def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement:
  43. """
  44. requirement = WS? IDENTIFIER WS? extras WS? requirement_details
  45. """
  46. tokenizer.consume("WS")
  47. name_token = tokenizer.expect(
  48. "IDENTIFIER", expected="package name at the start of dependency specifier"
  49. )
  50. name = name_token.text
  51. tokenizer.consume("WS")
  52. extras = _parse_extras(tokenizer)
  53. tokenizer.consume("WS")
  54. url, specifier, marker = _parse_requirement_details(tokenizer)
  55. tokenizer.expect("END", expected="end of dependency specifier")
  56. return ParsedRequirement(name, url, extras, specifier, marker)
  57. def _parse_requirement_details(
  58. tokenizer: Tokenizer,
  59. ) -> tuple[str, str, MarkerList | None]:
  60. """
  61. requirement_details = AT URL (WS requirement_marker?)?
  62. | specifier WS? (requirement_marker)?
  63. """
  64. specifier = ""
  65. url = ""
  66. marker = None
  67. if tokenizer.check("AT"):
  68. tokenizer.read()
  69. tokenizer.consume("WS")
  70. url_start = tokenizer.position
  71. url = tokenizer.expect("URL", expected="URL after @").text
  72. if tokenizer.check("END", peek=True):
  73. return (url, specifier, marker)
  74. tokenizer.expect("WS", expected="whitespace after URL")
  75. # The input might end after whitespace.
  76. if tokenizer.check("END", peek=True):
  77. return (url, specifier, marker)
  78. marker = _parse_requirement_marker(
  79. tokenizer, span_start=url_start, after="URL and whitespace"
  80. )
  81. else:
  82. specifier_start = tokenizer.position
  83. specifier = _parse_specifier(tokenizer)
  84. tokenizer.consume("WS")
  85. if tokenizer.check("END", peek=True):
  86. return (url, specifier, marker)
  87. marker = _parse_requirement_marker(
  88. tokenizer,
  89. span_start=specifier_start,
  90. after=(
  91. "version specifier"
  92. if specifier
  93. else "name and no valid version specifier"
  94. ),
  95. )
  96. return (url, specifier, marker)
  97. def _parse_requirement_marker(
  98. tokenizer: Tokenizer, *, span_start: int, after: str
  99. ) -> MarkerList:
  100. """
  101. requirement_marker = SEMICOLON marker WS?
  102. """
  103. if not tokenizer.check("SEMICOLON"):
  104. tokenizer.raise_syntax_error(
  105. f"Expected end or semicolon (after {after})",
  106. span_start=span_start,
  107. )
  108. tokenizer.read()
  109. marker = _parse_marker(tokenizer)
  110. tokenizer.consume("WS")
  111. return marker
  112. def _parse_extras(tokenizer: Tokenizer) -> list[str]:
  113. """
  114. extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)?
  115. """
  116. if not tokenizer.check("LEFT_BRACKET", peek=True):
  117. return []
  118. with tokenizer.enclosing_tokens(
  119. "LEFT_BRACKET",
  120. "RIGHT_BRACKET",
  121. around="extras",
  122. ):
  123. tokenizer.consume("WS")
  124. extras = _parse_extras_list(tokenizer)
  125. tokenizer.consume("WS")
  126. return extras
  127. def _parse_extras_list(tokenizer: Tokenizer) -> list[str]:
  128. """
  129. extras_list = identifier (wsp* ',' wsp* identifier)*
  130. """
  131. extras: list[str] = []
  132. if not tokenizer.check("IDENTIFIER"):
  133. return extras
  134. extras.append(tokenizer.read().text)
  135. while True:
  136. tokenizer.consume("WS")
  137. if tokenizer.check("IDENTIFIER", peek=True):
  138. tokenizer.raise_syntax_error("Expected comma between extra names")
  139. elif not tokenizer.check("COMMA"):
  140. break
  141. tokenizer.read()
  142. tokenizer.consume("WS")
  143. extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma")
  144. extras.append(extra_token.text)
  145. return extras
  146. def _parse_specifier(tokenizer: Tokenizer) -> str:
  147. """
  148. specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS
  149. | WS? version_many WS?
  150. """
  151. with tokenizer.enclosing_tokens(
  152. "LEFT_PARENTHESIS",
  153. "RIGHT_PARENTHESIS",
  154. around="version specifier",
  155. ):
  156. tokenizer.consume("WS")
  157. parsed_specifiers = _parse_version_many(tokenizer)
  158. tokenizer.consume("WS")
  159. return parsed_specifiers
  160. def _parse_version_many(tokenizer: Tokenizer) -> str:
  161. """
  162. version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)?
  163. """
  164. parsed_specifiers = ""
  165. while tokenizer.check("SPECIFIER"):
  166. span_start = tokenizer.position
  167. parsed_specifiers += tokenizer.read().text
  168. if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True):
  169. tokenizer.raise_syntax_error(
  170. ".* suffix can only be used with `==` or `!=` operators",
  171. span_start=span_start,
  172. span_end=tokenizer.position + 1,
  173. )
  174. if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True):
  175. tokenizer.raise_syntax_error(
  176. "Local version label can only be used with `==` or `!=` operators",
  177. span_start=span_start,
  178. span_end=tokenizer.position,
  179. )
  180. tokenizer.consume("WS")
  181. if not tokenizer.check("COMMA"):
  182. break
  183. parsed_specifiers += tokenizer.read().text
  184. tokenizer.consume("WS")
  185. return parsed_specifiers
  186. # --------------------------------------------------------------------------------------
  187. # Recursive descent parser for marker expression
  188. # --------------------------------------------------------------------------------------
  189. def parse_marker(source: str) -> MarkerList:
  190. return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES))
  191. def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList:
  192. retval = _parse_marker(tokenizer)
  193. tokenizer.expect("END", expected="end of marker expression")
  194. return retval
  195. def _parse_marker(tokenizer: Tokenizer) -> MarkerList:
  196. """
  197. marker = marker_atom (BOOLOP marker_atom)+
  198. """
  199. expression = [_parse_marker_atom(tokenizer)]
  200. while tokenizer.check("BOOLOP"):
  201. token = tokenizer.read()
  202. expr_right = _parse_marker_atom(tokenizer)
  203. expression.extend((token.text, expr_right))
  204. return expression
  205. def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom:
  206. """
  207. marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS?
  208. | WS? marker_item WS?
  209. """
  210. tokenizer.consume("WS")
  211. if tokenizer.check("LEFT_PARENTHESIS", peek=True):
  212. with tokenizer.enclosing_tokens(
  213. "LEFT_PARENTHESIS",
  214. "RIGHT_PARENTHESIS",
  215. around="marker expression",
  216. ):
  217. tokenizer.consume("WS")
  218. marker: MarkerAtom = _parse_marker(tokenizer)
  219. tokenizer.consume("WS")
  220. else:
  221. marker = _parse_marker_item(tokenizer)
  222. tokenizer.consume("WS")
  223. return marker
  224. def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem:
  225. """
  226. marker_item = WS? marker_var WS? marker_op WS? marker_var WS?
  227. """
  228. tokenizer.consume("WS")
  229. marker_var_left = _parse_marker_var(tokenizer)
  230. tokenizer.consume("WS")
  231. marker_op = _parse_marker_op(tokenizer)
  232. tokenizer.consume("WS")
  233. marker_var_right = _parse_marker_var(tokenizer)
  234. tokenizer.consume("WS")
  235. return (marker_var_left, marker_op, marker_var_right)
  236. def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar:
  237. """
  238. marker_var = VARIABLE | QUOTED_STRING
  239. """
  240. if tokenizer.check("VARIABLE"):
  241. return process_env_var(tokenizer.read().text.replace(".", "_"))
  242. elif tokenizer.check("QUOTED_STRING"):
  243. return process_python_str(tokenizer.read().text)
  244. else:
  245. tokenizer.raise_syntax_error(
  246. message="Expected a marker variable or quoted string"
  247. )
  248. def process_env_var(env_var: str) -> Variable:
  249. if env_var in ("platform_python_implementation", "python_implementation"):
  250. return Variable("platform_python_implementation")
  251. else:
  252. return Variable(env_var)
  253. def process_python_str(python_str: str) -> Value:
  254. value = ast.literal_eval(python_str)
  255. return Value(str(value))
  256. def _parse_marker_op(tokenizer: Tokenizer) -> Op:
  257. """
  258. marker_op = IN | NOT IN | OP
  259. """
  260. if tokenizer.check("IN"):
  261. tokenizer.read()
  262. return Op("in")
  263. elif tokenizer.check("NOT"):
  264. tokenizer.read()
  265. tokenizer.expect("WS", expected="whitespace after 'not'")
  266. tokenizer.expect("IN", expected="'in' after 'not'")
  267. return Op("not in")
  268. elif tokenizer.check("OP"):
  269. return Op(tokenizer.read().text)
  270. else:
  271. return tokenizer.raise_syntax_error(
  272. "Expected marker operator, one of "
  273. "<=, <, !=, ==, >=, >, ~=, ===, in, not in"
  274. )