parse.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. """Parser for attributes::
  2. attributes { id = "foo", class = "bar baz",
  3. key1 = "val1", key2 = "val2" }
  4. Adapted from:
  5. https://github.com/jgm/djot/blob/fae7364b86bfce69bc6d5b5eede1f5196d845fd6/djot/attributes.lua#L1
  6. syntax:
  7. attributes <- '{' whitespace* attribute (whitespace attribute)* whitespace* '}'
  8. attribute <- identifier | class | keyval
  9. identifier <- '#' name
  10. class <- '.' name
  11. name <- (nonspace, nonpunctuation other than ':', '_', '-')+
  12. keyval <- key '=' val
  13. key <- (ASCII_ALPHANUM | ':' | '_' | '-')+
  14. val <- bareval | quotedval
  15. bareval <- (ASCII_ALPHANUM | ':' | '_' | '-')+
  16. quotedval <- '"' ([^"] | '\"') '"'
  17. """
  18. from __future__ import annotations
  19. from enum import Enum
  20. import re
  21. from typing import Callable
  22. class State(Enum):
  23. START = 0
  24. SCANNING = 1
  25. SCANNING_ID = 2
  26. SCANNING_CLASS = 3
  27. SCANNING_KEY = 4
  28. SCANNING_VALUE = 5
  29. SCANNING_BARE_VALUE = 6
  30. SCANNING_QUOTED_VALUE = 7
  31. SCANNING_COMMENT = 8
  32. SCANNING_ESCAPED = 9
  33. DONE = 10
  34. REGEX_SPACE = re.compile(r"\s")
  35. REGEX_SPACE_PUNCTUATION = re.compile(r"[\s!\"#$%&'()*+,./;<=>?@[\]^`{|}~]")
  36. REGEX_KEY_CHARACTERS = re.compile(r"[a-zA-Z\d_:-]")
  37. class TokenState:
  38. def __init__(self) -> None:
  39. self._tokens: list[tuple[int, int, str]] = []
  40. self.start: int = 0
  41. def set_start(self, start: int) -> None:
  42. self.start = start
  43. def append(self, start: int, end: int, ttype: str) -> None:
  44. self._tokens.append((start, end, ttype))
  45. def compile(self, string: str) -> dict[str, str]:
  46. """compile the tokens into a dictionary"""
  47. attributes = {}
  48. classes = []
  49. idx = 0
  50. while idx < len(self._tokens):
  51. start, end, ttype = self._tokens[idx]
  52. if ttype == "id":
  53. attributes["id"] = string[start:end]
  54. elif ttype == "class":
  55. classes.append(string[start:end])
  56. elif ttype == "key":
  57. key = string[start:end]
  58. if idx + 1 < len(self._tokens):
  59. start, end, ttype = self._tokens[idx + 1]
  60. if ttype == "value":
  61. if key == "class":
  62. classes.append(string[start:end])
  63. else:
  64. attributes[key] = string[start:end]
  65. idx += 1
  66. idx += 1
  67. if classes:
  68. attributes["class"] = " ".join(classes)
  69. return attributes
  70. def __str__(self) -> str:
  71. return str(self._tokens)
  72. def __repr__(self) -> str:
  73. return repr(self._tokens)
  74. class ParseError(Exception):
  75. def __init__(self, msg: str, pos: int) -> None:
  76. self.pos = pos
  77. super().__init__(msg + f" at position {pos}")
  78. def parse(string: str) -> tuple[int, dict[str, str]]:
  79. """Parse attributes from start of string.
  80. :returns: (length of parsed string, dict of attributes)
  81. """
  82. pos = 0
  83. state: State = State.START
  84. tokens = TokenState()
  85. while pos < len(string):
  86. state = HANDLERS[state](string[pos], pos, tokens)
  87. if state == State.DONE:
  88. return pos, tokens.compile(string)
  89. pos = pos + 1
  90. return pos, tokens.compile(string)
  91. def handle_start(char: str, pos: int, tokens: TokenState) -> State:
  92. if char == "{":
  93. return State.SCANNING
  94. raise ParseError("Attributes must start with '{'", pos)
  95. def handle_scanning(char: str, pos: int, tokens: TokenState) -> State:
  96. if char == " " or char == "\t" or char == "\n" or char == "\r":
  97. return State.SCANNING
  98. if char == "}":
  99. return State.DONE
  100. if char == "#":
  101. tokens.set_start(pos)
  102. return State.SCANNING_ID
  103. if char == "%":
  104. tokens.set_start(pos)
  105. return State.SCANNING_COMMENT
  106. if char == ".":
  107. tokens.set_start(pos)
  108. return State.SCANNING_CLASS
  109. if REGEX_KEY_CHARACTERS.fullmatch(char):
  110. tokens.set_start(pos)
  111. return State.SCANNING_KEY
  112. raise ParseError(f"Unexpected character whilst scanning: {char}", pos)
  113. def handle_scanning_comment(char: str, pos: int, tokens: TokenState) -> State:
  114. if char == "%":
  115. return State.SCANNING
  116. return State.SCANNING_COMMENT
  117. def handle_scanning_id(char: str, pos: int, tokens: TokenState) -> State:
  118. if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
  119. return State.SCANNING_ID
  120. if char == "}":
  121. if (pos - 1) > tokens.start:
  122. tokens.append(tokens.start + 1, pos, "id")
  123. return State.DONE
  124. if REGEX_SPACE.fullmatch(char):
  125. if (pos - 1) > tokens.start:
  126. tokens.append(tokens.start + 1, pos, "id")
  127. return State.SCANNING
  128. raise ParseError(f"Unexpected character whilst scanning id: {char}", pos)
  129. def handle_scanning_class(char: str, pos: int, tokens: TokenState) -> State:
  130. if not REGEX_SPACE_PUNCTUATION.fullmatch(char):
  131. return State.SCANNING_CLASS
  132. if char == "}":
  133. if (pos - 1) > tokens.start:
  134. tokens.append(tokens.start + 1, pos, "class")
  135. return State.DONE
  136. if REGEX_SPACE.fullmatch(char):
  137. if (pos - 1) > tokens.start:
  138. tokens.append(tokens.start + 1, pos, "class")
  139. return State.SCANNING
  140. raise ParseError(f"Unexpected character whilst scanning class: {char}", pos)
  141. def handle_scanning_key(char: str, pos: int, tokens: TokenState) -> State:
  142. if char == "=":
  143. tokens.append(tokens.start, pos, "key")
  144. return State.SCANNING_VALUE
  145. if REGEX_KEY_CHARACTERS.fullmatch(char):
  146. return State.SCANNING_KEY
  147. raise ParseError(f"Unexpected character whilst scanning key: {char}", pos)
  148. def handle_scanning_value(char: str, pos: int, tokens: TokenState) -> State:
  149. if char == '"':
  150. tokens.set_start(pos)
  151. return State.SCANNING_QUOTED_VALUE
  152. if REGEX_KEY_CHARACTERS.fullmatch(char):
  153. tokens.set_start(pos)
  154. return State.SCANNING_BARE_VALUE
  155. raise ParseError(f"Unexpected character whilst scanning value: {char}", pos)
  156. def handle_scanning_bare_value(char: str, pos: int, tokens: TokenState) -> State:
  157. if REGEX_KEY_CHARACTERS.fullmatch(char):
  158. return State.SCANNING_BARE_VALUE
  159. if char == "}":
  160. tokens.append(tokens.start, pos, "value")
  161. return State.DONE
  162. if REGEX_SPACE.fullmatch(char):
  163. tokens.append(tokens.start, pos, "value")
  164. return State.SCANNING
  165. raise ParseError(f"Unexpected character whilst scanning bare value: {char}", pos)
  166. def handle_scanning_escaped(char: str, pos: int, tokens: TokenState) -> State:
  167. return State.SCANNING_QUOTED_VALUE
  168. def handle_scanning_quoted_value(char: str, pos: int, tokens: TokenState) -> State:
  169. if char == '"':
  170. tokens.append(tokens.start + 1, pos, "value")
  171. return State.SCANNING
  172. if char == "\\":
  173. return State.SCANNING_ESCAPED
  174. if char == "{" or char == "}":
  175. raise ParseError(
  176. f"Unexpected character whilst scanning quoted value: {char}", pos
  177. )
  178. if char == "\n":
  179. tokens.append(tokens.start + 1, pos, "value")
  180. return State.SCANNING_QUOTED_VALUE
  181. return State.SCANNING_QUOTED_VALUE
  182. HANDLERS: dict[State, Callable[[str, int, TokenState], State]] = {
  183. State.START: handle_start,
  184. State.SCANNING: handle_scanning,
  185. State.SCANNING_COMMENT: handle_scanning_comment,
  186. State.SCANNING_ID: handle_scanning_id,
  187. State.SCANNING_CLASS: handle_scanning_class,
  188. State.SCANNING_KEY: handle_scanning_key,
  189. State.SCANNING_VALUE: handle_scanning_value,
  190. State.SCANNING_BARE_VALUE: handle_scanning_bare_value,
  191. State.SCANNING_QUOTED_VALUE: handle_scanning_quoted_value,
  192. State.SCANNING_ESCAPED: handle_scanning_escaped,
  193. }