tree.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. """A tree representation of a linear markdown-it token stream.
  2. This module is not part of upstream JavaScript markdown-it.
  3. """
  4. from __future__ import annotations
  5. from collections.abc import Generator, Sequence
  6. import textwrap
  7. from typing import Any, NamedTuple, TypeVar, overload
  8. from .token import Token
  9. class _NesterTokens(NamedTuple):
  10. opening: Token
  11. closing: Token
  12. _NodeType = TypeVar("_NodeType", bound="SyntaxTreeNode")
  13. class SyntaxTreeNode:
  14. """A Markdown syntax tree node.
  15. A class that can be used to construct a tree representation of a linear
  16. `markdown-it-py` token stream.
  17. Each node in the tree represents either:
  18. - root of the Markdown document
  19. - a single unnested `Token`
  20. - a `Token` "_open" and "_close" token pair, and the tokens nested in
  21. between
  22. """
  23. def __init__(
  24. self, tokens: Sequence[Token] = (), *, create_root: bool = True
  25. ) -> None:
  26. """Initialize a `SyntaxTreeNode` from a token stream.
  27. If `create_root` is True, create a root node for the document.
  28. """
  29. # Only nodes representing an unnested token have self.token
  30. self.token: Token | None = None
  31. # Only containers have nester tokens
  32. self.nester_tokens: _NesterTokens | None = None
  33. # Root node does not have self.parent
  34. self._parent: Any = None
  35. # Empty list unless a non-empty container, or unnested token that has
  36. # children (i.e. inline or img)
  37. self._children: list[Any] = []
  38. if create_root:
  39. self._set_children_from_tokens(tokens)
  40. return
  41. if not tokens:
  42. raise ValueError(
  43. "Can only create root from empty token sequence."
  44. " Set `create_root=True`."
  45. )
  46. elif len(tokens) == 1:
  47. inline_token = tokens[0]
  48. if inline_token.nesting:
  49. raise ValueError(
  50. "Unequal nesting level at the start and end of token stream."
  51. )
  52. self.token = inline_token
  53. if inline_token.children:
  54. self._set_children_from_tokens(inline_token.children)
  55. else:
  56. self.nester_tokens = _NesterTokens(tokens[0], tokens[-1])
  57. self._set_children_from_tokens(tokens[1:-1])
  58. def __repr__(self) -> str:
  59. return f"{type(self).__name__}({self.type})"
  60. @overload
  61. def __getitem__(self: _NodeType, item: int) -> _NodeType:
  62. ...
  63. @overload
  64. def __getitem__(self: _NodeType, item: slice) -> list[_NodeType]:
  65. ...
  66. def __getitem__(self: _NodeType, item: int | slice) -> _NodeType | list[_NodeType]:
  67. return self.children[item]
  68. def to_tokens(self: _NodeType) -> list[Token]:
  69. """Recover the linear token stream."""
  70. def recursive_collect_tokens(node: _NodeType, token_list: list[Token]) -> None:
  71. if node.type == "root":
  72. for child in node.children:
  73. recursive_collect_tokens(child, token_list)
  74. elif node.token:
  75. token_list.append(node.token)
  76. else:
  77. assert node.nester_tokens
  78. token_list.append(node.nester_tokens.opening)
  79. for child in node.children:
  80. recursive_collect_tokens(child, token_list)
  81. token_list.append(node.nester_tokens.closing)
  82. tokens: list[Token] = []
  83. recursive_collect_tokens(self, tokens)
  84. return tokens
  85. @property
  86. def children(self: _NodeType) -> list[_NodeType]:
  87. return self._children
  88. @children.setter
  89. def children(self: _NodeType, value: list[_NodeType]) -> None:
  90. self._children = value
  91. @property
  92. def parent(self: _NodeType) -> _NodeType | None:
  93. return self._parent # type: ignore
  94. @parent.setter
  95. def parent(self: _NodeType, value: _NodeType | None) -> None:
  96. self._parent = value
  97. @property
  98. def is_root(self) -> bool:
  99. """Is the node a special root node?"""
  100. return not (self.token or self.nester_tokens)
  101. @property
  102. def is_nested(self) -> bool:
  103. """Is this node nested?.
  104. Returns `True` if the node represents a `Token` pair and tokens in the
  105. sequence between them, where `Token.nesting` of the first `Token` in
  106. the pair is 1 and nesting of the other `Token` is -1.
  107. """
  108. return bool(self.nester_tokens)
  109. @property
  110. def siblings(self: _NodeType) -> Sequence[_NodeType]:
  111. """Get siblings of the node.
  112. Gets the whole group of siblings, including self.
  113. """
  114. if not self.parent:
  115. return [self]
  116. return self.parent.children
  117. @property
  118. def type(self) -> str:
  119. """Get a string type of the represented syntax.
  120. - "root" for root nodes
  121. - `Token.type` if the node represents an unnested token
  122. - `Token.type` of the opening token, with "_open" suffix stripped, if
  123. the node represents a nester token pair
  124. """
  125. if self.is_root:
  126. return "root"
  127. if self.token:
  128. return self.token.type
  129. assert self.nester_tokens
  130. return _removesuffix(self.nester_tokens.opening.type, "_open")
  131. @property
  132. def next_sibling(self: _NodeType) -> _NodeType | None:
  133. """Get the next node in the sequence of siblings.
  134. Returns `None` if this is the last sibling.
  135. """
  136. self_index = self.siblings.index(self)
  137. if self_index + 1 < len(self.siblings):
  138. return self.siblings[self_index + 1]
  139. return None
  140. @property
  141. def previous_sibling(self: _NodeType) -> _NodeType | None:
  142. """Get the previous node in the sequence of siblings.
  143. Returns `None` if this is the first sibling.
  144. """
  145. self_index = self.siblings.index(self)
  146. if self_index - 1 >= 0:
  147. return self.siblings[self_index - 1]
  148. return None
  149. def _add_child(
  150. self,
  151. tokens: Sequence[Token],
  152. ) -> None:
  153. """Make a child node for `self`."""
  154. child = type(self)(tokens, create_root=False)
  155. child.parent = self
  156. self.children.append(child)
  157. def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
  158. """Convert the token stream to a tree structure and set the resulting
  159. nodes as children of `self`."""
  160. reversed_tokens = list(reversed(tokens))
  161. while reversed_tokens:
  162. token = reversed_tokens.pop()
  163. if not token.nesting:
  164. self._add_child([token])
  165. continue
  166. if token.nesting != 1:
  167. raise ValueError("Invalid token nesting")
  168. nested_tokens = [token]
  169. nesting = 1
  170. while reversed_tokens and nesting:
  171. token = reversed_tokens.pop()
  172. nested_tokens.append(token)
  173. nesting += token.nesting
  174. if nesting:
  175. raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")
  176. self._add_child(nested_tokens)
  177. def pretty(
  178. self, *, indent: int = 2, show_text: bool = False, _current: int = 0
  179. ) -> str:
  180. """Create an XML style string of the tree."""
  181. prefix = " " * _current
  182. text = prefix + f"<{self.type}"
  183. if not self.is_root and self.attrs:
  184. text += " " + " ".join(f"{k}={v!r}" for k, v in self.attrs.items())
  185. text += ">"
  186. if (
  187. show_text
  188. and not self.is_root
  189. and self.type in ("text", "text_special")
  190. and self.content
  191. ):
  192. text += "\n" + textwrap.indent(self.content, prefix + " " * indent)
  193. for child in self.children:
  194. text += "\n" + child.pretty(
  195. indent=indent, show_text=show_text, _current=_current + indent
  196. )
  197. return text
  198. def walk(
  199. self: _NodeType, *, include_self: bool = True
  200. ) -> Generator[_NodeType, None, None]:
  201. """Recursively yield all descendant nodes in the tree starting at self.
  202. The order mimics the order of the underlying linear token
  203. stream (i.e. depth first).
  204. """
  205. if include_self:
  206. yield self
  207. for child in self.children:
  208. yield from child.walk(include_self=True)
  209. # NOTE:
  210. # The values of the properties defined below directly map to properties
  211. # of the underlying `Token`s. A root node does not translate to a `Token`
  212. # object, so calling these property getters on a root node will raise an
  213. # `AttributeError`.
  214. #
  215. # There is no mapping for `Token.nesting` because the `is_nested` property
  216. # provides that data, and can be called on any node type, including root.
  217. def _attribute_token(self) -> Token:
  218. """Return the `Token` that is used as the data source for the
  219. properties defined below."""
  220. if self.token:
  221. return self.token
  222. if self.nester_tokens:
  223. return self.nester_tokens.opening
  224. raise AttributeError("Root node does not have the accessed attribute")
  225. @property
  226. def tag(self) -> str:
  227. """html tag name, e.g. \"p\" """
  228. return self._attribute_token().tag
  229. @property
  230. def attrs(self) -> dict[str, str | int | float]:
  231. """Html attributes."""
  232. return self._attribute_token().attrs
  233. def attrGet(self, name: str) -> None | str | int | float:
  234. """Get the value of attribute `name`, or null if it does not exist."""
  235. return self._attribute_token().attrGet(name)
  236. @property
  237. def map(self) -> tuple[int, int] | None:
  238. """Source map info. Format: `tuple[ line_begin, line_end ]`"""
  239. map_ = self._attribute_token().map
  240. if map_:
  241. # Type ignore because `Token`s attribute types are not perfect
  242. return tuple(map_) # type: ignore
  243. return None
  244. @property
  245. def level(self) -> int:
  246. """nesting level, the same as `state.level`"""
  247. return self._attribute_token().level
  248. @property
  249. def content(self) -> str:
  250. """In a case of self-closing tag (code, html, fence, etc.), it
  251. has contents of this tag."""
  252. return self._attribute_token().content
  253. @property
  254. def markup(self) -> str:
  255. """'*' or '_' for emphasis, fence string for fence, etc."""
  256. return self._attribute_token().markup
  257. @property
  258. def info(self) -> str:
  259. """fence infostring"""
  260. return self._attribute_token().info
  261. @property
  262. def meta(self) -> dict[Any, Any]:
  263. """A place for plugins to store an arbitrary data."""
  264. return self._attribute_token().meta
  265. @property
  266. def block(self) -> bool:
  267. """True for block-level tokens, false for inline tokens."""
  268. return self._attribute_token().block
  269. @property
  270. def hidden(self) -> bool:
  271. """If it's true, ignore this element when rendering.
  272. Used for tight lists to hide paragraphs."""
  273. return self._attribute_token().hidden
  274. def _removesuffix(string: str, suffix: str) -> str:
  275. """Remove a suffix from a string.
  276. Replace this with str.removesuffix() from stdlib when minimum Python
  277. version is 3.9.
  278. """
  279. if suffix and string.endswith(suffix):
  280. return string[: -len(suffix)]
  281. return string