utils.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. """Utilities for parsing source text
  2. """
  3. from __future__ import annotations
  4. import re
  5. from typing import Match, TypeVar
  6. from .entities import entities
  7. def charCodeAt(src: str, pos: int) -> int | None:
  8. """
  9. Returns the Unicode value of the character at the specified location.
  10. @param - index The zero-based index of the desired character.
  11. If there is no character at the specified index, NaN is returned.
  12. This was added for compatibility with python
  13. """
  14. try:
  15. return ord(src[pos])
  16. except IndexError:
  17. return None
  18. def charStrAt(src: str, pos: int) -> str | None:
  19. """
  20. Returns the Unicode value of the character at the specified location.
  21. @param - index The zero-based index of the desired character.
  22. If there is no character at the specified index, NaN is returned.
  23. This was added for compatibility with python
  24. """
  25. try:
  26. return src[pos]
  27. except IndexError:
  28. return None
  29. _ItemTV = TypeVar("_ItemTV")
  30. def arrayReplaceAt(
  31. src: list[_ItemTV], pos: int, newElements: list[_ItemTV]
  32. ) -> list[_ItemTV]:
  33. """
  34. Remove element from array and put another array at those position.
  35. Useful for some operations with tokens
  36. """
  37. return src[:pos] + newElements + src[pos + 1 :]
  38. def isValidEntityCode(c: int) -> bool:
  39. # broken sequence
  40. if c >= 0xD800 and c <= 0xDFFF:
  41. return False
  42. # never used
  43. if c >= 0xFDD0 and c <= 0xFDEF:
  44. return False
  45. if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
  46. return False
  47. # control codes
  48. if c >= 0x00 and c <= 0x08:
  49. return False
  50. if c == 0x0B:
  51. return False
  52. if c >= 0x0E and c <= 0x1F:
  53. return False
  54. if c >= 0x7F and c <= 0x9F:
  55. return False
  56. # out of range
  57. if c > 0x10FFFF:
  58. return False
  59. return True
  60. def fromCodePoint(c: int) -> str:
  61. """Convert ordinal to unicode.
  62. Note, in the original Javascript two string characters were required,
  63. for codepoints larger than `0xFFFF`.
  64. But Python 3 can represent any unicode codepoint in one character.
  65. """
  66. return chr(c)
  67. # UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
  68. # ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
  69. UNESCAPE_ALL_RE = re.compile(
  70. r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
  71. re.IGNORECASE,
  72. )
  73. DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
  74. DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
  75. def replaceEntityPattern(match: str, name: str) -> str:
  76. """Convert HTML entity patterns,
  77. see https://spec.commonmark.org/0.30/#entity-references
  78. """
  79. if name in entities:
  80. return entities[name]
  81. code: None | int = None
  82. if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
  83. code = int(pat.group(1), 10)
  84. elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
  85. code = int(pat.group(1), 16)
  86. if code is not None and isValidEntityCode(code):
  87. return fromCodePoint(code)
  88. return match
  89. def unescapeAll(string: str) -> str:
  90. def replacer_func(match: Match[str]) -> str:
  91. escaped = match.group(1)
  92. if escaped:
  93. return escaped
  94. entity = match.group(2)
  95. return replaceEntityPattern(match.group(), entity)
  96. if "\\" not in string and "&" not in string:
  97. return string
  98. return UNESCAPE_ALL_RE.sub(replacer_func, string)
  99. ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
  100. ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
  101. def stripEscape(string: str) -> str:
  102. """Strip escape \\ characters"""
  103. return ESCAPE_CHAR.sub(r"\1", string)
  104. def escapeHtml(raw: str) -> str:
  105. """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
  106. # like html.escape, but without escaping single quotes
  107. raw = raw.replace("&", "&amp;") # Must be done first!
  108. raw = raw.replace("<", "&lt;")
  109. raw = raw.replace(">", "&gt;")
  110. raw = raw.replace('"', "&quot;")
  111. return raw
  112. # //////////////////////////////////////////////////////////////////////////////
  113. REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
  114. def escapeRE(string: str) -> str:
  115. string = REGEXP_ESCAPE_RE.sub("\\$&", string)
  116. return string
  117. # //////////////////////////////////////////////////////////////////////////////
  118. def isSpace(code: int | None) -> bool:
  119. """Check if character code is a whitespace."""
  120. return code in (0x09, 0x20)
  121. def isStrSpace(ch: str | None) -> bool:
  122. """Check if character is a whitespace."""
  123. return ch in ("\t", " ")
  124. MD_WHITESPACE = {
  125. 0x09, # \t
  126. 0x0A, # \n
  127. 0x0B, # \v
  128. 0x0C, # \f
  129. 0x0D, # \r
  130. 0x20, # space
  131. 0xA0,
  132. 0x1680,
  133. 0x202F,
  134. 0x205F,
  135. 0x3000,
  136. }
  137. def isWhiteSpace(code: int) -> bool:
  138. r"""Zs (unicode class) || [\t\f\v\r\n]"""
  139. if code >= 0x2000 and code <= 0x200A:
  140. return True
  141. return code in MD_WHITESPACE
  142. # //////////////////////////////////////////////////////////////////////////////
  143. UNICODE_PUNCT_RE = re.compile(
  144. r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501
  145. )
  146. # Currently without astral characters support.
  147. def isPunctChar(ch: str) -> bool:
  148. """Check if character is a punctuation character."""
  149. return UNICODE_PUNCT_RE.search(ch) is not None
  150. MD_ASCII_PUNCT = {
  151. 0x21, # /* ! */
  152. 0x22, # /* " */
  153. 0x23, # /* # */
  154. 0x24, # /* $ */
  155. 0x25, # /* % */
  156. 0x26, # /* & */
  157. 0x27, # /* ' */
  158. 0x28, # /* ( */
  159. 0x29, # /* ) */
  160. 0x2A, # /* * */
  161. 0x2B, # /* + */
  162. 0x2C, # /* , */
  163. 0x2D, # /* - */
  164. 0x2E, # /* . */
  165. 0x2F, # /* / */
  166. 0x3A, # /* : */
  167. 0x3B, # /* ; */
  168. 0x3C, # /* < */
  169. 0x3D, # /* = */
  170. 0x3E, # /* > */
  171. 0x3F, # /* ? */
  172. 0x40, # /* @ */
  173. 0x5B, # /* [ */
  174. 0x5C, # /* \ */
  175. 0x5D, # /* ] */
  176. 0x5E, # /* ^ */
  177. 0x5F, # /* _ */
  178. 0x60, # /* ` */
  179. 0x7B, # /* { */
  180. 0x7C, # /* | */
  181. 0x7D, # /* } */
  182. 0x7E, # /* ~ */
  183. }
  184. def isMdAsciiPunct(ch: int) -> bool:
  185. """Markdown ASCII punctuation characters.
  186. ::
  187. !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
  188. See http://spec.commonmark.org/0.15/#ascii-punctuation-character
  189. Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
  190. """ # noqa: E501
  191. return ch in MD_ASCII_PUNCT
  192. def normalizeReference(string: str) -> str:
  193. """Helper to unify [reference labels]."""
  194. # Trim and collapse whitespace
  195. #
  196. string = re.sub(r"\s+", " ", string.strip())
  197. # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
  198. # fixed in v12 (couldn't find any details).
  199. #
  200. # So treat this one as a special case
  201. # (remove this when node v10 is no longer supported).
  202. #
  203. # if ('ẞ'.toLowerCase() === 'Ṿ') {
  204. # str = str.replace(/ẞ/g, 'ß')
  205. # }
  206. # .toLowerCase().toUpperCase() should get rid of all differences
  207. # between letter variants.
  208. #
  209. # Simple .toLowerCase() doesn't normalize 125 code points correctly,
  210. # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
  211. # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
  212. # uppercased versions).
  213. #
  214. # Here's an example showing how it happens. Lets take greek letter omega:
  215. # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
  216. #
  217. # Unicode entries:
  218. # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
  219. # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
  220. # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
  221. # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
  222. #
  223. # Case-insensitive comparison should treat all of them as equivalent.
  224. #
  225. # But .toLowerCase() doesn't change ϑ (it's already lowercase),
  226. # and .toUpperCase() doesn't change ϴ (already uppercase).
  227. #
  228. # Applying first lower then upper case normalizes any character:
  229. # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
  230. #
  231. # Note: this is equivalent to unicode case folding; unicode normalization
  232. # is a different step that is not required here.
  233. #
  234. # Final result should be uppercased, because it's later stored in an object
  235. # (this avoid a conflict with Object.prototype members,
  236. # most notably, `__proto__`)
  237. #
  238. return string.lower().upper()
  239. LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
  240. LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
  241. def isLinkOpen(string: str) -> bool:
  242. return bool(LINK_OPEN_RE.search(string))
  243. def isLinkClose(string: str) -> bool:
  244. return bool(LINK_CLOSE_RE.search(string))