ucre.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. from uc_micro.categories import Cc, Cf, P, Z
  2. from uc_micro.properties import Any
  3. SRC_ANY = Any.REGEX
  4. SRC_CC = Cc.REGEX
  5. SRC_CF = Cf.REGEX
  6. SRC_P = P.REGEX
  7. SRC_Z = Z.REGEX
  8. # \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
  9. SRC_ZPCC = "|".join([SRC_Z, SRC_P, SRC_CC])
  10. # \p{\Z\Cc} (white spaces + control)
  11. SRC_ZCC = "|".join([SRC_Z, SRC_CC])
  12. # Experimental. List of chars, completely prohibited in links
  13. # because can separate it from other part of text
  14. TEXT_SEPARATORS = "[><\uff5c]"
  15. # All possible word characters (everything without punctuation, spaces & controls)
  16. # Defined via punctuation & spaces to save space
  17. # Should be something like \p{\L\N\S\M} (\w but without `_`)
  18. SRC_PSEUDO_LETTER = "(?:(?!" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + SRC_ANY + ")"
  19. # The same as abothe but without [0-9]
  20. # var SRC_PSEUDO_LETTER_non_d = '(?:(?![0-9]|' + SRC_ZPCC + ')' + SRC_ANY + ')'
  21. # =============================================================================
  22. SRC_IP4 = (
  23. "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|"
  24. + "2[0-4][0-9]|[01]?[0-9][0-9]?)"
  25. )
  26. # Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
  27. SRC_AUTH = "(?:(?:(?!" + SRC_ZCC + "|[@/\\[\\]()]).)+@)?"
  28. SRC_PORT = (
  29. "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?"
  30. )
  31. # Allow anything in markdown spec, forbid quote (") at the first position
  32. # because emails enclosed in quotes are far more common
  33. SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'
  34. SRC_XN = "xn--[a-z0-9\\-]{1,59}"
  35. # More to read about domain names
  36. # http:#serverfault.com/questions/638260/
  37. # Allow letters & digits (http:#test1)
  38. SRC_DOMAIN_ROOT = "(?:" + SRC_XN + "|" + SRC_PSEUDO_LETTER + "{1,63}" + ")"
  39. SRC_DOMAIN = (
  40. "(?:"
  41. + SRC_XN
  42. + "|"
  43. + "(?:"
  44. + SRC_PSEUDO_LETTER
  45. + ")"
  46. + "|"
  47. + "(?:"
  48. + SRC_PSEUDO_LETTER
  49. + "(?:-|"
  50. + SRC_PSEUDO_LETTER
  51. + "){0,61}"
  52. + SRC_PSEUDO_LETTER
  53. + ")"
  54. + ")"
  55. )
  56. SRC_HOST = (
  57. "(?:"
  58. +
  59. # Don't need IP check, because digits are already allowed in normal domain names
  60. # SRC_IP4 +
  61. # '|' +
  62. "(?:(?:(?:"
  63. + SRC_DOMAIN
  64. + ")\\.)*"
  65. + SRC_DOMAIN # _root
  66. + ")"
  67. + ")"
  68. )
  69. TPL_HOST_FUZZY = (
  70. "(?:" + SRC_IP4 + "|" + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + ")"
  71. )
  72. TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))"
  73. # =============================================================================
  74. # Rude test fuzzy links by host, for quick deny
  75. TPL_HOST_FUZZY_TEST = (
  76. "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))"
  77. )
  78. def _re_host_terminator(opts):
  79. src_host_terminator = (
  80. "(?=$|"
  81. + TEXT_SEPARATORS
  82. + "|"
  83. + SRC_ZPCC
  84. + ")"
  85. + "(?!"
  86. + ("-(?!--)|" if opts.get("---") else "-|")
  87. + "_|:\\d|\\.-|\\.(?!$|"
  88. + SRC_ZPCC
  89. + "))"
  90. )
  91. return src_host_terminator
  92. def _re_src_path(opts):
  93. src_path = (
  94. "(?:"
  95. + "[/?#]"
  96. + "(?:"
  97. + "(?!"
  98. + SRC_ZCC
  99. + "|"
  100. + TEXT_SEPARATORS
  101. + "|[()[\\]{}.,\"'?!\\-;]).|"
  102. + "\\[(?:(?!"
  103. + SRC_ZCC
  104. + "|\\]).)*\\]|"
  105. + "\\((?:(?!"
  106. + SRC_ZCC
  107. + "|[)]).)*\\)|"
  108. + "\\{(?:(?!"
  109. + SRC_ZCC
  110. + "|[}]).)*\\}|"
  111. + '\\"(?:(?!'
  112. + SRC_ZCC
  113. + '|["]).)+\\"|'
  114. + "\\'(?:(?!"
  115. + SRC_ZCC
  116. + "|[']).)+\\'|"
  117. + "\\'(?="
  118. + SRC_PSEUDO_LETTER
  119. + "|[-])|"
  120. + "\\.{2,}[a-zA-Z0-9%/&]|"
  121. # google has many dots in "google search" links (#66, #81).
  122. # github has ... in commit range links,
  123. # ReSTRICT to
  124. # - english
  125. # - percent-encoded
  126. # - parts of file path
  127. # - params separator
  128. # until more examples found.
  129. + "\\.(?!"
  130. + SRC_ZCC
  131. + "|[.]|$)|"
  132. + ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|")
  133. + ",(?!"
  134. + SRC_ZCC
  135. + "|$)|" # allow `,,,` in paths
  136. + ";(?!"
  137. + SRC_ZCC
  138. + "|$)|" # allow `,,,` in paths
  139. + "\\!+(?!"
  140. + SRC_ZCC
  141. + "|[!]|$)|" # allow `!!!` in paths, but not at the end
  142. + "\\?(?!"
  143. + SRC_ZCC
  144. + "|[?]|$)"
  145. + ")+"
  146. + "|\\/"
  147. + ")?"
  148. )
  149. return src_path
  150. def build_re(opts):
  151. """Build regex
  152. Args:
  153. opts (dict): options
  154. Return:
  155. dict: dict of regex string
  156. """
  157. SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts)
  158. TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts)
  159. SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts)
  160. TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts)
  161. TPL_HOST_PORT_NO_IP_FUZZY_STRICT = (
  162. TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts)
  163. )
  164. TPL_EMAIL_FUZZY = (
  165. "(^|"
  166. + TEXT_SEPARATORS
  167. + '|"|\\(|'
  168. + SRC_ZCC
  169. + ")"
  170. + "("
  171. + SRC_EMAIL_NAME
  172. + "@"
  173. + TPL_HOST_FUZZY_STRICT
  174. + ")"
  175. )
  176. regex = {
  177. "src_Any": SRC_ANY,
  178. "src_Cc": SRC_CC,
  179. "src_Cf": SRC_CF,
  180. "src_Z": SRC_Z,
  181. "src_P": SRC_P,
  182. "src_ZPCc": SRC_ZPCC,
  183. "src_ZCc": SRC_ZCC,
  184. "src_pseudo_letter": SRC_PSEUDO_LETTER,
  185. "src_ip4": SRC_IP4,
  186. "src_auth": SRC_AUTH,
  187. "src_port": SRC_PORT,
  188. "src_host_terminator": _re_host_terminator(opts),
  189. "src_path": _re_src_path(opts),
  190. "src_email_name": SRC_EMAIL_NAME,
  191. "src_xn": SRC_XN,
  192. "src_domain_root": SRC_DOMAIN_ROOT,
  193. "src_domain": SRC_DOMAIN,
  194. "src_host": SRC_HOST,
  195. "tpl_host_fuzzy": TPL_HOST_FUZZY,
  196. "tpl_host_no_ip_fuzzy": TPL_HOST_NO_IP_FUZZY,
  197. "src_host_strict": SRC_HOST_STRICT,
  198. "tpl_host_fuzzy_strict": TPL_HOST_FUZZY_STRICT,
  199. "src_host_port_strict": SRC_HOST_PORT_STRICT,
  200. "tpl_host_port_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT,
  201. "tpl_host_port_no_ip_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT,
  202. # Main rules
  203. "tpl_host_fuzzy_test": TPL_HOST_FUZZY_TEST,
  204. "tpl_email_fuzzy": TPL_EMAIL_FUZZY,
  205. # Fuzzy link can't be prepended with .:/\- and non punctuation.
  206. # but can start with > (markdown blockquote)
  207. "tpl_link_fuzzy": (
  208. "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|"
  209. + SRC_ZPCC
  210. + "))"
  211. + "((?![$+<=>^`|\uff5c])"
  212. + TPL_HOST_PORT_FUZZY_STRICT
  213. + _re_src_path(opts)
  214. + ")"
  215. ),
  216. # Fuzzy link can't be prepended with .:/\- and non punctuation.
  217. # but can start with > (markdown blockquote)
  218. "tpl_link_no_ip_fuzzy": (
  219. "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|"
  220. + SRC_ZPCC
  221. + "))"
  222. + "((?![$+<=>^`|\uff5c])"
  223. + TPL_HOST_PORT_NO_IP_FUZZY_STRICT
  224. + _re_src_path(opts)
  225. + ")"
  226. ),
  227. }
  228. return regex