main.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. import copy
  2. import re
  3. import types
  4. from .ucre import build_re
  5. # py>=37: re.Pattern, else: _sre.SRE_Pattern
  6. RE_TYPE = type(re.compile(r""))
  7. def _escape_re(string):
  8. return re.sub(r"([.?*+^$[\]\\(){}|-])", r"\\\1", string)
  9. def _index_of(text, search_value):
  10. try:
  11. result = text.index(search_value)
  12. except ValueError:
  13. result = -1
  14. return result
  15. class SchemaError(Exception):
  16. """Linkify schema error"""
  17. def __init__(self, name, val):
  18. message = "(LinkifyIt) Invalid schema '{}': '{}'".format(name, val)
  19. super().__init__(message)
  20. class Match:
  21. """Match result.
  22. Attributes:
  23. schema (str): Prefix (protocol) for matched string.
  24. index (int): First position of matched string.
  25. last_index (int): Next position after matched string.
  26. raw (str): Matched string.
  27. text (str): Notmalized text of matched string.
  28. url (str): Normalized url of matched string.
  29. Args:
  30. linkifyit (:class:`linkify_it.main.LinkifyIt`) LinkifyIt object
  31. shift (int): text searh position
  32. """
  33. def __repr__(self):
  34. return "{}.{}({!r})".format(
  35. self.__class__.__module__, self.__class__.__name__, self.__dict__
  36. )
  37. def __init__(self, linkifyit, shift):
  38. start = linkifyit._index
  39. end = linkifyit._last_index
  40. text = linkifyit._text_cache[start:end]
  41. self.schema = linkifyit._schema.lower()
  42. self.index = start + shift
  43. self.last_index = end + shift
  44. self.raw = text
  45. self.text = text
  46. self.url = text
  47. class LinkifyIt:
  48. """Creates new linkifier instance with optional additional schemas.
  49. By default understands:
  50. - ``http(s)://...`` , ``ftp://...``, ``mailto:...`` & ``//...`` links
  51. - "fuzzy" links and emails (example.com, foo@bar.com).
  52. ``schemas`` is an dict where each key/value describes protocol/rule:
  53. - **key** - link prefix (usually, protocol name with ``:`` at the end, ``skype:``
  54. for example). `linkify-it` makes shure that prefix is not preceeded with
  55. alphanumeric char. Only whitespaces and punctuation allowed.
  56. - **value** - rule to check tail after link prefix
  57. - *str* - just alias to existing rule
  58. - *dict*
  59. - *validate* - either a ``re.Pattern``, ``re str`` (start with ``^``, and don't
  60. include the link prefix itself), or a validator ``function`` which, given
  61. arguments *self*, *text* and *pos* returns the length of a match in *text*
  62. starting at index *pos*. *pos* is the index right after the link prefix.
  63. - *normalize* - optional function to normalize text & url of matched
  64. result (for example, for @twitter mentions).
  65. ``options`` is an dict:
  66. - **fuzzyLink** - recognige URL-s without ``http(s):`` prefix. Default ``True``.
  67. - **fuzzyIP** - allow IPs in fuzzy links above. Can conflict with some texts
  68. like version numbers. Default ``False``.
  69. - **fuzzyEmail** - recognize emails without ``mailto:`` prefix.
  70. - **---** - set `True` to terminate link with `---` (if it's considered as long
  71. dash).
  72. Args:
  73. schemas (dict): Optional. Additional schemas to validate (prefix/validator)
  74. options (dict): { fuzzy_link | fuzzy_email | fuzzy_ip: True | False }.
  75. Default: {"fuzzy_link": True, "fuzzy_email": True, "fuzzy_ip": False}.
  76. """
  77. def _validate_http(self, text, pos):
  78. tail = text[pos:]
  79. if not self.re.get("http"):
  80. # compile lazily, because "host"-containing variables can change on
  81. # tlds update.
  82. self.re["http"] = (
  83. "^\\/\\/"
  84. + self.re["src_auth"]
  85. + self.re["src_host_port_strict"]
  86. + self.re["src_path"]
  87. )
  88. founds = re.search(self.re["http"], tail, flags=re.IGNORECASE)
  89. if founds:
  90. return len(founds.group())
  91. return 0
  92. def _validate_double_slash(self, text, pos):
  93. tail = text[pos:]
  94. if not self.re.get("not_http"):
  95. # compile lazily, because "host"-containing variables can change on
  96. # tlds update.
  97. self.re["not_http"] = (
  98. "^"
  99. + self.re["src_auth"]
  100. + "(?:localhost|(?:(?:"
  101. + self.re["src_domain"]
  102. + ")\\.)+"
  103. + self.re["src_domain_root"]
  104. + ")"
  105. + self.re["src_port"]
  106. + self.re["src_host_terminator"]
  107. + self.re["src_path"]
  108. )
  109. founds = re.search(self.re["not_http"], tail, flags=re.IGNORECASE)
  110. if founds:
  111. if pos >= 3 and text[pos - 3] == ":":
  112. return 0
  113. if pos >= 3 and text[pos - 3] == "/":
  114. return 0
  115. return len(founds.group(0))
  116. return 0
  117. def _validate_mailto(self, text, pos):
  118. tail = text[pos:]
  119. if not self.re.get("mailto"):
  120. self.re["mailto"] = (
  121. "^" + self.re["src_email_name"] + "@" + self.re["src_host_strict"]
  122. )
  123. founds = re.search(self.re["mailto"], tail, flags=re.IGNORECASE)
  124. if founds:
  125. return len(founds.group(0))
  126. return 0
  127. def _reset_scan_cache(self):
  128. self._index = -1
  129. self._text_cache = ""
  130. def _create_validator(self, regex):
  131. def func(text, pos):
  132. tail = text[pos:]
  133. if isinstance(regex, str):
  134. founds = re.search(regex, tail, flags=re.IGNORECASE)
  135. else:
  136. # re.Pattern
  137. founds = re.search(regex, tail)
  138. if founds:
  139. return len(founds.group(0))
  140. return 0
  141. return func
  142. def _create_normalizer(self):
  143. def func(match):
  144. self.normalize(match)
  145. return func
  146. def _create_match(self, shift):
  147. match = Match(self, shift)
  148. self._compiled[match.schema]["normalize"](match)
  149. return match
  150. def __init__(self, schemas=None, options=None):
  151. self.default_options = {
  152. "fuzzy_link": True,
  153. "fuzzy_email": True,
  154. "fuzzy_ip": False,
  155. }
  156. self.default_schemas = {
  157. "http:": {"validate": self._validate_http},
  158. "https:": "http:",
  159. "ftp:": "http:",
  160. "//": {"validate": self._validate_double_slash},
  161. "mailto:": {"validate": self._validate_mailto},
  162. }
  163. # RE pattern for 2-character tlds (autogenerated by ./support/tlds_2char_gen.js)
  164. self.tlds_2ch_src_re = "a[cdefgilmnoqrstuwxz]|b[abdefghijmnorstvwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[abdefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghijklmnortuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]" # noqa: E501
  165. # DON'T try to make PRs with changes. Extend TLDs with LinkifyIt.tlds() instead
  166. self.tlds_default = "biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф".split( # noqa: E501
  167. "|"
  168. )
  169. if options:
  170. self.default_options.update(options)
  171. self._opts = self.default_options
  172. else:
  173. self._opts = self.default_options
  174. # Cache last tested result. Used to skip repeating steps on next `match` call.
  175. self._index = -1
  176. self._last_index = -1 # Next scan position
  177. self._schema = ""
  178. self._text_cache = ""
  179. if schemas:
  180. self.default_schemas.update(schemas)
  181. self._schemas = self.default_schemas
  182. else:
  183. self._schemas = self.default_schemas
  184. self._compiled = {}
  185. self._tlds = self.tlds_default
  186. self._tlds_replaced = False
  187. self.re = {}
  188. self._compile()
  189. def _compile(self):
  190. """Schemas compiler. Build regexps."""
  191. # Load & clone RE patterns.
  192. self.re = build_re(self._opts)
  193. # Define dynamic patterns
  194. tlds = copy.deepcopy(self._tlds)
  195. self._on_compile()
  196. if not self._tlds_replaced:
  197. tlds.append(self.tlds_2ch_src_re)
  198. tlds.append(self.re["src_xn"])
  199. self.re["src_tlds"] = "|".join(tlds)
  200. def untpl(tpl):
  201. return tpl.replace("%TLDS%", self.re["src_tlds"])
  202. self.re["email_fuzzy"] = untpl(self.re["tpl_email_fuzzy"])
  203. self.re["link_fuzzy"] = untpl(self.re["tpl_link_fuzzy"])
  204. self.re["link_no_ip_fuzzy"] = untpl(self.re["tpl_link_no_ip_fuzzy"])
  205. self.re["host_fuzzy_test"] = untpl(self.re["tpl_host_fuzzy_test"])
  206. #
  207. # Compile each schema
  208. #
  209. aliases = []
  210. self._compiled = {}
  211. for name, val in self._schemas.items():
  212. # skip disabled methods
  213. if val is None:
  214. continue
  215. compiled = {"validate": None, "link": None}
  216. self._compiled[name] = compiled
  217. if isinstance(val, dict):
  218. if isinstance(val.get("validate"), RE_TYPE):
  219. compiled["validate"] = self._create_validator(val.get("validate"))
  220. elif isinstance(val.get("validate"), str):
  221. compiled["validate"] = self._create_validator(val.get("validate"))
  222. elif isinstance(val.get("validate"), types.MethodType):
  223. compiled["validate"] = val.get("validate")
  224. # Add custom handler
  225. elif isinstance(val.get("validate"), types.FunctionType):
  226. setattr(LinkifyIt, "func", val.get("validate"))
  227. compiled["validate"] = self.func
  228. else:
  229. raise SchemaError(name, val)
  230. if isinstance(val.get("normalize"), types.MethodType):
  231. compiled["normalize"] = val.get("normalize")
  232. # Add custom handler
  233. elif isinstance(val.get("normalize"), types.FunctionType):
  234. setattr(LinkifyIt, "func", val.get("normalize"))
  235. compiled["normalize"] = self.func
  236. elif not val.get("normalize"):
  237. compiled["normalize"] = self._create_normalizer()
  238. else:
  239. raise SchemaError(name, val)
  240. continue
  241. if isinstance(val, str):
  242. aliases.append(name)
  243. continue
  244. raise SchemaError(name, val)
  245. #
  246. # Compile postponed aliases
  247. #
  248. for alias in aliases:
  249. if not self._compiled.get(self._schemas.get(alias)):
  250. continue
  251. self._compiled[alias]["validate"] = self._compiled[self._schemas[alias]][
  252. "validate"
  253. ]
  254. self._compiled[alias]["normalize"] = self._compiled[self._schemas[alias]][
  255. "normalize"
  256. ]
  257. # Fake record for guessed links
  258. self._compiled[""] = {"validate": None, "normalize": self._create_normalizer()}
  259. #
  260. # Build schema condition
  261. #
  262. slist = "|".join(
  263. [
  264. _escape_re(name)
  265. for name, val in self._compiled.items()
  266. if len(name) > 0 and val
  267. ]
  268. )
  269. re_schema_test = (
  270. "(^|(?!_)(?:[><\uff5c]|" + self.re["src_ZPCc"] + "))(" + slist + ")"
  271. )
  272. # (?!_) cause 1.5x slowdown
  273. self.re["schema_test"] = re_schema_test
  274. self.re["schema_search"] = re_schema_test
  275. self.re["schema_at_start"] = "^" + self.re["schema_search"]
  276. self.re["pretest"] = (
  277. "(" + re_schema_test + ")|(" + self.re["host_fuzzy_test"] + ")|@"
  278. )
  279. # Cleanup
  280. self._reset_scan_cache()
  281. def add(self, schema, definition):
  282. """Add new rule definition. (chainable)
  283. See :class:`linkify_it.main.LinkifyIt` init description for details.
  284. ``schema`` is a link prefix (``skype:``, for example), and ``definition``
  285. is a ``str`` to alias to another schema, or an ``dict`` with ``validate`` and
  286. optionally `normalize` definitions. To disable an existing rule, use
  287. ``.add(<schema>, None)``.
  288. Args:
  289. schema (str): rule name (fixed pattern prefix)
  290. definition (`str` or `re.Pattern`): schema definition
  291. Return:
  292. :class:`linkify_it.main.LinkifyIt`
  293. """
  294. self._schemas[schema] = definition
  295. self._compile()
  296. return self
  297. def set(self, options):
  298. """Override default options. (chainable)
  299. Missed properties will not be changed.
  300. Args:
  301. options (dict): ``keys``: [``fuzzy_link`` | ``fuzzy_email`` | ``fuzzy_ip``].
  302. ``values``: [``True`` | ``False``]
  303. Return:
  304. :class:`linkify_it.main.LinkifyIt`
  305. """
  306. self._opts.update(options)
  307. return self
  308. def test(self, text):
  309. """Searches linkifiable pattern and returns ``True`` on success or ``False``
  310. on fail.
  311. Args:
  312. text (str): text to search
  313. Returns:
  314. bool: ``True`` if a linkable pattern was found, otherwise it is ``False``.
  315. """
  316. self._text_cache = text
  317. self._index = -1
  318. if not len(text):
  319. return False
  320. if re.search(self.re["schema_test"], text, flags=re.IGNORECASE):
  321. regex = self.re["schema_search"]
  322. last_index = 0
  323. matched_iter = re.finditer(regex, text[last_index:], flags=re.IGNORECASE)
  324. for matched in matched_iter:
  325. last_index = matched.end(0)
  326. m = (matched.group(), matched.groups()[0], matched.groups()[1])
  327. length = self.test_schema_at(text, m[2], last_index)
  328. if length:
  329. self._schema = m[2]
  330. self._index = matched.start(0) + len(m[1])
  331. self._last_index = matched.start(0) + len(m[0]) + length
  332. break
  333. if self._opts.get("fuzzy_link") and self._compiled.get("http:"):
  334. # guess schemaless links
  335. matched_tld = re.search(
  336. self.re["host_fuzzy_test"], text, flags=re.IGNORECASE
  337. )
  338. if matched_tld:
  339. tld_pos = matched_tld.start(0)
  340. else:
  341. tld_pos = -1
  342. if tld_pos >= 0:
  343. # if tld is located after found link - no need to check fuzzy pattern
  344. if self._index < 0 or tld_pos < self._index:
  345. if self._opts.get("fuzzy_ip"):
  346. pattern = self.re["link_fuzzy"]
  347. else:
  348. pattern = self.re["link_no_ip_fuzzy"]
  349. ml = re.search(pattern, text, flags=re.IGNORECASE)
  350. if ml:
  351. shift = ml.start(0) + len(ml.groups()[0])
  352. if self._index < 0 or shift < self._index:
  353. self._schema = ""
  354. self._index = shift
  355. self._last_index = ml.start(0) + len(ml.group())
  356. if self._opts.get("fuzzy_email") and self._compiled.get("mailto:"):
  357. # guess schemaless emails
  358. at_pos = _index_of(text, "@")
  359. if at_pos >= 0:
  360. # We can't skip this check, because this cases are possible:
  361. # 192.168.1.1@gmail.com, my.in@example.com
  362. me = re.search(self.re["email_fuzzy"], text, flags=re.IGNORECASE)
  363. if me:
  364. shift = me.start(0) + len(me.groups()[0])
  365. next_shift = me.start(0) + len(me.group())
  366. if (
  367. self._index < 0
  368. or shift < self._index
  369. or (shift == self._index and next_shift > self._last_index)
  370. ):
  371. self._schema = "mailto:"
  372. self._index = shift
  373. self._last_index = next_shift
  374. return self._index >= 0
  375. def pretest(self, text):
  376. """Very quick check, that can give false positives.
  377. Returns true if link MAY BE can exists. Can be used for speed optimization,
  378. when you need to check that link NOT exists.
  379. Args:
  380. text (str): text to search
  381. Returns:
  382. bool: ``True`` if a linkable pattern was found, otherwise it is ``False``.
  383. """
  384. if re.search(self.re["pretest"], text, flags=re.IGNORECASE):
  385. return True
  386. return False
  387. def test_schema_at(self, text, name, position):
  388. """Similar to :meth:`linkify_it.main.LinkifyIt.test` but checks only
  389. specific protocol tail exactly at given position.
  390. Args:
  391. text (str): text to scan
  392. name (str): rule (schema) name
  393. position (int): length of found pattern (0 on fail).
  394. Returns:
  395. int: text (str): text to search
  396. """
  397. # If not supported schema check requested - terminate
  398. if not self._compiled.get(name.lower()):
  399. return 0
  400. return self._compiled.get(name.lower()).get("validate")(text, position)
  401. def match(self, text):
  402. """Returns ``list`` of found link descriptions or ``None`` on fail.
  403. We strongly recommend to use :meth:`linkify_it.main.LinkifyIt.test`
  404. first, for best speed.
  405. Args:
  406. text (str): text to search
  407. Returns:
  408. ``list`` or ``None``: Result match description:
  409. * **schema** - link schema, can be empty for fuzzy links, or ``//``
  410. for protocol-neutral links.
  411. * **index** - offset of matched text
  412. * **last_index** - offset of matched text
  413. * **raw** - offset of matched text
  414. * **text** - normalized text
  415. * **url** - link, generated from matched text
  416. """
  417. shift = 0
  418. result = []
  419. # try to take previous element from cache, if .test() called before
  420. if self._index >= 0 and self._text_cache == text:
  421. result.append(self._create_match(shift))
  422. shift = self._last_index
  423. # Cut head if cache was used
  424. tail = text[shift:] if shift else text
  425. # Scan string until end reached
  426. while self.test(tail):
  427. result.append(self._create_match(shift))
  428. tail = tail[self._last_index :]
  429. shift += self._last_index
  430. if len(result):
  431. return result
  432. return None
  433. def match_at_start(self, text):
  434. """Returns fully-formed (not fuzzy) link if it starts at the beginning
  435. of the string, and null otherwise.
  436. Args:
  437. text (str): text to search
  438. Retuns:
  439. ``Match`` or ``None``
  440. """
  441. # Reset scan cache
  442. self._text_cache = text
  443. self._index = -1
  444. if not len(text):
  445. return None
  446. founds = re.search(self.re["schema_at_start"], text, flags=re.IGNORECASE)
  447. if not founds:
  448. return None
  449. m = (founds.group(), founds.groups()[0], founds.groups()[1])
  450. length = self.test_schema_at(text, m[2], len(m[0]))
  451. if not length:
  452. return None
  453. self._schema = m[2]
  454. self._index = founds.start(0) + len(m[1])
  455. self._last_index = founds.start(0) + len(m[0]) + length
  456. return self._create_match(0)
  457. def tlds(self, list_tlds, keep_old=False):
  458. """Load (or merge) new tlds list. (chainable)
  459. Those are user for fuzzy links (without prefix) to avoid false positives.
  460. By default this algorythm used:
  461. * hostname with any 2-letter root zones are ok.
  462. * biz|com|edu|gov|net|org|pro|web|xxx|aero|asia|coop|info|museum|name|shop|рф
  463. are ok.
  464. * encoded (`xn--...`) root zones are ok.
  465. If list is replaced, then exact match for 2-chars root zones will be checked.
  466. Args:
  467. list_tlds (list or str): ``list of tlds`` or ``tlds string``
  468. keep_old (bool): merge with current list if q`True`q (q`Falseq` by default)
  469. """
  470. _list = list_tlds if isinstance(list_tlds, list) else [list_tlds]
  471. if not keep_old:
  472. self._tlds = _list
  473. self._tlds_replaced = True
  474. self._compile()
  475. return self
  476. self._tlds.extend(_list)
  477. self._tlds = sorted(list(set(self._tlds)), reverse=True)
  478. self._compile()
  479. return self
  480. def normalize(self, match):
  481. """Default normalizer (if schema does not define it's own).
  482. Args:
  483. match (:class:`linkify_it.main.Match`): Match result
  484. """
  485. if not match.schema:
  486. match.url = "http://" + match.url
  487. if match.schema == "mailto:" and not re.search(
  488. "^mailto:", match.url, flags=re.IGNORECASE
  489. ):
  490. match.url = "mailto:" + match.url
  491. def _on_compile(self):
  492. """Override to modify basic RegExp-s."""
  493. pass