__init__.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. # Copyright 2019 The RE2 Authors. All Rights Reserved.
  2. # Use of this source code is governed by a BSD-style
  3. # license that can be found in the LICENSE file.
  4. r"""A drop-in replacement for the re module.
  5. It uses RE2 under the hood, of course, so various PCRE features
  6. (e.g. backreferences, look-around assertions) are not supported.
  7. See https://github.com/google/re2/wiki/Syntax for the canonical
  8. reference, but known syntactic "gotchas" relative to Python are:
  9. * PCRE supports \Z and \z; RE2 supports \z; Python supports \z,
  10. but calls it \Z. You must rewrite \Z to \z in pattern strings.
  11. Known differences between this module's API and the re module's API:
  12. * The error class does not provide any error information as attributes.
  13. * The Options class replaces the re module's flags with RE2's options as
  14. gettable/settable properties. Please see re2.h for their documentation.
  15. * The pattern string and the input string do not have to be the same type.
  16. Any str will be encoded to UTF-8.
  17. * The pattern string cannot be str if the options specify Latin-1 encoding.
  18. This module's LRU cache contains a maximum of 128 regular expression objects.
  19. Each regular expression object's underlying RE2 object uses a maximum of 8MiB
  20. of memory (by default). Hence, this module's LRU cache uses a maximum of 1GiB
  21. of memory (by default), but in most cases, it should use much less than that.
  22. """
  23. # start delvewheel patch
  24. def _delvewheel_patch_1_7_0():
  25. import os
  26. libs_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir, 'google_re2.libs'))
  27. if os.path.isdir(libs_dir):
  28. os.add_dll_directory(libs_dir)
  29. _delvewheel_patch_1_7_0()
  30. del _delvewheel_patch_1_7_0
  31. # end delvewheel patch
  32. import codecs
  33. import functools
  34. import itertools
  35. from . import _re2
  36. # pybind11 translates C++ exceptions to Python exceptions.
  37. # We use that same Python exception class for consistency.
  38. error = _re2.Error
  39. class Options(_re2.RE2.Options):
  40. __slots__ = ()
  41. NAMES = (
  42. 'max_mem',
  43. 'encoding',
  44. 'posix_syntax',
  45. 'longest_match',
  46. 'log_errors',
  47. 'literal',
  48. 'never_nl',
  49. 'dot_nl',
  50. 'never_capture',
  51. 'case_sensitive',
  52. 'perl_classes',
  53. 'word_boundary',
  54. 'one_line',
  55. )
  56. def compile(pattern, options=None):
  57. if isinstance(pattern, _Regexp):
  58. if options:
  59. raise error('pattern is already compiled, so '
  60. 'options may not be specified')
  61. pattern = pattern._pattern
  62. options = options or Options()
  63. values = tuple(getattr(options, name) for name in Options.NAMES)
  64. return _Regexp._make(pattern, values)
  65. def search(pattern, text, options=None):
  66. return compile(pattern, options=options).search(text)
  67. def match(pattern, text, options=None):
  68. return compile(pattern, options=options).match(text)
  69. def fullmatch(pattern, text, options=None):
  70. return compile(pattern, options=options).fullmatch(text)
  71. def finditer(pattern, text, options=None):
  72. return compile(pattern, options=options).finditer(text)
  73. def findall(pattern, text, options=None):
  74. return compile(pattern, options=options).findall(text)
  75. def split(pattern, text, maxsplit=0, options=None):
  76. return compile(pattern, options=options).split(text, maxsplit)
  77. def subn(pattern, repl, text, count=0, options=None):
  78. return compile(pattern, options=options).subn(repl, text, count)
  79. def sub(pattern, repl, text, count=0, options=None):
  80. return compile(pattern, options=options).sub(repl, text, count)
  81. def _encode(t):
  82. return t.encode(encoding='utf-8')
  83. def _decode(b):
  84. return b.decode(encoding='utf-8')
  85. def escape(pattern):
  86. if isinstance(pattern, str):
  87. encoded_pattern = _encode(pattern)
  88. escaped = _re2.RE2.QuoteMeta(encoded_pattern)
  89. decoded_escaped = _decode(escaped)
  90. return decoded_escaped
  91. else:
  92. escaped = _re2.RE2.QuoteMeta(pattern)
  93. return escaped
  94. def purge():
  95. return _Regexp._make.cache_clear()
  96. _Anchor = _re2.RE2.Anchor
  97. _NULL_SPAN = (-1, -1)
  98. class _Regexp(object):
  99. __slots__ = ('_pattern', '_regexp')
  100. @classmethod
  101. @functools.lru_cache(typed=True)
  102. def _make(cls, pattern, values):
  103. options = Options()
  104. for name, value in zip(Options.NAMES, values):
  105. setattr(options, name, value)
  106. return cls(pattern, options)
  107. def __init__(self, pattern, options):
  108. self._pattern = pattern
  109. if isinstance(self._pattern, str):
  110. if options.encoding == Options.Encoding.LATIN1:
  111. raise error('string type of pattern is str, but '
  112. 'encoding specified in options is LATIN1')
  113. encoded_pattern = _encode(self._pattern)
  114. self._regexp = _re2.RE2(encoded_pattern, options)
  115. else:
  116. self._regexp = _re2.RE2(self._pattern, options)
  117. if not self._regexp.ok():
  118. raise error(self._regexp.error())
  119. def __getstate__(self):
  120. options = {name: getattr(self.options, name) for name in Options.NAMES}
  121. return self._pattern, options
  122. def __setstate__(self, state):
  123. pattern, options = state
  124. values = tuple(options[name] for name in Options.NAMES)
  125. other = _Regexp._make(pattern, values)
  126. self._pattern = other._pattern
  127. self._regexp = other._regexp
  128. def _match(self, anchor, text, pos=None, endpos=None):
  129. pos = 0 if pos is None else max(0, min(pos, len(text)))
  130. endpos = len(text) if endpos is None else max(0, min(endpos, len(text)))
  131. if pos > endpos:
  132. return
  133. if isinstance(text, str):
  134. encoded_text = _encode(text)
  135. encoded_pos = _re2.CharLenToBytes(encoded_text, 0, pos)
  136. if endpos == len(text):
  137. # This is the common case.
  138. encoded_endpos = len(encoded_text)
  139. else:
  140. encoded_endpos = encoded_pos + _re2.CharLenToBytes(
  141. encoded_text, encoded_pos, endpos - pos)
  142. decoded_offsets = {0: 0}
  143. last_offset = 0
  144. while True:
  145. spans = self._regexp.Match(anchor, encoded_text, encoded_pos,
  146. encoded_endpos)
  147. if spans[0] == _NULL_SPAN:
  148. break
  149. # This algorithm is linear in the length of encoded_text. Specifically,
  150. # no matter how many groups there are for a given regular expression or
  151. # how many iterations through the loop there are for a given generator,
  152. # this algorithm uses a single, straightforward pass over encoded_text.
  153. offsets = sorted(set(itertools.chain(*spans)))
  154. if offsets[0] == -1:
  155. offsets = offsets[1:]
  156. # Discard the rest of the items because they are useless now - and we
  157. # could accumulate one item per str offset in the pathological case!
  158. decoded_offsets = {last_offset: decoded_offsets[last_offset]}
  159. for offset in offsets:
  160. decoded_offsets[offset] = (
  161. decoded_offsets[last_offset] +
  162. _re2.BytesToCharLen(encoded_text, last_offset, offset))
  163. last_offset = offset
  164. def decode(span):
  165. if span == _NULL_SPAN:
  166. return span
  167. return decoded_offsets[span[0]], decoded_offsets[span[1]]
  168. decoded_spans = [decode(span) for span in spans]
  169. yield _Match(self, text, pos, endpos, decoded_spans)
  170. if encoded_pos == encoded_endpos:
  171. break
  172. elif encoded_pos == spans[0][1]:
  173. # We matched the empty string at encoded_pos and would be stuck, so
  174. # in order to make forward progress, increment the str offset.
  175. encoded_pos += _re2.CharLenToBytes(encoded_text, encoded_pos, 1)
  176. else:
  177. encoded_pos = spans[0][1]
  178. else:
  179. while True:
  180. spans = self._regexp.Match(anchor, text, pos, endpos)
  181. if spans[0] == _NULL_SPAN:
  182. break
  183. yield _Match(self, text, pos, endpos, spans)
  184. if pos == endpos:
  185. break
  186. elif pos == spans[0][1]:
  187. # We matched the empty string at pos and would be stuck, so in order
  188. # to make forward progress, increment the bytes offset.
  189. pos += 1
  190. else:
  191. pos = spans[0][1]
  192. def search(self, text, pos=None, endpos=None):
  193. return next(self._match(_Anchor.UNANCHORED, text, pos, endpos), None)
  194. def match(self, text, pos=None, endpos=None):
  195. return next(self._match(_Anchor.ANCHOR_START, text, pos, endpos), None)
  196. def fullmatch(self, text, pos=None, endpos=None):
  197. return next(self._match(_Anchor.ANCHOR_BOTH, text, pos, endpos), None)
  198. def finditer(self, text, pos=None, endpos=None):
  199. return self._match(_Anchor.UNANCHORED, text, pos, endpos)
  200. def findall(self, text, pos=None, endpos=None):
  201. empty = type(text)()
  202. items = []
  203. for match in self.finditer(text, pos, endpos):
  204. if not self.groups:
  205. item = match.group()
  206. elif self.groups == 1:
  207. item = match.groups(default=empty)[0]
  208. else:
  209. item = match.groups(default=empty)
  210. items.append(item)
  211. return items
  212. def _split(self, cb, text, maxsplit=0):
  213. if maxsplit < 0:
  214. return [text], 0
  215. elif maxsplit > 0:
  216. matchiter = itertools.islice(self.finditer(text), maxsplit)
  217. else:
  218. matchiter = self.finditer(text)
  219. pieces = []
  220. end = 0
  221. numsplit = 0
  222. for match in matchiter:
  223. pieces.append(text[end:match.start()])
  224. pieces.extend(cb(match))
  225. end = match.end()
  226. numsplit += 1
  227. pieces.append(text[end:])
  228. return pieces, numsplit
  229. def split(self, text, maxsplit=0):
  230. cb = lambda match: [match[group] for group in range(1, self.groups + 1)]
  231. pieces, _ = self._split(cb, text, maxsplit)
  232. return pieces
  233. def subn(self, repl, text, count=0):
  234. cb = lambda match: [repl(match) if callable(repl) else match.expand(repl)]
  235. empty = type(text)()
  236. pieces, numsplit = self._split(cb, text, count)
  237. joined_pieces = empty.join(pieces)
  238. return joined_pieces, numsplit
  239. def sub(self, repl, text, count=0):
  240. joined_pieces, _ = self.subn(repl, text, count)
  241. return joined_pieces
  242. @property
  243. def pattern(self):
  244. return self._pattern
  245. @property
  246. def options(self):
  247. return self._regexp.options()
  248. @property
  249. def groups(self):
  250. return self._regexp.NumberOfCapturingGroups()
  251. @property
  252. def groupindex(self):
  253. groups = self._regexp.NamedCapturingGroups()
  254. if isinstance(self._pattern, str):
  255. decoded_groups = [(_decode(group), index) for group, index in groups]
  256. return dict(decoded_groups)
  257. else:
  258. return dict(groups)
  259. @property
  260. def programsize(self):
  261. return self._regexp.ProgramSize()
  262. @property
  263. def reverseprogramsize(self):
  264. return self._regexp.ReverseProgramSize()
  265. @property
  266. def programfanout(self):
  267. return self._regexp.ProgramFanout()
  268. @property
  269. def reverseprogramfanout(self):
  270. return self._regexp.ReverseProgramFanout()
  271. def possiblematchrange(self, maxlen):
  272. ok, min, max = self._regexp.PossibleMatchRange(maxlen)
  273. if not ok:
  274. raise error('failed to compute match range')
  275. return min, max
  276. class _Match(object):
  277. __slots__ = ('_regexp', '_text', '_pos', '_endpos', '_spans')
  278. def __init__(self, regexp, text, pos, endpos, spans):
  279. self._regexp = regexp
  280. self._text = text
  281. self._pos = pos
  282. self._endpos = endpos
  283. self._spans = spans
  284. # Python prioritises three-digit octal numbers over group escapes.
  285. # For example, \100 should not be handled the same way as \g<10>0.
  286. _OCTAL_RE = compile('\\\\[0-7][0-7][0-7]')
  287. # Python supports \1 through \99 (inclusive) and \g<...> syntax.
  288. _GROUP_RE = compile('\\\\[1-9][0-9]?|\\\\g<\\w+>')
  289. @classmethod
  290. @functools.lru_cache(typed=True)
  291. def _split(cls, template):
  292. if isinstance(template, str):
  293. backslash = '\\'
  294. else:
  295. backslash = b'\\'
  296. empty = type(template)()
  297. pieces = [empty]
  298. index = template.find(backslash)
  299. while index != -1:
  300. piece, template = template[:index], template[index:]
  301. pieces[-1] += piece
  302. octal_match = cls._OCTAL_RE.match(template)
  303. group_match = cls._GROUP_RE.match(template)
  304. if (not octal_match) and group_match:
  305. index = group_match.end()
  306. piece, template = template[:index], template[index:]
  307. pieces.extend((piece, empty))
  308. else:
  309. # 2 isn't enough for \o, \x, \N, \u and \U escapes, but none of those
  310. # should contain backslashes, so break them here and then fix them at
  311. # the beginning of the next loop iteration or right before returning.
  312. index = 2
  313. piece, template = template[:index], template[index:]
  314. pieces[-1] += piece
  315. index = template.find(backslash)
  316. pieces[-1] += template
  317. return pieces
  318. def expand(self, template):
  319. if isinstance(template, str):
  320. unescape = codecs.unicode_escape_decode
  321. else:
  322. unescape = codecs.escape_decode
  323. empty = type(template)()
  324. # Make a copy so that we don't clobber the cached pieces!
  325. pieces = list(self._split(template))
  326. for index, piece in enumerate(pieces):
  327. if not index % 2:
  328. pieces[index], _ = unescape(piece)
  329. else:
  330. if len(piece) <= 3: # \1 through \99 (inclusive)
  331. group = int(piece[1:])
  332. else: # \g<...>
  333. group = piece[3:-1]
  334. try:
  335. group = int(group)
  336. except ValueError:
  337. pass
  338. pieces[index] = self.__getitem__(group) or empty
  339. joined_pieces = empty.join(pieces)
  340. return joined_pieces
  341. def __getitem__(self, group):
  342. if not isinstance(group, int):
  343. try:
  344. group = self._regexp.groupindex[group]
  345. except KeyError:
  346. raise IndexError('bad group name')
  347. if not 0 <= group <= self._regexp.groups:
  348. raise IndexError('bad group index')
  349. span = self._spans[group]
  350. if span == _NULL_SPAN:
  351. return None
  352. return self._text[span[0]:span[1]]
  353. def group(self, *groups):
  354. if not groups:
  355. groups = (0,)
  356. items = (self.__getitem__(group) for group in groups)
  357. return next(items) if len(groups) == 1 else tuple(items)
  358. def groups(self, default=None):
  359. items = []
  360. for group in range(1, self._regexp.groups + 1):
  361. item = self.__getitem__(group)
  362. items.append(default if item is None else item)
  363. return tuple(items)
  364. def groupdict(self, default=None):
  365. items = []
  366. for group, index in self._regexp.groupindex.items():
  367. item = self.__getitem__(index)
  368. items.append((group, default) if item is None else (group, item))
  369. return dict(items)
  370. def start(self, group=0):
  371. if not 0 <= group <= self._regexp.groups:
  372. raise IndexError('bad group index')
  373. return self._spans[group][0]
  374. def end(self, group=0):
  375. if not 0 <= group <= self._regexp.groups:
  376. raise IndexError('bad group index')
  377. return self._spans[group][1]
  378. def span(self, group=0):
  379. if not 0 <= group <= self._regexp.groups:
  380. raise IndexError('bad group index')
  381. return self._spans[group]
  382. @property
  383. def re(self):
  384. return self._regexp
  385. @property
  386. def string(self):
  387. return self._text
  388. @property
  389. def pos(self):
  390. return self._pos
  391. @property
  392. def endpos(self):
  393. return self._endpos
  394. @property
  395. def lastindex(self):
  396. max_end = -1
  397. max_group = None
  398. # We look for the rightmost right parenthesis by keeping the first group
  399. # that ends at max_end because that is the leftmost/outermost group when
  400. # there are nested groups!
  401. for group in range(1, self._regexp.groups + 1):
  402. end = self._spans[group][1]
  403. if max_end < end:
  404. max_end = end
  405. max_group = group
  406. return max_group
  407. @property
  408. def lastgroup(self):
  409. max_group = self.lastindex
  410. if not max_group:
  411. return None
  412. for group, index in self._regexp.groupindex.items():
  413. if max_group == index:
  414. return group
  415. return None
  416. class Set(object):
  417. """A Pythonic wrapper around RE2::Set."""
  418. __slots__ = ('_set')
  419. def __init__(self, anchor, options=None):
  420. options = options or Options()
  421. self._set = _re2.Set(anchor, options)
  422. @classmethod
  423. def SearchSet(cls, options=None):
  424. return cls(_Anchor.UNANCHORED, options=options)
  425. @classmethod
  426. def MatchSet(cls, options=None):
  427. return cls(_Anchor.ANCHOR_START, options=options)
  428. @classmethod
  429. def FullMatchSet(cls, options=None):
  430. return cls(_Anchor.ANCHOR_BOTH, options=options)
  431. def Add(self, pattern):
  432. if isinstance(pattern, str):
  433. encoded_pattern = _encode(pattern)
  434. index = self._set.Add(encoded_pattern)
  435. else:
  436. index = self._set.Add(pattern)
  437. if index == -1:
  438. raise error('failed to add %r to Set' % pattern)
  439. return index
  440. def Compile(self):
  441. if not self._set.Compile():
  442. raise error('failed to compile Set')
  443. def Match(self, text):
  444. if isinstance(text, str):
  445. encoded_text = _encode(text)
  446. matches = self._set.Match(encoded_text)
  447. else:
  448. matches = self._set.Match(text)
  449. return matches or None
  450. class Filter(object):
  451. """A Pythonic wrapper around FilteredRE2."""
  452. __slots__ = ('_filter', '_patterns')
  453. def __init__(self):
  454. self._filter = _re2.Filter()
  455. self._patterns = []
  456. def Add(self, pattern, options=None):
  457. options = options or Options()
  458. if isinstance(pattern, str):
  459. encoded_pattern = _encode(pattern)
  460. index = self._filter.Add(encoded_pattern, options)
  461. else:
  462. index = self._filter.Add(pattern, options)
  463. if index == -1:
  464. raise error('failed to add %r to Filter' % pattern)
  465. self._patterns.append(pattern)
  466. return index
  467. def Compile(self):
  468. if not self._filter.Compile():
  469. raise error('failed to compile Filter')
  470. def Match(self, text, potential=False):
  471. if isinstance(text, str):
  472. encoded_text = _encode(text)
  473. matches = self._filter.Match(encoded_text, potential)
  474. else:
  475. matches = self._filter.Match(text, potential)
  476. return matches or None
  477. def re(self, index):
  478. if not 0 <= index < len(self._patterns):
  479. raise IndexError('bad index')
  480. proxy = object.__new__(_Regexp)
  481. proxy._pattern = self._patterns[index]
  482. proxy._regexp = self._filter.GetRE2(index)
  483. return proxy