gitwildmatch.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. """
  2. This module implements Git's wildmatch pattern matching which itself is derived
  3. from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
  4. """
  5. import re
  6. import warnings
  7. from typing import (
  8. AnyStr,
  9. Optional, # Replaced by `X | None` in 3.10.
  10. Tuple) # Replaced by `tuple` in 3.9.
  11. from .. import util
  12. from ..pattern import RegexPattern
  13. _BYTES_ENCODING = 'latin1'
  14. """
  15. The encoding to use when parsing a byte string pattern.
  16. """
  17. _DIR_MARK = 'ps_d'
  18. """
  19. The regex group name for the directory marker. This is only used by
  20. :class:`GitIgnoreSpec`.
  21. """
  22. class GitWildMatchPatternError(ValueError):
  23. """
  24. The :class:`GitWildMatchPatternError` indicates an invalid git wild match
  25. pattern.
  26. """
  27. pass
  28. class GitWildMatchPattern(RegexPattern):
  29. """
  30. The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
  31. pattern.
  32. """
  33. # Keep the dict-less class hierarchy.
  34. __slots__ = ()
  35. @classmethod
  36. def pattern_to_regex(
  37. cls,
  38. pattern: AnyStr,
  39. ) -> Tuple[Optional[AnyStr], Optional[bool]]:
  40. """
  41. Convert the pattern into a regular expression.
  42. *pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
  43. regular expression.
  44. Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
  45. :data:`None`); and whether matched files should be included (:data:`True`),
  46. excluded (:data:`False`), or if it is a null-operation (:data:`None`).
  47. """
  48. if isinstance(pattern, str):
  49. return_type = str
  50. elif isinstance(pattern, bytes):
  51. return_type = bytes
  52. pattern = pattern.decode(_BYTES_ENCODING)
  53. else:
  54. raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
  55. original_pattern = pattern
  56. if pattern.endswith('\\ '):
  57. # EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
  58. # with backslash followed by a space, only strip from left.
  59. pattern = pattern.lstrip()
  60. else:
  61. pattern = pattern.strip()
  62. if pattern.startswith('#'):
  63. # A pattern starting with a hash ('#') serves as a comment (neither
  64. # includes nor excludes files). Escape the hash with a back-slash to match
  65. # a literal hash (i.e., '\#').
  66. regex = None
  67. include = None
  68. elif pattern == '/':
  69. # EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
  70. # not match any file.
  71. regex = None
  72. include = None
  73. elif pattern:
  74. if pattern.startswith('!'):
  75. # A pattern starting with an exclamation mark ('!') negates the pattern
  76. # (exclude instead of include). Escape the exclamation mark with a
  77. # back-slash to match a literal exclamation mark (i.e., '\!').
  78. include = False
  79. # Remove leading exclamation mark.
  80. pattern = pattern[1:]
  81. else:
  82. include = True
  83. # Allow a regex override for edge cases that cannot be handled through
  84. # normalization.
  85. override_regex = None
  86. # Split pattern into segments.
  87. pattern_segs = pattern.split('/')
  88. # Check whether the pattern is specifically a directory pattern before
  89. # normalization.
  90. is_dir_pattern = not pattern_segs[-1]
  91. # Normalize pattern to make processing easier.
  92. # EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
  93. # sequence down to one double-asterisk. Iterate over the segments in
  94. # reverse and remove the duplicate double asterisks as we go.
  95. for i in range(len(pattern_segs) - 1, 0, -1):
  96. prev = pattern_segs[i-1]
  97. seg = pattern_segs[i]
  98. if prev == '**' and seg == '**':
  99. del pattern_segs[i]
  100. if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
  101. # EDGE CASE: The '**/' pattern should match everything except individual
  102. # files in the root directory. This case cannot be adequately handled
  103. # through normalization. Use the override.
  104. override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
  105. if not pattern_segs[0]:
  106. # A pattern beginning with a slash ('/') will only match paths directly
  107. # on the root directory instead of any descendant paths. So, remove
  108. # empty first segment to make pattern relative to root.
  109. del pattern_segs[0]
  110. elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
  111. # A single pattern without a beginning slash ('/') will match any
  112. # descendant path. This is equivalent to "**/{pattern}". So, prepend
  113. # with double-asterisks to make pattern relative to root.
  114. # - EDGE CASE: This also holds for a single pattern with a trailing
  115. # slash (e.g. dir/).
  116. if pattern_segs[0] != '**':
  117. pattern_segs.insert(0, '**')
  118. else:
  119. # EDGE CASE: A pattern without a beginning slash ('/') but contains at
  120. # least one prepended directory (e.g. "dir/{pattern}") should not match
  121. # "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
  122. pass
  123. if not pattern_segs:
  124. # After resolving the edge cases, we end up with no pattern at all. This
  125. # must be because the pattern is invalid.
  126. raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
  127. if not pattern_segs[-1] and len(pattern_segs) > 1:
  128. # A pattern ending with a slash ('/') will match all descendant paths if
  129. # it is a directory but not if it is a regular file. This is equivalent
  130. # to "{pattern}/**". So, set last segment to a double-asterisk to
  131. # include all descendants.
  132. pattern_segs[-1] = '**'
  133. if override_regex is None:
  134. # Build regular expression from pattern.
  135. output = ['^']
  136. need_slash = False
  137. end = len(pattern_segs) - 1
  138. for i, seg in enumerate(pattern_segs):
  139. if seg == '**':
  140. if i == 0 and i == end:
  141. # A pattern consisting solely of double-asterisks ('**') will
  142. # match every path.
  143. output.append(f'[^/]+(?:/.*)?')
  144. elif i == 0:
  145. # A normalized pattern beginning with double-asterisks
  146. # ('**') will match any leading path segments.
  147. output.append('(?:.+/)?')
  148. need_slash = False
  149. elif i == end:
  150. # A normalized pattern ending with double-asterisks ('**') will
  151. # match any trailing path segments.
  152. if is_dir_pattern:
  153. output.append(f'(?P<{_DIR_MARK}>/).*')
  154. else:
  155. output.append(f'/.*')
  156. else:
  157. # A pattern with inner double-asterisks ('**') will match multiple
  158. # (or zero) inner path segments.
  159. output.append('(?:/.+)?')
  160. need_slash = True
  161. elif seg == '*':
  162. # Match single path segment.
  163. if need_slash:
  164. output.append('/')
  165. output.append('[^/]+')
  166. if i == end:
  167. # A pattern ending without a slash ('/') will match a file or a
  168. # directory (with paths underneath it). E.g., "foo" matches "foo",
  169. # "foo/bar", "foo/bar/baz", etc.
  170. output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
  171. need_slash = True
  172. else:
  173. # Match segment glob pattern.
  174. if need_slash:
  175. output.append('/')
  176. try:
  177. output.append(cls._translate_segment_glob(seg))
  178. except ValueError as e:
  179. raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
  180. if i == end:
  181. # A pattern ending without a slash ('/') will match a file or a
  182. # directory (with paths underneath it). E.g., "foo" matches "foo",
  183. # "foo/bar", "foo/bar/baz", etc.
  184. output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
  185. need_slash = True
  186. output.append('$')
  187. regex = ''.join(output)
  188. else:
  189. # Use regex override.
  190. regex = override_regex
  191. else:
  192. # A blank pattern is a null-operation (neither includes nor excludes
  193. # files).
  194. regex = None
  195. include = None
  196. if regex is not None and return_type is bytes:
  197. regex = regex.encode(_BYTES_ENCODING)
  198. return regex, include
  199. @staticmethod
  200. def _translate_segment_glob(pattern: str) -> str:
  201. """
  202. Translates the glob pattern to a regular expression. This is used in the
  203. constructor to translate a path segment glob pattern to its corresponding
  204. regular expression.
  205. *pattern* (:class:`str`) is the glob pattern.
  206. Returns the regular expression (:class:`str`).
  207. """
  208. # NOTE: This is derived from `fnmatch.translate()` and is similar to the
  209. # POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
  210. escape = False
  211. regex = ''
  212. i, end = 0, len(pattern)
  213. while i < end:
  214. # Get next character.
  215. char = pattern[i]
  216. i += 1
  217. if escape:
  218. # Escape the character.
  219. escape = False
  220. regex += re.escape(char)
  221. elif char == '\\':
  222. # Escape character, escape next character.
  223. escape = True
  224. elif char == '*':
  225. # Multi-character wildcard. Match any string (except slashes), including
  226. # an empty string.
  227. regex += '[^/]*'
  228. elif char == '?':
  229. # Single-character wildcard. Match any single character (except a
  230. # slash).
  231. regex += '[^/]'
  232. elif char == '[':
  233. # Bracket expression wildcard. Except for the beginning exclamation
  234. # mark, the whole bracket expression can be used directly as regex, but
  235. # we have to find where the expression ends.
  236. # - "[][!]" matches ']', '[' and '!'.
  237. # - "[]-]" matches ']' and '-'.
  238. # - "[!]a-]" matches any character except ']', 'a' and '-'.
  239. j = i
  240. # Pass bracket expression negation.
  241. if j < end and (pattern[j] == '!' or pattern[j] == '^'):
  242. j += 1
  243. # Pass first closing bracket if it is at the beginning of the
  244. # expression.
  245. if j < end and pattern[j] == ']':
  246. j += 1
  247. # Find closing bracket. Stop once we reach the end or find it.
  248. while j < end and pattern[j] != ']':
  249. j += 1
  250. if j < end:
  251. # Found end of bracket expression. Increment j to be one past the
  252. # closing bracket:
  253. #
  254. # [...]
  255. # ^ ^
  256. # i j
  257. #
  258. j += 1
  259. expr = '['
  260. if pattern[i] == '!':
  261. # Bracket expression needs to be negated.
  262. expr += '^'
  263. i += 1
  264. elif pattern[i] == '^':
  265. # POSIX declares that the regex bracket expression negation "[^...]"
  266. # is undefined in a glob pattern. Python's `fnmatch.translate()`
  267. # escapes the caret ('^') as a literal. Git supports the using a
  268. # caret for negation. Maintain consistency with Git because that is
  269. # the expected behavior.
  270. expr += '^'
  271. i += 1
  272. # Build regex bracket expression. Escape slashes so they are treated
  273. # as literal slashes by regex as defined by POSIX.
  274. expr += pattern[i:j].replace('\\', '\\\\')
  275. # Add regex bracket expression to regex result.
  276. regex += expr
  277. # Set i to one past the closing bracket.
  278. i = j
  279. else:
  280. # Failed to find closing bracket, treat opening bracket as a bracket
  281. # literal instead of as an expression.
  282. regex += '\\['
  283. else:
  284. # Regular character, escape it for regex.
  285. regex += re.escape(char)
  286. if escape:
  287. raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
  288. return regex
  289. @staticmethod
  290. def escape(s: AnyStr) -> AnyStr:
  291. """
  292. Escape special characters in the given string.
  293. *s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
  294. escape, usually before adding it to a ".gitignore".
  295. Returns the escaped string (:class:`str` or :class:`bytes`).
  296. """
  297. if isinstance(s, str):
  298. return_type = str
  299. string = s
  300. elif isinstance(s, bytes):
  301. return_type = bytes
  302. string = s.decode(_BYTES_ENCODING)
  303. else:
  304. raise TypeError(f"s:{s!r} is not a unicode or byte string.")
  305. # Reference: https://git-scm.com/docs/gitignore#_pattern_format
  306. meta_characters = r"[]!*#?"
  307. out_string = "".join("\\" + x if x in meta_characters else x for x in string)
  308. if return_type is bytes:
  309. return out_string.encode(_BYTES_ENCODING)
  310. else:
  311. return out_string
  312. util.register_pattern('gitwildmatch', GitWildMatchPattern)
  313. class GitIgnorePattern(GitWildMatchPattern):
  314. """
  315. The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
  316. This class only exists to maintain compatibility with v0.4.
  317. """
  318. def __init__(self, *args, **kw) -> None:
  319. """
  320. Warn about deprecation.
  321. """
  322. self._deprecated()
  323. super(GitIgnorePattern, self).__init__(*args, **kw)
  324. @staticmethod
  325. def _deprecated() -> None:
  326. """
  327. Warn about deprecation.
  328. """
  329. warnings.warn((
  330. "GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
  331. "('gitwildmatch') instead."
  332. ), DeprecationWarning, stacklevel=3)
  333. @classmethod
  334. def pattern_to_regex(cls, *args, **kw):
  335. """
  336. Warn about deprecation.
  337. """
  338. cls._deprecated()
  339. return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
  340. # Register `GitIgnorePattern` as "gitignore" for backward compatibility with
  341. # v0.4.
  342. util.register_pattern('gitignore', GitIgnorePattern)