glob.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import os
  2. import re
  3. _default_seps = os.sep + str(os.altsep) * bool(os.altsep)
  4. class Translator:
  5. """
  6. >>> Translator('xyz')
  7. Traceback (most recent call last):
  8. ...
  9. AssertionError: Invalid separators
  10. >>> Translator('')
  11. Traceback (most recent call last):
  12. ...
  13. AssertionError: Invalid separators
  14. """
  15. seps: str
  16. def __init__(self, seps: str = _default_seps):
  17. assert seps and set(seps) <= set(_default_seps), "Invalid separators"
  18. self.seps = seps
  19. def translate(self, pattern):
  20. """
  21. Given a glob pattern, produce a regex that matches it.
  22. """
  23. return self.extend(self.match_dirs(self.translate_core(pattern)))
  24. def extend(self, pattern):
  25. r"""
  26. Extend regex for pattern-wide concerns.
  27. Apply '(?s:)' to create a non-matching group that
  28. matches newlines (valid on Unix).
  29. Append '\Z' to imply fullmatch even when match is used.
  30. """
  31. return rf'(?s:{pattern})\Z'
  32. def match_dirs(self, pattern):
  33. """
  34. Ensure that zipfile.Path directory names are matched.
  35. zipfile.Path directory names always end in a slash.
  36. """
  37. return rf'{pattern}[/]?'
  38. def translate_core(self, pattern):
  39. r"""
  40. Given a glob pattern, produce a regex that matches it.
  41. >>> t = Translator()
  42. >>> t.translate_core('*.txt').replace('\\\\', '')
  43. '[^/]*\\.txt'
  44. >>> t.translate_core('a?txt')
  45. 'a[^/]txt'
  46. >>> t.translate_core('**/*').replace('\\\\', '')
  47. '.*/[^/][^/]*'
  48. """
  49. self.restrict_rglob(pattern)
  50. return ''.join(map(self.replace, separate(self.star_not_empty(pattern))))
  51. def replace(self, match):
  52. """
  53. Perform the replacements for a match from :func:`separate`.
  54. """
  55. return match.group('set') or (
  56. re.escape(match.group(0))
  57. .replace('\\*\\*', r'.*')
  58. .replace('\\*', rf'[^{re.escape(self.seps)}]*')
  59. .replace('\\?', r'[^/]')
  60. )
  61. def restrict_rglob(self, pattern):
  62. """
  63. Raise ValueError if ** appears in anything but a full path segment.
  64. >>> Translator().translate('**foo')
  65. Traceback (most recent call last):
  66. ...
  67. ValueError: ** must appear alone in a path segment
  68. """
  69. seps_pattern = rf'[{re.escape(self.seps)}]+'
  70. segments = re.split(seps_pattern, pattern)
  71. if any('**' in segment and segment != '**' for segment in segments):
  72. raise ValueError("** must appear alone in a path segment")
  73. def star_not_empty(self, pattern):
  74. """
  75. Ensure that * will not match an empty segment.
  76. """
  77. def handle_segment(match):
  78. segment = match.group(0)
  79. return '?*' if segment == '*' else segment
  80. not_seps_pattern = rf'[^{re.escape(self.seps)}]+'
  81. return re.sub(not_seps_pattern, handle_segment, pattern)
  82. def separate(pattern):
  83. """
  84. Separate out character sets to avoid translating their contents.
  85. >>> [m.group(0) for m in separate('*.txt')]
  86. ['*.txt']
  87. >>> [m.group(0) for m in separate('a[?]txt')]
  88. ['a', '[?]', 'txt']
  89. """
  90. return re.finditer(r'([^\[]+)|(?P<set>[\[].*?[\]])|([\[][^\]]*$)', pattern)