tar.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. import logging
  2. import tarfile
  3. import fsspec
  4. from fsspec.archive import AbstractArchiveFileSystem
  5. from fsspec.compression import compr
  6. from fsspec.utils import infer_compression
  7. typemap = {b"0": "file", b"5": "directory"}
  8. logger = logging.getLogger("tar")
  9. class TarFileSystem(AbstractArchiveFileSystem):
  10. """Compressed Tar archives as a file-system (read-only)
  11. Supports the following formats:
  12. tar.gz, tar.bz2, tar.xz
  13. """
  14. root_marker = ""
  15. protocol = "tar"
  16. cachable = False
  17. def __init__(
  18. self,
  19. fo="",
  20. index_store=None,
  21. target_options=None,
  22. target_protocol=None,
  23. compression=None,
  24. **kwargs,
  25. ):
  26. super().__init__(**kwargs)
  27. target_options = target_options or {}
  28. if isinstance(fo, str):
  29. self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
  30. fo = self.of.open() # keep the reference
  31. # Try to infer compression.
  32. if compression is None:
  33. name = None
  34. # Try different ways to get hold of the filename. `fo` might either
  35. # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
  36. # `fsspec.AbstractFileSystem` instance.
  37. try:
  38. # Amended io.BufferedReader or similar.
  39. # This uses a "protocol extension" where original filenames are
  40. # propagated to archive-like filesystems in order to let them
  41. # infer the right compression appropriately.
  42. if hasattr(fo, "original"):
  43. name = fo.original
  44. # fsspec.LocalFileOpener
  45. elif hasattr(fo, "path"):
  46. name = fo.path
  47. # io.BufferedReader
  48. elif hasattr(fo, "name"):
  49. name = fo.name
  50. # fsspec.AbstractFileSystem
  51. elif hasattr(fo, "info"):
  52. name = fo.info()["name"]
  53. except Exception as ex:
  54. logger.warning(
  55. f"Unable to determine file name, not inferring compression: {ex}"
  56. )
  57. if name is not None:
  58. compression = infer_compression(name)
  59. logger.info(f"Inferred compression {compression} from file name {name}")
  60. if compression is not None:
  61. # TODO: tarfile already implements compression with modes like "'r:gz'",
  62. # but then would seek to offset in the file work?
  63. fo = compr[compression](fo)
  64. self._fo_ref = fo
  65. self.fo = fo # the whole instance is a context
  66. self.tar = tarfile.TarFile(fileobj=self.fo)
  67. self.dir_cache = None
  68. self.index_store = index_store
  69. self.index = None
  70. self._index()
  71. def _index(self):
  72. # TODO: load and set saved index, if exists
  73. out = {}
  74. for ti in self.tar:
  75. info = ti.get_info()
  76. info["type"] = typemap.get(info["type"], "file")
  77. name = ti.get_info()["name"].rstrip("/")
  78. out[name] = (info, ti.offset_data)
  79. self.index = out
  80. # TODO: save index to self.index_store here, if set
  81. def _get_dirs(self):
  82. if self.dir_cache is not None:
  83. return
  84. # This enables ls to get directories as children as well as files
  85. self.dir_cache = {
  86. dirname: {"name": dirname, "size": 0, "type": "directory"}
  87. for dirname in self._all_dirnames(self.tar.getnames())
  88. }
  89. for member in self.tar.getmembers():
  90. info = member.get_info()
  91. info["name"] = info["name"].rstrip("/")
  92. info["type"] = typemap.get(info["type"], "file")
  93. self.dir_cache[info["name"]] = info
  94. def _open(self, path, mode="rb", **kwargs):
  95. if mode != "rb":
  96. raise ValueError("Read-only filesystem implementation")
  97. details, offset = self.index[path]
  98. if details["type"] != "file":
  99. raise ValueError("Can only handle regular files")
  100. return self.tar.extractfile(path)