local.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. import datetime
  2. import io
  3. import logging
  4. import os
  5. import os.path as osp
  6. import shutil
  7. import stat
  8. import tempfile
  9. from fsspec import AbstractFileSystem
  10. from fsspec.compression import compr
  11. from fsspec.core import get_compression
  12. from fsspec.utils import isfilelike, stringify_path
  13. logger = logging.getLogger("fsspec.local")
  14. class LocalFileSystem(AbstractFileSystem):
  15. """Interface to files on local storage
  16. Parameters
  17. ----------
  18. auto_mkdir: bool
  19. Whether, when opening a file, the directory containing it should
  20. be created (if it doesn't already exist). This is assumed by pyarrow
  21. code.
  22. """
  23. root_marker = "/"
  24. protocol = "file", "local"
  25. local_file = True
  26. def __init__(self, auto_mkdir=False, **kwargs):
  27. super().__init__(**kwargs)
  28. self.auto_mkdir = auto_mkdir
  29. @property
  30. def fsid(self):
  31. return "local"
  32. def mkdir(self, path, create_parents=True, **kwargs):
  33. path = self._strip_protocol(path)
  34. if self.exists(path):
  35. raise FileExistsError(path)
  36. if create_parents:
  37. self.makedirs(path, exist_ok=True)
  38. else:
  39. os.mkdir(path, **kwargs)
  40. def makedirs(self, path, exist_ok=False):
  41. path = self._strip_protocol(path)
  42. os.makedirs(path, exist_ok=exist_ok)
  43. def rmdir(self, path):
  44. path = self._strip_protocol(path)
  45. os.rmdir(path)
  46. def ls(self, path, detail=False, **kwargs):
  47. path = self._strip_protocol(path)
  48. path_info = self.info(path)
  49. infos = []
  50. if path_info["type"] == "directory":
  51. with os.scandir(path) as it:
  52. for f in it:
  53. try:
  54. # Only get the info if requested since it is a bit expensive (the stat call inside)
  55. # The strip_protocol is also used in info() and calls make_path_posix to always return posix paths
  56. info = self.info(f) if detail else self._strip_protocol(f.path)
  57. infos.append(info)
  58. except FileNotFoundError:
  59. pass
  60. else:
  61. infos = [path_info] if detail else [path_info["name"]]
  62. return infos
  63. def info(self, path, **kwargs):
  64. if isinstance(path, os.DirEntry):
  65. # scandir DirEntry
  66. out = path.stat(follow_symlinks=False)
  67. link = path.is_symlink()
  68. if path.is_dir(follow_symlinks=False):
  69. t = "directory"
  70. elif path.is_file(follow_symlinks=False):
  71. t = "file"
  72. else:
  73. t = "other"
  74. size = out.st_size
  75. if link:
  76. try:
  77. out2 = path.stat(follow_symlinks=True)
  78. size = out2.st_size
  79. except OSError:
  80. size = 0
  81. path = self._strip_protocol(path.path)
  82. else:
  83. # str or path-like
  84. path = self._strip_protocol(path)
  85. out = os.stat(path, follow_symlinks=False)
  86. link = stat.S_ISLNK(out.st_mode)
  87. if link:
  88. out = os.stat(path, follow_symlinks=True)
  89. size = out.st_size
  90. if stat.S_ISDIR(out.st_mode):
  91. t = "directory"
  92. elif stat.S_ISREG(out.st_mode):
  93. t = "file"
  94. else:
  95. t = "other"
  96. result = {
  97. "name": path,
  98. "size": size,
  99. "type": t,
  100. "created": out.st_ctime,
  101. "islink": link,
  102. }
  103. for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
  104. result[field] = getattr(out, f"st_{field}")
  105. if link:
  106. result["destination"] = os.readlink(path)
  107. return result
  108. def lexists(self, path, **kwargs):
  109. return osp.lexists(path)
  110. def cp_file(self, path1, path2, **kwargs):
  111. path1 = self._strip_protocol(path1)
  112. path2 = self._strip_protocol(path2)
  113. if self.auto_mkdir:
  114. self.makedirs(self._parent(path2), exist_ok=True)
  115. if self.isfile(path1):
  116. shutil.copyfile(path1, path2)
  117. elif self.isdir(path1):
  118. self.mkdirs(path2, exist_ok=True)
  119. else:
  120. raise FileNotFoundError(path1)
  121. def isfile(self, path):
  122. path = self._strip_protocol(path)
  123. return os.path.isfile(path)
  124. def isdir(self, path):
  125. path = self._strip_protocol(path)
  126. return os.path.isdir(path)
  127. def get_file(self, path1, path2, callback=None, **kwargs):
  128. if isfilelike(path2):
  129. with open(path1, "rb") as f:
  130. shutil.copyfileobj(f, path2)
  131. else:
  132. return self.cp_file(path1, path2, **kwargs)
  133. def put_file(self, path1, path2, callback=None, **kwargs):
  134. return self.cp_file(path1, path2, **kwargs)
  135. def mv(self, path1, path2, **kwargs):
  136. path1 = self._strip_protocol(path1)
  137. path2 = self._strip_protocol(path2)
  138. shutil.move(path1, path2)
  139. def link(self, src, dst, **kwargs):
  140. src = self._strip_protocol(src)
  141. dst = self._strip_protocol(dst)
  142. os.link(src, dst, **kwargs)
  143. def symlink(self, src, dst, **kwargs):
  144. src = self._strip_protocol(src)
  145. dst = self._strip_protocol(dst)
  146. os.symlink(src, dst, **kwargs)
  147. def islink(self, path) -> bool:
  148. return os.path.islink(self._strip_protocol(path))
  149. def rm_file(self, path):
  150. os.remove(self._strip_protocol(path))
  151. def rm(self, path, recursive=False, maxdepth=None):
  152. if not isinstance(path, list):
  153. path = [path]
  154. for p in path:
  155. p = self._strip_protocol(p)
  156. if self.isdir(p):
  157. if not recursive:
  158. raise ValueError("Cannot delete directory, set recursive=True")
  159. if osp.abspath(p) == os.getcwd():
  160. raise ValueError("Cannot delete current working directory")
  161. shutil.rmtree(p)
  162. else:
  163. os.remove(p)
  164. def unstrip_protocol(self, name):
  165. name = self._strip_protocol(name) # normalise for local/win/...
  166. return f"file://{name}"
  167. def _open(self, path, mode="rb", block_size=None, **kwargs):
  168. path = self._strip_protocol(path)
  169. if self.auto_mkdir and "w" in mode:
  170. self.makedirs(self._parent(path), exist_ok=True)
  171. return LocalFileOpener(path, mode, fs=self, **kwargs)
  172. def touch(self, path, truncate=True, **kwargs):
  173. path = self._strip_protocol(path)
  174. if self.auto_mkdir:
  175. self.makedirs(self._parent(path), exist_ok=True)
  176. if self.exists(path):
  177. os.utime(path, None)
  178. else:
  179. open(path, "a").close()
  180. if truncate:
  181. os.truncate(path, 0)
  182. def created(self, path):
  183. info = self.info(path=path)
  184. return datetime.datetime.fromtimestamp(
  185. info["created"], tz=datetime.timezone.utc
  186. )
  187. def modified(self, path):
  188. info = self.info(path=path)
  189. return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
  190. @classmethod
  191. def _parent(cls, path):
  192. path = cls._strip_protocol(path)
  193. if os.sep == "/":
  194. # posix native
  195. return path.rsplit("/", 1)[0] or "/"
  196. else:
  197. # NT
  198. path_ = path.rsplit("/", 1)[0]
  199. if len(path_) <= 3:
  200. if path_[1:2] == ":":
  201. # nt root (something like c:/)
  202. return path_[0] + ":/"
  203. # More cases may be required here
  204. return path_
  205. @classmethod
  206. def _strip_protocol(cls, path):
  207. path = stringify_path(path)
  208. if path.startswith("file://"):
  209. path = path[7:]
  210. elif path.startswith("file:"):
  211. path = path[5:]
  212. elif path.startswith("local://"):
  213. path = path[8:]
  214. elif path.startswith("local:"):
  215. path = path[6:]
  216. path = make_path_posix(path)
  217. if os.sep != "/":
  218. # This code-path is a stripped down version of
  219. # > drive, path = ntpath.splitdrive(path)
  220. if path[1:2] == ":":
  221. # Absolute drive-letter path, e.g. X:\Windows
  222. # Relative path with drive, e.g. X:Windows
  223. drive, path = path[:2], path[2:]
  224. elif path[:2] == "//":
  225. # UNC drives, e.g. \\server\share or \\?\UNC\server\share
  226. # Device drives, e.g. \\.\device or \\?\device
  227. if (index1 := path.find("/", 2)) == -1 or (
  228. index2 := path.find("/", index1 + 1)
  229. ) == -1:
  230. drive, path = path, ""
  231. else:
  232. drive, path = path[:index2], path[index2:]
  233. else:
  234. # Relative path, e.g. Windows
  235. drive = ""
  236. path = path.rstrip("/") or cls.root_marker
  237. return drive + path
  238. else:
  239. return path.rstrip("/") or cls.root_marker
  240. def _isfilestore(self):
  241. # Inheriting from DaskFileSystem makes this False (S3, etc. were)
  242. # the original motivation. But we are a posix-like file system.
  243. # See https://github.com/dask/dask/issues/5526
  244. return True
  245. def chmod(self, path, mode):
  246. path = stringify_path(path)
  247. return os.chmod(path, mode)
  248. def make_path_posix(path):
  249. """Make path generic and absolute for current OS"""
  250. if not isinstance(path, str):
  251. if isinstance(path, (list, set, tuple)):
  252. return type(path)(make_path_posix(p) for p in path)
  253. else:
  254. path = stringify_path(path)
  255. if not isinstance(path, str):
  256. raise TypeError(f"could not convert {path!r} to string")
  257. if os.sep == "/":
  258. # Native posix
  259. if path.startswith("/"):
  260. # most common fast case for posix
  261. return path
  262. elif path.startswith("~"):
  263. return osp.expanduser(path)
  264. elif path.startswith("./"):
  265. path = path[2:]
  266. elif path == ".":
  267. path = ""
  268. return f"{os.getcwd()}/{path}"
  269. else:
  270. # NT handling
  271. if path[0:1] == "/" and path[2:3] == ":":
  272. # path is like "/c:/local/path"
  273. path = path[1:]
  274. if path[1:2] == ":":
  275. # windows full path like "C:\\local\\path"
  276. if len(path) <= 3:
  277. # nt root (something like c:/)
  278. return path[0] + ":/"
  279. path = path.replace("\\", "/")
  280. return path
  281. elif path[0:1] == "~":
  282. return make_path_posix(osp.expanduser(path))
  283. elif path.startswith(("\\\\", "//")):
  284. # windows UNC/DFS-style paths
  285. return "//" + path[2:].replace("\\", "/")
  286. elif path.startswith(("\\", "/")):
  287. # windows relative path with root
  288. path = path.replace("\\", "/")
  289. return f"{osp.splitdrive(os.getcwd())[0]}{path}"
  290. else:
  291. path = path.replace("\\", "/")
  292. if path.startswith("./"):
  293. path = path[2:]
  294. elif path == ".":
  295. path = ""
  296. return f"{make_path_posix(os.getcwd())}/{path}"
  297. def trailing_sep(path):
  298. """Return True if the path ends with a path separator.
  299. A forward slash is always considered a path separator, even on Operating
  300. Systems that normally use a backslash.
  301. """
  302. # TODO: if all incoming paths were posix-compliant then separator would
  303. # always be a forward slash, simplifying this function.
  304. # See https://github.com/fsspec/filesystem_spec/pull/1250
  305. return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
  306. class LocalFileOpener(io.IOBase):
  307. def __init__(
  308. self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
  309. ):
  310. logger.debug("open file: %s", path)
  311. self.path = path
  312. self.mode = mode
  313. self.fs = fs
  314. self.f = None
  315. self.autocommit = autocommit
  316. self.compression = get_compression(path, compression)
  317. self.blocksize = io.DEFAULT_BUFFER_SIZE
  318. self._open()
  319. def _open(self):
  320. if self.f is None or self.f.closed:
  321. if self.autocommit or "w" not in self.mode:
  322. self.f = open(self.path, mode=self.mode)
  323. if self.compression:
  324. compress = compr[self.compression]
  325. self.f = compress(self.f, mode=self.mode)
  326. else:
  327. # TODO: check if path is writable?
  328. i, name = tempfile.mkstemp()
  329. os.close(i) # we want normal open and normal buffered file
  330. self.temp = name
  331. self.f = open(name, mode=self.mode)
  332. if "w" not in self.mode:
  333. self.size = self.f.seek(0, 2)
  334. self.f.seek(0)
  335. self.f.size = self.size
  336. def _fetch_range(self, start, end):
  337. # probably only used by cached FS
  338. if "r" not in self.mode:
  339. raise ValueError
  340. self._open()
  341. self.f.seek(start)
  342. return self.f.read(end - start)
  343. def __setstate__(self, state):
  344. self.f = None
  345. loc = state.pop("loc", None)
  346. self.__dict__.update(state)
  347. if "r" in state["mode"]:
  348. self.f = None
  349. self._open()
  350. self.f.seek(loc)
  351. def __getstate__(self):
  352. d = self.__dict__.copy()
  353. d.pop("f")
  354. if "r" in self.mode:
  355. d["loc"] = self.f.tell()
  356. else:
  357. if not self.f.closed:
  358. raise ValueError("Cannot serialise open write-mode local file")
  359. return d
  360. def commit(self):
  361. if self.autocommit:
  362. raise RuntimeError("Can only commit if not already set to autocommit")
  363. shutil.move(self.temp, self.path)
  364. def discard(self):
  365. if self.autocommit:
  366. raise RuntimeError("Cannot discard if set to autocommit")
  367. os.remove(self.temp)
  368. def readable(self) -> bool:
  369. return True
  370. def writable(self) -> bool:
  371. return "r" not in self.mode
  372. def read(self, *args, **kwargs):
  373. return self.f.read(*args, **kwargs)
  374. def write(self, *args, **kwargs):
  375. return self.f.write(*args, **kwargs)
  376. def tell(self, *args, **kwargs):
  377. return self.f.tell(*args, **kwargs)
  378. def seek(self, *args, **kwargs):
  379. return self.f.seek(*args, **kwargs)
  380. def seekable(self, *args, **kwargs):
  381. return self.f.seekable(*args, **kwargs)
  382. def readline(self, *args, **kwargs):
  383. return self.f.readline(*args, **kwargs)
  384. def readlines(self, *args, **kwargs):
  385. return self.f.readlines(*args, **kwargs)
  386. def close(self):
  387. return self.f.close()
  388. def truncate(self, size=None) -> int:
  389. return self.f.truncate(size)
  390. @property
  391. def closed(self):
  392. return self.f.closed
  393. def fileno(self):
  394. return self.raw.fileno()
  395. def flush(self) -> None:
  396. self.f.flush()
  397. def __iter__(self):
  398. return self.f.__iter__()
  399. def __getattr__(self, item):
  400. return getattr(self.f, item)
  401. def __enter__(self):
  402. self._incontext = True
  403. return self
  404. def __exit__(self, exc_type, exc_value, traceback):
  405. self._incontext = False
  406. self.f.__exit__(exc_type, exc_value, traceback)