smb.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. """
  2. This module contains SMBFileSystem class responsible for handling access to
  3. Windows Samba network shares by using package smbprotocol
  4. """
  5. import datetime
  6. import re
  7. import uuid
  8. from stat import S_ISDIR, S_ISLNK
  9. import smbclient
  10. import smbprotocol.exceptions
  11. from .. import AbstractFileSystem
  12. from ..utils import infer_storage_options
  13. # ! pylint: disable=bad-continuation
  14. class SMBFileSystem(AbstractFileSystem):
  15. """Allow reading and writing to Windows and Samba network shares.
  16. When using `fsspec.open()` for getting a file-like object the URI
  17. should be specified as this format:
  18. ``smb://workgroup;user:password@server:port/share/folder/file.csv``.
  19. Example::
  20. >>> import fsspec
  21. >>> with fsspec.open(
  22. ... 'smb://myuser:mypassword@myserver.com/' 'share/folder/file.csv'
  23. ... ) as smbfile:
  24. ... df = pd.read_csv(smbfile, sep='|', header=None)
  25. Note that you need to pass in a valid hostname or IP address for the host
  26. component of the URL. Do not use the Windows/NetBIOS machine name for the
  27. host component.
  28. The first component of the path in the URL points to the name of the shared
  29. folder. Subsequent path components will point to the directory/folder/file.
  30. The URL components ``workgroup`` , ``user``, ``password`` and ``port`` may be
  31. optional.
  32. .. note::
  33. For working this source require `smbprotocol`_ to be installed, e.g.::
  34. $ pip install smbprotocol
  35. # or
  36. # pip install smbprotocol[kerberos]
  37. .. _smbprotocol: https://github.com/jborean93/smbprotocol#requirements
  38. Note: if using this with the ``open`` or ``open_files``, with full URLs,
  39. there is no way to tell if a path is relative, so all paths are assumed
  40. to be absolute.
  41. """
  42. protocol = "smb"
  43. # pylint: disable=too-many-arguments
  44. def __init__(
  45. self,
  46. host,
  47. port=None,
  48. username=None,
  49. password=None,
  50. timeout=60,
  51. encrypt=None,
  52. share_access=None,
  53. register_session_retries=4,
  54. register_session_retry_wait=1,
  55. register_session_retry_factor=10,
  56. auto_mkdir=False,
  57. **kwargs,
  58. ):
  59. """
  60. You can use _get_kwargs_from_urls to get some kwargs from
  61. a reasonable SMB url.
  62. Authentication will be anonymous or integrated if username/password are not
  63. given.
  64. Parameters
  65. ----------
  66. host: str
  67. The remote server name/ip to connect to
  68. port: int or None
  69. Port to connect with. Usually 445, sometimes 139.
  70. username: str or None
  71. Username to connect with. Required if Kerberos auth is not being used.
  72. password: str or None
  73. User's password on the server, if using username
  74. timeout: int
  75. Connection timeout in seconds
  76. encrypt: bool
  77. Whether to force encryption or not, once this has been set to True
  78. the session cannot be changed back to False.
  79. share_access: str or None
  80. Specifies the default access applied to file open operations
  81. performed with this file system object.
  82. This affects whether other processes can concurrently open a handle
  83. to the same file.
  84. - None (the default): exclusively locks the file until closed.
  85. - 'r': Allow other handles to be opened with read access.
  86. - 'w': Allow other handles to be opened with write access.
  87. - 'd': Allow other handles to be opened with delete access.
  88. register_session_retries: int
  89. Number of retries to register a session with the server. Retries are not performed
  90. for authentication errors, as they are considered as invalid credentials and not network
  91. issues. If set to negative value, no register attempts will be performed.
  92. register_session_retry_wait: int
  93. Time in seconds to wait between each retry. Number must be non-negative.
  94. register_session_retry_factor: int
  95. Base factor for the wait time between each retry. The wait time
  96. is calculated using exponential function. For factor=1 all wait times
  97. will be equal to `register_session_retry_wait`. For any number of retries,
  98. the last wait time will be equal to `register_session_retry_wait` and for retries>1
  99. the first wait time will be equal to `register_session_retry_wait / factor`.
  100. Number must be equal to or greater than 1. Optimal factor is 10.
  101. auto_mkdir: bool
  102. Whether, when opening a file, the directory containing it should
  103. be created (if it doesn't already exist). This is assumed by pyarrow
  104. and zarr-python code.
  105. """
  106. super().__init__(**kwargs)
  107. self.host = host
  108. self.port = port
  109. self.username = username
  110. self.password = password
  111. self.timeout = timeout
  112. self.encrypt = encrypt
  113. self.temppath = kwargs.pop("temppath", "")
  114. self.share_access = share_access
  115. self.register_session_retries = register_session_retries
  116. if register_session_retry_wait < 0:
  117. raise ValueError(
  118. "register_session_retry_wait must be a non-negative integer"
  119. )
  120. self.register_session_retry_wait = register_session_retry_wait
  121. if register_session_retry_factor < 1:
  122. raise ValueError(
  123. "register_session_retry_factor must be a positive "
  124. "integer equal to or greater than 1"
  125. )
  126. self.register_session_retry_factor = register_session_retry_factor
  127. self.auto_mkdir = auto_mkdir
  128. self._connect()
  129. @property
  130. def _port(self):
  131. return 445 if self.port is None else self.port
  132. def _connect(self):
  133. import time
  134. if self.register_session_retries <= -1:
  135. return
  136. retried_errors = []
  137. wait_time = self.register_session_retry_wait
  138. n_waits = (
  139. self.register_session_retries - 1
  140. ) # -1 = No wait time after the last retry
  141. factor = self.register_session_retry_factor
  142. # Generate wait times for each retry attempt.
  143. # Wait times are calculated using exponential function. For factor=1 all wait times
  144. # will be equal to `wait`. For any number of retries the last wait time will be
  145. # equal to `wait` and for retries>2 the first wait time will be equal to `wait / factor`.
  146. wait_times = iter(
  147. factor ** (n / n_waits - 1) * wait_time for n in range(0, n_waits + 1)
  148. )
  149. for attempt in range(self.register_session_retries + 1):
  150. try:
  151. smbclient.register_session(
  152. self.host,
  153. username=self.username,
  154. password=self.password,
  155. port=self._port,
  156. encrypt=self.encrypt,
  157. connection_timeout=self.timeout,
  158. )
  159. return
  160. except (
  161. smbprotocol.exceptions.SMBAuthenticationError,
  162. smbprotocol.exceptions.LogonFailure,
  163. ):
  164. # These exceptions should not be repeated, as they clearly indicate
  165. # that the credentials are invalid and not a network issue.
  166. raise
  167. except ValueError as exc:
  168. if re.findall(r"\[Errno -\d+]", str(exc)):
  169. # This exception is raised by the smbprotocol.transport:Tcp.connect
  170. # and originates from socket.gaierror (OSError). These exceptions might
  171. # be raised due to network instability. We will retry to connect.
  172. retried_errors.append(exc)
  173. else:
  174. # All another ValueError exceptions should be raised, as they are not
  175. # related to network issues.
  176. raise
  177. except Exception as exc:
  178. # Save the exception and retry to connect. This except might be dropped
  179. # in the future, once all exceptions suited for retry are identified.
  180. retried_errors.append(exc)
  181. if attempt < self.register_session_retries:
  182. time.sleep(next(wait_times))
  183. # Raise last exception to inform user about the connection issues.
  184. # Note: Should we use ExceptionGroup to raise all exceptions?
  185. raise retried_errors[-1]
  186. @classmethod
  187. def _strip_protocol(cls, path):
  188. return infer_storage_options(path)["path"]
  189. @staticmethod
  190. def _get_kwargs_from_urls(path):
  191. # smb://workgroup;user:password@host:port/share/folder/file.csv
  192. out = infer_storage_options(path)
  193. out.pop("path", None)
  194. out.pop("protocol", None)
  195. return out
  196. def mkdir(self, path, create_parents=True, **kwargs):
  197. wpath = _as_unc_path(self.host, path)
  198. if create_parents:
  199. smbclient.makedirs(wpath, exist_ok=False, port=self._port, **kwargs)
  200. else:
  201. smbclient.mkdir(wpath, port=self._port, **kwargs)
  202. def makedirs(self, path, exist_ok=False):
  203. if _share_has_path(path):
  204. wpath = _as_unc_path(self.host, path)
  205. smbclient.makedirs(wpath, exist_ok=exist_ok, port=self._port)
  206. def rmdir(self, path):
  207. if _share_has_path(path):
  208. wpath = _as_unc_path(self.host, path)
  209. smbclient.rmdir(wpath, port=self._port)
  210. def info(self, path, **kwargs):
  211. wpath = _as_unc_path(self.host, path)
  212. stats = smbclient.stat(wpath, port=self._port, **kwargs)
  213. if S_ISDIR(stats.st_mode):
  214. stype = "directory"
  215. elif S_ISLNK(stats.st_mode):
  216. stype = "link"
  217. else:
  218. stype = "file"
  219. res = {
  220. "name": path + "/" if stype == "directory" else path,
  221. "size": stats.st_size,
  222. "type": stype,
  223. "uid": stats.st_uid,
  224. "gid": stats.st_gid,
  225. "time": stats.st_atime,
  226. "mtime": stats.st_mtime,
  227. }
  228. return res
  229. def created(self, path):
  230. """Return the created timestamp of a file as a datetime.datetime"""
  231. wpath = _as_unc_path(self.host, path)
  232. stats = smbclient.stat(wpath, port=self._port)
  233. return datetime.datetime.fromtimestamp(stats.st_ctime, tz=datetime.timezone.utc)
  234. def modified(self, path):
  235. """Return the modified timestamp of a file as a datetime.datetime"""
  236. wpath = _as_unc_path(self.host, path)
  237. stats = smbclient.stat(wpath, port=self._port)
  238. return datetime.datetime.fromtimestamp(stats.st_mtime, tz=datetime.timezone.utc)
  239. def ls(self, path, detail=True, **kwargs):
  240. unc = _as_unc_path(self.host, path)
  241. listed = smbclient.listdir(unc, port=self._port, **kwargs)
  242. dirs = ["/".join([path.rstrip("/"), p]) for p in listed]
  243. if detail:
  244. dirs = [self.info(d) for d in dirs]
  245. return dirs
  246. # pylint: disable=too-many-arguments
  247. def _open(
  248. self,
  249. path,
  250. mode="rb",
  251. block_size=-1,
  252. autocommit=True,
  253. cache_options=None,
  254. **kwargs,
  255. ):
  256. """
  257. block_size: int or None
  258. If 0, no buffering, 1, line buffering, >1, buffer that many bytes
  259. Notes
  260. -----
  261. By specifying 'share_access' in 'kwargs' it is possible to override the
  262. default shared access setting applied in the constructor of this object.
  263. """
  264. if self.auto_mkdir and "w" in mode:
  265. self.makedirs(self._parent(path), exist_ok=True)
  266. bls = block_size if block_size is not None and block_size >= 0 else -1
  267. wpath = _as_unc_path(self.host, path)
  268. share_access = kwargs.pop("share_access", self.share_access)
  269. if "w" in mode and autocommit is False:
  270. temp = _as_temp_path(self.host, path, self.temppath)
  271. return SMBFileOpener(
  272. wpath, temp, mode, port=self._port, block_size=bls, **kwargs
  273. )
  274. return smbclient.open_file(
  275. wpath,
  276. mode,
  277. buffering=bls,
  278. share_access=share_access,
  279. port=self._port,
  280. **kwargs,
  281. )
  282. def copy(self, path1, path2, **kwargs):
  283. """Copy within two locations in the same filesystem"""
  284. wpath1 = _as_unc_path(self.host, path1)
  285. wpath2 = _as_unc_path(self.host, path2)
  286. if self.auto_mkdir:
  287. self.makedirs(self._parent(path2), exist_ok=True)
  288. smbclient.copyfile(wpath1, wpath2, port=self._port, **kwargs)
  289. def _rm(self, path):
  290. if _share_has_path(path):
  291. wpath = _as_unc_path(self.host, path)
  292. stats = smbclient.stat(wpath, port=self._port)
  293. if S_ISDIR(stats.st_mode):
  294. smbclient.rmdir(wpath, port=self._port)
  295. else:
  296. smbclient.remove(wpath, port=self._port)
  297. def mv(self, path1, path2, recursive=None, maxdepth=None, **kwargs):
  298. wpath1 = _as_unc_path(self.host, path1)
  299. wpath2 = _as_unc_path(self.host, path2)
  300. smbclient.rename(wpath1, wpath2, port=self._port, **kwargs)
  301. def _as_unc_path(host, path):
  302. rpath = path.replace("/", "\\")
  303. unc = f"\\\\{host}{rpath}"
  304. return unc
  305. def _as_temp_path(host, path, temppath):
  306. share = path.split("/")[1]
  307. temp_file = f"/{share}{temppath}/{uuid.uuid4()}"
  308. unc = _as_unc_path(host, temp_file)
  309. return unc
  310. def _share_has_path(path):
  311. parts = path.count("/")
  312. if path.endswith("/"):
  313. return parts > 2
  314. return parts > 1
  315. class SMBFileOpener:
  316. """writes to remote temporary file, move on commit"""
  317. def __init__(self, path, temp, mode, port=445, block_size=-1, **kwargs):
  318. self.path = path
  319. self.temp = temp
  320. self.mode = mode
  321. self.block_size = block_size
  322. self.kwargs = kwargs
  323. self.smbfile = None
  324. self._incontext = False
  325. self.port = port
  326. self._open()
  327. def _open(self):
  328. if self.smbfile is None or self.smbfile.closed:
  329. self.smbfile = smbclient.open_file(
  330. self.temp,
  331. self.mode,
  332. port=self.port,
  333. buffering=self.block_size,
  334. **self.kwargs,
  335. )
  336. def commit(self):
  337. """Move temp file to definitive on success."""
  338. # TODO: use transaction support in SMB protocol
  339. smbclient.replace(self.temp, self.path, port=self.port)
  340. def discard(self):
  341. """Remove the temp file on failure."""
  342. smbclient.remove(self.temp, port=self.port)
  343. def __fspath__(self):
  344. return self.path
  345. def __iter__(self):
  346. return self.smbfile.__iter__()
  347. def __getattr__(self, item):
  348. return getattr(self.smbfile, item)
  349. def __enter__(self):
  350. self._incontext = True
  351. return self.smbfile.__enter__()
  352. def __exit__(self, exc_type, exc_value, traceback):
  353. self._incontext = False
  354. self.smbfile.__exit__(exc_type, exc_value, traceback)