123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467 |
- import base64
- import urllib
- import requests
- import requests.exceptions
- from requests.adapters import HTTPAdapter, Retry
- from fsspec import AbstractFileSystem
- from fsspec.spec import AbstractBufferedFile
- class DatabricksException(Exception):
- """
- Helper class for exceptions raised in this module.
- """
- def __init__(self, error_code, message):
- """Create a new DatabricksException"""
- super().__init__(message)
- self.error_code = error_code
- self.message = message
- class DatabricksFileSystem(AbstractFileSystem):
- """
- Get access to the Databricks filesystem implementation over HTTP.
- Can be used inside and outside of a databricks cluster.
- """
- def __init__(self, instance, token, **kwargs):
- """
- Create a new DatabricksFileSystem.
- Parameters
- ----------
- instance: str
- The instance URL of the databricks cluster.
- For example for an Azure databricks cluster, this
- has the form adb-<some-number>.<two digits>.azuredatabricks.net.
- token: str
- Your personal token. Find out more
- here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
- """
- self.instance = instance
- self.token = token
- self.session = requests.Session()
- self.retries = Retry(
- total=10,
- backoff_factor=0.05,
- status_forcelist=[408, 429, 500, 502, 503, 504],
- )
- self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
- self.session.headers.update({"Authorization": f"Bearer {self.token}"})
- super().__init__(**kwargs)
- def ls(self, path, detail=True, **kwargs):
- """
- List the contents of the given path.
- Parameters
- ----------
- path: str
- Absolute path
- detail: bool
- Return not only the list of filenames,
- but also additional information on file sizes
- and types.
- """
- out = self._ls_from_cache(path)
- if not out:
- try:
- r = self._send_to_api(
- method="get", endpoint="list", json={"path": path}
- )
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- raise FileNotFoundError(e.message) from e
- raise
- files = r["files"]
- out = [
- {
- "name": o["path"],
- "type": "directory" if o["is_dir"] else "file",
- "size": o["file_size"],
- }
- for o in files
- ]
- self.dircache[path] = out
- if detail:
- return out
- return [o["name"] for o in out]
- def makedirs(self, path, exist_ok=True):
- """
- Create a given absolute path and all of its parents.
- Parameters
- ----------
- path: str
- Absolute path to create
- exist_ok: bool
- If false, checks if the folder
- exists before creating it (and raises an
- Exception if this is the case)
- """
- if not exist_ok:
- try:
- # If the following succeeds, the path is already present
- self._send_to_api(
- method="get", endpoint="get-status", json={"path": path}
- )
- raise FileExistsError(f"Path {path} already exists")
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- pass
- try:
- self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
- except DatabricksException as e:
- if e.error_code == "RESOURCE_ALREADY_EXISTS":
- raise FileExistsError(e.message) from e
- raise
- self.invalidate_cache(self._parent(path))
- def mkdir(self, path, create_parents=True, **kwargs):
- """
- Create a given absolute path and all of its parents.
- Parameters
- ----------
- path: str
- Absolute path to create
- create_parents: bool
- Whether to create all parents or not.
- "False" is not implemented so far.
- """
- if not create_parents:
- raise NotImplementedError
- self.mkdirs(path, **kwargs)
- def rm(self, path, recursive=False, **kwargs):
- """
- Remove the file or folder at the given absolute path.
- Parameters
- ----------
- path: str
- Absolute path what to remove
- recursive: bool
- Recursively delete all files in a folder.
- """
- try:
- self._send_to_api(
- method="post",
- endpoint="delete",
- json={"path": path, "recursive": recursive},
- )
- except DatabricksException as e:
- # This is not really an exception, it just means
- # not everything was deleted so far
- if e.error_code == "PARTIAL_DELETE":
- self.rm(path=path, recursive=recursive)
- elif e.error_code == "IO_ERROR":
- # Using the same exception as the os module would use here
- raise OSError(e.message) from e
- raise
- self.invalidate_cache(self._parent(path))
- def mv(
- self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
- ):
- """
- Move a source to a destination path.
- A note from the original [databricks API manual]
- (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
- When moving a large number of files the API call will time out after
- approximately 60s, potentially resulting in partially moved data.
- Therefore, for operations that move more than 10k files, we strongly
- discourage using the DBFS REST API.
- Parameters
- ----------
- source_path: str
- From where to move (absolute path)
- destination_path: str
- To where to move (absolute path)
- recursive: bool
- Not implemented to far.
- maxdepth:
- Not implemented to far.
- """
- if recursive:
- raise NotImplementedError
- if maxdepth:
- raise NotImplementedError
- try:
- self._send_to_api(
- method="post",
- endpoint="move",
- json={"source_path": source_path, "destination_path": destination_path},
- )
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- raise FileNotFoundError(e.message) from e
- elif e.error_code == "RESOURCE_ALREADY_EXISTS":
- raise FileExistsError(e.message) from e
- raise
- self.invalidate_cache(self._parent(source_path))
- self.invalidate_cache(self._parent(destination_path))
- def _open(self, path, mode="rb", block_size="default", **kwargs):
- """
- Overwrite the base class method to make sure to create a DBFile.
- All arguments are copied from the base method.
- Only the default blocksize is allowed.
- """
- return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
- def _send_to_api(self, method, endpoint, json):
- """
- Send the given json to the DBFS API
- using a get or post request (specified by the argument `method`).
- Parameters
- ----------
- method: str
- Which http method to use for communication; "get" or "post".
- endpoint: str
- Where to send the request to (last part of the API URL)
- json: dict
- Dictionary of information to send
- """
- if method == "post":
- session_call = self.session.post
- elif method == "get":
- session_call = self.session.get
- else:
- raise ValueError(f"Do not understand method {method}")
- url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
- r = session_call(url, json=json)
- # The DBFS API will return a json, also in case of an exception.
- # We want to preserve this information as good as possible.
- try:
- r.raise_for_status()
- except requests.HTTPError as e:
- # try to extract json error message
- # if that fails, fall back to the original exception
- try:
- exception_json = e.response.json()
- except Exception:
- raise e from None
- raise DatabricksException(**exception_json) from e
- return r.json()
- def _create_handle(self, path, overwrite=True):
- """
- Internal function to create a handle, which can be used to
- write blocks of a file to DBFS.
- A handle has a unique identifier which needs to be passed
- whenever written during this transaction.
- The handle is active for 10 minutes - after that a new
- write transaction needs to be created.
- Make sure to close the handle after you are finished.
- Parameters
- ----------
- path: str
- Absolute path for this file.
- overwrite: bool
- If a file already exist at this location, either overwrite
- it or raise an exception.
- """
- try:
- r = self._send_to_api(
- method="post",
- endpoint="create",
- json={"path": path, "overwrite": overwrite},
- )
- return r["handle"]
- except DatabricksException as e:
- if e.error_code == "RESOURCE_ALREADY_EXISTS":
- raise FileExistsError(e.message) from e
- raise
- def _close_handle(self, handle):
- """
- Close a handle, which was opened by :func:`_create_handle`.
- Parameters
- ----------
- handle: str
- Which handle to close.
- """
- try:
- self._send_to_api(method="post", endpoint="close", json={"handle": handle})
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- raise FileNotFoundError(e.message) from e
- raise
- def _add_data(self, handle, data):
- """
- Upload data to an already opened file handle
- (opened by :func:`_create_handle`).
- The maximal allowed data size is 1MB after
- conversion to base64.
- Remember to close the handle when you are finished.
- Parameters
- ----------
- handle: str
- Which handle to upload data to.
- data: bytes
- Block of data to add to the handle.
- """
- data = base64.b64encode(data).decode()
- try:
- self._send_to_api(
- method="post",
- endpoint="add-block",
- json={"handle": handle, "data": data},
- )
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- raise FileNotFoundError(e.message) from e
- elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
- raise ValueError(e.message) from e
- raise
- def _get_data(self, path, start, end):
- """
- Download data in bytes from a given absolute path in a block
- from [start, start+length].
- The maximum number of allowed bytes to read is 1MB.
- Parameters
- ----------
- path: str
- Absolute path to download data from
- start: int
- Start position of the block
- end: int
- End position of the block
- """
- try:
- r = self._send_to_api(
- method="get",
- endpoint="read",
- json={"path": path, "offset": start, "length": end - start},
- )
- return base64.b64decode(r["data"])
- except DatabricksException as e:
- if e.error_code == "RESOURCE_DOES_NOT_EXIST":
- raise FileNotFoundError(e.message) from e
- elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
- raise ValueError(e.message) from e
- raise
- def invalidate_cache(self, path=None):
- if path is None:
- self.dircache.clear()
- else:
- self.dircache.pop(path, None)
- super().invalidate_cache(path)
- class DatabricksFile(AbstractBufferedFile):
- """
- Helper class for files referenced in the DatabricksFileSystem.
- """
- DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size
- def __init__(
- self,
- fs,
- path,
- mode="rb",
- block_size="default",
- autocommit=True,
- cache_type="readahead",
- cache_options=None,
- **kwargs,
- ):
- """
- Create a new instance of the DatabricksFile.
- The blocksize needs to be the default one.
- """
- if block_size is None or block_size == "default":
- block_size = self.DEFAULT_BLOCK_SIZE
- assert block_size == self.DEFAULT_BLOCK_SIZE, (
- f"Only the default block size is allowed, not {block_size}"
- )
- super().__init__(
- fs,
- path,
- mode=mode,
- block_size=block_size,
- autocommit=autocommit,
- cache_type=cache_type,
- cache_options=cache_options or {},
- **kwargs,
- )
- def _initiate_upload(self):
- """Internal function to start a file upload"""
- self.handle = self.fs._create_handle(self.path)
- def _upload_chunk(self, final=False):
- """Internal function to add a chunk of data to a started upload"""
- self.buffer.seek(0)
- data = self.buffer.getvalue()
- data_chunks = [
- data[start:end] for start, end in self._to_sized_blocks(len(data))
- ]
- for data_chunk in data_chunks:
- self.fs._add_data(handle=self.handle, data=data_chunk)
- if final:
- self.fs._close_handle(handle=self.handle)
- return True
- def _fetch_range(self, start, end):
- """Internal function to download a block of data"""
- return_buffer = b""
- length = end - start
- for chunk_start, chunk_end in self._to_sized_blocks(length, start):
- return_buffer += self.fs._get_data(
- path=self.path, start=chunk_start, end=chunk_end
- )
- return return_buffer
- def _to_sized_blocks(self, length, start=0):
- """Helper function to split a range from 0 to total_length into bloksizes"""
- end = start + length
- for data_chunk in range(start, end, self.blocksize):
- data_start = data_chunk
- data_end = min(end, data_chunk + self.blocksize)
- yield data_start, data_end
|