123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267 |
- import base64
- import requests
- from ..spec import AbstractFileSystem
- from ..utils import infer_storage_options
- from .memory import MemoryFile
- # TODO: add GIST backend, would be very similar
- class GithubFileSystem(AbstractFileSystem):
- """Interface to files in github
- An instance of this class provides the files residing within a remote github
- repository. You may specify a point in the repos history, by SHA, branch
- or tag (default is current master).
- For files less than 1 MB in size, file content is returned directly in a
- MemoryFile. For larger files, or for files tracked by git-lfs, file content
- is returned as an HTTPFile wrapping the ``download_url`` provided by the
- GitHub API.
- When using fsspec.open, allows URIs of the form:
- - "github://path/file", in which case you must specify org, repo and
- may specify sha in the extra args
- - 'github://org:repo@/precip/catalog.yml', where the org and repo are
- part of the URI
- - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
- ``sha`` can be the full or abbreviated hex of the commit you want to fetch
- from, or a branch or tag name (so long as it doesn't contain special characters
- like "/", "?", which would have to be HTTP-encoded).
- For authorised access, you must provide username and token, which can be made
- at https://github.com/settings/tokens
- """
- url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
- content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
- protocol = "github"
- timeout = (60, 60) # connect, read timeouts
- def __init__(
- self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
- ):
- super().__init__(**kwargs)
- self.org = org
- self.repo = repo
- if (username is None) ^ (token is None):
- raise ValueError("Auth required both username and token")
- self.username = username
- self.token = token
- if timeout is not None:
- self.timeout = timeout
- if sha is None:
- # look up default branch (not necessarily "master")
- u = "https://api.github.com/repos/{org}/{repo}"
- r = requests.get(
- u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
- )
- r.raise_for_status()
- sha = r.json()["default_branch"]
- self.root = sha
- self.ls("")
- try:
- from .http import HTTPFileSystem
- self.http_fs = HTTPFileSystem(**kwargs)
- except ImportError:
- self.http_fs = None
- @property
- def kw(self):
- if self.username:
- return {"auth": (self.username, self.token)}
- return {}
- @classmethod
- def repos(cls, org_or_user, is_org=True):
- """List repo names for given org or user
- This may become the top level of the FS
- Parameters
- ----------
- org_or_user: str
- Name of the github org or user to query
- is_org: bool (default True)
- Whether the name is an organisation (True) or user (False)
- Returns
- -------
- List of string
- """
- r = requests.get(
- f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
- timeout=cls.timeout,
- )
- r.raise_for_status()
- return [repo["name"] for repo in r.json()]
- @property
- def tags(self):
- """Names of tags in the repo"""
- r = requests.get(
- f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
- timeout=self.timeout,
- **self.kw,
- )
- r.raise_for_status()
- return [t["name"] for t in r.json()]
- @property
- def branches(self):
- """Names of branches in the repo"""
- r = requests.get(
- f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
- timeout=self.timeout,
- **self.kw,
- )
- r.raise_for_status()
- return [t["name"] for t in r.json()]
- @property
- def refs(self):
- """Named references, tags and branches"""
- return {"tags": self.tags, "branches": self.branches}
- def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
- """List files at given path
- Parameters
- ----------
- path: str
- Location to list, relative to repo root
- detail: bool
- If True, returns list of dicts, one per file; if False, returns
- list of full filenames only
- sha: str (optional)
- List at the given point in the repo history, branch or tag name or commit
- SHA
- _sha: str (optional)
- List this specific tree object (used internally to descend into trees)
- """
- path = self._strip_protocol(path)
- if path == "":
- _sha = sha or self.root
- if _sha is None:
- parts = path.rstrip("/").split("/")
- so_far = ""
- _sha = sha or self.root
- for part in parts:
- out = self.ls(so_far, True, sha=sha, _sha=_sha)
- so_far += "/" + part if so_far else part
- out = [o for o in out if o["name"] == so_far]
- if not out:
- raise FileNotFoundError(path)
- out = out[0]
- if out["type"] == "file":
- if detail:
- return [out]
- else:
- return path
- _sha = out["sha"]
- if path not in self.dircache or sha not in [self.root, None]:
- r = requests.get(
- self.url.format(org=self.org, repo=self.repo, sha=_sha),
- timeout=self.timeout,
- **self.kw,
- )
- if r.status_code == 404:
- raise FileNotFoundError(path)
- r.raise_for_status()
- types = {"blob": "file", "tree": "directory"}
- out = [
- {
- "name": path + "/" + f["path"] if path else f["path"],
- "mode": f["mode"],
- "type": types[f["type"]],
- "size": f.get("size", 0),
- "sha": f["sha"],
- }
- for f in r.json()["tree"]
- if f["type"] in types
- ]
- if sha in [self.root, None]:
- self.dircache[path] = out
- else:
- out = self.dircache[path]
- if detail:
- return out
- else:
- return sorted([f["name"] for f in out])
- def invalidate_cache(self, path=None):
- self.dircache.clear()
- @classmethod
- def _strip_protocol(cls, path):
- opts = infer_storage_options(path)
- if "username" not in opts:
- return super()._strip_protocol(path)
- return opts["path"].lstrip("/")
- @staticmethod
- def _get_kwargs_from_urls(path):
- opts = infer_storage_options(path)
- if "username" not in opts:
- return {}
- out = {"org": opts["username"], "repo": opts["password"]}
- if opts["host"]:
- out["sha"] = opts["host"]
- return out
- def _open(
- self,
- path,
- mode="rb",
- block_size=None,
- cache_options=None,
- sha=None,
- **kwargs,
- ):
- if mode != "rb":
- raise NotImplementedError
- # construct a url to hit the GitHub API's repo contents API
- url = self.content_url.format(
- org=self.org, repo=self.repo, path=path, sha=sha or self.root
- )
- # make a request to this API, and parse the response as JSON
- r = requests.get(url, timeout=self.timeout, **self.kw)
- if r.status_code == 404:
- raise FileNotFoundError(path)
- r.raise_for_status()
- content_json = r.json()
- # if the response's content key is not empty, try to parse it as base64
- if content_json["content"]:
- content = base64.b64decode(content_json["content"])
- # as long as the content does not start with the string
- # "version https://git-lfs.github.com/"
- # then it is probably not a git-lfs pointer and we can just return
- # the content directly
- if not content.startswith(b"version https://git-lfs.github.com/"):
- return MemoryFile(None, None, content)
- # we land here if the content was not present in the first response
- # (regular file over 1MB or git-lfs tracked file)
- # in this case, we get let the HTTPFileSystem handle the download
- if self.http_fs is None:
- raise ImportError(
- "Please install fsspec[http] to access github files >1 MB "
- "or git-lfs tracked files."
- )
- return self.http_fs.open(
- content_json["download_url"],
- mode=mode,
- block_size=block_size,
- cache_options=cache_options,
- **kwargs,
- )
|