github.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. import base64
  2. import requests
  3. from ..spec import AbstractFileSystem
  4. from ..utils import infer_storage_options
  5. from .memory import MemoryFile
  6. # TODO: add GIST backend, would be very similar
  7. class GithubFileSystem(AbstractFileSystem):
  8. """Interface to files in github
  9. An instance of this class provides the files residing within a remote github
  10. repository. You may specify a point in the repos history, by SHA, branch
  11. or tag (default is current master).
  12. For files less than 1 MB in size, file content is returned directly in a
  13. MemoryFile. For larger files, or for files tracked by git-lfs, file content
  14. is returned as an HTTPFile wrapping the ``download_url`` provided by the
  15. GitHub API.
  16. When using fsspec.open, allows URIs of the form:
  17. - "github://path/file", in which case you must specify org, repo and
  18. may specify sha in the extra args
  19. - 'github://org:repo@/precip/catalog.yml', where the org and repo are
  20. part of the URI
  21. - 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
  22. ``sha`` can be the full or abbreviated hex of the commit you want to fetch
  23. from, or a branch or tag name (so long as it doesn't contain special characters
  24. like "/", "?", which would have to be HTTP-encoded).
  25. For authorised access, you must provide username and token, which can be made
  26. at https://github.com/settings/tokens
  27. """
  28. url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
  29. content_url = "https://api.github.com/repos/{org}/{repo}/contents/{path}?ref={sha}"
  30. protocol = "github"
  31. timeout = (60, 60) # connect, read timeouts
  32. def __init__(
  33. self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
  34. ):
  35. super().__init__(**kwargs)
  36. self.org = org
  37. self.repo = repo
  38. if (username is None) ^ (token is None):
  39. raise ValueError("Auth required both username and token")
  40. self.username = username
  41. self.token = token
  42. if timeout is not None:
  43. self.timeout = timeout
  44. if sha is None:
  45. # look up default branch (not necessarily "master")
  46. u = "https://api.github.com/repos/{org}/{repo}"
  47. r = requests.get(
  48. u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
  49. )
  50. r.raise_for_status()
  51. sha = r.json()["default_branch"]
  52. self.root = sha
  53. self.ls("")
  54. try:
  55. from .http import HTTPFileSystem
  56. self.http_fs = HTTPFileSystem(**kwargs)
  57. except ImportError:
  58. self.http_fs = None
  59. @property
  60. def kw(self):
  61. if self.username:
  62. return {"auth": (self.username, self.token)}
  63. return {}
  64. @classmethod
  65. def repos(cls, org_or_user, is_org=True):
  66. """List repo names for given org or user
  67. This may become the top level of the FS
  68. Parameters
  69. ----------
  70. org_or_user: str
  71. Name of the github org or user to query
  72. is_org: bool (default True)
  73. Whether the name is an organisation (True) or user (False)
  74. Returns
  75. -------
  76. List of string
  77. """
  78. r = requests.get(
  79. f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
  80. timeout=cls.timeout,
  81. )
  82. r.raise_for_status()
  83. return [repo["name"] for repo in r.json()]
  84. @property
  85. def tags(self):
  86. """Names of tags in the repo"""
  87. r = requests.get(
  88. f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
  89. timeout=self.timeout,
  90. **self.kw,
  91. )
  92. r.raise_for_status()
  93. return [t["name"] for t in r.json()]
  94. @property
  95. def branches(self):
  96. """Names of branches in the repo"""
  97. r = requests.get(
  98. f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
  99. timeout=self.timeout,
  100. **self.kw,
  101. )
  102. r.raise_for_status()
  103. return [t["name"] for t in r.json()]
  104. @property
  105. def refs(self):
  106. """Named references, tags and branches"""
  107. return {"tags": self.tags, "branches": self.branches}
  108. def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
  109. """List files at given path
  110. Parameters
  111. ----------
  112. path: str
  113. Location to list, relative to repo root
  114. detail: bool
  115. If True, returns list of dicts, one per file; if False, returns
  116. list of full filenames only
  117. sha: str (optional)
  118. List at the given point in the repo history, branch or tag name or commit
  119. SHA
  120. _sha: str (optional)
  121. List this specific tree object (used internally to descend into trees)
  122. """
  123. path = self._strip_protocol(path)
  124. if path == "":
  125. _sha = sha or self.root
  126. if _sha is None:
  127. parts = path.rstrip("/").split("/")
  128. so_far = ""
  129. _sha = sha or self.root
  130. for part in parts:
  131. out = self.ls(so_far, True, sha=sha, _sha=_sha)
  132. so_far += "/" + part if so_far else part
  133. out = [o for o in out if o["name"] == so_far]
  134. if not out:
  135. raise FileNotFoundError(path)
  136. out = out[0]
  137. if out["type"] == "file":
  138. if detail:
  139. return [out]
  140. else:
  141. return path
  142. _sha = out["sha"]
  143. if path not in self.dircache or sha not in [self.root, None]:
  144. r = requests.get(
  145. self.url.format(org=self.org, repo=self.repo, sha=_sha),
  146. timeout=self.timeout,
  147. **self.kw,
  148. )
  149. if r.status_code == 404:
  150. raise FileNotFoundError(path)
  151. r.raise_for_status()
  152. types = {"blob": "file", "tree": "directory"}
  153. out = [
  154. {
  155. "name": path + "/" + f["path"] if path else f["path"],
  156. "mode": f["mode"],
  157. "type": types[f["type"]],
  158. "size": f.get("size", 0),
  159. "sha": f["sha"],
  160. }
  161. for f in r.json()["tree"]
  162. if f["type"] in types
  163. ]
  164. if sha in [self.root, None]:
  165. self.dircache[path] = out
  166. else:
  167. out = self.dircache[path]
  168. if detail:
  169. return out
  170. else:
  171. return sorted([f["name"] for f in out])
  172. def invalidate_cache(self, path=None):
  173. self.dircache.clear()
  174. @classmethod
  175. def _strip_protocol(cls, path):
  176. opts = infer_storage_options(path)
  177. if "username" not in opts:
  178. return super()._strip_protocol(path)
  179. return opts["path"].lstrip("/")
  180. @staticmethod
  181. def _get_kwargs_from_urls(path):
  182. opts = infer_storage_options(path)
  183. if "username" not in opts:
  184. return {}
  185. out = {"org": opts["username"], "repo": opts["password"]}
  186. if opts["host"]:
  187. out["sha"] = opts["host"]
  188. return out
  189. def _open(
  190. self,
  191. path,
  192. mode="rb",
  193. block_size=None,
  194. cache_options=None,
  195. sha=None,
  196. **kwargs,
  197. ):
  198. if mode != "rb":
  199. raise NotImplementedError
  200. # construct a url to hit the GitHub API's repo contents API
  201. url = self.content_url.format(
  202. org=self.org, repo=self.repo, path=path, sha=sha or self.root
  203. )
  204. # make a request to this API, and parse the response as JSON
  205. r = requests.get(url, timeout=self.timeout, **self.kw)
  206. if r.status_code == 404:
  207. raise FileNotFoundError(path)
  208. r.raise_for_status()
  209. content_json = r.json()
  210. # if the response's content key is not empty, try to parse it as base64
  211. if content_json["content"]:
  212. content = base64.b64decode(content_json["content"])
  213. # as long as the content does not start with the string
  214. # "version https://git-lfs.github.com/"
  215. # then it is probably not a git-lfs pointer and we can just return
  216. # the content directly
  217. if not content.startswith(b"version https://git-lfs.github.com/"):
  218. return MemoryFile(None, None, content)
  219. # we land here if the content was not present in the first response
  220. # (regular file over 1MB or git-lfs tracked file)
  221. # in this case, we get let the HTTPFileSystem handle the download
  222. if self.http_fs is None:
  223. raise ImportError(
  224. "Please install fsspec[http] to access github files >1 MB "
  225. "or git-lfs tracked files."
  226. )
  227. return self.http_fs.open(
  228. content_json["download_url"],
  229. mode=mode,
  230. block_size=block_size,
  231. cache_options=cache_options,
  232. **kwargs,
  233. )