240 lines
7.8 KiB
Python
240 lines
7.8 KiB
Python
import requests
|
|
|
|
import fsspec
|
|
|
|
from ..spec import AbstractFileSystem
|
|
from ..utils import infer_storage_options
|
|
from .memory import MemoryFile
|
|
|
|
# TODO: add GIST backend, would be very similar
|
|
|
|
|
|
class GithubFileSystem(AbstractFileSystem):
|
|
"""Interface to files in github
|
|
|
|
An instance of this class provides the files residing within a remote github
|
|
repository. You may specify a point in the repos history, by SHA, branch
|
|
or tag (default is current master).
|
|
|
|
Given that code files tend to be small, and that github does not support
|
|
retrieving partial content, we always fetch whole files.
|
|
|
|
When using fsspec.open, allows URIs of the form:
|
|
|
|
- "github://path/file", in which case you must specify org, repo and
|
|
may specify sha in the extra args
|
|
- 'github://org:repo@/precip/catalog.yml', where the org and repo are
|
|
part of the URI
|
|
- 'github://org:repo@sha/precip/catalog.yml', where the sha is also included
|
|
|
|
``sha`` can be the full or abbreviated hex of the commit you want to fetch
|
|
from, or a branch or tag name (so long as it doesn't contain special characters
|
|
like "/", "?", which would have to be HTTP-encoded).
|
|
|
|
For authorised access, you must provide username and token, which can be made
|
|
at https://github.com/settings/tokens
|
|
"""
|
|
|
|
url = "https://api.github.com/repos/{org}/{repo}/git/trees/{sha}"
|
|
rurl = "https://raw.githubusercontent.com/{org}/{repo}/{sha}/{path}"
|
|
protocol = "github"
|
|
timeout = (60, 60) # connect, read timeouts
|
|
|
|
def __init__(
|
|
self, org, repo, sha=None, username=None, token=None, timeout=None, **kwargs
|
|
):
|
|
super().__init__(**kwargs)
|
|
self.org = org
|
|
self.repo = repo
|
|
if (username is None) ^ (token is None):
|
|
raise ValueError("Auth required both username and token")
|
|
self.username = username
|
|
self.token = token
|
|
if timeout is not None:
|
|
self.timeout = timeout
|
|
if sha is None:
|
|
# look up default branch (not necessarily "master")
|
|
u = "https://api.github.com/repos/{org}/{repo}"
|
|
r = requests.get(
|
|
u.format(org=org, repo=repo), timeout=self.timeout, **self.kw
|
|
)
|
|
r.raise_for_status()
|
|
sha = r.json()["default_branch"]
|
|
|
|
self.root = sha
|
|
self.ls("")
|
|
|
|
@property
|
|
def kw(self):
|
|
if self.username:
|
|
return {"auth": (self.username, self.token)}
|
|
return {}
|
|
|
|
@classmethod
|
|
def repos(cls, org_or_user, is_org=True):
|
|
"""List repo names for given org or user
|
|
|
|
This may become the top level of the FS
|
|
|
|
Parameters
|
|
----------
|
|
org_or_user: str
|
|
Name of the github org or user to query
|
|
is_org: bool (default True)
|
|
Whether the name is an organisation (True) or user (False)
|
|
|
|
Returns
|
|
-------
|
|
List of string
|
|
"""
|
|
r = requests.get(
|
|
f"https://api.github.com/{['users', 'orgs'][is_org]}/{org_or_user}/repos",
|
|
timeout=cls.timeout,
|
|
)
|
|
r.raise_for_status()
|
|
return [repo["name"] for repo in r.json()]
|
|
|
|
@property
|
|
def tags(self):
|
|
"""Names of tags in the repo"""
|
|
r = requests.get(
|
|
f"https://api.github.com/repos/{self.org}/{self.repo}/tags",
|
|
timeout=self.timeout,
|
|
**self.kw,
|
|
)
|
|
r.raise_for_status()
|
|
return [t["name"] for t in r.json()]
|
|
|
|
@property
|
|
def branches(self):
|
|
"""Names of branches in the repo"""
|
|
r = requests.get(
|
|
f"https://api.github.com/repos/{self.org}/{self.repo}/branches",
|
|
timeout=self.timeout,
|
|
**self.kw,
|
|
)
|
|
r.raise_for_status()
|
|
return [t["name"] for t in r.json()]
|
|
|
|
@property
|
|
def refs(self):
|
|
"""Named references, tags and branches"""
|
|
return {"tags": self.tags, "branches": self.branches}
|
|
|
|
def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
|
|
"""List files at given path
|
|
|
|
Parameters
|
|
----------
|
|
path: str
|
|
Location to list, relative to repo root
|
|
detail: bool
|
|
If True, returns list of dicts, one per file; if False, returns
|
|
list of full filenames only
|
|
sha: str (optional)
|
|
List at the given point in the repo history, branch or tag name or commit
|
|
SHA
|
|
_sha: str (optional)
|
|
List this specific tree object (used internally to descend into trees)
|
|
"""
|
|
path = self._strip_protocol(path)
|
|
if path == "":
|
|
_sha = sha or self.root
|
|
if _sha is None:
|
|
parts = path.rstrip("/").split("/")
|
|
so_far = ""
|
|
_sha = sha or self.root
|
|
for part in parts:
|
|
out = self.ls(so_far, True, sha=sha, _sha=_sha)
|
|
so_far += "/" + part if so_far else part
|
|
out = [o for o in out if o["name"] == so_far]
|
|
if not out:
|
|
raise FileNotFoundError(path)
|
|
out = out[0]
|
|
if out["type"] == "file":
|
|
if detail:
|
|
return [out]
|
|
else:
|
|
return path
|
|
_sha = out["sha"]
|
|
if path not in self.dircache or sha not in [self.root, None]:
|
|
r = requests.get(
|
|
self.url.format(org=self.org, repo=self.repo, sha=_sha),
|
|
timeout=self.timeout,
|
|
**self.kw,
|
|
)
|
|
if r.status_code == 404:
|
|
raise FileNotFoundError(path)
|
|
r.raise_for_status()
|
|
types = {"blob": "file", "tree": "directory"}
|
|
out = [
|
|
{
|
|
"name": path + "/" + f["path"] if path else f["path"],
|
|
"mode": f["mode"],
|
|
"type": types[f["type"]],
|
|
"size": f.get("size", 0),
|
|
"sha": f["sha"],
|
|
}
|
|
for f in r.json()["tree"]
|
|
if f["type"] in types
|
|
]
|
|
if sha in [self.root, None]:
|
|
self.dircache[path] = out
|
|
else:
|
|
out = self.dircache[path]
|
|
if detail:
|
|
return out
|
|
else:
|
|
return sorted([f["name"] for f in out])
|
|
|
|
def invalidate_cache(self, path=None):
|
|
self.dircache.clear()
|
|
|
|
@classmethod
|
|
def _strip_protocol(cls, path):
|
|
opts = infer_storage_options(path)
|
|
if "username" not in opts:
|
|
return super()._strip_protocol(path)
|
|
return opts["path"].lstrip("/")
|
|
|
|
@staticmethod
|
|
def _get_kwargs_from_urls(path):
|
|
opts = infer_storage_options(path)
|
|
if "username" not in opts:
|
|
return {}
|
|
out = {"org": opts["username"], "repo": opts["password"]}
|
|
if opts["host"]:
|
|
out["sha"] = opts["host"]
|
|
return out
|
|
|
|
def _open(
|
|
self,
|
|
path,
|
|
mode="rb",
|
|
block_size=None,
|
|
autocommit=True,
|
|
cache_options=None,
|
|
sha=None,
|
|
**kwargs,
|
|
):
|
|
if mode != "rb":
|
|
raise NotImplementedError
|
|
url = self.rurl.format(
|
|
org=self.org, repo=self.repo, path=path, sha=sha or self.root
|
|
)
|
|
r = requests.get(url, timeout=self.timeout, **self.kw)
|
|
if r.status_code == 404:
|
|
raise FileNotFoundError(path)
|
|
r.raise_for_status()
|
|
return MemoryFile(None, None, r.content)
|
|
|
|
def cat(self, path, recursive=False, on_error="raise", **kwargs):
|
|
paths = self.expand_path(path, recursive=recursive)
|
|
urls = [
|
|
self.rurl.format(org=self.org, repo=self.repo, path=u, sha=self.root)
|
|
for u, sh in paths
|
|
]
|
|
fs = fsspec.filesystem("http")
|
|
data = fs.cat(urls, on_error="return")
|
|
return {u: v for ((k, v), u) in zip(data.items(), urls)}
|