472 lines
15 KiB
Python
472 lines
15 KiB
Python
|
import datetime
|
||
|
import io
|
||
|
import logging
|
||
|
import os
|
||
|
import os.path as osp
|
||
|
import shutil
|
||
|
import stat
|
||
|
import tempfile
|
||
|
|
||
|
from fsspec import AbstractFileSystem
|
||
|
from fsspec.compression import compr
|
||
|
from fsspec.core import get_compression
|
||
|
from fsspec.utils import isfilelike, stringify_path
|
||
|
|
||
|
logger = logging.getLogger("fsspec.local")
|
||
|
|
||
|
|
||
|
class LocalFileSystem(AbstractFileSystem):
|
||
|
"""Interface to files on local storage
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
auto_mkdir: bool
|
||
|
Whether, when opening a file, the directory containing it should
|
||
|
be created (if it doesn't already exist). This is assumed by pyarrow
|
||
|
code.
|
||
|
"""
|
||
|
|
||
|
root_marker = "/"
|
||
|
protocol = "file", "local"
|
||
|
local_file = True
|
||
|
|
||
|
def __init__(self, auto_mkdir=False, **kwargs):
|
||
|
super().__init__(**kwargs)
|
||
|
self.auto_mkdir = auto_mkdir
|
||
|
|
||
|
@property
|
||
|
def fsid(self):
|
||
|
return "local"
|
||
|
|
||
|
def mkdir(self, path, create_parents=True, **kwargs):
|
||
|
path = self._strip_protocol(path)
|
||
|
if self.exists(path):
|
||
|
raise FileExistsError(path)
|
||
|
if create_parents:
|
||
|
self.makedirs(path, exist_ok=True)
|
||
|
else:
|
||
|
os.mkdir(path, **kwargs)
|
||
|
|
||
|
def makedirs(self, path, exist_ok=False):
|
||
|
path = self._strip_protocol(path)
|
||
|
os.makedirs(path, exist_ok=exist_ok)
|
||
|
|
||
|
def rmdir(self, path):
|
||
|
path = self._strip_protocol(path)
|
||
|
os.rmdir(path)
|
||
|
|
||
|
def ls(self, path, detail=False, **kwargs):
|
||
|
path = self._strip_protocol(path)
|
||
|
info = self.info(path)
|
||
|
if info["type"] == "directory":
|
||
|
with os.scandir(path) as it:
|
||
|
infos = [self.info(f) for f in it]
|
||
|
else:
|
||
|
infos = [info]
|
||
|
|
||
|
if not detail:
|
||
|
return [i["name"] for i in infos]
|
||
|
return infos
|
||
|
|
||
|
def info(self, path, **kwargs):
|
||
|
if isinstance(path, os.DirEntry):
|
||
|
# scandir DirEntry
|
||
|
out = path.stat(follow_symlinks=False)
|
||
|
link = path.is_symlink()
|
||
|
if path.is_dir(follow_symlinks=False):
|
||
|
t = "directory"
|
||
|
elif path.is_file(follow_symlinks=False):
|
||
|
t = "file"
|
||
|
else:
|
||
|
t = "other"
|
||
|
|
||
|
size = out.st_size
|
||
|
if link:
|
||
|
try:
|
||
|
out2 = path.stat(follow_symlinks=True)
|
||
|
size = out2.st_size
|
||
|
except OSError:
|
||
|
size = 0
|
||
|
path = self._strip_protocol(path.path)
|
||
|
else:
|
||
|
# str or path-like
|
||
|
path = self._strip_protocol(path)
|
||
|
out = os.stat(path, follow_symlinks=False)
|
||
|
link = stat.S_ISLNK(out.st_mode)
|
||
|
if link:
|
||
|
out = os.stat(path, follow_symlinks=True)
|
||
|
size = out.st_size
|
||
|
if stat.S_ISDIR(out.st_mode):
|
||
|
t = "directory"
|
||
|
elif stat.S_ISREG(out.st_mode):
|
||
|
t = "file"
|
||
|
else:
|
||
|
t = "other"
|
||
|
result = {
|
||
|
"name": path,
|
||
|
"size": size,
|
||
|
"type": t,
|
||
|
"created": out.st_ctime,
|
||
|
"islink": link,
|
||
|
}
|
||
|
for field in ["mode", "uid", "gid", "mtime", "ino", "nlink"]:
|
||
|
result[field] = getattr(out, f"st_{field}")
|
||
|
if link:
|
||
|
result["destination"] = os.readlink(path)
|
||
|
return result
|
||
|
|
||
|
def lexists(self, path, **kwargs):
|
||
|
return osp.lexists(path)
|
||
|
|
||
|
def cp_file(self, path1, path2, **kwargs):
|
||
|
path1 = self._strip_protocol(path1)
|
||
|
path2 = self._strip_protocol(path2)
|
||
|
if self.auto_mkdir:
|
||
|
self.makedirs(self._parent(path2), exist_ok=True)
|
||
|
if self.isfile(path1):
|
||
|
shutil.copyfile(path1, path2)
|
||
|
elif self.isdir(path1):
|
||
|
self.mkdirs(path2, exist_ok=True)
|
||
|
else:
|
||
|
raise FileNotFoundError(path1)
|
||
|
|
||
|
def isfile(self, path):
|
||
|
path = self._strip_protocol(path)
|
||
|
return os.path.isfile(path)
|
||
|
|
||
|
def isdir(self, path):
|
||
|
path = self._strip_protocol(path)
|
||
|
return os.path.isdir(path)
|
||
|
|
||
|
def get_file(self, path1, path2, callback=None, **kwargs):
|
||
|
if isfilelike(path2):
|
||
|
with open(path1, "rb") as f:
|
||
|
shutil.copyfileobj(f, path2)
|
||
|
else:
|
||
|
return self.cp_file(path1, path2, **kwargs)
|
||
|
|
||
|
def put_file(self, path1, path2, callback=None, **kwargs):
|
||
|
return self.cp_file(path1, path2, **kwargs)
|
||
|
|
||
|
def mv(self, path1, path2, **kwargs):
|
||
|
path1 = self._strip_protocol(path1)
|
||
|
path2 = self._strip_protocol(path2)
|
||
|
shutil.move(path1, path2)
|
||
|
|
||
|
def link(self, src, dst, **kwargs):
|
||
|
src = self._strip_protocol(src)
|
||
|
dst = self._strip_protocol(dst)
|
||
|
os.link(src, dst, **kwargs)
|
||
|
|
||
|
def symlink(self, src, dst, **kwargs):
|
||
|
src = self._strip_protocol(src)
|
||
|
dst = self._strip_protocol(dst)
|
||
|
os.symlink(src, dst, **kwargs)
|
||
|
|
||
|
def islink(self, path) -> bool:
|
||
|
return os.path.islink(self._strip_protocol(path))
|
||
|
|
||
|
def rm_file(self, path):
|
||
|
os.remove(self._strip_protocol(path))
|
||
|
|
||
|
def rm(self, path, recursive=False, maxdepth=None):
|
||
|
if not isinstance(path, list):
|
||
|
path = [path]
|
||
|
|
||
|
for p in path:
|
||
|
p = self._strip_protocol(p)
|
||
|
if self.isdir(p):
|
||
|
if not recursive:
|
||
|
raise ValueError("Cannot delete directory, set recursive=True")
|
||
|
if osp.abspath(p) == os.getcwd():
|
||
|
raise ValueError("Cannot delete current working directory")
|
||
|
shutil.rmtree(p)
|
||
|
else:
|
||
|
os.remove(p)
|
||
|
|
||
|
def unstrip_protocol(self, name):
|
||
|
name = self._strip_protocol(name) # normalise for local/win/...
|
||
|
return f"file://{name}"
|
||
|
|
||
|
def _open(self, path, mode="rb", block_size=None, **kwargs):
|
||
|
path = self._strip_protocol(path)
|
||
|
if self.auto_mkdir and "w" in mode:
|
||
|
self.makedirs(self._parent(path), exist_ok=True)
|
||
|
return LocalFileOpener(path, mode, fs=self, **kwargs)
|
||
|
|
||
|
def touch(self, path, truncate=True, **kwargs):
|
||
|
path = self._strip_protocol(path)
|
||
|
if self.auto_mkdir:
|
||
|
self.makedirs(self._parent(path), exist_ok=True)
|
||
|
if self.exists(path):
|
||
|
os.utime(path, None)
|
||
|
else:
|
||
|
open(path, "a").close()
|
||
|
if truncate:
|
||
|
os.truncate(path, 0)
|
||
|
|
||
|
def created(self, path):
|
||
|
info = self.info(path=path)
|
||
|
return datetime.datetime.fromtimestamp(
|
||
|
info["created"], tz=datetime.timezone.utc
|
||
|
)
|
||
|
|
||
|
def modified(self, path):
|
||
|
info = self.info(path=path)
|
||
|
return datetime.datetime.fromtimestamp(info["mtime"], tz=datetime.timezone.utc)
|
||
|
|
||
|
@classmethod
|
||
|
def _parent(cls, path):
|
||
|
path = cls._strip_protocol(path)
|
||
|
if os.sep == "/":
|
||
|
# posix native
|
||
|
return path.rsplit("/", 1)[0] or "/"
|
||
|
else:
|
||
|
# NT
|
||
|
path_ = path.rsplit("/", 1)[0]
|
||
|
if len(path_) <= 3:
|
||
|
if path_[1:2] == ":":
|
||
|
# nt root (something like c:/)
|
||
|
return path_[0] + ":/"
|
||
|
# More cases may be required here
|
||
|
return path_
|
||
|
|
||
|
@classmethod
|
||
|
def _strip_protocol(cls, path):
|
||
|
path = stringify_path(path)
|
||
|
if path.startswith("file://"):
|
||
|
path = path[7:]
|
||
|
elif path.startswith("file:"):
|
||
|
path = path[5:]
|
||
|
elif path.startswith("local://"):
|
||
|
path = path[8:]
|
||
|
elif path.startswith("local:"):
|
||
|
path = path[6:]
|
||
|
|
||
|
path = make_path_posix(path)
|
||
|
if os.sep != "/":
|
||
|
# This code-path is a stripped down version of
|
||
|
# > drive, path = ntpath.splitdrive(path)
|
||
|
if path[1:2] == ":":
|
||
|
# Absolute drive-letter path, e.g. X:\Windows
|
||
|
# Relative path with drive, e.g. X:Windows
|
||
|
drive, path = path[:2], path[2:]
|
||
|
elif path[:2] == "//":
|
||
|
# UNC drives, e.g. \\server\share or \\?\UNC\server\share
|
||
|
# Device drives, e.g. \\.\device or \\?\device
|
||
|
if (index1 := path.find("/", 2)) == -1 or (
|
||
|
index2 := path.find("/", index1 + 1)
|
||
|
) == -1:
|
||
|
drive, path = path, ""
|
||
|
else:
|
||
|
drive, path = path[:index2], path[index2:]
|
||
|
else:
|
||
|
# Relative path, e.g. Windows
|
||
|
drive = ""
|
||
|
|
||
|
path = path.rstrip("/") or cls.root_marker
|
||
|
return drive + path
|
||
|
|
||
|
else:
|
||
|
return path.rstrip("/") or cls.root_marker
|
||
|
|
||
|
def _isfilestore(self):
|
||
|
# Inheriting from DaskFileSystem makes this False (S3, etc. were)
|
||
|
# the original motivation. But we are a posix-like file system.
|
||
|
# See https://github.com/dask/dask/issues/5526
|
||
|
return True
|
||
|
|
||
|
def chmod(self, path, mode):
|
||
|
path = stringify_path(path)
|
||
|
return os.chmod(path, mode)
|
||
|
|
||
|
|
||
|
def make_path_posix(path):
|
||
|
"""Make path generic and absolute for current OS"""
|
||
|
if not isinstance(path, str):
|
||
|
if isinstance(path, (list, set, tuple)):
|
||
|
return type(path)(make_path_posix(p) for p in path)
|
||
|
else:
|
||
|
path = stringify_path(path)
|
||
|
if not isinstance(path, str):
|
||
|
raise TypeError(f"could not convert {path!r} to string")
|
||
|
if os.sep == "/":
|
||
|
# Native posix
|
||
|
if path.startswith("/"):
|
||
|
# most common fast case for posix
|
||
|
return path
|
||
|
elif path.startswith("~"):
|
||
|
return osp.expanduser(path)
|
||
|
elif path.startswith("./"):
|
||
|
path = path[2:]
|
||
|
elif path == ".":
|
||
|
path = ""
|
||
|
return f"{os.getcwd()}/{path}"
|
||
|
else:
|
||
|
# NT handling
|
||
|
if path[0:1] == "/" and path[2:3] == ":":
|
||
|
# path is like "/c:/local/path"
|
||
|
path = path[1:]
|
||
|
if path[1:2] == ":":
|
||
|
# windows full path like "C:\\local\\path"
|
||
|
if len(path) <= 3:
|
||
|
# nt root (something like c:/)
|
||
|
return path[0] + ":/"
|
||
|
path = path.replace("\\", "/")
|
||
|
return path
|
||
|
elif path[0:1] == "~":
|
||
|
return make_path_posix(osp.expanduser(path))
|
||
|
elif path.startswith(("\\\\", "//")):
|
||
|
# windows UNC/DFS-style paths
|
||
|
return "//" + path[2:].replace("\\", "/")
|
||
|
elif path.startswith(("\\", "/")):
|
||
|
# windows relative path with root
|
||
|
path = path.replace("\\", "/")
|
||
|
return f"{osp.splitdrive(os.getcwd())[0]}{path}"
|
||
|
else:
|
||
|
path = path.replace("\\", "/")
|
||
|
if path.startswith("./"):
|
||
|
path = path[2:]
|
||
|
elif path == ".":
|
||
|
path = ""
|
||
|
return f"{make_path_posix(os.getcwd())}/{path}"
|
||
|
|
||
|
|
||
|
def trailing_sep(path):
|
||
|
"""Return True if the path ends with a path separator.
|
||
|
|
||
|
A forward slash is always considered a path separator, even on Operating
|
||
|
Systems that normally use a backslash.
|
||
|
"""
|
||
|
# TODO: if all incoming paths were posix-compliant then separator would
|
||
|
# always be a forward slash, simplifying this function.
|
||
|
# See https://github.com/fsspec/filesystem_spec/pull/1250
|
||
|
return path.endswith(os.sep) or (os.altsep is not None and path.endswith(os.altsep))
|
||
|
|
||
|
|
||
|
class LocalFileOpener(io.IOBase):
|
||
|
def __init__(
|
||
|
self, path, mode, autocommit=True, fs=None, compression=None, **kwargs
|
||
|
):
|
||
|
logger.debug("open file: %s", path)
|
||
|
self.path = path
|
||
|
self.mode = mode
|
||
|
self.fs = fs
|
||
|
self.f = None
|
||
|
self.autocommit = autocommit
|
||
|
self.compression = get_compression(path, compression)
|
||
|
self.blocksize = io.DEFAULT_BUFFER_SIZE
|
||
|
self._open()
|
||
|
|
||
|
def _open(self):
|
||
|
if self.f is None or self.f.closed:
|
||
|
if self.autocommit or "w" not in self.mode:
|
||
|
self.f = open(self.path, mode=self.mode)
|
||
|
if self.compression:
|
||
|
compress = compr[self.compression]
|
||
|
self.f = compress(self.f, mode=self.mode)
|
||
|
else:
|
||
|
# TODO: check if path is writable?
|
||
|
i, name = tempfile.mkstemp()
|
||
|
os.close(i) # we want normal open and normal buffered file
|
||
|
self.temp = name
|
||
|
self.f = open(name, mode=self.mode)
|
||
|
if "w" not in self.mode:
|
||
|
self.size = self.f.seek(0, 2)
|
||
|
self.f.seek(0)
|
||
|
self.f.size = self.size
|
||
|
|
||
|
def _fetch_range(self, start, end):
|
||
|
# probably only used by cached FS
|
||
|
if "r" not in self.mode:
|
||
|
raise ValueError
|
||
|
self._open()
|
||
|
self.f.seek(start)
|
||
|
return self.f.read(end - start)
|
||
|
|
||
|
def __setstate__(self, state):
|
||
|
self.f = None
|
||
|
loc = state.pop("loc", None)
|
||
|
self.__dict__.update(state)
|
||
|
if "r" in state["mode"]:
|
||
|
self.f = None
|
||
|
self._open()
|
||
|
self.f.seek(loc)
|
||
|
|
||
|
def __getstate__(self):
|
||
|
d = self.__dict__.copy()
|
||
|
d.pop("f")
|
||
|
if "r" in self.mode:
|
||
|
d["loc"] = self.f.tell()
|
||
|
else:
|
||
|
if not self.f.closed:
|
||
|
raise ValueError("Cannot serialise open write-mode local file")
|
||
|
return d
|
||
|
|
||
|
def commit(self):
|
||
|
if self.autocommit:
|
||
|
raise RuntimeError("Can only commit if not already set to autocommit")
|
||
|
shutil.move(self.temp, self.path)
|
||
|
|
||
|
def discard(self):
|
||
|
if self.autocommit:
|
||
|
raise RuntimeError("Cannot discard if set to autocommit")
|
||
|
os.remove(self.temp)
|
||
|
|
||
|
def readable(self) -> bool:
|
||
|
return True
|
||
|
|
||
|
def writable(self) -> bool:
|
||
|
return "r" not in self.mode
|
||
|
|
||
|
def read(self, *args, **kwargs):
|
||
|
return self.f.read(*args, **kwargs)
|
||
|
|
||
|
def write(self, *args, **kwargs):
|
||
|
return self.f.write(*args, **kwargs)
|
||
|
|
||
|
def tell(self, *args, **kwargs):
|
||
|
return self.f.tell(*args, **kwargs)
|
||
|
|
||
|
def seek(self, *args, **kwargs):
|
||
|
return self.f.seek(*args, **kwargs)
|
||
|
|
||
|
def seekable(self, *args, **kwargs):
|
||
|
return self.f.seekable(*args, **kwargs)
|
||
|
|
||
|
def readline(self, *args, **kwargs):
|
||
|
return self.f.readline(*args, **kwargs)
|
||
|
|
||
|
def readlines(self, *args, **kwargs):
|
||
|
return self.f.readlines(*args, **kwargs)
|
||
|
|
||
|
def close(self):
|
||
|
return self.f.close()
|
||
|
|
||
|
def truncate(self, size=None) -> int:
|
||
|
return self.f.truncate(size)
|
||
|
|
||
|
@property
|
||
|
def closed(self):
|
||
|
return self.f.closed
|
||
|
|
||
|
def fileno(self):
|
||
|
return self.raw.fileno()
|
||
|
|
||
|
def flush(self) -> None:
|
||
|
self.f.flush()
|
||
|
|
||
|
def __iter__(self):
|
||
|
return self.f.__iter__()
|
||
|
|
||
|
def __getattr__(self, item):
|
||
|
return getattr(self.f, item)
|
||
|
|
||
|
def __enter__(self):
|
||
|
self._incontext = True
|
||
|
return self
|
||
|
|
||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||
|
self._incontext = False
|
||
|
self.f.__exit__(exc_type, exc_value, traceback)
|