125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
|
import logging
|
||
|
import tarfile
|
||
|
|
||
|
import fsspec
|
||
|
from fsspec.archive import AbstractArchiveFileSystem
|
||
|
from fsspec.compression import compr
|
||
|
from fsspec.utils import infer_compression
|
||
|
|
||
|
typemap = {b"0": "file", b"5": "directory"}
|
||
|
|
||
|
logger = logging.getLogger("tar")
|
||
|
|
||
|
|
||
|
class TarFileSystem(AbstractArchiveFileSystem):
|
||
|
"""Compressed Tar archives as a file-system (read-only)
|
||
|
|
||
|
Supports the following formats:
|
||
|
tar.gz, tar.bz2, tar.xz
|
||
|
"""
|
||
|
|
||
|
root_marker = ""
|
||
|
protocol = "tar"
|
||
|
cachable = False
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
fo="",
|
||
|
index_store=None,
|
||
|
target_options=None,
|
||
|
target_protocol=None,
|
||
|
compression=None,
|
||
|
**kwargs,
|
||
|
):
|
||
|
super().__init__(**kwargs)
|
||
|
target_options = target_options or {}
|
||
|
|
||
|
if isinstance(fo, str):
|
||
|
self.of = fsspec.open(fo, protocol=target_protocol, **target_options)
|
||
|
fo = self.of.open() # keep the reference
|
||
|
|
||
|
# Try to infer compression.
|
||
|
if compression is None:
|
||
|
name = None
|
||
|
|
||
|
# Try different ways to get hold of the filename. `fo` might either
|
||
|
# be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an
|
||
|
# `fsspec.AbstractFileSystem` instance.
|
||
|
try:
|
||
|
# Amended io.BufferedReader or similar.
|
||
|
# This uses a "protocol extension" where original filenames are
|
||
|
# propagated to archive-like filesystems in order to let them
|
||
|
# infer the right compression appropriately.
|
||
|
if hasattr(fo, "original"):
|
||
|
name = fo.original
|
||
|
|
||
|
# fsspec.LocalFileOpener
|
||
|
elif hasattr(fo, "path"):
|
||
|
name = fo.path
|
||
|
|
||
|
# io.BufferedReader
|
||
|
elif hasattr(fo, "name"):
|
||
|
name = fo.name
|
||
|
|
||
|
# fsspec.AbstractFileSystem
|
||
|
elif hasattr(fo, "info"):
|
||
|
name = fo.info()["name"]
|
||
|
|
||
|
except Exception as ex:
|
||
|
logger.warning(
|
||
|
f"Unable to determine file name, not inferring compression: {ex}"
|
||
|
)
|
||
|
|
||
|
if name is not None:
|
||
|
compression = infer_compression(name)
|
||
|
logger.info(f"Inferred compression {compression} from file name {name}")
|
||
|
|
||
|
if compression is not None:
|
||
|
# TODO: tarfile already implements compression with modes like "'r:gz'",
|
||
|
# but then would seek to offset in the file work?
|
||
|
fo = compr[compression](fo)
|
||
|
|
||
|
self._fo_ref = fo
|
||
|
self.fo = fo # the whole instance is a context
|
||
|
self.tar = tarfile.TarFile(fileobj=self.fo)
|
||
|
self.dir_cache = None
|
||
|
|
||
|
self.index_store = index_store
|
||
|
self.index = None
|
||
|
self._index()
|
||
|
|
||
|
def _index(self):
|
||
|
# TODO: load and set saved index, if exists
|
||
|
out = {}
|
||
|
for ti in self.tar:
|
||
|
info = ti.get_info()
|
||
|
info["type"] = typemap.get(info["type"], "file")
|
||
|
name = ti.get_info()["name"].rstrip("/")
|
||
|
out[name] = (info, ti.offset_data)
|
||
|
|
||
|
self.index = out
|
||
|
# TODO: save index to self.index_store here, if set
|
||
|
|
||
|
def _get_dirs(self):
|
||
|
if self.dir_cache is not None:
|
||
|
return
|
||
|
|
||
|
# This enables ls to get directories as children as well as files
|
||
|
self.dir_cache = {
|
||
|
dirname: {"name": dirname, "size": 0, "type": "directory"}
|
||
|
for dirname in self._all_dirnames(self.tar.getnames())
|
||
|
}
|
||
|
for member in self.tar.getmembers():
|
||
|
info = member.get_info()
|
||
|
info["name"] = info["name"].rstrip("/")
|
||
|
info["type"] = typemap.get(info["type"], "file")
|
||
|
self.dir_cache[info["name"]] = info
|
||
|
|
||
|
def _open(self, path, mode="rb", **kwargs):
|
||
|
if mode != "rb":
|
||
|
raise ValueError("Read-only filesystem implementation")
|
||
|
details, offset = self.index[path]
|
||
|
if details["type"] != "file":
|
||
|
raise ValueError("Can only handle regular files")
|
||
|
return self.tar.extractfile(path)
|