Zarr Cache Helper

GitHub Link to Code.

Zarr cache manager for memory-efficient trajectory storage.

Handles conversion from trajectory files to optimized Zarr format using md.iterload().

class mdxplain.trajectory.helper.dask_trajectory_helper.zarr_cache_helper.ZarrCacheHelper(chunk_size: int = 1000, compression: str = 'lz4', cache_dir: str = './cache')

Manages Zarr cache files for efficient trajectory storage and access.

Uses md.iterload() for memory-efficient conversion and stores trajectories in optimized Zarr format with 1000-frame chunks.

__init__(chunk_size: int = 1000, compression: str = 'lz4', cache_dir: str = './cache')

Initialize Zarr cache manager.

Parameters

chunk_sizeint, default=1000

Number of frames per chunk (optimized for DaskZarr)

compressionstr, default=’lz4’

Compression algorithm for Zarr storage

cache_dirstr, default=’./cache’

Default cache directory for zarr files

Returns

None

Initializes Zarr cache manager

get_cache_path(trajectory_file: str, cache_dir: str | None = None, traj_name: str | None = None) str

Generate cache path for trajectory file.

Parameters

trajectory_filestr

Path to trajectory file

cache_dirstr, optional

Directory for cache files (default: ./cache)

traj_namestr, optional

Explicit name for the trajectory. If None, the file stem is used.

Returns

str

Path to Zarr cache file

Examples

>>> cache_manager = ZarrCacheHelper()
>>> # Using file stem as name (default)
>>> path = cache_manager.get_cache_path('/data/run1.xtc')
>>> # Output format: './cache/run1_<hash>.dask.zarr'
>>> # Using an explicit trajectory name
>>> path = cache_manager.get_cache_path('/data/run1.xtc', traj_name='A_Y4R_hpp_run1')
>>> # Output format: './cache/A_Y4R_hpp_run1_<hash>.dask.zarr'
cache_exists(cache_path: str) bool

Check if valid Zarr cache exists.

Parameters

cache_pathstr

Path to Zarr cache file

Returns

bool

True if valid cache exists

Raises

Exception

If cache file is corrupted or unreadable (errors bubble up for debugging)

Examples

>>> cache_manager = ZarrCacheHelper()
>>> cache_path = './cache/trajectory.dask.zarr'
>>> if cache_manager.cache_exists(cache_path):
...     print("Cache found!")
... else:
...     print("Need to create cache")
create_cache(trajectory_file: str, topology_file: str | None, cache_path: str) Dict[str, Any]

Create Zarr cache from trajectory file using md.iterload().

Parameters

trajectory_filestr

Path to trajectory file

topology_filestr, optional

Path to topology file

cache_pathstr

Path for Zarr cache file

Returns

dict

Metadata about the cached trajectory

Raises

FileNotFoundError

If trajectory_file or topology_file doesn’t exist

ValueError

If trajectory file format is unsupported

OSError

If cache directory is not writable

Examples

>>> cache_manager = ZarrCacheHelper(chunk_size=1000)
>>> metadata = cache_manager.create_cache(
...     'trajectory.xtc', 'topology.pdb', 'cache/traj.zarr'
... )
>>> print(f"Cached {metadata['n_frames']} frames")
load_cache_metadata(cache_path: str) Dict[str, Any]

Load metadata from Zarr cache.

Parameters

cache_pathstr

Path to Zarr cache file

Returns

dict

Trajectory metadata

Raises

FileNotFoundError

If cache file doesn’t exist

KeyError

If metadata is missing from cache file

Examples

>>> cache_manager = ZarrCacheHelper()
>>> metadata = cache_manager.load_cache_metadata('cache/traj.zarr')
>>> print(f"Cache contains {metadata['n_frames']} frames")
get_or_create_cache(trajectory_file: str, topology_file: str | None = None, cache_path: str | None = None) Tuple[str, Dict[str, Any]]

Get existing cache or create new one.

Parameters

trajectory_filestr

Path to trajectory file

topology_filestr, optional

Path to topology file

cache_pathstr, optional

Custom cache path

Returns

tuple

(cache_path, metadata)

Raises

FileNotFoundError

If trajectory_file doesn’t exist

ValueError

If trajectory file format is unsupported

OSError

If cache directory is not writable

Examples

>>> cache_manager = ZarrCacheHelper()
>>> cache_path, metadata = cache_manager.get_or_create_cache(
...     'trajectory.xtc', 'topology.pdb'
... )
>>> print(f"Using cache at {cache_path}")
>>> print(f"Contains {metadata['n_frames']} frames")
static store_topology(store: Group, topology: Topology, compressor: Any | None = None) None

Store topology in Zarr store using pickle serialization.

Parameters

storezarr.Group

Target zarr store to store topology in

topologymd.Topology

MDTraj topology object to serialize and store

compressorobject, optional

Compression codec for topology storage

Returns

None

Stores topology as pickled array in zarr store

Examples

>>> import zarr
>>> import mdtraj as md
>>> store = zarr.open('trajectory.zarr', mode='w')
>>> topology = md.load_topology('protein.pdb')
>>> ZarrCacheHelper.store_topology(store, topology)
static load_topology(store: Group) Topology

Load topology from Zarr store.

Parameters

storezarr.Group

Zarr store containing pickled topology data

Returns

md.Topology

Loaded MDTraj topology object

Examples

>>> import zarr
>>> store = zarr.open('trajectory.zarr', mode='r')
>>> topology = ZarrCacheHelper.load_topology(store)
>>> print(f"Loaded {topology.n_atoms} atoms")
create_cache_from_mdtraj(mdtraj: Trajectory, cache_path: str | None = None) Tuple[str, dict]

Create Zarr cache directly from MDTraj trajectory object.

Parameters

mdtrajmd.Trajectory

MDTraj trajectory object to cache

cache_pathstr, optional

Path for cache. If None, creates temporary cache.

Returns

tuple

(cache_path, metadata_dict) containing cache location and info

Examples

>>> import mdtraj as md
>>> traj = md.load('trajectory.xtc', top='topology.pdb')
>>> cache_helper = ZarrCacheHelper()
>>> cache_path, metadata = cache_helper.create_cache_from_mdtraj(traj)