Source code for pycmplot.resources

"""
pycmplot.resources
==================

Centralised configuration for external reference files that cannot be
bundled with the package distribution (large gene-info TSVs, liftover
chain files, etc.).

Resolution order
----------------
Resource paths are resolved in the following priority order for each
attribute:

1. **Explicit argument** — pass a :class:`ResourceConfig` instance with
   the desired path directly to any function that accepts a *resources*
   parameter.
2. **Environment variable** — set the corresponding variable before
   running pycmplot:

   .. code-block:: bash

       export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain.gz
       export PYCMPLOT_CHAIN_HG18_HG38=/path/to/hg18ToHg38.over.chain.gz
       export PYCMPLOT_GENEINFO_HG38=/path/to/Homo_sapiens.GRCh38.geneinfo.tsv.gz
       export PYCMPLOT_GENEINFO_HG19=/path/to/Homo_sapiens.GRCh37.geneinfo.tsv.gz

3. **Bundled default** — pycmplot ships with the required files in the
   ``pycmplot/data/`` package directory; they are used automatically when
   neither of the above is set.

Examples
--------
Override a single resource while using defaults for the rest:

>>> from pycmplot.resources import ResourceConfig
>>> cfg = ResourceConfig(chain_hg19_hg38="/my/custom.over.chain.gz")
>>> # pass cfg to any function that accepts a resources argument:
>>> from pycmplot.liftover import liftover_position
>>> df_lifted = liftover_position(df, resources=cfg)
"""

from __future__ import annotations

import os
from dataclasses import dataclass, field
from pathlib import Path
from importlib.resources import files, as_file

# define _env
def _env(var: str, default: str | None = None) -> str | None:
    return os.environ.get(var, default)

# define packaged data helper
def _pkg_data(filename: str) -> str:
    return str(files("pycmplot.data") / filename)



[docs] @dataclass class ResourceConfig: """Paths to external reference files used by pycmplot. Dataclass grouping the on-disk resources required by pycmplot: - ``chain_hg19_hg38`` -- UCSC LiftOver chain file for hg19 to hg38 conversion. Resolved from ``PYCMPLOT_CHAIN_HG19_HG38`` or the bundled ``hg19ToHg38.over.chain.gz``. - ``chain_hg18_hg38`` -- UCSC LiftOver chain file for hg18 to hg38 conversion. Resolved from ``PYCMPLOT_CHAIN_HG18_HG38`` or the bundled ``hg18ToHg38.over.chain.gz``. Only required when any input summary statistics file carries a ``hg18`` build label. - ``geneinfo_hg38`` -- Ensembl gene-info TSV for GRCh38, used for nearest-gene annotation. Resolved from ``PYCMPLOT_GENEINFO_HG38`` or the bundled ``Homo_sapiens.GRCh38.geneinfo.tsv.gz``. - ``geneinfo_hg19`` -- Ensembl gene-info TSV for GRCh37, used when input data carry a hg19 build label. Resolved from ``PYCMPLOT_GENEINFO_HG19`` or the bundled ``Homo_sapiens.GRCh37.geneinfo.tsv.gz``. All three attributes default to values resolved from environment variables or the bundled ``pycmplot/data/`` directory via :func:`importlib.resources.files`. Override individual attributes to use custom file locations. Examples -------- Use all bundled defaults: >>> from pycmplot.resources import ResourceConfig >>> cfg = ResourceConfig() Override the hg38 gene-info file: >>> cfg = ResourceConfig( ... geneinfo_hg38="/data/custom_GRCh38_genes.tsv.gz" ... ) """ chain_hg19_hg38: str | None = field( default_factory=lambda: _env( "PYCMPLOT_CHAIN_HG19_HG38", _pkg_data("hg19ToHg38.over.chain.gz"), ) ) chain_hg18_hg38: str | None = field( default_factory=lambda: _env( "PYCMPLOT_CHAIN_HG18_HG38", _pkg_data("hg18ToHg38.over.chain.gz"), ) ) geneinfo_hg38: str | None = field( default_factory=lambda: _env( "PYCMPLOT_GENEINFO_HG38", _pkg_data("Homo_sapiens.GRCh38.geneinfo.tsv.gz"), ) ) geneinfo_hg19: str | None = field( default_factory=lambda: _env( "PYCMPLOT_GENEINFO_HG19", _pkg_data("Homo_sapiens.GRCh37.geneinfo.tsv.gz"), ) ) #featuresinfo: str | None = field( # default_factory=lambda: _env( # "PYCMPLOT_FEATURESINFO", # _pkg_data("featuresinfo.tsv.gz"), # ) #)
[docs] def require(self, attr: str) -> str: """Return the path for *attr*, raising a clear :exc:`FileNotFoundError` if the attribute is unset or the path does not exist. First checks whether the attribute value is ``None``; if so, raises :exc:`FileNotFoundError` with a message indicating which environment variable to set. Then verifies that the resolved path exists on disk, falling back to :func:`importlib.resources.files` package-data resolution before raising if neither succeeds. Parameters ---------- attr : str Name of the :class:`ResourceConfig` attribute to retrieve, e.g. ``'chain_hg19_hg38'``, ``'geneinfo_hg38'``, ``'geneinfo_hg19'``. Returns ------- str Absolute file path as a string. Raises ------ FileNotFoundError If the attribute is ``None`` or the resolved path does not exist. Examples -------- >>> from pycmplot.resources import ResourceConfig >>> cfg = ResourceConfig() >>> chain = cfg.require("chain_hg19_hg38") >>> chain.endswith(".over.chain.gz") True """ val = getattr(self, attr) if val is None: env_var = { "chain_hg19_hg38": "PYCMPLOT_CHAIN_HG19_HG38", "chain_hg18_hg38": "PYCMPLOT_CHAIN_HG18_HG38", "geneinfo_hg38": "PYCMPLOT_GENEINFO_HG38", "geneinfo_hg19": "PYCMPLOT_GENEINFO_HG19", #"featuresinfo": "PYCMPLOT_FEATURESINFO", }.get(attr, attr.upper()) raise FileNotFoundError( f"Resource '{attr}' is not configured.\n" f"Set the environment variable {env_var} or pass a " f"ResourceConfig('{attr}'='/path/to/file') to the function." ) path = Path(val) if path.exists(): return str(path) # fallback: try importlib resource resolution try: resource = files("pycmplot.data") / Path(val).name with as_file(resource) as real_path: if Path(real_path).exists(): return str(real_path) except Exception: pass raise FileNotFoundError( f"Resource file not found: {val}\n" f"Check the path set for '{attr}'." )
# Module-level default instance — picks up environment variables automatically. default_resources = ResourceConfig()