Source code for pycmplot.liftover

"""
pycmplot.liftover
=================

Genome coordinate liftover utilities (hg18 → hg38 and hg19 → hg38).

The :class:`pyliftover.LiftOver` objects are initialised **lazily** — they
are created on first use and cached in a module-level dictionary, so
importing this module never triggers a file-not-found error even if the
chain files have not been configured yet.

Supported conversions
---------------------
pycmplot harmonises input coordinates to GRCh38. Two source assemblies are
supported:

* ``hg19`` / GRCh37 → GRCh38 (default, bundled chain file)
* ``hg18`` / NCBI36 → GRCh38 (bundled chain file; used when input rows
  carry a ``hg18`` build label)

Resource configuration
----------------------
Chain file paths are resolved through
:class:`~pycmplot.resources.ResourceConfig`.  By default, bundled chain
files are used (``pycmplot/data/hg19ToHg38.over.chain.gz`` and
``pycmplot/data/hg18ToHg38.over.chain.gz``).  They can be overridden by
setting the environment variables:

.. code-block:: bash

    export PYCMPLOT_CHAIN_HG19_HG38=/path/to/hg19ToHg38.over.chain.gz
    export PYCMPLOT_CHAIN_HG18_HG38=/path/to/hg18ToHg38.over.chain.gz
"""

from __future__ import annotations

import logging
from typing import Optional

import numpy as np
import pandas as pd

from pycmplot.resources import ResourceConfig, default_resources
from pycmplot.constants import hg38_chr_lengths

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Lazy singleton — one LiftOver object per chain file path
# ---------------------------------------------------------------------------
_lo_cache: dict[str, object] = {}


def _get_liftover(chain_path: str):
    """Return a cached :class:`~pyliftover.LiftOver` for *chain_path*.

    Loads the chain file on first call and stores the resulting
    :class:`~pyliftover.LiftOver` instance in a module-level dict.  Subsequent
    calls with the same *chain_path* return the cached object without re-reading
    the file.

    Parameters
    ----------
    chain_path : str
        Absolute path to a UCSC-format ``.over.chain`` (or ``.over.chain.gz``)
        file.

    Returns
    -------
    pyliftover.LiftOver
        A ready-to-use liftover object for the specified chain file.
    """

    if chain_path not in _lo_cache:
        from pyliftover import LiftOver  # deferred import

        logger.info("Loading LiftOver chain file: %s", chain_path)
        _lo_cache[chain_path] = LiftOver(chain_path)
    return _lo_cache[chain_path]


# ---------------------------------------------------------------------------
# Public helpers
# ---------------------------------------------------------------------------


[docs]
def liftover_hg19_to_hg38(
    chrom: str,
    pos: int,
    resources: Optional[ResourceConfig] = None,
) -> Optional[int]:
    """Convert a single hg19 position to its hg38 equivalent.

    Uses a lazily loaded and cached :class:`~pyliftover.LiftOver` object backed
    by the chain file specified in *resources*.  When multiple hg38 mappings
    exist for a given position, the one with the highest chain score is returned.

    Parameters
    ----------
    chrom : str
        Chromosome name **without** the ``'chr'`` prefix (e.g. ``'1'``,
        ``'X'``).  The prefix is added internally before querying pyliftover.
    pos : int
        0-based hg19 position, as expected by :class:`pyliftover.LiftOver`.
    resources : ResourceConfig | Target Build Version, optional
        :class:`~pycmplot.resources.ResourceConfig` instance.  Falls back to
        :data:`~pycmplot.resources.default_resources` when ``None``.

    Returns
    -------
    int or None
        Corresponding 0-based hg38 position, or ``None`` if the position
        could not be mapped (unmapped region, chromosome gap, or deleted
        sequence).

    Notes
    -----
    pyliftover uses **0-based** coordinates (BED convention).  GWAS summary
    statistics files typically use **1-based** coordinates (VCF/Ensembl
    convention).  The caller (:func:`liftover_position`) is responsible for any
    coordinate-system adjustment.

    See Also
    --------
    liftover_position :
        Applies :func:`liftover_hg19_to_hg38` row-wise to a full DataFrame.

    Examples
    --------
    >>> from pycmplot.liftover import liftover_hg19_to_hg38
    >>> new_pos = liftover_hg19_to_hg38("11", 5246695)
    >>> new_pos
    5225465
    """

    if resources is None:
        resources = default_resources

    chain_path = resources.require("chain_hg19_hg38")
    lo = _get_liftover(chain_path)

    results = lo.convert_coordinate(f"chr{chrom}", pos)
    if not results:
        return None
    # pyliftover returns sorted by chain score; take the best hit
    _new_chrom, new_pos, _strand, _score = results[0]
    return new_pos




[docs]
def liftover_hg18_to_hg38(
    chrom: str,
    pos: int,
    resources: Optional[ResourceConfig] = None,
) -> Optional[int]:
    """Convert a single hg18 (NCBI36) position to its hg38 equivalent.

    Uses a lazily loaded and cached :class:`~pyliftover.LiftOver` object
    backed by the hg18→hg38 chain file specified in *resources*.  When
    multiple hg38 mappings exist for a given position, the one with the
    highest chain score is returned.

    Parameters
    ----------
    chrom : str
        Chromosome name **without** the ``'chr'`` prefix (e.g. ``'1'``,
        ``'X'``).  The prefix is added internally before querying
        pyliftover.
    pos : int
        0-based hg18 position, as expected by :class:`pyliftover.LiftOver`.
    resources : ResourceConfig, optional
        :class:`~pycmplot.resources.ResourceConfig` instance.  Falls back
        to :data:`~pycmplot.resources.default_resources` when ``None``.

    Returns
    -------
    int or None
        Corresponding 0-based hg38 position, or ``None`` if the position
        could not be mapped (unmapped region, chromosome gap, or deleted
        sequence).

    See Also
    --------
    liftover_hg19_to_hg38 :
        Equivalent helper for hg19 coordinates.
    liftover_position :
        Applies the appropriate per-row dispatcher to a full DataFrame.
    """

    if resources is None:
        resources = default_resources

    chain_path = resources.require("chain_hg18_hg38")
    lo = _get_liftover(chain_path)

    results = lo.convert_coordinate(f"chr{chrom}", pos)
    if not results:
        return None
    _new_chrom, new_pos, _strand, _score = results[0]
    return new_pos




[docs]
def liftover_position(
    df: pd.DataFrame,
    hg38_chr_limits: dict = None,
    resources: Optional[ResourceConfig] = None,
) -> pd.DataFrame:
    """Liftover all hg18/hg19 rows in *df* to hg38 coordinates.

    Iterates over every row in *df* and dispatches to
    :func:`liftover_hg19_to_hg38` for rows whose ``BUILD`` column equals
    ``'hg19'`` or to :func:`liftover_hg18_to_hg38` for rows whose ``BUILD``
    column equals ``'hg18'``.  Rows with any other build value are passed
    through unchanged.  Rows for which liftover returns ``None`` or ``0``
    (unmappable positions) are silently dropped.

    Two provenance columns are added to the returned DataFrame so that the
    original coordinates remain accessible:

    * ``OLD_POS`` — the pre-liftover base-pair position.
    * ``OLD_BUILD`` — the original build value (``'hg19'``).

    After processing, the ``BUILD`` column is updated to ``'hg38'`` for all
    rows.

    Parameters
    ----------
    df : pandas.DataFrame
        Summary statistics DataFrame with canonical columns ``CHR``, ``POS``,
        and ``BUILD``.  The ``POS`` column is coerced to ``int`` before
        processing.
    resources : ResourceConfig, optional
        :class:`~pycmplot.resources.ResourceConfig` instance supplying the
        chain file path.  Falls back to
        :data:`~pycmplot.resources.default_resources` when ``None``.

    Returns
    -------
    pandas.DataFrame
        A copy of *df* with:

        * ``POS`` replaced by hg38 coordinates for all hg19 rows.
        * ``BUILD`` set to ``'hg38'`` for all rows.
        * ``OLD_POS`` and ``OLD_BUILD`` columns added.
        * Rows with unmappable positions (new ``POS == 0``) removed.

    See Also
    --------
    liftover_hg19_to_hg38 :
        Single-position conversion function called internally.

    Examples
    --------
    >>> from pycmplot.liftover import liftover_position
    >>> df_hg38 = liftover_position(df)
    >>> df_hg38["BUILD"].unique()
    array(['hg38'], dtype=object)
    >>> "OLD_POS" in df_hg38.columns
    True
    """


    if resources is None:
        resources = default_resources

    if hg38_chr_limits is None:
        hg38_chr_limits = {k.replace("chr",""): v for k, v in hg38_chr_lengths.items()}
        

    df = df.copy()
    df["POS"] = df["POS"].astype(int)

    new_positions: list[Optional[int]] = []
    for chrom, pos, build in zip(df["CHR"], df["POS"], df["BUILD"]):
        if build == "hg19":
            new_positions.append(liftover_hg19_to_hg38(chrom, pos, resources))
        elif build == "hg18":
            new_positions.append(liftover_hg18_to_hg38(chrom, pos, resources))
        else:
            new_positions.append(pos)

    df["OLD_POS"] = df["POS"]
    df["OLD_BUILD"] = df["BUILD"]
    df["BUILD"] = "hg38"
    df["POS"] = new_positions
    df["POS"] = df["POS"].fillna(0).astype(int)

    clean_frames: list[pd.DataFrame] = []
    for chrom in df["CHR"].unique():
        chr_df = df[df["CHR"] == chrom]
        chr_limit = hg38_chr_limits.get(str(chrom))
        if chr_limit is not None:
            chr_df = chr_df[chr_df["POS"] <= chr_limit]
        else:
            logger.warning(
                "Chromosome %r not in hg38 chromosome-length table; "
                "keeping all variants without range check.", chrom,
            )
        clean_frames.append(chr_df)

    if not clean_frames:
        return df.iloc[0:0]

    clean_df = pd.concat(clean_frames, axis=0, ignore_index=True)
    return clean_df[clean_df["POS"] != 0]