Source code for pycmplot.stats

"""
pycmplot.stats
==============

Statistical utilities for identifying independent lead SNPs and locus
boundaries from GWAS summary statistics.

:func:`get_lead_snps` applies greedy distance-based clumping to return one
representative SNP per independent locus.  :func:`get_highlight_snps` extends
that to mark all variants within a locus window, enabling per-locus colouring
on Manhattan plots.

Notes
-----
Both functions operate on a single-trait DataFrame.  When comparing multiple
traits, call them independently per track; lead SNP extraction across traits
is handled by :func:`~pycmplot.io.get_sumstats_and_merged_sector_list`.
"""

from __future__ import annotations

import numpy as np
import pandas as pd



[docs]
def get_lead_snps(
    df: pd.DataFrame,
    signif_threshold: float = 5e-8,
    logp: bool = False,
    window: int = 500_000,
) -> pd.DataFrame:
    """Identify independent lead SNPs by greedy distance-based clumping.

    Starting from the most significant variant, each subsequent variant is
    retained as a new lead only if it lies more than *window* base-pairs from
    all previously accepted leads on the same chromosome.

    Parameters
    ----------
    df : pandas.DataFrame
        Summary statistics with canonical columns ``CHR``, ``POS``, ``P``.
        When *logp* is ``True``, a ``logP`` column (–log₁₀(P)) must also be
        present.
    signif_threshold : float, optional
        Significance cutoff.  When *logp* is ``False``, variants with
        ``P > signif_threshold`` are excluded; when *logp* is ``True``,
        variants with ``logP < -log10(signif_threshold)`` are excluded.
        Default is ``5e-8``.
    logp : bool, optional
        If ``True``, filter and rank by the ``logP`` column (descending)
        instead of ``P`` (ascending).  Default is ``False``.
    window : int, optional
        Clumping window half-width in base-pairs.  A candidate SNP is
        excluded if it falls within *window* bp of any already-accepted lead
        on the same chromosome.  Default is ``500_000`` (500 kb).

    Returns
    -------
    pandas.DataFrame
        Subset of *df* containing only the lead SNPs, one row per independent
        locus, in the order they were selected (most significant first within
        each chromosome).

    Notes
    -----
    This is a **distance-only** approach; it does not use linkage disequilibrium
    information.  Users requiring LD-based clumping should post-process the
    returned table with PLINK or a dedicated LD-clumping tool.

    See Also
    --------
    get_highlight_snps :
        Returns all variants within locus windows and adds an ``in_locus``
        flag column.

    Examples
    --------
    >>> from pycmplot.stats import get_lead_snps
    >>> leads = get_lead_snps(df, signif_threshold=5e-8, logp=True, window=500_000)
    >>> leads[["SNP", "CHR", "POS", "P"]].head()
            SNP CHR       POS           P
    0  rs123456   2  60718043  1.20e-120
    1  rs789012  11   5246696  3.40e-85
    """

    if logp:
        thresh = -np.log10(float(signif_threshold))
        sig = df[df["logP"] >= thresh].copy()
        p_col = "logP"
        ascending = False
    else:
        sig = df[df["P"] <= signif_threshold].copy()
        p_col = "P"
        ascending = True

    sig = sig.sort_values(p_col, ascending=ascending)
    leads: list[pd.Series] = []

    while not sig.empty:
        top = sig.iloc[0]
        leads.append(top)
        sig = sig[
            ~(
                (sig["CHR"] == top["CHR"])
                & (abs(sig["POS"] - top["POS"]) <= window)
            )
        ]

    return pd.DataFrame(leads)




[docs]
def get_highlight_snps(
    df: pd.DataFrame,
    highlight: bool = False,
    highlight_thresh: float = 5e-8,
    logp: bool = False,
    window: int = 500_000,
) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Mark all variants within *window* bp of a lead SNP.

    Calls :func:`get_lead_snps` to identify independent loci, then sets an
    ``in_locus`` boolean flag on every variant whose chromosomal position falls
    within ±*window* bp of any lead SNP on the same chromosome.

    Parameters
    ----------
    df : pandas.DataFrame
        Summary statistics with canonical columns ``CHR``, ``POS``, ``P``
        (and ``logP`` when *logp* is ``True``).
    highlight_thresh : float, optional
        Significance threshold passed to :func:`get_lead_snps`.  Default is
        ``5e-8``.
    logp : bool, optional
        If ``True``, use the ``logP`` column for thresholding and ranking.
        Default is ``False``.
    window : int, optional
        Half-width of the locus window in base-pairs.  Defaults to
        ``500_000`` (500 kb).

    Returns
    -------
    df_annotated : pandas.DataFrame
        A copy of *df* with an additional boolean column ``in_locus``.
        Variants inside at least one locus window have ``in_locus = True``.
    leads_df : pandas.DataFrame
        The lead-SNP DataFrame returned by :func:`get_lead_snps`.

    See Also
    --------
    get_lead_snps :
        Used internally to identify independent loci.

    Examples
    --------
    >>> from pycmplot.stats import get_highlight_snps
    >>> df_ann, leads = get_highlight_snps(df, highlight_thresh=5e-8)
    >>> df_ann["in_locus"].sum()
    1842
    """

    df = df.copy()

    leads_df = get_lead_snps(
        df=df,
        signif_threshold=highlight_thresh,
        logp=logp,
        window=window,
    )

    if highlight:
        df["in_locus"] = False
        for _, row in leads_df.iterrows():
            min_pos = row["POS"] - window
            max_pos = row["POS"] + window
            chrom = row["CHR"]

            mask = (df["CHR"] == chrom) & (df["POS"] >= min_pos) & (df["POS"] <= max_pos)
            df.loc[mask, "in_locus"] = True

    return df, leads_df