Source code for pycmplot.stats

"""
pycmplot.stats
==============

Statistical utilities for identifying independent lead SNPs and locus
boundaries from GWAS summary statistics.

:func:`get_lead_snps` applies greedy distance-based clumping to return one
representative SNP per independent locus.  :func:`get_highlight_snps` extends
that to mark all variants within a locus window, enabling per-locus colouring
on Manhattan plots.

Notes
-----
Both functions operate on a single-trait DataFrame.  When comparing multiple
traits, call them independently per track; lead SNP extraction across traits
is handled by :func:`~pycmplot.io.get_sumstats_and_merged_sector_list`.
"""

from __future__ import annotations

import numpy as np
import pandas as pd


[docs] def get_lead_snps( df: pd.DataFrame, signif_threshold: float = 5e-8, logp: bool = False, window: int = 500_000, ) -> pd.DataFrame: """Identify independent lead SNPs by greedy distance-based clumping. Starting from the most significant variant, each subsequent variant is retained as a new lead only if it lies more than *window* base-pairs from all previously accepted leads on the same chromosome. Parameters ---------- df : pandas.DataFrame Summary statistics with canonical columns ``CHR``, ``POS``, ``P``. When *logp* is ``True``, a ``logP`` column (–log₁₀(P)) must also be present. signif_threshold : float, optional Significance cutoff. When *logp* is ``False``, variants with ``P > signif_threshold`` are excluded; when *logp* is ``True``, variants with ``logP < -log10(signif_threshold)`` are excluded. Default is ``5e-8``. logp : bool, optional If ``True``, filter and rank by the ``logP`` column (descending) instead of ``P`` (ascending). Default is ``False``. window : int, optional Clumping window half-width in base-pairs. A candidate SNP is excluded if it falls within *window* bp of any already-accepted lead on the same chromosome. Default is ``500_000`` (500 kb). Returns ------- pandas.DataFrame Subset of *df* containing only the lead SNPs, one row per independent locus, in the order they were selected (most significant first within each chromosome). Notes ----- This is a **distance-only** approach; it does not use linkage disequilibrium information. Users requiring LD-based clumping should post-process the returned table with PLINK or a dedicated LD-clumping tool. See Also -------- get_highlight_snps : Returns all variants within locus windows and adds an ``in_locus`` flag column. Examples -------- >>> from pycmplot.stats import get_lead_snps >>> leads = get_lead_snps(df, signif_threshold=5e-8, logp=True, window=500_000) >>> leads[["SNP", "CHR", "POS", "P"]].head() SNP CHR POS P 0 rs123456 2 60718043 1.20e-120 1 rs789012 11 5246696 3.40e-85 """ if logp: thresh = -np.log10(float(signif_threshold)) sig = df[df["logP"] >= thresh].copy() p_col = "logP" ascending = False else: sig = df[df["P"] <= signif_threshold].copy() p_col = "P" ascending = True sig = sig.sort_values(p_col, ascending=ascending) leads: list[pd.Series] = [] while not sig.empty: top = sig.iloc[0] leads.append(top) sig = sig[ ~( (sig["CHR"] == top["CHR"]) & (abs(sig["POS"] - top["POS"]) <= window) ) ] return pd.DataFrame(leads)
[docs] def get_highlight_snps( df: pd.DataFrame, highlight: bool = False, highlight_thresh: float = 5e-8, logp: bool = False, window: int = 500_000, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Mark all variants within *window* bp of a lead SNP. Calls :func:`get_lead_snps` to identify independent loci, then sets an ``in_locus`` boolean flag on every variant whose chromosomal position falls within ±*window* bp of any lead SNP on the same chromosome. Parameters ---------- df : pandas.DataFrame Summary statistics with canonical columns ``CHR``, ``POS``, ``P`` (and ``logP`` when *logp* is ``True``). highlight_thresh : float, optional Significance threshold passed to :func:`get_lead_snps`. Default is ``5e-8``. logp : bool, optional If ``True``, use the ``logP`` column for thresholding and ranking. Default is ``False``. window : int, optional Half-width of the locus window in base-pairs. Defaults to ``500_000`` (500 kb). Returns ------- df_annotated : pandas.DataFrame A copy of *df* with an additional boolean column ``in_locus``. Variants inside at least one locus window have ``in_locus = True``. leads_df : pandas.DataFrame The lead-SNP DataFrame returned by :func:`get_lead_snps`. See Also -------- get_lead_snps : Used internally to identify independent loci. Examples -------- >>> from pycmplot.stats import get_highlight_snps >>> df_ann, leads = get_highlight_snps(df, highlight_thresh=5e-8) >>> df_ann["in_locus"].sum() 1842 """ df = df.copy() leads_df = get_lead_snps( df=df, signif_threshold=highlight_thresh, logp=logp, window=window, ) if highlight: df["in_locus"] = False for _, row in leads_df.iterrows(): min_pos = row["POS"] - window max_pos = row["POS"] + window chrom = row["CHR"] mask = (df["CHR"] == chrom) & (df["POS"] >= min_pos) & (df["POS"] <= max_pos) df.loc[mask, "in_locus"] = True return df, leads_df