Source code for indra_cogex.client.enrichment.signed

# -*- coding: utf-8 -*-

"""A collection of analyses possible on pairs of gene lists (of HGNC identifiers)."""

from pathlib import Path
from textwrap import dedent
from typing import Iterable, Optional

import pandas as pd
import pystow
import scipy.stats

from indra_cogex.client.enrichment.utils import (
    get_negative_stmt_sets,
    get_positive_stmt_sets,
)
from indra_cogex.client.neo4j_client import Neo4jClient, autoclient

HERE = Path(__file__).parent.resolve()



[docs]
@autoclient()
def reverse_causal_reasoning(
    positive_hgnc_ids: Iterable[str],
    negative_hgnc_ids: Iterable[str],
    minimum_size: int = 4,
    alpha: Optional[float] = None,
    keep_insignificant: bool = True,
    *,
    client: Neo4jClient,
    minimum_evidence_count: Optional[int] = None,
    minimum_belief: Optional[float] = None,
) -> pd.DataFrame:
    """Implement the Reverse Causal Reasoning algorithm from
    :ref:`Catlett, N. L., et al. (2013) <ref-causal-reas-references>`.

    Parameters
    ----------
    client :
        A neo4j client
    positive_hgnc_ids :
        A list of positive-signed HGNC gene identifiers
        (e.g., up-regulated genes in a differential gene expression analysis)
    negative_hgnc_ids :
        A list of negative-signed HGNC gene identifiers
        (e.g., down-regulated genes in a differential gene expression analysis)
    minimum_size :
        The minimum number of entities marked as downstream
        of an entity for it to be usable as a hyp
    alpha :
        The cutoff for significance. Defaults to 0.05
    keep_insignificant :
        If false, removes results with a p value less than alpha.
    minimum_evidence_count :
        The minimum number of evidences for a relationship to count it as a regulator.
        Defaults to 1 (i.e., cutoff not applied).
    minimum_belief :
        The minimum belief for a relationship to count it as a regulator.
        Defaults to 0.0 (i.e., cutoff not applied).

    Returns
    -------
    :
        A pandas DataFrame with results for each entity in the graph database


    .. _ref-causal-reas-references:

    References
    ----------
    Catlett, N. L., *et al.* (2013): `Reverse causal reasoning: applying qualitative
    causal knowledge to the interpretation of high-throughput data
    <https://doi.org/10.1186/1471-2105-14-340>`_. BMC Bioinformatics, **14** (1), 340.
    """
    if alpha is None:
        alpha = 0.05
    positive_hgnc_ids = set(positive_hgnc_ids)
    negative_hgnc_ids = set(negative_hgnc_ids)
    database_positive = get_positive_stmt_sets(
        client=client,
        minimum_belief=minimum_belief,
        minimum_evidence_count=minimum_evidence_count,
    )
    database_negative = get_negative_stmt_sets(
        client=client,
        minimum_belief=minimum_belief,
        minimum_evidence_count=minimum_evidence_count,
    )
    entities = set(database_positive).union(database_negative)

    rows = []
    for entity in entities:
        entity_positive: set[str] = database_positive.get(entity, set())
        entity_negative: set[str] = database_negative.get(entity, set())
        if len(entity_positive) + len(entity_negative) < minimum_size:
            continue  # skip this hypothesis
        correct, incorrect, ambiguous = 0, 0, 0
        for hgnc_id in positive_hgnc_ids:
            if hgnc_id in entity_positive and hgnc_id in entity_negative:
                ambiguous += 1
            elif hgnc_id in entity_positive:
                correct += 1
            elif hgnc_id in entity_negative:
                incorrect += 1
            else:
                ambiguous += 1
        for hgnc_id in negative_hgnc_ids:
            if hgnc_id in entity_positive and hgnc_id in entity_negative:
                ambiguous += 1
            elif hgnc_id in entity_positive:
                incorrect += 1
            elif hgnc_id in entity_negative:
                correct += 1
            else:
                ambiguous += 1

        if correct + incorrect:
            res = scipy.stats.binomtest(
                correct, correct + incorrect, alternative="greater"
            )
            res_p = res.pvalue
            res_ambig = scipy.stats.binomtest(
                correct, correct + incorrect + ambiguous, alternative="greater"
            )
            res_ambig_p = res_ambig.pvalue
        else:
            res_p, res_ambig_p = None, None
        rows.append((*entity, correct, incorrect, ambiguous, res_p, res_ambig_p))

    df = pd.DataFrame(
        rows,
        columns=[
            "curie",
            "name",
            "correct",
            "incorrect",
            "ambiguous",
            "binom_pvalue",
            "binom_ambig_pvalue",
        ],
    ).sort_values("binom_pvalue")
    if not keep_insignificant:
        df = df[df["binom_pvalue"] < alpha]
    return df



# Examples taken as top 40 up and down
# genes from dz:135 in CREEDS (prostate cancer)
# fmt: off
EXAMPLE_POSITIVE_HGNC_IDS = [
    "10354", "4141", "1692", "11771", "4932", "12692", "6561", "3999",
    "20768", "10317", "5472", "10372", "12468", "132", "11253", "2198",
    "10304", "10383", "7406", "10401", "10388", "10386", "7028", "10410",
    "4933", "10333", "13312", "2705", "10336", "10610", "3189", "402",
    "11879", "8831", "10371", "2528", "17194", "12458", "11553", "11820",
]
EXAMPLE_NEGATIVE_HGNC_IDS = [
    "5471", "11763", "2192", "2001", "17389", "3972", "10312", "8556",
    "10404", "7035", "7166", "13429", "29213", "6564", "6502", "15476",
    "13347", "20766", "3214", "13388", "3996", "7541", "10417", "4910",
    "2527", "667", "10327", "1546", "6492", "7", "163", "3284", "3774",
    "12437", "8547", "6908", "3218", "10424", "10496", "1595",
]


# fmt: on



[docs]
def main():
    """Demonstrate signed gene list functions."""
    client = Neo4jClient()
    df = reverse_causal_reasoning(
        client=client,
        positive_hgnc_ids=EXAMPLE_POSITIVE_HGNC_IDS,
        negative_hgnc_ids=EXAMPLE_NEGATIVE_HGNC_IDS,
    )
    path = pystow.join("indra", "cogex", "demos", name="rcr_test.tsv")
    df.to_csv(path, sep="\t", index=False)



if __name__ == "__main__":
    main()