# -*- coding: utf-8 -*-
"""A collection of analyses possible on gene lists (of HGNC identifiers)."""
from typing import Collection, Iterable, List, Mapping, Optional, Set, Tuple
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests
from indra_cogex.client.enrichment.utils import (
get_entity_to_regulators,
get_entity_to_targets,
get_go,
get_phenotype_gene_sets,
get_reactome,
get_wikipathways,
)
from indra_cogex.client.neo4j_client import Neo4jClient, autoclient
from indra_cogex.client.queries import get_genes_for_go_term
__all__ = [
"go_ora",
"wikipathways_ora",
"reactome_ora",
"phenotype_ora",
"indra_downstream_ora",
"indra_upstream_ora",
"EXAMPLE_GENE_IDS",
]
# fmt: off
#: This example list comes from human genes associated with COVID-19
#: (https://bgee.org/?page=top_anat#/result/9bbddda9dea22c21edcada56ad552a35cb8e29a7/)
EXAMPLE_GENE_IDS = [
"613", "1116", "1119", "1697", "7067", "2537", "2734", "29517", "8568", "4910", "4931", "4932", "4962", "4983",
"18873", "5432", "5433", "5981", "16404", "5985", "18358", "6018", "6019", "6021", "6118", "6120", "6122",
"6148", "6374", "6378", "6395", "6727", "14374", "8004", "18669", "8912", "30306", "23785", "9253", "9788",
"10498", "10819", "6769", "11120", "11133", "11432", "11584", "18348", "11849", "28948", "11876", "11878",
"11985", "20820", "12647", "20593", "12713"
]
# fmt: on
def _prepare_hypergeometric_test(
query_set: Set[str],
target_set: Set[str],
universe_size: int,
) -> np.ndarray:
"""Prepare the matrix for hypergeometric test calculations.
Parameters
----------
query_set:
Input gene set to test against a target set (e.g., a pathway).
target_set:
The target gene set (e.g., a pathway).
universe_size:
Size of the background gene set.
Returns
-------
:
A 2x2 matrix used as input for Fisher's exact test.
"""
return np.array(
[
[
len(query_set.intersection(target_set)),
len(query_set.difference(target_set)),
],
[
len(target_set.difference(query_set)),
universe_size - len(target_set.union(query_set)),
],
]
)
@autoclient(cache=True)
def count_human_genes(*, client: Neo4jClient) -> int:
"""Count the number of HGNC genes in neo4j.
Parameters
----------
client :
Neo4jClient
Returns
-------
:
Number of HGNC genes
"""
query = """\
MATCH (n:BioEntity)
WHERE n.id STARTS WITH 'hgnc'
AND NOT n.obsolete
RETURN count(n) as count
"""
results = client.query_tx(query)
if results is None:
raise ValueError
return results[0][0]
def gene_ontology_single_ora(
client: Neo4jClient,
go_term: Tuple[str, str],
gene_ids: List[str],
) -> float:
"""Get the *p*-value for the Fisher exact test a given GO term.
1. Look up genes associated with GO term or child terms
2. Run ORA and return results
Parameters
----------
client :
Neo4jClient
go_term :
GO term to test
gene_ids :
List of HGNC gene identifiers
Returns
-------
:
p-value
"""
count = count_human_genes(client=client)
go_gene_ids = {
gene.db_id
for gene in get_genes_for_go_term(
client=client, go_term=go_term, include_indirect=True
)
}
table = _prepare_hypergeometric_test(
query_set=set(gene_ids),
target_set=go_gene_ids,
universe_size=count,
)
return fisher_exact(table, alternative="greater")[1]
def _do_ora(
curie_to_target_sets: Mapping[Tuple[str, str], Set[str]],
query: Iterable[str],
count: int,
method: Optional[str] = "fdr_bh",
alpha: Optional[float] = None,
keep_insignificant: bool = True,
) -> pd.DataFrame:
if alpha is None:
alpha = 0.05
query_set = set(query)
rows = []
for (curie, name), target_set in curie_to_target_sets.items():
table = _prepare_hypergeometric_test(
query_set=query_set,
target_set=target_set,
universe_size=count,
)
_, pvalue = fisher_exact(table, alternative="greater")
rows.append((curie, name, pvalue))
df = pd.DataFrame(rows, columns=["curie", "name", "p"]).sort_values(
"p", ascending=True
)
df["mlp"] = -np.log10(df["p"])
if method:
correction_results = multipletests(
df["p"],
method=method,
is_sorted=True,
alpha=alpha,
)
df["q"] = correction_results[1]
df["mlq"] = -np.log10(df["q"])
df = df.sort_values("q", ascending=True)
if not keep_insignificant:
df = df[df["q"] < alpha]
return df
[docs]
def go_ora(
client: Neo4jClient,
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
**kwargs,
) -> pd.DataFrame:
"""Calculate over-representation on all GO terms.
Parameters
----------
client :
Neo4jClient
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(get_go(client=client), query=gene_ids, count=count, **kwargs)
[docs]
def wikipathways_ora(
client: Neo4jClient,
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
**kwargs,
) -> pd.DataFrame:
"""Calculate over-representation on all WikiPathway pathways.
Parameters
----------
client :
Neo4jClient
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(
get_wikipathways(client=client), query=gene_ids, count=count, **kwargs
)
[docs]
def reactome_ora(
client: Neo4jClient,
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
**kwargs,
) -> pd.DataFrame:
"""Calculate over-representation on all Reactome pathways.
Parameters
----------
client :
Neo4jClient
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(get_reactome(client=client), query=gene_ids, count=count, **kwargs)
[docs]
@autoclient()
def phenotype_ora(
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
*,
client: Neo4jClient,
**kwargs,
) -> pd.DataFrame:
"""Calculate over-representation on all HP phenotypes.
Parameters
----------
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
client :
Neo4jClient
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(
get_phenotype_gene_sets(client=client), query=gene_ids, count=count, **kwargs
)
[docs]
def indra_downstream_ora(
client: Neo4jClient,
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
*,
minimum_evidence_count: Optional[int] = 1,
minimum_belief: Optional[float] = 0.0,
**kwargs,
) -> pd.DataFrame:
"""
Calculate a p-value for each entity in the INDRA database
based on the genes that are causally upstream of it and how
they compare to the query gene set.
Parameters
----------
client :
Neo4jClient
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
minimum_evidence_count :
Minimum number of evidences to consider a causal relationship
minimum_belief :
Minimum belief to consider a causal relationship
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(
get_entity_to_regulators(
client=client,
minimum_evidence_count=minimum_evidence_count,
minimum_belief=minimum_belief,
),
query=gene_ids,
count=count,
**kwargs,
)
[docs]
def indra_upstream_ora(
client: Neo4jClient,
gene_ids: Iterable[str],
background_gene_ids: Optional[Collection[str]] = None,
*,
minimum_evidence_count: Optional[int] = 1,
minimum_belief: Optional[float] = 0.0,
**kwargs,
) -> pd.DataFrame:
"""
Calculate a p-value for each entity in the INDRA database
based on the set of genes that it regulates and how
they compare to the query gene set.
Parameters
----------
client :
Neo4jClient
gene_ids :
List of HGNC gene identifiers
background_gene_ids :
List of HGNC gene identifiers for the background gene set. If not
given, all genes with HGNC IDs are used as the background.
minimum_evidence_count :
Minimum number of evidences to consider a causal relationship
minimum_belief :
Minimum belief to consider a causal relationship
**kwargs :
Additional keyword arguments to pass to _do_ora
Returns
-------
:
DataFrame with columns:
curie, name, p, q, mlp, mlq
"""
count = (
count_human_genes(client=client)
if not background_gene_ids
else len(background_gene_ids)
)
return _do_ora(
get_entity_to_targets(
client=client,
minimum_evidence_count=minimum_evidence_count,
minimum_belief=minimum_belief,
),
query=gene_ids,
count=count,
**kwargs,
)
def main():
client = Neo4jClient()
print("\nGO Enrichment\n")
print(
go_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
print("\n## WikiPathways Enrichment\n")
print(
wikipathways_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
print("\n## Reactome Enrichment\n")
print(
reactome_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
print("\n## Phenotype Enrichment\n")
print(
phenotype_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
print("\n## INDRA Upstream Enrichment\n")
print(
indra_upstream_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
print("\n## INDRA Downstream Enrichment\n")
print(
indra_downstream_ora(client=client, gene_ids=EXAMPLE_GENE_IDS)
.head(15)
.to_markdown(index=False)
)
if __name__ == "__main__":
main()