Source code for indra_cogex.sources.goa

# -*- coding: utf-8 -*-

"""Processor for the Gene Ontology Associations (GOA) database."""

import logging

import pandas as pd

from indra.databases import uniprot_client
from ..processor import Processor
from ...representation import Node, Relation

logger = logging.getLogger(__name__)

GOA_URL = "http://geneontology.org/gene-associations/goa_human.gaf.gz"
EVIDENCE_CODES = {
    "EXP",
    "IDA",
    "IPI",
    "IMP",
    "IGI",
    "IEP",
    "HTP",
    "HDA",
    "HMP",
    "HGI",
    "HEP",
    "IBA",
    "IBD",
}


[docs]class GoaProcessor(Processor):
    """Processor for the Gene Ontology Associations (GOA) database."""

    name = "goa"
    df: pd.DataFrame
    node_types = ["BioEntity"]

    def __init__(self):
        """Initialize the GOA processor."""
        self.df = load_goa(GOA_URL)

[docs]    def get_nodes(self):  # noqa:D102
        for go_node in self.df["GO_ID"].unique():
            yield Node("GO", go_node, ["BioEntity"])
        for hgnc_id in self.df["HGNC_ID"].unique():
            yield Node("HGNC", hgnc_id, ["BioEntity"])

[docs]    def get_relations(self):  # noqa:D102
        rel_type = "associated_with"
        for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]:
            all_ecs = ",".join(sorted(set(ecs)))
            # Possible properties could be e.g., evidence codes
            data = {"evidence_codes:string": all_ecs, "source": self.name}
            yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)


[docs]def load_goa(url: str) -> pd.DataFrame:
    """Get the Gene Ontology Annotations database as a dataframe.

    :param url: The URL to the GOA database file.
    :return: The GOA database as a dataframe
    """
    logger.info("Loading GO annotations from %s", url)
    df = pd.read_csv(
        url,
        sep="\t",
        comment="!",
        dtype=str,
        header=None,
        usecols=[1, 3, 4, 6],
        names=[
            "UP_ID",
            "Qualifier",
            "GO_ID",
            "EC",
        ],
    )
    df["HGNC_ID"] = df.apply(
        lambda row: uniprot_client.get_hgnc_id(row["UP_ID"]),
        axis=1,
    )
    df = df[~df["HGNC_ID"].isna()]
    df["Qualifier"].fillna("", inplace=True)
    df = df[~df["Qualifier"].str.startswith("NOT")]
    df = df[df["EC"].isin(EVIDENCE_CODES)]
    logger.info("Loaded %s rows", len(df))
    return df