Source code for indra_cogex.sources.goa

# -*- coding: utf-8 -*-

"""Processor for the Gene Ontology Associations (GOA) database."""

import logging

import pandas as pd

from indra.databases import uniprot_client
from ..processor import Processor
from ...representation import Node, Relation

logger = logging.getLogger(__name__)

GOA_URL = "http://geneontology.org/gene-associations/goa_human.gaf.gz"
EVIDENCE_CODES = {
    "EXP",
    "IDA",
    "IPI",
    "IMP",
    "IGI",
    "IEP",
    "HTP",
    "HDA",
    "HMP",
    "HGI",
    "HEP",
    "IBA",
    "IBD",
}


[docs]class GoaProcessor(Processor): """Processor for the Gene Ontology Associations (GOA) database.""" name = "goa" df: pd.DataFrame node_types = ["BioEntity"] def __init__(self): """Initialize the GOA processor.""" self.df = load_goa(GOA_URL)
[docs] def get_nodes(self): # noqa:D102 for go_node in self.df["GO_ID"].unique(): yield Node("GO", go_node, ["BioEntity"]) for hgnc_id in self.df["HGNC_ID"].unique(): yield Node("HGNC", hgnc_id, ["BioEntity"])
[docs] def get_relations(self): # noqa:D102 rel_type = "associated_with" for (go_id, hgnc_id), ecs in self.df.groupby(["GO_ID", "HGNC_ID"])["EC"]: all_ecs = ",".join(sorted(set(ecs))) # Possible properties could be e.g., evidence codes data = {"evidence_codes:string": all_ecs, "source": self.name} yield Relation("HGNC", hgnc_id, "GO", go_id, rel_type, data)
[docs]def load_goa(url: str) -> pd.DataFrame: """Get the Gene Ontology Annotations database as a dataframe. :param url: The URL to the GOA database file. :return: The GOA database as a dataframe """ logger.info("Loading GO annotations from %s", url) df = pd.read_csv( url, sep="\t", comment="!", dtype=str, header=None, usecols=[1, 3, 4, 6], names=[ "UP_ID", "Qualifier", "GO_ID", "EC", ], ) df["HGNC_ID"] = df.apply( lambda row: uniprot_client.get_hgnc_id(row["UP_ID"]), axis=1, ) df = df[~df["HGNC_ID"].isna()] df["Qualifier"].fillna("", inplace=True) df = df[~df["Qualifier"].str.startswith("NOT")] df = df[df["EC"].isin(EVIDENCE_CODES)] logger.info("Loaded %s rows", len(df)) return df