Source code for indra_cogex.sources.cellmarker

# -*- coding: utf-8 -*-

"""Processor for the CellMarker database.

.. seealso::

    - Website: http://xteam.xbio.top/CellMarker/
    - Publication: https://doi.org/10.1093/nar/gky900
"""

import logging
from typing import Optional

import pandas as pd
import pyobo
from indra.databases import hgnc_client
from indra.ontology.standardize import standardize_name_db_refs
from indra.statements.agent import get_grounding

from .processor import Processor
from ..representation import Node, Relation

__all__ = [
    "CellMarkerProcessor",
]

logger = logging.getLogger(__name__)

URL = "http://xteam.xbio.top/CellMarker/download/Human_cell_markers.txt"

#: These are manual groundings for cancer types,
#: in case we ever want to include them in the future
CANCER_GROUNDINGS = {
    "Brain Cancer": ("DOID", "DOID:1319"),
    "Vascular Tumour": ("DOID", "DOID:175"),
    "Natural Killer Cell Lymphoma": ("DOID", "DOID:0050743"),
    # Needed to map to less specific mesh term
    "Breast Cancer (circulating)": ("MESH", "D001943"),  # not so specific
    "Gastric Cancer (circulating)": ("MESH", "D013274"),
    "Melanoma (circulating)": ("MESH", "D008545"),
    "Non-small Cell Lung Cancer (circulating)": ("MESH", "D002289"),
    "Prostate Cancer (circulating)": ("MESH", "D011471"),
    "Testicular Germ Cell Tumor (circulating)": ("MESH", "C563236"),
    # Could not find something
    "Oesophageal Cancer": "",
    "B cell type MALT lymphoma": "",
}


[docs]class CellMarkerProcessor(Processor):
    """Processor for the CellMarker database."""

    name = "cellmarker"
    df: pd.DataFrame
    node_types = ["BioEntity"]
    rel_type = "has_marker"

    def __init__(self, df: Optional[pd.DataFrame] = None):
        """Initialize the CellMarker processor."""
        self.df = get_df() if df is None else df

[docs]    def get_nodes(self):  # noqa:D102
        """Get cell, tissue, and gene nodes."""
        for cl_id in sorted(self.df["cl"].unique()):
            yield Node.standardized(
                db_ns="CL",
                db_id=f"CL:{cl_id}",
                name=pyobo.get_name("cl", cl_id),
                labels=["BioEntity"],
            )
        for hgnc_id in sorted(self.df["hgnc"].unique()):
            yield Node.standardized(
                db_ns="HGNC",
                db_id=hgnc_id,
                name=hgnc_client.get_hgnc_name(hgnc_id),
                labels=["BioEntity"],
            )

[docs]    def get_relations(self):  # noqa:D102
        columns = ["cl", "hgnc", "uberon", "pubmed", "marker_resource"]
        for (cl, hgnc), sdf in self.df[columns].groupby(["cl", "hgnc"]):
            all_tissues = set()
            all_pubmeds = set()
            all_markers = set()
            for _, _, uberon, pubmed, marker in sdf.values:
                all_tissues.add(uberon)
                all_pubmeds.add(pubmed)
                all_markers.add(marker)

            data = {
                "pubmed:string[]": _join(all_pubmeds),
                "markers:string[]": _join(all_markers),
                "tissue_uberon_ids": _join(all_tissues),
            }
            yield Relation(
                "CL",
                f"CL:{cl}",
                "HGNC",
                hgnc,
                self.rel_type,
                data,
            )


def get_df(url: str = URL) -> pd.DataFrame:
    """Get the CellMarker dataframe."""
    df = pd.read_csv(url, sep="\t", dtype=str)

    # Remove redundant species type annotation since we're looking at human markers file
    del df["speciesType"]

    # Assert existence of and clean CL identifier
    df = df[df["CellOntologyID"].notna()]
    df["cl"] = df["CellOntologyID"].map(_get_obo_luid)
    del df["CellOntologyID"]
    del df["cellName"]

    # Assert existence of and clean UBERON identifier
    df = df[df["UberonOntologyID"].notna()]
    df["uberon"] = df["UberonOntologyID"].map(_get_obo_luid)
    del df["UberonOntologyID"]
    del df["tissueType"]

    # this is either "Normal cell" or "Cancer cell" and is redundant
    # with df["cancerType"]
    del df["cellType"]

    # Remove non-normal cell types
    df = df[df["cancerType"] == "Normal"]
    del df["cancerType"]

    # not enough information here
    del df["Company"]

    # Redundant of genes
    del df["proteinName"]
    del df["proteinID"]

    # df["cellMarker"] appears to be the un-normalized version o df["geneSymbol"]
    del df["cellMarker"]
    del df["geneSymbol"]

    # split comma-separated lists of entrez gene ids
    df["hgnc_ids"] = df["geneID"].map(_parse_ncbigenes)
    del df["geneID"]

    # take all of the cells that contain lists of HGNC
    # identifirs and make them their own rows
    df = df.explode(["hgnc_ids"])
    df = df.rename(
        columns={
            "PMID": "pubmed",
            "markerResource": "marker_resource",
            "hgnc_ids": "hgnc",
        }
    )
    # Remove any rows where the HGNC ID wasn't mapped to something
    df = df[df["hgnc"].notna()]
    return df


def _get_obo_luid(s: str) -> str:
    return s.split("_", 1)[1]


def _parse_ncbigenes(s):
    """Parse string containing comma-separated NCBIGene identifiers."""
    if pd.isna(s):
        return []
    ncbigene_ids = (ncbigene_id.strip() for ncbigene_id in s.strip().split(","))
    hgnc_ids = (
        hgnc_client.get_hgnc_from_entrez(ncbigene_id)
        for ncbigene_id in ncbigene_ids
    )
    return sorted(hgnc_id for hgnc_id in hgnc_ids if hgnc_id)


def ground_cancer_types(df: pd.DataFrame):
    """
    A function that takes the CellMarker dataframe
    and attempts to ground cancer types.
    """
    import gilda

    rows = []
    for x in sorted(df["cancerType"].unique()):
        if x == "Normal":
            rows.append((x, None, None, None))
            continue

        prefix, identifier = CANCER_GROUNDINGS.get(x) or (None, None)
        if prefix:
            name, db_xrefs = standardize_name_db_refs({prefix: identifier})
        else:
            scored_matches = gilda.ground(x)
            if not scored_matches:
                rows.append((x, None, None, None))
                continue
            prefix = scored_matches[0].term.db
            identifier = scored_matches[0].term.id

            name, db_xrefs = standardize_name_db_refs({prefix: identifier})
            if not db_xrefs:
                # print(f"No standardization for: {x} ({scored_matches[0].term.db}:{scored_matches[0].term.id})")
                rows.append((x, None, None, None))
                continue
        prefix, identifier = get_grounding(db_xrefs)
        rows.append((x, prefix, identifier, name))


def _join(collection):
    # joins data for Neo4j ingestion. In Neo4j 4.4 the separator is `;`.
    # See `https://neo4j.com/docs/operations-manual/4.4/tools/
    # neo4j-admin/neo4j-admin-import/#import-tool-header-format-properties`
    return ";".join(item for item in collection if item and pd.notna(item))


if __name__ == "__main__":
    CellMarkerProcessor.cli()