Source code for indra_cogex.sources.cbioportal

import re
import tarfile
import logging

import pandas as pd

from pathlib import Path
from typing import Union

import gilda
import pystow
from indra.databases import hgnc_client

from indra_cogex.representation import Node, Relation
from indra_cogex.sources.processor import Processor

logger = logging.getLogger(__name__)


[docs]class CcleMutationsProcessor(Processor):
    name = "ccle_mutations"
    node_types = ["BioEntity"]

    def __init__(
        self,
        path: Union[str, Path, None] = None,
    ):
        if not path:
            tar_path = get_data()
            with tarfile.open(tar_path, "r") as fh:
                with fh.extractfile("ccle_broad_2019/data_mutations.txt") as fhh:
                    self.df = pd.read_csv(fhh, sep="\t", comment="#")
        else:
            if isinstance(path, str):
                path = Path(path)
            self.df = pd.read_csv(path, sep="\t", comment="#")

[docs]    def get_nodes(self):
        for hgnc_symbol in sorted(set(self.df["Hugo_Symbol"])):
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
            if not hgnc_id:
                continue
            yield Node(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])

        for cell_line in sorted(set(self.df["Tumor_Sample_Barcode"])):
            yield Node(db_ns="CCLE", db_id=cell_line, labels=["BioEntity"])

[docs]    def get_relations(self):
        for index, row in self.df.iterrows():
            if not pd.isna(row["HGVSp_Short"]):
                hgnc_id = hgnc_client.get_hgnc_id(row["Hugo_Symbol"])
                cell_line_id = row["Tumor_Sample_Barcode"]
                if not hgnc_id:
                    continue
                yield Relation(
                    source_ns="HGNC",
                    source_id=hgnc_id,
                    target_ns="CCLE",
                    target_id=cell_line_id,
                    rel_type="mutated_in",
                    data={"HGVSp_Short": row["HGVSp_Short"], "source": "ccle"},
                )


[docs]class CcleCnaProcessor(Processor):
    name = "ccle_cna"
    node_types = ["BioEntity"]

    def __init__(
        self,
        path: Union[str, Path, None] = None,
    ):
        if not path:
            tar_path = get_data()
            with tarfile.open(tar_path, "r") as fh:
                with fh.extractfile("ccle_broad_2019/data_cna.txt") as fhh:
                    self.df = pd.read_csv(fhh, sep="\t")
        else:
            if isinstance(path, str):
                path = Path(path)
            self.df = pd.read_csv(path, sep="\t")

[docs]    def get_nodes(self):
        # Collect all gene symbols from both tables
        for hgnc_symbol in sorted(set(self.df["Hugo_Symbol"])):
            hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol)
            if not hgnc_id:
                continue
            yield Node(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"])

        for cell_line in sorted(set(self.df.columns.values[1:])):
            yield Node(db_ns="CCLE", db_id=cell_line, labels=["BioEntity"])

[docs]    def get_relations(self):
        for index, row in self.df.iterrows():
            hgnc_id = hgnc_client.get_hgnc_id(row["Hugo_Symbol"])
            if not hgnc_id:
                continue
            for cell_line in self.df.columns.values[1:]:
                if row[cell_line] != 0:
                    yield Relation(
                        source_ns="HGNC",
                        source_id=hgnc_id,
                        target_ns="CCLE",
                        target_id=cell_line,
                        rel_type="copy_number_altered_in",
                        data={"CNA:int": row[cell_line], "source": "ccle"},
                    )


[docs]class CcleDrugResponseProcessor(Processor):
    name = "ccle_drug"
    node_types = ["BioEntity"]

    def __init__(self, path: Union[str, Path, None] = None):
        if not path:
            tar_path = get_data()
            with tarfile.open(tar_path, "r") as fh:
                with fh.extractfile(
                    "ccle_broad_2019/data_drug_treatment_ic50.txt"
                ) as fhh:
                    self.df = pd.read_csv(fhh, sep="\t")
        else:
            if isinstance(path, str):
                path = Path(path)
            self.df = pd.read_csv(path, sep="\t")
        self.drug_mappings = {}

[docs]    def get_nodes(self):
        drugs = self.get_drug_mappings()
        for db_ns, db_id in drugs.values():
            if db_ns and db_id:
                yield Node(db_ns, db_id, labels=["BioEntity"])

        for cell_line in list(self.df.columns[5:]):
            yield Node("CCLE", cell_line, labels=["BioEntity"])

[docs]    def get_relations(self):
        cell_lines = self.df.columns[5:]
        for _, row in self.df.iterrows():
            drug = row["ENTITY_STABLE_ID"]
            drug_ns, drug_id = self.drug_mappings.get(drug, (None, None))
            if drug_ns and drug_id:
                for cell_line in cell_lines:
                    if not pd.isna(row[cell_line]) and row[cell_line] < 10:
                        yield Relation(
                            "CCLE",
                            cell_line,
                            drug_ns,
                            drug_id,
                            rel_type="sensitive_to",
                            data={
                                "IC50:float": row[cell_line],
                                "source": "ccle"
                            },
                        )

    def get_drug_mappings(self):
        self.drug_mappings = {}
        for _, row in self.df.iterrows():
            # We skip ones of the form "Afatinib 1/2" because we use the
            # corresponding "Afatinib 2/2" entries instead.
            if re.match(r"^(.+) 1/2$", row["NAME"]):
                continue
            elif re.match(r"^(.+) 2/2$", row["NAME"]):
                to_ground = [row["ENTITY_STABLE_ID"].rsplit("-", 1)[0]]
            else:
                to_ground = [row["ENTITY_STABLE_ID"]]

            match = re.search(r"Synonyms:(.+)", row["DESCRIPTION"])
            if match:
                syns = match.groups()[0]
                if syns != "None":
                    to_ground += [syn.strip() for syn in syns.split(",")]

            db_ns, db_id = self.ground_drug(to_ground)
            self.drug_mappings[row["ENTITY_STABLE_ID"]] = (db_ns, db_id)
        return self.drug_mappings

    def ground_drug(self, names):
        for name in names:
            matches = gilda.ground(name)
            if matches:
                db_ns, db_id = matches[0].term.db, matches[0].term.id
                return db_ns, db_id
        logger.info("Could not match %s" % str(names))
        return None, None


def get_data():
    url = "https://cbioportal-datahub.s3.amazonaws.com/ccle_broad_2019.tar.gz"
    return pystow.ensure(
        "indra", "cogex", "cbioportal", name="ccle_broad_2019.tar.gz", url=url
    )