Source code for indra_cogex.sources.cbioportal

import re
import tarfile
import logging

import pandas as pd

from pathlib import Path
from typing import Union

import gilda
import pystow
from indra.databases import hgnc_client

from indra_cogex.representation import Node, Relation
from indra_cogex.sources.processor import Processor

logger = logging.getLogger(__name__)


[docs]class CcleMutationsProcessor(Processor): name = "ccle_mutations" node_types = ["BioEntity"] def __init__( self, path: Union[str, Path, None] = None, ): if not path: tar_path = get_data() with tarfile.open(tar_path, "r") as fh: with fh.extractfile("ccle_broad_2019/data_mutations.txt") as fhh: self.df = pd.read_csv(fhh, sep="\t", comment="#") else: if isinstance(path, str): path = Path(path) self.df = pd.read_csv(path, sep="\t", comment="#")
[docs] def get_nodes(self): for hgnc_symbol in sorted(set(self.df["Hugo_Symbol"])): hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if not hgnc_id: continue yield Node(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"]) for cell_line in sorted(set(self.df["Tumor_Sample_Barcode"])): yield Node(db_ns="CCLE", db_id=cell_line, labels=["BioEntity"])
[docs] def get_relations(self): for index, row in self.df.iterrows(): if not pd.isna(row["HGVSp_Short"]): hgnc_id = hgnc_client.get_hgnc_id(row["Hugo_Symbol"]) cell_line_id = row["Tumor_Sample_Barcode"] if not hgnc_id: continue yield Relation( source_ns="HGNC", source_id=hgnc_id, target_ns="CCLE", target_id=cell_line_id, rel_type="mutated_in", data={"HGVSp_Short": row["HGVSp_Short"], "source": "ccle"}, )
[docs]class CcleCnaProcessor(Processor): name = "ccle_cna" node_types = ["BioEntity"] def __init__( self, path: Union[str, Path, None] = None, ): if not path: tar_path = get_data() with tarfile.open(tar_path, "r") as fh: with fh.extractfile("ccle_broad_2019/data_cna.txt") as fhh: self.df = pd.read_csv(fhh, sep="\t") else: if isinstance(path, str): path = Path(path) self.df = pd.read_csv(path, sep="\t")
[docs] def get_nodes(self): # Collect all gene symbols from both tables for hgnc_symbol in sorted(set(self.df["Hugo_Symbol"])): hgnc_id = hgnc_client.get_hgnc_id(hgnc_symbol) if not hgnc_id: continue yield Node(db_ns="HGNC", db_id=hgnc_id, labels=["BioEntity"]) for cell_line in sorted(set(self.df.columns.values[1:])): yield Node(db_ns="CCLE", db_id=cell_line, labels=["BioEntity"])
[docs] def get_relations(self): for index, row in self.df.iterrows(): hgnc_id = hgnc_client.get_hgnc_id(row["Hugo_Symbol"]) if not hgnc_id: continue for cell_line in self.df.columns.values[1:]: if row[cell_line] != 0: yield Relation( source_ns="HGNC", source_id=hgnc_id, target_ns="CCLE", target_id=cell_line, rel_type="copy_number_altered_in", data={"CNA:int": row[cell_line], "source": "ccle"}, )
[docs]class CcleDrugResponseProcessor(Processor): name = "ccle_drug" node_types = ["BioEntity"] def __init__(self, path: Union[str, Path, None] = None): if not path: tar_path = get_data() with tarfile.open(tar_path, "r") as fh: with fh.extractfile( "ccle_broad_2019/data_drug_treatment_ic50.txt" ) as fhh: self.df = pd.read_csv(fhh, sep="\t") else: if isinstance(path, str): path = Path(path) self.df = pd.read_csv(path, sep="\t") self.drug_mappings = {}
[docs] def get_nodes(self): drugs = self.get_drug_mappings() for db_ns, db_id in drugs.values(): if db_ns and db_id: yield Node(db_ns, db_id, labels=["BioEntity"]) for cell_line in list(self.df.columns[5:]): yield Node("CCLE", cell_line, labels=["BioEntity"])
[docs] def get_relations(self): cell_lines = self.df.columns[5:] for _, row in self.df.iterrows(): drug = row["ENTITY_STABLE_ID"] drug_ns, drug_id = self.drug_mappings.get(drug, (None, None)) if drug_ns and drug_id: for cell_line in cell_lines: if not pd.isna(row[cell_line]) and row[cell_line] < 10: yield Relation( "CCLE", cell_line, drug_ns, drug_id, rel_type="sensitive_to", data={ "IC50:float": row[cell_line], "source": "ccle" }, )
def get_drug_mappings(self): self.drug_mappings = {} for _, row in self.df.iterrows(): # We skip ones of the form "Afatinib 1/2" because we use the # corresponding "Afatinib 2/2" entries instead. if re.match(r"^(.+) 1/2$", row["NAME"]): continue elif re.match(r"^(.+) 2/2$", row["NAME"]): to_ground = [row["ENTITY_STABLE_ID"].rsplit("-", 1)[0]] else: to_ground = [row["ENTITY_STABLE_ID"]] match = re.search(r"Synonyms:(.+)", row["DESCRIPTION"]) if match: syns = match.groups()[0] if syns != "None": to_ground += [syn.strip() for syn in syns.split(",")] db_ns, db_id = self.ground_drug(to_ground) self.drug_mappings[row["ENTITY_STABLE_ID"]] = (db_ns, db_id) return self.drug_mappings def ground_drug(self, names): for name in names: matches = gilda.ground(name) if matches: db_ns, db_id = matches[0].term.db, matches[0].term.id return db_ns, db_id logger.info("Could not match %s" % str(names)) return None, None
def get_data(): url = "https://cbioportal-datahub.s3.amazonaws.com/ccle_broad_2019.tar.gz" return pystow.ensure( "indra", "cogex", "cbioportal", name="ccle_broad_2019.tar.gz", url=url )