Source code for indra_cogex.sources.chembl

# -*- coding: utf-8 -*-

"""Processor for ChEMBL."""

import logging
from typing import Iterable, Optional

import bioversions
import chembl_downloader
from tqdm import tqdm

from indra_cogex.representation import Node, Relation
from indra_cogex.sources.processor import Processor

logger = logging.getLogger(__name__)

#: SQL for ChEMBL to get molecules that have indications
MOLECULE_SQL = f"""
SELECT DISTINCT
    MOLECULE_DICTIONARY.chembl_id,
    MOLECULE_DICTIONARY.pref_name
FROM MOLECULE_DICTIONARY
JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno
"""

#: SQL for ChEMBL to get indications
SQL = f"""
SELECT
    MOLECULE_DICTIONARY.chembl_id,
    DRUG_INDICATION.mesh_id,
    DRUG_INDICATION.max_phase_for_ind
FROM MOLECULE_DICTIONARY
JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno
"""


[docs]class ChemblIndicationsProcessor(Processor): """A processor for ChEMBL indications.""" name = "chembl" node_types = ["BioEntity"] def __init__(self, version: Optional[str] = None): self.version = version or bioversions.get_version("chembl") self.df = chembl_downloader.query(SQL, version=self.version) chemical_df = chembl_downloader.query(MOLECULE_SQL, version=version) self.chemicals = { chembl_id: Node.standardized( db_ns="CHEMBL", db_id=chembl_id, name=chembl_name, labels=["BioEntity"], ) for chembl_id, chembl_name in tqdm( chemical_df.values, unit_scale=True, desc="caching chemicals" ) } self.indications = { mesh_id: Node.standardized( db_ns="MESH", db_id=mesh_id, labels=["BioEntity"] ) for mesh_id in tqdm( self.df.mesh_id.unique(), unit_scale=True, desc="caching indications" ) }
[docs] def get_nodes(self) -> Iterable[Node]: """Iterate over ChEMBL chemicals and indications""" yielded = set() for node in self.chemicals.values(): grnd = node.grounding() if grnd not in yielded: yield node yielded.add(grnd) for node in self.indications.values(): grnd = node.grounding() if grnd not in yielded: yield node yielded.add(grnd)
[docs] def get_relations(self) -> Iterable[Relation]: """Iterate over ChEMBL indication annotations.""" for chembl_id, mesh_id, max_phase in self.df.values: chemical = self.chemicals[chembl_id] indication = self.indications[mesh_id] yield Relation( chemical.db_ns, chemical.db_id, indication.db_ns, indication.db_id, "has_indication", { "source": self.name, "max_phase:float": max_phase, "version": self.version, }, )