# -*- coding: utf-8 -*-
"""Processor for ChEMBL."""
import logging
from typing import Iterable, Optional
import bioversions
import chembl_downloader
from tqdm import tqdm
from indra_cogex.representation import Node, Relation
from indra_cogex.sources.processor import Processor
logger = logging.getLogger(__name__)
#: SQL for ChEMBL to get molecules that have indications
MOLECULE_SQL = f"""
SELECT DISTINCT
MOLECULE_DICTIONARY.chembl_id,
MOLECULE_DICTIONARY.pref_name
FROM MOLECULE_DICTIONARY
JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno
"""
#: SQL for ChEMBL to get indications
SQL = f"""
SELECT
MOLECULE_DICTIONARY.chembl_id,
DRUG_INDICATION.mesh_id,
DRUG_INDICATION.max_phase_for_ind
FROM MOLECULE_DICTIONARY
JOIN DRUG_INDICATION ON MOLECULE_DICTIONARY.molregno == DRUG_INDICATION.molregno
"""
[docs]class ChemblIndicationsProcessor(Processor):
"""A processor for ChEMBL indications."""
name = "chembl"
node_types = ["BioEntity"]
def __init__(self, version: Optional[str] = None):
self.version = version or bioversions.get_version("chembl")
self.df = chembl_downloader.query(SQL, version=self.version)
chemical_df = chembl_downloader.query(MOLECULE_SQL, version=version)
self.chemicals = {
chembl_id: Node.standardized(
db_ns="CHEMBL",
db_id=chembl_id,
name=chembl_name,
labels=["BioEntity"],
)
for chembl_id, chembl_name in tqdm(
chemical_df.values, unit_scale=True, desc="caching chemicals"
)
}
self.indications = {
mesh_id: Node.standardized(
db_ns="MESH", db_id=mesh_id, labels=["BioEntity"]
)
for mesh_id in tqdm(
self.df.mesh_id.unique(), unit_scale=True, desc="caching indications"
)
}
[docs] def get_nodes(self) -> Iterable[Node]:
"""Iterate over ChEMBL chemicals and indications"""
yielded = set()
for node in self.chemicals.values():
grnd = node.grounding()
if grnd not in yielded:
yield node
yielded.add(grnd)
for node in self.indications.values():
grnd = node.grounding()
if grnd not in yielded:
yield node
yielded.add(grnd)
[docs] def get_relations(self) -> Iterable[Relation]:
"""Iterate over ChEMBL indication annotations."""
for chembl_id, mesh_id, max_phase in self.df.values:
chemical = self.chemicals[chembl_id]
indication = self.indications[mesh_id]
yield Relation(
chemical.db_ns,
chemical.db_id,
indication.db_ns,
indication.db_id,
"has_indication",
{
"source": self.name,
"max_phase:float": max_phase,
"version": self.version,
},
)