Source code for indra_cogex.sources.clinicaltrials

"""This module implements input for ClinicalTrials.gov data

NOTE: ClinicalTrials.gov are working on a more modern API that is currently
in Beta: https://beta.clinicaltrials.gov/data-about-studies/learn-about-api
Once this API is released, we should switch to using it. The instructions for
using the current/old API are below.

To obtain the custom download for ingest, do the following

1. Go to https://clinicaltrials.gov/api/gui/demo/simple_study_fields

2. Enter the following in the form:

expr=
fields=NCTId,BriefTitle,Condition,ConditionMeshTerm,ConditionMeshId,InterventionName,InterventionType,InterventionMeshTerm,InterventionMeshId
min_rnk=1
max_rnk=500000  # or any number larger than the current number of studies
fmt=csv

3. Send Request

4. Enter the captcha characters into the text box and then press enter
(make sure to use the enter key and not press any buttons).

5. The website will display "please wait… " for a couple of minutes, finally,
the Save to file button will be active.

6. Click the Save to file button to download the response as a txt file.

7. Rename the txt file to clinical_trials.csv and then compress it as
gzip clinical_trials.csv to get clinical_trials.csv.gz, then place
this file into <pystow home>/indra/cogex/clinicaltrials/
"""

import logging
from collections import Counter
from pathlib import Path
from typing import Union

import gilda
import pandas as pd
import pystow
import tqdm

from indra.databases import mesh_client
from indra_cogex.sources.processor import Processor
from indra_cogex.representation import Node, Relation


logger = logging.getLogger(__name__)


[docs]class ClinicaltrialsProcessor(Processor):
    name = "clinicaltrials"
    node_types = ["BioEntity", "ClinicalTrial"]

    def __init__(self, path: Union[str, Path, None] = None):
        default_path = pystow.join(
            "indra",
            "cogex",
            "clinicaltrials",
            name="clinical_trials.csv.gz",
        )

        if not path:
            path = default_path
        elif isinstance(path, str):
            path = Path(path)

        self.df = pd.read_csv(path, sep=",", skiprows=10)
        self.has_trial_cond_ns = []
        self.has_trial_cond_id = []
        self.has_trial_nct = []
        self.tested_in_int_ns = []
        self.tested_in_int_id = []
        self.tested_in_nct = []

        self.problematic_mesh_ids = []

    def ground_condition(self, condition):
        matches = gilda.ground(condition)
        matches = [
            match
            for match in matches
            if match.term.db in {"MESH", "DOID", "EFO", "HP", "GO"}
        ]
        if matches:
            return matches[0].term
        return None

    def ground_drug(self, drug):
        matches = gilda.ground(drug)
        if matches:
            return matches[0].term
        return None

[docs]    def get_nodes(self):
        for index, row in tqdm.tqdm(self.df.iterrows(), total=len(self.df)):
            found_disease_gilda = False
            for condition in str(row["Condition"]).split("|"):
                cond_term = self.ground_condition(condition)
                if cond_term:
                    self.has_trial_nct.append(row["NCTId"])
                    self.has_trial_cond_ns.append(cond_term.db)
                    self.has_trial_cond_id.append(cond_term.id)
                    yield Node(
                        db_ns=cond_term.db, db_id=cond_term.id, labels=["BioEntity"]
                    )
                    found_disease_gilda = True
            if not found_disease_gilda and not pd.isna(row["ConditionMeshId"]):
                for mesh_id, mesh_term in zip(row["ConditionMeshId"].split("|"),
                                              row["ConditionMeshTerm"].split("|")):
                    correct_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
                    if not correct_mesh_id:
                        self.problematic_mesh_ids.append((mesh_id, mesh_term))
                        continue
                    self.has_trial_nct.append(row["NCTId"])
                    self.has_trial_cond_ns.append("MESH")
                    self.has_trial_cond_id.append(correct_mesh_id)
                    yield Node(db_ns="MESH", db_id=correct_mesh_id, labels=["BioEntity"])

            # We first try grounding the names with Gilda, if any match, we
            # use it, if there are no matches, we go by provided MeSH ID
            found_drug_gilda = False
            for int_name, int_type in zip(
                str(row["InterventionName"]).split("|"),
                str(row["InterventionType"]).split("|"),
            ):
                if int_type == "Drug":
                    drug_term = self.ground_drug(int_name)
                    if drug_term:
                        self.tested_in_int_ns.append(drug_term.db)
                        self.tested_in_int_id.append(drug_term.id)
                        self.tested_in_nct.append(row["NCTId"])
                        yield Node(
                            db_ns=drug_term.db, db_id=drug_term.id, labels=["BioEntity"]
                        )
                        found_drug_gilda = True
            # If there is no Gilda much but there are some MeSH IDs given
            if not found_drug_gilda and not pd.isna(row["InterventionMeshId"]):
                for mesh_id, mesh_term in zip(row["InterventionMeshId"].split("|"),
                                              row["InterventionMeshTerm"].split("|")):
                    correct_mesh_id = get_correct_mesh_id(mesh_id, mesh_term)
                    if not correct_mesh_id:
                        self.problematic_mesh_ids.append((mesh_id, mesh_term))
                        continue
                    self.tested_in_int_ns.append("MESH")
                    self.tested_in_int_id.append(correct_mesh_id)
                    self.tested_in_nct.append(row["NCTId"])
                    yield Node(db_ns="MESH", db_id=correct_mesh_id, labels=["BioEntity"])

        for nctid in set(self.tested_in_nct) | set(self.has_trial_nct):
            yield Node(db_ns="CLINICALTRIALS", db_id=nctid, labels=["ClinicalTrial"])

        logger.info('Problematic MeSH IDs: %s' % str(
            Counter(self.problematic_mesh_ids).most_common()))

[docs]    def get_relations(self):
        added = set()
        for cond_ns, cond_id, target_id in zip(
            self.has_trial_cond_ns, self.has_trial_cond_id, self.has_trial_nct
        ):
            if (cond_ns, cond_id, target_id) in added:
                continue
            added.add((cond_ns, cond_id, target_id))
            yield Relation(
                source_ns=cond_ns,
                source_id=cond_id,
                target_ns="CLINICALTRIALS",
                target_id=target_id,
                rel_type="has_trial",
            )
        added = set()
        for int_ns, int_id, target_id in zip(
            self.tested_in_int_ns, self.tested_in_int_id, self.tested_in_nct
        ):
            if (int_ns, int_id, target_id) in added:
                continue
            added.add((int_ns, int_id, target_id))
            yield Relation(
                source_ns=int_ns,
                source_id=int_id,
                target_ns="CLINICALTRIALS",
                target_id=target_id,
                rel_type="tested_in",
            )


def get_correct_mesh_id(mesh_id, mesh_term=None):
    # A proxy for checking whether something is a valid MeSH term is
    # to look up its name
    if mesh_client.get_mesh_name(mesh_id, offline=True):
        return mesh_id
    # A common issue is with zero padding, where 9 digits are used
    # instead of the correct 6, and we can remove the extra zeros
    # to get a valid ID
    else:
        short_id = mesh_id[0] + mesh_id[4:]
        if mesh_client.get_mesh_name(short_id, offline=True):
            return short_id
    # Another pattern is one where the MeSH ID is simply invalid but the
    # corresponding MeSH term allows us to get a valid ID via reverse
    # ID lookup - done here as grounding just to not have to assume
    # perfect / up to date naming conventions in the source data.
    if mesh_term:
        matches = gilda.ground(mesh_term, namespaces=['MESH'])
        if len(matches) == 1:
            for k, v in matches[0].get_groundings():
                if k == 'MESH':
                    return v
    return None