Source code for indra_cogex.sources.interpro

# -*- coding: utf-8 -*-

"""Processor for the InterPro database.

This was added in https://github.com/bgyori/indra_cogex/pull/125.

.. seealso:: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/
"""

import gzip
import logging
from collections import defaultdict
from pathlib import Path
from typing import Iterable, Mapping, Optional, Set, Tuple

import pandas as pd
import pystow
import requests
from protmapper import uniprot_client
from tqdm import tqdm

from ..processor import Processor
from ...representation import Node, Relation

__all__ = [
    "InterproProcessor",
]

logger = logging.getLogger(__name__)

INTERPRO_ENTRIES_URL = (
    "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list"
)
INTERPRO_SHORT_NAMES_URL = (
    "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/short_names.dat"
)
INTERPRO_TREE_URL = (
    "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/ParentChildTreeFile.txt"
)
INTERPRO_PROTEINS_URL = (
    "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz"
)
INTERPRO_GO_URL = (
    "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go"
)


[docs]class InterproProcessor(Processor): """Processor for Interpro.""" name = "interpro" node_types = ["BioEntity"] def __init__(self, force: bool = False): """Initialize the InterPro processor.""" self.version = get_current_version() self.entries_df = get_entries_df(module=self.module) self.interpro_ids = set(self.entries_df["ENTRY_AC"]) self.parents = get_parent_to_children(force=force, module=self.module) self.interpro_to_goa = get_interpro_to_goa(force=force, module=self.module) interpro_to_proteins = get_interpro_to_proteins( force=force, interpro_ids=self.interpro_ids, module=self.module ) interpro_to_genes = defaultdict(set) for interpro_id, uniprot_ids in interpro_to_proteins.items(): for uniprot_id, start, end in uniprot_ids: hgnc_id = uniprot_client.get_hgnc_id(uniprot_id) if hgnc_id is not None: # there are a lot of TrEMBL entries, these will return none interpro_to_genes[interpro_id].add((hgnc_id, start, end)) self.interpro_to_genes = dict(interpro_to_genes)
[docs] def get_nodes(self): # noqa:D102 unique_go = set() unique_hgnc = set() for interpro_id, _type, name, short_name in self.entries_df.values: yield Node( "IP", interpro_id, ["BioEntity"], dict(name=name, short_name=short_name, version=self.version), ) unique_go.update(self.interpro_to_goa.get(interpro_id, set())) unique_hgnc.update( hgnc_id for hgnc_id, _, _ in self.interpro_to_genes.get(interpro_id, set()) ) for go_id in sorted(unique_go): yield Node("GO", go_id, ["BioEntity"]) for hgnc_id in sorted(unique_hgnc, key=int): yield Node("HGNC", hgnc_id, ["BioEntity"])
[docs] def get_relations(self): # noqa:D102 for interpro_id in sorted(self.interpro_ids): for child_interpro_id in sorted(self.parents.get(interpro_id, set())): yield Relation( "IP", child_interpro_id, "IP", interpro_id, "isa" ) for go_id in sorted(self.interpro_to_goa.get(interpro_id, [])): yield Relation("IP", interpro_id, "GO", go_id, "associated_with") for hgnc_id, start, end in sorted( self.interpro_to_genes.get(interpro_id, []), key=lambda t: int(t[0]) ): yield Relation( "HGNC", hgnc_id, "IP", interpro_id, "has_domain", {"start:int": start, "end:int": end, "version": self.version}, )
def get_entries_df(*, force: bool = False, module: pystow.Module) -> pd.DataFrame: """Get a dataframe of InterPro entries, filtered to domains.""" short_names_df = module.ensure_csv( url=INTERPRO_SHORT_NAMES_URL, read_csv_kwargs=dict( header=None, names=("ENTRY_AC", "ENTRY_SHORT_NAME"), ), force=force, ) df = module.ensure_csv( url=INTERPRO_ENTRIES_URL, read_csv_kwargs=dict( skiprows=1, names=("ENTRY_AC", "ENTRY_TYPE", "ENTRY_NAME"), ), force=force, ) # Filter to entry types that represent domains df = df[df["ENTRY_TYPE"] == "Domain"] df = df.merge(short_names_df, on="ENTRY_AC", how="left") return df def get_parent_to_children( *, force: bool = False, module: pystow.Module ) -> Mapping[str, Set[str]]: """The a mapping from parent InterPro ID to list of children InterPro IDs.""" path = module.ensure(url=INTERPRO_TREE_URL, force=force) with open(path) as file: return _help_parent_to_children(file) def _help_parent_to_children(lines: Iterable[str]) -> Mapping[str, Set[str]]: lines = [line.strip().split("::", 1)[0] for line in lines] parent_to_children = defaultdict(set) for parent_id, child_id in _unroll_tree(_format_tree(lines)): parent_to_children[parent_id].add(child_id) return dict(parent_to_children) def _unroll_tree(d): for parent, subdict in d.items(): for child, subsubdict in subdict.items(): yield parent, child yield from _unroll_tree(subdict) def _format_tree(lines): return { parent: _format_tree(child_lines) for parent, child_lines in descend(lines).items() } def descend(lines): if not lines: return {} groups = {} # Assume the first one is always a root. parent, *lines = lines group = [] for line in lines: if line.startswith("--"): group.append(line[2:]) else: # save the previous group groups[parent] = group # set a new parent parent = line group = [] # don't forget about the last one groups[parent] = group return groups def _count_leading_dashes(s: str) -> int: """Count the number of leading dashes on a string.""" for position, element in enumerate(s): if element != "-": return position raise ValueError def get_interpro_to_proteins( *, force: bool = False, interpro_ids, module: pystow.Module ) -> Mapping[str, Set[Tuple[str, int, int]]]: """Get a mapping from InterPro identifiers to a set of UniProt identifiers.""" cache_path = module.join(name="protein2ipr_human.tsv") if cache_path.is_file(): rv = defaultdict(set) with cache_path.open() as file: for line in file: interpro_id, uniprot_id, start, end = line.strip().split("\t") rv[interpro_id].add((uniprot_id, int(start), int(end))) return dict(rv) path = module.ensure(url=INTERPRO_PROTEINS_URL, force=force) with gzip.open(path, "rt") as file: rv = _read_ipr2protein(file, interpro_ids) with cache_path.open("w") as file: for interpro_id, uniprot_ids in tqdm( sorted(rv.items()), unit_scale=True, desc="Writing human subset", ): for uniprot_id, start, end in sorted(uniprot_ids): print(interpro_id, uniprot_id, start, end, sep="\t", file=file) return rv def _read_ipr2protein(file, interpro_ids): interpro_to_uniprots = defaultdict(set) for line in tqdm( file, unit_scale=True, unit="line", desc="Processing ipr2protein", total=1_216_508_710, ): uniprot_id, interpro_id, _name, _xref, start, end = line.split("\t") if interpro_id not in interpro_ids: continue if uniprot_client.is_human(uniprot_id): interpro_to_uniprots[interpro_id].add((uniprot_id, int(start), int(end))) return dict(interpro_to_uniprots) def get_interpro_to_goa( *, force: bool = False, module: Optional[pystow.Module] = None, path: Optional[Path] = None, ) -> Mapping[str, Set[str]]: """Get a mapping from InterPro identifiers to sets of GO id/name pairs..""" if path is None: if module is None: raise ValueError path = module.ensure(url=INTERPRO_GO_URL, name="interpro2go.tsv", force=force) interpro_to_go_annotations = defaultdict(set) with path.open() as file: for line in file: line = line.strip() if line[0] == "!": continue interpro_id, go_id = process_go_mapping_line(line) interpro_to_go_annotations[interpro_id].add(go_id) return dict(interpro_to_go_annotations) def process_go_mapping_line(line: str) -> Tuple[str, str]: """Process a GO mapping file line. Example lines: .. code-block:: !date: 2022/10/05 11:07:08 !Mapping of InterPro entries to GO !external resource: http://www.ebi.ac.uk/interpro !citation: Blum et al. (2021) Nucl. Acids Res. 49:D344–D354 !contact:interhelp@ebi.ac.uk! InterPro:IPR000003 Retinoid X receptor/HNF4 > GO:DNA binding ; GO:0003677 InterPro:IPR000003 Retinoid X receptor/HNF4 > GO:nuclear steroid receptor activity ; GO:0003707 """ line = line[len("InterPro:") :] line, go_id = (part.strip() for part in line.rsplit(";", 1)) line, _go_name = (part.strip() for part in line.rsplit(">", 1)) interpro_id, _interpro_name = (part.strip() for part in line.split(" ", 1)) return interpro_id, go_id def get_current_version() -> str: """Get the latest version of InterPro.""" res = requests.get( "https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt" ) release_lines = [ line.strip() for line in res.text.splitlines() if line.startswith("Release ") ] # pick the second line line = release_lines[1] return line[len("Release ")].split(",", 1)[0]