Source code for gsnn.proc.bio


# NOTE: ``omnipath`` probes ``omnipathdb.org`` at package-import time, and
# ``pypath`` pulls in a similarly heavy graph of resources. Both are only
# needed inside the public helpers below, so we import them lazily (i.e.
# inside the function bodies). That way ``import gsnn.proc.bio`` (and the
# transitive ``import gsnn`` triggered by ``gsnn/__init__.py``) stays cheap
# and -- crucially -- silent on hosts with no egress to omnipathdb.org.
import re

import pandas as pd 
import numpy as np


_UNIPROT_RE = re.compile(
    r"^[OPQ][0-9][A-Z0-9]{3}[0-9]|^[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}$"
)

_PROTEIN_PREFIX = 'PROTEIN__'
_COMPLEX_PREFIX = 'COMPLEX__'
_EDGE_COLS = ('source', 'target', 'edge_type', 'source_uniprot', 'target_uniprot')
_DEDUP_COLS = ('source', 'target', 'edge_type')


def _is_complex_node(node):
    """Return True if ``node`` encodes a multi-protein complex.

    OmniPath represents a protein complex by underscore-concatenating the
    HGNC gene symbols of its members (matching the
    ``components_genesymbols`` field of the OmniPath ``complexes`` endpoint),
    e.g. ``PROTEIN__AEBP2_EED_EZH2_RBBP4_SUZ12`` for PRC2.
    """
    if not isinstance(node, str):
        return False
    if not node.startswith(_PROTEIN_PREFIX):
        return False
    return '_' in node[len(_PROTEIN_PREFIX):]


def _expand_complex_members(node):
    """Return the constituent ``PROTEIN__<member>`` nodes of a complex.

    Non-complex nodes are returned unchanged in a single-element list.
    """
    if _is_complex_node(node):
        members = node[len(_PROTEIN_PREFIX):].split('_')
        return [_PROTEIN_PREFIX + m for m in members]
    return [node]


def _node_uniprot_lookup(df, prefix):
    """Last-seen accession per prefixed node from either endpoint."""
    as_src = df.loc[df['source'].str.startswith(prefix, na=False)].groupby('source')['source_uniprot'].last()
    as_tgt = df.loc[df['target'].str.startswith(prefix, na=False)].groupby('target')['target_uniprot'].last()
    return as_src.combine_first(as_tgt)


def _build_func_nodes_df(func_df):
    """Build per-node metadata from a standardized edge table."""
    nodes = np.unique(func_df['src'].tolist() + func_df['dst'].tolist())
    from_src = func_df.groupby('src', sort=False)['source_uniprot'].last()
    from_dst = func_df.groupby('dst', sort=False)['target_uniprot'].last()
    uniprot = from_src.combine_first(from_dst)
    return pd.DataFrame({
        'func_name': nodes,
        'uniprot': [uniprot.get(n, pd.NA) for n in nodes],
        'gene_symbol': [n.split('__', 1)[1] if '__' in n else n for n in nodes],
    })


[docs]def build_uniprot_symbol_map(func_edges):
    """All (uniprot, gene_symbol, func_name) triples observed on any edge endpoint.

    Parameters
    ----------
    func_edges : pd.DataFrame
        The second return value of :func:`get_bio_interactions`. Must contain
        columns ``src``, ``dst``, ``source_uniprot``, ``target_uniprot``.

    Returns
    -------
    pd.DataFrame
        Columns (in order): ``uniprot``, ``gene_symbol``, ``func_name``,
        ``node_kind`` (``'PROTEIN'`` or ``'RNA'``). One row per unique
        ``(uniprot, func_name)`` pair. Many-to-many: a single uniprot may
        appear with multiple func_names, and a single func_name may carry
        multiple uniprots.

    Notes
    -----
    * Excludes synthetic ``COMPLEX:...`` accessions emitted by the OmniPath
      complexes endpoint (they are not real UniProt IDs).
    * Excludes miRBase-style identifiers carried on miRNA edges (anything
      that does not look like a UniProt accession; see ``_UNIPROT_RE``).
    * Pure pandas; no network I/O.
    * The unique uniprots here are a strict superset of the well-formed
      UniProt accessions in ``func_nodes['uniprot']`` (i.e. those matching
      ``_UNIPROT_RE``), because ``func_nodes`` keeps only one accession per
      node via ``.last()`` while this table retains every distinct
      ``(uniprot, func_name)`` pair seen on any edge endpoint.
    """
    cols_src = ['src', 'source_uniprot']
    cols_dst = ['dst', 'target_uniprot']
    src = (
        func_edges.loc[:, cols_src]
        .rename(columns={'src': 'func_name', 'source_uniprot': 'uniprot'})
    )
    dst = (
        func_edges.loc[:, cols_dst]
        .rename(columns={'dst': 'func_name', 'target_uniprot': 'uniprot'})
    )
    pairs = pd.concat([src, dst], ignore_index=True)
    pairs = pairs.dropna(subset=['func_name', 'uniprot'])
    pairs['func_name'] = pairs['func_name'].astype(str)
    pairs['uniprot'] = pairs['uniprot'].astype(str).str.strip()
    is_prot = pairs['func_name'].str.startswith('PROTEIN__')
    is_rna = pairs['func_name'].str.startswith('RNA__')
    pairs = pairs.loc[is_prot | is_rna].copy()
    pairs['node_kind'] = np.where(
        pairs['func_name'].str.startswith('PROTEIN__'), 'PROTEIN', 'RNA',
    )
    looks_uniprot = pairs['uniprot'].str.match(_UNIPROT_RE)
    pairs = pairs.loc[looks_uniprot]
    pairs['gene_symbol'] = pairs['func_name'].str.split('__', n=1).str[1]
    out = (
        pairs[['uniprot', 'gene_symbol', 'func_name', 'node_kind']]
        .drop_duplicates()
        .sort_values(['func_name', 'uniprot'])
        .reset_index(drop=True)
    )
    return out


def _strip_alias_suffix(name):
    """Drop semicolon-separated alias names and keep only the first entry.

    Several miRNA-resource tables exposed through OmniPath carry the
    miRBase legacy "previous IDs" string in the gene-symbol columns, with
    multiple historical names of the same mature miRNA joined by
    ``';'`` (e.g. ``HSA-MIR-675B;HSA-MIR-675*``).  These are aliases of a
    single entity, not distinct molecules, so we collapse them down to
    the first listed name.
    """
    if not isinstance(name, str) or ';' not in name:
        return name
    if '__' in name:
        prefix, _, rest = name.partition('__')
        return f'{prefix}__{rest.split(";")[0]}'
    return name.split(';')[0]


def _filter_curation_quality(df, min_n_references=None, min_curation_effort=None,
                             dataset='', verbose=True):
    """Drop OmniPath edges below literature-support thresholds.

    Parameters
    ----------
    df : pandas.DataFrame
        Raw interaction table returned by ``omnipath``.
    min_n_references : int or None
        Minimum number of PubMed references supporting an edge.
    min_curation_effort : int or None
        Minimum OmniPath curation-effort score for an edge.
    dataset : str
        Label used in verbose progress messages.
    verbose : bool
        Whether to print summary statistics.

    Returns
    -------
    pandas.DataFrame
    """
    if min_n_references is None and min_curation_effort is None:
        return df

    n_before = len(df)
    out = df

    if min_n_references is not None:
        if 'n_references' not in out.columns:
            if verbose:
                print(
                    f'\t{dataset}: no n_references column; dropping all {n_before} edges '
                    f'(min_n_references={min_n_references})'
                )
            return out.iloc[0:0].copy()
        out = out.loc[out['n_references'].fillna(0).ge(min_n_references)]

    if min_curation_effort is not None:
        if 'curation_effort' not in out.columns:
            if verbose:
                print(
                    f'\t{dataset}: no curation_effort column; dropping all {len(out)} edges '
                    f'(min_curation_effort={min_curation_effort})'
                )
            return out.iloc[0:0].copy()
        out = out.loc[out['curation_effort'].fillna(0).ge(min_curation_effort)]

    if verbose and len(out) < n_before:
        print(
            f'\t{dataset}: curation filter kept {len(out)}/{n_before} edges '
            f'(min_n_references={min_n_references}, min_curation_effort={min_curation_effort})'
        )

    return out.reset_index(drop=True)


def _standardize_interactions(df, src_name, tgt_name, src_prefix, tgt_prefix, edge_type,
                              min_n_references=None, min_curation_effort=None,
                              dataset='', verbose=True):
    """Filter, prefix and return a standard GSNN edge dataframe."""
    df = _filter_curation_quality(
        df, min_n_references, min_curation_effort, dataset=dataset, verbose=verbose,
    )
    if df.empty:
        return pd.DataFrame(columns=list(_EDGE_COLS))
    df = df.rename(columns={'source': 'source_uniprot', 'target': 'target_uniprot'})
    if src_name == 'source':
        src_name = 'source_uniprot'
    if tgt_name == 'target':
        tgt_name = 'target_uniprot'
    return df.assign(
        source=lambda x: [src_prefix + y for y in x[src_name]],
        target=lambda x: [tgt_prefix + y for y in x[tgt_name]],
        edge_type=edge_type,
    )[list(_EDGE_COLS)]


def _apply_complex_handling(df, mode, verbose=True):
    """Resolve protein-complex nodes in an interaction dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        Edge dataframe with columns ``_EDGE_COLS``.
    mode : str
        One of ``{'none', 'remove', 'expand', 'link'}``:

            * ``'none'``   - leave complex nodes untouched (backwards
              compatible with the legacy behaviour).
            * ``'remove'`` - drop every edge that involves a complex node.
            * ``'expand'`` - replace each complex with its constituent
              members, fanning the original edge out to one edge per
              member (cartesian product when both endpoints are
              complexes).  Self-loops introduced by overlapping
              membership are removed.
            * ``'link'``   - rename complex nodes into the
              ``COMPLEX__`` namespace and add explicit
              ``PROTEIN__<member> -> COMPLEX__<...>`` "assembly" edges
              so the model can learn complex activity from member
              activity.
    verbose : bool, optional
        Whether to print summary statistics.

    Returns
    -------
    pandas.DataFrame
    """
    if mode == 'none':
        return df

    src_is_cplx = df['source'].apply(_is_complex_node)
    dst_is_cplx = df['target'].apply(_is_complex_node)
    has_cplx = src_is_cplx | dst_is_cplx
    n_cplx_edges = int(has_cplx.sum())

    if mode == 'remove':
        if verbose:
            print(f'\tcomplex_handling=remove: dropping {n_cplx_edges} edges that involve a complex')
        return df.loc[~has_cplx].reset_index(drop=True)

    if mode == 'expand':
        keep = df.loc[~has_cplx]
        cplx_rows = df.loc[has_cplx]
        expanded = []
        for _, row in cplx_rows.iterrows():
            srcs = _expand_complex_members(row['source'])
            dsts = _expand_complex_members(row['target'])
            for s in srcs:
                for d in dsts:
                    if s == d:
                        continue
                    expanded.append({
                        'source': s,
                        'target': d,
                        'edge_type': row['edge_type'],
                        'source_uniprot': row['source_uniprot'],
                        'target_uniprot': row['target_uniprot'],
                    })
        expanded_df = pd.DataFrame(expanded, columns=list(_EDGE_COLS))
        out = pd.concat([keep, expanded_df], axis=0, ignore_index=True)
        out = out.drop_duplicates(subset=list(_DEDUP_COLS)).reset_index(drop=True)
        if verbose:
            print(f'\tcomplex_handling=expand: {n_cplx_edges} complex-level edges fanned out into {len(expanded_df)} member-level edges')
        return out

    if mode == 'link':
        # OmniPath complex accession keyed by underscore-joined member symbols
        cplx_acc_by_members = {}
        for _, row in df.iterrows():
            if _is_complex_node(row['source']):
                cplx_acc_by_members[row['source'][len(_PROTEIN_PREFIX):]] = row['source_uniprot']
            if _is_complex_node(row['target']):
                cplx_acc_by_members[row['target'][len(_PROTEIN_PREFIX):]] = row['target_uniprot']

        def _rename(node):
            if _is_complex_node(node):
                return _COMPLEX_PREFIX + node[len(_PROTEIN_PREFIX):]
            return node

        out = df.copy()
        out['source'] = out['source'].apply(_rename)
        out['target'] = out['target'].apply(_rename)
        cplx_nodes = sorted(set(
            out.loc[out['source'].str.startswith(_COMPLEX_PREFIX), 'source'].tolist()
            + out.loc[out['target'].str.startswith(_COMPLEX_PREFIX), 'target'].tolist()
        ))
        protein_uniprot = _node_uniprot_lookup(out, _PROTEIN_PREFIX)
        assembly_edges = []
        for cnode in cplx_nodes:
            member_key = cnode[len(_COMPLEX_PREFIX):]
            cplx_acc = cplx_acc_by_members[member_key]
            members = member_key.split('_')
            accs = cplx_acc[len('COMPLEX:'):].split('_') if isinstance(cplx_acc, str) and cplx_acc.startswith('COMPLEX:') else []
            acc_by_member = dict(zip(members, accs)) if len(members) == len(accs) else {}
            for m in members:
                src_acc = protein_uniprot.get(_PROTEIN_PREFIX + m)
                if src_acc is None:
                    src_acc = acc_by_member.get(m)
                assembly_edges.append({
                    'source': _PROTEIN_PREFIX + m,
                    'target': cnode,
                    'edge_type': 'assembly',
                    'source_uniprot': src_acc,
                    'target_uniprot': cplx_acc,
                })
        asm_df = pd.DataFrame(assembly_edges, columns=list(_EDGE_COLS))
        out = pd.concat([out, asm_df], axis=0, ignore_index=True)
        out = out.drop_duplicates(subset=list(_DEDUP_COLS)).reset_index(drop=True)
        if verbose:
            print(f'\tcomplex_handling=link: {len(cplx_nodes)} complexes linked via {len(asm_df)} PROTEIN->COMPLEX assembly edges')
        return out

    raise ValueError(
        f"complex_handling must be one of ['none', 'remove', 'expand', 'link'], got {mode!r}"
    )


[docs]def get_bio_interactions(undirected=False, 
                         include_tf_mirna=False, 
                         include_pathway_extra=False, 
                         include_kinase_extra=False, 
                         include_ligrec_extra=False,
                         include_collecTRI=False,
                         include_dorothea=True,
                         include_omnipath=True,
                         dorothea_levels=['A', 'B'], 
                         gene_symbol=True, 
                         complex_handling='link', 
                         min_n_references=None, 
                         min_curation_effort=None,
                         return_uniprot_map=False,
                         verbose=True): 
    r"""
    Retrieve and standardise directed biological interactions from the
    OmniPath knowledge base suite.

    The function downloads, harmonises and concatenates several curated
    interaction resources that are exposed through the *omnipath* Python
    package and converts them into a single DataFrame with unified node
    identifiers.  Each identifier is prefixed with the molecular entity
    type so that the downstream GSNN pipeline can easily distinguish
    between RNA and protein nodes:

        * ``PROTEIN__<gene_symbol>``
        * ``RNA__<gene_symbol>``

    In addition, an explicit *translation* edge (``RNA → PROTEIN``) is
    created for every gene that is found in both the RNA and the protein
    namespace.

    Parameters
    ----------
    undirected : bool, optional (default=False)
        If ``True``, the graph is made undirected by adding a reverse edge
        for every existing interaction.
    include_tf_mirna : bool, optional (default=False)
        Whether to augment the graph with TF-miRNA and miRNA-target
        interactions.
    include_pathway_extra : bool, optional (default=False)
        Whether to include additional pathway interactions that lack direct
        literature support.
    include_kinase_extra : bool, optional (default=False)
        Whether to include additional kinase-substrate interactions that
        lack direct literature support.
    include_ligrec_extra : bool, optional (default=False)
        Whether to include additional ligand-receptor interactions that
        lack direct literature support.
    include_collecTRI : bool, optional (default=False)
        Whether to include CollecTRI transcription-factor regulon
        interactions.
    include_dorothea : bool, optional (default=True)
        Whether to include DoRothEA transcription-factor regulon
        interactions.
    include_omnipath : bool, optional (default=True)
        Whether to include curated OmniPath protein-protein interactions.
    dorothea_levels : list[str], optional (default=['A', 'B'])
        Confidence levels to retain from the DoRothEA transcription-factor
        regulon resource. Valid levels are ``['A', 'B', 'C', 'D']``.
    gene_symbol : bool, optional (default=True)
        If ``True`` the identifiers are returned as HGNC gene symbols.
        Otherwise uniprot gene identifiers are used.
    complex_handling : {'none', 'remove', 'expand', 'link'}, optional
        How to deal with protein-complex entities (OmniPath encodes
        complexes as underscore-concatenated member gene symbols, e.g.
        ``PROTEIN__AEBP2_EED_EZH2_RBBP4_SUZ12`` for PRC2):

            * ``'none'``   - leave complex nodes untouched (legacy
              behaviour, kept for backwards compatibility).
            * ``'remove'`` - drop every edge that involves a complex.
            * ``'expand'`` - replace each complex with its constituent
              members, fanning out one edge per member.  This recovers
              gene-level coverage at the cost of introducing approximate
              member-level edges that were not literally curated.
            * ``'link'``   - rename complex nodes into a dedicated
              ``COMPLEX__`` namespace and add explicit
              ``PROTEIN__<member> -> COMPLEX__<...>`` "assembly" edges,
              so the GSNN can learn complex activity from member activity
              while preserving the unit-level semantics of the curated
              interaction.
    min_n_references : int or None, optional (default=None)
        If set, retain only edges supported by at least this many PubMed
        references (OmniPath ``n_references`` field).  Datasets that do
        not expose the column are dropped entirely when this filter is
        active.
    min_curation_effort : int or None, optional (default=None)
        If set, retain only edges whose OmniPath curation-effort score is
        at least this value.  Datasets that do not expose the column are
        dropped entirely when this filter is active.
    return_uniprot_map : bool, optional (default=False)
        If ``True``, return a third element: the many-to-many UniProt
        mapping table produced by :func:`build_uniprot_symbol_map`.
    verbose : bool, optional (default=True)
        Whether to print progress updates.

    Returns
    -------
    pandas.DataFrame
        One row per function-graph node with columns ``['func_name', 'uniprot',
        'gene_symbol']``.  ``func_name`` is the prefixed node id (e.g.
        ``PROTEIN__TP53``); ``gene_symbol`` is the suffix after ``__``;
        ``uniprot`` is the last-seen OmniPath accession for that node.
    pandas.DataFrame
        DataFrame with columns ``['src', 'dst', 'edge_type', 'source_uniprot',
        'target_uniprot']`` describing the directed interaction graph.
        ``source_uniprot`` and ``target_uniprot`` are the OmniPath ``source`` /
        ``target`` accession columns (typically UniProt; complexes may use
        ``COMPLEX:...``; miRNA resources may use miRBase IDs).  A handful of
        synthetic ``assembly`` edges may lack a member-level ``source_uniprot``.
    pandas.DataFrame, optional
        Returned only when ``return_uniprot_map=True``.  The many-to-many
        UniProt ↔ gene-symbol ↔ func_name mapping table from
        :func:`build_uniprot_symbol_map`.

    Notes
    -----
    The function prints the number of automatically generated translation
    edges.  Depending on the local cache state, the first call may take a
    few seconds because the interaction tables are lazily downloaded from
    the OmniPath server.

    Duplicate edges are collapsed on ``(src, dst, edge_type)`` only; if
    multiple accessions map to the same symbol after alias stripping, the
    first row is kept.

    Several miRNA-related tables surface miRBase legacy alias strings of
    the form ``"HSA-MIR-675B;HSA-MIR-675*"`` -- multiple historical names
    of the *same* mature miRNA joined by ``';'``.  Only the first alias
    is retained as the canonical node identifier.

    Examples
    --------
    >>> from gsnn.proc.bio import get_bio_interactions
    >>> nodes, edges = get_bio_interactions(undirected=True, include_tf_mirna=True)
    >>> nodes.columns.tolist(), edges.shape
    """

    import omnipath as op  # lazy: avoids network probes at gsnn import time

    if verbose: print('loading omnipath interactions...')
    if verbose and (min_n_references is not None or min_curation_effort is not None):
        print(
            f'\tapplying curation filters: min_n_references={min_n_references}, '
            f'min_curation_effort={min_curation_effort}'
        )

    if include_dorothea:
        if verbose: print('\tdorothea...')
        dorothea = op.interactions.Dorothea().get(
            organism='human', dorothea_levels=dorothea_levels, genesymbol=gene_symbol,
        )

    if include_omnipath:
        if verbose: print('\tomnipath...')
        omnipath = op.interactions.OmniPath().get(organism='human', genesymbol=gene_symbol)

    if include_collecTRI:
        if verbose: print('\tcollectri...')
        collectri = op.interactions.CollecTRI().get(organism='human', genesymbol=gene_symbol)

    if include_pathway_extra:
        if verbose: print('\tpathways_extra...')
        pathways_extra = op.interactions.PathwayExtra().get(organism='human', genesymbol=gene_symbol)

    if include_kinase_extra:
        if verbose: print('\tkinase_extra...')
        kin_extra = op.interactions.KinaseExtra().get(organism='human', genesymbol=gene_symbol)

    if include_ligrec_extra:
        if verbose: print('\tligrec_extra...')
        ligrec_extra = op.interactions.LigRecExtra().get(organism='human', genesymbol=gene_symbol)

    if include_tf_mirna:
        if verbose: print('\tTF-miRNA...')
        tf_mirna = op.interactions.TFmiRNA().get(organism='human', genesymbol=gene_symbol)

        if verbose: print('\tmiRNA...')
        mirna = op.interactions.miRNA().get(organism='human', genesymbol=gene_symbol)

    if gene_symbol:
        src_name = 'source_genesymbol'
        tgt_name = 'target_genesymbol'
    else:
        src_name = 'source'
        tgt_name = 'target'

    _std = lambda df, sp, tp, et, name: _standardize_interactions(
        df, src_name, tgt_name, sp, tp, et,
        min_n_references=min_n_references,
        min_curation_effort=min_curation_effort,
        dataset=name,
        verbose=verbose,
    )

    interactions = []

    if include_dorothea:
        interactions.append(_std(dorothea, 'PROTEIN__', 'RNA__', 'dorothea', 'dorothea'))

    if include_omnipath:
        interactions.append(_std(omnipath, 'PROTEIN__', 'PROTEIN__', 'omnipath', 'omnipath'))

    if include_collecTRI:
        interactions.append(_std(collectri, 'PROTEIN__', 'RNA__', 'collectri', 'collectri'))

    if include_pathway_extra:
        interactions.append(_std(pathways_extra, 'PROTEIN__', 'PROTEIN__', 'pathways_extra', 'pathways_extra'))

    if include_kinase_extra:
        interactions.append(_std(kin_extra, 'PROTEIN__', 'PROTEIN__', 'kinase_extra', 'kinase_extra'))

    if include_ligrec_extra:
        interactions.append(_std(ligrec_extra, 'PROTEIN__', 'PROTEIN__', 'ligrec_extra', 'ligrec_extra'))

    if include_tf_mirna:
        interactions.append(_std(tf_mirna, 'PROTEIN__', 'RNA__', 'tf_mirna', 'tf_mirna'))
        interactions.append(_std(mirna, 'RNA__', 'RNA__', 'mirna', 'mirna'))

    if not interactions:
        base_df = pd.DataFrame(columns=list(_EDGE_COLS))
    else:
        base_df = pd.concat(interactions, axis=0, ignore_index=True)

    # Collapse miRBase legacy alias strings (semicolon-separated) down to
    # a single canonical name so e.g. "RNA__HSA-MIR-675B;HSA-MIR-675*" and
    # "RNA__HSA-MIR-675B" are treated as the same node.
    base_df['source'] = base_df['source'].apply(_strip_alias_suffix)
    base_df['target'] = base_df['target'].apply(_strip_alias_suffix)
    base_df = base_df.drop_duplicates(subset=list(_DEDUP_COLS)).reset_index(drop=True)

    base_df = _apply_complex_handling(base_df, complex_handling, verbose=verbose)

    rna_uniprot = _node_uniprot_lookup(base_df, 'RNA__')
    protein_uniprot = _node_uniprot_lookup(base_df, 'PROTEIN__')

    # get translation interactions
    _fnames = base_df['source'].values.tolist() + base_df['target'].values.tolist()
    rna_space = [x.split('__')[1] for x in _fnames if x.split('__')[0] == 'RNA']
    protein_space = [x.split('__')[1] for x in _fnames if x.split('__')[0] == 'PROTEIN']
    RNA_PROT_OVERLAP = list(set(rna_space).intersection(set(protein_space)))
    trans_sources = ['RNA__' + x for x in RNA_PROT_OVERLAP]
    trans_targets = ['PROTEIN__' + x for x in RNA_PROT_OVERLAP]
    trans = pd.DataFrame({
        'source': trans_sources,
        'target': trans_targets,
        'edge_type': 'translation',
        'source_uniprot': rna_uniprot.loc[trans_sources].values,
        'target_uniprot': protein_uniprot.loc[trans_targets].values,
    })

    if verbose: print('# of translation edges (RNA->PROTEIN):', len(trans))

    # combine all edges
    func_df = pd.concat([base_df, trans], axis=0, ignore_index=True)

    if undirected:
        print('transforming to undirected (adding reverse edges)')
        # swap the direction of each edge to obtain an undirected graph
        # (doing the swap via column selection avoids potential duplicate column
        # names that can arise with a direct ``rename`` using the same targets)
        func_df2 = func_df[['target', 'source', 'target_uniprot', 'source_uniprot', 'edge_type']].copy()
        func_df2.columns = list(_EDGE_COLS)
        func_df = pd.concat((func_df, func_df2), ignore_index=True, axis=0)
        func_df = func_df.drop_duplicates()
        func_df = func_df.dropna()

    # rename source to src and target to dst
    func_df = func_df.rename(columns={'source': 'src', 'target': 'dst'})

    func_nodes = _build_func_nodes_df(func_df)

    if return_uniprot_map:
        return func_nodes, func_df, build_uniprot_symbol_map(func_df)
    return func_nodes, func_df




[docs]def uniprot2symbol(uniprot_ids, allow='1:m', drop_na=True):
    r"""Map UniProt accession IDs to HGNC gene symbols using PyPath.

    A convenience wrapper around :pyfunc:`pypath.utils.mapping.map_name` that
    translates protein accessions into their corresponding gene symbols.

    Two mapping strategies are available (`allow`):

        1. ``'1:m'`` - keep **all** gene symbols associated with a UniProt ID
           (one-to-many, default).
        2. ``'1:1'`` - keep only the **first** gene symbol returned by PyPath for
           each UniProt ID (one-to-one).

    Args:
        uniprot_ids (Sequence[str] or pandas.Series): Iterable of UniProt
            accession IDs.  Duplicate IDs are collapsed to the unique set for
            the lookup, but the returned :class:`~pandas.DataFrame` contains one
            row per *combination* of accession and gene symbol.
        allow (str, optional): Mapping strategy; must be either ``'1:m'`` or
            ``'1:1'``.  Defaults to ``'1:m'``.

    Returns:
        pandas.DataFrame: A two-column DataFrame with

            * ``'uniprot_id'`` - UniProt accession (str)
            * ``'gene_symbol'`` - Gene symbol (str) or *None* if the accession
              could not be mapped.

    Example:
        >>> from gsnn.proc.map import uniprot2symbol
        >>> ids = pd.Series(['P38398', 'Q9Y243', 'INVALID'])
        >>> uniprot2symbol(ids, mapping='1:m').head()
           uniprot_id gene_symbol
        0      P38398       MAPK1
        1      Q9Y243        PTEN
        2     INVALID        None
    """

    from pypath.utils import mapping  # lazy: avoids network probes at gsnn import time

    assert allow in ['1:m', '1:1'], 'allow must be one of ["1:m", "1:1"]'

    u2s  = {'uniprot_id': [], 'gene_symbol': []}

    for u in np.unique(uniprot_ids): 
        
        s = mapping.map_name(u, 'uniprot', 'genesymbol')

        if len(s) > 0: 
            for g in s:  
                u2s['uniprot_id'].append(u)
                u2s['gene_symbol'].append(g)
                if allow == '1:1': break # only map one gene symbol per uniprot id; first in set
        else: 
            u2s['uniprot_id'].append(u)
            u2s['gene_symbol'].append(None)

    u2s = pd.DataFrame(u2s)

    if drop_na: 
        u2s = u2s.dropna()

    return u2s.drop_duplicates()



[docs]def symbol2uniprot(gene_symbols, allow='1:m', drop_na=True):
    r"""Map gene symbols to UniProt accession IDs using PyPath.

    A convenience wrapper around :pyfunc:`pypath.utils.mapping.map_name` that
    translates gene symbols into their corresponding UniProt accession IDs.

    Two mapping strategies are available (`allow`):

        1. ``'1:m'`` - keep **all** gene symbols associated with a UniProt ID
           (one-to-many, default).
        2. ``'1:1'`` - keep only the **first** gene symbol returned by PyPath for
           each UniProt ID (one-to-one).

    Args:
        gene_symbols (Sequence[str] or pandas.Series): Iterable of gene symbols.
            Duplicate symbols are collapsed to the unique set for the lookup,
            but the returned :class:`~pandas.DataFrame` contains one row per
            *combination* of symbol and UniProt ID.
        allow (str, optional): Mapping strategy; must be either ``'1:m'`` or
            ``'1:1'``.  Defaults to ``'1:m'``.

    Returns:
        pandas.DataFrame: A two-column DataFrame with

            * ``'gene_symbol'`` - Gene symbol (str)
            * ``'uniprot_id'`` - UniProt accession (str) or *None* if the symbol
              could not be mapped.

    Example:
        >>> from gsnn.proc.map import symbol2uniprot
        >>> symbols = pd.Series(['MAPK1', 'PTEN', 'INVALID'])
        >>> symbol2uniprot(symbols, mapping='1:m').head()
           gene_symbol uniprot_id
        0      MAPK1       P38398
        1      PTEN        Q9Y243
        2     INVALID        None
    """

    from pypath.utils import mapping  # lazy: avoids network probes at gsnn import time

    assert allow in ['1:m', '1:1'], 'allow must be one of ["1:m", "1:1"]'

    s2u  = {'uniprot_id': [], 'gene_symbol': []}

    for s in np.unique(gene_symbols): 
        
        u = mapping.map_name(s, 'genesymbol', 'uniprot')

        if len(u) > 0: 
            for u_ in u:  
                s2u['uniprot_id'].append(u_)
                s2u['gene_symbol'].append(s)
                if allow == '1:1': break # only map one gene symbol per uniprot id; first in set
        else: 
            s2u['uniprot_id'].append(None)
            s2u['gene_symbol'].append(s)

    s2u = pd.DataFrame(s2u)

    if drop_na: 
        s2u = s2u.dropna()

    return s2u.drop_duplicates()


[docs]def ensg2symbol(ensg_ids, allow='1:m', drop_na=True):
    r"""Map Ensembl gene IDs (ENSG) to HGNC gene symbols using PyPath.

    A convenience wrapper around :pyfunc:`pypath.utils.mapping.map_name` that
    translates Ensembl gene IDs into their corresponding HGNC gene symbols.

    Two mapping strategies are available (`allow`):

        1. ``'1:m'`` - keep **all** gene symbols associated with an Ensembl ID
           (one-to-many, default).
        2. ``'1:1'`` - keep only the **first** gene symbol returned by PyPath for
           each Ensembl ID (one-to-one).

    Args:
        ensg_ids (Sequence[str] or pandas.Series): Iterable of Ensembl gene
            IDs. Duplicate IDs are collapsed to the unique set for the lookup,
            but the returned :class:`~pandas.DataFrame` contains one row per
            *combination* of Ensembl ID and gene symbol.
        allow (str, optional): Mapping strategy; must be either ``'1:m'`` or
            ``'1:1'``.  Defaults to ``'1:m'``.
        drop_na (bool, optional): If ``True``, drop rows where the Ensembl ID
            could not be mapped to a gene symbol. Defaults to ``True``.

    Returns:
        pandas.DataFrame: A two-column DataFrame with

            * ``'ensg_id'`` - Ensembl gene ID (str)
            * ``'gene_symbol'`` - HGNC gene symbol (str) or *None* if the
              Ensembl ID could not be mapped.

    Example:
        >>> from gsnn.proc.bio import ensg2symbol
        >>> ensgs = pd.Series(['ENSG00000100030', 'ENSG00000171862', 'INVALID'])
        >>> ensg2symbol(ensgs, allow='1:m').head()
                   ensg_id gene_symbol
        0  ENSG00000100030       MAPK1
        1  ENSG00000171862        PTEN
    """

    from pypath.utils import mapping  # lazy: avoids network probes at gsnn import time

    assert allow in ['1:m', '1:1'], 'allow must be one of ["1:m", "1:1"]'

    e2s = {'ensg_id': [], 'gene_symbol': []}

    for e in np.unique(ensg_ids): 
        
        s = mapping.map_name(e, 'ensg', 'genesymbol')

        if len(s) > 0: 
            for s_ in s:  
                e2s['ensg_id'].append(e)
                e2s['gene_symbol'].append(s_)
                if allow == '1:1': break # only map one gene symbol per ensg id; first in set
        else: 
            e2s['ensg_id'].append(e)
            e2s['gene_symbol'].append(None)

    e2s = pd.DataFrame(e2s)

    if drop_na: 
        e2s = e2s.dropna()

    return e2s.drop_duplicates()