Source code for nmf_models.utils

"""Module containing useful functions for intNMF module."""

import numpy as np
from nmf_models.nmf_models_mod_updates import intNMF
from typing import Optional, Union, Mapping  # Special
import pandas as pd
import muon as mu
import anndata as ad
import os

[docs]def get_top_features(nmf_model:intNMF,
                     topics: Optional[list] = None,
                     n_features: Optional[int] = 1,
                     modality: Optional[str] = 'rna',
                     mode: Optional[str] = 'abs'):
    """Get the highest ranked features from selected topics.

    Parameters
    ----------
    nmf_model : intNMF
        intNMF model with loadings to plot (.fit has been run)
    topics : list[int], default None (all topics)
        list of topics to get top features for
    n_features : int, default 1
        Number of features
    modality : str, 'rna' | 'atac', default 'rna'
        modality fro which to return features
    mode : str, 'abs' | 'diff', default 'abs'
        select features based on absolute value or diff with other topics average
    Returns
    -------
    list[str] of features from selected modality
    """



    if modality == 'rna':
        phi = nmf_model.phi_rna
        col_names = nmf_model.rna_features
    elif modality == 'atac':
        phi = nmf_model.phi_atac
        col_names = nmf_model.atac_features
    else:
        print('select modality from rna or atac')
        #return

    if col_names is None:
        print('Missing feature labels')
        #return
    phi_df = pd.DataFrame(phi, columns=col_names, index=['Factor {}'.format(i) for i in np.arange(nmf_model.k)])

    if topics is None:
        topics = np.arange(nmf_model.k)
    top_features = []
    if mode == 'abs':
        for topic in topics:
            top_features.append(list(phi_df.iloc[topic, :].sort_values(ascending=False)[0:n_features].index))

    elif mode == 'diff':
        for topic in topics:
            top_features.append(list((phi_df.iloc[topic, :] - phi_df.drop(phi_df.index[topic], axis=0).mean(axis=0)).sort_values(ascending=False)[0:n_features].index))

    return [item for topic_features in top_features for item in topic_features]

[docs]def load_multiome(file: str, labels: Optional[str] = None):
    """Function to load multiome data from .h5, .h5ad or .h5mu file types

    Parameters
    ----------
    File : str
        Location of multiome dataset. SHould be .h5 (output of cell ranger), h5ad or h5mu
    labels : str, default None
        Location of cell labels. Should be a tsv file format.

    Returns
    -------
    muon.MuData object multimodal data contanainer built on anndata
    """

    _, extension = os.path.splitext(file)
    if extension == '.h5':
        mu_data = mu.read_10x_h5(file)

    elif extension == '.h5ad':
        h5ad = ad.read_h5ad(file)
        rna = h5ad[:, h5ad.var['feature_types'] == 'GEX']
        atac = h5ad[:, h5ad.var['feature_types'] == 'ATAC']
        mu_data = mu.MuData({'rna': rna, 'atac': atac})
        mu_data.update()
        mu.pp.intersect_obs(mu_data)
    elif extension == '.h5mu':
        mu_data = mu.read(file)

    # If there are labels for the dataset load the labels and remove cells without a label.
    if labels is None:
        print('no labels')
    else:
        meta = pd.read_csv(labels, sep="\t", header=0, index_col=0)
        mu.pp.filter_obs(mu_data, meta.index.values)
        mu_data.obs = meta

    return mu_data