Skip to content

Transcriptome

chromatinhd.data.Transcriptome

Bases: Flow

A transcriptome containing counts for each gene in each cell.

Source code in src/chromatinhd/data/transcriptome/transcriptome.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
class Transcriptome(Flow):
    """
    A transcriptome containing counts for each gene in each cell.
    """

    var: pd.DataFrame = TSV(index_name="gene")
    obs: pd.DataFrame = TSV(index_name="cell")

    adata = Stored()
    "Anndata object containing the transcriptome data."

    def gene_id(self, symbol, column="symbol", optional=False, found=False):
        """
        Get the gene id for a given gene symbol.
        """
        if found:
            gene_id = self.var.reset_index().groupby(column).first().reindex(symbol)["gene"]
            return gene_id
        if optional:
            symbol = pd.Series(symbol)[pd.Series(symbol).isin(self.var[column])]

        assert all(pd.Series(symbol).isin(self.var[column])), set(
            pd.Series(symbol)[~pd.Series(symbol).isin(self.var[column])]
        )
        return self.var.reset_index("gene").set_index(column).loc[symbol]["gene"]

    def symbol(self, gene_id, column="symbol"):
        """
        Get the gene symbol for a given gene ID (e.g. Ensembl ID).
        """
        assert all(pd.Series(gene_id).isin(self.var.index)), set(
            pd.Series(gene_id)[~pd.Series(gene_id).isin(self.var.index)]
        )
        return self.var.loc[gene_id][column]

    def gene_ix(self, symbol):
        """
        Get the gene index for a given gene symbol.
        """
        self.var["ix"] = np.arange(self.var.shape[0])
        assert all(pd.Series(symbol).isin(self.var["symbol"])), set(
            pd.Series(symbol)[~pd.Series(symbol).isin(self.var["symbol"])]
        )
        return self.var.reset_index("gene").set_index("symbol").loc[symbol]["ix"]

    @classmethod
    def from_adata(
        cls,
        adata: sc.AnnData,
        path: Union[pathlib.Path, str] = None,
        overwrite=False,
    ):
        """
        Create a Transcriptome object from an AnnData object.

        Parameters:
            adata:
                Anndata object containing the transcriptome data.
            path:
                Folder in which the transcriptome data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
        """

        transcriptome = cls(path=path, reset=overwrite)
        transcriptome.adata = adata

        for k, v in adata.layers.items():
            if sparse.is_scipysparse(v):
                v = np.array(v.todense())
            transcriptome.layers[k] = v.astype("<f4")
        if sparse.is_scipysparse(adata.X):
            v = np.array(adata.X.todense()).astype("<f4")
        else:
            v = adata.X.astype("<f4")
        transcriptome.layers["X"] = v
        transcriptome.var = adata.var
        transcriptome.obs = adata.obs
        return transcriptome

    @property
    def X(self):
        return self.layers[list(self.layers.keys())[0]]

    @X.setter
    def X(self, value):
        self.layers["X"] = value

    layers = StoredDict(Tensorstore, kwargs=dict(dtype="<f4"))
    "Dictionary of layers, such as raw, normalized and imputed data."

    def filter_genes(self, genes, path=None):
        """
        Filter genes

        Parameters:
            genes:
                Genes to filter.
        """

        self.var["ix"] = np.arange(self.var.shape[0])
        gene_ixs = self.var["ix"].loc[genes]

        layers = {}
        for k, v in self.layers.items():
            layers[k] = v[:, gene_ixs]
        X = self.X[:, gene_ixs]

        return Transcriptome.create(
            var=self.var.loc[genes],
            obs=self.obs,
            X=X,
            layers=layers,
            path=path,
        )

    def filter_cells(self, cells, path=None):
        """
        Filter cells

        Parameters:
            cells:
                Cells to filter.
        """

        self.obs["ix"] = np.arange(self.obs.shape[0])
        cell_ixs = self.obs["ix"].loc[cells]

        layers = {}
        for k, v in self.layers.items():
            layers[k] = v[cell_ixs, :]
        X = self.X[cell_ixs, :]

        if self.o.adata.exists(self):
            adata = self.adata[cell_ixs, :]
        else:
            adata = None

        return Transcriptome.create(var=self.var, obs=self.obs.loc[cells], X=X, layers=layers, path=path, adata=adata)
    def get_X(self, gene_ids, layer=None):
        """
        Get the counts for a given set of genes.
        """

        if isinstance(gene_ids, str):
            gene_ixs = self.var.index.get_loc(gene_ids)
        else:
            gene_ixs = self.var.index.get_indexer(gene_ids)

        if layer is None:
            value = self.X[:, gene_ixs]
        else:
            value = self.layers[layer][:, gene_ixs]

        if sparse.is_scipysparse(value):
            value = np.array(value.todense())
            if isinstance(gene_ids, str):
                value = value[:, 0]
        return value

    def get_diffexp(self, key = "rank_genes_groups", groups = None):
        return get_diffexp(self.adata, key, groups)

adata = Stored() class-attribute instance-attribute

Anndata object containing the transcriptome data.

layers = StoredDict(Tensorstore, kwargs=dict(dtype='<f4')) class-attribute instance-attribute

Dictionary of layers, such as raw, normalized and imputed data.

filter_cells(cells, path=None)

Filter cells

Parameters:

Name Type Description Default
cells

Cells to filter.

required
Source code in src/chromatinhd/data/transcriptome/transcriptome.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def filter_cells(self, cells, path=None):
    """
    Filter cells

    Parameters:
        cells:
            Cells to filter.
    """

    self.obs["ix"] = np.arange(self.obs.shape[0])
    cell_ixs = self.obs["ix"].loc[cells]

    layers = {}
    for k, v in self.layers.items():
        layers[k] = v[cell_ixs, :]
    X = self.X[cell_ixs, :]

    if self.o.adata.exists(self):
        adata = self.adata[cell_ixs, :]
    else:
        adata = None

    return Transcriptome.create(var=self.var, obs=self.obs.loc[cells], X=X, layers=layers, path=path, adata=adata)

filter_genes(genes, path=None)

Filter genes

Parameters:

Name Type Description Default
genes

Genes to filter.

required
Source code in src/chromatinhd/data/transcriptome/transcriptome.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def filter_genes(self, genes, path=None):
    """
    Filter genes

    Parameters:
        genes:
            Genes to filter.
    """

    self.var["ix"] = np.arange(self.var.shape[0])
    gene_ixs = self.var["ix"].loc[genes]

    layers = {}
    for k, v in self.layers.items():
        layers[k] = v[:, gene_ixs]
    X = self.X[:, gene_ixs]

    return Transcriptome.create(
        var=self.var.loc[genes],
        obs=self.obs,
        X=X,
        layers=layers,
        path=path,
    )

from_adata(adata, path=None, overwrite=False) classmethod

Create a Transcriptome object from an AnnData object.

Parameters:

Name Type Description Default
adata AnnData

Anndata object containing the transcriptome data.

required
path Union[Path, str]

Folder in which the transcriptome data will be stored.

None
overwrite

Whether to overwrite the data if it already exists.

False
Source code in src/chromatinhd/data/transcriptome/transcriptome.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@classmethod
def from_adata(
    cls,
    adata: sc.AnnData,
    path: Union[pathlib.Path, str] = None,
    overwrite=False,
):
    """
    Create a Transcriptome object from an AnnData object.

    Parameters:
        adata:
            Anndata object containing the transcriptome data.
        path:
            Folder in which the transcriptome data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
    """

    transcriptome = cls(path=path, reset=overwrite)
    transcriptome.adata = adata

    for k, v in adata.layers.items():
        if sparse.is_scipysparse(v):
            v = np.array(v.todense())
        transcriptome.layers[k] = v.astype("<f4")
    if sparse.is_scipysparse(adata.X):
        v = np.array(adata.X.todense()).astype("<f4")
    else:
        v = adata.X.astype("<f4")
    transcriptome.layers["X"] = v
    transcriptome.var = adata.var
    transcriptome.obs = adata.obs
    return transcriptome

gene_id(symbol, column='symbol', optional=False, found=False)

Get the gene id for a given gene symbol.

Source code in src/chromatinhd/data/transcriptome/transcriptome.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def gene_id(self, symbol, column="symbol", optional=False, found=False):
    """
    Get the gene id for a given gene symbol.
    """
    if found:
        gene_id = self.var.reset_index().groupby(column).first().reindex(symbol)["gene"]
        return gene_id
    if optional:
        symbol = pd.Series(symbol)[pd.Series(symbol).isin(self.var[column])]

    assert all(pd.Series(symbol).isin(self.var[column])), set(
        pd.Series(symbol)[~pd.Series(symbol).isin(self.var[column])]
    )
    return self.var.reset_index("gene").set_index(column).loc[symbol]["gene"]

gene_ix(symbol)

Get the gene index for a given gene symbol.

Source code in src/chromatinhd/data/transcriptome/transcriptome.py
65
66
67
68
69
70
71
72
73
def gene_ix(self, symbol):
    """
    Get the gene index for a given gene symbol.
    """
    self.var["ix"] = np.arange(self.var.shape[0])
    assert all(pd.Series(symbol).isin(self.var["symbol"])), set(
        pd.Series(symbol)[~pd.Series(symbol).isin(self.var["symbol"])]
    )
    return self.var.reset_index("gene").set_index("symbol").loc[symbol]["ix"]

get_X(gene_ids, layer=None)

Get the counts for a given set of genes.

Source code in src/chromatinhd/data/transcriptome/transcriptome.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def get_X(self, gene_ids, layer=None):
    """
    Get the counts for a given set of genes.
    """

    if isinstance(gene_ids, str):
        gene_ixs = self.var.index.get_loc(gene_ids)
    else:
        gene_ixs = self.var.index.get_indexer(gene_ids)

    if layer is None:
        value = self.X[:, gene_ixs]
    else:
        value = self.layers[layer][:, gene_ixs]

    if sparse.is_scipysparse(value):
        value = np.array(value.todense())
        if isinstance(gene_ids, str):
            value = value[:, 0]
    return value

symbol(gene_id, column='symbol')

Get the gene symbol for a given gene ID (e.g. Ensembl ID).

Source code in src/chromatinhd/data/transcriptome/transcriptome.py
56
57
58
59
60
61
62
63
def symbol(self, gene_id, column="symbol"):
    """
    Get the gene symbol for a given gene ID (e.g. Ensembl ID).
    """
    assert all(pd.Series(gene_id).isin(self.var.index)), set(
        pd.Series(gene_id)[~pd.Series(gene_id).isin(self.var.index)]
    )
    return self.var.loc[gene_id][column]