Skip to content

Clustering

chromatinhd.data.clustering.Clustering

Bases: Flow

Source code in src/chromatinhd/data/clustering/clustering.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class Clustering(Flow):
    labels: pd.DataFrame = Stored()
    "Labels for each cell."

    indices: np.array = Tensorstore(dtype="<i4")
    "Indices for each cell."

    var: pd.DataFrame = StoredDataFrame(index_name="cluster")
    "Information for each cluster, such as a label, color, ..."

    @classmethod
    def from_labels(
        cls,
        labels: pd.Series,
        var: pd.DataFrame = None,
        path: PathLike = None,
        overwrite=False,
    ) -> Clustering:
        """
        Create a Clustering object from a series of labels.

        Parameters:
            labels:
                Series of labels for each cell, with index corresponding to cell
                names.
            path:
                Folder where the clustering information will be stored.
            overwrite:
                Whether to overwrite the clustering information if it already
                exists.

        Returns:
            Clustering object.

        """
        self = cls(path, reset=overwrite)

        if not overwrite and self.o.labels.exists(self):
            return self

        if not isinstance(labels, pd.Series):
            labels = pd.Series(labels).astype("category")
        elif not labels.dtype.name == "category":
            labels = labels.astype("category")
        self.labels = labels
        self.indices = labels.cat.codes.values

        if var is None:
            var = (
                pd.DataFrame(
                    {
                        "cluster": labels.cat.categories,
                        "label": labels.cat.categories,
                    }
                )
                .set_index("cluster")
                .loc[labels.cat.categories]
            )
            var["n_cells"] = labels.value_counts()
        else:
            var = var.reindex(labels.cat.categories)
            var["label"] = labels.cat.categories
        self.var = var
        return self

    @property
    def n_clusters(self):
        return len(self.labels.cat.categories)

    # temporarily link cluster_info to var
    @property
    def cluster_info(self):
        return self.var

    @cluster_info.setter
    def cluster_info(self, cluster_info):
        self.var = cluster_info

indices: np.array = Tensorstore(dtype='<i4') class-attribute instance-attribute

Indices for each cell.

labels: pd.DataFrame = Stored() class-attribute instance-attribute

Labels for each cell.

var: pd.DataFrame = StoredDataFrame(index_name='cluster') class-attribute instance-attribute

Information for each cluster, such as a label, color, ...

from_labels(labels, var=None, path=None, overwrite=False) classmethod

Create a Clustering object from a series of labels.

Parameters:

Name Type Description Default
labels Series

Series of labels for each cell, with index corresponding to cell names.

required
path PathLike

Folder where the clustering information will be stored.

None
overwrite

Whether to overwrite the clustering information if it already exists.

False

Returns:

Type Description
Clustering

Clustering object.

Source code in src/chromatinhd/data/clustering/clustering.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@classmethod
def from_labels(
    cls,
    labels: pd.Series,
    var: pd.DataFrame = None,
    path: PathLike = None,
    overwrite=False,
) -> Clustering:
    """
    Create a Clustering object from a series of labels.

    Parameters:
        labels:
            Series of labels for each cell, with index corresponding to cell
            names.
        path:
            Folder where the clustering information will be stored.
        overwrite:
            Whether to overwrite the clustering information if it already
            exists.

    Returns:
        Clustering object.

    """
    self = cls(path, reset=overwrite)

    if not overwrite and self.o.labels.exists(self):
        return self

    if not isinstance(labels, pd.Series):
        labels = pd.Series(labels).astype("category")
    elif not labels.dtype.name == "category":
        labels = labels.astype("category")
    self.labels = labels
    self.indices = labels.cat.codes.values

    if var is None:
        var = (
            pd.DataFrame(
                {
                    "cluster": labels.cat.categories,
                    "label": labels.cat.categories,
                }
            )
            .set_index("cluster")
            .loc[labels.cat.categories]
        )
        var["n_cells"] = labels.value_counts()
    else:
        var = var.reindex(labels.cat.categories)
        var["label"] = labels.cat.categories
    self.var = var
    return self