Skip to content

Folds

chromatinhd.data.folds.Folds

Bases: Flow

Folds of multiple cell and region combinations

Source code in src/chromatinhd/data/folds/folds.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class Folds(Flow):
    """
    Folds of multiple cell and region combinations
    """

    folds: dict = Stored()
    """The folds"""

    def sample_cells(
        self,
        fragments: Fragments,
        n_folds: int,
        n_repeats: int = 1,
        overwrite: bool = False,
        seed: int = 1,
    ):
        """
        Sample cells and regions into folds

        Parameters:
            fragments:
                the fragments
            n_folds:
                the number of folds
            n_repeats:
                the number of repeats
            overwrite:
                whether to overwrite existing folds
        """
        if not overwrite and self.get("folds").exists(self):
            return self

        folds = []

        for repeat_ix in range(n_repeats):
            generator = np.random.RandomState(repeat_ix * seed)

            cells_all = generator.permutation(fragments.n_cells)

            cell_bins = np.floor((np.arange(len(cells_all)) / (len(cells_all) / n_folds)))

            for i in range(n_folds):
                cells_train = cells_all[cell_bins != i]
                cells_validation_test = cells_all[cell_bins == i]
                cells_validation = cells_validation_test[: (len(cells_validation_test) // 2)]
                cells_test = cells_validation_test[(len(cells_validation_test) // 2) :]

                folds.append(
                    {
                        "cells_train": cells_train,
                        "cells_validation": cells_validation,
                        "cells_test": cells_test,
                        "repeat": repeat_ix,
                    }
                )
        self.folds = folds

        return self

    def sample_cellxregion(
        self,
        fragments: Fragments,
        n_folds: int,
        n_repeats: int = 1,
        stratify_by_chromosome=True,
        overwrite: bool = False,
    ):
        """
        Sample cells and regions into folds

        Parameters:
            fragments:
                the fragments
            n_folds:
                the number of folds
            n_repeats:
                the number of repeats
            overwrite:
                whether to overwrite existing folds
        """
        if not overwrite and self.get("folds").exists(self):
            return self

        folds = []

        for repeat_ix in range(n_repeats):
            generator = np.random.RandomState(repeat_ix)

            cells_all = generator.permutation(fragments.n_cells)

            cell_bins = np.floor((np.arange(len(cells_all)) / (len(cells_all) / n_folds)))

            regions_all = np.arange(fragments.n_regions)

            if stratify_by_chromosome:
                chr_column = "chr" if "chr" in fragments.regions.coordinates.columns else "chrom"
                chr_order = generator.permutation(fragments.regions.coordinates[chr_column].unique())
                region_chrs = pd.Categorical(
                    fragments.regions.coordinates[chr_column].astype(str), categories=chr_order
                ).codes
                region_bins = np.floor((region_chrs / (len(chr_order) / n_folds))).astype(int)
            else:
                region_bins = np.floor((np.arange(len(regions_all)) / (len(regions_all) / n_folds)))

            for i in range(n_folds):
                cells_train = cells_all[cell_bins != i]
                cells_validation_test = cells_all[cell_bins == i]
                cells_validation = cells_validation_test[: (len(cells_validation_test) // 2)]
                cells_test = cells_validation_test[(len(cells_validation_test) // 2) :]

                regions_train = regions_all[region_bins != i]
                regions_validation_test = generator.permutation(regions_all[region_bins == i])
                regions_validation = regions_validation_test[: (len(regions_validation_test) // 2)]
                regions_test = regions_validation_test[(len(regions_validation_test) // 2) :]

                folds.append(
                    {
                        "cells_train": cells_train,
                        "cells_validation": cells_validation,
                        "cells_test": cells_test,
                        "regions_train": regions_train,
                        "regions_validation": regions_validation,
                        "regions_test": regions_test,
                        "repeat": repeat_ix,
                    }
                )
        self.folds = folds
        return self

    def __getitem__(self, ix):
        return self.folds[ix]

    def __len__(self):
        return len(self.folds)

folds: dict = Stored() class-attribute instance-attribute

The folds

sample_cells(fragments, n_folds, n_repeats=1, overwrite=False, seed=1)

Sample cells and regions into folds

Parameters:

Name Type Description Default
fragments Fragments

the fragments

required
n_folds int

the number of folds

required
n_repeats int

the number of repeats

1
overwrite bool

whether to overwrite existing folds

False
Source code in src/chromatinhd/data/folds/folds.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def sample_cells(
    self,
    fragments: Fragments,
    n_folds: int,
    n_repeats: int = 1,
    overwrite: bool = False,
    seed: int = 1,
):
    """
    Sample cells and regions into folds

    Parameters:
        fragments:
            the fragments
        n_folds:
            the number of folds
        n_repeats:
            the number of repeats
        overwrite:
            whether to overwrite existing folds
    """
    if not overwrite and self.get("folds").exists(self):
        return self

    folds = []

    for repeat_ix in range(n_repeats):
        generator = np.random.RandomState(repeat_ix * seed)

        cells_all = generator.permutation(fragments.n_cells)

        cell_bins = np.floor((np.arange(len(cells_all)) / (len(cells_all) / n_folds)))

        for i in range(n_folds):
            cells_train = cells_all[cell_bins != i]
            cells_validation_test = cells_all[cell_bins == i]
            cells_validation = cells_validation_test[: (len(cells_validation_test) // 2)]
            cells_test = cells_validation_test[(len(cells_validation_test) // 2) :]

            folds.append(
                {
                    "cells_train": cells_train,
                    "cells_validation": cells_validation,
                    "cells_test": cells_test,
                    "repeat": repeat_ix,
                }
            )
    self.folds = folds

    return self

sample_cellxregion(fragments, n_folds, n_repeats=1, stratify_by_chromosome=True, overwrite=False)

Sample cells and regions into folds

Parameters:

Name Type Description Default
fragments Fragments

the fragments

required
n_folds int

the number of folds

required
n_repeats int

the number of repeats

1
overwrite bool

whether to overwrite existing folds

False
Source code in src/chromatinhd/data/folds/folds.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def sample_cellxregion(
    self,
    fragments: Fragments,
    n_folds: int,
    n_repeats: int = 1,
    stratify_by_chromosome=True,
    overwrite: bool = False,
):
    """
    Sample cells and regions into folds

    Parameters:
        fragments:
            the fragments
        n_folds:
            the number of folds
        n_repeats:
            the number of repeats
        overwrite:
            whether to overwrite existing folds
    """
    if not overwrite and self.get("folds").exists(self):
        return self

    folds = []

    for repeat_ix in range(n_repeats):
        generator = np.random.RandomState(repeat_ix)

        cells_all = generator.permutation(fragments.n_cells)

        cell_bins = np.floor((np.arange(len(cells_all)) / (len(cells_all) / n_folds)))

        regions_all = np.arange(fragments.n_regions)

        if stratify_by_chromosome:
            chr_column = "chr" if "chr" in fragments.regions.coordinates.columns else "chrom"
            chr_order = generator.permutation(fragments.regions.coordinates[chr_column].unique())
            region_chrs = pd.Categorical(
                fragments.regions.coordinates[chr_column].astype(str), categories=chr_order
            ).codes
            region_bins = np.floor((region_chrs / (len(chr_order) / n_folds))).astype(int)
        else:
            region_bins = np.floor((np.arange(len(regions_all)) / (len(regions_all) / n_folds)))

        for i in range(n_folds):
            cells_train = cells_all[cell_bins != i]
            cells_validation_test = cells_all[cell_bins == i]
            cells_validation = cells_validation_test[: (len(cells_validation_test) // 2)]
            cells_test = cells_validation_test[(len(cells_validation_test) // 2) :]

            regions_train = regions_all[region_bins != i]
            regions_validation_test = generator.permutation(regions_all[region_bins == i])
            regions_validation = regions_validation_test[: (len(regions_validation_test) // 2)]
            regions_test = regions_validation_test[(len(regions_validation_test) // 2) :]

            folds.append(
                {
                    "cells_train": cells_train,
                    "cells_validation": cells_validation,
                    "cells_test": cells_test,
                    "regions_train": regions_train,
                    "regions_validation": regions_validation,
                    "regions_test": regions_test,
                    "repeat": repeat_ix,
                }
            )
    self.folds = folds
    return self