Skip to content

Fragments

chromatinhd.loaders.fragments.Fragments

Basic loader for fragments. This requires either regionxcell_indptr (for a Fragments) or regionxcell_fragmentixs_indptr (for a FragmentsView) to be present.

Example
loader = Fragments(fragments, cellxregion_batch_size=1000)
minibatch = Minibatch(cells_oi=np.arange(100), regions_oi=np.arange(100))
data = loader.load(minibatch)
data.coordinates
Source code in src/chromatinhd/loaders/fragments.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
class Fragments:
    """
    Basic loader for fragments. This requires either `regionxcell_indptr` (for a Fragments) or `regionxcell_fragmentixs_indptr` (for a FragmentsView) to be present.

    Example:
        ```
        loader = Fragments(fragments, cellxregion_batch_size=1000)
        minibatch = Minibatch(cells_oi=np.arange(100), regions_oi=np.arange(100))
        data = loader.load(minibatch)
        data.coordinates
        ```
    """

    cellxregion_batch_size: int

    preloaded = False

    out_coordinates: torch.Tensor
    out_regionmapping: torch.Tensor
    out_local_cellxregion_ix: torch.Tensor

    n_regions: int
    is_view: bool

    def __init__(
        self,
        fragments: Union[chromatinhd.data.fragments.Fragments, chromatinhd.data.fragments.FragmentsView],
        cellxregion_batch_size: int,
        n_fragment_per_cellxregion: int = None,
        buffer_size_multiplier=10,  # increase this if crashing
        provide_libsize=False,
        provide_multiplets=True,
    ):
        """
        Parameters:
            fragments: Fragments object
            cellxregion_batch_size: maximum number of cell x region combinations that will be loaded
            n_fragment_per_cellxregion: estimated number of the number of fragments per cell x region combination, used for preallocation
        """
        self.cellxregion_batch_size = cellxregion_batch_size

        # store auxilliary information
        window = fragments.regions.window
        self.window = window

        # create buffers for coordinates
        if n_fragment_per_cellxregion is None:
            n_fragment_per_cellxregion = fragments.estimate_fragment_per_cellxregion()
        fragment_buffer_size = int(n_fragment_per_cellxregion * cellxregion_batch_size * buffer_size_multiplier)
        self.fragment_buffer_size = fragment_buffer_size

        self.n_regions = fragments.n_regions

        # set up readers and determine if we are dealing with a view or not
        if isinstance(fragments, chromatinhd.data.fragments.view.FragmentsView):
            self.regionxcell_indptr_reader = fragments.regionxcell_indptr.open_reader()
            self.coordinates_reader = fragments.coordinates.open_reader()

            if "strand" in fragments.regions.coordinates.columns:
                self.region_strands = fragments.regions.coordinates["strand"].values
            else:
                self.region_strands = np.ones((len(fragments.regions.coordinates),), dtype=np.int8)
            self.region_centers = (fragments.regions.coordinates["start"] - fragments.regions.window[0]).values * (
                self.region_strands == 1
            ).astype(int) + (fragments.regions.coordinates["end"] + fragments.regions.window[0]).values * (
                self.region_strands == -1
            ).astype(
                int
            )
            self.is_view = True
        # elif isinstance(fragments, chromatinhd.data.fragments.Fragments):
        else:
            self.is_view = False
            # raise ValueError("fragments must be either a Fragments or FragmentsView object", type(fragments))
            self.regionxcell_indptr_reader = fragments.regionxcell_indptr.open_reader(
                {"context": {"data_copy_concurrency": {"limit": 1}}}
            )
            self.coordinates_reader = fragments.coordinates.open_reader(
                {
                    "context": {
                        "data_copy_concurrency": {"limit": 1},
                    }
                }
            )

        self.n_cells = fragments.n_cells

        self.provide_multiplets = provide_multiplets
        self.provide_libsize = provide_libsize

        if provide_libsize:
            library_size = fragments.libsize
            self.library_size = torch.from_numpy((library_size - library_size.mean()) / library_size.std()).float()

    def preload(self):
        self.out_fragmentixs = np.zeros((self.fragment_buffer_size,), dtype=np.int64)
        self.out_local_cellxregion_ix = np.zeros((self.fragment_buffer_size,), dtype=np.int64)

        self.preloaded = True

    def load(self, minibatch: Minibatch) -> FragmentsResult:
        """
        Load a minibatch of fragments.

        Parameters:
            minibatch: Minibatch object

        Returns:
            The loaded fragments
        """
        if not self.preloaded:
            self.preload()

        if (len(minibatch.cells_oi) * len(minibatch.regions_oi)) > self.cellxregion_batch_size:
            raise ValueError(
                "Too many cell x region requested, increase cellxregion_batch_size at loader initialization"
            )

        if self.is_view:
            # load the fragment indices using pointers to the regionxcell fragment indices
            regionxcell_ixs = (minibatch.regions_oi * self.n_cells + minibatch.cells_oi[:, None]).flatten()
            n_fragments = fragments_helpers.multiple_arange(
                np.array(self.regionxcell_indptr_reader[regionxcell_ixs, 0]),
                np.array(self.regionxcell_indptr_reader[regionxcell_ixs, 1]),
                self.out_fragmentixs,
                self.out_local_cellxregion_ix,
            )

            assert n_fragments < self.fragment_buffer_size, "fragment buffer size too small"

            regionxcell_fragmentixs = self.out_fragmentixs[:n_fragments]
            local_cellxregion_ix = self.out_local_cellxregion_ix[:n_fragments]

            regionmapping = minibatch.regions_oi[local_cellxregion_ix % minibatch.n_regions]
            coordinates = self.coordinates_reader[regionxcell_fragmentixs]  # this is typically the slowest part by far

            # center coordinates around region centers, flip based on strandedness
            coordinates = (coordinates - self.region_centers[regionmapping][:, None]) * self.region_strands[
                regionmapping
            ][:, None]
        else:
            regionxcell_ixs = (minibatch.regions_oi * self.n_cells + minibatch.cells_oi[:, None]).flatten()
            n_fragments = fragments_helpers.multiple_arange(
                self.regionxcell_indptr_reader[regionxcell_ixs],
                self.regionxcell_indptr_reader[regionxcell_ixs + 1],
                self.out_fragmentixs,
                self.out_local_cellxregion_ix,
            )
            regionxcell_fragmentixs = np.resize(self.out_fragmentixs, n_fragments)
            coordinates = self.coordinates_reader[regionxcell_fragmentixs]  # this is typically the slowest part by far
            local_cellxregion_ix = np.resize(self.out_local_cellxregion_ix, n_fragments)
            regionmapping = minibatch.regions_oi[local_cellxregion_ix % minibatch.n_regions]

        # multiplets
        if self.provide_multiplets:
            from chromatinhd.utils.numpy import indices_to_indptr

            indptr = indices_to_indptr(local_cellxregion_ix, minibatch.n_cells * minibatch.n_regions)
            indptr_diff = np.diff(indptr)

            doublets = np.where(indptr_diff == 2)[0]
            doublet_idx = torch.from_numpy(np.stack([indptr[doublets], indptr[doublets] + 1], -1).flatten())

            triplets = np.where(indptr_diff == 2)[0]
            triplet_idx = np.stack([indptr[triplets], indptr[triplets] + 1, indptr[triplets] + 2], -1).flatten()
        else:
            doublet_idx = None
            triplet_idx = None

        # libsize
        if self.provide_libsize:
            libsize = self.library_size[minibatch.cells_oi]
        else:
            libsize = None

        return FragmentsResult(
            coordinates=torch.from_numpy(coordinates),
            local_cellxregion_ix=torch.from_numpy(local_cellxregion_ix),
            n_fragments=n_fragments,
            regionmapping=torch.from_numpy(regionmapping),
            window=self.window,
            n_total_regions=self.n_regions,
            cells_oi=minibatch.cells_oi,
            regions_oi=minibatch.regions_oi,
            doublet_idx=doublet_idx,
            libsize=libsize,
        )

__init__(fragments, cellxregion_batch_size, n_fragment_per_cellxregion=None, buffer_size_multiplier=10, provide_libsize=False, provide_multiplets=True)

Parameters:

Name Type Description Default
fragments Union[Fragments, FragmentsView]

Fragments object

required
cellxregion_batch_size int

maximum number of cell x region combinations that will be loaded

required
n_fragment_per_cellxregion int

estimated number of the number of fragments per cell x region combination, used for preallocation

None
Source code in src/chromatinhd/loaders/fragments.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def __init__(
    self,
    fragments: Union[chromatinhd.data.fragments.Fragments, chromatinhd.data.fragments.FragmentsView],
    cellxregion_batch_size: int,
    n_fragment_per_cellxregion: int = None,
    buffer_size_multiplier=10,  # increase this if crashing
    provide_libsize=False,
    provide_multiplets=True,
):
    """
    Parameters:
        fragments: Fragments object
        cellxregion_batch_size: maximum number of cell x region combinations that will be loaded
        n_fragment_per_cellxregion: estimated number of the number of fragments per cell x region combination, used for preallocation
    """
    self.cellxregion_batch_size = cellxregion_batch_size

    # store auxilliary information
    window = fragments.regions.window
    self.window = window

    # create buffers for coordinates
    if n_fragment_per_cellxregion is None:
        n_fragment_per_cellxregion = fragments.estimate_fragment_per_cellxregion()
    fragment_buffer_size = int(n_fragment_per_cellxregion * cellxregion_batch_size * buffer_size_multiplier)
    self.fragment_buffer_size = fragment_buffer_size

    self.n_regions = fragments.n_regions

    # set up readers and determine if we are dealing with a view or not
    if isinstance(fragments, chromatinhd.data.fragments.view.FragmentsView):
        self.regionxcell_indptr_reader = fragments.regionxcell_indptr.open_reader()
        self.coordinates_reader = fragments.coordinates.open_reader()

        if "strand" in fragments.regions.coordinates.columns:
            self.region_strands = fragments.regions.coordinates["strand"].values
        else:
            self.region_strands = np.ones((len(fragments.regions.coordinates),), dtype=np.int8)
        self.region_centers = (fragments.regions.coordinates["start"] - fragments.regions.window[0]).values * (
            self.region_strands == 1
        ).astype(int) + (fragments.regions.coordinates["end"] + fragments.regions.window[0]).values * (
            self.region_strands == -1
        ).astype(
            int
        )
        self.is_view = True
    # elif isinstance(fragments, chromatinhd.data.fragments.Fragments):
    else:
        self.is_view = False
        # raise ValueError("fragments must be either a Fragments or FragmentsView object", type(fragments))
        self.regionxcell_indptr_reader = fragments.regionxcell_indptr.open_reader(
            {"context": {"data_copy_concurrency": {"limit": 1}}}
        )
        self.coordinates_reader = fragments.coordinates.open_reader(
            {
                "context": {
                    "data_copy_concurrency": {"limit": 1},
                }
            }
        )

    self.n_cells = fragments.n_cells

    self.provide_multiplets = provide_multiplets
    self.provide_libsize = provide_libsize

    if provide_libsize:
        library_size = fragments.libsize
        self.library_size = torch.from_numpy((library_size - library_size.mean()) / library_size.std()).float()

load(minibatch)

Load a minibatch of fragments.

Parameters:

Name Type Description Default
minibatch Minibatch

Minibatch object

required

Returns:

Type Description
FragmentsResult

The loaded fragments

Source code in src/chromatinhd/loaders/fragments.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
def load(self, minibatch: Minibatch) -> FragmentsResult:
    """
    Load a minibatch of fragments.

    Parameters:
        minibatch: Minibatch object

    Returns:
        The loaded fragments
    """
    if not self.preloaded:
        self.preload()

    if (len(minibatch.cells_oi) * len(minibatch.regions_oi)) > self.cellxregion_batch_size:
        raise ValueError(
            "Too many cell x region requested, increase cellxregion_batch_size at loader initialization"
        )

    if self.is_view:
        # load the fragment indices using pointers to the regionxcell fragment indices
        regionxcell_ixs = (minibatch.regions_oi * self.n_cells + minibatch.cells_oi[:, None]).flatten()
        n_fragments = fragments_helpers.multiple_arange(
            np.array(self.regionxcell_indptr_reader[regionxcell_ixs, 0]),
            np.array(self.regionxcell_indptr_reader[regionxcell_ixs, 1]),
            self.out_fragmentixs,
            self.out_local_cellxregion_ix,
        )

        assert n_fragments < self.fragment_buffer_size, "fragment buffer size too small"

        regionxcell_fragmentixs = self.out_fragmentixs[:n_fragments]
        local_cellxregion_ix = self.out_local_cellxregion_ix[:n_fragments]

        regionmapping = minibatch.regions_oi[local_cellxregion_ix % minibatch.n_regions]
        coordinates = self.coordinates_reader[regionxcell_fragmentixs]  # this is typically the slowest part by far

        # center coordinates around region centers, flip based on strandedness
        coordinates = (coordinates - self.region_centers[regionmapping][:, None]) * self.region_strands[
            regionmapping
        ][:, None]
    else:
        regionxcell_ixs = (minibatch.regions_oi * self.n_cells + minibatch.cells_oi[:, None]).flatten()
        n_fragments = fragments_helpers.multiple_arange(
            self.regionxcell_indptr_reader[regionxcell_ixs],
            self.regionxcell_indptr_reader[regionxcell_ixs + 1],
            self.out_fragmentixs,
            self.out_local_cellxregion_ix,
        )
        regionxcell_fragmentixs = np.resize(self.out_fragmentixs, n_fragments)
        coordinates = self.coordinates_reader[regionxcell_fragmentixs]  # this is typically the slowest part by far
        local_cellxregion_ix = np.resize(self.out_local_cellxregion_ix, n_fragments)
        regionmapping = minibatch.regions_oi[local_cellxregion_ix % minibatch.n_regions]

    # multiplets
    if self.provide_multiplets:
        from chromatinhd.utils.numpy import indices_to_indptr

        indptr = indices_to_indptr(local_cellxregion_ix, minibatch.n_cells * minibatch.n_regions)
        indptr_diff = np.diff(indptr)

        doublets = np.where(indptr_diff == 2)[0]
        doublet_idx = torch.from_numpy(np.stack([indptr[doublets], indptr[doublets] + 1], -1).flatten())

        triplets = np.where(indptr_diff == 2)[0]
        triplet_idx = np.stack([indptr[triplets], indptr[triplets] + 1, indptr[triplets] + 2], -1).flatten()
    else:
        doublet_idx = None
        triplet_idx = None

    # libsize
    if self.provide_libsize:
        libsize = self.library_size[minibatch.cells_oi]
    else:
        libsize = None

    return FragmentsResult(
        coordinates=torch.from_numpy(coordinates),
        local_cellxregion_ix=torch.from_numpy(local_cellxregion_ix),
        n_fragments=n_fragments,
        regionmapping=torch.from_numpy(regionmapping),
        window=self.window,
        n_total_regions=self.n_regions,
        cells_oi=minibatch.cells_oi,
        regions_oi=minibatch.regions_oi,
        doublet_idx=doublet_idx,
        libsize=libsize,
    )

chromatinhd.loaders.fragments.Cuts

Bases: Fragments

Source code in src/chromatinhd/loaders/fragments.py
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
class Cuts(Fragments):
    def load(self, minibatch: Minibatch) -> CutsResult:
        """
        Load a minibatch of cuts.

        Parameters:
            minibatch: Minibatch object

        Returns:
            The loaded cut sites
        """
        result = super().load(minibatch)

        cut_coordinates = result.coordinates.flatten()

        n_cuts_per_fragment = result.coordinates.shape[1]
        local_cellxregion_ix = result.local_cellxregion_ix.expand(n_cuts_per_fragment, -1).T.flatten()

        # selected = np.random.rand(len(cut_coordinates)) < 0.2
        # cut_coordinates = cut_coordinates[selected]
        # local_cellxregion_ix = local_cellxregion_ix[selected]

        return CutsResult(
            coordinates=cut_coordinates,
            local_cellxregion_ix=local_cellxregion_ix,
            n_regions=len(minibatch.regions_oi),
            n_fragments=result.n_fragments,
            n_cuts=result.n_fragments * n_cuts_per_fragment,
            window=self.window,
        )

load(minibatch)

Load a minibatch of cuts.

Parameters:

Name Type Description Default
minibatch Minibatch

Minibatch object

required

Returns:

Type Description
CutsResult

The loaded cut sites

Source code in src/chromatinhd/loaders/fragments.py
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
def load(self, minibatch: Minibatch) -> CutsResult:
    """
    Load a minibatch of cuts.

    Parameters:
        minibatch: Minibatch object

    Returns:
        The loaded cut sites
    """
    result = super().load(minibatch)

    cut_coordinates = result.coordinates.flatten()

    n_cuts_per_fragment = result.coordinates.shape[1]
    local_cellxregion_ix = result.local_cellxregion_ix.expand(n_cuts_per_fragment, -1).T.flatten()

    # selected = np.random.rand(len(cut_coordinates)) < 0.2
    # cut_coordinates = cut_coordinates[selected]
    # local_cellxregion_ix = local_cellxregion_ix[selected]

    return CutsResult(
        coordinates=cut_coordinates,
        local_cellxregion_ix=local_cellxregion_ix,
        n_regions=len(minibatch.regions_oi),
        n_fragments=result.n_fragments,
        n_cuts=result.n_fragments * n_cuts_per_fragment,
        window=self.window,
    )

chromatinhd.loaders.fragments.FragmentsResult dataclass

Source code in src/chromatinhd/loaders/fragments.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@dataclasses.dataclass
class FragmentsResult:
    coordinates: torch.Tensor
    "Coordinates of the left and right cut site for each fragment"

    local_cellxregion_ix: torch.Tensor
    "Local cell x region index"

    n_fragments: int
    "Number of fragments"

    regionmapping: torch.Tensor = None
    "Mapping from local cell x region index to region index"

    cells_oi: np.ndarray = None
    "Cells of interest"

    regions_oi: np.ndarray = None
    "Regions of interest"

    window: np.ndarray = None
    "Window of the region"

    n_total_regions: int = None
    "Total number of regions"

    localcellxregion_ix: torch.Tensor = None
    "Local cell x region index, in the same order as cells_oi and regions_oi"

    doublet_idx: torch.Tensor = None
    "Indices of doublets"

    libsize: torch.Tensor = None
    "Library size for each cell"

    @property
    def n_cells(self):
        return len(self.cells_oi)

    @property
    def n_regions(self):
        return len(self.regions_oi)

    def to(self, device):
        self.coordinates = self.coordinates.to(device)
        self.local_cellxregion_ix = self.local_cellxregion_ix.to(device)
        if self.regionmapping is not None:
            self.regionmapping = self.regionmapping.to(device)
        if self.libsize is not None:
            self.libsize = self.libsize.to(device)
        return self

    @property
    def local_region_ix(self):
        return self.local_cellxregion_ix % self.n_regions

    @property
    def local_cell_ix(self):
        return torch.div(self.local_cellxregion_ix, self.n_regions, rounding_mode="floor")

    def filter_fragments(self, fragments_oi):
        assert len(fragments_oi) == self.n_fragments
        return FragmentsResult(
            coordinates=self.coordinates[fragments_oi],
            local_cellxregion_ix=self.local_cellxregion_ix[fragments_oi],
            regionmapping=self.regionmapping[fragments_oi],
            n_fragments=fragments_oi.sum(),
            cells_oi=self.cells_oi,
            regions_oi=self.regions_oi,
            window=self.window,
            n_total_regions=self.n_total_regions,
        )

cells_oi: np.ndarray = None class-attribute instance-attribute

Cells of interest

coordinates: torch.Tensor instance-attribute

Coordinates of the left and right cut site for each fragment

doublet_idx: torch.Tensor = None class-attribute instance-attribute

Indices of doublets

libsize: torch.Tensor = None class-attribute instance-attribute

Library size for each cell

local_cellxregion_ix: torch.Tensor instance-attribute

Local cell x region index

localcellxregion_ix: torch.Tensor = None class-attribute instance-attribute

Local cell x region index, in the same order as cells_oi and regions_oi

n_fragments: int instance-attribute

Number of fragments

n_total_regions: int = None class-attribute instance-attribute

Total number of regions

regionmapping: torch.Tensor = None class-attribute instance-attribute

Mapping from local cell x region index to region index

regions_oi: np.ndarray = None class-attribute instance-attribute

Regions of interest

window: np.ndarray = None class-attribute instance-attribute

Window of the region

chromatinhd.loaders.fragments.CutsResult dataclass

Source code in src/chromatinhd/loaders/fragments.py
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
@dataclasses.dataclass
class CutsResult:
    coordinates: torch.Tensor
    local_cellxregion_ix: torch.Tensor
    n_regions: int
    n_fragments: int
    n_cuts: int
    window: np.ndarray

    @property
    def local_region_ix(self):
        return self.local_cellxregion_ix % self.n_regions

    @property
    def local_cell_ix(self):
        return torch.div(self.local_cellxregion_ix, self.n_regions, rounding_mode="floor")

    def to(self, device):
        self.coordinates = self.coordinates.to(device)
        self.local_cellxregion_ix = self.local_cellxregion_ix.to(device)
        return self