Skip to content

Fragments

chromatinhd.data.fragments.Fragments

Bases: Flow

Fragments positioned within regions. Fragments are sorted by the region, position within the region (left cut site) and cell.

The object can also store several precalculated tensors that are used for efficient loading of fragments. See create_regionxcell_indptr for more information.

Source code in src/chromatinhd/data/fragments/fragments.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
class Fragments(Flow):
    """
    Fragments positioned within regions. Fragments are sorted by the region, position within the region (left cut site) and cell.

    The object can also store several precalculated tensors that are used for efficient loading of fragments. See create_regionxcell_indptr for more information.
    """

    regions: Regions = Linked()
    """The regions in which (part of) the fragments are located and centered."""

    coordinates: TensorstoreInstance = Tensorstore(dtype="<i4", chunks=[100000, 2], compression=None, shape=[0, 2])
    """Coordinates of the two cut sites."""

    mapping: TensorstoreInstance = Tensorstore(dtype="<i4", chunks=[100000, 2], compression="blosc", shape=[0, 2])
    """Mapping of a fragment to a cell (first column) and a region (second column)"""

    regionxcell_indptr: np.ndarray = Tensorstore(dtype="<i8", chunks=[100000], compression="blosc")
    """Index pointers to the regionxcell fragment positions"""

    def create_regionxcell_indptr(self, overwrite=False) -> Fragments:
        """
        Creates pointers to each individual region x cell combination from the mapping tensor.

        Returns:
            The same object with `regionxcell_indptr` populated
        """

        if self.o.regionxcell_indptr.exists(self) and not overwrite:
            return self

        regionxcell_ix = self.mapping[:, 1] * self.n_cells + self.mapping[:, 0]

        if not (np.diff(regionxcell_ix) >= 0).all():
            raise ValueError("Fragments should be ordered by regionxcell (ascending)")

        if not self.mapping[:, 0].max() < self.n_cells:
            raise ValueError("First column of mapping should be smaller than the number of cells")

        n_regionxcell = self.n_regions * self.n_cells
        regionxcell_indptr = np.pad(np.cumsum(np.bincount(regionxcell_ix, minlength=n_regionxcell), 0), (1, 0))
        assert self.coordinates.shape[0] == regionxcell_indptr[-1]
        if not (np.diff(regionxcell_indptr) >= 0).all():
            raise ValueError("Fragments should be ordered by regionxcell (ascending)")
        self.regionxcell_indptr[:] = regionxcell_indptr

        return self

    var: pd.DataFrame = TSV()
    """DataFrame containing information about regions."""

    obs: pd.DataFrame = TSV()
    """DataFrame containing information about cells."""

    @functools.cached_property
    def n_regions(self):
        """Number of regions"""
        return self.var.shape[0]

    @functools.cached_property
    def n_cells(self):
        """Number of cells"""
        return self.obs.shape[0]

    @property
    def local_cellxregion_ix(self):
        return self.mapping[:, 0] * self.n_regions + self.mapping[:, 1]

    def estimate_fragment_per_cellxregion(self):
        return math.ceil(self.coordinates.shape[0] / self.n_cells / self.n_regions)

    @class_or_instancemethod
    def from_fragments_tsv(
        cls,
        fragments_file: PathLike,
        regions: Regions,
        obs: pd.DataFrame,
        cell_column: str = None,
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 1e6,
    ) -> Fragments:
        """
        Create a Fragments object from a fragments tsv file

        Parameters:
            fragments_file:
                Location of the fragments tab-separate file created by e.g. CellRanger or sinto
            obs:
                DataFrame containing information about cells.
                The index should be the cell names as present in the fragments file.
                Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
            regions:
                Regions from which the fragments will be extracted.
            cell_column:
                Column name in the `obs` DataFrame containing the cell names.
                If None, the index of the `obs` DataFrame is used.
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory.
        Returns:
            A new Fragments object
        """

        if isinstance(fragments_file, str):
            fragments_file = pathlib.Path(fragments_file)
        if isinstance(path, str):
            path = pathlib.Path(path)
        if not fragments_file.exists():
            raise FileNotFoundError(f"File {fragments_file} does not exist")
        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )

        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # cell information
        obs = obs.copy()
        obs["ix"] = np.arange(obs.shape[0])
        if cell_column is None:
            cell_to_cell_ix = obs["ix"].to_dict()
        else:
            cell_to_cell_ix = obs.set_index(cell_column)["ix"].to_dict()

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        # read the fragments file
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e
        fragments_tabix = pysam.TabixFile(str(fragments_file))

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            coordinates_raw, mapping_raw = _fetch_fragments_region(
                fragments_tabix=fragments_tabix,
                chrom=region_info["chrom"],
                start=region_info["start"],
                end=region_info["end"],
                tss=tss,
                strand=strand,
                cell_to_cell_ix=cell_to_cell_ix,
                region_ix=region_ix,
            )

            mapping_processed.append(mapping_raw)
            coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    @class_or_instancemethod
    def from_multiple_fragments_tsv(
        cls,
        fragments_files: [PathLike],
        regions: Regions,
        obs: pd.DataFrame,
        cell_column: str = "cell_original",
        batch_column: str = "batch",
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 1e6,
    ) -> Fragments:
        """
        Create a Fragments object from multiple fragments tsv file

        Parameters:
            fragments_files:
                Location of the fragments tab-separate file created by e.g. CellRanger or sinto
            obs:
                DataFrame containing information about cells.
                The index should be the cell names as present in the fragments file.
                Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
            regions:
                Regions from which the fragments will be extracted.
            cell_column:
                Column name in the `obs` DataFrame containing the cell names.
            batch_column:
                Column name in the `obs` DataFrame containing the batch indices.
                If None, will default to batch
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory.
        Returns:
            A new Fragments object
        """

        if not isinstance(fragments_files, (list, tuple)):
            fragments_files = [fragments_files]

        fragments_files_ = []
        for fragments_file in fragments_files:
            if isinstance(fragments_file, str):
                fragments_file = pathlib.Path(fragments_file)
            if not fragments_file.exists():
                raise FileNotFoundError(f"File {fragments_file} does not exist")
            fragments_files_.append(fragments_file)
        fragments_files = fragments_files_

        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )

        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # cell information
        obs = obs.copy()
        obs["ix"] = np.arange(obs.shape[0])

        if batch_column is None:
            raise ValueError("batch_column should be specified")
        if batch_column not in obs.columns:
            raise ValueError(f"Column {batch_column} not in obs")
        if obs[batch_column].dtype != "int":
            raise ValueError(f"Column {batch_column} should be an integer column")
        if not obs[batch_column].max() == len(fragments_files) - 1:
            raise ValueError(f"Column {batch_column} should contain values between 0 and {len(fragments_files) - 1}")
        cell_to_cell_ix_batches = [
            obs.loc[obs[batch_column] == batch].set_index(cell_column)["ix"] for batch in obs[batch_column].unique()
        ]

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        # read the fragments file
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e
        fragments_tabix_batches = [pysam.TabixFile(str(fragments_file)) for fragments_file in fragments_files]

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            mapping_raw = []
            coordinates_raw = []

            for fragments_tabix, cell_to_cell_ix in zip(fragments_tabix_batches, cell_to_cell_ix_batches):
                coordinates_raw_batch, mapping_raw_batch = _fetch_fragments_region(
                    fragments_tabix=fragments_tabix,
                    chrom=region_info["chrom"],
                    start=region_info["start"],
                    end=region_info["end"],
                    tss=tss,
                    strand=strand,
                    cell_to_cell_ix=cell_to_cell_ix,
                    region_ix=region_ix,
                )
                print(len(coordinates_raw_batch))
                mapping_raw.append(mapping_raw_batch)
                coordinates_raw.append(coordinates_raw_batch)

            mapping_raw = np.concatenate(mapping_raw)
            coordinates_raw = np.concatenate(coordinates_raw)

            # sort by region, coordinate (of left cut sites), and cell
            sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
            mapping_raw = mapping_raw[sorted_idx]
            coordinates_raw = coordinates_raw[sorted_idx]

            mapping_processed.append(mapping_raw)
            coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    @class_or_instancemethod
    def from_alignments(
        cls,
        obs: pd.DataFrame,
        regions: Regions,
        file_column: str = "path",
        alignment_column: str = None,
        remove_duplicates: bool = None,
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 10e6,
        paired: bool = True,
    ) -> Fragments:
        """
        Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

        Parameters:
            obs:
                DataFrame containing information about cells/samples.
                The index will be used as the name of the cell for future reference.
                The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column).
                Any additional data about cells/samples can be stored in this dataframe as well.
            regions:
                Regions from which the fragments will be extracted.
            file_column:
                Column name in the `obs` DataFrame containing the path to the alignment file.
            alignment_column:
                Column name in the `obs` DataFrame containing the alignment object.
                If None, the alignment object will be loaded using the `file_column`.
            remove_duplicates:
                Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.
            paired:
                Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of `remove_duplicates`.
        """
        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # check path and overwrite
        if path is not None:
            if isinstance(path, str):
                path = pathlib.Path(path)
            if not overwrite and path.exists() and not reuse:
                raise FileExistsError(
                    f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
                )
        if overwrite:
            reuse = False

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        if reuse:
            return self

        # load alignment files
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read alignment files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e

        if alignment_column is None:
            if file_column not in obs.columns:
                raise ValueError(f"Column {file_column} not in obs")
            alignments = {}
            for cell, cell_info in obs.iterrows():
                alignments[cell] = pysam.Samfile(cell_info[file_column], "rb")
        else:
            if alignment_column not in obs.columns:
                raise ValueError(f"Column {alignment_column} not in obs")
            alignments = obs[alignment_column].to_dict()

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            chrom = region_info["chrom"]
            start = region_info["start"]
            end = region_info["end"]
            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            # process cell/sample
            for cell_ix, (cell_id, alignment) in enumerate(alignments.items()):
                if paired:
                    coordinates_raw = _process_paired(
                        alignment=alignment,
                        chrom=chrom,
                        start=start,
                        end=end,
                        remove_duplicates=True if remove_duplicates is None else remove_duplicates,
                    )
                    coordinates_raw = (np.array(coordinates_raw).reshape(-1, 2).astype(np.int32) - tss) * strand
                else:
                    coordinates_raw = _process_single(
                        alignment=alignment,
                        chrom=chrom,
                        start=start,
                        end=end,
                        remove_duplicates=False if remove_duplicates is None else remove_duplicates,
                    )
                    coordinates_raw = (np.array(coordinates_raw).reshape(-1, 1).astype(np.int32) - tss) * strand

                mapping_raw = np.stack(
                    [
                        np.repeat(cell_ix, len(coordinates_raw)),
                        np.repeat(region_ix, len(coordinates_raw)),
                    ],
                    axis=1,
                )

                # sort by region, coordinate (of left cut sites), and cell
                sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
                mapping_raw = mapping_raw[sorted_idx]
                coordinates_raw = coordinates_raw[sorted_idx]

                if len(mapping_raw) > 0:
                    mapping_processed.append(mapping_raw)
                    coordinates_processed.append(coordinates_raw)

                if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                    self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                    self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                    mapping_processed = []
                    coordinates_processed = []

                del mapping_raw
                del coordinates_raw

        # add final fragments
        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
        """
        Filter based on new regions

        Parameters:
            regions:
                Regions to filter.
        Returns:
            A new Fragments object
        """

        # check if new regions are a subset of the existing ones
        if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
            raise ValueError("New regions should be a subset of the existing ones")

        # filter region info
        self.regions.coordinates["ix"] = np.arange(self.regions.coordinates.shape[0])
        regions.coordinates["ix"] = self.regions.coordinates["ix"].loc[regions.coordinates.index]

        var = self.regions.coordinates.copy()
        var["original_ix"] = np.arange(var.shape[0])
        var = var.loc[regions.coordinates.index].copy()
        var["ix"] = np.arange(var.shape[0])

        # filter coordinates/mapping
        fragments_oi = np.isin(self.mapping[:, 1], regions.coordinates["ix"])

        mapping = self.mapping[fragments_oi].copy()
        mapping[:, 1] = var.set_index("original_ix").loc[mapping[:, 1], "ix"].values
        coordinates = self.coordinates[fragments_oi].copy()

        # Sort `coordinates` and `mapping` according to `mapping`
        sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
        mapping = mapping[sorted_idx]
        coordinates = coordinates[sorted_idx]

        fragments = Fragments.create(
            coordinates=coordinates, mapping=mapping, regions=regions, var=var, obs=self.obs, path=path, reset=overwrite
        )

        return fragments

    def filter_cells(self, cells, path: PathLike = None) -> Fragments:
        """
        Filter based on new cells

        Parameters:
            cells:
                Cells to filter.
        Returns:
            A new Fragments object
        """

        # check if new cells are a subset of the existing ones
        if not pd.Series(cells).isin(self.obs.index).all():
            raise ValueError("New cells should be a subset of the existing ones")

        # filter region info
        self.obs["ix"] = np.arange(self.obs.shape[0])
        obs = self.obs.loc[cells].copy()
        obs["original_ix"] = self.obs["ix"].loc[obs.index]
        obs["ix"] = np.arange(obs.shape[0])

        # filter coordinates/mapping
        fragments_oi = np.isin(self.mapping[:, 0], obs["original_ix"])

        mapping = self.mapping[fragments_oi].copy()
        mapping[:, 0] = obs.set_index("original_ix").loc[mapping[:, 0], "ix"].values
        coordinates = self.coordinates[fragments_oi]

        # Sort `coordinates` and `mapping` according to `mapping`
        sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
        mapping = mapping[sorted_idx]
        coordinates = coordinates[sorted_idx]

        return Fragments.create(
            coordinates=coordinates, mapping=mapping, regions=self.regions, var=self.var, obs=obs, path=path
        )

    @property
    def counts(self):
        """
        Counts of fragments per cell x region
        """
        cellxregion_ix = self.mapping[:, 0] * self.n_regions + self.mapping[:, 1]
        counts = np.bincount(cellxregion_ix, minlength=self.n_cells * self.n_regions).reshape(
            self.n_cells, self.n_regions
        )
        return counts

    @property
    def nonzero(self):
        return self.counts > 0

    _single_region_cache = None

    def get_cache(self, region_oi):
        if self._single_region_cache is None:
            self._single_region_cache = {}
        if region_oi not in self._single_region_cache:
            region = self.var.index.get_loc(region_oi)
            regionxcell_indptr = self.regionxcell_indptr.open_reader(
                {"context": {"data_copy_concurrency": {"limit": 1}}}
            )[region * self.n_cells : (region + 1) * self.n_cells + 1]
            coordinates = self.coordinates.open_reader(
                {
                    "context": {
                        "data_copy_concurrency": {"limit": 1},
                    }
                }
            )[regionxcell_indptr[0] : regionxcell_indptr[-1]]
            regionxcell_indptr = regionxcell_indptr - regionxcell_indptr[0]

            self._single_region_cache[region_oi] = {
                "regionxcell_indptr": regionxcell_indptr,
                "coordinates": coordinates,
            }
        return self._single_region_cache[region_oi]

    _libsize = None

    @property
    def libsize(self):
        if self._libsize is None:
            self._libsize = np.bincount(self.mapping[:, 0], minlength=self.n_cells)
        return self._libsize

coordinates: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression=None, shape=[0, 2]) class-attribute instance-attribute

Coordinates of the two cut sites.

counts property

Counts of fragments per cell x region

mapping: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression='blosc', shape=[0, 2]) class-attribute instance-attribute

Mapping of a fragment to a cell (first column) and a region (second column)

n_cells cached property

Number of cells

n_regions cached property

Number of regions

obs: pd.DataFrame = TSV() class-attribute instance-attribute

DataFrame containing information about cells.

regions: Regions = Linked() class-attribute instance-attribute

The regions in which (part of) the fragments are located and centered.

regionxcell_indptr: np.ndarray = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc') class-attribute instance-attribute

Index pointers to the regionxcell fragment positions

var: pd.DataFrame = TSV() class-attribute instance-attribute

DataFrame containing information about regions.

create_regionxcell_indptr(overwrite=False)

Creates pointers to each individual region x cell combination from the mapping tensor.

Returns:

Type Description
Fragments

The same object with regionxcell_indptr populated

Source code in src/chromatinhd/data/fragments/fragments.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def create_regionxcell_indptr(self, overwrite=False) -> Fragments:
    """
    Creates pointers to each individual region x cell combination from the mapping tensor.

    Returns:
        The same object with `regionxcell_indptr` populated
    """

    if self.o.regionxcell_indptr.exists(self) and not overwrite:
        return self

    regionxcell_ix = self.mapping[:, 1] * self.n_cells + self.mapping[:, 0]

    if not (np.diff(regionxcell_ix) >= 0).all():
        raise ValueError("Fragments should be ordered by regionxcell (ascending)")

    if not self.mapping[:, 0].max() < self.n_cells:
        raise ValueError("First column of mapping should be smaller than the number of cells")

    n_regionxcell = self.n_regions * self.n_cells
    regionxcell_indptr = np.pad(np.cumsum(np.bincount(regionxcell_ix, minlength=n_regionxcell), 0), (1, 0))
    assert self.coordinates.shape[0] == regionxcell_indptr[-1]
    if not (np.diff(regionxcell_indptr) >= 0).all():
        raise ValueError("Fragments should be ordered by regionxcell (ascending)")
    self.regionxcell_indptr[:] = regionxcell_indptr

    return self

filter_cells(cells, path=None)

Filter based on new cells

Parameters:

Name Type Description Default
cells

Cells to filter.

required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
def filter_cells(self, cells, path: PathLike = None) -> Fragments:
    """
    Filter based on new cells

    Parameters:
        cells:
            Cells to filter.
    Returns:
        A new Fragments object
    """

    # check if new cells are a subset of the existing ones
    if not pd.Series(cells).isin(self.obs.index).all():
        raise ValueError("New cells should be a subset of the existing ones")

    # filter region info
    self.obs["ix"] = np.arange(self.obs.shape[0])
    obs = self.obs.loc[cells].copy()
    obs["original_ix"] = self.obs["ix"].loc[obs.index]
    obs["ix"] = np.arange(obs.shape[0])

    # filter coordinates/mapping
    fragments_oi = np.isin(self.mapping[:, 0], obs["original_ix"])

    mapping = self.mapping[fragments_oi].copy()
    mapping[:, 0] = obs.set_index("original_ix").loc[mapping[:, 0], "ix"].values
    coordinates = self.coordinates[fragments_oi]

    # Sort `coordinates` and `mapping` according to `mapping`
    sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
    mapping = mapping[sorted_idx]
    coordinates = coordinates[sorted_idx]

    return Fragments.create(
        coordinates=coordinates, mapping=mapping, regions=self.regions, var=self.var, obs=obs, path=path
    )

filter_regions(regions, path=None, overwrite=True)

Filter based on new regions

Parameters:

Name Type Description Default
regions Regions

Regions to filter.

required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
    """
    Filter based on new regions

    Parameters:
        regions:
            Regions to filter.
    Returns:
        A new Fragments object
    """

    # check if new regions are a subset of the existing ones
    if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
        raise ValueError("New regions should be a subset of the existing ones")

    # filter region info
    self.regions.coordinates["ix"] = np.arange(self.regions.coordinates.shape[0])
    regions.coordinates["ix"] = self.regions.coordinates["ix"].loc[regions.coordinates.index]

    var = self.regions.coordinates.copy()
    var["original_ix"] = np.arange(var.shape[0])
    var = var.loc[regions.coordinates.index].copy()
    var["ix"] = np.arange(var.shape[0])

    # filter coordinates/mapping
    fragments_oi = np.isin(self.mapping[:, 1], regions.coordinates["ix"])

    mapping = self.mapping[fragments_oi].copy()
    mapping[:, 1] = var.set_index("original_ix").loc[mapping[:, 1], "ix"].values
    coordinates = self.coordinates[fragments_oi].copy()

    # Sort `coordinates` and `mapping` according to `mapping`
    sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
    mapping = mapping[sorted_idx]
    coordinates = coordinates[sorted_idx]

    fragments = Fragments.create(
        coordinates=coordinates, mapping=mapping, regions=regions, var=var, obs=self.obs, path=path, reset=overwrite
    )

    return fragments

from_alignments(obs, regions, file_column='path', alignment_column=None, remove_duplicates=None, path=None, overwrite=False, reuse=True, batch_size=10000000.0, paired=True)

Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

Parameters:

Name Type Description Default
obs DataFrame

DataFrame containing information about cells/samples. The index will be used as the name of the cell for future reference. The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column). Any additional data about cells/samples can be stored in this dataframe as well.

required
regions Regions

Regions from which the fragments will be extracted.

required
file_column str

Column name in the obs DataFrame containing the path to the alignment file.

'path'
alignment_column str

Column name in the obs DataFrame containing the alignment object. If None, the alignment object will be loaded using the file_column.

None
remove_duplicates bool

Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.

None
path PathLike

Folder in which the fragments data will be stored.

None
overwrite bool

Whether to overwrite the data if it already exists.

False
reuse bool

Whether to reuse existing data if it exists.

True
batch_size int

Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.

10000000.0
paired bool

Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of remove_duplicates.

True
Source code in src/chromatinhd/data/fragments/fragments.py
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
@class_or_instancemethod
def from_alignments(
    cls,
    obs: pd.DataFrame,
    regions: Regions,
    file_column: str = "path",
    alignment_column: str = None,
    remove_duplicates: bool = None,
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 10e6,
    paired: bool = True,
) -> Fragments:
    """
    Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

    Parameters:
        obs:
            DataFrame containing information about cells/samples.
            The index will be used as the name of the cell for future reference.
            The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column).
            Any additional data about cells/samples can be stored in this dataframe as well.
        regions:
            Regions from which the fragments will be extracted.
        file_column:
            Column name in the `obs` DataFrame containing the path to the alignment file.
        alignment_column:
            Column name in the `obs` DataFrame containing the alignment object.
            If None, the alignment object will be loaded using the `file_column`.
        remove_duplicates:
            Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.
        paired:
            Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of `remove_duplicates`.
    """
    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # check path and overwrite
    if path is not None:
        if isinstance(path, str):
            path = pathlib.Path(path)
        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )
    if overwrite:
        reuse = False

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    if reuse:
        return self

    # load alignment files
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read alignment files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e

    if alignment_column is None:
        if file_column not in obs.columns:
            raise ValueError(f"Column {file_column} not in obs")
        alignments = {}
        for cell, cell_info in obs.iterrows():
            alignments[cell] = pysam.Samfile(cell_info[file_column], "rb")
    else:
        if alignment_column not in obs.columns:
            raise ValueError(f"Column {alignment_column} not in obs")
        alignments = obs[alignment_column].to_dict()

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        chrom = region_info["chrom"]
        start = region_info["start"]
        end = region_info["end"]
        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        # process cell/sample
        for cell_ix, (cell_id, alignment) in enumerate(alignments.items()):
            if paired:
                coordinates_raw = _process_paired(
                    alignment=alignment,
                    chrom=chrom,
                    start=start,
                    end=end,
                    remove_duplicates=True if remove_duplicates is None else remove_duplicates,
                )
                coordinates_raw = (np.array(coordinates_raw).reshape(-1, 2).astype(np.int32) - tss) * strand
            else:
                coordinates_raw = _process_single(
                    alignment=alignment,
                    chrom=chrom,
                    start=start,
                    end=end,
                    remove_duplicates=False if remove_duplicates is None else remove_duplicates,
                )
                coordinates_raw = (np.array(coordinates_raw).reshape(-1, 1).astype(np.int32) - tss) * strand

            mapping_raw = np.stack(
                [
                    np.repeat(cell_ix, len(coordinates_raw)),
                    np.repeat(region_ix, len(coordinates_raw)),
                ],
                axis=1,
            )

            # sort by region, coordinate (of left cut sites), and cell
            sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
            mapping_raw = mapping_raw[sorted_idx]
            coordinates_raw = coordinates_raw[sorted_idx]

            if len(mapping_raw) > 0:
                mapping_processed.append(mapping_raw)
                coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

    # add final fragments
    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

from_fragments_tsv(fragments_file, regions, obs, cell_column=None, path=None, overwrite=False, reuse=True, batch_size=1000000.0)

Create a Fragments object from a fragments tsv file

Parameters:

Name Type Description Default
fragments_file PathLike

Location of the fragments tab-separate file created by e.g. CellRanger or sinto

required
obs DataFrame

DataFrame containing information about cells. The index should be the cell names as present in the fragments file. Alternatively, the column containing cell ids can be specified using the cell_column argument.

required
regions Regions

Regions from which the fragments will be extracted.

required
cell_column str

Column name in the obs DataFrame containing the cell names. If None, the index of the obs DataFrame is used.

None
path PathLike

Folder in which the fragments data will be stored.

None
overwrite bool

Whether to overwrite the data if it already exists.

False
reuse bool

Whether to reuse existing data if it exists.

True
batch_size int

Number of fragments to process before saving. Lower this number if you run out of memory.

1000000.0

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
@class_or_instancemethod
def from_fragments_tsv(
    cls,
    fragments_file: PathLike,
    regions: Regions,
    obs: pd.DataFrame,
    cell_column: str = None,
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 1e6,
) -> Fragments:
    """
    Create a Fragments object from a fragments tsv file

    Parameters:
        fragments_file:
            Location of the fragments tab-separate file created by e.g. CellRanger or sinto
        obs:
            DataFrame containing information about cells.
            The index should be the cell names as present in the fragments file.
            Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
        regions:
            Regions from which the fragments will be extracted.
        cell_column:
            Column name in the `obs` DataFrame containing the cell names.
            If None, the index of the `obs` DataFrame is used.
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory.
    Returns:
        A new Fragments object
    """

    if isinstance(fragments_file, str):
        fragments_file = pathlib.Path(fragments_file)
    if isinstance(path, str):
        path = pathlib.Path(path)
    if not fragments_file.exists():
        raise FileNotFoundError(f"File {fragments_file} does not exist")
    if not overwrite and path.exists() and not reuse:
        raise FileExistsError(
            f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
        )

    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # cell information
    obs = obs.copy()
    obs["ix"] = np.arange(obs.shape[0])
    if cell_column is None:
        cell_to_cell_ix = obs["ix"].to_dict()
    else:
        cell_to_cell_ix = obs.set_index(cell_column)["ix"].to_dict()

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    # read the fragments file
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e
    fragments_tabix = pysam.TabixFile(str(fragments_file))

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        coordinates_raw, mapping_raw = _fetch_fragments_region(
            fragments_tabix=fragments_tabix,
            chrom=region_info["chrom"],
            start=region_info["start"],
            end=region_info["end"],
            tss=tss,
            strand=strand,
            cell_to_cell_ix=cell_to_cell_ix,
            region_ix=region_ix,
        )

        mapping_processed.append(mapping_raw)
        coordinates_processed.append(coordinates_raw)

        if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
            mapping_processed = []
            coordinates_processed = []

        del mapping_raw
        del coordinates_raw

    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

from_multiple_fragments_tsv(fragments_files, regions, obs, cell_column='cell_original', batch_column='batch', path=None, overwrite=False, reuse=True, batch_size=1000000.0)

Create a Fragments object from multiple fragments tsv file

Parameters:

Name Type Description Default
fragments_files [PathLike]

Location of the fragments tab-separate file created by e.g. CellRanger or sinto

required
obs DataFrame

DataFrame containing information about cells. The index should be the cell names as present in the fragments file. Alternatively, the column containing cell ids can be specified using the cell_column argument.

required
regions Regions

Regions from which the fragments will be extracted.

required
cell_column str

Column name in the obs DataFrame containing the cell names.

'cell_original'
batch_column str

Column name in the obs DataFrame containing the batch indices. If None, will default to batch

'batch'
path PathLike

Folder in which the fragments data will be stored.

None
overwrite bool

Whether to overwrite the data if it already exists.

False
reuse bool

Whether to reuse existing data if it exists.

True
batch_size int

Number of fragments to process before saving. Lower this number if you run out of memory.

1000000.0

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
@class_or_instancemethod
def from_multiple_fragments_tsv(
    cls,
    fragments_files: [PathLike],
    regions: Regions,
    obs: pd.DataFrame,
    cell_column: str = "cell_original",
    batch_column: str = "batch",
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 1e6,
) -> Fragments:
    """
    Create a Fragments object from multiple fragments tsv file

    Parameters:
        fragments_files:
            Location of the fragments tab-separate file created by e.g. CellRanger or sinto
        obs:
            DataFrame containing information about cells.
            The index should be the cell names as present in the fragments file.
            Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
        regions:
            Regions from which the fragments will be extracted.
        cell_column:
            Column name in the `obs` DataFrame containing the cell names.
        batch_column:
            Column name in the `obs` DataFrame containing the batch indices.
            If None, will default to batch
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory.
    Returns:
        A new Fragments object
    """

    if not isinstance(fragments_files, (list, tuple)):
        fragments_files = [fragments_files]

    fragments_files_ = []
    for fragments_file in fragments_files:
        if isinstance(fragments_file, str):
            fragments_file = pathlib.Path(fragments_file)
        if not fragments_file.exists():
            raise FileNotFoundError(f"File {fragments_file} does not exist")
        fragments_files_.append(fragments_file)
    fragments_files = fragments_files_

    if not overwrite and path.exists() and not reuse:
        raise FileExistsError(
            f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
        )

    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # cell information
    obs = obs.copy()
    obs["ix"] = np.arange(obs.shape[0])

    if batch_column is None:
        raise ValueError("batch_column should be specified")
    if batch_column not in obs.columns:
        raise ValueError(f"Column {batch_column} not in obs")
    if obs[batch_column].dtype != "int":
        raise ValueError(f"Column {batch_column} should be an integer column")
    if not obs[batch_column].max() == len(fragments_files) - 1:
        raise ValueError(f"Column {batch_column} should contain values between 0 and {len(fragments_files) - 1}")
    cell_to_cell_ix_batches = [
        obs.loc[obs[batch_column] == batch].set_index(cell_column)["ix"] for batch in obs[batch_column].unique()
    ]

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    # read the fragments file
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e
    fragments_tabix_batches = [pysam.TabixFile(str(fragments_file)) for fragments_file in fragments_files]

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        mapping_raw = []
        coordinates_raw = []

        for fragments_tabix, cell_to_cell_ix in zip(fragments_tabix_batches, cell_to_cell_ix_batches):
            coordinates_raw_batch, mapping_raw_batch = _fetch_fragments_region(
                fragments_tabix=fragments_tabix,
                chrom=region_info["chrom"],
                start=region_info["start"],
                end=region_info["end"],
                tss=tss,
                strand=strand,
                cell_to_cell_ix=cell_to_cell_ix,
                region_ix=region_ix,
            )
            print(len(coordinates_raw_batch))
            mapping_raw.append(mapping_raw_batch)
            coordinates_raw.append(coordinates_raw_batch)

        mapping_raw = np.concatenate(mapping_raw)
        coordinates_raw = np.concatenate(coordinates_raw)

        # sort by region, coordinate (of left cut sites), and cell
        sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
        mapping_raw = mapping_raw[sorted_idx]
        coordinates_raw = coordinates_raw[sorted_idx]

        mapping_processed.append(mapping_raw)
        coordinates_processed.append(coordinates_raw)

        if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
            mapping_processed = []
            coordinates_processed = []

        del mapping_raw
        del coordinates_raw

    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

chromatinhd.data.fragments.FragmentsView

Bases: Flow

A view of fragments based on regions that are a subset of the parent fragments. In a typical use case, the parent contains fragments for all chromosomes, while this the view focuses on specific regions.

Only fragments that are fully inclusive (i.e., both left and right cut sites) within a region will be selected.

Source code in src/chromatinhd/data/fragments/view.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
class FragmentsView(Flow):
    """
    A view of fragments based on regions that are a subset of the parent fragments. In a typical use case, the parent contains fragments for all chromosomes, while this the view focuses on specific regions.

    Only fragments that are fully inclusive (i.e., both left and right cut sites) within a region will be selected.
    """

    regionxcell_indptr: TensorstoreInstance = Tensorstore(
        dtype="<i8", chunks=[100000], compression="blosc", shape=(0, 2)
    )
    """Index of fragments in the parent fragments object"""

    parent: Fragments = Linked()
    """The parent fragments object from which this view is created"""

    regions: Regions = Linked()
    """The regions object"""

    parentregion_column: str = Stored()

    @classmethod
    def from_fragments(
        cls,
        parent: Fragments,
        regions: Regions,
        parentregion_column: str = "chrom",
        obs: pd.DataFrame = None,
        var: pd.DataFrame = None,
        path: PathLike = None,
        overwrite: bool = False,
    ):
        """
        Creates a fragments view from a parent fragments object and a regions object

        Parameters:
            parent:
                Parent fragments object. If a fragments view is provided, the parent of the parent will be used.
            regions:
                Regions object
            obs:
                DataFrame containing information about cells, will be copied from the fragments object if not provided
            parentregion_column:
                Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both `parent.regions.coordinates` and `regions.coordinates`
            path:
                Path to store the fragments view
        """

        if isinstance(parent, FragmentsView):
            while isinstance(parent, FragmentsView):
                parent = parent.parent
        if not isinstance(parent, Fragments):
            raise ValueError("parent should be a Fragments object")
        if not isinstance(regions, Regions):
            raise ValueError("regions should be a Regions object", regions.__class__)

        # dummy proofing
        if parentregion_column not in regions.coordinates.columns:
            raise ValueError(
                f"Column {parentregion_column} not in regions coordinates. Available columns are {regions.coordinates.columns}"
            )
        if parentregion_column not in parent.regions.coordinates.columns:
            raise ValueError(
                f"Column {parentregion_column} not in fragments regions coordinates. Available columns are {parent.regions.coordinates.columns}"
            )
        if not (regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])).all():
            raise ValueError(
                f"Not all regions are present in the parent fragments. Missing regions: {regions.coordinates[parentregion_column][~regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])]}"
            )

        self = cls.create(
            parent=parent,
            regions=regions,
            path=path,
            parentregion_column=parentregion_column,
            obs=parent.obs if obs is None else obs,
            var=regions.coordinates if var is None else var,
            reset=overwrite,
        )

        return self

    def create_regionxcell_indptr(
        self,
        inclusive: tuple = (True, False),
        overwrite=True,
    ) -> FragmentsView:
        """
        Create index pointers (left and right) for the fragments associated to each regionxcell combination

        Parameters:
            batch_size:
                Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
            inclusive:
                Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
            overwrite:
                Whether to overwrite the existing index pointers.

        Returns:
            Same object but with the `regionxcell_indptr` populated

        """
        if self.regionxcell_indptr.exists() and not overwrite:
            return self

        mapping = self.parent.mapping[:]
        coordinates = self.parent.coordinates[:]

        # convert regions in parent to parent region ixs
        parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

        # reset the tensorstores
        regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

        # index pointers from parentregions to fragments
        parentregion_fragment_indptr = indices_to_indptr(mapping[:, 1], self.parent.n_regions)

        pbar = tqdm.tqdm(
            total=len(self.regions.coordinates),
            leave=False,
            desc="Processing regions",
        )

        self.regions.coordinates["ix"] = np.arange(len(self.regions))

        for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
            # extract in which parent region we need to look
            parentregion_ix = parentregion_to_parentregion_ix(parentregion)

            parentregion_start_ix, parentregion_end_ix = (
                parentregion_fragment_indptr[parentregion_ix],
                parentregion_fragment_indptr[parentregion_ix + 1],
            )

            # extract parent's mapping and coordinates
            coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]
            cellmapping_parentregion = mapping[parentregion_start_ix:parentregion_end_ix, 0]

            cell_indptr_parentregion = indices_to_indptr(cellmapping_parentregion, self.n_cells, dtype=np.int64)

            for region_id, region in subcoordinates.iterrows():
                region_ix = region["ix"]

                if pbar is not None:
                    pbar.update(1)
                    pbar.set_description(f"Processing region {region_id}")

                # extract fragments lying within the region
                # depending on the strand, we either need to include the start or the end
                region["start_inclusive"] = region["start"] + (region["strand"] == -1)
                region["end_inclusive"] = region["end"] + (region["strand"] == -1)

                if inclusive == (True, False):
                    fragments_oi = (coordinates_parentregion[:, 0] >= region["start_inclusive"]) & (
                        coordinates_parentregion[:, 0] < region["end_inclusive"]
                    )
                    fragments_excluded_left = coordinates_parentregion[:, 0] < region["start_inclusive"]
                else:
                    raise NotImplementedError("For now, only left-inclusive fragments are supported")

                n_excluded_left = np.bincount(cellmapping_parentregion[fragments_excluded_left], minlength=self.n_cells)
                n_included = np.bincount(cellmapping_parentregion[fragments_oi], minlength=self.n_cells)

                cell_indptr_left = cell_indptr_parentregion[:-1] + n_excluded_left
                cell_indptr_right = cell_indptr_parentregion[:-1] + n_excluded_left + n_included

                cell_indptr = np.stack([cell_indptr_left, cell_indptr_right], axis=1)

                regionxcell_indptr[region_ix * self.n_cells : (region_ix + 1) * self.n_cells] = (
                    cell_indptr + parentregion_start_ix
                )

        regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
            shape=(len(self.regions.coordinates) * len(self.obs), 2)
        )
        regionxcell_indptr_writer[:] = regionxcell_indptr

        return self

    def create_regionxcell_indptr2(
        self,
        inclusive: tuple = (True, False),
        overwrite=True,
    ) -> FragmentsView:
        """
        Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

        Parameters:
            batch_size:
                Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
            inclusive:
                Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
            overwrite:
                Whether to overwrite the existing index pointers.

        Returns:
            Same object but with the `regionxcell_indptr` populated

        """
        if self.regionxcell_indptr.exists() and not overwrite:
            return self

        mapping = self.parent.mapping[:]
        coordinates = self.parent.coordinates[:]

        # convert regions in parent to parent region ixs
        parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

        # reset the tensorstores
        regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

        # index pointers from parentregions to fragments
        parent_regionxcell_indptr = indices_to_indptr(
            mapping[:, 1] * self.parent.n_cells + mapping[:, 0], self.parent.n_regions * self.parent.n_cells
        )

        pbar = tqdm.tqdm(
            total=len(self.regions.coordinates) * len(self.obs),
            leave=False,
            desc="Processing regions",
        )

        self.regions.coordinates["ix"] = np.arange(len(self.regions))
        self.regions.coordinates["start_inclusive"] = self.regions.coordinates["start"] + (
            self.regions.coordinates["strand"] == -1
        )
        self.regions.coordinates["end_inclusive"] = self.regions.coordinates["end"] + (
            self.regions.coordinates["strand"] == -1
        )

        lastupdate = time.time()
        i = 0

        for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
            # extract in which parent region we need to look
            parentregion_ix = parentregion_to_parentregion_ix(parentregion)

            for cell_ix in range(self.n_cells):
                # extract the parent region coordinates per cell
                parentregionxcell_ix = parentregion_ix * self.parent.n_cells + cell_ix
                parentregion_start_ix, parentregion_end_ix = parent_regionxcell_indptr[
                    parentregionxcell_ix : parentregionxcell_ix + 2
                ]

                coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]

                for region_id, region in subcoordinates.iterrows():
                    # extract fragments lying within the region
                    region_ix = region["ix"]
                    regionxcell_ix = region_ix * self.n_cells + cell_ix

                    if (pbar is not None) and (time.time() - lastupdate > 1):
                        pbar.update(i + 1)
                        i = 0
                        pbar.set_description(f"Processing region {region_id} {cell_ix}")
                        lastupdate = time.time()
                    else:
                        i += 1

                    # extract fragments lying within the region
                    if inclusive == (True, False):
                        n_excluded_left = np.searchsorted(coordinates_parentregion[:, 0], region["start_inclusive"])
                        n_included = (
                            np.searchsorted(coordinates_parentregion[:, 0], region["end_inclusive"]) - n_excluded_left
                        )
                    else:
                        raise NotImplementedError("For now, only left-inclusive fragments are supported")

                    regionxcell_indptr[regionxcell_ix] = [
                        parentregion_start_ix + n_excluded_left,
                        parentregion_start_ix + n_excluded_left + n_included,
                    ]

        regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
            shape=(len(self.regions.coordinates) * len(self.obs), 2)
        )
        regionxcell_indptr_writer[:] = regionxcell_indptr

        return self

    def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
        """
        Filter based on new regions

        Parameters:
            regions:
                Regions to filter.
        Returns:
            A new Fragments object
        """

        # check if new regions are a subset of the existing ones
        if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
            raise ValueError("New regions should be a subset of the existing ones")

        # create new fragments
        fragments = FragmentsView.create(
            parent=self.parent,
            regions=regions,
            path=path,
            parentregion_column=self.parentregion_column,
            obs=self.obs,
            var=regions.coordinates,
            reset=overwrite,
        )

        return fragments

    var: pd.DataFrame = TSV()
    """DataFrame containing information about regions."""

    obs: pd.DataFrame = TSV()
    """DataFrame containing information about cells."""

    @functools.cached_property
    def n_regions(self):
        """Number of regions"""
        return self.var.shape[0]

    @functools.cached_property
    def n_cells(self):
        """Number of cells"""
        return self.obs.shape[0]

    def estimate_fragment_per_cellxregion(self) -> int:
        """
        Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.
        """
        return math.ceil((self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).astype(float).mean())

    @property
    def coordinates(self):
        """
        Coordinates of the fragments, equal to the parent coordinates
        """
        return self.parent.coordinates

    @property
    def mapping(self):
        """
        Mapping of the fragments, equal to the parent mapping
        """
        return self.parent.mapping

    @property
    def counts(self):
        """
        Number of fragments per region and cell
        """
        return (self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).reshape(self.regions.n_regions, -1).T

    _cache = None

    def get_cache(self, region_oi):
        """
        Get the cache for a specific region
        """

        if self._cache is None:
            self._cache = {}

        if region_oi in self._cache:
            return self._cache[region_oi]

        region_ix = self.regions.coordinates.index.get_loc(region_oi)
        regionxcell_ixs = region_ix * self.n_cells + np.arange(self.n_cells)

        indptrs = self.regionxcell_indptr[regionxcell_ixs]

        coordinates_reader = self.parent.coordinates.open_reader()

        n = []
        i = 0
        coordinates = []
        for start, end in indptrs:
            coordinates.append(coordinates_reader[start:end])
            i += end - start
            n.append(end - start)

        coordinates = np.concatenate(coordinates)
        local_cellxregion_ix = np.repeat(np.arange(len(indptrs)), n)
        regionxcell_indptr = indices_to_indptr(local_cellxregion_ix, len(indptrs), dtype=np.int64)

        self._cache[region_oi] = {
            "regionxcell_indptr": regionxcell_indptr,
            "coordinates": coordinates,
        }

        return self._cache[region_oi]

    _libsize = None

    @property
    def libsize(self):
        if self._libsize is None:
            self._libsize = np.bincount(self.mapping[:, 0], minlength=self.n_cells)
        return self._libsize

coordinates property

Coordinates of the fragments, equal to the parent coordinates

counts property

Number of fragments per region and cell

mapping property

Mapping of the fragments, equal to the parent mapping

n_cells cached property

Number of cells

n_regions cached property

Number of regions

obs: pd.DataFrame = TSV() class-attribute instance-attribute

DataFrame containing information about cells.

parent: Fragments = Linked() class-attribute instance-attribute

The parent fragments object from which this view is created

regions: Regions = Linked() class-attribute instance-attribute

The regions object

regionxcell_indptr: TensorstoreInstance = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc', shape=(0, 2)) class-attribute instance-attribute

Index of fragments in the parent fragments object

var: pd.DataFrame = TSV() class-attribute instance-attribute

DataFrame containing information about regions.

create_regionxcell_indptr(inclusive=(True, False), overwrite=True)

Create index pointers (left and right) for the fragments associated to each regionxcell combination

Parameters:

Name Type Description Default
batch_size

Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.

required
inclusive tuple

Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.

(True, False)
overwrite

Whether to overwrite the existing index pointers.

True

Returns:

Type Description
FragmentsView

Same object but with the regionxcell_indptr populated

Source code in src/chromatinhd/data/fragments/view.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def create_regionxcell_indptr(
    self,
    inclusive: tuple = (True, False),
    overwrite=True,
) -> FragmentsView:
    """
    Create index pointers (left and right) for the fragments associated to each regionxcell combination

    Parameters:
        batch_size:
            Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
        inclusive:
            Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
        overwrite:
            Whether to overwrite the existing index pointers.

    Returns:
        Same object but with the `regionxcell_indptr` populated

    """
    if self.regionxcell_indptr.exists() and not overwrite:
        return self

    mapping = self.parent.mapping[:]
    coordinates = self.parent.coordinates[:]

    # convert regions in parent to parent region ixs
    parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

    # reset the tensorstores
    regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

    # index pointers from parentregions to fragments
    parentregion_fragment_indptr = indices_to_indptr(mapping[:, 1], self.parent.n_regions)

    pbar = tqdm.tqdm(
        total=len(self.regions.coordinates),
        leave=False,
        desc="Processing regions",
    )

    self.regions.coordinates["ix"] = np.arange(len(self.regions))

    for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
        # extract in which parent region we need to look
        parentregion_ix = parentregion_to_parentregion_ix(parentregion)

        parentregion_start_ix, parentregion_end_ix = (
            parentregion_fragment_indptr[parentregion_ix],
            parentregion_fragment_indptr[parentregion_ix + 1],
        )

        # extract parent's mapping and coordinates
        coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]
        cellmapping_parentregion = mapping[parentregion_start_ix:parentregion_end_ix, 0]

        cell_indptr_parentregion = indices_to_indptr(cellmapping_parentregion, self.n_cells, dtype=np.int64)

        for region_id, region in subcoordinates.iterrows():
            region_ix = region["ix"]

            if pbar is not None:
                pbar.update(1)
                pbar.set_description(f"Processing region {region_id}")

            # extract fragments lying within the region
            # depending on the strand, we either need to include the start or the end
            region["start_inclusive"] = region["start"] + (region["strand"] == -1)
            region["end_inclusive"] = region["end"] + (region["strand"] == -1)

            if inclusive == (True, False):
                fragments_oi = (coordinates_parentregion[:, 0] >= region["start_inclusive"]) & (
                    coordinates_parentregion[:, 0] < region["end_inclusive"]
                )
                fragments_excluded_left = coordinates_parentregion[:, 0] < region["start_inclusive"]
            else:
                raise NotImplementedError("For now, only left-inclusive fragments are supported")

            n_excluded_left = np.bincount(cellmapping_parentregion[fragments_excluded_left], minlength=self.n_cells)
            n_included = np.bincount(cellmapping_parentregion[fragments_oi], minlength=self.n_cells)

            cell_indptr_left = cell_indptr_parentregion[:-1] + n_excluded_left
            cell_indptr_right = cell_indptr_parentregion[:-1] + n_excluded_left + n_included

            cell_indptr = np.stack([cell_indptr_left, cell_indptr_right], axis=1)

            regionxcell_indptr[region_ix * self.n_cells : (region_ix + 1) * self.n_cells] = (
                cell_indptr + parentregion_start_ix
            )

    regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
        shape=(len(self.regions.coordinates) * len(self.obs), 2)
    )
    regionxcell_indptr_writer[:] = regionxcell_indptr

    return self

create_regionxcell_indptr2(inclusive=(True, False), overwrite=True)

Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

Parameters:

Name Type Description Default
batch_size

Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.

required
inclusive tuple

Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.

(True, False)
overwrite

Whether to overwrite the existing index pointers.

True

Returns:

Type Description
FragmentsView

Same object but with the regionxcell_indptr populated

Source code in src/chromatinhd/data/fragments/view.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
def create_regionxcell_indptr2(
    self,
    inclusive: tuple = (True, False),
    overwrite=True,
) -> FragmentsView:
    """
    Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

    Parameters:
        batch_size:
            Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
        inclusive:
            Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
        overwrite:
            Whether to overwrite the existing index pointers.

    Returns:
        Same object but with the `regionxcell_indptr` populated

    """
    if self.regionxcell_indptr.exists() and not overwrite:
        return self

    mapping = self.parent.mapping[:]
    coordinates = self.parent.coordinates[:]

    # convert regions in parent to parent region ixs
    parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

    # reset the tensorstores
    regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

    # index pointers from parentregions to fragments
    parent_regionxcell_indptr = indices_to_indptr(
        mapping[:, 1] * self.parent.n_cells + mapping[:, 0], self.parent.n_regions * self.parent.n_cells
    )

    pbar = tqdm.tqdm(
        total=len(self.regions.coordinates) * len(self.obs),
        leave=False,
        desc="Processing regions",
    )

    self.regions.coordinates["ix"] = np.arange(len(self.regions))
    self.regions.coordinates["start_inclusive"] = self.regions.coordinates["start"] + (
        self.regions.coordinates["strand"] == -1
    )
    self.regions.coordinates["end_inclusive"] = self.regions.coordinates["end"] + (
        self.regions.coordinates["strand"] == -1
    )

    lastupdate = time.time()
    i = 0

    for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
        # extract in which parent region we need to look
        parentregion_ix = parentregion_to_parentregion_ix(parentregion)

        for cell_ix in range(self.n_cells):
            # extract the parent region coordinates per cell
            parentregionxcell_ix = parentregion_ix * self.parent.n_cells + cell_ix
            parentregion_start_ix, parentregion_end_ix = parent_regionxcell_indptr[
                parentregionxcell_ix : parentregionxcell_ix + 2
            ]

            coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]

            for region_id, region in subcoordinates.iterrows():
                # extract fragments lying within the region
                region_ix = region["ix"]
                regionxcell_ix = region_ix * self.n_cells + cell_ix

                if (pbar is not None) and (time.time() - lastupdate > 1):
                    pbar.update(i + 1)
                    i = 0
                    pbar.set_description(f"Processing region {region_id} {cell_ix}")
                    lastupdate = time.time()
                else:
                    i += 1

                # extract fragments lying within the region
                if inclusive == (True, False):
                    n_excluded_left = np.searchsorted(coordinates_parentregion[:, 0], region["start_inclusive"])
                    n_included = (
                        np.searchsorted(coordinates_parentregion[:, 0], region["end_inclusive"]) - n_excluded_left
                    )
                else:
                    raise NotImplementedError("For now, only left-inclusive fragments are supported")

                regionxcell_indptr[regionxcell_ix] = [
                    parentregion_start_ix + n_excluded_left,
                    parentregion_start_ix + n_excluded_left + n_included,
                ]

    regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
        shape=(len(self.regions.coordinates) * len(self.obs), 2)
    )
    regionxcell_indptr_writer[:] = regionxcell_indptr

    return self

estimate_fragment_per_cellxregion()

Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.

Source code in src/chromatinhd/data/fragments/view.py
339
340
341
342
343
def estimate_fragment_per_cellxregion(self) -> int:
    """
    Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.
    """
    return math.ceil((self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).astype(float).mean())

filter_regions(regions, path=None, overwrite=True)

Filter based on new regions

Parameters:

Name Type Description Default
regions Regions

Regions to filter.

required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/view.py
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
    """
    Filter based on new regions

    Parameters:
        regions:
            Regions to filter.
    Returns:
        A new Fragments object
    """

    # check if new regions are a subset of the existing ones
    if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
        raise ValueError("New regions should be a subset of the existing ones")

    # create new fragments
    fragments = FragmentsView.create(
        parent=self.parent,
        regions=regions,
        path=path,
        parentregion_column=self.parentregion_column,
        obs=self.obs,
        var=regions.coordinates,
        reset=overwrite,
    )

    return fragments

from_fragments(parent, regions, parentregion_column='chrom', obs=None, var=None, path=None, overwrite=False) classmethod

Creates a fragments view from a parent fragments object and a regions object

Parameters:

Name Type Description Default
parent Fragments

Parent fragments object. If a fragments view is provided, the parent of the parent will be used.

required
regions Regions

Regions object

required
obs DataFrame

DataFrame containing information about cells, will be copied from the fragments object if not provided

None
parentregion_column str

Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both parent.regions.coordinates and regions.coordinates

'chrom'
path PathLike

Path to store the fragments view

None
Source code in src/chromatinhd/data/fragments/view.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
@classmethod
def from_fragments(
    cls,
    parent: Fragments,
    regions: Regions,
    parentregion_column: str = "chrom",
    obs: pd.DataFrame = None,
    var: pd.DataFrame = None,
    path: PathLike = None,
    overwrite: bool = False,
):
    """
    Creates a fragments view from a parent fragments object and a regions object

    Parameters:
        parent:
            Parent fragments object. If a fragments view is provided, the parent of the parent will be used.
        regions:
            Regions object
        obs:
            DataFrame containing information about cells, will be copied from the fragments object if not provided
        parentregion_column:
            Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both `parent.regions.coordinates` and `regions.coordinates`
        path:
            Path to store the fragments view
    """

    if isinstance(parent, FragmentsView):
        while isinstance(parent, FragmentsView):
            parent = parent.parent
    if not isinstance(parent, Fragments):
        raise ValueError("parent should be a Fragments object")
    if not isinstance(regions, Regions):
        raise ValueError("regions should be a Regions object", regions.__class__)

    # dummy proofing
    if parentregion_column not in regions.coordinates.columns:
        raise ValueError(
            f"Column {parentregion_column} not in regions coordinates. Available columns are {regions.coordinates.columns}"
        )
    if parentregion_column not in parent.regions.coordinates.columns:
        raise ValueError(
            f"Column {parentregion_column} not in fragments regions coordinates. Available columns are {parent.regions.coordinates.columns}"
        )
    if not (regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])).all():
        raise ValueError(
            f"Not all regions are present in the parent fragments. Missing regions: {regions.coordinates[parentregion_column][~regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])]}"
        )

    self = cls.create(
        parent=parent,
        regions=regions,
        path=path,
        parentregion_column=parentregion_column,
        obs=parent.obs if obs is None else obs,
        var=regions.coordinates if var is None else var,
        reset=overwrite,
    )

    return self

get_cache(region_oi)

Get the cache for a specific region

Source code in src/chromatinhd/data/fragments/view.py
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
def get_cache(self, region_oi):
    """
    Get the cache for a specific region
    """

    if self._cache is None:
        self._cache = {}

    if region_oi in self._cache:
        return self._cache[region_oi]

    region_ix = self.regions.coordinates.index.get_loc(region_oi)
    regionxcell_ixs = region_ix * self.n_cells + np.arange(self.n_cells)

    indptrs = self.regionxcell_indptr[regionxcell_ixs]

    coordinates_reader = self.parent.coordinates.open_reader()

    n = []
    i = 0
    coordinates = []
    for start, end in indptrs:
        coordinates.append(coordinates_reader[start:end])
        i += end - start
        n.append(end - start)

    coordinates = np.concatenate(coordinates)
    local_cellxregion_ix = np.repeat(np.arange(len(indptrs)), n)
    regionxcell_indptr = indices_to_indptr(local_cellxregion_ix, len(indptrs), dtype=np.int64)

    self._cache[region_oi] = {
        "regionxcell_indptr": regionxcell_indptr,
        "coordinates": coordinates,
    }

    return self._cache[region_oi]