Fragments

`chromatinhd.data.fragments.Fragments`

Bases: Flow

Fragments positioned within regions. Fragments are sorted by the region, position within the region (left cut site) and cell.

The object can also store several precalculated tensors that are used for efficient loading of fragments. See create_regionxcell_indptr for more information.

Source code in src/chromatinhd/data/fragments/fragments.py

class Fragments(Flow):
    """
    Fragments positioned within regions. Fragments are sorted by the region, position within the region (left cut site) and cell.

    The object can also store several precalculated tensors that are used for efficient loading of fragments. See create_regionxcell_indptr for more information.
    """

    regions: Regions = Linked()
    """The regions in which (part of) the fragments are located and centered."""

    coordinates: TensorstoreInstance = Tensorstore(dtype="<i4", chunks=[100000, 2], compression=None, shape=[0, 2])
    """Coordinates of the two cut sites."""

    mapping: TensorstoreInstance = Tensorstore(dtype="<i4", chunks=[100000, 2], compression="blosc", shape=[0, 2])
    """Mapping of a fragment to a cell (first column) and a region (second column)"""

    regionxcell_indptr: np.ndarray = Tensorstore(dtype="<i8", chunks=[100000], compression="blosc")
    """Index pointers to the regionxcell fragment positions"""

    def create_regionxcell_indptr(self, overwrite=False) -> Fragments:
        """
        Creates pointers to each individual region x cell combination from the mapping tensor.

        Returns:
            The same object with `regionxcell_indptr` populated
        """

        if self.o.regionxcell_indptr.exists(self) and not overwrite:
            return self

        regionxcell_ix = self.mapping[:, 1] * self.n_cells + self.mapping[:, 0]

        if not (np.diff(regionxcell_ix) >= 0).all():
            raise ValueError("Fragments should be ordered by regionxcell (ascending)")

        if not self.mapping[:, 0].max() < self.n_cells:
            raise ValueError("First column of mapping should be smaller than the number of cells")

        n_regionxcell = self.n_regions * self.n_cells
        regionxcell_indptr = np.pad(np.cumsum(np.bincount(regionxcell_ix, minlength=n_regionxcell), 0), (1, 0))
        assert self.coordinates.shape[0] == regionxcell_indptr[-1]
        if not (np.diff(regionxcell_indptr) >= 0).all():
            raise ValueError("Fragments should be ordered by regionxcell (ascending)")
        self.regionxcell_indptr[:] = regionxcell_indptr

        return self

    var: pd.DataFrame = TSV()
    """DataFrame containing information about regions."""

    obs: pd.DataFrame = TSV()
    """DataFrame containing information about cells."""

    @functools.cached_property
    def n_regions(self):
        """Number of regions"""
        return self.var.shape[0]

    @functools.cached_property
    def n_cells(self):
        """Number of cells"""
        return self.obs.shape[0]

    @property
    def local_cellxregion_ix(self):
        return self.mapping[:, 0] * self.n_regions + self.mapping[:, 1]

    def estimate_fragment_per_cellxregion(self):
        return math.ceil(self.coordinates.shape[0] / self.n_cells / self.n_regions)

    @class_or_instancemethod
    def from_fragments_tsv(
        cls,
        fragments_file: PathLike,
        regions: Regions,
        obs: pd.DataFrame,
        cell_column: str = None,
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 1e6,
    ) -> Fragments:
        """
        Create a Fragments object from a fragments tsv file

        Parameters:
            fragments_file:
                Location of the fragments tab-separate file created by e.g. CellRanger or sinto
            obs:
                DataFrame containing information about cells.
                The index should be the cell names as present in the fragments file.
                Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
            regions:
                Regions from which the fragments will be extracted.
            cell_column:
                Column name in the `obs` DataFrame containing the cell names.
                If None, the index of the `obs` DataFrame is used.
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory.
        Returns:
            A new Fragments object
        """

        if isinstance(fragments_file, str):
            fragments_file = pathlib.Path(fragments_file)
        if isinstance(path, str):
            path = pathlib.Path(path)
        if not fragments_file.exists():
            raise FileNotFoundError(f"File {fragments_file} does not exist")
        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )

        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # cell information
        obs = obs.copy()
        obs["ix"] = np.arange(obs.shape[0])
        if cell_column is None:
            cell_to_cell_ix = obs["ix"].to_dict()
        else:
            cell_to_cell_ix = obs.set_index(cell_column)["ix"].to_dict()

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        # read the fragments file
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e
        fragments_tabix = pysam.TabixFile(str(fragments_file))

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            coordinates_raw, mapping_raw = _fetch_fragments_region(
                fragments_tabix=fragments_tabix,
                chrom=region_info["chrom"],
                start=region_info["start"],
                end=region_info["end"],
                tss=tss,
                strand=strand,
                cell_to_cell_ix=cell_to_cell_ix,
                region_ix=region_ix,
            )

            mapping_processed.append(mapping_raw)
            coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    @class_or_instancemethod
    def from_multiple_fragments_tsv(
        cls,
        fragments_files: [PathLike],
        regions: Regions,
        obs: pd.DataFrame,
        cell_column: str = "cell_original",
        batch_column: str = "batch",
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 1e6,
    ) -> Fragments:
        """
        Create a Fragments object from multiple fragments tsv file

        Parameters:
            fragments_files:
                Location of the fragments tab-separate file created by e.g. CellRanger or sinto
            obs:
                DataFrame containing information about cells.
                The index should be the cell names as present in the fragments file.
                Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
            regions:
                Regions from which the fragments will be extracted.
            cell_column:
                Column name in the `obs` DataFrame containing the cell names.
            batch_column:
                Column name in the `obs` DataFrame containing the batch indices.
                If None, will default to batch
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory.
        Returns:
            A new Fragments object
        """

        if not isinstance(fragments_files, (list, tuple)):
            fragments_files = [fragments_files]

        fragments_files_ = []
        for fragments_file in fragments_files:
            if isinstance(fragments_file, str):
                fragments_file = pathlib.Path(fragments_file)
            if not fragments_file.exists():
                raise FileNotFoundError(f"File {fragments_file} does not exist")
            fragments_files_.append(fragments_file)
        fragments_files = fragments_files_

        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )

        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # cell information
        obs = obs.copy()
        obs["ix"] = np.arange(obs.shape[0])

        if batch_column is None:
            raise ValueError("batch_column should be specified")
        if batch_column not in obs.columns:
            raise ValueError(f"Column {batch_column} not in obs")
        if obs[batch_column].dtype != "int":
            raise ValueError(f"Column {batch_column} should be an integer column")
        if not obs[batch_column].max() == len(fragments_files) - 1:
            raise ValueError(f"Column {batch_column} should contain values between 0 and {len(fragments_files) - 1}")
        cell_to_cell_ix_batches = [
            obs.loc[obs[batch_column] == batch].set_index(cell_column)["ix"] for batch in obs[batch_column].unique()
        ]

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        # read the fragments file
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e
        fragments_tabix_batches = [pysam.TabixFile(str(fragments_file)) for fragments_file in fragments_files]

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            mapping_raw = []
            coordinates_raw = []

            for fragments_tabix, cell_to_cell_ix in zip(fragments_tabix_batches, cell_to_cell_ix_batches):
                coordinates_raw_batch, mapping_raw_batch = _fetch_fragments_region(
                    fragments_tabix=fragments_tabix,
                    chrom=region_info["chrom"],
                    start=region_info["start"],
                    end=region_info["end"],
                    tss=tss,
                    strand=strand,
                    cell_to_cell_ix=cell_to_cell_ix,
                    region_ix=region_ix,
                )
                print(len(coordinates_raw_batch))
                mapping_raw.append(mapping_raw_batch)
                coordinates_raw.append(coordinates_raw_batch)

            mapping_raw = np.concatenate(mapping_raw)
            coordinates_raw = np.concatenate(coordinates_raw)

            # sort by region, coordinate (of left cut sites), and cell
            sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
            mapping_raw = mapping_raw[sorted_idx]
            coordinates_raw = coordinates_raw[sorted_idx]

            mapping_processed.append(mapping_raw)
            coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    @class_or_instancemethod
    def from_alignments(
        cls,
        obs: pd.DataFrame,
        regions: Regions,
        file_column: str = "path",
        alignment_column: str = None,
        remove_duplicates: bool = None,
        path: PathLike = None,
        overwrite: bool = False,
        reuse: bool = True,
        batch_size: int = 10e6,
        paired: bool = True,
    ) -> Fragments:
        """
        Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

        Parameters:
            obs:
                DataFrame containing information about cells/samples.
                The index will be used as the name of the cell for future reference.
                The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column).
                Any additional data about cells/samples can be stored in this dataframe as well.
            regions:
                Regions from which the fragments will be extracted.
            file_column:
                Column name in the `obs` DataFrame containing the path to the alignment file.
            alignment_column:
                Column name in the `obs` DataFrame containing the alignment object.
                If None, the alignment object will be loaded using the `file_column`.
            remove_duplicates:
                Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.
            path:
                Folder in which the fragments data will be stored.
            overwrite:
                Whether to overwrite the data if it already exists.
            reuse:
                Whether to reuse existing data if it exists.
            batch_size:
                Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.
            paired:
                Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of `remove_duplicates`.
        """
        # regions information
        var = pd.DataFrame(index=regions.coordinates.index)
        var["ix"] = np.arange(var.shape[0])

        # check path and overwrite
        if path is not None:
            if isinstance(path, str):
                path = pathlib.Path(path)
            if not overwrite and path.exists() and not reuse:
                raise FileExistsError(
                    f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
                )
        if overwrite:
            reuse = False

        self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

        if reuse:
            return self

        # load alignment files
        try:
            import pysam
        except ImportError as e:
            raise ImportError(
                "pysam is required to read alignment files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
            ) from e

        if alignment_column is None:
            if file_column not in obs.columns:
                raise ValueError(f"Column {file_column} not in obs")
            alignments = {}
            for cell, cell_info in obs.iterrows():
                alignments[cell] = pysam.Samfile(cell_info[file_column], "rb")
        else:
            if alignment_column not in obs.columns:
                raise ValueError(f"Column {alignment_column} not in obs")
            alignments = obs[alignment_column].to_dict()

        # process regions
        pbar = tqdm.tqdm(
            enumerate(regions.coordinates.iterrows()),
            total=regions.coordinates.shape[0],
            leave=False,
            desc="Processing fragments",
        )

        self.mapping.open_creator()
        self.coordinates.open_creator()

        mapping_processed = []
        coordinates_processed = []

        for region_ix, (region_id, region_info) in pbar:
            pbar.set_description(f"{region_id}")

            chrom = region_info["chrom"]
            start = region_info["start"]
            end = region_info["end"]
            strand = region_info["strand"]
            if "tss" in region_info:
                tss = region_info["tss"]
            else:
                tss = region_info["start"]

            # process cell/sample
            for cell_ix, (cell_id, alignment) in enumerate(alignments.items()):
                if paired:
                    coordinates_raw = _process_paired(
                        alignment=alignment,
                        chrom=chrom,
                        start=start,
                        end=end,
                        remove_duplicates=True if remove_duplicates is None else remove_duplicates,
                    )
                    coordinates_raw = (np.array(coordinates_raw).reshape(-1, 2).astype(np.int32) - tss) * strand
                else:
                    coordinates_raw = _process_single(
                        alignment=alignment,
                        chrom=chrom,
                        start=start,
                        end=end,
                        remove_duplicates=False if remove_duplicates is None else remove_duplicates,
                    )
                    coordinates_raw = (np.array(coordinates_raw).reshape(-1, 1).astype(np.int32) - tss) * strand

                mapping_raw = np.stack(
                    [
                        np.repeat(cell_ix, len(coordinates_raw)),
                        np.repeat(region_ix, len(coordinates_raw)),
                    ],
                    axis=1,
                )

                # sort by region, coordinate (of left cut sites), and cell
                sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
                mapping_raw = mapping_raw[sorted_idx]
                coordinates_raw = coordinates_raw[sorted_idx]

                if len(mapping_raw) > 0:
                    mapping_processed.append(mapping_raw)
                    coordinates_processed.append(coordinates_raw)

                if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                    self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                    self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                    mapping_processed = []
                    coordinates_processed = []

                del mapping_raw
                del coordinates_raw

        # add final fragments
        if len(mapping_processed) > 0:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

        return self

    def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
        """
        Filter based on new regions

        Parameters:
            regions:
                Regions to filter.
        Returns:
            A new Fragments object
        """

        # check if new regions are a subset of the existing ones
        if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
            raise ValueError("New regions should be a subset of the existing ones")

        # filter region info
        self.regions.coordinates["ix"] = np.arange(self.regions.coordinates.shape[0])
        regions.coordinates["ix"] = self.regions.coordinates["ix"].loc[regions.coordinates.index]

        var = self.regions.coordinates.copy()
        var["original_ix"] = np.arange(var.shape[0])
        var = var.loc[regions.coordinates.index].copy()
        var["ix"] = np.arange(var.shape[0])

        # filter coordinates/mapping
        fragments_oi = np.isin(self.mapping[:, 1], regions.coordinates["ix"])

        mapping = self.mapping[fragments_oi].copy()
        mapping[:, 1] = var.set_index("original_ix").loc[mapping[:, 1], "ix"].values
        coordinates = self.coordinates[fragments_oi].copy()

        # Sort `coordinates` and `mapping` according to `mapping`
        sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
        mapping = mapping[sorted_idx]
        coordinates = coordinates[sorted_idx]

        fragments = Fragments.create(
            coordinates=coordinates, mapping=mapping, regions=regions, var=var, obs=self.obs, path=path, reset=overwrite
        )

        return fragments

    def filter_cells(self, cells, path: PathLike = None) -> Fragments:
        """
        Filter based on new cells

        Parameters:
            cells:
                Cells to filter.
        Returns:
            A new Fragments object
        """

        # check if new cells are a subset of the existing ones
        if not pd.Series(cells).isin(self.obs.index).all():
            raise ValueError("New cells should be a subset of the existing ones")

        # filter region info
        self.obs["ix"] = np.arange(self.obs.shape[0])
        obs = self.obs.loc[cells].copy()
        obs["original_ix"] = self.obs["ix"].loc[obs.index]
        obs["ix"] = np.arange(obs.shape[0])

        # filter coordinates/mapping
        fragments_oi = np.isin(self.mapping[:, 0], obs["original_ix"])

        mapping = self.mapping[fragments_oi].copy()
        mapping[:, 0] = obs.set_index("original_ix").loc[mapping[:, 0], "ix"].values
        coordinates = self.coordinates[fragments_oi]

        # Sort `coordinates` and `mapping` according to `mapping`
        sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
        mapping = mapping[sorted_idx]
        coordinates = coordinates[sorted_idx]

        return Fragments.create(
            coordinates=coordinates, mapping=mapping, regions=self.regions, var=self.var, obs=obs, path=path
        )

    @property
    def counts(self):
        """
        Counts of fragments per cell x region
        """
        cellxregion_ix = self.mapping[:, 0] * self.n_regions + self.mapping[:, 1]
        counts = np.bincount(cellxregion_ix, minlength=self.n_cells * self.n_regions).reshape(
            self.n_cells, self.n_regions
        )
        return counts

    @property
    def nonzero(self):
        return self.counts > 0

    _single_region_cache = None

    def get_cache(self, region_oi):
        if self._single_region_cache is None:
            self._single_region_cache = {}
        if region_oi not in self._single_region_cache:
            region = self.var.index.get_loc(region_oi)
            regionxcell_indptr = self.regionxcell_indptr.open_reader(
                {"context": {"data_copy_concurrency": {"limit": 1}}}
            )[region * self.n_cells : (region + 1) * self.n_cells + 1]
            coordinates = self.coordinates.open_reader(
                {
                    "context": {
                        "data_copy_concurrency": {"limit": 1},
                    }
                }
            )[regionxcell_indptr[0] : regionxcell_indptr[-1]]
            regionxcell_indptr = regionxcell_indptr - regionxcell_indptr[0]

            self._single_region_cache[region_oi] = {
                "regionxcell_indptr": regionxcell_indptr,
                "coordinates": coordinates,
            }
        return self._single_region_cache[region_oi]

    _libsize = None

    @property
    def libsize(self):
        if self._libsize is None:
            self._libsize = np.bincount(self.mapping[:, 0], minlength=self.n_cells)
        return self._libsize

`coordinates: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression=None, shape=[0, 2])` `class-attribute` `instance-attribute`

Coordinates of the two cut sites.

`counts` `property`

Counts of fragments per cell x region

`mapping: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression='blosc', shape=[0, 2])` `class-attribute` `instance-attribute`

Mapping of a fragment to a cell (first column) and a region (second column)

`n_cells` `cached` `property`

Number of cells

`n_regions` `cached` `property`

Number of regions

`obs: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

DataFrame containing information about cells.

`regions: Regions = Linked()` `class-attribute` `instance-attribute`

The regions in which (part of) the fragments are located and centered.

`regionxcell_indptr: np.ndarray = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc')` `class-attribute` `instance-attribute`

Index pointers to the regionxcell fragment positions

`var: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

DataFrame containing information about regions.

`create_regionxcell_indptr(overwrite=False)`

Creates pointers to each individual region x cell combination from the mapping tensor.

Returns:

Type	Description
`Fragments`	The same object with `regionxcell_indptr` populated

Source code in src/chromatinhd/data/fragments/fragments.py

def create_regionxcell_indptr(self, overwrite=False) -> Fragments:
    """
    Creates pointers to each individual region x cell combination from the mapping tensor.

    Returns:
        The same object with `regionxcell_indptr` populated
    """

    if self.o.regionxcell_indptr.exists(self) and not overwrite:
        return self

    regionxcell_ix = self.mapping[:, 1] * self.n_cells + self.mapping[:, 0]

    if not (np.diff(regionxcell_ix) >= 0).all():
        raise ValueError("Fragments should be ordered by regionxcell (ascending)")

    if not self.mapping[:, 0].max() < self.n_cells:
        raise ValueError("First column of mapping should be smaller than the number of cells")

    n_regionxcell = self.n_regions * self.n_cells
    regionxcell_indptr = np.pad(np.cumsum(np.bincount(regionxcell_ix, minlength=n_regionxcell), 0), (1, 0))
    assert self.coordinates.shape[0] == regionxcell_indptr[-1]
    if not (np.diff(regionxcell_indptr) >= 0).all():
        raise ValueError("Fragments should be ordered by regionxcell (ascending)")
    self.regionxcell_indptr[:] = regionxcell_indptr

    return self

`filter_cells(cells, path=None)`

Filter based on new cells

Parameters:

Name	Type	Description	Default
`cells`		Cells to filter.	required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py

def filter_cells(self, cells, path: PathLike = None) -> Fragments:
    """
    Filter based on new cells

    Parameters:
        cells:
            Cells to filter.
    Returns:
        A new Fragments object
    """

    # check if new cells are a subset of the existing ones
    if not pd.Series(cells).isin(self.obs.index).all():
        raise ValueError("New cells should be a subset of the existing ones")

    # filter region info
    self.obs["ix"] = np.arange(self.obs.shape[0])
    obs = self.obs.loc[cells].copy()
    obs["original_ix"] = self.obs["ix"].loc[obs.index]
    obs["ix"] = np.arange(obs.shape[0])

    # filter coordinates/mapping
    fragments_oi = np.isin(self.mapping[:, 0], obs["original_ix"])

    mapping = self.mapping[fragments_oi].copy()
    mapping[:, 0] = obs.set_index("original_ix").loc[mapping[:, 0], "ix"].values
    coordinates = self.coordinates[fragments_oi]

    # Sort `coordinates` and `mapping` according to `mapping`
    sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
    mapping = mapping[sorted_idx]
    coordinates = coordinates[sorted_idx]

    return Fragments.create(
        coordinates=coordinates, mapping=mapping, regions=self.regions, var=self.var, obs=obs, path=path
    )

`filter_regions(regions, path=None, overwrite=True)`

Filter based on new regions

Parameters:

Name	Type	Description	Default
`regions`	`Regions`	Regions to filter.	required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py

def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
    """
    Filter based on new regions

    Parameters:
        regions:
            Regions to filter.
    Returns:
        A new Fragments object
    """

    # check if new regions are a subset of the existing ones
    if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
        raise ValueError("New regions should be a subset of the existing ones")

    # filter region info
    self.regions.coordinates["ix"] = np.arange(self.regions.coordinates.shape[0])
    regions.coordinates["ix"] = self.regions.coordinates["ix"].loc[regions.coordinates.index]

    var = self.regions.coordinates.copy()
    var["original_ix"] = np.arange(var.shape[0])
    var = var.loc[regions.coordinates.index].copy()
    var["ix"] = np.arange(var.shape[0])

    # filter coordinates/mapping
    fragments_oi = np.isin(self.mapping[:, 1], regions.coordinates["ix"])

    mapping = self.mapping[fragments_oi].copy()
    mapping[:, 1] = var.set_index("original_ix").loc[mapping[:, 1], "ix"].values
    coordinates = self.coordinates[fragments_oi].copy()

    # Sort `coordinates` and `mapping` according to `mapping`
    sorted_idx = np.lexsort((coordinates[:, 0], mapping[:, 0], mapping[:, 1]))
    mapping = mapping[sorted_idx]
    coordinates = coordinates[sorted_idx]

    fragments = Fragments.create(
        coordinates=coordinates, mapping=mapping, regions=regions, var=var, obs=self.obs, path=path, reset=overwrite
    )

    return fragments

`from_alignments(obs, regions, file_column='path', alignment_column=None, remove_duplicates=None, path=None, overwrite=False, reuse=True, batch_size=10000000.0, paired=True)`

Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

Parameters:

Name	Type	Description	Default
`obs`	`DataFrame`	DataFrame containing information about cells/samples. The index will be used as the name of the cell for future reference. The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column). Any additional data about cells/samples can be stored in this dataframe as well.	required
`regions`	`Regions`	Regions from which the fragments will be extracted.	required
`file_column`	`str`	Column name in the `obs` DataFrame containing the path to the alignment file.	`'path'`
`alignment_column`	`str`	Column name in the `obs` DataFrame containing the alignment object. If None, the alignment object will be loaded using the `file_column`.	`None`
`remove_duplicates`	`bool`	Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.	`None`
`path`	`PathLike`	Folder in which the fragments data will be stored.	`None`
`overwrite`	`bool`	Whether to overwrite the data if it already exists.	`False`
`reuse`	`bool`	Whether to reuse existing data if it exists.	`True`
`batch_size`	`int`	Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.	`10000000.0`
`paired`	`bool`	Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of `remove_duplicates`.	`True`

Source code in src/chromatinhd/data/fragments/fragments.py

@class_or_instancemethod
def from_alignments(
    cls,
    obs: pd.DataFrame,
    regions: Regions,
    file_column: str = "path",
    alignment_column: str = None,
    remove_duplicates: bool = None,
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 10e6,
    paired: bool = True,
) -> Fragments:
    """
    Create a Fragments object from multiple alignment (bam/sam) files, each pertaining to a single cell or sample

    Parameters:
        obs:
            DataFrame containing information about cells/samples.
            The index will be used as the name of the cell for future reference.
            The DataFrame should contain a column with the path to the alignment file (specified in the file_column) or a column with the alignment object itself (specified in the alignment_column).
            Any additional data about cells/samples can be stored in this dataframe as well.
        regions:
            Regions from which the fragments will be extracted.
        file_column:
            Column name in the `obs` DataFrame containing the path to the alignment file.
        alignment_column:
            Column name in the `obs` DataFrame containing the alignment object.
            If None, the alignment object will be loaded using the `file_column`.
        remove_duplicates:
            Whether to remove duplicate fragments within a sample or cell. This is commonly done for single-cell data, but not necessarily for bulk data. If the data is paired, duplicates will be removed by default. If the data is single-end, duplicates will not be removed by default.
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory. Increase the number if you want to speed up the process, particularly if disk I/O is slow.
        paired:
            Whether the reads are paired-end or single-end. If paired, the coordinates of the two cut sites will be stored. If single-end, only the coordinate of only one cut site will be stored. Note that this also affects the default value of `remove_duplicates`.
    """
    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # check path and overwrite
    if path is not None:
        if isinstance(path, str):
            path = pathlib.Path(path)
        if not overwrite and path.exists() and not reuse:
            raise FileExistsError(
                f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
            )
    if overwrite:
        reuse = False

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    if reuse:
        return self

    # load alignment files
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read alignment files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e

    if alignment_column is None:
        if file_column not in obs.columns:
            raise ValueError(f"Column {file_column} not in obs")
        alignments = {}
        for cell, cell_info in obs.iterrows():
            alignments[cell] = pysam.Samfile(cell_info[file_column], "rb")
    else:
        if alignment_column not in obs.columns:
            raise ValueError(f"Column {alignment_column} not in obs")
        alignments = obs[alignment_column].to_dict()

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        chrom = region_info["chrom"]
        start = region_info["start"]
        end = region_info["end"]
        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        # process cell/sample
        for cell_ix, (cell_id, alignment) in enumerate(alignments.items()):
            if paired:
                coordinates_raw = _process_paired(
                    alignment=alignment,
                    chrom=chrom,
                    start=start,
                    end=end,
                    remove_duplicates=True if remove_duplicates is None else remove_duplicates,
                )
                coordinates_raw = (np.array(coordinates_raw).reshape(-1, 2).astype(np.int32) - tss) * strand
            else:
                coordinates_raw = _process_single(
                    alignment=alignment,
                    chrom=chrom,
                    start=start,
                    end=end,
                    remove_duplicates=False if remove_duplicates is None else remove_duplicates,
                )
                coordinates_raw = (np.array(coordinates_raw).reshape(-1, 1).astype(np.int32) - tss) * strand

            mapping_raw = np.stack(
                [
                    np.repeat(cell_ix, len(coordinates_raw)),
                    np.repeat(region_ix, len(coordinates_raw)),
                ],
                axis=1,
            )

            # sort by region, coordinate (of left cut sites), and cell
            sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
            mapping_raw = mapping_raw[sorted_idx]
            coordinates_raw = coordinates_raw[sorted_idx]

            if len(mapping_raw) > 0:
                mapping_processed.append(mapping_raw)
                coordinates_processed.append(coordinates_raw)

            if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
                self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
                self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
                mapping_processed = []
                coordinates_processed = []

            del mapping_raw
            del coordinates_raw

    # add final fragments
    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

`from_fragments_tsv(fragments_file, regions, obs, cell_column=None, path=None, overwrite=False, reuse=True, batch_size=1000000.0)`

Create a Fragments object from a fragments tsv file

Parameters:

Name	Type	Description	Default
`fragments_file`	`PathLike`	Location of the fragments tab-separate file created by e.g. CellRanger or sinto	required
`obs`	`DataFrame`	DataFrame containing information about cells. The index should be the cell names as present in the fragments file. Alternatively, the column containing cell ids can be specified using the `cell_column` argument.	required
`regions`	`Regions`	Regions from which the fragments will be extracted.	required
`cell_column`	`str`	Column name in the `obs` DataFrame containing the cell names. If None, the index of the `obs` DataFrame is used.	`None`
`path`	`PathLike`	Folder in which the fragments data will be stored.	`None`
`overwrite`	`bool`	Whether to overwrite the data if it already exists.	`False`
`reuse`	`bool`	Whether to reuse existing data if it exists.	`True`
`batch_size`	`int`	Number of fragments to process before saving. Lower this number if you run out of memory.	`1000000.0`

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py

@class_or_instancemethod
def from_fragments_tsv(
    cls,
    fragments_file: PathLike,
    regions: Regions,
    obs: pd.DataFrame,
    cell_column: str = None,
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 1e6,
) -> Fragments:
    """
    Create a Fragments object from a fragments tsv file

    Parameters:
        fragments_file:
            Location of the fragments tab-separate file created by e.g. CellRanger or sinto
        obs:
            DataFrame containing information about cells.
            The index should be the cell names as present in the fragments file.
            Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
        regions:
            Regions from which the fragments will be extracted.
        cell_column:
            Column name in the `obs` DataFrame containing the cell names.
            If None, the index of the `obs` DataFrame is used.
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory.
    Returns:
        A new Fragments object
    """

    if isinstance(fragments_file, str):
        fragments_file = pathlib.Path(fragments_file)
    if isinstance(path, str):
        path = pathlib.Path(path)
    if not fragments_file.exists():
        raise FileNotFoundError(f"File {fragments_file} does not exist")
    if not overwrite and path.exists() and not reuse:
        raise FileExistsError(
            f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
        )

    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # cell information
    obs = obs.copy()
    obs["ix"] = np.arange(obs.shape[0])
    if cell_column is None:
        cell_to_cell_ix = obs["ix"].to_dict()
    else:
        cell_to_cell_ix = obs.set_index(cell_column)["ix"].to_dict()

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    # read the fragments file
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e
    fragments_tabix = pysam.TabixFile(str(fragments_file))

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        coordinates_raw, mapping_raw = _fetch_fragments_region(
            fragments_tabix=fragments_tabix,
            chrom=region_info["chrom"],
            start=region_info["start"],
            end=region_info["end"],
            tss=tss,
            strand=strand,
            cell_to_cell_ix=cell_to_cell_ix,
            region_ix=region_ix,
        )

        mapping_processed.append(mapping_raw)
        coordinates_processed.append(coordinates_raw)

        if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
            mapping_processed = []
            coordinates_processed = []

        del mapping_raw
        del coordinates_raw

    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

`from_multiple_fragments_tsv(fragments_files, regions, obs, cell_column='cell_original', batch_column='batch', path=None, overwrite=False, reuse=True, batch_size=1000000.0)`

Create a Fragments object from multiple fragments tsv file

Parameters:

Name	Type	Description	Default
`fragments_files`	`[PathLike]`	Location of the fragments tab-separate file created by e.g. CellRanger or sinto	required
`obs`	`DataFrame`	DataFrame containing information about cells. The index should be the cell names as present in the fragments file. Alternatively, the column containing cell ids can be specified using the `cell_column` argument.	required
`regions`	`Regions`	Regions from which the fragments will be extracted.	required
`cell_column`	`str`	Column name in the `obs` DataFrame containing the cell names.	`'cell_original'`
`batch_column`	`str`	Column name in the `obs` DataFrame containing the batch indices. If None, will default to batch	`'batch'`
`path`	`PathLike`	Folder in which the fragments data will be stored.	`None`
`overwrite`	`bool`	Whether to overwrite the data if it already exists.	`False`
`reuse`	`bool`	Whether to reuse existing data if it exists.	`True`
`batch_size`	`int`	Number of fragments to process before saving. Lower this number if you run out of memory.	`1000000.0`

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/fragments.py

@class_or_instancemethod
def from_multiple_fragments_tsv(
    cls,
    fragments_files: [PathLike],
    regions: Regions,
    obs: pd.DataFrame,
    cell_column: str = "cell_original",
    batch_column: str = "batch",
    path: PathLike = None,
    overwrite: bool = False,
    reuse: bool = True,
    batch_size: int = 1e6,
) -> Fragments:
    """
    Create a Fragments object from multiple fragments tsv file

    Parameters:
        fragments_files:
            Location of the fragments tab-separate file created by e.g. CellRanger or sinto
        obs:
            DataFrame containing information about cells.
            The index should be the cell names as present in the fragments file.
            Alternatively, the column containing cell ids can be specified using the `cell_column` argument.
        regions:
            Regions from which the fragments will be extracted.
        cell_column:
            Column name in the `obs` DataFrame containing the cell names.
        batch_column:
            Column name in the `obs` DataFrame containing the batch indices.
            If None, will default to batch
        path:
            Folder in which the fragments data will be stored.
        overwrite:
            Whether to overwrite the data if it already exists.
        reuse:
            Whether to reuse existing data if it exists.
        batch_size:
            Number of fragments to process before saving. Lower this number if you run out of memory.
    Returns:
        A new Fragments object
    """

    if not isinstance(fragments_files, (list, tuple)):
        fragments_files = [fragments_files]

    fragments_files_ = []
    for fragments_file in fragments_files:
        if isinstance(fragments_file, str):
            fragments_file = pathlib.Path(fragments_file)
        if not fragments_file.exists():
            raise FileNotFoundError(f"File {fragments_file} does not exist")
        fragments_files_.append(fragments_file)
    fragments_files = fragments_files_

    if not overwrite and path.exists() and not reuse:
        raise FileExistsError(
            f"Folder {path} already exists, use `overwrite=True` to overwrite, or `reuse=True` to reuse existing data"
        )

    # regions information
    var = pd.DataFrame(index=regions.coordinates.index)
    var["ix"] = np.arange(var.shape[0])

    # cell information
    obs = obs.copy()
    obs["ix"] = np.arange(obs.shape[0])

    if batch_column is None:
        raise ValueError("batch_column should be specified")
    if batch_column not in obs.columns:
        raise ValueError(f"Column {batch_column} not in obs")
    if obs[batch_column].dtype != "int":
        raise ValueError(f"Column {batch_column} should be an integer column")
    if not obs[batch_column].max() == len(fragments_files) - 1:
        raise ValueError(f"Column {batch_column} should contain values between 0 and {len(fragments_files) - 1}")
    cell_to_cell_ix_batches = [
        obs.loc[obs[batch_column] == batch].set_index(cell_column)["ix"] for batch in obs[batch_column].unique()
    ]

    self = cls.create(path=path, obs=obs, var=var, regions=regions, reset=overwrite)

    # read the fragments file
    try:
        import pysam
    except ImportError as e:
        raise ImportError(
            "pysam is required to read fragments files. Install using `pip install pysam` or `conda install -c bioconda pysam`"
        ) from e
    fragments_tabix_batches = [pysam.TabixFile(str(fragments_file)) for fragments_file in fragments_files]

    # process regions
    pbar = tqdm.tqdm(
        enumerate(regions.coordinates.iterrows()),
        total=regions.coordinates.shape[0],
        leave=False,
        desc="Processing fragments",
    )

    self.mapping.open_creator()
    self.coordinates.open_creator()

    mapping_processed = []
    coordinates_processed = []

    for region_ix, (region_id, region_info) in pbar:
        pbar.set_description(f"{region_id}")

        strand = region_info["strand"]
        if "tss" in region_info:
            tss = region_info["tss"]
        else:
            tss = region_info["start"]

        mapping_raw = []
        coordinates_raw = []

        for fragments_tabix, cell_to_cell_ix in zip(fragments_tabix_batches, cell_to_cell_ix_batches):
            coordinates_raw_batch, mapping_raw_batch = _fetch_fragments_region(
                fragments_tabix=fragments_tabix,
                chrom=region_info["chrom"],
                start=region_info["start"],
                end=region_info["end"],
                tss=tss,
                strand=strand,
                cell_to_cell_ix=cell_to_cell_ix,
                region_ix=region_ix,
            )
            print(len(coordinates_raw_batch))
            mapping_raw.append(mapping_raw_batch)
            coordinates_raw.append(coordinates_raw_batch)

        mapping_raw = np.concatenate(mapping_raw)
        coordinates_raw = np.concatenate(coordinates_raw)

        # sort by region, coordinate (of left cut sites), and cell
        sorted_idx = np.lexsort((coordinates_raw[:, 0], mapping_raw[:, 0], mapping_raw[:, 1]))
        mapping_raw = mapping_raw[sorted_idx]
        coordinates_raw = coordinates_raw[sorted_idx]

        mapping_processed.append(mapping_raw)
        coordinates_processed.append(coordinates_raw)

        if sum(mapping_raw.shape[0] for mapping_raw in mapping_processed) >= batch_size:
            self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
            self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))
            mapping_processed = []
            coordinates_processed = []

        del mapping_raw
        del coordinates_raw

    if len(mapping_processed) > 0:
        self.mapping.extend(np.concatenate(mapping_processed).astype(np.int32))
        self.coordinates.extend(np.concatenate(coordinates_processed).astype(np.int32))

    return self

`chromatinhd.data.fragments.FragmentsView`

Bases: Flow

A view of fragments based on regions that are a subset of the parent fragments. In a typical use case, the parent contains fragments for all chromosomes, while this the view focuses on specific regions.

Only fragments that are fully inclusive (i.e., both left and right cut sites) within a region will be selected.

Source code in src/chromatinhd/data/fragments/view.py

class FragmentsView(Flow):
    """
    A view of fragments based on regions that are a subset of the parent fragments. In a typical use case, the parent contains fragments for all chromosomes, while this the view focuses on specific regions.

    Only fragments that are fully inclusive (i.e., both left and right cut sites) within a region will be selected.
    """

    regionxcell_indptr: TensorstoreInstance = Tensorstore(
        dtype="<i8", chunks=[100000], compression="blosc", shape=(0, 2)
    )
    """Index of fragments in the parent fragments object"""

    parent: Fragments = Linked()
    """The parent fragments object from which this view is created"""

    regions: Regions = Linked()
    """The regions object"""

    parentregion_column: str = Stored()

    @classmethod
    def from_fragments(
        cls,
        parent: Fragments,
        regions: Regions,
        parentregion_column: str = "chrom",
        obs: pd.DataFrame = None,
        var: pd.DataFrame = None,
        path: PathLike = None,
        overwrite: bool = False,
    ):
        """
        Creates a fragments view from a parent fragments object and a regions object

        Parameters:
            parent:
                Parent fragments object. If a fragments view is provided, the parent of the parent will be used.
            regions:
                Regions object
            obs:
                DataFrame containing information about cells, will be copied from the fragments object if not provided
            parentregion_column:
                Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both `parent.regions.coordinates` and `regions.coordinates`
            path:
                Path to store the fragments view
        """

        if isinstance(parent, FragmentsView):
            while isinstance(parent, FragmentsView):
                parent = parent.parent
        if not isinstance(parent, Fragments):
            raise ValueError("parent should be a Fragments object")
        if not isinstance(regions, Regions):
            raise ValueError("regions should be a Regions object", regions.__class__)

        # dummy proofing
        if parentregion_column not in regions.coordinates.columns:
            raise ValueError(
                f"Column {parentregion_column} not in regions coordinates. Available columns are {regions.coordinates.columns}"
            )
        if parentregion_column not in parent.regions.coordinates.columns:
            raise ValueError(
                f"Column {parentregion_column} not in fragments regions coordinates. Available columns are {parent.regions.coordinates.columns}"
            )
        if not (regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])).all():
            raise ValueError(
                f"Not all regions are present in the parent fragments. Missing regions: {regions.coordinates[parentregion_column][~regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])]}"
            )

        self = cls.create(
            parent=parent,
            regions=regions,
            path=path,
            parentregion_column=parentregion_column,
            obs=parent.obs if obs is None else obs,
            var=regions.coordinates if var is None else var,
            reset=overwrite,
        )

        return self

    def create_regionxcell_indptr(
        self,
        inclusive: tuple = (True, False),
        overwrite=True,
    ) -> FragmentsView:
        """
        Create index pointers (left and right) for the fragments associated to each regionxcell combination

        Parameters:
            batch_size:
                Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
            inclusive:
                Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
            overwrite:
                Whether to overwrite the existing index pointers.

        Returns:
            Same object but with the `regionxcell_indptr` populated

        """
        if self.regionxcell_indptr.exists() and not overwrite:
            return self

        mapping = self.parent.mapping[:]
        coordinates = self.parent.coordinates[:]

        # convert regions in parent to parent region ixs
        parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

        # reset the tensorstores
        regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

        # index pointers from parentregions to fragments
        parentregion_fragment_indptr = indices_to_indptr(mapping[:, 1], self.parent.n_regions)

        pbar = tqdm.tqdm(
            total=len(self.regions.coordinates),
            leave=False,
            desc="Processing regions",
        )

        self.regions.coordinates["ix"] = np.arange(len(self.regions))

        for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
            # extract in which parent region we need to look
            parentregion_ix = parentregion_to_parentregion_ix(parentregion)

            parentregion_start_ix, parentregion_end_ix = (
                parentregion_fragment_indptr[parentregion_ix],
                parentregion_fragment_indptr[parentregion_ix + 1],
            )

            # extract parent's mapping and coordinates
            coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]
            cellmapping_parentregion = mapping[parentregion_start_ix:parentregion_end_ix, 0]

            cell_indptr_parentregion = indices_to_indptr(cellmapping_parentregion, self.n_cells, dtype=np.int64)

            for region_id, region in subcoordinates.iterrows():
                region_ix = region["ix"]

                if pbar is not None:
                    pbar.update(1)
                    pbar.set_description(f"Processing region {region_id}")

                # extract fragments lying within the region
                # depending on the strand, we either need to include the start or the end
                region["start_inclusive"] = region["start"] + (region["strand"] == -1)
                region["end_inclusive"] = region["end"] + (region["strand"] == -1)

                if inclusive == (True, False):
                    fragments_oi = (coordinates_parentregion[:, 0] >= region["start_inclusive"]) & (
                        coordinates_parentregion[:, 0] < region["end_inclusive"]
                    )
                    fragments_excluded_left = coordinates_parentregion[:, 0] < region["start_inclusive"]
                else:
                    raise NotImplementedError("For now, only left-inclusive fragments are supported")

                n_excluded_left = np.bincount(cellmapping_parentregion[fragments_excluded_left], minlength=self.n_cells)
                n_included = np.bincount(cellmapping_parentregion[fragments_oi], minlength=self.n_cells)

                cell_indptr_left = cell_indptr_parentregion[:-1] + n_excluded_left
                cell_indptr_right = cell_indptr_parentregion[:-1] + n_excluded_left + n_included

                cell_indptr = np.stack([cell_indptr_left, cell_indptr_right], axis=1)

                regionxcell_indptr[region_ix * self.n_cells : (region_ix + 1) * self.n_cells] = (
                    cell_indptr + parentregion_start_ix
                )

        regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
            shape=(len(self.regions.coordinates) * len(self.obs), 2)
        )
        regionxcell_indptr_writer[:] = regionxcell_indptr

        return self

    def create_regionxcell_indptr2(
        self,
        inclusive: tuple = (True, False),
        overwrite=True,
    ) -> FragmentsView:
        """
        Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

        Parameters:
            batch_size:
                Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
            inclusive:
                Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
            overwrite:
                Whether to overwrite the existing index pointers.

        Returns:
            Same object but with the `regionxcell_indptr` populated

        """
        if self.regionxcell_indptr.exists() and not overwrite:
            return self

        mapping = self.parent.mapping[:]
        coordinates = self.parent.coordinates[:]

        # convert regions in parent to parent region ixs
        parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

        # reset the tensorstores
        regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

        # index pointers from parentregions to fragments
        parent_regionxcell_indptr = indices_to_indptr(
            mapping[:, 1] * self.parent.n_cells + mapping[:, 0], self.parent.n_regions * self.parent.n_cells
        )

        pbar = tqdm.tqdm(
            total=len(self.regions.coordinates) * len(self.obs),
            leave=False,
            desc="Processing regions",
        )

        self.regions.coordinates["ix"] = np.arange(len(self.regions))
        self.regions.coordinates["start_inclusive"] = self.regions.coordinates["start"] + (
            self.regions.coordinates["strand"] == -1
        )
        self.regions.coordinates["end_inclusive"] = self.regions.coordinates["end"] + (
            self.regions.coordinates["strand"] == -1
        )

        lastupdate = time.time()
        i = 0

        for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
            # extract in which parent region we need to look
            parentregion_ix = parentregion_to_parentregion_ix(parentregion)

            for cell_ix in range(self.n_cells):
                # extract the parent region coordinates per cell
                parentregionxcell_ix = parentregion_ix * self.parent.n_cells + cell_ix
                parentregion_start_ix, parentregion_end_ix = parent_regionxcell_indptr[
                    parentregionxcell_ix : parentregionxcell_ix + 2
                ]

                coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]

                for region_id, region in subcoordinates.iterrows():
                    # extract fragments lying within the region
                    region_ix = region["ix"]
                    regionxcell_ix = region_ix * self.n_cells + cell_ix

                    if (pbar is not None) and (time.time() - lastupdate > 1):
                        pbar.update(i + 1)
                        i = 0
                        pbar.set_description(f"Processing region {region_id} {cell_ix}")
                        lastupdate = time.time()
                    else:
                        i += 1

                    # extract fragments lying within the region
                    if inclusive == (True, False):
                        n_excluded_left = np.searchsorted(coordinates_parentregion[:, 0], region["start_inclusive"])
                        n_included = (
                            np.searchsorted(coordinates_parentregion[:, 0], region["end_inclusive"]) - n_excluded_left
                        )
                    else:
                        raise NotImplementedError("For now, only left-inclusive fragments are supported")

                    regionxcell_indptr[regionxcell_ix] = [
                        parentregion_start_ix + n_excluded_left,
                        parentregion_start_ix + n_excluded_left + n_included,
                    ]

        regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
            shape=(len(self.regions.coordinates) * len(self.obs), 2)
        )
        regionxcell_indptr_writer[:] = regionxcell_indptr

        return self

    def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
        """
        Filter based on new regions

        Parameters:
            regions:
                Regions to filter.
        Returns:
            A new Fragments object
        """

        # check if new regions are a subset of the existing ones
        if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
            raise ValueError("New regions should be a subset of the existing ones")

        # create new fragments
        fragments = FragmentsView.create(
            parent=self.parent,
            regions=regions,
            path=path,
            parentregion_column=self.parentregion_column,
            obs=self.obs,
            var=regions.coordinates,
            reset=overwrite,
        )

        return fragments

    var: pd.DataFrame = TSV()
    """DataFrame containing information about regions."""

    obs: pd.DataFrame = TSV()
    """DataFrame containing information about cells."""

    @functools.cached_property
    def n_regions(self):
        """Number of regions"""
        return self.var.shape[0]

    @functools.cached_property
    def n_cells(self):
        """Number of cells"""
        return self.obs.shape[0]

    def estimate_fragment_per_cellxregion(self) -> int:
        """
        Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.
        """
        return math.ceil((self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).astype(float).mean())

    @property
    def coordinates(self):
        """
        Coordinates of the fragments, equal to the parent coordinates
        """
        return self.parent.coordinates

    @property
    def mapping(self):
        """
        Mapping of the fragments, equal to the parent mapping
        """
        return self.parent.mapping

    @property
    def counts(self):
        """
        Number of fragments per region and cell
        """
        return (self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).reshape(self.regions.n_regions, -1).T

    _cache = None

    def get_cache(self, region_oi):
        """
        Get the cache for a specific region
        """

        if self._cache is None:
            self._cache = {}

        if region_oi in self._cache:
            return self._cache[region_oi]

        region_ix = self.regions.coordinates.index.get_loc(region_oi)
        regionxcell_ixs = region_ix * self.n_cells + np.arange(self.n_cells)

        indptrs = self.regionxcell_indptr[regionxcell_ixs]

        coordinates_reader = self.parent.coordinates.open_reader()

        n = []
        i = 0
        coordinates = []
        for start, end in indptrs:
            coordinates.append(coordinates_reader[start:end])
            i += end - start
            n.append(end - start)

        coordinates = np.concatenate(coordinates)
        local_cellxregion_ix = np.repeat(np.arange(len(indptrs)), n)
        regionxcell_indptr = indices_to_indptr(local_cellxregion_ix, len(indptrs), dtype=np.int64)

        self._cache[region_oi] = {
            "regionxcell_indptr": regionxcell_indptr,
            "coordinates": coordinates,
        }

        return self._cache[region_oi]

    _libsize = None

    @property
    def libsize(self):
        if self._libsize is None:
            self._libsize = np.bincount(self.mapping[:, 0], minlength=self.n_cells)
        return self._libsize

`coordinates` `property`

Coordinates of the fragments, equal to the parent coordinates

`counts` `property`

Number of fragments per region and cell

`mapping` `property`

Mapping of the fragments, equal to the parent mapping

`n_cells` `cached` `property`

Number of cells

`n_regions` `cached` `property`

Number of regions

`obs: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

DataFrame containing information about cells.

`parent: Fragments = Linked()` `class-attribute` `instance-attribute`

The parent fragments object from which this view is created

`regions: Regions = Linked()` `class-attribute` `instance-attribute`

The regions object

`regionxcell_indptr: TensorstoreInstance = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc', shape=(0, 2))` `class-attribute` `instance-attribute`

Index of fragments in the parent fragments object

`var: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

DataFrame containing information about regions.

`create_regionxcell_indptr(inclusive=(True, False), overwrite=True)`

Create index pointers (left and right) for the fragments associated to each regionxcell combination

Parameters:

Name	Type	Description	Default
`batch_size`		Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.	required
`inclusive`	`tuple`	Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.	`(True, False)`
`overwrite`		Whether to overwrite the existing index pointers.	`True`

Returns:

Type	Description
`FragmentsView`	Same object but with the `regionxcell_indptr` populated

Source code in src/chromatinhd/data/fragments/view.py

def create_regionxcell_indptr(
    self,
    inclusive: tuple = (True, False),
    overwrite=True,
) -> FragmentsView:
    """
    Create index pointers (left and right) for the fragments associated to each regionxcell combination

    Parameters:
        batch_size:
            Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
        inclusive:
            Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
        overwrite:
            Whether to overwrite the existing index pointers.

    Returns:
        Same object but with the `regionxcell_indptr` populated

    """
    if self.regionxcell_indptr.exists() and not overwrite:
        return self

    mapping = self.parent.mapping[:]
    coordinates = self.parent.coordinates[:]

    # convert regions in parent to parent region ixs
    parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

    # reset the tensorstores
    regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

    # index pointers from parentregions to fragments
    parentregion_fragment_indptr = indices_to_indptr(mapping[:, 1], self.parent.n_regions)

    pbar = tqdm.tqdm(
        total=len(self.regions.coordinates),
        leave=False,
        desc="Processing regions",
    )

    self.regions.coordinates["ix"] = np.arange(len(self.regions))

    for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
        # extract in which parent region we need to look
        parentregion_ix = parentregion_to_parentregion_ix(parentregion)

        parentregion_start_ix, parentregion_end_ix = (
            parentregion_fragment_indptr[parentregion_ix],
            parentregion_fragment_indptr[parentregion_ix + 1],
        )

        # extract parent's mapping and coordinates
        coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]
        cellmapping_parentregion = mapping[parentregion_start_ix:parentregion_end_ix, 0]

        cell_indptr_parentregion = indices_to_indptr(cellmapping_parentregion, self.n_cells, dtype=np.int64)

        for region_id, region in subcoordinates.iterrows():
            region_ix = region["ix"]

            if pbar is not None:
                pbar.update(1)
                pbar.set_description(f"Processing region {region_id}")

            # extract fragments lying within the region
            # depending on the strand, we either need to include the start or the end
            region["start_inclusive"] = region["start"] + (region["strand"] == -1)
            region["end_inclusive"] = region["end"] + (region["strand"] == -1)

            if inclusive == (True, False):
                fragments_oi = (coordinates_parentregion[:, 0] >= region["start_inclusive"]) & (
                    coordinates_parentregion[:, 0] < region["end_inclusive"]
                )
                fragments_excluded_left = coordinates_parentregion[:, 0] < region["start_inclusive"]
            else:
                raise NotImplementedError("For now, only left-inclusive fragments are supported")

            n_excluded_left = np.bincount(cellmapping_parentregion[fragments_excluded_left], minlength=self.n_cells)
            n_included = np.bincount(cellmapping_parentregion[fragments_oi], minlength=self.n_cells)

            cell_indptr_left = cell_indptr_parentregion[:-1] + n_excluded_left
            cell_indptr_right = cell_indptr_parentregion[:-1] + n_excluded_left + n_included

            cell_indptr = np.stack([cell_indptr_left, cell_indptr_right], axis=1)

            regionxcell_indptr[region_ix * self.n_cells : (region_ix + 1) * self.n_cells] = (
                cell_indptr + parentregion_start_ix
            )

    regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
        shape=(len(self.regions.coordinates) * len(self.obs), 2)
    )
    regionxcell_indptr_writer[:] = regionxcell_indptr

    return self

`create_regionxcell_indptr2(inclusive=(True, False), overwrite=True)`

Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

Parameters:

Name	Type	Description	Default
`batch_size`		Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.	required
`inclusive`	`tuple`	Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.	`(True, False)`
`overwrite`		Whether to overwrite the existing index pointers.	`True`

Returns:

Type	Description
`FragmentsView`	Same object but with the `regionxcell_indptr` populated

Source code in src/chromatinhd/data/fragments/view.py

def create_regionxcell_indptr2(
    self,
    inclusive: tuple = (True, False),
    overwrite=True,
) -> FragmentsView:
    """
    Create index pointers (left and right) for the fragments associated to each regionxcell combination. This implementation is faster if there are many samples with a lot of fragments (e.g. minibulk data)

    Parameters:
        batch_size:
            Number of regions to wait before saving the intermediate results and freeing up memory. Reduce this number to avoid running out of memory.
        inclusive:
            Whether to only include fragments that are only partially overlapping with the region. Must be a tuple indicating left and/or right inclusivity.
        overwrite:
            Whether to overwrite the existing index pointers.

    Returns:
        Same object but with the `regionxcell_indptr` populated

    """
    if self.regionxcell_indptr.exists() and not overwrite:
        return self

    mapping = self.parent.mapping[:]
    coordinates = self.parent.coordinates[:]

    # convert regions in parent to parent region ixs
    parentregion_to_parentregion_ix = self.parent.regions.coordinates.index.get_loc

    # reset the tensorstores
    regionxcell_indptr = np.zeros((len(self.regions.coordinates) * len(self.obs), 2), dtype=np.int64)

    # index pointers from parentregions to fragments
    parent_regionxcell_indptr = indices_to_indptr(
        mapping[:, 1] * self.parent.n_cells + mapping[:, 0], self.parent.n_regions * self.parent.n_cells
    )

    pbar = tqdm.tqdm(
        total=len(self.regions.coordinates) * len(self.obs),
        leave=False,
        desc="Processing regions",
    )

    self.regions.coordinates["ix"] = np.arange(len(self.regions))
    self.regions.coordinates["start_inclusive"] = self.regions.coordinates["start"] + (
        self.regions.coordinates["strand"] == -1
    )
    self.regions.coordinates["end_inclusive"] = self.regions.coordinates["end"] + (
        self.regions.coordinates["strand"] == -1
    )

    lastupdate = time.time()
    i = 0

    for parentregion, subcoordinates in self.regions.coordinates.groupby(self.parentregion_column):
        # extract in which parent region we need to look
        parentregion_ix = parentregion_to_parentregion_ix(parentregion)

        for cell_ix in range(self.n_cells):
            # extract the parent region coordinates per cell
            parentregionxcell_ix = parentregion_ix * self.parent.n_cells + cell_ix
            parentregion_start_ix, parentregion_end_ix = parent_regionxcell_indptr[
                parentregionxcell_ix : parentregionxcell_ix + 2
            ]

            coordinates_parentregion = coordinates[parentregion_start_ix:parentregion_end_ix]

            for region_id, region in subcoordinates.iterrows():
                # extract fragments lying within the region
                region_ix = region["ix"]
                regionxcell_ix = region_ix * self.n_cells + cell_ix

                if (pbar is not None) and (time.time() - lastupdate > 1):
                    pbar.update(i + 1)
                    i = 0
                    pbar.set_description(f"Processing region {region_id} {cell_ix}")
                    lastupdate = time.time()
                else:
                    i += 1

                # extract fragments lying within the region
                if inclusive == (True, False):
                    n_excluded_left = np.searchsorted(coordinates_parentregion[:, 0], region["start_inclusive"])
                    n_included = (
                        np.searchsorted(coordinates_parentregion[:, 0], region["end_inclusive"]) - n_excluded_left
                    )
                else:
                    raise NotImplementedError("For now, only left-inclusive fragments are supported")

                regionxcell_indptr[regionxcell_ix] = [
                    parentregion_start_ix + n_excluded_left,
                    parentregion_start_ix + n_excluded_left + n_included,
                ]

    regionxcell_indptr_writer = self.regionxcell_indptr.open_creator(
        shape=(len(self.regions.coordinates) * len(self.obs), 2)
    )
    regionxcell_indptr_writer[:] = regionxcell_indptr

    return self

`estimate_fragment_per_cellxregion()`

Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.

Source code in src/chromatinhd/data/fragments/view.py

def estimate_fragment_per_cellxregion(self) -> int:
    """
    Estimate the expected number of fragments per regionxcell combination. This is used to estimate the buffer size for loading fragments.
    """
    return math.ceil((self.regionxcell_indptr[:, 1] - self.regionxcell_indptr[:, 0]).astype(float).mean())

`filter_regions(regions, path=None, overwrite=True)`

Filter based on new regions

Parameters:

Name	Type	Description	Default
`regions`	`Regions`	Regions to filter.	required

Returns: A new Fragments object

Source code in src/chromatinhd/data/fragments/view.py

def filter_regions(self, regions: Regions, path: PathLike = None, overwrite=True) -> Fragments:
    """
    Filter based on new regions

    Parameters:
        regions:
            Regions to filter.
    Returns:
        A new Fragments object
    """

    # check if new regions are a subset of the existing ones
    if not regions.coordinates.index.isin(self.regions.coordinates.index).all():
        raise ValueError("New regions should be a subset of the existing ones")

    # create new fragments
    fragments = FragmentsView.create(
        parent=self.parent,
        regions=regions,
        path=path,
        parentregion_column=self.parentregion_column,
        obs=self.obs,
        var=regions.coordinates,
        reset=overwrite,
    )

    return fragments

`from_fragments(parent, regions, parentregion_column='chrom', obs=None, var=None, path=None, overwrite=False)` `classmethod`

Creates a fragments view from a parent fragments object and a regions object

Parameters:

Name	Type	Description	Default
`parent`	`Fragments`	Parent fragments object. If a fragments view is provided, the parent of the parent will be used.	required
`regions`	`Regions`	Regions object	required
`obs`	`DataFrame`	DataFrame containing information about cells, will be copied from the fragments object if not provided	`None`
`parentregion_column`	`str`	Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both `parent.regions.coordinates` and `regions.coordinates`	`'chrom'`
`path`	`PathLike`	Path to store the fragments view	`None`

Source code in src/chromatinhd/data/fragments/view.py

@classmethod
def from_fragments(
    cls,
    parent: Fragments,
    regions: Regions,
    parentregion_column: str = "chrom",
    obs: pd.DataFrame = None,
    var: pd.DataFrame = None,
    path: PathLike = None,
    overwrite: bool = False,
):
    """
    Creates a fragments view from a parent fragments object and a regions object

    Parameters:
        parent:
            Parent fragments object. If a fragments view is provided, the parent of the parent will be used.
        regions:
            Regions object
        obs:
            DataFrame containing information about cells, will be copied from the fragments object if not provided
        parentregion_column:
            Column in the regions coordinates that links each new region to the regions of the original fragments. This is typically the chromosome column. This column should be present in both `parent.regions.coordinates` and `regions.coordinates`
        path:
            Path to store the fragments view
    """

    if isinstance(parent, FragmentsView):
        while isinstance(parent, FragmentsView):
            parent = parent.parent
    if not isinstance(parent, Fragments):
        raise ValueError("parent should be a Fragments object")
    if not isinstance(regions, Regions):
        raise ValueError("regions should be a Regions object", regions.__class__)

    # dummy proofing
    if parentregion_column not in regions.coordinates.columns:
        raise ValueError(
            f"Column {parentregion_column} not in regions coordinates. Available columns are {regions.coordinates.columns}"
        )
    if parentregion_column not in parent.regions.coordinates.columns:
        raise ValueError(
            f"Column {parentregion_column} not in fragments regions coordinates. Available columns are {parent.regions.coordinates.columns}"
        )
    if not (regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])).all():
        raise ValueError(
            f"Not all regions are present in the parent fragments. Missing regions: {regions.coordinates[parentregion_column][~regions.coordinates[parentregion_column].isin(parent.regions.coordinates[parentregion_column])]}"
        )

    self = cls.create(
        parent=parent,
        regions=regions,
        path=path,
        parentregion_column=parentregion_column,
        obs=parent.obs if obs is None else obs,
        var=regions.coordinates if var is None else var,
        reset=overwrite,
    )

    return self

`get_cache(region_oi)`

Get the cache for a specific region

Source code in src/chromatinhd/data/fragments/view.py

def get_cache(self, region_oi):
    """
    Get the cache for a specific region
    """

    if self._cache is None:
        self._cache = {}

    if region_oi in self._cache:
        return self._cache[region_oi]

    region_ix = self.regions.coordinates.index.get_loc(region_oi)
    regionxcell_ixs = region_ix * self.n_cells + np.arange(self.n_cells)

    indptrs = self.regionxcell_indptr[regionxcell_ixs]

    coordinates_reader = self.parent.coordinates.open_reader()

    n = []
    i = 0
    coordinates = []
    for start, end in indptrs:
        coordinates.append(coordinates_reader[start:end])
        i += end - start
        n.append(end - start)

    coordinates = np.concatenate(coordinates)
    local_cellxregion_ix = np.repeat(np.arange(len(indptrs)), n)
    regionxcell_indptr = indices_to_indptr(local_cellxregion_ix, len(indptrs), dtype=np.int64)

    self._cache[region_oi] = {
        "regionxcell_indptr": regionxcell_indptr,
        "coordinates": coordinates,
    }

    return self._cache[region_oi]

Fragments

chromatinhd.data.fragments.Fragments

coordinates: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression=None, shape=[0, 2]) class-attribute instance-attribute

counts property

mapping: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression='blosc', shape=[0, 2]) class-attribute instance-attribute

n_cells cached property

n_regions cached property

obs: pd.DataFrame = TSV() class-attribute instance-attribute

regions: Regions = Linked() class-attribute instance-attribute

regionxcell_indptr: np.ndarray = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc') class-attribute instance-attribute

var: pd.DataFrame = TSV() class-attribute instance-attribute

create_regionxcell_indptr(overwrite=False)

filter_cells(cells, path=None)

filter_regions(regions, path=None, overwrite=True)

from_alignments(obs, regions, file_column='path', alignment_column=None, remove_duplicates=None, path=None, overwrite=False, reuse=True, batch_size=10000000.0, paired=True)

from_fragments_tsv(fragments_file, regions, obs, cell_column=None, path=None, overwrite=False, reuse=True, batch_size=1000000.0)

from_multiple_fragments_tsv(fragments_files, regions, obs, cell_column='cell_original', batch_column='batch', path=None, overwrite=False, reuse=True, batch_size=1000000.0)

chromatinhd.data.fragments.FragmentsView

coordinates property

counts property

mapping property

n_cells cached property

n_regions cached property

obs: pd.DataFrame = TSV() class-attribute instance-attribute

parent: Fragments = Linked() class-attribute instance-attribute

regions: Regions = Linked() class-attribute instance-attribute

regionxcell_indptr: TensorstoreInstance = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc', shape=(0, 2)) class-attribute instance-attribute

var: pd.DataFrame = TSV() class-attribute instance-attribute

create_regionxcell_indptr(inclusive=(True, False), overwrite=True)

create_regionxcell_indptr2(inclusive=(True, False), overwrite=True)

estimate_fragment_per_cellxregion()

filter_regions(regions, path=None, overwrite=True)

from_fragments(parent, regions, parentregion_column='chrom', obs=None, var=None, path=None, overwrite=False) classmethod

get_cache(region_oi)

`chromatinhd.data.fragments.Fragments`

`coordinates: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression=None, shape=[0, 2])` `class-attribute` `instance-attribute`

`counts` `property`

`mapping: TensorstoreInstance = Tensorstore(dtype='<i4', chunks=[100000, 2], compression='blosc', shape=[0, 2])` `class-attribute` `instance-attribute`

`n_cells` `cached` `property`

`n_regions` `cached` `property`

`obs: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

`regions: Regions = Linked()` `class-attribute` `instance-attribute`

`regionxcell_indptr: np.ndarray = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc')` `class-attribute` `instance-attribute`

`var: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

`create_regionxcell_indptr(overwrite=False)`

`filter_cells(cells, path=None)`

`filter_regions(regions, path=None, overwrite=True)`

`from_alignments(obs, regions, file_column='path', alignment_column=None, remove_duplicates=None, path=None, overwrite=False, reuse=True, batch_size=10000000.0, paired=True)`

`from_fragments_tsv(fragments_file, regions, obs, cell_column=None, path=None, overwrite=False, reuse=True, batch_size=1000000.0)`

`from_multiple_fragments_tsv(fragments_files, regions, obs, cell_column='cell_original', batch_column='batch', path=None, overwrite=False, reuse=True, batch_size=1000000.0)`

`chromatinhd.data.fragments.FragmentsView`

`coordinates` `property`

`counts` `property`

`mapping` `property`

`n_cells` `cached` `property`

`n_regions` `cached` `property`

`obs: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

`parent: Fragments = Linked()` `class-attribute` `instance-attribute`

`regions: Regions = Linked()` `class-attribute` `instance-attribute`

`regionxcell_indptr: TensorstoreInstance = Tensorstore(dtype='<i8', chunks=[100000], compression='blosc', shape=(0, 2))` `class-attribute` `instance-attribute`

`var: pd.DataFrame = TSV()` `class-attribute` `instance-attribute`

`create_regionxcell_indptr(inclusive=(True, False), overwrite=True)`

`create_regionxcell_indptr2(inclusive=(True, False), overwrite=True)`

`estimate_fragment_per_cellxregion()`

`filter_regions(regions, path=None, overwrite=True)`

`from_fragments(parent, regions, parentregion_column='chrom', obs=None, var=None, path=None, overwrite=False)` `classmethod`

`get_cache(region_oi)`