Skip to content

Regions

chromatinhd.data.Regions

Bases: Flow

Regions in the genome

Source code in src/chromatinhd/data/regions.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
class Regions(Flow):
    """
    Regions in the genome
    """

    coordinates = TSV(columns=["chrom", "start", "end"])
    "Coordinates dataframe of the regions, with columns chrom, start, end"

    window = Stored()

    @classmethod
    def from_transcripts(
        cls,
        transcripts: pd.DataFrame,
        window: [list, np.ndarray],
        path: PathLike = None,
        max_n_regions: Optional[int] = None,
        overwrite=True,
    ) -> Regions:
        """
        Create regions from a dataframe of transcripts,
        using a specified window around each transcription start site.

        Parameters:
            transcripts:
                Dataframe of transcripts, with columns chrom, start, end, strand, transcript
            window:
                Window around each transcription start site. Should be a 2-element array, e.g. [-10000, 10000]
            path:
                Folder in which the regions data will be stored
            max_n_regions:
                Maximum number of region to use. If None, all regions are used.
        Returns:
            Regions
        """
        transcripts["tss"] = transcripts["start"] * (transcripts["strand"] == 1) + transcripts["end"] * (transcripts["strand"] == -1)

        regions = transcripts[["chrom", "tss", "transcript"]].copy()

        regions["strand"] = transcripts["strand"]
        regions["positive_strand"] = (regions["strand"] == 1).astype(int)
        regions["negative_strand"] = (regions["strand"] == -1).astype(int)
        regions["chrom"] = transcripts.loc[regions.index, "chrom"]

        regions["start"] = (regions["tss"] + window[0] * (regions["strand"] == 1) - window[1] * (regions["strand"] == -1)).astype(int)
        regions["end"] = (regions["tss"] + window[1] * (regions["strand"] == -1) - window[0] * (regions["strand"] == 1)).astype(int)

        if max_n_regions is not None:
            regions = regions.iloc[:max_n_regions]

        return cls.create(
            path=path,
            coordinates=regions[["chrom", "start", "end", "tss", "strand", "transcript"]],
            window=window,
            reset=overwrite,
        )

    def filter(self, region_ids: List[str], path: PathLike = None, overwrite=True) -> Regions:
        """
        Select a subset of regions

        Parameters:
            region_ids:
                Genes to filter. Should be a pandas Series with the index being the ensembl transcript ids.
            path:
                Path to store the filtered regions
        Returns:
            Regions with only the specified region_ids
        """

        return Regions.create(coordinates=self.coordinates.loc[region_ids], window=self.window, path=path, reset=overwrite)

    @property
    def window_width(self):
        if self.window is None:
            return None
        return self.window[1] - self.window[0]

    region_width = window_width
    width = window_width
    "Width of the regions, None if regions do not have a fixed width"

    @classmethod
    def from_chromosomes_file(cls, chromosomes_file: PathLike, path: PathLike = None, filter_chromosomes=True, overwrite: bool = True) -> Regions:
        """
        Create regions based on a chromosomes file, e.g. hg38.chrom.sizes

        Parameters:
            chromosomes_file:
                Path to chromosomes file, tab separated, with columns chrom, size
            path:
                Folder in which the regions data will be stored
        Returns:
            Regions
        """

        chromosomes = pd.read_csv(chromosomes_file, sep="\t", names=["chrom", "size"])
        chromosomes["start"] = 0
        chromosomes["end"] = chromosomes["size"]
        chromosomes["strand"] = 1
        chromosomes = chromosomes[["chrom", "start", "end", "strand"]]
        chromosomes = chromosomes.set_index("chrom", drop=False)
        chromosomes.index.name = "region"

        if filter_chromosomes:
            chromosomes = chromosomes.loc[~chromosomes["chrom"].isin(["chrM", "chrMT"])]
            chromosomes = chromosomes.loc[~chromosomes["chrom"].str.contains("_")]
            chromosomes = chromosomes.loc[~chromosomes["chrom"].str.contains(r"\.")]

        chromosomes = chromosomes.sort_values("chrom")

        return cls.create(
            path=path,
            coordinates=chromosomes,
            window=None,
            reset=overwrite,
        )

    @functools.cached_property
    def n_regions(self):
        return self.coordinates.shape[0]

    @functools.cached_property
    def region_lengths(self):
        return (self.coordinates["end"] - self.coordinates["start"]).values

    @functools.cached_property
    def region_starts(self):
        return self.coordinates["start"].values

    @functools.cached_property
    def cumulative_region_lengths(self):
        return np.pad(np.cumsum(self.coordinates["end"].values - self.coordinates["start"].values), (1, 0))

    @property
    def var(self):
        return self.coordinates

    @var.setter
    def var(self, value):
        self.coordinates = value

    def __len__(self):
        return self.n_regions

coordinates = TSV(columns=['chrom', 'start', 'end']) class-attribute instance-attribute

Coordinates dataframe of the regions, with columns chrom, start, end

width = window_width class-attribute instance-attribute

Width of the regions, None if regions do not have a fixed width

filter(region_ids, path=None, overwrite=True)

Select a subset of regions

Parameters:

Name Type Description Default
region_ids List[str]

Genes to filter. Should be a pandas Series with the index being the ensembl transcript ids.

required
path PathLike

Path to store the filtered regions

None

Returns: Regions with only the specified region_ids

Source code in src/chromatinhd/data/regions.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def filter(self, region_ids: List[str], path: PathLike = None, overwrite=True) -> Regions:
    """
    Select a subset of regions

    Parameters:
        region_ids:
            Genes to filter. Should be a pandas Series with the index being the ensembl transcript ids.
        path:
            Path to store the filtered regions
    Returns:
        Regions with only the specified region_ids
    """

    return Regions.create(coordinates=self.coordinates.loc[region_ids], window=self.window, path=path, reset=overwrite)

from_chromosomes_file(chromosomes_file, path=None, filter_chromosomes=True, overwrite=True) classmethod

Create regions based on a chromosomes file, e.g. hg38.chrom.sizes

Parameters:

Name Type Description Default
chromosomes_file PathLike

Path to chromosomes file, tab separated, with columns chrom, size

required
path PathLike

Folder in which the regions data will be stored

None

Returns: Regions

Source code in src/chromatinhd/data/regions.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
@classmethod
def from_chromosomes_file(cls, chromosomes_file: PathLike, path: PathLike = None, filter_chromosomes=True, overwrite: bool = True) -> Regions:
    """
    Create regions based on a chromosomes file, e.g. hg38.chrom.sizes

    Parameters:
        chromosomes_file:
            Path to chromosomes file, tab separated, with columns chrom, size
        path:
            Folder in which the regions data will be stored
    Returns:
        Regions
    """

    chromosomes = pd.read_csv(chromosomes_file, sep="\t", names=["chrom", "size"])
    chromosomes["start"] = 0
    chromosomes["end"] = chromosomes["size"]
    chromosomes["strand"] = 1
    chromosomes = chromosomes[["chrom", "start", "end", "strand"]]
    chromosomes = chromosomes.set_index("chrom", drop=False)
    chromosomes.index.name = "region"

    if filter_chromosomes:
        chromosomes = chromosomes.loc[~chromosomes["chrom"].isin(["chrM", "chrMT"])]
        chromosomes = chromosomes.loc[~chromosomes["chrom"].str.contains("_")]
        chromosomes = chromosomes.loc[~chromosomes["chrom"].str.contains(r"\.")]

    chromosomes = chromosomes.sort_values("chrom")

    return cls.create(
        path=path,
        coordinates=chromosomes,
        window=None,
        reset=overwrite,
    )

from_transcripts(transcripts, window, path=None, max_n_regions=None, overwrite=True) classmethod

Create regions from a dataframe of transcripts, using a specified window around each transcription start site.

Parameters:

Name Type Description Default
transcripts DataFrame

Dataframe of transcripts, with columns chrom, start, end, strand, transcript

required
window [list, ndarray]

Window around each transcription start site. Should be a 2-element array, e.g. [-10000, 10000]

required
path PathLike

Folder in which the regions data will be stored

None
max_n_regions Optional[int]

Maximum number of region to use. If None, all regions are used.

None

Returns: Regions

Source code in src/chromatinhd/data/regions.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@classmethod
def from_transcripts(
    cls,
    transcripts: pd.DataFrame,
    window: [list, np.ndarray],
    path: PathLike = None,
    max_n_regions: Optional[int] = None,
    overwrite=True,
) -> Regions:
    """
    Create regions from a dataframe of transcripts,
    using a specified window around each transcription start site.

    Parameters:
        transcripts:
            Dataframe of transcripts, with columns chrom, start, end, strand, transcript
        window:
            Window around each transcription start site. Should be a 2-element array, e.g. [-10000, 10000]
        path:
            Folder in which the regions data will be stored
        max_n_regions:
            Maximum number of region to use. If None, all regions are used.
    Returns:
        Regions
    """
    transcripts["tss"] = transcripts["start"] * (transcripts["strand"] == 1) + transcripts["end"] * (transcripts["strand"] == -1)

    regions = transcripts[["chrom", "tss", "transcript"]].copy()

    regions["strand"] = transcripts["strand"]
    regions["positive_strand"] = (regions["strand"] == 1).astype(int)
    regions["negative_strand"] = (regions["strand"] == -1).astype(int)
    regions["chrom"] = transcripts.loc[regions.index, "chrom"]

    regions["start"] = (regions["tss"] + window[0] * (regions["strand"] == 1) - window[1] * (regions["strand"] == -1)).astype(int)
    regions["end"] = (regions["tss"] + window[1] * (regions["strand"] == -1) - window[0] * (regions["strand"] == 1)).astype(int)

    if max_n_regions is not None:
        regions = regions.iloc[:max_n_regions]

    return cls.create(
        path=path,
        coordinates=regions[["chrom", "start", "end", "tss", "strand", "transcript"]],
        window=window,
        reset=overwrite,
    )

chromatinhd.data.regions.select_tss_from_fragments(transcripts, fragments_file, window=(-100, 100))

Select the TSS with the most fragments within a window of the TSS

Parameters:

Name Type Description Default
transcripts DataFrame

Dataframe of transcripts, with columns chrom, tss, ensembl_gene_id.

required
fragments_file PathLike

Path to fragments file

required
window [ndarray, tuple]

Window around the TSS to count fragments

(-100, 100)

Returns: Dataframe of transcripts, with columns chrom, tss and n_fragments, with index being the gene id

Source code in src/chromatinhd/data/regions.py
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
def select_tss_from_fragments(transcripts: pd.DataFrame, fragments_file: PathLike, window: [np.ndarray, tuple] = (-100, 100)) -> pd.DataFrame:
    """
    Select the TSS with the most fragments within a window of the TSS

    Parameters:
        transcripts:
            Dataframe of transcripts, with columns chrom, tss, ensembl_gene_id.
        fragments_file:
            Path to fragments file
        window:
            Window around the TSS to count fragments
    Returns:
        Dataframe of transcripts, with columns chrom, tss and n_fragments, with index being the gene id
    """
    if not ([col in transcripts.columns for col in ["chrom", "tss", "ensembl_gene_id"]]):
        raise ValueError("Transcripts should have columns chrom, tss, ensembl_gene_id. ")

    try:
        import pysam
    except ImportError:
        raise ImportError("Please install the pysam package `pip install pysam` or `conda install pysam`")

    try:
        fragments_tabix = pysam.TabixFile(str(fragments_file))
    except OSError as error:
        raise ValueError("fragments file is not indexed, please run `pysam.tabix_index(file, preset = 'bed')`") from error

    nfrags = []
    for chrom, tss in tqdm.tqdm(zip(transcripts["chrom"], transcripts["tss"]), total=transcripts.shape[0]):
        frags = list(fragments_tabix.fetch(chrom, tss + window[0], tss + window[1]))
        nfrags.append(len(frags))
    transcripts["n_fragments"] = nfrags
    selected_transcripts = transcripts.reset_index().sort_values("n_fragments", ascending=False).groupby("ensembl_gene_id").first()
    selected_transcripts.index.name = "gene"
    return selected_transcripts