Tessl Tile for pypi/scanpy@1.11.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

analysis-tools.md data-io.md datasets.md external-tools.md index.md preprocessing.md queries.md spatial-analysis.md utilities.md visualization.md

preprocessing.mddocs/

0
# Preprocessing
1

2
Scanpy's preprocessing module provides a comprehensive pipeline for preparing raw single-cell data for downstream analysis. This includes quality control, filtering, normalization, scaling, feature selection, and dimensionality reduction.
3

4
## Capabilities
5

6
### Quality Control and Filtering
7

8
Calculate quality control metrics and filter cells and genes based on various criteria.
9

10
```python { .api }
11
def calculate_qc_metrics(adata, expr_type='counts', var_type='genes', qc_vars=None, percent_top=None, log1p=False, inplace=False):
12
    """
13
    Calculate quality control metrics for cells and genes.
14
    
15
    Parameters:
16
    - adata (AnnData): Annotated data object
17
    - expr_type (str): Name for expression type in metrics
18
    - var_type (str): Name for variable type in metrics  
19
    - qc_vars (list, optional): List of gene sets to calculate percentages
20
    - percent_top (list, optional): Calculate percentage of top expressed genes
21
    - log1p (bool): Calculate metrics on log1p transformed data
22
    - inplace (bool): Modify adata in place
23
    
24
    Returns:
25
    None or dict: QC metrics (if not inplace)
26
    """
27

28
def filter_cells(adata, min_counts=None, min_genes=None, max_counts=None, max_genes=None, inplace=True, copy=False):
29
    """
30
    Filter cells based on counts and numbers of genes expressed.
31
    
32
    Parameters:
33
    - adata (AnnData): Annotated data object
34
    - min_counts (int, optional): Minimum number of counts per cell
35
    - min_genes (int, optional): Minimum number of genes per cell
36
    - max_counts (int, optional): Maximum number of counts per cell
37
    - max_genes (int, optional): Maximum number of genes per cell
38
    - inplace (bool): Modify adata in place
39
    - copy (bool): Return copy instead of modifying
40
    
41
    Returns:
42
    AnnData or None: Filtered object (if copy=True)
43
    """
44

45
def filter_genes(adata, min_counts=None, min_cells=None, max_counts=None, max_cells=None, inplace=True, copy=False):
46
    """
47
    Filter genes based on counts and numbers of cells expressed.
48
    
49
    Parameters:
50
    - adata (AnnData): Annotated data object
51
    - min_counts (int, optional): Minimum number of counts per gene
52
    - min_cells (int, optional): Minimum number of cells expressing gene
53
    - max_counts (int, optional): Maximum number of counts per gene
54
    - max_cells (int, optional): Maximum number of cells expressing gene
55
    - inplace (bool): Modify adata in place
56
    - copy (bool): Return copy instead of modifying
57
    
58
    Returns:
59
    AnnData or None: Filtered object (if copy=True)
60
    """
61
```
62

63
### Normalization and Transformation
64

65
Normalize and transform count data to make it suitable for analysis.
66

67
```python { .api }
68
def normalize_total(adata, target_sum=None, exclude_highly_expressed=False, max_fraction=0.05, key_added=None, layer=None, inplace=True):
69
    """
70
    Normalize counts per cell to a common library size.
71
    
72
    Parameters:
73
    - adata (AnnData): Annotated data object
74
    - target_sum (float, optional): Target sum for normalization (default: median)
75
    - exclude_highly_expressed (bool): Exclude highly expressed genes from calculation
76
    - max_fraction (float): Maximum fraction of total counts for a gene
77
    - key_added (str, optional): Key to add normalization factors to obs
78
    - layer (str, optional): Layer to normalize
79
    - inplace (bool): Modify adata in place
80
    
81
    Returns:
82
    AnnData or None: Normalized object (if not inplace)
83
    """
84

85
def normalize_per_cell(adata, counts_per_cell_after=None, counts_per_cell=None, key_n_counts='n_counts', copy=False):
86
    """
87
    Normalize total counts per cell (deprecated - use normalize_total).
88
    
89
    Parameters:
90
    - adata (AnnData): Annotated data object
91
    - counts_per_cell_after (float, optional): Target counts per cell
92
    - counts_per_cell (array, optional): Current counts per cell
93
    - key_n_counts (str): Key for count information
94
    - copy (bool): Return copy
95
    
96
    Returns:
97
    AnnData or None: Normalized object (if copy=True)
98
    """
99

100
def log1p(adata, base=None, copy=False, chunked=False, chunk_size=None, layer=None, obsm=None):
101
    """
102
    Logarithmize the data matrix: X = log(X + 1).
103
    
104
    Parameters:
105
    - adata (AnnData): Annotated data object
106
    - base (float, optional): Base for logarithm (default: natural log)
107
    - copy (bool): Return copy
108
    - chunked (bool): Process in chunks for large datasets
109
    - chunk_size (int, optional): Size of chunks
110
    - layer (str, optional): Layer to transform
111
    - obsm (str, optional): Obsm key to transform
112
    
113
    Returns:
114
    AnnData or None: Transformed object (if copy=True)
115
    """
116

117
def sqrt(adata, copy=False):
118
    """
119
    Square root transform the data matrix.
120
    
121
    Parameters:
122
    - adata (AnnData): Annotated data object
123
    - copy (bool): Return copy
124
    
125
    Returns:
126
    AnnData or None: Transformed object (if copy=True)
127
    """
128
```
129

130
### Scaling and Centering
131

132
Scale and center data for downstream analysis.
133

134
```python { .api }
135
def scale(adata, zero_center=True, max_value=None, copy=False, layer=None, obsm=None):
136
    """
137
    Scale data matrix to unit variance and optionally zero mean.
138
    
139
    Parameters:
140
    - adata (AnnData): Annotated data object
141
    - zero_center (bool): Center data to zero mean
142
    - max_value (float, optional): Clip values to maximum
143
    - copy (bool): Return copy
144
    - layer (str, optional): Layer to scale
145
    - obsm (str, optional): Obsm key to scale
146
    
147
    Returns:
148
    AnnData or None: Scaled object (if copy=True)
149
    """
150
```
151

152
### Feature Selection
153

154
Identify highly variable genes and other feature selection methods.
155

156
```python { .api }
157
def highly_variable_genes(adata, layer=None, n_top_genes=None, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=3, span=0.3, n_bins=20, flavor='seurat_v3', subset=False, inplace=True, batch_key=None):
158
    """
159
    Identify highly variable genes across cells.
160
    
161
    Parameters:
162
    - adata (AnnData): Annotated data object
163
    - layer (str, optional): Layer to use for calculation
164
    - n_top_genes (int, optional): Number of top genes to select
165
    - min_disp (float): Minimum dispersion
166
    - max_disp (float): Maximum dispersion
167
    - min_mean (float): Minimum mean expression
168
    - max_mean (float): Maximum mean expression
169
    - span (float): Span for LOWESS fit
170
    - n_bins (int): Number of bins for binning
171
    - flavor (str): Method for calculation ('seurat', 'cell_ranger', 'seurat_v3')
172
    - subset (bool): Subset to highly variable genes
173
    - inplace (bool): Modify adata in place
174
    - batch_key (str, optional): Key for batch correction
175
    
176
    Returns:
177
    AnnData or None: Modified object (if not inplace)
178
    """
179

180
def filter_genes_dispersion(adata, flavor='seurat', min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_top_genes=None, log=True, subset=True, copy=False):
181
    """
182
    Filter genes by dispersion (deprecated - use highly_variable_genes).
183
    
184
    Parameters:
185
    - adata (AnnData): Annotated data object
186
    - flavor (str): Method for calculation
187
    - min_disp (float, optional): Minimum dispersion
188
    - max_disp (float, optional): Maximum dispersion
189
    - min_mean (float, optional): Minimum mean
190
    - max_mean (float, optional): Maximum mean
191
    - n_top_genes (int, optional): Number of top genes
192
    - log (bool): Log transform before calculation
193
    - subset (bool): Subset data
194
    - copy (bool): Return copy
195
    
196
    Returns:
197
    AnnData or None: Filtered object (if copy=True)
198
    """
199
```
200

201
### Dimensionality Reduction
202

203
Perform principal component analysis for dimensionality reduction.
204

205
```python { .api }
206
def pca(adata, n_comps=50, zero_center=True, svd_solver=None, random_state=0, return_info=False, use_highly_variable=None, dtype='float32', copy=False, chunked=False, chunk_size=None):
207
    """
208
    Principal component analysis.
209
    
210
    Parameters:
211
    - adata (AnnData): Annotated data object
212
    - n_comps (int): Number of principal components to compute
213
    - zero_center (bool): Zero center the data
214
    - svd_solver (str, optional): SVD solver ('arpack', 'randomized', 'auto')
215
    - random_state (int): Random seed
216
    - return_info (bool): Return additional information
217
    - use_highly_variable (bool, optional): Use only highly variable genes
218
    - dtype (str): Data type for computation
219
    - copy (bool): Return copy
220
    - chunked (bool): Process in chunks
221
    - chunk_size (int, optional): Chunk size
222
    
223
    Returns:
224
    AnnData or None: Object with PCA results (if copy=True)
225
    """
226
```
227

228
### Neighborhood Graph Construction
229

230
Compute neighborhood graphs for downstream analysis.
231

232
```python { .api }
233
def neighbors(adata, n_neighbors=15, n_pcs=None, use_rep=None, knn=True, method='umap', transformer=None, metric='euclidean', metric_kwds={}, random_state=0, key_added=None, copy=False):
234
    """
235
    Compute the nearest neighbors distance matrix and neighborhood graph.
236
    
237
    Parameters:
238
    - adata (AnnData): Annotated data object
239
    - n_neighbors (int): Number of nearest neighbors
240
    - n_pcs (int, optional): Number of PCs to use
241
    - use_rep (str, optional): Representation to use ('X_pca', etc.)
242
    - knn (bool): Use k-nearest neighbors
243
    - method (str): Method for connectivity ('umap', 'gauss')
244
    - transformer (object, optional): Custom transformer
245
    - metric (str): Distance metric
246
    - metric_kwds (dict): Additional metric parameters
247
    - random_state (int): Random seed
248
    - key_added (str, optional): Key for storing results
249
    - copy (bool): Return copy
250
    
251
    Returns:
252
    AnnData or None: Object with neighbors graph (if copy=True)
253
    """
254
```
255

256
### Batch Effect Correction
257

258
Correct for batch effects and technical variation.
259

260
```python { .api }
261
def combat(adata, key='batch', covariates=None, inplace=True):
262
    """
263
    ComBat batch effect correction.
264
    
265
    Parameters:
266
    - adata (AnnData): Annotated data object
267
    - key (str): Key in obs containing batch information
268
    - covariates (list, optional): Additional covariates to preserve
269
    - inplace (bool): Modify adata in place
270
    
271
    Returns:
272
    AnnData or None: Batch-corrected object (if not inplace)
273
    """
274
```
275

276
### Doublet Detection
277

278
Detect potential cell doublets using Scrublet.
279

280
```python { .api }
281
def scrublet(adata, adata_sim=None, sim_doublet_ratio=2.0, n_neighbors=None, expected_doublet_rate=0.1, stdev_doublet_rate=0.02, synthetic_doublet_umi_subsampling=1.0, knn_dist_metric='euclidean', normalize_variance=True, log_transform=False, mean_center=True, n_prin_comps=30, use_approx_neighbors=True, get_doublet_neighbor_parents=False, random_state=0, copy=False):
282
    """
283
    Predict cell doublets using Scrublet.
284
    
285
    Parameters:
286
    - adata (AnnData): Annotated data object
287
    - adata_sim (AnnData, optional): Simulated doublets
288
    - sim_doublet_ratio (float): Ratio of simulated doublets
289
    - n_neighbors (int, optional): Number of neighbors for KNN graph
290
    - expected_doublet_rate (float): Expected doublet rate
291
    - stdev_doublet_rate (float): Standard deviation of doublet rate
292
    - synthetic_doublet_umi_subsampling (float): UMI subsampling rate
293
    - knn_dist_metric (str): Distance metric for KNN
294
    - normalize_variance (bool): Normalize variance
295
    - log_transform (bool): Log transform data
296
    - mean_center (bool): Mean center data
297
    - n_prin_comps (int): Number of principal components
298
    - use_approx_neighbors (bool): Use approximate neighbors
299
    - get_doublet_neighbor_parents (bool): Get doublet neighbor parents
300
    - random_state (int): Random seed
301
    - copy (bool): Return copy
302
    
303
    Returns:
304
    AnnData or None: Object with doublet scores (if copy=True)
305
    """
306

307
def scrublet_simulate_doublets(adata, sim_doublet_ratio=2.0, synthetic_doublet_umi_subsampling=1.0, random_state=0):
308
    """
309
    Simulate doublets for Scrublet analysis.
310
    
311
    Parameters:
312
    - adata (AnnData): Annotated data object
313
    - sim_doublet_ratio (float): Ratio of simulated doublets
314
    - synthetic_doublet_umi_subsampling (float): UMI subsampling rate
315
    - random_state (int): Random seed
316
    
317
    Returns:
318
    AnnData: Simulated doublets
319
    """
320
```
321

322
### Utility Functions
323

324
Additional preprocessing utilities.
325

326
```python { .api }
327
def downsample_counts(adata, counts_per_cell=None, total_counts=None, random_state=0, replace=False, copy=False):
328
    """
329
    Downsample counts per cell.
330
    
331
    Parameters:
332
    - adata (AnnData): Annotated data object
333
    - counts_per_cell (int, optional): Target counts per cell
334
    - total_counts (int, optional): Total target counts
335
    - random_state (int): Random seed
336
    - replace (bool): Sample with replacement
337
    - copy (bool): Return copy
338
    
339
    Returns:
340
    AnnData or None: Downsampled object (if copy=True)
341
    """
342

343
def sample(adata, n_obs=None, fraction=None, copy=False, random_state=0):
344
    """
345
    Sample observations from the data.
346
    
347
    Parameters:
348
    - adata (AnnData): Annotated data object
349
    - n_obs (int, optional): Number of observations to sample
350
    - fraction (float, optional): Fraction of observations to sample
351
    - copy (bool): Return copy
352
    - random_state (int): Random seed
353
    
354
    Returns:
355
    AnnData or None: Sampled object (if copy=True)
356
    """
357

358
def regress_out(adata, keys, n_jobs=None, copy=False):
359
    """
360
    Regress out unwanted sources of variation.
361
    
362
    Parameters:
363
    - adata (AnnData): Annotated data object
364
    - keys (list): Keys in obs to regress out
365
    - n_jobs (int, optional): Number of parallel jobs
366
    - copy (bool): Return copy
367
    
368
    Returns:
369
    AnnData or None: Corrected object (if copy=True)
370
    """
371
```
372

373
### Recipe Functions
374

375
Predefined preprocessing workflows based on published methods.
376

377
```python { .api }
378
def recipe_seurat(adata, log=True, plot=True, copy=False):
379
    """
380
    Seurat-like preprocessing recipe.
381
    
382
    Parameters:
383
    - adata (AnnData): Annotated data object
384
    - log (bool): Apply log transformation
385
    - plot (bool): Generate plots
386
    - copy (bool): Return copy
387
    
388
    Returns:
389
    AnnData or None: Preprocessed object (if copy=True)
390
    """
391

392
def recipe_weinreb17(adata, log=True, mean_threshold=0.01, cv_threshold=2, n_top_genes=1000, plot=True, copy=False):
393
    """
394
    Preprocessing recipe from Weinreb et al. 2017.
395
    
396
    Parameters:
397
    - adata (AnnData): Annotated data object
398
    - log (bool): Apply log transformation
399
    - mean_threshold (float): Mean expression threshold
400
    - cv_threshold (float): Coefficient of variation threshold
401
    - n_top_genes (int): Number of top genes to select
402
    - plot (bool): Generate plots
403
    - copy (bool): Return copy
404
    
405
    Returns:
406
    AnnData or None: Preprocessed object (if copy=True)
407
    """
408

409
def recipe_zheng17(adata, n_top_genes=1000, log=True, plot=True, copy=False):
410
    """
411
    Preprocessing recipe from Zheng et al. 2017.
412
    
413
    Parameters:
414
    - adata (AnnData): Annotated data object
415
    - n_top_genes (int): Number of top genes to select
416
    - log (bool): Apply log transformation  
417
    - plot (bool): Generate plots
418
    - copy (bool): Return copy
419
    
420
    Returns:
421
    AnnData or None: Preprocessed object (if copy=True)
422
    """
423
```
424

425
## Usage Examples
426

427
### Basic Preprocessing Pipeline
428

429
```python
430
import scanpy as sc
431
import numpy as np
432

433
# Load data
434
adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/hg19/')
435

436
# Basic filtering
437
sc.pp.filter_cells(adata, min_genes=200)  # filter cells
438
sc.pp.filter_genes(adata, min_cells=3)   # filter genes
439

440
# Calculate QC metrics
441
adata.var['mt'] = adata.var_names.str.startswith('MT-')
442
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
443

444
# Filter based on QC metrics
445
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
446
adata = adata[adata.obs.pct_counts_mt < 20, :]
447

448
# Normalization and log transformation
449
sc.pp.normalize_total(adata, target_sum=1e4)
450
sc.pp.log1p(adata)
451

452
# Feature selection
453
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
454
adata.raw = adata  # save the full data
455
adata = adata[:, adata.var.highly_variable]
456

457
# Scaling
458
sc.pp.scale(adata, max_value=10)
459

460
# PCA
461
sc.pp.pca(adata, svd_solver='arpack')
462

463
# Neighborhood graph
464
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
465
```
466

467
### Recipe-based Preprocessing
468

469
```python
470
# Use Seurat-like preprocessing
471
adata = sc.read_10x_mtx('data/')
472
sc.pp.recipe_seurat(adata, log=True, plot=False)
473
```

Version

Tile

Files

preprocessing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

preprocessing.mddocs/