0
# Preprocessing
1
2
Scanpy's preprocessing module provides a comprehensive pipeline for preparing raw single-cell data for downstream analysis. This includes quality control, filtering, normalization, scaling, feature selection, and dimensionality reduction.
3
4
## Capabilities
5
6
### Quality Control and Filtering
7
8
Calculate quality control metrics and filter cells and genes based on various criteria.
9
10
```python { .api }
11
def calculate_qc_metrics(adata, expr_type='counts', var_type='genes', qc_vars=None, percent_top=None, log1p=False, inplace=False):
12
"""
13
Calculate quality control metrics for cells and genes.
14
15
Parameters:
16
- adata (AnnData): Annotated data object
17
- expr_type (str): Name for expression type in metrics
18
- var_type (str): Name for variable type in metrics
19
- qc_vars (list, optional): List of gene sets to calculate percentages
20
- percent_top (list, optional): Calculate percentage of top expressed genes
21
- log1p (bool): Calculate metrics on log1p transformed data
22
- inplace (bool): Modify adata in place
23
24
Returns:
25
None or dict: QC metrics (if not inplace)
26
"""
27
28
def filter_cells(adata, min_counts=None, min_genes=None, max_counts=None, max_genes=None, inplace=True, copy=False):
29
"""
30
Filter cells based on counts and numbers of genes expressed.
31
32
Parameters:
33
- adata (AnnData): Annotated data object
34
- min_counts (int, optional): Minimum number of counts per cell
35
- min_genes (int, optional): Minimum number of genes per cell
36
- max_counts (int, optional): Maximum number of counts per cell
37
- max_genes (int, optional): Maximum number of genes per cell
38
- inplace (bool): Modify adata in place
39
- copy (bool): Return copy instead of modifying
40
41
Returns:
42
AnnData or None: Filtered object (if copy=True)
43
"""
44
45
def filter_genes(adata, min_counts=None, min_cells=None, max_counts=None, max_cells=None, inplace=True, copy=False):
46
"""
47
Filter genes based on counts and numbers of cells expressed.
48
49
Parameters:
50
- adata (AnnData): Annotated data object
51
- min_counts (int, optional): Minimum number of counts per gene
52
- min_cells (int, optional): Minimum number of cells expressing gene
53
- max_counts (int, optional): Maximum number of counts per gene
54
- max_cells (int, optional): Maximum number of cells expressing gene
55
- inplace (bool): Modify adata in place
56
- copy (bool): Return copy instead of modifying
57
58
Returns:
59
AnnData or None: Filtered object (if copy=True)
60
"""
61
```
62
63
### Normalization and Transformation
64
65
Normalize and transform count data to make it suitable for analysis.
66
67
```python { .api }
68
def normalize_total(adata, target_sum=None, exclude_highly_expressed=False, max_fraction=0.05, key_added=None, layer=None, inplace=True):
69
"""
70
Normalize counts per cell to a common library size.
71
72
Parameters:
73
- adata (AnnData): Annotated data object
74
- target_sum (float, optional): Target sum for normalization (default: median)
75
- exclude_highly_expressed (bool): Exclude highly expressed genes from calculation
76
- max_fraction (float): Maximum fraction of total counts for a gene
77
- key_added (str, optional): Key to add normalization factors to obs
78
- layer (str, optional): Layer to normalize
79
- inplace (bool): Modify adata in place
80
81
Returns:
82
AnnData or None: Normalized object (if not inplace)
83
"""
84
85
def normalize_per_cell(adata, counts_per_cell_after=None, counts_per_cell=None, key_n_counts='n_counts', copy=False):
86
"""
87
Normalize total counts per cell (deprecated - use normalize_total).
88
89
Parameters:
90
- adata (AnnData): Annotated data object
91
- counts_per_cell_after (float, optional): Target counts per cell
92
- counts_per_cell (array, optional): Current counts per cell
93
- key_n_counts (str): Key for count information
94
- copy (bool): Return copy
95
96
Returns:
97
AnnData or None: Normalized object (if copy=True)
98
"""
99
100
def log1p(adata, base=None, copy=False, chunked=False, chunk_size=None, layer=None, obsm=None):
101
"""
102
Logarithmize the data matrix: X = log(X + 1).
103
104
Parameters:
105
- adata (AnnData): Annotated data object
106
- base (float, optional): Base for logarithm (default: natural log)
107
- copy (bool): Return copy
108
- chunked (bool): Process in chunks for large datasets
109
- chunk_size (int, optional): Size of chunks
110
- layer (str, optional): Layer to transform
111
- obsm (str, optional): Obsm key to transform
112
113
Returns:
114
AnnData or None: Transformed object (if copy=True)
115
"""
116
117
def sqrt(adata, copy=False):
118
"""
119
Square root transform the data matrix.
120
121
Parameters:
122
- adata (AnnData): Annotated data object
123
- copy (bool): Return copy
124
125
Returns:
126
AnnData or None: Transformed object (if copy=True)
127
"""
128
```
129
130
### Scaling and Centering
131
132
Scale and center data for downstream analysis.
133
134
```python { .api }
135
def scale(adata, zero_center=True, max_value=None, copy=False, layer=None, obsm=None):
136
"""
137
Scale data matrix to unit variance and optionally zero mean.
138
139
Parameters:
140
- adata (AnnData): Annotated data object
141
- zero_center (bool): Center data to zero mean
142
- max_value (float, optional): Clip values to maximum
143
- copy (bool): Return copy
144
- layer (str, optional): Layer to scale
145
- obsm (str, optional): Obsm key to scale
146
147
Returns:
148
AnnData or None: Scaled object (if copy=True)
149
"""
150
```
151
152
### Feature Selection
153
154
Identify highly variable genes and other feature selection methods.
155
156
```python { .api }
157
def highly_variable_genes(adata, layer=None, n_top_genes=None, min_disp=0.5, max_disp=np.inf, min_mean=0.0125, max_mean=3, span=0.3, n_bins=20, flavor='seurat_v3', subset=False, inplace=True, batch_key=None):
158
"""
159
Identify highly variable genes across cells.
160
161
Parameters:
162
- adata (AnnData): Annotated data object
163
- layer (str, optional): Layer to use for calculation
164
- n_top_genes (int, optional): Number of top genes to select
165
- min_disp (float): Minimum dispersion
166
- max_disp (float): Maximum dispersion
167
- min_mean (float): Minimum mean expression
168
- max_mean (float): Maximum mean expression
169
- span (float): Span for LOWESS fit
170
- n_bins (int): Number of bins for binning
171
- flavor (str): Method for calculation ('seurat', 'cell_ranger', 'seurat_v3')
172
- subset (bool): Subset to highly variable genes
173
- inplace (bool): Modify adata in place
174
- batch_key (str, optional): Key for batch correction
175
176
Returns:
177
AnnData or None: Modified object (if not inplace)
178
"""
179
180
def filter_genes_dispersion(adata, flavor='seurat', min_disp=None, max_disp=None, min_mean=None, max_mean=None, n_top_genes=None, log=True, subset=True, copy=False):
181
"""
182
Filter genes by dispersion (deprecated - use highly_variable_genes).
183
184
Parameters:
185
- adata (AnnData): Annotated data object
186
- flavor (str): Method for calculation
187
- min_disp (float, optional): Minimum dispersion
188
- max_disp (float, optional): Maximum dispersion
189
- min_mean (float, optional): Minimum mean
190
- max_mean (float, optional): Maximum mean
191
- n_top_genes (int, optional): Number of top genes
192
- log (bool): Log transform before calculation
193
- subset (bool): Subset data
194
- copy (bool): Return copy
195
196
Returns:
197
AnnData or None: Filtered object (if copy=True)
198
"""
199
```
200
201
### Dimensionality Reduction
202
203
Perform principal component analysis for dimensionality reduction.
204
205
```python { .api }
206
def pca(adata, n_comps=50, zero_center=True, svd_solver=None, random_state=0, return_info=False, use_highly_variable=None, dtype='float32', copy=False, chunked=False, chunk_size=None):
207
"""
208
Principal component analysis.
209
210
Parameters:
211
- adata (AnnData): Annotated data object
212
- n_comps (int): Number of principal components to compute
213
- zero_center (bool): Zero center the data
214
- svd_solver (str, optional): SVD solver ('arpack', 'randomized', 'auto')
215
- random_state (int): Random seed
216
- return_info (bool): Return additional information
217
- use_highly_variable (bool, optional): Use only highly variable genes
218
- dtype (str): Data type for computation
219
- copy (bool): Return copy
220
- chunked (bool): Process in chunks
221
- chunk_size (int, optional): Chunk size
222
223
Returns:
224
AnnData or None: Object with PCA results (if copy=True)
225
"""
226
```
227
228
### Neighborhood Graph Construction
229
230
Compute neighborhood graphs for downstream analysis.
231
232
```python { .api }
233
def neighbors(adata, n_neighbors=15, n_pcs=None, use_rep=None, knn=True, method='umap', transformer=None, metric='euclidean', metric_kwds={}, random_state=0, key_added=None, copy=False):
234
"""
235
Compute the nearest neighbors distance matrix and neighborhood graph.
236
237
Parameters:
238
- adata (AnnData): Annotated data object
239
- n_neighbors (int): Number of nearest neighbors
240
- n_pcs (int, optional): Number of PCs to use
241
- use_rep (str, optional): Representation to use ('X_pca', etc.)
242
- knn (bool): Use k-nearest neighbors
243
- method (str): Method for connectivity ('umap', 'gauss')
244
- transformer (object, optional): Custom transformer
245
- metric (str): Distance metric
246
- metric_kwds (dict): Additional metric parameters
247
- random_state (int): Random seed
248
- key_added (str, optional): Key for storing results
249
- copy (bool): Return copy
250
251
Returns:
252
AnnData or None: Object with neighbors graph (if copy=True)
253
"""
254
```
255
256
### Batch Effect Correction
257
258
Correct for batch effects and technical variation.
259
260
```python { .api }
261
def combat(adata, key='batch', covariates=None, inplace=True):
262
"""
263
ComBat batch effect correction.
264
265
Parameters:
266
- adata (AnnData): Annotated data object
267
- key (str): Key in obs containing batch information
268
- covariates (list, optional): Additional covariates to preserve
269
- inplace (bool): Modify adata in place
270
271
Returns:
272
AnnData or None: Batch-corrected object (if not inplace)
273
"""
274
```
275
276
### Doublet Detection
277
278
Detect potential cell doublets using Scrublet.
279
280
```python { .api }
281
def scrublet(adata, adata_sim=None, sim_doublet_ratio=2.0, n_neighbors=None, expected_doublet_rate=0.1, stdev_doublet_rate=0.02, synthetic_doublet_umi_subsampling=1.0, knn_dist_metric='euclidean', normalize_variance=True, log_transform=False, mean_center=True, n_prin_comps=30, use_approx_neighbors=True, get_doublet_neighbor_parents=False, random_state=0, copy=False):
282
"""
283
Predict cell doublets using Scrublet.
284
285
Parameters:
286
- adata (AnnData): Annotated data object
287
- adata_sim (AnnData, optional): Simulated doublets
288
- sim_doublet_ratio (float): Ratio of simulated doublets
289
- n_neighbors (int, optional): Number of neighbors for KNN graph
290
- expected_doublet_rate (float): Expected doublet rate
291
- stdev_doublet_rate (float): Standard deviation of doublet rate
292
- synthetic_doublet_umi_subsampling (float): UMI subsampling rate
293
- knn_dist_metric (str): Distance metric for KNN
294
- normalize_variance (bool): Normalize variance
295
- log_transform (bool): Log transform data
296
- mean_center (bool): Mean center data
297
- n_prin_comps (int): Number of principal components
298
- use_approx_neighbors (bool): Use approximate neighbors
299
- get_doublet_neighbor_parents (bool): Get doublet neighbor parents
300
- random_state (int): Random seed
301
- copy (bool): Return copy
302
303
Returns:
304
AnnData or None: Object with doublet scores (if copy=True)
305
"""
306
307
def scrublet_simulate_doublets(adata, sim_doublet_ratio=2.0, synthetic_doublet_umi_subsampling=1.0, random_state=0):
308
"""
309
Simulate doublets for Scrublet analysis.
310
311
Parameters:
312
- adata (AnnData): Annotated data object
313
- sim_doublet_ratio (float): Ratio of simulated doublets
314
- synthetic_doublet_umi_subsampling (float): UMI subsampling rate
315
- random_state (int): Random seed
316
317
Returns:
318
AnnData: Simulated doublets
319
"""
320
```
321
322
### Utility Functions
323
324
Additional preprocessing utilities.
325
326
```python { .api }
327
def downsample_counts(adata, counts_per_cell=None, total_counts=None, random_state=0, replace=False, copy=False):
328
"""
329
Downsample counts per cell.
330
331
Parameters:
332
- adata (AnnData): Annotated data object
333
- counts_per_cell (int, optional): Target counts per cell
334
- total_counts (int, optional): Total target counts
335
- random_state (int): Random seed
336
- replace (bool): Sample with replacement
337
- copy (bool): Return copy
338
339
Returns:
340
AnnData or None: Downsampled object (if copy=True)
341
"""
342
343
def sample(adata, n_obs=None, fraction=None, copy=False, random_state=0):
344
"""
345
Sample observations from the data.
346
347
Parameters:
348
- adata (AnnData): Annotated data object
349
- n_obs (int, optional): Number of observations to sample
350
- fraction (float, optional): Fraction of observations to sample
351
- copy (bool): Return copy
352
- random_state (int): Random seed
353
354
Returns:
355
AnnData or None: Sampled object (if copy=True)
356
"""
357
358
def regress_out(adata, keys, n_jobs=None, copy=False):
359
"""
360
Regress out unwanted sources of variation.
361
362
Parameters:
363
- adata (AnnData): Annotated data object
364
- keys (list): Keys in obs to regress out
365
- n_jobs (int, optional): Number of parallel jobs
366
- copy (bool): Return copy
367
368
Returns:
369
AnnData or None: Corrected object (if copy=True)
370
"""
371
```
372
373
### Recipe Functions
374
375
Predefined preprocessing workflows based on published methods.
376
377
```python { .api }
378
def recipe_seurat(adata, log=True, plot=True, copy=False):
379
"""
380
Seurat-like preprocessing recipe.
381
382
Parameters:
383
- adata (AnnData): Annotated data object
384
- log (bool): Apply log transformation
385
- plot (bool): Generate plots
386
- copy (bool): Return copy
387
388
Returns:
389
AnnData or None: Preprocessed object (if copy=True)
390
"""
391
392
def recipe_weinreb17(adata, log=True, mean_threshold=0.01, cv_threshold=2, n_top_genes=1000, plot=True, copy=False):
393
"""
394
Preprocessing recipe from Weinreb et al. 2017.
395
396
Parameters:
397
- adata (AnnData): Annotated data object
398
- log (bool): Apply log transformation
399
- mean_threshold (float): Mean expression threshold
400
- cv_threshold (float): Coefficient of variation threshold
401
- n_top_genes (int): Number of top genes to select
402
- plot (bool): Generate plots
403
- copy (bool): Return copy
404
405
Returns:
406
AnnData or None: Preprocessed object (if copy=True)
407
"""
408
409
def recipe_zheng17(adata, n_top_genes=1000, log=True, plot=True, copy=False):
410
"""
411
Preprocessing recipe from Zheng et al. 2017.
412
413
Parameters:
414
- adata (AnnData): Annotated data object
415
- n_top_genes (int): Number of top genes to select
416
- log (bool): Apply log transformation
417
- plot (bool): Generate plots
418
- copy (bool): Return copy
419
420
Returns:
421
AnnData or None: Preprocessed object (if copy=True)
422
"""
423
```
424
425
## Usage Examples
426
427
### Basic Preprocessing Pipeline
428
429
```python
430
import scanpy as sc
431
import numpy as np
432
433
# Load data
434
adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/hg19/')
435
436
# Basic filtering
437
sc.pp.filter_cells(adata, min_genes=200) # filter cells
438
sc.pp.filter_genes(adata, min_cells=3) # filter genes
439
440
# Calculate QC metrics
441
adata.var['mt'] = adata.var_names.str.startswith('MT-')
442
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
443
444
# Filter based on QC metrics
445
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
446
adata = adata[adata.obs.pct_counts_mt < 20, :]
447
448
# Normalization and log transformation
449
sc.pp.normalize_total(adata, target_sum=1e4)
450
sc.pp.log1p(adata)
451
452
# Feature selection
453
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
454
adata.raw = adata # save the full data
455
adata = adata[:, adata.var.highly_variable]
456
457
# Scaling
458
sc.pp.scale(adata, max_value=10)
459
460
# PCA
461
sc.pp.pca(adata, svd_solver='arpack')
462
463
# Neighborhood graph
464
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
465
```
466
467
### Recipe-based Preprocessing
468
469
```python
470
# Use Seurat-like preprocessing
471
adata = sc.read_10x_mtx('data/')
472
sc.pp.recipe_seurat(adata, log=True, plot=False)
473
```