0
# Data Types and Missing Data
1
2
Extension data types, missing data handling, and type conversion utilities including nullable integer/boolean types, categorical data, and advanced missing value operations.
3
4
## Core Imports
5
6
```python
7
import pandas as pd
8
from pandas import isna, notna, Categorical, NA
9
```
10
11
## Capabilities
12
13
### Missing Data Detection
14
15
Functions to detect and handle missing values in pandas data structures.
16
17
```python { .api }
18
def isna(obj):
19
"""
20
Detect missing values for an array-like object.
21
22
Parameters:
23
- obj: scalar or array-like, object to check for null or missing values
24
25
Returns:
26
bool or array-like of bool, boolean mask indicating missing values
27
"""
28
29
def isnull(obj):
30
"""
31
Detect missing values for an array-like object.
32
33
Alias for isna().
34
35
Parameters:
36
- obj: scalar or array-like, object to check for null or missing values
37
38
Returns:
39
bool or array-like of bool, boolean mask indicating missing values
40
"""
41
42
def notna(obj):
43
"""
44
Detect existing (non-missing) values.
45
46
Parameters:
47
- obj: scalar or array-like, object to check for non-null values
48
49
Returns:
50
bool or array-like of bool, boolean mask indicating non-missing values
51
"""
52
53
def notnull(obj):
54
"""
55
Detect existing (non-missing) values.
56
57
Alias for notna().
58
59
Parameters:
60
- obj: scalar or array-like, object to check for non-null values
61
62
Returns:
63
bool or array-like of bool, boolean mask indicating non-missing values
64
"""
65
```
66
67
### Categorical Data
68
69
Categorical data type for efficient storage and computation of repetitive data.
70
71
```python { .api }
72
class Categorical:
73
def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False):
74
"""
75
Represent a categorical variable in classic R / S-plus fashion.
76
77
Parameters:
78
- values: list-like, values for the categorical
79
- categories: Index-like, unique categories for this categorical
80
- ordered: bool, whether categories have meaningful order
81
- dtype: CategoricalDtype, dtype for the categorical
82
"""
83
84
def add_categories(self, new_categories, inplace=False):
85
"""Add new categories."""
86
87
def remove_categories(self, removals, inplace=False):
88
"""Remove categories."""
89
90
def rename_categories(self, new_categories, inplace=False):
91
"""Rename categories."""
92
93
def reorder_categories(self, new_categories, ordered=None, inplace=False):
94
"""Reorder categories."""
95
96
def remove_unused_categories(self, inplace=False):
97
"""Remove categories not in use."""
98
99
def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
100
"""Set categories to specified new_categories."""
101
102
def as_ordered(self, inplace=False):
103
"""Set Categorical to be ordered."""
104
105
def as_unordered(self, inplace=False):
106
"""Set Categorical to be unordered."""
107
108
@property
109
def categories(self):
110
"""The categories of this categorical."""
111
112
@property
113
def ordered(self):
114
"""Whether the categories have an ordered relationship."""
115
116
@property
117
def codes(self):
118
"""The category codes of this categorical."""
119
120
def value_counts(self, sort=True, ascending=False, dropna=True):
121
"""Return counts of each category."""
122
123
class CategoricalDtype:
124
def __init__(self, categories=None, ordered=None):
125
"""
126
Type for categorical data with categories and ordered attributes.
127
128
Parameters:
129
- categories: sequence, categories for the dtype
130
- ordered: bool, whether the categories are ordered
131
"""
132
133
@property
134
def categories(self):
135
"""Categorical categories."""
136
137
@property
138
def ordered(self):
139
"""Whether categories are ordered."""
140
```
141
142
### Extension Data Types
143
144
Specialized data types that extend pandas' capabilities beyond NumPy types.
145
146
```python { .api }
147
class StringDtype:
148
def __init__(self, storage=None):
149
"""
150
Extension dtype for string data.
151
152
Parameters:
153
- storage: str, storage type ('python' or 'pyarrow')
154
"""
155
156
class BooleanDtype:
157
def __init__(self):
158
"""Extension dtype for boolean data with missing value support."""
159
160
class Int8Dtype:
161
def __init__(self):
162
"""Extension dtype for nullable 8-bit integer data."""
163
164
class Int16Dtype:
165
def __init__(self):
166
"""Extension dtype for nullable 16-bit integer data."""
167
168
class Int32Dtype:
169
def __init__(self):
170
"""Extension dtype for nullable 32-bit integer data."""
171
172
class Int64Dtype:
173
def __init__(self):
174
"""Extension dtype for nullable 64-bit integer data."""
175
176
class UInt8Dtype:
177
def __init__(self):
178
"""Extension dtype for nullable 8-bit unsigned integer data."""
179
180
class UInt16Dtype:
181
def __init__(self):
182
"""Extension dtype for nullable 16-bit unsigned integer data."""
183
184
class UInt32Dtype:
185
def __init__(self):
186
"""Extension dtype for nullable 32-bit unsigned integer data."""
187
188
class UInt64Dtype:
189
def __init__(self):
190
"""Extension dtype for nullable 64-bit unsigned integer data."""
191
192
class Float32Dtype:
193
def __init__(self):
194
"""Extension dtype for nullable 32-bit floating point data."""
195
196
class Float64Dtype:
197
def __init__(self):
198
"""Extension dtype for nullable 64-bit floating point data."""
199
200
class PeriodDtype:
201
def __init__(self, freq=None):
202
"""
203
Extension dtype for Period data.
204
205
Parameters:
206
- freq: str or DateOffset, frequency of the Period
207
"""
208
209
class IntervalDtype:
210
def __init__(self, subtype=None, closed=None):
211
"""
212
Extension dtype for Interval data.
213
214
Parameters:
215
- subtype: str or numpy dtype, subtype of interval
216
- closed: str, whether intervals are closed ('left', 'right', 'both', 'neither')
217
"""
218
219
class DatetimeTZDtype:
220
def __init__(self, tz=None, unit='ns'):
221
"""
222
Extension dtype for timezone-aware datetime data.
223
224
Parameters:
225
- tz: str or tzinfo, timezone information
226
- unit: str, unit of precision ('ns', 'us', 'ms', 's')
227
"""
228
229
class SparseDtype:
230
def __init__(self, dtype=numpy.float64, fill_value=None):
231
"""
232
Extension dtype for sparse data.
233
234
Parameters:
235
- dtype: str, numpy.dtype, ExtensionDtype, the dtype of non-sparse values
236
- fill_value: scalar, value used for sparse locations
237
"""
238
```
239
240
### Arrow Integration
241
242
Apache Arrow-backed data types for improved performance and interoperability.
243
244
```python { .api }
245
class ArrowDtype:
246
def __init__(self, pyarrow_dtype):
247
"""
248
Extension dtype for PyArrow data types.
249
250
Parameters:
251
- pyarrow_dtype: pyarrow.DataType, PyArrow data type
252
"""
253
254
@property
255
def pyarrow_dtype(self):
256
"""Return the PyArrow data type."""
257
258
@property
259
def name(self):
260
"""Return the name of the data type."""
261
262
@property
263
def type(self):
264
"""Return the scalar type for the array."""
265
```
266
267
### Array Creation and Conversion
268
269
Functions to create pandas arrays and convert between different array types.
270
271
```python { .api }
272
def array(data, dtype=None, copy=True):
273
"""
274
Create an ExtensionArray from the input data.
275
276
Parameters:
277
- data: Sequence, 1-dimensional list, Series, Index, or ExtensionArray
278
- dtype: str, np.dtype, or ExtensionDtype, dtype for the array
279
- copy: bool, whether to copy the data
280
281
Returns:
282
ExtensionArray, newly created array
283
"""
284
285
def factorize(values, sort=False, na_sentinel=-1, use_na_sentinel=True, size_hint=None):
286
"""
287
Encode the object as an enumerated type or categorical variable.
288
289
Parameters:
290
- values: sequence, 1-d array-like
291
- sort: bool, sort uniques
292
- na_sentinel: int, value to mark missing values
293
- use_na_sentinel: bool, use na_sentinel for missing values
294
- size_hint: int, hint to the hashtable sizer
295
296
Returns:
297
tuple of (codes, uniques)
298
"""
299
300
def unique(values):
301
"""
302
Return unique values based on a hash table.
303
304
Parameters:
305
- values: 1d array-like
306
307
Returns:
308
ndarray or ExtensionArray, unique values
309
"""
310
311
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True):
312
"""
313
Compute a histogram of the 1D array values.
314
315
Parameters:
316
- values: 1d array-like
317
- sort: bool, sort by values
318
- ascending: bool, sort in ascending order
319
- normalize: bool, return relative frequencies
320
- bins: int, rather than count values, group them into half-open bins
321
- dropna: bool, don't include counts of NaN
322
323
Returns:
324
Series
325
"""
326
```
327
328
### Type Checking Functions
329
330
Functions to check data types and properties of pandas objects.
331
332
```python { .api }
333
# Available in pandas.api.types
334
def infer_dtype(value, skipna=True):
335
"""
336
Efficiently infer the type of a passed val.
337
338
Parameters:
339
- value: object, object whose type is to be inferred
340
- skipna: bool, ignore NaN values when inferring type
341
342
Returns:
343
str, type of the object
344
"""
345
346
def is_any_real_numeric_dtype(arr_or_dtype):
347
"""Check whether the provided array or dtype is a real number data type."""
348
349
def is_bool_dtype(arr_or_dtype):
350
"""Check whether the provided array or dtype is a boolean data type."""
351
352
def is_categorical_dtype(arr_or_dtype):
353
"""Check whether the provided array or dtype is Categorical data type."""
354
355
def is_complex_dtype(arr_or_dtype):
356
"""Check whether the provided array or dtype is a complex data type."""
357
358
def is_datetime64_any_dtype(arr_or_dtype):
359
"""Check whether the provided array or dtype is datetime64 data type."""
360
361
def is_datetime64_dtype(arr_or_dtype):
362
"""Check whether the provided array or dtype is datetime64[ns] data type."""
363
364
def is_datetime64_ns_dtype(arr_or_dtype):
365
"""Check whether the provided array or dtype is datetime64[ns] data type."""
366
367
def is_datetime64tz_dtype(arr_or_dtype):
368
"""Check whether the provided array or dtype has a timezone-aware datetime64 data type."""
369
370
def is_extension_array_dtype(arr_or_dtype):
371
"""Check whether the provided array or dtype is an extension data type."""
372
373
def is_float_dtype(arr_or_dtype):
374
"""Check whether the provided array or dtype is a float data type."""
375
376
def is_integer_dtype(arr_or_dtype):
377
"""Check whether the provided array or dtype is an integer data type."""
378
379
def is_interval_dtype(arr_or_dtype):
380
"""Check whether the provided array or dtype is Interval data type."""
381
382
def is_numeric_dtype(arr_or_dtype):
383
"""Check whether the provided array or dtype is a numeric data type."""
384
385
def is_object_dtype(arr_or_dtype):
386
"""Check whether the provided array or dtype is object data type."""
387
388
def is_period_dtype(arr_or_dtype):
389
"""Check whether the provided array or dtype is Period data type."""
390
391
def is_signed_integer_dtype(arr_or_dtype):
392
"""Check whether the provided array or dtype is a signed integer data type."""
393
394
def is_string_dtype(arr_or_dtype):
395
"""Check whether the provided array or dtype is a string data type."""
396
397
def is_timedelta64_dtype(arr_or_dtype):
398
"""Check whether the provided array or dtype is timedelta64 data type."""
399
400
def is_timedelta64_ns_dtype(arr_or_dtype):
401
"""Check whether the provided array or dtype is timedelta64[ns] data type."""
402
403
def is_unsigned_integer_dtype(arr_or_dtype):
404
"""Check whether the provided array or dtype is an unsigned integer data type."""
405
406
def pandas_dtype(dtype):
407
"""
408
Convert input into a pandas only dtype object or a numpy dtype object.
409
410
Parameters:
411
- dtype: object to be converted
412
413
Returns:
414
np.dtype or pandas dtype
415
"""
416
```
417
418
### Extension Arrays
419
420
Specialized array classes that provide the foundation for extension data types.
421
422
```python { .api }
423
class BooleanArray:
424
def __init__(self, values, mask, copy=False):
425
"""
426
Array of boolean (True/False) data with missing values.
427
428
Parameters:
429
- values: numpy.ndarray, boolean array
430
- mask: numpy.ndarray, boolean array indicating missing values
431
- copy: bool, copy the input arrays
432
"""
433
434
class IntegerArray:
435
def __init__(self, values, mask, copy=False):
436
"""
437
Array of integer values with missing value support.
438
439
Parameters:
440
- values: numpy.ndarray, integer array
441
- mask: numpy.ndarray, boolean array indicating missing values
442
- copy: bool, copy the input arrays
443
"""
444
445
class FloatingArray:
446
def __init__(self, values, mask, copy=False):
447
"""
448
Array of floating point values with missing value support.
449
450
Parameters:
451
- values: numpy.ndarray, float array
452
- mask: numpy.ndarray, boolean array indicating missing values
453
- copy: bool, copy the input arrays
454
"""
455
456
class StringArray:
457
def __init__(self, values, copy=False):
458
"""
459
Extension array for string data in a pandas Series or DataFrame.
460
461
Parameters:
462
- values: array-like, sequence of strings
463
- copy: bool, copy the input array
464
"""
465
466
class IntervalArray:
467
def __init__(self, data, closed=None, dtype=None, copy=False, verify_integrity=True):
468
"""
469
Pandas array for interval data that are closed on the same side.
470
471
Parameters:
472
- data: array-like (1-dimensional), array of Interval objects
473
- closed: str, whether intervals are closed ('left', 'right', 'both', 'neither')
474
- dtype: IntervalDtype, dtype for the IntervalArray
475
- copy: bool, copy the input data
476
- verify_integrity: bool, verify data integrity
477
"""
478
479
class PeriodArray:
480
def __init__(self, values, dtype=None, freq=None, copy=False):
481
"""
482
Pandas array for storing Period data.
483
484
Parameters:
485
- values: Union[PeriodArray, Series[period], ndarray[int], PeriodIndex]
486
- dtype: PeriodDtype, optional
487
- freq: str or period object, frequency
488
- copy: bool, copy the input data
489
"""
490
491
class DatetimeArray:
492
def __init__(self, values, dtype=None, freq=None, copy=False):
493
"""
494
Pandas array for datetime64 data.
495
496
Parameters:
497
- values: Series, Index, DatetimeArray, ndarray
498
- dtype: numpy.dtype or DatetimeTZDtype
499
- freq: str or Offset
500
- copy: bool, copy the input data
501
"""
502
503
class TimedeltaArray:
504
def __init__(self, values, dtype=None, freq=None, copy=False):
505
"""
506
Pandas array for timedelta64 data.
507
508
Parameters:
509
- values: array-like, sequence of timedelta-like objects
510
- dtype: numpy.dtype
511
- freq: str or Offset
512
- copy: bool, copy the input data
513
"""
514
515
class SparseArray:
516
def __init__(self, data, sparse_index=None, fill_value=None, kind='integer', dtype=None, copy=False):
517
"""
518
An ExtensionArray for storing sparse data.
519
520
Parameters:
521
- data: array-like or scalar
522
- sparse_index: SparseIndex, locations of non-fill_value entries
523
- fill_value: scalar, entries matching this value are omitted from representation
524
- kind: str, sparse index kind ('integer' or 'block')
525
- dtype: numpy.dtype
526
- copy: bool, copy the input data
527
"""
528
```
529
530
## Advanced Type Operations
531
532
### Categorical Utilities
533
534
```python { .api }
535
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
536
"""
537
Combine list-like of Categorical-like into a single Categorical.
538
539
Parameters:
540
- to_union: list-like, Categorical, CategoricalIndex, or Series with categorical dtype
541
- sort_categories: bool, sort resulting categories
542
- ignore_order: bool, ignore category order
543
544
Returns:
545
Categorical
546
"""
547
548
def concat_categoricals(to_concat, axis=0, join='outer', ignore_index=False):
549
"""
550
Concatenate Categoricals.
551
552
Parameters:
553
- to_concat: list of Categoricals
554
- axis: int, axis to concatenate along
555
- join: str, join method for categories
556
- ignore_index: bool, reset index in result
557
558
Returns:
559
Categorical
560
"""
561
```
562
563
### Nullable Integer Construction
564
565
```python { .api }
566
# Constructor functions for nullable integer arrays
567
def Int8Array(values, mask=None, copy=False):
568
"""Construct Int8Array."""
569
570
def Int16Array(values, mask=None, copy=False):
571
"""Construct Int16Array."""
572
573
def Int32Array(values, mask=None, copy=False):
574
"""Construct Int32Array."""
575
576
def Int64Array(values, mask=None, copy=False):
577
"""Construct Int64Array."""
578
579
def UInt8Array(values, mask=None, copy=False):
580
"""Construct UInt8Array."""
581
582
def UInt16Array(values, mask=None, copy=False):
583
"""Construct UInt16Array."""
584
585
def UInt32Array(values, mask=None, copy=False):
586
"""Construct UInt32Array."""
587
588
def UInt64Array(values, mask=None, copy=False):
589
"""Construct UInt64Array."""
590
```
591
592
## Types
593
594
```python { .api }
595
# Missing value sentinels
596
NA: object # Pandas missing value for extension dtypes
597
NaT: object # Not-a-Time for datetime/timedelta
598
599
# Extension dtype base classes
600
class ExtensionDtype:
601
"""Base class for custom data types."""
602
603
@property
604
def name(self):
605
"""Return a string representation of the dtype."""
606
607
@property
608
def type(self):
609
"""Return the scalar type for the array."""
610
611
@classmethod
612
def construct_from_string(cls, string):
613
"""Construct this type from a string."""
614
615
# Categorical ordering
616
CategoricalOrdering = bool
617
618
# Dtype inference results
619
InferredType = Literal[
620
'boolean', 'integer', 'floating', 'complex', 'string', 'unicode',
621
'mixed', 'mixed-integer', 'mixed-integer-float', 'decimal',
622
'datetime', 'datetime64', 'timedelta', 'timedelta64',
623
'period', 'categorical', 'interval', 'bytes', 'empty'
624
]
625
626
# Arrow dtype string representations
627
ArrowDtypeStr = str # PyArrow dtype string like 'int64[pyarrow]'
628
629
# Sparse array kinds
630
SparseKind = Literal['integer', 'block']
631
```