Tessl Tile for pypi/datacompy@0.18.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

column-utilities.md distributed-comparison.md index.md multi-backend-comparison.md pandas-comparison.md reporting.md

column-utilities.mddocs/

0
# Column-Level Comparison Utilities
1

2
Low-level functions for comparing individual columns and performing specialized comparisons, useful for custom comparison logic and integration with other data processing workflows.
3

4
## Capabilities
5

6
### Pandas Column Comparison
7

8
Functions for comparing individual Pandas Series with tolerance support and specialized handling for different data types.
9

10
```python { .api }
11
def columns_equal(
12
    col_1: pd.Series[Any],
13
    col_2: pd.Series[Any],
14
    rel_tol: float = 0,
15
    abs_tol: float = 0,
16
    ignore_spaces: bool = False,
17
    ignore_case: bool = False
18
) -> pd.Series[bool]:
19
    """
20
    Compare two Pandas Series element-wise.
21
    
22
    Parameters:
23
    - col_1: First Series to compare
24
    - col_2: Second Series to compare
25
    - rel_tol: Relative tolerance for numeric comparisons
26
    - abs_tol: Absolute tolerance for numeric comparisons
27
    - ignore_spaces: Strip whitespace from string values
28
    - ignore_case: Ignore case in string comparisons
29
    
30
    Returns:
31
    Boolean Series indicating element-wise equality
32
    """
33

34
def compare_string_and_date_columns(
35
    col_1: pd.Series[Any],
36
    col_2: pd.Series[Any]
37
) -> pd.Series[bool]:
38
    """
39
    Compare string and date columns with specialized logic.
40
    
41
    Parameters:
42
    - col_1: First Series (string or date type)
43
    - col_2: Second Series (string or date type)
44
    
45
    Returns:
46
    Boolean Series indicating element-wise equality
47
    """
48

49
def calculate_max_diff(col_1: pd.Series[Any], col_2: pd.Series[Any]) -> float:
50
    """
51
    Calculate maximum absolute difference between numeric columns.
52
    
53
    Parameters:
54
    - col_1: First numeric Series
55
    - col_2: Second numeric Series
56
    
57
    Returns:
58
    Maximum absolute difference as float
59
    """
60
```
61

62
### DataFrame Utility Functions
63

64
Helper functions for DataFrame manipulation and analysis during comparison operations.
65

66
```python { .api }
67
def get_merged_columns(
68
    original_df: pd.DataFrame,
69
    merged_df: pd.DataFrame,
70
    suffix: str
71
) -> List[str]:
72
    """
73
    Get column names from merged DataFrame with specific suffix.
74
    
75
    Parameters:
76
    - original_df: Original DataFrame before merge
77
    - merged_df: DataFrame after merge operation
78
    - suffix: Suffix used in merge operation
79
    
80
    Returns:
81
    List of column names with the specified suffix
82
    """
83

84
def generate_id_within_group(
85
    dataframe: pd.DataFrame,
86
    join_columns: List[str]
87
) -> pd.Series[int]:
88
    """
89
    Generate unique identifiers within groups for deduplication.
90
    
91
    Parameters:
92
    - dataframe: DataFrame to generate IDs for
93
    - join_columns: Columns defining the groups
94
    
95
    Returns:
96
    Series of integer IDs unique within each group
97
    """
98
```
99

100
### String Processing Utilities
101

102
Functions for normalizing string data during comparisons.
103

104
```python { .api }
105
def normalize_string_column(
106
    column: pd.Series,
107
    ignore_spaces: bool,
108
    ignore_case: bool
109
) -> pd.Series:
110
    """
111
    Normalize string column for comparison.
112
    
113
    Parameters:
114
    - column: String Series to normalize
115
    - ignore_spaces: Strip leading/trailing whitespace
116
    - ignore_case: Convert to lowercase
117
    
118
    Returns:
119
    Normalized string Series
120
    """
121
```
122

123
### Polars Column Comparison
124

125
Specialized functions for comparing Polars Series with optimized performance.
126

127
```python { .api }
128
def columns_equal(
129
    col_1: pl.Series,
130
    col_2: pl.Series,
131
    rel_tol: float = 0,
132
    abs_tol: float = 0,
133
    ignore_spaces: bool = False,
134
    ignore_case: bool = False
135
) -> pl.Series:
136
    """
137
    Compare two Polars Series element-wise.
138
    
139
    Parameters:
140
    - col_1: First Polars Series to compare
141
    - col_2: Second Polars Series to compare
142
    - rel_tol: Relative tolerance for numeric comparisons
143
    - abs_tol: Absolute tolerance for numeric comparisons
144
    - ignore_spaces: Strip whitespace from string values
145
    - ignore_case: Ignore case in string comparisons
146
    
147
    Returns:
148
    Boolean Polars Series indicating element-wise equality
149
    """
150

151
def calculate_max_diff(col_1: pl.Series, col_2: pl.Series) -> float:
152
    """
153
    Calculate maximum absolute difference between numeric Polars Series.
154
    
155
    Parameters:
156
    - col_1: First numeric Polars Series
157
    - col_2: Second numeric Polars Series
158
    
159
    Returns:
160
    Maximum absolute difference as float
161
    """
162

163
def normalize_string_column(
164
    column: pl.Series,
165
    ignore_spaces: bool,
166
    ignore_case: bool
167
) -> pl.Series:
168
    """
169
    Normalize Polars string Series for comparison.
170
    
171
    Parameters:
172
    - column: String Polars Series to normalize
173
    - ignore_spaces: Strip leading/trailing whitespace
174
    - ignore_case: Convert to lowercase
175
    
176
    Returns:
177
    Normalized string Polars Series
178
    """
179
```
180

181
### Spark Column Comparison
182

183
Functions for comparing Spark DataFrame columns using SQL expressions.
184

185
```python { .api }
186
def columns_equal(
187
    dataframe: pyspark.sql.DataFrame,
188
    col_1: str,
189
    col_2: str,
190
    rel_tol: float = 0,
191
    abs_tol: float = 0,
192
    ignore_spaces: bool = False,
193
    ignore_case: bool = False
194
) -> pyspark.sql.Column:
195
    """
196
    Create Spark SQL Column expression for comparing two columns.
197
    
198
    Parameters:
199
    - dataframe: Spark DataFrame containing the columns
200
    - col_1: Name of first column to compare
201
    - col_2: Name of second column to compare
202
    - rel_tol: Relative tolerance for numeric comparisons
203
    - abs_tol: Absolute tolerance for numeric comparisons
204
    - ignore_spaces: Strip whitespace from string values
205
    - ignore_case: Ignore case in string comparisons
206
    
207
    Returns:
208
    Spark Column expression evaluating to boolean
209
    """
210

211
def calculate_max_diff(
212
    dataframe: pyspark.sql.DataFrame,
213
    col_1: str,
214
    col_2: str
215
) -> float:
216
    """
217
    Calculate maximum absolute difference between numeric columns.
218
    
219
    Parameters:
220
    - dataframe: Spark DataFrame containing the columns
221
    - col_1: Name of first column
222
    - col_2: Name of second column
223
    
224
    Returns:
225
    Maximum absolute difference as float
226
    """
227

228
def calculate_null_diff(
229
    dataframe: pyspark.sql.DataFrame,
230
    col_1: str,
231
    col_2: str
232
) -> int:
233
    """
234
    Calculate differences in null values between columns.
235
    
236
    Parameters:
237
    - dataframe: Spark DataFrame containing the columns
238
    - col_1: Name of first column
239
    - col_2: Name of second column
240
    
241
    Returns:
242
    Count of null value differences as int
243
    """
244
```
245

246
### Snowflake Column Comparison
247

248
Functions for comparing Snowflake DataFrame columns with cloud-optimized processing.
249

250
```python { .api }
251
def columns_equal(
252
    dataframe: sp.DataFrame,
253
    col_1: str,
254
    col_2: str,
255
    col_match: str,
256
    rel_tol: float = 0,
257
    abs_tol: float = 0,
258
    ignore_spaces: bool = False
259
) -> sp.DataFrame:
260
    """
261
    Compare columns in Snowflake DataFrame.
262
    
263
    Parameters:
264
    - dataframe: Snowpark DataFrame containing the columns
265
    - col_1: Name of first column to compare
266
    - col_2: Name of second column to compare
267
    - col_match: Name of column to store match results
268
    - rel_tol: Relative tolerance for numeric comparisons
269
    - abs_tol: Absolute tolerance for numeric comparisons
270
    - ignore_spaces: Strip whitespace from string values
271
    
272
    Returns:
273
    Snowpark DataFrame with comparison results
274
    """
275

276
def calculate_max_diff(
277
    dataframe: sp.DataFrame,
278
    col_1: str,
279
    col_2: str
280
) -> float:
281
    """
282
    Calculate maximum absolute difference between numeric columns.
283
    
284
    Parameters:
285
    - dataframe: Snowpark DataFrame containing the columns
286
    - col_1: Name of first column
287
    - col_2: Name of second column
288
    
289
    Returns:
290
    Maximum absolute difference as float
291
    """
292

293
def calculate_null_diff(
294
    dataframe: sp.DataFrame,
295
    col_1: str,
296
    col_2: str
297
) -> int:
298
    """
299
    Calculate differences in null values between columns.
300
    
301
    Parameters:
302
    - dataframe: Snowpark DataFrame containing the columns
303
    - col_1: Name of first column
304
    - col_2: Name of second column
305
    
306
    Returns:
307
    Count of null value differences as int
308
    """
309
```
310

311
## Usage Examples
312

313
### Basic Column Comparison
314

315
```python
316
import pandas as pd
317
import datacompy
318

319
# Create test Series
320
col1 = pd.Series([1.0, 2.0, 3.0, 4.0, None])
321
col2 = pd.Series([1.1, 2.0, 3.2, 4.0, None])
322

323
# Compare with tolerance
324
matches = datacompy.columns_equal(col1, col2, abs_tol=0.1)
325
print(matches)  # [True, True, False, True, True]
326

327
# Calculate maximum difference
328
max_diff = datacompy.calculate_max_diff(col1, col2)
329
print(f"Maximum difference: {max_diff}")  # 0.2
330
```
331

332
### String Column Comparison
333

334
```python
335
import pandas as pd
336
import datacompy
337

338
# String data with case and space variations
339
col1 = pd.Series(['Alice', 'Bob ', 'CHARLIE', 'david'])
340
col2 = pd.Series(['alice', 'Bob', 'charlie', 'David'])
341

342
# Case-sensitive comparison
343
strict_matches = datacompy.columns_equal(col1, col2)
344
print(strict_matches)  # [False, False, False, False]
345

346
# Case-insensitive with space normalization
347
flexible_matches = datacompy.columns_equal(
348
    col1, col2,
349
    ignore_case=True,
350
    ignore_spaces=True
351
)
352
print(flexible_matches)  # [True, True, True, True]
353
```
354

355
### Polars Column Operations
356

357
```python
358
import polars as pl
359
import datacompy
360

361
# Create Polars Series
362
col1 = pl.Series([1.0, 2.0, 3.0, 4.0])
363
col2 = pl.Series([1.1, 2.0, 3.2, 4.0])
364

365
# Compare with tolerance
366
matches = datacompy.columns_equal(col1, col2, abs_tol=0.1)
367
print(matches)
368

369
# String normalization
370
str_col = pl.Series(['  Alice  ', 'BOB', 'charlie'])
371
normalized = datacompy.normalize_string_column(
372
    str_col,
373
    ignore_spaces=True,
374
    ignore_case=True
375
)
376
print(normalized)  # ['alice', 'bob', 'charlie']
377
```
378

379
### Spark Column Expressions
380

381
```python
382
from pyspark.sql import SparkSession
383
import datacompy
384

385
spark = SparkSession.builder.appName("DataComPy").getOrCreate()
386

387
# Create DataFrame
388
df = spark.createDataFrame([
389
    (1, 1.0, 1.1),
390
    (2, 2.0, 2.0),
391
    (3, 3.0, 3.2)
392
], ['id', 'col1', 'col2'])
393

394
# Create comparison expression
395
comparison_expr = datacompy.columns_equal(
396
    df, 'col1', 'col2',
397
    abs_tol=0.1
398
)
399

400
# Apply comparison
401
result = df.withColumn('matches', comparison_expr)
402
result.show()
403

404
# Calculate maximum difference
405
max_diff = datacompy.calculate_max_diff(df, 'col1', 'col2')
406
print(f"Max difference: {max_diff}")
407
```
408

409
### DataFrame Utility Usage
410

411
```python
412
import pandas as pd
413
import datacompy
414

415
# Example DataFrame with duplicates
416
df = pd.DataFrame({
417
    'group': ['A', 'A', 'B', 'B', 'A'],
418
    'value': [1, 2, 3, 4, 5]
419
})
420

421
# Generate unique IDs within each group
422
ids = datacompy.generate_id_within_group(df, ['group'])
423
print(ids)  # [0, 1, 0, 1, 2]
424

425
# Example of merged column extraction
426
df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
427
df2 = pd.DataFrame({'a': [1, 2], 'c': [5, 6]})
428
merged = df1.merge(df2, on='a', suffixes=('_left', '_right'))
429

430
left_cols = datacompy.get_merged_columns(df1, merged, '_left')
431
print(left_cols)  # ['b_left']
432
```
433

434
## Type Constants
435

436
```python { .api }
437
# Polars type identifiers
438
STRING_TYPE: List[str] = ["String", "Utf8"]
439
LIST_TYPE: List[str] = ["List", "Array"]
440

441
# Spark numeric types (internal)
442
NUMERIC_SPARK_TYPES: List  # Internal Spark numeric type list
443

444
# Snowflake numeric types (internal)  
445
NUMERIC_SNOWPARK_TYPES: List  # Internal Snowpark numeric type list
446
```
447

448
These utility functions provide the building blocks for custom comparison logic and can be combined to create specialized comparison workflows tailored to specific data analysis needs.

Version

Tile

Files

column-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

column-utilities.mddocs/