0
# Data Utilities
1
2
Data processing utilities including frequency conversion, merging, date calculations, and pandas extensions for financial time series manipulation. Essential tools for data preparation and transformation in quantitative analysis.
3
4
## Capabilities
5
6
### Data Merging and Combination
7
8
Utilities for combining multiple time series and handling data alignment.
9
10
```python { .api }
11
def merge(*series):
12
"""
13
Merge multiple Series and/or DataFrames with proper alignment.
14
15
Parameters:
16
- *series: Variable number of pandas Series or DataFrames to merge
17
18
Returns:
19
pd.DataFrame: Merged data with aligned dates
20
"""
21
22
def drop_duplicate_cols(df):
23
"""
24
Remove duplicate columns keeping the one with longest history.
25
26
Parameters:
27
- df (pd.DataFrame): DataFrame potentially containing duplicate columns
28
29
Returns:
30
pd.DataFrame: DataFrame with duplicate columns removed
31
"""
32
```
33
34
### Frequency Conversion
35
36
Convert time series between different frequencies while preserving financial data characteristics.
37
38
```python { .api }
39
def to_monthly(series, method="ffill", how="end"):
40
"""
41
Convert time series to monthly frequency.
42
43
Parameters:
44
- series (pd.Series or pd.DataFrame): Input time series
45
- method (str): Resampling method ('ffill', 'mean', 'last', etc.)
46
- how (str): How to handle period boundaries ('end', 'start')
47
48
Returns:
49
pd.Series or pd.DataFrame: Monthly frequency data
50
"""
51
52
def asfreq_actual(series, freq, method="ffill", how="end", normalize=False):
53
"""
54
Resample to specified frequency keeping actual dates (not period ends).
55
56
Parameters:
57
- series (pd.Series): Input time series
58
- freq (str): Target frequency ('D', 'W', 'M', 'Y', etc.)
59
- method (str): Fill method for missing values
60
- how (str): Period boundary handling
61
- normalize (bool): Normalize timestamps to midnight
62
63
Returns:
64
pd.Series: Resampled series with actual dates
65
"""
66
```
67
68
### Date and Time Calculations
69
70
Financial date calculations and business day utilities.
71
72
```python { .api }
73
def year_frac(start, end):
74
"""
75
Calculate year fraction between two dates using ACT/365 convention.
76
77
Parameters:
78
- start (datetime): Start date
79
- end (datetime): End date
80
81
Returns:
82
float: Year fraction (e.g., 0.25 for 3 months)
83
"""
84
85
def get_num_days_required(offset, period="d", perc_required=0.90, annualization_factor=252):
86
"""
87
Estimate number of days required for reliable statistics calculation.
88
89
Parameters:
90
- offset (str): Time offset (e.g., '1Y', '6M', '3M')
91
- period (str): Period type ('d' for daily, 'm' for monthly)
92
- perc_required (float): Required data percentage (default: 0.90)
93
- annualization_factor (int): Trading days per year (default: 252)
94
95
Returns:
96
int: Estimated required number of days
97
"""
98
99
def calc_mtd(daily_prices, monthly_prices):
100
"""
101
Calculate month-to-date returns from daily and monthly price series.
102
103
Parameters:
104
- daily_prices (pd.Series): Daily price series
105
- monthly_prices (pd.Series): Monthly price series
106
107
Returns:
108
pd.Series: Month-to-date returns
109
"""
110
111
def calc_ytd(daily_prices, yearly_prices):
112
"""
113
Calculate year-to-date returns from daily and yearly price series.
114
115
Parameters:
116
- daily_prices (pd.Series): Daily price series
117
- yearly_prices (pd.Series): Yearly price series
118
119
Returns:
120
pd.Series: Year-to-date returns
121
"""
122
```
123
124
### Data Processing Utilities
125
126
General data transformation and cleaning functions.
127
128
```python { .api }
129
def annualize(returns, durations, one_year=365.0):
130
"""
131
Annualize returns using actual durations.
132
133
Parameters:
134
- returns (array-like): Return values
135
- durations (array-like): Duration of each return in days
136
- one_year (float): Days in one year (default: 365.0)
137
138
Returns:
139
array-like: Annualized returns
140
"""
141
142
def deannualize(returns, nperiods):
143
"""
144
Convert annualized returns to different frequency basis.
145
146
Parameters:
147
- returns (array-like): Annualized returns
148
- nperiods (int): Number of periods per year for target frequency
149
150
Returns:
151
array-like: Returns converted to target frequency
152
"""
153
154
def infer_freq(data):
155
"""
156
Infer most likely frequency from time series index.
157
158
Parameters:
159
- data (pd.Series or pd.DataFrame): Time series data
160
161
Returns:
162
str: Inferred frequency string (e.g., 'D', 'M', 'Y')
163
"""
164
165
def infer_nperiods(data, annualization_factor=None):
166
"""
167
Infer number of periods for annualization based on data frequency.
168
169
Parameters:
170
- data (pd.Series or pd.DataFrame): Time series data
171
- annualization_factor (int): Override annualization factor (default: None)
172
173
Returns:
174
int: Number of periods for annualization
175
"""
176
```
177
178
### Visualization Utilities
179
180
Plotting functions for financial data visualization.
181
182
```python { .api }
183
def plot_heatmap(data, title="Heatmap", show_legend=True, show_labels=True, label_fmt=".2f", vmin=None, vmax=None, figsize=None, label_color="w", cmap="RdBu", **kwargs):
184
"""
185
Plot data as a heatmap with customizable formatting.
186
187
Parameters:
188
- data (pd.DataFrame): Data to plot
189
- title (str): Plot title
190
- show_legend (bool): Whether to show color legend
191
- show_labels (bool): Whether to show value labels on cells
192
- label_fmt (str): Format string for value labels
193
- vmin, vmax (float): Color scale bounds
194
- figsize (tuple): Figure size
195
- label_color (str): Color for value labels
196
- cmap (str): Colormap name
197
- **kwargs: Additional matplotlib arguments
198
199
Returns:
200
matplotlib objects for further customization
201
"""
202
203
def plot_corr_heatmap(data, **kwargs):
204
"""
205
Plot correlation matrix as heatmap with sensible defaults.
206
207
Parameters:
208
- data (pd.DataFrame): Data for correlation calculation
209
- **kwargs: Arguments passed to plot_heatmap()
210
211
Returns:
212
matplotlib objects for further customization
213
"""
214
```
215
216
### Data Formatting Utilities
217
218
Financial number formatting and display functions for reports and presentations.
219
220
```python { .api }
221
def fmtp(number):
222
"""
223
Format number as percentage with 2 decimal places.
224
225
Parameters:
226
- number (float): Number to format (0.1523 becomes '15.23%')
227
228
Returns:
229
str: Formatted percentage string, returns '-' for NaN values
230
"""
231
232
def fmtpn(number):
233
"""
234
Format number as percentage without % symbol.
235
236
Parameters:
237
- number (float): Number to format (0.1523 becomes '15.23')
238
239
Returns:
240
str: Formatted percentage number, returns '-' for NaN values
241
"""
242
243
def fmtn(number):
244
"""
245
Format number as float with 2 decimal places.
246
247
Parameters:
248
- number (float): Number to format
249
250
Returns:
251
str: Formatted float string, returns '-' for NaN values
252
"""
253
254
def as_percent(item, digits=2):
255
"""
256
Format pandas Series/DataFrame values as percentages.
257
258
Parameters:
259
- item (pd.Series or pd.DataFrame): Data to format
260
- digits (int): Number of decimal places (default: 2)
261
262
Returns:
263
pd.Series or pd.DataFrame: Formatted with percentage strings
264
"""
265
266
def as_format(item, format_str=".2f"):
267
"""
268
Apply format string to pandas Series/DataFrame values.
269
270
Parameters:
271
- item (pd.Series or pd.DataFrame): Data to format
272
- format_str (str): Python format string (default: '.2f')
273
274
Returns:
275
pd.Series or pd.DataFrame: Formatted strings
276
"""
277
```
278
279
### Data Parsing and Cleaning
280
281
Utilities for parsing arguments and cleaning ticker symbols.
282
283
```python { .api }
284
def parse_arg(arg):
285
"""
286
Parse flexible argument input (string, list, tuple, CSV).
287
288
Parameters:
289
- arg (str, list, tuple): Input to parse
290
Can be 'AAPL,MSFT,GOOGL' or ['AAPL', 'MSFT'] or ('AAPL', 'MSFT')
291
292
Returns:
293
list: Parsed and cleaned list of arguments
294
"""
295
296
def clean_ticker(ticker):
297
"""
298
Clean ticker symbol for consistent usage.
299
300
Removes non-alphanumeric characters and converts to lowercase.
301
Handles cases like '^VIX' -> 'vix', 'SPX Index' -> 'spx'
302
303
Parameters:
304
- ticker (str): Raw ticker symbol
305
306
Returns:
307
str: Cleaned ticker symbol
308
"""
309
310
def clean_tickers(tickers):
311
"""
312
Apply clean_ticker to multiple tickers.
313
314
Parameters:
315
- tickers (list): List of ticker symbols
316
317
Returns:
318
list: List of cleaned ticker symbols
319
"""
320
```
321
322
### Data Scaling and Transformation
323
324
Advanced data transformation utilities.
325
326
```python { .api }
327
def scale(val, src, dst):
328
"""
329
Scale value from source range to destination range with clipping.
330
331
Parameters:
332
- val (float): Value to scale
333
- src (tuple): Source range (min, max)
334
- dst (tuple): Destination range (min, max)
335
336
Returns:
337
float: Scaled value, clipped to destination bounds
338
"""
339
340
def get_freq_name(period):
341
"""
342
Get human-readable name for pandas frequency string.
343
344
Parameters:
345
- period (str): Pandas frequency code ('D', 'M', 'Y', etc.)
346
347
Returns:
348
str or None: Human-readable frequency name or None if unknown
349
"""
350
```
351
352
### Pandas Extension System
353
354
Core function that enables method chaining on pandas objects.
355
356
```python { .api }
357
def extend_pandas():
358
"""
359
Extend pandas Series and DataFrame with all ffn functions as methods.
360
361
This function is called automatically when ffn is imported, enabling
362
method chaining like: prices.to_returns().calc_sharpe()
363
364
Returns:
365
None (modifies pandas classes in-place)
366
"""
367
```
368
369
## Usage Examples
370
371
### Data Merging and Alignment
372
373
```python
374
import ffn
375
import pandas as pd
376
377
# Download data with different date ranges
378
spy_data = ffn.get('SPY', start='2020-01-01', end='2023-12-31')
379
vti_data = ffn.get('VTI', start='2020-06-01', end='2023-06-30') # Shorter range
380
bond_data = ffn.get('BND', start='2019-01-01', end='2023-12-31') # Longer range
381
382
# Merge with automatic alignment
383
merged_data = ffn.merge(spy_data, vti_data, bond_data)
384
print(f"Merged data shape: {merged_data.shape}")
385
print(f"Date range: {merged_data.index[0]} to {merged_data.index[-1]}")
386
387
# Handle duplicate columns scenario
388
duplicate_data = pd.concat([spy_data, spy_data.rename(columns={'SPY': 'SPY_dup'})], axis=1)
389
print(f"Before dedup: {duplicate_data.columns.tolist()}")
390
391
clean_data = ffn.drop_duplicate_cols(duplicate_data)
392
print(f"After dedup: {clean_data.columns.tolist()}")
393
```
394
395
### Frequency Conversion
396
397
```python
398
import ffn
399
400
# Download daily data
401
daily_prices = ffn.get('AAPL,MSFT', start='2020-01-01')
402
daily_returns = ffn.to_returns(daily_prices).dropna()
403
404
# Convert to monthly
405
monthly_prices = ffn.to_monthly(daily_prices, method="last")
406
monthly_returns = ffn.to_returns(monthly_prices).dropna()
407
408
print(f"Daily data points: {len(daily_prices)}")
409
print(f"Monthly data points: {len(monthly_prices)}")
410
411
# Custom frequency conversion
412
weekly_prices = ffn.asfreq_actual(daily_prices['AAPL'], freq='W', method='last')
413
print(f"Weekly data points: {len(weekly_prices)}")
414
415
# Performance comparison across frequencies
416
daily_sharpe = ffn.calc_sharpe(daily_returns['AAPL'], rf=0.02)
417
monthly_sharpe = ffn.calc_sharpe(monthly_returns['AAPL'], rf=0.02)
418
419
print(f"Daily Sharpe: {daily_sharpe:.3f}")
420
print(f"Monthly Sharpe: {monthly_sharpe:.3f}")
421
```
422
423
### Date Calculations and Business Days
424
425
```python
426
import ffn
427
import pandas as pd
428
from datetime import datetime
429
430
# Calculate year fractions
431
start_date = datetime(2020, 1, 1)
432
end_date = datetime(2020, 12, 31)
433
year_fraction = ffn.year_frac(start_date, end_date)
434
print(f"Year fraction for 2020: {year_fraction:.4f}")
435
436
# Quarter calculation
437
q1_end = datetime(2020, 3, 31)
438
q1_fraction = ffn.year_frac(start_date, q1_end)
439
print(f"Q1 2020 year fraction: {q1_fraction:.4f}")
440
441
# Estimate required data for reliable statistics
442
days_1y = ffn.get_num_days_required('1Y', perc_required=0.95)
443
days_6m = ffn.get_num_days_required('6M', perc_required=0.90)
444
days_3m = ffn.get_num_days_required('3M', perc_required=0.85)
445
446
print(f"Days needed for 1Y analysis (95% req): {days_1y}")
447
print(f"Days needed for 6M analysis (90% req): {days_6m}")
448
print(f"Days needed for 3M analysis (85% req): {days_3m}")
449
```
450
451
### Return Annualization
452
453
```python
454
import ffn
455
import numpy as np
456
457
# Download data and calculate returns
458
prices = ffn.get('SPY', start='2020-01-01')['SPY']
459
returns = ffn.to_returns(prices).dropna()
460
461
# Calculate holding period durations
462
durations = np.ones(len(returns)) # Daily returns = 1 day each
463
464
# Annualize using actual durations
465
annualized_returns = ffn.annualize(returns, durations, one_year=365)
466
print(f"First 5 annualized daily returns: {annualized_returns[:5]}")
467
468
# Convert annualized returns to monthly basis
469
monthly_equivalent = ffn.deannualize(annualized_returns, nperiods=12)
470
print(f"Monthly equivalent returns (first 5): {monthly_equivalent[:5]}")
471
472
# Compare methodologies
473
annual_return_compound = (1 + returns).prod() ** (252 / len(returns)) - 1
474
annual_return_ffn = ffn.calc_cagr(prices)
475
476
print(f"Compound annual return: {annual_return_compound:.4f}")
477
print(f"FFN CAGR: {annual_return_ffn:.4f}")
478
```
479
480
### Visualization Examples
481
482
```python
483
import ffn
484
import matplotlib.pyplot as plt
485
486
# Download multi-asset data
487
assets = ['SPY', 'QQQ', 'IWM', 'EFA', 'EEM']
488
prices = ffn.get(assets, start='2020-01-01')
489
returns = ffn.to_returns(prices).dropna()
490
491
# Correlation heatmap
492
correlation_matrix = returns.corr()
493
ffn.plot_corr_heatmap(correlation_matrix, title='Asset Correlation Matrix',
494
figsize=(8, 6))
495
plt.show()
496
497
# Custom heatmap for return statistics
498
stats_data = pd.DataFrame({
499
'Mean': returns.mean() * 252, # Annualized
500
'Volatility': returns.std() * np.sqrt(252),
501
'Sharpe': [ffn.calc_sharpe(returns[col], rf=0.02) for col in returns.columns]
502
}).T
503
504
ffn.plot_heatmap(stats_data, title='Asset Statistics Heatmap',
505
label_fmt='.3f', cmap='viridis', figsize=(10, 4))
506
plt.show()
507
508
# Monthly return heatmap
509
monthly_returns = ffn.to_monthly(prices, method='last')
510
monthly_ret = ffn.to_returns(monthly_returns).dropna()
511
512
# Reshape for heatmap (years vs months)
513
monthly_pivot = monthly_ret['SPY'].groupby([monthly_ret.index.year, monthly_ret.index.month]).first().unstack()
514
monthly_pivot.columns = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
515
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
516
517
ffn.plot_heatmap(monthly_pivot, title='SPY Monthly Returns by Year',
518
label_fmt='.1%', cmap='RdYlGn', figsize=(12, 8))
519
plt.show()
520
```
521
522
### Method Chaining with Pandas Extensions
523
524
```python
525
import ffn
526
527
# FFN automatically extends pandas with all functions
528
prices = ffn.get('AAPL,MSFT', start='2020-01-01')
529
530
# Method chaining examples
531
portfolio_analysis = (prices
532
.rebase(100) # Rebase to 100
533
.to_returns() # Convert to returns
534
.dropna() # Clean data
535
.to_monthly(method='last') # Convert to monthly
536
.merge(ffn.get('SPY', start='2020-01-01').to_returns().dropna().to_monthly()) # Add benchmark
537
)
538
539
print(f"Chained analysis shape: {portfolio_analysis.shape}")
540
541
# Complex chaining for performance metrics
542
performance_metrics = {}
543
for asset in ['AAPL', 'MSFT']:
544
metrics = (prices[asset]
545
.to_returns()
546
.dropna()
547
.calc_perf_stats(rf=0.02)
548
)
549
performance_metrics[asset] = {
550
'CAGR': metrics.cagr,
551
'Sharpe': metrics.sharpe,
552
'Max DD': metrics.max_drawdown
553
}
554
555
import pandas as pd
556
metrics_df = pd.DataFrame(performance_metrics).T
557
print("Performance Metrics via Chaining:")
558
print(metrics_df.round(3))
559
560
# Data pipeline with chaining
561
def create_analysis_pipeline(tickers, start_date, benchmark='SPY'):
562
"""Complete analysis pipeline using method chaining."""
563
564
# Core data
565
data = (ffn.get(tickers, start=start_date)
566
.dropna()
567
.rebase(100)
568
)
569
570
# Returns analysis
571
returns = (data
572
.to_returns()
573
.dropna()
574
)
575
576
# Add benchmark
577
benchmark_returns = (ffn.get(benchmark, start=start_date)
578
.to_returns()
579
.dropna()
580
)
581
benchmark_returns.columns = ['Benchmark']
582
583
# Final dataset
584
final_data = ffn.merge(returns, benchmark_returns)
585
586
return {
587
'prices': data,
588
'returns': final_data,
589
'correlations': final_data.corr(),
590
'performance': {col: ffn.calc_perf_stats(data[col] if col in data.columns
591
else ffn.to_price_index(final_data[col]))
592
for col in final_data.columns}
593
}
594
595
# Run pipeline
596
pipeline_results = create_analysis_pipeline(['AAPL', 'MSFT'], '2020-01-01')
597
print(f"Pipeline created {len(pipeline_results['performance'])} performance analyses")
598
```