0
# Statistical Analysis
1
2
Advanced statistical tools including clustering algorithms, bootstrap resampling, and momentum calculations for quantitative research. Provides sophisticated analytical capabilities for financial data exploration and strategy development.
3
4
## Capabilities
5
6
### Clustering Analysis
7
8
Identify groups of assets with similar return patterns using machine learning clustering algorithms.
9
10
```python { .api }
11
def calc_clusters(returns, n=None, plot=False):
12
"""
13
Perform K-means clustering on assets based on return correlations.
14
15
Parameters:
16
- returns (pd.DataFrame): Return series for multiple assets
17
- n (int): Number of clusters (default: None for automatic selection)
18
- plot (bool): Whether to generate cluster visualization plots (default: False)
19
20
Returns:
21
dict: Clustering results with cluster assignments and centroids
22
"""
23
24
def calc_ftca(returns, threshold=0.5):
25
"""
26
Fast Threshold Clustering Algorithm for asset grouping.
27
28
Parameters:
29
- returns (pd.DataFrame): Return series for multiple assets
30
- threshold (float): Correlation threshold for clustering (default: 0.5)
31
32
Returns:
33
dict: Cluster assignments and statistics
34
"""
35
```
36
37
### Momentum Analysis
38
39
Calculate momentum-based signals and probabilistic momentum indicators.
40
41
```python { .api }
42
def calc_prob_mom(returns, other_returns):
43
"""
44
Calculate probabilistic momentum comparing two return series.
45
46
Parameters:
47
- returns (pd.Series): Primary return series
48
- other_returns (pd.Series): Comparison return series
49
50
Returns:
51
float: Probabilistic momentum score
52
"""
53
```
54
55
### Bootstrap Resampling
56
57
Statistical resampling techniques for robust metric estimation and confidence intervals.
58
59
```python { .api }
60
def resample_returns(returns, func, seed=0, num_trials=100):
61
"""
62
Bootstrap resample returns and calculate statistics with confidence intervals.
63
64
Parameters:
65
- returns (pd.Series or pd.DataFrame): Return series to resample
66
- func (callable): Function to apply to each resampled dataset
67
- seed (int): Random seed for reproducibility (default: 0)
68
- num_trials (int): Number of bootstrap trials (default: 100)
69
70
Returns:
71
dict: Bootstrap results including mean, std, and confidence intervals
72
"""
73
```
74
75
### Rolling Analysis
76
77
Apply functions over rolling windows for time-varying analysis.
78
79
```python { .api }
80
def rollapply(data, window, fn):
81
"""
82
Apply function over rolling window.
83
84
Parameters:
85
- data (pd.Series or pd.DataFrame): Input data
86
- window (int): Rolling window size
87
- fn (callable): Function to apply to each window
88
89
Returns:
90
pd.Series or pd.DataFrame: Rolling function results
91
"""
92
```
93
94
### Statistical Utilities
95
96
Data transformation and statistical processing functions.
97
98
```python { .api }
99
def winsorize(x, axis=0, limits=0.01):
100
"""
101
Winsorize values to reduce impact of outliers.
102
103
Parameters:
104
- x (array-like): Input data
105
- axis (int): Axis along which to winsorize (default: 0)
106
- limits (float or tuple): Winsorization limits as fraction (default: 0.01)
107
108
Returns:
109
array-like: Winsorized data
110
"""
111
112
def rescale(x, min=0.0, max=1.0, axis=0):
113
"""
114
Rescale values to fit within specified range.
115
116
Parameters:
117
- x (array-like): Input data
118
- min (float): Minimum value for rescaling (default: 0.0)
119
- max (float): Maximum value for rescaling (default: 1.0)
120
- axis (int): Axis along which to rescale (default: 0)
121
122
Returns:
123
array-like: Rescaled data
124
"""
125
```
126
127
### Time Series Analysis
128
129
Frequency analysis and period estimation utilities.
130
131
```python { .api }
132
def infer_freq(data):
133
"""
134
Infer most likely frequency from time series index.
135
136
Parameters:
137
- data (pd.Series or pd.DataFrame): Time series data
138
139
Returns:
140
str: Inferred frequency string (e.g., 'D', 'M', 'Y')
141
"""
142
143
def infer_nperiods(data, annualization_factor=None):
144
"""
145
Infer number of periods for annualization based on data frequency.
146
147
Parameters:
148
- data (pd.Series or pd.DataFrame): Time series data
149
- annualization_factor (int): Override annualization factor (default: None)
150
151
Returns:
152
int: Number of periods for annualization
153
"""
154
```
155
156
## Usage Examples
157
158
### Asset Clustering Analysis
159
160
```python
161
import ffn
162
import matplotlib.pyplot as plt
163
164
# Download sector ETF data
165
sector_etfs = ['XLK', 'XLF', 'XLE', 'XLV', 'XLI', 'XLP', 'XLY', 'XLU', 'XLB']
166
prices = ffn.get(sector_etfs, start='2020-01-01')
167
returns = ffn.to_returns(prices).dropna()
168
169
# Perform clustering analysis
170
clusters = ffn.calc_clusters(returns, n=3, plot=True)
171
print("Cluster Assignments:")
172
for cluster_id, assets in clusters['clusters'].items():
173
print(f"Cluster {cluster_id}: {assets}")
174
175
# Fast threshold clustering
176
ftca_results = ffn.calc_ftca(returns, threshold=0.6)
177
print(f"\nFTCA found {len(ftca_results)} clusters")
178
179
# Analyze cluster characteristics
180
for i, cluster in enumerate(clusters['clusters'].values()):
181
cluster_returns = returns[cluster].mean(axis=1)
182
cluster_vol = cluster_returns.std() * (252**0.5)
183
print(f"Cluster {i} Volatility: {cluster_vol:.3f}")
184
```
185
186
### Momentum Analysis
187
188
```python
189
import ffn
190
191
# Download market data
192
prices = ffn.get('SPY,QQQ,IWM', start='2020-01-01')
193
returns = ffn.to_returns(prices).dropna()
194
195
# Calculate momentum signals
196
spy_qqq_mom = ffn.calc_prob_mom(returns['SPY'], returns['QQQ'])
197
spy_iwm_mom = ffn.calc_prob_mom(returns['SPY'], returns['IWM'])
198
qqq_iwm_mom = ffn.calc_prob_mom(returns['QQQ'], returns['IWM'])
199
200
print(f"SPY vs QQQ Momentum: {spy_qqq_mom:.3f}")
201
print(f"SPY vs IWM Momentum: {spy_iwm_mom:.3f}")
202
print(f"QQQ vs IWM Momentum: {qqq_iwm_mom:.3f}")
203
204
# Rolling momentum analysis
205
window = 63 # Quarterly
206
rolling_mom = []
207
for i in range(window, len(returns)):
208
period_returns = returns.iloc[i-window:i]
209
mom_score = ffn.calc_prob_mom(period_returns['SPY'], period_returns['QQQ'])
210
rolling_mom.append(mom_score)
211
212
rolling_mom_series = pd.Series(rolling_mom, index=returns.index[window:])
213
rolling_mom_series.plot(title='Rolling SPY vs QQQ Momentum', figsize=(12, 6))
214
plt.axhline(y=0.5, color='r', linestyle='--', label='Neutral')
215
plt.legend()
216
plt.show()
217
```
218
219
### Bootstrap Analysis
220
221
```python
222
import ffn
223
import numpy as np
224
225
# Download and prepare data
226
prices = ffn.get('AAPL', start='2020-01-01')['AAPL']
227
returns = ffn.to_returns(prices).dropna()
228
229
# Bootstrap Sharpe ratio analysis
230
def calc_sharpe_wrapper(ret_series):
231
return ffn.calc_sharpe(ret_series, rf=0.02)
232
233
sharpe_bootstrap = ffn.resample_returns(returns, calc_sharpe_wrapper,
234
seed=42, num_trials=1000)
235
236
print("Bootstrap Sharpe Ratio Results:")
237
print(f"Mean: {sharpe_bootstrap['mean']:.3f}")
238
print(f"Std: {sharpe_bootstrap['std']:.3f}")
239
print(f"95% CI: [{sharpe_bootstrap['ci_lower']:.3f}, {sharpe_bootstrap['ci_upper']:.3f}]")
240
241
# Bootstrap maximum drawdown
242
def calc_max_dd_wrapper(ret_series):
243
price_series = ffn.to_price_index(ret_series)
244
return ffn.calc_max_drawdown(price_series)
245
246
dd_bootstrap = ffn.resample_returns(returns, calc_max_dd_wrapper,
247
seed=42, num_trials=1000)
248
249
print(f"\nBootstrap Max Drawdown Results:")
250
print(f"Mean: {dd_bootstrap['mean']:.3f}")
251
print(f"95% CI: [{dd_bootstrap['ci_lower']:.3f}, {dd_bootstrap['ci_upper']:.3f}]")
252
```
253
254
### Rolling Window Analysis
255
256
```python
257
import ffn
258
259
# Download data
260
prices = ffn.get('SPY,TLT', start='2015-01-01')
261
returns = ffn.to_returns(prices).dropna()
262
263
# Rolling correlation analysis
264
def rolling_corr(window_data):
265
return window_data.corr().iloc[0, 1]
266
267
rolling_corr_60d = ffn.rollapply(returns, window=60, fn=rolling_corr)
268
rolling_corr_252d = ffn.rollapply(returns, window=252, fn=rolling_corr)
269
270
# Plot rolling correlations
271
fig, ax = plt.subplots(figsize=(12, 6))
272
rolling_corr_60d.plot(label='60-Day Rolling Correlation', ax=ax)
273
rolling_corr_252d.plot(label='252-Day Rolling Correlation', ax=ax)
274
plt.title('SPY-TLT Rolling Correlation')
275
plt.ylabel('Correlation')
276
plt.legend()
277
plt.grid(True)
278
plt.show()
279
280
# Rolling Sharpe ratio
281
def rolling_sharpe(window_data):
282
return ffn.calc_sharpe(window_data['SPY'], rf=0.02)
283
284
rolling_sharpe_252d = ffn.rollapply(returns, window=252, fn=rolling_sharpe)
285
rolling_sharpe_252d.plot(title='SPY Rolling 1-Year Sharpe Ratio', figsize=(12, 6))
286
plt.ylabel('Sharpe Ratio')
287
plt.grid(True)
288
plt.show()
289
```
290
291
### Data Preprocessing and Outlier Treatment
292
293
```python
294
import ffn
295
import numpy as np
296
297
# Download volatile asset data
298
prices = ffn.get('TSLA', start='2020-01-01')['TSLA']
299
returns = ffn.to_returns(prices).dropna()
300
301
print("Original Return Statistics:")
302
print(f"Mean: {returns.mean():.4f}")
303
print(f"Std: {returns.std():.4f}")
304
print(f"Skewness: {returns.skew():.3f}")
305
print(f"Min: {returns.min():.4f}")
306
print(f"Max: {returns.max():.4f}")
307
308
# Winsorize extreme returns
309
winsorized_returns = ffn.winsorize(returns, limits=0.05) # 5% winsorization
310
311
print(f"\nWinsorized Return Statistics:")
312
print(f"Mean: {winsorized_returns.mean():.4f}")
313
print(f"Std: {winsorized_returns.std():.4f}")
314
print(f"Skewness: {winsorized_returns.skew():.3f}")
315
print(f"Min: {winsorized_returns.min():.4f}")
316
print(f"Max: {winsorized_returns.max():.4f}")
317
318
# Rescale returns to [-1, 1] range
319
rescaled_returns = ffn.rescale(returns, min=-1, max=1)
320
321
print(f"\nRescaled Return Range:")
322
print(f"Min: {rescaled_returns.min():.4f}")
323
print(f"Max: {rescaled_returns.max():.4f}")
324
325
# Compare performance metrics
326
original_sharpe = ffn.calc_sharpe(returns, rf=0.02)
327
winsorized_sharpe = ffn.calc_sharpe(winsorized_returns, rf=0.02)
328
329
print(f"\nSharpe Ratio Comparison:")
330
print(f"Original: {original_sharpe:.3f}")
331
print(f"Winsorized: {winsorized_sharpe:.3f}")
332
```
333
334
### Advanced Statistical Analysis Pipeline
335
336
```python
337
import ffn
338
import pandas as pd
339
340
def comprehensive_analysis(tickers, start_date='2020-01-01'):
341
"""Comprehensive statistical analysis pipeline."""
342
343
# Download and prepare data
344
prices = ffn.get(tickers, start=start_date)
345
returns = ffn.to_returns(prices).dropna()
346
347
results = {}
348
349
# 1. Clustering analysis
350
if len(returns.columns) > 2:
351
clusters = ffn.calc_clusters(returns, n=min(3, len(returns.columns)//2))
352
results['clusters'] = clusters
353
354
# 2. Bootstrap statistics for each asset
355
bootstrap_results = {}
356
for asset in returns.columns:
357
asset_returns = returns[asset]
358
359
# Bootstrap Sharpe
360
sharpe_boot = ffn.resample_returns(asset_returns,
361
lambda x: ffn.calc_sharpe(x, rf=0.02),
362
num_trials=500)
363
bootstrap_results[asset] = sharpe_boot
364
365
results['bootstrap'] = bootstrap_results
366
367
# 3. Rolling correlations (if multiple assets)
368
if len(returns.columns) > 1:
369
rolling_corrs = {}
370
for i, asset1 in enumerate(returns.columns):
371
for asset2 in returns.columns[i+1:]:
372
pair_returns = returns[[asset1, asset2]]
373
rolling_corr = ffn.rollapply(pair_returns, 60,
374
lambda x: x.corr().iloc[0,1])
375
rolling_corrs[f"{asset1}_{asset2}"] = rolling_corr
376
377
results['rolling_correlations'] = rolling_corrs
378
379
# 4. Frequency analysis
380
freq_info = {
381
'inferred_frequency': ffn.infer_freq(returns),
382
'nperiods': ffn.infer_nperiods(returns)
383
}
384
results['frequency_analysis'] = freq_info
385
386
return results
387
388
# Run comprehensive analysis
389
analysis_results = comprehensive_analysis(['AAPL', 'MSFT', 'GOOGL', 'AMZN'])
390
391
print("Comprehensive Analysis Results:")
392
print(f"Clusters found: {len(analysis_results.get('clusters', {}).get('clusters', {}))}")
393
print(f"Bootstrap samples: {len(list(analysis_results['bootstrap'].values())[0]['samples'])}")
394
print(f"Rolling correlations tracked: {len(analysis_results.get('rolling_correlations', {}))}")
395
print(f"Data frequency: {analysis_results['frequency_analysis']['inferred_frequency']}")
396
```