or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

character-data.mdcore-data-models.mddata-io.mdindex.mdsimulation.mdtree-analysis.mdvisualization-interop.md

character-data.mddocs/

0

# Character Data & Evolution

1

2

Character matrices for molecular and morphological data, state alphabets, and evolutionary models. DendroPy supports DNA, RNA, protein, restriction sites, standard morphological, and continuous character data with comprehensive state alphabet management.

3

4

## Capabilities

5

6

### State Alphabets

7

8

Classes defining the possible character states for different data types.

9

10

```python { .api }

11

class StateAlphabet:

12

"""

13

Base class for character state alphabets.

14

15

Parameters:

16

- fundamental_states: Core states (excluding ambiguous states)

17

- ambiguous_states: States representing multiple fundamental states

18

"""

19

20

def __init__(self, fundamental_states=None, ambiguous_states=None): ...

21

22

def __len__(self):

23

"""Number of fundamental states."""

24

25

def __iter__(self):

26

"""Iterate over all states."""

27

28

def __contains__(self, state):

29

"""Check if state is in alphabet."""

30

31

def state_for_symbol(self, symbol):

32

"""Get state object for symbol."""

33

34

def symbol_for_state(self, state):

35

"""Get symbol for state object."""

36

37

def fundamental_states(self):

38

"""Iterator over fundamental states only."""

39

40

def ambiguous_states(self):

41

"""Iterator over ambiguous states only."""

42

43

# Specialized alphabet classes

44

class DnaStateAlphabet(StateAlphabet):

45

"""DNA state alphabet with A, C, G, T and ambiguity codes."""

46

47

class RnaStateAlphabet(StateAlphabet):

48

"""RNA state alphabet with A, C, G, U and ambiguity codes."""

49

50

class NucleotideStateAlphabet(StateAlphabet):

51

"""General nucleotide alphabet (DNA or RNA)."""

52

53

class ProteinStateAlphabet(StateAlphabet):

54

"""Protein state alphabet with 20 amino acids and ambiguity codes."""

55

56

class BinaryStateAlphabet(StateAlphabet):

57

"""Binary state alphabet (0, 1)."""

58

59

class RestrictionSitesStateAlphabet(StateAlphabet):

60

"""Restriction sites alphabet (0=absent, 1=present)."""

61

62

class InfiniteSitesStateAlphabet(StateAlphabet):

63

"""Infinite sites alphabet for phylogenetic analysis."""

64

65

# Predefined alphabet instances

66

DNA_STATE_ALPHABET: DnaStateAlphabet

67

RNA_STATE_ALPHABET: RnaStateAlphabet

68

NUCLEOTIDE_STATE_ALPHABET: NucleotideStateAlphabet

69

PROTEIN_STATE_ALPHABET: ProteinStateAlphabet

70

BINARY_STATE_ALPHABET: BinaryStateAlphabet

71

RESTRICTION_SITES_STATE_ALPHABET: RestrictionSitesStateAlphabet

72

INFINITE_SITES_STATE_ALPHABET: InfiniteSitesStateAlphabet

73

74

def new_standard_state_alphabet(symbols):

75

"""

76

Create custom standard morphological state alphabet.

77

78

Parameters:

79

- symbols: String or list of state symbols

80

81

Returns:

82

StateAlphabet: Custom alphabet with specified symbols

83

"""

84

```

85

86

### Character Sequences

87

88

Classes representing individual character sequences (rows in alignment matrices).

89

90

```python { .api }

91

class CharacterDataSequence:

92

"""

93

Base class for character data sequences.

94

95

Parameters:

96

- taxon: Associated Taxon object

97

- values: Sequence of character states

98

"""

99

100

def __init__(self, taxon=None, values=None): ...

101

102

def __len__(self):

103

"""Length of sequence."""

104

105

def __iter__(self):

106

"""Iterate over character states."""

107

108

def __getitem__(self, index):

109

"""Get character state at position."""

110

111

def __setitem__(self, index, value):

112

"""Set character state at position."""

113

114

def append(self, value):

115

"""Append character state to sequence."""

116

117

def extend(self, values):

118

"""Extend sequence with multiple states."""

119

120

def symbols_as_string(self):

121

"""Return sequence as string of symbols."""

122

123

# Specific sequence types

124

class DnaCharacterDataSequence(CharacterDataSequence):

125

"""DNA character sequence with nucleotide states."""

126

127

class RnaCharacterDataSequence(CharacterDataSequence):

128

"""RNA character sequence with nucleotide states."""

129

130

class NucleotideCharacterDataSequence(CharacterDataSequence):

131

"""General nucleotide character sequence."""

132

133

class ProteinCharacterDataSequence(CharacterDataSequence):

134

"""Protein character sequence with amino acid states."""

135

136

class StandardCharacterDataSequence(CharacterDataSequence):

137

"""Standard morphological character sequence."""

138

139

class RestrictionSitesCharacterDataSequence(CharacterDataSequence):

140

"""Restriction sites character sequence."""

141

142

class InfiniteSitesCharacterDataSequence(CharacterDataSequence):

143

"""Infinite sites character sequence."""

144

145

class ContinuousCharacterDataSequence(CharacterDataSequence):

146

"""Continuous (quantitative) character sequence."""

147

```

148

149

### Character Matrices

150

151

Classes representing character data matrices (alignments) with multiple sequences.

152

153

```python { .api }

154

class CharacterMatrix:

155

"""

156

Base class for character data matrices.

157

158

Parameters:

159

- taxon_namespace: TaxonNamespace for matrix taxa

160

- default_state_alphabet: StateAlphabet for character states

161

"""

162

163

def __init__(self, taxon_namespace=None, default_state_alphabet=None): ...

164

165

@classmethod

166

def get(cls, **kwargs):

167

"""Read character matrix from external source."""

168

169

def read(self, **kwargs):

170

"""Read data from external source into existing matrix."""

171

172

def write(self, **kwargs):

173

"""Write matrix to external destination."""

174

175

# Matrix access and manipulation

176

def __len__(self):

177

"""Number of sequences (taxa) in matrix."""

178

179

def __iter__(self):

180

"""Iterate over taxon-sequence pairs."""

181

182

def __getitem__(self, taxon):

183

"""Get sequence for specific taxon."""

184

185

def __setitem__(self, taxon, sequence):

186

"""Set sequence for specific taxon."""

187

188

def __contains__(self, taxon):

189

"""Check if taxon has sequence in matrix."""

190

191

def __delitem__(self, taxon):

192

"""Remove taxon and its sequence from matrix."""

193

194

def new_sequence(self, taxon, values=None):

195

"""Create new sequence for taxon."""

196

197

def add_sequence(self, sequence):

198

"""Add existing sequence to matrix."""

199

200

def remove_sequences(self, taxa):

201

"""Remove sequences for specified taxa."""

202

203

def keep_chars(self, indices):

204

"""Keep only characters at specified indices."""

205

206

def remove_chars(self, indices):

207

"""Remove characters at specified indices."""

208

209

# Matrix properties

210

def max_sequence_size(self):

211

"""Length of longest sequence in matrix."""

212

213

def sequence_size_is_uniform(self):

214

"""Check if all sequences have same length."""

215

216

def pack(self, pad_to_size=None):

217

"""Pad sequences to uniform length."""

218

219

def concatenate(self, other_matrices):

220

"""Concatenate with other character matrices."""

221

222

def export_character_indices(self, indices):

223

"""Export subset of characters as new matrix."""

224

225

def export_character_subset(self, character_set):

226

"""Export character subset as new matrix."""

227

228

# Molecular sequence matrices

229

class DnaCharacterMatrix(CharacterMatrix):

230

"""DNA sequence alignment matrix."""

231

232

def __init__(self, **kwargs): ...

233

234

def nucleotide_frequencies(self):

235

"""Calculate nucleotide frequencies across matrix."""

236

237

def gc_content(self):

238

"""Calculate GC content of matrix."""

239

240

class RnaCharacterMatrix(CharacterMatrix):

241

"""RNA sequence alignment matrix."""

242

243

class NucleotideCharacterMatrix(CharacterMatrix):

244

"""General nucleotide sequence matrix."""

245

246

class ProteinCharacterMatrix(CharacterMatrix):

247

"""Protein sequence alignment matrix."""

248

249

def amino_acid_frequencies(self):

250

"""Calculate amino acid frequencies."""

251

252

# Morphological matrices

253

class StandardCharacterMatrix(CharacterMatrix):

254

"""Standard morphological character matrix."""

255

256

class BinaryCharacterMatrix(CharacterMatrix):

257

"""Binary character matrix (0/1 states)."""

258

259

class RestrictionSitesCharacterMatrix(CharacterMatrix):

260

"""Restriction sites presence/absence matrix."""

261

262

class InfiniteSitesCharacterMatrix(CharacterMatrix):

263

"""Infinite sites character matrix."""

264

265

# Quantitative data

266

class ContinuousCharacterMatrix(CharacterMatrix):

267

"""Continuous (quantitative) character matrix."""

268

269

def mean_vector(self):

270

"""Calculate mean values for each character."""

271

272

def variance_vector(self):

273

"""Calculate variance for each character."""

274

275

def covariance_matrix(self):

276

"""Calculate character covariance matrix."""

277

```

278

279

### Character Evolution Models

280

281

Classes for modeling discrete character evolution along phylogenetic trees.

282

283

```python { .api }

284

class DiscreteCharacterEvolutionModel:

285

"""

286

General discrete character evolution model.

287

288

Parameters:

289

- state_alphabet: StateAlphabet defining possible states

290

- stationary_freqs: Equilibrium state frequencies

291

- rate_matrix: Instantaneous rate matrix

292

"""

293

294

def __init__(self, state_alphabet=None, **kwargs): ...

295

296

def p_matrix(self, edge_length):

297

"""Calculate transition probability matrix for given time."""

298

299

def stationary_sample(self, rng=None):

300

"""Sample character state from equilibrium distribution."""

301

302

class Hky85(DiscreteCharacterEvolutionModel):

303

"""

304

HKY85 nucleotide substitution model.

305

306

Parameters:

307

- kappa: Transition/transversion ratio

308

- base_freqs: Equilibrium base frequencies [A, C, G, T]

309

"""

310

311

def __init__(self, kappa=1.0, base_freqs=None): ...

312

313

class Jc69(DiscreteCharacterEvolutionModel):

314

"""

315

Jukes-Cantor 69 nucleotide substitution model.

316

317

All substitution rates equal, equal base frequencies.

318

"""

319

320

def __init__(self): ...

321

322

class DiscreteCharacterEvolver:

323

"""

324

Engine for evolving discrete characters on trees.

325

326

Parameters:

327

- seq_model: DiscreteCharacterEvolutionModel

328

- seq_len: Length of sequences to simulate

329

"""

330

331

def __init__(self, seq_model=None, seq_len=None): ...

332

333

def evolve_states(self, tree, seq_len=None, rng=None):

334

"""

335

Simulate character evolution on tree.

336

337

Parameters:

338

- tree: Tree for simulation

339

- seq_len: Number of characters to simulate

340

- rng: Random number generator

341

342

Returns:

343

CharacterMatrix: Simulated character data

344

"""

345

```

346

347

### Character Simulation Functions

348

349

Functions for simulating character evolution under various models.

350

351

```python { .api }

352

def simulate_discrete_char_dataset(tree, seq_len, **kwargs):

353

"""

354

Simulate discrete character dataset on tree.

355

356

Parameters:

357

- tree: Tree for character simulation

358

- seq_len: Number of characters to simulate

359

- char_model: Character evolution model

360

- mutation_rate: Overall mutation rate

361

- rng: Random number generator

362

363

Returns:

364

CharacterMatrix: Simulated character data

365

"""

366

367

def simulate_discrete_chars(tree, char_model, seq_len, **kwargs):

368

"""

369

Simulate discrete characters with specified model.

370

371

Parameters:

372

- tree: Phylogenetic tree

373

- char_model: DiscreteCharacterEvolutionModel

374

- seq_len: Sequence length

375

- rng: Random number generator

376

377

Returns:

378

CharacterMatrix: Simulated alignment

379

"""

380

381

def hky85_chars(tree, seq_len, kappa=1.0, base_freqs=None, **kwargs):

382

"""

383

Simulate DNA sequences under HKY85 model.

384

385

Parameters:

386

- tree: Phylogenetic tree with branch lengths

387

- seq_len: Length of sequences to simulate

388

- kappa: Transition/transversion ratio

389

- base_freqs: Base frequencies [A, C, G, T]

390

- mutation_rate: Mutation rate multiplier

391

- rng: Random number generator

392

393

Returns:

394

DnaCharacterMatrix: Simulated DNA alignment

395

"""

396

397

def evolve_continuous_char(tree, char_matrix, **kwargs):

398

"""

399

Evolve continuous characters using Brownian motion.

400

401

Parameters:

402

- tree: Phylogenetic tree

403

- char_matrix: Initial continuous character values

404

- rate: Rate of character evolution

405

- rng: Random number generator

406

407

Returns:

408

ContinuousCharacterMatrix: Evolved character data

409

"""

410

```

411

412

### Character Data Conversion

413

414

Functions for converting between character data types and formats.

415

416

```python { .api }

417

def concatenate_matrices(matrices, taxon_namespace=None):

418

"""

419

Concatenate multiple character matrices.

420

421

Parameters:

422

- matrices: List of CharacterMatrix objects

423

- taxon_namespace: Target TaxonNamespace

424

425

Returns:

426

CharacterMatrix: Concatenated matrix

427

"""

428

429

def standardize_taxon_namespace(matrices, taxon_namespace=None):

430

"""

431

Standardize taxon namespace across multiple matrices.

432

433

Parameters:

434

- matrices: List of CharacterMatrix objects

435

- taxon_namespace: Target TaxonNamespace

436

437

Returns:

438

None (modifies matrices in place)

439

"""

440

441

def convert_dna_to_protein(dna_matrix, genetic_code=None):

442

"""

443

Translate DNA matrix to protein matrix.

444

445

Parameters:

446

- dna_matrix: DnaCharacterMatrix to translate

447

- genetic_code: Genetic code for translation

448

449

Returns:

450

ProteinCharacterMatrix: Translated sequences

451

"""

452

```

453

454

### Character Statistics

455

456

Functions for calculating statistics on character data.

457

458

```python { .api }

459

def char_state_frequencies(char_matrix, gap_as_missing=True):

460

"""

461

Calculate character state frequencies.

462

463

Parameters:

464

- char_matrix: CharacterMatrix to analyze

465

- gap_as_missing: Treat gaps as missing data

466

467

Returns:

468

dict: State frequencies across matrix

469

"""

470

471

def pairwise_sequence_distances(char_matrix, distance_fn=None):

472

"""

473

Calculate pairwise distances between sequences.

474

475

Parameters:

476

- char_matrix: CharacterMatrix for distance calculation

477

- distance_fn: Distance function (default: p-distance)

478

479

Returns:

480

dict: Pairwise distance matrix

481

"""

482

483

def invariant_sites_proportion(char_matrix):

484

"""

485

Calculate proportion of invariant sites.

486

487

Parameters:

488

- char_matrix: CharacterMatrix to analyze

489

490

Returns:

491

float: Proportion of sites with no variation

492

"""

493

494

def segregating_sites_count(char_matrix):

495

"""

496

Count number of segregating (variable) sites.

497

498

Parameters:

499

- char_matrix: CharacterMatrix to analyze

500

501

Returns:

502

int: Number of variable sites

503

"""

504

```