or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-processing.mddistributed-processing.mdindex.mdutilities.md

core-processing.mddocs/

0

# Core XML Processing

1

2

Essential classes for parsing MediaWiki XML dumps into structured Python objects with streaming iteration support. These classes form the foundation of mwxml's memory-efficient processing approach.

3

4

## Capabilities

5

6

### Dump Processing

7

8

The main entry point for processing MediaWiki XML dumps, providing access to site information and iterators for pages and log items.

9

10

```python { .api }

11

class Dump:

12

"""

13

XML Dump Iterator containing site metadata and page/log item iterators.

14

15

Attributes:

16

- site_info: SiteInfo object with metadata from <siteinfo> block

17

- pages: Iterator of Page elements

18

- log_items: Iterator of LogItem elements

19

- items: Iterator of both Page and LogItem elements

20

"""

21

22

@classmethod

23

def from_file(cls, f):

24

"""

25

Constructs a Dump from a file pointer.

26

27

Parameters:

28

- f: Plain text file pointer containing XML to process

29

30

Returns: Dump instance

31

"""

32

33

@classmethod

34

def from_page_xml(cls, page_xml):

35

"""

36

Constructs a Dump from a <page> block.

37

38

Parameters:

39

- page_xml: String or file containing <page> block XML to process

40

41

Returns: Dump instance

42

"""

43

44

def __iter__(self):

45

"""Returns iterator over items (pages and log items)."""

46

47

def __next__(self):

48

"""Returns next item from iterator."""

49

```

50

51

**Usage Example:**

52

53

```python

54

import mwxml

55

56

# Process from file

57

with open("dump.xml") as f:

58

dump = mwxml.Dump.from_file(f)

59

60

# Access site information

61

print(f"Site: {dump.site_info.name}")

62

print(f"Database: {dump.site_info.dbname}")

63

64

# Process all items (pages and log items)

65

for item in dump:

66

if isinstance(item, mwxml.Page):

67

print(f"Page: {item.title}")

68

elif isinstance(item, mwxml.LogItem):

69

print(f"Log: {item.type}")

70

71

# Process from page XML fragment

72

page_xml = """<page>

73

<title>Test Page</title>

74

<id>123</id>

75

<revision>

76

<id>456</id>

77

<text>Page content</text>

78

</revision>

79

</page>"""

80

81

dump = mwxml.Dump.from_page_xml(page_xml)

82

```

83

84

### Page Processing

85

86

Represents individual pages with metadata and revision iterators for memory-efficient processing of page histories.

87

88

```python { .api }

89

class Page:

90

"""

91

Page metadata and Revision iterator.

92

93

Attributes (inherited from mwtypes.Page):

94

- id: Page ID (int)

95

- title: Page title (str)

96

- namespace: Namespace ID (int)

97

- redirect: Redirect target title (str | None)

98

- restrictions: List of restriction strings (list[str])

99

"""

100

101

@classmethod

102

def from_element(cls, element, namespace_map=None):

103

"""

104

Constructs Page from XML element.

105

106

Parameters:

107

- element: XML element representing <page>

108

- namespace_map: Optional mapping of namespace names to Namespace objects

109

110

Returns: Page instance

111

"""

112

113

def __iter__(self):

114

"""Returns iterator over page revisions."""

115

116

def __next__(self):

117

"""Returns next revision from iterator."""

118

```

119

120

**Usage Example:**

121

122

```python

123

# Iterate through pages in dump

124

for page in dump.pages:

125

print(f"Processing page: {page.title} (ID: {page.id})")

126

print(f"Namespace: {page.namespace}")

127

128

if page.redirect:

129

print(f"Redirects to: {page.redirect}")

130

131

# Process all revisions for this page

132

revision_count = 0

133

for revision in page:

134

revision_count += 1

135

print(f" Revision {revision.id} at {revision.timestamp}")

136

137

print(f"Total revisions: {revision_count}")

138

```

139

140

### Revision Processing

141

142

Represents individual revisions with complete metadata, user information, and content data.

143

144

```python { .api }

145

class Revision:

146

"""

147

Revision metadata and text content.

148

149

Attributes (inherited from mwtypes.Revision):

150

- id: Revision ID (int)

151

- timestamp: Revision timestamp (Timestamp)

152

- user: User who made the revision (User | None)

153

- minor: Whether this is a minor edit (bool)

154

- parent_id: Parent revision ID (int | None)

155

- comment: Edit comment (str | None)

156

- deleted: Deletion status information (Deleted)

157

- slots: Content slots containing text and metadata (Slots)

158

"""

159

160

@classmethod

161

def from_element(cls, element):

162

"""

163

Constructs Revision from XML element.

164

165

Parameters:

166

- element: XML element representing <revision>

167

168

Returns: Revision instance

169

"""

170

```

171

172

**Usage Example:**

173

174

```python

175

for page in dump.pages:

176

for revision in page:

177

print(f"Revision {revision.id} by {revision.user.text if revision.user else 'Anonymous'}")

178

print(f"Timestamp: {revision.timestamp}")

179

print(f"Minor edit: {revision.minor}")

180

181

if revision.comment:

182

print(f"Comment: {revision.comment}")

183

184

# Access revision content

185

if revision.slots and revision.slots.main:

186

main_content = revision.slots.main

187

if main_content.text:

188

print(f"Text length: {len(main_content.text)}")

189

print(f"Content model: {main_content.model}")

190

print(f"Format: {main_content.format}")

191

```

192

193

## Error Handling

194

195

All parsing operations can raise `MalformedXML` exceptions when the XML structure doesn't match expected MediaWiki dump format.

196

197

```python { .api }

198

class MalformedXML(Exception):

199

"""

200

Thrown when XML dump file is not formatted as expected.

201

202

This exception is raised during parsing when:

203

- Required XML elements are missing

204

- XML structure doesn't match MediaWiki dump schema

205

- Unexpected XML elements are encountered

206

- XML parsing errors occur

207

"""

208

```

209

210

**Error Handling Example:**

211

212

```python

213

import mwxml

214

from mwxml.errors import MalformedXML

215

216

try:

217

dump = mwxml.Dump.from_file(open("dump.xml"))

218

for page in dump:

219

for revision in page:

220

print(f"Processing revision {revision.id}")

221

except MalformedXML as e:

222

print(f"XML format error: {e}")

223

except FileNotFoundError as e:

224

print(f"File not found: {e}")

225

except Exception as e:

226

print(f"Unexpected error: {e}")

227

```

228

229

### Site Information Processing

230

231

Contains metadata about the MediaWiki site from the `<siteinfo>` block, including site name, database name, and namespace configuration.

232

233

```python { .api }

234

class SiteInfo:

235

"""

236

Site metadata from <siteinfo> block.

237

238

Attributes:

239

- name: Site name (str | None)

240

- dbname: Database name (str | None)

241

- base: Base URL (str | None)

242

- generator: Generator information (str | None)

243

- case: Case sensitivity setting (str | None)

244

- namespaces: List of Namespace objects (list[Namespace] | None)

245

"""

246

247

@classmethod

248

def from_element(cls, element):

249

"""

250

Constructs SiteInfo from XML element.

251

252

Parameters:

253

- element: XML element representing <siteinfo>

254

255

Returns: SiteInfo instance

256

"""

257

```

258

259

**Usage Example:**

260

261

```python

262

site_info = dump.site_info

263

264

print(f"Site name: {site_info.name}")

265

print(f"Database: {site_info.dbname}")

266

print(f"Base URL: {site_info.base}")

267

print(f"Generator: {site_info.generator}")

268

269

# Process namespaces

270

if site_info.namespaces:

271

print("Namespaces:")

272

for ns in site_info.namespaces:

273

print(f" {ns.id}: {ns.name}")

274

```

275

276

### Log Item Processing

277

278

Represents log entries for administrative actions and events in the wiki.

279

280

```python { .api }

281

class LogItem:

282

"""

283

Log entry metadata for administrative actions.

284

285

Attributes (inherited from mwtypes.LogItem):

286

- id: Log item ID (int)

287

- timestamp: Event timestamp (Timestamp)

288

- comment: Log comment (str | None)

289

- user: User who performed the action (User | None)

290

- page: Page affected by the action (Page | None)

291

- type: Log type (str | None)

292

- action: Specific action performed (str | None)

293

- text: Additional text data (str | None)

294

- params: Action parameters (str | None)

295

- deleted: Deletion status information (Deleted)

296

"""

297

298

@classmethod

299

def from_element(cls, element, namespace_map=None):

300

"""

301

Constructs LogItem from XML element.

302

303

Parameters:

304

- element: XML element representing <logitem>

305

- namespace_map: Optional mapping of namespace names to Namespace objects

306

307

Returns: LogItem instance

308

"""

309

```

310

311

**Usage Example:**

312

313

```python

314

# Process log items from dump

315

for log_item in dump.log_items:

316

print(f"Log {log_item.id}: {log_item.type}/{log_item.action}")

317

print(f"Timestamp: {log_item.timestamp}")

318

319

if log_item.user:

320

print(f"User: {log_item.user.text}")

321

322

if log_item.page:

323

print(f"Page: {log_item.page.title}")

324

325

if log_item.comment:

326

print(f"Comment: {log_item.comment}")

327

328

if log_item.params:

329

print(f"Parameters: {log_item.params}")

330

```