A set of utilities for processing MediaWiki XML dump data efficiently with streaming and distributed processing capabilities.
—
Essential classes for parsing MediaWiki XML dumps into structured Python objects with streaming iteration support. These classes form the foundation of mwxml's memory-efficient processing approach.
The main entry point for processing MediaWiki XML dumps, providing access to site information and iterators for pages and log items.
class Dump:
"""
XML Dump Iterator containing site metadata and page/log item iterators.
Attributes:
- site_info: SiteInfo object with metadata from <siteinfo> block
- pages: Iterator of Page elements
- log_items: Iterator of LogItem elements
- items: Iterator of both Page and LogItem elements
"""
@classmethod
def from_file(cls, f):
"""
Constructs a Dump from a file pointer.
Parameters:
- f: Plain text file pointer containing XML to process
Returns: Dump instance
"""
@classmethod
def from_page_xml(cls, page_xml):
"""
Constructs a Dump from a <page> block.
Parameters:
- page_xml: String or file containing <page> block XML to process
Returns: Dump instance
"""
def __iter__(self):
"""Returns iterator over items (pages and log items)."""
def __next__(self):
"""Returns next item from iterator."""Usage Example:
import mwxml
# Process from file
with open("dump.xml") as f:
dump = mwxml.Dump.from_file(f)
# Access site information
print(f"Site: {dump.site_info.name}")
print(f"Database: {dump.site_info.dbname}")
# Process all items (pages and log items)
for item in dump:
if isinstance(item, mwxml.Page):
print(f"Page: {item.title}")
elif isinstance(item, mwxml.LogItem):
print(f"Log: {item.type}")
# Process from page XML fragment
page_xml = """<page>
<title>Test Page</title>
<id>123</id>
<revision>
<id>456</id>
<text>Page content</text>
</revision>
</page>"""
dump = mwxml.Dump.from_page_xml(page_xml)Represents individual pages with metadata and revision iterators for memory-efficient processing of page histories.
class Page:
"""
Page metadata and Revision iterator.
Attributes (inherited from mwtypes.Page):
- id: Page ID (int)
- title: Page title (str)
- namespace: Namespace ID (int)
- redirect: Redirect target title (str | None)
- restrictions: List of restriction strings (list[str])
"""
@classmethod
def from_element(cls, element, namespace_map=None):
"""
Constructs Page from XML element.
Parameters:
- element: XML element representing <page>
- namespace_map: Optional mapping of namespace names to Namespace objects
Returns: Page instance
"""
def __iter__(self):
"""Returns iterator over page revisions."""
def __next__(self):
"""Returns next revision from iterator."""Usage Example:
# Iterate through pages in dump
for page in dump.pages:
print(f"Processing page: {page.title} (ID: {page.id})")
print(f"Namespace: {page.namespace}")
if page.redirect:
print(f"Redirects to: {page.redirect}")
# Process all revisions for this page
revision_count = 0
for revision in page:
revision_count += 1
print(f" Revision {revision.id} at {revision.timestamp}")
print(f"Total revisions: {revision_count}")Represents individual revisions with complete metadata, user information, and content data.
class Revision:
"""
Revision metadata and text content.
Attributes (inherited from mwtypes.Revision):
- id: Revision ID (int)
- timestamp: Revision timestamp (Timestamp)
- user: User who made the revision (User | None)
- minor: Whether this is a minor edit (bool)
- parent_id: Parent revision ID (int | None)
- comment: Edit comment (str | None)
- deleted: Deletion status information (Deleted)
- slots: Content slots containing text and metadata (Slots)
"""
@classmethod
def from_element(cls, element):
"""
Constructs Revision from XML element.
Parameters:
- element: XML element representing <revision>
Returns: Revision instance
"""Usage Example:
for page in dump.pages:
for revision in page:
print(f"Revision {revision.id} by {revision.user.text if revision.user else 'Anonymous'}")
print(f"Timestamp: {revision.timestamp}")
print(f"Minor edit: {revision.minor}")
if revision.comment:
print(f"Comment: {revision.comment}")
# Access revision content
if revision.slots and revision.slots.main:
main_content = revision.slots.main
if main_content.text:
print(f"Text length: {len(main_content.text)}")
print(f"Content model: {main_content.model}")
print(f"Format: {main_content.format}")All parsing operations can raise MalformedXML exceptions when the XML structure doesn't match expected MediaWiki dump format.
class MalformedXML(Exception):
"""
Thrown when XML dump file is not formatted as expected.
This exception is raised during parsing when:
- Required XML elements are missing
- XML structure doesn't match MediaWiki dump schema
- Unexpected XML elements are encountered
- XML parsing errors occur
"""Error Handling Example:
import mwxml
from mwxml.errors import MalformedXML
try:
dump = mwxml.Dump.from_file(open("dump.xml"))
for page in dump:
for revision in page:
print(f"Processing revision {revision.id}")
except MalformedXML as e:
print(f"XML format error: {e}")
except FileNotFoundError as e:
print(f"File not found: {e}")
except Exception as e:
print(f"Unexpected error: {e}")Contains metadata about the MediaWiki site from the <siteinfo> block, including site name, database name, and namespace configuration.
class SiteInfo:
"""
Site metadata from <siteinfo> block.
Attributes:
- name: Site name (str | None)
- dbname: Database name (str | None)
- base: Base URL (str | None)
- generator: Generator information (str | None)
- case: Case sensitivity setting (str | None)
- namespaces: List of Namespace objects (list[Namespace] | None)
"""
@classmethod
def from_element(cls, element):
"""
Constructs SiteInfo from XML element.
Parameters:
- element: XML element representing <siteinfo>
Returns: SiteInfo instance
"""Usage Example:
site_info = dump.site_info
print(f"Site name: {site_info.name}")
print(f"Database: {site_info.dbname}")
print(f"Base URL: {site_info.base}")
print(f"Generator: {site_info.generator}")
# Process namespaces
if site_info.namespaces:
print("Namespaces:")
for ns in site_info.namespaces:
print(f" {ns.id}: {ns.name}")Represents log entries for administrative actions and events in the wiki.
class LogItem:
"""
Log entry metadata for administrative actions.
Attributes (inherited from mwtypes.LogItem):
- id: Log item ID (int)
- timestamp: Event timestamp (Timestamp)
- comment: Log comment (str | None)
- user: User who performed the action (User | None)
- page: Page affected by the action (Page | None)
- type: Log type (str | None)
- action: Specific action performed (str | None)
- text: Additional text data (str | None)
- params: Action parameters (str | None)
- deleted: Deletion status information (Deleted)
"""
@classmethod
def from_element(cls, element, namespace_map=None):
"""
Constructs LogItem from XML element.
Parameters:
- element: XML element representing <logitem>
- namespace_map: Optional mapping of namespace names to Namespace objects
Returns: LogItem instance
"""Usage Example:
# Process log items from dump
for log_item in dump.log_items:
print(f"Log {log_item.id}: {log_item.type}/{log_item.action}")
print(f"Timestamp: {log_item.timestamp}")
if log_item.user:
print(f"User: {log_item.user.text}")
if log_item.page:
print(f"Page: {log_item.page.title}")
if log_item.comment:
print(f"Comment: {log_item.comment}")
if log_item.params:
print(f"Parameters: {log_item.params}")Install with Tessl CLI
npx tessl i tessl/pypi-mwxml