tessl/pypi-wikipedia-api

Python wrapper for Wikipedia's API that provides easy access to page content, sections, links, categories, and translations

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Categories

Name: tessl/pypi-wikipedia-api
Author: tessl

Work with Wikipedia's category system including page categories and category membership. Categories provide hierarchical organization of Wikipedia content, enabling discovery of related articles and navigation through topical groupings.

Capabilities

Page Categories

Access categories that a page belongs to, providing topical classification and related content discovery.

class WikipediaPage:
    @property
    def categories(self) -> dict[str, WikipediaPage]:
        """
        Get categories that this page belongs to.
        
        Returns:
        Dictionary mapping category names to WikipediaPage objects.
        Keys are category names (including "Category:" prefix),
        values are WikipediaPage instances representing category pages.
        """

Usage Examples

import wikipediaapi

wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
page = wiki.page('Machine_learning')

# Get all categories for the page
categories = page.categories
print(f"Page belongs to {len(categories)} categories")

# Browse categories
for category_name, category_page in categories.items():
    print(f"Category: {category_name}")
    print(f"  Namespace: {category_page.namespace}")  # Should be 14 for categories
    print(f"  Title: {category_page.title}")

# Find specific types of categories
subject_categories = []
maintenance_categories = []

for cat_name, cat_page in categories.items():
    if any(keyword in cat_name.lower() for keyword in ['computer science', 'algorithms', 'artificial intelligence']):
        subject_categories.append(cat_name)
    elif any(keyword in cat_name.lower() for keyword in ['articles', 'pages', 'wikipedia']):
        maintenance_categories.append(cat_name)

print(f"Subject categories: {len(subject_categories)}")
for cat in subject_categories:
    print(f"  - {cat}")

print(f"Maintenance categories: {len(maintenance_categories)}")

Category Members

For category pages, access all pages that belong to that category, enabling exploration of topically related content.

class WikipediaPage:
    @property
    def categorymembers(self) -> dict[str, WikipediaPage]:
        """
        Get pages that belong to this category (for category pages).
        
        Returns:
        Dictionary mapping page titles to WikipediaPage objects.
        Keys are page titles, values are WikipediaPage instances.
        Only meaningful for pages in the Category namespace.
        """

Usage Examples

# Work with a category page
category_page = wiki.page('Category:Machine_learning', ns=wikipediaapi.Namespace.CATEGORY)

if category_page.exists():
    members = category_page.categorymembers
    print(f"Category has {len(members)} members")
    
    # Analyze category members by namespace
    articles = []
    subcategories = []
    other = []
    
    for title, member_page in members.items():
        if member_page.namespace == wikipediaapi.Namespace.MAIN:
            articles.append(title)
        elif member_page.namespace == wikipediaapi.Namespace.CATEGORY:
            subcategories.append(title)
        else:
            other.append(title)
    
    print(f"Articles: {len(articles)}")
    print(f"Subcategories: {len(subcategories)}")
    print(f"Other: {len(other)}")
    
    # Show some articles in this category
    print("\nSome articles in this category:")
    for article_title in articles[:10]:
        print(f"  - {article_title}")
    
    # Show subcategories
    print("\nSubcategories:")
    for subcat_title in subcategories[:5]:
        print(f"  - {subcat_title}")

# Direct category exploration
physics_cat = wiki.page('Category:Physics', ns=wikipediaapi.Namespace.CATEGORY)
if physics_cat.exists():
    physics_members = physics_cat.categorymembers
    print(f"Physics category has {len(physics_members)} members")
    
    # Find notable physics articles
    for title, page in physics_members.items():
        if page.namespace == wikipediaapi.Namespace.MAIN:
            # Could check page.summary or other properties to assess importance
            if len(title) < 30:  # Simple heuristic for main topics
                print(f"Physics article: {title}")

Category Hierarchy Navigation

Navigate through Wikipedia's category hierarchy to explore related topics and subcategories.

Usage Examples

def explore_category_hierarchy(category_name, max_depth=2, current_depth=0):
    """Recursively explore category hierarchy."""
    if current_depth >= max_depth:
        return
    
    wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
    category_page = wiki.page(category_name, ns=wikipediaapi.Namespace.CATEGORY)
    
    if not category_page.exists():
        return
    
    indent = "  " * current_depth
    print(f"{indent}Category: {category_name}")
    
    members = category_page.categorymembers
    articles = []
    subcategories = []
    
    for title, member_page in members.items():
        if member_page.namespace == wikipediaapi.Namespace.MAIN:
            articles.append(title)
        elif member_page.namespace == wikipediaapi.Namespace.CATEGORY:
            subcategories.append(title)
    
    print(f"{indent}  Articles: {len(articles)}")
    print(f"{indent}  Subcategories: {len(subcategories)}")
    
    # Show some articles
    for article in articles[:3]:
        print(f"{indent}    - {article}")
    
    # Recurse into subcategories
    for subcat in subcategories[:3]:  # Limit to prevent too much output
        explore_category_hierarchy(subcat, max_depth, current_depth + 1)

# Explore computer science hierarchy
explore_category_hierarchy('Category:Computer_science', max_depth=3)

Category-Based Content Discovery

Use categories to discover related content and analyze topical relationships.

Usage Examples

def find_related_articles_via_categories(page_title, min_shared_categories=2):
    """Find articles related via shared categories."""
    wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
    page = wiki.page(page_title)
    
    if not page.exists():
        return []
    
    page_categories = set(page.categories.keys())
    related_articles = {}
    
    # Check each category this page belongs to
    for category_name, category_page in page.categories.items():
        if category_page.exists():
            # Get other articles in this category
            for member_title, member_page in category_page.categorymembers.items():
                if (member_page.namespace == wikipediaapi.Namespace.MAIN and 
                    member_title != page_title):
                    
                    if member_title not in related_articles:
                        related_articles[member_title] = set()
                    related_articles[member_title].add(category_name)
    
    # Filter by minimum shared categories
    highly_related = []
    for article_title, shared_cats in related_articles.items():
        if len(shared_cats) >= min_shared_categories:
            highly_related.append((article_title, len(shared_cats), shared_cats))
    
    # Sort by number of shared categories
    highly_related.sort(key=lambda x: x[1], reverse=True)
    return highly_related

def analyze_category_overlap(page1_title, page2_title):
    """Analyze category overlap between two pages."""
    wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
    
    page1 = wiki.page(page1_title)
    page2 = wiki.page(page2_title)
    
    if not (page1.exists() and page2.exists()):
        return None
    
    cats1 = set(page1.categories.keys())
    cats2 = set(page2.categories.keys())
    
    shared = cats1.intersection(cats2)
    only1 = cats1 - cats2
    only2 = cats2 - cats1
    
    return {
        'shared_categories': sorted(shared),
        'only_in_first': sorted(only1),
        'only_in_second': sorted(only2),
        'similarity_ratio': len(shared) / (len(cats1.union(cats2)) if cats1.union(cats2) else 1)
    }

# Find articles related to "Neural network"
related = find_related_articles_via_categories('Neural_network', min_shared_categories=2)
print(f"Found {len(related)} highly related articles:")
for article, shared_count, categories in related[:10]:
    print(f"  {article} (shares {shared_count} categories)")
    for cat in list(categories)[:3]:  # Show first 3 shared categories
        print(f"    - {cat}")

# Compare two pages via categories
comparison = analyze_category_overlap('Machine_learning', 'Deep_learning')
if comparison:
    print(f"Similarity ratio: {comparison['similarity_ratio']:.2f}")
    print(f"Shared categories: {len(comparison['shared_categories'])}")
    for cat in comparison['shared_categories'][:5]:
        print(f"  - {cat}")

Category Filtering and Analysis

Advanced category filtering and analysis for content organization.

Usage Examples

def filter_categories_by_type(page_title):
    """Categorize page categories by type."""
    wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
    page = wiki.page(page_title)
    
    if not page.exists():
        return None
    
    categorized = {
        'subject': [],       # Subject matter categories
        'geographic': [],    # Geographic categories  
        'temporal': [],      # Time-based categories
        'maintenance': [],   # Wikipedia maintenance categories
        'other': []
    }
    
    for cat_name in page.categories.keys():
        cat_lower = cat_name.lower()
        
        if any(keyword in cat_lower for keyword in ['articles', 'pages', 'wikipedia', 'cleanup']):
            categorized['maintenance'].append(cat_name)
        elif any(keyword in cat_lower for keyword in ['country', 'city', 'region', 'american', 'european']):
            categorized['geographic'].append(cat_name)
        elif any(keyword in cat_lower for keyword in ['century', 'year', 'decade', 'era']):
            categorized['temporal'].append(cat_name)
        elif any(keyword in cat_lower for keyword in ['science', 'mathematics', 'computer', 'physics']):
            categorized['subject'].append(cat_name)
        else:
            categorized['other'].append(cat_name)
    
    return categorized

def get_category_statistics(category_name):
    """Get statistics about a category."""
    wiki = wikipediaapi.Wikipedia('MyApp/1.0', 'en')
    category_page = wiki.page(category_name, ns=wikipediaapi.Namespace.CATEGORY)
    
    if not category_page.exists():
        return None
    
    members = category_page.categorymembers
    stats = {
        'total_members': len(members),
        'articles': 0,
        'subcategories': 0,
        'other': 0,
        'member_types': {}
    }
    
    for title, member_page in members.items():
        ns = member_page.namespace
        if ns == wikipediaapi.Namespace.MAIN:
            stats['articles'] += 1
        elif ns == wikipediaapi.Namespace.CATEGORY:
            stats['subcategories'] += 1
        else:
            stats['other'] += 1
        
        ns_name = f"Namespace_{ns}"
        stats['member_types'][ns_name] = stats['member_types'].get(ns_name, 0) + 1
    
    return stats

# Analyze page categories
category_analysis = filter_categories_by_type('Quantum_computing')
if category_analysis:
    for cat_type, cats in category_analysis.items():
        if cats:
            print(f"{cat_type.title()} categories ({len(cats)}):")
            for cat in cats[:3]:  # Show first 3
                print(f"  - {cat}")

# Get category statistics
stats = get_category_statistics('Category:Artificial_intelligence')
if stats:
    print(f"Category statistics:")
    print(f"  Total members: {stats['total_members']}")
    print(f"  Articles: {stats['articles']}")
    print(f"  Subcategories: {stats['subcategories']}")
    print(f"  Other: {stats['other']}")

Install with Tessl CLI