A Python library for scraping the Google search engine.
npx @tessl/cli install tessl/pypi-googlesearch-python@1.3.0A Python library for scraping the Google search engine using web scraping techniques. It leverages requests for HTTP communication and BeautifulSoup4 for HTML parsing to extract search results, titles, URLs, and descriptions from Google's search pages.
pip install googlesearch-pythonfrom googlesearch import searchFor advanced search results with structured data:
from googlesearch import search, SearchResultFor user agent utilities:
from googlesearch import get_useragentfrom googlesearch import search
# Simple search - returns URLs only
for url in search("Python programming", num_results=10):
print(url)
# Advanced search - returns SearchResult objects with structured data
for result in search("Python programming", num_results=10, advanced=True):
print(f"Title: {result.title}")
print(f"URL: {result.url}")
print(f"Description: {result.description}")
print("---")
# Search with language and region settings
for url in search("Python programming", lang="en", region="us", num_results=5):
print(url)Performs Google search queries with extensive customization options including result count control, language and region specification, proxy support, and safe search toggles.
def search(
term: str,
num_results: int = 10,
lang: str = "en",
proxy: str = None,
advanced: bool = False,
sleep_interval: int = 0,
timeout: int = 5,
safe: str = "active",
ssl_verify: bool = None,
region: str = None,
start_num: int = 0,
unique: bool = False
):
"""
Search the Google search engine and yield results.
Parameters:
- term: Search query string
- num_results: Number of results to return (default: 10)
- lang: Language code for search results (default: "en")
- proxy: HTTP/HTTPS proxy URL (optional)
- advanced: Return SearchResult objects instead of URLs (default: False)
- sleep_interval: Sleep time between requests in seconds (default: 0)
- timeout: Request timeout in seconds (default: 5)
- safe: Safe search setting - "active" or None (default: "active")
- ssl_verify: SSL certificate verification (optional)
- region: Country code for region-specific results (optional)
- start_num: Starting result number for pagination (default: 0)
- unique: Filter duplicate URLs (default: False)
Yields:
- str: URLs when advanced=False
- SearchResult: Result objects when advanced=True
Examples:
Basic search returning URLs:
>>> for url in search("machine learning", num_results=5):
... print(url)
Advanced search with structured results:
>>> for result in search("AI research", advanced=True, num_results=3):
... print(f"{result.title}: {result.url}")
Search with language and region:
>>> for url in search("café", lang="fr", region="fr", num_results=5):
... print(url)
Search with proxy and SSL settings:
>>> proxy_url = "http://proxy.example.com:8080"
>>> for url in search("secure search", proxy=proxy_url, ssl_verify=False):
... print(url)
Paginated search with rate limiting:
>>> for url in search("large dataset", num_results=200, sleep_interval=2):
... print(url)
"""Generates random user agent strings for HTTP requests to improve request diversity and reduce detection.
def get_useragent() -> str:
"""
Generate a random user agent string mimicking Lynx browser format.
The user agent string components:
- Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2
- libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15
- SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5
- OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9
Returns:
str: A randomly generated user agent string in the format:
"Lynx/x.y.z libwww-FM/x.y SSL-MM/x.y OpenSSL/x.y.z"
Examples:
>>> agent = get_useragent()
>>> print(agent)
"Lynx/2.8.1 libwww-FM/2.14 SSL-MM/1.4 OpenSSL/1.2.7"
"""class SearchResult:
"""
Data structure for advanced search results containing structured information
about each search result including URL, title, and description.
"""
def __init__(self, url: str, title: str, description: str):
"""
Initialize a SearchResult object.
Parameters:
- url: The result URL
- title: The result title
- description: The result description/snippet
"""
self.url = url
self.title = title
self.description = description
def __repr__(self) -> str:
"""
Return string representation of the SearchResult.
Returns:
str: String representation in format:
"SearchResult(url={url}, title={title}, description={description})"
"""Use standard language codes like "en" (English), "fr" (French), "de" (German), "es" (Spanish), "ja" (Japanese), etc.
Use Country Codes like "us" (United States), "uk" (United Kingdom), "ca" (Canada), "au" (Australia), etc.
"active": Enable safe search filtering (default)None: Disable safe search filteringSupports both HTTP and HTTPS proxies:
# HTTP proxy
proxy = "http://proxy.example.com:8080"
# HTTPS proxy
proxy = "https://proxy.example.com:8080"
# Proxy with authentication
proxy = "http://user:pass@proxy.example.com:8080"The library may raise the following exceptions:
Example error handling:
import requests
from googlesearch import search
try:
results = list(search("example query", num_results=10, timeout=10))
except requests.exceptions.Timeout:
print("Request timed out")
except requests.exceptions.RequestException as e:
print(f"Network error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")To avoid being blocked by Google:
sleep_interval to 1-5 seconds for large result setsExample with rate limiting:
# Good practice for large result sets
for url in search("large query", num_results=100, sleep_interval=2):
print(url)The package requires: