Google Cloud Natural Language API client library providing sentiment analysis, entity recognition, text classification, and content moderation capabilities
—
Categorizes text documents into predefined classification categories, enabling automated content organization and filtering based on subject matter and themes. The classification system can identify topics, genres, and content types to help with content management, routing, and analysis at scale.
Analyzes the provided text and assigns it to relevant predefined categories with confidence scores.
def classify_text(
self,
request: Optional[Union[ClassifyTextRequest, dict]] = None,
*,
document: Optional[Document] = None,
retry: OptionalRetry = gapic_v1.method.DEFAULT,
timeout: Union[float, object] = gapic_v1.method.DEFAULT,
metadata: Sequence[Tuple[str, Union[str, bytes]]] = ()
) -> ClassifyTextResponse:
"""
Classifies a document into categories.
Args:
request: The request object containing document and options
document: Input document for classification
retry: Retry configuration for the request
timeout: Request timeout in seconds
metadata: Additional metadata to send with the request
Returns:
ClassifyTextResponse containing classification categories
"""from google.cloud import language
# Initialize client
client = language.LanguageServiceClient()
# Create document
document = language.Document(
content="""
The latest advancements in artificial intelligence and machine learning
are revolutionizing how we approach data analysis and predictive modeling.
Neural networks and deep learning algorithms are becoming increasingly
sophisticated, enabling more accurate predictions and insights from
complex datasets.
""",
type_=language.Document.Type.PLAIN_TEXT
)
# Classify text
response = client.classify_text(
request={"document": document}
)
# Process classification results
print("Classification Results:")
for category in response.categories:
print(f"Category: {category.name}")
print(f"Confidence: {category.confidence:.3f}")
print()class ClassifyTextRequest:
document: Document
classification_model_options: ClassificationModelOptions # v1/v1beta2 onlyclass ClassifyTextResponse:
categories: MutableSequence[ClassificationCategory]Represents a classification category with confidence score.
class ClassificationCategory:
name: str # Category name (hierarchical path)
confidence: float # Confidence score [0.0, 1.0]Configuration options for the classification model.
class ClassificationModelOptions:
class V1Model(proto.Message):
pass
class V2Model(proto.Message):
pass
v1_model: V1Model # Use V1 classification model
v2_model: V2Model # Use V2 classification modelClassification categories follow a hierarchical structure using forward slashes:
/Arts & Entertainment/Autos & Vehicles/Beauty & Fitness/Books & Literature/Business & Industrial/Computers & Electronics/Finance/Food & Drink/Games/Health/Hobbies & Leisure/Home & Garden/Internet & Telecom/Jobs & Education/Law & Government/News/Online Communities/People & Society/Pets & Animals/Real Estate/Reference/Science/Shopping/Sports/Travel/Computers & Electronics/Software/Computers & Electronics/Software/Business Software/Arts & Entertainment/Movies/Arts & Entertainment/Music & Audio/Science/Computer Science/Business & Industrial/Advertising & Marketingdef classify_and_rank_categories(client, text, min_confidence=0.1):
"""Classify text and rank all categories above threshold."""
document = language.Document(
content=text,
type_=language.Document.Type.PLAIN_TEXT
)
response = client.classify_text(
request={"document": document}
)
# Filter and sort categories
filtered_categories = [
cat for cat in response.categories
if cat.confidence >= min_confidence
]
sorted_categories = sorted(
filtered_categories,
key=lambda x: x.confidence,
reverse=True
)
return sorted_categories
# Usage
text = """
Machine learning algorithms are transforming healthcare by enabling
early disease detection through medical imaging analysis. Artificial
intelligence systems can now identify patterns in X-rays, MRIs, and
CT scans that might be missed by human radiologists.
"""
categories = classify_and_rank_categories(client, text, min_confidence=0.1)
print("All Categories (above 10% confidence):")
for cat in categories:
print(f"{cat.name}: {cat.confidence:.3f}")def classify_multiple_documents(client, documents):
"""Classify multiple documents and return aggregated results."""
results = []
for i, doc_text in enumerate(documents):
document = language.Document(
content=doc_text,
type_=language.Document.Type.PLAIN_TEXT
)
try:
response = client.classify_text(
request={"document": document}
)
doc_categories = []
for category in response.categories:
doc_categories.append({
'name': category.name,
'confidence': category.confidence
})
results.append({
'document_index': i,
'text_preview': doc_text[:100] + "..." if len(doc_text) > 100 else doc_text,
'categories': doc_categories
})
except Exception as e:
results.append({
'document_index': i,
'text_preview': doc_text[:100] + "..." if len(doc_text) > 100 else doc_text,
'error': str(e),
'categories': []
})
return results
# Usage
documents = [
"Stock market analysis and investment strategies for portfolio management.",
"Latest updates in artificial intelligence and machine learning research.",
"Healthy cooking recipes for vegetarian and vegan diets.",
"Professional basketball game highlights and player statistics."
]
batch_results = classify_multiple_documents(client, documents)
for result in batch_results:
print(f"Document {result['document_index']}: {result['text_preview']}")
if 'error' in result:
print(f" Error: {result['error']}")
else:
for cat in result['categories']:
print(f" {cat['name']}: {cat['confidence']:.3f}")
print()def group_by_top_level_category(categories):
"""Group categories by their top-level parent."""
grouped = {}
for category in categories:
# Extract top-level category
parts = category.name.split('/')
top_level = '/' + parts[1] if len(parts) > 1 else category.name
if top_level not in grouped:
grouped[top_level] = []
grouped[top_level].append(category)
return grouped
def get_most_specific_categories(categories, max_categories=3):
"""Get the most specific (deepest) categories with highest confidence."""
# Sort by depth (number of slashes) and confidence
sorted_cats = sorted(
categories,
key=lambda x: (x.name.count('/'), x.confidence),
reverse=True
)
return sorted_cats[:max_categories]
# Usage
response = client.classify_text(request={"document": document})
# Group by top-level category
grouped_categories = group_by_top_level_category(response.categories)
print("Categories grouped by top-level:")
for top_level, cats in grouped_categories.items():
print(f"{top_level}:")
for cat in cats:
print(f" {cat.name}: {cat.confidence:.3f}")
print()
# Get most specific categories
specific_categories = get_most_specific_categories(response.categories)
print("Most specific categories:")
for cat in specific_categories:
depth = cat.name.count('/')
print(f"{cat.name} (depth: {depth}): {cat.confidence:.3f}")class ContentOrganizer:
def __init__(self, client):
self.client = client
self.category_mapping = {
'technology': ['/Computers & Electronics', '/Science'],
'business': ['/Business & Industrial', '/Finance'],
'entertainment': ['/Arts & Entertainment', '/Games'],
'health': ['/Health', '/Beauty & Fitness'],
'lifestyle': ['/Home & Garden', '/Food & Drink', '/Hobbies & Leisure'],
'news': ['/News', '/Law & Government'],
'education': ['/Jobs & Education', '/Reference', '/Books & Literature'],
'travel': ['/Travel'],
'sports': ['/Sports'],
'other': [] # Catch-all for unmatched categories
}
def organize_content(self, text):
"""Organize content into predefined buckets."""
document = language.Document(
content=text,
type_=language.Document.Type.PLAIN_TEXT
)
response = self.client.classify_text(
request={"document": document}
)
if not response.categories:
return 'other', []
# Find best matching bucket
best_bucket = 'other'
best_confidence = 0
matched_categories = []
for category in response.categories:
for bucket, prefixes in self.category_mapping.items():
for prefix in prefixes:
if category.name.startswith(prefix):
if category.confidence > best_confidence:
best_bucket = bucket
best_confidence = category.confidence
matched_categories.append({
'bucket': bucket,
'category': category.name,
'confidence': category.confidence
})
break
return best_bucket, matched_categories
def get_bucket_statistics(self, texts):
"""Get distribution of texts across buckets."""
bucket_counts = {bucket: 0 for bucket in self.category_mapping.keys()}
bucket_examples = {bucket: [] for bucket in self.category_mapping.keys()}
for text in texts:
bucket, categories = self.organize_content(text)
bucket_counts[bucket] += 1
if len(bucket_examples[bucket]) < 3: # Store up to 3 examples
bucket_examples[bucket].append({
'text': text[:50] + "..." if len(text) > 50 else text,
'categories': categories
})
return bucket_counts, bucket_examples
# Usage
organizer = ContentOrganizer(client)
sample_texts = [
"Latest developments in quantum computing and artificial intelligence.",
"Investment strategies for stock market volatility and portfolio management.",
"Delicious pasta recipes with organic ingredients and wine pairings.",
"Professional soccer match analysis and player performance statistics.",
"Breaking news about government policy changes and legal implications."
]
bucket_counts, bucket_examples = organizer.get_bucket_statistics(sample_texts)
print("Content Distribution:")
for bucket, count in bucket_counts.items():
if count > 0:
print(f"{bucket}: {count} documents")
for example in bucket_examples[bucket]:
print(f" - {example['text']}")def classify_with_specific_model(client, text, model_version='v2'):
"""Classify text using a specific model version."""
document = language_v1.Document(
content=text,
type_=language_v1.Document.Type.PLAIN_TEXT
)
# Configure model options
if model_version == 'v1':
model_options = language_v1.ClassificationModelOptions(
v1_model=language_v1.ClassificationModelOptions.V1Model()
)
else: # v2
model_options = language_v1.ClassificationModelOptions(
v2_model=language_v1.ClassificationModelOptions.V2Model()
)
response = client.classify_text(
request={
"document": document,
"classification_model_options": model_options
}
)
return response.categories
# Usage (only with v1/v1beta2 clients)
# v1_categories = classify_with_specific_model(client, text, 'v1')
# v2_categories = classify_with_specific_model(client, text, 'v2')def analyze_classification_confidence(client, texts, thresholds=[0.1, 0.3, 0.5, 0.7]):
"""Analyze how classification results vary with different confidence thresholds."""
results = {}
for threshold in thresholds:
results[threshold] = {
'classified_count': 0,
'unclassified_count': 0,
'avg_categories_per_doc': 0,
'total_categories': 0
}
for text in texts:
document = language.Document(
content=text,
type_=language.Document.Type.PLAIN_TEXT
)
try:
response = client.classify_text(
request={"document": document}
)
for threshold in thresholds:
filtered_categories = [
cat for cat in response.categories
if cat.confidence >= threshold
]
if filtered_categories:
results[threshold]['classified_count'] += 1
results[threshold]['total_categories'] += len(filtered_categories)
else:
results[threshold]['unclassified_count'] += 1
except Exception:
for threshold in thresholds:
results[threshold]['unclassified_count'] += 1
# Calculate averages
for threshold in thresholds:
classified = results[threshold]['classified_count']
if classified > 0:
results[threshold]['avg_categories_per_doc'] = (
results[threshold]['total_categories'] / classified
)
return results
# Usage
texts = [
"Advanced machine learning techniques for predictive analytics.",
"Gourmet cooking with seasonal vegetables and herbs.",
"Financial planning strategies for retirement savings.",
"Professional basketball playoffs and championship predictions."
]
confidence_analysis = analyze_classification_confidence(client, texts)
print("Classification Analysis by Confidence Threshold:")
for threshold, stats in confidence_analysis.items():
print(f"Threshold {threshold}:")
print(f" Classified: {stats['classified_count']}")
print(f" Unclassified: {stats['unclassified_count']}")
print(f" Avg categories per doc: {stats['avg_categories_per_doc']:.2f}")
print()from google.api_core import exceptions
try:
response = client.classify_text(
request={"document": document},
timeout=15.0
)
except exceptions.InvalidArgument as e:
print(f"Invalid document: {e}")
# Common causes: empty document, unsupported language, insufficient content
except exceptions.ResourceExhausted:
print("API quota exceeded")
except exceptions.DeadlineExceeded:
print("Request timed out")
except exceptions.GoogleAPIError as e:
print(f"API error: {e}")
# Handle no classification results
if not response.categories:
print("No classification categories found - document may be too short or ambiguous")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-language