Python Client for Video Intelligence API that enables developers to make videos searchable and discoverable by extracting metadata through machine learning.
—
Comprehensive configuration options for different AI detection capabilities. Each feature can be fine-tuned with specific parameters and thresholds to optimize results for different use cases.
Core features available for video analysis, each providing different types of AI-powered insights.
class Feature(Enum):
"""
Video annotation feature.
Values:
FEATURE_UNSPECIFIED: Unspecified feature
LABEL_DETECTION: Label detection - detect objects, such as dog or flower
SHOT_CHANGE_DETECTION: Shot change detection
EXPLICIT_CONTENT_DETECTION: Explicit content detection
FACE_DETECTION: Human face detection
SPEECH_TRANSCRIPTION: Speech transcription
TEXT_DETECTION: OCR text detection and tracking
OBJECT_TRACKING: Object detection and tracking
LOGO_RECOGNITION: Logo detection, tracking, and recognition
PERSON_DETECTION: Person detection
"""
FEATURE_UNSPECIFIED = 0
LABEL_DETECTION = 1
SHOT_CHANGE_DETECTION = 2
EXPLICIT_CONTENT_DETECTION = 3
FACE_DETECTION = 4
SPEECH_TRANSCRIPTION = 6
TEXT_DETECTION = 7
OBJECT_TRACKING = 9
LOGO_RECOGNITION = 12
PERSON_DETECTION = 14Main configuration object that allows fine-tuning of different analysis features.
class VideoContext:
"""
Video context and/or feature-specific parameters.
Attributes:
segments: Video segments to annotate. If unspecified, each video is treated as a single segment
label_detection_config: Config for LABEL_DETECTION
shot_change_detection_config: Config for SHOT_CHANGE_DETECTION
explicit_content_detection_config: Config for EXPLICIT_CONTENT_DETECTION
face_detection_config: Config for FACE_DETECTION
speech_transcription_config: Config for SPEECH_TRANSCRIPTION
text_detection_config: Config for TEXT_DETECTION
object_tracking_config: Config for OBJECT_TRACKING
person_detection_config: Config for PERSON_DETECTION
"""
segments: MutableSequence[VideoSegment]
label_detection_config: LabelDetectionConfig
shot_change_detection_config: ShotChangeDetectionConfig
explicit_content_detection_config: ExplicitContentDetectionConfig
face_detection_config: FaceDetectionConfig
speech_transcription_config: SpeechTranscriptionConfig
text_detection_config: TextDetectionConfig
object_tracking_config: ObjectTrackingConfig
person_detection_config: PersonDetectionConfigConfigure how labels (objects, activities, concepts) are detected in videos.
class LabelDetectionConfig:
"""
Config for LABEL_DETECTION.
Attributes:
label_detection_mode: What labels should be detected with LABEL_DETECTION, in addition to video-level labels or segment-level labels
stationary_camera: Whether the video has been shot from a stationary (non-moving) camera
model: Model to use for label detection. Supported values: "builtin/stable", "builtin/latest"
frame_confidence_threshold: The confidence threshold for frame-level label detection (0.0-1.0)
video_confidence_threshold: The confidence threshold for video-level label detection (0.0-1.0)
"""
label_detection_mode: LabelDetectionMode
stationary_camera: bool
model: str
frame_confidence_threshold: float
video_confidence_threshold: float
class LabelDetectionMode(Enum):
"""
Label detection mode.
Values:
LABEL_DETECTION_MODE_UNSPECIFIED: Unspecified
SHOT_MODE: Detect shot-level labels
FRAME_MODE: Detect frame-level labels
SHOT_AND_FRAME_MODE: Detect both shot-level and frame-level labels
"""
LABEL_DETECTION_MODE_UNSPECIFIED = 0
SHOT_MODE = 1
FRAME_MODE = 2
SHOT_AND_FRAME_MODE = 3Configure detection and tracking of human faces in videos.
class FaceDetectionConfig:
"""
Config for FACE_DETECTION.
Attributes:
model: Model to use for face detection. Supported values: "builtin/stable", "builtin/latest"
include_bounding_boxes: Whether bounding boxes are included in the face annotation output
include_attributes: Whether to enable face attributes detection, such as glasses, dark_glasses, mouth_open etc
"""
model: str
include_bounding_boxes: bool
include_attributes: boolConfigure detection and tracking of objects throughout the video.
class ObjectTrackingConfig:
"""
Config for OBJECT_TRACKING.
Attributes:
model: Model to use for object tracking. Supported values: "builtin/stable", "builtin/latest"
"""
model: strConfigure detection of explicit or inappropriate content.
class ExplicitContentDetectionConfig:
"""
Config for EXPLICIT_CONTENT_DETECTION.
Attributes:
model: Model to use for explicit content detection. Supported values: "builtin/stable", "builtin/latest"
"""
model: strConfigure speech-to-text transcription with language and context options.
class SpeechTranscriptionConfig:
"""
Config for SPEECH_TRANSCRIPTION.
Attributes:
language_code: Required. BCP-47 language tag of the language spoken in the audio (e.g., "en-US")
max_alternatives: Maximum number of recognition hypotheses to be returned
filter_profanity: If set to true, the server will attempt to filter out profanities
speech_contexts: A means to provide context to assist the speech recognition
enable_automatic_punctuation: If set to true, adds punctuation to recognition result hypotheses
audio_tracks: For file formats that contain multiple audio tracks, this field controls which track should be transcribed
enable_speaker_diarization: If true, enable speaker detection for each recognized word
diarization_speaker_count: If speaker_diarization is enabled, set this field to specify the number of speakers
enable_word_confidence: If true, the top result includes a list of words and the confidence for those words
"""
language_code: str
max_alternatives: int
filter_profanity: bool
speech_contexts: MutableSequence[SpeechContext]
enable_automatic_punctuation: bool
audio_tracks: MutableSequence[int]
enable_speaker_diarization: bool
diarization_speaker_count: int
enable_word_confidence: bool
class SpeechContext:
"""
Provides "hints" to the speech recognizer to favor specific words and phrases in the results.
Attributes:
phrases: A list of strings containing words and phrases "hints" so that the speech recognition is more likely to recognize them
"""
phrases: MutableSequence[str]Configure optical character recognition (OCR) for detecting text in videos.
class TextDetectionConfig:
"""
Config for TEXT_DETECTION.
Attributes:
language_hints: Language hint can be specified if the language spoken in the audio is known a priori
model: Model to use for text detection. Supported values: "builtin/stable", "builtin/latest"
"""
language_hints: MutableSequence[str]
model: strConfigure detection and tracking of people in videos.
class PersonDetectionConfig:
"""
Config for PERSON_DETECTION.
Attributes:
include_bounding_boxes: Whether bounding boxes are included in the person detection annotation output
include_pose_landmarks: Whether to enable pose landmarks detection
include_attributes: Whether to enable person attributes detection, such as cloth color
"""
include_bounding_boxes: bool
include_pose_landmarks: bool
include_attributes: boolConfigure detection of shot boundaries and scene changes.
class ShotChangeDetectionConfig:
"""
Config for SHOT_CHANGE_DETECTION.
Attributes:
model: Model to use for shot change detection. Supported values: "builtin/stable", "builtin/latest"
"""
model: strclass Likelihood(Enum):
"""
Bucketized representation of likelihood.
Values:
LIKELIHOOD_UNSPECIFIED: Unspecified likelihood
VERY_UNLIKELY: Very unlikely
UNLIKELY: Unlikely
POSSIBLE: Possible
LIKELY: Likely
VERY_LIKELY: Very likely
"""
LIKELIHOOD_UNSPECIFIED = 0
VERY_UNLIKELY = 1
UNLIKELY = 2
POSSIBLE = 3
LIKELY = 4
VERY_LIKELY = 5
class VideoSegment:
"""
Video segment.
Attributes:
start_time_offset: Time-offset, relative to the beginning of the video, corresponding to the start of the segment
end_time_offset: Time-offset, relative to the beginning of the video, corresponding to the end of the segment
"""
start_time_offset: duration_pb2.Duration
end_time_offset: duration_pb2.Durationfrom google.cloud import videointelligence
# Create client
client = videointelligence.VideoIntelligenceServiceClient()
# Configure multiple features with custom settings
video_context = videointelligence.VideoContext(
segments=[
videointelligence.VideoSegment(
start_time_offset={"seconds": 10},
end_time_offset={"seconds": 50}
)
],
label_detection_config=videointelligence.LabelDetectionConfig(
label_detection_mode=videointelligence.LabelDetectionMode.SHOT_AND_FRAME_MODE,
stationary_camera=True,
model="builtin/latest",
frame_confidence_threshold=0.7,
video_confidence_threshold=0.8
),
face_detection_config=videointelligence.FaceDetectionConfig(
model="builtin/latest",
include_bounding_boxes=True,
include_attributes=True
),
speech_transcription_config=videointelligence.SpeechTranscriptionConfig(
language_code="en-US",
enable_automatic_punctuation=True,
enable_speaker_diarization=True,
diarization_speaker_count=2,
enable_word_confidence=True
)
)
# Annotate video with custom configuration
operation = client.annotate_video(
request={
"features": [
videointelligence.Feature.LABEL_DETECTION,
videointelligence.Feature.FACE_DETECTION,
videointelligence.Feature.SPEECH_TRANSCRIPTION
],
"input_uri": "gs://your-bucket/your-video.mp4",
"video_context": video_context
}
)
result = operation.result(timeout=600)from google.cloud import videointelligence
client = videointelligence.VideoIntelligenceServiceClient()
# Configure text detection for multiple languages
text_config = videointelligence.TextDetectionConfig(
language_hints=["en", "fr", "es"], # English, French, Spanish
model="builtin/latest"
)
video_context = videointelligence.VideoContext(
text_detection_config=text_config
)
operation = client.annotate_video(
request={
"features": [videointelligence.Feature.TEXT_DETECTION],
"input_uri": "gs://your-bucket/multilingual-video.mp4",
"video_context": video_context
}
)
result = operation.result(timeout=300)from google.cloud import videointelligence
client = videointelligence.VideoIntelligenceServiceClient()
# Configure person detection with all features enabled
person_config = videointelligence.PersonDetectionConfig(
include_bounding_boxes=True,
include_pose_landmarks=True,
include_attributes=True
)
video_context = videointelligence.VideoContext(
person_detection_config=person_config
)
operation = client.annotate_video(
request={
"features": [videointelligence.Feature.PERSON_DETECTION],
"input_uri": "gs://your-bucket/sports-video.mp4",
"video_context": video_context
}
)
result = operation.result(timeout=400)from google.cloud import videointelligence
client = videointelligence.VideoIntelligenceServiceClient()
# Configure explicit content detection
explicit_config = videointelligence.ExplicitContentDetectionConfig(
model="builtin/latest"
)
video_context = videointelligence.VideoContext(
explicit_content_detection_config=explicit_config
)
operation = client.annotate_video(
request={
"features": [videointelligence.Feature.EXPLICIT_CONTENT_DETECTION],
"input_uri": "gs://your-bucket/content-to-moderate.mp4",
"video_context": video_context
}
)
result = operation.result(timeout=300)
# Check explicit content results
for annotation_result in result.annotation_results:
explicit_annotation = annotation_result.explicit_annotation
for frame in explicit_annotation.frames:
likelihood = frame.pornography_likelihood
time_offset = frame.time_offset.total_seconds()
print(f"Frame at {time_offset}s: {likelihood.name}")Install with Tessl CLI
npx tessl i tessl/pypi-google-cloud-videointelligence