0
# Computer Vision
1
2
Comprehensive vision AI capabilities including image generation, analysis, and multimodal understanding through specialized models like Imagen for generation and vision models for understanding tasks.
3
4
## Capabilities
5
6
### Image Generation
7
8
Generate high-quality images from text prompts using Imagen models with advanced editing capabilities.
9
10
```python { .api }
11
class ImageGenerationModel:
12
@classmethod
13
def from_pretrained(cls, model_name: str) -> 'ImageGenerationModel': ...
14
15
def generate_images(
16
self,
17
prompt: str,
18
negative_prompt: Optional[str] = None,
19
number_of_images: int = 1,
20
aspect_ratio: Optional[str] = None,
21
safety_filter_level: Optional[str] = None,
22
person_generation: Optional[str] = None,
23
**kwargs
24
) -> ImageGenerationResponse: ...
25
26
def edit_image(
27
self,
28
prompt: str,
29
base_image: Optional[Image] = None,
30
mask: Optional[Image] = None,
31
edit_mode: Optional[str] = None,
32
**kwargs
33
) -> ImageGenerationResponse: ...
34
35
def upscale_image(
36
self,
37
image: Image,
38
new_size: int = 2048,
39
upscale_factor: Optional[int] = None,
40
**kwargs
41
) -> ImageGenerationResponse: ...
42
```
43
44
### Image Understanding
45
46
Analyze and understand image content with captioning, question answering, and classification capabilities.
47
48
```python { .api }
49
class ImageCaptioningModel:
50
@classmethod
51
def from_pretrained(cls, model_name: str) -> 'ImageCaptioningModel': ...
52
53
def get_captions(
54
self,
55
image: Image,
56
number_of_results: int = 1,
57
language: str = "en",
58
output_gcs_uri: Optional[str] = None,
59
**kwargs
60
) -> List[str]: ...
61
62
class ImageQnAModel:
63
@classmethod
64
def from_pretrained(cls, model_name: str) -> 'ImageQnAModel': ...
65
66
def ask_question(
67
self,
68
image: Image,
69
question: str,
70
number_of_results: int = 1,
71
**kwargs
72
) -> List[str]: ...
73
74
class ImageTextModel(ImageCaptioningModel, ImageQnAModel):
75
"""Combined image understanding model with both captioning and Q&A capabilities."""
76
pass
77
```
78
79
### Multimodal Embeddings
80
81
Generate vector embeddings from images, videos, and text for similarity search and multimodal applications.
82
83
```python { .api }
84
class MultiModalEmbeddingModel:
85
@classmethod
86
def from_pretrained(cls, model_name: str) -> 'MultiModalEmbeddingModel': ...
87
88
def get_embeddings(
89
self,
90
image: Optional[Image] = None,
91
video: Optional[Video] = None,
92
contextual_text: Optional[str] = None,
93
dimension: Optional[int] = None,
94
video_segment_config: Optional[VideoSegmentConfig] = None,
95
**kwargs
96
) -> MultiModalEmbeddingResponse: ...
97
```
98
99
### Media Data Types
100
101
Comprehensive media handling for images and videos with flexible loading and processing options.
102
103
```python { .api }
104
class Image:
105
def __init__(self, image_bytes: bytes = None, gcs_uri: str = None): ...
106
107
@staticmethod
108
def load_from_file(location: str) -> 'Image': ...
109
110
def show(self, figsize: Tuple[int, int] = (10, 10)) -> None: ...
111
def save(self, location: str) -> None: ...
112
113
@property
114
def _size(self) -> Optional[Tuple[int, int]]: ...
115
@property
116
def _mime_type(self) -> Optional[str]: ...
117
118
class Video:
119
def __init__(self, video_bytes: bytes = None, gcs_uri: str = None): ...
120
121
@staticmethod
122
def load_from_file(location: str) -> 'Video': ...
123
124
def save(self, location: str) -> None: ...
125
126
class GeneratedImage(Image):
127
"""Image generated by ImageGenerationModel with metadata."""
128
@property
129
def generation_parameters(self) -> Optional[Dict[str, Any]]: ...
130
131
def save(self, location: str, include_generation_parameters: bool = True) -> None: ...
132
```
133
134
### Response Types
135
136
Structured responses containing generated images and analysis results with comprehensive metadata.
137
138
```python { .api }
139
class ImageGenerationResponse:
140
"""Container for generated images."""
141
def __init__(self, images: List[GeneratedImage]): ...
142
143
@property
144
def images(self) -> List[GeneratedImage]: ...
145
146
def __iter__(self) -> Iterator[GeneratedImage]: ...
147
def __getitem__(self, index: int) -> GeneratedImage: ...
148
149
class MultiModalEmbeddingResponse:
150
"""Container for multimodal embedding results."""
151
@property
152
def image_embedding(self) -> Optional[List[float]]: ...
153
@property
154
def video_embeddings(self) -> Optional[List[VideoEmbedding]]: ...
155
@property
156
def text_embedding(self) -> Optional[List[float]]: ...
157
158
class VideoEmbedding:
159
"""Embedding for a video segment."""
160
@property
161
def start_offset_sec(self) -> float: ...
162
@property
163
def end_offset_sec(self) -> float: ...
164
@property
165
def embedding(self) -> List[float]: ...
166
167
class VideoSegmentConfig:
168
"""Configuration for video segment processing."""
169
def __init__(
170
self,
171
start_offset_sec: int = 0,
172
end_offset_sec: int = 120,
173
interval_sec: int = 16
174
): ...
175
```
176
177
## Usage Examples
178
179
**Generate images:**
180
```python
181
from vertexai.vision_models import ImageGenerationModel
182
183
model = ImageGenerationModel.from_pretrained("imagegeneration@006")
184
response = model.generate_images(
185
prompt="A serene mountain landscape at sunset",
186
number_of_images=2,
187
aspect_ratio="16:9"
188
)
189
190
for i, image in enumerate(response.images):
191
image.save(f"generated_image_{i}.png")
192
```
193
194
**Image understanding:**
195
```python
196
from vertexai.vision_models import ImageTextModel, Image
197
198
model = ImageTextModel.from_pretrained("imagetext@001")
199
image = Image.load_from_file("photo.jpg")
200
201
# Get captions
202
captions = model.get_captions(image, number_of_results=3)
203
print("Captions:", captions)
204
205
# Ask questions
206
answer = model.ask_question(image, "What objects are in this image?")
207
print("Answer:", answer)
208
```
209
210
**Multimodal embeddings:**
211
```python
212
from vertexai.vision_models import MultiModalEmbeddingModel, Image
213
214
model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")
215
image = Image.load_from_file("image.jpg")
216
217
response = model.get_embeddings(
218
image=image,
219
contextual_text="A beautiful landscape",
220
dimension=512
221
)
222
223
print(f"Image embedding dimension: {len(response.image_embedding)}")
224
```