or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch.mddatasets.mdexperiments.mdfeature-store.mdgenerative-ai.mdindex.mdmodels.mdpipelines.mdtraining.mdvector-search.mdvision.md

vision.mddocs/

0

# Computer Vision

1

2

Comprehensive vision AI capabilities including image generation, analysis, and multimodal understanding through specialized models like Imagen for generation and vision models for understanding tasks.

3

4

## Capabilities

5

6

### Image Generation

7

8

Generate high-quality images from text prompts using Imagen models with advanced editing capabilities.

9

10

```python { .api }

11

class ImageGenerationModel:

12

@classmethod

13

def from_pretrained(cls, model_name: str) -> 'ImageGenerationModel': ...

14

15

def generate_images(

16

self,

17

prompt: str,

18

negative_prompt: Optional[str] = None,

19

number_of_images: int = 1,

20

aspect_ratio: Optional[str] = None,

21

safety_filter_level: Optional[str] = None,

22

person_generation: Optional[str] = None,

23

**kwargs

24

) -> ImageGenerationResponse: ...

25

26

def edit_image(

27

self,

28

prompt: str,

29

base_image: Optional[Image] = None,

30

mask: Optional[Image] = None,

31

edit_mode: Optional[str] = None,

32

**kwargs

33

) -> ImageGenerationResponse: ...

34

35

def upscale_image(

36

self,

37

image: Image,

38

new_size: int = 2048,

39

upscale_factor: Optional[int] = None,

40

**kwargs

41

) -> ImageGenerationResponse: ...

42

```

43

44

### Image Understanding

45

46

Analyze and understand image content with captioning, question answering, and classification capabilities.

47

48

```python { .api }

49

class ImageCaptioningModel:

50

@classmethod

51

def from_pretrained(cls, model_name: str) -> 'ImageCaptioningModel': ...

52

53

def get_captions(

54

self,

55

image: Image,

56

number_of_results: int = 1,

57

language: str = "en",

58

output_gcs_uri: Optional[str] = None,

59

**kwargs

60

) -> List[str]: ...

61

62

class ImageQnAModel:

63

@classmethod

64

def from_pretrained(cls, model_name: str) -> 'ImageQnAModel': ...

65

66

def ask_question(

67

self,

68

image: Image,

69

question: str,

70

number_of_results: int = 1,

71

**kwargs

72

) -> List[str]: ...

73

74

class ImageTextModel(ImageCaptioningModel, ImageQnAModel):

75

"""Combined image understanding model with both captioning and Q&A capabilities."""

76

pass

77

```

78

79

### Multimodal Embeddings

80

81

Generate vector embeddings from images, videos, and text for similarity search and multimodal applications.

82

83

```python { .api }

84

class MultiModalEmbeddingModel:

85

@classmethod

86

def from_pretrained(cls, model_name: str) -> 'MultiModalEmbeddingModel': ...

87

88

def get_embeddings(

89

self,

90

image: Optional[Image] = None,

91

video: Optional[Video] = None,

92

contextual_text: Optional[str] = None,

93

dimension: Optional[int] = None,

94

video_segment_config: Optional[VideoSegmentConfig] = None,

95

**kwargs

96

) -> MultiModalEmbeddingResponse: ...

97

```

98

99

### Media Data Types

100

101

Comprehensive media handling for images and videos with flexible loading and processing options.

102

103

```python { .api }

104

class Image:

105

def __init__(self, image_bytes: bytes = None, gcs_uri: str = None): ...

106

107

@staticmethod

108

def load_from_file(location: str) -> 'Image': ...

109

110

def show(self, figsize: Tuple[int, int] = (10, 10)) -> None: ...

111

def save(self, location: str) -> None: ...

112

113

@property

114

def _size(self) -> Optional[Tuple[int, int]]: ...

115

@property

116

def _mime_type(self) -> Optional[str]: ...

117

118

class Video:

119

def __init__(self, video_bytes: bytes = None, gcs_uri: str = None): ...

120

121

@staticmethod

122

def load_from_file(location: str) -> 'Video': ...

123

124

def save(self, location: str) -> None: ...

125

126

class GeneratedImage(Image):

127

"""Image generated by ImageGenerationModel with metadata."""

128

@property

129

def generation_parameters(self) -> Optional[Dict[str, Any]]: ...

130

131

def save(self, location: str, include_generation_parameters: bool = True) -> None: ...

132

```

133

134

### Response Types

135

136

Structured responses containing generated images and analysis results with comprehensive metadata.

137

138

```python { .api }

139

class ImageGenerationResponse:

140

"""Container for generated images."""

141

def __init__(self, images: List[GeneratedImage]): ...

142

143

@property

144

def images(self) -> List[GeneratedImage]: ...

145

146

def __iter__(self) -> Iterator[GeneratedImage]: ...

147

def __getitem__(self, index: int) -> GeneratedImage: ...

148

149

class MultiModalEmbeddingResponse:

150

"""Container for multimodal embedding results."""

151

@property

152

def image_embedding(self) -> Optional[List[float]]: ...

153

@property

154

def video_embeddings(self) -> Optional[List[VideoEmbedding]]: ...

155

@property

156

def text_embedding(self) -> Optional[List[float]]: ...

157

158

class VideoEmbedding:

159

"""Embedding for a video segment."""

160

@property

161

def start_offset_sec(self) -> float: ...

162

@property

163

def end_offset_sec(self) -> float: ...

164

@property

165

def embedding(self) -> List[float]: ...

166

167

class VideoSegmentConfig:

168

"""Configuration for video segment processing."""

169

def __init__(

170

self,

171

start_offset_sec: int = 0,

172

end_offset_sec: int = 120,

173

interval_sec: int = 16

174

): ...

175

```

176

177

## Usage Examples

178

179

**Generate images:**

180

```python

181

from vertexai.vision_models import ImageGenerationModel

182

183

model = ImageGenerationModel.from_pretrained("imagegeneration@006")

184

response = model.generate_images(

185

prompt="A serene mountain landscape at sunset",

186

number_of_images=2,

187

aspect_ratio="16:9"

188

)

189

190

for i, image in enumerate(response.images):

191

image.save(f"generated_image_{i}.png")

192

```

193

194

**Image understanding:**

195

```python

196

from vertexai.vision_models import ImageTextModel, Image

197

198

model = ImageTextModel.from_pretrained("imagetext@001")

199

image = Image.load_from_file("photo.jpg")

200

201

# Get captions

202

captions = model.get_captions(image, number_of_results=3)

203

print("Captions:", captions)

204

205

# Ask questions

206

answer = model.ask_question(image, "What objects are in this image?")

207

print("Answer:", answer)

208

```

209

210

**Multimodal embeddings:**

211

```python

212

from vertexai.vision_models import MultiModalEmbeddingModel, Image

213

214

model = MultiModalEmbeddingModel.from_pretrained("multimodalembedding@001")

215

image = Image.load_from_file("image.jpg")

216

217

response = model.get_embeddings(

218

image=image,

219

contextual_text="A beautiful landscape",

220

dimension=512

221

)

222

223

print(f"Image embedding dimension: {len(response.image_embedding)}")

224

```