or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

article-data.mdconfiguration.mdcore-extraction.mdindex.mdmedia-extraction.md

media-extraction.mddocs/

0

# Media Extraction

1

2

Image and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms. Goose3 automatically identifies and extracts media elements from article content, including main images and embedded videos.

3

4

## Capabilities

5

6

### Image Classes

7

8

Classes representing extracted images with metadata and storage capabilities.

9

10

```python { .api }

11

class Image:

12

def __init__(self):

13

"""Initialize image container."""

14

15

@property

16

def src(self) -> str:

17

"""Image source URL."""

18

19

@property

20

def width(self) -> int:

21

"""Image width in pixels."""

22

23

@property

24

def height(self) -> int:

25

"""Image height in pixels."""

26

27

@property

28

def top_image_node(self):

29

"""DOM node of the image element."""

30

31

@property

32

def confidence_score(self) -> float:

33

"""Confidence score for image extraction (0.0-1.0)."""

34

35

@property

36

def extraction_type(self) -> str:

37

"""Type of extraction used (e.g., 'bestGuess', 'linkTag', 'openGraph')."""

38

39

@property

40

def bytes(self) -> int:

41

"""Size of the image in bytes."""

42

43

class ImageDetails:

44

"""Detailed image information and metadata container."""

45

def __init__(self):

46

"""Initialize detailed image information."""

47

48

# Contains extended image metadata and analysis results

49

50

class LocallyStoredImage:

51

"""Container for locally stored/cached images."""

52

def __init__(self):

53

"""Initialize local image storage container."""

54

55

# Manages local storage paths and cached image data

56

```

57

58

### Video Classes

59

60

Classes representing extracted video content from embedded players and media platforms.

61

62

```python { .api }

63

class Video:

64

def __init__(self):

65

"""Initialize video container."""

66

67

@property

68

def src(self) -> str:

69

"""Video source URL."""

70

71

@property

72

def embed_code(self) -> str:

73

"""HTML embed code for the video."""

74

75

@property

76

def embed_type(self) -> str:

77

"""Type of embed (e.g., 'iframe', 'object')."""

78

79

@property

80

def width(self) -> int:

81

"""Video width in pixels."""

82

83

@property

84

def height(self) -> int:

85

"""Video height in pixels."""

86

87

@property

88

def provider(self) -> str:

89

"""Video provider/platform name."""

90

```

91

92

### Image Extraction Usage

93

94

Basic image access:

95

96

```python

97

from goose3 import Goose

98

99

g = Goose()

100

article = g.extract(url='https://example.com/article')

101

102

# Access main article image

103

if article.top_image:

104

image = article.top_image

105

print(f"Image URL: {image.src}")

106

print(f"Dimensions: {image.width}x{image.height}")

107

108

# Check if valid dimensions were extracted

109

if image.width and image.height:

110

aspect_ratio = image.width / image.height

111

print(f"Aspect ratio: {aspect_ratio:.2f}")

112

else:

113

print("Dimensions not available")

114

else:

115

print("No main image found in article")

116

```

117

118

Image fetching configuration:

119

120

```python

121

from goose3 import Goose

122

123

# Enable image fetching and local storage

124

config = {

125

'enable_image_fetching': True,

126

'local_storage_path': '/tmp/goose_images'

127

}

128

129

g = Goose(config)

130

article = g.extract(url='https://example.com/article')

131

132

if article.top_image:

133

print(f"Image fetched and stored: {article.top_image.src}")

134

```

135

136

### Video Extraction Usage

137

138

Basic video access:

139

140

```python

141

from goose3 import Goose

142

143

g = Goose()

144

article = g.extract(url='https://example.com/article')

145

146

# Access embedded videos

147

if article.movies:

148

print(f"Found {len(article.movies)} videos")

149

150

for i, video in enumerate(article.movies):

151

print(f"\nVideo {i+1}:")

152

print(f" Source: {video.src}")

153

print(f" Type: {video.embed_type}")

154

print(f" Dimensions: {video.width}x{video.height}")

155

print(f" Embed code: {video.embed_code[:100]}...")

156

else:

157

print("No videos found in article")

158

```

159

160

Working with video embeds:

161

162

```python

163

article = g.extract(url='https://example.com/article')

164

165

for video in article.movies:

166

if video.embed_type == 'iframe':

167

# Handle iframe embeds

168

print(f"Iframe video: {video.src}")

169

print(f"Embed HTML: {video.embed_code}")

170

elif video.embed_type == 'object':

171

# Handle object embeds

172

print(f"Object video: {video.src}")

173

174

# Check video platform

175

if 'youtube.com' in video.src or 'youtu.be' in video.src:

176

print("YouTube video detected")

177

elif 'vimeo.com' in video.src:

178

print("Vimeo video detected")

179

elif 'kewego.com' in video.src:

180

print("Kewego video detected")

181

```

182

183

### Platform-Specific Video Support

184

185

Goose3 supports extraction from various video platforms:

186

187

```python

188

# YouTube video extraction

189

article = g.extract(url='https://example.com/article-with-youtube')

190

for video in article.movies:

191

if 'youtube' in video.src.lower():

192

print(f"YouTube video ID can be extracted from: {video.src}")

193

194

# Vimeo video extraction

195

article = g.extract(url='https://example.com/article-with-vimeo')

196

for video in article.movies:

197

if 'vimeo' in video.src.lower():

198

print(f"Vimeo video: {video.src}")

199

200

# Generic iframe embeds

201

for video in article.movies:

202

if video.embed_type == 'iframe':

203

print(f"Generic iframe embed: {video.embed_code}")

204

```

205

206

### Media Validation

207

208

Checking media availability and quality:

209

210

```python

211

from goose3 import Goose

212

213

g = Goose({'enable_image_fetching': True})

214

article = g.extract(url='https://example.com/article')

215

216

# Validate main image

217

if article.top_image:

218

image = article.top_image

219

220

# Check if image has valid URL

221

if image.src and image.src.startswith(('http://', 'https://')):

222

print(f"Valid image URL: {image.src}")

223

224

# Check dimensions

225

if image.width and image.height:

226

if image.width >= 300 and image.height >= 200:

227

print("Image meets minimum size requirements")

228

else:

229

print("Image is quite small")

230

else:

231

print("Image dimensions not available")

232

else:

233

print("Invalid or missing image URL")

234

235

# Validate videos

236

valid_videos = []

237

for video in article.movies:

238

if video.src and video.embed_code:

239

if any(platform in video.src.lower()

240

for platform in ['youtube', 'vimeo', 'dailymotion']):

241

valid_videos.append(video)

242

243

print(f"Found {len(valid_videos)} valid videos from known platforms")

244

```

245

246

### Error Handling for Media

247

248

```python

249

from goose3 import Goose

250

251

try:

252

g = Goose({

253

'enable_image_fetching': True,

254

'local_storage_path': '/tmp/goose_images'

255

})

256

article = g.extract(url='https://example.com/article')

257

258

# Safe media access

259

image_available = bool(article.top_image and article.top_image.src)

260

videos_available = bool(article.movies)

261

262

print(f"Media extraction results - Images: {image_available}, Videos: {videos_available}")

263

264

except Exception as e:

265

print(f"Media extraction error: {e}")

266

# Continue with article text even if media extraction fails

267

print(f"Article title: {article.title if 'article' in locals() else 'N/A'}")

268

```