0
# Media Extraction
1
2
Image and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms. Goose3 automatically identifies and extracts media elements from article content, including main images and embedded videos.
3
4
## Capabilities
5
6
### Image Classes
7
8
Classes representing extracted images with metadata and storage capabilities.
9
10
```python { .api }
11
class Image:
12
def __init__(self):
13
"""Initialize image container."""
14
15
@property
16
def src(self) -> str:
17
"""Image source URL."""
18
19
@property
20
def width(self) -> int:
21
"""Image width in pixels."""
22
23
@property
24
def height(self) -> int:
25
"""Image height in pixels."""
26
27
@property
28
def top_image_node(self):
29
"""DOM node of the image element."""
30
31
@property
32
def confidence_score(self) -> float:
33
"""Confidence score for image extraction (0.0-1.0)."""
34
35
@property
36
def extraction_type(self) -> str:
37
"""Type of extraction used (e.g., 'bestGuess', 'linkTag', 'openGraph')."""
38
39
@property
40
def bytes(self) -> int:
41
"""Size of the image in bytes."""
42
43
class ImageDetails:
44
"""Detailed image information and metadata container."""
45
def __init__(self):
46
"""Initialize detailed image information."""
47
48
# Contains extended image metadata and analysis results
49
50
class LocallyStoredImage:
51
"""Container for locally stored/cached images."""
52
def __init__(self):
53
"""Initialize local image storage container."""
54
55
# Manages local storage paths and cached image data
56
```
57
58
### Video Classes
59
60
Classes representing extracted video content from embedded players and media platforms.
61
62
```python { .api }
63
class Video:
64
def __init__(self):
65
"""Initialize video container."""
66
67
@property
68
def src(self) -> str:
69
"""Video source URL."""
70
71
@property
72
def embed_code(self) -> str:
73
"""HTML embed code for the video."""
74
75
@property
76
def embed_type(self) -> str:
77
"""Type of embed (e.g., 'iframe', 'object')."""
78
79
@property
80
def width(self) -> int:
81
"""Video width in pixels."""
82
83
@property
84
def height(self) -> int:
85
"""Video height in pixels."""
86
87
@property
88
def provider(self) -> str:
89
"""Video provider/platform name."""
90
```
91
92
### Image Extraction Usage
93
94
Basic image access:
95
96
```python
97
from goose3 import Goose
98
99
g = Goose()
100
article = g.extract(url='https://example.com/article')
101
102
# Access main article image
103
if article.top_image:
104
image = article.top_image
105
print(f"Image URL: {image.src}")
106
print(f"Dimensions: {image.width}x{image.height}")
107
108
# Check if valid dimensions were extracted
109
if image.width and image.height:
110
aspect_ratio = image.width / image.height
111
print(f"Aspect ratio: {aspect_ratio:.2f}")
112
else:
113
print("Dimensions not available")
114
else:
115
print("No main image found in article")
116
```
117
118
Image fetching configuration:
119
120
```python
121
from goose3 import Goose
122
123
# Enable image fetching and local storage
124
config = {
125
'enable_image_fetching': True,
126
'local_storage_path': '/tmp/goose_images'
127
}
128
129
g = Goose(config)
130
article = g.extract(url='https://example.com/article')
131
132
if article.top_image:
133
print(f"Image fetched and stored: {article.top_image.src}")
134
```
135
136
### Video Extraction Usage
137
138
Basic video access:
139
140
```python
141
from goose3 import Goose
142
143
g = Goose()
144
article = g.extract(url='https://example.com/article')
145
146
# Access embedded videos
147
if article.movies:
148
print(f"Found {len(article.movies)} videos")
149
150
for i, video in enumerate(article.movies):
151
print(f"\nVideo {i+1}:")
152
print(f" Source: {video.src}")
153
print(f" Type: {video.embed_type}")
154
print(f" Dimensions: {video.width}x{video.height}")
155
print(f" Embed code: {video.embed_code[:100]}...")
156
else:
157
print("No videos found in article")
158
```
159
160
Working with video embeds:
161
162
```python
163
article = g.extract(url='https://example.com/article')
164
165
for video in article.movies:
166
if video.embed_type == 'iframe':
167
# Handle iframe embeds
168
print(f"Iframe video: {video.src}")
169
print(f"Embed HTML: {video.embed_code}")
170
elif video.embed_type == 'object':
171
# Handle object embeds
172
print(f"Object video: {video.src}")
173
174
# Check video platform
175
if 'youtube.com' in video.src or 'youtu.be' in video.src:
176
print("YouTube video detected")
177
elif 'vimeo.com' in video.src:
178
print("Vimeo video detected")
179
elif 'kewego.com' in video.src:
180
print("Kewego video detected")
181
```
182
183
### Platform-Specific Video Support
184
185
Goose3 supports extraction from various video platforms:
186
187
```python
188
# YouTube video extraction
189
article = g.extract(url='https://example.com/article-with-youtube')
190
for video in article.movies:
191
if 'youtube' in video.src.lower():
192
print(f"YouTube video ID can be extracted from: {video.src}")
193
194
# Vimeo video extraction
195
article = g.extract(url='https://example.com/article-with-vimeo')
196
for video in article.movies:
197
if 'vimeo' in video.src.lower():
198
print(f"Vimeo video: {video.src}")
199
200
# Generic iframe embeds
201
for video in article.movies:
202
if video.embed_type == 'iframe':
203
print(f"Generic iframe embed: {video.embed_code}")
204
```
205
206
### Media Validation
207
208
Checking media availability and quality:
209
210
```python
211
from goose3 import Goose
212
213
g = Goose({'enable_image_fetching': True})
214
article = g.extract(url='https://example.com/article')
215
216
# Validate main image
217
if article.top_image:
218
image = article.top_image
219
220
# Check if image has valid URL
221
if image.src and image.src.startswith(('http://', 'https://')):
222
print(f"Valid image URL: {image.src}")
223
224
# Check dimensions
225
if image.width and image.height:
226
if image.width >= 300 and image.height >= 200:
227
print("Image meets minimum size requirements")
228
else:
229
print("Image is quite small")
230
else:
231
print("Image dimensions not available")
232
else:
233
print("Invalid or missing image URL")
234
235
# Validate videos
236
valid_videos = []
237
for video in article.movies:
238
if video.src and video.embed_code:
239
if any(platform in video.src.lower()
240
for platform in ['youtube', 'vimeo', 'dailymotion']):
241
valid_videos.append(video)
242
243
print(f"Found {len(valid_videos)} valid videos from known platforms")
244
```
245
246
### Error Handling for Media
247
248
```python
249
from goose3 import Goose
250
251
try:
252
g = Goose({
253
'enable_image_fetching': True,
254
'local_storage_path': '/tmp/goose_images'
255
})
256
article = g.extract(url='https://example.com/article')
257
258
# Safe media access
259
image_available = bool(article.top_image and article.top_image.src)
260
videos_available = bool(article.movies)
261
262
print(f"Media extraction results - Images: {image_available}, Videos: {videos_available}")
263
264
except Exception as e:
265
print(f"Media extraction error: {e}")
266
# Continue with article text even if media extraction fails
267
print(f"Article title: {article.title if 'article' in locals() else 'N/A'}")
268
```