Tessl Tile for pypi/goose3@3.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

article-data.md configuration.md core-extraction.md index.md media-extraction.md

media-extraction.mddocs/

0
# Media Extraction
1

2
Image and video extraction capabilities with support for metadata, dimensions, and embedded content from various platforms. Goose3 automatically identifies and extracts media elements from article content, including main images and embedded videos.
3

4
## Capabilities
5

6
### Image Classes
7

8
Classes representing extracted images with metadata and storage capabilities.
9

10
```python { .api }
11
class Image:
12
    def __init__(self):
13
        """Initialize image container."""
14
        
15
    @property
16
    def src(self) -> str:
17
        """Image source URL."""
18
        
19
    @property
20
    def width(self) -> int:
21
        """Image width in pixels."""
22
        
23
    @property
24
    def height(self) -> int:
25
        """Image height in pixels."""
26
        
27
    @property
28
    def top_image_node(self):
29
        """DOM node of the image element."""
30
        
31
    @property
32
    def confidence_score(self) -> float:
33
        """Confidence score for image extraction (0.0-1.0)."""
34
        
35
    @property
36
    def extraction_type(self) -> str:
37
        """Type of extraction used (e.g., 'bestGuess', 'linkTag', 'openGraph')."""
38
        
39
    @property
40
    def bytes(self) -> int:
41
        """Size of the image in bytes."""
42

43
class ImageDetails:
44
    """Detailed image information and metadata container."""
45
    def __init__(self):
46
        """Initialize detailed image information."""
47
        
48
    # Contains extended image metadata and analysis results
49

50
class LocallyStoredImage:
51
    """Container for locally stored/cached images."""
52
    def __init__(self):
53
        """Initialize local image storage container."""
54
        
55
    # Manages local storage paths and cached image data
56
```
57

58
### Video Classes
59

60
Classes representing extracted video content from embedded players and media platforms.
61

62
```python { .api }
63
class Video:
64
    def __init__(self):
65
        """Initialize video container."""
66
        
67
    @property
68
    def src(self) -> str:
69
        """Video source URL."""
70
        
71
    @property
72
    def embed_code(self) -> str:
73
        """HTML embed code for the video."""
74
        
75
    @property
76
    def embed_type(self) -> str:
77
        """Type of embed (e.g., 'iframe', 'object')."""
78
        
79
    @property
80
    def width(self) -> int:
81
        """Video width in pixels."""
82
        
83
    @property
84
    def height(self) -> int:
85
        """Video height in pixels."""
86
        
87
    @property
88
    def provider(self) -> str:
89
        """Video provider/platform name."""
90
```
91

92
### Image Extraction Usage
93

94
Basic image access:
95

96
```python
97
from goose3 import Goose
98

99
g = Goose()
100
article = g.extract(url='https://example.com/article')
101

102
# Access main article image
103
if article.top_image:
104
    image = article.top_image
105
    print(f"Image URL: {image.src}")
106
    print(f"Dimensions: {image.width}x{image.height}")
107
    
108
    # Check if valid dimensions were extracted
109
    if image.width and image.height:
110
        aspect_ratio = image.width / image.height
111
        print(f"Aspect ratio: {aspect_ratio:.2f}")
112
    else:
113
        print("Dimensions not available")
114
else:
115
    print("No main image found in article")
116
```
117

118
Image fetching configuration:
119

120
```python
121
from goose3 import Goose
122

123
# Enable image fetching and local storage
124
config = {
125
    'enable_image_fetching': True,
126
    'local_storage_path': '/tmp/goose_images'
127
}
128

129
g = Goose(config)
130
article = g.extract(url='https://example.com/article')
131

132
if article.top_image:
133
    print(f"Image fetched and stored: {article.top_image.src}")
134
```
135

136
### Video Extraction Usage
137

138
Basic video access:
139

140
```python
141
from goose3 import Goose
142

143
g = Goose()
144
article = g.extract(url='https://example.com/article')
145

146
# Access embedded videos
147
if article.movies:
148
    print(f"Found {len(article.movies)} videos")
149
    
150
    for i, video in enumerate(article.movies):
151
        print(f"\nVideo {i+1}:")
152
        print(f"  Source: {video.src}")
153
        print(f"  Type: {video.embed_type}")
154
        print(f"  Dimensions: {video.width}x{video.height}")
155
        print(f"  Embed code: {video.embed_code[:100]}...")
156
else:
157
    print("No videos found in article")
158
```
159

160
Working with video embeds:
161

162
```python
163
article = g.extract(url='https://example.com/article')
164

165
for video in article.movies:
166
    if video.embed_type == 'iframe':
167
        # Handle iframe embeds
168
        print(f"Iframe video: {video.src}")
169
        print(f"Embed HTML: {video.embed_code}")
170
    elif video.embed_type == 'object':
171
        # Handle object embeds
172
        print(f"Object video: {video.src}")
173
    
174
    # Check video platform
175
    if 'youtube.com' in video.src or 'youtu.be' in video.src:
176
        print("YouTube video detected")
177
    elif 'vimeo.com' in video.src:
178
        print("Vimeo video detected")
179
    elif 'kewego.com' in video.src:
180
        print("Kewego video detected")
181
```
182

183
### Platform-Specific Video Support
184

185
Goose3 supports extraction from various video platforms:
186

187
```python
188
# YouTube video extraction
189
article = g.extract(url='https://example.com/article-with-youtube')
190
for video in article.movies:
191
    if 'youtube' in video.src.lower():
192
        print(f"YouTube video ID can be extracted from: {video.src}")
193

194
# Vimeo video extraction  
195
article = g.extract(url='https://example.com/article-with-vimeo')
196
for video in article.movies:
197
    if 'vimeo' in video.src.lower():
198
        print(f"Vimeo video: {video.src}")
199

200
# Generic iframe embeds
201
for video in article.movies:
202
    if video.embed_type == 'iframe':
203
        print(f"Generic iframe embed: {video.embed_code}")
204
```
205

206
### Media Validation
207

208
Checking media availability and quality:
209

210
```python
211
from goose3 import Goose
212

213
g = Goose({'enable_image_fetching': True})
214
article = g.extract(url='https://example.com/article')
215

216
# Validate main image
217
if article.top_image:
218
    image = article.top_image
219
    
220
    # Check if image has valid URL
221
    if image.src and image.src.startswith(('http://', 'https://')):
222
        print(f"Valid image URL: {image.src}")
223
        
224
        # Check dimensions
225
        if image.width and image.height:
226
            if image.width >= 300 and image.height >= 200:
227
                print("Image meets minimum size requirements")
228
            else:
229
                print("Image is quite small")
230
        else:
231
            print("Image dimensions not available")
232
    else:
233
        print("Invalid or missing image URL")
234

235
# Validate videos
236
valid_videos = []
237
for video in article.movies:
238
    if video.src and video.embed_code:
239
        if any(platform in video.src.lower() 
240
               for platform in ['youtube', 'vimeo', 'dailymotion']):
241
            valid_videos.append(video)
242

243
print(f"Found {len(valid_videos)} valid videos from known platforms")
244
```
245

246
### Error Handling for Media
247

248
```python
249
from goose3 import Goose
250

251
try:
252
    g = Goose({
253
        'enable_image_fetching': True,
254
        'local_storage_path': '/tmp/goose_images'
255
    })
256
    article = g.extract(url='https://example.com/article')
257
    
258
    # Safe media access
259
    image_available = bool(article.top_image and article.top_image.src)
260
    videos_available = bool(article.movies)
261
    
262
    print(f"Media extraction results - Images: {image_available}, Videos: {videos_available}")
263
    
264
except Exception as e:
265
    print(f"Media extraction error: {e}")
266
    # Continue with article text even if media extraction fails
267
    print(f"Article title: {article.title if 'article' in locals() else 'N/A'}")
268
```

Version

Tile

Files

media-extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

media-extraction.mddocs/