Tessl Tile for pypi/goose3@3.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

article-data.md configuration.md core-extraction.md index.md media-extraction.md

article-data.mddocs/

0
# Article Data
1

2
Rich data structure containing all extracted content, metadata, and media from web pages. The Article class provides comprehensive property access to extracted information including text content, metadata, images, videos, and structural data.
3

4
## Capabilities
5

6
### Article Class
7

8
Main container for all extracted article information with read-only properties providing access to content and metadata.
9

10
```python { .api }
11
class Article:
12
    def __init__(self):
13
        """Initialize empty article container."""
14
        
15
    # Content properties
16
    @property
17
    def title(self) -> str:
18
        """Article title extracted from page."""
19
        
20
    @property  
21
    def cleaned_text(self) -> str:
22
        """Main article text content, cleaned and formatted."""
23
        
24
    @property
25
    def meta_description(self) -> str:
26
        """Meta description from page metadata."""
27
        
28
    @property
29
    def meta_lang(self) -> str:
30
        """Language metadata from page."""
31
        
32
    @property
33
    def meta_favicon(self) -> str:
34
        """Favicon URL extracted from page."""
35
        
36
    @property
37
    def meta_keywords(self) -> str:
38
        """Meta keywords from page metadata."""
39
        
40
    @property
41
    def meta_encoding(self) -> list:
42
        """Character encoding information."""
43
        
44
    # URL and domain properties
45
    @property
46
    def canonical_link(self) -> str:
47
        """Canonical URL from page metadata."""
48
        
49
    @property
50
    def domain(self) -> str:
51
        """Domain name of the article source."""
52
        
53
    @property
54
    def final_url(self) -> str:
55
        """Final resolved URL after redirects."""
56
        
57
    @property
58
    def link_hash(self) -> str:
59
        """Hash of the article URL."""
60
        
61
    # Content structure properties
62
    @property
63
    def top_node(self):
64
        """Main content DOM node (parser-specific object)."""
65
        
66
    @property
67
    def top_node_raw_html(self) -> str:
68
        """Raw HTML of the main content area."""
69
        
70
    @property
71
    def raw_html(self) -> str:
72
        """Original HTML of the entire page."""
73
        
74
    @property
75
    def doc(self):
76
        """Parsed document object (parser-specific)."""
77
        
78
    @property
79
    def raw_doc(self):
80
        """Raw document object before processing."""
81
        
82
    # Media properties
83
    @property
84
    def top_image(self) -> Image:
85
        """Main article image object."""
86
        
87
    @property
88
    def movies(self) -> list:
89
        """List of Video objects for embedded videos."""
90
        
91
    # Structured data properties
92
    @property
93
    def tags(self) -> list:
94
        """List of article tags extracted from page."""
95
        
96
    @property
97
    def opengraph(self) -> dict:
98
        """OpenGraph metadata as dictionary."""
99
        
100
    @property
101
    def tweets(self) -> list:
102
        """List of embedded tweets."""
103
        
104
    @property
105
    def links(self) -> list:
106
        """List of links found in article content."""
107
        
108
    @property
109
    def authors(self) -> list:
110
        """List of article authors."""
111
        
112
    @property
113
    def schema(self):
114
        """Schema.org structured data from page."""
115
        
116
    # Date properties
117
    @property
118
    def publish_date(self) -> str:
119
        """Publication date as string."""
120
        
121
    @property
122
    def publish_datetime_utc(self):
123
        """Publication datetime in UTC (datetime object)."""
124
        
125
    # Additional data
126
    @property
127
    def additional_data(self) -> dict:
128
        """Additional extracted data as dictionary."""
129
        
130
    @property
131
    def infos(self) -> dict:
132
        """Extraction information and statistics."""
133
```
134

135
### Property Usage Examples
136

137
Accessing article content:
138

139
```python
140
from goose3 import Goose
141

142
g = Goose()
143
article = g.extract(url='https://example.com/article')
144

145
# Basic content
146
print(f"Title: {article.title}")
147
print(f"Text length: {len(article.cleaned_text)} characters")
148
print(f"Description: {article.meta_description}")
149

150
# Metadata
151
print(f"Language: {article.meta_lang}")
152
print(f"Domain: {article.domain}")
153
print(f"Final URL: {article.final_url}")
154
print(f"Keywords: {article.meta_keywords}")
155
```
156

157
Working with images:
158

159
```python
160
article = g.extract(url='https://example.com/article')
161

162
if article.top_image:
163
    print(f"Main image: {article.top_image.src}")
164
    print(f"Image dimensions: {article.top_image.width}x{article.top_image.height}")
165
else:
166
    print("No main image found")
167
```
168

169
Accessing embedded media:
170

171
```python
172
article = g.extract(url='https://example.com/article')
173

174
# Videos
175
if article.movies:
176
    for video in article.movies:
177
        print(f"Video source: {video.src}")
178
        print(f"Embed code: {video.embed_code}")
179
        print(f"Video type: {video.embed_type}")
180

181
# Tweets
182
if article.tweets:
183
    print(f"Found {len(article.tweets)} embedded tweets")
184
```
185

186
Working with structured data:
187

188
```python
189
article = g.extract(url='https://example.com/article')
190

191
# OpenGraph data
192
if article.opengraph:
193
    og_title = article.opengraph.get('title')
194
    og_image = article.opengraph.get('image')
195
    print(f"OG Title: {og_title}")
196
    print(f"OG Image: {og_image}")
197

198
# Tags and categories
199
if article.tags:
200
    print(f"Tags: {', '.join(article.tags)}")
201

202
# Author information
203
if article.authors:
204
    print(f"Authors: {', '.join(article.authors)}")
205

206
# Schema.org data
207
if article.schema:
208
    print(f"Schema data available: {type(article.schema)}")
209
```
210

211
Date and time information:
212

213
```python
214
article = g.extract(url='https://example.com/article')
215

216
if article.publish_date:
217
    print(f"Published: {article.publish_date}")
218

219
if article.publish_datetime_utc:
220
    print(f"Published (UTC): {article.publish_datetime_utc}")
221
```
222

223
Raw content access:
224

225
```python
226
article = g.extract(url='https://example.com/article')
227

228
# Raw HTML content
229
print(f"Page HTML length: {len(article.raw_html)} characters")
230
print(f"Main content HTML: {article.top_node_raw_html[:200]}...")
231

232
# Extraction statistics
233
if article.infos:
234
    print(f"Extraction info: {article.infos}")
235

236
# Additional extracted data
237
if article.additional_data:
238
    print(f"Additional data keys: {list(article.additional_data.keys())}")
239
```
240

241
### Data Availability
242

243
Not all properties will have values for every article. Always check for empty values:
244

245
```python
246
article = g.extract(url='https://example.com/article')
247

248
# Safe access patterns
249
title = article.title or "No title found"
250
description = article.meta_description or "No description available"
251

252
# Check for media
253
has_image = article.top_image is not None
254
has_videos = bool(article.movies)
255
has_authors = bool(article.authors)
256

257
print(f"Content available - Image: {has_image}, Videos: {has_videos}, Authors: {has_authors}")
258
```

Version

Tile

Files

article-data.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

article-data.mddocs/