or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

article-data.mdconfiguration.mdcore-extraction.mdindex.mdmedia-extraction.md

article-data.mddocs/

0

# Article Data

1

2

Rich data structure containing all extracted content, metadata, and media from web pages. The Article class provides comprehensive property access to extracted information including text content, metadata, images, videos, and structural data.

3

4

## Capabilities

5

6

### Article Class

7

8

Main container for all extracted article information with read-only properties providing access to content and metadata.

9

10

```python { .api }

11

class Article:

12

def __init__(self):

13

"""Initialize empty article container."""

14

15

# Content properties

16

@property

17

def title(self) -> str:

18

"""Article title extracted from page."""

19

20

@property

21

def cleaned_text(self) -> str:

22

"""Main article text content, cleaned and formatted."""

23

24

@property

25

def meta_description(self) -> str:

26

"""Meta description from page metadata."""

27

28

@property

29

def meta_lang(self) -> str:

30

"""Language metadata from page."""

31

32

@property

33

def meta_favicon(self) -> str:

34

"""Favicon URL extracted from page."""

35

36

@property

37

def meta_keywords(self) -> str:

38

"""Meta keywords from page metadata."""

39

40

@property

41

def meta_encoding(self) -> list:

42

"""Character encoding information."""

43

44

# URL and domain properties

45

@property

46

def canonical_link(self) -> str:

47

"""Canonical URL from page metadata."""

48

49

@property

50

def domain(self) -> str:

51

"""Domain name of the article source."""

52

53

@property

54

def final_url(self) -> str:

55

"""Final resolved URL after redirects."""

56

57

@property

58

def link_hash(self) -> str:

59

"""Hash of the article URL."""

60

61

# Content structure properties

62

@property

63

def top_node(self):

64

"""Main content DOM node (parser-specific object)."""

65

66

@property

67

def top_node_raw_html(self) -> str:

68

"""Raw HTML of the main content area."""

69

70

@property

71

def raw_html(self) -> str:

72

"""Original HTML of the entire page."""

73

74

@property

75

def doc(self):

76

"""Parsed document object (parser-specific)."""

77

78

@property

79

def raw_doc(self):

80

"""Raw document object before processing."""

81

82

# Media properties

83

@property

84

def top_image(self) -> Image:

85

"""Main article image object."""

86

87

@property

88

def movies(self) -> list:

89

"""List of Video objects for embedded videos."""

90

91

# Structured data properties

92

@property

93

def tags(self) -> list:

94

"""List of article tags extracted from page."""

95

96

@property

97

def opengraph(self) -> dict:

98

"""OpenGraph metadata as dictionary."""

99

100

@property

101

def tweets(self) -> list:

102

"""List of embedded tweets."""

103

104

@property

105

def links(self) -> list:

106

"""List of links found in article content."""

107

108

@property

109

def authors(self) -> list:

110

"""List of article authors."""

111

112

@property

113

def schema(self):

114

"""Schema.org structured data from page."""

115

116

# Date properties

117

@property

118

def publish_date(self) -> str:

119

"""Publication date as string."""

120

121

@property

122

def publish_datetime_utc(self):

123

"""Publication datetime in UTC (datetime object)."""

124

125

# Additional data

126

@property

127

def additional_data(self) -> dict:

128

"""Additional extracted data as dictionary."""

129

130

@property

131

def infos(self) -> dict:

132

"""Extraction information and statistics."""

133

```

134

135

### Property Usage Examples

136

137

Accessing article content:

138

139

```python

140

from goose3 import Goose

141

142

g = Goose()

143

article = g.extract(url='https://example.com/article')

144

145

# Basic content

146

print(f"Title: {article.title}")

147

print(f"Text length: {len(article.cleaned_text)} characters")

148

print(f"Description: {article.meta_description}")

149

150

# Metadata

151

print(f"Language: {article.meta_lang}")

152

print(f"Domain: {article.domain}")

153

print(f"Final URL: {article.final_url}")

154

print(f"Keywords: {article.meta_keywords}")

155

```

156

157

Working with images:

158

159

```python

160

article = g.extract(url='https://example.com/article')

161

162

if article.top_image:

163

print(f"Main image: {article.top_image.src}")

164

print(f"Image dimensions: {article.top_image.width}x{article.top_image.height}")

165

else:

166

print("No main image found")

167

```

168

169

Accessing embedded media:

170

171

```python

172

article = g.extract(url='https://example.com/article')

173

174

# Videos

175

if article.movies:

176

for video in article.movies:

177

print(f"Video source: {video.src}")

178

print(f"Embed code: {video.embed_code}")

179

print(f"Video type: {video.embed_type}")

180

181

# Tweets

182

if article.tweets:

183

print(f"Found {len(article.tweets)} embedded tweets")

184

```

185

186

Working with structured data:

187

188

```python

189

article = g.extract(url='https://example.com/article')

190

191

# OpenGraph data

192

if article.opengraph:

193

og_title = article.opengraph.get('title')

194

og_image = article.opengraph.get('image')

195

print(f"OG Title: {og_title}")

196

print(f"OG Image: {og_image}")

197

198

# Tags and categories

199

if article.tags:

200

print(f"Tags: {', '.join(article.tags)}")

201

202

# Author information

203

if article.authors:

204

print(f"Authors: {', '.join(article.authors)}")

205

206

# Schema.org data

207

if article.schema:

208

print(f"Schema data available: {type(article.schema)}")

209

```

210

211

Date and time information:

212

213

```python

214

article = g.extract(url='https://example.com/article')

215

216

if article.publish_date:

217

print(f"Published: {article.publish_date}")

218

219

if article.publish_datetime_utc:

220

print(f"Published (UTC): {article.publish_datetime_utc}")

221

```

222

223

Raw content access:

224

225

```python

226

article = g.extract(url='https://example.com/article')

227

228

# Raw HTML content

229

print(f"Page HTML length: {len(article.raw_html)} characters")

230

print(f"Main content HTML: {article.top_node_raw_html[:200]}...")

231

232

# Extraction statistics

233

if article.infos:

234

print(f"Extraction info: {article.infos}")

235

236

# Additional extracted data

237

if article.additional_data:

238

print(f"Additional data keys: {list(article.additional_data.keys())}")

239

```

240

241

### Data Availability

242

243

Not all properties will have values for every article. Always check for empty values:

244

245

```python

246

article = g.extract(url='https://example.com/article')

247

248

# Safe access patterns

249

title = article.title or "No title found"

250

description = article.meta_description or "No description available"

251

252

# Check for media

253

has_image = article.top_image is not None

254

has_videos = bool(article.movies)

255

has_authors = bool(article.authors)

256

257

print(f"Content available - Image: {has_image}, Videos: {has_videos}, Authors: {has_authors}")

258

```