or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli-interface.mderror-handling.mdindex.mdsimple-api.mdsitemap-index.mdsitemap-parsing.mdsitemap-streams.mdvalidation-utilities.mdxml-validation.md

sitemap-parsing.mddocs/

0

# Sitemap Parsing

1

2

Functionality for parsing existing XML sitemaps back into JavaScript objects for analysis and manipulation. These utilities allow you to read and process existing sitemaps from files or streams.

3

4

## Capabilities

5

6

### Sitemap Parsing Functions

7

8

High-level functions for parsing complete sitemaps into arrays of items.

9

10

```typescript { .api }

11

/**

12

* Parse XML sitemap and resolve with array of sitemap items

13

* @param xml - Readable stream containing XML sitemap data

14

* @returns Promise resolving to array of parsed sitemap items

15

*/

16

function parseSitemap(xml: Readable): Promise<SitemapItem[]>;

17

18

/**

19

* Parse XML sitemap index and resolve with array of index items

20

* @param xml - Readable stream containing XML sitemap index data

21

* @returns Promise resolving to array of parsed index items

22

*/

23

function parseSitemapIndex(xml: Readable): Promise<IndexItem[]>;

24

```

25

26

**Usage Examples:**

27

28

```typescript

29

import { parseSitemap, parseSitemapIndex } from "sitemap";

30

import { createReadStream } from "fs";

31

32

// Parse sitemap from file

33

const sitemapItems = await parseSitemap(

34

createReadStream("./sitemap.xml")

35

);

36

37

console.log("Found", sitemapItems.length, "URLs");

38

sitemapItems.forEach(item => {

39

console.log(`${item.url} - ${item.changefreq} - ${item.priority}`);

40

});

41

42

// Parse sitemap index

43

const indexItems = await parseSitemapIndex(

44

createReadStream("./sitemap-index.xml")

45

);

46

47

console.log("Found", indexItems.length, "sitemaps");

48

indexItems.forEach(item => {

49

console.log(`${item.url} - ${item.lastmod}`);

50

});

51

```

52

53

### XMLToSitemapItemStream

54

55

Transform stream for parsing XML sitemap data into individual sitemap items.

56

57

```typescript { .api }

58

/**

59

* Transform stream that converts XML sitemap data into SitemapItem objects

60

* Use this to parse existing sitemaps into config options compatible with this library

61

*/

62

class XMLToSitemapItemStream extends Transform {

63

constructor(opts?: XMLToSitemapItemStreamOptions);

64

65

/** Error handling level */

66

level: ErrorLevel;

67

68

/** Logger function for warnings and errors */

69

logger: Logger;

70

71

/** Current parsing error */

72

error: Error | null;

73

74

/** SAX stream parser */

75

saxStream: SAXStream;

76

}

77

78

interface XMLToSitemapItemStreamOptions extends TransformOptions {

79

/** Error handling level for validation */

80

level?: ErrorLevel;

81

82

/** Custom logger function or false to disable logging */

83

logger?: Logger | false;

84

}

85

86

type Logger = (

87

level: 'warn' | 'error' | 'info' | 'log',

88

...message: Parameters<Console['log']>[0]

89

) => void;

90

```

91

92

**Usage Examples:**

93

94

```typescript

95

import { XMLToSitemapItemStream, ErrorLevel } from "sitemap";

96

import { createReadStream } from "fs";

97

98

// Parse with strict error handling

99

const parser = new XMLToSitemapItemStream({

100

level: ErrorLevel.THROW

101

});

102

103

const sitemapItems: SitemapItem[] = [];

104

105

createReadStream("sitemap.xml")

106

.pipe(parser)

107

.on('data', (item: SitemapItem) => {

108

sitemapItems.push(item);

109

})

110

.on('end', () => {

111

console.log("Parsed", sitemapItems.length, "items");

112

})

113

.on('error', (error) => {

114

console.error("Parse error:", error);

115

});

116

117

// Parse with custom logging

118

const customParser = new XMLToSitemapItemStream({

119

level: ErrorLevel.WARN,

120

logger: (level, ...args) => {

121

console.log(`[${level.toUpperCase()}]`, ...args);

122

}

123

});

124

```

125

126

### XMLToSitemapIndexStream

127

128

Transform stream for parsing XML sitemap index data into index items.

129

130

```typescript { .api }

131

/**

132

* Transform stream that converts XML sitemap index data into IndexItem objects

133

* Use this to parse existing sitemap indices into config options

134

*/

135

class XMLToSitemapIndexStream extends Transform {

136

constructor(opts?: XMLToSitemapIndexItemStreamOptions);

137

138

/** Error handling level */

139

level: ErrorLevel;

140

141

/** Logger function */

142

logger: Logger;

143

144

/** SAX stream parser */

145

saxStream: SAXStream;

146

}

147

148

interface XMLToSitemapIndexItemStreamOptions extends TransformOptions {

149

/** Error handling level for validation */

150

level?: ErrorLevel;

151

152

/** Custom logger function or false to disable logging */

153

logger?: Logger | false;

154

}

155

```

156

157

**Usage Examples:**

158

159

```typescript

160

import { XMLToSitemapIndexStream } from "sitemap";

161

import { createReadStream } from "fs";

162

163

const indexParser = new XMLToSitemapIndexStream();

164

const indexItems: IndexItem[] = [];

165

166

createReadStream("sitemap-index.xml")

167

.pipe(indexParser)

168

.on('data', (item: IndexItem) => {

169

indexItems.push(item);

170

})

171

.on('end', () => {

172

console.log("Found sitemaps:", indexItems.map(i => i.url));

173

});

174

```

175

176

### JSON Conversion Streams

177

178

Transform streams for converting parsed objects to JSON format.

179

180

```typescript { .api }

181

/**

182

* Transform stream that converts sitemap items to JSON format

183

* @param lineSeparated - Whether to separate entries by newline or comma

184

*/

185

class ObjectStreamToJSON extends Transform {

186

constructor(opts?: ObjectStreamToJSONOptions);

187

188

/** Whether to use line-separated JSON */

189

lineSeparated: boolean;

190

191

/** Whether first item has been written */

192

firstWritten: boolean;

193

}

194

195

interface ObjectStreamToJSONOptions extends TransformOptions {

196

/** Whether to separate entries by newline instead of comma */

197

lineSeparated: boolean;

198

}

199

200

/**

201

* Transform stream that converts index items to JSON format

202

*/

203

class IndexObjectStreamToJSON extends Transform {

204

constructor(opts?: IndexObjectStreamToJSONOptions);

205

206

/** Whether to use line-separated JSON */

207

lineSeparated: boolean;

208

209

/** Whether first item has been written */

210

firstWritten: boolean;

211

}

212

213

interface IndexObjectStreamToJSONOptions extends TransformOptions {

214

/** Whether to separate entries by newline instead of comma */

215

lineSeparated: boolean;

216

}

217

```

218

219

**Usage Examples:**

220

221

```typescript

222

import {

223

XMLToSitemapItemStream,

224

ObjectStreamToJSON

225

} from "sitemap";

226

import { createReadStream, createWriteStream } from "fs";

227

228

// Convert sitemap XML to JSON array

229

createReadStream("sitemap.xml")

230

.pipe(new XMLToSitemapItemStream())

231

.pipe(new ObjectStreamToJSON({ lineSeparated: false }))

232

.pipe(createWriteStream("sitemap.json"));

233

234

// Convert to line-separated JSON (JSONL)

235

createReadStream("sitemap.xml")

236

.pipe(new XMLToSitemapItemStream())

237

.pipe(new ObjectStreamToJSON({ lineSeparated: true }))

238

.pipe(createWriteStream("sitemap.jsonl"));

239

```

240

241

## Advanced Parsing Features

242

243

### Supported Sitemap Extensions

244

245

The parser supports all standard sitemap extensions:

246

247

- **Images**: `<image:image>` elements with captions, titles, geo-location

248

- **Videos**: `<video:video>` elements with thumbnails, descriptions, metadata

249

- **News**: `<news:news>` elements with publication data

250

- **Alternate Languages**: `<xhtml:link>` elements for multilingual content

251

- **Mobile**: `<mobile:mobile>` elements (legacy)

252

253

### Error Handling in Parsing

254

255

```typescript

256

import { XMLToSitemapItemStream, ErrorLevel } from "sitemap";

257

258

// Different error handling strategies

259

const silentParser = new XMLToSitemapItemStream({

260

level: ErrorLevel.SILENT // Ignore all validation errors

261

});

262

263

const warningParser = new XMLToSitemapItemStream({

264

level: ErrorLevel.WARN // Log warnings but continue

265

});

266

267

const strictParser = new XMLToSitemapItemStream({

268

level: ErrorLevel.THROW // Throw on any validation error

269

});

270

```

271

272

### Custom Validation During Parsing

273

274

```typescript

275

import { XMLToSitemapItemStream } from "sitemap";

276

277

const parser = new XMLToSitemapItemStream({

278

level: ErrorLevel.WARN,

279

logger: (level, ...args) => {

280

// Custom validation logic

281

if (level === 'warn' && args[0].includes('unhandled')) {

282

console.warn("Found unsupported sitemap element:", ...args);

283

}

284

}

285

});

286

```

287

288

## Complete Parsing Example

289

290

```typescript

291

import {

292

parseSitemap,

293

SitemapStream,

294

streamToPromise

295

} from "sitemap";

296

import { createReadStream } from "fs";

297

298

async function processSitemap() {

299

// Parse existing sitemap

300

const items = await parseSitemap(createReadStream("input.xml"));

301

302

// Filter and modify items

303

const filteredItems = items

304

.filter(item => item.priority && item.priority > 0.5)

305

.map(item => ({

306

...item,

307

lastmod: new Date().toISOString(),

308

changefreq: "weekly" as const

309

}));

310

311

// Generate new sitemap

312

const newSitemap = new SitemapStream({

313

hostname: "https://example.com"

314

});

315

316

filteredItems.forEach(item => newSitemap.write(item));

317

newSitemap.end();

318

319

const xmlBuffer = await streamToPromise(newSitemap);

320

console.log(xmlBuffer.toString());

321

}

322

```