or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

entity-parsing.mdgeographic-data.mdindex.mdinput-format.mdtweet-model.mduser-model.md

entity-parsing.mddocs/

0

# Entity Parsing

1

2

Extraction and parsing of entities from tweet text including hashtags, URLs, user mentions, media attachments, and stock symbols.

3

4

## Capabilities

5

6

### Entities Container

7

8

Container class that holds all parsed entities from tweet text, providing structured access to hashtags, URLs, user mentions, media, and symbols.

9

10

```java { .api }

11

/**

12

* Container for all entities parsed from tweet text.

13

* Provides structured access to hashtags, URLs, mentions, media, and symbols.

14

*/

15

public class Entities {

16

17

/**

18

* Default constructor that initializes all entity lists.

19

*/

20

public Entities();

21

22

/**

23

* Get the list of hashtags found in the tweet text.

24

* @return List of HashTags objects

25

*/

26

public List<HashTags> getHashtags();

27

28

/**

29

* Set the list of hashtags found in the tweet text.

30

* @param hashtags List of HashTags objects

31

*/

32

public void setHashtags(List<HashTags> hashtags);

33

34

/**

35

* Get the list of URLs found in the tweet text.

36

* @return List of URL objects

37

*/

38

public List<URL> getUrls();

39

40

/**

41

* Set the list of URLs found in the tweet text.

42

* @param urls List of URL objects

43

*/

44

public void setUrls(List<URL> urls);

45

46

/**

47

* Get the list of user mentions found in the tweet text.

48

* @return List of UserMention objects

49

*/

50

public List<UserMention> getUser_mentions();

51

52

/**

53

* Set the list of user mentions found in the tweet text.

54

* @param user_mentions List of UserMention objects

55

*/

56

public void setUser_mentions(List<UserMention> user_mentions);

57

58

/**

59

* Get the list of media attachments in the tweet.

60

* @return List of Media objects

61

*/

62

public List<Media> getMedia();

63

64

/**

65

* Set the list of media attachments in the tweet.

66

* @param media List of Media objects

67

*/

68

public void setMedia(List<Media> media);

69

70

/**

71

* Get the list of stock symbols found in the tweet text.

72

* @return List of Symbol objects

73

*/

74

public List<Symbol> getSymbols();

75

76

/**

77

* Set the list of stock symbols found in the tweet text.

78

* @param symbols List of Symbol objects

79

*/

80

public void setSymbols(List<Symbol> symbols);

81

}

82

```

83

84

### HashTags

85

86

Hashtag entities parsed from tweet text with position information and cleaned text content.

87

88

```java { .api }

89

/**

90

* Represents hashtags parsed from tweet text.

91

* Contains the hashtag text and its position indices in the original text.

92

*/

93

public class HashTags {

94

95

/**

96

* Get the position indices of this hashtag in the tweet text.

97

* @return Array of [start, end] positions

98

*/

99

public long[] getIndices();

100

101

/**

102

* Set the position indices of this hashtag in the tweet text.

103

* @param indices Array of [start, end] positions

104

*/

105

public void setIndices(long[] indices);

106

107

/**

108

* Set the position indices of this hashtag in the tweet text.

109

* @param start Starting position in tweet text

110

* @param end Ending position in tweet text

111

*/

112

public void setIndices(long start, long end);

113

114

/**

115

* Get the hashtag text without the # symbol.

116

* @return Hashtag text

117

*/

118

public String getText();

119

120

/**

121

* Set the hashtag text, optionally processing to remove # symbol.

122

* @param text Hashtag text

123

* @param hashExist Whether the text includes the # symbol

124

*/

125

public void setText(String text, boolean hashExist);

126

}

127

```

128

129

### URL Entities

130

131

URL entities found in tweet text with expanded and display versions.

132

133

```java { .api }

134

/**

135

* Represents URLs found in tweet text.

136

* Contains original, expanded, and display versions of URLs.

137

*/

138

public class URL {

139

140

/**

141

* Get the position indices of this URL in the tweet text.

142

* @return Array of [start, end] positions

143

*/

144

public long[] getIndices();

145

146

/**

147

* Set the position indices of this URL in the tweet text.

148

* @param indices Array of [start, end] positions

149

*/

150

public void setIndices(long[] indices);

151

152

/**

153

* Get the original URL as it appears in the tweet.

154

* @return Original URL (usually shortened)

155

*/

156

public String getUrl();

157

158

/**

159

* Set the original URL as it appears in the tweet.

160

* @param url Original URL (usually shortened)

161

*/

162

public void setUrl(String url);

163

164

/**

165

* Get the expanded/resolved URL.

166

* @return Full expanded URL

167

*/

168

public String getExpanded_url();

169

170

/**

171

* Set the expanded/resolved URL.

172

* @param expanded_url Full expanded URL

173

*/

174

public void setExpanded_url(String expanded_url);

175

176

/**

177

* Get the display URL shown to users.

178

* @return Display-friendly URL

179

*/

180

public String getDisplay_url();

181

182

/**

183

* Set the display URL shown to users.

184

* @param display_url Display-friendly URL

185

*/

186

public void setDisplay_url(String display_url);

187

}

188

```

189

190

### UserMention

191

192

User mention entities (@username) found in tweet text.

193

194

```java { .api }

195

/**

196

* Represents user mentions (@username) found in tweet text.

197

* Contains user information and position data.

198

*/

199

public class UserMention {

200

201

/**

202

* Get the position indices of this mention in the tweet text.

203

* @return Array of [start, end] positions

204

*/

205

public long[] getIndices();

206

207

/**

208

* Set the position indices of this mention in the tweet text.

209

* @param indices Array of [start, end] positions

210

*/

211

public void setIndices(long[] indices);

212

213

/**

214

* Get the mentioned user's ID.

215

* @return User ID

216

*/

217

public long getId();

218

219

/**

220

* Set the mentioned user's ID.

221

* @param id User ID

222

*/

223

public void setId(long id);

224

225

/**

226

* Get the mentioned user's ID as string.

227

* @return User ID as string

228

*/

229

public String getId_str();

230

231

/**

232

* Set the mentioned user's ID as string (computed from numeric ID).

233

*/

234

public void setId_str();

235

236

/**

237

* Get the mentioned user's screen name.

238

* @return Screen name without @ symbol

239

*/

240

public String getScreen_name();

241

242

/**

243

* Set the mentioned user's screen name.

244

* @param screen_name Screen name without @ symbol

245

*/

246

public void setScreen_name(String screen_name);

247

248

/**

249

* Get the mentioned user's display name.

250

* @return Display name

251

*/

252

public String getName();

253

254

/**

255

* Set the mentioned user's display name.

256

* @param name Display name

257

*/

258

public void setName(String name);

259

}

260

```

261

262

### Media

263

264

Media attachment entities including images and videos.

265

266

```java { .api }

267

/**

268

* Represents media attachments (images, videos) in tweets.

269

* Contains URLs, dimensions, and metadata for media content.

270

*/

271

public class Media {

272

273

/**

274

* Get the position indices of this media in the tweet text.

275

* @return Array of [start, end] positions

276

*/

277

public long[] getIndices();

278

279

/**

280

* Set the position indices of this media in the tweet text.

281

* @param indices Array of [start, end] positions

282

*/

283

public void setIndices(long[] indices);

284

285

/**

286

* Get the media ID.

287

* @return Media ID

288

*/

289

public long getId();

290

291

/**

292

* Set the media ID.

293

* @param id Media ID

294

*/

295

public void setId(long id);

296

297

/**

298

* Get the media ID as string.

299

* @return Media ID as string

300

*/

301

public String getId_str();

302

303

/**

304

* Set the media ID as string.

305

* @param id_str Media ID as string

306

*/

307

public void setId_str(String id_str);

308

309

/**

310

* Get the media URL.

311

* @return Media URL

312

*/

313

public String getMedia_url();

314

315

/**

316

* Set the media URL.

317

* @param media_url Media URL

318

*/

319

public void setMedia_url(String media_url);

320

321

/**

322

* Get the HTTPS media URL.

323

* @return HTTPS media URL

324

*/

325

public String getMedia_url_https();

326

327

/**

328

* Set the HTTPS media URL.

329

* @param media_url_https HTTPS media URL

330

*/

331

public void setMedia_url_https(String media_url_https);

332

333

/**

334

* Get the display URL for this media.

335

* @return Display URL

336

*/

337

public String getDisplay_url();

338

339

/**

340

* Set the display URL for this media.

341

* @param display_url Display URL

342

*/

343

public void setDisplay_url(String display_url);

344

345

/**

346

* Get the expanded URL for this media.

347

* @return Expanded URL

348

*/

349

public String getExpanded_url();

350

351

/**

352

* Set the expanded URL for this media.

353

* @param expanded_url Expanded URL

354

*/

355

public void setExpanded_url(String expanded_url);

356

357

/**

358

* Get the original URL that was extracted from the tweet.

359

* @return Original URL

360

*/

361

public String getUrl();

362

363

/**

364

* Set the original URL that was extracted from the tweet.

365

* @param url Original URL

366

*/

367

public void setUrl(String url);

368

369

/**

370

* Get the media type (photo, video, etc.).

371

* @return Media type

372

*/

373

public String getType();

374

375

/**

376

* Set the media type (photo, video, etc.).

377

* @param type Media type

378

*/

379

public void setType(String type);

380

381

/**

382

* Get the available sizes for this media.

383

* @return Map of size names to Size objects

384

*/

385

public Map<String, Size> getSizes();

386

387

/**

388

* Set the available sizes for this media.

389

* @param sizes Map of size names to Size objects

390

*/

391

public void setSizes(Map<String, Size> sizes);

392

}

393

```

394

395

### Symbol

396

397

Stock symbol entities ($SYMBOL) found in tweet text.

398

399

```java { .api }

400

/**

401

* Represents stock symbols ($SYMBOL) found in tweet text.

402

* Contains symbol text and position information.

403

*/

404

public class Symbol {

405

406

/**

407

* Get the position indices of this symbol in the tweet text.

408

* @return Array of [start, end] positions

409

*/

410

public long[] getIndices();

411

412

/**

413

* Set the position indices of this symbol in the tweet text.

414

* @param indices Array of [start, end] positions

415

*/

416

public void setIndices(long[] indices);

417

418

/**

419

* Get the stock symbol text without the $ symbol.

420

* @return Stock symbol text

421

*/

422

public String getText();

423

424

/**

425

* Set the stock symbol text.

426

* @param text Stock symbol text

427

*/

428

public void setText(String text);

429

}

430

```

431

432

### Size

433

434

Media size information for different image/video dimensions.

435

436

```java { .api }

437

/**

438

* Represents size information for media attachments.

439

* Contains dimensions and resize information for different media sizes.

440

*/

441

public class Size {

442

443

/**

444

* Constructor with size dimensions and resize method.

445

* @param width Width in pixels

446

* @param height Height in pixels

447

* @param resize Resize method

448

*/

449

public Size(long width, long height, String resize);

450

451

/**

452

* Get the width of this media size.

453

* @return Width in pixels

454

*/

455

public long getWidth();

456

457

/**

458

* Set the width of this media size.

459

* @param width Width in pixels

460

*/

461

public void setWidth(long width);

462

463

/**

464

* Get the height of this media size.

465

* @return Height in pixels

466

*/

467

public long getHeight();

468

469

/**

470

* Set the height of this media size.

471

* @param height Height in pixels

472

*/

473

public void setHeight(long height);

474

475

/**

476

* Get the resize method for this media size.

477

* @return Resize method (fit, crop, etc.)

478

*/

479

public String getResize();

480

481

/**

482

* Set the resize method for this media size.

483

* @param resize Resize method (fit, crop, etc.)

484

*/

485

public void setResize(String resize);

486

}

487

```

488

489

**Usage Examples:**

490

491

```java

492

import org.apache.flink.contrib.tweetinputformat.model.tweet.entities.*;

493

import java.util.List;

494

import java.util.Map;

495

496

// Process all entities in a tweet

497

Tweet tweet = // ... get tweet

498

Entities entities = tweet.getEntities();

499

500

// Extract hashtags

501

List<HashTags> hashtags = entities.getHashtags();

502

for (HashTags tag : hashtags) {

503

System.out.println("Hashtag: #" + tag.getText());

504

long[] indices = tag.getIndices();

505

System.out.println("Position: " + indices[0] + "-" + indices[1]);

506

}

507

508

// Extract URLs

509

List<URL> urls = entities.getUrls();

510

for (URL url : urls) {

511

System.out.println("URL: " + url.getUrl());

512

System.out.println("Expanded: " + url.getExpanded_url());

513

System.out.println("Display: " + url.getDisplay_url());

514

}

515

516

// Extract user mentions

517

List<UserMention> mentions = entities.getUser_mentions();

518

for (UserMention mention : mentions) {

519

System.out.println("Mentioned: @" + mention.getScreen_name());

520

System.out.println("Name: " + mention.getName());

521

}

522

523

// Extract media

524

List<Media> mediaList = entities.getMedia();

525

for (Media media : mediaList) {

526

System.out.println("Media type: " + media.getType());

527

System.out.println("Media URL: " + media.getMedia_url_https());

528

529

// Check available sizes

530

Map<String, Size> sizes = media.getSizes();

531

if (sizes.containsKey("large")) {

532

Size largeSize = sizes.get("large");

533

System.out.printf("Large size: %dx%d%n", largeSize.getWidth(), largeSize.getHeight());

534

}

535

}

536

537

// Extract stock symbols

538

List<Symbol> symbols = entities.getSymbols();

539

for (Symbol symbol : symbols) {

540

System.out.println("Stock symbol: $" + symbol.getText());

541

}

542

```

543

544

## Entity Analysis Patterns

545

546

Common patterns for analyzing entities in stream processing:

547

548

```java

549

// Popular hashtags analysis

550

tweets.flatMap(tweet -> {

551

return tweet.getEntities().getHashtags().stream()

552

.map(HashTags::getText)

553

.collect(Collectors.toList());

554

}).countByValue();

555

556

// URL domain analysis

557

tweets.flatMap(tweet -> {

558

return tweet.getEntities().getUrls().stream()

559

.map(url -> extractDomain(url.getExpanded_url()))

560

.collect(Collectors.toList());

561

});

562

563

// User mention network analysis

564

tweets.flatMap(tweet -> {

565

String author = tweet.getUser().getScreen_name();

566

return tweet.getEntities().getUser_mentions().stream()

567

.map(mention -> new UserInteraction(author, mention.getScreen_name()))

568

.collect(Collectors.toList());

569

});

570

571

// Media type distribution

572

tweets.filter(tweet -> !tweet.getEntities().getMedia().isEmpty())

573

.map(tweet -> tweet.getEntities().getMedia().get(0).getType())

574

.countByValue();

575

576

// Stock symbol tracking

577

tweets.filter(tweet -> !tweet.getEntities().getSymbols().isEmpty())

578

.flatMap(tweet -> tweet.getEntities().getSymbols().stream()

579

.map(Symbol::getText)

580

.collect(Collectors.toList()));

581

```

582

583

## Position-Based Text Extraction

584

585

Using entity indices to extract text segments:

586

587

```java

588

public String extractEntityText(String tweetText, long[] indices) {

589

int start = (int) indices[0];

590

int end = (int) indices[1];

591

return tweetText.substring(start, end);

592

}

593

594

// Example usage

595

String tweetText = tweet.getText();

596

for (HashTags hashtag : tweet.getEntities().getHashtags()) {

597

String hashtagWithSymbol = extractEntityText(tweetText, hashtag.getIndices());

598

// hashtagWithSymbol will include the # symbol

599

System.out.println("Full hashtag: " + hashtagWithSymbol);

600

System.out.println("Clean text: " + hashtag.getText());

601

}

602

```