0
# Entity Parsing
1
2
Extraction and parsing of entities from tweet text including hashtags, URLs, user mentions, media attachments, and stock symbols.
3
4
## Capabilities
5
6
### Entities Container
7
8
Container class that holds all parsed entities from tweet text, providing structured access to hashtags, URLs, user mentions, media, and symbols.
9
10
```java { .api }
11
/**
12
* Container for all entities parsed from tweet text.
13
* Provides structured access to hashtags, URLs, mentions, media, and symbols.
14
*/
15
public class Entities {
16
17
/**
18
* Default constructor that initializes all entity lists.
19
*/
20
public Entities();
21
22
/**
23
* Get the list of hashtags found in the tweet text.
24
* @return List of HashTags objects
25
*/
26
public List<HashTags> getHashtags();
27
28
/**
29
* Set the list of hashtags found in the tweet text.
30
* @param hashtags List of HashTags objects
31
*/
32
public void setHashtags(List<HashTags> hashtags);
33
34
/**
35
* Get the list of URLs found in the tweet text.
36
* @return List of URL objects
37
*/
38
public List<URL> getUrls();
39
40
/**
41
* Set the list of URLs found in the tweet text.
42
* @param urls List of URL objects
43
*/
44
public void setUrls(List<URL> urls);
45
46
/**
47
* Get the list of user mentions found in the tweet text.
48
* @return List of UserMention objects
49
*/
50
public List<UserMention> getUser_mentions();
51
52
/**
53
* Set the list of user mentions found in the tweet text.
54
* @param user_mentions List of UserMention objects
55
*/
56
public void setUser_mentions(List<UserMention> user_mentions);
57
58
/**
59
* Get the list of media attachments in the tweet.
60
* @return List of Media objects
61
*/
62
public List<Media> getMedia();
63
64
/**
65
* Set the list of media attachments in the tweet.
66
* @param media List of Media objects
67
*/
68
public void setMedia(List<Media> media);
69
70
/**
71
* Get the list of stock symbols found in the tweet text.
72
* @return List of Symbol objects
73
*/
74
public List<Symbol> getSymbols();
75
76
/**
77
* Set the list of stock symbols found in the tweet text.
78
* @param symbols List of Symbol objects
79
*/
80
public void setSymbols(List<Symbol> symbols);
81
}
82
```
83
84
### HashTags
85
86
Hashtag entities parsed from tweet text with position information and cleaned text content.
87
88
```java { .api }
89
/**
90
* Represents hashtags parsed from tweet text.
91
* Contains the hashtag text and its position indices in the original text.
92
*/
93
public class HashTags {
94
95
/**
96
* Get the position indices of this hashtag in the tweet text.
97
* @return Array of [start, end] positions
98
*/
99
public long[] getIndices();
100
101
/**
102
* Set the position indices of this hashtag in the tweet text.
103
* @param indices Array of [start, end] positions
104
*/
105
public void setIndices(long[] indices);
106
107
/**
108
* Set the position indices of this hashtag in the tweet text.
109
* @param start Starting position in tweet text
110
* @param end Ending position in tweet text
111
*/
112
public void setIndices(long start, long end);
113
114
/**
115
* Get the hashtag text without the # symbol.
116
* @return Hashtag text
117
*/
118
public String getText();
119
120
/**
121
* Set the hashtag text, optionally processing to remove # symbol.
122
* @param text Hashtag text
123
* @param hashExist Whether the text includes the # symbol
124
*/
125
public void setText(String text, boolean hashExist);
126
}
127
```
128
129
### URL Entities
130
131
URL entities found in tweet text with expanded and display versions.
132
133
```java { .api }
134
/**
135
* Represents URLs found in tweet text.
136
* Contains original, expanded, and display versions of URLs.
137
*/
138
public class URL {
139
140
/**
141
* Get the position indices of this URL in the tweet text.
142
* @return Array of [start, end] positions
143
*/
144
public long[] getIndices();
145
146
/**
147
* Set the position indices of this URL in the tweet text.
148
* @param indices Array of [start, end] positions
149
*/
150
public void setIndices(long[] indices);
151
152
/**
153
* Get the original URL as it appears in the tweet.
154
* @return Original URL (usually shortened)
155
*/
156
public String getUrl();
157
158
/**
159
* Set the original URL as it appears in the tweet.
160
* @param url Original URL (usually shortened)
161
*/
162
public void setUrl(String url);
163
164
/**
165
* Get the expanded/resolved URL.
166
* @return Full expanded URL
167
*/
168
public String getExpanded_url();
169
170
/**
171
* Set the expanded/resolved URL.
172
* @param expanded_url Full expanded URL
173
*/
174
public void setExpanded_url(String expanded_url);
175
176
/**
177
* Get the display URL shown to users.
178
* @return Display-friendly URL
179
*/
180
public String getDisplay_url();
181
182
/**
183
* Set the display URL shown to users.
184
* @param display_url Display-friendly URL
185
*/
186
public void setDisplay_url(String display_url);
187
}
188
```
189
190
### UserMention
191
192
User mention entities (@username) found in tweet text.
193
194
```java { .api }
195
/**
196
* Represents user mentions (@username) found in tweet text.
197
* Contains user information and position data.
198
*/
199
public class UserMention {
200
201
/**
202
* Get the position indices of this mention in the tweet text.
203
* @return Array of [start, end] positions
204
*/
205
public long[] getIndices();
206
207
/**
208
* Set the position indices of this mention in the tweet text.
209
* @param indices Array of [start, end] positions
210
*/
211
public void setIndices(long[] indices);
212
213
/**
214
* Get the mentioned user's ID.
215
* @return User ID
216
*/
217
public long getId();
218
219
/**
220
* Set the mentioned user's ID.
221
* @param id User ID
222
*/
223
public void setId(long id);
224
225
/**
226
* Get the mentioned user's ID as string.
227
* @return User ID as string
228
*/
229
public String getId_str();
230
231
/**
232
* Set the mentioned user's ID as string (computed from numeric ID).
233
*/
234
public void setId_str();
235
236
/**
237
* Get the mentioned user's screen name.
238
* @return Screen name without @ symbol
239
*/
240
public String getScreen_name();
241
242
/**
243
* Set the mentioned user's screen name.
244
* @param screen_name Screen name without @ symbol
245
*/
246
public void setScreen_name(String screen_name);
247
248
/**
249
* Get the mentioned user's display name.
250
* @return Display name
251
*/
252
public String getName();
253
254
/**
255
* Set the mentioned user's display name.
256
* @param name Display name
257
*/
258
public void setName(String name);
259
}
260
```
261
262
### Media
263
264
Media attachment entities including images and videos.
265
266
```java { .api }
267
/**
268
* Represents media attachments (images, videos) in tweets.
269
* Contains URLs, dimensions, and metadata for media content.
270
*/
271
public class Media {
272
273
/**
274
* Get the position indices of this media in the tweet text.
275
* @return Array of [start, end] positions
276
*/
277
public long[] getIndices();
278
279
/**
280
* Set the position indices of this media in the tweet text.
281
* @param indices Array of [start, end] positions
282
*/
283
public void setIndices(long[] indices);
284
285
/**
286
* Get the media ID.
287
* @return Media ID
288
*/
289
public long getId();
290
291
/**
292
* Set the media ID.
293
* @param id Media ID
294
*/
295
public void setId(long id);
296
297
/**
298
* Get the media ID as string.
299
* @return Media ID as string
300
*/
301
public String getId_str();
302
303
/**
304
* Set the media ID as string.
305
* @param id_str Media ID as string
306
*/
307
public void setId_str(String id_str);
308
309
/**
310
* Get the media URL.
311
* @return Media URL
312
*/
313
public String getMedia_url();
314
315
/**
316
* Set the media URL.
317
* @param media_url Media URL
318
*/
319
public void setMedia_url(String media_url);
320
321
/**
322
* Get the HTTPS media URL.
323
* @return HTTPS media URL
324
*/
325
public String getMedia_url_https();
326
327
/**
328
* Set the HTTPS media URL.
329
* @param media_url_https HTTPS media URL
330
*/
331
public void setMedia_url_https(String media_url_https);
332
333
/**
334
* Get the display URL for this media.
335
* @return Display URL
336
*/
337
public String getDisplay_url();
338
339
/**
340
* Set the display URL for this media.
341
* @param display_url Display URL
342
*/
343
public void setDisplay_url(String display_url);
344
345
/**
346
* Get the expanded URL for this media.
347
* @return Expanded URL
348
*/
349
public String getExpanded_url();
350
351
/**
352
* Set the expanded URL for this media.
353
* @param expanded_url Expanded URL
354
*/
355
public void setExpanded_url(String expanded_url);
356
357
/**
358
* Get the original URL that was extracted from the tweet.
359
* @return Original URL
360
*/
361
public String getUrl();
362
363
/**
364
* Set the original URL that was extracted from the tweet.
365
* @param url Original URL
366
*/
367
public void setUrl(String url);
368
369
/**
370
* Get the media type (photo, video, etc.).
371
* @return Media type
372
*/
373
public String getType();
374
375
/**
376
* Set the media type (photo, video, etc.).
377
* @param type Media type
378
*/
379
public void setType(String type);
380
381
/**
382
* Get the available sizes for this media.
383
* @return Map of size names to Size objects
384
*/
385
public Map<String, Size> getSizes();
386
387
/**
388
* Set the available sizes for this media.
389
* @param sizes Map of size names to Size objects
390
*/
391
public void setSizes(Map<String, Size> sizes);
392
}
393
```
394
395
### Symbol
396
397
Stock symbol entities ($SYMBOL) found in tweet text.
398
399
```java { .api }
400
/**
401
* Represents stock symbols ($SYMBOL) found in tweet text.
402
* Contains symbol text and position information.
403
*/
404
public class Symbol {
405
406
/**
407
* Get the position indices of this symbol in the tweet text.
408
* @return Array of [start, end] positions
409
*/
410
public long[] getIndices();
411
412
/**
413
* Set the position indices of this symbol in the tweet text.
414
* @param indices Array of [start, end] positions
415
*/
416
public void setIndices(long[] indices);
417
418
/**
419
* Get the stock symbol text without the $ symbol.
420
* @return Stock symbol text
421
*/
422
public String getText();
423
424
/**
425
* Set the stock symbol text.
426
* @param text Stock symbol text
427
*/
428
public void setText(String text);
429
}
430
```
431
432
### Size
433
434
Media size information for different image/video dimensions.
435
436
```java { .api }
437
/**
438
* Represents size information for media attachments.
439
* Contains dimensions and resize information for different media sizes.
440
*/
441
public class Size {
442
443
/**
444
* Constructor with size dimensions and resize method.
445
* @param width Width in pixels
446
* @param height Height in pixels
447
* @param resize Resize method
448
*/
449
public Size(long width, long height, String resize);
450
451
/**
452
* Get the width of this media size.
453
* @return Width in pixels
454
*/
455
public long getWidth();
456
457
/**
458
* Set the width of this media size.
459
* @param width Width in pixels
460
*/
461
public void setWidth(long width);
462
463
/**
464
* Get the height of this media size.
465
* @return Height in pixels
466
*/
467
public long getHeight();
468
469
/**
470
* Set the height of this media size.
471
* @param height Height in pixels
472
*/
473
public void setHeight(long height);
474
475
/**
476
* Get the resize method for this media size.
477
* @return Resize method (fit, crop, etc.)
478
*/
479
public String getResize();
480
481
/**
482
* Set the resize method for this media size.
483
* @param resize Resize method (fit, crop, etc.)
484
*/
485
public void setResize(String resize);
486
}
487
```
488
489
**Usage Examples:**
490
491
```java
492
import org.apache.flink.contrib.tweetinputformat.model.tweet.entities.*;
493
import java.util.List;
494
import java.util.Map;
495
496
// Process all entities in a tweet
497
Tweet tweet = // ... get tweet
498
Entities entities = tweet.getEntities();
499
500
// Extract hashtags
501
List<HashTags> hashtags = entities.getHashtags();
502
for (HashTags tag : hashtags) {
503
System.out.println("Hashtag: #" + tag.getText());
504
long[] indices = tag.getIndices();
505
System.out.println("Position: " + indices[0] + "-" + indices[1]);
506
}
507
508
// Extract URLs
509
List<URL> urls = entities.getUrls();
510
for (URL url : urls) {
511
System.out.println("URL: " + url.getUrl());
512
System.out.println("Expanded: " + url.getExpanded_url());
513
System.out.println("Display: " + url.getDisplay_url());
514
}
515
516
// Extract user mentions
517
List<UserMention> mentions = entities.getUser_mentions();
518
for (UserMention mention : mentions) {
519
System.out.println("Mentioned: @" + mention.getScreen_name());
520
System.out.println("Name: " + mention.getName());
521
}
522
523
// Extract media
524
List<Media> mediaList = entities.getMedia();
525
for (Media media : mediaList) {
526
System.out.println("Media type: " + media.getType());
527
System.out.println("Media URL: " + media.getMedia_url_https());
528
529
// Check available sizes
530
Map<String, Size> sizes = media.getSizes();
531
if (sizes.containsKey("large")) {
532
Size largeSize = sizes.get("large");
533
System.out.printf("Large size: %dx%d%n", largeSize.getWidth(), largeSize.getHeight());
534
}
535
}
536
537
// Extract stock symbols
538
List<Symbol> symbols = entities.getSymbols();
539
for (Symbol symbol : symbols) {
540
System.out.println("Stock symbol: $" + symbol.getText());
541
}
542
```
543
544
## Entity Analysis Patterns
545
546
Common patterns for analyzing entities in stream processing:
547
548
```java
549
// Popular hashtags analysis
550
tweets.flatMap(tweet -> {
551
return tweet.getEntities().getHashtags().stream()
552
.map(HashTags::getText)
553
.collect(Collectors.toList());
554
}).countByValue();
555
556
// URL domain analysis
557
tweets.flatMap(tweet -> {
558
return tweet.getEntities().getUrls().stream()
559
.map(url -> extractDomain(url.getExpanded_url()))
560
.collect(Collectors.toList());
561
});
562
563
// User mention network analysis
564
tweets.flatMap(tweet -> {
565
String author = tweet.getUser().getScreen_name();
566
return tweet.getEntities().getUser_mentions().stream()
567
.map(mention -> new UserInteraction(author, mention.getScreen_name()))
568
.collect(Collectors.toList());
569
});
570
571
// Media type distribution
572
tweets.filter(tweet -> !tweet.getEntities().getMedia().isEmpty())
573
.map(tweet -> tweet.getEntities().getMedia().get(0).getType())
574
.countByValue();
575
576
// Stock symbol tracking
577
tweets.filter(tweet -> !tweet.getEntities().getSymbols().isEmpty())
578
.flatMap(tweet -> tweet.getEntities().getSymbols().stream()
579
.map(Symbol::getText)
580
.collect(Collectors.toList()));
581
```
582
583
## Position-Based Text Extraction
584
585
Using entity indices to extract text segments:
586
587
```java
588
public String extractEntityText(String tweetText, long[] indices) {
589
int start = (int) indices[0];
590
int end = (int) indices[1];
591
return tweetText.substring(start, end);
592
}
593
594
// Example usage
595
String tweetText = tweet.getText();
596
for (HashTags hashtag : tweet.getEntities().getHashtags()) {
597
String hashtagWithSymbol = extractEntityText(tweetText, hashtag.getIndices());
598
// hashtagWithSymbol will include the # symbol
599
System.out.println("Full hashtag: " + hashtagWithSymbol);
600
System.out.println("Clean text: " + hashtag.getText());
601
}
602
```