0
# Search and Analytics Containers
1
2
Specialized containers for search engines, analytics platforms, and data processing including Elasticsearch, OpenSearch, ClickHouse, and vector databases for full-text search, analytics, and AI/ML workloads.
3
4
## Capabilities
5
6
### Elasticsearch Container
7
8
Elasticsearch distributed search and analytics engine container with configurable cluster settings and security options.
9
10
```python { .api }
11
class ElasticSearchContainer:
12
def __init__(
13
self,
14
image: str = "elasticsearch:8.8.0",
15
port: int = 9200,
16
**kwargs: Any
17
):
18
"""
19
Initialize Elasticsearch container.
20
21
Args:
22
image: Elasticsearch Docker image
23
port: HTTP port (default 9200)
24
**kwargs: Additional container options
25
"""
26
27
def get_url(self) -> str:
28
"""
29
Get Elasticsearch HTTP URL.
30
31
Returns:
32
Elasticsearch HTTP URL string
33
"""
34
```
35
36
### OpenSearch Container
37
38
OpenSearch distributed search and analytics engine container with dashboard support and security configuration.
39
40
```python { .api }
41
class OpenSearchContainer:
42
def __init__(
43
self,
44
image: str = "opensearchproject/opensearch:latest",
45
port: int = 9200,
46
**kwargs: Any
47
):
48
"""
49
Initialize OpenSearch container.
50
51
Args:
52
image: OpenSearch Docker image
53
port: HTTP port (default 9200)
54
**kwargs: Additional container options
55
"""
56
57
def get_url(self) -> str:
58
"""
59
Get OpenSearch HTTP URL.
60
61
Returns:
62
OpenSearch HTTP URL string
63
"""
64
```
65
66
### Vector Database Containers
67
68
Modern vector databases for similarity search, embeddings, and AI/ML applications.
69
70
```python { .api }
71
class ChromaContainer:
72
def __init__(
73
self,
74
image: str = "chromadb/chroma:latest",
75
port: int = 8000,
76
**kwargs: Any
77
):
78
"""
79
Initialize Chroma vector database container.
80
81
Args:
82
image: Chroma Docker image
83
port: HTTP port (default 8000)
84
**kwargs: Additional container options
85
"""
86
87
def get_url(self) -> str:
88
"""
89
Get Chroma HTTP URL.
90
91
Returns:
92
Chroma HTTP URL string
93
"""
94
95
class WeaviateContainer:
96
def __init__(
97
self,
98
image: str = "semitechnologies/weaviate:latest",
99
port: int = 8080,
100
**kwargs: Any
101
):
102
"""
103
Initialize Weaviate vector database container.
104
105
Args:
106
image: Weaviate Docker image
107
port: HTTP port (default 8080)
108
**kwargs: Additional container options
109
"""
110
111
def get_url(self) -> str:
112
"""
113
Get Weaviate HTTP URL.
114
115
Returns:
116
Weaviate HTTP URL string
117
"""
118
119
class QdrantContainer:
120
def __init__(
121
self,
122
image: str = "qdrant/qdrant:latest",
123
port: int = 6333,
124
**kwargs: Any
125
):
126
"""
127
Initialize Qdrant vector database container.
128
129
Args:
130
image: Qdrant Docker image
131
port: HTTP port (default 6333)
132
**kwargs: Additional container options
133
"""
134
135
def get_url(self) -> str:
136
"""
137
Get Qdrant HTTP URL.
138
139
Returns:
140
Qdrant HTTP URL string
141
"""
142
143
class MilvusContainer:
144
def __init__(
145
self,
146
image: str = "milvusdb/milvus:latest",
147
port: int = 19530,
148
**kwargs: Any
149
):
150
"""
151
Initialize Milvus vector database container.
152
153
Args:
154
image: Milvus Docker image
155
port: gRPC port (default 19530)
156
**kwargs: Additional container options
157
"""
158
159
def get_connection_args(self) -> dict:
160
"""
161
Get Milvus connection arguments.
162
163
Returns:
164
Dictionary with host and port for Milvus client
165
"""
166
```
167
168
### Analytics Database Containers
169
170
High-performance analytics and columnar databases for OLAP workloads.
171
172
```python { .api }
173
class ClickHouseContainer:
174
def __init__(
175
self,
176
image: str = "clickhouse/clickhouse-server:latest",
177
port: int = 8123,
178
username: str = "default",
179
password: str = "",
180
dbname: str = "default",
181
**kwargs: Any
182
):
183
"""
184
Initialize ClickHouse container.
185
186
Args:
187
image: ClickHouse Docker image
188
port: HTTP port (default 8123)
189
username: Database username
190
password: Database password
191
dbname: Database name
192
**kwargs: Additional container options
193
"""
194
195
def get_connection_url(self) -> str:
196
"""
197
Get ClickHouse connection URL.
198
199
Returns:
200
ClickHouse connection URL string
201
"""
202
203
class TrinoContainer:
204
def __init__(
205
self,
206
image: str = "trinodb/trino:latest",
207
port: int = 8080,
208
**kwargs: Any
209
):
210
"""
211
Initialize Trino distributed query engine container.
212
213
Args:
214
image: Trino Docker image
215
port: HTTP port (default 8080)
216
**kwargs: Additional container options
217
"""
218
219
def get_connection_url(self) -> str:
220
"""
221
Get Trino connection URL.
222
223
Returns:
224
Trino connection URL string
225
"""
226
```
227
228
## Usage Examples
229
230
### Elasticsearch Full-Text Search
231
232
```python
233
from testcontainers.elasticsearch import ElasticSearchContainer
234
from elasticsearch import Elasticsearch
235
236
with ElasticSearchContainer("elasticsearch:8.8.0") as es_container:
237
# Get Elasticsearch client
238
es_url = es_container.get_url()
239
es_client = Elasticsearch([es_url])
240
241
# Wait for cluster to be ready
242
es_client.cluster.health(wait_for_status="yellow", timeout="30s")
243
244
# Create an index
245
index_name = "test_index"
246
es_client.indices.create(index=index_name, ignore=400)
247
248
# Index some documents
249
documents = [
250
{"title": "Elasticsearch Guide", "content": "Learn about search and analytics"},
251
{"title": "Python Testing", "content": "Unit testing with containers"},
252
{"title": "Data Analytics", "content": "Big data processing and analysis"}
253
]
254
255
for i, doc in enumerate(documents, 1):
256
es_client.index(index=index_name, id=i, body=doc)
257
258
# Refresh index
259
es_client.indices.refresh(index=index_name)
260
261
# Search documents
262
search_query = {
263
"query": {
264
"match": {
265
"content": "analytics"
266
}
267
}
268
}
269
270
results = es_client.search(index=index_name, body=search_query)
271
print(f"Found {results['hits']['total']['value']} matching documents")
272
273
for hit in results['hits']['hits']:
274
print(f"- {hit['_source']['title']}: {hit['_score']}")
275
```
276
277
### Vector Database with Chroma
278
279
```python
280
from testcontainers.chroma import ChromaContainer
281
import chromadb
282
import numpy as np
283
284
with ChromaContainer() as chroma_container:
285
# Get Chroma client
286
chroma_url = chroma_container.get_url()
287
client = chromadb.HttpClient(host=chroma_url.split("://")[1].split(":")[0],
288
port=int(chroma_url.split(":")[2]))
289
290
# Create collection
291
collection = client.create_collection("test_collection")
292
293
# Add embeddings
294
embeddings = [
295
[0.1, 0.2, 0.3, 0.4],
296
[0.5, 0.6, 0.7, 0.8],
297
[0.9, 0.1, 0.2, 0.3]
298
]
299
300
documents = [
301
"First document about AI",
302
"Second document about machine learning",
303
"Third document about data science"
304
]
305
306
ids = ["doc1", "doc2", "doc3"]
307
308
collection.add(
309
embeddings=embeddings,
310
documents=documents,
311
ids=ids
312
)
313
314
# Query similar vectors
315
query_embedding = [0.1, 0.25, 0.35, 0.45]
316
results = collection.query(
317
query_embeddings=[query_embedding],
318
n_results=2
319
)
320
321
print("Similar documents:")
322
for i, doc in enumerate(results['documents'][0]):
323
distance = results['distances'][0][i]
324
print(f"- {doc} (distance: {distance:.4f})")
325
```
326
327
### ClickHouse Analytics
328
329
```python
330
from testcontainers.clickhouse import ClickHouseContainer
331
import clickhouse_driver
332
333
with ClickHouseContainer() as clickhouse:
334
# Connect to ClickHouse
335
connection_url = clickhouse.get_connection_url()
336
client = clickhouse_driver.Client.from_url(connection_url)
337
338
# Create table for analytics
339
client.execute("""
340
CREATE TABLE IF NOT EXISTS events (
341
timestamp DateTime,
342
user_id UInt32,
343
event_type String,
344
value Float64
345
) ENGINE = MergeTree()
346
ORDER BY timestamp
347
""")
348
349
# Insert sample data
350
import datetime
351
import random
352
353
events_data = []
354
base_time = datetime.datetime.now()
355
356
for i in range(1000):
357
events_data.append((
358
base_time + datetime.timedelta(minutes=i),
359
random.randint(1, 100),
360
random.choice(['click', 'view', 'purchase']),
361
random.uniform(1.0, 100.0)
362
))
363
364
client.execute(
365
"INSERT INTO events (timestamp, user_id, event_type, value) VALUES",
366
events_data
367
)
368
369
# Run analytics queries
370
# Daily event counts
371
daily_stats = client.execute("""
372
SELECT
373
toDate(timestamp) as date,
374
event_type,
375
count() as events,
376
sum(value) as total_value
377
FROM events
378
GROUP BY date, event_type
379
ORDER BY date, event_type
380
""")
381
382
print("Daily event statistics:")
383
for date, event_type, count, total in daily_stats:
384
print(f"{date} {event_type}: {count} events, total value: {total:.2f}")
385
386
# Top users by activity
387
top_users = client.execute("""
388
SELECT
389
user_id,
390
count() as activity_count,
391
sum(value) as total_value
392
FROM events
393
GROUP BY user_id
394
ORDER BY activity_count DESC
395
LIMIT 5
396
""")
397
398
print("\nTop users by activity:")
399
for user_id, count, total in top_users:
400
print(f"User {user_id}: {count} events, total value: {total:.2f}")
401
```
402
403
### Multi-Engine Search Setup
404
405
```python
406
from testcontainers.elasticsearch import ElasticSearchContainer
407
from testcontainers.opensearch import OpenSearchContainer
408
from testcontainers.chroma import ChromaContainer
409
from testcontainers.core.network import Network
410
411
# Create network for search engines
412
with Network() as network:
413
# Start multiple search engines
414
with ElasticSearchContainer() as elasticsearch, \
415
OpenSearchContainer() as opensearch, \
416
ChromaContainer() as chroma:
417
418
# Connect to network
419
elasticsearch.with_network(network).with_network_aliases("elasticsearch")
420
opensearch.with_network(network).with_network_aliases("opensearch")
421
chroma.with_network(network).with_network_aliases("chroma")
422
423
# Get service URLs
424
es_url = elasticsearch.get_url()
425
os_url = opensearch.get_url()
426
chroma_url = chroma.get_url()
427
428
print(f"Elasticsearch: {es_url}")
429
print(f"OpenSearch: {os_url}")
430
print(f"Chroma: {chroma_url}")
431
432
# Use multiple search engines for different use cases
433
# Elasticsearch for structured search
434
# OpenSearch for log analytics
435
# Chroma for vector similarity search
436
```
437
438
### Trino Distributed Query Engine
439
440
```python
441
from testcontainers.trino import TrinoContainer
442
import trino
443
444
with TrinoContainer() as trino_container:
445
connection_url = trino_container.get_connection_url()
446
447
# Connect to Trino
448
conn = trino.dbapi.connect(
449
host=connection_url.split("://")[1].split(":")[0],
450
port=int(connection_url.split(":")[2]),
451
user="test"
452
)
453
454
cursor = conn.cursor()
455
456
# Query information schema
457
cursor.execute("SHOW CATALOGS")
458
catalogs = cursor.fetchall()
459
print("Available catalogs:")
460
for catalog in catalogs:
461
print(f"- {catalog[0]}")
462
463
# Create memory table for testing
464
cursor.execute("""
465
CREATE TABLE memory.default.sales AS
466
SELECT * FROM (VALUES
467
('2023-01-01', 'Product A', 100.0),
468
('2023-01-02', 'Product B', 150.0),
469
('2023-01-03', 'Product A', 200.0)
470
) AS t(date, product, amount)
471
""")
472
473
# Query the data
474
cursor.execute("""
475
SELECT product, sum(amount) as total_sales
476
FROM memory.default.sales
477
GROUP BY product
478
ORDER BY total_sales DESC
479
""")
480
481
results = cursor.fetchall()
482
print("\nSales by product:")
483
for product, total in results:
484
print(f"{product}: ${total}")
485
```
486
487
### Vector Similarity Search Comparison
488
489
```python
490
from testcontainers.chroma import ChromaContainer
491
from testcontainers.weaviate import WeaviateContainer
492
from testcontainers.qdrant import QdrantContainer
493
import numpy as np
494
495
# Generate sample embeddings
496
def generate_embeddings(n_docs=100, dim=384):
497
"""Generate random embeddings for testing."""
498
return np.random.random((n_docs, dim)).tolist()
499
500
embeddings = generate_embeddings()
501
documents = [f"Document {i}" for i in range(len(embeddings))]
502
503
# Test with multiple vector databases
504
with ChromaContainer() as chroma, \
505
WeaviateContainer() as weaviate, \
506
QdrantContainer() as qdrant:
507
508
print("Testing vector similarity search across databases...")
509
510
# Chroma setup
511
import chromadb
512
chroma_client = chromadb.HttpClient(host="localhost", port=8000) # Simplified
513
chroma_collection = chroma_client.create_collection("test")
514
chroma_collection.add(
515
embeddings=embeddings,
516
documents=documents,
517
ids=[str(i) for i in range(len(documents))]
518
)
519
520
# Query all databases with same vector
521
query_vector = embeddings[0] # Use first document as query
522
523
# Chroma query
524
chroma_results = chroma_collection.query(
525
query_embeddings=[query_vector],
526
n_results=5
527
)
528
529
print(f"Chroma found {len(chroma_results['documents'][0])} similar documents")
530
531
# Compare performance and results
532
print("Vector database comparison complete")
533
```