0
# Service Level Monitoring
1
2
Comprehensive service-level monitoring for managing services and Service Level Objectives (SLOs) in Google Cloud Monitoring. This enables service-oriented monitoring with SLI definitions, error budgets, and service health tracking for modern microservices architectures.
3
4
## Capabilities
5
6
### Service Management
7
8
Manage the complete lifecycle of services including creation, updates, retrieval, and deletion.
9
10
```python { .api }
11
class ServiceMonitoringServiceClient:
12
def create_service(
13
self,
14
request=None,
15
*,
16
parent: str = None,
17
service=None,
18
retry=None,
19
timeout=None,
20
metadata=()
21
) -> service.Service:
22
"""
23
Create a Service.
24
25
Args:
26
request: The request object or dict equivalent
27
parent: Required. Project name in format 'projects/[PROJECT_ID]'
28
service: Required. The Service to create
29
retry: Retry configuration
30
timeout: Request timeout in seconds
31
metadata: Additional metadata
32
33
Returns:
34
Created Service object
35
"""
36
37
def get_service(
38
self,
39
request=None,
40
*,
41
name: str = None,
42
retry=None,
43
timeout=None,
44
metadata=()
45
) -> service.Service:
46
"""
47
Get the named Service.
48
49
Args:
50
request: The request object or dict equivalent
51
name: Required. Service name in format 'projects/[PROJECT_ID]/services/[SERVICE_ID]'
52
retry: Retry configuration
53
timeout: Request timeout in seconds
54
metadata: Additional metadata
55
56
Returns:
57
Service object
58
"""
59
60
def list_services(
61
self,
62
request=None,
63
*,
64
parent: str = None,
65
retry=None,
66
timeout=None,
67
metadata=()
68
) -> pagers.ListServicesPager:
69
"""
70
List Services for this Metrics Scope.
71
72
Args:
73
request: The request object or dict equivalent
74
parent: Required. Project name
75
retry: Retry configuration
76
timeout: Request timeout in seconds
77
metadata: Additional metadata
78
79
Returns:
80
Pager for iterating over Service objects
81
"""
82
83
def update_service(
84
self,
85
request=None,
86
*,
87
service=None,
88
retry=None,
89
timeout=None,
90
metadata=()
91
) -> service.Service:
92
"""
93
Update this Service.
94
95
Args:
96
request: The request object or dict equivalent
97
service: Required. Updated Service
98
retry: Retry configuration
99
timeout: Request timeout in seconds
100
metadata: Additional metadata
101
102
Returns:
103
Updated Service object
104
"""
105
106
def delete_service(
107
self,
108
request=None,
109
*,
110
name: str = None,
111
retry=None,
112
timeout=None,
113
metadata=()
114
) -> None:
115
"""
116
Soft delete this Service.
117
118
Args:
119
request: The request object or dict equivalent
120
name: Required. Service name to delete
121
retry: Retry configuration
122
timeout: Request timeout in seconds
123
metadata: Additional metadata
124
"""
125
```
126
127
### Service Level Objective Management
128
129
Manage Service Level Objectives (SLOs) for tracking service reliability and performance.
130
131
```python { .api }
132
class ServiceMonitoringServiceClient:
133
def create_service_level_objective(
134
self,
135
request=None,
136
*,
137
parent: str = None,
138
service_level_objective=None,
139
retry=None,
140
timeout=None,
141
metadata=()
142
) -> service.ServiceLevelObjective:
143
"""
144
Create a ServiceLevelObjective for the given Service.
145
146
Args:
147
request: The request object or dict equivalent
148
parent: Required. Service name
149
service_level_objective: Required. The SLO to create
150
retry: Retry configuration
151
timeout: Request timeout in seconds
152
metadata: Additional metadata
153
154
Returns:
155
Created ServiceLevelObjective object
156
"""
157
158
def get_service_level_objective(
159
self,
160
request=None,
161
*,
162
name: str = None,
163
retry=None,
164
timeout=None,
165
metadata=()
166
) -> service.ServiceLevelObjective:
167
"""
168
Get a ServiceLevelObjective by name.
169
170
Args:
171
request: The request object or dict equivalent
172
name: Required. SLO name
173
retry: Retry configuration
174
timeout: Request timeout in seconds
175
metadata: Additional metadata
176
177
Returns:
178
ServiceLevelObjective object
179
"""
180
181
def list_service_level_objectives(
182
self,
183
request=None,
184
*,
185
parent: str = None,
186
retry=None,
187
timeout=None,
188
metadata=()
189
) -> pagers.ListServiceLevelObjectivesPager:
190
"""
191
List the ServiceLevelObjectives for the given Service.
192
193
Args:
194
request: The request object or dict equivalent
195
parent: Required. Service name
196
retry: Retry configuration
197
timeout: Request timeout in seconds
198
metadata: Additional metadata
199
200
Returns:
201
Pager for iterating over ServiceLevelObjective objects
202
"""
203
204
def update_service_level_objective(
205
self,
206
request=None,
207
*,
208
service_level_objective=None,
209
retry=None,
210
timeout=None,
211
metadata=()
212
) -> service.ServiceLevelObjective:
213
"""
214
Update the given ServiceLevelObjective.
215
216
Args:
217
request: The request object or dict equivalent
218
service_level_objective: Required. Updated SLO
219
retry: Retry configuration
220
timeout: Request timeout in seconds
221
metadata: Additional metadata
222
223
Returns:
224
Updated ServiceLevelObjective object
225
"""
226
227
def delete_service_level_objective(
228
self,
229
request=None,
230
*,
231
name: str = None,
232
retry=None,
233
timeout=None,
234
metadata=()
235
) -> None:
236
"""
237
Delete the given ServiceLevelObjective.
238
239
Args:
240
request: The request object or dict equivalent
241
name: Required. SLO name to delete
242
retry: Retry configuration
243
timeout: Request timeout in seconds
244
metadata: Additional metadata
245
"""
246
```
247
248
## Data Types
249
250
### Service
251
252
Represents a service for monitoring purposes.
253
254
```python { .api }
255
class Service:
256
name: str # Resource name
257
display_name: str # Human-readable name
258
custom: Service.Custom # Custom service definition
259
app_engine: Service.AppEngine # App Engine service
260
cloud_endpoints: Service.CloudEndpoints # Cloud Endpoints service
261
cluster_istio: Service.ClusterIstio # Istio service mesh
262
mesh_istio: Service.MeshIstio # Istio mesh service
263
istio_canonical_service: Service.IstioCanonicalService # Canonical Istio service
264
cloud_run: Service.CloudRun # Cloud Run service
265
gke_namespace: Service.GkeNamespace # GKE namespace service
266
gke_workload: Service.GkeWorkload # GKE workload service
267
gke_service: Service.GkeService # GKE service
268
telemetry: Service.Telemetry # Telemetry configuration
269
user_labels: Dict[str, str] # User-defined labels
270
271
class Service.Custom:
272
# Custom service defined by a filter
273
274
class Service.CloudRun:
275
service_name: str # Cloud Run service name
276
location: str # Cloud Run service location
277
278
class Service.Telemetry:
279
resource_name: str # Resource name for telemetry
280
```
281
282
### ServiceLevelObjective
283
284
Represents a Service Level Objective definition.
285
286
```python { .api }
287
class ServiceLevelObjective:
288
name: str # Resource name
289
display_name: str # Human-readable name
290
service_level_indicator: ServiceLevelIndicator # SLI definition
291
goal: float # SLO target (0.0 to 1.0)
292
rolling_period: Duration # Rolling period for SLO
293
calendar_period: CalendarPeriod # Calendar period for SLO
294
user_labels: Dict[str, str] # User-defined labels
295
296
class ServiceLevelIndicator:
297
basic_sli: BasicSli # Basic SLI definition
298
request_based: RequestBasedSli # Request-based SLI
299
windows_based: WindowsBasedSli # Windows-based SLI
300
301
class BasicSli:
302
method: List[str] # HTTP methods to monitor
303
location: List[str] # Locations to monitor
304
version: List[str] # Versions to monitor
305
availability: BasicSli.AvailabilityCriteria # Availability criteria
306
latency: BasicSli.LatencyCriteria # Latency criteria
307
308
class RequestBasedSli:
309
good_total_ratio: TimeSeriesRatio # Good events vs total events
310
distribution_cut: DistributionCut # Distribution-based SLI
311
312
class WindowsBasedSli:
313
good_bad_metric_filter: str # Metric filter for good/bad windows
314
good_total_ratio: TimeSeriesRatio # Good vs total windows
315
metric_mean_in_range: Range # Metric mean within range
316
metric_sum_in_range: Range # Metric sum within range
317
window_period: Duration # Window period
318
```
319
320
### Request and Response Types
321
322
```python { .api }
323
class CreateServiceRequest:
324
parent: str # Required. Project name
325
service_id: str # Service ID
326
service: Service # Required. Service to create
327
328
class GetServiceRequest:
329
name: str # Required. Service name
330
331
class ListServicesRequest:
332
parent: str # Required. Project name
333
filter: str # Filter expression
334
page_size: int # Maximum results per page
335
page_token: str # Page token
336
337
class ListServicesResponse:
338
services: List[Service] # Services
339
next_page_token: str # Next page token
340
341
class UpdateServiceRequest:
342
service: Service # Required. Updated service
343
update_mask: FieldMask # Fields to update
344
345
class DeleteServiceRequest:
346
name: str # Required. Service name to delete
347
348
class CreateServiceLevelObjectiveRequest:
349
parent: str # Required. Service name
350
service_level_objective_id: str # SLO ID
351
service_level_objective: ServiceLevelObjective # Required. SLO to create
352
353
class GetServiceLevelObjectiveRequest:
354
name: str # Required. SLO name
355
view: ServiceLevelObjective.View # View type
356
357
class ListServiceLevelObjectivesRequest:
358
parent: str # Required. Service name
359
filter: str # Filter expression
360
page_size: int # Maximum results per page
361
page_token: str # Page token
362
363
class ListServiceLevelObjectivesResponse:
364
service_level_objectives: List[ServiceLevelObjective] # SLOs
365
next_page_token: str # Next page token
366
367
class UpdateServiceLevelObjectiveRequest:
368
service_level_objective: ServiceLevelObjective # Required. Updated SLO
369
update_mask: FieldMask # Fields to update
370
371
class DeleteServiceLevelObjectiveRequest:
372
name: str # Required. SLO name to delete
373
```
374
375
## Usage Examples
376
377
### Creating a Custom Service
378
379
```python
380
from google.cloud.monitoring import ServiceMonitoringServiceClient
381
from google.cloud.monitoring_v3.types import Service
382
383
client = ServiceMonitoringServiceClient()
384
project_name = f"projects/{project_id}"
385
386
# Create a custom service
387
service_obj = Service()
388
service_obj.display_name = "Web Frontend Service"
389
390
# Define custom service with filter
391
custom_service = Service.Custom()
392
service_obj.custom = custom_service
393
394
# Add user labels
395
service_obj.user_labels["team"] = "frontend"
396
service_obj.user_labels["environment"] = "production"
397
398
created_service = client.create_service(
399
parent=project_name,
400
service=service_obj,
401
service_id="web-frontend"
402
)
403
print(f"Created service: {created_service.name}")
404
```
405
406
### Creating a Cloud Run Service
407
408
```python
409
# Create Cloud Run service
410
cloud_run_service = Service()
411
cloud_run_service.display_name = "API Service"
412
413
# Configure Cloud Run service
414
cloud_run = Service.CloudRun()
415
cloud_run.service_name = "api-service"
416
cloud_run.location = "us-central1"
417
cloud_run_service.cloud_run = cloud_run
418
419
created_cloud_run = client.create_service(
420
parent=project_name,
421
service=cloud_run_service,
422
service_id="api-service"
423
)
424
print(f"Created Cloud Run service: {created_cloud_run.name}")
425
```
426
427
### Creating a Service Level Objective
428
429
```python
430
from google.cloud.monitoring_v3.types import (
431
ServiceLevelObjective, ServiceLevelIndicator, BasicSli
432
)
433
from google.protobuf.duration_pb2 import Duration
434
435
service_name = f"projects/{project_id}/services/web-frontend"
436
437
# Create SLO for availability
438
slo = ServiceLevelObjective()
439
slo.display_name = "Web Frontend Availability SLO"
440
slo.goal = 0.995 # 99.5% availability target
441
442
# Define rolling period (30 days)
443
rolling_period = Duration()
444
rolling_period.seconds = 30 * 24 * 60 * 60 # 30 days
445
slo.rolling_period = rolling_period
446
447
# Define Service Level Indicator
448
sli = ServiceLevelIndicator()
449
basic_sli = BasicSli()
450
451
# Configure availability criteria
452
availability = BasicSli.AvailabilityCriteria()
453
basic_sli.availability = availability
454
sli.basic_sli = basic_sli
455
slo.service_level_indicator = sli
456
457
# Add user labels
458
slo.user_labels["tier"] = "critical"
459
slo.user_labels["team"] = "frontend"
460
461
created_slo = client.create_service_level_objective(
462
parent=service_name,
463
service_level_objective=slo,
464
service_level_objective_id="availability-slo"
465
)
466
print(f"Created SLO: {created_slo.name}")
467
print(f"Target: {created_slo.goal * 100}%")
468
```
469
470
### Creating a Latency SLO
471
472
```python
473
# Create SLO for latency
474
latency_slo = ServiceLevelObjective()
475
latency_slo.display_name = "Web Frontend Latency SLO"
476
latency_slo.goal = 0.90 # 90% of requests under threshold
477
478
# Rolling period (7 days)
479
rolling_period = Duration()
480
rolling_period.seconds = 7 * 24 * 60 * 60
481
latency_slo.rolling_period = rolling_period
482
483
# Define latency SLI
484
latency_sli = ServiceLevelIndicator()
485
latency_basic = BasicSli()
486
487
# Configure latency criteria (500ms threshold)
488
latency_criteria = BasicSli.LatencyCriteria()
489
latency_criteria.threshold.seconds = 0
490
latency_criteria.threshold.nanos = 500000000 # 500ms
491
latency_basic.latency = latency_criteria
492
493
latency_sli.basic_sli = latency_basic
494
latency_slo.service_level_indicator = latency_sli
495
496
created_latency_slo = client.create_service_level_objective(
497
parent=service_name,
498
service_level_objective=latency_slo,
499
service_level_objective_id="latency-slo"
500
)
501
print(f"Created latency SLO: {created_latency_slo.name}")
502
```
503
504
### Listing Services and SLOs
505
506
```python
507
# List all services
508
print("Services:")
509
for service in client.list_services(parent=project_name):
510
print(f"- {service.display_name}: {service.name}")
511
512
# List SLOs for each service
513
print(f" SLOs:")
514
for slo in client.list_service_level_objectives(parent=service.name):
515
print(f" - {slo.display_name}: {slo.goal * 100}% target")
516
517
# Filter services by label
518
filter_expr = 'user_labels.environment="production"'
519
print(f"\nProduction services:")
520
for service in client.list_services(parent=project_name, filter=filter_expr):
521
print(f"- {service.display_name}")
522
```
523
524
### Updating a Service
525
526
```python
527
from google.protobuf import field_mask_pb2
528
529
# Get existing service
530
service_name = f"projects/{project_id}/services/web-frontend"
531
service = client.get_service(name=service_name)
532
533
# Update service properties
534
service.display_name = "Updated Web Frontend Service"
535
service.user_labels["version"] = "v2.0"
536
537
# Create field mask for selective update
538
update_mask = field_mask_pb2.FieldMask()
539
update_mask.paths.extend(["display_name", "user_labels"])
540
541
updated_service = client.update_service(
542
service=service,
543
update_mask=update_mask
544
)
545
print(f"Updated service: {updated_service.display_name}")
546
```
547
548
### Working with Request-Based SLIs
549
550
```python
551
from google.cloud.monitoring_v3.types import RequestBasedSli, TimeSeriesRatio
552
553
# Create request-based SLO
554
request_slo = ServiceLevelObjective()
555
request_slo.display_name = "Error Rate SLO"
556
request_slo.goal = 0.999 # 99.9% success rate
557
558
# Rolling period
559
rolling_period = Duration()
560
rolling_period.seconds = 28 * 24 * 60 * 60 # 28 days
561
request_slo.rolling_period = rolling_period
562
563
# Define request-based SLI
564
request_sli = ServiceLevelIndicator()
565
request_based = RequestBasedSli()
566
567
# Configure good vs total ratio
568
ratio = TimeSeriesRatio()
569
ratio.good_service_filter = 'project="my-project" AND service_name="api-service" AND response_code_class="2xx"'
570
ratio.total_service_filter = 'project="my-project" AND service_name="api-service"'
571
request_based.good_total_ratio = ratio
572
573
request_sli.request_based = request_based
574
request_slo.service_level_indicator = request_sli
575
576
created_request_slo = client.create_service_level_objective(
577
parent=service_name,
578
service_level_objective=request_slo,
579
service_level_objective_id="error-rate-slo"
580
)
581
print(f"Created request-based SLO: {created_request_slo.name}")
582
```
583
584
### Deleting Services and SLOs
585
586
```python
587
# Delete SLO
588
slo_name = f"projects/{project_id}/services/web-frontend/serviceLevelObjectives/availability-slo"
589
client.delete_service_level_objective(name=slo_name)
590
print(f"Deleted SLO: {slo_name}")
591
592
# Delete service (soft delete)
593
service_name = f"projects/{project_id}/services/web-frontend"
594
client.delete_service(name=service_name)
595
print(f"Deleted service: {service_name}")
596
```
597
598
### Async Service Operations
599
600
```python
601
import asyncio
602
from google.cloud.monitoring import ServiceMonitoringServiceAsyncClient
603
604
async def manage_services():
605
client = ServiceMonitoringServiceAsyncClient()
606
project_name = f"projects/{project_id}"
607
608
# List services asynchronously
609
async for service in await client.list_services(parent=project_name):
610
print(f"Async service: {service.display_name}")
611
612
# List SLOs for each service
613
async for slo in await client.list_service_level_objectives(parent=service.name):
614
print(f" Async SLO: {slo.display_name}")
615
616
asyncio.run(manage_services())
617
```
618
619
## Resource Path Helpers
620
621
```python { .api }
622
class ServiceMonitoringServiceClient:
623
@staticmethod
624
def service_path(project: str, service: str) -> str:
625
"""Returns a fully-qualified service string."""
626
627
@staticmethod
628
def service_level_objective_path(
629
project: str,
630
service: str,
631
service_level_objective: str
632
) -> str:
633
"""Returns a fully-qualified service_level_objective string."""
634
635
@staticmethod
636
def parse_service_path(path: str) -> Dict[str, str]:
637
"""Parses a service path into its component segments."""
638
```
639
640
## Error Handling
641
642
Service monitoring operations can raise specific exceptions:
643
644
```python
645
from google.api_core import exceptions
646
from google.cloud.monitoring import ServiceMonitoringServiceClient
647
648
client = ServiceMonitoringServiceClient()
649
650
try:
651
service = client.get_service(name="invalid/path")
652
except exceptions.NotFound:
653
print("Service not found")
654
except exceptions.InvalidArgument as e:
655
print(f"Invalid service configuration: {e}")
656
except exceptions.PermissionDenied:
657
print("Insufficient permissions")
658
except exceptions.FailedPrecondition as e:
659
print(f"Cannot delete service with active SLOs: {e}")
660
```