Tessl Tile for maven/org.apache.spark/spark-sql_2.13@4.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

catalog.md data-sources.md data-types.md dataframe-dataset.md index.md session-management.md sql-functions.md streaming.md udfs.md

catalog.mddocs/

0
# Apache Spark SQL - Catalog Operations
1

2
## Capabilities
3

4
### Database and Namespace Management
5
- Manage multiple databases and namespaces within Spark's metastore for logical data organization
6
- Create, drop, and list databases with configurable properties and location settings
7
- Switch between databases and manage current database context for query execution
8
- Handle database-level permissions and access control through metastore integration
9

10
### Table and View Management Operations
11
- Create, drop, and manage both managed and external tables with comprehensive metadata support
12
- Handle temporary and global temporary views for session-scoped and cross-session data sharing
13
- Support for table creation with custom storage formats, partitioning, and bucketing strategies
14
- Manage table properties, statistics, and optimization hints for query performance tuning
15

16
### Function Registry and Management
17
- Register and manage user-defined functions (UDFs) and user-defined aggregate functions (UDAFs)
18
- Handle both temporary and persistent function registrations with namespace scoping
19
- Support for function overloading and parameter type checking for type-safe operations
20
- Enable function discovery and introspection for development and debugging workflows
21

22
### Metadata Discovery and Introspection
23
- Query comprehensive metadata about databases, tables, columns, and functions through programmatic APIs
24
- Support for schema discovery and data lineage tracking across tables and views
25
- Handle table statistics and partition information for query optimization and monitoring
26
- Enable catalog browsing and exploration for data governance and documentation purposes
27

28
## API Reference
29

30
### Catalog Class
31
```scala { .api }
32
abstract class Catalog {
33
  // Current database operations
34
  def currentDatabase: String
35
  def setCurrentDatabase(dbName: String): Unit
36
  
37
  // Database management
38
  def listDatabases(): Dataset[Database]
39
  def listDatabases(pattern: String): Dataset[Database]
40
  def databaseExists(dbName: String): Boolean
41
  def getDatabase(dbName: String): Database
42
  def createDatabase(dbName: String, description: String, location: String): Unit
43
  def dropDatabase(dbName: String): Unit
44
  def dropDatabase(dbName: String, ignoreIfNotExists: Boolean, cascade: Boolean): Unit
45
  
46
  // Table management
47
  def listTables(): Dataset[Table]
48
  def listTables(dbName: String): Dataset[Table]
49
  def listTables(dbName: String, pattern: String): Dataset[Table]
50
  def getTable(tableName: String): Table
51
  def getTable(dbName: String, tableName: String): Table
52
  def tableExists(tableName: String): Boolean
53
  def tableExists(dbName: String, tableName: String): Boolean
54
  def createTable(tableName: String, path: String): DataFrame
55
  def createTable(tableName: String, path: String, source: String): DataFrame
56
  def createTable(tableName: String, source: String, schema: StructType, options: Map[String, String]): DataFrame
57
  def dropTempView(viewName: String): Boolean
58
  def dropGlobalTempView(viewName: String): Boolean
59
  
60
  // Column information
61
  def listColumns(tableName: String): Dataset[Column]
62
  def listColumns(dbName: String, tableName: String): Dataset[Column]
63
  
64
  // Function management
65
  def listFunctions(): Dataset[Function]
66
  def listFunctions(dbName: String): Dataset[Function]
67
  def listFunctions(dbName: String, pattern: String): Dataset[Function]
68
  def getFunction(functionName: String): Function
69
  def getFunction(dbName: String, functionName: String): Function
70
  def functionExists(functionName: String): Boolean
71
  def functionExists(dbName: String, functionName: String): Boolean
72
  
73
  // Caching operations
74
  def cacheTable(tableName: String): Unit
75
  def cacheTable(tableName: String, storageLevel: StorageLevel): Unit
76
  def uncacheTable(tableName: String): Unit
77
  def clearCache(): Unit
78
  def isCached(tableName: String): Boolean
79
  def refreshTable(tableName: String): Unit
80
  def refreshByPath(path: String): Unit
81
  
82
  // Recovery operations
83
  def recoverPartitions(tableName: String): Unit
84
}
85
```
86

87
### Database Metadata
88
```scala { .api }
89
case class Database(
90
    name: String,
91
    description: String,
92
    locationUri: String) extends DefinedByConstructorParams {
93
  
94
  override def toString: String = {
95
    s"Database[name='$name', description='$description', path='$locationUri']"
96
  }
97
}
98
```
99

100
### Table Metadata
101
```scala { .api }
102
case class Table(
103
    name: String,
104
    database: String,
105
    description: String,
106
    tableType: String,
107
    isTemporary: Boolean) extends DefinedByConstructorParams {
108
  
109
  override def toString: String = {
110
    s"Table[name='$name', database='$database', description='$description', " +
111
    s"tableType='$tableType', isTemporary='$isTemporary']"
112
  }
113
}
114
```
115

116
### Column Metadata
117
```scala { .api }
118
case class Column(
119
    name: String,
120
    description: String,
121
    dataType: String,
122
    nullable: Boolean,
123
    isPartition: Boolean,
124
    isBucket: Boolean) extends DefinedByConstructorParams {
125
  
126
  override def toString: String = {
127
    s"Column[name='$name', description='$description', dataType='$dataType', " +
128
    s"nullable='$nullable', isPartition='$isPartition', isBucket='$isBucket']"
129
  }
130
}
131
```
132

133
### Function Metadata
134
```scala { .api }
135
case class Function(
136
    name: String,
137
    database: String,
138
    description: String,
139
    className: String,
140
    isTemporary: Boolean) extends DefinedByConstructorParams {
141
  
142
  override def toString: String = {
143
    s"Function[name='$name', database='$database', description='$description', " +
144
    s"className='$className', isTemporary='$isTemporary']"
145
  }
146
}
147
```
148

149
### Table Creation Options
150
```scala { .api }
151
// Table creation with DataFrameWriter
152
class DataFrameWriter[T] {
153
  def saveAsTable(tableName: String): Unit
154
  def insertInto(tableName: String): Unit
155
  
156
  // V2 table operations
157
  def writeTo(tableName: String): DataFrameWriterV2[T]
158
}
159

160
// Advanced table creation
161
class DataFrameWriterV2[T] {
162
  def create(): Unit
163
  def replace(): Unit  
164
  def createOrReplace(): Unit
165
  def append(): Unit
166
  def overwrite(): Unit
167
  def overwritePartitions(): Unit
168
  
169
  // Table properties
170
  def tableProperty(property: String, value: String): DataFrameWriterV2[T]
171
  def partitionedBy(column: Column, columns: Column*): DataFrameWriterV2[T]
172
  def using(provider: String): DataFrameWriterV2[T]
173
}
174
```
175

176
### Storage Level for Caching
177
```scala { .api }
178
object StorageLevel {
179
  val NONE: StorageLevel
180
  val DISK_ONLY: StorageLevel
181
  val DISK_ONLY_2: StorageLevel
182
  val DISK_ONLY_3: StorageLevel
183
  val MEMORY_ONLY: StorageLevel
184
  val MEMORY_ONLY_2: StorageLevel
185
  val MEMORY_ONLY_SER: StorageLevel
186
  val MEMORY_ONLY_SER_2: StorageLevel
187
  val MEMORY_AND_DISK: StorageLevel
188
  val MEMORY_AND_DISK_2: StorageLevel
189
  val MEMORY_AND_DISK_SER: StorageLevel
190
  val MEMORY_AND_DISK_SER_2: StorageLevel
191
  val OFF_HEAP: StorageLevel
192
}
193
```
194

195
## Usage Examples
196

197
### Database Management
198
```scala
199
import org.apache.spark.sql.SparkSession
200
import org.apache.spark.sql.catalyst.catalog._
201

202
val spark = SparkSession.builder()
203
  .appName("Catalog Operations Demo")
204
  .enableHiveSupport() // Enable Hive metastore support
205
  .getOrCreate()
206

207
// Current database operations
208
println(s"Current database: ${spark.catalog.currentDatabase}")
209

210
// List all databases
211
val databases = spark.catalog.listDatabases()
212
databases.show()
213

214
databases.collect().foreach { db =>
215
  println(s"Database: ${db.name}, Description: ${db.description}, Location: ${db.locationUri}")
216
}
217

218
// Filter databases by pattern
219
val testDatabases = spark.catalog.listDatabases("test*")
220
testDatabases.show()
221

222
// Check if database exists
223
val dbExists = spark.catalog.databaseExists("analytics")
224
println(s"Analytics database exists: $dbExists")
225

226
// Create a new database
227
spark.catalog.createDatabase(
228
  dbName = "analytics", 
229
  description = "Analytics and reporting database",
230
  location = "s3a://my-bucket/analytics/"
231
)
232

233
// Get database information
234
val analyticsDB = spark.catalog.getDatabase("analytics")
235
println(s"Analytics DB: ${analyticsDB.name} at ${analyticsDB.locationUri}")
236

237
// Switch to different database
238
spark.catalog.setCurrentDatabase("analytics")
239
println(s"Switched to database: ${spark.catalog.currentDatabase}")
240

241
// Drop database (with cascade to drop all tables)
242
spark.catalog.dropDatabase("old_database", ignoreIfNotExists = true, cascade = true)
243
```
244

245
### Table Management and Discovery
246
```scala
247
// List all tables in current database
248
val tables = spark.catalog.listTables()
249
tables.show()
250

251
// List tables in specific database
252
val analyticsTables = spark.catalog.listTables("analytics")
253
analyticsTables.show()
254

255
// Filter tables by pattern
256
val salesTables = spark.catalog.listTables("analytics", "sales*")
257
salesTables.show()
258

259
// Get detailed table information
260
tables.collect().foreach { table =>
261
  println(s"Table: ${table.database}.${table.name}")
262
  println(s"  Type: ${table.tableType}, Temporary: ${table.isTemporary}")
263
  println(s"  Description: ${table.description}")
264
  println()
265
}
266

267
// Check if table exists
268
val tableExists = spark.catalog.tableExists("sales_data")
269
val dbTableExists = spark.catalog.tableExists("analytics", "user_events")
270
println(s"sales_data exists: $tableExists")
271
println(s"analytics.user_events exists: $dbTableExists")
272

273
// Get specific table metadata
274
if (spark.catalog.tableExists("analytics", "sales_data")) {
275
  val salesTable = spark.catalog.getTable("analytics", "sales_data")
276
  println(s"Sales table: ${salesTable.name} in ${salesTable.database}")
277
  println(s"Table type: ${salesTable.tableType}")
278
  println(s"Is temporary: ${salesTable.isTemporary}")
279
}
280

281
// Create external table
282
val externalTable = spark.catalog.createTable(
283
  tableName = "external_sales",
284
  path = "s3a://data-lake/sales/",
285
  source = "parquet"
286
)
287

288
// Create table with schema and options
289
import org.apache.spark.sql.types._
290
val salesSchema = StructType(Array(
291
  StructField("transaction_id", StringType, nullable = false),
292
  StructField("customer_id", StringType, nullable = false),
293
  StructField("product_id", StringType, nullable = false),
294
  StructField("amount", DecimalType(10, 2), nullable = false),
295
  StructField("transaction_date", DateType, nullable = false),
296
  StructField("region", StringType, nullable = false)
297
))
298

299
val managedTable = spark.catalog.createTable(
300
  tableName = "managed_sales",
301
  source = "delta",
302
  schema = salesSchema,
303
  options = Map(
304
    "path" -> "/path/to/delta/sales",
305
    "delta.autoOptimize.optimizeWrite" -> "true",
306
    "delta.autoOptimize.autoCompact" -> "true"
307
  )
308
)
309
```
310

311
### Column Information and Schema Discovery
312
```scala
313
// List columns for a table
314
val salesColumns = spark.catalog.listColumns("analytics", "sales_data")
315
salesColumns.show(truncate = false)
316

317
// Get detailed column information
318
salesColumns.collect().foreach { column =>
319
  println(s"Column: ${column.name}")
320
  println(s"  Data Type: ${column.dataType}")
321
  println(s"  Nullable: ${column.nullable}")
322
  println(s"  Is Partition: ${column.isPartition}")
323
  println(s"  Is Bucket: ${column.isBucket}")
324
  println(s"  Description: ${column.description}")
325
  println()
326
}
327

328
// Analyze table schema programmatically
329
def analyzeTableSchema(dbName: String, tableName: String): Unit = {
330
  val columns = spark.catalog.listColumns(dbName, tableName).collect()
331
  
332
  println(s"Schema Analysis for $dbName.$tableName:")
333
  println(s"Total columns: ${columns.length}")
334
  
335
  val partitionColumns = columns.filter(_.isPartition)
336
  val bucketColumns = columns.filter(_.isBucket)
337
  val nullableColumns = columns.filter(_.nullable)
338
  
339
  println(s"Partition columns: ${partitionColumns.map(_.name).mkString(", ")}")
340
  println(s"Bucket columns: ${bucketColumns.map(_.name).mkString(", ")}")
341
  println(s"Nullable columns: ${nullableColumns.length}/${columns.length}")
342
  
343
  // Group by data type
344
  val typeGroups = columns.groupBy(_.dataType)
345
  typeGroups.foreach { case (dataType, cols) =>
346
    println(s"$dataType: ${cols.map(_.name).mkString(", ")}")
347
  }
348
}
349

350
analyzeTableSchema("analytics", "sales_data")
351

352
// Compare schemas between tables
353
def compareSchemas(db1: String, table1: String, db2: String, table2: String): Unit = {
354
  val schema1 = spark.catalog.listColumns(db1, table1).collect().map(c => c.name -> c.dataType).toMap
355
  val schema2 = spark.catalog.listColumns(db2, table2).collect().map(c => c.name -> c.dataType).toMap
356
  
357
  val commonColumns = schema1.keySet.intersect(schema2.keySet)
358
  val onlyInFirst = schema1.keySet -- schema2.keySet
359
  val onlyInSecond = schema2.keySet -- schema1.keySet
360
  
361
  println(s"Schema Comparison: $db1.$table1 vs $db2.$table2")
362
  println(s"Common columns: ${commonColumns.size}")
363
  println(s"Only in first: ${onlyInFirst.mkString(", ")}")
364
  println(s"Only in second: ${onlyInSecond.mkString(", ")}")
365
  
366
  // Check for type mismatches in common columns
367
  val typeMismatches = commonColumns.filter(col => schema1(col) != schema2(col))
368
  if (typeMismatches.nonEmpty) {
369
    println("Type mismatches:")
370
    typeMismatches.foreach { col =>
371
      println(s"  $col: ${schema1(col)} vs ${schema2(col)}")
372
    }
373
  }
374
}
375
```
376

377
### Function Management
378
```scala
379
// List all functions
380
val allFunctions = spark.catalog.listFunctions()
381
allFunctions.show()
382

383
// List functions in specific database
384
val analyticsFunctions = spark.catalog.listFunctions("analytics")
385
analyticsFunctions.show()
386

387
// Filter functions by pattern
388
val mathFunctions = spark.catalog.listFunctions("default", "*math*")
389
mathFunctions.show()
390

391
// Get function information
392
allFunctions.collect().foreach { func =>
393
  println(s"Function: ${func.database}.${func.name}")
394
  println(s"  Class: ${func.className}")
395
  println(s"  Temporary: ${func.isTemporary}")
396
  println(s"  Description: ${func.description}")
397
  println()
398
}
399

400
// Check if function exists
401
val funcExists = spark.catalog.functionExists("my_custom_function")
402
val dbFuncExists = spark.catalog.functionExists("analytics", "sales_metrics")
403
println(s"my_custom_function exists: $funcExists")
404
println(s"analytics.sales_metrics exists: $dbFuncExists")
405

406
// Get specific function details
407
if (spark.catalog.functionExists("default", "substring")) {
408
  val substringFunc = spark.catalog.getFunction("default", "substring")
409
  println(s"Substring function: ${substringFunc.name}")
410
  println(s"Class: ${substringFunc.className}")
411
}
412

413
// Register custom function (example)
414
spark.udf.register("calculate_tax", (amount: Double, rate: Double) => amount * rate)
415

416
// Verify registration
417
val taxFuncExists = spark.catalog.functionExists("calculate_tax")
418
println(s"calculate_tax function registered: $taxFuncExists")
419

420
// Function discovery and documentation
421
def documentFunctions(databaseName: String): Unit = {
422
  val functions = spark.catalog.listFunctions(databaseName).collect()
423
  
424
  println(s"Function Documentation for Database: $databaseName")
425
  println("=" * 50)
426
  
427
  val grouped = functions.groupBy(_.isTemporary)
428
  
429
  println("PERSISTENT FUNCTIONS:")
430
  grouped.getOrElse(false, Array()).foreach { func =>
431
    println(s"  ${func.name}: ${func.description}")
432
  }
433
  
434
  println("\nTEMPORARY FUNCTIONS:")
435
  grouped.getOrElse(true, Array()).foreach { func =>
436
    println(s"  ${func.name}: ${func.description}")
437
  }
438
}
439

440
documentFunctions("default")
441
```
442

443
### Table Caching Operations
444
```scala
445
import org.apache.spark.storage.StorageLevel
446

447
// Create sample data for caching examples
448
val salesData = Seq(
449
  ("TXN001", "CUST001", "PROD001", 100.50, "2023-01-15", "US"),
450
  ("TXN002", "CUST002", "PROD002", 250.75, "2023-01-16", "UK"),
451
  ("TXN003", "CUST003", "PROD001", 180.25, "2023-01-17", "CA")
452
).toDF("transaction_id", "customer_id", "product_id", "amount", "transaction_date", "region")
453

454
// Save as table for caching examples
455
salesData.write
456
  .mode("overwrite")
457
  .saveAsTable("sales_cache_demo")
458

459
// Basic table caching
460
spark.catalog.cacheTable("sales_cache_demo")
461

462
// Check if table is cached
463
val isCached = spark.catalog.isCached("sales_cache_demo")
464
println(s"sales_cache_demo is cached: $isCached")
465

466
// Cache with specific storage level
467
spark.catalog.cacheTable("sales_cache_demo", StorageLevel.MEMORY_AND_DISK_SER)
468

469
// Cache multiple tables with different strategies
470
val largeTables = spark.catalog.listTables().filter(_.name.contains("large")).collect()
471
largeTables.foreach { table =>
472
  spark.catalog.cacheTable(table.name, StorageLevel.DISK_ONLY)
473
  println(s"Cached large table: ${table.name} to disk only")
474
}
475

476
val frequentTables = spark.catalog.listTables().filter(_.name.contains("frequent")).collect()
477
frequentTables.foreach { table =>
478
  spark.catalog.cacheTable(table.name, StorageLevel.MEMORY_ONLY)
479
  println(s"Cached frequent table: ${table.name} to memory only")
480
}
481

482
// Uncache specific table
483
spark.catalog.uncacheTable("sales_cache_demo")
484
println(s"sales_cache_demo is cached after uncache: ${spark.catalog.isCached("sales_cache_demo")}")
485

486
// Clear all cached tables
487
spark.catalog.clearCache()
488
println("All cached tables cleared")
489

490
// Refresh table metadata (useful after external changes)
491
spark.catalog.refreshTable("sales_cache_demo")
492

493
// Refresh by path (for external tables)
494
spark.catalog.refreshByPath("/path/to/external/data")
495

496
// Cache management utility
497
def manageCacheForDatabase(databaseName: String, cacheStrategy: String): Unit = {
498
  val tables = spark.catalog.listTables(databaseName).collect()
499
  
500
  cacheStrategy.toLowerCase match {
501
    case "memory" =>
502
      tables.foreach { table =>
503
        spark.catalog.cacheTable(s"${table.database}.${table.name}", StorageLevel.MEMORY_ONLY)
504
        println(s"Cached ${table.database}.${table.name} in memory")
505
      }
506
    
507
    case "disk" =>
508
      tables.foreach { table =>
509
        spark.catalog.cacheTable(s"${table.database}.${table.name}", StorageLevel.DISK_ONLY)
510
        println(s"Cached ${table.database}.${table.name} on disk")
511
      }
512
      
513
    case "mixed" =>
514
      tables.foreach { table =>
515
        spark.catalog.cacheTable(s"${table.database}.${table.name}", StorageLevel.MEMORY_AND_DISK)
516
        println(s"Cached ${table.database}.${table.name} with memory and disk")
517
      }
518
      
519
    case "clear" =>
520
      tables.foreach { table =>
521
        if (spark.catalog.isCached(s"${table.database}.${table.name}")) {
522
          spark.catalog.uncacheTable(s"${table.database}.${table.name}")
523
          println(s"Uncached ${table.database}.${table.name}")
524
        }
525
      }
526
  }
527
}
528

529
// Cache all tables in analytics database in memory and disk
530
manageCacheForDatabase("analytics", "mixed")
531
```
532

533
### Temporary Views Management
534
```scala
535
// Create temporary views
536
salesData.createOrReplaceTempView("temp_sales")
537
salesData.createGlobalTempView("global_sales")
538

539
// List all tables including temporary views
540
val allTables = spark.catalog.listTables().collect()
541
allTables.filter(_.isTemporary).foreach { table =>
542
  println(s"Temporary view: ${table.name} in database: ${table.database}")
543
}
544

545
// Access global temporary views (in global_temp database)
546
val globalTempTables = spark.catalog.listTables("global_temp").collect()
547
globalTempTables.foreach { table =>
548
  println(s"Global temporary view: ${table.name}")
549
}
550

551
// Drop temporary views
552
val tempDropped = spark.catalog.dropTempView("temp_sales")
553
val globalTempDropped = spark.catalog.dropGlobalTempView("global_sales")
554
println(s"Temporary view dropped: $tempDropped")
555
println(s"Global temporary view dropped: $globalTempDropped")
556

557
// Temporary view lifecycle management
558
def manageTemporaryViews(sessionName: String): Unit = {
559
  // Create session-specific temporary views
560
  salesData.filter($"region" === "US")
561
    .createOrReplaceTempView(s"${sessionName}_us_sales")
562
  
563
  salesData.filter($"region" === "UK")
564
    .createOrReplaceTempView(s"${sessionName}_uk_sales")
565
  
566
  // List session views
567
  val sessionViews = spark.catalog.listTables().filter(_.name.startsWith(sessionName)).collect()
568
  println(s"Session views for $sessionName:")
569
  sessionViews.foreach(view => println(s"  - ${view.name}"))
570
  
571
  // Cleanup function
572
  def cleanup(): Unit = {
573
    sessionViews.foreach { view =>
574
      spark.catalog.dropTempView(view.name)
575
      println(s"Dropped temporary view: ${view.name}")
576
    }
577
  }
578
  
579
  // Register cleanup for shutdown
580
  sys.addShutdownHook(cleanup())
581
}
582

583
manageTemporaryViews("analytics_session")
584
```
585

586
### Partition Recovery and Maintenance
587
```scala
588
// Create partitioned table for recovery demo
589
val partitionedSales = salesData.withColumn("year", year(to_date($"transaction_date")))
590
  .withColumn("month", month(to_date($"transaction_date")))
591

592
partitionedSales.write
593
  .mode("overwrite")
594
  .partitionBy("year", "month")
595
  .saveAsTable("partitioned_sales")
596

597
// Simulate adding partitions externally (outside Spark)
598
// In practice, this would be done by external processes
599

600
// Recover partitions to sync metastore with filesystem
601
spark.catalog.recoverPartitions("partitioned_sales")
602
println("Recovered partitions for partitioned_sales table")
603

604
// Comprehensive catalog maintenance
605
def performCatalogMaintenance(): Unit = {
606
  println("Starting catalog maintenance...")
607
  
608
  // Get all databases
609
  val databases = spark.catalog.listDatabases().collect()
610
  
611
  databases.foreach { db =>
612
    println(s"Maintaining database: ${db.name}")
613
    
614
    // Get all tables in database
615
    val tables = spark.catalog.listTables(db.name).collect()
616
    
617
    tables.foreach { table =>
618
      if (!table.isTemporary) {
619
        try {
620
          // Refresh table metadata
621
          spark.catalog.refreshTable(s"${table.database}.${table.name}")
622
          
623
          // Recover partitions for partitioned tables
624
          val columns = spark.catalog.listColumns(table.database, table.name).collect()
625
          val hasPartitions = columns.exists(_.isPartition)
626
          
627
          if (hasPartitions) {
628
            spark.catalog.recoverPartitions(s"${table.database}.${table.name}")
629
            println(s"  Recovered partitions for ${table.database}.${table.name}")
630
          }
631
          
632
        } catch {
633
          case e: Exception =>
634
            println(s"  Error maintaining ${table.database}.${table.name}: ${e.getMessage}")
635
        }
636
      }
637
    }
638
  }
639
  
640
  println("Catalog maintenance completed")
641
}
642

643
// Run maintenance
644
performCatalogMaintenance()
645

646
// Catalog health check
647
def catalogHealthCheck(): Unit = {
648
  println("Catalog Health Check")
649
  println("=" * 20)
650
  
651
  val databases = spark.catalog.listDatabases().collect()
652
  println(s"Total databases: ${databases.length}")
653
  
654
  databases.foreach { db =>
655
    val tables = spark.catalog.listTables(db.name).collect()
656
    val tempTables = tables.count(_.isTemporary)
657
    val permanentTables = tables.length - tempTables
658
    
659
    println(s"Database ${db.name}: $permanentTables permanent, $tempTables temporary tables")
660
    
661
    // Check for tables with issues
662
    tables.filter(!_.isTemporary).foreach { table =>
663
      try {
664
        val columns = spark.catalog.listColumns(table.database, table.name).collect()
665
        val partitionCount = columns.count(_.isPartition)
666
        val bucketCount = columns.count(_.isBucket)
667
        
668
        if (partitionCount > 0 || bucketCount > 0) {
669
          println(s"  ${table.name}: $partitionCount partition columns, $bucketCount bucket columns")
670
        }
671
        
672
      } catch {
673
        case e: Exception =>
674
          println(s"  WARNING: Cannot access ${table.name}: ${e.getMessage}")
675
      }
676
    }
677
  }
678
  
679
  val allFunctions = spark.catalog.listFunctions().collect()
680
  val tempFunctions = allFunctions.count(_.isTemporary)
681
  val permanentFunctions = allFunctions.length - tempFunctions
682
  
683
  println(s"Total functions: $permanentFunctions permanent, $tempFunctions temporary")
684
}
685

686
catalogHealthCheck()
687
```

Version

Tile

Files

catalog.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

catalog.mddocs/