0
# Character Encoding
1
2
Character encoding and decoding support with UTF-8 and ISO-8859-1 charsets, providing encoder/decoder abstractions for text processing.
3
4
## Capabilities
5
6
### Charset Classes
7
8
Abstract character set representations with encoder and decoder factory methods.
9
10
```kotlin { .api }
11
/**
12
* Abstract character set representation.
13
* Platform-specific implementation providing encoding and decoding functionality.
14
*/
15
abstract class Charset {
16
/** Name of the character set (e.g., "UTF-8", "ISO-8859-1") */
17
abstract val name: String
18
19
/**
20
* Create a new encoder for this charset.
21
* Encoders convert character sequences to byte sequences.
22
* @return new CharsetEncoder instance
23
*/
24
abstract fun newEncoder(): CharsetEncoder
25
26
/**
27
* Create a new decoder for this charset.
28
* Decoders convert byte sequences to character sequences.
29
* @return new CharsetDecoder instance
30
*/
31
abstract fun newDecoder(): CharsetDecoder
32
33
companion object {
34
/**
35
* Get a charset by name.
36
* @param name charset name (case-insensitive)
37
* @return Charset instance
38
* @throws UnsupportedCharsetException if charset is not supported
39
*/
40
fun forName(name: String): Charset
41
42
/**
43
* Check if a charset is supported on this platform.
44
* @param charset charset name to check
45
* @return true if charset is supported
46
*/
47
fun isSupported(charset: String): Boolean
48
}
49
}
50
```
51
52
**Usage Examples:**
53
54
```kotlin
55
import io.ktor.utils.io.charsets.*
56
57
// Get charset by name
58
val utf8 = Charset.forName("UTF-8")
59
val latin1 = Charset.forName("ISO-8859-1")
60
61
// Check charset support
62
val isSupported = Charset.isSupported("UTF-16") // Platform-dependent
63
64
// Create encoders and decoders
65
val encoder = utf8.newEncoder()
66
val decoder = utf8.newDecoder()
67
68
println("Charset: ${utf8.name}")
69
```
70
71
### CharsetEncoder Class
72
73
Character sequence to byte sequence encoder with configurable encoding options.
74
75
```kotlin { .api }
76
/**
77
* Encoder for converting character sequences to byte sequences.
78
* Platform-specific implementation optimized for the target charset.
79
*/
80
abstract class CharsetEncoder {
81
/** The charset this encoder converts to */
82
abstract val charset: Charset
83
84
/**
85
* Encode a character sequence to a byte array.
86
* @param input character sequence to encode
87
* @param fromIndex starting character index (inclusive)
88
* @param toIndex ending character index (exclusive)
89
* @return encoded byte array
90
*/
91
fun encodeToByteArray(
92
input: CharSequence,
93
fromIndex: Int = 0,
94
toIndex: Int = input.length
95
): ByteArray
96
97
/**
98
* Encode a character sequence to a ByteReadPacket.
99
* @param input character sequence to encode
100
* @param fromIndex starting character index (inclusive)
101
* @param toIndex ending character index (exclusive)
102
* @return encoded bytes as a packet
103
*/
104
fun encode(
105
input: CharSequence,
106
fromIndex: Int = 0,
107
toIndex: Int = input.length
108
): ByteReadPacket
109
}
110
```
111
112
**Usage Examples:**
113
114
```kotlin
115
import io.ktor.utils.io.charsets.*
116
import io.ktor.utils.io.core.*
117
118
// Encode text to bytes
119
val encoder = Charsets.UTF_8.newEncoder()
120
121
// Encode to byte array
122
val text = "Hello, δΈη! π"
123
val bytes = encoder.encodeToByteArray(text)
124
println("Encoded ${text.length} characters to ${bytes.size} bytes")
125
126
// Encode partial text
127
val partialBytes = encoder.encodeToByteArray(text, fromIndex = 0, toIndex = 5)
128
129
// Encode to packet
130
val packet = encoder.encode(text)
131
val firstByte = packet.readByte()
132
val remainingBytes = packet.readRemaining()
133
134
// Encode with different charsets
135
val utf8Encoder = Charsets.UTF_8.newEncoder()
136
val latin1Encoder = Charsets.ISO_8859_1.newEncoder()
137
138
val utf8Bytes = utf8Encoder.encodeToByteArray("Hello")
139
val latin1Bytes = latin1Encoder.encodeToByteArray("Hello")
140
141
println("UTF-8: ${utf8Bytes.size} bytes")
142
println("ISO-8859-1: ${latin1Bytes.size} bytes")
143
```
144
145
### CharsetDecoder Class
146
147
Byte sequence to character sequence decoder with error handling and streaming support.
148
149
```kotlin { .api }
150
/**
151
* Decoder for converting byte sequences to character sequences.
152
* Platform-specific implementation with error handling for malformed input.
153
*/
154
abstract class CharsetDecoder {
155
/** The charset this decoder converts from */
156
abstract val charset: Charset
157
158
/**
159
* Decode bytes from input to a string.
160
* @param input byte input stream
161
* @param max maximum characters to decode
162
* @return decoded string
163
*/
164
fun decode(input: Input, max: Int = Int.MAX_VALUE): String
165
166
/**
167
* Decode bytes from input and append to destination.
168
* @param input byte input stream
169
* @param dst destination to append decoded characters
170
* @param max maximum characters to decode
171
* @return number of characters decoded
172
*/
173
fun decode(input: Input, dst: Appendable, max: Int = Int.MAX_VALUE): Int
174
175
/**
176
* Decode exactly the specified number of bytes to a string.
177
* @param input byte input stream
178
* @param inputLength exact number of bytes to read
179
* @return decoded string
180
* @throws EOFException if not enough bytes available
181
*/
182
fun decodeExactBytes(input: Input, inputLength: Int): String
183
}
184
```
185
186
**Usage Examples:**
187
188
```kotlin
189
import io.ktor.utils.io.charsets.*
190
import io.ktor.utils.io.core.*
191
192
// Decode bytes to text
193
val decoder = Charsets.UTF_8.newDecoder()
194
195
// Create input from byte array
196
val bytes = "Hello, δΈη! π".toByteArray(Charsets.UTF_8)
197
val input = ByteReadPacket(bytes)
198
199
// Decode to string
200
val decodedText = decoder.decode(input)
201
println("Decoded: $decodedText")
202
203
// Decode with character limit
204
val limitedInput = ByteReadPacket(bytes)
205
val partialText = decoder.decode(limitedInput, max = 10)
206
207
// Decode to appendable
208
val output = StringBuilder()
209
val bytesInput = ByteReadPacket(bytes)
210
val charactersDecoded = decoder.decode(bytesInput, output)
211
println("Decoded $charactersDecoded characters: ${output}")
212
213
// Decode exact byte count
214
val exactInput = ByteReadPacket("Test".toByteArray())
215
val exactText = decoder.decodeExactBytes(exactInput, inputLength = 4)
216
217
// Handle different encodings
218
val utf8Decoder = Charsets.UTF_8.newDecoder()
219
val latin1Decoder = Charsets.ISO_8859_1.newDecoder()
220
221
val testBytes = byteArrayOf(0x48, 0x65, 0x6C, 0x6C, 0x6F) // "Hello"
222
223
val utf8Result = utf8Decoder.decode(ByteReadPacket(testBytes))
224
val latin1Result = latin1Decoder.decode(ByteReadPacket(testBytes))
225
226
println("UTF-8 decoded: $utf8Result")
227
println("ISO-8859-1 decoded: $latin1Result")
228
```
229
230
### Standard Charsets
231
232
Pre-configured charset instances for commonly used character encodings.
233
234
```kotlin { .api }
235
/**
236
* Standard charset constants for commonly used character encodings.
237
*/
238
object Charsets {
239
/** UTF-8 character encoding */
240
val UTF_8: Charset
241
242
/** ISO-8859-1 (Latin-1) character encoding */
243
val ISO_8859_1: Charset
244
}
245
```
246
247
**Usage Examples:**
248
249
```kotlin
250
import io.ktor.utils.io.charsets.*
251
252
// Use standard charsets
253
val utf8 = Charsets.UTF_8
254
val latin1 = Charsets.ISO_8859_1
255
256
println("UTF-8 name: ${utf8.name}")
257
println("ISO-8859-1 name: ${latin1.name}")
258
259
// Create encoders for standard charsets
260
val utf8Encoder = Charsets.UTF_8.newEncoder()
261
val utf8Decoder = Charsets.UTF_8.newDecoder()
262
263
val latin1Encoder = Charsets.ISO_8859_1.newEncoder()
264
val latin1Decoder = Charsets.ISO_8859_1.newDecoder()
265
266
// Compare encoding results
267
val text = "Hello World"
268
val utf8Bytes = utf8Encoder.encodeToByteArray(text)
269
val latin1Bytes = latin1Encoder.encodeToByteArray(text)
270
271
println("Text: '$text'")
272
println("UTF-8 bytes: ${utf8Bytes.size}")
273
println("ISO-8859-1 bytes: ${latin1Bytes.size}")
274
```
275
276
### Exception Classes
277
278
Exception types for character encoding and decoding error handling.
279
280
```kotlin { .api }
281
/**
282
* Base exception for malformed input during character encoding/decoding.
283
*/
284
abstract class MalformedInputException(message: String) : Throwable(message)
285
286
/**
287
* Exception thrown when a line is too long during text processing.
288
* Extends MalformedInputException for consistent error handling.
289
*/
290
class TooLongLineException(message: String) : MalformedInputException(message)
291
```
292
293
**Usage Examples:**
294
295
```kotlin
296
import io.ktor.utils.io.charsets.*
297
import io.ktor.utils.io.core.*
298
299
// Handle encoding exceptions
300
fun safeEncode(text: String, charset: Charset): ByteArray? {
301
return try {
302
val encoder = charset.newEncoder()
303
encoder.encodeToByteArray(text)
304
} catch (e: MalformedInputException) {
305
println("Failed to encode text: ${e.message}")
306
null
307
}
308
}
309
310
// Handle decoding exceptions
311
fun safeDecode(bytes: ByteArray, charset: Charset): String? {
312
return try {
313
val decoder = charset.newDecoder()
314
val input = ByteReadPacket(bytes)
315
decoder.decode(input)
316
} catch (e: MalformedInputException) {
317
println("Failed to decode bytes: ${e.message}")
318
null
319
} catch (e: TooLongLineException) {
320
println("Line too long: ${e.message}")
321
null
322
}
323
}
324
325
// Usage
326
val validText = "Hello World"
327
val validBytes = safeEncode(validText, Charsets.UTF_8)
328
val decodedText = validBytes?.let { safeDecode(it, Charsets.UTF_8) }
329
330
// Handle malformed input
331
val malformedBytes = byteArrayOf(0xFF.toByte(), 0xFE.toByte()) // Invalid UTF-8
332
val result = safeDecode(malformedBytes, Charsets.UTF_8) // Returns null
333
```
334
335
### Text Processing Extensions
336
337
Extension functions for common text encoding and decoding operations.
338
339
```kotlin { .api }
340
/**
341
* Convert string to byte array using UTF-8 encoding.
342
* @param charset character encoding to use (default UTF-8)
343
* @return encoded byte array
344
*/
345
fun String.toByteArray(charset: Charset = Charsets.UTF_8): ByteArray
346
347
/**
348
* Convert byte array to string using UTF-8 encoding.
349
* @param charset character encoding to use (default UTF-8)
350
* @return decoded string
351
*/
352
fun ByteArray.toString(charset: Charset = Charsets.UTF_8): String
353
354
/**
355
* Write UTF-8 text to output.
356
* @param text string to write
357
*/
358
fun Output.writeText(text: String)
359
360
/**
361
* Write UTF-8 text followed by line separator.
362
* @param text string to write
363
*/
364
fun Output.writeTextLine(text: String)
365
366
/**
367
* Read UTF-8 text from input.
368
* @param min minimum characters to read
369
* @param max maximum characters to read
370
* @return decoded string
371
*/
372
fun Input.readText(min: Int = 0, max: Int = Int.MAX_VALUE): String
373
374
/**
375
* Read exactly the specified number of UTF-8 characters.
376
* @param exactCharacters exact number of characters to read
377
* @return decoded string
378
*/
379
fun Input.readTextExact(exactCharacters: Int): String
380
```
381
382
**Usage Examples:**
383
384
```kotlin
385
import io.ktor.utils.io.charsets.*
386
import io.ktor.utils.io.core.*
387
388
// String and byte array conversions
389
val text = "Hello, δΈη!"
390
val utf8Bytes = text.toByteArray(Charsets.UTF_8)
391
val latin1Bytes = text.toByteArray(Charsets.ISO_8859_1)
392
393
val decodedFromUtf8 = utf8Bytes.toString(Charsets.UTF_8)
394
val decodedFromLatin1 = latin1Bytes.toString(Charsets.ISO_8859_1)
395
396
// Writing text to packets
397
val packet = buildPacket {
398
writeText("Line 1")
399
writeTextLine("Line 2 with newline")
400
writeText("Line 3")
401
}
402
403
// Reading text from packets
404
val input = ByteReadPacket("Hello World Test".toByteArray())
405
val allText = input.readText()
406
val limitedText = input.readText(min = 1, max = 5)
407
408
// Exact character reading
409
val exactInput = ByteReadPacket("Exact".toByteArray())
410
val exactText = exactInput.readTextExact(5) // Reads exactly 5 characters
411
412
// Integration with I/O operations
413
suspend fun writeTextToChannel(channel: ByteWriteChannel, text: String) {
414
val bytes = text.toByteArray(Charsets.UTF_8)
415
channel.writeFully(bytes, 0, bytes.size)
416
}
417
418
suspend fun readTextFromChannel(channel: ByteReadChannel, maxBytes: Int): String {
419
val buffer = ByteArray(maxBytes)
420
val bytesRead = channel.readAvailable(buffer, 0, buffer.size)
421
return buffer.copyOf(bytesRead).toString(Charsets.UTF_8)
422
}
423
```
424
425
### Advanced Character Encoding Usage
426
427
Complex scenarios involving multiple charsets, streaming, and error recovery.
428
429
```kotlin { .api }
430
// Multi-charset text processor
431
class MultiCharsetProcessor {
432
private val charsets = mapOf(
433
"utf-8" to Charsets.UTF_8,
434
"iso-8859-1" to Charsets.ISO_8859_1
435
)
436
437
fun processText(input: ByteArray, charsetName: String): String {
438
val charset = charsets[charsetName.lowercase()]
439
?: throw IllegalArgumentException("Unsupported charset: $charsetName")
440
441
val decoder = charset.newDecoder()
442
val byteInput = ByteReadPacket(input)
443
444
return try {
445
decoder.decode(byteInput)
446
} catch (e: MalformedInputException) {
447
// Fallback to Latin-1 for binary data
448
val fallbackDecoder = Charsets.ISO_8859_1.newDecoder()
449
val fallbackInput = ByteReadPacket(input)
450
fallbackDecoder.decode(fallbackInput)
451
}
452
}
453
454
fun detectEncoding(bytes: ByteArray): String {
455
// Simplified encoding detection
456
return when {
457
bytes.size >= 3 &&
458
bytes[0] == 0xEF.toByte() &&
459
bytes[1] == 0xBB.toByte() &&
460
bytes[2] == 0xBF.toByte() -> "utf-8"
461
462
bytes.all { it >= 0 } -> "ascii"
463
else -> "iso-8859-1"
464
}
465
}
466
}
467
468
// Streaming text converter
469
class StreamingTextConverter(
470
private val sourceCharset: Charset,
471
private val targetCharset: Charset
472
) {
473
fun convert(input: Input, output: Output) {
474
val decoder = sourceCharset.newDecoder()
475
val encoder = targetCharset.newEncoder()
476
477
while (!input.endOfInput) {
478
try {
479
val text = decoder.decode(input, max = 1024)
480
val packet = encoder.encode(text)
481
output.writePacket(packet)
482
} catch (e: MalformedInputException) {
483
// Skip invalid bytes
484
if (!input.endOfInput) {
485
input.discard(1)
486
}
487
}
488
}
489
}
490
}
491
```