0
# REUSE Information Processing
1
2
The REUSE information processing system provides data structures and functions for handling licensing and copyright information. The core `ReuseInfo` class and supporting utilities enable extraction, manipulation, and analysis of REUSE compliance data.
3
4
## Capabilities
5
6
### Core Data Structures
7
8
The foundational data classes for REUSE information handling.
9
10
```python { .api }
11
@dataclass(frozen=True)
12
class ReuseInfo:
13
"""
14
Simple dataclass holding licensing and copyright information.
15
16
Attributes:
17
spdx_expressions: set[Expression] - SPDX license expressions
18
copyright_lines: set[str] - Copyright statements
19
contributor_lines: set[str] - Contributor information
20
path: Optional[str] - File path this info applies to
21
source_path: Optional[str] - Source file where info was found
22
source_type: Optional[SourceType] - Type of source containing the info
23
"""
24
spdx_expressions: set[Expression] = field(default_factory=set)
25
copyright_lines: set[str] = field(default_factory=set)
26
contributor_lines: set[str] = field(default_factory=set)
27
path: Optional[str] = None
28
source_path: Optional[str] = None
29
source_type: Optional[SourceType] = None
30
```
31
32
```python { .api }
33
class SourceType(Enum):
34
"""
35
Enumeration representing types of sources for license information.
36
"""
37
DOT_LICENSE = "dot-license" # A .license file containing license information
38
FILE_HEADER = "file-header" # A file header containing license information
39
DEP5 = "dep5" # A .reuse/dep5 file containing license information
40
REUSE_TOML = "reuse-toml" # A REUSE.toml file containing license information
41
```
42
43
### ReuseInfo Manipulation
44
45
Methods for creating, copying, and combining ReuseInfo instances.
46
47
```python { .api }
48
def copy(self, **kwargs: Any) -> ReuseInfo:
49
"""
50
Return a copy of ReuseInfo, replacing the values of attributes with
51
the values from kwargs.
52
53
Args:
54
**kwargs: Attribute values to replace
55
56
Returns:
57
New ReuseInfo instance with updated attributes
58
59
Raises:
60
KeyError: If kwargs contains non-existent attributes
61
"""
62
63
def union(self, value: ReuseInfo) -> ReuseInfo:
64
"""
65
Return a new instance of ReuseInfo where all set attributes are equal
66
to the union of the set in self and the set in value.
67
68
All non-set attributes are set to their values in self.
69
70
Args:
71
value: ReuseInfo instance to union with
72
73
Returns:
74
New ReuseInfo instance with combined data
75
"""
76
77
def __or__(self, value: ReuseInfo) -> ReuseInfo:
78
"""Union operator support (| operator)."""
79
return self.union(value)
80
```
81
82
**Usage Examples:**
83
84
```python
85
from reuse import ReuseInfo, SourceType
86
87
# Create basic ReuseInfo
88
info1 = ReuseInfo(
89
copyright_lines={"2023 Jane Doe"},
90
source_path="example.py",
91
source_type=SourceType.FILE_HEADER
92
)
93
94
# Create another with different data
95
info2 = ReuseInfo(
96
copyright_lines={"2023 John Smith"},
97
spdx_expressions={"MIT"}
98
)
99
100
# Copy with modifications
101
modified_info = info1.copy(
102
copyright_lines={"2024 Jane Doe"},
103
spdx_expressions={"Apache-2.0"}
104
)
105
106
# Union two ReuseInfo instances
107
combined = info1.union(info2)
108
print(f"Combined copyrights: {combined.copyright_lines}")
109
# Output: {'2023 Jane Doe', '2023 John Smith'}
110
111
# Using union operator
112
combined_alt = info1 | info2 # Same as info1.union(info2)
113
```
114
115
### Content Analysis Methods
116
117
Methods for analyzing ReuseInfo content and compliance status.
118
119
```python { .api }
120
def contains_copyright_or_licensing(self) -> bool:
121
"""
122
Check if either spdx_expressions or copyright_lines is non-empty.
123
124
Returns:
125
True if the instance contains copyright or licensing information
126
"""
127
128
def contains_copyright_xor_licensing(self) -> bool:
129
"""
130
Check if exactly one of spdx_expressions or copyright_lines is non-empty.
131
132
Returns:
133
True if contains exactly one type of information (copyright XOR licensing)
134
"""
135
136
def contains_info(self) -> bool:
137
"""
138
Check if any field except path, source_path and source_type is non-empty.
139
140
Returns:
141
True if the instance contains any substantive REUSE information
142
"""
143
144
def __bool__(self) -> bool:
145
"""
146
Check if any attributes have values.
147
148
Returns:
149
True if any attribute is truthy
150
"""
151
```
152
153
**Usage Examples:**
154
155
```python
156
# Create ReuseInfo instances for testing
157
empty_info = ReuseInfo()
158
copyright_only = ReuseInfo(copyright_lines={"2023 Jane Doe"})
159
license_only = ReuseInfo(spdx_expressions={"MIT"})
160
complete_info = ReuseInfo(
161
copyright_lines={"2023 Jane Doe"},
162
spdx_expressions={"MIT"}
163
)
164
165
# Test content analysis methods
166
print(f"Empty has info: {empty_info.contains_info()}") # False
167
print(f"Copyright only has copyright or license: {copyright_only.contains_copyright_or_licensing()}") # True
168
print(f"License only has copyright XOR license: {license_only.contains_copyright_xor_licensing()}") # True
169
print(f"Complete info has copyright or license: {complete_info.contains_copyright_or_licensing()}") # True
170
print(f"Complete info has copyright XOR license: {complete_info.contains_copyright_xor_licensing()}") # False
171
172
# Boolean evaluation
173
print(f"Empty info is truthy: {bool(empty_info)}") # False
174
print(f"Complete info is truthy: {bool(complete_info)}") # True
175
```
176
177
### Content Extraction Functions
178
179
Functions for extracting REUSE information from text content and files.
180
181
```python { .api }
182
def extract_reuse_info(text: str) -> ReuseInfo:
183
"""
184
Extract REUSE info from text content.
185
186
Args:
187
text: Text content to analyze for REUSE information
188
189
Returns:
190
ReuseInfo instance containing extracted information
191
192
Note:
193
Searches for SPDX license identifiers, copyright statements,
194
and contributor information using pattern matching.
195
"""
196
197
def reuse_info_of_file(path: Path) -> ReuseInfo:
198
"""
199
Get REUSE info for specific file.
200
201
Args:
202
path: File path to analyze
203
204
Returns:
205
ReuseInfo instance for the file
206
207
Raises:
208
FileNotFoundError: If file doesn't exist
209
UnicodeDecodeError: If file can't be decoded as text
210
"""
211
212
def contains_reuse_info(text: str) -> bool:
213
"""
214
Check if text contains REUSE information.
215
216
Args:
217
text: Text content to check
218
219
Returns:
220
True if text contains REUSE licensing or copyright information
221
"""
222
```
223
224
**Usage Examples:**
225
226
```python
227
from reuse.extract import extract_reuse_info, contains_reuse_info
228
from pathlib import Path
229
230
# Extract from text content
231
file_content = '''
232
# SPDX-FileCopyrightText: 2023 Jane Doe <jane@example.com>
233
# SPDX-License-Identifier: MIT
234
235
def hello_world():
236
print("Hello, World!")
237
'''
238
239
info = extract_reuse_info(file_content)
240
print(f"Extracted licenses: {info.spdx_expressions}")
241
print(f"Extracted copyrights: {info.copyright_lines}")
242
243
# Check if content has REUSE info
244
has_info = contains_reuse_info(file_content)
245
print(f"Contains REUSE info: {has_info}")
246
247
# Extract from file
248
if Path("example.py").exists():
249
file_info = reuse_info_of_file(Path("example.py"))
250
print(f"File REUSE info: {file_info}")
251
```
252
253
### Text Processing Utilities
254
255
Utility functions for processing and manipulating text content.
256
257
```python { .api }
258
def find_spdx_tag(text: str, pattern: re.Pattern) -> Iterator[str]:
259
"""
260
Find SPDX tags in text using regex pattern.
261
262
Args:
263
text: Text to search
264
pattern: Compiled regex pattern for SPDX tags
265
266
Yields:
267
str: SPDX tag values found in text
268
"""
269
270
def filter_ignore_block(text: str) -> str:
271
"""
272
Filter ignored blocks from text.
273
274
Args:
275
text: Input text potentially containing ignore blocks
276
277
Returns:
278
Text with ignore blocks removed
279
280
Note:
281
Removes sections marked with REUSE-IgnoreStart/REUSE-IgnoreEnd comments.
282
"""
283
284
def detect_line_endings(text: str) -> str:
285
"""
286
Detect line ending style in text.
287
288
Args:
289
text: Text content to analyze
290
291
Returns:
292
Line ending character(s) detected ('\\n', '\\r\\n', or '\\r')
293
"""
294
```
295
296
**Usage Examples:**
297
298
```python
299
import re
300
from reuse.extract import find_spdx_tag, filter_ignore_block, detect_line_endings
301
302
# Find SPDX license identifiers
303
license_pattern = re.compile(r'SPDX-License-Identifier:\s*([^\n\r]*)')
304
text_with_licenses = "SPDX-License-Identifier: MIT\nSPDX-License-Identifier: GPL-3.0"
305
306
for license_id in find_spdx_tag(text_with_licenses, license_pattern):
307
print(f"Found license: {license_id}")
308
309
# Filter ignore blocks
310
text_with_ignore = '''
311
Some content
312
# REUSE-IgnoreStart
313
This content should be ignored
314
# REUSE-IgnoreEnd
315
More content
316
'''
317
318
filtered = filter_ignore_block(text_with_ignore)
319
print(f"Filtered text: {filtered}")
320
321
# Detect line endings
322
unix_text = "Line 1\nLine 2\n"
323
windows_text = "Line 1\r\nLine 2\r\n"
324
325
print(f"Unix endings: {repr(detect_line_endings(unix_text))}") # '\\n'
326
print(f"Windows endings: {repr(detect_line_endings(windows_text))}") # '\\r\\n'
327
```
328
329
### Binary File Handling
330
331
Functions for handling binary files and extracting text content.
332
333
```python { .api }
334
def decoded_text_from_binary(binary_data: bytes) -> str:
335
"""
336
Extract text from binary file data.
337
338
Args:
339
binary_data: Raw binary data from file
340
341
Returns:
342
Decoded text content
343
344
Raises:
345
UnicodeDecodeError: If binary data cannot be decoded as text
346
347
Note:
348
Attempts multiple encoding strategies (UTF-8, Latin-1, etc.)
349
and handles byte order marks (BOM).
350
"""
351
```
352
353
**Usage Examples:**
354
355
```python
356
from reuse.extract import decoded_text_from_binary
357
358
# Read binary file and decode
359
with open("example.py", "rb") as f:
360
binary_data = f.read()
361
362
try:
363
text_content = decoded_text_from_binary(binary_data)
364
# Now extract REUSE info from text
365
info = extract_reuse_info(text_content)
366
except UnicodeDecodeError:
367
print("File is not text or uses unsupported encoding")
368
```
369
370
## Complete REUSE Information Processing Example
371
372
```python
373
from reuse import ReuseInfo, SourceType
374
from reuse.extract import extract_reuse_info, contains_reuse_info
375
from pathlib import Path
376
377
def process_file_reuse_info(file_path: Path) -> dict:
378
"""Complete example of processing REUSE information."""
379
380
result = {
381
"file": str(file_path),
382
"has_reuse_info": False,
383
"licenses": [],
384
"copyrights": [],
385
"contributors": [],
386
"compliance_status": "unknown"
387
}
388
389
try:
390
# Read file content
391
with open(file_path, 'r', encoding='utf-8') as f:
392
content = f.read()
393
394
# Check if file contains REUSE info
395
if not contains_reuse_info(content):
396
result["compliance_status"] = "missing_info"
397
return result
398
399
# Extract REUSE information
400
info = extract_reuse_info(content)
401
402
if info.contains_info():
403
result["has_reuse_info"] = True
404
result["licenses"] = list(str(expr) for expr in info.spdx_expressions)
405
result["copyrights"] = list(info.copyright_lines)
406
result["contributors"] = list(info.contributor_lines)
407
408
# Determine compliance status
409
if info.contains_copyright_or_licensing():
410
if info.spdx_expressions and info.copyright_lines:
411
result["compliance_status"] = "compliant"
412
elif info.contains_copyright_xor_licensing():
413
result["compliance_status"] = "partial"
414
else:
415
result["compliance_status"] = "missing_info"
416
else:
417
result["compliance_status"] = "missing_info"
418
419
except (FileNotFoundError, UnicodeDecodeError) as e:
420
result["error"] = str(e)
421
result["compliance_status"] = "error"
422
423
return result
424
425
# Usage
426
file_analysis = process_file_reuse_info(Path("src/example.py"))
427
print(f"File: {file_analysis['file']}")
428
print(f"Compliance: {file_analysis['compliance_status']}")
429
print(f"Licenses: {file_analysis['licenses']}")
430
print(f"Copyrights: {file_analysis['copyrights']}")
431
```