Comprehensive Chinese character to Pinyin conversion library with intelligent word segmentation and multiple output styles
—
Command-line interfaces for batch processing, format conversion, and integration with shell scripts and automation workflows.
The primary command-line tool for converting Chinese text to pinyin with full option support.
pypinyin [options] [input_text]# Basic usage
pypinyin "中国" # Basic conversion with default options
pypinyin "中国" --style tone # Specify output style
pypinyin "中国" --style tone3 # Tone numbers after pinyin
# Style options
pypinyin "中国" --style normal # No tones
pypinyin "中国" --style initials # Initial consonants only
pypinyin "中国" --style finals # Final vowels only
pypinyin "中国" --style first_letter # First letters only
# Advanced options
pypinyin "银行" --heteronym # Show all pronunciations
pypinyin "text" --errors ignore # Skip unrecognized characters
pypinyin "女" --v-to-u # Convert v to ü
# Output formatting
pypinyin "中国" --separator "_" # Custom separator
pypinyin "中国" --no-tone-num # Disable tone numbers in numeric styles# Simple conversion
$ pypinyin "中华人民共和国"
zhōng huá rén mín gòng hé guó
# Different styles
$ pypinyin "中华人民共和国" --style normal
zhong hua ren min gong he guo
$ pypinyin "中华人民共和国" --style tone3
zhong1 hua2 ren2 min2 gong4 he2 guo2
$ pypinyin "中华人民共和国" --style first_letter
z h r m g h g
# Heteronym support
$ pypinyin "银行" --heteronym
yín háng,xíng
# Pipe support
$ echo "北京大学" | pypinyin
běi jīng dà xué
# File processing
$ pypinyin < input.txt > output.txt
$ cat chinese_text.txt | pypinyin --style tone3 > pinyin_output.txtExecute pypinyin as a Python module for integration with Python workflows.
python -m pypinyin [options] [input_text]# Direct module execution
$ python -m pypinyin "中国"
zhōng guó
# With Python options
$ python -m pypinyin "中国" --style tone3
zhong1 guo2
# Environment variable support
$ export PYPINYIN_STYLE=normal
$ python -m pypinyin "中国"
zhong guoSpecialized tools for converting between different tone representation formats.
python -m pypinyin.tools.toneconvert [action] [input]# Convert tone marks to tone numbers
$ echo "zhōng guó" | python -m pypinyin.tools.toneconvert to_tone3
zhong1 guo2
# Convert tone numbers to tone marks
$ echo "zhong1 guo2" | python -m pypinyin.tools.toneconvert to_tone
zhōng guó
# Remove tones entirely
$ echo "zhōng guó" | python -m pypinyin.tools.toneconvert to_normal
zhong guo
# Convert between number formats
$ echo "zho1ng guo2" | python -m pypinyin.tools.toneconvert to_tone3
zhong1 guo2
# Batch file processing
$ python -m pypinyin.tools.toneconvert to_tone3 < input_with_tones.txt > output_with_numbers.txtCommon patterns for integrating pypinyin into shell scripts and automation:
#!/bin/bash
# Process multiple files
for file in *.txt; do
echo "Processing $file..."
pypinyin --style normal < "$file" > "${file%.txt}_pinyin.txt"
done
# Create searchable index
create_pinyin_index() {
local input_file="$1"
local index_file="${input_file%.txt}_index.txt"
# Create first-letter index for search
pypinyin --style first_letter < "$input_file" | \
tr ' ' '\n' | \
sort | uniq > "$index_file"
}
# URL slug generation
generate_url_slug() {
local chinese_title="$1"
echo "$chinese_title" | pypinyin --style normal --separator "-"
}
# Example usage
chinese_title="北京大学计算机科学"
url_slug=$(generate_url_slug "$chinese_title")
echo "URL slug: $url_slug" # beijing-da-xue-ji-suan-ji-ke-xueEfficient processing of large text corpora:
# Process large files with progress indication
process_large_file() {
local input_file="$1"
local output_file="$2"
local style="${3:-normal}"
echo "Processing $input_file with style $style..."
# Count lines for progress
total_lines=$(wc -l < "$input_file")
current_line=0
while IFS= read -r line; do
current_line=$((current_line + 1))
echo "$line" | pypinyin --style "$style" >> "$output_file"
# Progress indicator
if ((current_line % 100 == 0)); then
echo "Progress: $current_line/$total_lines lines"
fi
done < "$input_file"
}
# Parallel processing for multiple files
parallel_process() {
local style="$1"
shift
local files=("$@")
for file in "${files[@]}"; do
(
echo "Starting $file..."
pypinyin --style "$style" < "$file" > "${file%.txt}_${style}.txt"
echo "Completed $file"
) &
done
wait # Wait for all background jobs to complete
echo "All files processed"
}
# Usage
parallel_process normal file1.txt file2.txt file3.txtIntegration with common Unix text processing tools:
# Extract and convert Chinese text from mixed content
extract_and_convert() {
local input_file="$1"
# Extract Chinese characters, convert to pinyin, create word frequency
grep -oP '[\x{4e00}-\x{9fff}]+' "$input_file" | \
pypinyin --style normal | \
tr ' ' '\n' | \
sort | uniq -c | sort -nr > chinese_word_frequency.txt
}
# Create pronunciation dictionary from text
create_pronunciation_dict() {
local input_file="$1"
# Extract unique Chinese phrases and their pinyin
grep -oP '[\x{4e00}-\x{9fff}]{2,}' "$input_file" | \
sort | uniq | \
while read -r phrase; do
pinyin_result=$(echo "$phrase" | pypinyin --style tone)
echo "$phrase -> $pinyin_result"
done > pronunciation_dict.txt
}
# Search text by pinyin
search_by_pinyin() {
local search_term="$1"
local text_file="$2"
# Convert search term to pinyin patterns
search_pattern=$(echo "$search_term" | pypinyin --style normal | tr ' ' '.*')
# Find matching lines
while IFS= read -r line; do
line_pinyin=$(echo "$line" | pypinyin --style normal)
if echo "$line_pinyin" | grep -q "$search_pattern"; then
echo "$line"
fi
done < "$text_file"
}Environment variables and configuration options:
# Environment variable configuration
export PYPINYIN_STYLE=tone3 # Default style
export PYPINYIN_SEPARATOR="_" # Default separator
export PYPINYIN_ERRORS=ignore # Error handling strategy
# Configuration file support (if available)
cat > ~/.pypinyinrc << EOF
style=normal
separator=-
heteronym=false
v_to_u=true
EOF
# Use configuration in scripts
load_config() {
if [[ -f ~/.pypinyinrc ]]; then
source ~/.pypinyinrc
echo "Loaded configuration from ~/.pypinyinrc"
fi
}Robust error handling for production workflows:
# Safe pypinyin execution with error handling
safe_pypinyin() {
local input="$1"
local style="${2:-normal}"
local max_retries=3
local retry_count=0
while ((retry_count < max_retries)); do
if result=$(echo "$input" | pypinyin --style "$style" 2>/dev/null); then
echo "$result"
return 0
else
((retry_count++))
echo "Retry $retry_count/$max_retries for: $input" >&2
sleep 1
fi
done
echo "Failed to process: $input" >&2
return 1
}
# Validate Chinese text before processing
validate_chinese_text() {
local text="$1"
# Check if text contains Chinese characters
if ! echo "$text" | grep -qP '[\x{4e00}-\x{9fff}]'; then
echo "Warning: No Chinese characters found in: $text" >&2
return 1
fi
# Check text length
if ((${#text} > 1000)); then
echo "Warning: Text very long (${#text} chars): $text" >&2
fi
return 0
}
# Complete processing function with validation
process_with_validation() {
local input="$1"
local style="${2:-normal}"
if validate_chinese_text "$input"; then
safe_pypinyin "$input" "$style"
else
echo "Skipping invalid input: $input" >&2
return 1
fi
}Install with Tessl CLI
npx tessl i tessl/pypi-pypinyin