Python字符串操作高级技巧

1. 字符串高级格式化

1.1 f-string 格式化（Python 3.6+）

f-string 是 Python 3.6 引入的字符串格式化方法，性能高且语法简洁。

name = "Alice"
age = 28
height = 1.65

# 基本用法
print(f"姓名: {name}, 年龄: {age}, 身高: {height:.2f}米")
# 输出: 姓名: Alice, 年龄: 28, 身高: 1.65米

# 表达式计算
print(f"{name}明年将 {age + 1} 岁")
# 输出: Alice明年将 29 岁

# 调用函数
print(f"姓名大写: {name.upper()}")
# 输出: 姓名大写: ALICE

# 格式化数字
number = 1234567.89123
print(f"千位分隔: {number:,}")  # 输出: 1,234,567.89123
print(f"科学计数法: {number:.2e}")  # 输出: 1.23e+06

1.2 format() 方法高级用法

format() 方法提供了更多格式化控制选项。

# 对齐和填充
print("{:>15}".format("右对齐"))  # 输出: '           右对齐'
print("{:<15}".format("左对齐"))  # 输出: '左对齐           '
print("{:^15}".format("居中"))    # 输出: '     居中      '
print("{:*^15}".format("居中"))   # 输出: '******居中******'

# 数字格式化
pi = 3.1415926
print("圆周率: {:.2f}".format(pi))  # 输出: 圆周率: 3.14
print("十六进制: {0:x}".format(255))  # 输出: ff
print("二进制: {0:b}".format(10))     # 输出: 1010

# 字典格式化
person = {"name": "Bob", "age": 35}
print("姓名: {name}, 年龄: {age}".format(**person))
# 输出: 姓名: Bob, 年龄: 35

2. 字符串编码与解码

2.1 理解编码与解码

Python 3 中字符串默认使用 Unicode 编码，处理不同编码时需要转换。

text = "中文示例"

# 编码为字节序列
utf8_bytes = text.encode("utf-8")
print(utf8_bytes)  # 输出: b'\xe4\xb8\xad\xe6\x96\x87\xe7\xa4\xba\xe4\xbe\x8b'

# 解码为字符串
decoded_text = utf8_bytes.decode("utf-8")
print(decoded_text)  # 输出: 中文示例

# 处理不同编码
gbk_bytes = text.encode("gbk")
print(gbk_bytes)  # 输出: b'\xd6\xd0\xce\xc4\xca\xbe\xc0\xfd'

# 错误处理方式
try:
    # 尝试用错误编码解码
    print(utf8_bytes.decode("ascii"))
except UnicodeDecodeError as e:
    print(f"解码错误: {e}")

# 使用错误处理参数
print(utf8_bytes.decode("ascii", errors="replace"))  # 替换无法解码的字符
print(utf8_bytes.decode("ascii", errors="ignore"))   # 忽略无法解码的字符

3. 正则表达式高级应用

3.1 常用正则表达式模式

正则表达式是处理复杂字符串匹配的利器。

import re

text = "联系电话: 138-0013-8000, 备用: 15012345678, 邮箱: contact@example.com"

# 匹配手机号码
phone_pattern = r"1[3-9]\d{9}"
phones = re.findall(phone_pattern, text)
print(phones)  # 输出: ['15012345678']

# 匹配带分隔符的电话
phone_pattern2 = r"\d{3}-\d{4}-\d{4}"
phones2 = re.findall(phone_pattern2, text)
print(phones2)  # 输出: ['138-0013-8000']

# 匹配邮箱
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
emails = re.findall(email_pattern, text)
print(emails)  # 输出: ['contact@example.com']

# 替换操作
anonymized = re.sub(r"\d{3}-\d{4}", "***-****", text)
print(anonymized)
# 输出: 联系电话: ***-****-8000, 备用: 15012345678, 邮箱: contact@example.com

# 分组提取
date_text = "日期: 2023-08-15, 时间: 14:30"
date_pattern = r"(\d{4})-(\d{2})-(\d{2})"
match = re.search(date_pattern, date_text)
if match:
    year, month, day = match.groups()
    print(f"年: {year}, 月: {month}, 日: {day}")
    # 输出: 年: 2023, 月: 08, 日: 15

4. 高级字符转换

4.1 maketrans() 和 translate()

用于高效执行多个字符的替换操作。

# 创建转换表
trans_table = str.maketrans("aeiou", "12345")
text = "This is an example of translation."
print(text.translate(trans_table))
# 输出: Th3s 3s 1n 2x1mpl2 4f tr1nsl1t34n.

# 删除特定字符
remove_table = str.maketrans("", "", "!?.,;:")
text = "Hello! How are you? I'm fine, thanks."
print(text.translate(remove_table))
# 输出: Hello How are you Im fine thanks

# 更复杂的替换
replace_dict = {
    "apple": "orange",
    "banana": "grape",
    "cherry": "berry"
}
text = "I like apple, banana and cherry."

# 创建自定义转换函数
def multiple_replace(text, replace_dict):
    pattern = re.compile("|".join(map(re.escape, replace_dict.keys())))
    return pattern.sub(lambda m: replace_dict[m.group(0)], text)

print(multiple_replace(text, replace_dict))
# 输出: I like orange, grape and berry.