所有中文处理操作必须遵循以下编码规则:
# ✅ 正确:显式指定 utf-8 编码
with open("file.txt", "r", encoding="utf-8") as f:
content = f.read()
with open("output.txt", "w", encoding="utf-8") as f:
f.write("中文内容")
# ❌ 错误:依赖系统默认编码(Windows 默认 GBK,会导致乱码)
with open("file.txt", "r") as f: # 可能使用 GBK 解码
content = f.read()
import csv
# 写入 CSV(必须指定 utf-8-sig 以便 Excel 正确打开)
with open("output.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.writer(f)
writer.writerow(["姓名", "年龄", "城市"])
writer.writerow(["张三", 28, "北京"])
# 读取 CSV
with open("input.csv", "r", encoding="utf-8-sig") as f:
reader = csv.reader(f)
for row in reader:
print(row)
import json
data = {"名称": "测试", "描述": "中文内容"}
# 写入(ensure_ascii=False 保留中文,而不是转义为 \uxxxx)
with open("data.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# 读取
with open("data.json", "r", encoding="utf-8") as f:
data = json.load(f)
import sys
import io
# 确保标准输出使用 UTF-8
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
print("中文测试") # 正常输出中文,不乱码
import chardet
def detect_encoding(file_path):
with open(file_path, "rb") as f:
raw = f.read(10000)
result = chardet.detect(raw)
return result["encoding"]
# 使用示例
enc = detect_encoding("unknown.txt")
print(f"检测到编码: {enc}")
# pip install jieba
import jieba
text = "我爱北京天安门"
words = jieba.lcut(text)
print(words) # ['我', '爱', '北京', '天安门']
# 添加自定义词典
jieba.add_word("天安门")
# pip install opencc-python-reimplemented
from opencc import OpenCC
# 简体 → 繁体
cc_s2t = OpenCC("s2t")
traditional = cc_s2t.convert("我爱你中国")
print(traditional) # 我愛你中國
# 繁体 → 简体
cc_t2s = OpenCC("t2s")
simplified = cc_t2s.convert("我愛你中國")
print(simplified) # 我爱你中国
import re
def clean_chinese_text(text):
"""清洗中文文本:去除多余空白、特殊字符"""
# 去除多余空白(保留中文标点)
text = re.sub(r"\s+", " ", text)
# 去除不可见控制字符(保留中文)
text = "".join(ch for ch in text if ch.isprintable() or ord(ch) > 0x4E00)
return text.strip()
text = " hello 你好 \t世界!\n\r"
print(clean_chinese_text(text)) # "hello 你好 世界!"
# pip install reportlab
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
# 注册中文字体(防乱码关键)
# Windows 系统字体路径
pdfmetrics.registerFont(TTFont("SimSun", "C:/Windows/Fonts/simsun.ttc"))
c = canvas.Canvas("中文文档.pdf", pagesize=A4)
c.setFont("SimSun", 12)
c.drawString(50, 750, "这是一个中文 PDF 文档")
c.drawString(50, 730, "使用 SimSun 字体确保中文正常显示")
c.save()
# pip install openpyxl
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title = "中文工作表"
ws["A1"] = "姓名"
ws["B1"] = "描述"
ws["A2"] = "张三"
ws["B2"] = "中文测试数据"
wb.save("中文文件.xlsx")
import requests
# requests 自动处理编码,设置 fallback 为 utf-8
resp = requests.get("https://example.com/api/data")
resp.encoding = "utf-8" # 强制 UTF-8 解码
data = resp.text
chinese-utils/
├── SKILL.md # 本文件:技能定义和说明
├── .skillhub.json # SkillHub 元数据
└── scripts/
└── encoding_helper.py # 编码辅助工具
要将此技能发布到 SkillHub,请使用以下命令:
# 1. 定位到技能目录(包含 SKILL.md 的父目录)
cd chinese-utils
# 2. 发布
skillhub publish
utf-8-sig 编码以便 Excel 正确识别中文ensure_ascii=False 保留原始中文共 1 个版本