> 版本: v3.22
> 适用: 所有支持文件处理与长文本生成的 AI 智能体
> 核心能力: 根据用户提供的模板 + 多格式材料,生成格式一致、内容完整、事实准确的成品文档
> 工作流: 输入 → 分析 → 确认 → 填充 → 自检 → 交付
本 Skill 依赖以下 Python 库:
| 库名 | 用途 | 安装命令 |
|---|---|---|
| ------ | ------ | ---------- |
python-docx | Word 文档读写 | pip install python-docx |
openpyxl | Excel 读取 | pip install openpyxl |
快速安装: pip install python-docx openpyxl
你是文档生成专家。根据用户提供的模板和参考材料,生成格式与模板完全一致、内容基于材料事实的成品文档。
绝对原则:
用户会提供 1~4 类输入:
| 输入项 | 要求 | 说明 |
|---|---|---|
| -------- | ------ | ------ |
| 模板 | 必填 | Word(.docx) 文件 |
| 材料 | 必填 | Excel/CSV/文本等 |
| 占位符规则 | 选填 | 自定义正则表达式 |
| 输出要求 | 选填 | 格式、页数、风格等 |
操作指令:
输出四部分分析结果,完成后暂停等待用户确认:
列出模板的所有层级结构(按原模板层级缩进)
将模板中的"空白/占位/需填处"分类:
| 可直接替换的字段 | 占位符标记 | 需要创作的章节 | 当前状态 |
|---|---|---|---|
| ------------------ | ------------ | ---------------- | ---------- |
| 公司名称 | {公司名称} | 项目背景 | 空段落 |
| 签约日期 | 【日期】 | 实施建议 | 仅标题 |
占位符识别规则:
{xxx}、【xxx】、{{xxx}}、________、XXX、待定TBD、To Be Filled、[Your Company]根据模板类型给出填充策略
检测是否包含目录域,提醒用户更新
分析完成后输出:
【第二步:模板分析结果】
A. 模板结构概览: [展示结构]
B. 字段分类清单: [展示表格]
C. 创作建议: [策略说明]
D. 目录提示: [如有目录则提示]
---
请确认是否开始执行填充? (Y/A/N)
等待用户输入:
核心操作(使用下方附录的 Python 代码):
replace_preserve_format() 函数scan_and_replace_hidden() 扫描页眉/页脚/文本框detect_conflicts_during_fill() 处理多材料冲突替换级别:
填充完成后,执行 7 项自检:
【第五步:量化自检报告】
1. 占位符清除检查
通过: 未发现未替换的占位符
2. 隐藏内容检查
页眉: N 处 / 页脚: N 处 / 文本框: N 处
3. 格式一致性检查
通过: 抽检 N 个 Run,全部一致
4. 段落样式检查
通过: 所有段落样式与模板一致
5. 事实核查
通过: 所有数据可在材料中找到出处
6. 目录检查
通过: 模板不含自动目录
7. Run 级替换失败检查
通过: 所有占位符均通过 Run 级或跨 Run 级替换
---
综合判定: 可交付 / 需修正
保存成品文档,提供下载链接,并附上自检报告。
文件命名规则: <原模板名>_已填充_<日期>.docx
以下所有 Python 函数均已实现,AI 可直接使用或根据需求修改。
"""
通用模板填充与改写 SKILL v3.22 - 核心函数
"""
from docx import Document
from docx.text.paragraph import Paragraph
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import re
def _read_font_props(run):
"""安全读取 Run 的字体属性"""
props = {}
font = run.font
for attr in ('name', 'size', 'bold', 'italic', 'underline'):
val = getattr(font, attr, None)
if val is not None:
props[attr] = val
try:
rgb = font.color.rgb
if rgb is not None:
props['color_rgb'] = rgb
except Exception:
pass
return props
def _apply_font_props(run, props):
"""将字体属性应用到 Run"""
font = run.font
if 'name' in props: font.name = props['name']
if 'size' in props: font.size = props['size']
if 'bold' in props: font.bold = props['bold']
if 'italic' in props: font.italic = props['italic']
if 'underline' in props: font.underline = props['underline']
if 'color_rgb' in props:
try: font.color.rgb = props['color_rgb']
except Exception: pass
def replace_preserve_format(para, old_text, new_text):
"""
替换段落中的文本,同时保留 Run 级格式
返回: (success: bool, level: str)
"""
if old_text not in para.text:
return False, "not-found"
# Level 1: 单 Run 内替换(最优)
for run in para.runs:
if old_text in run.text:
fp = _read_font_props(run)
run.text = run.text.replace(old_text, new_text)
_apply_font_props(run, fp)
return True, "run-level"
# Level 2: 跨 Run 合并替换
runs_text = [r.text for r in para.runs]
full_text = "".join(runs_text)
if old_text in full_text:
idx = full_text.index(old_text)
end_idx = idx + len(old_text)
run_positions = []
cur = 0
for rt in runs_text:
run_positions.append((cur, cur + len(rt)))
cur += len(rt)
start_run = end_run = -1
start_off = end_off = 0
for i, (s, e) in enumerate(run_positions):
if s <= idx < e and start_run == -1:
start_run = i; start_off = idx - s
if s < end_idx <= e:
end_run = i; end_off = end_idx - s
if start_run == -1 or end_run == -1:
return False, "cannot-locate-runs"
sr = para.runs[start_run]
sfp = _read_font_props(sr)
prefix = sr.text[:start_off]
if end_run != start_run:
sr.text = prefix + new_text
_apply_font_props(sr, sfp)
er = para.runs[end_run]
er.text = er.text[end_off:]
for i in range(start_run + 1, end_run):
para.runs[i].text = ""
p = para.runs[i]._element.getparent()
if p is not None: p.remove(para.runs[i]._element)
else:
sr.text = prefix + new_text + sr.text[end_off:]
_apply_font_props(sr, sfp)
return True, "cross-run-level"
return False, "cannot-replace"
def replace_in_table_cell(cell, old_text, new_text):
"""表格单元格内执行 Run 级替换"""
for para in cell.paragraphs:
ok, lvl = replace_preserve_format(para, old_text, new_text)
if ok: return True, lvl
return False, "not-found"
def _iter_textbox_elements(doc):
"""遍历文档中所有文本框的段落和表格单元格"""
try:
for txbx in doc.element.body.iter(qn('w:txbxContent')):
for pe in txbx.iter(qn('w:p')):
yield ("para", pe)
for tbl in txbx.iter(qn('w:tbl')):
for row in tbl.iter(qn('w:tr')):
for cell in row.iter(qn('w:tc')):
yield ("cell", cell)
except Exception:
pass
def _get_element_text(element):
"""从 XML 元素中提取纯文本"""
return ''.join(n.text for n in element.iter(qn('w:t')) if n.text)
def _replace_in_element(element, old_text, new_text):
"""在 XML 元素中替换文本(会丢失字符格式)"""
text = _get_element_text(element)
if old_text not in text: return False
new_full = text.replace(old_text, new_text)
for tn in element.iter(qn('w:t')): tn.text = ""
tnodes = list(element.iter(qn('w:t')))
if not tnodes: return False
tnodes[0].text = new_full
return True
def scan_and_replace_hidden(doc, replace_map):
"""扫描并替换隐藏内容:页眉、页脚、文本框"""
stats = {"header": 0, "footer": 0, "textbox": 0}
for section in doc.sections:
for hdr in [section.header, section.first_page_header, section.even_page_header]:
if hdr and not hdr.is_linked_to_previous:
for para in hdr.paragraphs:
for old, new in replace_map.items():
if replace_preserve_format(para, old, new)[0]:
stats["header"] += 1
for tbl in hdr.tables:
for row in tbl.rows:
for cell in row.cells:
for old, new in replace_map.items():
if replace_in_table_cell(cell, old, new)[0]:
stats["header"] += 1
for ftr in [section.footer, section.first_page_footer, section.even_page_footer]:
if ftr and not ftr.is_linked_to_previous:
for para in ftr.paragraphs:
for old, new in replace_map.items():
if replace_preserve_format(para, old, new)[0]:
stats["footer"] += 1
for tbl in ftr.tables:
for row in tbl.rows:
for cell in row.cells:
for old, new in replace_map.items():
if replace_in_table_cell(cell, old, new)[0]:
stats["footer"] += 1
try:
for etype, elem in _iter_textbox_elements(doc):
for old, new in replace_map.items():
if _replace_in_element(elem, old, new):
stats["textbox"] += 1
except Exception:
pass
return stats
def scan_placeholders_full(doc, custom_patterns=None):
"""全文档扫描所有占位符残留"""
base_patterns = [
r'\{[^{}]+\}', r'\{\{[^{}]+\}\}',
r'_{3,}', r'\bXXX\b', r'\bTBD\b',
r'\bPlaceholder\b', r'\bTo Be Filled\b',
]
if custom_patterns:
base_patterns.extend(custom_patterns)
findings = []
def scan_text(text, src):
for pat in base_patterns:
for m in re.finditer(pat, text, re.IGNORECASE):
findings.append({
"type": src, "text": m.group(0),
"context": text[max(0,m.start()-10):m.end()+10]
})
for para in doc.paragraphs: scan_text(para.text, "body")
for table in doc.tables:
for row in table.rows:
for cell in row.cells: scan_text(cell.text, "table")
return findings
def detect_toc(doc):
"""检测文档中是否包含目录域"""
for para in doc.paragraphs:
for run in para.runs:
fld_chars = run._element.findall(qn('w:fldChar'))
if fld_chars:
parent = run._element.getparent()
if parent is not None:
instrs = parent.findall('.' + qn('w:instrText'))
for instr in instrs:
if instr.text and 'TOC' in instr.text:
return True
for para in doc.paragraphs:
if para.style and para.style.name and para.style.name.startswith('TOC'):
return True
return False
"""
材料数据解析函数
"""
import openpyxl
import csv
import re
def parse_excel_data(rows_data):
"""
解析 Excel 行数据为字段-值映射
参数: rows_data: dict {sheet_name: [row_tuples]} 或 list of row tuples
返回: (文本字符串, 结果字典)
"""
results = {}
if isinstance(rows_data, list):
rows_data = {"Sheet1": rows_data}
for sname, rows in rows_data.items():
if not rows: continue
headers = []
for i, h in enumerate(rows[0]):
if h is not None:
headers.append(str(h).strip())
else:
val = None
for c in range(i-1, -1, -1):
if c < len(rows[0]) and rows[0][c] is not None:
val = str(rows[0][c]).strip()
break
headers.append(val if val else f"col_{i+1}")
for ridx, row in enumerate(rows[1:], start=2):
rd = {}
for cidx, cv in enumerate(row):
if cv is None: cv = ""
fn = headers[cidx] if cidx < len(headers) else f"col_{cidx+1}"
key = f"{sname}.{fn}" if len(rows_data) > 1 else fn
rd[key] = str(cv).strip()
if fn not in rd: rd[fn] = str(cv).strip()
if rd: results[f"{sname}_row{ridx}"] = rd
parts = []
for k, d in results.items():
parts.append(f"--- {k} ---")
for f, v in d.items(): parts.append(f"{f}: {v}")
return "\n".join(parts), results
def parse_csv_data(csv_text):
"""解析 CSV 文本内容,第一行作为字段名"""
parts = []
rows = list(csv.reader(csv_text.splitlines()))
if not rows: return ""
hdrs = [h.strip() for h in rows[0]]
parts.append("CSV Headers: " + ", ".join(hdrs))
for ridx, row in enumerate(rows[1:], start=2):
parts.append(f"--- Row {ridx} ---")
for i, v in enumerate(row):
if i < len(hdrs) and v.strip():
parts.append(f"{hdrs[i]}: {v.strip()}")
return "\n".join(parts)
def parse_markdown_data(md_text):
"""解析 Markdown 文本为结构化章节"""
parts = []
for line in md_text.split("\n"):
hm = re.match(r'^(#{1,6})\s+(.+)', line)
if hm:
lv = len(hm.group(1))
parts.append(" " * (lv-1) + "[Section] " + hm.group(2).strip())
elif line.strip().startswith('- ') or line.strip().startswith('* '):
parts.append(" * " + line.strip()[2:])
elif line.strip():
parts.append(" " + line.strip())
return "\n".join(parts)
"""
工具函数: 优先级、冲突检测
"""
import re
def get_priority(name, ext_type=""):
"""
基于文件名和类型判定材料权威性优先级
返回: 优先级分数 (0-100)
"""
score = 50
nl = str(name).lower()
high = ['contract', 'agreement', 'official', '合同', '协议', '官方', '正式']
if any(k in nl for k in high): score += 50
med = ['confirm', 'approval', '确认', '确认函', '批复']
if any(k in nl for k in med): score += 30
low = ['draft', 'discussion', '草稿', '初稿', '讨论']
if any(k in nl for k in low): score += 10
et = str(ext_type).lower()
if et == 'pdf': score += 10
elif et in ['docx', 'doc', 'xlsx', 'xls']: score += 5
return max(0, min(100, score))
def detect_conflicts_during_fill(field_name, candidates):
"""
检测多材料冲突
参数: field_name: str, candidates: [(value, source, priority), ...]
返回: (resolved_value, conflict_log or None)
"""
if not candidates:
return None, f"Field `{field_name}` has no candidates"
if len(candidates) == 1:
return candidates[0][0], None
sorted_c = sorted(candidates, key=lambda x: x[2], reverse=True)
top_val, top_src, top_pri = sorted_c[0]
if len(set(c[0] for c in sorted_c)) == 1:
return top_val, None
log = f"[CONFLICT] `{field_name}` has multiple values:\n"
for v, s, p in sorted_c:
log += f" - {v} (src: {s}, pri: {p})\n"
log += f"-> Auto-selected: {top_val} (highest pri)\n"
return top_val, log
def is_placeholder(text):
"""判断文本是否疑似占位符"""
return re.search(r'[{\[(%#][^}\])%#\n]{1,50}[}\])%#]+', text) is not None
# 完整使用示例
from docx import Document
# 假设上述函数已定义或从附录复制
def fill_template(template_path, material_data, output_path):
"""填充模板的主函数"""
# 1. 加载模板
doc = Document(template_path)
# 2. 准备替换映射
replace_map = {
"{公司名称}": material_data.get("company", ""),
"{日期}": material_data.get("date", ""),
"{金额}": material_data.get("amount", ""),
}
# 3. 替换正文段落
for para in doc.paragraphs:
for old, new in replace_map.items():
replace_preserve_format(para, old, new)
# 4. 替换表格内容
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for old, new in replace_map.items():
replace_in_table_cell(cell, old, new)
# 5. 替换隐藏内容(页眉/页脚/文本框)
scan_and_replace_hidden(doc, replace_map)
# 6. 自检:扫描残留占位符
remaining = scan_placeholders_full(doc)
if remaining:
print(f"警告: 发现 {len(remaining)} 个未替换占位符")
for item in remaining:
print(f" - {item['type']}: {item['text']}")
# 7. 检测目录
if detect_toc(doc):
print("提示: 文档包含目录,请手动更新 (Ctrl+A -> F9)")
# 8. 保存
doc.save(output_path)
print(f"已保存: {output_path}")
# 使用
# fill_template("合同模板.docx", {"公司":"ABC科技","日期":"2024-01-01"}, "输出.docx")
当任何智能体加载此 SKILL 后,必须在首次交互时向用户声明:
> 我已加载「通用模板填充与改写 SKILL v3.22」。请上传您的模板文件(支持单个模板)和参考材料(支持 Excel/CSV/文本等格式)。
>
> 我将按照"分析→确认→填充→自检→交付"的六步工作流为您生成成品文档。支持以下能力:
> - 隐藏内容扫描: 自动扫描并替换页眉、页脚、文本框中的占位符
> - 多格式材料: 自动识别 Excel 表头、CSV 字段、Markdown 层级
> - 格式保留: 使用 Run 级替换,最大化保留原文格式
> - 冲突检测: 多材料冲突时按权威性分级自动处理
>
> ⚠️ 能力边界: 当前智能体不支持直接识别图片。建议优先使用 Excel、Word 等文本格式材料。
共 1 个版本