Batch rename CNKI-downloaded PDF papers to YYYYMMDD_Title.pdf using Playwright-based browser automation.
Extracts the exact online publication date from CNKI search results, detects degree type for theses, and
renames files uniformly.
YYYYMMDD_Title.pdf format_硕 (master's) or _博 (doctorate) suffixesThe host machine must have:
# Python 3.8+ with Playwright
pip install playwright
playwright install chromium
The skill automates a real Chromium browser via Playwright to search CNKI for each paper and extract
precise publication metadata. The workflow is:
For each PDF without an exact YYYYMMDD date prefix:
1. Extract paper title from filename (remove _Author suffix and existing date prefix)
2. Open CNKI search page: https://kns.cnki.net/kns8s/search
3. Type title into search input, click search
4. Parse first result: date from td.date, paper type from td.data
5. Rename file to YYYYMMDD_Title[_硕|_博].pdf
6. Pace requests to avoid captcha (1.5-3s between searches, 5-10s between batches)
CNKI-downloaded PDFs are typically named 论文标题_作者名.pdf. The extraction logic:
import re
def extract_title(filepath):
basename = os.path.basename(filepath)[:-4] # Remove .pdf
basename = re.sub(r'^\d{4,8}_', '', basename) # Remove existing date prefix
# Remove author suffix (last underscore + short name)
parts = basename.rsplit('_', 1)
if len(parts) == 2 and len(parts[1]) <= 6 and parts[1] not in ('硕', '博'):
return parts[0]
return basename
CNKI search results show dates in various formats. Parse them all into YYYYMMDD:
import re
def parse_date(date_str):
if not date_str:
return None
# "2024-03-15" or "2024-3-15"
m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}{m.group(3).zfill(2)}"
# "2024年3月15日"
m = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}{m.group(3).zfill(2)}"
# "2024年3月" or "2024年第3期" (fallback: no day)
m = re.search(r'(\d{4})年(?:第)?(\d{1,2})(?:月|期)', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}"
# "2024年" (fallback: year only)
m = re.search(r'(\d{4})年', date_str)
if m:
return m.group(1)
# Bare year
m = re.search(r'(\d{4})', date_str)
return m.group(1) if m else None
CNKI search results include a database type column (td.data) showing "期刊", "硕士", "博士":
def detect_paper_type(db_text):
if '博士' in db_text:
return '博'
if '硕士' in db_text:
return '硕'
return None # Journal paper, no suffix
Below is the production-ready script. It works in multiple passes:
"""
CNKI Batch PDF Rename Tool
Renames PDFs to YYYYMMDD_Title[_硕|_博].pdf using Playwright + CNKI search
"""
import asyncio
import re
import os
from playwright.async_api import async_playwright
BASE_PATH = None # Set by caller: target directory path
BATCH_SIZE = 15 # Papers per batch
BATCH_PAUSE = 10 # Seconds pause between batches
SEARCH_DELAY = 2.0 # Seconds between individual searches
def extract_title(filepath):
basename = os.path.basename(filepath)[:-4]
basename = re.sub(r'^\d{4,8}_', '', basename)
parts = basename.rsplit('_', 1)
if len(parts) == 2 and len(parts[1]) <= 6 and parts[1] not in ('硕', '博'):
return parts[0]
return basename
def parse_date(date_str):
if not date_str:
return None
m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}{m.group(3).zfill(2)}"
m = re.search(r'(\d{4})年(\d{1,2})月(\d{1,2})日', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}{m.group(3).zfill(2)}"
m = re.search(r'(\d{4})年(?:第)?(\d{1,2})(?:月|期)', date_str)
if m:
return f"{m.group(1)}{m.group(2).zfill(2)}"
m = re.search(r'(\d{4})年', date_str)
if m:
return m.group(1)
m = re.search(r'(\d{4})', date_str)
return m.group(1) if m else None
def detect_paper_type(db_text):
if '博士' in db_text:
return '博'
if '硕士' in db_text:
return '硕'
return None
def sanitize_filename(name):
name = name.replace('/', '_').replace(':', ':').replace('*', '_')
return re.sub(r'[?"<>|]', '_', name)
async def cnki_batch_rename(target_dir, thesis_label=True, upgrade_only=False):
"""
Batch rename PDFs in target_dir using CNKI search.
Args:
target_dir: Path to directory containing PDFs (searched recursively)
thesis_label: If True, append _硕/_博 suffix for degree papers
upgrade_only: If True, only process papers that already have YYYYMM prefix
"""
# Collect papers to process
pdfs = []
for root, dirs, files in os.walk(target_dir):
for f in files:
if not f.endswith('.pdf') or f.startswith('~$'):
continue
fpath = os.path.join(root, f)
m = re.match(r'^(\d+)_', f)
if upgrade_only:
# Only YYYYMM papers (exactly 6 digits)
if m and len(m.group(1)) == 6:
pdfs.append((fpath, extract_title(fpath)))
else:
# Papers without full 8-digit date prefix
if not m or len(m.group(1)) < 8:
pdfs.append((fpath, extract_title(fpath)))
if not pdfs:
print("All papers have precise dates. Nothing to do.")
return 0
print(f"Papers to process: {len(pdfs)}")
success = 0
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
ctx = await browser.new_context(
user_agent=(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
)
)
page = await ctx.new_page()
# Visit CNKI homepage first to establish session
await page.goto('https://www.cnki.net/', wait_until='domcontentloaded', timeout=30000)
await page.wait_for_timeout(2000)
for batch_start in range(0, len(pdfs), BATCH_SIZE):
batch = pdfs[batch_start:batch_start + BATCH_SIZE]
for i, (fpath, title) in enumerate(batch):
idx = batch_start + i + 1
short_title = title[:60] if len(title) > 60 else title
print(f"[{idx}/{len(pdfs)}] {short_title}")
try:
# Navigate to search page
await page.goto(
'https://kns.cnki.net/kns8s/search',
wait_until='domcontentloaded',
timeout=20000
)
await page.wait_for_timeout(1500)
# Check for captcha
captcha = await page.query_selector('#tcaptcha_transform_dy')
if captcha:
box = await captcha.bounding_box()
if box and box['y'] >= 0:
print(" CAPTCHA detected. Pausing 30s for manual solve...")
await page.wait_for_timeout(30000)
await page.goto(
'https://kns.cnki.net/kns8s/search',
wait_until='domcontentloaded',
timeout=20000
)
await page.wait_for_timeout(2000)
# Fill search input
search_input = await page.wait_for_selector(
'input.search-input', timeout=10000
)
await search_input.fill(title)
await page.wait_for_timeout(400)
# Submit search
search_btn = await page.query_selector('input.search-btn')
if search_btn:
await search_btn.click()
else:
await page.press('input.search-input', 'Enter')
await page.wait_for_timeout(2500)
# Parse results
rows = await page.query_selector_all('.result-table-list tbody tr')
if not rows:
rows = await page.query_selector_all('tr')
found = False
for row in rows:
try:
date_el = await row.query_selector('td.date')
db_el = await row.query_selector('td.data')
if not date_el:
continue
r_date = (await date_el.inner_text()).strip()
r_db = (await db_el.inner_text()).strip() if db_el else ""
d = parse_date(r_date)
if not d:
continue
# For upgrade mode, only rename if we got a better date
if upgrade_only and len(d) <= 6:
continue
# Build new filename
clean_title = sanitize_filename(title)
if thesis_label:
pt = detect_paper_type(r_db)
suffix = f"_{pt}" if pt else ""
else:
suffix = ""
new_name = f"{d}_{clean_title}{suffix}.pdf"
new_path = os.path.join(os.path.dirname(fpath), new_name)
if fpath != new_path:
os.rename(fpath, new_path)
tag = f"_{pt}" if (thesis_label and 'pt' in dir()) else ""
print(f" -> {d}{tag}")
success += 1
found = True
break
except Exception:
continue
if not found:
print(f" No match found on CNKI")
except Exception as e:
print(f" Error: {str(e)[:80]}")
# Pace between searches
await page.wait_for_timeout(int(SEARCH_DELAY * 1000))
# Pause between batches
if batch_start + BATCH_SIZE < len(pdfs):
print(f" (batch pause {BATCH_PAUSE}s)...")
await page.wait_for_timeout(BATCH_PAUSE * 1000)
await browser.close()
print(f"Done. Renamed: {success}/{len(pdfs)}")
return success
# CLI entry point
if __name__ == "__main__":
import sys
target = sys.argv[1] if len(sys.argv) > 1 else os.getcwd()
asyncio.run(cnki_batch_rename(target))
python cnki_batch_rename.py "/path/to/papers"
# In the script, set upgrade_only=True and call:
asyncio.run(cnki_batch_rename("/path/to/papers", upgrade_only=True))
CNKI uses Tencent slider CAPTCHA. The script handles it by:
#tcaptcha_transform_dy visibility; if triggered, pauses 30s for manual solveIf CAPTCHA appears frequently, increase SEARCH_DELAY to 3-5 seconds and reduce BATCH_SIZE to 8-10.
The skill scripts are stored as standalone Python files that agents can read and execute.
When invoked, the agent should:
python -c "from playwright.async_api import async_playwright; print('OK')"| Element | Selector | Notes |
|---|---|---|
| --------- | ---------- | ------- |
| Search input | input.search-input | id=txt_search |
| Search button | input.search-btn | type="button" |
| Result rows | .result-table-list tbody tr | Each row = one paper |
| Title link | td.name a.fz14 | Paper title |
| Date | td.date | Format: "2024-03-15" or "2024年3期" |
| Database type | td.data | "期刊", "硕士", "博士" |
| CAPTCHA | #tcaptcha_transform_dy | Visible when getBoundingClientRect().top >= 0 |
.result-table-list
tbody
tr
td.seq # Row number
td.name
a.fz14 # Title + detail URL
td.author
a.KnowledgeNetLink # Author names
td.source
a # Journal name
td.date # "2025-01-15" or "2025年1期"
td.data # "期刊" | "硕士" | "博士"
td.quote # Citation count
td.download # Download count
共 1 个版本