Files
2026-06-19 17:31:51 +02:00

213 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
split_titlepage.py
Splits a Quarto-manuscript-rendered .docx into two files:
- <output>-titlepage.docx : title, authors, affiliations (front matter)
- <output>-body.docx : everything from the first section break onward
Why this works:
Quarto's manuscript/scholarly docx title-block template emits a section
break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
title-block content, before the body begins. This script finds that first
section break in word/document.xml and splits there.
Safety check:
Before writing output, the script verifies the "title page" portion
actually contains the document's title and at least one author name
(read from the source .qmd YAML front matter, or passed explicitly).
If this check fails, the script aborts with a clear error rather than
silently producing a wrong split - getting this wrong has real
deanonymization consequences for blind peer review.
This script is self-contained (standard library only: zipfile, re, shutil)
and does NOT depend on any external docx-editing toolkit. It works directly
with the docx ZIP container, replacing only word/document.xml in each of
two copies of the original archive.
Usage:
python split_titlepage.py INPUT.docx OUTDIR \
--title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
Exit codes:
0 success
1 split point not found
2 safety check failed (title/author not found in detected title page)
3 other error (bad args, file not found)
"""
import argparse
import re
import shutil
import sys
import zipfile
from pathlib import Path
SECTPR_PATTERN = re.compile(
r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
re.DOTALL,
)
# A bare sectPr also legitimately appears as the LAST element of body,
# as a direct child of <w:body> (not inside a paragraph) -- that one
# describes the final/only section and is NOT a split point on its own.
# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
# an explicit section break before the end of the document.
def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
"""
Returns (start, end) character offsets of the first paragraph
containing a section break, or None if not found.
"""
match = SECTPR_PATTERN.search(document_xml)
if not match:
return None
return match.span()
def strip_tags_for_text_check(xml_fragment: str) -> str:
"""Crude tag stripping for a plain-text containment check."""
text = re.sub(r'<[^>]+>', ' ', xml_fragment)
text = re.sub(r'\s+', ' ', text).strip()
return text
def read_document_xml(docx_path: Path) -> str:
with zipfile.ZipFile(docx_path, "r") as z:
return z.read("word/document.xml").decode("utf-8")
def write_docx_with_replaced_document_xml(
source_docx: Path, new_document_xml: str, dest_docx: Path
) -> None:
"""Copy source_docx to dest_docx, replacing only word/document.xml."""
with zipfile.ZipFile(source_docx, "r") as src:
with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
for item in src.infolist():
data = src.read(item.filename)
if item.filename == "word/document.xml":
data = new_document_xml.encode("utf-8")
dst.writestr(item, data)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
parser.add_argument("outdir", type=Path, help="Directory to write split output into")
parser.add_argument("--title", required=True, help="Document title, for the safety check")
parser.add_argument(
"--author", action="append", default=[],
help="Author name for the safety check; repeat flag for multiple authors"
)
parser.add_argument(
"--authors", default=None,
help="Comma-separated author names for the safety check "
"(simpler alternative to repeating --author from a Taskfile)"
)
parser.add_argument(
"--basename", default=None,
help="Base name for output files (default: input filename stem)"
)
parser.add_argument(
"--allow-no-author-match", action="store_true",
help="Downgrade the author-match safety check from fatal to a warning"
)
args = parser.parse_args()
authors = list(args.author)
if args.authors:
authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
args.author = authors
if not args.input_docx.exists():
print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
sys.exit(3)
basename = args.basename or args.input_docx.stem
args.outdir.mkdir(parents=True, exist_ok=True)
print(f"Splitting title page from: {args.input_docx}")
try:
xml = read_document_xml(args.input_docx)
except KeyError:
print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
file=sys.stderr)
sys.exit(3)
body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
if not body_match:
print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
sys.exit(3)
body_start, body_end = body_match.span(1)
body_content = xml[body_start:body_end]
split = find_first_section_break(body_content)
if split is None:
print(
"ERROR: no section break found in document body. "
"Expected Quarto's manuscript docx template to emit a "
"section break after the title block. Aborting split — "
"check that the source document actually came from the "
"manuscript title-block template.",
file=sys.stderr,
)
sys.exit(1)
_, split_end = split
titlepage_fragment = body_content[:split_end]
body_fragment = body_content[split_end:]
# --- Safety check ---
titlepage_text = strip_tags_for_text_check(titlepage_fragment)
title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
problems = []
if not title_found:
problems.append(f"title {args.title!r} not found in detected title-page text")
if args.author and not authors_found:
problems.append(
f"none of the expected authors {args.author!r} found in detected title-page text"
)
if problems:
msg = (
"SAFETY CHECK FAILED: the detected 'title page' section does not "
"appear to contain the expected title/author metadata:\n - "
+ "\n - ".join(problems)
+ "\nThis usually means the section-break detection found the "
"wrong split point. Refusing to write output to avoid a silent "
"deanonymization risk."
)
if args.allow_no_author_match:
print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
file=sys.stderr)
else:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(2)
else:
print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})")
def build_full_xml(body_inner: str) -> str:
return xml[:body_start] + body_inner + xml[body_end:]
titlepage_out = args.outdir / f"{basename}-titlepage.docx"
body_out = args.outdir / f"{basename}-body.docx"
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
)
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(body_fragment), body_out
)
print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}")
if __name__ == "__main__":
main()