213 lines
8.0 KiB
Python
213 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
split_titlepage.py
|
|
|
|
Splits a Quarto-manuscript-rendered .docx into two files:
|
|
- <output>-titlepage.docx : title, authors, affiliations (front matter)
|
|
- <output>-body.docx : everything from the first section break onward
|
|
|
|
Why this works:
|
|
Quarto's manuscript/scholarly docx title-block template emits a section
|
|
break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
|
|
title-block content, before the body begins. This script finds that first
|
|
section break in word/document.xml and splits there.
|
|
|
|
Safety check:
|
|
Before writing output, the script verifies the "title page" portion
|
|
actually contains the document's title and at least one author name
|
|
(read from the source .qmd YAML front matter, or passed explicitly).
|
|
If this check fails, the script aborts with a clear error rather than
|
|
silently producing a wrong split - getting this wrong has real
|
|
deanonymization consequences for blind peer review.
|
|
|
|
This script is self-contained (standard library only: zipfile, re, shutil)
|
|
and does NOT depend on any external docx-editing toolkit. It works directly
|
|
with the docx ZIP container, replacing only word/document.xml in each of
|
|
two copies of the original archive.
|
|
|
|
Usage:
|
|
python split_titlepage.py INPUT.docx OUTDIR \
|
|
--title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
|
|
|
|
Exit codes:
|
|
0 success
|
|
1 split point not found
|
|
2 safety check failed (title/author not found in detected title page)
|
|
3 other error (bad args, file not found)
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
|
|
SECTPR_PATTERN = re.compile(
|
|
r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
|
|
re.DOTALL,
|
|
)
|
|
|
|
# A bare sectPr also legitimately appears as the LAST element of body,
|
|
# as a direct child of <w:body> (not inside a paragraph) -- that one
|
|
# describes the final/only section and is NOT a split point on its own.
|
|
# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
|
|
# an explicit section break before the end of the document.
|
|
|
|
|
|
def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
|
|
"""
|
|
Returns (start, end) character offsets of the first paragraph
|
|
containing a section break, or None if not found.
|
|
"""
|
|
match = SECTPR_PATTERN.search(document_xml)
|
|
if not match:
|
|
return None
|
|
return match.span()
|
|
|
|
|
|
def strip_tags_for_text_check(xml_fragment: str) -> str:
|
|
"""Crude tag stripping for a plain-text containment check."""
|
|
text = re.sub(r'<[^>]+>', ' ', xml_fragment)
|
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
return text
|
|
|
|
|
|
def read_document_xml(docx_path: Path) -> str:
|
|
with zipfile.ZipFile(docx_path, "r") as z:
|
|
return z.read("word/document.xml").decode("utf-8")
|
|
|
|
|
|
def write_docx_with_replaced_document_xml(
|
|
source_docx: Path, new_document_xml: str, dest_docx: Path
|
|
) -> None:
|
|
"""Copy source_docx to dest_docx, replacing only word/document.xml."""
|
|
with zipfile.ZipFile(source_docx, "r") as src:
|
|
with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
|
|
for item in src.infolist():
|
|
data = src.read(item.filename)
|
|
if item.filename == "word/document.xml":
|
|
data = new_document_xml.encode("utf-8")
|
|
dst.writestr(item, data)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
|
|
parser.add_argument("outdir", type=Path, help="Directory to write split output into")
|
|
parser.add_argument("--title", required=True, help="Document title, for the safety check")
|
|
parser.add_argument(
|
|
"--author", action="append", default=[],
|
|
help="Author name for the safety check; repeat flag for multiple authors"
|
|
)
|
|
parser.add_argument(
|
|
"--authors", default=None,
|
|
help="Comma-separated author names for the safety check "
|
|
"(simpler alternative to repeating --author from a Taskfile)"
|
|
)
|
|
parser.add_argument(
|
|
"--basename", default=None,
|
|
help="Base name for output files (default: input filename stem)"
|
|
)
|
|
parser.add_argument(
|
|
"--allow-no-author-match", action="store_true",
|
|
help="Downgrade the author-match safety check from fatal to a warning"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
authors = list(args.author)
|
|
if args.authors:
|
|
authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
|
|
args.author = authors
|
|
|
|
if not args.input_docx.exists():
|
|
print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
|
|
sys.exit(3)
|
|
|
|
basename = args.basename or args.input_docx.stem
|
|
args.outdir.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Splitting title page from: {args.input_docx}")
|
|
|
|
try:
|
|
xml = read_document_xml(args.input_docx)
|
|
except KeyError:
|
|
print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
|
|
file=sys.stderr)
|
|
sys.exit(3)
|
|
|
|
body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
|
|
if not body_match:
|
|
print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
|
|
sys.exit(3)
|
|
body_start, body_end = body_match.span(1)
|
|
body_content = xml[body_start:body_end]
|
|
|
|
split = find_first_section_break(body_content)
|
|
if split is None:
|
|
print(
|
|
"ERROR: no section break found in document body. "
|
|
"Expected Quarto's manuscript docx template to emit a "
|
|
"section break after the title block. Aborting split — "
|
|
"check that the source document actually came from the "
|
|
"manuscript title-block template.",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(1)
|
|
|
|
_, split_end = split
|
|
titlepage_fragment = body_content[:split_end]
|
|
body_fragment = body_content[split_end:]
|
|
|
|
# --- Safety check ---
|
|
titlepage_text = strip_tags_for_text_check(titlepage_fragment)
|
|
title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
|
|
authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
|
|
|
|
print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
|
|
|
|
problems = []
|
|
if not title_found:
|
|
problems.append(f"title {args.title!r} not found in detected title-page text")
|
|
if args.author and not authors_found:
|
|
problems.append(
|
|
f"none of the expected authors {args.author!r} found in detected title-page text"
|
|
)
|
|
|
|
if problems:
|
|
msg = (
|
|
"SAFETY CHECK FAILED: the detected 'title page' section does not "
|
|
"appear to contain the expected title/author metadata:\n - "
|
|
+ "\n - ".join(problems)
|
|
+ "\nThis usually means the section-break detection found the "
|
|
"wrong split point. Refusing to write output to avoid a silent "
|
|
"deanonymization risk."
|
|
)
|
|
if args.allow_no_author_match:
|
|
print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
|
|
file=sys.stderr)
|
|
else:
|
|
print(f"ERROR: {msg}", file=sys.stderr)
|
|
sys.exit(2)
|
|
else:
|
|
print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})")
|
|
|
|
def build_full_xml(body_inner: str) -> str:
|
|
return xml[:body_start] + body_inner + xml[body_end:]
|
|
|
|
titlepage_out = args.outdir / f"{basename}-titlepage.docx"
|
|
body_out = args.outdir / f"{basename}-body.docx"
|
|
|
|
write_docx_with_replaced_document_xml(
|
|
args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
|
|
)
|
|
write_docx_with_replaced_document_xml(
|
|
args.input_docx, build_full_xml(body_fragment), body_out
|
|
)
|
|
|
|
print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |