#!/usr/bin/env python3 """ split_titlepage.py Splits a Quarto-manuscript-rendered .docx into two files: - -titlepage.docx : title, authors, affiliations (front matter) - -body.docx : everything from the first section break onward Why this works: Quarto's manuscript/scholarly docx title-block template emits a section break ( inside a paragraph's ) immediately after the title-block content, before the body begins. This script finds that first section break in word/document.xml and splits there. Safety check: Before writing output, the script verifies the "title page" portion actually contains the document's title and at least one author name (read from the source .qmd YAML front matter, or passed explicitly). If this check fails, the script aborts with a clear error rather than silently producing a wrong split - getting this wrong has real deanonymization consequences for blind peer review. This script is self-contained (standard library only: zipfile, re, shutil) and does NOT depend on any external docx-editing toolkit. It works directly with the docx ZIP container, replacing only word/document.xml in each of two copies of the original archive. Usage: python split_titlepage.py INPUT.docx OUTDIR \ --title "Title" --author "Jane Doe" [--author "John Q. Doe" ...] Exit codes: 0 success 1 split point not found 2 safety check failed (title/author not found in detected title page) 3 other error (bad args, file not found) """ import argparse import re import shutil import sys import zipfile from pathlib import Path SECTPR_PATTERN = re.compile( r']*>(?:(?!).)*?.*?', re.DOTALL, ) # A bare sectPr also legitimately appears as the LAST element of body, # as a direct child of (not inside a paragraph) -- that one # describes the final/only section and is NOT a split point on its own. # We only want sectPr that appears INSIDE a paragraph's pPr, which marks # an explicit section break before the end of the document. def find_first_section_break(document_xml: str) -> "tuple[int, int] | None": """ Returns (start, end) character offsets of the first paragraph containing a section break, or None if not found. """ match = SECTPR_PATTERN.search(document_xml) if not match: return None return match.span() def strip_tags_for_text_check(xml_fragment: str) -> str: """Crude tag stripping for a plain-text containment check.""" text = re.sub(r'<[^>]+>', ' ', xml_fragment) text = re.sub(r'\s+', ' ', text).strip() return text def read_document_xml(docx_path: Path) -> str: with zipfile.ZipFile(docx_path, "r") as z: return z.read("word/document.xml").decode("utf-8") def write_docx_with_replaced_document_xml( source_docx: Path, new_document_xml: str, dest_docx: Path ) -> None: """Copy source_docx to dest_docx, replacing only word/document.xml.""" with zipfile.ZipFile(source_docx, "r") as src: with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst: for item in src.infolist(): data = src.read(item.filename) if item.filename == "word/document.xml": data = new_document_xml.encode("utf-8") dst.writestr(item, data) def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx") parser.add_argument("outdir", type=Path, help="Directory to write split output into") parser.add_argument("--title", required=True, help="Document title, for the safety check") parser.add_argument( "--author", action="append", default=[], help="Author name for the safety check; repeat flag for multiple authors" ) parser.add_argument( "--authors", default=None, help="Comma-separated author names for the safety check " "(simpler alternative to repeating --author from a Taskfile)" ) parser.add_argument( "--basename", default=None, help="Base name for output files (default: input filename stem)" ) parser.add_argument( "--allow-no-author-match", action="store_true", help="Downgrade the author-match safety check from fatal to a warning" ) args = parser.parse_args() authors = list(args.author) if args.authors: authors.extend(a.strip() for a in args.authors.split(",") if a.strip()) args.author = authors if not args.input_docx.exists(): print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr) sys.exit(3) basename = args.basename or args.input_docx.stem args.outdir.mkdir(parents=True, exist_ok=True) print(f"Splitting title page from: {args.input_docx}") try: xml = read_document_xml(args.input_docx) except KeyError: print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?", file=sys.stderr) sys.exit(3) body_match = re.search(r"]*>(.*)", xml, re.DOTALL) if not body_match: print("ERROR: could not locate in document.xml", file=sys.stderr) sys.exit(3) body_start, body_end = body_match.span(1) body_content = xml[body_start:body_end] split = find_first_section_break(body_content) if split is None: print( "ERROR: no section break found in document body. " "Expected Quarto's manuscript docx template to emit a " "section break after the title block. Aborting split — " "check that the source document actually came from the " "manuscript title-block template.", file=sys.stderr, ) sys.exit(1) _, split_end = split titlepage_fragment = body_content[:split_end] body_fragment = body_content[split_end:] # --- Safety check --- titlepage_text = strip_tags_for_text_check(titlepage_fragment) title_found = args.title.strip() != "" and args.title.strip() in titlepage_text authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text] print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}") problems = [] if not title_found: problems.append(f"title {args.title!r} not found in detected title-page text") if args.author and not authors_found: problems.append( f"none of the expected authors {args.author!r} found in detected title-page text" ) if problems: msg = ( "SAFETY CHECK FAILED: the detected 'title page' section does not " "appear to contain the expected title/author metadata:\n - " + "\n - ".join(problems) + "\nThis usually means the section-break detection found the " "wrong split point. Refusing to write output to avoid a silent " "deanonymization risk." ) if args.allow_no_author_match: print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)", file=sys.stderr) else: print(f"ERROR: {msg}", file=sys.stderr) sys.exit(2) else: print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})") def build_full_xml(body_inner: str) -> str: return xml[:body_start] + body_inner + xml[body_end:] titlepage_out = args.outdir / f"{basename}-titlepage.docx" body_out = args.outdir / f"{basename}-body.docx" write_docx_with_replaced_document_xml( args.input_docx, build_full_xml(titlepage_fragment), titlepage_out ) write_docx_with_replaced_document_xml( args.input_docx, build_full_xml(body_fragment), body_out ) print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}") if __name__ == "__main__": main()