initial commit

2026-06-19 17:31:51 +02:00
commit ef4936b10c
49 changed files with 4554 additions and 0 deletions
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""
+split_titlepage.py
+
+Splits a Quarto-manuscript-rendered .docx into two files:
+  - <output>-titlepage.docx : title, authors, affiliations (front matter)
+  - <output>-body.docx      : everything from the first section break onward
+
+Why this works:
+Quarto's manuscript/scholarly docx title-block template emits a section
+break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
+title-block content, before the body begins. This script finds that first
+section break in word/document.xml and splits there.
+
+Safety check:
+Before writing output, the script verifies the "title page" portion
+actually contains the document's title and at least one author name
+(read from the source .qmd YAML front matter, or passed explicitly).
+If this check fails, the script aborts with a clear error rather than
+silently producing a wrong split - getting this wrong has real
+deanonymization consequences for blind peer review.
+
+This script is self-contained (standard library only: zipfile, re, shutil)
+and does NOT depend on any external docx-editing toolkit. It works directly
+with the docx ZIP container, replacing only word/document.xml in each of
+two copies of the original archive.
+
+Usage:
+    python split_titlepage.py INPUT.docx OUTDIR \
+        --title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
+
+Exit codes:
+    0  success
+    1  split point not found
+    2  safety check failed (title/author not found in detected title page)
+    3  other error (bad args, file not found)
+"""
+
+import argparse
+import re
+import shutil
+import sys
+import zipfile
+from pathlib import Path
+
+
+SECTPR_PATTERN = re.compile(
+    r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
+    re.DOTALL,
+)
+
+# A bare sectPr also legitimately appears as the LAST element of body,
+# as a direct child of <w:body> (not inside a paragraph) -- that one
+# describes the final/only section and is NOT a split point on its own.
+# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
+# an explicit section break before the end of the document.
+
+
+def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
+    """
+    Returns (start, end) character offsets of the first paragraph
+    containing a section break, or None if not found.
+    """
+    match = SECTPR_PATTERN.search(document_xml)
+    if not match:
+        return None
+    return match.span()
+
+
+def strip_tags_for_text_check(xml_fragment: str) -> str:
+    """Crude tag stripping for a plain-text containment check."""
+    text = re.sub(r'<[^>]+>', ' ', xml_fragment)
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+
+
+def read_document_xml(docx_path: Path) -> str:
+    with zipfile.ZipFile(docx_path, "r") as z:
+        return z.read("word/document.xml").decode("utf-8")
+
+
+def write_docx_with_replaced_document_xml(
+    source_docx: Path, new_document_xml: str, dest_docx: Path
+) -> None:
+    """Copy source_docx to dest_docx, replacing only word/document.xml."""
+    with zipfile.ZipFile(source_docx, "r") as src:
+        with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
+            for item in src.infolist():
+                data = src.read(item.filename)
+                if item.filename == "word/document.xml":
+                    data = new_document_xml.encode("utf-8")
+                dst.writestr(item, data)
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
+    parser.add_argument("outdir", type=Path, help="Directory to write split output into")
+    parser.add_argument("--title", required=True, help="Document title, for the safety check")
+    parser.add_argument(
+        "--author", action="append", default=[],
+        help="Author name for the safety check; repeat flag for multiple authors"
+    )
+    parser.add_argument(
+        "--authors", default=None,
+        help="Comma-separated author names for the safety check "
+             "(simpler alternative to repeating --author from a Taskfile)"
+    )
+    parser.add_argument(
+        "--basename", default=None,
+        help="Base name for output files (default: input filename stem)"
+    )
+    parser.add_argument(
+        "--allow-no-author-match", action="store_true",
+        help="Downgrade the author-match safety check from fatal to a warning"
+    )
+    args = parser.parse_args()
+
+    authors = list(args.author)
+    if args.authors:
+        authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
+    args.author = authors
+
+    if not args.input_docx.exists():
+        print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
+        sys.exit(3)
+
+    basename = args.basename or args.input_docx.stem
+    args.outdir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Splitting title page from: {args.input_docx}")
+
+    try:
+        xml = read_document_xml(args.input_docx)
+    except KeyError:
+        print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
+              file=sys.stderr)
+        sys.exit(3)
+
+    body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
+    if not body_match:
+        print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
+        sys.exit(3)
+    body_start, body_end = body_match.span(1)
+    body_content = xml[body_start:body_end]
+
+    split = find_first_section_break(body_content)
+    if split is None:
+        print(
+            "ERROR: no section break found in document body. "
+            "Expected Quarto's manuscript docx template to emit a "
+            "section break after the title block. Aborting split — "
+            "check that the source document actually came from the "
+            "manuscript title-block template.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    _, split_end = split
+    titlepage_fragment = body_content[:split_end]
+    body_fragment = body_content[split_end:]
+
+    # --- Safety check ---
+    titlepage_text = strip_tags_for_text_check(titlepage_fragment)
+    title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
+    authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
+
+    print(f"  Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
+
+    problems = []
+    if not title_found:
+        problems.append(f"title {args.title!r} not found in detected title-page text")
+    if args.author and not authors_found:
+        problems.append(
+            f"none of the expected authors {args.author!r} found in detected title-page text"
+        )
+
+    if problems:
+        msg = (
+            "SAFETY CHECK FAILED: the detected 'title page' section does not "
+            "appear to contain the expected title/author metadata:\n  - "
+            + "\n  - ".join(problems)
+            + "\nThis usually means the section-break detection found the "
+            "wrong split point. Refusing to write output to avoid a silent "
+            "deanonymization risk."
+        )
+        if args.allow_no_author_match:
+            print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
+                  file=sys.stderr)
+        else:
+            print(f"ERROR: {msg}", file=sys.stderr)
+            sys.exit(2)
+    else:
+        print(f"  Safety check passed (title found: {title_found}, authors found: {authors_found})")
+
+    def build_full_xml(body_inner: str) -> str:
+        return xml[:body_start] + body_inner + xml[body_end:]
+
+    titlepage_out = args.outdir / f"{basename}-titlepage.docx"
+    body_out = args.outdir / f"{basename}-body.docx"
+
+    write_docx_with_replaced_document_xml(
+        args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
+    )
+    write_docx_with_replaced_document_xml(
+        args.input_docx, build_full_xml(body_fragment), body_out
+    )
+
+    print(f"Done.\n  Title page -> {titlepage_out}\n  Body       -> {body_out}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+task_utils.py — cross-platform file operations for the Taskfile pipeline.
+
+Replaces shell-specific commands (PowerShell Compress-Archive, Remove-Item,
+etc.) with plain Python stdlib calls that behave identically on Windows,
+Linux, and macOS. Called from Task as:
+
+    python ../resources/scripts/task_utils.py <subcommand> [args...]
+
+Subcommands:
+    zip SRC_DIR DEST_ZIP
+        Zip the contents of SRC_DIR into DEST_ZIP (overwrites if it exists).
+        Fails with a clear message (exit 1) if SRC_DIR doesn't exist.
+
+    zip-if-exists SRC_DIR DEST_ZIP
+        Same as `zip`, but exits 0 with a warning (no error) if SRC_DIR
+        doesn't exist, instead of failing. Used for optional things like
+        a project's data/ folder.
+
+    clean-project PROJECT_DIR
+        Remove _output/, .quarto/, and any *_files/*_cache directories
+        found anywhere under PROJECT_DIR. Safe to call even if nothing
+        exists yet.
+
+    copy-if-exists SRC DEST
+        Copy a single file from SRC to DEST if SRC exists; otherwise
+        print a warning and exit 0 (does not fail the pipeline).
+
+    today
+        Print today's date as YYYY-MM-DD (used for the finalized/ folder
+        name). No platform-specific date command needed.
+"""
+
+import shutil
+import sys
+import zipfile
+from datetime import date
+from pathlib import Path
+
+
+def cmd_zip(src_dir: str, dest_zip: str, allow_missing: bool) -> int:
+    src = Path(src_dir)
+    dest = Path(dest_zip)
+
+    if not src.exists() or not src.is_dir():
+        msg = f"'{src}' does not exist or is not a directory"
+        if allow_missing:
+            print(f"WARNING: {msg} — skipping zip of {dest}")
+            return 0
+        print(f"ERROR: {msg} — did you render first?", file=sys.stderr)
+        return 1
+
+    files = [p for p in src.rglob("*") if p.is_file()]
+    if not files:
+        msg = f"'{src}' exists but contains no files"
+        if allow_missing:
+            print(f"WARNING: {msg} — skipping zip of {dest}")
+            return 0
+        print(f"ERROR: {msg}", file=sys.stderr)
+        return 1
+
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    if dest.exists():
+        dest.unlink()
+
+    with zipfile.ZipFile(dest, "w", zipfile.ZIP_DEFLATED) as zf:
+        for f in files:
+            zf.write(f, f.relative_to(src))
+
+    print(f"Created {dest} ({len(files)} files)")
+    return 0
+
+
+def cmd_clean_project(project_dir: str) -> int:
+    root = Path(project_dir)
+
+    for name in ("_output", ".quarto"):
+        target = root / name
+        if target.exists():
+            shutil.rmtree(target, ignore_errors=True)
+            print(f"Removed {target}")
+
+    for pattern in ("*_files", "*_cache"):
+        for match in root.rglob(pattern):
+            if match.is_dir():
+                shutil.rmtree(match, ignore_errors=True)
+                print(f"Removed {match}")
+
+    return 0
+
+def cmd_clean_zips(project_dir: str) -> int:
+    root = Path(project_dir)
+
+    for match in root.rglob("*.zip"):
+        if match.is_file():
+            shutil.rmtree(match, ignore_errors=True)
+            print(f"Removed {match}")
+
+    return 0
+
+def cmd_copy_if_exists(src: str, dest: str) -> int:
+    src_path = Path(src)
+    dest_path = Path(dest)
+
+    if not src_path.exists():
+        print(f"WARNING: '{src_path}' not found, skipping copy")
+        return 0
+
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(src_path, dest_path)
+    print(f"Copied {src_path} -> {dest_path}")
+    return 0
+
+
+def cmd_today() -> int:
+    print(date.today().isoformat())
+    return 0
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        print(__doc__, file=sys.stderr)
+        return 1
+
+    subcommand = sys.argv[1]
+    args = sys.argv[2:]
+
+    if subcommand == "zip":
+        if len(args) != 2:
+            print("usage: task_utils.py zip SRC_DIR DEST_ZIP", file=sys.stderr)
+            return 1
+        return cmd_zip(args[0], args[1], allow_missing=False)
+
+    if subcommand == "zip-if-exists":
+        if len(args) != 2:
+            print("usage: task_utils.py zip-if-exists SRC_DIR DEST_ZIP", file=sys.stderr)
+            return 1
+        return cmd_zip(args[0], args[1], allow_missing=True)
+
+    if subcommand == "clean-project":
+        if len(args) != 1:
+            print("usage: task_utils.py clean-project PROJECT_DIR", file=sys.stderr)
+            return 1
+        return cmd_clean_project(args[0])
+
+    if subcommand == "clean-zips":
+        if len(args) != 1:
+            print("usage: task_utils.py clean-zips PROJECT_DIR", file=sys.stderr)
+            return 1
+        return cmd_clean_zips(args[0])
+
+    if subcommand == "copy-if-exists":
+        if len(args) != 2:
+            print("usage: task_utils.py copy-if-exists SRC DEST", file=sys.stderr)
+            return 1
+        return cmd_copy_if_exists(args[0], args[1])
+
+    if subcommand == "today":
+        return cmd_today()
+
+    print(f"ERROR: unknown subcommand '{subcommand}'", file=sys.stderr)
+    print(__doc__, file=sys.stderr)
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())