initial commit

This commit is contained in:
2026-06-19 17:31:51 +02:00
commit ef4936b10c
49 changed files with 4554 additions and 0 deletions
+213
View File
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
split_titlepage.py
Splits a Quarto-manuscript-rendered .docx into two files:
- <output>-titlepage.docx : title, authors, affiliations (front matter)
- <output>-body.docx : everything from the first section break onward
Why this works:
Quarto's manuscript/scholarly docx title-block template emits a section
break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
title-block content, before the body begins. This script finds that first
section break in word/document.xml and splits there.
Safety check:
Before writing output, the script verifies the "title page" portion
actually contains the document's title and at least one author name
(read from the source .qmd YAML front matter, or passed explicitly).
If this check fails, the script aborts with a clear error rather than
silently producing a wrong split - getting this wrong has real
deanonymization consequences for blind peer review.
This script is self-contained (standard library only: zipfile, re, shutil)
and does NOT depend on any external docx-editing toolkit. It works directly
with the docx ZIP container, replacing only word/document.xml in each of
two copies of the original archive.
Usage:
python split_titlepage.py INPUT.docx OUTDIR \
--title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
Exit codes:
0 success
1 split point not found
2 safety check failed (title/author not found in detected title page)
3 other error (bad args, file not found)
"""
import argparse
import re
import shutil
import sys
import zipfile
from pathlib import Path
SECTPR_PATTERN = re.compile(
r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
re.DOTALL,
)
# A bare sectPr also legitimately appears as the LAST element of body,
# as a direct child of <w:body> (not inside a paragraph) -- that one
# describes the final/only section and is NOT a split point on its own.
# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
# an explicit section break before the end of the document.
def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
"""
Returns (start, end) character offsets of the first paragraph
containing a section break, or None if not found.
"""
match = SECTPR_PATTERN.search(document_xml)
if not match:
return None
return match.span()
def strip_tags_for_text_check(xml_fragment: str) -> str:
"""Crude tag stripping for a plain-text containment check."""
text = re.sub(r'<[^>]+>', ' ', xml_fragment)
text = re.sub(r'\s+', ' ', text).strip()
return text
def read_document_xml(docx_path: Path) -> str:
with zipfile.ZipFile(docx_path, "r") as z:
return z.read("word/document.xml").decode("utf-8")
def write_docx_with_replaced_document_xml(
source_docx: Path, new_document_xml: str, dest_docx: Path
) -> None:
"""Copy source_docx to dest_docx, replacing only word/document.xml."""
with zipfile.ZipFile(source_docx, "r") as src:
with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
for item in src.infolist():
data = src.read(item.filename)
if item.filename == "word/document.xml":
data = new_document_xml.encode("utf-8")
dst.writestr(item, data)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
parser.add_argument("outdir", type=Path, help="Directory to write split output into")
parser.add_argument("--title", required=True, help="Document title, for the safety check")
parser.add_argument(
"--author", action="append", default=[],
help="Author name for the safety check; repeat flag for multiple authors"
)
parser.add_argument(
"--authors", default=None,
help="Comma-separated author names for the safety check "
"(simpler alternative to repeating --author from a Taskfile)"
)
parser.add_argument(
"--basename", default=None,
help="Base name for output files (default: input filename stem)"
)
parser.add_argument(
"--allow-no-author-match", action="store_true",
help="Downgrade the author-match safety check from fatal to a warning"
)
args = parser.parse_args()
authors = list(args.author)
if args.authors:
authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
args.author = authors
if not args.input_docx.exists():
print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
sys.exit(3)
basename = args.basename or args.input_docx.stem
args.outdir.mkdir(parents=True, exist_ok=True)
print(f"Splitting title page from: {args.input_docx}")
try:
xml = read_document_xml(args.input_docx)
except KeyError:
print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
file=sys.stderr)
sys.exit(3)
body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
if not body_match:
print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
sys.exit(3)
body_start, body_end = body_match.span(1)
body_content = xml[body_start:body_end]
split = find_first_section_break(body_content)
if split is None:
print(
"ERROR: no section break found in document body. "
"Expected Quarto's manuscript docx template to emit a "
"section break after the title block. Aborting split — "
"check that the source document actually came from the "
"manuscript title-block template.",
file=sys.stderr,
)
sys.exit(1)
_, split_end = split
titlepage_fragment = body_content[:split_end]
body_fragment = body_content[split_end:]
# --- Safety check ---
titlepage_text = strip_tags_for_text_check(titlepage_fragment)
title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
problems = []
if not title_found:
problems.append(f"title {args.title!r} not found in detected title-page text")
if args.author and not authors_found:
problems.append(
f"none of the expected authors {args.author!r} found in detected title-page text"
)
if problems:
msg = (
"SAFETY CHECK FAILED: the detected 'title page' section does not "
"appear to contain the expected title/author metadata:\n - "
+ "\n - ".join(problems)
+ "\nThis usually means the section-break detection found the "
"wrong split point. Refusing to write output to avoid a silent "
"deanonymization risk."
)
if args.allow_no_author_match:
print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
file=sys.stderr)
else:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(2)
else:
print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})")
def build_full_xml(body_inner: str) -> str:
return xml[:body_start] + body_inner + xml[body_end:]
titlepage_out = args.outdir / f"{basename}-titlepage.docx"
body_out = args.outdir / f"{basename}-body.docx"
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
)
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(body_fragment), body_out
)
print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}")
if __name__ == "__main__":
main()
+168
View File
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
"""
task_utils.py — cross-platform file operations for the Taskfile pipeline.
Replaces shell-specific commands (PowerShell Compress-Archive, Remove-Item,
etc.) with plain Python stdlib calls that behave identically on Windows,
Linux, and macOS. Called from Task as:
python ../resources/scripts/task_utils.py <subcommand> [args...]
Subcommands:
zip SRC_DIR DEST_ZIP
Zip the contents of SRC_DIR into DEST_ZIP (overwrites if it exists).
Fails with a clear message (exit 1) if SRC_DIR doesn't exist.
zip-if-exists SRC_DIR DEST_ZIP
Same as `zip`, but exits 0 with a warning (no error) if SRC_DIR
doesn't exist, instead of failing. Used for optional things like
a project's data/ folder.
clean-project PROJECT_DIR
Remove _output/, .quarto/, and any *_files/*_cache directories
found anywhere under PROJECT_DIR. Safe to call even if nothing
exists yet.
copy-if-exists SRC DEST
Copy a single file from SRC to DEST if SRC exists; otherwise
print a warning and exit 0 (does not fail the pipeline).
today
Print today's date as YYYY-MM-DD (used for the finalized/ folder
name). No platform-specific date command needed.
"""
import shutil
import sys
import zipfile
from datetime import date
from pathlib import Path
def cmd_zip(src_dir: str, dest_zip: str, allow_missing: bool) -> int:
src = Path(src_dir)
dest = Path(dest_zip)
if not src.exists() or not src.is_dir():
msg = f"'{src}' does not exist or is not a directory"
if allow_missing:
print(f"WARNING: {msg} — skipping zip of {dest}")
return 0
print(f"ERROR: {msg} — did you render first?", file=sys.stderr)
return 1
files = [p for p in src.rglob("*") if p.is_file()]
if not files:
msg = f"'{src}' exists but contains no files"
if allow_missing:
print(f"WARNING: {msg} — skipping zip of {dest}")
return 0
print(f"ERROR: {msg}", file=sys.stderr)
return 1
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
dest.unlink()
with zipfile.ZipFile(dest, "w", zipfile.ZIP_DEFLATED) as zf:
for f in files:
zf.write(f, f.relative_to(src))
print(f"Created {dest} ({len(files)} files)")
return 0
def cmd_clean_project(project_dir: str) -> int:
root = Path(project_dir)
for name in ("_output", ".quarto"):
target = root / name
if target.exists():
shutil.rmtree(target, ignore_errors=True)
print(f"Removed {target}")
for pattern in ("*_files", "*_cache"):
for match in root.rglob(pattern):
if match.is_dir():
shutil.rmtree(match, ignore_errors=True)
print(f"Removed {match}")
return 0
def cmd_clean_zips(project_dir: str) -> int:
root = Path(project_dir)
for match in root.rglob("*.zip"):
if match.is_file():
shutil.rmtree(match, ignore_errors=True)
print(f"Removed {match}")
return 0
def cmd_copy_if_exists(src: str, dest: str) -> int:
src_path = Path(src)
dest_path = Path(dest)
if not src_path.exists():
print(f"WARNING: '{src_path}' not found, skipping copy")
return 0
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_path, dest_path)
print(f"Copied {src_path} -> {dest_path}")
return 0
def cmd_today() -> int:
print(date.today().isoformat())
return 0
def main() -> int:
if len(sys.argv) < 2:
print(__doc__, file=sys.stderr)
return 1
subcommand = sys.argv[1]
args = sys.argv[2:]
if subcommand == "zip":
if len(args) != 2:
print("usage: task_utils.py zip SRC_DIR DEST_ZIP", file=sys.stderr)
return 1
return cmd_zip(args[0], args[1], allow_missing=False)
if subcommand == "zip-if-exists":
if len(args) != 2:
print("usage: task_utils.py zip-if-exists SRC_DIR DEST_ZIP", file=sys.stderr)
return 1
return cmd_zip(args[0], args[1], allow_missing=True)
if subcommand == "clean-project":
if len(args) != 1:
print("usage: task_utils.py clean-project PROJECT_DIR", file=sys.stderr)
return 1
return cmd_clean_project(args[0])
if subcommand == "clean-zips":
if len(args) != 1:
print("usage: task_utils.py clean-zips PROJECT_DIR", file=sys.stderr)
return 1
return cmd_clean_zips(args[0])
if subcommand == "copy-if-exists":
if len(args) != 2:
print("usage: task_utils.py copy-if-exists SRC DEST", file=sys.stderr)
return 1
return cmd_copy_if_exists(args[0], args[1])
if subcommand == "today":
return cmd_today()
print(f"ERROR: unknown subcommand '{subcommand}'", file=sys.stderr)
print(__doc__, file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())