initial commit
This commit is contained in:
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
split_titlepage.py
|
||||
|
||||
Splits a Quarto-manuscript-rendered .docx into two files:
|
||||
- <output>-titlepage.docx : title, authors, affiliations (front matter)
|
||||
- <output>-body.docx : everything from the first section break onward
|
||||
|
||||
Why this works:
|
||||
Quarto's manuscript/scholarly docx title-block template emits a section
|
||||
break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
|
||||
title-block content, before the body begins. This script finds that first
|
||||
section break in word/document.xml and splits there.
|
||||
|
||||
Safety check:
|
||||
Before writing output, the script verifies the "title page" portion
|
||||
actually contains the document's title and at least one author name
|
||||
(read from the source .qmd YAML front matter, or passed explicitly).
|
||||
If this check fails, the script aborts with a clear error rather than
|
||||
silently producing a wrong split - getting this wrong has real
|
||||
deanonymization consequences for blind peer review.
|
||||
|
||||
This script is self-contained (standard library only: zipfile, re, shutil)
|
||||
and does NOT depend on any external docx-editing toolkit. It works directly
|
||||
with the docx ZIP container, replacing only word/document.xml in each of
|
||||
two copies of the original archive.
|
||||
|
||||
Usage:
|
||||
python split_titlepage.py INPUT.docx OUTDIR \
|
||||
--title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
|
||||
|
||||
Exit codes:
|
||||
0 success
|
||||
1 split point not found
|
||||
2 safety check failed (title/author not found in detected title page)
|
||||
3 other error (bad args, file not found)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
SECTPR_PATTERN = re.compile(
|
||||
r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
|
||||
re.DOTALL,
|
||||
)
|
||||
|
||||
# A bare sectPr also legitimately appears as the LAST element of body,
|
||||
# as a direct child of <w:body> (not inside a paragraph) -- that one
|
||||
# describes the final/only section and is NOT a split point on its own.
|
||||
# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
|
||||
# an explicit section break before the end of the document.
|
||||
|
||||
|
||||
def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
|
||||
"""
|
||||
Returns (start, end) character offsets of the first paragraph
|
||||
containing a section break, or None if not found.
|
||||
"""
|
||||
match = SECTPR_PATTERN.search(document_xml)
|
||||
if not match:
|
||||
return None
|
||||
return match.span()
|
||||
|
||||
|
||||
def strip_tags_for_text_check(xml_fragment: str) -> str:
|
||||
"""Crude tag stripping for a plain-text containment check."""
|
||||
text = re.sub(r'<[^>]+>', ' ', xml_fragment)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
|
||||
def read_document_xml(docx_path: Path) -> str:
|
||||
with zipfile.ZipFile(docx_path, "r") as z:
|
||||
return z.read("word/document.xml").decode("utf-8")
|
||||
|
||||
|
||||
def write_docx_with_replaced_document_xml(
|
||||
source_docx: Path, new_document_xml: str, dest_docx: Path
|
||||
) -> None:
|
||||
"""Copy source_docx to dest_docx, replacing only word/document.xml."""
|
||||
with zipfile.ZipFile(source_docx, "r") as src:
|
||||
with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
|
||||
for item in src.infolist():
|
||||
data = src.read(item.filename)
|
||||
if item.filename == "word/document.xml":
|
||||
data = new_document_xml.encode("utf-8")
|
||||
dst.writestr(item, data)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
|
||||
parser.add_argument("outdir", type=Path, help="Directory to write split output into")
|
||||
parser.add_argument("--title", required=True, help="Document title, for the safety check")
|
||||
parser.add_argument(
|
||||
"--author", action="append", default=[],
|
||||
help="Author name for the safety check; repeat flag for multiple authors"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--authors", default=None,
|
||||
help="Comma-separated author names for the safety check "
|
||||
"(simpler alternative to repeating --author from a Taskfile)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--basename", default=None,
|
||||
help="Base name for output files (default: input filename stem)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--allow-no-author-match", action="store_true",
|
||||
help="Downgrade the author-match safety check from fatal to a warning"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
authors = list(args.author)
|
||||
if args.authors:
|
||||
authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
|
||||
args.author = authors
|
||||
|
||||
if not args.input_docx.exists():
|
||||
print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
basename = args.basename or args.input_docx.stem
|
||||
args.outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"Splitting title page from: {args.input_docx}")
|
||||
|
||||
try:
|
||||
xml = read_document_xml(args.input_docx)
|
||||
except KeyError:
|
||||
print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
|
||||
file=sys.stderr)
|
||||
sys.exit(3)
|
||||
|
||||
body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
|
||||
if not body_match:
|
||||
print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
|
||||
sys.exit(3)
|
||||
body_start, body_end = body_match.span(1)
|
||||
body_content = xml[body_start:body_end]
|
||||
|
||||
split = find_first_section_break(body_content)
|
||||
if split is None:
|
||||
print(
|
||||
"ERROR: no section break found in document body. "
|
||||
"Expected Quarto's manuscript docx template to emit a "
|
||||
"section break after the title block. Aborting split — "
|
||||
"check that the source document actually came from the "
|
||||
"manuscript title-block template.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
_, split_end = split
|
||||
titlepage_fragment = body_content[:split_end]
|
||||
body_fragment = body_content[split_end:]
|
||||
|
||||
# --- Safety check ---
|
||||
titlepage_text = strip_tags_for_text_check(titlepage_fragment)
|
||||
title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
|
||||
authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
|
||||
|
||||
print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
|
||||
|
||||
problems = []
|
||||
if not title_found:
|
||||
problems.append(f"title {args.title!r} not found in detected title-page text")
|
||||
if args.author and not authors_found:
|
||||
problems.append(
|
||||
f"none of the expected authors {args.author!r} found in detected title-page text"
|
||||
)
|
||||
|
||||
if problems:
|
||||
msg = (
|
||||
"SAFETY CHECK FAILED: the detected 'title page' section does not "
|
||||
"appear to contain the expected title/author metadata:\n - "
|
||||
+ "\n - ".join(problems)
|
||||
+ "\nThis usually means the section-break detection found the "
|
||||
"wrong split point. Refusing to write output to avoid a silent "
|
||||
"deanonymization risk."
|
||||
)
|
||||
if args.allow_no_author_match:
|
||||
print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
|
||||
file=sys.stderr)
|
||||
else:
|
||||
print(f"ERROR: {msg}", file=sys.stderr)
|
||||
sys.exit(2)
|
||||
else:
|
||||
print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})")
|
||||
|
||||
def build_full_xml(body_inner: str) -> str:
|
||||
return xml[:body_start] + body_inner + xml[body_end:]
|
||||
|
||||
titlepage_out = args.outdir / f"{basename}-titlepage.docx"
|
||||
body_out = args.outdir / f"{basename}-body.docx"
|
||||
|
||||
write_docx_with_replaced_document_xml(
|
||||
args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
|
||||
)
|
||||
write_docx_with_replaced_document_xml(
|
||||
args.input_docx, build_full_xml(body_fragment), body_out
|
||||
)
|
||||
|
||||
print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
task_utils.py — cross-platform file operations for the Taskfile pipeline.
|
||||
|
||||
Replaces shell-specific commands (PowerShell Compress-Archive, Remove-Item,
|
||||
etc.) with plain Python stdlib calls that behave identically on Windows,
|
||||
Linux, and macOS. Called from Task as:
|
||||
|
||||
python ../resources/scripts/task_utils.py <subcommand> [args...]
|
||||
|
||||
Subcommands:
|
||||
zip SRC_DIR DEST_ZIP
|
||||
Zip the contents of SRC_DIR into DEST_ZIP (overwrites if it exists).
|
||||
Fails with a clear message (exit 1) if SRC_DIR doesn't exist.
|
||||
|
||||
zip-if-exists SRC_DIR DEST_ZIP
|
||||
Same as `zip`, but exits 0 with a warning (no error) if SRC_DIR
|
||||
doesn't exist, instead of failing. Used for optional things like
|
||||
a project's data/ folder.
|
||||
|
||||
clean-project PROJECT_DIR
|
||||
Remove _output/, .quarto/, and any *_files/*_cache directories
|
||||
found anywhere under PROJECT_DIR. Safe to call even if nothing
|
||||
exists yet.
|
||||
|
||||
copy-if-exists SRC DEST
|
||||
Copy a single file from SRC to DEST if SRC exists; otherwise
|
||||
print a warning and exit 0 (does not fail the pipeline).
|
||||
|
||||
today
|
||||
Print today's date as YYYY-MM-DD (used for the finalized/ folder
|
||||
name). No platform-specific date command needed.
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
import zipfile
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def cmd_zip(src_dir: str, dest_zip: str, allow_missing: bool) -> int:
|
||||
src = Path(src_dir)
|
||||
dest = Path(dest_zip)
|
||||
|
||||
if not src.exists() or not src.is_dir():
|
||||
msg = f"'{src}' does not exist or is not a directory"
|
||||
if allow_missing:
|
||||
print(f"WARNING: {msg} — skipping zip of {dest}")
|
||||
return 0
|
||||
print(f"ERROR: {msg} — did you render first?", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
files = [p for p in src.rglob("*") if p.is_file()]
|
||||
if not files:
|
||||
msg = f"'{src}' exists but contains no files"
|
||||
if allow_missing:
|
||||
print(f"WARNING: {msg} — skipping zip of {dest}")
|
||||
return 0
|
||||
print(f"ERROR: {msg}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
if dest.exists():
|
||||
dest.unlink()
|
||||
|
||||
with zipfile.ZipFile(dest, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||
for f in files:
|
||||
zf.write(f, f.relative_to(src))
|
||||
|
||||
print(f"Created {dest} ({len(files)} files)")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_clean_project(project_dir: str) -> int:
|
||||
root = Path(project_dir)
|
||||
|
||||
for name in ("_output", ".quarto"):
|
||||
target = root / name
|
||||
if target.exists():
|
||||
shutil.rmtree(target, ignore_errors=True)
|
||||
print(f"Removed {target}")
|
||||
|
||||
for pattern in ("*_files", "*_cache"):
|
||||
for match in root.rglob(pattern):
|
||||
if match.is_dir():
|
||||
shutil.rmtree(match, ignore_errors=True)
|
||||
print(f"Removed {match}")
|
||||
|
||||
return 0
|
||||
|
||||
def cmd_clean_zips(project_dir: str) -> int:
|
||||
root = Path(project_dir)
|
||||
|
||||
for match in root.rglob("*.zip"):
|
||||
if match.is_file():
|
||||
shutil.rmtree(match, ignore_errors=True)
|
||||
print(f"Removed {match}")
|
||||
|
||||
return 0
|
||||
|
||||
def cmd_copy_if_exists(src: str, dest: str) -> int:
|
||||
src_path = Path(src)
|
||||
dest_path = Path(dest)
|
||||
|
||||
if not src_path.exists():
|
||||
print(f"WARNING: '{src_path}' not found, skipping copy")
|
||||
return 0
|
||||
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(src_path, dest_path)
|
||||
print(f"Copied {src_path} -> {dest_path}")
|
||||
return 0
|
||||
|
||||
|
||||
def cmd_today() -> int:
|
||||
print(date.today().isoformat())
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> int:
|
||||
if len(sys.argv) < 2:
|
||||
print(__doc__, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
subcommand = sys.argv[1]
|
||||
args = sys.argv[2:]
|
||||
|
||||
if subcommand == "zip":
|
||||
if len(args) != 2:
|
||||
print("usage: task_utils.py zip SRC_DIR DEST_ZIP", file=sys.stderr)
|
||||
return 1
|
||||
return cmd_zip(args[0], args[1], allow_missing=False)
|
||||
|
||||
if subcommand == "zip-if-exists":
|
||||
if len(args) != 2:
|
||||
print("usage: task_utils.py zip-if-exists SRC_DIR DEST_ZIP", file=sys.stderr)
|
||||
return 1
|
||||
return cmd_zip(args[0], args[1], allow_missing=True)
|
||||
|
||||
if subcommand == "clean-project":
|
||||
if len(args) != 1:
|
||||
print("usage: task_utils.py clean-project PROJECT_DIR", file=sys.stderr)
|
||||
return 1
|
||||
return cmd_clean_project(args[0])
|
||||
|
||||
if subcommand == "clean-zips":
|
||||
if len(args) != 1:
|
||||
print("usage: task_utils.py clean-zips PROJECT_DIR", file=sys.stderr)
|
||||
return 1
|
||||
return cmd_clean_zips(args[0])
|
||||
|
||||
if subcommand == "copy-if-exists":
|
||||
if len(args) != 2:
|
||||
print("usage: task_utils.py copy-if-exists SRC DEST", file=sys.stderr)
|
||||
return 1
|
||||
return cmd_copy_if_exists(args[0], args[1])
|
||||
|
||||
if subcommand == "today":
|
||||
return cmd_today()
|
||||
|
||||
print(f"ERROR: unknown subcommand '{subcommand}'", file=sys.stderr)
|
||||
print(__doc__, file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user