initial commit

This commit is contained in:
2026-06-19 17:31:51 +02:00
commit ef4936b10c
49 changed files with 4554 additions and 0 deletions
+2273
View File
File diff suppressed because it is too large Load Diff
+176
View File
@@ -0,0 +1,176 @@
--[[
affiliation-blocks generate title components
Copyright © 20172021 Albert Krewinkel
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
]]
local List = require 'pandoc.List'
local utils = require 'pandoc.utils'
local stringify = utils.stringify
local default_marks
local default_marks = {
corresponding_author = FORMAT == 'latex'
and {pandoc.RawInline('latex', '*')}
or {pandoc.Str ''},
equal_contributor = FORMAT == 'latex'
and {pandoc.RawInline('latex', '$\\dagger{}$')}
or {pandoc.Str '*'},
}
local function intercalate(lists, elem)
local result = List:new{}
for i = 1, (#lists - 1) do
result:extend(lists[i])
result:extend(elem)
end
if #lists > 0 then
result:extend(lists[#lists])
end
return result
end
--- Check whether the given author is a corresponding author
local function is_corresponding_author(author)
return author.correspondence and author.email
end
--- Create inlines for a single author (includes all author notes)
local function author_inline_generator (get_mark)
return function (author)
local author_marks = List:new{}
if author.equal_contributor then
author_marks[#author_marks + 1] = get_mark 'equal_contributor'
end
local idx_str
for _, idx in ipairs(author.institute) do
if type(idx) ~= 'table' then
idx_str = tostring(idx)
else
idx_str = stringify(idx)
end
author_marks[#author_marks + 1] = {pandoc.Str(idx_str)}
end
if is_corresponding_author(author) then
author_marks[#author_marks + 1] = get_mark 'corresponding_author'
end
local res = List.clone(author.name)
res[#res + 1] = pandoc.Superscript(intercalate(author_marks, {pandoc.Str ','}))
return res
end
end
local function is_equal_contributor (author)
return author.equal_contributor
end
--- Create equal contributors note.
local function create_equal_contributors_block(authors, mark)
local has_equal_contribs = List:new(authors):find_if(is_equal_contributor)
if not has_equal_contribs then
return nil
end
local contributors = {
pandoc.Superscript(mark'equal_contributor'),
pandoc.Space(),
pandoc.Str 'These authors contributed equally to this work.'
}
return List:new{pandoc.Para(contributors)}
end
--- Generate a block list all affiliations, marked with arabic numbers.
local function create_affiliations_blocks(affiliations)
local affil_lines = List:new(affiliations):map(
function (affil, i)
local num_inlines = List:new{
pandoc.Superscript{pandoc.Str(tostring(i))},
pandoc.Space()
}
return num_inlines .. affil.name
end
)
return {pandoc.Para(intercalate(affil_lines, {pandoc.LineBreak()}))}
end
--- Generate a block element containing the correspondence information
local function create_correspondence_blocks(authors, mark)
local corresponding_authors = List:new{}
for _, author in ipairs(authors) do
if is_corresponding_author(author) then
local mailto = 'mailto:' .. pandoc.utils.stringify(author.email)
local author_with_mail = List:new(
author.name .. List:new{pandoc.Space(), pandoc.Str '<'} ..
author.email .. List:new{pandoc.Str '>'}
)
local link = pandoc.Link(author_with_mail, mailto)
table.insert(corresponding_authors, {link})
end
end
if #corresponding_authors == 0 then
return nil
end
local correspondence = List:new{
pandoc.Superscript(mark'corresponding_author'),
pandoc.Space(),
pandoc.Str'Correspondence:',
pandoc.Space()
}
local sep = List:new{pandoc.Str',', pandoc.Space()}
return {
pandoc.Para(correspondence .. intercalate(corresponding_authors, sep))
}
end
--- Generate a list of inlines containing all authors.
local function create_authors_inlines(authors, mark)
local inlines_generator = author_inline_generator(mark)
local inlines = List:new(authors):map(inlines_generator)
local and_str = List:new{pandoc.Space(), pandoc.Str'and', pandoc.Space()}
local last_author = inlines[#inlines]
inlines[#inlines] = nil
local result = intercalate(inlines, {pandoc.Str ',', pandoc.Space()})
if #authors > 1 then
result:extend(List:new{pandoc.Str ","} .. and_str)
end
result:extend(last_author)
return result
end
return {
{
Pandoc = function (doc)
local meta = doc.meta
local body = List:new{}
local mark = function (mark_name) return default_marks[mark_name] end
body:extend(create_equal_contributors_block(doc.meta.author, mark) or {})
body:extend(create_affiliations_blocks(doc.meta.institute) or {})
body:extend(create_correspondence_blocks(doc.meta.author, mark) or {})
body:extend(doc.blocks)
-- Overwrite authors with formatted values. We use a single, formatted
-- string for most formats. LaTeX output, however, looks nicer if we
-- provide a authors as a list.
meta.author = FORMAT:match 'latex'
and pandoc.MetaList(doc.meta.author):map(author_inline_generator(mark))
or pandoc.MetaInlines(create_authors_inlines(doc.meta.author, mark))
-- Institute info is now baked into the affiliations block.
meta.institute = nil
return pandoc.Pandoc(body, meta)
end
}
}
@@ -0,0 +1,2 @@
authors:
- name: Anonymous
@@ -0,0 +1,27 @@
authors:
- name: John Doe
affiliations:
- ref: jdct
corresponding: true
email: john.doe@jdct.edu
orcid: 0000-1111-2222-3333
equal-contributor: true
- name: John Roe
affiliations:
- ref: jdct
orcid: 0000-3333-2222-1111
- name: Jane Roe
affiliations:
- ref: jdct
- ref: iot
orcid: 0000-2222-1111-3333
equal-contributor: true
affiliations:
- id: jdct
name: John Doe Center for Technology, John Doe University, Doetown, Germany.
- id: iot
name: Institute of Technology, John Doe University, Doetown, Germany.
filters:
- ../resources/authors_block/authors-block.lua
+71
View File
@@ -0,0 +1,71 @@
--[[
authors-block affiliations block extension for quarto
Copyright (c) 2023 Lorenz A. Kapsner
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
]]
local List = require 'pandoc.List'
-- [import]
local from_utils = require "utils"
local normalize_affiliations = from_utils.normalize_affiliations
local normalize_authors = from_utils.normalize_authors
local normalize_latex_authors = from_utils.normalize_latex_authors
local from_authors = require "from_author_info_blocks"
local default_marks = from_authors.default_marks
local create_equal_contributors_block = from_authors.create_equal_contributors_block
local create_affiliations_blocks = from_authors.create_affiliations_blocks
local create_correspondence_blocks = from_authors.create_correspondence_blocks
local is_corresponding_author = from_authors.is_corresponding_author
local author_inline_generator = from_authors.author_inline_generator
local create_authors_inlines = from_authors.create_authors_inlines
-- [/import]
-- This is the main-part
function Pandoc(doc)
local meta = doc.meta
local body = List:new{}
-- Support both `authors:` and `author:` YAML keys; skip if no valid author list
local authors = meta.authors or meta.author
if authors == nil or authors[1] == nil or authors[1].name == nil then
return doc
end
meta.authors = List:new(authors)
local mark = function (mark_name) return default_marks[mark_name] end
body:extend(create_equal_contributors_block(meta.authors, mark) or {})
body:extend(create_affiliations_blocks(meta.affiliations) or {})
body:extend(create_correspondence_blocks(meta.authors, mark) or {})
body:extend(doc.blocks)
for _i, author in ipairs(meta.authors) do
author.test = is_corresponding_author(author)
end
meta.affiliations = normalize_affiliations(meta.affiliations)
meta.author = meta.authors:map(normalize_authors(meta.affiliations))
-- Overwrite authors with formatted values. We use a single, formatted
-- string for most formats. LaTeX output, however, looks nicer if we
-- provide a authors as a list.
meta.author = pandoc.MetaInlines(create_authors_inlines(meta.author, mark))
-- Institute info is now baked into the affiliations block.
meta.affiliations = nil
return pandoc.Pandoc(body, meta)
end
@@ -0,0 +1,201 @@
-- https://github.com/pandoc/lua-filters/commit/ca72210b453cc0d045360e0ae36448d019d7dfbf
--[[
affiliation-blocks generate title components
Copyright © 20172021 Albert Krewinkel
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
]]
-- from @kapsner
-- [import]
local from_utils = require "utils"
local has_key = from_utils.has_key
-- [/import]
local M = {}
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
local List = require 'pandoc.List'
local utils = require 'pandoc.utils'
local stringify = utils.stringify
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
local default_marks
local default_marks = {
corresponding_author = FORMAT == 'latex'
and {pandoc.RawInline('latex', '*')}
or {pandoc.Str ''},
equal_contributor = FORMAT == 'latex'
and {pandoc.RawInline('latex', '$\\dagger{}$')}
or {pandoc.Str '*'},
}
M.default_marks = default_marks
-- modified by @kapsner
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
local function is_equal_contributor(author)
if has_key(author, "attributes") then
return author.attributes["equal-contributor"]
end
return nil
end
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Create equal contributors note.
local function create_equal_contributors_block(authors, mark)
local has_equal_contribs = List:new(authors):find_if(is_equal_contributor)
if not has_equal_contribs then
return nil
end
local contributors = {
pandoc.Superscript(mark'equal_contributor'),
pandoc.Space(),
pandoc.Str 'These authors contributed equally to this work.'
}
return List:new{pandoc.Para(contributors)}
end
M.create_equal_contributors_block = create_equal_contributors_block
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
local function intercalate(lists, elem)
local result = List:new{}
for i = 1, (#lists - 1) do
result:extend(lists[i])
result:extend(elem)
end
if #lists > 0 then
result:extend(lists[#lists])
end
return result
end
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Check whether the given author is a corresponding author
local function is_corresponding_author(author)
if has_key(author, "attributes") then
if author.attributes["corresponding"] then
return author.email
end
end
return nil
end
M.is_corresponding_author = is_corresponding_author
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Generate a block element containing the correspondence information
local function create_correspondence_blocks(authors, mark)
local corresponding_authors = List:new{}
for _, author in ipairs(authors) do
if is_corresponding_author(author) then
local mailto = 'mailto:' .. utils.stringify(author.email)
local author_with_mail = List:new(
-- modified by @kapsner
author.name.literal .. List:new{pandoc.Space(), pandoc.Str '<'} ..
author.email .. List:new{pandoc.Str '>'}
)
local link = pandoc.Link(author_with_mail, mailto)
table.insert(corresponding_authors, {link})
end
end
if #corresponding_authors == 0 then
return nil
end
local correspondence = List:new{
pandoc.Superscript(mark'corresponding_author'),
pandoc.Space(),
pandoc.Str'Correspondence:',
pandoc.Space()
}
local sep = List:new{pandoc.Str',', pandoc.Space()}
return {
pandoc.Para(correspondence .. intercalate(corresponding_authors, sep))
}
end
M.create_correspondence_blocks = create_correspondence_blocks
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Create inlines for a single author (includes all author notes)
local function author_inline_generator (get_mark)
return function (author)
local author_marks = List:new{}
-- modified by @kapsner
if has_key(author, "attributes") then
if author.attributes["equal-contributor"] then
author_marks[#author_marks + 1] = get_mark 'equal_contributor'
end
end
local idx_str
for _, idx in ipairs(author.affiliations) do
if type(idx) ~= 'table' then
idx_str = tostring(idx)
else
idx_str = stringify(idx)
end
author_marks[#author_marks + 1] = {pandoc.Str(idx_str)}
end
if is_corresponding_author(author) then
author_marks[#author_marks + 1] = get_mark 'corresponding_author'
end
-- modified by @kapsner
if FORMAT:match 'latex' then
author.name.literal[#author.name.literal + 1] = pandoc.Superscript(intercalate(author_marks, {pandoc.Str ','}))
return author
else
local res = List.clone(author.name.literal)
res[#res + 1] = pandoc.Superscript(intercalate(author_marks, {pandoc.Str ','}))
return res
end
end
end
M.author_inline_generator = author_inline_generator
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Generate a list of inlines containing all authors.
local function create_authors_inlines(authors, mark)
local inlines_generator = author_inline_generator(mark)
local inlines = List:new(authors):map(inlines_generator)
local and_str = List:new{pandoc.Space(), pandoc.Str'and', pandoc.Space()}
local last_author = inlines[#inlines]
inlines[#inlines] = nil
local result = intercalate(inlines, {pandoc.Str ',', pandoc.Space()})
if #authors > 1 then
if #authors == 2 then
result:extend(and_str)
else
result:extend(List:new{pandoc.Str ","} .. and_str)
end
end
result:extend(last_author)
return result
end
M.create_authors_inlines = create_authors_inlines
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/author-info-blocks/author-info-blocks.lua
--- Generate a block list all affiliations, marked with arabic numbers.
local function create_affiliations_blocks(affiliations)
local affil_lines = List:new(affiliations):map(
function (affil, i)
local num_inlines = List:new{
pandoc.Superscript{pandoc.Str(affil.number)},
pandoc.Space()
}
return num_inlines .. affil.name
end
)
return {pandoc.Para(intercalate(affil_lines, {pandoc.LineBreak()}))}
end
M.create_affiliations_blocks = create_affiliations_blocks
return M
@@ -0,0 +1,59 @@
--[[
ScholarlyMeta normalize author/affiliation meta variables
Copyright (c) 2017-2021 Albert Krewinkel, Robert Winkler
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
]]
local List = require 'pandoc.List'
local utils = require 'pandoc.utils'
local stringify = utils.stringify
local M = {}
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/scholarly-metadata/scholarly-metadata.lua
--- Returns a function which checks whether an object has the given ID.
local function has_id(id)
return function(x) return x.id == id end
end
-- taken from https://github.com/pandoc/lua-filters/blob/1660794b991c3553968beb993f5aabb99b317584/scholarly-metadata/scholarly-metadata.lua
--- Resolve institute placeholders to full named objects
local function resolve_institutes(institute, known_institutes)
local unresolved_institutes
if institute == nil then
unresolved_institutes = {}
elseif type(institute) == "string" or type(institute) == "number" then
unresolved_institutes = {institute}
else
unresolved_institutes = institute
end
local result = List:new{}
for i, inst in ipairs(unresolved_institutes) do
-- this has been modified by @kapsner
--result[i] =
-- known_institutes[tonumber(inst)] or
-- known_institutes:find_if(has_id(pandoc.utils.stringify(inst))) or
-- to_named_object(inst)
intermed_val = known_institutes:find_if(has_id(stringify(inst)))
result[i] = pandoc.MetaString(stringify(intermed_val.index))
end
return result
end
M.resolve_institutes = resolve_institutes
return M
+62
View File
@@ -0,0 +1,62 @@
--[[
authors-block affiliations block extension for quarto
Copyright (c) 2023 Lorenz A. Kapsner
Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted, provided that the above copyright notice
and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
]]
local List = require 'pandoc.List'
local utils = require 'pandoc.utils'
local stringify = utils.stringify
-- [import]
local from_scholarly = require "from_scholarly_metadata"
local resolve_institutes = from_scholarly.resolve_institutes
-- [/import]
local M = {}
-- from @kapsner
local function normalize_affiliations(affiliations)
local affiliations_norm = List:new(affiliations):map(
function(affil, i)
affil.index = pandoc.MetaInlines(pandoc.Str(tostring(i)))
affil.id = pandoc.MetaString(stringify(affil.id))
return affil
end
)
return affiliations_norm
end
M.normalize_affiliations = normalize_affiliations
-- from https://stackoverflow.com/a/2282547
local function has_key(set, key)
return set[key] ~= nil
end
M.has_key = has_key
-- from @kapsner
local function normalize_authors(affiliations)
return function(auth)
auth.id = pandoc.MetaString(stringify(auth.name))
auth.affiliations = resolve_institutes(
auth.affiliations,
affiliations
)
return auth
end
end
M.normalize_authors = normalize_authors
return M
Binary file not shown.
+19
View File
@@ -0,0 +1,19 @@
@article{knuth84,
author = {Knuth, Donald E.},
title = {Literate Programming},
year = {1984},
issue_date = {May 1984},
publisher = {Oxford University Press, Inc.},
address = {USA},
volume = {27},
number = {2},
issn = {0010-4620},
url = {https://doi.org/10.1093/comjnl/27.2.97},
doi = {10.1093/comjnl/27.2.97},
journal = {Comput. J.},
month = may,
pages = {97111},
numpages = {15}
}
+213
View File
@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""
split_titlepage.py
Splits a Quarto-manuscript-rendered .docx into two files:
- <output>-titlepage.docx : title, authors, affiliations (front matter)
- <output>-body.docx : everything from the first section break onward
Why this works:
Quarto's manuscript/scholarly docx title-block template emits a section
break (<w:sectPr> inside a paragraph's <w:pPr>) immediately after the
title-block content, before the body begins. This script finds that first
section break in word/document.xml and splits there.
Safety check:
Before writing output, the script verifies the "title page" portion
actually contains the document's title and at least one author name
(read from the source .qmd YAML front matter, or passed explicitly).
If this check fails, the script aborts with a clear error rather than
silently producing a wrong split - getting this wrong has real
deanonymization consequences for blind peer review.
This script is self-contained (standard library only: zipfile, re, shutil)
and does NOT depend on any external docx-editing toolkit. It works directly
with the docx ZIP container, replacing only word/document.xml in each of
two copies of the original archive.
Usage:
python split_titlepage.py INPUT.docx OUTDIR \
--title "Title" --author "Jane Doe" [--author "John Q. Doe" ...]
Exit codes:
0 success
1 split point not found
2 safety check failed (title/author not found in detected title page)
3 other error (bad args, file not found)
"""
import argparse
import re
import shutil
import sys
import zipfile
from pathlib import Path
SECTPR_PATTERN = re.compile(
r'<w:p\b[^>]*>(?:(?!</w:p>).)*?<w:sectPr\b.*?</w:sectPr>.*?</w:p>',
re.DOTALL,
)
# A bare sectPr also legitimately appears as the LAST element of body,
# as a direct child of <w:body> (not inside a paragraph) -- that one
# describes the final/only section and is NOT a split point on its own.
# We only want sectPr that appears INSIDE a paragraph's pPr, which marks
# an explicit section break before the end of the document.
def find_first_section_break(document_xml: str) -> "tuple[int, int] | None":
"""
Returns (start, end) character offsets of the first paragraph
containing a section break, or None if not found.
"""
match = SECTPR_PATTERN.search(document_xml)
if not match:
return None
return match.span()
def strip_tags_for_text_check(xml_fragment: str) -> str:
"""Crude tag stripping for a plain-text containment check."""
text = re.sub(r'<[^>]+>', ' ', xml_fragment)
text = re.sub(r'\s+', ' ', text).strip()
return text
def read_document_xml(docx_path: Path) -> str:
with zipfile.ZipFile(docx_path, "r") as z:
return z.read("word/document.xml").decode("utf-8")
def write_docx_with_replaced_document_xml(
source_docx: Path, new_document_xml: str, dest_docx: Path
) -> None:
"""Copy source_docx to dest_docx, replacing only word/document.xml."""
with zipfile.ZipFile(source_docx, "r") as src:
with zipfile.ZipFile(dest_docx, "w", zipfile.ZIP_DEFLATED) as dst:
for item in src.infolist():
data = src.read(item.filename)
if item.filename == "word/document.xml":
data = new_document_xml.encode("utf-8")
dst.writestr(item, data)
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input_docx", type=Path, help="Rendered anonymized .docx")
parser.add_argument("outdir", type=Path, help="Directory to write split output into")
parser.add_argument("--title", required=True, help="Document title, for the safety check")
parser.add_argument(
"--author", action="append", default=[],
help="Author name for the safety check; repeat flag for multiple authors"
)
parser.add_argument(
"--authors", default=None,
help="Comma-separated author names for the safety check "
"(simpler alternative to repeating --author from a Taskfile)"
)
parser.add_argument(
"--basename", default=None,
help="Base name for output files (default: input filename stem)"
)
parser.add_argument(
"--allow-no-author-match", action="store_true",
help="Downgrade the author-match safety check from fatal to a warning"
)
args = parser.parse_args()
authors = list(args.author)
if args.authors:
authors.extend(a.strip() for a in args.authors.split(",") if a.strip())
args.author = authors
if not args.input_docx.exists():
print(f"ERROR: input file not found: {args.input_docx}", file=sys.stderr)
sys.exit(3)
basename = args.basename or args.input_docx.stem
args.outdir.mkdir(parents=True, exist_ok=True)
print(f"Splitting title page from: {args.input_docx}")
try:
xml = read_document_xml(args.input_docx)
except KeyError:
print("ERROR: word/document.xml not found inside the docx - is this a valid .docx?",
file=sys.stderr)
sys.exit(3)
body_match = re.search(r"<w:body\b[^>]*>(.*)</w:body>", xml, re.DOTALL)
if not body_match:
print("ERROR: could not locate <w:body> in document.xml", file=sys.stderr)
sys.exit(3)
body_start, body_end = body_match.span(1)
body_content = xml[body_start:body_end]
split = find_first_section_break(body_content)
if split is None:
print(
"ERROR: no section break found in document body. "
"Expected Quarto's manuscript docx template to emit a "
"section break after the title block. Aborting split — "
"check that the source document actually came from the "
"manuscript title-block template.",
file=sys.stderr,
)
sys.exit(1)
_, split_end = split
titlepage_fragment = body_content[:split_end]
body_fragment = body_content[split_end:]
# --- Safety check ---
titlepage_text = strip_tags_for_text_check(titlepage_fragment)
title_found = args.title.strip() != "" and args.title.strip() in titlepage_text
authors_found = [a for a in args.author if a.strip() and a.strip() in titlepage_text]
print(f" Detected title page text (first 200 chars): {titlepage_text[:200]!r}")
problems = []
if not title_found:
problems.append(f"title {args.title!r} not found in detected title-page text")
if args.author and not authors_found:
problems.append(
f"none of the expected authors {args.author!r} found in detected title-page text"
)
if problems:
msg = (
"SAFETY CHECK FAILED: the detected 'title page' section does not "
"appear to contain the expected title/author metadata:\n - "
+ "\n - ".join(problems)
+ "\nThis usually means the section-break detection found the "
"wrong split point. Refusing to write output to avoid a silent "
"deanonymization risk."
)
if args.allow_no_author_match:
print(f"WARNING: {msg}\n(Continuing anyway because --allow-no-author-match was set.)",
file=sys.stderr)
else:
print(f"ERROR: {msg}", file=sys.stderr)
sys.exit(2)
else:
print(f" Safety check passed (title found: {title_found}, authors found: {authors_found})")
def build_full_xml(body_inner: str) -> str:
return xml[:body_start] + body_inner + xml[body_end:]
titlepage_out = args.outdir / f"{basename}-titlepage.docx"
body_out = args.outdir / f"{basename}-body.docx"
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(titlepage_fragment), titlepage_out
)
write_docx_with_replaced_document_xml(
args.input_docx, build_full_xml(body_fragment), body_out
)
print(f"Done.\n Title page -> {titlepage_out}\n Body -> {body_out}")
if __name__ == "__main__":
main()
+168
View File
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
"""
task_utils.py — cross-platform file operations for the Taskfile pipeline.
Replaces shell-specific commands (PowerShell Compress-Archive, Remove-Item,
etc.) with plain Python stdlib calls that behave identically on Windows,
Linux, and macOS. Called from Task as:
python ../resources/scripts/task_utils.py <subcommand> [args...]
Subcommands:
zip SRC_DIR DEST_ZIP
Zip the contents of SRC_DIR into DEST_ZIP (overwrites if it exists).
Fails with a clear message (exit 1) if SRC_DIR doesn't exist.
zip-if-exists SRC_DIR DEST_ZIP
Same as `zip`, but exits 0 with a warning (no error) if SRC_DIR
doesn't exist, instead of failing. Used for optional things like
a project's data/ folder.
clean-project PROJECT_DIR
Remove _output/, .quarto/, and any *_files/*_cache directories
found anywhere under PROJECT_DIR. Safe to call even if nothing
exists yet.
copy-if-exists SRC DEST
Copy a single file from SRC to DEST if SRC exists; otherwise
print a warning and exit 0 (does not fail the pipeline).
today
Print today's date as YYYY-MM-DD (used for the finalized/ folder
name). No platform-specific date command needed.
"""
import shutil
import sys
import zipfile
from datetime import date
from pathlib import Path
def cmd_zip(src_dir: str, dest_zip: str, allow_missing: bool) -> int:
src = Path(src_dir)
dest = Path(dest_zip)
if not src.exists() or not src.is_dir():
msg = f"'{src}' does not exist or is not a directory"
if allow_missing:
print(f"WARNING: {msg} — skipping zip of {dest}")
return 0
print(f"ERROR: {msg} — did you render first?", file=sys.stderr)
return 1
files = [p for p in src.rglob("*") if p.is_file()]
if not files:
msg = f"'{src}' exists but contains no files"
if allow_missing:
print(f"WARNING: {msg} — skipping zip of {dest}")
return 0
print(f"ERROR: {msg}", file=sys.stderr)
return 1
dest.parent.mkdir(parents=True, exist_ok=True)
if dest.exists():
dest.unlink()
with zipfile.ZipFile(dest, "w", zipfile.ZIP_DEFLATED) as zf:
for f in files:
zf.write(f, f.relative_to(src))
print(f"Created {dest} ({len(files)} files)")
return 0
def cmd_clean_project(project_dir: str) -> int:
root = Path(project_dir)
for name in ("_output", ".quarto"):
target = root / name
if target.exists():
shutil.rmtree(target, ignore_errors=True)
print(f"Removed {target}")
for pattern in ("*_files", "*_cache"):
for match in root.rglob(pattern):
if match.is_dir():
shutil.rmtree(match, ignore_errors=True)
print(f"Removed {match}")
return 0
def cmd_clean_zips(project_dir: str) -> int:
root = Path(project_dir)
for match in root.rglob("*.zip"):
if match.is_file():
shutil.rmtree(match, ignore_errors=True)
print(f"Removed {match}")
return 0
def cmd_copy_if_exists(src: str, dest: str) -> int:
src_path = Path(src)
dest_path = Path(dest)
if not src_path.exists():
print(f"WARNING: '{src_path}' not found, skipping copy")
return 0
dest_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_path, dest_path)
print(f"Copied {src_path} -> {dest_path}")
return 0
def cmd_today() -> int:
print(date.today().isoformat())
return 0
def main() -> int:
if len(sys.argv) < 2:
print(__doc__, file=sys.stderr)
return 1
subcommand = sys.argv[1]
args = sys.argv[2:]
if subcommand == "zip":
if len(args) != 2:
print("usage: task_utils.py zip SRC_DIR DEST_ZIP", file=sys.stderr)
return 1
return cmd_zip(args[0], args[1], allow_missing=False)
if subcommand == "zip-if-exists":
if len(args) != 2:
print("usage: task_utils.py zip-if-exists SRC_DIR DEST_ZIP", file=sys.stderr)
return 1
return cmd_zip(args[0], args[1], allow_missing=True)
if subcommand == "clean-project":
if len(args) != 1:
print("usage: task_utils.py clean-project PROJECT_DIR", file=sys.stderr)
return 1
return cmd_clean_project(args[0])
if subcommand == "clean-zips":
if len(args) != 1:
print("usage: task_utils.py clean-zips PROJECT_DIR", file=sys.stderr)
return 1
return cmd_clean_zips(args[0])
if subcommand == "copy-if-exists":
if len(args) != 2:
print("usage: task_utils.py copy-if-exists SRC DEST", file=sys.stderr)
return 1
return cmd_copy_if_exists(args[0], args[1])
if subcommand == "today":
return cmd_today()
print(f"ERROR: unknown subcommand '{subcommand}'", file=sys.stderr)
print(__doc__, file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())