diff --git a/docs/src/Submakefile b/docs/src/Submakefile index d3102f8ecd5..c221572f6df 100644 --- a/docs/src/Submakefile +++ b/docs/src/Submakefile @@ -573,7 +573,7 @@ $(DOC_OUT_HTML)/pdf/index.html: $(PDF_TARGETS) ../scripts/make-docs-pdf-index @mkdir -p $(dir $@) $(Q)../scripts/make-docs-pdf-index -htmldocs: svgs_made_from_dots .htmldoc-stamp checkref_en +htmldocs: svgs_made_from_dots .htmldoc-stamp .dedup-images-stamp checkref_en # When translations are enabled, the .adoc files in $(L)/ are produced by # the translateddocs target (po4a). Teach make how to ask for them: the @@ -595,6 +595,13 @@ endif .htmldoc-stamp: .copy-asciidoc-stamp $(DOC_DIR)/.gen_complist-stamp $(HTML_TARGETS) .images-stamp .include-stamp $(DOC_OUT_HTML)/asciidoctor.css $(DOC_OUT_HTML)/rouge-github.css .lang-switcher-stamp touch $@ +# Collapse byte-identical images into a shared image/ tree and rewrite refs. +# Runs last; the tool preserves HTML mtimes so a second `make htmldocs` is a +# no-op. +.dedup-images-stamp: $(DOC_SRCDIR)/tools/dedup-images.py .htmldoc-stamp + $(Q)python3 $(DOC_SRCDIR)/tools/dedup-images.py --html-root $(DOC_OUT_HTML) --apply + @touch $@ + # Inject the whole-document sidebar/topbar and grey out missing language- # switcher entries. Runs last (depends on every HTML target) and is # idempotent. Gated on BUILD_DOCS_HTML, not translations: the sidebar comes @@ -705,7 +712,7 @@ checkref_en: $(DOC_DIR)/.checkref-english-stamp # (w3c-linkchecker disables file:// URIs), so the tree may carry accumulated # broken links. Report them without breaking the build for now; drop this # flag once the backlog is cleared so regressions fail the build again. -$(DOC_DIR)/.checkref-english-stamp: $(DOC_TARGETS_HTML_EN) $(DOC_OUT_HTML)/en/index.html $(DOC_OUT_HTML)/en/gcode.html .htmldoc-stamp +$(DOC_DIR)/.checkref-english-stamp: $(DOC_TARGETS_HTML_EN) $(DOC_OUT_HTML)/en/index.html $(DOC_OUT_HTML)/en/gcode.html .htmldoc-stamp .dedup-images-stamp $(DOC_SRCDIR)/checkref --warn-on-failure English $(filter %.html,$^) @touch $@ @@ -1313,6 +1320,7 @@ docclean: -rm -f $(DOC_SRCDIR)/*/*.html -rm -rf $(DOC_FONT_DIR) -rm -f .htmldoc-stamp + -rm -f .dedup-images-stamp -rm -f .copy-asciidoc-stamp -rm -f .adoc-images-stamp -rm -f .html-images-stamp diff --git a/docs/src/tools/dedup-images.py b/docs/src/tools/dedup-images.py new file mode 100644 index 00000000000..d7be5e83858 --- /dev/null +++ b/docs/src/tools/dedup-images.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +# dedup-images.py -- deduplicate images in the built LinuxCNC HTML docs tree. +# +# The build copies every referenced image into every language tree (and every +# topic that references a shared image), storing the same bytes many times. +# This rewrites the tree to: +# html/image/ generic images, one copy per unique content +# html//image/ only the images a language actually translates +# and rewrites every and click-to-enlarge to match. Images +# are matched by SHA-256, so byte-identical copies collapse to a single file. +# +# Dry-run by default; --apply rewrites in place; --check only verifies refs. +# After --apply it re-resolves every reference and fails if any is broken. It +# is idempotent and touches only the output tree, so it can run post-build. + +import argparse +import hashlib +import os +import re +import sys + +IMAGE_EXT = ('.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp') +# Refs the build leaves alone: external URLs, data URIs, absolute paths, logo. +SKIP_REF = re.compile(r'^(https?:|data:|/|#)|lcnc-docs\.svg', re.IGNORECASE) +REF_RE = re.compile(r'(?P\b(?:src|href))="(?P[^"]+)"', re.IGNORECASE) + +DEFAULT_CANONICAL_LANG = 'en' +DEFAULT_IMAGE_DIR = 'image' + + +def log(msg): + sys.stderr.write(msg + '\n') + + +def human(n): + for unit in ('B', 'KiB', 'MiB', 'GiB'): + if n < 1024 or unit == 'GiB': + return ('%.1f %s' % (n, unit)) if unit != 'B' else ('%d B' % n) + n /= 1024.0 + + +def sha256_file(path): + h = hashlib.sha256() + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1 << 16), b''): + h.update(chunk) + return h.hexdigest() + + +def is_image_ref(val): + """True if a src/href value is an in-tree image we should manage.""" + if SKIP_REF.search(val): + return False + base = val.split('#', 1)[0].split('?', 1)[0] + return base.lower().endswith(IMAGE_EXT) + + +def discover_langs(html_root, image_dir): + langs = [] + for name in sorted(os.listdir(html_root)): + p = os.path.join(html_root, name) + if os.path.isdir(p) and name != image_dir: + langs.append(name) + return langs + + +def rel_posix(path, start): + return os.path.relpath(path, start).replace(os.sep, '/') + + +class Plan: + """Computed relocation plan for the whole tree.""" + + def __init__(self, html_root, langs, image_dir, canonical_lang): + self.html_root = html_root + self.langs = langs + self.image_dir = image_dir + self.canonical_lang = canonical_lang + self.inventory = {} # inventory[lang][P] = (abspath, sha, size); P lang-relative posix + self.generic_hash = {} # generic_hash[P] = sha of the generic (en or majority) content + self.canonical_path = {} # canonical_path[sha] = posix path under html_root + self.dest_abs = {} # dest_abs[(lang, P)] = absolute destination after relocation + self.total_bytes = 0 + self.unique_bytes = 0 + + def _lang_root(self, lang): + return os.path.join(self.html_root, lang) + + def _under_image_dir(self, p_posix): + # exclude the new layout (root image/ and /image/) so re-runs are no-ops + return p_posix == self.image_dir or p_posix.startswith(self.image_dir + '/') + + def build_inventory(self): + for lang in self.langs: + root = self._lang_root(lang) + self.inventory[lang] = {} + for dirpath, dirnames, filenames in os.walk(root): + for fn in filenames: + if not fn.lower().endswith(IMAGE_EXT): + continue + ap = os.path.join(dirpath, fn) + if os.path.islink(ap): + continue + P = rel_posix(ap, root) + if self._under_image_dir(P): + continue # already-relocated image; leave it + sha = sha256_file(ap) + size = os.path.getsize(ap) + self.inventory[lang][P] = (ap, sha, size) + + def classify(self): + # All logical paths seen in any language. + all_paths = set() + for lang in self.langs: + all_paths.update(self.inventory[lang].keys()) + + # Generic content for each path: the canonical lang's bytes, else majority. + for P in all_paths: + cl = self.canonical_lang + if cl in self.inventory and P in self.inventory[cl]: + self.generic_hash[P] = self.inventory[cl][P][1] + continue + counts = {} + for lang in self.langs: + e = self.inventory[lang].get(P) + if e: + counts[e[1]] = counts.get(e[1], 0) + 1 + # majority; deterministic tie-break by hash + self.generic_hash[P] = sorted(counts.items(), + key=lambda kv: (-kv[1], kv[0]))[0][0] + + # One canonical path per generic content (keyed by hash, so identical + # bytes anywhere collapse to one file). Representative = shortest then + # lexicographically first path holding that content. + reps = {} # sha -> (depth, P) + for P in sorted(all_paths): + g = self.generic_hash[P] + has = any(self.inventory[l].get(P) and self.inventory[l][P][1] == g + for l in self.langs) + if not has: + continue + cur = reps.get(g) + key = (P.count('/'), P) + if cur is None or key < (cur[0], cur[1]): + reps[g] = (P.count('/'), P) + for sha, (_, P) in reps.items(): + self.canonical_path[sha] = self.image_dir + '/' + P + + # Per-(lang, P) destination + byte accounting. + seen_unique = set() # abspath-or-canonical keys counted once + for lang in self.langs: + for P, (ap, sha, size) in self.inventory[lang].items(): + self.total_bytes += size + if sha == self.generic_hash.get(P): + dest = os.path.join(self.html_root, + *self.canonical_path[sha].split('/')) + key = ('G', sha) + else: + # translated / language-specific image + dest = os.path.join(self._lang_root(lang), self.image_dir, + *P.split('/')) + key = ('S', lang, sha, P) + self.dest_abs[(lang, P)] = dest + if key not in seen_unique: + seen_unique.add(key) + self.unique_bytes += size + + # ---- reference resolution ------------------------------------------- + def resolve_ref(self, page_abs, lang, val): + """Map a src/href value on a page to its (P, dest_abs) or None.""" + page_dir = os.path.dirname(page_abs) + base = val.split('#', 1)[0].split('?', 1)[0] + target_old = os.path.normpath(os.path.join(page_dir, base)) + lang_root = self._lang_root(lang) + P = rel_posix(target_old, lang_root) + if P.startswith('../') or self._under_image_dir(P): + return None # outside the language tree, or already relocated + if (lang, P) not in self.dest_abs: + return None + return P, self.dest_abs[(lang, P)] + + def new_ref_value(self, page_abs, dest_abs, original_val): + page_dir = os.path.dirname(page_abs) + new = rel_posix(dest_abs, page_dir) + # preserve any #fragment/?query suffix + suffix = original_val[len(original_val.split('#', 1)[0].split('?', 1)[0]):] + return new + suffix + + +def iter_html(html_root, langs): + for lang in langs: + for dirpath, _, filenames in os.walk(os.path.join(html_root, lang)): + for fn in filenames: + if fn.lower().endswith(('.html', '.htm')): + yield lang, os.path.join(dirpath, fn) + + +def rewrite_html_file(plan, lang, page_abs, apply_changes): + with open(page_abs, 'r', encoding='utf-8', errors='surrogatepass') as f: + text = f.read() + changed = 0 + + def repl(m): + nonlocal changed + val = m.group('val') + if not is_image_ref(val): + return m.group(0) + r = plan.resolve_ref(page_abs, lang, val) + if r is None: + return m.group(0) + _, dest_abs = r + newval = plan.new_ref_value(page_abs, dest_abs, val) + if newval == val: + return m.group(0) + changed += 1 + return '%s="%s"' % (m.group('attr'), newval) + + new_text = REF_RE.sub(repl, text) + if apply_changes and changed: + st = os.stat(page_abs) + with open(page_abs, 'w', encoding='utf-8', errors='surrogatepass') as f: + f.write(new_text) + # Preserve mtime so an incremental `make` does not re-run the build. + os.utime(page_abs, (st.st_atime, st.st_mtime)) + return changed + + +def relocate_files(plan): + import shutil + placed = set() + + def source_for_generic(sha): + cl = plan.canonical_lang + for lang in [cl] + [l for l in plan.langs if l != cl]: + for P, (ap, s, _) in plan.inventory[lang].items(): + if s == sha: + return ap + return None + + for (lang, P), dest in plan.dest_abs.items(): + if dest in placed: + continue + ap, sha, _ = plan.inventory[lang][P] + if sha == plan.generic_hash.get(P): + src = source_for_generic(sha) + else: + src = ap + os.makedirs(os.path.dirname(dest), exist_ok=True) + if os.path.abspath(src) != os.path.abspath(dest): + shutil.copy2(src, dest) + placed.add(dest) + # 2) delete the originals + for lang in plan.langs: + for P, (ap, _, _) in plan.inventory[lang].items(): + if os.path.abspath(ap) in {os.path.abspath(d) for d in placed}: + continue + try: + os.remove(ap) + except FileNotFoundError: + pass + # 3) prune empty directories (bottom-up), never the new image dirs + for lang in plan.langs: + for dirpath, dirnames, filenames in os.walk(plan._lang_root(lang), + topdown=False): + if os.path.basename(dirpath) == plan.image_dir: + continue + try: + if not os.listdir(dirpath): + os.rmdir(dirpath) + except OSError: + pass + + +def verify(plan): + """Re-resolve every image reference; return list of broken (page, val).""" + broken = [] + for lang, page in iter_html(plan.html_root, plan.langs): + with open(page, 'r', encoding='utf-8', errors='surrogatepass') as f: + text = f.read() + page_dir = os.path.dirname(page) + for m in REF_RE.finditer(text): + val = m.group('val') + if not is_image_ref(val): + continue + base = val.split('#', 1)[0].split('?', 1)[0] + target = os.path.normpath(os.path.join(page_dir, base)) + if not os.path.isfile(target): + broken.append((page, val)) + return broken + + +def main(): + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('--html-root', required=True, + help='built HTML tree root (e.g. docs/build/html)') + ap.add_argument('--apply', action='store_true', + help='rewrite the tree in place (default: dry-run report)') + ap.add_argument('--check', action='store_true', + help='only verify that every image reference resolves') + ap.add_argument('--langs', default='', + help='comma-separated language dirs (default: autodetect)') + ap.add_argument('--canonical-lang', default=DEFAULT_CANONICAL_LANG, + help='language whose images are the generic default (default: en)') + ap.add_argument('--image-dir', default=DEFAULT_IMAGE_DIR, + help='name of the shared image directory (default: image)') + ap.add_argument('-v', '--verbose', action='store_true') + args = ap.parse_args() + + html_root = os.path.abspath(args.html_root) + if not os.path.isdir(html_root): + log('error: not a directory: %s' % html_root) + return 2 + + langs = ([l for l in args.langs.split(',') if l] if args.langs + else discover_langs(html_root, args.image_dir)) + if not langs: + log('error: no language directories found under %s' % html_root) + return 2 + + if args.check: + plan = Plan(html_root, langs, args.image_dir, args.canonical_lang) + broken = verify(plan) + if broken: + log('FAIL: %d broken image reference(s):' % len(broken)) + for page, val in broken[:50]: + log(' %s -> %s' % (rel_posix(page, html_root), val)) + return 1 + log('OK: all image references resolve') + return 0 + + plan = Plan(html_root, langs, args.image_dir, args.canonical_lang) + log('Scanning %d language tree(s): %s' % (len(langs), ', '.join(langs))) + plan.build_inventory() + plan.classify() + + n_files = sum(len(plan.inventory[l]) for l in langs) + n_generic = len(plan.canonical_path) + n_specific = sum(1 for k, dest in plan.dest_abs.items() + if os.sep + args.image_dir + os.sep in dest + and not dest.startswith(os.path.join(html_root, args.image_dir) + os.sep)) + saved = plan.total_bytes - plan.unique_bytes + + log('') + log('Image files (with duplication): %d, %s' % (n_files, human(plan.total_bytes))) + log('Unique after dedup: %s' % human(plan.unique_bytes)) + log('Generic images (root image/): %d' % n_generic) + log('Language-specific (translated): %d' % n_specific) + log('Space reclaimed: %s (%.1f%%)' + % (human(saved), 100.0 * saved / plan.total_bytes if plan.total_bytes else 0)) + + # count reference rewrites + total_refs = 0 + for lang, page in iter_html(html_root, langs): + total_refs += rewrite_html_file(plan, lang, page, apply_changes=args.apply) + + if not args.apply: + log('') + log('Dry run: %d image reference(s) would be rewritten across the HTML.' % total_refs) + log('Re-run with --apply to perform the changes.') + return 0 + + log('') + log('Rewrote %d image reference(s).' % total_refs) + relocate_files(plan) + broken = verify(plan) + if broken: + log('ERROR: %d broken image reference(s) after apply:' % len(broken)) + for page, val in broken[:50]: + log(' %s -> %s' % (rel_posix(page, html_root), val)) + return 1 + log('Verification passed: all image references resolve.') + return 0 + + +if __name__ == '__main__': + sys.exit(main())