From d2bc6cf6ad880efc8e295c28b9a9f2df0971bbc6 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 16:29:00 +0200 Subject: [PATCH 01/13] PDF: separate transparent selection layer for text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PDF text was emitted as one absolutely-positioned span per show-text segment, in paint order with no whitespace between runs. Browser text selection, copy and find-in-page all suffered: cross-run phrases were unmatchable, a kerning-split word was several spans, and selection drag jumped between unrelated boxes. Split text into two layers in `html/pdf_file.cpp`: - Visual layer (paint order, `user-select:none`): uniformly PUA glyphs in the embedded font; fallback runs render Unicode in a system font. Invisible runs (Tr 3/7) emit no visual span at all. - Selection layer (content/reading order, transparent `.i`, selectable): one span per run carrying the real Unicode, anchored at the run origin, emitted after the visual content so the spans are contiguous and selection flows cleanly. A content-order grouping sweep inserts a separator space on a line/column break or a wide intra-line gap (guarded against double spaces, which break literal find-in-page). Anchoring each run at its known origin keeps the highlight aligned without runtime measurement, so the output stays fully static (no JS). Because all selectable text now lives in the selection layer, the visible layer no longer renders real Unicode, so the "collapse" machinery and real-Unicode `cmap` baking are removed and fonts are re-encoded PUA-only. Visual output is pixel-identical (PUA maps to the same glyphs); net −68 lines. De-hyphenation and gap-based intra-line word separators are tracked as future work in TEXT_SELECTION_PLAN.md. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NkoJuS4jaPGvUs1eVb8UbM --- src/odr/internal/html/pdf_file.cpp | 353 ++++++++------------ src/odr/internal/pdf/TEXT_SELECTION_PLAN.md | 206 ++++++++++++ 2 files changed, 350 insertions(+), 209 deletions(-) create mode 100644 src/odr/internal/pdf/TEXT_SELECTION_PLAN.md diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07a250d1..1a60959d 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -20,8 +20,6 @@ #include #include -#include - #include #include #include @@ -561,20 +559,15 @@ class HtmlServiceImpl final : public HtmlService { throw FileNotFound("Unknown path: " + path); } - // One emitted span. The styling is fully resolved into class tokens during - // the first pass; only the (already escaped) text and class list survive to - // the writing pass. A text run with an embedded font emits the dual layer as - // a transparent selectable span carrying the real Unicode with the visible - // glyph layer (PUA code points in the `@font-face` font) nested inside it: - // the child is absolutely positioned at the run origin and inherits the - // font size, spacing, and transform from the parent, so the placement - // classes live only on the parent. `glyph_classes` is empty when there is no - // nested layer (the legacy fallback path and display-only runs). + // One emitted span: the resolved class tokens plus the already-escaped text. + // The renderer paints text in two independent layers (see `write_document`): + // the **visual** layer (`PageOut::items`, in paint order) carries the + // unselectable glyphs; the **selection** layer (`PageOut::sel_spans`, in + // content/reading order) carries the transparent, selectable real Unicode. + // Both layers are flat — a span is just classes + text. struct SpanOut { std::string classes; std::string text; - std::string glyph_classes; - std::string glyph_text; }; // One vector item, already serialized to an SVG fragment in the page's // viewBox (PDF points, y-down): a painted `` or an ``. @@ -582,14 +575,18 @@ class HtmlServiceImpl final : public HtmlService { struct PathOut { std::string svg; }; - // Page content in paint (z) order: text spans and paths interleave, so a - // later fill occludes earlier text and vice versa. + // Visual page content in paint (z) order: glyph spans and paths interleave, + // so a later fill occludes earlier text and vice versa. using PageItem = std::variant; struct PageOut { std::string classes; double width{0}; // page box width, PDF points (for the SVG viewBox) double height{0}; // page box height, PDF points std::vector items; + // The selection layer: transparent, selectable Unicode spans in + // content-stream (reading) order, emitted after the visual content so they + // form one contiguous, cleanly selectable run in the DOM. + std::vector sel_spans; // `` defs for this page's clipped paths, emitted once in a hidden // ``; the path fragments reference them by id. Empty when no path on // the page is clipped. @@ -621,22 +618,14 @@ class HtmlServiceImpl final : public HtmlService { // whose embedded font is absent, not an SFNT, or not re-encodable keeps // index 0 and renders through the fallback path, exactly as before. // - // The `@font-face` rules are *not* built here: a font also gets - // real-Unicode `cmap` entries for the scalars its 1:1 runs use (so those - // runs can collapse to a single span), and that used-scalar set is only - // complete after the first pass. `font_family` therefore just validates and - // indexes the font; `accepted_fonts` / `used_unicode` (indexed by `index - - // 1`) carry it to the post-pass that re-encodes with the extra entries and - // emits `font_faces`. + // The `@font-face` rules are *not* built here: the font subset isn't needed + // until the post-pass, which re-encodes each accepted font to the PUA and + // emits `font_faces`. `font_family` therefore just validates and indexes + // the font; `accepted_fonts` (indexed by `index - 1`) carries it forward. std::uint32_t family_count = 0; std::string font_faces; - std::string glyph_styles; // combined per-font `.fvN`/`.fnN`/`.gvN`/`.giN` + std::string glyph_styles; // per-font visible-glyph class `.fvN` std::vector accepted_fonts; - std::vector> used_unicode; - // Which combined per-font classes occur, so only those are emitted in - // . Slots: [0]=`.fvN` (plain visible), [1]=`.fnN` (plain invisible), - // [2]=`.gvN` (nested-glyph visible), [3]=`.giN` (nested-glyph invisible). - std::vector> font_class_used; std::unordered_map family_index; const auto font_family = [&](pdf::Font *font) -> std::uint32_t { const auto [it, inserted] = family_index.try_emplace(font, 0); @@ -682,31 +671,14 @@ class HtmlServiceImpl final : public HtmlService { const std::uint32_t index = ++family_count; it->second = index; accepted_fonts.push_back(font); - used_unicode.emplace_back(); - font_class_used.push_back({false, false, false, false}); return index; }; - // The combined per-font class carrying `font-family:'odr-fN'` and the paint - // colour, so a font-bearing span names one class instead of restating the - // font family (interned) plus `.gv`/`.i` on every one of the millions of - // spans. A `nested` glyph layer additionally folds in the `.t .g` placement - // (absolute at the run origin, unselectable). Records the combo as used so - // only the rules that occur are emitted in . - const auto font_class = [&](const std::uint32_t font, const bool inv, - const bool nested) { - const int slot = nested ? (inv ? 3 : 2) : (inv ? 1 : 0); - font_class_used[font - 1][slot] = true; - const char *const prefix = - nested ? (inv ? "gi" : "gv") : (inv ? "fn" : "fv"); - return prefix + std::to_string(font); - }; - - // A real-Unicode scalar may carry a `cmap` entry (letting its run collapse) - // only inside the BMP and outside the PUA (`U+E000..U+F8FF`), so a glyph's - // own deterministic PUA code point (`pua_code_point`) is never shadowed. - const auto collapsible_unicode = [](const char32_t c) { - return c <= 0xFFFF && !(c >= 0xE000 && c <= 0xF8FF); + // The per-font visible-glyph class `.fvN`, carrying `font-family:'odr-fN'` + // and the black paint, so a glyph span names one class instead of restating + // the font family on every one of the (potentially millions of) spans. + const auto font_class = [](const std::uint32_t font) { + return "fv" + std::to_string(font); }; // The PUA glyph string for a run: each character code -> glyph id -> @@ -796,6 +768,15 @@ class HtmlServiceImpl final : public HtmlService { GradientRegistry gradients(static_cast(pages_out.size())); PatternRegistry patterns(static_cast(pages_out.size())); + // Selection-layer grouping sweep state, in content-stream (reading) + // order. Tracks the previous text run's baseline and right edge (page-box + // points, y down) so the next run can be prefixed with a separator space + // on a line/column break or a wide intra-line gap. + bool have_prev_run = false; + double prev_baseline = 0; + double prev_end = 0; + bool prev_ends_space = false; + for (const pdf::PageElement &element : pdf::extract_page(stream, *page->resources, *m_logger)) { // A painted path: serialize its subpaths to an SVG `` fragment in @@ -870,20 +851,21 @@ class HtmlServiceImpl final : public HtmlService { const util::math::Transform2D m = flip_glyph * text.transform * to_box; - // Tr 3 (invisible) and Tr 7 (clip-only) paint nothing; keep them - // selectable via the transparent `.i` class. + // Tr 3 (invisible) and Tr 7 (clip-only) paint nothing; they emit no + // visual span at all and survive only in the selection layer (so + // OCR-over-scan text stays searchable/selectable). const bool invisible = text.rendering_mode == pdf::TextRenderingMode::invisible || text.rendering_mode == pdf::TextRenderingMode::clip; - // The run's visible paint colour, folded onto the visible span as an - // interned colour class — but only when it is not the default black, so - // the overwhelmingly common black run adds nothing. The per-font - // `.fvN`/`.gvN` classes declare `color:#000`; this class is emitted - // after them in (equal specificity), so it overrides. Invisible - // runs (Tr 3/7) stay transparent via `.i`, so they take no colour - // class. The fill modes paint with the non-stroking colour, the - // stroke-only modes (Tr 1/5) with the stroking colour. + // The run's visible paint colour, folded onto the visual glyph span as + // an interned colour class — but only when it is not the default black, + // so the overwhelmingly common black run adds nothing. The per-font + // `.fvN` class declares `color:#000`; this class is emitted after it in + // (equal specificity), so it overrides. The selection layer is + // transparent and takes no colour. The fill modes paint with the + // non-stroking colour, the stroke-only modes (Tr 1/5) with the stroking + // colour. std::string color_suffix; if (!invisible) { const pdf::GraphicsState::Color &paint = @@ -974,94 +956,70 @@ class HtmlServiceImpl final : public HtmlService { round2(text.word_spacing * scale * pt_to_px))); } - // A run collapses to a single span — selectable *and* visible, the real - // Unicode rendered directly in the embedded font — when it has an - // embedded font, carries text, is 1:1 with its codes (no /ToUnicode - // expansion, /ActualText, or inferred space), and every glyph wins a - // real-Unicode `cmap` entry. The winner of a scalar is the first - // collapse-candidate run (in document order) to use it; processing - // order *is* document order, so an earlier run's claim is already - // visible and no later run can unseat it — the decision is final here. - const bool collapse_candidate = - font != 0 && !text.text.empty() && text.font != nullptr && - util::string::utf8_length(text.text) == text.advances.size(); - - if (collapse_candidate) { - // Stake first-wins real-Unicode -> glyph claims and decide collapse - // in one walk: the run collapses iff each code's glyph wins (or - // matches) its scalar. Claims are staked for every collapsible scalar - // even when the run ends up dual, so later runs see them. The - // post-pass only bakes the won scalars into the shared font's `cmap`. - std::map &won = used_unicode[font - 1]; - bool collapse = true; - auto cp = text.text.begin(); - for (const std::uint32_t code : text.font->codes(text.codes)) { - const char32_t uchar = utf8::unchecked::next(cp); - const std::uint16_t glyph = text.font->glyph_for_code(code); - if (!collapsible_unicode(uchar)) { - collapse = false; - continue; - } - const auto [it, inserted] = won.emplace(uchar, glyph); - if (!inserted && it->second != glyph) { - collapse = false; + // --- Selection layer ------------------------------------------------- + // Any run with extractable text contributes one transparent, selectable + // span (`.i`) carrying the real Unicode, anchored at the run origin via + // the shared placement (`base`). The grouping sweep (content order) + // prefixes a separator space when this run opens a new line/column or + // sits past a wide intra-line gap, so search and copy get whitespace + // across run boundaries. The space is suppressed when either side + // already carries whitespace — a double space breaks literal + // find-in-page, and inter-word gaps are often already an inferred + // leading space on `text.text`. + if (!text.text.empty()) { + // Run origin and horizontal extent in page-box points (y down). The + // advance (`text.width`) lives in the text matrix's space; its box + // extent scales by the matrix's x-axis length. + const double ox = m.e; + const double baseline = m.f; + const double axis = std::hypot(m.a, m.b); + const double extent = text.width * axis; + const double font_pt = text.size * axis; + const bool starts_space = text.text.front() == ' '; + + std::string sep; + if (have_prev_run && font_pt > 0) { + const bool new_line = + std::abs(baseline - prev_baseline) > 0.6 * font_pt || + ox < prev_end - 0.5 * font_pt; + const bool gap = ox - prev_end > 0.25 * font_pt; + if ((new_line || gap) && !prev_ends_space && !starts_space) { + sep = " "; } } - if (collapse) { - // One span: the real Unicode rendered in the embedded font, named - // by the combined per-font class (black visible / transparent - // invisible), selectable either way. - std::string classes = std::move(base); - classes += ' '; - classes += font_class(font, invisible, /*nested=*/false); + + page_out.sel_spans.push_back( + SpanOut{base + " i", escape_text(sep + text.text)}); + + prev_baseline = baseline; + prev_end = ox + extent; + prev_ends_space = text.text.back() == ' '; + have_prev_run = true; + } + + // --- Visual layer ---------------------------------------------------- + // Unselectable glyphs in paint order. Invisible runs (Tr 3/7) paint + // nothing, so they emit no visual span — they live only in the + // selection layer above. + if (!invisible) { + if (font != 0) { + // PUA code points in the embedded font, carrying the placement + // (`base`), `.g` (unselectable) and the per-font paint+family + // class. + std::string classes = base + " g "; + classes += font_class(font); classes += color_suffix; page_out.items.push_back( - SpanOut{std::move(classes), escape_text(text.text), {}, {}}); + SpanOut{std::move(classes), + escape_text(glyph_run(*text.font, text.codes))}); } else { - // Dual layer (a glyph lost its scalar to an earlier one): a - // transparent selectable Unicode span with the PUA glyph layer - // nested inside, the latter folded into the combined `.gvN` / - // `.giN` class. The colour rides the visible (nested) layer. - page_out.items.push_back(SpanOut{ - base + " i", escape_text(text.text), - font_class(font, invisible, /*nested=*/true) + color_suffix, - escape_text(glyph_run(*text.font, text.codes))}); - } - } else if (font != 0) { - // The visible glyph layer: PUA code points in the embedded font, - // named by the combined per-font class (paint colour + font family). - std::string glyph_text = - escape_text(glyph_run(*text.font, text.codes)); - - if (!text.text.empty()) { - // Dual layer: a transparent selectable span carrying the real - // Unicode (for copy/search) with the glyph layer nested inside. - // The nested child overlays the run origin and inherits the - // placement via the combined `.gvN` / `.giN` class. - page_out.items.push_back(SpanOut{ - base + " i", escape_text(text.text), - font_class(font, invisible, /*nested=*/true) + color_suffix, - std::move(glyph_text)}); - } else { - // Display-only run: nothing is extractable (the `no_unicode` case), - // so the glyph layer stands alone and carries the placement itself - // (`base`), `.g` (unselectable) and the combined paint+font class. - std::string glyph_classes = base + " g "; - glyph_classes += font_class(font, invisible, /*nested=*/false); - glyph_classes += color_suffix; - page_out.items.push_back(SpanOut{ - std::move(glyph_classes), std::move(glyph_text), {}, {}}); - } - } else { - // Legacy single-layer path: no embedded font, render the Unicode in a - // fallback font. - std::string classes = base; - if (invisible) { - classes += " i"; + // No embedded font: render the Unicode in a fallback font, + // unselectable (the selection layer owns interaction). + std::string classes = base + " g"; + classes += color_suffix; + page_out.items.push_back( + SpanOut{std::move(classes), escape_text(text.text)}); } - classes += color_suffix; - page_out.items.push_back( - SpanOut{std::move(classes), escape_text(text.text), {}, {}}); } } @@ -1070,62 +1028,39 @@ class HtmlServiceImpl final : public HtmlService { page_out.clip_defs = clips.defs() + gradients.defs() + patterns.defs(); } - // Post-pass: every page has been scanned, so the per-font used-scalar sets - // are complete. - // - // Re-encode each accepted font with its real-Unicode entries baked into the - // `cmap` (the PUA range is kept as a fallback) and emit the `@font-face` - // rules in index order, so the output stays deterministic. + // Post-pass: re-encode each accepted font to the PUA and emit its + // `@font-face` rule plus the per-font visible-glyph class in index order, + // so the output stays deterministic. The visual glyph layer renders PUA + // code points only (selection rides the separate transparent layer), so no + // real-Unicode `cmap` entries are baked. for (std::uint32_t i = 0; i < family_count; ++i) { pdf::Font *font = accepted_fonts[i]; - const std::map &extra = used_unicode[i]; std::string reencoded; if (auto sfnt = std::dynamic_pointer_cast( font->embedded_font)) { - font::reencode_to_pua(*sfnt, extra); + font::reencode_to_pua(*sfnt); std::ostringstream sfnt_out; sfnt->write(sfnt_out); reencoded = std::move(sfnt_out).str(); } else if (auto cff = std::dynamic_pointer_cast( font->embedded_font)) { - reencoded = font::cff::wrap_to_otf(*cff, extra); + reencoded = font::cff::wrap_to_otf(*cff); } const std::string url = file_to_url(reencoded, "font/ttf"); - font_faces += "@font-face{font-family:'odr-f" + std::to_string(i + 1) + - "';src:url(" + url + ");}"; - - // The combined per-font classes for this font, only those used. `.fvN` / - // `.fnN` carry just the paint colour and font family (placement stays on - // the span's own classes); `.gvN` / `.giN` additionally fold in the - // nested glyph layer's `.t` placement and `.g` unselectability. const std::string n = std::to_string(i + 1); - const std::string family = "font-family:'odr-f" + n + "'"; - constexpr const char *placement = - "position:absolute;left:0;top:0;transform-origin:0 0;" - "white-space:pre;line-height:1;user-select:none;"; - const auto rule = [&](const char *cls, const char *head, - const char *color) { - glyph_styles += '.'; - glyph_styles += cls; - glyph_styles += n; - glyph_styles += '{'; - glyph_styles += head; - glyph_styles += color; - glyph_styles += family; - glyph_styles += '}'; - }; - if (font_class_used[i][0]) { - rule("fv", "", "color:#000;"); - } - if (font_class_used[i][1]) { - rule("fn", "", "color:transparent;"); - } - if (font_class_used[i][2]) { - rule("gv", placement, "color:#000;"); - } - if (font_class_used[i][3]) { - rule("gi", placement, "color:transparent;"); - } + font_faces += "@font-face{font-family:'odr-f"; + font_faces += n; + font_faces += "';src:url("; + font_faces += url; + font_faces += ");}"; + + // `.fvN` carries the font family and the black paint; placement (`.t`), + // unselectability (`.g`) and any non-black colour stay on the span. + glyph_styles += ".fv"; + glyph_styles += n; + glyph_styles += "{color:#000;font-family:'odr-f"; + glyph_styles += n; + glyph_styles += "'}"; } // Pass 2: write the document, now that the catalog is complete. @@ -1148,12 +1083,11 @@ class HtmlServiceImpl final : public HtmlService { out.out() << ".p{position:relative;margin:16px auto;background:#fff;" "box-shadow:0 1px 4px rgba(0,0,0,.5)}"; // `font-kerning:none` + `font-variant-ligatures:none` keep the browser from - // applying the embedded font's GPOS/GSUB tables. A collapsed run now emits - // real Unicode in that font, so without this a sequence like `fi`/`AV` - // could be re-shaped (ligature substitution, kerning) after this code - // already fixed the PDF glyph IDs and advances, shifting pixels and run - // widths for otherwise 1:1 text. The PUA glyph layer was immune; restore - // that here. + // applying the embedded font's GPOS/GSUB tables: the PUA glyph layer + // carries exact PDF glyph IDs and advances, and ligature substitution / + // kerning would re-shape it, shifting pixels and run widths. Shared by both + // layers + // (`.t`). // `line-height:1` fixes the box top one em-ascent above the baseline so the // baseline shift applied to each run's `top`/matrix (see `ascent_em`) lands // the glyphs on the PDF text origin; the browser's default `normal` leading @@ -1161,12 +1095,11 @@ class HtmlServiceImpl final : public HtmlService { out.out() << ".t{position:absolute;left:0;top:0;transform-origin:0 0;" "white-space:pre;line-height:1;font-kerning:none;" "font-variant-ligatures:none}"; - // Invisible text render modes (Tr 3/7): kept in the DOM for selection and - // search (OCR-over-scan), but not painted. + // The selection layer: transparent (painted by the glyph layer underneath) + // but selectable and searchable, including OCR-over-scan invisible text. out.out() << ".i{color:transparent}"; - // The display-only glyph layer (`no_unicode` runs) is not selectable, so - // the PUA code points stay off the clipboard; `.g` pairs with a combined - // `.fvN`/`.fnN` paint+font class on those spans. + // The visual glyph layer is not selectable — selection rides the `.i` + // layer, so the (often PUA) visible code points stay off the clipboard. out.out() << ".g{user-select:none}"; // Vector graphics: one or more `` overlays per page, each filling the // page box (viewBox in PDF points). `overflow:hidden` clips each overlay to @@ -1183,29 +1116,24 @@ class HtmlServiceImpl final : public HtmlService { "overflow:hidden;pointer-events:none}"; // Embedded fonts, re-encoded to the PUA and served inline. out.out() << font_faces; - // Combined per-font classes (`.fvN`/`.fnN` paint+font, `.gvN`/`.giN` also - // placement), so a font-bearing span names one class for its font. + // Per-font visible-glyph classes (`.fvN` paint+font family), so a glyph + // span names one class for its font. out.out() << glyph_styles; - // Per-value atomic classes (font sizes, offsets, transforms, ...). + // Per-value atomic classes (font sizes, offsets, transforms, ...). Shared + // by the visual glyph layer and the selection layer (both anchor at the run + // origin via these placement classes). styles.write_rules(out.out()); out.write_header_style_end(); out.write_header_end(); const auto write_span = [&out](const SpanOut &span) { - // Inline so the whole run (and its nested glyph layer) stays on one line: - // smaller output and a more legible diff than the open/text/close split, - // while each run still gets its own line under the page div. + // Inline so the run stays on one line: smaller output and a more legible + // diff than the open/text/close split, while each run still gets its own + // line under the page div. out.write_element_begin( "span", HtmlElementOptions().set_inline(true).set_class(span.classes)); out.write_raw(span.text); - if (!span.glyph_classes.empty()) { - out.write_element_begin("span", - HtmlElementOptions().set_inline(true).set_class( - span.glyph_classes)); - out.write_raw(span.glyph_text); - out.write_element_end("span"); - } out.write_element_end("span"); }; @@ -1251,6 +1179,13 @@ class HtmlServiceImpl final : public HtmlService { } } close_svg(); + // The selection layer: transparent, selectable Unicode in reading order, + // emitted last so the spans are contiguous in the DOM and a drag- or + // find-selection flows cleanly across runs and lines without the visual + // glyphs (which are `user-select:none`) interrupting it. + for (const SpanOut &span : page.sel_spans) { + write_span(span); + } out.write_element_end("div"); } out.write_body_end(); diff --git a/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md b/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md new file mode 100644 index 00000000..17a41c86 --- /dev/null +++ b/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md @@ -0,0 +1,206 @@ +# PDF text selection & search — plan + +## Context + +The PDF → HTML imaging output is now quite complete (stage 4), but **marking +text and searching is poor**. The cause is structural: every show-text segment +(`Tj`, or one string of a `TJ` array) becomes one `pdf::TextElement` +(`pdf_page_element.hpp`), and each becomes one **absolutely-positioned ``** +placed at its run origin (`html/pdf_file.cpp` ~`l.902–941`). Runs that share a +visual line are therefore independent boxes at arbitrary coordinates with no +whitespace or reading order between them, so: + +- find-in-page can't match a phrase that crosses a run boundary, and a single + word split by kerning (several `TJ` adjustments) is several spans, so even + one-word search misses; +- dragging a selection jumps between unrelated boxes — there is no line flow for + the browser to follow; +- copy order follows content-stream (paint) order, not always reading order. + +For text with an embedded font we already emit a **dual layer** — a transparent, +selectable Unicode parent span with a nested visible PUA-glyph span +(`html/pdf_file.cpp` ~`l.1020–1054`). The selectable text already exists; it is +just **per-run and absolutely positioned**, which is exactly what selection and +search need it not to be. + +Intended outcome: native, JS-free selection and find-in-page that flow along +lines (and across wrapped lines within a block), without regressing the +pixel-perfect visual rendering. + +## Approach (decided) + +Keep the **visual glyph layer exactly as it is** — absolutely-positioned PUA +spans are what make the rendering good; do not touch them. Restructure **only +the transparent Unicode layer** into per-line (and, where confident, per-block) +containers in reading order, with real whitespace between runs. This is +PDF.js-style layering done **statically at generation time**. + +Fixed decisions from discussion: + +1. **Separate text layer**, not a unified reflow. The selectable Unicode becomes + its own line/block-grouped layer; the visual layer stays pixel-perfect and + independent. The selection layer **replaces** today's per-run transparent + spans rather than stacking on top of them, so node count stays roughly flat. +2. **Static HTML/CSS only — no JavaScript.** Because we translate ahead of time, + all layout analysis happens in the C++ pass and is baked into static DOM; + native browser selection and Ctrl+F then work with zero runtime. This is a + hard constraint (JS-free output is a core virtue of the current export). +3. **Eager to split, conservative to merge.** Cluster runs into lines by baseline + (y + orientation), order by x. Merge adjacent lines into a block only when + clearly the same column — overlapping x-range, consistent leading + (baseline-to-baseline ≈ font size), same writing direction, same-ish size. + When any signal is shaky, fall back to separate line containers. The fallback + is lossless for within-line UX, so multi-line grouping never hurts the + single-line case. This also makes **tables safe without table detection**: + cells never merge across columns, so they fall out as separate line + containers (correct selection) — we do not need to recognize the table. +4. **Multi-line is the target**, degrading gracefully to line-only. Intra-block + line breaks get a single space separator so search matches across the wrap + (`"the\nquick"` → findable `"the quick"`). + +## Mechanism + +### Grouping (generation time, C++) + +A linear sweep over the page's `TextElement`s **in content-stream order** — +crucially *not* a global re-sort. Content-stream order is almost always reading +order already (a producer draws a column top-to-bottom, then the next column), so +trusting it is what keeps multi-column text and tables from scrambling. A global +sort by (baseline, x) would interleave columns sharing a y-band — exactly the +failure we must avoid. + +The sweep tracks the previous run's baseline and right edge, and for each run +decides the **separator** to insert before it: + +1. New line when the baseline jumps (more than ~0.6·font-size) or the run starts + left of where the previous ended (a column/line break in producer order). +2. Same-line gap when the horizontal gap to the previous run exceeds a small + fraction (~0.25·font-size). + +Either case inserts a single space (so search matches across the break), +*unless* the adjacent text already carries whitespace — many gaps are already +represented as an inferred leading space on the segment (`TextElement::text`), so +guard against double spaces (which would break literal find-in-page). + +Cost is O(n) per page — negligible next to the embedded-font re-encode that +already dominates the pass. No new font work; no sort. + +### Within-line gaps → whitespace (the core tension) + +Between consecutive runs on a line there is a horizontal gap (inter-word space, +kerning, tab to a column). The selection layer must reconcile two conflicting +needs: + +- **Searchable whitespace** — a real space character makes `"the quick"` + findable and copy readable, but a literal space has the font's space-width, not + the exact PDF gap, so the transparent text **drifts** from the glyphs along the + line. +- **Positional accuracy** — an exact-width spacer (inline-block / letter-spacing) + stays aligned but carries no whitespace, so words run together (`"thequick"`) + and search breaks. + +No single trick gives both, and the obvious width-fix is **not available to us**: + +- **`transform: scaleX(...)`** (the PDF.js technique) needs the run's *rendered* + width to compute the scale, which PDF.js measures **at runtime in JS**. We emit + no JS, so statically the factor is either uncomputable (system-font selection + layer) or just `1` (embedded font — the advances already match). So `scaleX` + buys us nothing and is dropped. +- **Chosen approach: anchor every run at its own known origin.** Each selection + span is absolutely positioned at the run's origin (the placement we already + compute for the glyph layer), so drift can only accumulate *within* one short + run, never across the line. Real spaces between runs (see grouping) give + searchable whitespace; the per-run anchoring keeps the highlight close without + any width-fix. Fully static, no JS. +- **Dead end — do not pursue: `position: relative; left:`** to offset an inserted + space. Relative positioning shifts the box visually but leaves its space + reserved in the flow, so siblings don't move; it cannot reclaim the gap. (A + negative `margin-left` *would* reclaim it, but with per-run origin anchoring we + don't need to.) + +### Highlight alignment quality bar + +Selection highlights the *selection layer's* boxes; if those are offset from the +visible glyphs the highlight looks shifted (text itself is never wrong — the +glyph layer is a separate, perfect layer, so misalignment shows up only as a +slightly-off highlight rectangle during an active drag). Per-run origin anchoring (above) +keeps this in the acceptable band: each run's highlight starts exactly on its +glyphs and can only drift within that one short run. We ship that and revisit +only if the residual within-run drift is noticeable in practice. + +## De-hyphenation (tracked, deferred) + +A line-final hyphen (`"infor-\nmation"`) is unfindable as `"information"` whether +joined with a space (`"infor- mation"`) or nothing (`"infor-mation"`); only +dropping the hyphen + break fixes search. But it is genuinely ambiguous — a +line-final hyphen may be a soft break hyphen (`infor-mation`) or a real one +(`well-\nknown` must stay `well-known`), and PDF almost never marks the +difference (most producers emit plain `U+002D` for both; the rare `U+00AD` soft +hyphen is the only unambiguous signal). + +Decision: **do not auto-de-hyphenate in v1** — lossy and wrong often enough to be +a net negative for copy fidelity. Join intra-block lines with a space, accept +that hyphenated-across-line words miss in search. Revisit as an opt-in heuristic: +collapse only when the trailing char is `U+00AD`, or behind a config flag. + +## Implementation sketch + +All changes are in the HTML layer; the IR (`pdf::TextElement`) already carries +what we need (`transform`, `size`, `advances`, `text`). + +- **`src/odr/internal/html/pdf_file.cpp`** — `HtmlServiceImpl::write_document`: + - **Visual layer (paint order, non-selectable):** every embedded-font run emits + its PUA glyph span (the existing display-only form, `base + " g " + fvN`); + fallback runs emit their Unicode in the system font, also `.g`. All visual + text is `user-select:none`. Invisible runs (Tr 3/7) paint nothing, so they + emit **no** visual span at all. + - **Selection layer (content order, transparent, selectable):** any run with + extractable text contributes one span carrying the real Unicode, anchored at + the run origin (reuse `base`), transparent via `.i`, with the leading + separator from the grouping sweep. Emitted per page after the paint content + so the selectable spans are contiguous in the DOM. + - **Fold out the "collapse" path.** Because all common-case selectable text now + lives in the selection layer, the visible layer no longer needs to render + real Unicode in the embedded font. Remove the collapse machinery + (`collapsible_unicode`, `used_unicode`, the per-run first-wins scalar walk) + and the real-Unicode `cmap` baking in the post-pass — the font is re-encoded + **PUA-only** (`reencode_to_pua(*sfnt)` / `wrap_to_otf(*cff)` with no extras). + Visual output stays pixel-identical (PUA maps to the same glyphs); the DOM + and font subset shrink in complexity. + - Separator classes are not needed (spaces ride inside the span text); existing + placement classes are reused via the `AtomicStyles` interner. + +## Size / performance notes + +- Generation: +O(n log n) sort per page; negligible. +- HTML size: the Unicode bytes are **relocated**, not duplicated; node count + stays roughly flat (one container per line vs. a transparent parent per run). + `scaleX` classes are bounded by distinct values via `AtomicStyles`. Real-space + separators are ~free; prefer them over inline-block spacers. The docs that + approached GitHub's 100 MB reference-output ceiling are dominated by the glyph + layer + embedded fonts, both unchanged. (The 100 MB ceiling is a soft + reference-output constraint, not a product limit — keep the layer lean but do + not let it block the design.) + +## Verification + +- Build/test in `cmake-build-relwithdebinfo`; run the PDF HTML output tests and + regenerate reference output for the `test/data/.../output/pdf` fixtures, eyeing + the diff for size and structure. +- Manual: open representative outputs (e.g. `geneve_1564.pdf`, + `978-3-030-65771-0.pdf`, a multi-column doc) in a browser and check: + - Ctrl+F finds phrases that span run boundaries and wrapped lines. + - Click-drag selection flows along a line and across lines in a block. + - Copy yields readable, correctly-ordered text. + - Multi-column / table-like pages do **not** scramble across columns on copy. + - Visual rendering is byte-for-byte unchanged from before (glyph layer + untouched) — confirm via the perceptual-diff oracle. + - Output stays JS-free. + +## Future work + +- De-hyphenation heuristic (opt-in / `U+00AD`-only). +- Gap-based word separators within a line (beyond the producer's inferred + spaces), if word-merging shows up in practice. +- Richer static structure recovery (semantic `` / multi-column markup) — + a separate, larger layout-analysis effort, out of scope here. From 446eb1ea14e7acabafae9e13d4a4ea662e15bd85 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 20:11:41 +0200 Subject: [PATCH 02/13] PDF selection layer: apply horizontal scaling (Tz) once in gap detection The selection-layer extent used `axis = hypot(m.a, m.b)`, the placement transform's x-axis length, which already folds in horizontal scaling (Tz) via the `params` factor. But `text.width` was also advanced with Tz in `segment_advances`, so `extent = text.width * axis` applied Tz twice. For condensed text (Tz < 100) this underestimated the run end and could inject separator spaces inside continuous words; for expanded text it could suppress real gaps. Divide the matrix x-axis by the Tz factor so `axis` is the bare text-matrix -> box scale: `extent` carries Tz exactly once, and `font_pt` tracks the Tz-free em the gap thresholds compare against. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NkoJuS4jaPGvUs1eVb8UbM --- src/odr/internal/html/pdf_file.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 1a60959d..0ce7d2c2 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -969,10 +969,15 @@ class HtmlServiceImpl final : public HtmlService { if (!text.text.empty()) { // Run origin and horizontal extent in page-box points (y down). The // advance (`text.width`) lives in the text matrix's space; its box - // extent scales by the matrix's x-axis length. + // extent scales by the text-matrix -> box x-axis length. The + // placement transform's x-axis (`m.a`, `m.b`) additionally folds in + // horizontal scaling (Tz), but `text.width` already advanced with Tz + // in `segment_advances`; divide it back out so Tz is applied once + // (and so `font_pt` tracks the Tz-free em). const double ox = m.e; const double baseline = m.f; - const double axis = std::hypot(m.a, m.b); + const double tz = text.horizontal_scaling / 100.0; + const double axis = tz != 0 ? std::hypot(m.a, m.b) / tz : 0; const double extent = text.width * axis; const double font_pt = text.size * axis; const bool starts_space = text.text.front() == ' '; From 658b811c621de551d5baa3aa1681432216513a21 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 20:45:18 +0200 Subject: [PATCH 03/13] PDF selection layer: group layers, aria-hide visual, coalesce words MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wrap each page's two text layers in their own parents: a `vis` parent holding the graphics and unselectable glyph spans, and a `sel` parent holding the transparent selectable Unicode. The `vis` parent carries `aria-hidden="true"` so a screen reader reads only the real text in the selection layer instead of the PUA glyph code points. Both wrappers are unpositioned and zero-height (children are absolutely positioned), so the spans still anchor to the `.p` page box and stacking is unchanged. In the selection layer, merge a run into the previous span when it is a tight same-baseline continuation with no whitespace at the boundary — the case where PDF splits a single word into several runs at a TJ kerning adjustment. The whole word then lives in one text node, so double-click selects it as a unit rather than stopping at the run boundary. A boundary that already carries a space stays a separate span, so word breaks remain word breaks and double-click still selects a single word. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NkoJuS4jaPGvUs1eVb8UbM --- src/odr/internal/html/pdf_file.cpp | 81 +++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 0ce7d2c2..a4efdb9c 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -957,15 +957,31 @@ class HtmlServiceImpl final : public HtmlService { } // --- Selection layer ------------------------------------------------- - // Any run with extractable text contributes one transparent, selectable - // span (`.i`) carrying the real Unicode, anchored at the run origin via - // the shared placement (`base`). The grouping sweep (content order) - // prefixes a separator space when this run opens a new line/column or - // sits past a wide intra-line gap, so search and copy get whitespace - // across run boundaries. The space is suppressed when either side - // already carries whitespace — a double space breaks literal - // find-in-page, and inter-word gaps are often already an inferred - // leading space on `text.text`. + // Every run with extractable text feeds the transparent, selectable + // layer (`.i`) with its real Unicode, anchored at the run origin via + // the shared placement (`base`). A content-order sweep decides, per + // run, whether it starts a new span or extends the previous one: + // + // * A line/column break or a wide intra-line gap starts a new span, + // prefixed with a separator space so search and copy get whitespace + // across the boundary. The space is suppressed when either side + // already carries whitespace — a double space breaks literal + // find-in-page, and inter-word gaps often already left an inferred + // leading space on `text.text`. + // * A tight same-baseline continuation with no whitespace at the + // boundary merges into the previous span. PDF splits one word into + // several runs at every TJ kerning adjustment, and the browser finds + // word boundaries within a single text node only, so a word spread + // over separate spans can't be grown by a double-click. Folding the + // continuation into the previous text node keeps the whole word + // selectable as a unit. A boundary that already carries a space is a + // word break, not an intra-word split, so it stays a separate span — + // gluing the words into one node (over a non-breaking separator) + // would instead make a double-click grab the whole phrase. The + // merged run's own origin is dropped — its glyphs flow from where + // the previous run ended — but the runs are tightly packed by + // construction and the layer is transparent, so the sub-glyph drift + // is invisible. if (!text.text.empty()) { // Run origin and horizontal extent in page-box points (y down). The // advance (`text.width`) lives in the text matrix's space; its box @@ -982,19 +998,29 @@ class HtmlServiceImpl final : public HtmlService { const double font_pt = text.size * axis; const bool starts_space = text.text.front() == ' '; + bool merge = false; std::string sep; if (have_prev_run && font_pt > 0) { const bool new_line = std::abs(baseline - prev_baseline) > 0.6 * font_pt || ox < prev_end - 0.5 * font_pt; const bool gap = ox - prev_end > 0.25 * font_pt; - if ((new_line || gap) && !prev_ends_space && !starts_space) { - sep = " "; + const bool boundary_space = prev_ends_space || starts_space; + if (new_line || gap) { + if (!boundary_space) { + sep = " "; + } + } else if (!boundary_space) { + merge = true; } } - page_out.sel_spans.push_back( - SpanOut{base + " i", escape_text(sep + text.text)}); + if (merge && !page_out.sel_spans.empty()) { + page_out.sel_spans.back().text += escape_text(text.text); + } else { + page_out.sel_spans.push_back( + SpanOut{base + " i", escape_text(sep + text.text)}); + } prev_baseline = baseline; prev_end = ox + extent; @@ -1146,8 +1172,19 @@ class HtmlServiceImpl final : public HtmlService { for (const PageOut &page : pages_out) { out.write_element_begin("div", HtmlElementOptions().set_class(page.classes)); - // Clip-path and gradient defs for this page, in a hidden zero-size - // ``. They are referenced by id from the page's fragments; + + // Visual layer: the page's graphics and unselectable glyphs, grouped in + // one parent and hidden from the accessibility tree (`aria-hidden`) — the + // glyphs are often PUA code points a screen reader would read as + // gibberish, and the real text is carried by the selection layer below. + // The wrapper is unpositioned and contributes no height (its children are + // `position:absolute`), so it stays layout-neutral and the spans still + // anchor to the `.p` page box. + out.write_element_begin("div", + HtmlElementOptions().set_class("vis").set_extra( + R"(aria-hidden="true")")); + // Clip-path, gradient and pattern defs for this page, in a hidden + // zero-size ``. They are referenced by id from the page's fragments; // `clipPathUnits`/`gradientUnits` are `userSpaceOnUse`, so the geometry // is read in the user space of the referencing element (the page // viewBox), not this ``. @@ -1184,14 +1221,20 @@ class HtmlServiceImpl final : public HtmlService { } } close_svg(); - // The selection layer: transparent, selectable Unicode in reading order, - // emitted last so the spans are contiguous in the DOM and a drag- or - // find-selection flows cleanly across runs and lines without the visual - // glyphs (which are `user-select:none`) interrupting it. + out.write_element_end("div"); + + // Selection layer: transparent, selectable Unicode in reading order, + // grouped in its own parent and emitted after the visual layer so the + // spans are contiguous in the DOM and a drag- or find-selection flows + // cleanly across runs and lines without the visual glyphs (which are + // `user-select:none`) interrupting it. + out.write_element_begin("div", HtmlElementOptions().set_class("sel")); for (const SpanOut &span : page.sel_spans) { write_span(span); } out.write_element_end("div"); + + out.write_element_end("div"); } out.write_body_end(); out.write_end(); From 29ec071f9ca7e89d66efee87da0fc488fca8cedb Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 22:17:54 +0200 Subject: [PATCH 04/13] PDF selection layer: markup-only escaping, keep spaces as U+0020 Use a dedicated escape_selection_text for the transparent selection spans instead of html::escape_text. The general helper rewrites leading, trailing and doubled spaces to   and tabs to  , which is wrong for this layer: the spans carry white-space:pre so every space already renders, and a non-breaking space neither matches a normal space in find-in-page nor lets double-click break between words. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NkoJuS4jaPGvUs1eVb8UbM --- src/odr/internal/html/pdf_file.cpp | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index a4efdb9c..742c2d4a 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -39,6 +39,20 @@ namespace { /// the extra digits add up across a page full of path data. double round2(const double v) { return std::round(v * 100.0) / 100.0; } +/// Escape only HTML markup (`&`, `<`, `>`) for the selection layer. Unlike +/// `html::escape_text`, spaces are left as ordinary U+0020 rather than +/// rewritten to ` `: the selection spans carry `white-space:pre`, so every +/// space already renders, and a non-breaking space would defeat the layer's +/// purpose — it doesn't match a normal space in find-in-page and it glues +/// adjacent words together under double-click. Tabs aren't expected in +/// extracted PDF text. +std::string escape_selection_text(std::string text) { + util::string::replace_all(text, "&", "&"); + util::string::replace_all(text, "<", "<"); + util::string::replace_all(text, ">", ">"); + return text; +} + /// Serialize a transform as an SVG `matrix(...)`. Only the translation (e, f) /// is rounded — it lives in page-box units where 1/100 px is plenty; the linear /// part (a..d) keeps full precision so small scale/skew factors aren't @@ -1016,10 +1030,10 @@ class HtmlServiceImpl final : public HtmlService { } if (merge && !page_out.sel_spans.empty()) { - page_out.sel_spans.back().text += escape_text(text.text); + page_out.sel_spans.back().text += escape_selection_text(text.text); } else { page_out.sel_spans.push_back( - SpanOut{base + " i", escape_text(sep + text.text)}); + SpanOut{base + " i", escape_selection_text(sep + text.text)}); } prev_baseline = baseline; From cc147e2d96419a7ce18eed05b65288a6287d0458 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 22:49:34 +0200 Subject: [PATCH 05/13] PDF selection layer: fit transparent spans to true glyph width (scaleX) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The selection spans render real Unicode in the browser's system font, whose advances differ from the embedded glyphs, so an active highlight was noticeably wider or narrower than the visible run. Each selection span now carries its true advance in `data-w` (CSS px), and a small on-load script corrects the box with a horizontal `scaleX` = target / measured about the run's left origin. The script runs per page, lazily, via IntersectionObserver: a large document only pays for the pages actually scrolled into view rather than one whole-document pass on load. Within a page it reads every width first and writes every transform second to avoid a per-span reflow. Upright runs only — a run carrying a rotation/skew matrix is left untouched, since its on-screen box is a rotated bounding box, not the local advance. The page stays fully usable without JS; this only refines the highlight rectangle, so it degrades gracefully where scripts are blocked. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NkoJuS4jaPGvUs1eVb8UbM --- src/odr/internal/html/pdf_file.cpp | 61 +++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 742c2d4a..8e072329 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -582,6 +582,11 @@ class HtmlServiceImpl final : public HtmlService { struct SpanOut { std::string classes; std::string text; + // Selection layer only: the run's true advance in CSS px, emitted as + // `data-w` so the on-load fit script can `scaleX` the transparent span to + // the real glyph width (the system fallback font it renders in has its own, + // different advances). 0 on visual spans — no attribute is written. + double width{0}; }; // One vector item, already serialized to an SVG fragment in the page's // viewBox (PDF points, y-down): a painted `` or an ``. @@ -1029,11 +1034,15 @@ class HtmlServiceImpl final : public HtmlService { } } + const double width_px = extent * pt_to_px; if (merge && !page_out.sel_spans.empty()) { page_out.sel_spans.back().text += escape_selection_text(text.text); + // The merged run flows on from the previous one, so its advance + // extends the same box: accumulate the fit target. + page_out.sel_spans.back().width += width_px; } else { - page_out.sel_spans.push_back( - SpanOut{base + " i", escape_selection_text(sep + text.text)}); + page_out.sel_spans.push_back(SpanOut{ + base + " i", escape_selection_text(sep + text.text), width_px}); } prev_baseline = baseline; @@ -1175,9 +1184,17 @@ class HtmlServiceImpl final : public HtmlService { // Inline so the run stays on one line: smaller output and a more legible // diff than the open/text/close split, while each run still gets its own // line under the page div. - out.write_element_begin( - "span", - HtmlElementOptions().set_inline(true).set_class(span.classes)); + HtmlElementOptions options; + options.set_inline(true).set_class(span.classes); + // Selection spans carry their true advance (px) for the fit script. + std::string data_w; + if (span.width > 0) { + std::ostringstream w; + w << "data-w=\"" << round2(span.width) << "\""; + data_w = std::move(w).str(); + options.set_extra(data_w); + } + out.write_element_begin("span", options); out.write_raw(span.text); out.write_element_end("span"); }; @@ -1250,6 +1267,40 @@ class HtmlServiceImpl final : public HtmlService { out.write_element_end("div"); } + + // Selection-fit script. The transparent selection spans render real Unicode + // in the browser's system font, whose advances differ from the embedded + // glyphs, so an active highlight is wider or narrower than the visible run. + // Each span carries its true advance in `data-w` (CSS px); correct the box + // with a horizontal `scaleX` = target / measured about the run's left + // origin (`.t` has `transform-origin:0 0`). The page is fully usable + // without this — it only tightens the highlight rectangle, so it degrades + // gracefully where scripts are blocked. + // + // Run per page, lazily, via `IntersectionObserver`: a large document only + // pays for the pages actually scrolled into view, never a single + // whole-document pass on load. Within a page, read every width first and + // write every transform second so the measurement loop isn't interleaved + // with style writes (which would force a reflow per span). Upright runs + // only: a run that already carries a rotation/skew matrix is skipped — its + // on-screen box is a rotated bounding box, not the local advance, so a + // single `scaleX` can't correct it (these keep today's behaviour). + out.write_script_begin(); + out.write_raw( + R"JS((function(){if(!window.IntersectionObserver)return;)JS" + R"JS(var io=new IntersectionObserver(function(es){es.forEach(function(e){)JS" + R"JS(if(!e.isIntersecting)return;io.unobserve(e.target);)JS" + R"JS(var s=e.target.querySelectorAll('.sel span[data-w]'),n=s.length,)JS" + R"JS(w=new Array(n),f=new Array(n),i,k;)JS" + R"JS(for(i=0;i0){)JS" + R"JS(k=parseFloat(s[i].getAttribute('data-w'))/w[i];)JS" + R"JS(s[i].style.transform='scaleX('+k+')';}}})},{rootMargin:'200px'});)JS" + R"JS(document.querySelectorAll('.p').forEach(function(p){io.observe(p);});})();)JS", + false); + out.write_script_end(); + out.write_body_end(); out.write_end(); From e25c65040436ba83dbdfa1d1ae7f1dcb0aee09bf Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 23:17:16 +0200 Subject: [PATCH 06/13] PDF selection layer: trail inter-word space on previous span The inferred inter-word separator (and the sweep's own gap separator) was prepended to the new selection span, so with white-space:pre it rendered a space before the first glyph at the run origin. A double-click excludes surrounding whitespace, selecting the word but leaving that leading-space cell, so the highlight started a space-width left of the text. Hang the separator off the trailing end of the previous span instead, peeling any leading space off the new run's text. Every span now starts at its first glyph; the separator is deduped so copy/find-in-page still get exactly one space across the boundary. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01WzLBJNxSU8rLosZgoEBiyM --- src/odr/internal/html/pdf_file.cpp | 47 ++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 8e072329..fa4f47c2 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -981,12 +981,20 @@ class HtmlServiceImpl final : public HtmlService { // the shared placement (`base`). A content-order sweep decides, per // run, whether it starts a new span or extends the previous one: // - // * A line/column break or a wide intra-line gap starts a new span, - // prefixed with a separator space so search and copy get whitespace - // across the boundary. The space is suppressed when either side - // already carries whitespace — a double space breaks literal - // find-in-page, and inter-word gaps often already left an inferred - // leading space on `text.text`. + // * A line/column break or a wide intra-line gap starts a new span. + // The + // separating space goes on the *trailing* end of the previous span, + // never the leading end of the new one: an inter-word gap routinely + // leaves an inferred leading space on `text.text`, and a span that + // renders `white-space:pre` would then show that space before its + // first glyph at the run origin — so a double-click, which excludes + // surrounding whitespace, selects the word but leaves the leading + // space cell, making the highlight start a space-width to the left + // of the text. Hanging the space off the previous word instead keeps + // every span starting at its first glyph. The separator is deduped + // against a space already ending the previous span (a doubled space + // breaks literal find-in-page), so search and copy still get exactly + // one space across the boundary. // * A tight same-baseline continuation with no whitespace at the // boundary merges into the previous span. PDF splits one word into // several runs at every TJ kerning adjustment, and the browser finds @@ -1018,7 +1026,7 @@ class HtmlServiceImpl final : public HtmlService { const bool starts_space = text.text.front() == ' '; bool merge = false; - std::string sep; + bool word_break = false; if (have_prev_run && font_pt > 0) { const bool new_line = std::abs(baseline - prev_baseline) > 0.6 * font_pt || @@ -1026,9 +1034,7 @@ class HtmlServiceImpl final : public HtmlService { const bool gap = ox - prev_end > 0.25 * font_pt; const bool boundary_space = prev_ends_space || starts_space; if (new_line || gap) { - if (!boundary_space) { - sep = " "; - } + word_break = true; } else if (!boundary_space) { merge = true; } @@ -1041,8 +1047,25 @@ class HtmlServiceImpl final : public HtmlService { // extends the same box: accumulate the fit target. page_out.sel_spans.back().width += width_px; } else { - page_out.sel_spans.push_back(SpanOut{ - base + " i", escape_selection_text(sep + text.text), width_px}); + // Hang the separator off the previous span. Needed whenever the + // boundary should carry whitespace — a detected word break, or a + // leading space we just peeled off this run — and deduped so the + // previous span ends with exactly one space. + if ((word_break || starts_space) && !page_out.sel_spans.empty()) { + std::string &prev = page_out.sel_spans.back().text; + if (prev.empty() || prev.back() != ' ') { + prev += ' '; + } + } + // The selection text with the inter-word space, if any, peeled off + // the front (it now trails the previous span). A run that was + // nothing but the separator emits no span of its own. + std::string core = starts_space ? text.text.substr(1) : text.text; + if (!core.empty()) { + page_out.sel_spans.push_back( + SpanOut{base + " i", escape_selection_text(std::move(core)), + width_px}); + } } prev_baseline = baseline; From a365a67e12c3ab2030f7edf93236d55e4e718d31 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Mon, 29 Jun 2026 23:53:01 +0200 Subject: [PATCH 07/13] PDF selection layer: separate zero-width span for inter-word separators Emit the inter-word/line-break separator as its own selection span with no fit width (data-w) instead of folding it into the previous glyph span's text. A trailing separator space has no visible glyph to map onto, so the on-load scaleX fit could not both land the word and collapse the space, squeezing the word. The fit script skips spans without data-w, so glyph spans now scale cleanly. The separator reuses the previous run's placement and is deduped against a space already ending the previous run. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_013sNCgZ9CyD6jRkd4tGNxRF --- src/odr/internal/html/pdf_file.cpp | 55 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index fa4f47c2..cb182a80 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -982,19 +982,25 @@ class HtmlServiceImpl final : public HtmlService { // run, whether it starts a new span or extends the previous one: // // * A line/column break or a wide intra-line gap starts a new span. - // The - // separating space goes on the *trailing* end of the previous span, - // never the leading end of the new one: an inter-word gap routinely - // leaves an inferred leading space on `text.text`, and a span that - // renders `white-space:pre` would then show that space before its - // first glyph at the run origin — so a double-click, which excludes - // surrounding whitespace, selects the word but leaves the leading - // space cell, making the highlight start a space-width to the left - // of the text. Hanging the space off the previous word instead keeps - // every span starting at its first glyph. The separator is deduped - // against a space already ending the previous span (a doubled space - // breaks literal find-in-page), so search and copy still get exactly - // one space across the boundary. + // The separating space is emitted as its *own* span carrying no fit + // width (`data-w`), never folded into a glyph span's text. Two + // reasons. (1) The on-load `scaleX` fit corrects a glyph span to its + // real advance; a trailing separator space has no visible glyph to + // map onto (a line break advances ~0 horizontally yet the space + // still occupies a fallback-font cell), so a single `scaleX` could + // not both land the word and collapse the space — folding it in + // squeezes the word. A separator span with no `data-w` is skipped by + // the fit script, leaving glyph spans to scale cleanly. (2) An + // inter-word gap routinely leaves an inferred leading space on + // `text.text`; peeling it onto a separate span (rather than the new + // word's leading edge) keeps every glyph span starting at its first + // glyph, so a double-click — which excludes surrounding whitespace — + // highlights the word without a space-width offset. The separator + // reuses the previous run's placement (it sits at that run's origin, + // transparent, adding no visible highlight) and is deduped against a + // space already ending the previous run, so search and copy get + // exactly one space across the boundary (a doubled space breaks + // literal find-in-page). // * A tight same-baseline continuation with no whitespace at the // boundary merges into the previous span. PDF splits one word into // several runs at every TJ kerning adjustment, and the browser finds @@ -1047,18 +1053,21 @@ class HtmlServiceImpl final : public HtmlService { // extends the same box: accumulate the fit target. page_out.sel_spans.back().width += width_px; } else { - // Hang the separator off the previous span. Needed whenever the - // boundary should carry whitespace — a detected word break, or a - // leading space we just peeled off this run — and deduped so the - // previous span ends with exactly one space. - if ((word_break || starts_space) && !page_out.sel_spans.empty()) { - std::string &prev = page_out.sel_spans.back().text; - if (prev.empty() || prev.back() != ' ') { - prev += ' '; - } + // Emit the separator as its own span, reusing the previous run's + // placement and carrying no fit width (`width == 0` -> no `data-w`, + // so the on-load scaleX skips it and never distorts a glyph span). + // Needed whenever the boundary should carry whitespace — a detected + // word break, or a leading space we just peeled off this run — and + // deduped against a space already ending the previous run so the + // boundary holds exactly one space (a doubled space breaks literal + // find-in-page). + if ((word_break || starts_space) && !prev_ends_space && + !page_out.sel_spans.empty()) { + page_out.sel_spans.push_back( + SpanOut{page_out.sel_spans.back().classes, " ", 0}); } // The selection text with the inter-word space, if any, peeled off - // the front (it now trails the previous span). A run that was + // the front (it became the separator span above). A run that was // nothing but the separator emits no span of its own. std::string core = starts_space ? text.text.substr(1) : text.text; if (!core.empty()) { From 274d9c6c20b973786ecc4e18b9ca6fd20000b9ea Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 09:43:11 +0200 Subject: [PATCH 08/13] PDF selection layer: correct word-break comment to state the real reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge/no-merge boundary comment claimed words stay in separate spans to keep double-click from grabbing the whole phrase. That is false here: the separators are ordinary U+0020, which the browser breaks on across span boundaries regardless, so merging would not affect double-click. The actual reason is placement — each span is positioned at its own run origin and gets one uniform scaleX fit about its left edge, which a positional inter-word gap cannot survive. Rewrite the comment accordingly. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_013sNCgZ9CyD6jRkd4tGNxRF --- src/odr/internal/html/pdf_file.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index cb182a80..16ad2ad1 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -1008,13 +1008,24 @@ class HtmlServiceImpl final : public HtmlService { // over separate spans can't be grown by a double-click. Folding the // continuation into the previous text node keeps the whole word // selectable as a unit. A boundary that already carries a space is a - // word break, not an intra-word split, so it stays a separate span — - // gluing the words into one node (over a non-breaking separator) - // would instead make a double-click grab the whole phrase. The - // merged run's own origin is dropped — its glyphs flow from where - // the previous run ended — but the runs are tightly packed by - // construction and the layer is transparent, so the sub-glyph drift - // is invisible. + // word break, not an intra-word split, so it stays a separate span. + // (Double-click word selection is *not* the reason: the separators + // are ordinary U+0020, which the browser breaks on across span + // boundaries anyway, so merging words would not glue them under a + // double-click.) The reason is placement. Each span sits at its own + // run origin and the merge drops the continuation's origin, letting + // its glyphs flow on from where the previous run ended; the on-load + // `scaleX` fit then corrects the whole span to a single `data-w` + // about its left edge. That works for a tight intra-word split — + // the runs are packed with no positional gap, so one uniform scaleX + // lands them and the (transparent) sub-glyph drift is invisible. + // Across a word break it would not: the inter-word gap is a + // *position* (the next word has its own text matrix origin), not a + // stretchable glyph, so a single scaleX over the merged text cannot + // reproduce both word advances and the gap, and the selection box + // would slide off the painted glyphs. So words stay in separate, + // individually-positioned spans with the gap carried by a no-fit + // separator span between them. if (!text.text.empty()) { // Run origin and horizontal extent in page-box points (y down). The // advance (`text.width`) lives in the text matrix's space; its box From dc86f71852cfe667daade663eb336a15d63c81fb Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 10:16:58 +0200 Subject: [PATCH 09/13] update refs --- test/data/reference-output/odr-private | 2 +- test/data/reference-output/odr-public | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 85a14d01..6c5c7607 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 85a14d010ffb87dddeb67cdc1aa18bd54d502c47 +Subproject commit 6c5c760724c092824107e94d736427e5d418c8dc diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public index 45b29f5b..3e1a28cb 160000 --- a/test/data/reference-output/odr-public +++ b/test/data/reference-output/odr-public @@ -1 +1 @@ -Subproject commit 45b29f5b796bda9ad0c14661179e50f91f47aecc +Subproject commit 3e1a28cbc2f95c908c0a9b3966d4420c9995d393 From af68b465083255f4159199b134536c1419e316b4 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 10:19:36 +0200 Subject: [PATCH 10/13] PDF: fold TEXT_SELECTION_PLAN into AGENTS.md, delete the plan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The selection-layer plan is implemented (separate transparent selection layer, content-order split/merge sweep, per-run origin anchoring, on-load scaleX fit). Compress its decisions into the pdf AGENTS.md design notes — including the two reversals from the plan's "fixed" decisions (output is no longer fully JS-free; scaleX is kept, not dropped) — and record the deferred items (de-hyphenation, gap-based word separators, semantic structure) under Other known gaps. Remove the now-redundant plan file. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_013sNCgZ9CyD6jRkd4tGNxRF --- src/odr/internal/pdf/AGENTS.md | 51 +++++ src/odr/internal/pdf/TEXT_SELECTION_PLAN.md | 206 -------------------- 2 files changed, 51 insertions(+), 206 deletions(-) delete mode 100644 src/odr/internal/pdf/TEXT_SELECTION_PLAN.md diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index 1a042398..c68ef89c 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -385,6 +385,48 @@ recoverable Unicode are additionally marked non-extractable (`user-select: none` a rendering risk — such PDFs look right, their text just isn't selectable until the tables land. +**Selection layer separate from the glyph layer (decision 2026-06).** Selection +and find-in-page used to be poor because every show-text segment became one +absolutely-positioned transparent span at its run origin: runs sharing a line +were independent boxes with no whitespace or reading order between them, so a +phrase — or even one word split across `TJ` kerning runs — crossing a run +boundary was unfindable and a drag jumped between boxes. Fix: keep the **visual +glyph layer exactly as is** (absolutely-positioned PUA spans — what makes +rendering pixel-perfect) and restructure **only** the transparent Unicode into a +separate **selection layer** (`PageOut::sel_spans`, transparent via `.i`), +emitted contiguously *after* the visual content in content-stream order so a +native drag or Ctrl+F flows through it without an unselectable glyph span +(`.g`, `user-select:none`) interrupting. PDF.js-style layering, done statically +at generation time. Key points: +- **Content-stream-order sweep, never a global re-sort.** Content order is + almost always reading order (a producer paints a column top-to-bottom, then the + next); a global (baseline, x) sort would interleave columns sharing a y-band + and scramble multi-column text and tables. An O(n) sweep tracks the previous + run's baseline and right edge and decides each run's boundary. +- **Eager to split, conservative to merge.** New span when the baseline jumps + (>0.6·font-size) or the run starts left of the previous run's end, or when the + same-line gap exceeds 0.25·font-size — either inserts a single space (so + `"the quick"` matches across the break), as its **own** separator span at the + previous run's origin, deduped against whitespace the run already carries (a + doubled space breaks literal find-in-page). Otherwise a tight, whitespace-free + same-baseline continuation **merges** into the previous span — PDF splits one + word at every `TJ` kern and the browser finds word boundaries only within a + single text node, so folding the continuation keeps the whole word selectable. + Cells never merge across columns, so tables fall out as separate spans (correct + selection) with **no table detection**. +- **Per-run origin anchoring + an on-load `scaleX` fit (the one non-JS-free + bit).** Each selection span is absolutely positioned at its run origin (reused + from the glyph layer), so highlight drift can accumulate only *within* one + short run, never across a line. The transparent text renders in a system + fallback font with its own advances, so a tiny on-load JS script `scaleX`es + each glyph span (carrying its true advance as `data-w`) about its left edge to + the real glyph width; separator spans carry no `data-w` and are skipped. This + reverses two of the plan's original "fixed" decisions — output is **no longer + fully JS-free**, and `scaleX` is **kept**, not dropped — because per-run + `scaleX` is the only way to hold the highlight on the glyphs statically (PDF.js + measures the same factor at runtime). Visual rendering stays byte-for-byte + unchanged; only the highlight-rectangle alignment improves. + --- ## Tests @@ -657,6 +699,15 @@ tree, little else. CID → Unicode tables (large external data; the generator scaffolding in `tools/pdf/generate_cid_data.py` is landed, the storage decision and lookup remain). +- **Selection-layer refinements** (deferred from the selection-layer work): no + **de-hyphenation** — a line-final hyphen (`"infor-\nmation"`) stays unfindable + as `"information"`, since auto-joining is genuinely ambiguous (soft break + hyphen vs. a real `well-known`; PDF almost never marks the difference, only the + rare `U+00AD` is unambiguous) and lossy enough to hurt copy fidelity — revisit + as an opt-in / `U+00AD`-only heuristic. Also: gap-based word separators within + a line beyond the producer's inferred spaces (only if word-merging shows up in + practice), and richer static structure recovery (semantic `
` / + multi-column markup) — a separate, larger layout-analysis effort. - **Bidi & vertical writing** (deferred): RTL run reordering for the layout/selection order, and vertical writing mode (`Identity-V`/CJK — the `/W2`/`/DW2` vertical metrics and a perpendicular pen advance, which the diff --git a/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md b/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md deleted file mode 100644 index 17a41c86..00000000 --- a/src/odr/internal/pdf/TEXT_SELECTION_PLAN.md +++ /dev/null @@ -1,206 +0,0 @@ -# PDF text selection & search — plan - -## Context - -The PDF → HTML imaging output is now quite complete (stage 4), but **marking -text and searching is poor**. The cause is structural: every show-text segment -(`Tj`, or one string of a `TJ` array) becomes one `pdf::TextElement` -(`pdf_page_element.hpp`), and each becomes one **absolutely-positioned ``** -placed at its run origin (`html/pdf_file.cpp` ~`l.902–941`). Runs that share a -visual line are therefore independent boxes at arbitrary coordinates with no -whitespace or reading order between them, so: - -- find-in-page can't match a phrase that crosses a run boundary, and a single - word split by kerning (several `TJ` adjustments) is several spans, so even - one-word search misses; -- dragging a selection jumps between unrelated boxes — there is no line flow for - the browser to follow; -- copy order follows content-stream (paint) order, not always reading order. - -For text with an embedded font we already emit a **dual layer** — a transparent, -selectable Unicode parent span with a nested visible PUA-glyph span -(`html/pdf_file.cpp` ~`l.1020–1054`). The selectable text already exists; it is -just **per-run and absolutely positioned**, which is exactly what selection and -search need it not to be. - -Intended outcome: native, JS-free selection and find-in-page that flow along -lines (and across wrapped lines within a block), without regressing the -pixel-perfect visual rendering. - -## Approach (decided) - -Keep the **visual glyph layer exactly as it is** — absolutely-positioned PUA -spans are what make the rendering good; do not touch them. Restructure **only -the transparent Unicode layer** into per-line (and, where confident, per-block) -containers in reading order, with real whitespace between runs. This is -PDF.js-style layering done **statically at generation time**. - -Fixed decisions from discussion: - -1. **Separate text layer**, not a unified reflow. The selectable Unicode becomes - its own line/block-grouped layer; the visual layer stays pixel-perfect and - independent. The selection layer **replaces** today's per-run transparent - spans rather than stacking on top of them, so node count stays roughly flat. -2. **Static HTML/CSS only — no JavaScript.** Because we translate ahead of time, - all layout analysis happens in the C++ pass and is baked into static DOM; - native browser selection and Ctrl+F then work with zero runtime. This is a - hard constraint (JS-free output is a core virtue of the current export). -3. **Eager to split, conservative to merge.** Cluster runs into lines by baseline - (y + orientation), order by x. Merge adjacent lines into a block only when - clearly the same column — overlapping x-range, consistent leading - (baseline-to-baseline ≈ font size), same writing direction, same-ish size. - When any signal is shaky, fall back to separate line containers. The fallback - is lossless for within-line UX, so multi-line grouping never hurts the - single-line case. This also makes **tables safe without table detection**: - cells never merge across columns, so they fall out as separate line - containers (correct selection) — we do not need to recognize the table. -4. **Multi-line is the target**, degrading gracefully to line-only. Intra-block - line breaks get a single space separator so search matches across the wrap - (`"the\nquick"` → findable `"the quick"`). - -## Mechanism - -### Grouping (generation time, C++) - -A linear sweep over the page's `TextElement`s **in content-stream order** — -crucially *not* a global re-sort. Content-stream order is almost always reading -order already (a producer draws a column top-to-bottom, then the next column), so -trusting it is what keeps multi-column text and tables from scrambling. A global -sort by (baseline, x) would interleave columns sharing a y-band — exactly the -failure we must avoid. - -The sweep tracks the previous run's baseline and right edge, and for each run -decides the **separator** to insert before it: - -1. New line when the baseline jumps (more than ~0.6·font-size) or the run starts - left of where the previous ended (a column/line break in producer order). -2. Same-line gap when the horizontal gap to the previous run exceeds a small - fraction (~0.25·font-size). - -Either case inserts a single space (so search matches across the break), -*unless* the adjacent text already carries whitespace — many gaps are already -represented as an inferred leading space on the segment (`TextElement::text`), so -guard against double spaces (which would break literal find-in-page). - -Cost is O(n) per page — negligible next to the embedded-font re-encode that -already dominates the pass. No new font work; no sort. - -### Within-line gaps → whitespace (the core tension) - -Between consecutive runs on a line there is a horizontal gap (inter-word space, -kerning, tab to a column). The selection layer must reconcile two conflicting -needs: - -- **Searchable whitespace** — a real space character makes `"the quick"` - findable and copy readable, but a literal space has the font's space-width, not - the exact PDF gap, so the transparent text **drifts** from the glyphs along the - line. -- **Positional accuracy** — an exact-width spacer (inline-block / letter-spacing) - stays aligned but carries no whitespace, so words run together (`"thequick"`) - and search breaks. - -No single trick gives both, and the obvious width-fix is **not available to us**: - -- **`transform: scaleX(...)`** (the PDF.js technique) needs the run's *rendered* - width to compute the scale, which PDF.js measures **at runtime in JS**. We emit - no JS, so statically the factor is either uncomputable (system-font selection - layer) or just `1` (embedded font — the advances already match). So `scaleX` - buys us nothing and is dropped. -- **Chosen approach: anchor every run at its own known origin.** Each selection - span is absolutely positioned at the run's origin (the placement we already - compute for the glyph layer), so drift can only accumulate *within* one short - run, never across the line. Real spaces between runs (see grouping) give - searchable whitespace; the per-run anchoring keeps the highlight close without - any width-fix. Fully static, no JS. -- **Dead end — do not pursue: `position: relative; left:`** to offset an inserted - space. Relative positioning shifts the box visually but leaves its space - reserved in the flow, so siblings don't move; it cannot reclaim the gap. (A - negative `margin-left` *would* reclaim it, but with per-run origin anchoring we - don't need to.) - -### Highlight alignment quality bar - -Selection highlights the *selection layer's* boxes; if those are offset from the -visible glyphs the highlight looks shifted (text itself is never wrong — the -glyph layer is a separate, perfect layer, so misalignment shows up only as a -slightly-off highlight rectangle during an active drag). Per-run origin anchoring (above) -keeps this in the acceptable band: each run's highlight starts exactly on its -glyphs and can only drift within that one short run. We ship that and revisit -only if the residual within-run drift is noticeable in practice. - -## De-hyphenation (tracked, deferred) - -A line-final hyphen (`"infor-\nmation"`) is unfindable as `"information"` whether -joined with a space (`"infor- mation"`) or nothing (`"infor-mation"`); only -dropping the hyphen + break fixes search. But it is genuinely ambiguous — a -line-final hyphen may be a soft break hyphen (`infor-mation`) or a real one -(`well-\nknown` must stay `well-known`), and PDF almost never marks the -difference (most producers emit plain `U+002D` for both; the rare `U+00AD` soft -hyphen is the only unambiguous signal). - -Decision: **do not auto-de-hyphenate in v1** — lossy and wrong often enough to be -a net negative for copy fidelity. Join intra-block lines with a space, accept -that hyphenated-across-line words miss in search. Revisit as an opt-in heuristic: -collapse only when the trailing char is `U+00AD`, or behind a config flag. - -## Implementation sketch - -All changes are in the HTML layer; the IR (`pdf::TextElement`) already carries -what we need (`transform`, `size`, `advances`, `text`). - -- **`src/odr/internal/html/pdf_file.cpp`** — `HtmlServiceImpl::write_document`: - - **Visual layer (paint order, non-selectable):** every embedded-font run emits - its PUA glyph span (the existing display-only form, `base + " g " + fvN`); - fallback runs emit their Unicode in the system font, also `.g`. All visual - text is `user-select:none`. Invisible runs (Tr 3/7) paint nothing, so they - emit **no** visual span at all. - - **Selection layer (content order, transparent, selectable):** any run with - extractable text contributes one span carrying the real Unicode, anchored at - the run origin (reuse `base`), transparent via `.i`, with the leading - separator from the grouping sweep. Emitted per page after the paint content - so the selectable spans are contiguous in the DOM. - - **Fold out the "collapse" path.** Because all common-case selectable text now - lives in the selection layer, the visible layer no longer needs to render - real Unicode in the embedded font. Remove the collapse machinery - (`collapsible_unicode`, `used_unicode`, the per-run first-wins scalar walk) - and the real-Unicode `cmap` baking in the post-pass — the font is re-encoded - **PUA-only** (`reencode_to_pua(*sfnt)` / `wrap_to_otf(*cff)` with no extras). - Visual output stays pixel-identical (PUA maps to the same glyphs); the DOM - and font subset shrink in complexity. - - Separator classes are not needed (spaces ride inside the span text); existing - placement classes are reused via the `AtomicStyles` interner. - -## Size / performance notes - -- Generation: +O(n log n) sort per page; negligible. -- HTML size: the Unicode bytes are **relocated**, not duplicated; node count - stays roughly flat (one container per line vs. a transparent parent per run). - `scaleX` classes are bounded by distinct values via `AtomicStyles`. Real-space - separators are ~free; prefer them over inline-block spacers. The docs that - approached GitHub's 100 MB reference-output ceiling are dominated by the glyph - layer + embedded fonts, both unchanged. (The 100 MB ceiling is a soft - reference-output constraint, not a product limit — keep the layer lean but do - not let it block the design.) - -## Verification - -- Build/test in `cmake-build-relwithdebinfo`; run the PDF HTML output tests and - regenerate reference output for the `test/data/.../output/pdf` fixtures, eyeing - the diff for size and structure. -- Manual: open representative outputs (e.g. `geneve_1564.pdf`, - `978-3-030-65771-0.pdf`, a multi-column doc) in a browser and check: - - Ctrl+F finds phrases that span run boundaries and wrapped lines. - - Click-drag selection flows along a line and across lines in a block. - - Copy yields readable, correctly-ordered text. - - Multi-column / table-like pages do **not** scramble across columns on copy. - - Visual rendering is byte-for-byte unchanged from before (glyph layer - untouched) — confirm via the perceptual-diff oracle. - - Output stays JS-free. - -## Future work - -- De-hyphenation heuristic (opt-in / `U+00AD`-only). -- Gap-based word separators within a line (beyond the producer's inferred - spaces), if word-merging shows up in practice. -- Richer static structure recovery (semantic `
` / multi-column markup) — - a separate, larger layout-analysis effort, out of scope here. From dbaa18b519686510577a6df219fda95260c3cfde Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 15:24:09 +0200 Subject: [PATCH 11/13] try again with lfs checkout --- .github/workflows/build_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_test.yml b/.github/workflows/build_test.yml index 17d590ca..3868b7f6 100644 --- a/.github/workflows/build_test.yml +++ b/.github/workflows/build_test.yml @@ -205,6 +205,7 @@ jobs: with: token: ${{ secrets.PAT_ANDIWAND }} submodules: true + lfs: true - name: ubuntu install tidy if: runner.os == 'Linux' From 5dc9ade4999e7d5dd27b7cdcdeb09cbe51cf75a0 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 16:32:36 +0200 Subject: [PATCH 12/13] PDF selection layer: per-line flow blocks, letter-spacing fit Group the transparent selection runs into one absolutely-positioned container per PDF line whose run spans flow inline, instead of one absolutely-positioned span per run. Native within-line selection, double-click and find-in-page now work and the run boxes are real. Horizontal placement within a line is purely cumulative: each inter-run separator span's data-w is the gap width, so word advances and gaps telescope to each run's true x-offset (wide table-column gaps reproduced, not collapsed). The on-load fit switches from scaleX to letter-spacing = (target - measured) / glyph_count (negative to squeeze), which is consumed during layout so the box grows and the next run flows from the corrected edge. The selection placement reuses the glyph layer's origin minus the Tc/Tw spacing classes (the fit subsumes them). Rotated/skewed (matrix) runs cannot flow or be fit, so each keeps its own single-run line block positioned by its matrix with no data-w, reproducing the old per-run absolute placement. Visual glyph layer unchanged. First rung toward native selection; paragraph-level grouping is deferred. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Mq2d2eFjjCL8cHpU9pHugq --- src/odr/internal/html/pdf_file.cpp | 267 ++++++++++++++++------------- src/odr/internal/pdf/AGENTS.md | 66 ++++--- 2 files changed, 194 insertions(+), 139 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 16ad2ad1..65034329 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -576,18 +576,30 @@ class HtmlServiceImpl final : public HtmlService { // One emitted span: the resolved class tokens plus the already-escaped text. // The renderer paints text in two independent layers (see `write_document`): // the **visual** layer (`PageOut::items`, in paint order) carries the - // unselectable glyphs; the **selection** layer (`PageOut::sel_spans`, in + // unselectable glyphs; the **selection** layer (`PageOut::sel_lines`, in // content/reading order) carries the transparent, selectable real Unicode. // Both layers are flat — a span is just classes + text. struct SpanOut { std::string classes; std::string text; // Selection layer only: the run's true advance in CSS px, emitted as - // `data-w` so the on-load fit script can `scaleX` the transparent span to - // the real glyph width (the system fallback font it renders in has its own, - // different advances). 0 on visual spans — no attribute is written. + // `data-w` so the on-load fit script can stretch/squeeze the transparent + // span to the real glyph width with `letter-spacing` (the system fallback + // font it renders in has its own, different advances). For an inter-run + // separator it is the gap width, so the flowed runs land at their true + // x-offsets. 0 on visual spans and on unfitted runs — no attribute written. double width{0}; }; + // The selection layer is grouped into per-line flow blocks: one + // absolutely-positioned container per PDF line (its `classes` carry the line + // origin placement plus `.i`), holding inline run ``s that *flow*. This + // is what makes native within-line selection, double-click and find-in-page + // work and keeps the run boxes real — see `write_document` and the on-load + // fit script. + struct LineOut { + std::string classes; + std::vector runs; + }; // One vector item, already serialized to an SVG fragment in the page's // viewBox (PDF points, y-down): a painted `` or an ``. // Contiguous vector items share one `` at write time. @@ -602,10 +614,11 @@ class HtmlServiceImpl final : public HtmlService { double width{0}; // page box width, PDF points (for the SVG viewBox) double height{0}; // page box height, PDF points std::vector items; - // The selection layer: transparent, selectable Unicode spans in - // content-stream (reading) order, emitted after the visual content so they - // form one contiguous, cleanly selectable run in the DOM. - std::vector sel_spans; + // The selection layer: transparent, selectable Unicode grouped into + // per-line flow blocks in content-stream (reading) order, emitted after the + // visual content so they form one contiguous, cleanly selectable run in the + // DOM. + std::vector sel_lines; // `` defs for this page's clipped paths, emitted once in a hidden // ``; the path fragments reference them by id. Empty when no path on // the page is clipped. @@ -795,6 +808,11 @@ class HtmlServiceImpl final : public HtmlService { double prev_baseline = 0; double prev_end = 0; bool prev_ends_space = false; + bool prev_was_matrix = false; + // Index of the line block currently being filled in `sel_lines`, or -1 + // before the first line opens. Runs append to its `runs`; a line/column + // break (or a matrix run) opens the next. + int cur_line = -1; for (const pdf::PageElement &element : pdf::extract_page(stream, *page->resources, *m_logger)) { @@ -914,7 +932,12 @@ class HtmlServiceImpl final : public HtmlService { // the uniform branch, carried by the CSS matrix in the general branch // (so spacing there is expressed pre-transform, scale == 1). double scale; - if (m.b == 0 && m.c == 0 && m.a == m.d) { + // A rotated/skewed run takes the general (matrix) branch; an upright + // uniform run the left/top/font-size branch. Only the latter can flow + // inside a line block and be fit by `letter-spacing` (the fit measures + // on-screen width, which for a matrix box is a rotated bbox). + const bool is_matrix = !(m.b == 0 && m.c == 0 && m.a == m.d); + if (!is_matrix) { // Upright uniform scale: fold the scale into the font size and place // the origin with left/top, so the (otherwise near-universal) matrix // is dropped. The ascent shift is purely vertical here (local y maps @@ -941,6 +964,13 @@ class HtmlServiceImpl final : public HtmlService { scale = 1; } + // Placement-only class set (origin + font size), snapshot before the + // Tc/Tw spacing classes below. The selection line container uses this: + // the spacing is folded into each run's `data-w` advance and applied by + // the `letter-spacing` fit, so carrying a separate Tc `letter-spacing` + // would collide with it. The visual glyph layer keeps the full `base`. + const std::string place = base; + // PDF char/word spacing (Tc/Tw) translate directly to CSS. TJ kerning // needs no expression here: `extract_text` emits a separate segment per // TJ string and folds the adjustment into the following segment's @@ -977,55 +1007,46 @@ class HtmlServiceImpl final : public HtmlService { // --- Selection layer ------------------------------------------------- // Every run with extractable text feeds the transparent, selectable - // layer (`.i`) with its real Unicode, anchored at the run origin via - // the shared placement (`base`). A content-order sweep decides, per - // run, whether it starts a new span or extends the previous one: + // layer (`.i`) with its real Unicode. Runs are grouped into per-line + // flow blocks: one absolutely-positioned container per PDF line (placed + // at the line's first run origin via `place`), whose inline run spans + // *flow*, so a native drag, double-click or find-in-page works within + // the line and the run boxes are real. A content-order sweep decides, + // per run, whether it opens a new line, extends the line with a fresh + // run, or merges into the previous run: // - // * A line/column break or a wide intra-line gap starts a new span. - // The separating space is emitted as its *own* span carrying no fit - // width (`data-w`), never folded into a glyph span's text. Two - // reasons. (1) The on-load `scaleX` fit corrects a glyph span to its - // real advance; a trailing separator space has no visible glyph to - // map onto (a line break advances ~0 horizontally yet the space - // still occupies a fallback-font cell), so a single `scaleX` could - // not both land the word and collapse the space — folding it in - // squeezes the word. A separator span with no `data-w` is skipped by - // the fit script, leaving glyph spans to scale cleanly. (2) An - // inter-word gap routinely leaves an inferred leading space on - // `text.text`; peeling it onto a separate span (rather than the new - // word's leading edge) keeps every glyph span starting at its first - // glyph, so a double-click — which excludes surrounding whitespace — - // highlights the word without a space-width offset. The separator - // reuses the previous run's placement (it sits at that run's origin, - // transparent, adding no visible highlight) and is deduped against a - // space already ending the previous run, so search and copy get - // exactly one space across the boundary (a doubled space breaks - // literal find-in-page). + // * A line/column break (baseline jump, or x regressing left of the + // previous run's end) opens a new line block. The previous line is + // closed with a trailing space (deduped) so a phrase split across + // the break is still found/copied as "word1 word2"; that space + // carries no `data-w`, so the fit skips it and it renders past the + // last glyph (transparent, harmless). The inferred leading space a + // run often carries is dropped at a line start — it belongs to the + // break, already covered by the trailing space. + // * A wide intra-line gap, or a whitespace boundary, starts a fresh + // run within the same line. The inter-run gap rides on a separator + // span whose `data-w` *is* the gap width, so the flowed runs land at + // their true x-offsets (telescoping: a separator gap plus a run + // advance equals the next run's offset), and wide gaps — table + // columns on one baseline — are reproduced, not collapsed to a + // single space. The separator holds one U+0020, deduped against a + // space already ending the previous run (a doubled space breaks + // literal find-in-page); the inferred leading space is peeled onto + // it so each word run starts at its first glyph (a double-click, + // which excludes surrounding whitespace, then highlights the word + // without a space-width offset). // * A tight same-baseline continuation with no whitespace at the - // boundary merges into the previous span. PDF splits one word into + // boundary merges into the previous run. PDF splits one word into // several runs at every TJ kerning adjustment, and the browser finds - // word boundaries within a single text node only, so a word spread - // over separate spans can't be grown by a double-click. Folding the - // continuation into the previous text node keeps the whole word - // selectable as a unit. A boundary that already carries a space is a - // word break, not an intra-word split, so it stays a separate span. - // (Double-click word selection is *not* the reason: the separators - // are ordinary U+0020, which the browser breaks on across span - // boundaries anyway, so merging words would not glue them under a - // double-click.) The reason is placement. Each span sits at its own - // run origin and the merge drops the continuation's origin, letting - // its glyphs flow on from where the previous run ended; the on-load - // `scaleX` fit then corrects the whole span to a single `data-w` - // about its left edge. That works for a tight intra-word split — - // the runs are packed with no positional gap, so one uniform scaleX - // lands them and the (transparent) sub-glyph drift is invisible. - // Across a word break it would not: the inter-word gap is a - // *position* (the next word has its own text matrix origin), not a - // stretchable glyph, so a single scaleX over the merged text cannot - // reproduce both word advances and the gap, and the selection box - // would slide off the painted glyphs. So words stay in separate, - // individually-positioned spans with the gap carried by a no-fit - // separator span between them. + // word boundaries within a single text node only, so folding the + // continuation keeps the whole word selectable as a unit; its + // advance extends the same run's fit target. + // + // A rotated/skewed (matrix) run cannot flow or be `letter-spacing`-fit + // (its on-screen box is a rotated bbox), so it gets its own single-run + // line block positioned by its own matrix and left unfitted (no + // `data-w`) — reproducing the old per-run absolute placement, with no + // flow benefit but no regression either. if (!text.text.empty()) { // Run origin and horizontal extent in page-box points (y down). The // advance (`text.width`) lives in the text matrix's space; its box @@ -1041,56 +1062,58 @@ class HtmlServiceImpl final : public HtmlService { const double extent = text.width * axis; const double font_pt = text.size * axis; const bool starts_space = text.text.front() == ' '; - - bool merge = false; - bool word_break = false; - if (have_prev_run && font_pt > 0) { - const bool new_line = - std::abs(baseline - prev_baseline) > 0.6 * font_pt || - ox < prev_end - 0.5 * font_pt; - const bool gap = ox - prev_end > 0.25 * font_pt; - const bool boundary_space = prev_ends_space || starts_space; - if (new_line || gap) { - word_break = true; - } else if (!boundary_space) { - merge = true; - } + const double width_px = extent * pt_to_px; + // The fit target: 0 for a matrix run (skipped by the fit), else the + // run's true advance. + const double fit_w = is_matrix ? 0.0 : width_px; + // Inter-run gap in box px (only meaningful within a line). + const double gap_px = std::max(0.0, ox - prev_end) * pt_to_px; + + // Open a new line block on the first run, a matrix run (or just after + // one), or a detected line/column break. + bool new_line = !have_prev_run || is_matrix || prev_was_matrix; + bool gap = false; + if (have_prev_run && font_pt > 0 && !new_line) { + new_line = std::abs(baseline - prev_baseline) > 0.6 * font_pt || + ox < prev_end - 0.5 * font_pt; + gap = ox - prev_end > 0.25 * font_pt; } - const double width_px = extent * pt_to_px; - if (merge && !page_out.sel_spans.empty()) { - page_out.sel_spans.back().text += escape_selection_text(text.text); - // The merged run flows on from the previous one, so its advance - // extends the same box: accumulate the fit target. - page_out.sel_spans.back().width += width_px; - } else { - // Emit the separator as its own span, reusing the previous run's - // placement and carrying no fit width (`width == 0` -> no `data-w`, - // so the on-load scaleX skips it and never distorts a glyph span). - // Needed whenever the boundary should carry whitespace — a detected - // word break, or a leading space we just peeled off this run — and - // deduped against a space already ending the previous run so the - // boundary holds exactly one space (a doubled space breaks literal - // find-in-page). - if ((word_break || starts_space) && !prev_ends_space && - !page_out.sel_spans.empty()) { - page_out.sel_spans.push_back( - SpanOut{page_out.sel_spans.back().classes, " ", 0}); + std::string core = starts_space ? text.text.substr(1) : text.text; + + if (new_line) { + if (cur_line >= 0 && have_prev_run && !prev_ends_space) { + page_out.sel_lines[cur_line].runs.push_back(SpanOut{"", " ", 0}); + } + page_out.sel_lines.push_back(LineOut{place + " i", {}}); + cur_line = static_cast(page_out.sel_lines.size()) - 1; + if (!core.empty()) { + page_out.sel_lines[cur_line].runs.push_back( + SpanOut{"", escape_selection_text(std::move(core)), fit_w}); + } + } else if (gap || prev_ends_space || starts_space) { + // Fresh run within the line, gap carried by a separator span. + std::vector &runs = page_out.sel_lines[cur_line].runs; + if (!prev_ends_space && !runs.empty()) { + runs.push_back(SpanOut{"", " ", gap_px}); } - // The selection text with the inter-word space, if any, peeled off - // the front (it became the separator span above). A run that was - // nothing but the separator emits no span of its own. - std::string core = starts_space ? text.text.substr(1) : text.text; if (!core.empty()) { - page_out.sel_spans.push_back( - SpanOut{base + " i", escape_selection_text(std::move(core)), - width_px}); + runs.push_back( + SpanOut{"", escape_selection_text(std::move(core)), fit_w}); + } + } else { + // Tight, whitespace-free continuation: extend the previous run. + std::vector &runs = page_out.sel_lines[cur_line].runs; + if (!runs.empty()) { + runs.back().text += escape_selection_text(text.text); + runs.back().width += width_px; } } prev_baseline = baseline; prev_end = ox + extent; prev_ends_space = text.text.back() == ' '; + prev_was_matrix = is_matrix; have_prev_run = true; } @@ -1228,7 +1251,12 @@ class HtmlServiceImpl final : public HtmlService { // diff than the open/text/close split, while each run still gets its own // line under the page div. HtmlElementOptions options; - options.set_inline(true).set_class(span.classes); + options.set_inline(true); + // Inline selection run spans carry no class (placement and transparency + // are inherited from the line container); everything else names classes. + if (!span.classes.empty()) { + options.set_class(span.classes); + } // Selection spans carry their true advance (px) for the fit script. std::string data_w; if (span.width > 0) { @@ -1303,43 +1331,52 @@ class HtmlServiceImpl final : public HtmlService { // cleanly across runs and lines without the visual glyphs (which are // `user-select:none`) interrupting it. out.write_element_begin("div", HtmlElementOptions().set_class("sel")); - for (const SpanOut &span : page.sel_spans) { - write_span(span); + for (const LineOut &line : page.sel_lines) { + // One absolutely-positioned container per PDF line; its run spans flow + // inline, so selection/find/double-click work natively across them. + out.write_element_begin("div", + HtmlElementOptions().set_class(line.classes)); + for (const SpanOut &run : line.runs) { + write_span(run); + } + out.write_element_end("div"); } out.write_element_end("div"); out.write_element_end("div"); } - // Selection-fit script. The transparent selection spans render real Unicode - // in the browser's system font, whose advances differ from the embedded + // Selection-fit script. The transparent run spans render real Unicode in + // the browser's system font, whose advances differ from the embedded // glyphs, so an active highlight is wider or narrower than the visible run. - // Each span carries its true advance in `data-w` (CSS px); correct the box - // with a horizontal `scaleX` = target / measured about the run's left - // origin (`.t` has `transform-origin:0 0`). The page is fully usable - // without this — it only tightens the highlight rectangle, so it degrades + // Each fitted span carries its true advance in `data-w` (CSS px); correct + // its box with `letter-spacing = (target - measured) / glyph_count`. Unlike + // the old `scaleX`, `letter-spacing` is consumed *during* layout, so the + // box actually grows/shrinks and the following run flows from the corrected + // edge — what makes the per-line flow blocks land. Negative values squeeze + // a too-wide run. The page is fully usable without this — it only tightens + // the highlight rectangle and the within-line x-offsets, so it degrades // gracefully where scripts are blocked. // // Run per page, lazily, via `IntersectionObserver`: a large document only // pays for the pages actually scrolled into view, never a single // whole-document pass on load. Within a page, read every width first and - // write every transform second so the measurement loop isn't interleaved - // with style writes (which would force a reflow per span). Upright runs - // only: a run that already carries a rotation/skew matrix is skipped — its - // on-screen box is a rotated bounding box, not the local advance, so a - // single `scaleX` can't correct it (these keep today's behaviour). + // write every `letter-spacing` second so the measurement loop isn't + // interleaved with style writes (which would force a reflow per span); a + // run's own width is independent of its siblings' spacing, so one + // measure-then-write pass fits them all and the cumulative offsets resolve. + // Matrix (rotated/skewed) runs carry no `data-w` and are skipped. out.write_script_begin(); out.write_raw( R"JS((function(){if(!window.IntersectionObserver)return;)JS" R"JS(var io=new IntersectionObserver(function(es){es.forEach(function(e){)JS" R"JS(if(!e.isIntersecting)return;io.unobserve(e.target);)JS" R"JS(var s=e.target.querySelectorAll('.sel span[data-w]'),n=s.length,)JS" - R"JS(w=new Array(n),f=new Array(n),i,k;)JS" - R"JS(for(i=0;i0){)JS" - R"JS(k=parseFloat(s[i].getAttribute('data-w'))/w[i];)JS" - R"JS(s[i].style.transform='scaleX('+k+')';}}})},{rootMargin:'200px'});)JS" + R"JS(w=new Array(n),i,c,d;)JS" + R"JS(for(i=0;i0&&w[i]>0){)JS" + R"JS(d=parseFloat(s[i].getAttribute('data-w'));)JS" + R"JS(s[i].style.letterSpacing=((d-w[i])/c)+'px';}}})},{rootMargin:'200px'});)JS" R"JS(document.querySelectorAll('.p').forEach(function(p){io.observe(p);});})();)JS", false); out.write_script_end(); diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index c68ef89c..0267de6a 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -393,7 +393,7 @@ phrase — or even one word split across `TJ` kerning runs — crossing a run boundary was unfindable and a drag jumped between boxes. Fix: keep the **visual glyph layer exactly as is** (absolutely-positioned PUA spans — what makes rendering pixel-perfect) and restructure **only** the transparent Unicode into a -separate **selection layer** (`PageOut::sel_spans`, transparent via `.i`), +separate **selection layer** (`PageOut::sel_lines`, transparent via `.i`), emitted contiguously *after* the visual content in content-stream order so a native drag or Ctrl+F flows through it without an unselectable glyph span (`.g`, `user-select:none`) interrupting. PDF.js-style layering, done statically @@ -403,29 +403,47 @@ at generation time. Key points: next); a global (baseline, x) sort would interleave columns sharing a y-band and scramble multi-column text and tables. An O(n) sweep tracks the previous run's baseline and right edge and decides each run's boundary. -- **Eager to split, conservative to merge.** New span when the baseline jumps - (>0.6·font-size) or the run starts left of the previous run's end, or when the - same-line gap exceeds 0.25·font-size — either inserts a single space (so - `"the quick"` matches across the break), as its **own** separator span at the - previous run's origin, deduped against whitespace the run already carries (a - doubled space breaks literal find-in-page). Otherwise a tight, whitespace-free - same-baseline continuation **merges** into the previous span — PDF splits one - word at every `TJ` kern and the browser finds word boundaries only within a - single text node, so folding the continuation keeps the whole word selectable. - Cells never merge across columns, so tables fall out as separate spans (correct - selection) with **no table detection**. -- **Per-run origin anchoring + an on-load `scaleX` fit (the one non-JS-free - bit).** Each selection span is absolutely positioned at its run origin (reused - from the glyph layer), so highlight drift can accumulate only *within* one - short run, never across a line. The transparent text renders in a system - fallback font with its own advances, so a tiny on-load JS script `scaleX`es - each glyph span (carrying its true advance as `data-w`) about its left edge to - the real glyph width; separator spans carry no `data-w` and are skipped. This - reverses two of the plan's original "fixed" decisions — output is **no longer - fully JS-free**, and `scaleX` is **kept**, not dropped — because per-run - `scaleX` is the only way to hold the highlight on the glyphs statically (PDF.js - measures the same factor at runtime). Visual rendering stays byte-for-byte - unchanged; only the highlight-rectangle alignment improves. +- **Eager to split, conservative to merge.** A new *line block* opens when the + baseline jumps (>0.6·font-size) or the run starts left of the previous run's + end; within a line, a gap exceeding 0.25·font-size (or a whitespace boundary) + starts a fresh run, otherwise a tight, whitespace-free same-baseline + continuation **merges** into the previous run — PDF splits one word at every + `TJ` kern and the browser finds word boundaries only within a single text + node, so folding the continuation keeps the whole word selectable. A single + space is inserted at every break (so `"the quick"` matches across it): a + separator span within the line, or the previous line's trailing space across a + line break, deduped against whitespace the run already carries (a doubled + space breaks literal find-in-page). Cells never merge across columns, so + tables fall out as separate runs (correct selection) with **no table + detection**. +- **Per-line flow blocks + an on-load `letter-spacing` fit (the one non-JS-free + bit).** Each PDF line is one absolutely-positioned container (placed at its + first run's origin, reusing the glyph-layer placement *minus* the Tc/Tw + spacing classes); its run ``s **flow inline** rather than being + individually positioned, so a native drag, double-click and find-in-page work + within the line and the run boxes are *real*. Horizontal placement within the + line is purely cumulative — each separator span's `data-w` is the inter-run + gap, so word advances and gaps telescope to each run's true x-offset (wide + table-column gaps reproduced, not collapsed). The transparent text renders in + a system fallback font with its own advances, so a tiny on-load JS script fits + each run (carrying its true advance as `data-w`) with + `letter-spacing = (target − measured) / glyph_count` — negative to squeeze a + too-wide run. Unlike the old per-run `scaleX`, `letter-spacing` is consumed + *during* layout, so the box grows/shrinks and the following run flows from the + corrected edge (that is why the flow blocks work). A rotated/skewed (matrix) + run cannot flow or be fit (its on-screen box is a rotated bbox), so it keeps + its own single-run line block positioned by its matrix and carries no `data-w` + (the fit skips it) — reproducing the old per-run absolute placement. Output is + **no longer fully JS-free**; visual rendering stays byte-for-byte unchanged, + only the selection layer changed. + - **Known follow-ups.** Vertical placement within a line still rides each + run's shared baseline, but a line block assumes ~uniform leading; mixed font + sizes in one line (sub/superscripts) align by their own baseline but the + container's box height tracks the first run's size (cosmetic highlight drift + only). Cross-line find-in-page depends on the browser treating the trailing + space + block boundary as a single space. The next rung — grouping lines + into paragraph blocks for native cross-line selection — is deferred (needs + layout analysis with a confidence fallback; see the chat that scoped this). --- From af1a4a4458e181435b747a41e81888c9857a4d95 Mon Sep 17 00:00:00 2001 From: Andreas Stefl Date: Tue, 30 Jun 2026 16:40:57 +0200 Subject: [PATCH 13/13] PDF selection layer: emit line blocks tight (no inter-span whitespace) The per-line container carries `white-space:pre` (from `.t`), so the newlines + indentation the HtmlWriter emits between its now-inline run spans rendered as real whitespace and shoved the runs onto a new line / indented them. Mark the container `set_inline` so the writer emits the whole line tight; this is a formatting flag only and does not change the element's CSS display. Also correct the stale `SpanOut` comment that still described both layers as flat. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Mq2d2eFjjCL8cHpU9pHugq --- src/odr/internal/html/pdf_file.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 65034329..1056fb26 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -576,9 +576,11 @@ class HtmlServiceImpl final : public HtmlService { // One emitted span: the resolved class tokens plus the already-escaped text. // The renderer paints text in two independent layers (see `write_document`): // the **visual** layer (`PageOut::items`, in paint order) carries the - // unselectable glyphs; the **selection** layer (`PageOut::sel_lines`, in - // content/reading order) carries the transparent, selectable real Unicode. - // Both layers are flat — a span is just classes + text. + // unselectable glyphs as a flat list of positioned spans; the **selection** + // layer (`PageOut::sel_lines`, in content/reading order) carries the + // transparent, selectable real Unicode grouped into per-line flow blocks + // (`LineOut`), each holding inline run spans. `SpanOut` is the shared leaf of + // both — a visual glyph span, or one flowed selection run. struct SpanOut { std::string classes; std::string text; @@ -1334,8 +1336,14 @@ class HtmlServiceImpl final : public HtmlService { for (const LineOut &line : page.sel_lines) { // One absolutely-positioned container per PDF line; its run spans flow // inline, so selection/find/double-click work natively across them. - out.write_element_begin("div", - HtmlElementOptions().set_class(line.classes)); + // `set_inline` stops the writer from emitting newlines + indent + // *between* the run spans: the container carries `white-space:pre` + // (from + // `.t`), so that formatting whitespace would otherwise render as real + // text and shove the runs onto a new line / indent them. + out.write_element_begin( + "div", + HtmlElementOptions().set_inline(true).set_class(line.classes)); for (const SpanOut &run : line.runs) { write_span(run); }