Skip to content
Open
32 changes: 32 additions & 0 deletions src/odr/html.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,21 @@ enum class HtmlTableGridlines {
hard,
};

/// @brief PDF text rendering mode.
///
/// Selects how text is emitted in PDF→HTML output.
///
/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a
/// separate transparent selection/search layer (reading order, real Unicode).
/// Similar to pdf.js. No JavaScript required.
/// - `single_layer`: A single combined layer where every glyph is mapped to
/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript
/// required.
enum class PdfTextMode {
dual_layer,
single_layer,
};

/// @brief HTML configuration.
struct HtmlConfig {
// document output file names
Expand Down Expand Up @@ -106,6 +121,23 @@ struct HtmlConfig {
std::string background_image_format{"png"};
double background_image_dpi{144.0};

// PDF text mode
PdfTextMode pdf_text_mode{PdfTextMode::dual_layer};
// `dual_layer`'s invisible selection-layer text is rendered in a local
// system font (tried in order; the first that resolves wins) rather than
// the embedded PDF font, so its natural width rarely matches the
// PDF-derived box width CSS `text-justify` is asked to fill (justify can
// only add spacing, never compress).
// `pdf_dual_layer_fallback_font_size_adjust` is applied as that @font-face's
// `size-adjust` (0-1, written out as a percent) to shrink the fallback font's
// metrics toward the PDF's, leaving less — ideally no — gap for justify to
// compress instead of stretch into. Safe to underestimate (justify then just
// spreads characters further; harmless on an invisible layer) but not to
// overestimate (the excess is clipped, not shrunk).
std::vector<std::string> pdf_dual_layer_fallback_fonts{
"Arial", "Helvetica", "Liberation Sans", "DejaVu Sans", "Nimbus Sans"};
double pdf_dual_layer_fallback_font_size_adjust{0.5};

// drm options
bool no_drm{false};

Expand Down
18 changes: 11 additions & 7 deletions src/odr/internal/font/cff_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,9 @@ std::string cff::wrap_to_otf(const CffFont &font,
const std::map<char32_t, std::uint16_t> &extra) {
const std::uint16_t glyphs = font.glyph_count();

// The uniform PUA re-encode: pua_code_point(glyph) -> glyph over
// every glyph. serialize_cmap throws if a code point is beyond the BMP, which
// also bounds the glyph count to the PUA capacity.
// The uniform PUA re-encode: pua_code_point(glyph) -> glyph over every glyph.
// Glyphs past the 6400-slot BMP PUA overflow into Supplementary PUA-A, and
// serialize_cmap emits a format-12 subtable to cover them.
std::map<char32_t, std::uint16_t> pua;
for (std::uint16_t glyph = 0; glyph < glyphs; ++glyph) {
pua[pua_code_point(glyph)] = glyph;
Expand Down Expand Up @@ -184,10 +184,14 @@ std::string cff::wrap_to_otf(const CffFont &font,
tables.emplace_back("cmap", serialize_cmap(pua));
tables.emplace_back("name", serialize_name(font.name()));
tables.emplace_back("post", serialize_post());
tables.emplace_back("OS/2",
serialize_os2(font.units_per_em(), bbox.y_min, bbox.y_max,
static_cast<std::uint16_t>(first),
static_cast<std::uint16_t>(last)));
// OS/2 usFirst/usLastCharIndex are u16; a beyond-BMP PUA code point (large
// glyph counts overflow into Supplementary PUA-A) is clamped to 0xFFFF.
tables.emplace_back(
"OS/2",
serialize_os2(
font.units_per_em(), bbox.y_min, bbox.y_max,
static_cast<std::uint16_t>(std::min<char32_t>(first, 0xffff)),
static_cast<std::uint16_t>(std::min<char32_t>(last, 0xffff))));

std::ostringstream out;
build_sfnt(out, 0x4f54544f /* 'OTTO' */, std::move(tables));
Expand Down
86 changes: 73 additions & 13 deletions src/odr/internal/font/sfnt_transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
#include <algorithm>
#include <bit>
#include <cstdint>
#include <limits>
#include <map>
#include <ostream>
#include <ranges>
#include <stdexcept>
#include <utility>

namespace odr::internal::font {
Expand All @@ -18,8 +18,15 @@ namespace {

namespace bs = util::byte_string;

constexpr char32_t pua_base = 0xe000;
constexpr std::uint16_t pua_capacity = 0xf8ff - 0xe000 + 1; // 6400
// Glyphs are re-encoded to Private Use Area code points, filling the BMP PUA
// first and overflowing into Supplementary PUA-A (Plane 15). A uint16 glyph id
// (max 65535) offset past the 6400-slot BMP PUA tops out at U+FE6FF, well
// inside PUA-A's 65534 slots, so Supplementary PUA-B is never needed.
constexpr char32_t pua_base = 0xe000; // BMP PUA start
constexpr std::uint16_t pua_bmp_capacity = 0xf8ff - 0xe000 + 1; // 6400
constexpr char32_t pua_supp_a_base = 0xf0000; // PUA-A (Plane 15)
constexpr std::uint32_t pua_capacity =
pua_bmp_capacity + (0xffffd - 0xf0000 + 1); // 6400 + 65534 = 71934

void pad4(std::string &s) {
while (s.size() % 4 != 0) {
Expand Down Expand Up @@ -67,8 +74,14 @@ SearchHints search_hints(const std::uint16_t count, const std::uint16_t unit) {

namespace odr::internal {

namespace bs = util::byte_string;

char32_t font::pua_code_point(const std::uint16_t glyph) noexcept {
return pua_base + glyph;
if (glyph < pua_bmp_capacity) {
return pua_base + glyph;
}
// Overflow past the BMP PUA into Supplementary PUA-A (U+F0000..U+FFFFD).
return pua_supp_a_base + (glyph - pua_bmp_capacity);
}

void font::build_sfnt(std::ostream &out, const std::uint32_t sfnt_version,
Expand Down Expand Up @@ -134,7 +147,59 @@ void font::build_sfnt(std::ostream &out, const std::uint32_t sfnt_version,
}
}

/// Format-12 `cmap` subtable (segmented coverage): sequential map groups over
/// the full Unicode range, each `[startCharCode, endCharCode]` mapping to
/// `startGlyphID + (code - startCharCode)`. Used when the map reaches beyond
/// the BMP (glyphs overflowing into Supplementary PUA-A), which format 4 cannot
/// express. Wrapped in a (Windows, Unicode full repertoire) encoding record.
static std::string
serialize_cmap_format12(const std::map<char32_t, std::uint16_t> &map) {
struct Group {
std::uint32_t start_code;
std::uint32_t end_code;
std::uint32_t start_glyph;
};
std::vector<Group> groups;
for (const auto &[code, glyph] : map) {
if (!groups.empty() && code == groups.back().end_code + 1 &&
glyph == groups.back().start_glyph +
(groups.back().end_code - groups.back().start_code) + 1) {
groups.back().end_code = code; // extend the current lockstep run
} else {
groups.push_back({code, code, glyph});
}
}

std::string sub;
bs::put_u16_be(sub, 12); // format
bs::put_u16_be(sub, 0); // reserved
bs::put_u32_be(sub,
static_cast<std::uint32_t>(16 + 12 * groups.size())); // len
bs::put_u32_be(sub, 0); // language
bs::put_u32_be(sub, static_cast<std::uint32_t>(groups.size()));
for (const auto &g : groups) {
bs::put_u32_be(sub, g.start_code);
bs::put_u32_be(sub, g.end_code);
bs::put_u32_be(sub, g.start_glyph);
}

std::string cmap;
bs::put_u16_be(cmap, 0); // version
bs::put_u16_be(cmap, 1); // numTables
bs::put_u16_be(cmap, 3); // platformID (Windows)
bs::put_u16_be(cmap, 10); // encodingID (Unicode full repertoire)
bs::put_u32_be(cmap, 12); // offset to the subtable
cmap += sub;
return cmap;
}

std::string font::serialize_cmap(const std::map<char32_t, std::uint16_t> &map) {
// Format 4 tops out at the BMP; a map that overflows into the Supplementary
// PUA needs format 12's 32-bit code ranges instead.
if (!map.empty() && map.rbegin()->first > 0xffff) {
return serialize_cmap_format12(map);
}

// A format-4 segment: a contiguous code range [start, end] whose glyph is
// `code + delta` (mod 2^16), i.e. idRangeOffset = 0.
struct Segment {
Expand All @@ -144,11 +209,6 @@ std::string font::serialize_cmap(const std::map<char32_t, std::uint16_t> &map) {
};
std::vector<Segment> segments;
for (const auto &[code, glyph] : map) {
if (code > 0xffff) {
throw std::runtime_error(
"sfnt: serialize_cmap supports only BMP code points (format 4); "
"beyond-BMP coverage (format 12) is a follow-up");
}
const auto c = static_cast<std::uint16_t>(code);
const auto delta = static_cast<std::uint16_t>(glyph - c);
if (!segments.empty() && c == segments.back().end + 1 &&
Expand Down Expand Up @@ -272,10 +332,10 @@ std::string font::serialize_os2(const std::uint16_t units_per_em,

void font::reencode_to_pua(sfnt::SfntFont &font,
const std::map<char32_t, std::uint16_t> &extra) {
if (font.glyph_count() > pua_capacity) {
throw std::runtime_error(
"sfnt_transform: glyph count exceeds BMP PUA capacity");
}
// A uint16 glyph id always fits: `pua_code_point` maps the BMP PUA first
// (6400 slots) then overflows into Supplementary PUA-A, whose combined
// `pua_capacity` (71934) exceeds any 16-bit glyph count.
static_assert(std::numeric_limits<std::uint16_t>::max() < pua_capacity);

std::map<char32_t, std::uint16_t> map;
for (std::uint16_t glyph = 0; glyph < font.glyph_count(); ++glyph) {
Expand Down
Loading
Loading