diff --git a/src/odr/html.hpp b/src/odr/html.hpp index 9dd92bd4..21df78a0 100644 --- a/src/odr/html.hpp +++ b/src/odr/html.hpp @@ -65,6 +65,21 @@ enum class HtmlTableGridlines { hard, }; +/// @brief PDF text rendering mode. +/// +/// Selects how text is emitted in PDF→HTML output. +/// +/// - `dual_layer`: A visual layer (paint order, embedded PUA glyphs) and a +/// separate transparent selection/search layer (reading order, real Unicode). +/// Similar to pdf.js. No JavaScript required. +/// - `single_layer`: A single combined layer where every glyph is mapped to +/// Unicode via frequency analysis. Similar to pdf2htmlEX. No JavaScript +/// required. +enum class PdfTextMode { + dual_layer, + single_layer, +}; + /// @brief HTML configuration. struct HtmlConfig { // document output file names @@ -106,6 +121,23 @@ struct HtmlConfig { std::string background_image_format{"png"}; double background_image_dpi{144.0}; + // PDF text mode + PdfTextMode pdf_text_mode{PdfTextMode::dual_layer}; + // `dual_layer`'s invisible selection-layer text is rendered in a local + // system font (tried in order; the first that resolves wins) rather than + // the embedded PDF font, so its natural width rarely matches the + // PDF-derived box width CSS `text-justify` is asked to fill (justify can + // only add spacing, never compress). + // `pdf_dual_layer_fallback_font_size_adjust` is applied as that @font-face's + // `size-adjust` (0-1, written out as a percent) to shrink the fallback font's + // metrics toward the PDF's, leaving less — ideally no — gap for justify to + // compress instead of stretch into. Safe to underestimate (justify then just + // spreads characters further; harmless on an invisible layer) but not to + // overestimate (the excess is clipped, not shrunk). + std::vector pdf_dual_layer_fallback_fonts{ + "Arial", "Helvetica", "Liberation Sans", "DejaVu Sans", "Nimbus Sans"}; + double pdf_dual_layer_fallback_font_size_adjust{0.5}; + // drm options bool no_drm{false}; diff --git a/src/odr/internal/font/cff_transform.cpp b/src/odr/internal/font/cff_transform.cpp index 62a4cd6e..dd03d41f 100644 --- a/src/odr/internal/font/cff_transform.cpp +++ b/src/odr/internal/font/cff_transform.cpp @@ -145,9 +145,9 @@ std::string cff::wrap_to_otf(const CffFont &font, const std::map &extra) { const std::uint16_t glyphs = font.glyph_count(); - // The uniform PUA re-encode: pua_code_point(glyph) -> glyph over - // every glyph. serialize_cmap throws if a code point is beyond the BMP, which - // also bounds the glyph count to the PUA capacity. + // The uniform PUA re-encode: pua_code_point(glyph) -> glyph over every glyph. + // Glyphs past the 6400-slot BMP PUA overflow into Supplementary PUA-A, and + // serialize_cmap emits a format-12 subtable to cover them. std::map pua; for (std::uint16_t glyph = 0; glyph < glyphs; ++glyph) { pua[pua_code_point(glyph)] = glyph; @@ -184,10 +184,14 @@ std::string cff::wrap_to_otf(const CffFont &font, tables.emplace_back("cmap", serialize_cmap(pua)); tables.emplace_back("name", serialize_name(font.name())); tables.emplace_back("post", serialize_post()); - tables.emplace_back("OS/2", - serialize_os2(font.units_per_em(), bbox.y_min, bbox.y_max, - static_cast(first), - static_cast(last))); + // OS/2 usFirst/usLastCharIndex are u16; a beyond-BMP PUA code point (large + // glyph counts overflow into Supplementary PUA-A) is clamped to 0xFFFF. + tables.emplace_back( + "OS/2", + serialize_os2( + font.units_per_em(), bbox.y_min, bbox.y_max, + static_cast(std::min(first, 0xffff)), + static_cast(std::min(last, 0xffff)))); std::ostringstream out; build_sfnt(out, 0x4f54544f /* 'OTTO' */, std::move(tables)); diff --git a/src/odr/internal/font/sfnt_transform.cpp b/src/odr/internal/font/sfnt_transform.cpp index f59dfa09..e9ef53a7 100644 --- a/src/odr/internal/font/sfnt_transform.cpp +++ b/src/odr/internal/font/sfnt_transform.cpp @@ -6,10 +6,10 @@ #include #include #include +#include #include #include #include -#include #include namespace odr::internal::font { @@ -18,8 +18,15 @@ namespace { namespace bs = util::byte_string; -constexpr char32_t pua_base = 0xe000; -constexpr std::uint16_t pua_capacity = 0xf8ff - 0xe000 + 1; // 6400 +// Glyphs are re-encoded to Private Use Area code points, filling the BMP PUA +// first and overflowing into Supplementary PUA-A (Plane 15). A uint16 glyph id +// (max 65535) offset past the 6400-slot BMP PUA tops out at U+FE6FF, well +// inside PUA-A's 65534 slots, so Supplementary PUA-B is never needed. +constexpr char32_t pua_base = 0xe000; // BMP PUA start +constexpr std::uint16_t pua_bmp_capacity = 0xf8ff - 0xe000 + 1; // 6400 +constexpr char32_t pua_supp_a_base = 0xf0000; // PUA-A (Plane 15) +constexpr std::uint32_t pua_capacity = + pua_bmp_capacity + (0xffffd - 0xf0000 + 1); // 6400 + 65534 = 71934 void pad4(std::string &s) { while (s.size() % 4 != 0) { @@ -67,8 +74,14 @@ SearchHints search_hints(const std::uint16_t count, const std::uint16_t unit) { namespace odr::internal { +namespace bs = util::byte_string; + char32_t font::pua_code_point(const std::uint16_t glyph) noexcept { - return pua_base + glyph; + if (glyph < pua_bmp_capacity) { + return pua_base + glyph; + } + // Overflow past the BMP PUA into Supplementary PUA-A (U+F0000..U+FFFFD). + return pua_supp_a_base + (glyph - pua_bmp_capacity); } void font::build_sfnt(std::ostream &out, const std::uint32_t sfnt_version, @@ -134,7 +147,59 @@ void font::build_sfnt(std::ostream &out, const std::uint32_t sfnt_version, } } +/// Format-12 `cmap` subtable (segmented coverage): sequential map groups over +/// the full Unicode range, each `[startCharCode, endCharCode]` mapping to +/// `startGlyphID + (code - startCharCode)`. Used when the map reaches beyond +/// the BMP (glyphs overflowing into Supplementary PUA-A), which format 4 cannot +/// express. Wrapped in a (Windows, Unicode full repertoire) encoding record. +static std::string +serialize_cmap_format12(const std::map &map) { + struct Group { + std::uint32_t start_code; + std::uint32_t end_code; + std::uint32_t start_glyph; + }; + std::vector groups; + for (const auto &[code, glyph] : map) { + if (!groups.empty() && code == groups.back().end_code + 1 && + glyph == groups.back().start_glyph + + (groups.back().end_code - groups.back().start_code) + 1) { + groups.back().end_code = code; // extend the current lockstep run + } else { + groups.push_back({code, code, glyph}); + } + } + + std::string sub; + bs::put_u16_be(sub, 12); // format + bs::put_u16_be(sub, 0); // reserved + bs::put_u32_be(sub, + static_cast(16 + 12 * groups.size())); // len + bs::put_u32_be(sub, 0); // language + bs::put_u32_be(sub, static_cast(groups.size())); + for (const auto &g : groups) { + bs::put_u32_be(sub, g.start_code); + bs::put_u32_be(sub, g.end_code); + bs::put_u32_be(sub, g.start_glyph); + } + + std::string cmap; + bs::put_u16_be(cmap, 0); // version + bs::put_u16_be(cmap, 1); // numTables + bs::put_u16_be(cmap, 3); // platformID (Windows) + bs::put_u16_be(cmap, 10); // encodingID (Unicode full repertoire) + bs::put_u32_be(cmap, 12); // offset to the subtable + cmap += sub; + return cmap; +} + std::string font::serialize_cmap(const std::map &map) { + // Format 4 tops out at the BMP; a map that overflows into the Supplementary + // PUA needs format 12's 32-bit code ranges instead. + if (!map.empty() && map.rbegin()->first > 0xffff) { + return serialize_cmap_format12(map); + } + // A format-4 segment: a contiguous code range [start, end] whose glyph is // `code + delta` (mod 2^16), i.e. idRangeOffset = 0. struct Segment { @@ -144,11 +209,6 @@ std::string font::serialize_cmap(const std::map &map) { }; std::vector segments; for (const auto &[code, glyph] : map) { - if (code > 0xffff) { - throw std::runtime_error( - "sfnt: serialize_cmap supports only BMP code points (format 4); " - "beyond-BMP coverage (format 12) is a follow-up"); - } const auto c = static_cast(code); const auto delta = static_cast(glyph - c); if (!segments.empty() && c == segments.back().end + 1 && @@ -272,10 +332,10 @@ std::string font::serialize_os2(const std::uint16_t units_per_em, void font::reencode_to_pua(sfnt::SfntFont &font, const std::map &extra) { - if (font.glyph_count() > pua_capacity) { - throw std::runtime_error( - "sfnt_transform: glyph count exceeds BMP PUA capacity"); - } + // A uint16 glyph id always fits: `pua_code_point` maps the BMP PUA first + // (6400 slots) then overflows into Supplementary PUA-A, whose combined + // `pua_capacity` (71934) exceeds any 16-bit glyph count. + static_assert(std::numeric_limits::max() < pua_capacity); std::map map; for (std::uint16_t glyph = 0; glyph < font.glyph_count(); ++glyph) { diff --git a/src/odr/internal/html/pdf_file.cpp b/src/odr/internal/html/pdf_file.cpp index 07a250d1..30f45201 100644 --- a/src/odr/internal/html/pdf_file.cpp +++ b/src/odr/internal/html/pdf_file.cpp @@ -24,11 +24,13 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -41,6 +43,9 @@ namespace { /// the extra digits add up across a page full of path data. double round2(const double v) { return std::round(v * 100.0) / 100.0; } +constexpr double pt_to_px = 96.0 / 72.0; +constexpr double pt_to_in = 1.0 / 72.0; + /// Serialize a transform as an SVG `matrix(...)`. Only the translation (e, f) /// is rounded — it lives in page-box units where 1/100 px is plenty; the linear /// part (a..d) keeps full precision so small scale/skew factors aren't @@ -53,16 +58,18 @@ std::string svg_matrix(const util::math::Transform2D &m) { return std::move(f).str(); } +/// Clamp a colour component in [0, 1] to an 8-bit channel value. +int to255(const double v) { + return static_cast(std::lround(std::clamp(v, 0.0, 1.0) * 255.0)); +} + /// Convert a PDF device color to a CSS `rgb(...)` string. Non-device color /// spaces (Separation/ICCBased/… — stage 4.4) and the unknown space fall back /// to black, the PDF initial color. std::string device_color_to_css(const pdf::GraphicsState::Color &color) { - const auto to255 = [](const double v) { - return static_cast(std::lround(std::clamp(v, 0.0, 1.0) * 255.0)); - }; - int r = 0; - int g = 0; - int b = 0; + std::int32_t r = 0; + std::int32_t g = 0; + std::int32_t b = 0; switch (color.space) { case pdf::ColorSpace::device_grey: r = g = b = to255(color.grey); @@ -94,9 +101,6 @@ std::string device_color_to_css(const pdf::GraphicsState::Color &color) { /// Convert an sRGB triple in [0, 1] (a shading colour stop) to a CSS /// `rgb(...)`. std::string rgb_to_css(const std::array &rgb) { - const auto to255 = [](const double v) { - return static_cast(std::lround(std::clamp(v, 0.0, 1.0) * 255.0)); - }; std::ostringstream s; s << "rgb(" << to255(rgb[0]) << ',' << to255(rgb[1]) << ',' << to255(rgb[2]) << ')'; @@ -188,9 +192,8 @@ std::string svg_path_fragment(const pdf::PathElement &path, // miter join: SVG defaults the limit to 4, PDF to 10 — state it. f << " stroke-miterlimit=\"" << round2(path.miter_limit) << '"'; } - const bool dashed = - std::any_of(path.dash_array.begin(), path.dash_array.end(), - [](const double v) { return v > 0; }); + const bool dashed = std::ranges::any_of( + path.dash_array, [](const double v) { return v > 0; }); if (dashed) { f << " stroke-dasharray=\""; for (std::size_t i = 0; i < path.dash_array.size(); ++i) { @@ -223,14 +226,11 @@ std::string svg_image_fragment(const pdf::ImageElement &image, if (image.data.empty()) { return {}; } - // image natural box [0,1] (y-down) -> PDF unit square (y-up) -> user -> box. constexpr util::math::Transform2D flip = util::math::Transform2D::scaling_translation(1, -1, 0, 1); const util::math::Transform2D m = flip * image.transform * to_box; std::ostringstream f; - // The clip wraps the image in a transform-free `` rather than sitting on - // the ``: see the function comment. if (!clip_id.empty()) { f << ""; } @@ -430,7 +430,6 @@ class PatternRegistry : public DefsRegistry { // Tile content is laid out in pattern space (identity page transform); the // y-flip and placement live in `patternTransform`. - const util::math::Transform2D identity; std::ostringstream tile; for (const pdf::PageElement &element : pdf::extract_page(pattern.content, *pattern.resources, logger)) { @@ -440,9 +439,9 @@ class PatternRegistry : public DefsRegistry { painted.fill_color = fill_color; painted.stroke_color = fill_color; } - tile << svg_path_fragment(painted, identity, "", ""); + tile << svg_path_fragment(painted, util::math::Transform2D(), "", ""); } else if (const auto *image = std::get_if(&element)) { - tile << svg_image_fragment(*image, identity, ""); + tile << svg_image_fragment(*image, util::math::Transform2D(), ""); } } @@ -472,13 +471,12 @@ class PatternRegistry : public DefsRegistry { }; /// Deduplicates CSS declarations into atomic, single-property classes. PDF text -/// emits one absolutely-positioned span per glyph run, and the same font sizes, -/// offsets and spacings recur across the (potentially millions of) spans. -/// Writing each declaration inline bloats the document — the Bluetooth Core -/// spec reference output crossed GitHub's 100 MB file limit. Instead, every -/// distinct declaration is registered once here, named `` in +/// emits one absolutely-positioned line block per detected line, and the same +/// font sizes, offsets and spacings recur across the (potentially millions of) +/// elements. Writing each declaration inline bloats the document. Instead, +/// every distinct declaration is registered once here, named `` in /// first-seen order (e.g. `f1`, `f2` for font sizes, `t1` for a top offset), -/// emitted once in , and referenced by class on each span. This is +/// emitted once in , and referenced by class on each element. This is /// representation-only: the computed style of every element is unchanged. class AtomicStyles { public: @@ -506,7 +504,8 @@ class AtomicStyles { } private: - // Node-based map: pointers stored in `m_order` stay valid across insertions. + /// Node-based map: pointers stored in `m_order` stay valid across + /// insertions. std::unordered_map m_class_by_declaration; std::unordered_map m_count_by_prefix; std::vector *> m_order; @@ -530,7 +529,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return true; } - return false; } @@ -538,7 +536,6 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return "text/html"; } - throw FileNotFound("Unknown path: " + path); } @@ -548,7 +545,6 @@ class HtmlServiceImpl final : public HtmlService { write_document(writer); return; } - throw FileNotFound("Unknown path: " + path); } @@ -557,578 +553,1146 @@ class HtmlServiceImpl final : public HtmlService { if (path == "document.html") { return write_document(out); } - throw FileNotFound("Unknown path: " + path); } - // One emitted span. The styling is fully resolved into class tokens during - // the first pass; only the (already escaped) text and class list survive to - // the writing pass. A text run with an embedded font emits the dual layer as - // a transparent selectable span carrying the real Unicode with the visible - // glyph layer (PUA code points in the `@font-face` font) nested inside it: - // the child is absolutely positioned at the run origin and inherits the - // font size, spacing, and transform from the parent, so the placement - // classes live only on the parent. `glyph_classes` is empty when there is no - // nested layer (the legacy fallback path and display-only runs). - struct SpanOut { + HtmlResources write_document(HtmlWriter &out) const { + if (config().pdf_text_mode == PdfTextMode::single_layer) { + return write_document_single_layer(out); + } + return write_document_dual_layer(out); + } + + // ========================================================================= + // DUAL-LAYER MODE + // ========================================================================= + // + // Two separate layers per page: + // + // Visual layer (`