`) in paint order; individual runs flow
+ // inline within the block, each nudged by a `margin-left` (the gen-time gap
+ // from the previous run's right edge, matching the embedded font's advance
+ // exactly when the font's `hmtx` matches the PDF `/Widths`).
+ //
+ // Unicode mapping uses frequency analysis: a pre-pass over all pages counts
+ // (uchar, glyph) co-occurrences per font, then the post-pass picks the
+ // most-frequent glyph for each uchar as the cmap entry. This ensures the
+ // common case wins instead of an arbitrary first-come-first-serve order.
+ //
+ // Clean runs (all (uchar, glyph) pairs match the frequency winner) render
+ // the real Unicode directly in the embedded font — natively findable and
+ // selectable. Unclean visible runs paint their glyphs via
+ // `::before{content:attr(data-g)}` CSS generated content (kept out of the
+ // DOM text stream so they never break find mid-word), with a zero-width
+ // `display:inline-block; overflow:hidden` overlay carrying the real Unicode
+ // alongside. No-unicode runs show only the glyph. Invisible (Tr 3/7) and
+ // fallback (no embedded font) runs render the real Unicode as ordinary text.
+
+ struct SingleRunOut {
+ std::string margin; ///< "" or a `margin-left` class
+ std::string color; ///< "" or a colour class name (no leading space)
+ std::string text; ///< real Unicode (HTML-escaped), may be empty
+ std::string glyph_data; ///< PUA glyph string (non-empty → unclean)
+ };
+ struct SingleLineOut {
+ std::string classes; ///< "t lN tN [mN] [fvN|fnN] [iN]..."
+ std::string font_class; ///< per-font family+colour class on the block
+ std::vector
runs;
+ };
+ struct SinglePathOut {
+ std::string svg;
+ };
+ using SingleItem = std::variant;
+
+ struct SinglePageOut {
+ std::string classes;
+ double width{0};
+ double height{0};
+ std::vector items;
+ std::string clip_defs;
+ };
+
+ HtmlResources write_document_single_layer(HtmlWriter &out) const {
+ HtmlResources resources;
+
+ const auto &pdf_file =
+ dynamic_cast(*m_pdf_file.impl());
+ pdf::DocumentParser parser = pdf_file.create_parser(*m_logger);
+ const std::unique_ptr document = parser.parse_document();
+ const std::vector pages = document->collect_pages();
+
+ // ---- Font registration ------------------------------------------------
+ // A real-Unicode scalar gets a cmap entry only inside the BMP and outside
+ // the PUA (`U+E000..U+F8FF`), so glyph-deterministic PUA code points are
+ // never shadowed.
+ const auto collapsible_unicode = [](const char32_t c) {
+ return c <= 0xFFFF && !(c >= 0xE000 && c <= 0xF8FF);
+ };
+
+ std::uint32_t family_count = 0;
+ std::string font_faces;
+ std::string font_styles; // ".fvN{...}" / ".fnN{...}"
+ std::vector accepted_fonts;
+ // Per-font, per-uchar, per-glyph occurrence count (pre-pass).
+ // Indexed by font_index - 1.
+ std::vector>>
+ glyph_freq;
+ // Per-font winning uchar→glyph mapping (derived from glyph_freq).
+ std::vector> used_unicode;
+ // Which per-font classes are used: [0]=fv (visible), [1]=fn (invisible).
+ std::vector> font_class_used;
+ std::unordered_map family_index;
+
+ const auto font_family = [&](pdf::Font *font) {
+ return intern_font(family_index, family_count, font, [&](std::uint32_t) {
+ accepted_fonts.push_back(font);
+ glyph_freq.emplace_back();
+ used_unicode.emplace_back();
+ font_class_used.push_back({false, false});
+ });
+ };
+
+ AtomicStyles styles;
+ const auto add_class = [&styles](std::string &classes,
+ const std::string &prefix,
+ std::string declaration) {
+ classes += ' ';
+ classes += styles.intern(prefix, std::move(declaration));
+ };
+
+ // Build the page streams once (reused for both the pre-pass and main pass).
+ std::vector page_streams;
+ page_streams.reserve(pages.size());
+ for (pdf::Page *page : pages) {
+ std::string stream;
+ for (const auto &ref : page->contents_reference) {
+ stream += parser.read_decoded_stream(ref);
+ stream += '\n';
+ }
+ page_streams.push_back(std::move(stream));
+ }
+
+ // ---- Pre-pass: frequency analysis ------------------------------------
+ // Count (uchar, glyph) co-occurrences per font over all pages. The main
+ // pass then uses the frequency winner for each uchar instead of first-come-
+ // first-serve, so the most common glyph shape wins its cmap entry.
+ // This re-runs `extract_page` on every page (the main pass parses each a
+ // second time) — a deliberate tradeoff: re-parsing the already-decoded
+ // stream is cheap next to buffering every page's element list in memory.
+ for (std::size_t pi = 0; pi < pages.size(); ++pi) {
+ pdf::Page *page = pages[pi];
+ for (const pdf::PageElement &element :
+ pdf::extract_page(page_streams[pi], *page->resources, *m_logger)) {
+ const auto *text = std::get_if(&element);
+ if (text == nullptr || text->text.empty() || text->font == nullptr) {
+ continue;
+ }
+ const std::uint32_t font = font_family(text->font);
+ if (font == 0) {
+ continue;
+ }
+ // Only collapsible-candidate runs contribute to frequency counts.
+ if (util::string::utf8_length(text->text) != text->advances.size()) {
continue;
}
+ auto cp = text->text.begin();
+ for (const std::uint32_t code : text->font->codes(text->codes)) {
+ const char32_t uchar = utf8::unchecked::next(cp);
+ if (!collapsible_unicode(uchar)) {
+ continue;
+ }
+ const std::uint16_t glyph = text->font->glyph_for_code(code);
+ ++glyph_freq[font - 1][uchar][glyph];
+ }
+ }
+ }
- // An image XObject: an `` placed by the CTM, in the page ``
- // alongside the paths (so it layers by paint order).
- if (const auto *image = std::get_if(&element)) {
- const std::string clip_id = clips.register_clip(image->clip, to_box);
- std::string fragment = svg_image_fragment(*image, to_box, clip_id);
- if (!fragment.empty()) {
- page_out.items.push_back(PathOut{std::move(fragment)});
+ // Compute the frequency winner for each (font, uchar): the glyph with the
+ // highest count becomes the cmap entry. Ties broken by lower glyph id.
+ for (std::uint32_t fi = 0; fi < family_count; ++fi) {
+ for (const auto &[uchar, counts] : glyph_freq[fi]) {
+ std::uint16_t best_glyph = 0;
+ std::uint32_t best_count = 0;
+ for (const auto &[glyph, count] : counts) {
+ if (count > best_count ||
+ (count == best_count && glyph < best_glyph)) {
+ best_glyph = glyph;
+ best_count = count;
}
+ }
+ used_unicode[fi][uchar] = best_glyph;
+ }
+ }
+
+ // ---- Main pass (pass 1): build page structures -----------------------
+ std::vector pages_out;
+ pages_out.reserve(pages.size());
+
+ for (std::size_t pi = 0; pi < pages.size(); ++pi) {
+ pdf::Page *page = pages[pi];
+ const PageBox pb = begin_page(*page, add_class);
+ const double width = pb.width;
+ const double height = pb.height;
+ const util::math::Transform2D &to_box = pb.to_box;
+
+ SinglePageOut &page_out = pages_out.emplace_back();
+ page_out.classes = pb.classes;
+ page_out.width = width;
+ page_out.height = height;
+
+ ClipRegistry clips(static_cast(pages_out.size()));
+ GradientRegistry gradients(static_cast(pages_out.size()));
+ PatternRegistry patterns(static_cast(pages_out.size()));
+
+ std::int32_t cur_line = -1;
+ std::string cur_flow_key;
+ bool prev_was_matrix = false;
+ double prev_end = 0;
+ double prev_baseline = 0;
+ double prev_font_pt = 0;
+ const auto close_line = [&] { cur_line = -1; };
+
+ for (const pdf::PageElement &element :
+ pdf::extract_page(page_streams[pi], *page->resources, *m_logger)) {
+ if (handle_graphic_element(
+ element, to_box, width, height, clips, gradients, patterns,
+ *m_logger, [&] { close_line(); },
+ [&](std::string frag) {
+ page_out.items.push_back(SinglePathOut{std::move(frag)});
+ })) {
continue;
}
const pdf::TextElement &text = std::get(element);
- // TODO(clip text): the active clip is not applied to text. Paths carry
- // a clip snapshot realized as an SVG ``, but text is emitted
- // as HTML spans that the clipPath cannot reach, so clipped text paints
- // outside its region. See STAGE4_PLAN.md "4.3 — Clipping" follow-up.
- // The font index is non-zero when an embedded font lets us render
- // the actual glyphs; 0 falls through to the legacy path.
+ // TODO(clip text): clip not applied to text; see STAGE4_PLAN.md.
const std::uint32_t font =
text.font != nullptr ? font_family(text.font) : 0;
- // Without an embedded font, an empty `text` has nothing to show: a code
- // with no recoverable Unicode (`no_unicode`) or an `/ActualText`-
- // suppressed segment. With one, the glyphs still render (PUA layer).
if (text.text.empty() && font == 0) {
continue;
}
- const util::math::Transform2D m = flip_glyph * text.transform * to_box;
-
- // Tr 3 (invisible) and Tr 7 (clip-only) paint nothing; keep them
- // selectable via the transparent `.i` class.
- const bool invisible =
- text.rendering_mode == pdf::TextRenderingMode::invisible ||
- text.rendering_mode == pdf::TextRenderingMode::clip;
-
- // The run's visible paint colour, folded onto the visible span as an
- // interned colour class — but only when it is not the default black, so
- // the overwhelmingly common black run adds nothing. The per-font
- // `.fvN`/`.gvN` classes declare `color:#000`; this class is emitted
- // after them in (equal specificity), so it overrides. Invisible
- // runs (Tr 3/7) stay transparent via `.i`, so they take no colour
- // class. The fill modes paint with the non-stroking colour, the
- // stroke-only modes (Tr 1/5) with the stroking colour.
- std::string color_suffix;
- if (!invisible) {
- const pdf::GraphicsState::Color &paint =
- (text.rendering_mode == pdf::TextRenderingMode::stroke ||
- text.rendering_mode == pdf::TextRenderingMode::stroke_clip)
- ? text.stroke_color
- : text.fill_color;
- if (std::string css = device_color_to_css(paint);
- css != "rgb(0,0,0)") {
- color_suffix = ' ' + styles.intern("k", "color:" + std::move(css));
+ const auto [m, invisible, is_matrix, asc, scale, ox, baseline, extent,
+ font_pt, font_size_px] = run_geometry(text, to_box);
+ const double cs_px = round2(text.char_spacing * scale * pt_to_px);
+ const double ws_px = round2(text.word_spacing * scale * pt_to_px);
+ const std::string color_suffix = color_class(text, invisible, styles);
+
+ // ---- Run payload ------------------------------------------------
+ // Decide whether this is a clean collapse (real unicode renders
+ // directly via the frequency-winner cmap entries) or unclean (glyph
+ // painted via generated content, unicode as overlay).
+ SingleRunOut run;
+ // `color_suffix` carries a leading space for the dual-layer paths that
+ // concatenate it onto a class string; the single-layer run stores just
+ // the class name.
+ run.color =
+ color_suffix.empty() ? std::string() : color_suffix.substr(1);
+
+ if (font == 0 || invisible) {
+ // Fallback / invisible: render real unicode directly.
+ run.text = escape_markup(text.text);
+ } else {
+ // Check collapse: 1:1 text↔codes and every (uchar, glyph) matches
+ // the frequency winner.
+ // `font != 0` here already implies `text.font != nullptr`.
+ bool collapse =
+ !text.text.empty() &&
+ util::string::utf8_length(text.text) == text.advances.size();
+ if (collapse) {
+ const std::map &won =
+ used_unicode[font - 1];
+ auto cp = text.text.begin();
+ for (const std::uint32_t code : text.font->codes(text.codes)) {
+ const char32_t uchar = utf8::unchecked::next(cp);
+ const std::uint16_t glyph = text.font->glyph_for_code(code);
+ const auto it = won.find(uchar);
+ if (!collapsible_unicode(uchar) || it == won.end() ||
+ it->second != glyph) {
+ collapse = false;
+ break;
+ }
+ }
+ }
+ if (collapse) {
+ run.text = escape_markup(text.text);
+ } else {
+ run.glyph_data = glyph_run_str(*text.font, text.codes);
+ run.text = escape_markup(text.text); // overlay (empty=no_unicode)
}
}
-
- // Placement and spacing are shared by both layers of a run; build them
- // once on `base`.
- std::string base = "t";
-
- // Place by the baseline: PDF's text origin (`m.e`, `m.f`) is the glyph
- // baseline, but a CSS span anchors its box top, which sits one ascent
- // above the baseline. Shift the origin up by the ascent along the run's
- // local y axis so the baseline lands on the PDF origin.
- const double asc = ascent_em(text);
-
- // Tc/Tw are absolute text-space lengths (not scaled by the font size).
- // One text-space unit is `scale * pt_to_px` CSS px, where `scale` is
- // the linear factor we apply to the glyphs: folded into `font-size` in
- // the uniform branch, carried by the CSS matrix in the general branch
- // (so spacing there is expressed pre-transform, scale == 1).
- double scale;
- if (m.b == 0 && m.c == 0 && m.a == m.d) {
- // Upright uniform scale: fold the scale into the font size and place
- // the origin with left/top, so the (otherwise near-universal) matrix
- // is dropped. The ascent shift is purely vertical here (local y maps
- // to box y, scaled by `m.a`).
- add_class(base, "l", px_decl("left", round2(m.e * pt_to_px)));
- add_class(
- base, "t",
- px_decl("top", round2((m.f - asc * m.a * text.size) * pt_to_px)));
- add_class(base, "f",
- px_decl("font-size", round2(m.a * text.size * pt_to_px)));
- scale = m.a;
- } else {
- // The ascent shift is `asc` em down the local y axis, whose direction
- // in the box is the matrix's (c, d) column; subtract it from the
- // translation so the baseline, not the box top, lands on the origin.
- const double ascent_px = asc * text.size * pt_to_px;
- std::ostringstream tm;
- tm << "transform:matrix(" << m.a << "," << m.b << "," << m.c << ","
- << m.d << "," << round2(m.e * pt_to_px - m.c * ascent_px) << ","
- << round2(m.f * pt_to_px - m.d * ascent_px) << ")";
- add_class(base, "m", std::move(tm).str());
- add_class(base, "f",
- px_decl("font-size", round2(text.size * pt_to_px)));
- scale = 1;
+ if (run.text.empty() && run.glyph_data.empty()) {
+ continue; // invisible no_unicode
}
- // PDF char/word spacing (Tc/Tw) translate directly to CSS. TJ kerning
- // needs no expression here: `extract_text` emits a separate segment per
- // TJ string and folds the adjustment into the following segment's
- // `transform`, so a segment only carries its constant spacing. Emitted
- // only when non-zero to keep the (overwhelmingly common) unspaced span
- // small.
- //
- // CSS letter-/word-spacing key on the *rendered* string's character and
- // space boundaries, but PDF Tc/Tw advance the text matrix per raw code
- // (Tw only on a simple font's single-byte 0x20; ISO 32000-1 9.3.3). The
- // two coincide only when the rendered run is 1:1 with the codes. The
- // glyph layer always is (one PUA code point per code, `font != 0`); the
- // Unicode text layer is not when a /ToUnicode CMap expands a code into
- // several characters (ligatures), /ActualText substitutes text, or a
- // space was inferred — there CSS would insert gaps the segment advances
- // never accounted for, splitting glyphs and drifting the next
- // absolutely-positioned segment. Gate emission on that correspondence;
- // word spacing additionally never applies to a composite font.
- const bool spacing_one_to_one =
- font != 0 ||
- (text.font != nullptr &&
- util::string::utf8_length(text.text) == text.advances.size());
- if (text.char_spacing != 0 && spacing_one_to_one) {
- add_class(base, "s",
- px_decl("letter-spacing",
- round2(text.char_spacing * scale * pt_to_px)));
- }
- if (text.word_spacing != 0 && spacing_one_to_one &&
- !(text.font != nullptr && text.font->composite)) {
- add_class(base, "w",
- px_decl("word-spacing",
- round2(text.word_spacing * scale * pt_to_px)));
+ // ---- Flow grouping -----------------------------------------------
+ std::ostringstream fk;
+ fk << font << '|' << invisible << '|' << font_size_px << '|' << cs_px
+ << '|' << ws_px;
+ const std::string flow_key = std::move(fk).str();
+ bool new_line = is_matrix || prev_was_matrix || cur_line < 0 ||
+ flow_key != cur_flow_key;
+ double margin_px = 0;
+ if (!new_line && prev_font_pt > 0) {
+ if (starts_new_line(baseline, prev_baseline, ox, prev_end,
+ prev_font_pt)) {
+ new_line = true;
+ } else {
+ margin_px = round2((ox - prev_end) * pt_to_px);
+ }
}
- // A run collapses to a single span — selectable *and* visible, the real
- // Unicode rendered directly in the embedded font — when it has an
- // embedded font, carries text, is 1:1 with its codes (no /ToUnicode
- // expansion, /ActualText, or inferred space), and every glyph wins a
- // real-Unicode `cmap` entry. The winner of a scalar is the first
- // collapse-candidate run (in document order) to use it; processing
- // order *is* document order, so an earlier run's claim is already
- // visible and no later run can unseat it — the decision is final here.
- const bool collapse_candidate =
- font != 0 && !text.text.empty() && text.font != nullptr &&
- util::string::utf8_length(text.text) == text.advances.size();
-
- if (collapse_candidate) {
- // Stake first-wins real-Unicode -> glyph claims and decide collapse
- // in one walk: the run collapses iff each code's glyph wins (or
- // matches) its scalar. Claims are staked for every collapsible scalar
- // even when the run ends up dual, so later runs see them. The
- // post-pass only bakes the won scalars into the shared font's `cmap`.
- std::map &won = used_unicode[font - 1];
- bool collapse = true;
- auto cp = text.text.begin();
- for (const std::uint32_t code : text.font->codes(text.codes)) {
- const char32_t uchar = utf8::unchecked::next(cp);
- const std::uint16_t glyph = text.font->glyph_for_code(code);
- if (!collapsible_unicode(uchar)) {
- collapse = false;
- continue;
- }
- const auto [it, inserted] = won.emplace(uchar, glyph);
- if (!inserted && it->second != glyph) {
- collapse = false;
- }
+ if (new_line) {
+ std::string base = "t";
+ add_position_classes(base, add_class, m, is_matrix, ox, baseline,
+ asc * text.size);
+ add_class(base, "f", px_decl("font-size", font_size_px));
+ const bool spacing_one_to_one =
+ font != 0 ||
+ (text.font != nullptr &&
+ util::string::utf8_length(text.text) == text.advances.size());
+ if (text.char_spacing != 0 && spacing_one_to_one) {
+ add_class(base, "s", px_decl("letter-spacing", cs_px));
}
- if (collapse) {
- // One span: the real Unicode rendered in the embedded font, named
- // by the combined per-font class (black visible / transparent
- // invisible), selectable either way.
- std::string classes = std::move(base);
- classes += ' ';
- classes += font_class(font, invisible, /*nested=*/false);
- classes += color_suffix;
- page_out.items.push_back(
- SpanOut{std::move(classes), escape_text(text.text), {}, {}});
- } else {
- // Dual layer (a glyph lost its scalar to an earlier one): a
- // transparent selectable Unicode span with the PUA glyph layer
- // nested inside, the latter folded into the combined `.gvN` /
- // `.giN` class. The colour rides the visible (nested) layer.
- page_out.items.push_back(SpanOut{
- base + " i", escape_text(text.text),
- font_class(font, invisible, /*nested=*/true) + color_suffix,
- escape_text(glyph_run(*text.font, text.codes))});
+ if (text.word_spacing != 0 && spacing_one_to_one &&
+ !(text.font != nullptr && text.font->composite)) {
+ // `ws` = word-spacing everywhere (`w` is width in the dual layer).
+ add_class(base, "ws", px_decl("word-spacing", ws_px));
}
- } else if (font != 0) {
- // The visible glyph layer: PUA code points in the embedded font,
- // named by the combined per-font class (paint colour + font family).
- std::string glyph_text =
- escape_text(glyph_run(*text.font, text.codes));
-
- if (!text.text.empty()) {
- // Dual layer: a transparent selectable span carrying the real
- // Unicode (for copy/search) with the glyph layer nested inside.
- // The nested child overlays the run origin and inherits the
- // placement via the combined `.gvN` / `.giN` class.
- page_out.items.push_back(SpanOut{
- base + " i", escape_text(text.text),
- font_class(font, invisible, /*nested=*/true) + color_suffix,
- std::move(glyph_text)});
- } else {
- // Display-only run: nothing is extractable (the `no_unicode` case),
- // so the glyph layer stands alone and carries the placement itself
- // (`base`), `.g` (unselectable) and the combined paint+font class.
- std::string glyph_classes = base + " g ";
- glyph_classes += font_class(font, invisible, /*nested=*/false);
- glyph_classes += color_suffix;
- page_out.items.push_back(SpanOut{
- std::move(glyph_classes), std::move(glyph_text), {}, {}});
+ if (font == 0 && invisible) {
+ base += " i";
+ }
+
+ SingleLineOut line;
+ line.classes = std::move(base);
+ if (font != 0) {
+ line.font_class = font_class(font_class_used, font, invisible);
}
+ line.runs.push_back(std::move(run));
+ page_out.items.push_back(std::move(line));
+ cur_line = static_cast(page_out.items.size()) - 1;
+ cur_flow_key = flow_key;
} else {
- // Legacy single-layer path: no embedded font, render the Unicode in a
- // fallback font.
- std::string classes = base;
- if (invisible) {
- classes += " i";
+ if (margin_px != 0) {
+ run.margin = styles.intern("ml", px_decl("margin-left", margin_px));
}
- classes += color_suffix;
- page_out.items.push_back(
- SpanOut{std::move(classes), escape_text(text.text), {}, {}});
+ std::get(page_out.items[cur_line])
+ .runs.push_back(std::move(run));
}
+
+ prev_end = ox + extent;
+ prev_baseline = baseline;
+ prev_font_pt = font_pt;
+ prev_was_matrix = is_matrix;
}
- // Clip-path, gradient and pattern defs share the page's hidden
- // ``.
page_out.clip_defs = clips.defs() + gradients.defs() + patterns.defs();
}
- // Post-pass: every page has been scanned, so the per-font used-scalar sets
- // are complete.
- //
- // Re-encode each accepted font with its real-Unicode entries baked into the
- // `cmap` (the PUA range is kept as a fallback) and emit the `@font-face`
- // rules in index order, so the output stays deterministic.
+ // ---- Post-pass: re-encode fonts with frequency-winner cmap entries ---
for (std::uint32_t i = 0; i < family_count; ++i) {
- pdf::Font *font = accepted_fonts[i];
- const std::map &extra = used_unicode[i];
- std::string reencoded;
- if (auto sfnt = std::dynamic_pointer_cast(
- font->embedded_font)) {
- font::reencode_to_pua(*sfnt, extra);
- std::ostringstream sfnt_out;
- sfnt->write(sfnt_out);
- reencoded = std::move(sfnt_out).str();
- } else if (auto cff = std::dynamic_pointer_cast(
- font->embedded_font)) {
- reencoded = font::cff::wrap_to_otf(*cff, extra);
- }
- const std::string url = file_to_url(reencoded, "font/ttf");
- font_faces += "@font-face{font-family:'odr-f" + std::to_string(i + 1) +
- "';src:url(" + url + ");}";
-
- // The combined per-font classes for this font, only those used. `.fvN` /
- // `.fnN` carry just the paint colour and font family (placement stays on
- // the span's own classes); `.gvN` / `.giN` additionally fold in the
- // nested glyph layer's `.t` placement and `.g` unselectability.
- const std::string n = std::to_string(i + 1);
- const std::string family = "font-family:'odr-f" + n + "'";
- constexpr const char *placement =
- "position:absolute;left:0;top:0;transform-origin:0 0;"
- "white-space:pre;line-height:1;user-select:none;";
- const auto rule = [&](const char *cls, const char *head,
- const char *color) {
- glyph_styles += '.';
- glyph_styles += cls;
- glyph_styles += n;
- glyph_styles += '{';
- glyph_styles += head;
- glyph_styles += color;
- glyph_styles += family;
- glyph_styles += '}';
+ write_font_face(*accepted_fonts[i], i, used_unicode[i],
+ font_class_used[i], font_faces, font_styles);
+ }
+
+ // ---- Pass 2: write HTML ---------------------------------------------
+ write_header_common(out, font_faces, font_styles, styles, [&] {
+ // Invisible text render modes (Tr 3/7).
+ out.out() << ".i{color:transparent}";
+ // Unclean glyphs painted via CSS generated content (`data-g` attr), kept
+ // out of the DOM text stream so they never break find/double-click.
+ out.out() << ".gl::before{content:attr(data-g)}";
+ // Zero-width inline-block overlay carrying the real Unicode of an unclean
+ // run: invisible, zero-width, but still found/selected in reading order.
+ // `inline-block` lets `width:0` apply (a regular inline box ignores it);
+ // `overflow:hidden` clips the invisible text to zero width.
+ out.out() << ".ov{display:inline-block;width:0;overflow:hidden;"
+ "color:transparent;vertical-align:baseline}";
+ });
+
+ // Helper: derive a run's leading-span class from `head` prefix plus its
+ // optional margin-left and colour override.
+ const auto run_class = [](const SingleRunOut &run, const char *head) {
+ std::string cls = head;
+ const auto add = [&](const std::string &t) {
+ if (t.empty()) {
+ return;
+ }
+ if (!cls.empty()) {
+ cls += ' ';
+ }
+ cls += t;
};
- if (font_class_used[i][0]) {
- rule("fv", "", "color:#000;");
+ add(run.margin);
+ add(run.color);
+ return cls;
+ };
+
+ const auto write_line = [&](const SingleLineOut &line) {
+ std::string classes = line.classes;
+ if (!line.font_class.empty()) {
+ classes += ' ';
+ classes += line.font_class;
}
- if (font_class_used[i][1]) {
- rule("fn", "", "color:transparent;");
+ out.write_element_begin(
+ "div", HtmlElementOptions().set_inline(true).set_class(classes));
+ for (const SingleRunOut &run : line.runs) {
+ if (run.glyph_data.empty()) {
+ // Clean / invisible / fallback: real Unicode renders directly.
+ const std::string cls = run_class(run, "");
+ if (cls.empty()) {
+ out.write_raw(run.text);
+ } else {
+ out.write_element_begin(
+ "span", HtmlElementOptions().set_inline(true).set_class(cls));
+ out.write_raw(run.text);
+ out.write_element_end("span");
+ }
+ } else {
+ // Unclean: glyph via generated content, real-unicode overlay.
+ out.write_element_begin(
+ "span", HtmlElementOptions()
+ .set_inline(true)
+ .set_class(run_class(run, "gl"))
+ .set_attributes(HtmlAttributesVector{
+ {std::string("data-g"), run.glyph_data}}));
+ out.write_element_end("span");
+ if (!run.text.empty()) {
+ out.write_element_begin(
+ "span", HtmlElementOptions().set_inline(true).set_class("ov"));
+ out.write_raw(run.text);
+ out.write_element_end("span");
+ }
+ }
}
- if (font_class_used[i][2]) {
- rule("gv", placement, "color:#000;");
+ out.write_element_end("div");
+ };
+
+ out.write_body_begin();
+ for (const SinglePageOut &page : pages_out) {
+ out.write_element_begin("div",
+ HtmlElementOptions().set_class(page.classes));
+ write_page_items(out, page.clip_defs, page.items, page.width, page.height,
+ write_line);
+ out.write_element_end("div");
+ }
+ out.write_body_end();
+ out.write_end();
+
+ return resources;
+ }
+
+ static std::string px_decl(const char *property, double value) {
+ std::ostringstream s;
+ s << property << ':' << value << "px";
+ return std::move(s).str();
+ }
+
+ /// Whether a run at (`ox`, `baseline`) starts a new visual line rather than
+ /// continuing the previous run: its baseline jumped by more than 0.6× the
+ /// previous run's advance height, or its origin sits left of the previous
+ /// run's right edge (minus half that height) — a carriage return. Shared by
+ /// all three layers (visual, selection, single) so the heuristic, which is
+ /// always measured against the *previous* run's `prev_font_pt`, cannot drift
+ /// between them. Callers gate on `prev_font_pt > 0`.
+ static bool starts_new_line(const double baseline, const double prev_baseline,
+ const double ox, const double prev_end,
+ const double prev_font_pt) {
+ return std::abs(baseline - prev_baseline) > 0.6 * prev_font_pt ||
+ ox < prev_end - 0.5 * prev_font_pt;
+ }
+
+ /// The per-run geometry derived from a `TextElement` and the page's `to_box`
+ /// transform. Identical in every text mode, so it lives in one place — no
+ /// call site can compute it differently (that is how findings like the 180°
+ /// rotation and the drifting line-break threshold crept in).
+ struct RunGeometry {
+ util::math::Transform2D m; ///< glyph space -> page box (y-down, px later)
+ bool invisible; ///< Tr 3/7 — paints nothing, selectable only
+ bool is_matrix; ///< rotated/skewed/flipped -> CSS matrix path
+ double asc; ///< ascent in em (clamped)
+ double scale; ///< uniform axis scale (1 on the matrix path)
+ double ox; ///< origin x (baseline left) in pt
+ double baseline; ///< origin y (baseline) in pt
+ double extent; ///< advance width in pt
+ double font_pt; ///< font size along the advance axis in pt
+ double font_size_px; ///< CSS font-size in px
+ };
+
+ static RunGeometry run_geometry(const pdf::TextElement &text,
+ const util::math::Transform2D &to_box) {
+ constexpr util::math::Transform2D flip_glyph =
+ util::math::Transform2D::scaling(1, -1);
+ const util::math::Transform2D m = flip_glyph * text.transform * to_box;
+ const bool invisible =
+ text.rendering_mode == pdf::TextRenderingMode::invisible ||
+ text.rendering_mode == pdf::TextRenderingMode::clip;
+ // `m.a > 0` keeps the axis-aligned fast path from swallowing a pure 180°
+ // rotation (a = d = -1, b = c = 0), which would otherwise feed a negative
+ // `m.a` into `font_size_px` and the left/top math.
+ const bool is_matrix = !(m.b == 0 && m.c == 0 && m.a == m.d && m.a > 0);
+ const double tz = text.horizontal_scaling / 100.0;
+ const double axis = tz != 0 ? std::hypot(m.a, m.b) / tz : 0;
+ return RunGeometry{
+ .m = m,
+ .invisible = invisible,
+ .is_matrix = is_matrix,
+ .asc = ascent_em(text),
+ .scale = is_matrix ? 1.0 : m.a,
+ .ox = m.e,
+ .baseline = m.f,
+ .extent = text.width * axis,
+ .font_pt = text.size * axis,
+ .font_size_px =
+ round2((is_matrix ? text.size : m.a * text.size) * pt_to_px),
+ };
+ }
+
+ /// The colour class suffix (with a leading space) for a run's paint colour,
+ /// or "" for black / invisible. Interns the declaration in `styles`. Shared
+ /// by both modes' run emission.
+ static std::string color_class(const pdf::TextElement &text,
+ const bool invisible, AtomicStyles &styles) {
+ if (invisible) {
+ return {};
+ }
+ const pdf::GraphicsState::Color &paint =
+ (text.rendering_mode == pdf::TextRenderingMode::stroke ||
+ text.rendering_mode == pdf::TextRenderingMode::stroke_clip)
+ ? text.stroke_color
+ : text.fill_color;
+ std::string css = device_color_to_css(paint);
+ if (css == "rgb(0,0,0)") {
+ return {};
+ }
+ return ' ' + styles.intern("k", "color:" + std::move(css));
+ }
+
+ /// The page-box geometry (dimensions, the page `to_box` transform and the
+ /// `.p x# y#` class string) shared by both modes' page setup. `add_class`
+ /// interns the width/height declarations.
+ struct PageBox {
+ double width;
+ double height;
+ util::math::Transform2D to_box;
+ std::string classes;
+ };
+
+ template
+ static PageBox begin_page(const pdf::Page &page, AddClass &&add_class) {
+ const pdf::Array &page_box = page.media_box.as_array();
+ const double box_x0 = page_box[0].as_real();
+ const double box_y0 = page_box[1].as_real();
+ const double width = page_box[2].as_real() - box_x0;
+ const double height = page_box[3].as_real() - box_y0;
+
+ std::string classes = "p";
+ {
+ std::ostringstream w;
+ w << "width:" << width * pt_to_in << "in";
+ add_class(classes, "x", std::move(w).str());
+ std::ostringstream h;
+ h << "height:" << height * pt_to_in << "in";
+ add_class(classes, "y", std::move(h).str());
+ }
+
+ const util::math::Transform2D to_box =
+ util::math::Transform2D::translation(-box_x0, -box_y0) *
+ util::math::Transform2D::scaling_translation(1, -1, 0, height);
+
+ return {width, height, to_box, std::move(classes)};
+ }
+
+ /// Returns the 1-based font family index for `font`, or 0 when it is unusable
+ /// (or already rejected). On the first acceptance of a usable font runs
+ /// `on_accept(index)` so the caller can grow its parallel per-font arrays.
+ /// Shared accept/reject bookkeeping for both modes' `font_family` lambdas.
+ template
+ static std::uint32_t intern_font(
+ std::unordered_map &family_index,
+ std::uint32_t &family_count, pdf::Font *font, OnAccept &&on_accept) {
+ const auto [it, inserted] = family_index.try_emplace(font, 0);
+ if (!inserted) {
+ return it->second;
+ }
+ if (!font_is_usable(*font)) {
+ return 0;
+ }
+ const std::uint32_t index = ++family_count;
+ it->second = index;
+ on_accept(index);
+ return index;
+ }
+
+ /// Writes a page's `` clips and its paint-order body — an SVG
+ /// open/close dance around a `variant` item list — via
+ /// `write_line`. The structure is identical in both modes; only the line and
+ /// path types and the line writer differ.
+ template
+ static void
+ write_page_items(HtmlWriter &out, const std::string &clip_defs,
+ const std::vector> &items,
+ const double width, const double height,
+ WriteLine &&write_line) {
+ if (!clip_defs.empty()) {
+ out.write_raw(""
+ "");
+ out.write_raw(clip_defs);
+ out.write_raw(" ");
+ }
+ bool svg_open = false;
+ const auto close_svg = [&] {
+ if (svg_open) {
+ out.write_raw(" ");
+ svg_open = false;
}
- if (font_class_used[i][3]) {
- rule("gi", placement, "color:transparent;");
+ };
+ for (const std::variant &item : items) {
+ if (const auto *path = std::get_if(&item)) {
+ if (!svg_open) {
+ std::ostringstream open;
+ open << "";
+ out.write_raw(std::move(open).str());
+ svg_open = true;
+ }
+ out.write_raw(path->svg);
+ } else {
+ close_svg();
+ write_line(std::get(item));
}
}
+ close_svg();
+ }
- // Pass 2: write the document, now that the catalog is complete.
+ /// Writes the document/head prologue shared by both modes: the constant
+ /// `body`/`.p`/`.t` rules, then `write_mode_css()` for the mode-specific
+ /// rules, then the constant `.s` rule, the font faces/styles and the interned
+ /// atomic rules. Leaves the writer positioned after ``.
+ template
+ static void write_header_common(HtmlWriter &out,
+ const std::string &font_faces,
+ const std::string &font_styles,
+ const AtomicStyles &styles,
+ WriteModeCss &&write_mode_css) {
out.write_begin();
out.write_header_begin();
out.write_header_charset("UTF-8");
@@ -1136,132 +1700,233 @@ class HtmlServiceImpl final : public HtmlService {
out.write_header_title("odr");
out.write_header_viewport(
"width=device-width,initial-scale=1.0,user-scalable=yes");
- // Constant per-page and per-glyph styling lives in classes so it is not
- // repeated inline on every one of the (potentially millions of) spans.
out.write_header_style_begin();
- // Page presentation: a neutral backdrop with each page as a centered white
- // sheet and a soft drop shadow, mirroring the familiar PDF-viewer look.
- // This is purely cosmetic chrome around the page box; the
- // absolutely-positioned spans inside are unaffected (they anchor to the
- // `.p` box, which keeps `position:relative`).
out.out() << "body{margin:0;background:#525659}";
out.out() << ".p{position:relative;margin:16px auto;background:#fff;"
"box-shadow:0 1px 4px rgba(0,0,0,.5)}";
- // `font-kerning:none` + `font-variant-ligatures:none` keep the browser from
- // applying the embedded font's GPOS/GSUB tables. A collapsed run now emits
- // real Unicode in that font, so without this a sequence like `fi`/`AV`
- // could be re-shaped (ligature substitution, kerning) after this code
- // already fixed the PDF glyph IDs and advances, shifting pixels and run
- // widths for otherwise 1:1 text. The PUA glyph layer was immune; restore
- // that here.
- // `line-height:1` fixes the box top one em-ascent above the baseline so the
- // baseline shift applied to each run's `top`/matrix (see `ascent_em`) lands
- // the glyphs on the PDF text origin; the browser's default `normal` leading
- // would add an unknown offset.
+ // `.t`: shared base for all absolutely-positioned line blocks.
out.out() << ".t{position:absolute;left:0;top:0;transform-origin:0 0;"
"white-space:pre;line-height:1;font-kerning:none;"
"font-variant-ligatures:none}";
- // Invisible text render modes (Tr 3/7): kept in the DOM for selection and
- // search (OCR-over-scan), but not painted.
- out.out() << ".i{color:transparent}";
- // The display-only glyph layer (`no_unicode` runs) is not selectable, so
- // the PUA code points stay off the clipboard; `.g` pairs with a combined
- // `.fvN`/`.fnN` paint+font class on those spans.
- out.out() << ".g{user-select:none}";
- // Vector graphics: one or more `` overlays per page, each filling the
- // page box (viewBox in PDF points). `overflow:hidden` clips each overlay to
- // the page box, matching a PDF viewer: content drawn outside the MediaBox
- // (e.g. a background rectangle that bleeds past the left edge) is never
- // visible, and without this it spills into the centered page's margin.
- // In-page clip paths are honoured via per-path `clip-path` (the page's
- // `` defs are emitted in a hidden `` above).
- // `preserveAspectRatio:none` keeps the points->box mapping exact.
- // `pointer-events:none` so a full-page overlay painted after text
- // (paint order) does not swallow selection/clicks over its transparent
- // areas — the graphics are decorative, the text layer owns interaction.
+ write_mode_css();
+ // SVG overlay covering the page box (visual graphics layer).
out.out() << ".s{position:absolute;left:0;top:0;width:100%;height:100%;"
"overflow:hidden;pointer-events:none}";
- // Embedded fonts, re-encoded to the PUA and served inline.
out.out() << font_faces;
- // Combined per-font classes (`.fvN`/`.fnN` paint+font, `.gvN`/`.giN` also
- // placement), so a font-bearing span names one class for its font.
- out.out() << glyph_styles;
- // Per-value atomic classes (font sizes, offsets, transforms, ...).
+ out.out() << font_styles;
styles.write_rules(out.out());
out.write_header_style_end();
out.write_header_end();
+ }
- const auto write_span = [&out](const SpanOut &span) {
- // Inline so the whole run (and its nested glyph layer) stays on one line:
- // smaller output and a more legible diff than the open/text/close split,
- // while each run still gets its own line under the page div.
- out.write_element_begin(
- "span",
- HtmlElementOptions().set_inline(true).set_class(span.classes));
- out.write_raw(span.text);
- if (!span.glyph_classes.empty()) {
- out.write_element_begin("span",
- HtmlElementOptions().set_inline(true).set_class(
- span.glyph_classes));
- out.write_raw(span.glyph_text);
- out.write_element_end("span");
+ /// Appends a text run's line-block placement classes via `add_class(classes,
+ /// prefix, declaration)`: `l`/`t` (left/top, in px) for an axis-aligned run,
+ /// or `m` (a CSS `matrix(...)` transform, re-anchored to the run's baseline
+ /// by `ascent_pt`) for a rotated/skewed one. Shared by the visual, selection
+ /// and single-layer line blocks, which all position runs the same way.
+ template
+ static void add_position_classes(std::string &classes, AddClass &&add_class,
+ const util::math::Transform2D &m,
+ const bool is_matrix, const double ox,
+ const double baseline,
+ const double ascent_pt) {
+ if (!is_matrix) {
+ add_class(classes, "l", px_decl("left", round2(ox * pt_to_px)));
+ add_class(
+ classes, "t",
+ px_decl("top", round2((baseline - ascent_pt * m.a) * pt_to_px)));
+ return;
+ }
+ const double ascent_px = ascent_pt * pt_to_px;
+ const util::math::Transform2D px_m{m.a,
+ m.b,
+ m.c,
+ m.d,
+ m.e * pt_to_px - m.c * ascent_px,
+ m.f * pt_to_px - m.d * ascent_px};
+ add_class(classes, "m", "transform:" + svg_matrix(px_m));
+ }
+
+ /// Whether `font`'s embedded program can be re-encoded (SFNT PUA re-cmap or
+ /// CFF->OTF wrap) without throwing; probes the real encode path so failures
+ /// surface here rather than in the post-pass. Restores the SFNT's original
+ /// cmap after probing (the CFF probe is stateless).
+ static bool font_is_usable(const pdf::Font &font) {
+ if (const auto sfnt = std::dynamic_pointer_cast(
+ font.embedded_font)) {
+ std::map original_cmap = sfnt->cmap();
+ bool usable = false;
+ try {
+ font::reencode_to_pua(*sfnt);
+ std::ostringstream sfnt_out;
+ sfnt->write(sfnt_out);
+ usable = true;
+ } catch (...) {
+ usable = false;
}
- out.write_element_end("span");
+ sfnt->set_cmap(std::move(original_cmap));
+ return usable;
+ }
+ if (const auto cff =
+ std::dynamic_pointer_cast(font.embedded_font)) {
+ try {
+ (void)font::cff::wrap_to_otf(*cff);
+ return true;
+ } catch (...) {
+ return false;
+ }
+ }
+ return false;
+ }
+
+ /// The `fvN`/`fnN` class for `font` (visible/invisible), marking it used in
+ /// `font_class_used` so the post-pass emits the corresponding rule.
+ static std::string
+ font_class(std::vector> &font_class_used,
+ const std::uint32_t font, const bool inv) {
+ font_class_used[font - 1][inv ? 1 : 0] = true;
+ return (inv ? "fn" : "fv") + std::to_string(font);
+ }
+
+ /// Re-encodes `font`'s embedded program (SFNT PUA re-cmap or CFF->OTF wrap,
+ /// folding in `extra_unicode`'s real-Unicode cmap entries alongside the PUA
+ /// range) and appends its `@font-face` and `.fvN`/`.fnN` rules.
+ /// `class_used[0]`/`[1]` gate whether the visible/invisible rule is needed.
+ static void
+ write_font_face(pdf::Font &font, const std::uint32_t index,
+ const std::map &extra_unicode,
+ const std::array &class_used,
+ std::string &font_faces, std::string &font_styles) {
+ std::string reencoded;
+ if (const auto sfnt = std::dynamic_pointer_cast(
+ font.embedded_font)) {
+ font::reencode_to_pua(*sfnt, extra_unicode);
+ std::ostringstream sfnt_out;
+ sfnt->write(sfnt_out);
+ reencoded = std::move(sfnt_out).str();
+ } else if (const auto cff = std::dynamic_pointer_cast(
+ font.embedded_font)) {
+ reencoded = font::cff::wrap_to_otf(*cff, extra_unicode);
+ }
+ const std::string url = file_to_url(reencoded, "font/ttf");
+ const std::string n = std::to_string(index + 1);
+ font_faces += "@font-face{font-family:'odr-f";
+ font_faces += n;
+ font_faces += "';src:url(";
+ font_faces += url;
+ font_faces += ");}";
+ const auto rule = [&](const char *cls, const char *color) {
+ font_styles += '.';
+ font_styles += cls;
+ font_styles += n;
+ font_styles += '{';
+ font_styles += color;
+ font_styles += "font-family:'odr-f";
+ font_styles += n;
+ font_styles += "'}";
};
+ if (class_used[0]) {
+ rule("fv", "color:#000;");
+ }
+ if (class_used[1]) {
+ rule("fn", "color:transparent;");
+ }
+ }
- out.write_body_begin();
- for (const PageOut &page : pages_out) {
- out.write_element_begin("div",
- HtmlElementOptions().set_class(page.classes));
- // Clip-path and gradient defs for this page, in a hidden zero-size
- // ``. They are referenced by id from the page's fragments;
- // `clipPathUnits`/`gradientUnits` are `userSpaceOnUse`, so the geometry
- // is read in the user space of the referencing element (the page
- // viewBox), not this ``.
- if (!page.clip_defs.empty()) {
- out.write_raw(
- ""
- "");
- out.write_raw(page.clip_defs);
- out.write_raw(" ");
+ static double ascent_em(const pdf::TextElement &text) {
+ double em = 0.8;
+ if (text.font != nullptr && text.font->descriptor_ascent) {
+ em = *text.font->descriptor_ascent;
+ } else if (text.font != nullptr && text.font->embedded_font != nullptr) {
+ const std::uint16_t units = text.font->embedded_font->units_per_em();
+ if (units != 0) {
+ em = static_cast(
+ text.font->embedded_font->bounding_box().y_max) /
+ units;
}
- // Walk the page's elements in paint order, coalescing contiguous paths
- // into a single `` so spans and vector graphics layer by DOM order.
- bool svg_open = false;
- const auto close_svg = [&] {
- if (svg_open) {
- out.write_raw(" ");
- svg_open = false;
- }
- };
- for (const PageItem &item : page.items) {
- if (const auto *path = std::get_if(&item)) {
- if (!svg_open) {
- std::ostringstream open;
- open << "";
- out.write_raw(std::move(open).str());
- svg_open = true;
- }
- out.write_raw(path->svg);
- } else {
- close_svg();
- write_span(std::get(item));
+ }
+ return std::clamp(em, 0.5, 1.2);
+ }
+
+ static std::string glyph_run_str(const pdf::Font &font,
+ const std::string &codes) {
+ std::string s;
+ for (const std::uint32_t code : font.codes(codes)) {
+ util::string::append_c32(font::pua_code_point(font.glyph_for_code(code)),
+ s);
+ }
+ return s;
+ }
+
+ /// Escapes only the three markup-significant characters for the selection /
+ /// unicode text. Deliberately *not* `html::escape_text`: that substitutes
+ /// spaces with ` `, which browsers treat as a distinct character from
+ /// U+0020 and so breaks find-in-page word matching and double-click word
+ /// selection across the very text these layers exist to make selectable.
+ static std::string escape_markup(std::string s) {
+ util::string::replace_all(s, "&", "&");
+ util::string::replace_all(s, "<", "<");
+ util::string::replace_all(s, ">", ">");
+ return s;
+ }
+
+ /// Handles path/shading/image elements common to both rendering modes.
+ /// Calls close_line() and push_fragment(svg_string) when a non-empty
+ /// fragment is produced. Returns true when the element was a graphic
+ /// (caller should `continue`), false when it is a text element.
+ template
+ static bool handle_graphic_element(
+ const pdf::PageElement &element, const util::math::Transform2D &to_box,
+ double width, double height, ClipRegistry &clips,
+ GradientRegistry &gradients, PatternRegistry &patterns, Logger &logger,
+ CloseLine &&close_line, PushSvg &&push_svg) {
+ if (const auto *path = std::get_if(&element)) {
+ const std::string clip_id = clips.register_clip(path->clip, to_box);
+ std::string fill_url_id;
+ if (path->fill_shading != nullptr) {
+ fill_url_id = gradients.register_gradient(
+ *path->fill_shading, path->shading_transform * to_box);
+ } else if (path->fill_pattern != nullptr) {
+ fill_url_id = patterns.register_pattern(
+ *path->fill_pattern, path->pattern_transform * to_box,
+ path->fill_color, logger);
+ }
+ if (std::string frag =
+ svg_path_fragment(*path, to_box, clip_id, fill_url_id);
+ !frag.empty()) {
+ close_line();
+ push_svg(std::move(frag));
+ }
+ return true;
+ }
+ if (const auto *shading = std::get_if(&element)) {
+ if (shading->shading != nullptr) {
+ const std::string clip_id = clips.register_clip(shading->clip, to_box);
+ const std::string gradient_id = gradients.register_gradient(
+ *shading->shading, shading->transform * to_box);
+ if (std::string frag =
+ svg_shading_fragment(gradient_id, clip_id, width, height);
+ !frag.empty()) {
+ close_line();
+ push_svg(std::move(frag));
}
}
- close_svg();
- out.write_element_end("div");
+ return true;
}
- out.write_body_end();
- out.write_end();
-
- return resources;
+ if (const auto *image = std::get_if(&element)) {
+ const std::string clip_id = clips.register_clip(image->clip, to_box);
+ if (std::string frag = svg_image_fragment(*image, to_box, clip_id);
+ !frag.empty()) {
+ close_line();
+ push_svg(std::move(frag));
+ }
+ return true;
+ }
+ return false;
}
protected:
PdfFile m_pdf_file;
-
HtmlViews m_views;
};
diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private
index 85a14d01..3e9e7d31 160000
--- a/test/data/reference-output/odr-private
+++ b/test/data/reference-output/odr-private
@@ -1 +1 @@
-Subproject commit 85a14d010ffb87dddeb67cdc1aa18bd54d502c47
+Subproject commit 3e9e7d31b0082a37e2605f6c9237876df8172be3
diff --git a/test/data/reference-output/odr-public b/test/data/reference-output/odr-public
index 45b29f5b..2eaeb9db 160000
--- a/test/data/reference-output/odr-public
+++ b/test/data/reference-output/odr-public
@@ -1 +1 @@
-Subproject commit 45b29f5b796bda9ad0c14661179e50f91f47aecc
+Subproject commit 2eaeb9dbc2db95fbdb8a4b76b58d3976fafaab38
diff --git a/test/src/html_output_test.cpp b/test/src/html_output_test.cpp
index 3b51705c..ffc675e3 100644
--- a/test/src/html_output_test.cpp
+++ b/test/src/html_output_test.cpp
@@ -164,26 +164,44 @@ TEST_P(HtmlOutputTests, html_meta) {
fs::copy_options::recursive |
fs::copy_options::overwrite_existing);
- HtmlConfig config(output_path);
- config.embed_images = true;
- config.embed_shipped_resources = false;
- config.resource_path = resource_path;
- config.relative_resource_paths = true;
- config.editable = true;
- config.spreadsheet_limit = TableDimensions(4000, 500);
- config.format_html = true;
- config.html_indent = 1;
- config.html_indent_string = "\t";
-
- std::string output_path_tmp = output_path + "/tmp";
- std::filesystem::create_directories(output_path_tmp);
- HtmlService service = html::translate(file, output_path_tmp, config);
- Html html = service.bring_offline(output_path);
- std::filesystem::remove_all(output_path_tmp);
-
- for (const HtmlPage &html_page : html.pages()) {
- EXPECT_TRUE(fs::is_regular_file(html_page.path));
- EXPECT_LT(0, fs::file_size(html_page.path));
+ const auto write_offline = [&](const std::string &out_path,
+ const bool single_layer) {
+ HtmlConfig config(out_path);
+ config.embed_images = true;
+ config.embed_shipped_resources = false;
+ config.resource_path = resource_path;
+ config.relative_resource_paths = true;
+ config.editable = true;
+ config.spreadsheet_limit = TableDimensions(4000, 500);
+ config.format_html = true;
+ config.html_indent = 1;
+ config.html_indent_string = "\t";
+ if (single_layer) {
+ config.pdf_text_mode = PdfTextMode::single_layer;
+ }
+
+ const std::string output_path_tmp = out_path + "/tmp";
+ fs::create_directories(out_path);
+ std::filesystem::create_directories(output_path_tmp);
+ HtmlService service = html::translate(file, output_path_tmp, config);
+ Html html = service.bring_offline(out_path);
+ std::filesystem::remove_all(output_path_tmp);
+
+ for (const HtmlPage &html_page : html.pages()) {
+ EXPECT_TRUE(fs::is_regular_file(html_page.path));
+ EXPECT_LT(0, fs::file_size(html_page.path));
+ }
+ };
+
+ write_offline(output_path, /*single_layer=*/false);
+
+ // PDFs default to `PdfTextMode::dual_layer`. To keep the single-layer path
+ // under reference-output coverage too, emit it alongside the dual-layer
+ // output for one representative PDF under a `-single` suffix (mirroring the
+ // `-poppler` engine-suffix convention).
+ if (test_file.short_path == "odr-private/pdf/978-3-030-65771-0.pdf" &&
+ engine == DecoderEngine::odr) {
+ write_offline(output_path + "-single", /*single_layer=*/true);
}
}
diff --git a/test/src/internal/font/sfnt_transform.cpp b/test/src/internal/font/sfnt_transform.cpp
index 6ee02b79..46ea4fad 100644
--- a/test/src/internal/font/sfnt_transform.cpp
+++ b/test/src/internal/font/sfnt_transform.cpp
@@ -178,9 +178,24 @@ TEST(SfntTransform, serialize_cmap_round_trips_multiple_segments) {
EXPECT_EQ(parsed.glyph_for_code_point('C'), 0); // gap between the segments
}
-TEST(SfntTransform, serialize_cmap_rejects_beyond_bmp) {
- const std::map map{{0x1f600, 1}};
- EXPECT_THROW((void)serialize_cmap(map), std::runtime_error);
+TEST(SfntTransform, serialize_cmap_format12_round_trips_beyond_bmp) {
+ // A beyond-BMP code point forces a format-12 subtable. Mixes a BMP entry
+ // ('A'), a Supplementary PUA-A run (two consecutive codes/glyphs), and a
+ // lone high code point — exercising the group builder across all three.
+ const std::map map{
+ {'A', 1}, {0xf0000, 2}, {0xf0001, 3}, {0x10fffd, 9}};
+ const std::string font =
+ build_font(0x00010000, {{"head", head_table()},
+ {"maxp", maxp_table(10)},
+ {"hhea", hhea_table(0)},
+ {"cmap", serialize_cmap(map)}});
+
+ const sfnt::SfntFont parsed = parse(font);
+ EXPECT_EQ(parsed.glyph_for_code_point('A'), 1);
+ EXPECT_EQ(parsed.glyph_for_code_point(0xf0000), 2);
+ EXPECT_EQ(parsed.glyph_for_code_point(0xf0001), 3);
+ EXPECT_EQ(parsed.glyph_for_code_point(0x10fffd), 9);
+ EXPECT_EQ(parsed.glyph_for_code_point(0xf0002), 0); // gap after the run
}
TEST(SfntTransform, reencode_mutates_the_font_in_place) {
@@ -257,9 +272,24 @@ TEST(SfntTransform, write_preserves_passthrough_tables_and_checksum) {
EXPECT_EQ(parsed.name(), "TestFont");
}
-TEST(SfntTransform, reencode_rejects_too_many_glyphs) {
+TEST(SfntTransform, reencode_overflows_into_supplementary_pua) {
+ // A font with more glyphs than the 6400-slot BMP PUA re-encodes by spilling
+ // the overflow into Supplementary PUA-A; the writer emits a format-12 cmap so
+ // the beyond-BMP code points round-trip.
sfnt::SfntFont font = parse(sample_font(7000));
- EXPECT_THROW(reencode_to_pua(font), std::runtime_error);
+ reencode_to_pua(font);
+
+ // Glyph 0 stays in the BMP PUA; glyph 6400 is the first overflow into PUA-A.
+ EXPECT_EQ(pua_code_point(0), 0xe000u);
+ EXPECT_EQ(pua_code_point(6400), 0xf0000u);
+
+ std::ostringstream out;
+ font.write(out);
+ const sfnt::SfntFont parsed = parse(std::move(out).str());
+ EXPECT_EQ(parsed.glyph_for_code_point(pua_code_point(0)), 0);
+ EXPECT_EQ(parsed.glyph_for_code_point(pua_code_point(6399)), 6399);
+ EXPECT_EQ(parsed.glyph_for_code_point(pua_code_point(6400)), 6400);
+ EXPECT_EQ(parsed.glyph_for_code_point(pua_code_point(6999)), 6999);
}
TEST(SfntTransform, write_synthesizes_post_when_absent) {