diff --git a/CMakeLists.txt b/CMakeLists.txt index 7dafea1d..d0067b11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,9 +198,14 @@ set(ODR_SOURCE_FILES "src/odr/internal/pdf/pdf_object_parser.cpp" "src/odr/internal/pdf/pdf_page_text.cpp" + "src/odr/internal/font/cff_builder.cpp" "src/odr/internal/font/cff_font.cpp" "src/odr/internal/font/cff_standard_strings.cpp" "src/odr/internal/font/cff_transform.cpp" + "src/odr/internal/font/type1_charstring.cpp" + "src/odr/internal/font/type1_crypt.cpp" + "src/odr/internal/font/type1_font.cpp" + "src/odr/internal/font/type1_transform.cpp" "src/odr/internal/font/sfnt_font.cpp" "src/odr/internal/font/sfnt_transform.cpp" "src/odr/internal/font/font_file.cpp" diff --git a/src/odr/internal/font/cff_builder.cpp b/src/odr/internal/font/cff_builder.cpp new file mode 100644 index 00000000..b92cb24c --- /dev/null +++ b/src/odr/internal/font/cff_builder.cpp @@ -0,0 +1,187 @@ +#include + +#include + +#include +#include +#include + +namespace odr::internal::font::cff { + +namespace { + +/// A CFF DICT integer in the compact encoding (used for widths / bbox). +void dict_int(std::string &s, const std::int32_t v) { + if (v >= -107 && v <= 107) { + s += static_cast(v + 139); + } else if (v >= 108 && v <= 1131) { + const std::int32_t u = v - 108; + s += static_cast((u >> 8) + 247); + s += static_cast(u & 0xff); + } else if (v >= -1131 && v <= -108) { + const std::int32_t u = -v - 108; + s += static_cast((u >> 8) + 251); + s += static_cast(u & 0xff); + } else if (v >= -32768 && v <= 32767) { + s += static_cast(28); + util::byte_string::put_u16_be(s, static_cast(v)); + } else { + s += static_cast(29); + s += static_cast((v >> 24) & 0xff); + s += static_cast((v >> 16) & 0xff); + s += static_cast((v >> 8) & 0xff); + s += static_cast(v & 0xff); + } +} + +/// A CFF DICT integer in the fixed 5-byte form (`29 + int32`), so an operand +/// whose value (an offset) is not yet known can be sized before it is filled. +void dict_int_fixed(std::string &s, const std::int32_t v) { + s += static_cast(29); + s += static_cast((v >> 24) & 0xff); + s += static_cast((v >> 16) & 0xff); + s += static_cast((v >> 8) & 0xff); + s += static_cast(v & 0xff); +} + +void dict_operator(std::string &s, const std::int32_t op) { + if (op >= 1200) { + s += static_cast(12); + s += static_cast(op - 1200); + } else { + s += static_cast(op); + } +} + +/// Serialize a CFF INDEX from its members. +std::string build_index(const std::vector &members) { + std::string out; + util::byte_string::put_u16_be(out, + static_cast(members.size())); + if (members.empty()) { + return out; // count 0: no offSize/offsets + } + std::uint32_t total = 1; + for (const std::string &m : members) { + total += static_cast(m.size()); + } + const std::uint8_t off_size = total <= 0xff ? 1 + : total <= 0xffff ? 2 + : total <= 0xffffff ? 3 + : 4; + out += static_cast(off_size); + const auto put_off = [&](const std::uint32_t off) { + for (std::int32_t i = off_size - 1; i >= 0; --i) { + out += static_cast((off >> (8 * i)) & 0xff); + } + }; + std::uint32_t offset = 1; + put_off(offset); + for (const std::string &m : members) { + offset += static_cast(m.size()); + put_off(offset); + } + for (const std::string &m : members) { + out += m; + } + return out; +} + +} // namespace + +} // namespace odr::internal::font::cff + +namespace odr::internal::font { + +std::string cff::build_cff(const std::string_view name, + const std::vector &glyphs, + const double default_width, + const double nominal_width, const FontBBox bbox) { + // CharStrings INDEX (one Type2 charstring per glyph). + std::vector charstrings; + charstrings.reserve(glyphs.size()); + for (const BuilderGlyph &glyph : glyphs) { + charstrings.push_back(glyph.charstring); + } + const std::string charstrings_index = build_index(charstrings); + + // String INDEX: every glyph name gets a custom SID (391 + position). Glyph 0 + // is the implicit `.notdef` (SID 0), so its name is not stored; the charset + // lists SIDs for glyphs 1..n-1. + std::vector strings; + for (std::size_t i = 1; i < glyphs.size(); ++i) { + strings.push_back(glyphs[i].name); + } + const std::string string_index = build_index(strings); + + // Format-0 charset: SID per glyph 1..n-1. + std::string charset; + charset += static_cast(0); // format 0 + for (std::size_t i = 1; i < glyphs.size(); ++i) { + util::byte_string::put_u16_be(charset, + static_cast(391 + (i - 1))); + } + + // Private DICT: defaultWidthX (20), nominalWidthX (21). + std::string private_dict; + dict_int(private_dict, static_cast(default_width)); + dict_operator(private_dict, 20); + dict_int(private_dict, static_cast(nominal_width)); + dict_operator(private_dict, 21); + + const std::string name_index = + build_index({std::string(name.empty() ? "ODRType1" : name)}); + const std::string global_subrs = build_index({}); + + // Top DICT, with the offsets to charset / CharStrings / Private filled once + // the layout is known. Fixed-width offset integers keep the size constant. + const auto top_dict = [&](const std::uint32_t charset_off, + const std::uint32_t charstrings_off, + const std::uint32_t private_off) { + std::string d; + dict_int(d, bbox.x_min); + dict_int(d, bbox.y_min); + dict_int(d, bbox.x_max); + dict_int(d, bbox.y_max); + dict_operator(d, 5); // FontBBox + dict_int_fixed(d, static_cast(charset_off)); + dict_operator(d, 15); // charset + dict_int_fixed(d, static_cast(charstrings_off)); + dict_operator(d, 17); // CharStrings + dict_int_fixed(d, static_cast(private_dict.size())); + dict_int_fixed(d, static_cast(private_off)); + dict_operator(d, 18); // Private [size offset] + return d; + }; + + const std::string top_dict_probe = build_index({top_dict(0, 0, 0)}); + constexpr std::uint32_t header_size = 4; + const auto prefix = static_cast( + header_size + name_index.size() + top_dict_probe.size() + + string_index.size() + global_subrs.size()); + // Layout after the prefix: CharStrings, charset, Private. + const std::uint32_t charstrings_off = prefix; + const std::uint32_t charset_off = + charstrings_off + static_cast(charstrings_index.size()); + const std::uint32_t private_off = + charset_off + static_cast(charset.size()); + + const std::string top_dict_index = + build_index({top_dict(charset_off, charstrings_off, private_off)}); + + std::string out; + out += static_cast(1); // major + out += static_cast(0); // minor + out += static_cast(4); // hdrSize + out += static_cast(4); // offSize (absolute offsets; legacy/unused) + out += name_index; + out += top_dict_index; + out += string_index; + out += global_subrs; + out += charstrings_index; + out += charset; + out += private_dict; + return out; +} + +} // namespace odr::internal::font diff --git a/src/odr/internal/font/cff_builder.hpp b/src/odr/internal/font/cff_builder.hpp new file mode 100644 index 00000000..cf8f6ac5 --- /dev/null +++ b/src/odr/internal/font/cff_builder.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include + +#include +#include +#include + +namespace odr::internal::font::cff { + +/// One glyph for the CFF builder: its PostScript name and its **Type2** +/// charstring (already translated from Type1, if applicable). +struct BuilderGlyph { + std::string name; + std::string charstring; +}; + +/// Serialize a name-keyed CFF font from Type2 charstrings. +/// +/// Assembles the minimal CFF a `CffFont` reader (and, after wrapping, a +/// browser) needs: Header, Name INDEX, Top DICT (FontBBox + +/// charset/CharStrings/Private offsets), String INDEX (every glyph name, SID +/// 391+), an empty Global Subr INDEX, the CharStrings INDEX, a format-0 charset +/// and a Private DICT +/// (`defaultWidthX`/`nominalWidthX`). Glyph 0 is the implicit `.notdef`; the +/// caller orders @p glyphs so glyph 0 is `.notdef`. +/// +/// This is the assembly target for the Type1 -> CFF path (stage 3.5): the +/// translated Type2 charstrings go in here, the result feeds `CffFont` + +/// `wrap_to_otf` (3.4). No `FontMatrix` is emitted, so the font is 1000 +/// units/em (the Type1 default); a non-default matrix is a follow-up. +/// +/// Offsets in the Top DICT use the fixed-width 5-byte integer form so the +/// layout resolves in a single pass. +[[nodiscard]] std::string build_cff(std::string_view name, + const std::vector &glyphs, + double default_width, double nominal_width, + FontBBox bbox); + +} // namespace odr::internal::font::cff diff --git a/src/odr/internal/font/type1_charstring.cpp b/src/odr/internal/font/type1_charstring.cpp new file mode 100644 index 00000000..0ccdccb7 --- /dev/null +++ b/src/odr/internal/font/type1_charstring.cpp @@ -0,0 +1,428 @@ +#include + +#include +#include +#include +#include +#include + +namespace odr::internal::font::type1 { + +namespace { + +// Type1 charstring operators (single byte; 12 = escape to a two-byte op). +enum T1 : std::int32_t { + t1_hstem = 1, + t1_vstem = 3, + t1_vmoveto = 4, + t1_rlineto = 5, + t1_hlineto = 6, + t1_vlineto = 7, + t1_rrcurveto = 8, + t1_closepath = 9, + t1_callsubr = 10, + t1_return = 11, + t1_hsbw = 13, + t1_endchar = 14, + t1_rmoveto = 21, + t1_hmoveto = 22, + t1_vhcurveto = 30, + t1_hvcurveto = 31, + t1_dotsection = 1200, + t1_vstem3 = 1201, + t1_hstem3 = 1202, + t1_seac = 1206, + t1_sbw = 1207, + t1_div = 1212, + t1_callothersubr = 1216, + t1_pop = 1217, + t1_setcurrentpoint = 1233, +}; + +/// Encode an integer operand in the Type2 charstring number forms. +void emit_int(std::string &out, const std::int32_t v) { + if (v >= -107 && v <= 107) { + out += static_cast(v + 139); + } else if (v >= 108 && v <= 1131) { + const std::int32_t u = v - 108; + out += static_cast((u >> 8) + 247); + out += static_cast(u & 0xff); + } else if (v >= -1131 && v <= -108) { + const std::int32_t u = -v - 108; + out += static_cast((u >> 8) + 251); + out += static_cast(u & 0xff); + } else { + out += static_cast(28); // shortint + out += static_cast((v >> 8) & 0xff); + out += static_cast(v & 0xff); + } +} + +/// Encode a (possibly fractional) operand: an integer form when whole and in +/// range, else the Type2 16.16 fixed form (`255 + int32`). +void emit_num(std::string &out, const double v) { + if (v == std::floor(v) && v >= -32768 && v <= 32767) { + emit_int(out, static_cast(v)); + return; + } + const auto fixed = static_cast(std::lround(v * 65536.0)); + out += static_cast(255); + out += static_cast((fixed >> 24) & 0xff); + out += static_cast((fixed >> 16) & 0xff); + out += static_cast((fixed >> 8) & 0xff); + out += static_cast(fixed & 0xff); +} + +/// The translation state machine. Walks the Type1 charstring (recursing through +/// `callsubr`), emitting a Type2 charstring. +class Translator { +public: + explicit Translator(const std::vector &subrs) : m_subrs(subrs) {} + + Type2Charstring run(const std::string_view charstring) { + execute(charstring, 0); + if (!m_ended) { + m_out += static_cast(t1_endchar); + } + return {std::move(m_out), m_width, m_has_width}; + } + +private: + // Emit the pending width (once) ahead of the first stem/move/endchar's + // operands, as the Type2 width does. nominalWidthX is 0 in the built CFF, so + // the width is the absolute advance. + void emit_width() { + if (m_width_pending) { + emit_int(m_out, m_width); + m_width_pending = false; + } + } + + void flush_stack() { + for (const double v : m_stack) { + emit_num(m_out, v); + } + m_stack.clear(); + } + + // Emit width + operands + a one-byte operator, clearing the stack. + void emit_op(const std::int32_t op) { + emit_width(); + flush_stack(); + m_out += static_cast(op); + } + + void execute(const std::string_view cs, const std::int32_t depth) { + if (depth > 16 || m_ended) { + return; + } + std::size_t p = 0; + while (p < cs.size() && !m_ended) { + const auto b = static_cast(cs[p]); + if (b >= 32) { + // operand + double value = 0.0; + if (b <= 246) { + value = static_cast(b) - 139; + p += 1; + } else if (b <= 250) { + value = (static_cast(b) - 247) * 256 + + static_cast(cs[p + 1]) + 108; + p += 2; + } else if (b <= 254) { + value = -(static_cast(b) - 251) * 256 - + static_cast(cs[p + 1]) - 108; + p += 2; + } else { // 255: Type1 32-bit integer + value = static_cast( + (static_cast(cs[p + 1]) << 24) | + (static_cast(cs[p + 2]) << 16) | + (static_cast(cs[p + 3]) << 8) | + static_cast(cs[p + 4])); + p += 5; + } + m_stack.push_back(value); + continue; + } + std::int32_t op = b; + ++p; + if (b == 12) { + op = 1200 + static_cast(cs[p]); + ++p; + } + handle(op, depth); + } + } + + void handle(const std::int32_t op, const std::int32_t depth) { + switch (op) { + case t1_hsbw: + if (m_stack.size() >= 2) { + m_sbx = m_stack[0]; + m_width = static_cast(m_stack[1]); + m_has_width = true; + m_width_pending = true; + m_sbx_pending = true; + } + m_stack.clear(); + break; + case t1_sbw: + if (m_stack.size() >= 4) { + m_sbx = m_stack[0]; + m_width = static_cast(m_stack[2]); + m_has_width = true; + m_width_pending = true; + m_sbx_pending = true; + } + m_stack.clear(); + break; + + case t1_rmoveto: + if (m_flex_active) { + collect_flex_point(); + } else { + if (m_sbx_pending && !m_stack.empty()) { + m_stack[0] += m_sbx; + m_sbx_pending = false; + } + emit_op(t1_rmoveto); + } + break; + case t1_hmoveto: + if (m_flex_active) { + collect_flex_point(); + } else { + // hmoveto has no y; the side bearing adds an x, so keep it hmoveto. + if (m_sbx_pending && !m_stack.empty()) { + m_stack[0] += m_sbx; + } + m_sbx_pending = false; + emit_op(t1_hmoveto); + } + break; + case t1_vmoveto: + if (m_flex_active) { + collect_flex_point(); + } else if (m_sbx_pending && m_sbx != 0.0) { + // A side bearing adds an x offset, which vmoveto cannot carry: promote + // to rmoveto(sbx, dy). + const double dy = m_stack.empty() ? 0.0 : m_stack[0]; + m_stack = {m_sbx, dy}; + m_sbx_pending = false; + emit_op(t1_rmoveto); + } else { + m_sbx_pending = false; + emit_op(t1_vmoveto); + } + break; + + case t1_hstem: + case t1_vstem: + case t1_rlineto: + case t1_hlineto: + case t1_vlineto: + case t1_rrcurveto: + case t1_vhcurveto: + case t1_hvcurveto: + emit_op(op); // identical opcodes/semantics in Type2 + break; + + case t1_closepath: + case t1_dotsection: + case t1_vstem3: + case t1_hstem3: + case t1_setcurrentpoint: + m_stack.clear(); // dropped (implicit / hints / no-op) + break; + + case t1_div: + if (m_stack.size() >= 2) { + const double b = m_stack.back(); + m_stack.pop_back(); + const double a = m_stack.back(); + m_stack.pop_back(); + m_stack.push_back(b != 0.0 ? a / b : 0.0); + } + break; + + case t1_callsubr: { + if (m_stack.empty()) { + break; + } + const auto index = static_cast(m_stack.back()); + m_stack.pop_back(); + if (index >= 0 && index < static_cast(m_subrs.size())) { + execute(m_subrs[index], depth + 1); + } + break; + } + case t1_return: + break; // end of the current subr + + case t1_callothersubr: + handle_othersubr(); + break; + case t1_pop: + // Push the value the matching callothersubr left on the PS stack. + if (!m_ps_stack.empty()) { + m_stack.push_back(m_ps_stack.back()); + m_ps_stack.pop_back(); + } else { + m_stack.push_back(0.0); + } + break; + + case t1_seac: + emit_seac(); + break; + + case t1_endchar: + emit_op(t1_endchar); + m_ended = true; + break; + + default: + m_stack.clear(); // unknown: skip + break; + } + } + + // OtherSubr dispatch: flex (1 start / 2 add-point / 0 end) and hint + // replacement (3). The Type1 convention is `arg1..argN N othersubr# + // callothersubr`, so the operand stack top is the othersubr number, below it + // the argument count, below that the arguments. + void handle_othersubr() { + if (m_stack.size() < 2) { + m_stack.clear(); + return; + } + const auto othersubr = static_cast(m_stack.back()); + m_stack.pop_back(); + const auto argc = static_cast(m_stack.back()); + m_stack.pop_back(); + + std::vector args; + for (std::int32_t i = 0; i < argc && !m_stack.empty(); ++i) { + args.push_back(m_stack.back()); + m_stack.pop_back(); + } + // args is reversed (top first); restore call order. + std::reverse(args.begin(), args.end()); + + switch (othersubr) { + case 1: // flex start + m_flex_active = true; + m_flex_points.clear(); + break; + case 2: // flex add-point marker (the rmoveto already collected the point) + break; + case 0: // flex end: emit two curves from the collected points + emit_flex(); + m_flex_active = false; + // OtherSubr 0 leaves the end x,y on the PS stack for `pop pop + // setcurrentpoint`. + if (args.size() >= 3) { + m_ps_stack.push_back(args[2]); // y (popped second) + m_ps_stack.push_back(args[1]); // x (popped first) + } + break; + case 3: // hint replacement: result is the subr number, used by callsubr + m_ps_stack.push_back(args.empty() ? 3.0 : args[0]); + break; + default: + // Unknown OtherSubr: make the arguments available to subsequent pops. + for (auto it = args.rbegin(); it != args.rend(); ++it) { + m_ps_stack.push_back(*it); + } + break; + } + } + + // During flex the 7 points arrive as `dx dy rmoveto`; collect their deltas. + void collect_flex_point() { + const double dx = m_stack.size() >= 2 ? m_stack[m_stack.size() - 2] : 0.0; + const double dy = m_stack.empty() ? 0.0 : m_stack.back(); + m_flex_points.push_back({dx, dy}); + m_stack.clear(); + } + + // Emit the flex as two rrcurvetos. Point 1 is the reference point; points + // 2..7 are the two beziers. The first curve's leading delta folds in the + // reference delta (point 2 relative to the pre-flex point = d1 + d2). + void emit_flex() { + if (m_flex_points.size() < 7) { + m_flex_points.clear(); + return; + } + const auto &d = m_flex_points; + emit_width(); + emit_num(m_out, d[1].x + d[0].x); + emit_num(m_out, d[1].y + d[0].y); + emit_num(m_out, d[2].x); + emit_num(m_out, d[2].y); + emit_num(m_out, d[3].x); + emit_num(m_out, d[3].y); + m_out += static_cast(t1_rrcurveto); + emit_num(m_out, d[4].x); + emit_num(m_out, d[4].y); + emit_num(m_out, d[5].x); + emit_num(m_out, d[5].y); + emit_num(m_out, d[6].x); + emit_num(m_out, d[6].y); + m_out += static_cast(t1_rrcurveto); + m_flex_points.clear(); + } + + // seac: asb adx ady bchar achar. Emit the Type2 deprecated endchar-seac form + // `adx' ady bchar achar endchar`, adjusting adx for the accent side bearing. + void emit_seac() { + if (m_stack.size() >= 5) { + const double asb = m_stack[0]; + const double adx = m_stack[1]; + const double ady = m_stack[2]; + const double bchar = m_stack[3]; + const double achar = m_stack[4]; + m_stack.clear(); + emit_width(); + emit_num(m_out, adx - asb + m_sbx); + emit_num(m_out, ady); + emit_num(m_out, bchar); + emit_num(m_out, achar); + m_out += static_cast(t1_endchar); + } + m_ended = true; + } + + struct Point { + double x; + double y; + }; + + const std::vector &m_subrs; + std::string m_out; + std::vector m_stack; + std::vector m_ps_stack; + + std::int32_t m_width{}; + bool m_has_width{}; + bool m_width_pending{}; + double m_sbx{}; + bool m_sbx_pending{}; + bool m_ended{}; + + bool m_flex_active{}; + std::vector m_flex_points; +}; + +} // namespace + +} // namespace odr::internal::font::type1 + +namespace odr::internal::font { + +type1::Type2Charstring type1::to_type2(const std::string_view type1, + const std::vector &subrs) { + return Translator(subrs).run(type1); +} + +} // namespace odr::internal::font diff --git a/src/odr/internal/font/type1_charstring.hpp b/src/odr/internal/font/type1_charstring.hpp new file mode 100644 index 00000000..86c333ac --- /dev/null +++ b/src/odr/internal/font/type1_charstring.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include +#include + +namespace odr::internal::font::type1 { + +/// The result of translating a Type1 charstring to Type2 (CFF). +struct Type2Charstring { + std::string charstring; ///< the Type2 charstring (no leading width) + std::int32_t width{}; ///< advance width from `hsbw`/`sbw`, in glyph units + bool has_width{}; ///< whether an `hsbw`/`sbw` set the width +}; + +/// Translate one **decrypted** Type1 charstring to a Type2 (CFF) charstring. +/// +/// Type1 and Type2 share most path operators; this flattens `callsubr` +/// (inlining @p subrs), folds `div`, lifts the `hsbw` side bearing into the +/// first move, drops Type1-only hint operators (`dotsection`, `*stem3`, hint +/// replacement) and translates the flex and `seac` OtherSubr mechanisms. The +/// advance width (`hsbw`) is returned separately rather than baked into the +/// charstring, so the caller emits it against the CFF `nominalWidthX`. +/// +/// Best-effort and display-oriented: hints are dropped (they affect rendering +/// quality, not glyph shape), and unknown operators are skipped. +[[nodiscard]] Type2Charstring to_type2(std::string_view type1, + const std::vector &subrs); + +} // namespace odr::internal::font::type1 diff --git a/src/odr/internal/font/type1_crypt.cpp b/src/odr/internal/font/type1_crypt.cpp new file mode 100644 index 00000000..4a31b73c --- /dev/null +++ b/src/odr/internal/font/type1_crypt.cpp @@ -0,0 +1,98 @@ +#include + +#include +#include +#include + +namespace odr::internal::font::type1 { + +namespace { + +constexpr std::uint16_t c1 = 52845; +constexpr std::uint16_t c2 = 22719; + +[[nodiscard]] bool is_hex_digit(const std::uint8_t c) { + return std::isxdigit(c) != 0; +} + +/// Hex-decode @p in, skipping whitespace; stops at the first non-hex, non-space +/// byte (the binary `eexec` form never reaches here). +[[nodiscard]] std::string hex_decode(const std::string_view in) { + std::string out; + std::int32_t high = -1; + for (const char ch : in) { + const auto c = static_cast(ch); + if (std::isspace(c) != 0) { + continue; + } + if (!is_hex_digit(c)) { + break; + } + const std::int32_t value = (c <= '9') ? c - '0' + : (c <= 'F') ? c - 'A' + 10 + : c - 'a' + 10; + if (high < 0) { + high = value; + } else { + out += static_cast((high << 4) | value); + high = -1; + } + } + return out; +} + +/// Whether @p eexec is the ASCII-hex form: the first four non-space bytes are +/// all hex digits (Type1 spec 7.2 — the binary form is detected as not-this). +[[nodiscard]] bool looks_like_hex(const std::string_view eexec) { + std::int32_t seen = 0; + for (const char ch : eexec) { + const auto c = static_cast(ch); + if (std::isspace(c) != 0) { + continue; + } + if (!is_hex_digit(c)) { + return false; + } + if (++seen == 4) { + return true; + } + } + return false; +} + +} // namespace + +} // namespace odr::internal::font::type1 + +namespace odr::internal::font { + +std::string type1::decrypt(const std::string_view cipher, + const std::uint16_t key, const std::size_t skip) { + std::uint16_t r = key; + std::string out; + out.reserve(cipher.size()); + for (const char ch : cipher) { + const auto c = static_cast(ch); + out += static_cast(c ^ (r >> 8)); + r = static_cast((c + r) * c1 + c2); + } + if (skip >= out.size()) { + return {}; + } + return out.substr(skip); +} + +std::string type1::decrypt_eexec(const std::string_view eexec) { + if (looks_like_hex(eexec)) { + const std::string binary = hex_decode(eexec); + return decrypt(binary, 55665, 4); + } + return decrypt(eexec, 55665, 4); +} + +std::string type1::decrypt_charstring(const std::string_view charstring, + const std::size_t len_iv) { + return decrypt(charstring, 4330, len_iv); +} + +} // namespace odr::internal::font diff --git a/src/odr/internal/font/type1_crypt.hpp b/src/odr/internal/font/type1_crypt.hpp new file mode 100644 index 00000000..779f9b3e --- /dev/null +++ b/src/odr/internal/font/type1_crypt.hpp @@ -0,0 +1,34 @@ +#pragma once + +#include +#include +#include +#include + +namespace odr::internal::font::type1 { + +// Type1 (Adobe Type 1 Font Format) `eexec` / charstring decryption. +// +// Both the `eexec`-encrypted portion of the font program and each individual +// charstring use the same stream cipher with different keys: a 16-bit running +// key `R`, constants c1 = 52845 / c2 = 22719, where each plaintext byte is +// `cipher ^ (R >> 8)` and `R = (cipher + R) * c1 + c2` (mod 2^16). The leading +// @p skip bytes of plaintext are random padding and discarded. + +/// Decrypt @p cipher with the running-key cipher seeded at @p key, discarding +/// the first @p skip plaintext bytes. +[[nodiscard]] std::string decrypt(std::string_view cipher, std::uint16_t key, + std::size_t skip); + +/// Decrypt the `eexec` section (key 55665, 4 random bytes). Accepts either the +/// binary form (PDF `/FontFile`, the `/Length2` portion) or the ASCII-hex form +/// (PFA fonts): when the section's leading bytes are all hex digits/whitespace +/// it is hex-decoded first. +[[nodiscard]] std::string decrypt_eexec(std::string_view eexec); + +/// Decrypt one charstring (key 4330), discarding @p len_iv leading bytes +/// (the font's `/lenIV`, default 4). +[[nodiscard]] std::string decrypt_charstring(std::string_view charstring, + std::size_t len_iv = 4); + +} // namespace odr::internal::font::type1 diff --git a/src/odr/internal/font/type1_font.cpp b/src/odr/internal/font/type1_font.cpp new file mode 100644 index 00000000..2e4075b0 --- /dev/null +++ b/src/odr/internal/font/type1_font.cpp @@ -0,0 +1,281 @@ +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace odr::internal::font::type1 { + +namespace { + +[[nodiscard]] bool is_ps_space(const char c) { + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f' || + c == '\0'; +} + +/// Skip PostScript whitespace starting at @p p. +[[nodiscard]] std::size_t skip_space(const std::string_view s, std::size_t p) { + while (p < s.size() && is_ps_space(s[p])) { + ++p; + } + return p; +} + +/// Read a whitespace-delimited token starting at @p p; advances @p p past it. +[[nodiscard]] std::string_view read_token(const std::string_view s, + std::size_t &p) { + p = skip_space(s, p); + const std::size_t begin = p; + while (p < s.size() && !is_ps_space(s[p])) { + ++p; + } + return s.substr(begin, p - begin); +} + +[[nodiscard]] bool parse_int(const std::string_view token, std::int32_t &out) { + const char *begin = token.data(); + const char *end = begin + token.size(); + const auto [ptr, ec] = std::from_chars(begin, end, out); + return ec == std::errc() && ptr == end; +} + +[[nodiscard]] double parse_double(const std::string_view token) { + // std::from_chars for double is not universally available; std::stod needs a + // null-terminated copy. + try { + return std::stod(std::string(token)); + } catch (const std::exception &) { + return 0.0; + } +} + +/// Parse the numbers inside the next `[...]` or `{...}` after @p key in @p s. +[[nodiscard]] std::vector +parse_number_array(const std::string_view s, const std::string_view key) { + std::vector out; + const std::size_t k = s.find(key); + if (k == std::string_view::npos) { + return out; + } + std::size_t open = s.find_first_of("[{", k); + if (open == std::string_view::npos) { + return out; + } + const std::size_t close = s.find_first_of("]}", open); + if (close == std::string_view::npos) { + return out; + } + std::size_t p = open + 1; + while (p < close) { + const std::string_view token = read_token(s.substr(0, close), p); + if (token.empty()) { + break; + } + out.push_back(parse_double(token)); + } + return out; +} + +/// Read the `length` binary bytes of an `RD`/`-|` value: at @p p sits the +/// length integer, then the RD operator, then exactly one space, then the +/// bytes. On success returns the bytes and advances @p p past them; on failure +/// returns nullopt. +[[nodiscard]] std::optional +read_rd_binary(const std::string_view s, std::size_t &p) { + std::size_t q = p; + const std::string_view length_token = read_token(s, q); + std::int32_t length = 0; + if (!parse_int(length_token, length) || length < 0) { + return std::nullopt; + } + const std::string_view rd = read_token(s, q); // "RD" or "-|" + if (rd != "RD" && rd != "-|") { + return std::nullopt; + } + // Exactly one space separates the RD operator from the binary data. + if (q >= s.size()) { + return std::nullopt; + } + ++q; // the single delimiter space + if (q + static_cast(length) > s.size()) { + return std::nullopt; + } + const std::string_view bytes = s.substr(q, static_cast(length)); + p = q + static_cast(length); + return bytes; +} + +} // namespace + +bool Type1Font::is_type1(const std::string_view data) { + if (data.size() >= 2 && static_cast(data[0]) == 0x80) { + return true; // PFB segment marker + } + return data.substr(0, 256).find("%!PS-AdobeFont") != std::string_view::npos || + data.substr(0, 256).find("%!FontType1") != std::string_view::npos; +} + +Type1Font::Type1Font(std::string_view data) { + // Strip PFB segment framing if present: each segment is `0x80 type len32le` + // followed by `len` bytes (type 1 = ASCII, 2 = binary, 3 = EOF). + std::string unframed; + if (!data.empty() && static_cast(data[0]) == 0x80) { + std::size_t p = 0; + while (p + 6 <= data.size() && static_cast(data[p]) == 0x80) { + const std::uint8_t type = static_cast(data[p + 1]); + if (type == 3) { + break; + } + const auto len = + util::byte::from_little_endian(data.substr(p + 2, 4)); + p += 6; + if (p + len > data.size()) { + break; + } + unframed.append(data.substr(p, len)); + p += len; + } + data = unframed; + } + + const std::size_t eexec = data.find("eexec"); + if (eexec == std::string_view::npos) { + throw std::runtime_error("type1: no eexec section"); + } + + parse_clear(data.substr(0, eexec)); + + // The encrypted blob begins after `eexec` and its trailing whitespace. + const std::size_t blob = skip_space(data, eexec + 5); + const std::string decrypted = decrypt_eexec(data.substr(blob)); + parse_private(decrypted); + + if (m_glyphs.empty()) { + throw std::runtime_error("type1: no /CharStrings"); + } +} + +void Type1Font::parse_clear(const std::string_view clear) { + if (const std::size_t k = clear.find("/FontName"); + k != std::string_view::npos) { + std::size_t p = clear.find('/', k + 1); + if (p != std::string_view::npos) { + ++p; + m_name = std::string(read_token(clear, p)); + } + } + + if (const std::vector matrix = + parse_number_array(clear, "/FontMatrix"); + matrix.size() == 6) { + m_font_matrix = {matrix[0], matrix[1], matrix[2], + matrix[3], matrix[4], matrix[5]}; + } + if (const std::vector bbox = parse_number_array(clear, "/FontBBox"); + bbox.size() == 4) { + m_font_bbox = { + static_cast(bbox[0]), static_cast(bbox[1]), + static_cast(bbox[2]), static_cast(bbox[3])}; + } + + // /Encoding: `StandardEncoding def`, or a custom array built with + // `dup / put` lines. + const std::size_t enc = clear.find("/Encoding"); + if (enc != std::string_view::npos) { + const std::string_view after = clear.substr(enc); + if (after.substr(0, 64).find("StandardEncoding") != + std::string_view::npos) { + m_standard_encoding = true; + } else { + m_standard_encoding = false; + std::size_t p = 0; + while ((p = after.find("dup ", p)) != std::string_view::npos) { + std::size_t q = p + 4; + std::int32_t code = 0; + const std::string_view code_token = read_token(after, q); + const std::size_t slash = after.find('/', q); + if (parse_int(code_token, code) && slash != std::string_view::npos) { + std::size_t r = slash + 1; + m_encoding[code] = std::string(read_token(after, r)); + } + p = q; + } + } + } +} + +void Type1Font::parse_private(const std::string_view decrypted) { + std::int32_t len_iv = 4; + if (const std::size_t k = decrypted.find("/lenIV"); + k != std::string_view::npos) { + std::size_t p = k + 6; + std::int32_t value = 0; + if (parse_int(read_token(decrypted, p), value)) { + len_iv = value; + } + } + m_len_iv = len_iv; + + // /Subrs: entries `dup RD NP`. + if (const std::size_t k = decrypted.find("/Subrs"); + k != std::string_view::npos) { + std::size_t p = k; + while ((p = decrypted.find("dup ", p)) != std::string_view::npos) { + // Stop when /CharStrings starts (Subrs precede it). + const std::size_t cs = decrypted.find("/CharStrings"); + if (cs != std::string_view::npos && p > cs) { + break; + } + std::size_t q = p + 4; + std::int32_t index = 0; + if (!parse_int(read_token(decrypted, q), index) || index < 0) { + p += 4; + continue; + } + const std::optional bytes = + read_rd_binary(decrypted, q); + if (!bytes.has_value()) { + p += 4; + continue; + } + if (static_cast(m_subrs.size()) <= index) { + m_subrs.resize(index + 1); + } + m_subrs[index] = decrypt_charstring(*bytes, len_iv); + p = q; + } + } + + // /CharStrings: entries `/ RD ND`. + const std::size_t cs = decrypted.find("/CharStrings"); + if (cs == std::string_view::npos) { + return; + } + const std::size_t begin = decrypted.find("begin", cs); + std::size_t p = (begin == std::string_view::npos) ? cs : begin + 5; + while (p < decrypted.size()) { + const std::size_t slash = decrypted.find('/', p); + if (slash == std::string_view::npos) { + break; + } + std::size_t q = slash + 1; + std::string name(read_token(decrypted, q)); + const std::optional bytes = read_rd_binary(decrypted, q); + if (!bytes.has_value()) { + // Not a charstring entry (e.g. `end`); advance past this slash. + p = slash + 1; + continue; + } + m_glyphs.push_back({std::move(name), decrypt_charstring(*bytes, len_iv)}); + p = q; + } +} + +} // namespace odr::internal::font::type1 diff --git a/src/odr/internal/font/type1_font.hpp b/src/odr/internal/font/type1_font.hpp new file mode 100644 index 00000000..68f3a492 --- /dev/null +++ b/src/odr/internal/font/type1_font.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace odr::internal::font::type1 { + +/// One glyph of a Type1 font: its PostScript name and its **decrypted** Type1 +/// charstring (charstring encryption removed, `/lenIV` leading bytes dropped). +struct Glyph { + std::string name; + std::string charstring; +}; + +/// @brief Parses an Adobe Type1 font program into its decrypted parts. +/// +/// A Type1 program has three sections: a clear-text header (font dictionary up +/// to `eexec`), an `eexec`-encrypted private portion (`/Subrs`, +/// `/CharStrings`), and a zero-padded trailer. This reads the header for +/// `/FontMatrix`, +/// `/FontBBox`, `/Encoding` and `/FontName`, decrypts the `eexec` section +/// (`type1_crypt`) and extracts every glyph's decrypted charstring plus the +/// `/Subrs`. It does **not** yet interpret the charstrings — that is the +/// Type1 -> Type2 translation that follows, feeding 3.4's CFF -> OTF path. +/// +/// Throws `std::runtime_error` when the program has no `eexec` section or no +/// `/CharStrings`. +class Type1Font { +public: + /// Cheap magic test: the PostScript font sentinel (`%!PS-AdobeFont`, + /// `%!FontType1`) or a PFB segment marker (`0x80`). + [[nodiscard]] static bool is_type1(std::string_view data); + + /// Parse @p data (the raw `/FontFile` bytes, PFB markers stripped if + /// present). + explicit Type1Font(std::string_view data); + + [[nodiscard]] std::string_view name() const noexcept { return m_name; } + /// The `/FontMatrix` (defaults to `[0.001 0 0 0.001 0 0]`). + [[nodiscard]] const util::math::Transform2D &font_matrix() const noexcept { + return m_font_matrix; + } + [[nodiscard]] FontBBox font_bbox() const noexcept { return m_font_bbox; } + + /// `/Encoding` as code -> glyph name. Empty when the font uses + /// `StandardEncoding` (see `standard_encoding`). + [[nodiscard]] const std::map & + encoding() const noexcept { + return m_encoding; + } + [[nodiscard]] bool standard_encoding() const noexcept { + return m_standard_encoding; + } + + /// Decrypted glyphs in declaration order. + [[nodiscard]] const std::vector &glyphs() const noexcept { + return m_glyphs; + } + /// Decrypted `/Subrs`, indexed by subr number. + [[nodiscard]] const std::vector &subrs() const noexcept { + return m_subrs; + } + +private: + void parse_clear(std::string_view clear); + void parse_private(std::string_view decrypted); + + std::string m_name; + util::math::Transform2D m_font_matrix{0.001, 0.0, 0.0, 0.001, 0.0, 0.0}; + FontBBox m_font_bbox{}; + std::map m_encoding; + bool m_standard_encoding{true}; + std::vector m_glyphs; + std::vector m_subrs; + std::int32_t m_len_iv{4}; +}; + +} // namespace odr::internal::font::type1 diff --git a/src/odr/internal/font/type1_transform.cpp b/src/odr/internal/font/type1_transform.cpp new file mode 100644 index 00000000..a519d0f3 --- /dev/null +++ b/src/odr/internal/font/type1_transform.cpp @@ -0,0 +1,49 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace odr::internal::font { + +std::string type1::to_cff(const Type1Font &font) { + // Order glyphs with `.notdef` at index 0 (CFF requires it). Translate each + // Type1 charstring to Type2; the width rides in the charstring (the CFF + // builder uses nominalWidthX = 0). + std::vector glyphs; + glyphs.reserve(font.glyphs().size() + 1); + + const auto translate = [&](const Glyph &glyph) { + Type2Charstring t2 = to_type2(glyph.charstring, font.subrs()); + glyphs.push_back({glyph.name, std::move(t2.charstring)}); + }; + + // .notdef first. + std::size_t notdef = font.glyphs().size(); + for (std::size_t i = 0; i < font.glyphs().size(); ++i) { + if (font.glyphs()[i].name == ".notdef") { + notdef = i; + break; + } + } + if (notdef < font.glyphs().size()) { + translate(font.glyphs()[notdef]); + } else { + glyphs.push_back({".notdef", std::string(1, static_cast(14))}); + } + for (std::size_t i = 0; i < font.glyphs().size(); ++i) { + if (i != notdef) { + translate(font.glyphs()[i]); + } + } + + return cff::build_cff(font.name(), glyphs, /*default_width=*/0, + /*nominal_width=*/0, font.font_bbox()); +} + +} // namespace odr::internal::font diff --git a/src/odr/internal/font/type1_transform.hpp b/src/odr/internal/font/type1_transform.hpp new file mode 100644 index 00000000..4a45a81a --- /dev/null +++ b/src/odr/internal/font/type1_transform.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace odr::internal::font::type1 { + +class Type1Font; + +/// Convert a parsed Type1 font to a **bare CFF** font program: translate every +/// glyph's charstring to Type2 (`to_type2`, flattening the font's `/Subrs`) and +/// assemble via the CFF builder, with `.notdef` placed at glyph 0. +/// +/// Returns the CFF bytes (not a `cff::CffFont`): a `CffFont` is the +/// parse-and-keep-the-bytes reader, so producing one means parsing this output +/// back — the caller does that (`CffFont{to_cff(font)}`), then `wrap_to_otf` +/// wraps it for the browser, so an embedded Type1 font reuses the entire 3.4 +/// CFF path. Mirrors `cff::wrap_to_otf`, which likewise emits bytes. +[[nodiscard]] std::string to_cff(const Type1Font &font); + +} // namespace odr::internal::font::type1 diff --git a/src/odr/internal/pdf/AGENTS.md b/src/odr/internal/pdf/AGENTS.md index c11e9c4b..70059c14 100644 --- a/src/odr/internal/pdf/AGENTS.md +++ b/src/odr/internal/pdf/AGENTS.md @@ -116,7 +116,7 @@ Experimental and not production-quality. forms) for composite fonts. `Font::advance_width(code)` returns the advance in text-space units (glyph-space / 1000), falling back to `/MissingWidth` or `/DW`. Codes outside the corpus are interpreted as CIDs for composite fonts (identity); - AFM widths for the non-embedded standard-14 fonts are stage 3. + AFM widths for the non-embedded standard-14 fonts are stage 4. - **Embedded font programs** (stage 3.3): a TrueType `/FontFile2` (a simple font, or a composite `CIDFontType2`) is decoded into an `abstract::Font` (`SfntFont`) and held on `Font::embedded_font`; an explicit `/CIDToGIDMap` stream is read into @@ -549,8 +549,9 @@ rendering. (`Identity-V`/CJK, the `/W2`/`/DW2` metrics and a perpendicular pen advance). No corpus fixture needs it yet; tracked under *Other known gaps* alongside the legacy-CJK CMap work (both wait on a real file). -- **Intra-segment glyph shaping** (browser fallback) and **AFM widths for the - non-embedded standard-14 fonts** — folded into **stage 3**. `/BBox` clipping, +- **Intra-segment glyph shaping** (browser fallback) — folded into **stage 3**; + **AFM widths for the non-embedded standard-14 fonts** — folded into **stage 4** + with non-embedded substitution. `/BBox` clipping, `/MCID`-driven structure-tree reordering and `/Alt` (stage 5), and precise baseline placement (needs font ascent metrics) are likewise deferred. @@ -588,8 +589,9 @@ alongside. The embedded-font reverse map (above) reads Unicode from it, the OTF `head`/`hhea`/`hmtx`/`OS/2` from it, the re-encoder assigns PUA code points from its glyph count. -**Decision (2026-06-19): standalone-first, uniform PUA, Type3 stays in-stage.** -Three sequencing/scope choices fix the sub-stage plan below: +**Decision (2026-06-19, revised 2026-06-24): standalone-first, uniform PUA; +Type3 + non-embedded deferred to stage 4.** +Two sequencing/scope choices fix the sub-stage plan below: - **Standalone-first.** Fonts ship as a library deliverable — a `FontFile` `DecodedFile` plus a specimen-page HTML view, with font-only tests — *before* being wired into PDF output. The specimen page's glyph grid must show *every* @@ -604,9 +606,12 @@ Three sequencing/scope choices fix the sub-stage plan below: display/text decoupling holds (see *Design decisions*). Runs with no recoverable Unicode are additionally marked non-extractable (`user-select: none`, `aria-hidden`). -- **Type3 stays in stage 3.** Type3 glyphs are drawing procedures, so they need - path → SVG rendering that otherwise belongs to stage 4; a minimal path → SVG - capability is pulled forward into sub-stage 3.6 rather than waiting on stage 4. +- **Type3 + non-embedded deferred to stage 4.** Type3 glyphs are drawing + procedures needing path → SVG rendering, which belongs to stage 4; rather than + pull a minimal path → SVG slice forward into stage 3, Type3 now rides stage + 4's vector machinery. Non-embedded standard-14 substitution (and its AFM + widths) goes with it, so stage 3 ends at the embedded-program flavors + (TrueType / CFF / Type1). **Sub-stages.** Ordered so each lands an independently testable (and, from 3.2, viewable) artifact; the risky core — read a font, produce a sanitizer-clean, @@ -676,11 +681,8 @@ implementation. translation, build a CFF, reuse 3.4's CFF → OTF path. Reverse map via charstring glyph names → AGL. The hardest single piece, but precisely specified (Adobe T1 spec; pdf.js as reference). -- **3.6 — Type3 + non-embedded fonts.** Type3 glyph procedures (mini content - streams, already tokenized by the operator parser) → SVG glyphs via a minimal - path → SVG capability pulled forward from stage 4. Plus non-embedded fonts: - substitute the standard 14 + common names with CSS fallbacks and metrics from - `/Widths` (AFM widths for the standard-14 — closes stage 2's deferred item). +Type3 and non-embedded fonts were a sub-stage 3.6 here; they are deferred to +**stage 4** (the path → SVG machinery they need lives there). Stage 3 ends at 3.5. **Mechanisms & guards (ride through 3.1–3.5).** - **Broken-font long tail.** Real embedded fonts are routinely malformed, and @@ -707,6 +709,19 @@ stage exists to avoid. fill rules, stroke parameters, transforms; clipping → nested ``; tiling patterns → `` (form-XObject machinery from stage 2); axial/radial shadings (types 2/3) → `linearGradient`/`radialGradient`. +- **Type3 fonts** (deferred from stage 3): glyph char procs are mini content + streams (already tokenized by the operator parser) → SVG glyphs via the same + path → SVG machinery; each glyph placed at the text transform (CTM × `Tm` × + `/FontMatrix`, font size folded in), Unicode for selection from the stage-1 + chain. No glyph program → no PUA re-encode and no reverse map, so the + dual-layer model holds (SVG glyph layer + transparent Unicode layer). +- **Non-embedded fonts** (deferred from stage 3): a font with no `/FontFile*` is + substituted, not rendered — map the standard 14 (Helvetica/Times/Courier + + Symbol/ZapfDingbats) and common names to CSS `font-family` fallback stacks + (serif/sans/mono by flags + name, bold/italic from `/Flags` and the name); + drive placement from `/Widths`, with **AFM widths** for the standard 14 (which + usually ship none — closes stage 2's deferred item) as a generated data table. + Glyph shapes are the browser's fallback font. - **Images**: `DCTDecode` → `` JPEG pass-through; Flate/LZW raster → PNG encode; inline images (`BI`/`ID`/`EI` — currently not even tokenized correctly past `ID`); image masks and SMasks later. diff --git a/src/odr/internal/pdf/pdf_document_parser.cpp b/src/odr/internal/pdf/pdf_document_parser.cpp index cae04371..372db331 100644 --- a/src/odr/internal/pdf/pdf_document_parser.cpp +++ b/src/odr/internal/pdf/pdf_document_parser.cpp @@ -5,6 +5,8 @@ #include #include +#include +#include #include #include #include @@ -14,11 +16,11 @@ #include #include +#include #include #include #include #include -#include #include #include #include @@ -276,9 +278,10 @@ util::math::Transform2D parse_matrix(DocumentParser &parser, Object object) { /// interface: `/FontFile2` (TrueType / `CIDFontType2`) -> `SfntFont`, and /// `/FontFile3` (CFF / `Type1C` / `CIDFontType0C`, or OpenType-CFF) -> either /// an `SfntFont` (when the program is already a full SFNT, `/Subtype -/// /OpenType`) or a bare `CffFont`. `/FontFile` (Type1) is not yet read and -/// leaves `font.embedded_font` null, so such fonts keep rendering through the -/// fallback path. A malformed font is logged and left null. +/// /OpenType`) or a bare `CffFont`. `/FontFile` (Type1) is translated to a CFF +/// (`type1::to_cff`) and read as a `CffFont`, so it reuses the whole CFF path. +/// A malformed font is logged and leaves `font.embedded_font` null, so such +/// fonts keep rendering through the fallback path. void load_embedded_font(DocumentParser &parser, const Dictionary &descriptor, Font &font) { try { @@ -286,8 +289,8 @@ void load_embedded_font(DocumentParser &parser, const Dictionary &descriptor, descriptor["FontFile2"].is_reference()) { std::string data = parser.read_decoded_stream(descriptor["FontFile2"].as_reference()); - font.embedded_font = std::make_shared( - std::make_unique(std::move(data))); + font.embedded_font = + std::make_shared(std::move(data)); } else if (descriptor.has_key("FontFile3") && descriptor["FontFile3"].is_reference()) { std::string data = @@ -295,12 +298,21 @@ void load_embedded_font(DocumentParser &parser, const Dictionary &descriptor, // The program may be a full SFNT (`/Subtype /OpenType`) or a bare CFF // (`Type1C` / `CIDFontType0C`); dispatch on the magic. if (font::sfnt::SfntFont::is_sfnt(data)) { - font.embedded_font = std::make_shared( - std::make_unique(std::move(data))); + font.embedded_font = + std::make_shared(std::move(data)); } else { font.embedded_font = std::make_shared(std::move(data)); } + } else if (descriptor.has_key("FontFile") && + descriptor["FontFile"].is_reference()) { + // Type1 (`/FontFile`): translate the font to a CFF, then read it as a + // CffFont so the whole CFF path (re-encode / wrap / reverse map) applies. + const std::string data = + parser.read_decoded_stream(descriptor["FontFile"].as_reference()); + const font::type1::Type1Font type1_font(data); + font.embedded_font = + std::make_shared(font::type1::to_cff(type1_font)); } } catch (const std::exception &e) { ODR_WARNING(parser.logger(), diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 95fe4cf8..975180f4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -55,6 +55,9 @@ add_executable(odr_test "src/internal/pdf/pdf_test_file_builder.cpp" "src/internal/font/cff_font.cpp" + "src/internal/font/type1_charstring.cpp" + "src/internal/font/type1_crypt.cpp" + "src/internal/font/type1_font.cpp" "src/internal/font/sfnt_font.cpp" "src/internal/font/sfnt_transform.cpp" "src/internal/font/font_file.cpp" diff --git a/test/data/reference-output/odr-private b/test/data/reference-output/odr-private index 94645d83..ee4516d9 160000 --- a/test/data/reference-output/odr-private +++ b/test/data/reference-output/odr-private @@ -1 +1 @@ -Subproject commit 94645d834da9efe65c244631248d62f9fae1a2a4 +Subproject commit ee4516d9f60086677262a1e68076e7230bc8c662 diff --git a/test/src/internal/font/cff_font.cpp b/test/src/internal/font/cff_font.cpp index dfd71cf9..9a5b9000 100644 --- a/test/src/internal/font/cff_font.cpp +++ b/test/src/internal/font/cff_font.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -392,6 +393,37 @@ TEST(CffFontTest, IsCffMagic) { EXPECT_FALSE(CffFont::is_cff("not a font")); } +TEST(CffFontTest, BuildCffRoundTripsThroughReader) { + using odr::internal::font::cff::build_cff; + using odr::internal::font::cff::BuilderGlyph; + + // Type2 charstrings: .notdef = endchar; "A" = width-operand 50 then endchar + // (50 -> single byte 50 + 139 = 0xBD; endchar = 0x0E). + std::vector glyphs = { + {".notdef", std::string("\x0e", 1)}, + {"A", std::string("\xbd\x0e", 2)}, + }; + const std::string cff_bytes = + build_cff("MyType1", glyphs, /*default_width=*/0, /*nominal_width=*/100, + FontBBox{0, -200, 700, 800}); + + const CffFont font{cff_bytes}; + EXPECT_EQ(font.format(), FontFormat::cff); + EXPECT_EQ(font.name(), "MyType1"); + EXPECT_EQ(font.glyph_count(), 2); + EXPECT_FALSE(font.is_cid_keyed()); + EXPECT_EQ(font.glyph_name(1), "A"); + EXPECT_EQ(font.bounding_box().x_max, 700); + // explicit charstring width: nominalWidthX (100) + 50. + EXPECT_EQ(font.advance_width(1), 150); + // no explicit width: defaultWidthX (0). + EXPECT_EQ(font.advance_width(0), 0); + + // The built CFF wraps into a loadable OTTO (3.4 path) end to end. + const std::string otf = odr::internal::font::cff::wrap_to_otf(font); + EXPECT_TRUE(odr::internal::font::sfnt::SfntFont::is_sfnt(otf)); +} + TEST(CffFontTest, WrapsToLoadableOtf) { using namespace odr::internal::font; const CffFont cff{build_cff()}; diff --git a/test/src/internal/font/type1_charstring.cpp b/test/src/internal/font/type1_charstring.cpp new file mode 100644 index 00000000..b79e2a22 --- /dev/null +++ b/test/src/internal/font/type1_charstring.cpp @@ -0,0 +1,122 @@ +#include + +#include + +#include +#include + +using namespace odr::internal::font::type1; + +namespace { + +/// Encode an integer in the Type1/Type2 shared number forms (no 28/255 needed +/// for the small values used here). +void num(std::string &s, const int v) { + if (v >= -107 && v <= 107) { + s += static_cast(v + 139); + } else if (v >= 108 && v <= 1131) { + const int u = v - 108; + s += static_cast((u >> 8) + 247); + s += static_cast(u & 0xff); + } else if (v >= -1131 && v <= -108) { + const int u = -v - 108; + s += static_cast((u >> 8) + 251); + s += static_cast(u & 0xff); + } +} + +void op(std::string &s, const int o) { s += static_cast(o); } + +} // namespace + +TEST(Type1CharstringTest, HsbwWidthAndSideBearing) { + // sbx=10 wx=200 hsbw 100 0 rmoveto 50 50 rlineto endchar + std::string t1; + num(t1, 10); + num(t1, 200); + op(t1, 13); // hsbw + num(t1, 100); + num(t1, 0); + op(t1, 21); // rmoveto + num(t1, 50); + num(t1, 50); + op(t1, 5); // rlineto + op(t1, 14); // endchar + + const Type2Charstring out = to_type2(t1, {}); + EXPECT_TRUE(out.has_width); + EXPECT_EQ(out.width, 200); + + // Type2: [width 200][dx 100+sbx 10 = 110][dy 0] rmoveto [50][50] rlineto + // endchar. + std::string expected; + num(expected, 200); // width prepended + num(expected, 110); // 100 + side bearing 10 + num(expected, 0); + op(expected, 21); // rmoveto + num(expected, 50); + num(expected, 50); + op(expected, 5); // rlineto + op(expected, 14); // endchar + EXPECT_EQ(out.charstring, expected); +} + +TEST(Type1CharstringTest, FlattensCallSubr) { + // subr 0: 50 50 rlineto return + std::string subr0; + num(subr0, 50); + num(subr0, 50); + op(subr0, 5); // rlineto + op(subr0, 11); // return + + // 0 0 hsbw 0 0 rmoveto 0 callsubr endchar + std::string t1; + num(t1, 0); + num(t1, 0); + op(t1, 13); // hsbw + num(t1, 0); + num(t1, 0); + op(t1, 21); // rmoveto + num(t1, 0); + op(t1, 10); // callsubr 0 + op(t1, 14); // endchar + + const Type2Charstring out = to_type2(t1, {subr0}); + + // The subr's rlineto is inlined; expect width(0) rmoveto, then rlineto, then + // endchar. + std::string expected; + num(expected, 0); // width + num(expected, 0); + num(expected, 0); + op(expected, 21); // rmoveto + num(expected, 50); + num(expected, 50); + op(expected, 5); // rlineto (from subr) + op(expected, 14); // endchar + EXPECT_EQ(out.charstring, expected); +} + +TEST(Type1CharstringTest, FoldsDiv) { + // 0 0 hsbw 600 2 div 0 rmoveto endchar -> dx = 300 + std::string t1; + num(t1, 0); + num(t1, 0); + op(t1, 13); // hsbw + num(t1, 600); + num(t1, 2); + t1 += static_cast(12); + t1 += static_cast(12); // div + num(t1, 0); + op(t1, 21); // rmoveto + op(t1, 14); // endchar + + const Type2Charstring out = to_type2(t1, {}); + std::string expected; + num(expected, 0); // width + num(expected, 300); // 600 / 2 + num(expected, 0); + op(expected, 21); // rmoveto + op(expected, 14); // endchar + EXPECT_EQ(out.charstring, expected); +} diff --git a/test/src/internal/font/type1_crypt.cpp b/test/src/internal/font/type1_crypt.cpp new file mode 100644 index 00000000..7bfbd4d8 --- /dev/null +++ b/test/src/internal/font/type1_crypt.cpp @@ -0,0 +1,65 @@ +#include + +#include + +#include +#include + +using namespace odr::internal::font::type1; + +namespace { + +/// Independent reference implementation of the Type1 *encryption* (the inverse +/// of `decrypt`), so the round-trip tests are not circular: this codes the +/// cipher forwards (plaintext -> ciphertext), `decrypt` codes it backwards. +std::string encrypt(const std::string &plain, std::uint16_t r, + const std::string &random_prefix) { + constexpr std::uint16_t c1 = 52845; + constexpr std::uint16_t c2 = 22719; + std::string out; + const std::string full = random_prefix + plain; + for (const char ch : full) { + const auto p = static_cast(ch); + const auto cipher = static_cast(p ^ (r >> 8)); + out += static_cast(cipher); + r = static_cast((cipher + r) * c1 + c2); + } + return out; +} + +} // namespace + +TEST(Type1CryptTest, EexecRoundTrip) { + const std::string plain = "/Private 10 dict dup begin"; + const std::string cipher = encrypt(plain, 55665, "ABCD"); + EXPECT_EQ(decrypt_eexec(cipher), plain); +} + +TEST(Type1CryptTest, CharstringRoundTrip) { + const std::string plain("\x0d\x0e\xff\x00\x01", 5); // hsbw-ish bytes + const std::string cipher = encrypt(plain, 4330, "wxyz"); + EXPECT_EQ(decrypt_charstring(cipher), plain); // default lenIV = 4 +} + +TEST(Type1CryptTest, CharstringHonoursLenIv) { + const std::string plain = "hello"; + const std::string cipher = encrypt(plain, 4330, ""); + EXPECT_EQ(decrypt_charstring(cipher, 0), plain); +} + +TEST(Type1CryptTest, EexecAcceptsHexForm) { + const std::string plain = "dup /CharStrings"; + const std::string binary = encrypt(plain, 55665, "0000"); + // Hex-encode the binary eexec (PFA form), with whitespace the decoder skips. + std::string hex; + const char *digits = "0123456789abcdef"; + for (std::size_t i = 0; i < binary.size(); ++i) { + const auto b = static_cast(binary[i]); + hex += digits[b >> 4]; + hex += digits[b & 0x0f]; + if (i % 8 == 7) { + hex += '\n'; + } + } + EXPECT_EQ(decrypt_eexec(hex), plain); +} diff --git a/test/src/internal/font/type1_font.cpp b/test/src/internal/font/type1_font.cpp new file mode 100644 index 00000000..22155762 --- /dev/null +++ b/test/src/internal/font/type1_font.cpp @@ -0,0 +1,134 @@ +#include +#include + +#include +#include +#include + +#include + +#include +#include + +using namespace odr::internal::font::type1; + +namespace { + +/// Forward Type1 cipher (the inverse of `decrypt`), so the test builds a real +/// encrypted program rather than trusting the decryptor. +std::string encrypt(const std::string &plain, std::uint16_t r, + const std::string &random_prefix) { + constexpr std::uint16_t c1 = 52845; + constexpr std::uint16_t c2 = 22719; + std::string out; + for (const char ch : random_prefix + plain) { + const auto p = static_cast(ch); + const auto cipher = static_cast(p ^ (r >> 8)); + out += static_cast(cipher); + r = static_cast((cipher + r) * c1 + c2); + } + return out; +} + +/// A `/name len RD ND` charstring entry, the charstring encrypted with +/// the charstring key (4330) and a 4-byte lenIV prefix. +std::string charstring_entry(const std::string &name, + const std::string &plain_charstring) { + // The 4-byte lenIV prefix must be a real 4 NUL bytes — a "\x00\x00\x00\x00" + // string literal would be empty (the first NUL terminates it). + const std::string enc = encrypt(plain_charstring, 4330, std::string(4, '\0')); + return "/" + name + " " + std::to_string(enc.size()) + " RD " + enc + " ND\n"; +} + +/// Assemble a minimal but well-formed Type1 program: a clear header (with a +/// custom /Encoding and /FontMatrix) and an eexec-encrypted private section +/// holding two glyphs and one subr. +std::string build_type1() { + std::string clear = "%!PS-AdobeFont-1.0: TestType1 001.000\n" + "/FontName /TestType1 def\n" + "/FontMatrix [0.001 0 0 0.001 0 0] readonly def\n" + "/FontBBox {0 -200 700 800} readonly def\n" + "/Encoding 256 array\n" + "0 1 255 {1 index exch /.notdef put} for\n" + "dup 65 /A put\n" + "dup 66 /B put\n" + "readonly def\n" + "currentdict end\n" + "currentfile eexec\n"; + + std::string private_section = "dup /Private 16 dict dup begin\n" + "/lenIV 4 def\n" + "/Subrs 1 array\n"; + private_section += "dup 0 "; + { + const std::string subr = + encrypt(std::string("\x0b", 1), 4330, std::string(4, '\0')); // return + private_section += std::to_string(subr.size()) + " RD " + subr + " NP\n"; + } + private_section += "ND\n" + "2 index /CharStrings 2 dict dup begin\n"; + // .notdef-ish + two named glyphs. Charstring bytes are arbitrary here: the + // parser does not interpret them, it only extracts them. + private_section += charstring_entry("A", std::string("\x8b\x8b\x0d\x0e", 4)); + private_section += charstring_entry("B", std::string("\xf0\x0d\x0e", 3)); + private_section += "end\nend\n"; + + std::string program = clear; + program += encrypt(private_section, 55665, "wxyz"); + // Trailer (would be 512 zeros + cleartomark in a real font); the parser + // tolerates trailing data, so a short stub is enough. + program += std::string(8, '\0'); + return program; +} + +} // namespace + +TEST(Type1FontTest, IsType1Magic) { + EXPECT_TRUE(Type1Font::is_type1(build_type1())); + EXPECT_FALSE(Type1Font::is_type1("not a font program at all")); +} + +TEST(Type1FontTest, ParsesHeaderAndEncoding) { + const Type1Font font{build_type1()}; + + EXPECT_EQ(font.name(), "TestType1"); + EXPECT_FALSE(font.standard_encoding()); + EXPECT_DOUBLE_EQ(font.font_matrix().a, 0.001); + EXPECT_DOUBLE_EQ(font.font_matrix().d, 0.001); + EXPECT_EQ(font.font_bbox().y_min, -200); + EXPECT_EQ(font.font_bbox().x_max, 700); + + EXPECT_EQ(font.encoding().at(65), "A"); + EXPECT_EQ(font.encoding().at(66), "B"); +} + +TEST(Type1FontTest, DecryptsCharstringsAndSubrs) { + const Type1Font font{build_type1()}; + + ASSERT_EQ(font.glyphs().size(), 2u); + EXPECT_EQ(font.glyphs()[0].name, "A"); + EXPECT_EQ(font.glyphs()[0].charstring, std::string("\x8b\x8b\x0d\x0e", 4)); + EXPECT_EQ(font.glyphs()[1].name, "B"); + EXPECT_EQ(font.glyphs()[1].charstring, std::string("\xf0\x0d\x0e", 3)); + + ASSERT_EQ(font.subrs().size(), 1u); + EXPECT_EQ(font.subrs()[0], std::string("\x0b", 1)); // return +} + +TEST(Type1FontTest, ConvertsToLoadableCff) { + namespace cff = odr::internal::font::cff; + namespace sfnt = odr::internal::font::sfnt; + + const Type1Font type1_font{build_type1()}; + const std::string cff_bytes = to_cff(type1_font); + + const cff::CffFont font{cff_bytes}; + EXPECT_EQ(font.format(), odr::FontFormat::cff); + // .notdef (synthesized, since the test font has none) + A + B. + EXPECT_EQ(font.glyph_count(), 3); + EXPECT_EQ(font.glyph_name(1), "A"); + EXPECT_EQ(font.glyph_name(2), "B"); + + // The converted CFF wraps into a browser-loadable OTTO (the 3.4 path). + EXPECT_TRUE(sfnt::SfntFont::is_sfnt(cff::wrap_to_otf(font))); +}