diff --git a/Cargo.lock b/Cargo.lock index b6456c841063..12be6b1cc435 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,9 +240,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.37" +version = "1.2.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65193589c6404eb80b450d618eaf9a2cafaaafd57ecce47370519ef674a7bd44" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" dependencies = [ "find-msvc-tools", "jobserver", @@ -416,6 +416,7 @@ dependencies = [ "tree-sitter", "tree-sitter-json", "tree-sitter-ql", + "yeast", "zstd", ] @@ -754,9 +755,9 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.1" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" @@ -2853,9 +2854,9 @@ dependencies = [ [[package]] name = "tree-sitter" -version = "0.25.9" +version = "0.26.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa" +checksum = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538" dependencies = [ "cc", "regex", @@ -2891,6 +2892,16 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8" +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-ql" version = "0.23.1" @@ -3367,6 +3378,29 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +[[package]] +name = "yeast" +version = "0.1.0" +dependencies = [ + "clap", + "serde", + "serde_json", + "serde_yaml", + "tree-sitter", + "tree-sitter-python", + "tree-sitter-ruby", + "yeast-macros", +] + +[[package]] +name = "yeast-macros" +version = "0.1.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "yoke" version = "0.8.0" diff --git a/Cargo.toml b/Cargo.toml index 58a755340b9c..1e2be0d9ca56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,8 @@ resolver = "2" members = [ "shared/tree-sitter-extractor", + "shared/yeast", + "shared/yeast-macros", "ruby/extractor", "rust/extractor", "rust/extractor/macros", diff --git a/MODULE.bazel b/MODULE.bazel index 16b4a4691f8a..e7474e9a393f 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -141,14 +141,16 @@ use_repo( "vendor_ts__serde-1.0.228", "vendor_ts__serde_json-1.0.145", "vendor_ts__serde_with-3.14.1", + "vendor_ts__serde_yaml-0.9.34-deprecated", "vendor_ts__syn-2.0.106", "vendor_ts__toml-0.9.7", "vendor_ts__tracing-0.1.41", "vendor_ts__tracing-flame-0.2.0", "vendor_ts__tracing-subscriber-0.3.20", - "vendor_ts__tree-sitter-0.25.9", + "vendor_ts__tree-sitter-0.26.8", "vendor_ts__tree-sitter-embedded-template-0.25.0", "vendor_ts__tree-sitter-json-0.24.8", + "vendor_ts__tree-sitter-python-0.23.6", "vendor_ts__tree-sitter-ql-0.23.1", "vendor_ts__tree-sitter-ruby-0.23.1", "vendor_ts__triomphe-0.1.14", diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.bazel index a5cfeccdcea8..27d36c221ea4 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.bazel @@ -529,6 +529,18 @@ alias( tags = ["manual"], ) +alias( + name = "serde_yaml-0.9.34+deprecated", + actual = "@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml", + tags = ["manual"], +) + +alias( + name = "serde_yaml", + actual = "@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml", + tags = ["manual"], +) + alias( name = "syn-2.0.106", actual = "@vendor_ts__syn-2.0.106//:syn", @@ -590,14 +602,14 @@ alias( ) alias( - name = "tree-sitter-0.25.9", - actual = "@vendor_ts__tree-sitter-0.25.9//:tree_sitter", + name = "tree-sitter-0.26.8", + actual = "@vendor_ts__tree-sitter-0.26.8//:tree_sitter", tags = ["manual"], ) alias( name = "tree-sitter", - actual = "@vendor_ts__tree-sitter-0.25.9//:tree_sitter", + actual = "@vendor_ts__tree-sitter-0.26.8//:tree_sitter", tags = ["manual"], ) @@ -625,6 +637,18 @@ alias( tags = ["manual"], ) +alias( + name = "tree-sitter-python-0.23.6", + actual = "@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python", + tags = ["manual"], +) + +alias( + name = "tree-sitter-python", + actual = "@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python", + tags = ["manual"], +) + alias( name = "tree-sitter-ql-0.23.1", actual = "@vendor_ts__tree-sitter-ql-0.23.1//:tree_sitter_ql", diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.37.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.61.bazel similarity index 98% rename from misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.37.bazel rename to misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.61.bazel index c747c1c3c4fc..4071cd592439 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.37.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.cc-1.2.61.bazel @@ -96,9 +96,9 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "1.2.37", + version = "1.2.61", deps = [ - "@vendor_ts__find-msvc-tools-0.1.1//:find_msvc_tools", + "@vendor_ts__find-msvc-tools-0.1.9//:find_msvc_tools", "@vendor_ts__jobserver-0.1.34//:jobserver", "@vendor_ts__shlex-1.3.0//:shlex", ] + select({ diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.1.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.9.bazel similarity index 99% rename from misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.1.bazel rename to misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.9.bazel index 8fc8c9a81e26..12cd2ecbe0cc 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.1.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.find-msvc-tools-0.1.9.bazel @@ -93,5 +93,5 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "0.1.1", + version = "0.1.9", ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.iana-time-zone-haiku-0.1.2.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.iana-time-zone-haiku-0.1.2.bazel index fe120c046896..582c31e06fb7 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.iana-time-zone-haiku-0.1.2.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.iana-time-zone-haiku-0.1.2.bazel @@ -154,7 +154,7 @@ cargo_build_script( version = "0.1.2", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.25.9.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.26.8.bazel similarity index 97% rename from misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.25.9.bazel rename to misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.26.8.bazel index fd97779c54c6..98cdfaa4d2fd 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.25.9.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-0.26.8.bazel @@ -101,12 +101,12 @@ rust_library( "@rules_rust//rust/platform:x86_64-unknown-uefi": [], "//conditions:default": ["@platforms//:incompatible"], }), - version = "0.25.9", + version = "0.26.8", deps = [ "@vendor_ts__regex-1.11.3//:regex", "@vendor_ts__regex-syntax-0.8.6//:regex_syntax", "@vendor_ts__streaming-iterator-0.1.9//:streaming_iterator", - "@vendor_ts__tree-sitter-0.25.9//:build_script_build", + "@vendor_ts__tree-sitter-0.26.8//:build_script_build", "@vendor_ts__tree-sitter-language-0.1.5//:tree_sitter_language", ], ) @@ -164,10 +164,10 @@ cargo_build_script( "noclippy", "norustfmt", ], - version = "0.25.9", + version = "0.26.8", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", "@vendor_ts__serde_json-1.0.145//:serde_json", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-embedded-template-0.25.0.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-embedded-template-0.25.0.bazel index c9dd60b03c0e..57cf6cd860bf 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-embedded-template-0.25.0.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-embedded-template-0.25.0.bazel @@ -155,7 +155,7 @@ cargo_build_script( version = "0.25.0", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-json-0.24.8.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-json-0.24.8.bazel index 6b9a7bf0582b..245539d02e73 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-json-0.24.8.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-json-0.24.8.bazel @@ -155,7 +155,7 @@ cargo_build_script( version = "0.24.8", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-python-0.23.6.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-python-0.23.6.bazel new file mode 100644 index 000000000000..fc7a098193a4 --- /dev/null +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-python-0.23.6.bazel @@ -0,0 +1,166 @@ +############################################################################### +# @generated +# DO NOT MODIFY: This file is auto-generated by a crate_universe tool. To +# regenerate this file, run the following: +# +# bazel run @@//misc/bazel/3rdparty:vendor_tree_sitter_extractors +############################################################################### + +load( + "@rules_rust//cargo:defs.bzl", + "cargo_build_script", + "cargo_toml_env_vars", +) +load("@rules_rust//rust:defs.bzl", "rust_library") + +package(default_visibility = ["//visibility:public"]) + +cargo_toml_env_vars( + name = "cargo_toml_env_vars", + src = "Cargo.toml", +) + +rust_library( + name = "tree_sitter_python", + srcs = glob( + include = ["**/*.rs"], + allow_empty = True, + ), + compile_data = glob( + include = ["**"], + allow_empty = True, + exclude = [ + "**/* *", + ".tmp_git_root/**/*", + "BUILD", + "BUILD.bazel", + "WORKSPACE", + "WORKSPACE.bazel", + ], + ), + crate_root = "bindings/rust/lib.rs", + edition = "2021", + rustc_env_files = [ + ":cargo_toml_env_vars", + ], + rustc_flags = [ + "--cap-lints=allow", + ], + tags = [ + "cargo-bazel", + "crate-name=tree-sitter-python", + "manual", + "noclippy", + "norustfmt", + ], + target_compatible_with = select({ + "@rules_rust//rust/platform:aarch64-apple-darwin": [], + "@rules_rust//rust/platform:aarch64-apple-ios": [], + "@rules_rust//rust/platform:aarch64-apple-ios-sim": [], + "@rules_rust//rust/platform:aarch64-linux-android": [], + "@rules_rust//rust/platform:aarch64-pc-windows-msvc": [], + "@rules_rust//rust/platform:aarch64-unknown-fuchsia": [], + "@rules_rust//rust/platform:aarch64-unknown-linux-gnu": [], + "@rules_rust//rust/platform:aarch64-unknown-nixos-gnu": [], + "@rules_rust//rust/platform:aarch64-unknown-nto-qnx710": [], + "@rules_rust//rust/platform:aarch64-unknown-uefi": [], + "@rules_rust//rust/platform:arm-unknown-linux-gnueabi": [], + "@rules_rust//rust/platform:arm-unknown-linux-musleabi": [], + "@rules_rust//rust/platform:armv7-linux-androideabi": [], + "@rules_rust//rust/platform:armv7-unknown-linux-gnueabi": [], + "@rules_rust//rust/platform:i686-apple-darwin": [], + "@rules_rust//rust/platform:i686-linux-android": [], + "@rules_rust//rust/platform:i686-pc-windows-msvc": [], + "@rules_rust//rust/platform:i686-unknown-freebsd": [], + "@rules_rust//rust/platform:i686-unknown-linux-gnu": [], + "@rules_rust//rust/platform:powerpc-unknown-linux-gnu": [], + "@rules_rust//rust/platform:riscv32imc-unknown-none-elf": [], + "@rules_rust//rust/platform:riscv64gc-unknown-linux-gnu": [], + "@rules_rust//rust/platform:riscv64gc-unknown-none-elf": [], + "@rules_rust//rust/platform:s390x-unknown-linux-gnu": [], + "@rules_rust//rust/platform:thumbv7em-none-eabi": [], + "@rules_rust//rust/platform:thumbv8m.main-none-eabi": [], + "@rules_rust//rust/platform:wasm32-unknown-emscripten": [], + "@rules_rust//rust/platform:wasm32-unknown-unknown": [], + "@rules_rust//rust/platform:wasm32-wasip1": [], + "@rules_rust//rust/platform:wasm32-wasip1-threads": [], + "@rules_rust//rust/platform:wasm32-wasip2": [], + "@rules_rust//rust/platform:x86_64-apple-darwin": [], + "@rules_rust//rust/platform:x86_64-apple-ios": [], + "@rules_rust//rust/platform:x86_64-linux-android": [], + "@rules_rust//rust/platform:x86_64-pc-windows-msvc": [], + "@rules_rust//rust/platform:x86_64-unknown-freebsd": [], + "@rules_rust//rust/platform:x86_64-unknown-fuchsia": [], + "@rules_rust//rust/platform:x86_64-unknown-linux-gnu": [], + "@rules_rust//rust/platform:x86_64-unknown-nixos-gnu": [], + "@rules_rust//rust/platform:x86_64-unknown-none": [], + "@rules_rust//rust/platform:x86_64-unknown-uefi": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + version = "0.23.6", + deps = [ + "@vendor_ts__tree-sitter-language-0.1.5//:tree_sitter_language", + "@vendor_ts__tree-sitter-python-0.23.6//:build_script_build", + ], +) + +cargo_build_script( + name = "_bs", + srcs = glob( + include = ["**/*.rs"], + allow_empty = True, + ), + compile_data = glob( + include = ["**"], + allow_empty = True, + exclude = [ + "**/* *", + "**/*.rs", + ".tmp_git_root/**/*", + "BUILD", + "BUILD.bazel", + "WORKSPACE", + "WORKSPACE.bazel", + ], + ), + crate_name = "build_script_build", + crate_root = "bindings/rust/build.rs", + data = glob( + include = ["**"], + allow_empty = True, + exclude = [ + "**/* *", + ".tmp_git_root/**/*", + "BUILD", + "BUILD.bazel", + "WORKSPACE", + "WORKSPACE.bazel", + ], + ), + edition = "2021", + pkg_name = "tree-sitter-python", + rustc_env_files = [ + ":cargo_toml_env_vars", + ], + rustc_flags = [ + "--cap-lints=allow", + ], + tags = [ + "cargo-bazel", + "crate-name=tree-sitter-python", + "manual", + "noclippy", + "norustfmt", + ], + version = "0.23.6", + visibility = ["//visibility:private"], + deps = [ + "@vendor_ts__cc-1.2.61//:cc", + ], +) + +alias( + name = "build_script_build", + actual = ":_bs", + tags = ["manual"], +) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ql-0.23.1.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ql-0.23.1.bazel index 0b7ce3a9a29c..6d68d04cf007 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ql-0.23.1.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ql-0.23.1.bazel @@ -155,7 +155,7 @@ cargo_build_script( version = "0.23.1", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ruby-0.23.1.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ruby-0.23.1.bazel index f939b4b9493c..855b75133117 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ruby-0.23.1.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.tree-sitter-ruby-0.23.1.bazel @@ -155,7 +155,7 @@ cargo_build_script( version = "0.23.1", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.zstd-sys-2.0.16+zstd.1.5.7.bazel b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.zstd-sys-2.0.16+zstd.1.5.7.bazel index 32a06f947886..f2a70faf4640 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.zstd-sys-2.0.16+zstd.1.5.7.bazel +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/BUILD.zstd-sys-2.0.16+zstd.1.5.7.bazel @@ -165,7 +165,7 @@ cargo_build_script( version = "2.0.16+zstd.1.5.7", visibility = ["//visibility:private"], deps = [ - "@vendor_ts__cc-1.2.37//:cc", + "@vendor_ts__cc-1.2.61//:cc", "@vendor_ts__pkg-config-0.3.32//:pkg_config", ], ) diff --git a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl index bf11bc6c82fa..d1da77819f3d 100644 --- a/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl +++ b/misc/bazel/3rdparty/tree_sitter_extractors_deps/defs.bzl @@ -303,7 +303,7 @@ _NORMAL_DEPENDENCIES = { "serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"), "tracing": Label("@vendor_ts__tracing-0.1.41//:tracing"), "tracing-subscriber": Label("@vendor_ts__tracing-subscriber-0.3.20//:tracing_subscriber"), - "tree-sitter": Label("@vendor_ts__tree-sitter-0.25.9//:tree_sitter"), + "tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"), "tree-sitter-embedded-template": Label("@vendor_ts__tree-sitter-embedded-template-0.25.0//:tree_sitter_embedded_template"), "tree-sitter-ruby": Label("@vendor_ts__tree-sitter-ruby-0.23.1//:tree_sitter_ruby"), }, @@ -381,10 +381,28 @@ _NORMAL_DEPENDENCIES = { "serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"), "tracing": Label("@vendor_ts__tracing-0.1.41//:tracing"), "tracing-subscriber": Label("@vendor_ts__tracing-subscriber-0.3.20//:tracing_subscriber"), - "tree-sitter": Label("@vendor_ts__tree-sitter-0.25.9//:tree_sitter"), + "tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"), "zstd": Label("@vendor_ts__zstd-0.13.3//:zstd"), }, }, + "shared/yeast": { + _COMMON_CONDITION: { + "clap": Label("@vendor_ts__clap-4.5.48//:clap"), + "serde": Label("@vendor_ts__serde-1.0.228//:serde"), + "serde_json": Label("@vendor_ts__serde_json-1.0.145//:serde_json"), + "serde_yaml": Label("@vendor_ts__serde_yaml-0.9.34-deprecated//:serde_yaml"), + "tree-sitter": Label("@vendor_ts__tree-sitter-0.26.8//:tree_sitter"), + "tree-sitter-python": Label("@vendor_ts__tree-sitter-python-0.23.6//:tree_sitter_python"), + "tree-sitter-ruby": Label("@vendor_ts__tree-sitter-ruby-0.23.1//:tree_sitter_ruby"), + }, + }, + "shared/yeast-macros": { + _COMMON_CONDITION: { + "proc-macro2": Label("@vendor_ts__proc-macro2-1.0.101//:proc_macro2"), + "quote": Label("@vendor_ts__quote-1.0.41//:quote"), + "syn": Label("@vendor_ts__syn-2.0.106//:syn"), + }, + }, } _NORMAL_ALIASES = { @@ -411,6 +429,14 @@ _NORMAL_ALIASES = { _COMMON_CONDITION: { }, }, + "shared/yeast": { + _COMMON_CONDITION: { + }, + }, + "shared/yeast-macros": { + _COMMON_CONDITION: { + }, + }, } _NORMAL_DEV_DEPENDENCIES = { @@ -431,6 +457,10 @@ _NORMAL_DEV_DEPENDENCIES = { "tree-sitter-ql": Label("@vendor_ts__tree-sitter-ql-0.23.1//:tree_sitter_ql"), }, }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _NORMAL_DEV_ALIASES = { @@ -448,6 +478,10 @@ _NORMAL_DEV_ALIASES = { _COMMON_CONDITION: { }, }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _PROC_MACRO_DEPENDENCIES = { @@ -463,6 +497,10 @@ _PROC_MACRO_DEPENDENCIES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _PROC_MACRO_ALIASES = { @@ -478,6 +516,10 @@ _PROC_MACRO_ALIASES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _PROC_MACRO_DEV_DEPENDENCIES = { @@ -493,6 +535,10 @@ _PROC_MACRO_DEV_DEPENDENCIES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _PROC_MACRO_DEV_ALIASES = { @@ -510,6 +556,10 @@ _PROC_MACRO_DEV_ALIASES = { _COMMON_CONDITION: { }, }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _BUILD_DEPENDENCIES = { @@ -525,6 +575,10 @@ _BUILD_DEPENDENCIES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _BUILD_ALIASES = { @@ -540,6 +594,10 @@ _BUILD_ALIASES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _BUILD_PROC_MACRO_DEPENDENCIES = { @@ -555,6 +613,10 @@ _BUILD_PROC_MACRO_DEPENDENCIES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _BUILD_PROC_MACRO_ALIASES = { @@ -570,6 +632,10 @@ _BUILD_PROC_MACRO_ALIASES = { }, "shared/tree-sitter-extractor": { }, + "shared/yeast": { + }, + "shared/yeast-macros": { + }, } _CONDITIONS = { @@ -923,12 +989,12 @@ def crate_repositories(): maybe( http_archive, - name = "vendor_ts__cc-1.2.37", - sha256 = "65193589c6404eb80b450d618eaf9a2cafaaafd57ecce47370519ef674a7bd44", + name = "vendor_ts__cc-1.2.61", + sha256 = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d", type = "tar.gz", - urls = ["https://static.crates.io/crates/cc/1.2.37/download"], - strip_prefix = "cc-1.2.37", - build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.cc-1.2.37.bazel"), + urls = ["https://static.crates.io/crates/cc/1.2.61/download"], + strip_prefix = "cc-1.2.61", + build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.cc-1.2.61.bazel"), ) maybe( @@ -1373,12 +1439,12 @@ def crate_repositories(): maybe( http_archive, - name = "vendor_ts__find-msvc-tools-0.1.1", - sha256 = "7fd99930f64d146689264c637b5af2f0233a933bef0d8570e2526bf9e083192d", + name = "vendor_ts__find-msvc-tools-0.1.9", + sha256 = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582", type = "tar.gz", - urls = ["https://static.crates.io/crates/find-msvc-tools/0.1.1/download"], - strip_prefix = "find-msvc-tools-0.1.1", - build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.find-msvc-tools-0.1.1.bazel"), + urls = ["https://static.crates.io/crates/find-msvc-tools/0.1.9/download"], + strip_prefix = "find-msvc-tools-0.1.9", + build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.find-msvc-tools-0.1.9.bazel"), ) maybe( @@ -3363,12 +3429,12 @@ def crate_repositories(): maybe( http_archive, - name = "vendor_ts__tree-sitter-0.25.9", - sha256 = "ccd2a058a86cfece0bf96f7cce1021efef9c8ed0e892ab74639173e5ed7a34fa", + name = "vendor_ts__tree-sitter-0.26.8", + sha256 = "887bd495d0582c5e3e0d8ece2233666169fa56a9644d172fc22ad179ab2d0538", type = "tar.gz", - urls = ["https://static.crates.io/crates/tree-sitter/0.25.9/download"], - strip_prefix = "tree-sitter-0.25.9", - build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-0.25.9.bazel"), + urls = ["https://static.crates.io/crates/tree-sitter/0.26.8/download"], + strip_prefix = "tree-sitter-0.26.8", + build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-0.26.8.bazel"), ) maybe( @@ -3401,6 +3467,16 @@ def crate_repositories(): build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-language-0.1.5.bazel"), ) + maybe( + http_archive, + name = "vendor_ts__tree-sitter-python-0.23.6", + sha256 = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04", + type = "tar.gz", + urls = ["https://static.crates.io/crates/tree-sitter-python/0.23.6/download"], + strip_prefix = "tree-sitter-python-0.23.6", + build_file = Label("//misc/bazel/3rdparty/tree_sitter_extractors_deps:BUILD.tree-sitter-python-0.23.6.bazel"), + ) + maybe( http_archive, name = "vendor_ts__tree-sitter-ql-0.23.1", @@ -4152,13 +4228,15 @@ def crate_repositories(): struct(repo = "vendor_ts__serde-1.0.228", is_dev_dep = False), struct(repo = "vendor_ts__serde_json-1.0.145", is_dev_dep = False), struct(repo = "vendor_ts__serde_with-3.14.1", is_dev_dep = False), + struct(repo = "vendor_ts__serde_yaml-0.9.34-deprecated", is_dev_dep = False), struct(repo = "vendor_ts__syn-2.0.106", is_dev_dep = False), struct(repo = "vendor_ts__toml-0.9.7", is_dev_dep = False), struct(repo = "vendor_ts__tracing-0.1.41", is_dev_dep = False), struct(repo = "vendor_ts__tracing-flame-0.2.0", is_dev_dep = False), struct(repo = "vendor_ts__tracing-subscriber-0.3.20", is_dev_dep = False), - struct(repo = "vendor_ts__tree-sitter-0.25.9", is_dev_dep = False), + struct(repo = "vendor_ts__tree-sitter-0.26.8", is_dev_dep = False), struct(repo = "vendor_ts__tree-sitter-embedded-template-0.25.0", is_dev_dep = False), + struct(repo = "vendor_ts__tree-sitter-python-0.23.6", is_dev_dep = False), struct(repo = "vendor_ts__tree-sitter-ruby-0.23.1", is_dev_dep = False), struct(repo = "vendor_ts__triomphe-0.1.14", is_dev_dep = False), struct(repo = "vendor_ts__ungrammar-1.16.1", is_dev_dep = False), diff --git a/ql/extractor/src/extractor.rs b/ql/extractor/src/extractor.rs index 8383d8424eee..66096442e85f 100644 --- a/ql/extractor/src/extractor.rs +++ b/ql/extractor/src/extractor.rs @@ -29,24 +29,28 @@ pub fn run(options: Options) -> std::io::Result<()> { prefix: "ql", ts_language: tree_sitter_ql::LANGUAGE.into(), node_types: tree_sitter_ql::NODE_TYPES, + desugar: None, file_globs: vec!["*.ql".into(), "*.qll".into()], }, simple::LanguageSpec { prefix: "dbscheme", ts_language: tree_sitter_ql_dbscheme::LANGUAGE.into(), node_types: tree_sitter_ql_dbscheme::NODE_TYPES, + desugar: None, file_globs: vec!["*.dbscheme".into()], }, simple::LanguageSpec { prefix: "json", ts_language: tree_sitter_json::LANGUAGE.into(), node_types: tree_sitter_json::NODE_TYPES, + desugar: None, file_globs: vec!["*.json".into(), "*.jsonl".into(), "*.jsonc".into()], }, simple::LanguageSpec { prefix: "blame", ts_language: tree_sitter_blame::LANGUAGE.into(), node_types: tree_sitter_blame::NODE_TYPES, + desugar: None, file_globs: vec!["*.blame".into()], }, ], diff --git a/ql/extractor/src/generator.rs b/ql/extractor/src/generator.rs index 650e11c138b8..96ce5319dd19 100644 --- a/ql/extractor/src/generator.rs +++ b/ql/extractor/src/generator.rs @@ -21,18 +21,22 @@ pub fn run(options: Options) -> std::io::Result<()> { Language { name: "QL".to_owned(), node_types: tree_sitter_ql::NODE_TYPES, + desugar: None, }, Language { name: "Dbscheme".to_owned(), node_types: tree_sitter_ql_dbscheme::NODE_TYPES, + desugar: None, }, Language { name: "Blame".to_owned(), node_types: tree_sitter_blame::NODE_TYPES, + desugar: None, }, Language { name: "JSON".to_owned(), node_types: tree_sitter_json::NODE_TYPES, + desugar: None, }, ]; diff --git a/ruby/extractor/src/extractor.rs b/ruby/extractor/src/extractor.rs index 6807d09e9bec..4849f473ccbc 100644 --- a/ruby/extractor/src/extractor.rs +++ b/ruby/extractor/src/extractor.rs @@ -123,6 +123,7 @@ pub fn run(options: Options) -> std::io::Result<()> { &path, &source, &[], + None, ); let (ranges, line_breaks) = scan_erb( @@ -211,6 +212,7 @@ pub fn run(options: Options) -> std::io::Result<()> { &path, &source, &code_ranges, + None, ); std::fs::create_dir_all(src_archive_file.parent().unwrap())?; if needs_conversion { diff --git a/ruby/extractor/src/generator.rs b/ruby/extractor/src/generator.rs index de1d0dbfd7e3..0430afd103e7 100644 --- a/ruby/extractor/src/generator.rs +++ b/ruby/extractor/src/generator.rs @@ -21,10 +21,12 @@ pub fn run(options: Options) -> std::io::Result<()> { Language { name: "Ruby".to_owned(), node_types: tree_sitter_ruby::NODE_TYPES, + desugar: None, }, Language { name: "Erb".to_owned(), node_types: tree_sitter_embedded_template::NODE_TYPES, + desugar: None, }, ]; diff --git a/shared/tree-sitter-extractor/BUILD.bazel b/shared/tree-sitter-extractor/BUILD.bazel index ee19d8e5c778..5539641e90e9 100644 --- a/shared/tree-sitter-extractor/BUILD.bazel +++ b/shared/tree-sitter-extractor/BUILD.bazel @@ -12,7 +12,9 @@ rust_library( compile_data = [ "src/generator/prefix.dbscheme", ], - deps = all_crate_deps(), + deps = all_crate_deps() + [ + "//shared/yeast", + ], ) alias( diff --git a/shared/tree-sitter-extractor/Cargo.toml b/shared/tree-sitter-extractor/Cargo.toml index d02f02fd5888..1ad18a6df5a5 100644 --- a/shared/tree-sitter-extractor/Cargo.toml +++ b/shared/tree-sitter-extractor/Cargo.toml @@ -20,6 +20,7 @@ serde_json = "1.0" chrono = { version = "0.4.42", features = ["serde"] } num_cpus = "1.17.0" zstd = "0.13.3" +yeast = { path = "../yeast" } [dev-dependencies] tree-sitter-ql = "0.23.1" diff --git a/shared/tree-sitter-extractor/src/extractor/mod.rs b/shared/tree-sitter-extractor/src/extractor/mod.rs index 0ace38318810..0c3e13660817 100644 --- a/shared/tree-sitter-extractor/src/extractor/mod.rs +++ b/shared/tree-sitter-extractor/src/extractor/mod.rs @@ -18,6 +18,82 @@ use tree_sitter::{Language, Node, Parser, Range, Tree}; pub mod simple; +/// Trait abstracting over tree-sitter and yeast node types for extraction. +trait AstNode { + fn kind(&self) -> &str; + fn is_named(&self) -> bool; + fn is_missing(&self) -> bool; + fn is_error(&self) -> bool; + fn is_extra(&self) -> bool; + fn start_position(&self) -> tree_sitter::Point; + fn end_position(&self) -> tree_sitter::Point; + fn byte_range(&self) -> std::ops::Range; + fn end_byte(&self) -> usize { + self.byte_range().end + } + /// For yeast nodes with synthetic content, return it. Otherwise None. + fn opt_string_content(&self) -> Option { + None + } +} + +impl<'a> AstNode for Node<'a> { + fn kind(&self) -> &str { + Node::kind(self) + } + fn is_named(&self) -> bool { + Node::is_named(self) + } + fn is_missing(&self) -> bool { + Node::is_missing(self) + } + fn is_error(&self) -> bool { + Node::is_error(self) + } + fn is_extra(&self) -> bool { + Node::is_extra(self) + } + fn start_position(&self) -> tree_sitter::Point { + Node::start_position(self) + } + fn end_position(&self) -> tree_sitter::Point { + Node::end_position(self) + } + fn byte_range(&self) -> std::ops::Range { + Node::byte_range(self) + } +} + +impl AstNode for yeast::Node { + fn kind(&self) -> &str { + yeast::Node::kind(self) + } + fn is_named(&self) -> bool { + yeast::Node::is_named(self) + } + fn is_missing(&self) -> bool { + yeast::Node::is_missing(self) + } + fn is_error(&self) -> bool { + yeast::Node::is_error(self) + } + fn is_extra(&self) -> bool { + yeast::Node::is_extra(self) + } + fn start_position(&self) -> tree_sitter::Point { + yeast::Node::start_position(self) + } + fn end_position(&self) -> tree_sitter::Point { + yeast::Node::end_position(self) + } + fn byte_range(&self) -> std::ops::Range { + yeast::Node::byte_range(self) + } + fn opt_string_content(&self) -> Option { + yeast::Node::opt_string_content(self) + } +} + /// Sets the tracing level based on the environment variables /// `RUST_LOG` and `CODEQL_VERBOSITY` (prioritized in that order), /// falling back to `warn` if neither is set. @@ -204,6 +280,11 @@ pub fn location_label(writer: &mut trap::Writer, location: trap::Location) -> tr } /// Extracts the source file at `path`, which is assumed to be canonicalized. +/// When `yeast_runner` is `Some`, the parsed tree is first transformed +/// through the supplied yeast `Runner` before TRAP extraction. Building the +/// `Runner` (which parses YAML and constructs the schema) is the caller's +/// responsibility, allowing it to be done once and shared across files. +#[allow(clippy::too_many_arguments)] pub fn extract( language: &Language, language_prefix: &str, @@ -214,6 +295,7 @@ pub fn extract( path: &Path, source: &[u8], ranges: &[Range], + yeast_runner: Option<&yeast::Runner<'_>>, ) { let path_str = file_paths::normalize_and_transform_path(path, transformer); let span = tracing::span!( @@ -236,13 +318,20 @@ pub fn extract( source, diagnostics_writer, trap_writer, - // TODO: should we handle path strings that are not valid UTF8 better? &path_str, file_label, language_prefix, schema, ); - traverse(&tree, &mut visitor); + + if let Some(yeast_runner) = yeast_runner { + let ast = yeast_runner + .run_from_tree(&tree) + .unwrap_or_else(|e| panic!("Desugaring failed for {path_str}: {e}")); + traverse_yeast(&ast, &mut visitor); + } else { + traverse(&tree, &mut visitor); + } parser.reset(); } @@ -329,11 +418,11 @@ impl<'a> Visitor<'a> { ); } - fn record_parse_error_for_node( + fn record_parse_error_for_node( &mut self, message: &str, args: &[diagnostics::MessageArg], - node: Node, + node: &N, status_page: bool, ) { let loc = location_for(self, self.file_label, node); @@ -357,7 +446,7 @@ impl<'a> Visitor<'a> { self.record_parse_error(loc_label, &mesg); } - fn enter_node(&mut self, node: Node) -> bool { + fn enter_node(&mut self, node: &N) -> bool { if node.is_missing() { self.record_parse_error_for_node( "A parse error occurred (expected {} symbol). Check the syntax of the file. If the file is invalid, correct the error or {} the file from analysis.", @@ -383,7 +472,7 @@ impl<'a> Visitor<'a> { true } - fn leave_node(&mut self, field_name: Option<&'static str>, node: Node) { + fn leave_node(&mut self, field_name: Option<&'static str>, node: &N) { if node.is_error() || node.is_missing() { return; } @@ -434,7 +523,7 @@ impl<'a> Visitor<'a> { fields, name: table_name, } => { - if let Some(args) = self.complex_node(&node, fields, &child_nodes, id) { + if let Some(args) = self.complex_node(node, fields, &child_nodes, id) { self.trap_writer.add_tuple( &self.ast_node_location_table_name, vec![trap::Arg::Label(id), trap::Arg::Label(loc_label)], @@ -495,9 +584,9 @@ impl<'a> Visitor<'a> { } } - fn complex_node( + fn complex_node( &mut self, - node: &Node, + node: &N, fields: &[Field], child_nodes: &[ChildNode], parent_id: trap::Label, @@ -529,7 +618,7 @@ impl<'a> Visitor<'a> { diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)), diagnostics::MessageArg::Code(&format!("{:?}", field.type_info)), ], - *node, + node, false, ); } @@ -541,7 +630,7 @@ impl<'a> Visitor<'a> { diagnostics::MessageArg::Code(child_node.field_name.unwrap_or("child")), diagnostics::MessageArg::Code(&format!("{:?}", child_node.type_name)), ], - *node, + node, false, ); } @@ -566,7 +655,7 @@ impl<'a> Visitor<'a> { node.kind(), column_name ); - self.record_parse_error_for_node(&error_message, &[], *node, false); + self.record_parse_error_for_node(&error_message, &[], node, false); } } Storage::Table { @@ -582,7 +671,7 @@ impl<'a> Visitor<'a> { diagnostics::MessageArg::Code(node.kind()), diagnostics::MessageArg::Code(table_name), ], - *node, + node, false, ); break; @@ -639,15 +728,21 @@ impl<'a> Visitor<'a> { } // Emit a slice of a source file as an Arg. -fn sliced_source_arg(source: &[u8], n: Node) -> trap::Arg { - let range = n.byte_range(); - trap::Arg::String(String::from_utf8_lossy(&source[range.start..range.end]).into_owned()) +fn sliced_source_arg(source: &[u8], n: &N) -> trap::Arg { + trap::Arg::String(n.opt_string_content().unwrap_or_else(|| { + let range = n.byte_range(); + String::from_utf8_lossy(&source[range.start..range.end]).into_owned() + })) } // Emit a pair of `TrapEntry`s for the provided node, appropriately calibrated. // The first is the location and label definition, and the second is the // 'Located' entry. -fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap::Location { +fn location_for( + visitor: &mut Visitor, + file_label: trap::Label, + n: &N, +) -> trap::Location { // Tree-sitter row, column values are 0-based while CodeQL starts // counting at 1. In addition Tree-sitter's row and column for the // end position are exclusive while CodeQL's end positions are inclusive. @@ -715,6 +810,28 @@ fn location_for(visitor: &mut Visitor, file_label: trap::Label, n: Node) -> trap fn traverse(tree: &Tree, visitor: &mut Visitor) { let cursor = &mut tree.walk(); + visitor.enter_node(&cursor.node()); + let mut recurse = true; + loop { + if recurse && cursor.goto_first_child() { + recurse = visitor.enter_node(&cursor.node()); + } else { + visitor.leave_node(cursor.field_name(), &cursor.node()); + + if cursor.goto_next_sibling() { + recurse = visitor.enter_node(&cursor.node()); + } else if cursor.goto_parent() { + recurse = false; + } else { + break; + } + } + } +} + +fn traverse_yeast(tree: &yeast::Ast, visitor: &mut Visitor) { + use yeast::Cursor; + let mut cursor = tree.walk(); visitor.enter_node(cursor.node()); let mut recurse = true; loop { diff --git a/shared/tree-sitter-extractor/src/extractor/simple.rs b/shared/tree-sitter-extractor/src/extractor/simple.rs index b8446d02f892..6fcd29b03443 100644 --- a/shared/tree-sitter-extractor/src/extractor/simple.rs +++ b/shared/tree-sitter-extractor/src/extractor/simple.rs @@ -7,11 +7,17 @@ use std::path::{Path, PathBuf}; use crate::diagnostics; use crate::node_types; +use yeast; pub struct LanguageSpec { pub prefix: &'static str, pub ts_language: tree_sitter::Language, pub node_types: &'static str, + /// Optional yeast desugaring configuration. When set, the parsed + /// tree is rewritten through yeast before TRAP extraction. The + /// config's `output_node_types_yaml` (if set) provides the schema + /// used both at runtime (for the rewriter) and for TRAP validation. + pub desugar: Option, pub file_globs: Vec, } @@ -85,9 +91,35 @@ impl Extractor { .collect(); let mut schemas = vec![]; + let mut yeast_runners = Vec::new(); for lang in &self.languages { - let schema = node_types::read_node_types_str(lang.prefix, lang.node_types)?; + let effective_node_types: String = + match lang.desugar.as_ref().and_then(|c| c.output_node_types_yaml) { + Some(yaml) => yeast::node_types_yaml::convert(yaml).map_err(|e| { + std::io::Error::other(format!( + "Failed to convert YAML node-types to JSON for {}: {e}", + lang.prefix + )) + })?, + None => lang.node_types.to_string(), + }; + let schema = node_types::read_node_types_str(lang.prefix, &effective_node_types)?; schemas.push(schema); + + // Build the yeast runner once per language so the YAML schema + // isn't re-parsed for every file. + let yeast_runner = lang + .desugar + .as_ref() + .map(|config| yeast::Runner::from_config(lang.ts_language.clone(), config)) + .transpose() + .map_err(|e| { + std::io::Error::other(format!( + "Failed to build desugaring runner for {}: {e}", + lang.prefix + )) + })?; + yeast_runners.push(yeast_runner); } // Construct a single globset containing all language globs, @@ -162,6 +194,7 @@ impl Extractor { &path, &source, &[], + yeast_runners[i].as_ref(), ); std::fs::create_dir_all(src_archive_file.parent().unwrap())?; std::fs::copy(&path, &src_archive_file)?; diff --git a/shared/tree-sitter-extractor/src/generator/language.rs b/shared/tree-sitter-extractor/src/generator/language.rs index f0b0ed1790f2..a95f750b5727 100644 --- a/shared/tree-sitter-extractor/src/generator/language.rs +++ b/shared/tree-sitter-extractor/src/generator/language.rs @@ -1,4 +1,9 @@ pub struct Language { pub name: String, pub node_types: &'static str, + /// Optional yeast desugaring configuration. When set with an + /// `output_node_types_yaml`, the generator uses that YAML for the + /// dbscheme/QL library instead of `node_types`. The `rules` field is + /// unused at code-generation time; only the schema matters. + pub desugar: Option, } diff --git a/shared/tree-sitter-extractor/src/generator/mod.rs b/shared/tree-sitter-extractor/src/generator/mod.rs index 78e9e4a0b694..d2521c51b3ec 100644 --- a/shared/tree-sitter-extractor/src/generator/mod.rs +++ b/shared/tree-sitter-extractor/src/generator/mod.rs @@ -6,6 +6,7 @@ use std::io::Write; use std::path::PathBuf; use crate::node_types; +use yeast; pub mod dbscheme; pub mod language; @@ -68,7 +69,20 @@ pub fn generate( let token_name = format!("{}_token", &prefix); let tokeninfo_name = format!("{}_tokeninfo", &prefix); let reserved_word_name = format!("{}_reserved_word", &prefix); - let nodes = node_types::read_node_types_str(&prefix, language.node_types)?; + let effective_node_types: String = match language + .desugar + .as_ref() + .and_then(|c| c.output_node_types_yaml) + { + Some(yaml) => yeast::node_types_yaml::convert(yaml).map_err(|e| { + std::io::Error::other(format!( + "Failed to convert YAML node-types to JSON for {}: {e}", + language.name + )) + })?, + None => language.node_types.to_string(), + }; + let nodes = node_types::read_node_types_str(&prefix, &effective_node_types)?; let (dbscheme_entries, mut ast_node_members, token_kinds) = convert_nodes(&nodes); ast_node_members.insert(&token_name); writeln!(&mut dbscheme_writer, "/*- {} dbscheme -*/", language.name)?; diff --git a/shared/tree-sitter-extractor/tests/integration_test.rs b/shared/tree-sitter-extractor/tests/integration_test.rs index 2b243ff7945b..694eb526f394 100644 --- a/shared/tree-sitter-extractor/tests/integration_test.rs +++ b/shared/tree-sitter-extractor/tests/integration_test.rs @@ -13,6 +13,7 @@ fn simple_extractor() { prefix: "ql", ts_language: tree_sitter_ql::LANGUAGE.into(), node_types: tree_sitter_ql::NODE_TYPES, + desugar: None, file_globs: vec!["*.qll".into()], }; diff --git a/shared/tree-sitter-extractor/tests/multiple_languages.rs b/shared/tree-sitter-extractor/tests/multiple_languages.rs index 2e45e56754a3..e345eec58280 100644 --- a/shared/tree-sitter-extractor/tests/multiple_languages.rs +++ b/shared/tree-sitter-extractor/tests/multiple_languages.rs @@ -13,12 +13,14 @@ fn multiple_language_extractor() { prefix: "ql", ts_language: tree_sitter_ql::LANGUAGE.into(), node_types: tree_sitter_ql::NODE_TYPES, + desugar: None, file_globs: vec!["*.qll".into()], }; let lang_json = simple::LanguageSpec { prefix: "json", ts_language: tree_sitter_json::LANGUAGE.into(), node_types: tree_sitter_json::NODE_TYPES, + desugar: None, file_globs: vec!["*.json".into(), "*Jsonfile".into()], }; diff --git a/shared/yeast-macros/BUILD.bazel b/shared/yeast-macros/BUILD.bazel new file mode 100644 index 000000000000..71bc6eb288ad --- /dev/null +++ b/shared/yeast-macros/BUILD.bazel @@ -0,0 +1,12 @@ +load("@rules_rust//rust:defs.bzl", "rust_proc_macro") +load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps") + +exports_files(["Cargo.toml"]) + +rust_proc_macro( + name = "yeast-macros", + srcs = glob(["src/**/*.rs"]), + aliases = aliases(), + visibility = ["//visibility:public"], + deps = all_crate_deps(), +) diff --git a/shared/yeast-macros/Cargo.toml b/shared/yeast-macros/Cargo.toml new file mode 100644 index 000000000000..30c82d03b6eb --- /dev/null +++ b/shared/yeast-macros/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "yeast-macros" +version = "0.1.0" +edition = "2021" + +[lib] +proc-macro = true + +[dependencies] +proc-macro2 = "1.0" +quote = "1.0" +syn = "2.0" diff --git a/shared/yeast-macros/src/lib.rs b/shared/yeast-macros/src/lib.rs new file mode 100644 index 000000000000..0c264ee13c80 --- /dev/null +++ b/shared/yeast-macros/src/lib.rs @@ -0,0 +1,105 @@ +use proc_macro::TokenStream; +use proc_macro2::TokenStream as TokenStream2; + +mod parse; + +/// Proc macro for constructing a `QueryNode` from a tree-sitter-inspired pattern. +/// +/// # Syntax +/// +/// ```text +/// (_) - match any named node (skips unnamed tokens) +/// (kind) - match a named node of the given kind +/// ("literal") - match an unnamed token by its text +/// (kind field: (pattern)) - match with named field +/// (kind (pat) (pat)...) - match unnamed children (after all fields) +/// (pattern) @capture - capture the matched node +/// (pattern)* @capture - capture each repeated match +/// (pattern)? - zero or one +/// ``` +#[proc_macro] +pub fn query(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_query_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Build a single AST node from a template, returning its `Id`. +/// +/// # Template syntax +/// +/// ```text +/// (kind "literal") - leaf with static content +/// (kind #{expr}) - leaf with computed content (expr.to_string()) +/// (kind $fresh) - leaf with auto-generated unique name +/// {expr} - embed a Rust expression returning Id +/// {..expr} - splice an iterable of Id (in child/field position) +/// field: {..expr} - splice into a named field +/// ``` +/// +/// Can be called with an explicit context or using the implicit context +/// from an enclosing `rule!`: +/// +/// ```text +/// tree!(ctx, (kind ...)) // explicit BuildCtx +/// tree!((kind ...)) // implicit context from rule! +/// ``` +#[proc_macro] +pub fn tree(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_tree_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Build a list of AST nodes from a template, returning `Vec`. +/// +/// Like `tree!` but returns `Vec` and supports multiple top-level +/// elements. All syntax from `tree!` is available. +/// +/// Can be called with an explicit context or using the implicit context +/// from an enclosing `rule!`: +/// +/// ```text +/// trees!(ctx, (node1 ...) (node2 ...)) // explicit BuildCtx +/// trees!((node1 ...) (node2 ...)) // implicit context from rule! +/// ``` +#[proc_macro] +pub fn trees(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_trees_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} + +/// Define a desugaring rule with query and transform in one declaration. +/// +/// ```text +/// rule!( +/// (query_pattern field: (_) @name (kind)* @repeated (_)? @optional) +/// => +/// (output_template field: {name} {..repeated}) +/// ) +/// +/// // Shorthand: captures become fields on the output node +/// rule!((query ...) => output_kind) +/// ``` +/// +/// Captures become Rust variables automatically: +/// - `@name` (no quantifier) → `name: Id` +/// - `@name` (after `*`/`+`) → `name: Vec` +/// - `@name` (after `?`) → `name: Option` +/// +/// `tree!` and `trees!` can be used without explicit context inside `{...}`. +#[proc_macro] +pub fn rule(input: TokenStream) -> TokenStream { + let input2: TokenStream2 = input.into(); + match parse::parse_rule_top(input2) { + Ok(output) => output.into(), + Err(err) => err.to_compile_error().into(), + } +} diff --git a/shared/yeast-macros/src/parse.rs b/shared/yeast-macros/src/parse.rs new file mode 100644 index 000000000000..f8554f3178ca --- /dev/null +++ b/shared/yeast-macros/src/parse.rs @@ -0,0 +1,838 @@ +use proc_macro2::{Delimiter, Ident, Literal, Span, TokenStream, TokenTree}; +use quote::quote; +use std::iter::Peekable; + +type Tokens = Peekable; +type Result = std::result::Result; + +// --------------------------------------------------------------------------- +// Query parsing +// --------------------------------------------------------------------------- + +/// Top-level entry: parse a single query node from the full input. +pub fn parse_query_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let result = parse_query_node(&mut tokens)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned(tok, "unexpected token after query")); + } + Ok(result) +} + +/// Parse a single query node (possibly with a trailing `@capture`). +fn parse_query_node(tokens: &mut Tokens) -> Result { + let base = parse_query_atom(tokens)?; + // Check for trailing @capture + if peek_is_at(tokens) { + tokens.next(); // consume @ + let capture_name = expect_ident(tokens, "expected capture name after @")?; + let name_str = capture_name.to_string(); + Ok(quote! { + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(#base), + } + }) + } else { + Ok(base) + } +} + +/// Parse a query atom: `(kind fields...)` or `(kind fields... bare_children...)`. +/// Does not handle `@capture` — that's handled by the caller as a postfix. +fn parse_query_atom(tokens: &mut Tokens) -> Result { + match tokens.peek() { + None => Err(syn::Error::new( + Span::call_site(), + "unexpected end of query", + )), + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + let result = parse_query_node_inner(&mut inner)?; + if let Some(tok) = inner.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token in query node", + )); + } + Ok(result) + } + Some(tok) => Err(syn::Error::new_spanned( + tok.clone(), + "expected `(` in query; use `(_) @name` to capture a wildcard", + )), + } +} + +/// Parse the inside of a parenthesized query node: `kind fields...` or `_` or `"lit"`. +fn parse_query_node_inner(tokens: &mut Tokens) -> Result { + match tokens.peek() { + None => Err(syn::Error::new( + Span::call_site(), + "empty parenthesized group in query", + )), + Some(TokenTree::Ident(id)) if *id == "_" => { + tokens.next(); + Ok(quote! { yeast::query::QueryNode::Any() }) + } + Some(TokenTree::Literal(_)) => { + let lit = expect_literal(tokens)?; + Ok(quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }) + } + Some(TokenTree::Ident(_)) => { + let kind = expect_ident(tokens, "expected node kind")?; + let kind_str = kind.to_string(); + let fields = parse_query_fields(tokens)?; + Ok(quote! { + yeast::query::QueryNode::Node { + kind: #kind_str, + children: vec![#(#fields),*], + } + }) + } + Some(tok) => Err(syn::Error::new_spanned( + tok.clone(), + "expected node kind, `_`, or string literal", + )), + } +} + +/// Parse zero or more field specifications and trailing bare patterns. +/// Named fields: `name: pattern` or `name*: (list...)`. +/// Bare patterns (no field name) become implicit `child` field entries. +fn parse_query_fields(tokens: &mut Tokens) -> Result> { + let mut fields = Vec::new(); + while tokens.peek().is_some() { + if peek_is_field(tokens) { + let field_name = expect_ident(tokens, "expected field name")?; + let field_str = field_name.to_string(); + + expect_punct(tokens, ':', "expected `:` after field name")?; + + let child = parse_query_node(tokens)?; + fields.push(quote! { + (#field_str, vec![yeast::query::QueryListElem::SingleNode(#child)]) + }); + } else { + // Bare patterns — collect as implicit `child` field + let elems = parse_query_list(tokens)?; + if !elems.is_empty() { + fields.push(quote! { + ("child", vec![#(#elems),*]) + }); + } + break; + } + } + Ok(fields) +} + +/// Parse a list of query elements (bare children). +/// Each element is a node pattern, possibly followed by `*`, `+`, `?`. +fn parse_query_list(tokens: &mut Tokens) -> Result> { + let mut elems = Vec::new(); + while tokens.peek().is_some() { + // Check for parenthesized group + if peek_is_group(tokens, Delimiter::Parenthesis) { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + + // Check for repetition after the group + if peek_is_repetition(tokens) { + let rep = expect_repetition(tokens)?; + // Determine if the group is a single node pattern or a list + // of patterns. If it starts with an identifier (node kind) or + // `_`, treat it as a single repeated node. Otherwise, parse + // as a repeated list of sub-patterns. + let is_single_node = matches!(inner.peek(), Some(TokenTree::Ident(_))); + if is_single_node { + let node = parse_query_node_inner(&mut inner)?; + let elem = quote! { + yeast::query::QueryListElem::Repeated { + children: vec![yeast::query::QueryListElem::SingleNode(#node)], + rep: #rep, + } + }; + let elem = maybe_wrap_list_capture(tokens, elem)?; + elems.push(elem); + } else { + let sub_elems = parse_query_list(&mut inner)?; + let elem = quote! { + yeast::query::QueryListElem::Repeated { + children: vec![#(#sub_elems),*], + rep: #rep, + } + }; + let elem = maybe_wrap_list_capture(tokens, elem)?; + elems.push(elem); + } + } else { + // Single parenthesized node, possibly followed by @capture + let node = parse_query_node_inner(&mut inner)?; + let node = maybe_wrap_capture(tokens, node)?; + elems.push(quote! { + yeast::query::QueryListElem::SingleNode(#node) + }); + } + continue; + } + + // Check for string literal (unnamed node) + if peek_is_literal(tokens) { + let lit = expect_literal(tokens)?; + let node = quote! { yeast::query::QueryNode::UnnamedNode { kind: #lit } }; + let elem = maybe_wrap_repetition( + tokens, + quote! { + yeast::query::QueryListElem::SingleNode(#node) + }, + )?; + elems.push(elem); + continue; + } + + // Check for bare _ (wildcard), possibly followed by @capture + if peek_is_underscore(tokens) { + tokens.next(); + let node = quote! { yeast::query::QueryNode::Any() }; + let node = maybe_wrap_capture(tokens, node)?; + let elem = maybe_wrap_repetition( + tokens, + quote! { + yeast::query::QueryListElem::SingleNode(#node) + }, + )?; + elems.push(elem); + continue; + } + + break; + } + Ok(elems) +} + +// --------------------------------------------------------------------------- +// tree! / trees! parsing — direct code generation against BuildCtx +// --------------------------------------------------------------------------- + +const IMPLICIT_CTX: &str = "__yeast_ctx"; + +/// Determine the context identifier: either explicit `ctx,` or the implicit +/// `__yeast_ctx` from an enclosing `rule!`. +fn parse_ctx_or_implicit(tokens: &mut Tokens) -> Ident { + // Check if first token is an ident followed by a comma + let mut lookahead = tokens.clone(); + let is_explicit = matches!(lookahead.next(), Some(TokenTree::Ident(_))) + && matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == ','); + + if is_explicit { + let ctx = expect_ident(tokens, "").unwrap(); + let _ = tokens.next(); // consume comma + ctx + } else { + Ident::new(IMPLICIT_CTX, Span::call_site()) + } +} + +/// Parse `tree!(ctx, (template))` or `tree!((template))` — returns single `Id`. +pub fn parse_tree_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let ctx = parse_ctx_or_implicit(&mut tokens); + + let first = parse_direct_node(&mut tokens, &ctx)?; + + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected tokens after tree! template; use trees! for multiple nodes", + )); + } + + Ok(quote! { { #first } }) +} + +/// Parse `trees!(ctx, ...)` or `trees!(...)` — returns `Vec`. +pub fn parse_trees_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + let ctx = parse_ctx_or_implicit(&mut tokens); + let items = parse_direct_list(&mut tokens, &ctx)?; + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after trees! template", + )); + } + Ok(quote! { + { + let mut __nodes: Vec = Vec::new(); + #(#items)* + __nodes + } + }) +} + +/// Parse a single node template and generate code that returns an `Id`. +/// Handles: `(kind fields... children...)` and `{expr}`. +fn parse_direct_node(tokens: &mut Tokens, ctx: &Ident) -> Result { + match tokens.peek() { + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Brace => { + let group = expect_group(tokens, Delimiter::Brace)?; + let expr = group.stream(); + Ok(quote! { #expr }) + } + Some(TokenTree::Group(g)) if g.delimiter() == Delimiter::Parenthesis => { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + parse_direct_node_inner(&mut inner, ctx) + } + Some(tok) => Err(syn::Error::new_spanned( + tok.clone(), + "expected `(` or `{` in tree template", + )), + None => Err(syn::Error::new( + Span::call_site(), + "unexpected end of tree template", + )), + } +} + +/// Parse the inside of a parenthesized node: `kind fields... children...` +/// or `kind "literal"` or `kind $fresh`. +fn parse_direct_node_inner(tokens: &mut Tokens, ctx: &Ident) -> Result { + let kind = expect_ident(tokens, "expected node kind")?; + let kind_str = kind.to_string(); + + // Check for (kind "literal") + if peek_is_literal(tokens) { + let lit = expect_literal(tokens)?; + return Ok(quote! { #ctx.literal(#kind_str, #lit) }); + } + + // Check for (kind #{expr}) — computed literal, expr converted via .to_string() + if peek_is_hash(tokens) { + tokens.next(); // consume # + let group = expect_group(tokens, Delimiter::Brace)?; + let expr = group.stream(); + return Ok(quote! { #ctx.literal(#kind_str, &(#expr).to_string()) }); + } + + // Check for (kind $fresh) + if peek_is_dollar(tokens) { + tokens.next(); + let name = expect_ident(tokens, "expected fresh variable name after $")?; + let name_str = name.to_string(); + return Ok(quote! { #ctx.fresh(#kind_str, #name_str) }); + } + + // Parse named fields + let mut stmts = Vec::new(); + let mut field_args = Vec::new(); + let mut field_counter = 0usize; + + // Named fields — compute each value into a temp, then reference it + while peek_is_field(tokens) { + let field_name = expect_ident(tokens, "expected field name")?; + let field_str = field_name.to_string(); + expect_punct(tokens, ':', "expected `:` after field name")?; + let temp = Ident::new( + &format!("__field_{field_str}_{field_counter}"), + Span::call_site(), + ); + field_counter += 1; + + // Check for field: {..expr} — splice a Vec into the field + if peek_is_group(tokens, Delimiter::Brace) { + let group_clone = tokens.clone().next().unwrap(); + if let TokenTree::Group(g) = &group_clone { + let mut inner_check = g.stream().into_iter(); + let is_splice = matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') + && matches!(inner_check.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.'); + if is_splice { + let group = expect_group(tokens, Delimiter::Brace)?; + let mut inner = group.stream().into_iter().peekable(); + inner.next(); // consume first . + inner.next(); // consume second . + let expr: proc_macro2::TokenStream = inner.collect(); + stmts.push(quote! { let #temp: Vec = #expr; }); + field_args.push(quote! { (#field_str, #temp) }); + continue; + } + } + } + + let value = parse_direct_node(tokens, ctx)?; + stmts.push(quote! { let #temp = #value; }); + field_args.push(quote! { (#field_str, vec![#temp]) }); + } + + // After all named fields, no other tokens are allowed. + // Output templates require all children to be in named fields. + if let Some(tok) = tokens.peek() { + return Err(syn::Error::new_spanned( + tok.clone(), + "expected named field (`name:`) or end of node template; \ + output templates do not support unnamed children", + )); + } + + Ok(quote! { + { + #(#stmts)* + #ctx.node(#kind_str, vec![#(#field_args),*]) + } + }) +} + +/// Parse the top-level list of a `trees!` template. +/// Each item is a node template or `{expr}` splice. +fn parse_direct_list(tokens: &mut Tokens, ctx: &Ident) -> Result> { + let mut items = Vec::new(); + while tokens.peek().is_some() { + if peek_is_group(tokens, Delimiter::Parenthesis) { + let group = expect_group(tokens, Delimiter::Parenthesis)?; + let mut inner = group.stream().into_iter().peekable(); + + // Regular node + let node = parse_direct_node_inner(&mut inner, ctx)?; + items.push(quote! { __nodes.push(#node); }); + continue; + } + + // {expr} or {..expr} — single node or splice + if peek_is_group(tokens, Delimiter::Brace) { + let group = expect_group(tokens, Delimiter::Brace)?; + let mut inner = group.stream().into_iter().peekable(); + if peek_is_dotdot(&inner) { + inner.next(); // consume first . + inner.next(); // consume second . + let expr: TokenStream = inner.collect(); + items.push(quote! { __nodes.extend(#expr); }); + } else { + let expr = group.stream(); + items.push(quote! { __nodes.push(#expr); }); + } + continue; + } + + break; + } + Ok(items) +} + +// --------------------------------------------------------------------------- +// rule! parsing +// --------------------------------------------------------------------------- + +/// A captured variable from a query pattern. +struct CaptureInfo { + name: String, + multiplicity: CaptureMultiplicity, +} + +#[derive(Clone, Copy, PartialEq)] +enum CaptureMultiplicity { + /// Exactly one match (bare pattern or after no quantifier) + Single, + /// Zero or one match (after `?`) + Optional, + /// Zero or more matches (after `*` or `+`, or inside a repeated group) + Repeated, +} + +/// Walk a token stream and extract all `@name` captures, noting whether +/// they appear after `*` or `+` (repeated) or not. +fn extract_captures(stream: &TokenStream) -> Vec { + let mut captures = Vec::new(); + extract_captures_inner( + &mut stream.clone().into_iter().peekable(), + &mut captures, + CaptureMultiplicity::Single, + ); + captures +} + +fn extract_captures_inner( + tokens: &mut Tokens, + captures: &mut Vec, + parent_mult: CaptureMultiplicity, +) { + let mut last_mult = CaptureMultiplicity::Single; + while let Some(tok) = tokens.next() { + match tok { + TokenTree::Group(g) => { + let mut inner = g.stream().into_iter().peekable(); + let group_mult = match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '*' || p.as_char() == '+' => { + CaptureMultiplicity::Repeated + } + Some(TokenTree::Punct(p)) if p.as_char() == '?' => { + CaptureMultiplicity::Optional + } + _ => CaptureMultiplicity::Single, + }; + last_mult = group_mult; + let child_mult = if parent_mult == CaptureMultiplicity::Repeated + || group_mult == CaptureMultiplicity::Repeated + { + CaptureMultiplicity::Repeated + } else if parent_mult == CaptureMultiplicity::Optional + || group_mult == CaptureMultiplicity::Optional + { + CaptureMultiplicity::Optional + } else { + CaptureMultiplicity::Single + }; + extract_captures_inner(&mut inner, captures, child_mult); + } + TokenTree::Punct(p) if p.as_char() == '@' => { + if let Some(TokenTree::Ident(name)) = tokens.next() { + let mult = if parent_mult == CaptureMultiplicity::Repeated + || last_mult == CaptureMultiplicity::Repeated + { + CaptureMultiplicity::Repeated + } else if parent_mult == CaptureMultiplicity::Optional + || last_mult == CaptureMultiplicity::Optional + { + CaptureMultiplicity::Optional + } else { + CaptureMultiplicity::Single + }; + captures.push(CaptureInfo { + name: name.to_string(), + multiplicity: mult, + }); + } + last_mult = CaptureMultiplicity::Single; + } + TokenTree::Punct(p) if matches!(p.as_char(), '*' | '+' | '?') => { + // Keep last_mult — the @capture follows + } + _ => { + last_mult = CaptureMultiplicity::Single; + } + } + } +} + +/// Parse `rule!( query => transform )`. +pub fn parse_rule_top(input: TokenStream) -> Result { + let mut tokens = input.into_iter().peekable(); + + // Collect query tokens up to `=>` + let mut query_tokens = Vec::new(); + loop { + match tokens.peek() { + None => return Err(syn::Error::new(Span::call_site(), "expected `=>` in rule!")), + Some(TokenTree::Punct(p)) if p.as_char() == '=' => { + let eq = tokens.next().unwrap(); + match tokens.peek() { + Some(TokenTree::Punct(p)) if p.as_char() == '>' => { + tokens.next(); // consume > + break; + } + _ => { + query_tokens.push(eq); + continue; + } + } + } + _ => { + query_tokens.push(tokens.next().unwrap()); + } + } + } + + let query_stream: TokenStream = query_tokens.into_iter().collect(); + + // Extract captures from query + let captures = extract_captures(&query_stream); + + // Parse query + let query_code = parse_query_top(query_stream.clone())?; + + // Generate capture bindings + let ctx_ident = Ident::new(IMPLICIT_CTX, Span::call_site()); + let bindings: Vec = captures + .iter() + .map(|cap| { + let name = Ident::new(&cap.name, Span::call_site()); + let name_str = &cap.name; + match cap.multiplicity { + CaptureMultiplicity::Repeated => { + quote! { let #name: Vec = __captures.get_all(#name_str); } + } + CaptureMultiplicity::Optional => { + quote! { let #name: Option = __captures.get_opt(#name_str); } + } + CaptureMultiplicity::Single => { + quote! { let #name: usize = __captures.get_var(#name_str).unwrap(); } + } + } + }) + .collect(); + + // Parse transform: either shorthand `=> kind_name` or full `=> (template ...)` + let transform_body = if peek_is_field(&mut tokens) && { + // Shorthand form: bare identifier = output node kind. + // Auto-generate template from captures. + let mut lookahead = tokens.clone(); + lookahead.next(); // skip ident + lookahead.peek().is_none() // nothing after = shorthand + } { + let output_kind = expect_ident(&mut tokens, "expected output node kind")?; + let output_kind_str = output_kind.to_string(); + + // Generate field assignments from captures + let field_stmts: Vec = captures + .iter() + .map(|cap| { + let name = Ident::new(&cap.name, Span::call_site()); + let name_str = &cap.name; + match cap.multiplicity { + CaptureMultiplicity::Repeated => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + __fields.insert(__field_id, #name); + }, + CaptureMultiplicity::Optional => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + if let Some(__id) = #name { + __fields.entry(__field_id).or_insert_with(Vec::new).push(__id); + } + }, + CaptureMultiplicity::Single => quote! { + let __field_id = #ctx_ident.ast.field_id_for_name(#name_str) + .unwrap_or_else(|| panic!("field '{}' not found", #name_str)); + __fields.entry(__field_id).or_insert_with(Vec::new).push(#name); + }, + } + }) + .collect(); + + quote! { + let __kind = #ctx_ident.ast.id_for_node_kind(#output_kind_str) + .unwrap_or_else(|| panic!("node kind '{}' not found", #output_kind_str)); + let mut __fields = std::collections::BTreeMap::new(); + #(#field_stmts)* + let __id = #ctx_ident.ast.create_node_with_range( + __kind, + yeast::NodeContent::DynamicString(String::new()), + __fields, + true, + __source_range, + ); + vec![__id] + } + } else { + // Full template form + let transform_items = parse_direct_list(&mut tokens, &ctx_ident)?; + + if let Some(tok) = tokens.next() { + return Err(syn::Error::new_spanned( + tok, + "unexpected token after rule! transform", + )); + } + + quote! { + let mut __nodes: Vec = Vec::new(); + #(#transform_items)* + __nodes + } + }; + + Ok(quote! { + { + let __query = #query_code; + yeast::Rule::new(__query, Box::new(|__ast: &mut yeast::Ast, __captures: yeast::captures::Captures, __fresh: &yeast::tree_builder::FreshScope, __source_range: Option| { + #(#bindings)* + let mut #ctx_ident = yeast::build::BuildCtx::with_source_range(__ast, &__captures, __fresh, __source_range); + #transform_body + })) + } + }) +} + +// --------------------------------------------------------------------------- +// Token utilities +// --------------------------------------------------------------------------- + +fn peek_is_at(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '@') +} + +fn peek_is_literal(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Literal(_))) +} + +fn peek_is_dollar(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '$') +} + +fn peek_is_hash(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if p.as_char() == '#') +} + +/// Check for `..` (two consecutive dot punctuation tokens). +fn peek_is_dotdot(tokens: &Tokens) -> bool { + let mut lookahead = tokens.clone(); + matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') + && matches!(lookahead.next(), Some(TokenTree::Punct(p)) if p.as_char() == '.') +} + +fn peek_is_underscore(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Ident(id)) if *id == "_") +} + +/// Check if the next tokens form a field specification (ident followed by `:` or `*:`). +/// A bare identifier (other than `_`) at this position is always a field name, since +/// bare child patterns must start with `(`, `@`, `"literal"`, or `_`. +fn peek_is_field(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Ident(id)) if *id != "_") +} + +fn peek_is_group(tokens: &mut Tokens, delim: Delimiter) -> bool { + matches!(tokens.peek(), Some(TokenTree::Group(g)) if g.delimiter() == delim) +} + +fn peek_is_repetition(tokens: &mut Tokens) -> bool { + matches!(tokens.peek(), Some(TokenTree::Punct(p)) if matches!(p.as_char(), '*' | '+' | '?')) +} + +fn expect_ident(tokens: &mut Tokens, msg: &str) -> Result { + match tokens.next() { + Some(TokenTree::Ident(id)) => Ok(id), + Some(tok) => Err(syn::Error::new_spanned(tok, msg)), + None => Err(syn::Error::new(Span::call_site(), msg)), + } +} + +fn expect_literal(tokens: &mut Tokens) -> Result { + match tokens.next() { + Some(TokenTree::Literal(lit)) => Ok(lit), + Some(tok) => Err(syn::Error::new_spanned(tok, "expected string literal")), + None => Err(syn::Error::new( + Span::call_site(), + "expected string literal", + )), + } +} + +fn expect_punct(tokens: &mut Tokens, ch: char, msg: &str) -> Result<()> { + match tokens.next() { + Some(TokenTree::Punct(p)) if p.as_char() == ch => Ok(()), + Some(tok) => Err(syn::Error::new_spanned(tok, msg)), + None => Err(syn::Error::new(Span::call_site(), msg)), + } +} + +fn expect_group(tokens: &mut Tokens, delim: Delimiter) -> Result { + match tokens.next() { + Some(TokenTree::Group(g)) if g.delimiter() == delim => Ok(g), + Some(tok) => Err(syn::Error::new_spanned( + tok, + format!("expected {delim:?} group"), + )), + None => Err(syn::Error::new( + Span::call_site(), + format!("expected {delim:?} group"), + )), + } +} + +fn expect_repetition(tokens: &mut Tokens) -> Result { + match tokens.next() { + Some(TokenTree::Punct(p)) => match p.as_char() { + '*' => Ok(quote! { yeast::query::Rep::ZeroOrMore }), + '+' => Ok(quote! { yeast::query::Rep::OneOrMore }), + '?' => Ok(quote! { yeast::query::Rep::ZeroOrOne }), + _ => Err(syn::Error::new(p.span(), "expected `*`, `+`, or `?`")), + }, + Some(tok) => Err(syn::Error::new_spanned( + tok, + "expected repetition quantifier", + )), + None => Err(syn::Error::new( + Span::call_site(), + "expected repetition quantifier", + )), + } +} + +fn maybe_wrap_capture(tokens: &mut Tokens, base: TokenStream) -> Result { + if peek_is_at(tokens) { + tokens.next(); // consume @ + let name = expect_ident(tokens, "expected capture name after @")?; + let name_str = name.to_string(); + Ok(quote! { + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(#base), + } + }) + } else { + Ok(base) + } +} + +fn maybe_wrap_repetition(tokens: &mut Tokens, single: TokenStream) -> Result { + if peek_is_repetition(tokens) { + let rep = expect_repetition(tokens)?; + Ok(quote! { + yeast::query::QueryListElem::Repeated { + children: vec![#single], + rep: #rep, + } + }) + } else { + Ok(single) + } +} + +/// If `@name` follows a Repeated list element, wrap each child SingleNode +/// inside the repetition with a Capture. This matches tree-sitter semantics +/// where `(_)* @name` captures each matched node. +fn maybe_wrap_list_capture(tokens: &mut Tokens, elem: TokenStream) -> Result { + if peek_is_at(tokens) { + tokens.next(); + let name = expect_ident(tokens, "expected capture name after @")?; + let name_str = name.to_string(); + // Re-parse the element isn't practical, so we generate a wrapper + // that creates a new Repeated with each child wrapped in a capture. + // The simplest approach: generate code that the runtime can interpret. + // Actually, the capture annotation on repeated elements is best handled + // by re-generating the Repeated with captures injected. + // For now, assume the common case: the repetition contains a single + // SingleNode child, and we wrap that node in a capture. + Ok(quote! { + { + let __rep = #elem; + match __rep { + yeast::query::QueryListElem::Repeated { children, rep } => { + yeast::query::QueryListElem::Repeated { + children: children.into_iter().map(|child| { + match child { + yeast::query::QueryListElem::SingleNode(node) => { + yeast::query::QueryListElem::SingleNode( + yeast::query::QueryNode::Capture { + capture: #name_str, + node: Box::new(node), + } + ) + } + other => other, + } + }).collect(), + rep, + } + } + other => other, + } + } + }) + } else { + Ok(elem) + } +} diff --git a/shared/yeast/BUILD.bazel b/shared/yeast/BUILD.bazel new file mode 100644 index 000000000000..fe0b01bb87bd --- /dev/null +++ b/shared/yeast/BUILD.bazel @@ -0,0 +1,18 @@ +load("@rules_rust//rust:defs.bzl", "rust_library") +load("//misc/bazel/3rdparty/tree_sitter_extractors_deps:defs.bzl", "aliases", "all_crate_deps") + +exports_files(["Cargo.toml"]) + +rust_library( + name = "yeast", + srcs = glob( + ["src/**/*.rs"], + exclude = ["src/bin/**"], + ), + aliases = aliases(), + proc_macro_deps = [ + "//shared/yeast-macros", + ], + visibility = ["//visibility:public"], + deps = all_crate_deps(), +) diff --git a/shared/yeast/Cargo.toml b/shared/yeast/Cargo.toml new file mode 100644 index 000000000000..166887c324cf --- /dev/null +++ b/shared/yeast/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "yeast" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.4.10", features = ["derive"] } +serde = { version = "1.0.193", features = ["derive"] } +serde_json = "1.0.108" +serde_yaml = "0.9" +tree-sitter = ">= 0.23.0" +yeast-macros = { path = "../yeast-macros" } + +tree-sitter-ruby = "0.23" +tree-sitter-python = "0.23" diff --git a/shared/yeast/doc/node-types-yaml.md b/shared/yeast/doc/node-types-yaml.md new file mode 100644 index 000000000000..b887f5a82bbf --- /dev/null +++ b/shared/yeast/doc/node-types-yaml.md @@ -0,0 +1,241 @@ +# YAML Node Types Format + +The YAML node-types format is a human-friendly alternative to tree-sitter's +`node-types.json`. It can be converted to and from JSON using the +`node_types_yaml` tool. + +## Overview + +A YAML node-types file has three top-level sections: + +```yaml +supertypes: + # Abstract union types + +named: + # Concrete AST nodes and leaf tokens + +unnamed: + # Punctuation and keyword tokens +``` + +All three sections are optional. If omitted, they default to empty. + +## Supertypes + +Supertypes are abstract groupings of node types (unions). Each supertype maps +to a list of its members: + +```yaml +supertypes: + _expression: + - assignment + - binary + - identifier + - call +``` + +This corresponds to the following JSON: + +```json +{ + "type": "_expression", + "named": true, + "subtypes": [ + { "type": "assignment", "named": true }, + { "type": "binary", "named": true }, + { "type": "identifier", "named": true }, + { "type": "call", "named": true } + ] +} +``` + +Members are resolved as named or unnamed using the +[type reference rules](#type-references) described below. + +## Named nodes + +Named nodes are concrete AST node types. Each entry is a node kind mapping to +its fields. A node with no fields (a leaf token like `identifier`) uses an +empty value: + +```yaml +named: + identifier: + constant: +``` + +```json +{"type": "identifier", "named": true, "fields": {}}, +{"type": "constant", "named": true, "fields": {}} +``` + +### Fields + +Each field has a name, a multiplicity suffix, and a list of allowed types. + +| Suffix | Meaning | JSON `multiple` | JSON `required` | +| ------ | ------------ | --------------- | --------------- | +| (none) | exactly one | `false` | `true` | +| `?` | zero or one | `false` | `false` | +| `+` | one or more | `true` | `true` | +| `*` | zero or more | `true` | `false` | + +Example: + +```yaml +named: + assignment: + left: _lhs + right: _expression +``` + +```json +{ + "type": "assignment", + "named": true, + "fields": { + "left": { + "multiple": false, + "required": true, + "types": [{ "type": "_lhs", "named": true }] + }, + "right": { + "multiple": false, + "required": true, + "types": [{ "type": "_expression", "named": true }] + } + } +} +``` + +A field with multiple allowed types uses a list: + +```yaml +named: + binary: + left: [_expression, _simple_numeric] + operator: ["!=", "+", "&&"] + right: _expression +``` + +A singleton list can be written as a bare value (as shown with `right` above). + +### Unnamed children + +Unnamed children (nodes that appear as children without a field name) are +specified using the special `$children` field name, with the same suffixes: + +```yaml +named: + argument_list: + $children*: [_expression, block_argument, splat_argument] +``` + +```json +{ + "type": "argument_list", + "named": true, + "fields": {}, + "children": { + "multiple": true, + "required": false, + "types": [ + { "type": "_expression", "named": true }, + { "type": "block_argument", "named": true }, + { "type": "splat_argument", "named": true } + ] + } +} +``` + +## Unnamed tokens + +Unnamed tokens are punctuation, operators, and keywords that appear in the +parse tree but don't have their own AST node type. They are listed as simple +strings: + +```yaml +unnamed: + - "=" + - "end" + - "+" + - "&&" +``` + +```json +{"type": "=", "named": false}, +{"type": "end", "named": false}, +{"type": "+", "named": false}, +{"type": "&&", "named": false} +``` + +When converting to YAML, unnamed tokens are always wrapped in quotes for +visual clarity. This is purely cosmetic — YAML treats `end` and `"end"` as +the same string. + +## Type references + +When a type name appears in a field's type list or a supertype's member list, +it needs to be resolved as either named or unnamed. The rules are: + +1. If the name only appears in `named` or `supertypes`, it is **named**. +2. If the name only appears in `unnamed`, it is **unnamed**. +3. If the name appears in both, it defaults to **named**. +4. To explicitly reference an unnamed type in the ambiguous case, use the + map form: + +```yaml +named: + example: + field: { unnamed: foo } +``` + +In practice, ambiguity is rare — names like `end`, `+`, `if` are almost +always only unnamed, while names like `identifier`, `assignment` are only +named. + +## Complete example + +```yaml +supertypes: + _expression: + - assignment + - binary + - identifier + +named: + assignment: + left: _expression + right?: _expression + binary: + left: [_expression, _simple_numeric] + operator: ["!=", "+"] + right: _expression + argument_list: + $children*: [_expression, block_argument] + identifier: + constant: + +unnamed: + - "!=" + - "+" + - "=" + - "end" +``` + +## CLI usage + +Convert YAML to JSON: + +``` +node_types_yaml input.yaml > node-types.json +``` + +Convert JSON to YAML: + +``` +node_types_yaml --from-json node-types.json > node-types.yaml +``` + +Both commands also accept input from stdin if no file argument is given. diff --git a/shared/yeast/doc/yeast.md b/shared/yeast/doc/yeast.md new file mode 100644 index 000000000000..d49ff96f11df --- /dev/null +++ b/shared/yeast/doc/yeast.md @@ -0,0 +1,329 @@ +# YEAST — YEAST Elaborates Abstract Syntax Trees + +YEAST is a framework for transforming tree-sitter parse trees before they are +extracted into a CodeQL database. It sits between the tree-sitter parser and +the TRAP extractor, rewriting parts of the AST according to declarative rules. + +## Motivation + +Tree-sitter grammars describe the **concrete syntax** of a language — every +keyword, operator, and punctuation token appears in the parse tree. CodeQL +analyses often prefer a **simplified abstract syntax** where syntactic sugar +has been removed. YEAST bridges this gap by desugaring the tree-sitter output +into a cleaner form before extraction. + +For example, Ruby's `for x in list do ... end` is syntactic sugar for +`list.each { |x| ... }`. A YEAST rule can rewrite the former into the latter +so that CodeQL queries only need to reason about the `.each` form. + +## Architecture + +``` +Source code + │ + ▼ +┌──────────────┐ +│ tree-sitter │ Parse source into a concrete syntax tree +│ parser │ +└──────┬───────┘ + │ tree_sitter::Tree + ▼ +┌──────────────┐ +│ YEAST │ Apply desugaring rules, producing a new AST +│ Runner │ +└──────┬───────┘ + │ yeast::Ast + ▼ +┌──────────────┐ +│ TRAP │ Walk the (possibly rewritten) AST and emit TRAP tuples +│ extractor │ +└──────────────┘ +``` + +The entry point is `extract()` in the shared tree-sitter extractor. When +called with a non-empty `rules` vector, the parsed tree is run through the +YEAST `Runner` before TRAP extraction; with an empty `rules` vector the +tree is extracted unchanged. + +## How desugaring works + +A YEAST `Rule` has two parts: + +1. A **query** that matches nodes in the AST using a tree-sitter-inspired + pattern language. +2. A **transform** that produces replacement nodes from the match captures. + +The `Runner` applies rules by walking the tree top-down. At each node, it +tries each rule in order. If a rule's query matches, the node is replaced by +the transform's output, and the rules are re-applied to the result. If no +rule matches, the node is kept and its children are processed recursively. + +A rule can replace one node with zero nodes (deletion), one node (rewriting), +or multiple nodes (expansion). + +## Query language + +Queries use a syntax inspired by +[tree-sitter queries](https://tree-sitter.github.io/tree-sitter/using-parsers/queries/index.html), +written inside the `yeast::query!()` proc macro. + +### Node patterns + +```rust +// Match any named node +(_) + +// Match a node of a specific kind +(assignment) + +// Match an unnamed token by its text +("end") +``` + +### Fields + +```rust +// Match a node with specific fields +(assignment + left: (identifier) @lhs + right: (_) @rhs +) +``` + +Fields are matched by name. Unmentioned fields are ignored — the pattern +`(assignment left: (_) @x)` matches any `assignment` node regardless of +what's in `right`. + +### Captures + +Captures bind matched nodes to names for use in the transform. A capture +`@name` always follows the pattern it captures: + +```rust +(identifier) @name // capture an identifier node +(_) @value // capture any named node +(identifier)* @items // capture each repeated match +``` + +### Unnamed children + +Patterns that appear after all named fields match unnamed (positional) +children. Named node patterns like `(_)` automatically skip unnamed tokens +(keywords, operators, punctuation), matching tree-sitter semantics: + +```rust +(for + pattern: (_) @pat // named field + value: (in (_) @val) // "in" token is skipped automatically + body: (do (_)* @body) // "do" and "end" tokens skipped +) +``` + +### Repetitions + +```rust +(_)* // zero or more +(_)+ // one or more +(_)? // zero or one +(identifier)* @names // capture each repeated match +``` + +## Template language + +Templates construct new AST nodes using the `tree!` and `trees!` macros. +All children in a template must be in named fields — output AST nodes are +always fully fielded. + +When used inside a `rule!` macro, the context is implicit — no explicit +`BuildCtx` argument is needed. When used standalone, they take a `BuildCtx` +as the first argument: + +```rust +// Inside rule! — implicit context, captures are Rust variables +yeast::rule!( + (assignment left: (_) @left right: (_) @right) + => + (assignment left: {right} right: {left}) +); + +// Standalone — explicit context +let fresh = yeast::tree_builder::FreshScope::new(); +let mut ctx = BuildCtx::new(ast, &captures, &fresh); +let id = yeast::tree!(ctx, + (assignment + left: {ctx.capture("lhs")} + right: {ctx.capture("rhs")} + ) +); +``` + +### `tree!` — build a single node + +`tree!(...)` returns a single node `Id`: + +```rust +yeast::tree!(ctx, + (assignment + left: {ctx.capture("lhs")} + right: {ctx.capture("rhs")} + ) +) +``` + +### `trees!` — build multiple nodes + +`trees!(...)` returns `Vec`: + +```rust +yeast::trees!(ctx, + (assignment left: {tmp} right: {right}) + {..body} +) +``` + +### Literal nodes + +`(kind "text")` creates a leaf node with fixed text content: + +```rust +(identifier "each") // an identifier node whose text is "each" +``` + +### Computed literals + +`(kind #{expr})` creates a leaf node whose content is `expr.to_string()`: + +```rust +(integer #{i}) // an integer node with the value of i +(identifier #{name}) // an identifier from a Rust variable +``` + +### Fresh identifiers + +`(kind $name)` creates a leaf node with an auto-generated unique name. All +occurrences of the same `$name` within one `BuildCtx` share the same value: + +```rust +(block + parameters: (block_parameters + (identifier $tmp) // generates e.g. "$tmp-0" + ) + body: (block_body + (assignment + left: {pat} + right: (identifier $tmp) // same "$tmp-0" value + ) + ) +) +``` + +### Embedded Rust expressions + +`{expr}` embeds a Rust expression that returns a single node `Id`: + +```rust +(assignment + left: {some_node_id} // insert a pre-built node + right: {rhs} // insert a captured value (inside rule!) +) +``` + +`{..expr}` splices a `Vec` (or any iterable of `Id`): + +```rust +yeast::trees!(ctx, + (assignment left: {tmp} right: {right}) + {..extra_nodes} // splice a Vec +) +``` + +Inside `rule!`, captures are Rust variables, so `{name}` inserts a +single capture (`Id`) and `{..name}` splices a repeated capture +(`Vec`). + +## Complete example: for-loop desugaring + +This rule rewrites Ruby's `for pat in val do body end` into +`val.each { |tmp| pat = tmp; body }`: + +```rust +let for_rule = yeast::rule!( + (for + pattern: (_) @pat + value: (in (_) @val) + body: (do (_)* @body) + ) + => + (call + receiver: {val} + method: (identifier "each") + block: (block + parameters: (block_parameters + (identifier $tmp) + ) + body: (block_body + (assignment + left: {pat} + right: (identifier $tmp) + ) + {..body} + ) + ) + ) +); +``` + +Captures from the query (`@pat`, `@val`, `@body`) become Rust variables +automatically: single captures bind as `Id`, repeated captures (after +`*` or `+`) as `Vec`, and optional captures (after `?`) as +`Option`. + +## The `rule!` macro + +`rule!` combines a query and a transform into a single declaration: + +```rust +// Full template form +yeast::rule!( + (query_pattern field: (_) @capture) + => + (output_template field: {capture}) +) + +// Shorthand form — captures become fields on the output node +yeast::rule!( + (query_pattern field: (_) @capture) + => output_kind +) +``` + +The shorthand `=> kind` form auto-generates the template, mapping each +capture name to a field of the same name on the output node. + +## Integration with the extractor + +A YEAST desugaring pass is configured with a [`DesugaringConfig`], which +carries the rules and an optional output node-types schema (in YAML +format). Attach it to a language spec to enable rewriting: + +```rust +let desugar = yeast::DesugaringConfig::new(my_rules) + .with_output_node_types_yaml(include_str!("output-node-types.yml")); + +let lang = simple::LanguageSpec { + prefix: "ruby", + ts_language: tree_sitter_ruby::LANGUAGE.into(), + node_types: tree_sitter_ruby::NODE_TYPES, + desugar: Some(desugar), + file_globs: vec!["*.rb".into()], +}; +``` + +The same YAML node-types is used for both the runtime yeast `Schema` (so +rules can refer to output-only kinds and fields) and TRAP validation (it +is converted to JSON internally). + +For the dbscheme/QL code generator, set `Language::desugar` to a +`DesugaringConfig` carrying the same YAML; the generator converts it to +JSON for downstream code generation. The `rules` field of the config is +unused at code-generation time. diff --git a/shared/yeast/src/bin/main.rs b/shared/yeast/src/bin/main.rs new file mode 100644 index 000000000000..975c8e8b25f5 --- /dev/null +++ b/shared/yeast/src/bin/main.rs @@ -0,0 +1,26 @@ +use clap::Parser; + +#[derive(Parser)] +#[clap(name = "yeast", about = "yeast elaborates abstract syntax trees")] +struct Cli { + file: String, + #[clap(default_value = "ruby")] + language: String, +} + +fn get_language(language: &str) -> tree_sitter::Language { + match language { + "ruby" => tree_sitter_ruby::LANGUAGE.into(), + "python" => tree_sitter_python::LANGUAGE.into(), + _ => panic!("Unsupported language: {language}"), + } +} + +fn main() { + let args = Cli::parse(); + let language = get_language(&args.language); + let source = std::fs::read_to_string(&args.file).unwrap(); + let runner = yeast::Runner::new(language, &[]); + let ast = runner.run(&source).unwrap(); + println!("{}", ast.print(&source, ast.get_root())); +} diff --git a/shared/yeast/src/bin/node_types_yaml.rs b/shared/yeast/src/bin/node_types_yaml.rs new file mode 100644 index 000000000000..bc392ecb1f66 --- /dev/null +++ b/shared/yeast/src/bin/node_types_yaml.rs @@ -0,0 +1,51 @@ +use clap::Parser; +use std::io::Read; + +#[derive(Parser)] +#[clap( + name = "node-types-yaml", + about = "Convert between YAML and JSON node-types formats" +)] +struct Cli { + /// Input file (reads from stdin if not provided) + input: Option, + + /// Convert from JSON to YAML (default is YAML to JSON) + #[arg(long)] + from_json: bool, +} + +fn main() { + let args = Cli::parse(); + + let input = match &args.input { + Some(path) => std::fs::read_to_string(path).unwrap_or_else(|e| { + eprintln!("Error reading {path}: {e}"); + std::process::exit(1); + }), + None => { + let mut buf = String::new(); + std::io::stdin() + .read_to_string(&mut buf) + .unwrap_or_else(|e| { + eprintln!("Error reading stdin: {e}"); + std::process::exit(1); + }); + buf + } + }; + + let result = if args.from_json { + yeast::node_types_yaml::convert_from_json(&input) + } else { + yeast::node_types_yaml::convert(&input) + }; + + match result { + Ok(output) => print!("{output}"), + Err(e) => { + eprintln!("Error: {e}"); + std::process::exit(1); + } + } +} diff --git a/shared/yeast/src/build.rs b/shared/yeast/src/build.rs new file mode 100644 index 000000000000..bee4c4f7d034 --- /dev/null +++ b/shared/yeast/src/build.rs @@ -0,0 +1,91 @@ +use std::collections::BTreeMap; + +use crate::captures::Captures; +use crate::tree_builder::FreshScope; +use crate::{Ast, FieldId, Id, NodeContent}; + +/// Context for building new AST nodes during a transformation. +/// +/// Used by the `tree!` and `trees!` macros. Holds a mutable reference to the +/// AST, a reference to the captures from a query match, and a `FreshScope` for +/// generating unique identifiers. +pub struct BuildCtx<'a> { + pub ast: &'a mut Ast, + pub captures: &'a Captures, + pub fresh: &'a FreshScope, + /// Source range of the matched node, inherited by synthetic nodes. + pub source_range: Option, +} + +impl<'a> BuildCtx<'a> { + pub fn new(ast: &'a mut Ast, captures: &'a Captures, fresh: &'a FreshScope) -> Self { + Self { + ast, + captures, + fresh, + source_range: None, + } + } + + pub fn with_source_range( + ast: &'a mut Ast, + captures: &'a Captures, + fresh: &'a FreshScope, + source_range: Option, + ) -> Self { + Self { + ast, + captures, + fresh, + source_range, + } + } + + /// Look up a capture variable, returning its node Id. + pub fn capture(&self, name: &str) -> Id { + self.captures + .get_var(name) + .unwrap_or_else(|e| panic!("build: {e}")) + } + + /// Get all values of a repeated capture variable. + pub fn capture_all(&self, name: &str) -> Vec { + self.captures.get_all(name) + } + + /// Create a named AST node with the given kind and fields. + pub fn node(&mut self, kind: &str, fields: Vec<(&str, Vec)>) -> Id { + let kind_id = self + .ast + .id_for_node_kind(kind) + .unwrap_or_else(|| panic!("build: node kind '{kind}' not found")); + let mut field_map: BTreeMap> = BTreeMap::new(); + for (name, ids) in fields { + let field_id = self + .ast + .field_id_for_name(name) + .unwrap_or_else(|| panic!("build: field '{name}' not found")); + field_map.entry(field_id).or_default().extend(ids); + } + self.ast.create_node_with_range( + kind_id, + NodeContent::DynamicString(String::new()), + field_map, + true, + self.source_range, + ) + } + + /// Create a leaf node with a fixed string content. + pub fn literal(&mut self, kind: &'static str, value: &str) -> Id { + self.ast + .create_named_token_with_range(kind, value.to_string(), self.source_range) + } + + /// Create a leaf node with an auto-generated unique name. + pub fn fresh(&mut self, kind: &'static str, name: &str) -> Id { + let generated = self.fresh.resolve(name); + self.ast + .create_named_token_with_range(kind, generated, self.source_range) + } +} diff --git a/shared/yeast/src/captures.rs b/shared/yeast/src/captures.rs new file mode 100644 index 000000000000..a92c5096e94e --- /dev/null +++ b/shared/yeast/src/captures.rs @@ -0,0 +1,105 @@ +use std::collections::{BTreeMap, BTreeSet}; + +use crate::Id; + +#[derive(Debug, Clone)] +pub struct Captures { + captures: BTreeMap<&'static str, Vec>, +} + +impl Default for Captures { + fn default() -> Self { + Self::new() + } +} + +impl Captures { + pub fn new() -> Self { + Captures { + captures: BTreeMap::new(), + } + } + + pub fn get_var(&self, key: &str) -> Result { + let ids = self.captures.get(key); + if let Some(ids) = ids { + if ids.len() == 1 { + Ok(ids[0]) + } else { + Err(format!( + "Variable {} has {} matches, use * to allow repetition", + key, + ids.len() + )) + } + } else { + Err(format!("No variable named {key}")) + } + } + + /// Get all values of a capture variable (for repeated captures). + pub fn get_all(&self, key: &str) -> Vec { + self.captures.get(key).cloned().unwrap_or_default() + } + + /// Get an optional capture variable. Returns None if unmatched, + /// Some(id) if matched exactly once. + pub fn get_opt(&self, key: &str) -> Option { + self.captures + .get(key) + .and_then(|ids| if ids.len() == 1 { Some(ids[0]) } else { None }) + } + + pub fn insert(&mut self, key: &'static str, id: Id) { + self.captures.entry(key).or_default().push(id); + } + + pub fn map_captures(&mut self, kind: &str, f: &mut impl FnMut(Id) -> Id) { + if let Some(ids) = self.captures.get_mut(kind) { + for id in ids { + *id = f(*id); + } + } + } + pub fn map_captures_to(&mut self, from: &str, to: &'static str, f: &mut impl FnMut(Id) -> Id) { + if let Some(from_ids) = self.captures.get(from) { + let new_values = from_ids.iter().copied().map(f).collect(); + self.captures.insert(to, new_values); + } + } + + pub fn merge(&mut self, other: &Captures) { + for (key, ids) in &other.captures { + self.captures.entry(key).or_default().extend(ids); + } + } + + pub fn un_star<'a>( + &'a self, + children: &'a BTreeSet<&'static str>, + ) -> Result + 'a, String> { + let mut id_iter = children.iter(); + + if let Some(fst) = id_iter.next() { + let repeats = self + .captures + .get(fst) + .ok_or_else(|| format!("No variable named {fst}"))? + .len(); + // TODO: better error on missing capture + if id_iter.any(|id| self.captures.get(id).map(Vec::len).unwrap_or(0) != repeats) { + return Err("Repeated captures must have the same number of matches".to_string()); + } + Ok((0..repeats).map(move |iter| { + let mut new_vars: Captures = Captures::new(); + for id in children { + let child_capture = self.captures.get(id).unwrap()[iter]; + new_vars.captures.insert(id, vec![child_capture]); + } + new_vars + })) + } else { + Err("Repeated captures must have at least one capture".to_string()) + } + } +} diff --git a/shared/yeast/src/cursor.rs b/shared/yeast/src/cursor.rs new file mode 100644 index 000000000000..ef5f6d94f259 --- /dev/null +++ b/shared/yeast/src/cursor.rs @@ -0,0 +1,8 @@ +pub trait Cursor<'a, T, N, F> { + fn node(&self) -> &'a N; + fn field_id(&self) -> Option; + fn field_name(&self) -> Option<&'static str>; + fn goto_first_child(&mut self) -> bool; + fn goto_next_sibling(&mut self) -> bool; + fn goto_parent(&mut self) -> bool; +} diff --git a/shared/yeast/src/dump.rs b/shared/yeast/src/dump.rs new file mode 100644 index 000000000000..99ba019cc3ea --- /dev/null +++ b/shared/yeast/src/dump.rs @@ -0,0 +1,181 @@ +use std::fmt::Write; + +use crate::{Ast, Node, NodeContent, CHILD_FIELD}; + +/// Options for controlling AST dump output. +pub struct DumpOptions { + /// Whether to include source locations in the output. + pub show_locations: bool, + /// Whether to include source text for leaf nodes. + pub show_content: bool, +} + +impl Default for DumpOptions { + fn default() -> Self { + Self { + show_locations: false, + show_content: true, + } + } +} + +/// Dump a yeast AST as a human-readable indented text format. +/// +/// Output format: +/// ```text +/// program +/// assignment +/// left: +/// left_assignment_list +/// identifier "x" +/// identifier "y" +/// right: +/// call +/// method: +/// identifier "foo" +/// ``` +pub fn dump_ast(ast: &Ast, root: usize, source: &str) -> String { + dump_ast_with_options(ast, root, source, &DumpOptions::default()) +} + +pub fn dump_ast_with_options( + ast: &Ast, + root: usize, + source: &str, + options: &DumpOptions, +) -> String { + let mut out = String::new(); + dump_node(ast, root, source, options, 0, &mut out); + out +} + +fn dump_node( + ast: &Ast, + id: usize, + source: &str, + options: &DumpOptions, + indent: usize, + out: &mut String, +) { + let node = match ast.get_node(id) { + Some(n) => n, + None => return, + }; + + let prefix = " ".repeat(indent); + + // Node kind + write!(out, "{}{}", prefix, node.kind_name()).unwrap(); + + // Location + if options.show_locations { + let start = node.start_position(); + let end = node.end_position(); + write!( + out, + " [{},{}]-[{},{}]", + start.row + 1, + start.column + 1, + end.row + 1, + end.column + 1 + ) + .unwrap(); + } + + // Content for leaf nodes + if options.show_content && node.is_named() && is_leaf(node) { + let content = node_content(node, source); + if !content.is_empty() { + write!(out, " {content:?}").unwrap(); + } + } + + writeln!(out).unwrap(); + + // Named fields first + for (&field_id, children) in &node.fields { + if field_id == CHILD_FIELD { + continue; // Handle unnamed children last + } + let field_name = ast.field_name_for_id(field_id).unwrap_or("?"); + if children.len() == 1 { + write!(out, "{prefix} {field_name}:").unwrap(); + // Inline single child + let child = ast.get_node(children[0]); + if child.is_some_and(is_leaf) { + write!(out, " ").unwrap(); + dump_node_inline(ast, children[0], source, options, out); + } else { + writeln!(out).unwrap(); + dump_node(ast, children[0], source, options, indent + 2, out); + } + } else { + writeln!(out, "{prefix} {field_name}:").unwrap(); + for &child_id in children { + dump_node(ast, child_id, source, options, indent + 2, out); + } + } + } + + // Unnamed children — skip unnamed tokens (keywords, punctuation) + if let Some(children) = node.fields.get(&CHILD_FIELD) { + for &child_id in children { + if let Some(child) = ast.get_node(child_id) { + if child.is_named() { + dump_node(ast, child_id, source, options, indent + 1, out); + } + } + } + } +} + +/// Dump a leaf node inline (no newline prefix, caller provides context). +fn dump_node_inline(ast: &Ast, id: usize, source: &str, options: &DumpOptions, out: &mut String) { + let node = match ast.get_node(id) { + Some(n) => n, + None => return, + }; + + write!(out, "{}", node.kind_name()).unwrap(); + + if options.show_locations { + let start = node.start_position(); + let end = node.end_position(); + write!( + out, + " [{},{}]-[{},{}]", + start.row + 1, + start.column + 1, + end.row + 1, + end.column + 1 + ) + .unwrap(); + } + + if options.show_content && node.is_named() { + let content = node_content(node, source); + if !content.is_empty() { + write!(out, " {content:?}").unwrap(); + } + } + + writeln!(out).unwrap(); +} + +fn is_leaf(node: &Node) -> bool { + node.fields.is_empty() +} + +fn node_content(node: &Node, source: &str) -> String { + match &node.content { + NodeContent::DynamicString(s) if !s.is_empty() => s.clone(), + _ => { + let range = node.byte_range(); + if range.start < source.len() && range.end <= source.len() { + source[range.start..range.end].to_string() + } else { + String::new() + } + } + } +} diff --git a/shared/yeast/src/lib.rs b/shared/yeast/src/lib.rs new file mode 100644 index 000000000000..46629e198406 --- /dev/null +++ b/shared/yeast/src/lib.rs @@ -0,0 +1,727 @@ +use std::collections::BTreeMap; + +extern crate self as yeast; + +use serde::Serialize; +use serde_json::{json, Value}; + +pub mod build; +pub mod captures; +pub mod cursor; +pub mod dump; +pub mod node_types_yaml; +pub mod query; +mod range; +pub mod schema; +pub mod tree_builder; +mod visitor; + +pub use yeast_macros::{query, rule, tree, trees}; + +use captures::Captures; +pub use cursor::Cursor; +use query::QueryNode; + +/// Node ids are indexes into the arena +type Id = usize; + +/// Field and Kind ids are provided by tree-sitter +type FieldId = u16; +type KindId = u16; + +pub const CHILD_FIELD: u16 = u16::MAX; + +#[derive(Debug)] +pub struct AstCursor<'a> { + ast: &'a Ast, + /// A stack of parents, along with iterators for their children + parents: Vec<(&'a Node, ChildrenIter<'a>)>, + node: &'a Node, +} + +impl<'a> AstCursor<'a> { + pub fn new(ast: &'a Ast) -> Self { + // TODO: handle non-zero root + let node = ast.get_node(ast.root).unwrap(); + Self { + ast, + parents: vec![], + node, + } + } + + fn goto_next_sibling_opt(&mut self) -> Option<()> { + self.node = self.parents.last_mut()?.1.next()?; + Some(()) + } + + fn goto_first_child_opt(&mut self) -> Option<()> { + let parent = self.node; + let mut children = ChildrenIter::new(self.ast, parent); + let first_child = children.next()?; + self.node = first_child; + self.parents.push((parent, children)); + Some(()) + } + + fn goto_parent_opt(&mut self) -> Option<()> { + self.node = self.parents.pop()?.0; + Some(()) + } +} +impl<'a> Cursor<'a, Ast, Node, FieldId> for AstCursor<'a> { + fn node(&self) -> &'a Node { + self.node + } + + fn field_id(&self) -> Option { + let (_, children) = self.parents.last()?; + children.current_field() + } + + fn field_name(&self) -> Option<&'static str> { + if self.field_id() == Some(CHILD_FIELD) { + None + } else { + self.field_id() + .and_then(|id| self.ast.field_name_for_id(id)) + } + } + + fn goto_first_child(&mut self) -> bool { + self.goto_first_child_opt().is_some() + } + + fn goto_next_sibling(&mut self) -> bool { + self.goto_next_sibling_opt().is_some() + } + + fn goto_parent(&mut self) -> bool { + self.goto_parent_opt().is_some() + } +} + +/// An iterator over all the child nodes of a node. +#[derive(Debug)] +struct ChildrenIter<'a> { + ast: &'a Ast, + current_field: Option, + fields: std::collections::btree_map::Iter<'a, FieldId, Vec>, + field_children: Option>, +} + +impl<'a> ChildrenIter<'a> { + fn new(ast: &'a Ast, node: &'a Node) -> Self { + Self { + ast, + current_field: None, + fields: node.fields.iter(), + field_children: None, + } + } + + fn get_node(&self, id: Id) -> &'a Node { + self.ast.get_node(id).unwrap() + } + + fn current_field(&self) -> Option { + self.current_field + } +} + +impl<'a> Iterator for ChildrenIter<'a> { + type Item = &'a Node; + + fn next(&mut self) -> Option { + match self.field_children.as_mut() { + None => match self.fields.next() { + Some((field, children)) => { + self.current_field = Some(*field); + self.field_children = Some(children.iter()); + self.next() + } + None => None, + }, + Some(children) => match children.next() { + None => match self.fields.next() { + None => None, + Some((field, children)) => { + self.current_field = Some(*field); + self.field_children = Some(children.iter()); + self.next() + } + }, + Some(child_id) => Some(self.get_node(*child_id)), + }, + } + } +} + +/// Our AST +pub struct Ast { + root: Id, + nodes: Vec, + schema: schema::Schema, +} + +impl std::fmt::Debug for Ast { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Ast") + .field("root", &self.root) + .field("nodes", &self.nodes.len()) + .finish() + } +} + +impl Ast { + /// Construct an AST from a TS tree + pub fn from_tree(language: tree_sitter::Language, tree: &tree_sitter::Tree) -> Self { + let schema = schema::Schema::from_language(&language); + Self::from_tree_with_schema(schema, tree, &language) + } + + pub fn from_tree_with_schema( + schema: schema::Schema, + tree: &tree_sitter::Tree, + language: &tree_sitter::Language, + ) -> Self { + let mut visitor = visitor::Visitor::new(language.clone()); + visitor.visit(tree); + + visitor.build_with_schema(schema) + } + + pub fn walk(&self) -> AstCursor { + AstCursor::new(self) + } + + pub fn nodes(&self) -> &[Node] { + &self.nodes + } + + pub fn get_root(&self) -> Id { + self.root + } + + pub fn set_root(&mut self, root: Id) { + self.root = root; + } + + pub fn get_node(&self, id: Id) -> Option<&Node> { + self.nodes.get(id) + } + + pub fn print(&self, source: &str, root_id: Id) -> Value { + let root = &self.nodes()[root_id]; + self.print_node(root, source) + } + + pub fn create_node( + &mut self, + kind: KindId, + content: NodeContent, + fields: BTreeMap>, + is_named: bool, + ) -> Id { + self.create_node_with_range(kind, content, fields, is_named, None) + } + + pub fn create_node_with_range( + &mut self, + kind: KindId, + content: NodeContent, + fields: BTreeMap>, + is_named: bool, + source_range: Option, + ) -> Id { + let id = self.nodes.len(); + self.nodes.push(Node { + id, + kind, + kind_name: self.schema.node_kind_for_id(kind).unwrap(), + fields, + content, + is_missing: false, + is_error: false, + is_extra: false, + is_named, + source_range, + }); + id + } + + pub fn create_named_token(&mut self, kind: &'static str, content: String) -> Id { + self.create_named_token_with_range(kind, content, None) + } + + pub fn create_named_token_with_range( + &mut self, + kind: &'static str, + content: String, + source_range: Option, + ) -> Id { + let kind_id = self.schema.id_for_node_kind(kind).unwrap_or_else(|| { + panic!("create_named_token: node kind '{kind}' not found in schema") + }); + let id = self.nodes.len(); + self.nodes.push(Node { + id, + kind: kind_id, + kind_name: kind, + is_named: true, + is_missing: false, + is_error: false, + source_range, + is_extra: false, + fields: BTreeMap::new(), + content: NodeContent::DynamicString(content), + }); + id + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + self.schema.field_name_for_id(id) + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + self.schema.field_id_for_name(name) + } + + /// Print a node for debugging + fn print_node(&self, node: &Node, source: &str) -> Value { + let fields: BTreeMap<&'static str, Vec> = node + .fields + .iter() + .map(|(field_id, nodes)| { + let field_name = if field_id == &CHILD_FIELD { + "rest" + } else { + self.field_name_for_id(*field_id).unwrap() + }; + let nodes: Vec = nodes + .iter() + .map(|id| self.print_node(self.get_node(*id).unwrap(), source)) + .collect(); + (field_name, nodes) + }) + .collect(); + let mut value = BTreeMap::new(); + let kind = self.schema.node_kind_for_id(node.kind).unwrap(); + let content = match &node.content { + NodeContent::Range(range) => source[range.start_byte..range.end_byte].to_string(), + NodeContent::String(s) => s.to_string(), + NodeContent::DynamicString(s) => s.clone(), + }; + if fields.is_empty() { + value.insert(kind, json!(content)); + } else { + let mut fields: BTreeMap<_, _> = + fields.into_iter().map(|(k, v)| (k, json!(v))).collect(); + fields.insert("content", json!(content)); + value.insert(kind, json!(fields)); + } + json!(value) + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + let id = self.schema.id_for_node_kind(kind).unwrap_or(0); + if id == 0 { + None + } else { + Some(id) + } + } + + fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + let id = self.schema.id_for_unnamed_node_kind(kind).unwrap_or(0); + if id == 0 { + None + } else { + Some(id) + } + } +} + +/// A node in our AST +#[derive(PartialEq, Eq, Debug, Clone, Serialize)] +pub struct Node { + id: Id, + kind: KindId, + kind_name: &'static str, + pub(crate) fields: BTreeMap>, + pub(crate) content: NodeContent, + /// For synthetic nodes, the source range of the original node they + /// were desugared from. Used for location information in TRAP output. + #[serde(skip)] + source_range: Option, + is_named: bool, + is_missing: bool, + is_extra: bool, + is_error: bool, +} + +impl Node { + pub fn id(&self) -> Id { + self.id + } + + pub fn kind(&self) -> &'static str { + self.kind_name + } + + pub fn kind_name(&self) -> &'static str { + self.kind_name + } + + pub fn is_named(&self) -> bool { + self.is_named + } + + pub fn is_missing(&self) -> bool { + self.is_missing + } + + pub fn is_extra(&self) -> bool { + self.is_extra + } + + pub fn is_error(&self) -> bool { + self.is_error + } + + fn fake_point(&self) -> tree_sitter::Point { + tree_sitter::Point { row: 0, column: 0 } + } + + pub fn start_position(&self) -> tree_sitter::Point { + match self.content { + NodeContent::Range(range) => range.start_point, + _ => self + .source_range + .map_or_else(|| self.fake_point(), |r| r.start_point), + } + } + + pub fn end_position(&self) -> tree_sitter::Point { + match self.content { + NodeContent::Range(range) => range.end_point, + _ => self + .source_range + .map_or_else(|| self.fake_point(), |r| r.end_point), + } + } + + pub fn start_byte(&self) -> usize { + match self.content { + NodeContent::Range(range) => range.start_byte, + _ => self.source_range.map_or(0, |r| r.start_byte), + } + } + + pub fn end_byte(&self) -> usize { + match self.content { + NodeContent::Range(range) => range.end_byte, + _ => self.source_range.map_or(0, |r| r.end_byte), + } + } + + pub fn byte_range(&self) -> std::ops::Range { + self.start_byte()..self.end_byte() + } + + pub fn opt_string_content(&self) -> Option { + match &self.content { + NodeContent::Range(_range) => None, + NodeContent::String(s) => Some(s.to_string()), + NodeContent::DynamicString(s) => Some(s.to_string()), + } + } +} + +/// The contents of a node is either a range in the original source file, +/// or a new string if the node is synthesized. +#[derive(PartialEq, Eq, Debug, Clone, Serialize)] +pub enum NodeContent { + Range(#[serde(with = "range::Range")] tree_sitter::Range), + String(&'static str), + DynamicString(String), +} + +impl From<&'static str> for NodeContent { + fn from(value: &'static str) -> Self { + NodeContent::String(value) + } +} + +impl From for NodeContent { + fn from(value: tree_sitter::Range) -> Self { + NodeContent::Range(value) + } +} + +/// The transform function for a rule: takes the AST, captured variables, a +/// fresh-name scope, and the source range of the matched node, and returns +/// the IDs of the replacement nodes. +pub type Transform = Box< + dyn Fn(&mut Ast, Captures, &tree_builder::FreshScope, Option) -> Vec + + Send + + Sync, +>; + +pub struct Rule { + query: QueryNode, + transform: Transform, +} + +impl Rule { + pub fn new(query: QueryNode, transform: Transform) -> Self { + Self { query, transform } + } + + fn try_rule( + &self, + ast: &mut Ast, + node: Id, + fresh: &tree_builder::FreshScope, + ) -> Result>, String> { + let mut captures = Captures::new(); + if self.query.do_match(ast, node, &mut captures)? { + fresh.next_scope(); + let source_range = ast.get_node(node).and_then(|n| match n.content { + NodeContent::Range(r) => Some(r), + _ => n.source_range, + }); + Ok(Some((self.transform)(ast, captures, fresh, source_range))) + } else { + Ok(None) + } + } +} + +const MAX_REWRITE_DEPTH: usize = 100; + +/// Index of rules by their root query kind for fast lookup. +struct RuleIndex<'a> { + /// Rules indexed by root node kind name. + by_kind: BTreeMap<&'static str, Vec<&'a Rule>>, + /// Rules with wildcard queries (Any) that apply to all nodes. + wildcard: Vec<&'a Rule>, +} + +impl<'a> RuleIndex<'a> { + fn new(rules: &'a [Rule]) -> Self { + let mut by_kind: BTreeMap<&'static str, Vec<&'a Rule>> = BTreeMap::new(); + let mut wildcard = Vec::new(); + for rule in rules { + match rule.query.root_kind() { + Some(kind) => by_kind.entry(kind).or_default().push(rule), + None => wildcard.push(rule), + } + } + Self { by_kind, wildcard } + } + + fn rules_for_kind(&self, kind: &str) -> impl Iterator { + self.by_kind + .get(kind) + .into_iter() + .flat_map(|v| v.iter()) + .chain(self.wildcard.iter()) + } +} + +fn apply_rules( + rules: &[Rule], + ast: &mut Ast, + id: Id, + fresh: &tree_builder::FreshScope, +) -> Result, String> { + let index = RuleIndex::new(rules); + apply_rules_inner(&index, ast, id, fresh, 0) +} + +fn apply_rules_inner( + index: &RuleIndex, + ast: &mut Ast, + id: Id, + fresh: &tree_builder::FreshScope, + rewrite_depth: usize, +) -> Result, String> { + if rewrite_depth > MAX_REWRITE_DEPTH { + return Err(format!( + "Desugaring exceeded maximum rewrite depth ({MAX_REWRITE_DEPTH}). \ + This likely indicates a non-terminating rule cycle." + )); + } + + let node_kind = ast.get_node(id).map(|n| n.kind()).unwrap_or(""); + for rule in index.rules_for_kind(node_kind) { + if let Some(result_node) = rule.try_rule(ast, id, fresh)? { + let mut results = Vec::new(); + for node in result_node { + results.extend(apply_rules_inner( + index, + ast, + node, + fresh, + rewrite_depth + 1, + )?); + } + return Ok(results); + } + } + + // Collect fields before recursing (avoids borrowing ast immutably during mutation) + let field_entries: Vec<(FieldId, Vec)> = ast.nodes[id] + .fields + .iter() + .map(|(&fid, children)| (fid, children.clone())) + .collect(); + + // recursively descend into all the fields + // Child traversal does not increment rewrite depth + let mut changed = false; + let mut new_fields = BTreeMap::new(); + for (field_id, children) in field_entries { + let mut new_children = Vec::new(); + for child_id in children { + let result = apply_rules_inner(index, ast, child_id, fresh, rewrite_depth)?; + if result.len() != 1 || result[0] != child_id { + changed = true; + } + new_children.extend(result); + } + new_fields.insert(field_id, new_children); + } + + if !changed { + return Ok(vec![id]); + } + + let mut node = ast.nodes[id].clone(); + node.fields = new_fields; + node.id = ast.nodes.len(); + ast.nodes.push(node); + Ok(vec![ast.nodes.len() - 1]) +} + +/// Configuration for a desugaring pass: a set of rules and an optional +/// output node-types schema (in YAML format). +/// +/// When attached to a `LanguageSpec` (in the shared tree-sitter extractor), +/// enables yeast-based AST rewriting before TRAP extraction. The same YAML +/// is used both to validate TRAP output (via JSON conversion) and to +/// resolve output-only node kinds and fields at runtime. +pub struct DesugaringConfig { + /// Rules to apply during desugaring. + pub rules: Vec, + /// Output node-types in YAML format. If `None`, the input grammar's + /// node types are used (i.e. the desugared AST has the same node types + /// as the tree-sitter grammar). + pub output_node_types_yaml: Option<&'static str>, +} + +impl DesugaringConfig { + pub fn new(rules: Vec) -> Self { + Self { + rules, + output_node_types_yaml: None, + } + } + + pub fn with_output_node_types_yaml(mut self, yaml: &'static str) -> Self { + self.output_node_types_yaml = Some(yaml); + self + } + + /// Build the yeast `Schema` for this config, given the input language. + /// If `output_node_types_yaml` is `None`, returns the schema derived from + /// the input grammar. + pub fn build_schema(&self, language: &tree_sitter::Language) -> Result { + match self.output_node_types_yaml { + Some(yaml) => node_types_yaml::schema_from_yaml_with_language(yaml, language), + None => Ok(schema::Schema::from_language(language)), + } + } +} + +pub struct Runner<'a> { + language: tree_sitter::Language, + schema: schema::Schema, + rules: &'a [Rule], +} + +impl<'a> Runner<'a> { + /// Create a runner using the input grammar's schema for output. + pub fn new(language: tree_sitter::Language, rules: &'a [Rule]) -> Self { + let schema = schema::Schema::from_language(&language); + Self { + language, + schema, + rules, + } + } + + /// Create a runner with separate input language and output schema. + pub fn with_schema( + language: tree_sitter::Language, + schema: &schema::Schema, + rules: &'a [Rule], + ) -> Self { + Self { + language, + schema: schema.clone(), + rules, + } + } + + /// Create a runner from a [`DesugaringConfig`]. + pub fn from_config( + language: tree_sitter::Language, + config: &'a DesugaringConfig, + ) -> Result { + let schema = config.build_schema(&language)?; + Ok(Self { + language, + schema, + rules: &config.rules, + }) + } + + pub fn run_from_tree(&self, tree: &tree_sitter::Tree) -> Result { + let fresh = tree_builder::FreshScope::new(); + let mut ast = Ast::from_tree_with_schema(self.schema.clone(), tree, &self.language); + let root = ast.get_root(); + let res = apply_rules(self.rules, &mut ast, root, &fresh)?; + if res.len() != 1 { + return Err(format!( + "Expected exactly one result node, got {}", + res.len() + )); + } + ast.set_root(res[0]); + Ok(ast) + } + + pub fn run(&self, input: &str) -> Result { + let fresh = tree_builder::FreshScope::new(); + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&self.language) + .map_err(|e| format!("Failed to set language: {e}"))?; + let tree = parser + .parse(input, None) + .ok_or_else(|| "Failed to parse input".to_string())?; + let mut ast = Ast::from_tree_with_schema(self.schema.clone(), &tree, &self.language); + let root = ast.get_root(); + let res = apply_rules(self.rules, &mut ast, root, &fresh)?; + if res.len() != 1 { + return Err(format!( + "Expected exactly one result node, got {}", + res.len() + )); + } + ast.set_root(res[0]); + Ok(ast) + } +} diff --git a/shared/yeast/src/node_types_yaml.rs b/shared/yeast/src/node_types_yaml.rs new file mode 100644 index 000000000000..d321ba8a2cf0 --- /dev/null +++ b/shared/yeast/src/node_types_yaml.rs @@ -0,0 +1,722 @@ +/// Converts a YAML node-types file to the tree-sitter `node-types.json` format. +/// +/// # YAML format +/// +/// ```yaml +/// supertypes: +/// _expression: +/// - assignment +/// - binary +/// +/// named: +/// assignment: +/// left: _lhs +/// right: _expression +/// identifier: +/// +/// unnamed: +/// - "+" +/// - "end" +/// ``` +/// +/// See the crate-level docs for the full format specification. +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Write; + +use serde::Deserialize; +use serde_json::json; + +/// Top-level YAML structure. +#[derive(Deserialize, Default)] +struct YamlNodeTypes { + #[serde(default)] + supertypes: BTreeMap>, + #[serde(default)] + named: BTreeMap>>, + #[serde(default)] + unnamed: Vec, +} + +/// A reference to a node type. Can be: +/// - a plain string (resolved by looking up named vs unnamed) +/// - a map `{unnamed: "name"}` to force unnamed interpretation +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRef { + Name(String), + Explicit { unnamed: String }, +} + +/// A field value: either a single type ref or a list of them. +#[derive(Deserialize, Debug, Clone)] +#[serde(untagged)] +enum TypeRefOrList { + Single(TypeRef), + List(Vec), +} + +impl TypeRefOrList { + fn into_vec(self) -> Vec { + match self { + TypeRefOrList::Single(t) => vec![t], + TypeRefOrList::List(v) => v, + } + } +} + +/// Parsed field name: base name + multiplicity markers. +struct FieldSpec { + name: Option, // None for $children + multiple: bool, + required: bool, +} + +fn parse_field_name(raw: &str) -> FieldSpec { + let is_children = + raw == "$children" || raw == "$children?" || raw == "$children*" || raw == "$children+"; + + let suffix = raw.chars().last().filter(|c| matches!(c, '?' | '*' | '+')); + + let (multiple, required) = match suffix { + Some('?') => (false, false), + Some('*') => (true, false), + Some('+') => (true, true), + _ => (false, true), // bare field name = required, single + }; + + let name = if is_children { + None + } else { + let base = raw.trim_end_matches(['?', '*', '+']); + Some(base.to_string()) + }; + + FieldSpec { + name, + multiple, + required, + } +} + +/// Resolve a TypeRef to a (type, named) pair, given the sets of known named +/// and unnamed types. +fn resolve_type_ref( + type_ref: &TypeRef, + named_types: &BTreeSet, + unnamed_types: &BTreeSet, +) -> serde_json::Value { + match type_ref { + TypeRef::Explicit { unnamed } => { + json!({"type": unnamed, "named": false}) + } + TypeRef::Name(name) => { + let is_named = named_types.contains(name); + let is_unnamed = unnamed_types.contains(name); + + if is_named && is_unnamed { + // Ambiguous: default to named + json!({"type": name, "named": true}) + } else if is_unnamed { + json!({"type": name, "named": false}) + } else { + // Named, or unknown (assume named) + json!({"type": name, "named": true}) + } + } + } +} + +/// Convert YAML string to node-types JSON string. +pub fn convert(yaml_input: &str) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + // Build the sets of known named and unnamed types for resolution. + let mut named_types = BTreeSet::new(); + for name in yaml.supertypes.keys() { + named_types.insert(name.clone()); + } + for name in yaml.named.keys() { + named_types.insert(name.clone()); + } + let unnamed_types: BTreeSet = yaml.unnamed.iter().cloned().collect(); + + let mut output = Vec::new(); + + // 1. Supertypes + for (name, members) in &yaml.supertypes { + let subtypes: Vec<_> = members + .iter() + .map(|m| resolve_type_ref(m, &named_types, &unnamed_types)) + .collect(); + output.push(json!({ + "type": name, + "named": true, + "subtypes": subtypes, + })); + } + + // 2. Named nodes + for (name, fields_opt) in &yaml.named { + let fields_map = match fields_opt { + None => { + // Leaf token: no fields, no children, no subtypes + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) if m.is_empty() => { + output.push(json!({ + "type": name, + "named": true, + "fields": {}, + })); + continue; + } + Some(m) => m, + }; + + let mut json_fields = serde_json::Map::new(); + let mut json_children: Option = None; + + for (raw_field_name, type_refs) in fields_map { + let spec = parse_field_name(raw_field_name); + let types: Vec<_> = type_refs + .clone() + .into_vec() + .iter() + .map(|t| resolve_type_ref(t, &named_types, &unnamed_types)) + .collect(); + + // Cloning to make the borrow checker happy + let field_info = json!({ + "multiple": spec.multiple, + "required": spec.required, + "types": types, + }); + + if spec.name.is_none() { + // $children + json_children = Some(field_info); + } else { + json_fields.insert(spec.name.unwrap(), field_info); + } + } + + let mut entry = json!({ + "type": name, + "named": true, + "fields": json_fields, + }); + + if let Some(children) = json_children { + entry + .as_object_mut() + .unwrap() + .insert("children".to_string(), children); + } + + output.push(entry); + } + + // 3. Unnamed tokens + for name in &yaml.unnamed { + output.push(json!({ + "type": name, + "named": false, + })); + } + + serde_json::to_string_pretty(&output).map_err(|e| format!("Failed to serialize JSON: {e}")) +} + +/// Build a Schema from a YAML node-types string. +/// Registers all node kinds and field names found in the YAML. +pub fn schema_from_yaml(yaml_input: &str) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + let mut schema = crate::schema::Schema::new(); + + // Register all supertypes as node kinds + for name in yaml.supertypes.keys() { + schema.register_kind(name); + } + + // Register named node kinds and their fields + for (name, fields_opt) in &yaml.named { + schema.register_kind(name); + if let Some(fields) = fields_opt { + for raw_field_name in fields.keys() { + let spec = parse_field_name(raw_field_name); + if let Some(field_name) = &spec.name { + schema.register_field(field_name); + } + } + } + } + + // Register unnamed tokens as node kinds + for name in &yaml.unnamed { + schema.register_unnamed_kind(name); + } + + Ok(schema) +} + +/// Build a Schema from a YAML string, extending a tree-sitter Language. +/// The Schema inherits all field/kind names from the Language, plus any +/// additional ones defined in the YAML. +pub fn schema_from_yaml_with_language( + yaml_input: &str, + language: &tree_sitter::Language, +) -> Result { + let yaml: YamlNodeTypes = + serde_yaml::from_str(yaml_input).map_err(|e| format!("Failed to parse YAML: {e}"))?; + + let mut schema = crate::schema::Schema::from_language(language); + + // Register supertypes + for name in yaml.supertypes.keys() { + schema.register_kind(name); + } + + // Register named node kinds and their fields + for (name, fields_opt) in &yaml.named { + schema.register_kind(name); + if let Some(fields) = fields_opt { + for raw_field_name in fields.keys() { + let spec = parse_field_name(raw_field_name); + if let Some(field_name) = &spec.name { + schema.register_field(field_name); + } + } + } + } + + // Register unnamed tokens + for name in &yaml.unnamed { + schema.register_unnamed_kind(name); + } + + Ok(schema) +} + +// --------------------------------------------------------------------------- +// JSON → YAML conversion +// --------------------------------------------------------------------------- + +/// JSON node-types structures (mirrors tree-sitter's format). +#[derive(Deserialize)] +struct JsonNodeInfo { + #[serde(rename = "type")] + kind: String, + named: bool, + #[serde(default)] + fields: BTreeMap, + children: Option, + #[serde(default)] + subtypes: Vec, +} + +#[derive(Deserialize)] +struct JsonNodeType { + #[serde(rename = "type")] + kind: String, + named: bool, +} + +#[derive(Deserialize)] +struct JsonFieldInfo { + multiple: bool, + required: bool, + types: Vec, +} + +/// Convert a tree-sitter node-types.json string to the YAML format. +pub fn convert_from_json(json_input: &str) -> Result { + let nodes: Vec = + serde_json::from_str(json_input).map_err(|e| format!("Failed to parse JSON: {e}"))?; + + // Collect all named and unnamed types for disambiguation decisions. + let mut all_named: BTreeSet = BTreeSet::new(); + let mut all_unnamed: BTreeSet = BTreeSet::new(); + for node in &nodes { + if node.named { + all_named.insert(node.kind.clone()); + } else { + all_unnamed.insert(node.kind.clone()); + } + } + + let mut supertypes: BTreeMap> = BTreeMap::new(); + let mut named: BTreeMap>> = BTreeMap::new(); + let mut unnamed: Vec = Vec::new(); + + for node in nodes { + if !node.named { + unnamed.push(node.kind); + continue; + } + + if !node.subtypes.is_empty() { + supertypes.insert(node.kind, node.subtypes); + continue; + } + + if node.fields.is_empty() && node.children.is_none() { + // Leaf token + named.insert(node.kind, None); + } else { + let mut fields = BTreeMap::new(); + for (name, info) in node.fields { + fields.insert(name, info); + } + if let Some(children) = node.children { + fields.insert("$children".to_string(), children); + } + named.insert(node.kind, Some(fields)); + } + } + + // Now emit YAML + let mut out = String::new(); + + // Supertypes + if !supertypes.is_empty() { + writeln!(out, "supertypes:").unwrap(); + for (name, members) in &supertypes { + writeln!(out, " {name}:").unwrap(); + for member in members { + let ref_str = format_type_ref(&member.kind, member.named, &all_named, &all_unnamed); + writeln!(out, " - {ref_str}").unwrap(); + } + } + writeln!(out).unwrap(); + } + + // Named + if !named.is_empty() { + writeln!(out, "named:").unwrap(); + for (name, fields_opt) in &named { + match fields_opt { + None => { + writeln!(out, " {name}:").unwrap(); + } + Some(fields) => { + writeln!(out, " {name}:").unwrap(); + for (field_name, info) in fields { + let suffix = field_suffix(info.multiple, info.required); + let yaml_name = if field_name == "$children" { + format!("$children{suffix}") + } else { + format!("{field_name}{suffix}") + }; + + let type_refs: Vec = info + .types + .iter() + .map(|t| format_type_ref(&t.kind, t.named, &all_named, &all_unnamed)) + .collect(); + + if type_refs.len() == 1 { + writeln!(out, " {yaml_name}: {}", type_refs[0]).unwrap(); + } else { + let list = type_refs + .iter() + .map(|s| s.as_str()) + .collect::>() + .join(", "); + writeln!(out, " {yaml_name}: [{list}]").unwrap(); + } + } + } + } + } + writeln!(out).unwrap(); + } + + // Unnamed + if !unnamed.is_empty() { + writeln!(out, "unnamed:").unwrap(); + for name in &unnamed { + writeln!(out, " - {}", force_quote(name)).unwrap(); + } + } + + Ok(out) +} + +fn field_suffix(multiple: bool, required: bool) -> &'static str { + match (multiple, required) { + (false, true) => "", + (false, false) => "?", + (true, true) => "+", + (true, false) => "*", + } +} + +/// Format a type reference for YAML output. Uses the disambiguation rule: +/// plain string if unambiguous, `{unnamed: name}` if the name exists as both +/// named and unnamed and we need the unnamed interpretation. +fn format_type_ref( + kind: &str, + named: bool, + all_named: &BTreeSet, + _all_unnamed: &BTreeSet, +) -> String { + if named { + quote_yaml(kind) + } else { + let is_also_named = all_named.contains(kind); + if is_also_named { + format!("{{unnamed: {}}}", force_quote(kind)) + } else { + force_quote(kind) + } + } +} + +/// Always wrap in double quotes. Used for unnamed node references so they're +/// visually distinct from named ones — YAML treats both forms as equivalent strings. +fn force_quote(s: &str) -> String { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) +} + +/// Quote a YAML string value if it contains special characters or could be +/// misinterpreted. +fn quote_yaml(s: &str) -> String { + let needs_quoting = s.is_empty() + || s.contains(|c: char| { + matches!( + c, + ':' | '{' + | '}' + | '[' + | ']' + | ',' + | '&' + | '*' + | '#' + | '?' + | '|' + | '-' + | '<' + | '>' + | '=' + | '!' + | '%' + | '@' + | '`' + | '"' + | '\'' + ) + }) + || s.starts_with(' ') + || s.ends_with(' ') + || s == "true" + || s == "false" + || s == "null" + || s == "yes" + || s == "no" + || s.parse::().is_ok(); + + if needs_quoting { + format!("\"{}\"", s.replace('\\', "\\\\").replace('"', "\\\"")) + } else { + s.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic_conversion() { + let yaml = r#" +supertypes: + _expression: + - assignment + - binary + +named: + assignment: + left: _lhs + right: _expression + binary: + left: [_expression, _simple_numeric] + operator: ["!=", "+"] + right: _expression + argument_list: + $children*: [_expression, block_argument] + identifier: + +unnamed: + - "!=" + - "+" + - "end" +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + + // Check supertype + let expr = &result[0]; + assert_eq!(expr["type"], "_expression"); + assert_eq!(expr["named"], true); + assert_eq!(expr["subtypes"].as_array().unwrap().len(), 2); + + // Check assignment + let assign = result.iter().find(|n| n["type"] == "assignment").unwrap(); + assert_eq!(assign["fields"]["left"]["required"], true); + assert_eq!(assign["fields"]["left"]["multiple"], false); + assert_eq!(assign["fields"]["left"]["types"][0]["type"], "_lhs"); + assert_eq!(assign["fields"]["left"]["types"][0]["named"], true); + + // Check binary.operator — "!=" and "+" should resolve to unnamed + let binary = result.iter().find(|n| n["type"] == "binary").unwrap(); + let op_types = binary["fields"]["operator"]["types"].as_array().unwrap(); + assert_eq!(op_types[0]["type"], "!="); + assert_eq!(op_types[0]["named"], false); + assert_eq!(op_types[1]["type"], "+"); + assert_eq!(op_types[1]["named"], false); + + // Check argument_list has children, not a field + let arg_list = result + .iter() + .find(|n| n["type"] == "argument_list") + .unwrap(); + assert!(arg_list.get("children").is_some()); + assert_eq!(arg_list["children"]["multiple"], true); + assert_eq!(arg_list["children"]["required"], false); + + // Check identifier is a leaf + let ident = result.iter().find(|n| n["type"] == "identifier").unwrap(); + assert_eq!(ident["fields"].as_object().unwrap().len(), 0); + + // Check unnamed tokens + let end = result.iter().find(|n| n["type"] == "end").unwrap(); + assert_eq!(end["named"], false); + } + + #[test] + fn test_explicit_unnamed_disambiguation() { + let yaml = r#" +named: + foo: + field: [{unnamed: bar}] + +unnamed: + - bar +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let foo = result.iter().find(|n| n["type"] == "foo").unwrap(); + assert_eq!(foo["fields"]["field"]["types"][0]["named"], false); + } + + #[test] + fn test_field_suffixes() { + let yaml = r#" +named: + test_node: + required_single: foo + optional_single?: foo + required_multiple+: foo + optional_multiple*: foo +"#; + + let json_str = convert(yaml).unwrap(); + let result: Vec = serde_json::from_str(&json_str).unwrap(); + let node = result.iter().find(|n| n["type"] == "test_node").unwrap(); + let fields = node["fields"].as_object().unwrap(); + + assert_eq!(fields["required_single"]["required"], true); + assert_eq!(fields["required_single"]["multiple"], false); + + assert_eq!(fields["optional_single"]["required"], false); + assert_eq!(fields["optional_single"]["multiple"], false); + + assert_eq!(fields["required_multiple"]["required"], true); + assert_eq!(fields["required_multiple"]["multiple"], true); + + assert_eq!(fields["optional_multiple"]["required"], false); + assert_eq!(fields["optional_multiple"]["multiple"], true); + } + + #[test] + fn test_json_to_yaml() { + let json = r#"[ + {"type": "_expression", "named": true, "subtypes": [ + {"type": "assignment", "named": true}, + {"type": "identifier", "named": true} + ]}, + {"type": "assignment", "named": true, "fields": { + "left": {"multiple": false, "required": true, "types": [ + {"type": "_expression", "named": true} + ]}, + "right": {"multiple": false, "required": false, "types": [ + {"type": "_expression", "named": true} + ]} + }, "children": { + "multiple": true, "required": false, "types": [ + {"type": "identifier", "named": true} + ] + }}, + {"type": "identifier", "named": true, "fields": {}}, + {"type": "=", "named": false}, + {"type": "end", "named": false} + ]"#; + + let yaml = convert_from_json(json).unwrap(); + + // Verify key structures are present + assert!(yaml.contains("supertypes:")); + assert!(yaml.contains("_expression:")); + assert!(yaml.contains("named:")); + assert!(yaml.contains("assignment:")); + assert!(yaml.contains("left:")); + assert!(yaml.contains("right?:")); + assert!(yaml.contains("$children*:")); + assert!(yaml.contains("identifier:")); + assert!(yaml.contains("unnamed:")); + assert!(yaml.contains("\"=\"")); + assert!(yaml.contains("end")); + } + + #[test] + fn test_round_trip() { + let yaml_input = r#" +supertypes: + _expression: + - assignment + - identifier + +named: + assignment: + left: _expression + right?: _expression + $children*: identifier + identifier: + +unnamed: + - "=" + - end +"#; + + // YAML → JSON → YAML + let json = convert(yaml_input).unwrap(); + let yaml_output = convert_from_json(&json).unwrap(); + // YAML → JSON again (should be identical) + let json2 = convert(&yaml_output).unwrap(); + + let v1: serde_json::Value = serde_json::from_str(&json).unwrap(); + let v2: serde_json::Value = serde_json::from_str(&json2).unwrap(); + assert_eq!(v1, v2); + } +} diff --git a/shared/yeast/src/query.rs b/shared/yeast/src/query.rs new file mode 100644 index 000000000000..223b34569190 --- /dev/null +++ b/shared/yeast/src/query.rs @@ -0,0 +1,228 @@ +use crate::{captures::Captures, Ast, Id}; + +#[derive(Debug, Clone)] +pub enum QueryNode { + Any(), + Node { + kind: &'static str, + children: Vec<(&'static str, Vec)>, + }, + UnnamedNode { + kind: &'static str, + }, + Capture { + capture: &'static str, + node: Box, + }, +} + +impl QueryNode { + /// Returns the root node kind this query matches, if it's specific. + /// Returns None for wildcards (Any) and captures wrapping wildcards. + pub fn root_kind(&self) -> Option<&'static str> { + match self { + QueryNode::Node { kind, .. } => Some(kind), + QueryNode::UnnamedNode { kind } => Some(kind), + QueryNode::Capture { node, .. } => node.root_kind(), + QueryNode::Any() => None, + } + } +} + +#[derive(Debug, Clone)] +pub enum QueryListElem { + Repeated { + children: Vec, + rep: Rep, + }, + SingleNode(QueryNode), +} + +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum Rep { + ZeroOrMore, + OneOrMore, + ZeroOrOne, +} + +impl QueryNode { + /// Returns true if this query only matches named nodes (not unnamed tokens). + /// Used to skip unnamed children in positional matching, matching tree-sitter + /// semantics where `(_)` only matches named nodes. + fn matches_named_only(&self) -> bool { + match self { + QueryNode::Any() => true, + QueryNode::Node { .. } => true, + QueryNode::UnnamedNode { .. } => false, + QueryNode::Capture { node, .. } => node.matches_named_only(), + } + } + + pub fn do_match(&self, ast: &Ast, node: Id, matches: &mut Captures) -> Result { + match self { + QueryNode::Any() => Ok(true), + QueryNode::Node { kind, children } => { + let node = ast.get_node(node).unwrap(); + let target_kind = ast + .id_for_node_kind(kind) + .ok_or_else(|| format!("Node kind {kind} not found in language"))?; + if node.kind != target_kind { + return Ok(false); + } + for (field, field_children) in children { + let field_id = ast + .field_id_for_name(field) + .ok_or_else(|| format!("Field {field} not found in language"))?; + let empty = Vec::new(); + let mut child_iter = + node.fields.get(&field_id).unwrap_or(&empty).iter().cloned(); + if !match_children(field_children.iter(), ast, &mut child_iter, matches)? { + return Ok(false); + } + } + Ok(true) + } + QueryNode::UnnamedNode { kind } => { + let node = ast.get_node(node).unwrap(); + let target_kind = ast + .id_for_unnamed_node_kind(kind) + .ok_or_else(|| format!("unnamed Node kind {kind} not found in language"))?; + Ok(node.kind == target_kind) + } + QueryNode::Capture { + capture, + node: sub_query, + } => { + let matched = sub_query.do_match(ast, node, matches)?; + if matched { + matches.insert(capture, node); + } + Ok(matched) + } + } + } +} + +fn match_children<'a>( + child_matchers: impl Iterator, + ast: &Ast, + remaining_children: &mut (impl Iterator + Clone), + matches: &mut Captures, +) -> Result { + for child in child_matchers { + if !child.do_match(ast, remaining_children, matches)? { + return Ok(false); + } + } + Ok(true) +} + +impl QueryListElem { + fn do_match( + &self, + ast: &Ast, + remaining_children: &mut (impl Iterator + Clone), + matches: &mut Captures, + ) -> Result { + match self { + QueryListElem::Repeated { children, rep } => { + if children.is_empty() { + // Empty repetition always succeeds without consuming + return Ok(*rep != Rep::OneOrMore); + } + + let mut iters = 0; + + loop { + let matches_initial = matches.clone(); + let start = remaining_children.clone(); + let start_next = start.clone().next(); + if !match_children(children.iter(), ast, remaining_children, matches)? { + *remaining_children = start; + *matches = matches_initial; + break; + } + // Guard against zero-width matches: if the iterator + // didn't advance, break to avoid infinite looping. + let current_next = remaining_children.clone().next(); + if start_next == current_next { + break; + } + iters += 1; + if *rep == Rep::ZeroOrOne { + break; + } + } + if *rep == Rep::OneOrMore && iters == 0 { + // We didn't match any children but we were supposed to + Ok(false) + } else { + Ok(true) + } + } + QueryListElem::SingleNode(sub_query) => { + if sub_query.matches_named_only() { + // Skip unnamed children, matching tree-sitter semantics + // where (_) only matches named nodes. + loop { + match remaining_children.next() { + Some(child) => { + let node = ast.get_node(child).unwrap(); + if node.is_named() { + return sub_query.do_match(ast, child, matches); + } + // Skip unnamed child, continue to next + } + None => return Ok(false), + } + } + } else if let Some(child) = remaining_children.next() { + sub_query.do_match(ast, child, matches) + } else { + Ok(false) + } + } + } + } +} + +#[cfg(test)] +mod tests { + use crate::query::*; + #[test] + fn it_works() { + let query1: QueryNode = yeast::query!((_)); + println!("{query1:?}"); + let query2 = yeast::query!((foo)); + println!("{query2:?}"); + let query3 = yeast::query!((foo child: (_))); + println!("{query3:?}"); + let query4 = yeast::query!((foo (_)*)); + println!("{query4:?}"); + let query5: QueryNode = yeast::query!((foo (_)*)); + println!("{query5:?}"); + let query6: QueryNode = yeast::query!((_) @bar); + println!("{query6:?}"); + let query7: QueryNode = yeast::query!((foo child: (_) @bar)); + println!("{query7:?}"); + let query8: QueryNode = yeast::query!( + (assignment + left: (element_reference + object: (_) @obj + (_) @index + ) + right: (_) @rhs + ) + ); + println!("{query8:?}"); + let query9 = yeast::query!( + (program + child: (assignment + left: (_) @left + right: (_) @right + ) + ) + ); + println!("{query9:?}"); + } +} diff --git a/shared/yeast/src/range.rs b/shared/yeast/src/range.rs new file mode 100644 index 000000000000..ec670b438d56 --- /dev/null +++ b/shared/yeast/src/range.rs @@ -0,0 +1,21 @@ +//! (de)-serialize helpers for tree_sitter::Range + +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +#[serde(remote = "tree_sitter::Point")] +pub struct Point { + pub row: usize, + pub column: usize, +} + +#[derive(Serialize, Deserialize)] +#[serde(remote = "tree_sitter::Range")] +pub struct Range { + pub start_byte: usize, + pub end_byte: usize, + #[serde(with = "Point")] + pub start_point: tree_sitter::Point, + #[serde(with = "Point")] + pub end_point: tree_sitter::Point, +} diff --git a/shared/yeast/src/schema.rs b/shared/yeast/src/schema.rs new file mode 100644 index 000000000000..0a33fd6e0ed4 --- /dev/null +++ b/shared/yeast/src/schema.rs @@ -0,0 +1,167 @@ +use std::collections::BTreeMap; + +use crate::{FieldId, KindId, CHILD_FIELD}; + +/// A schema defining node kinds and field names for the output AST. +/// Built from a node-types.yml file, independent of any tree-sitter grammar. +/// +/// # Memory management +/// +/// `register_field`/`register_kind`/`register_unnamed_kind` use `Box::leak` +/// to obtain `&'static str` names. This is intentional: the `&'static str` +/// names appear pervasively in `Node`, `AstCursor`, query patterns, and the +/// extractor's TRAP output, where adding a lifetime would propagate widely. +/// +/// The leak is bounded by the number of distinct kind/field names registered. +/// Schemas are expected to be constructed once per process (e.g. at extractor +/// startup) and reused. Repeated construction in long-running processes will +/// leak memory unboundedly and should be avoided. +#[derive(Clone)] +pub struct Schema { + field_ids: BTreeMap, + field_names: BTreeMap, + next_field_id: FieldId, + kind_ids: BTreeMap, + unnamed_kind_ids: BTreeMap, + kind_names: BTreeMap, + next_kind_id: KindId, +} + +impl Default for Schema { + fn default() -> Self { + Self::new() + } +} + +impl Schema { + pub fn new() -> Self { + Self { + field_ids: BTreeMap::new(), + field_names: BTreeMap::new(), + next_field_id: 1, // 0 is reserved + kind_ids: BTreeMap::new(), + unnamed_kind_ids: BTreeMap::new(), + kind_names: BTreeMap::new(), + next_kind_id: 1, // 0 is reserved + } + } + + /// Create a schema from a tree-sitter language, importing all its + /// known field and kind names. + pub fn from_language(language: &tree_sitter::Language) -> Self { + let mut schema = Self::new(); + // Import all field names, preserving tree-sitter's IDs + for id in 1..=language.field_count() as u16 { + if let Some(name) = language.field_name_for_id(id) { + schema.field_ids.insert(name.to_string(), id); + schema.field_names.insert(id, name); + if id >= schema.next_field_id { + schema.next_field_id = id + 1; + } + } + } + // Import all node kind names, preserving tree-sitter's IDs. + // Track named and unnamed variants separately. + // For named kinds, use the canonical ID from id_for_node_kind(name, true) + // since some languages have multiple IDs for the same named kind. + for id in 0..language.node_kind_count() as u16 { + if let Some(name) = language.node_kind_for_id(id) { + if !name.is_empty() { + let is_named = language.node_kind_is_named(id); + if is_named { + let canonical_id = language.id_for_node_kind(name, true); + if canonical_id != 0 && !schema.kind_ids.contains_key(name) { + schema.kind_ids.insert(name.to_string(), canonical_id); + schema.kind_names.insert(canonical_id, name); + } + } else { + // For unnamed kinds, only insert if we don't already have one + // (some languages have multiple unnamed IDs for the same text) + schema + .unnamed_kind_ids + .entry(name.to_string()) + .or_insert(id); + } + // Always track the name for any ID we encounter + schema.kind_names.entry(id).or_insert(name); + if id >= schema.next_kind_id { + schema.next_kind_id = id + 1; + } + } + } + } + schema + } + + /// Register a field name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_field(&mut self, name: &str) -> FieldId { + if name == "child" { + return CHILD_FIELD; + } + if let Some(&id) = self.field_ids.get(name) { + return id; + } + let id = self.next_field_id; + assert!(id < CHILD_FIELD, "too many fields"); + self.next_field_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.field_ids.insert(name.to_string(), id); + self.field_names.insert(id, leaked); + id + } + + /// Register a named node kind name, returning its ID. + /// If already registered, returns the existing ID. + pub fn register_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + /// Register an unnamed token kind (e.g. `"="`, `"end"`), returning its ID. + /// If already registered, returns the existing ID. + pub fn register_unnamed_kind(&mut self, name: &str) -> KindId { + if let Some(&id) = self.unnamed_kind_ids.get(name) { + return id; + } + let id = self.next_kind_id; + self.next_kind_id += 1; + let leaked: &'static str = Box::leak(name.to_string().into_boxed_str()); + self.unnamed_kind_ids.insert(name.to_string(), id); + self.kind_names.insert(id, leaked); + id + } + + pub fn field_id_for_name(&self, name: &str) -> Option { + if name == "child" { + return Some(CHILD_FIELD); + } + self.field_ids.get(name).copied() + } + + pub fn field_name_for_id(&self, id: FieldId) -> Option<&'static str> { + if id == CHILD_FIELD { + return Some("child"); + } + self.field_names.get(&id).copied() + } + + pub fn id_for_node_kind(&self, kind: &str) -> Option { + self.kind_ids.get(kind).copied() + } + + pub fn id_for_unnamed_node_kind(&self, kind: &str) -> Option { + self.unnamed_kind_ids.get(kind).copied() + } + + pub fn node_kind_for_id(&self, id: KindId) -> Option<&'static str> { + self.kind_names.get(&id).copied() + } +} diff --git a/shared/yeast/src/tree_builder.rs b/shared/yeast/src/tree_builder.rs new file mode 100644 index 000000000000..c735c272d283 --- /dev/null +++ b/shared/yeast/src/tree_builder.rs @@ -0,0 +1,43 @@ +use std::cell::Cell; +use std::collections::BTreeMap; + +/// Tracks fresh identifier generation during a single tree-building operation. +/// All occurrences of the same `$name` within one build share the same generated value. +pub struct FreshScope { + counter: Cell, + resolved: std::cell::RefCell>, +} + +impl Default for FreshScope { + fn default() -> Self { + Self::new() + } +} + +impl FreshScope { + pub fn new() -> Self { + Self { + counter: Cell::new(0), + resolved: std::cell::RefCell::new(BTreeMap::new()), + } + } + + pub fn resolve(&self, name: &str) -> String { + self.resolved + .borrow_mut() + .entry(name.to_string()) + .or_insert_with(|| { + let id = self.counter.get(); + self.counter.set(id + 1); + format!("${name}-{id}") + }) + .clone() + } + + /// Clear resolved names but keep the counter. Called between rule + /// applications so that `$tmp` in different rules gets different values + /// while the counter increases monotonically. + pub fn next_scope(&self) { + self.resolved.borrow_mut().clear(); + } +} diff --git a/shared/yeast/src/visitor.rs b/shared/yeast/src/visitor.rs new file mode 100644 index 000000000000..655aa01e6b3e --- /dev/null +++ b/shared/yeast/src/visitor.rs @@ -0,0 +1,111 @@ +use std::collections::BTreeMap; +use tree_sitter::{Language, Tree}; + +use crate::{Ast, Id, Node, NodeContent, CHILD_FIELD}; + +#[derive(Debug)] +struct VisitorNode { + inner: Node, + parent: Option, +} + +/// A type that can walk a TS tree and produce an `Ast`. +#[derive(Debug)] +pub(crate) struct Visitor { + nodes: Vec, + current: Option, + language: Language, +} + +impl Visitor { + pub fn new(language: Language) -> Self { + Self { + nodes: Vec::new(), + current: None, + language, + } + } + + pub fn visit(&mut self, tree: &Tree) { + let cursor = &mut tree.walk(); + self.enter_node(cursor.node()); + let mut recurse = true; + loop { + if recurse && cursor.goto_first_child() { + recurse = self.enter_node(cursor.node()); + } else { + self.leave_node(cursor.field_name(), cursor.node()); + + if cursor.goto_next_sibling() { + recurse = self.enter_node(cursor.node()); + } else if cursor.goto_parent() { + recurse = false; + } else { + break; + } + } + } + } + + pub fn build_with_schema(self, schema: crate::schema::Schema) -> Ast { + Ast { + root: self.nodes[0].inner.id, + schema, + nodes: self.nodes.into_iter().map(|n| n.inner).collect(), + } + } + + fn add_node(&mut self, n: tree_sitter::Node<'_>, content: NodeContent, is_named: bool) -> Id { + let id = self.nodes.len(); + self.nodes.push(VisitorNode { + inner: Node { + id, + kind: self.language.id_for_node_kind(n.kind(), is_named), + kind_name: n.kind(), + content, + fields: BTreeMap::new(), + is_missing: n.is_missing(), + is_named: n.is_named(), + is_extra: n.is_extra(), + is_error: n.is_error(), + source_range: None, + }, + parent: self.current, + }); + id + } + + fn enter_node(&mut self, node: tree_sitter::Node<'_>) -> bool { + let id = self.add_node(node, node.range().into(), node.is_named()); + self.current = Some(id); + true + } + + fn leave_node(&mut self, field_name: Option<&'static str>, _node: tree_sitter::Node<'_>) { + let node = self.current.map(|i| &self.nodes[i]).unwrap(); + let node_id = node.inner.id; + let node_parent = node.parent; + + if let Some(parent_id) = node.parent { + let parent = self.nodes.get_mut(parent_id).unwrap(); + if let Some(field) = field_name { + let field_id = self.language.field_id_for_name(field).unwrap().get(); + parent + .inner + .fields + .entry(field_id) + .or_default() + .push(node_id); + } else { + parent + .inner + .fields + .entry(CHILD_FIELD) + .or_default() + .push(node_id); + } + } + + self.current = node_parent; + } +} diff --git a/shared/yeast/tests/node-types.yml b/shared/yeast/tests/node-types.yml new file mode 100644 index 000000000000..8416d377ee80 --- /dev/null +++ b/shared/yeast/tests/node-types.yml @@ -0,0 +1,73 @@ +# Output node types for yeast test rules. +# Inspired by tree-sitter-ruby, but with all children in named fields +# (no unnamed children). This represents the desugared output schema. + +named: + program: + stmt*: [assignment, call, identifier, for, first_node, second_node] + + assignment: + left: [identifier, left_assignment_list] + right: [identifier, integer, call, element_reference] + + left_assignment_list: + item*: identifier + + element_reference: + object: identifier + index: [integer, identifier] + + for: + pattern: [identifier, left_assignment_list] + value: in + body: do + + in: + value: [identifier, call] + + do: + stmt*: [assignment, identifier, call] + + call: + receiver: [identifier, call] + method: identifier + arguments?: argument_list + block?: block + + argument_list: + argument*: [identifier, integer, call] + + block: + parameters: block_parameters + body: block_body + + block_parameters: + parameter*: identifier + + block_body: + stmt*: [assignment, identifier, call] + + identifier: + integer: + + # Output-only kinds, used by tests of chained desugaring rules. + # Neither exists in the input tree-sitter-ruby grammar. + first_node: + left: [identifier, integer] + right: [identifier, integer] + + second_node: + left: [identifier, integer] + right: [identifier, integer] + +unnamed: + - "=" + - "," + - "(" + - ")" + - "for" + - "in" + - "do" + - "end" + - "|" + - "." diff --git a/shared/yeast/tests/test.rs b/shared/yeast/tests/test.rs new file mode 100644 index 000000000000..e4485857bff1 --- /dev/null +++ b/shared/yeast/tests/test.rs @@ -0,0 +1,454 @@ +#![cfg(test)] + +use yeast::dump::dump_ast; +use yeast::*; + +const OUTPUT_SCHEMA_YAML: &str = include_str!("node-types.yml"); + +/// Helper: parse Ruby source with no rules, return dump. +fn parse_and_dump(input: &str) -> String { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run(input).unwrap(); + dump_ast(&ast, ast.get_root(), input) +} + +/// Helper: parse Ruby source with a custom output schema and rules, return dump. +fn run_and_dump(input: &str, rules: Vec) -> String { + let lang: tree_sitter::Language = tree_sitter_ruby::LANGUAGE.into(); + let schema = + yeast::node_types_yaml::schema_from_yaml_with_language(OUTPUT_SCHEMA_YAML, &lang).unwrap(); + let runner = Runner::with_schema(lang, &schema, &rules); + let ast = runner.run(input).unwrap(); + dump_ast(&ast, ast.get_root(), input) +} + +/// Assert that a dump equals the expected string, treating the expected +/// string as an indented multiline literal: leading/trailing blank lines +/// are stripped, and the common leading indentation is removed from every +/// line. This lets test assertions place the first line at the same +/// indentation as the rest of the body. +#[track_caller] +fn assert_dump_eq(actual: &str, expected: &str) { + let min_indent = expected + .lines() + .filter(|l| !l.trim().is_empty()) + .map(|l| l.len() - l.trim_start().len()) + .min() + .unwrap_or(0); + let dedented: String = expected + .lines() + .map(|l| { + if l.len() >= min_indent { + &l[min_indent..] + } else { + l + } + }) + .collect::>() + .join("\n"); + assert_eq!(actual.trim(), dedented.trim()); +} + +// ---- Parsing tests ---- + +#[test] +fn test_parse_assignment() { + let dump = parse_and_dump("x = 1"); + assert_dump_eq( + &dump, + r#" + program + assignment + left: identifier "x" + right: integer "1" + "#, + ); +} + +#[test] +fn test_parse_multiple_assignment() { + let dump = parse_and_dump("x, y = foo()"); + assert_dump_eq( + &dump, + r#" + program + assignment + left: + left_assignment_list + identifier "x" + identifier "y" + right: + call + arguments: + argument_list + method: identifier "foo" + "#, + ); +} + +#[test] +fn test_parse_for_loop() { + let dump = parse_and_dump("for x in list do\n y\nend"); + assert_dump_eq( + &dump, + r#" + program + for + body: + do + identifier "y" + pattern: identifier "x" + value: + in + identifier "list" + "#, + ); +} + +// ---- Query tests ---- + +#[test] +fn test_query_match() { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let query = yeast::query!( + (program + child: (assignment + left: (_) @left + right: (_) @right + ) + ) + ); + + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, ast.get_root(), &mut captures).unwrap(); + assert!(matched); + assert!(captures.get_var("left").is_ok()); + assert!(captures.get_var("right").is_ok()); +} + +#[test] +fn test_query_no_match() { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + + let query = yeast::query!( + (program + child: (call + method: (_) @m + ) + ) + ); + + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, ast.get_root(), &mut captures).unwrap(); + assert!(!matched); +} + +#[test] +fn test_query_repeated_capture() { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x, y, z = 1").unwrap(); + + let query = yeast::query!( + (assignment + left: (left_assignment_list + (identifier)* @names + ) + ) + ); + + // Match against the assignment node (first named child of program) + let mut cursor = AstCursor::new(&ast); + cursor.goto_first_child(); + let assignment_id = cursor.node().id(); + + let mut captures = yeast::captures::Captures::new(); + let matched = query.do_match(&ast, assignment_id, &mut captures).unwrap(); + assert!(matched); + assert_eq!(captures.get_all("names").len(), 3); +} + +// ---- Tree builder tests ---- + +#[test] +fn test_tree_builder() { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let mut ast = runner.run("x = 1").unwrap(); + let input = "x = 1"; + + let query = yeast::query!( + (program + child: (assignment + left: (_) @left + right: (_) @right + ) + ) + ); + + let mut captures = yeast::captures::Captures::new(); + query.do_match(&ast, ast.get_root(), &mut captures).unwrap(); + + // Swap left and right + let fresh = yeast::tree_builder::FreshScope::new(); + let mut ctx = yeast::build::BuildCtx::new(&mut ast, &captures, &fresh); + let new_id = yeast::tree!(ctx, + (program + child: (assignment + left: {ctx.capture("right")} + right: {ctx.capture("left")} + ) + ) + ); + + let dump = dump_ast(ctx.ast, new_id, input); + assert_dump_eq( + &dump, + r#" + program + assignment + left: integer "1" + right: identifier "x" + "#, + ); +} + +// ---- Rule tests ---- + +// These rules use field names from node-types.yml, which extends the +// tree-sitter-ruby grammar with named fields for nodes that only have +// unnamed children in tree-sitter (e.g. block_body.stmt, block_parameters.parameter). +fn ruby_rules() -> Vec { + let assign_rule = yeast::rule!( + (assignment + left: (left_assignment_list + (identifier)* @left + ) + right: (_) @right + ) + => + (assignment + left: (identifier $tmp) + right: {right} + ) + {..left.iter().enumerate().map(|(i, &lhs)| + yeast::tree!( + (assignment + left: {lhs} + right: (element_reference + object: (identifier $tmp) + index: (integer #{i}) + ) + ) + ) + )} + ); + + let for_rule = yeast::rule!( + (for + pattern: (_) @pat + value: (in (_) @val) + body: (do (_)* @body) + ) + => + (call + receiver: {val} + method: (identifier "each") + block: (block + parameters: (block_parameters + parameter: (identifier $tmp) + ) + body: (block_body + stmt: (assignment + left: {pat} + right: (identifier $tmp) + ) + stmt: {..body} + ) + ) + ) + ); + + vec![assign_rule, for_rule] +} + +#[test] +fn test_desugar_multiple_assignment() { + let dump = run_and_dump("x, y = e", ruby_rules()); + assert_dump_eq( + &dump, + r#" + program + assignment + left: identifier "$tmp-0" + right: identifier "e" + assignment + left: identifier "x" + right: + element_reference + object: identifier "$tmp-0" + index: integer "0" + assignment + left: identifier "y" + right: + element_reference + object: identifier "$tmp-0" + index: integer "1" + "#, + ); +} + +#[test] +fn test_desugar_for_loop() { + let dump = run_and_dump("for x in list do\n y\nend", ruby_rules()); + assert_dump_eq( + &dump, + r#" + program + call + block: + block + body: + block_body + stmt: + assignment + left: identifier "x" + right: identifier "$tmp-0" + identifier "y" + parameters: + block_parameters + parameter: identifier "$tmp-0" + method: identifier "each" + receiver: identifier "list" + "#, + ); +} + +#[test] +fn test_shorthand_rule() { + let rule = yeast::rule!( + (assignment + left: (_) @method + right: (_) @receiver + ) + => call + ); + + let dump = run_and_dump("x = 1", vec![rule]); + assert_dump_eq( + &dump, + r#" + program + call + method: identifier "x" + receiver: integer "1" + "#, + ); +} + +#[test] +fn test_chained_rules_output_only_kind() { + // Exercise rule chaining where an intermediate kind exists only in the + // output schema (not in the input tree-sitter grammar): + // assignment → first_node (input → output-only) + // first_node → second_node (output-only → output-only) + // The matcher must look up `first_node` against the schema, which only + // knows about it via the YAML node-types file. + let assignment_to_first = yeast::rule!( + (assignment + left: (_) @left + right: (_) @right + ) + => first_node + ); + let first_to_second = yeast::rule!( + (first_node + left: (_) @left + right: (_) @right + ) + => second_node + ); + + let dump = run_and_dump("x = 1", vec![assignment_to_first, first_to_second]); + assert_dump_eq( + &dump, + r#" + program + second_node + left: identifier "x" + right: integer "1" + "#, + ); +} + +// ---- Cursor tests ---- + +#[test] +fn test_cursor_navigation() { + let runner = Runner::new(tree_sitter_ruby::LANGUAGE.into(), &[]); + let ast = runner.run("x = 1").unwrap(); + let mut cursor = AstCursor::new(&ast); + + // Start at root + assert_eq!(cursor.node().kind(), "program"); + + // Go to first child (assignment) + assert!(cursor.goto_first_child()); + assert_eq!(cursor.node().kind(), "assignment"); + + // No sibling + assert!(!cursor.goto_next_sibling()); + + // Go to first child of assignment + assert!(cursor.goto_first_child()); + assert!(cursor.node().is_named()); + + // Go back up + assert!(cursor.goto_parent()); + assert_eq!(cursor.node().kind(), "assignment"); + + assert!(cursor.goto_parent()); + assert_eq!(cursor.node().kind(), "program"); + + // Can't go further up + assert!(!cursor.goto_parent()); +} + +#[test] +fn test_desugar_for_with_multiple_assignment() { + let dump = run_and_dump("for a, b in list do\n x\nend", ruby_rules()); + assert_dump_eq( + &dump, + r#" + program + call + block: + block + body: + block_body + stmt: + assignment + left: identifier "$tmp-1" + right: identifier "$tmp-0" + assignment + left: identifier "a" + right: + element_reference + object: identifier "$tmp-1" + index: integer "0" + assignment + left: identifier "b" + right: + element_reference + object: identifier "$tmp-1" + index: integer "1" + identifier "x" + parameters: + block_parameters + parameter: identifier "$tmp-0" + method: identifier "each" + receiver: identifier "list" + "#, + ); +}